This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
re/pat.t: Add todo test for #38133
[perl5.git] / toke.c
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
26
27This is the lower layer of the Perl parser, managing characters and tokens.
28
29=for apidoc AmU|yy_parser *|PL_parser
30
31Pointer to a structure encapsulating the state of the parsing operation
32currently in progress. The pointer can be locally changed to perform
33a nested parse without interfering with the state of an outer parse.
34Individual members of C<PL_parser> have their own documentation.
35
36=cut
37*/
38
378cc40b 39#include "EXTERN.h"
864dbfa3 40#define PERL_IN_TOKE_C
378cc40b 41#include "perl.h"
04e98a4d 42#include "dquote_static.c"
378cc40b 43
eb0d8d16
NC
44#define new_constant(a,b,c,d,e,f,g) \
45 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
46
6154021b 47#define pl_yylval (PL_parser->yylval)
d3b6f988 48
199e78b7
DM
49/* XXX temporary backwards compatibility */
50#define PL_lex_brackets (PL_parser->lex_brackets)
51#define PL_lex_brackstack (PL_parser->lex_brackstack)
52#define PL_lex_casemods (PL_parser->lex_casemods)
53#define PL_lex_casestack (PL_parser->lex_casestack)
54#define PL_lex_defer (PL_parser->lex_defer)
55#define PL_lex_dojoin (PL_parser->lex_dojoin)
56#define PL_lex_expect (PL_parser->lex_expect)
57#define PL_lex_formbrack (PL_parser->lex_formbrack)
58#define PL_lex_inpat (PL_parser->lex_inpat)
59#define PL_lex_inwhat (PL_parser->lex_inwhat)
60#define PL_lex_op (PL_parser->lex_op)
61#define PL_lex_repl (PL_parser->lex_repl)
62#define PL_lex_starts (PL_parser->lex_starts)
63#define PL_lex_stuff (PL_parser->lex_stuff)
64#define PL_multi_start (PL_parser->multi_start)
65#define PL_multi_open (PL_parser->multi_open)
66#define PL_multi_close (PL_parser->multi_close)
67#define PL_pending_ident (PL_parser->pending_ident)
68#define PL_preambled (PL_parser->preambled)
69#define PL_sublex_info (PL_parser->sublex_info)
bdc0bf6f 70#define PL_linestr (PL_parser->linestr)
c2598295
DM
71#define PL_expect (PL_parser->expect)
72#define PL_copline (PL_parser->copline)
f06b5848
DM
73#define PL_bufptr (PL_parser->bufptr)
74#define PL_oldbufptr (PL_parser->oldbufptr)
75#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
76#define PL_linestart (PL_parser->linestart)
77#define PL_bufend (PL_parser->bufend)
78#define PL_last_uni (PL_parser->last_uni)
79#define PL_last_lop (PL_parser->last_lop)
80#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 81#define PL_lex_state (PL_parser->lex_state)
2f9285f8 82#define PL_rsfp (PL_parser->rsfp)
5486870f 83#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
84#define PL_in_my (PL_parser->in_my)
85#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 86#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 87#define PL_multi_end (PL_parser->multi_end)
13765c85 88#define PL_error_count (PL_parser->error_count)
199e78b7
DM
89
90#ifdef PERL_MAD
91# define PL_endwhite (PL_parser->endwhite)
92# define PL_faketokens (PL_parser->faketokens)
93# define PL_lasttoke (PL_parser->lasttoke)
94# define PL_nextwhite (PL_parser->nextwhite)
95# define PL_realtokenstart (PL_parser->realtokenstart)
96# define PL_skipwhite (PL_parser->skipwhite)
97# define PL_thisclose (PL_parser->thisclose)
98# define PL_thismad (PL_parser->thismad)
99# define PL_thisopen (PL_parser->thisopen)
100# define PL_thisstuff (PL_parser->thisstuff)
101# define PL_thistoken (PL_parser->thistoken)
102# define PL_thiswhite (PL_parser->thiswhite)
fb205e7a
DM
103# define PL_thiswhite (PL_parser->thiswhite)
104# define PL_nexttoke (PL_parser->nexttoke)
105# define PL_curforce (PL_parser->curforce)
106#else
107# define PL_nexttoke (PL_parser->nexttoke)
108# define PL_nexttype (PL_parser->nexttype)
109# define PL_nextval (PL_parser->nextval)
199e78b7
DM
110#endif
111
16173588
NC
112/* This can't be done with embed.fnc, because struct yy_parser contains a
113 member named pending_ident, which clashes with the generated #define */
3cbf51f5
DM
114static int
115S_pending_ident(pTHX);
199e78b7 116
0bd48802 117static const char ident_too_long[] = "Identifier too long";
8903cb82 118
29595ff2 119#ifdef PERL_MAD
29595ff2 120# define CURMAD(slot,sv) if (PL_madskills) { curmad(slot,sv); sv = 0; }
cd81e915 121# define NEXTVAL_NEXTTOKE PL_nexttoke[PL_curforce].next_val
9ded7720 122#else
5db06880 123# define CURMAD(slot,sv)
9ded7720 124# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2
NC
125#endif
126
a7aaec61
Z
127#define XENUMMASK 0x3f
128#define XFAKEEOF 0x40
129#define XFAKEBRACK 0x80
9059aa12 130
39e02b42
JH
131#ifdef USE_UTF8_SCRIPTS
132# define UTF (!IN_BYTES)
2b9d42f0 133#else
746b446a 134# define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || (PL_hints & HINT_UTF8))
2b9d42f0 135#endif
a0ed51b3 136
b1fc3636
CJ
137/* The maximum number of characters preceding the unrecognized one to display */
138#define UNRECOGNIZED_PRECEDE_COUNT 10
139
61f0cdd9 140/* In variables named $^X, these are the legal values for X.
2b92dfce
GS
141 * 1999-02-27 mjd-perl-patch@plover.com */
142#define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
143
bf4acbe4 144#define SPACE_OR_TAB(c) ((c)==' '||(c)=='\t')
bf4acbe4 145
ffb4593c
NT
146/* LEX_* are values for PL_lex_state, the state of the lexer.
147 * They are arranged oddly so that the guard on the switch statement
79072805
LW
148 * can get by with a single comparison (if the compiler is smart enough).
149 */
150
fb73857a 151/* #define LEX_NOTPARSING 11 is done in perl.h. */
152
b6007c36
DM
153#define LEX_NORMAL 10 /* normal code (ie not within "...") */
154#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
155#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
156#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
157#define LEX_INTERPSTART 6 /* expecting the start of a $var */
158
159 /* at end of code, eg "$x" followed by: */
160#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
161#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
162
163#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
164 string or after \E, $foo, etc */
165#define LEX_INTERPCONST 2 /* NOT USED */
166#define LEX_FORMLINE 1 /* expecting a format line */
167#define LEX_KNOWNEXT 0 /* next token known; just return it */
168
79072805 169
bbf60fe6 170#ifdef DEBUGGING
27da23d5 171static const char* const lex_state_names[] = {
bbf60fe6
DM
172 "KNOWNEXT",
173 "FORMLINE",
174 "INTERPCONST",
175 "INTERPCONCAT",
176 "INTERPENDMAYBE",
177 "INTERPEND",
178 "INTERPSTART",
179 "INTERPPUSH",
180 "INTERPCASEMOD",
181 "INTERPNORMAL",
182 "NORMAL"
183};
184#endif
185
79072805
LW
186#ifdef ff_next
187#undef ff_next
d48672a2
LW
188#endif
189
79072805 190#include "keywords.h"
fe14fcc3 191
ffb4593c
NT
192/* CLINE is a macro that ensures PL_copline has a sane value */
193
ae986130
LW
194#ifdef CLINE
195#undef CLINE
196#endif
57843af0 197#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 198
5db06880 199#ifdef PERL_MAD
29595ff2
NC
200# define SKIPSPACE0(s) skipspace0(s)
201# define SKIPSPACE1(s) skipspace1(s)
202# define SKIPSPACE2(s,tsv) skipspace2(s,&tsv)
203# define PEEKSPACE(s) skipspace2(s,0)
204#else
205# define SKIPSPACE0(s) skipspace(s)
206# define SKIPSPACE1(s) skipspace(s)
207# define SKIPSPACE2(s,tsv) skipspace(s)
208# define PEEKSPACE(s) skipspace(s)
209#endif
210
ffb4593c
NT
211/*
212 * Convenience functions to return different tokens and prime the
9cbb5ea2 213 * lexer for the next token. They all take an argument.
ffb4593c
NT
214 *
215 * TOKEN : generic token (used for '(', DOLSHARP, etc)
216 * OPERATOR : generic operator
217 * AOPERATOR : assignment operator
218 * PREBLOCK : beginning the block after an if, while, foreach, ...
219 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
220 * PREREF : *EXPR where EXPR is not a simple identifier
221 * TERM : expression term
222 * LOOPX : loop exiting command (goto, last, dump, etc)
223 * FTST : file test operator
224 * FUN0 : zero-argument function
2d2e263d 225 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
226 * BOop : bitwise or or xor
227 * BAop : bitwise and
228 * SHop : shift operator
229 * PWop : power operator
9cbb5ea2 230 * PMop : pattern-matching operator
ffb4593c
NT
231 * Aop : addition-level operator
232 * Mop : multiplication-level operator
233 * Eop : equality-testing operator
e5edeb50 234 * Rop : relational operator <= != gt
ffb4593c
NT
235 *
236 * Also see LOP and lop() below.
237 */
238
998054bd 239#ifdef DEBUGGING /* Serve -DT. */
704d4215 240# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 241#else
bbf60fe6 242# define REPORT(retval) (retval)
998054bd
SC
243#endif
244
bbf60fe6
DM
245#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
246#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
247#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, REPORT(retval)))
248#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
249#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
250#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
251#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
6154021b
RGS
252#define LOOPX(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)LOOPEX))
253#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
254#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
255#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
256#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITOROP)))
257#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITANDOP)))
258#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)SHIFTOP)))
259#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)POWOP)))
260#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
261#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)ADDOP)))
262#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MULOP)))
263#define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
264#define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
2f3197b3 265
a687059c
LW
266/* This bit of chicanery makes a unary function followed by
267 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
268 * The UNIDOR macro is for unary functions that can be followed by the //
269 * operator (such as C<shift // 0>).
a687059c 270 */
376fcdbf 271#define UNI2(f,x) { \
6154021b 272 pl_yylval.ival = f; \
376fcdbf
AL
273 PL_expect = x; \
274 PL_bufptr = s; \
275 PL_last_uni = PL_oldbufptr; \
276 PL_last_lop_op = f; \
277 if (*s == '(') \
278 return REPORT( (int)FUNC1 ); \
29595ff2 279 s = PEEKSPACE(s); \
376fcdbf
AL
280 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
281 }
6f33ba73
RGS
282#define UNI(f) UNI2(f,XTERM)
283#define UNIDOR(f) UNI2(f,XTERMORDORDOR)
a687059c 284
376fcdbf 285#define UNIBRACK(f) { \
6154021b 286 pl_yylval.ival = f; \
376fcdbf
AL
287 PL_bufptr = s; \
288 PL_last_uni = PL_oldbufptr; \
289 if (*s == '(') \
290 return REPORT( (int)FUNC1 ); \
29595ff2 291 s = PEEKSPACE(s); \
376fcdbf
AL
292 return REPORT( (*s == '(') ? (int)FUNC1 : (int)UNIOP ); \
293 }
79072805 294
9f68db38 295/* grandfather return to old style */
6154021b 296#define OLDLOP(f) return(pl_yylval.ival=f,PL_expect = XTERM,PL_bufptr = s,(int)LSTOP)
79072805 297
8fa7f367
JH
298#ifdef DEBUGGING
299
6154021b 300/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
301enum token_type {
302 TOKENTYPE_NONE,
303 TOKENTYPE_IVAL,
6154021b 304 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6
DM
305 TOKENTYPE_PVAL,
306 TOKENTYPE_OPVAL,
307 TOKENTYPE_GVVAL
308};
309
6d4a66ac
NC
310static struct debug_tokens {
311 const int token;
312 enum token_type type;
313 const char *name;
314} const debug_tokens[] =
9041c2e3 315{
bbf60fe6
DM
316 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
317 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
318 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
319 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
320 { ARROW, TOKENTYPE_NONE, "ARROW" },
321 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
322 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
323 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
324 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
325 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
0d863452 326 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
327 { DO, TOKENTYPE_NONE, "DO" },
328 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
329 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
330 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
331 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
332 { ELSE, TOKENTYPE_NONE, "ELSE" },
333 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
334 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
335 { FOR, TOKENTYPE_IVAL, "FOR" },
336 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
337 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
338 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
339 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
340 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
341 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 342 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
343 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
344 { IF, TOKENTYPE_IVAL, "IF" },
345 { LABEL, TOKENTYPE_PVAL, "LABEL" },
346 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
347 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
348 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
349 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
350 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
351 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
352 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
353 { MY, TOKENTYPE_IVAL, "MY" },
354 { MYSUB, TOKENTYPE_NONE, "MYSUB" },
355 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
356 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
357 { OROP, TOKENTYPE_IVAL, "OROP" },
358 { OROR, TOKENTYPE_NONE, "OROR" },
359 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
88e1f1a2
JV
360 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
361 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6
DM
362 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
363 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
364 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
365 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
366 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
367 { PREINC, TOKENTYPE_NONE, "PREINC" },
368 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
369 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
370 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
371 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
372 { SUB, TOKENTYPE_NONE, "SUB" },
373 { THING, TOKENTYPE_OPVAL, "THING" },
374 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
375 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
376 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
377 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
378 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
379 { USE, TOKENTYPE_IVAL, "USE" },
0d863452 380 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6
DM
381 { WHILE, TOKENTYPE_IVAL, "WHILE" },
382 { WORD, TOKENTYPE_OPVAL, "WORD" },
be25f609 383 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 384 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
385};
386
6154021b 387/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 388
bbf60fe6 389STATIC int
704d4215 390S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 391{
97aff369 392 dVAR;
7918f24d
NC
393
394 PERL_ARGS_ASSERT_TOKEREPORT;
395
bbf60fe6 396 if (DEBUG_T_TEST) {
bd61b366 397 const char *name = NULL;
bbf60fe6 398 enum token_type type = TOKENTYPE_NONE;
f54cb97a 399 const struct debug_tokens *p;
396482e1 400 SV* const report = newSVpvs("<== ");
bbf60fe6 401
f54cb97a 402 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
403 if (p->token == (int)rv) {
404 name = p->name;
405 type = p->type;
406 break;
407 }
408 }
409 if (name)
54667de8 410 Perl_sv_catpv(aTHX_ report, name);
bbf60fe6
DM
411 else if ((char)rv > ' ' && (char)rv < '~')
412 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
413 else if (!rv)
396482e1 414 sv_catpvs(report, "EOF");
bbf60fe6
DM
415 else
416 Perl_sv_catpvf(aTHX_ report, "?? %"IVdf, (IV)rv);
417 switch (type) {
418 case TOKENTYPE_NONE:
419 case TOKENTYPE_GVVAL: /* doesn't appear to be used */
420 break;
421 case TOKENTYPE_IVAL:
704d4215 422 Perl_sv_catpvf(aTHX_ report, "(ival=%"IVdf")", (IV)lvalp->ival);
bbf60fe6
DM
423 break;
424 case TOKENTYPE_OPNUM:
425 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 426 PL_op_name[lvalp->ival]);
bbf60fe6
DM
427 break;
428 case TOKENTYPE_PVAL:
704d4215 429 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
430 break;
431 case TOKENTYPE_OPVAL:
704d4215 432 if (lvalp->opval) {
401441c0 433 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
434 PL_op_name[lvalp->opval->op_type]);
435 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 436 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 437 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
438 }
439
440 }
401441c0 441 else
396482e1 442 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
443 break;
444 }
b6007c36 445 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
446 };
447 return (int)rv;
998054bd
SC
448}
449
b6007c36
DM
450
451/* print the buffer with suitable escapes */
452
453STATIC void
15f169a1 454S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 455{
396482e1 456 SV* const tmp = newSVpvs("");
7918f24d
NC
457
458 PERL_ARGS_ASSERT_PRINTBUF;
459
b6007c36
DM
460 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
461 SvREFCNT_dec(tmp);
462}
463
8fa7f367
JH
464#endif
465
8290c323
NC
466static int
467S_deprecate_commaless_var_list(pTHX) {
468 PL_expect = XTERM;
469 deprecate("comma-less variable list");
470 return REPORT(','); /* grandfather non-comma-format format */
471}
472
ffb4593c
NT
473/*
474 * S_ao
475 *
c963b151
BD
476 * This subroutine detects &&=, ||=, and //= and turns an ANDAND, OROR or DORDOR
477 * into an OP_ANDASSIGN, OP_ORASSIGN, or OP_DORASSIGN
ffb4593c
NT
478 */
479
76e3520e 480STATIC int
cea2e8a9 481S_ao(pTHX_ int toketype)
a0d0e21e 482{
97aff369 483 dVAR;
3280af22
NIS
484 if (*PL_bufptr == '=') {
485 PL_bufptr++;
a0d0e21e 486 if (toketype == ANDAND)
6154021b 487 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 488 else if (toketype == OROR)
6154021b 489 pl_yylval.ival = OP_ORASSIGN;
c963b151 490 else if (toketype == DORDOR)
6154021b 491 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
492 toketype = ASSIGNOP;
493 }
494 return toketype;
495}
496
ffb4593c
NT
497/*
498 * S_no_op
499 * When Perl expects an operator and finds something else, no_op
500 * prints the warning. It always prints "<something> found where
501 * operator expected. It prints "Missing semicolon on previous line?"
502 * if the surprise occurs at the start of the line. "do you need to
503 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
504 * where the compiler doesn't know if foo is a method call or a function.
505 * It prints "Missing operator before end of line" if there's nothing
506 * after the missing operator, or "... before <...>" if there is something
507 * after the missing operator.
508 */
509
76e3520e 510STATIC void
15f169a1 511S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 512{
97aff369 513 dVAR;
9d4ba2ae
AL
514 char * const oldbp = PL_bufptr;
515 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 516
7918f24d
NC
517 PERL_ARGS_ASSERT_NO_OP;
518
1189a94a
GS
519 if (!s)
520 s = oldbp;
07c798fb 521 else
1189a94a 522 PL_bufptr = s;
cea2e8a9 523 yywarn(Perl_form(aTHX_ "%s found where operator expected", what));
56da5a46
RGS
524 if (ckWARN_d(WARN_SYNTAX)) {
525 if (is_first)
526 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
527 "\t(Missing semicolon on previous line?)\n");
528 else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) {
f54cb97a 529 const char *t;
c35e046a
AL
530 for (t = PL_oldoldbufptr; (isALNUM_lazy_if(t,UTF) || *t == ':'); t++)
531 NOOP;
56da5a46
RGS
532 if (t < PL_bufptr && isSPACE(*t))
533 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
534 "\t(Do you need to predeclare %.*s?)\n",
551405c4 535 (int)(t - PL_oldoldbufptr), PL_oldoldbufptr);
56da5a46
RGS
536 }
537 else {
538 assert(s >= oldbp);
539 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
551405c4 540 "\t(Missing operator before %.*s?)\n", (int)(s - oldbp), oldbp);
56da5a46 541 }
07c798fb 542 }
3280af22 543 PL_bufptr = oldbp;
8990e307
LW
544}
545
ffb4593c
NT
546/*
547 * S_missingterm
548 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 549 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
550 * If we're in a delimited string and the delimiter is a control
551 * character, it's reformatted into a two-char sequence like ^C.
552 * This is fatal.
553 */
554
76e3520e 555STATIC void
cea2e8a9 556S_missingterm(pTHX_ char *s)
8990e307 557{
97aff369 558 dVAR;
8990e307
LW
559 char tmpbuf[3];
560 char q;
561 if (s) {
9d4ba2ae 562 char * const nl = strrchr(s,'\n');
d2719217 563 if (nl)
8990e307
LW
564 *nl = '\0';
565 }
463559e7 566 else if (isCNTRL(PL_multi_close)) {
8990e307 567 *tmpbuf = '^';
585ec06d 568 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
569 tmpbuf[2] = '\0';
570 s = tmpbuf;
571 }
572 else {
eb160463 573 *tmpbuf = (char)PL_multi_close;
8990e307
LW
574 tmpbuf[1] = '\0';
575 s = tmpbuf;
576 }
577 q = strchr(s,'"') ? '\'' : '"';
cea2e8a9 578 Perl_croak(aTHX_ "Can't find string terminator %c%s%c anywhere before EOF",q,s,q);
463ee0b2 579}
79072805 580
ef89dcc3 581#define FEATURE_IS_ENABLED(name) \
0d863452 582 ((0 != (PL_hints & HINT_LOCALIZE_HH)) \
89529cee 583 && S_feature_is_enabled(aTHX_ STR_WITH_LEN(name)))
4a731d7b 584/* The longest string we pass in. */
1863b879 585#define MAX_FEATURE_LEN (sizeof("unicode_strings")-1)
4a731d7b 586
0d863452
RH
587/*
588 * S_feature_is_enabled
589 * Check whether the named feature is enabled.
590 */
591STATIC bool
15f169a1 592S_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
0d863452 593{
97aff369 594 dVAR;
0d863452 595 HV * const hinthv = GvHV(PL_hintgv);
4a731d7b 596 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
7918f24d
NC
597
598 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
599
4a731d7b
NC
600 assert(namelen <= MAX_FEATURE_LEN);
601 memcpy(&he_name[8], name, namelen);
d4c19fe8 602
7b9ef140 603 return (hinthv && hv_exists(hinthv, he_name, 8 + namelen));
0d863452
RH
604}
605
ffb4593c 606/*
9cbb5ea2
GS
607 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
608 * utf16-to-utf8-reversed.
ffb4593c
NT
609 */
610
c39cd008
GS
611#ifdef PERL_CR_FILTER
612static void
613strip_return(SV *sv)
614{
95a20fc0 615 register const char *s = SvPVX_const(sv);
9d4ba2ae 616 register const char * const e = s + SvCUR(sv);
7918f24d
NC
617
618 PERL_ARGS_ASSERT_STRIP_RETURN;
619
c39cd008
GS
620 /* outer loop optimized to do nothing if there are no CR-LFs */
621 while (s < e) {
622 if (*s++ == '\r' && *s == '\n') {
623 /* hit a CR-LF, need to copy the rest */
624 register char *d = s - 1;
625 *d++ = *s++;
626 while (s < e) {
627 if (*s == '\r' && s[1] == '\n')
628 s++;
629 *d++ = *s++;
630 }
631 SvCUR(sv) -= s - d;
632 return;
633 }
634 }
635}
a868473f 636
76e3520e 637STATIC I32
c39cd008 638S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 639{
f54cb97a 640 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
641 if (count > 0 && !maxlen)
642 strip_return(sv);
643 return count;
a868473f
NIS
644}
645#endif
646
ffb4593c 647/*
8eaa0acf
Z
648=for apidoc Amx|void|lex_start|SV *line|PerlIO *rsfp|U32 flags
649
650Creates and initialises a new lexer/parser state object, supplying
651a context in which to lex and parse from a new source of Perl code.
652A pointer to the new state object is placed in L</PL_parser>. An entry
653is made on the save stack so that upon unwinding the new state object
654will be destroyed and the former value of L</PL_parser> will be restored.
655Nothing else need be done to clean up the parsing context.
656
657The code to be parsed comes from I<line> and I<rsfp>. I<line>, if
658non-null, provides a string (in SV form) containing code to be parsed.
659A copy of the string is made, so subsequent modification of I<line>
660does not affect parsing. I<rsfp>, if non-null, provides an input stream
661from which code will be read to be parsed. If both are non-null, the
662code in I<line> comes first and must consist of complete lines of input,
663and I<rsfp> supplies the remainder of the source.
664
665The I<flags> parameter is reserved for future use, and must always
666be zero.
667
668=cut
669*/
ffb4593c 670
a0d0e21e 671void
8eaa0acf 672Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
79072805 673{
97aff369 674 dVAR;
6ef55633 675 const char *s = NULL;
8990e307 676 STRLEN len;
5486870f 677 yy_parser *parser, *oparser;
8eaa0acf
Z
678 if (flags)
679 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
acdf0a21
DM
680
681 /* create and initialise a parser */
682
199e78b7 683 Newxz(parser, 1, yy_parser);
5486870f 684 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
685 PL_parser = parser;
686
28ac2b49
Z
687 parser->stack = NULL;
688 parser->ps = NULL;
689 parser->stack_size = 0;
acdf0a21 690
e3abe207
DM
691 /* on scope exit, free this parser and restore any outer one */
692 SAVEPARSER(parser);
7c4baf47 693 parser->saved_curcop = PL_curcop;
e3abe207 694
acdf0a21 695 /* initialise lexer state */
8990e307 696
fb205e7a
DM
697#ifdef PERL_MAD
698 parser->curforce = -1;
699#else
700 parser->nexttoke = 0;
701#endif
ca4cfd28 702 parser->error_count = oparser ? oparser->error_count : 0;
c2598295 703 parser->copline = NOLINE;
5afb0a62 704 parser->lex_state = LEX_NORMAL;
c2598295 705 parser->expect = XSTATE;
2f9285f8 706 parser->rsfp = rsfp;
f07ec6dd 707 parser->rsfp_filters = newAV();
2f9285f8 708
199e78b7
DM
709 Newx(parser->lex_brackstack, 120, char);
710 Newx(parser->lex_casestack, 12, char);
711 *parser->lex_casestack = '\0';
02b34bbe 712
10efb74f
NC
713 if (line) {
714 s = SvPV_const(line, len);
715 } else {
716 len = 0;
717 }
bdc0bf6f 718
10efb74f 719 if (!len) {
bdc0bf6f 720 parser->linestr = newSVpvs("\n;");
805700c1 721 } else {
719a9bb0 722 parser->linestr = newSVpvn_flags(s, len, SvUTF8(line));
10efb74f 723 if (s[len-1] != ';')
bdc0bf6f 724 sv_catpvs(parser->linestr, "\n;");
8990e307 725 }
f06b5848
DM
726 parser->oldoldbufptr =
727 parser->oldbufptr =
728 parser->bufptr =
729 parser->linestart = SvPVX(parser->linestr);
730 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
731 parser->last_lop = parser->last_uni = NULL;
737c24fc
Z
732
733 parser->in_pod = 0;
79072805 734}
a687059c 735
e3abe207
DM
736
737/* delete a parser object */
738
739void
740Perl_parser_free(pTHX_ const yy_parser *parser)
741{
7918f24d
NC
742 PERL_ARGS_ASSERT_PARSER_FREE;
743
7c4baf47 744 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
745 SvREFCNT_dec(parser->linestr);
746
2f9285f8
DM
747 if (parser->rsfp == PerlIO_stdin())
748 PerlIO_clearerr(parser->rsfp);
799361c3
SH
749 else if (parser->rsfp && (!parser->old_parser ||
750 (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 751 PerlIO_close(parser->rsfp);
5486870f 752 SvREFCNT_dec(parser->rsfp_filters);
2f9285f8 753
e3abe207
DM
754 Safefree(parser->lex_brackstack);
755 Safefree(parser->lex_casestack);
756 PL_parser = parser->old_parser;
757 Safefree(parser);
758}
759
760
ffb4593c 761/*
f0e67a1d
Z
762=for apidoc AmxU|SV *|PL_parser-E<gt>linestr
763
764Buffer scalar containing the chunk currently under consideration of the
765text currently being lexed. This is always a plain string scalar (for
766which C<SvPOK> is true). It is not intended to be used as a scalar by
767normal scalar means; instead refer to the buffer directly by the pointer
768variables described below.
769
770The lexer maintains various C<char*> pointers to things in the
771C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
772reallocated, all of these pointers must be updated. Don't attempt to
773do this manually, but rather use L</lex_grow_linestr> if you need to
774reallocate the buffer.
775
776The content of the text chunk in the buffer is commonly exactly one
777complete line of input, up to and including a newline terminator,
778but there are situations where it is otherwise. The octets of the
779buffer may be intended to be interpreted as either UTF-8 or Latin-1.
780The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
781flag on this scalar, which may disagree with it.
782
783For direct examination of the buffer, the variable
784L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
785lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
786of these pointers is usually preferable to examination of the scalar
787through normal scalar means.
788
789=for apidoc AmxU|char *|PL_parser-E<gt>bufend
790
791Direct pointer to the end of the chunk of text currently being lexed, the
792end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
793+ SvCUR(PL_parser-E<gt>linestr)>. A NUL character (zero octet) is
794always located at the end of the buffer, and does not count as part of
795the buffer's contents.
796
797=for apidoc AmxU|char *|PL_parser-E<gt>bufptr
798
799Points to the current position of lexing inside the lexer buffer.
800Characters around this point may be freely examined, within
801the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
802L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
803interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
804
805Lexing code (whether in the Perl core or not) moves this pointer past
806the characters that it consumes. It is also expected to perform some
807bookkeeping whenever a newline character is consumed. This movement
808can be more conveniently performed by the function L</lex_read_to>,
809which handles newlines appropriately.
810
811Interpretation of the buffer's octets can be abstracted out by
812using the slightly higher-level functions L</lex_peek_unichar> and
813L</lex_read_unichar>.
814
815=for apidoc AmxU|char *|PL_parser-E<gt>linestart
816
817Points to the start of the current line inside the lexer buffer.
818This is useful for indicating at which column an error occurred, and
819not much else. This must be updated by any lexing code that consumes
820a newline; the function L</lex_read_to> handles this detail.
821
822=cut
823*/
824
825/*
826=for apidoc Amx|bool|lex_bufutf8
827
828Indicates whether the octets in the lexer buffer
829(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
830of Unicode characters. If not, they should be interpreted as Latin-1
831characters. This is analogous to the C<SvUTF8> flag for scalars.
832
833In UTF-8 mode, it is not guaranteed that the lexer buffer actually
834contains valid UTF-8. Lexing code must be robust in the face of invalid
835encoding.
836
837The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
838is significant, but not the whole story regarding the input character
839encoding. Normally, when a file is being read, the scalar contains octets
840and its C<SvUTF8> flag is off, but the octets should be interpreted as
841UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
842however, the scalar may have the C<SvUTF8> flag on, and in this case its
843octets should be interpreted as UTF-8 unless the C<use bytes> pragma
844is in effect. This logic may change in the future; use this function
845instead of implementing the logic yourself.
846
847=cut
848*/
849
850bool
851Perl_lex_bufutf8(pTHX)
852{
853 return UTF;
854}
855
856/*
857=for apidoc Amx|char *|lex_grow_linestr|STRLEN len
858
859Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
860at least I<len> octets (including terminating NUL). Returns a
861pointer to the reallocated buffer. This is necessary before making
862any direct modification of the buffer that would increase its length.
863L</lex_stuff_pvn> provides a more convenient way to insert text into
864the buffer.
865
866Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
867this function updates all of the lexer's variables that point directly
868into the buffer.
869
870=cut
871*/
872
873char *
874Perl_lex_grow_linestr(pTHX_ STRLEN len)
875{
876 SV *linestr;
877 char *buf;
878 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
879 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
880 linestr = PL_parser->linestr;
881 buf = SvPVX(linestr);
882 if (len <= SvLEN(linestr))
883 return buf;
884 bufend_pos = PL_parser->bufend - buf;
885 bufptr_pos = PL_parser->bufptr - buf;
886 oldbufptr_pos = PL_parser->oldbufptr - buf;
887 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
888 linestart_pos = PL_parser->linestart - buf;
889 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
890 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
891 buf = sv_grow(linestr, len);
892 PL_parser->bufend = buf + bufend_pos;
893 PL_parser->bufptr = buf + bufptr_pos;
894 PL_parser->oldbufptr = buf + oldbufptr_pos;
895 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
896 PL_parser->linestart = buf + linestart_pos;
897 if (PL_parser->last_uni)
898 PL_parser->last_uni = buf + last_uni_pos;
899 if (PL_parser->last_lop)
900 PL_parser->last_lop = buf + last_lop_pos;
901 return buf;
902}
903
904/*
83aa740e 905=for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
f0e67a1d
Z
906
907Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
908immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
909reallocating the buffer if necessary. This means that lexing code that
910runs later will see the characters as if they had appeared in the input.
911It is not recommended to do this as part of normal parsing, and most
912uses of this facility run the risk of the inserted characters being
913interpreted in an unintended manner.
914
915The string to be inserted is represented by I<len> octets starting
916at I<pv>. These octets are interpreted as either UTF-8 or Latin-1,
917according to whether the C<LEX_STUFF_UTF8> flag is set in I<flags>.
918The characters are recoded for the lexer buffer, according to how the
919buffer is currently being interpreted (L</lex_bufutf8>). If a string
9dcc53ea 920to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
f0e67a1d
Z
921function is more convenient.
922
923=cut
924*/
925
926void
83aa740e 927Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
f0e67a1d 928{
749123ff 929 dVAR;
f0e67a1d
Z
930 char *bufptr;
931 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
932 if (flags & ~(LEX_STUFF_UTF8))
933 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
934 if (UTF) {
935 if (flags & LEX_STUFF_UTF8) {
936 goto plain_copy;
937 } else {
938 STRLEN highhalf = 0;
83aa740e 939 const char *p, *e = pv+len;
f0e67a1d
Z
940 for (p = pv; p != e; p++)
941 highhalf += !!(((U8)*p) & 0x80);
942 if (!highhalf)
943 goto plain_copy;
944 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
945 bufptr = PL_parser->bufptr;
946 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
947 SvCUR_set(PL_parser->linestr,
948 SvCUR(PL_parser->linestr) + len+highhalf);
f0e67a1d
Z
949 PL_parser->bufend += len+highhalf;
950 for (p = pv; p != e; p++) {
951 U8 c = (U8)*p;
952 if (c & 0x80) {
953 *bufptr++ = (char)(0xc0 | (c >> 6));
954 *bufptr++ = (char)(0x80 | (c & 0x3f));
955 } else {
956 *bufptr++ = (char)c;
957 }
958 }
959 }
960 } else {
961 if (flags & LEX_STUFF_UTF8) {
962 STRLEN highhalf = 0;
83aa740e 963 const char *p, *e = pv+len;
f0e67a1d
Z
964 for (p = pv; p != e; p++) {
965 U8 c = (U8)*p;
966 if (c >= 0xc4) {
967 Perl_croak(aTHX_ "Lexing code attempted to stuff "
968 "non-Latin-1 character into Latin-1 input");
969 } else if (c >= 0xc2 && p+1 != e &&
970 (((U8)p[1]) & 0xc0) == 0x80) {
971 p++;
972 highhalf++;
973 } else if (c >= 0x80) {
974 /* malformed UTF-8 */
975 ENTER;
976 SAVESPTR(PL_warnhook);
977 PL_warnhook = PERL_WARNHOOK_FATAL;
978 utf8n_to_uvuni((U8*)p, e-p, NULL, 0);
979 LEAVE;
980 }
981 }
982 if (!highhalf)
983 goto plain_copy;
984 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
985 bufptr = PL_parser->bufptr;
986 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
987 SvCUR_set(PL_parser->linestr,
988 SvCUR(PL_parser->linestr) + len-highhalf);
f0e67a1d
Z
989 PL_parser->bufend += len-highhalf;
990 for (p = pv; p != e; p++) {
991 U8 c = (U8)*p;
992 if (c & 0x80) {
993 *bufptr++ = (char)(((c & 0x3) << 6) | (p[1] & 0x3f));
994 p++;
995 } else {
996 *bufptr++ = (char)c;
997 }
998 }
999 } else {
1000 plain_copy:
1001 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1002 bufptr = PL_parser->bufptr;
1003 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
255fdf19 1004 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
f0e67a1d
Z
1005 PL_parser->bufend += len;
1006 Copy(pv, bufptr, len, char);
1007 }
1008 }
1009}
1010
1011/*
9dcc53ea
Z
1012=for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
1013
1014Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1015immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1016reallocating the buffer if necessary. This means that lexing code that
1017runs later will see the characters as if they had appeared in the input.
1018It is not recommended to do this as part of normal parsing, and most
1019uses of this facility run the risk of the inserted characters being
1020interpreted in an unintended manner.
1021
1022The string to be inserted is represented by octets starting at I<pv>
1023and continuing to the first nul. These octets are interpreted as either
1024UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
1025in I<flags>. The characters are recoded for the lexer buffer, according
1026to how the buffer is currently being interpreted (L</lex_bufutf8>).
1027If it is not convenient to nul-terminate a string to be inserted, the
1028L</lex_stuff_pvn> function is more appropriate.
1029
1030=cut
1031*/
1032
1033void
1034Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1035{
1036 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1037 lex_stuff_pvn(pv, strlen(pv), flags);
1038}
1039
1040/*
f0e67a1d
Z
1041=for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1042
1043Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1044immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1045reallocating the buffer if necessary. This means that lexing code that
1046runs later will see the characters as if they had appeared in the input.
1047It is not recommended to do this as part of normal parsing, and most
1048uses of this facility run the risk of the inserted characters being
1049interpreted in an unintended manner.
1050
1051The string to be inserted is the string value of I<sv>. The characters
1052are recoded for the lexer buffer, according to how the buffer is currently
9dcc53ea 1053being interpreted (L</lex_bufutf8>). If a string to be inserted is
f0e67a1d
Z
1054not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1055need to construct a scalar.
1056
1057=cut
1058*/
1059
1060void
1061Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1062{
1063 char *pv;
1064 STRLEN len;
1065 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1066 if (flags)
1067 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1068 pv = SvPV(sv, len);
1069 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1070}
1071
1072/*
1073=for apidoc Amx|void|lex_unstuff|char *ptr
1074
1075Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1076I<ptr>. Text following I<ptr> will be moved, and the buffer shortened.
1077This hides the discarded text from any lexing code that runs later,
1078as if the text had never appeared.
1079
1080This is not the normal way to consume lexed text. For that, use
1081L</lex_read_to>.
1082
1083=cut
1084*/
1085
1086void
1087Perl_lex_unstuff(pTHX_ char *ptr)
1088{
1089 char *buf, *bufend;
1090 STRLEN unstuff_len;
1091 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1092 buf = PL_parser->bufptr;
1093 if (ptr < buf)
1094 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1095 if (ptr == buf)
1096 return;
1097 bufend = PL_parser->bufend;
1098 if (ptr > bufend)
1099 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1100 unstuff_len = ptr - buf;
1101 Move(ptr, buf, bufend+1-ptr, char);
1102 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1103 PL_parser->bufend = bufend - unstuff_len;
1104}
1105
1106/*
1107=for apidoc Amx|void|lex_read_to|char *ptr
1108
1109Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1110to I<ptr>. This advances L</PL_parser-E<gt>bufptr> to match I<ptr>,
1111performing the correct bookkeeping whenever a newline character is passed.
1112This is the normal way to consume lexed text.
1113
1114Interpretation of the buffer's octets can be abstracted out by
1115using the slightly higher-level functions L</lex_peek_unichar> and
1116L</lex_read_unichar>.
1117
1118=cut
1119*/
1120
1121void
1122Perl_lex_read_to(pTHX_ char *ptr)
1123{
1124 char *s;
1125 PERL_ARGS_ASSERT_LEX_READ_TO;
1126 s = PL_parser->bufptr;
1127 if (ptr < s || ptr > PL_parser->bufend)
1128 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1129 for (; s != ptr; s++)
1130 if (*s == '\n') {
1131 CopLINE_inc(PL_curcop);
1132 PL_parser->linestart = s+1;
1133 }
1134 PL_parser->bufptr = ptr;
1135}
1136
1137/*
1138=for apidoc Amx|void|lex_discard_to|char *ptr
1139
1140Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1141up to I<ptr>. The remaining content of the buffer will be moved, and
1142all pointers into the buffer updated appropriately. I<ptr> must not
1143be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1144it is not permitted to discard text that has yet to be lexed.
1145
1146Normally it is not necessarily to do this directly, because it suffices to
1147use the implicit discarding behaviour of L</lex_next_chunk> and things
1148based on it. However, if a token stretches across multiple lines,
1f317c95 1149and the lexing code has kept multiple lines of text in the buffer for
f0e67a1d
Z
1150that purpose, then after completion of the token it would be wise to
1151explicitly discard the now-unneeded earlier lines, to avoid future
1152multi-line tokens growing the buffer without bound.
1153
1154=cut
1155*/
1156
1157void
1158Perl_lex_discard_to(pTHX_ char *ptr)
1159{
1160 char *buf;
1161 STRLEN discard_len;
1162 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1163 buf = SvPVX(PL_parser->linestr);
1164 if (ptr < buf)
1165 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1166 if (ptr == buf)
1167 return;
1168 if (ptr > PL_parser->bufptr)
1169 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1170 discard_len = ptr - buf;
1171 if (PL_parser->oldbufptr < ptr)
1172 PL_parser->oldbufptr = ptr;
1173 if (PL_parser->oldoldbufptr < ptr)
1174 PL_parser->oldoldbufptr = ptr;
1175 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1176 PL_parser->last_uni = NULL;
1177 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1178 PL_parser->last_lop = NULL;
1179 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1180 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1181 PL_parser->bufend -= discard_len;
1182 PL_parser->bufptr -= discard_len;
1183 PL_parser->oldbufptr -= discard_len;
1184 PL_parser->oldoldbufptr -= discard_len;
1185 if (PL_parser->last_uni)
1186 PL_parser->last_uni -= discard_len;
1187 if (PL_parser->last_lop)
1188 PL_parser->last_lop -= discard_len;
1189}
1190
1191/*
1192=for apidoc Amx|bool|lex_next_chunk|U32 flags
1193
1194Reads in the next chunk of text to be lexed, appending it to
1195L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1196looked to the end of the current chunk and wants to know more. It is
1197usual, but not necessary, for lexing to have consumed the entirety of
1198the current chunk at this time.
1199
1200If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1201chunk (i.e., the current chunk has been entirely consumed), normally the
1202current chunk will be discarded at the same time that the new chunk is
1203read in. If I<flags> includes C<LEX_KEEP_PREVIOUS>, the current chunk
1204will not be discarded. If the current chunk has not been entirely
1205consumed, then it will not be discarded regardless of the flag.
1206
1207Returns true if some new text was added to the buffer, or false if the
1208buffer has reached the end of the input text.
1209
1210=cut
1211*/
1212
1213#define LEX_FAKE_EOF 0x80000000
1214
1215bool
1216Perl_lex_next_chunk(pTHX_ U32 flags)
1217{
1218 SV *linestr;
1219 char *buf;
1220 STRLEN old_bufend_pos, new_bufend_pos;
1221 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1222 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1223 bool got_some_for_debugger = 0;
f0e67a1d
Z
1224 bool got_some;
1225 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF))
1226 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
f0e67a1d
Z
1227 linestr = PL_parser->linestr;
1228 buf = SvPVX(linestr);
1229 if (!(flags & LEX_KEEP_PREVIOUS) &&
1230 PL_parser->bufptr == PL_parser->bufend) {
1231 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1232 linestart_pos = 0;
1233 if (PL_parser->last_uni != PL_parser->bufend)
1234 PL_parser->last_uni = NULL;
1235 if (PL_parser->last_lop != PL_parser->bufend)
1236 PL_parser->last_lop = NULL;
1237 last_uni_pos = last_lop_pos = 0;
1238 *buf = 0;
1239 SvCUR(linestr) = 0;
1240 } else {
1241 old_bufend_pos = PL_parser->bufend - buf;
1242 bufptr_pos = PL_parser->bufptr - buf;
1243 oldbufptr_pos = PL_parser->oldbufptr - buf;
1244 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1245 linestart_pos = PL_parser->linestart - buf;
1246 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1247 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1248 }
1249 if (flags & LEX_FAKE_EOF) {
1250 goto eof;
1251 } else if (!PL_parser->rsfp) {
1252 got_some = 0;
1253 } else if (filter_gets(linestr, old_bufend_pos)) {
1254 got_some = 1;
17cc9359 1255 got_some_for_debugger = 1;
f0e67a1d 1256 } else {
580561a3
Z
1257 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1258 sv_setpvs(linestr, "");
f0e67a1d
Z
1259 eof:
1260 /* End of real input. Close filehandle (unless it was STDIN),
1261 * then add implicit termination.
1262 */
1263 if ((PerlIO*)PL_parser->rsfp == PerlIO_stdin())
1264 PerlIO_clearerr(PL_parser->rsfp);
1265 else if (PL_parser->rsfp)
1266 (void)PerlIO_close(PL_parser->rsfp);
1267 PL_parser->rsfp = NULL;
737c24fc 1268 PL_parser->in_pod = 0;
f0e67a1d
Z
1269#ifdef PERL_MAD
1270 if (PL_madskills && !PL_in_eval && (PL_minus_p || PL_minus_n))
1271 PL_faketokens = 1;
1272#endif
1273 if (!PL_in_eval && PL_minus_p) {
1274 sv_catpvs(linestr,
1275 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1276 PL_minus_n = PL_minus_p = 0;
1277 } else if (!PL_in_eval && PL_minus_n) {
1278 sv_catpvs(linestr, /*{*/";}");
1279 PL_minus_n = 0;
1280 } else
1281 sv_catpvs(linestr, ";");
1282 got_some = 1;
1283 }
1284 buf = SvPVX(linestr);
1285 new_bufend_pos = SvCUR(linestr);
1286 PL_parser->bufend = buf + new_bufend_pos;
1287 PL_parser->bufptr = buf + bufptr_pos;
1288 PL_parser->oldbufptr = buf + oldbufptr_pos;
1289 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1290 PL_parser->linestart = buf + linestart_pos;
1291 if (PL_parser->last_uni)
1292 PL_parser->last_uni = buf + last_uni_pos;
1293 if (PL_parser->last_lop)
1294 PL_parser->last_lop = buf + last_lop_pos;
17cc9359 1295 if (got_some_for_debugger && (PERLDB_LINE || PERLDB_SAVESRC) &&
f0e67a1d
Z
1296 PL_curstash != PL_debstash) {
1297 /* debugger active and we're not compiling the debugger code,
1298 * so store the line into the debugger's array of lines
1299 */
1300 update_debugger_info(NULL, buf+old_bufend_pos,
1301 new_bufend_pos-old_bufend_pos);
1302 }
1303 return got_some;
1304}
1305
1306/*
1307=for apidoc Amx|I32|lex_peek_unichar|U32 flags
1308
1309Looks ahead one (Unicode) character in the text currently being lexed.
1310Returns the codepoint (unsigned integer value) of the next character,
1311or -1 if lexing has reached the end of the input text. To consume the
1312peeked character, use L</lex_read_unichar>.
1313
1314If the next character is in (or extends into) the next chunk of input
1315text, the next chunk will be read in. Normally the current chunk will be
1316discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1317then the current chunk will not be discarded.
1318
1319If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1320is encountered, an exception is generated.
1321
1322=cut
1323*/
1324
1325I32
1326Perl_lex_peek_unichar(pTHX_ U32 flags)
1327{
749123ff 1328 dVAR;
f0e67a1d
Z
1329 char *s, *bufend;
1330 if (flags & ~(LEX_KEEP_PREVIOUS))
1331 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1332 s = PL_parser->bufptr;
1333 bufend = PL_parser->bufend;
1334 if (UTF) {
1335 U8 head;
1336 I32 unichar;
1337 STRLEN len, retlen;
1338 if (s == bufend) {
1339 if (!lex_next_chunk(flags))
1340 return -1;
1341 s = PL_parser->bufptr;
1342 bufend = PL_parser->bufend;
1343 }
1344 head = (U8)*s;
1345 if (!(head & 0x80))
1346 return head;
1347 if (head & 0x40) {
1348 len = PL_utf8skip[head];
1349 while ((STRLEN)(bufend-s) < len) {
1350 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1351 break;
1352 s = PL_parser->bufptr;
1353 bufend = PL_parser->bufend;
1354 }
1355 }
1356 unichar = utf8n_to_uvuni((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
1357 if (retlen == (STRLEN)-1) {
1358 /* malformed UTF-8 */
1359 ENTER;
1360 SAVESPTR(PL_warnhook);
1361 PL_warnhook = PERL_WARNHOOK_FATAL;
1362 utf8n_to_uvuni((U8*)s, bufend-s, NULL, 0);
1363 LEAVE;
1364 }
1365 return unichar;
1366 } else {
1367 if (s == bufend) {
1368 if (!lex_next_chunk(flags))
1369 return -1;
1370 s = PL_parser->bufptr;
1371 }
1372 return (U8)*s;
1373 }
1374}
1375
1376/*
1377=for apidoc Amx|I32|lex_read_unichar|U32 flags
1378
1379Reads the next (Unicode) character in the text currently being lexed.
1380Returns the codepoint (unsigned integer value) of the character read,
1381and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1382if lexing has reached the end of the input text. To non-destructively
1383examine the next character, use L</lex_peek_unichar> instead.
1384
1385If the next character is in (or extends into) the next chunk of input
1386text, the next chunk will be read in. Normally the current chunk will be
1387discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1388then the current chunk will not be discarded.
1389
1390If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1391is encountered, an exception is generated.
1392
1393=cut
1394*/
1395
1396I32
1397Perl_lex_read_unichar(pTHX_ U32 flags)
1398{
1399 I32 c;
1400 if (flags & ~(LEX_KEEP_PREVIOUS))
1401 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1402 c = lex_peek_unichar(flags);
1403 if (c != -1) {
1404 if (c == '\n')
1405 CopLINE_inc(PL_curcop);
1406 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1407 }
1408 return c;
1409}
1410
1411/*
1412=for apidoc Amx|void|lex_read_space|U32 flags
1413
1414Reads optional spaces, in Perl style, in the text currently being
1415lexed. The spaces may include ordinary whitespace characters and
1416Perl-style comments. C<#line> directives are processed if encountered.
1417L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1418at a non-space character (or the end of the input text).
1419
1420If spaces extend into the next chunk of input text, the next chunk will
1421be read in. Normally the current chunk will be discarded at the same
1422time, but if I<flags> includes C<LEX_KEEP_PREVIOUS> then the current
1423chunk will not be discarded.
1424
1425=cut
1426*/
1427
f0998909
Z
1428#define LEX_NO_NEXT_CHUNK 0x80000000
1429
f0e67a1d
Z
1430void
1431Perl_lex_read_space(pTHX_ U32 flags)
1432{
1433 char *s, *bufend;
1434 bool need_incline = 0;
f0998909 1435 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK))
f0e67a1d
Z
1436 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
1437#ifdef PERL_MAD
1438 if (PL_skipwhite) {
1439 sv_free(PL_skipwhite);
1440 PL_skipwhite = NULL;
1441 }
1442 if (PL_madskills)
1443 PL_skipwhite = newSVpvs("");
1444#endif /* PERL_MAD */
1445 s = PL_parser->bufptr;
1446 bufend = PL_parser->bufend;
1447 while (1) {
1448 char c = *s;
1449 if (c == '#') {
1450 do {
1451 c = *++s;
1452 } while (!(c == '\n' || (c == 0 && s == bufend)));
1453 } else if (c == '\n') {
1454 s++;
1455 PL_parser->linestart = s;
1456 if (s == bufend)
1457 need_incline = 1;
1458 else
1459 incline(s);
1460 } else if (isSPACE(c)) {
1461 s++;
1462 } else if (c == 0 && s == bufend) {
1463 bool got_more;
1464#ifdef PERL_MAD
1465 if (PL_madskills)
1466 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1467#endif /* PERL_MAD */
f0998909
Z
1468 if (flags & LEX_NO_NEXT_CHUNK)
1469 break;
f0e67a1d
Z
1470 PL_parser->bufptr = s;
1471 CopLINE_inc(PL_curcop);
1472 got_more = lex_next_chunk(flags);
1473 CopLINE_dec(PL_curcop);
1474 s = PL_parser->bufptr;
1475 bufend = PL_parser->bufend;
1476 if (!got_more)
1477 break;
1478 if (need_incline && PL_parser->rsfp) {
1479 incline(s);
1480 need_incline = 0;
1481 }
1482 } else {
1483 break;
1484 }
1485 }
1486#ifdef PERL_MAD
1487 if (PL_madskills)
1488 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1489#endif /* PERL_MAD */
1490 PL_parser->bufptr = s;
1491}
1492
1493/*
ffb4593c
NT
1494 * S_incline
1495 * This subroutine has nothing to do with tilting, whether at windmills
1496 * or pinball tables. Its name is short for "increment line". It
57843af0 1497 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1498 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1499 * # line 500 "foo.pm"
1500 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1501 */
1502
76e3520e 1503STATIC void
d9095cec 1504S_incline(pTHX_ const char *s)
463ee0b2 1505{
97aff369 1506 dVAR;
d9095cec
NC
1507 const char *t;
1508 const char *n;
1509 const char *e;
463ee0b2 1510
7918f24d
NC
1511 PERL_ARGS_ASSERT_INCLINE;
1512
57843af0 1513 CopLINE_inc(PL_curcop);
463ee0b2
LW
1514 if (*s++ != '#')
1515 return;
d4c19fe8
AL
1516 while (SPACE_OR_TAB(*s))
1517 s++;
73659bf1
GS
1518 if (strnEQ(s, "line", 4))
1519 s += 4;
1520 else
1521 return;
084592ab 1522 if (SPACE_OR_TAB(*s))
73659bf1 1523 s++;
4e553d73 1524 else
73659bf1 1525 return;
d4c19fe8
AL
1526 while (SPACE_OR_TAB(*s))
1527 s++;
463ee0b2
LW
1528 if (!isDIGIT(*s))
1529 return;
d4c19fe8 1530
463ee0b2
LW
1531 n = s;
1532 while (isDIGIT(*s))
1533 s++;
07714eb4 1534 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1535 return;
bf4acbe4 1536 while (SPACE_OR_TAB(*s))
463ee0b2 1537 s++;
73659bf1 1538 if (*s == '"' && (t = strchr(s+1, '"'))) {
463ee0b2 1539 s++;
73659bf1
GS
1540 e = t + 1;
1541 }
463ee0b2 1542 else {
c35e046a
AL
1543 t = s;
1544 while (!isSPACE(*t))
1545 t++;
73659bf1 1546 e = t;
463ee0b2 1547 }
bf4acbe4 1548 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1549 e++;
1550 if (*e != '\n' && *e != '\0')
1551 return; /* false alarm */
1552
f4dd75d9 1553 if (t - s > 0) {
d9095cec 1554 const STRLEN len = t - s;
8a5ee598 1555#ifndef USE_ITHREADS
19bad673
NC
1556 SV *const temp_sv = CopFILESV(PL_curcop);
1557 const char *cf;
1558 STRLEN tmplen;
1559
1560 if (temp_sv) {
1561 cf = SvPVX(temp_sv);
1562 tmplen = SvCUR(temp_sv);
1563 } else {
1564 cf = NULL;
1565 tmplen = 0;
1566 }
1567
42d9b98d 1568 if (tmplen > 7 && strnEQ(cf, "(eval ", 6)) {
e66cf94c
RGS
1569 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1570 * to *{"::_<newfilename"} */
44867030
NC
1571 /* However, the long form of evals is only turned on by the
1572 debugger - usually they're "(eval %lu)" */
1573 char smallbuf[128];
1574 char *tmpbuf;
1575 GV **gvp;
d9095cec 1576 STRLEN tmplen2 = len;
798b63bc 1577 if (tmplen + 2 <= sizeof smallbuf)
e66cf94c
RGS
1578 tmpbuf = smallbuf;
1579 else
2ae0db35 1580 Newx(tmpbuf, tmplen + 2, char);
44867030
NC
1581 tmpbuf[0] = '_';
1582 tmpbuf[1] = '<';
2ae0db35 1583 memcpy(tmpbuf + 2, cf, tmplen);
44867030 1584 tmplen += 2;
8a5ee598
RGS
1585 gvp = (GV**)hv_fetch(PL_defstash, tmpbuf, tmplen, FALSE);
1586 if (gvp) {
44867030
NC
1587 char *tmpbuf2;
1588 GV *gv2;
1589
1590 if (tmplen2 + 2 <= sizeof smallbuf)
1591 tmpbuf2 = smallbuf;
1592 else
1593 Newx(tmpbuf2, tmplen2 + 2, char);
1594
1595 if (tmpbuf2 != smallbuf || tmpbuf != smallbuf) {
1596 /* Either they malloc'd it, or we malloc'd it,
1597 so no prefix is present in ours. */
1598 tmpbuf2[0] = '_';
1599 tmpbuf2[1] = '<';
1600 }
1601
1602 memcpy(tmpbuf2 + 2, s, tmplen2);
1603 tmplen2 += 2;
1604
8a5ee598 1605 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1606 if (!isGV(gv2)) {
8a5ee598 1607 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1608 /* adjust ${"::_<newfilename"} to store the new file name */
1609 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
3cb1dbc6
NC
1610 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(*gvp)));
1611 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(*gvp)));
e5527e4b 1612 }
44867030
NC
1613
1614 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
8a5ee598 1615 }
e66cf94c 1616 if (tmpbuf != smallbuf) Safefree(tmpbuf);
e66cf94c 1617 }
8a5ee598 1618#endif
05ec9bb3 1619 CopFILE_free(PL_curcop);
d9095cec 1620 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1621 }
57843af0 1622 CopLINE_set(PL_curcop, atoi(n)-1);
463ee0b2
LW
1623}
1624
29595ff2 1625#ifdef PERL_MAD
cd81e915 1626/* skip space before PL_thistoken */
29595ff2
NC
1627
1628STATIC char *
1629S_skipspace0(pTHX_ register char *s)
1630{
7918f24d
NC
1631 PERL_ARGS_ASSERT_SKIPSPACE0;
1632
29595ff2
NC
1633 s = skipspace(s);
1634 if (!PL_madskills)
1635 return s;
cd81e915
NC
1636 if (PL_skipwhite) {
1637 if (!PL_thiswhite)
6b29d1f5 1638 PL_thiswhite = newSVpvs("");
cd81e915
NC
1639 sv_catsv(PL_thiswhite, PL_skipwhite);
1640 sv_free(PL_skipwhite);
1641 PL_skipwhite = 0;
1642 }
1643 PL_realtokenstart = s - SvPVX(PL_linestr);
29595ff2
NC
1644 return s;
1645}
1646
cd81e915 1647/* skip space after PL_thistoken */
29595ff2
NC
1648
1649STATIC char *
1650S_skipspace1(pTHX_ register char *s)
1651{
d4c19fe8 1652 const char *start = s;
29595ff2
NC
1653 I32 startoff = start - SvPVX(PL_linestr);
1654
7918f24d
NC
1655 PERL_ARGS_ASSERT_SKIPSPACE1;
1656
29595ff2
NC
1657 s = skipspace(s);
1658 if (!PL_madskills)
1659 return s;
1660 start = SvPVX(PL_linestr) + startoff;
cd81e915 1661 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1662 const char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1663 PL_thistoken = newSVpvn(tstart, start - tstart);
1664 }
1665 PL_realtokenstart = -1;
1666 if (PL_skipwhite) {
1667 if (!PL_nextwhite)
6b29d1f5 1668 PL_nextwhite = newSVpvs("");
cd81e915
NC
1669 sv_catsv(PL_nextwhite, PL_skipwhite);
1670 sv_free(PL_skipwhite);
1671 PL_skipwhite = 0;
29595ff2
NC
1672 }
1673 return s;
1674}
1675
1676STATIC char *
1677S_skipspace2(pTHX_ register char *s, SV **svp)
1678{
c35e046a
AL
1679 char *start;
1680 const I32 bufptroff = PL_bufptr - SvPVX(PL_linestr);
1681 const I32 startoff = s - SvPVX(PL_linestr);
1682
7918f24d
NC
1683 PERL_ARGS_ASSERT_SKIPSPACE2;
1684
29595ff2
NC
1685 s = skipspace(s);
1686 PL_bufptr = SvPVX(PL_linestr) + bufptroff;
1687 if (!PL_madskills || !svp)
1688 return s;
1689 start = SvPVX(PL_linestr) + startoff;
cd81e915 1690 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1691 char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1692 PL_thistoken = newSVpvn(tstart, start - tstart);
1693 PL_realtokenstart = -1;
29595ff2 1694 }
cd81e915 1695 if (PL_skipwhite) {
29595ff2 1696 if (!*svp)
6b29d1f5 1697 *svp = newSVpvs("");
cd81e915
NC
1698 sv_setsv(*svp, PL_skipwhite);
1699 sv_free(PL_skipwhite);
1700 PL_skipwhite = 0;
29595ff2
NC
1701 }
1702
1703 return s;
1704}
1705#endif
1706
80a702cd 1707STATIC void
15f169a1 1708S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1709{
1710 AV *av = CopFILEAVx(PL_curcop);
1711 if (av) {
b9f83d2f 1712 SV * const sv = newSV_type(SVt_PVMG);
5fa550fb
NC
1713 if (orig_sv)
1714 sv_setsv(sv, orig_sv);
1715 else
1716 sv_setpvn(sv, buf, len);
80a702cd
RGS
1717 (void)SvIOK_on(sv);
1718 SvIV_set(sv, 0);
1719 av_store(av, (I32)CopLINE(PL_curcop), sv);
1720 }
1721}
1722
ffb4593c
NT
1723/*
1724 * S_skipspace
1725 * Called to gobble the appropriate amount and type of whitespace.
1726 * Skips comments as well.
1727 */
1728
76e3520e 1729STATIC char *
cea2e8a9 1730S_skipspace(pTHX_ register char *s)
a687059c 1731{
5db06880 1732#ifdef PERL_MAD
f0e67a1d
Z
1733 char *start = s;
1734#endif /* PERL_MAD */
7918f24d 1735 PERL_ARGS_ASSERT_SKIPSPACE;
f0e67a1d 1736#ifdef PERL_MAD
cd81e915
NC
1737 if (PL_skipwhite) {
1738 sv_free(PL_skipwhite);
f0e67a1d 1739 PL_skipwhite = NULL;
5db06880 1740 }
f0e67a1d 1741#endif /* PERL_MAD */
3280af22 1742 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
bf4acbe4 1743 while (s < PL_bufend && SPACE_OR_TAB(*s))
463ee0b2 1744 s++;
f0e67a1d
Z
1745 } else {
1746 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1747 PL_bufptr = s;
f0998909
Z
1748 lex_read_space(LEX_KEEP_PREVIOUS |
1749 (PL_sublex_info.sub_inwhat || PL_lex_state == LEX_FORMLINE ?
1750 LEX_NO_NEXT_CHUNK : 0));
3280af22 1751 s = PL_bufptr;
f0e67a1d
Z
1752 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1753 if (PL_linestart > PL_bufptr)
1754 PL_bufptr = PL_linestart;
1755 return s;
463ee0b2 1756 }
5db06880 1757#ifdef PERL_MAD
f0e67a1d
Z
1758 if (PL_madskills)
1759 PL_skipwhite = newSVpvn(start, s-start);
1760#endif /* PERL_MAD */
5db06880 1761 return s;
a687059c 1762}
378cc40b 1763
ffb4593c
NT
1764/*
1765 * S_check_uni
1766 * Check the unary operators to ensure there's no ambiguity in how they're
1767 * used. An ambiguous piece of code would be:
1768 * rand + 5
1769 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1770 * the +5 is its argument.
1771 */
1772
76e3520e 1773STATIC void
cea2e8a9 1774S_check_uni(pTHX)
ba106d47 1775{
97aff369 1776 dVAR;
d4c19fe8
AL
1777 const char *s;
1778 const char *t;
2f3197b3 1779
3280af22 1780 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1781 return;
3280af22
NIS
1782 while (isSPACE(*PL_last_uni))
1783 PL_last_uni++;
c35e046a
AL
1784 s = PL_last_uni;
1785 while (isALNUM_lazy_if(s,UTF) || *s == '-')
1786 s++;
3280af22 1787 if ((t = strchr(s, '(')) && t < PL_bufptr)
a0d0e21e 1788 return;
6136c704 1789
9b387841
NC
1790 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
1791 "Warning: Use of \"%.*s\" without parentheses is ambiguous",
1792 (int)(s - PL_last_uni), PL_last_uni);
2f3197b3
LW
1793}
1794
ffb4593c
NT
1795/*
1796 * LOP : macro to build a list operator. Its behaviour has been replaced
1797 * with a subroutine, S_lop() for which LOP is just another name.
1798 */
1799
a0d0e21e
LW
1800#define LOP(f,x) return lop(f,x,s)
1801
ffb4593c
NT
1802/*
1803 * S_lop
1804 * Build a list operator (or something that might be one). The rules:
1805 * - if we have a next token, then it's a list operator [why?]
1806 * - if the next thing is an opening paren, then it's a function
1807 * - else it's a list operator
1808 */
1809
76e3520e 1810STATIC I32
a0be28da 1811S_lop(pTHX_ I32 f, int x, char *s)
ffed7fef 1812{
97aff369 1813 dVAR;
7918f24d
NC
1814
1815 PERL_ARGS_ASSERT_LOP;
1816
6154021b 1817 pl_yylval.ival = f;
35c8bce7 1818 CLINE;
3280af22
NIS
1819 PL_expect = x;
1820 PL_bufptr = s;
1821 PL_last_lop = PL_oldbufptr;
eb160463 1822 PL_last_lop_op = (OPCODE)f;
5db06880
NC
1823#ifdef PERL_MAD
1824 if (PL_lasttoke)
1825 return REPORT(LSTOP);
1826#else
3280af22 1827 if (PL_nexttoke)
bbf60fe6 1828 return REPORT(LSTOP);
5db06880 1829#endif
79072805 1830 if (*s == '(')
bbf60fe6 1831 return REPORT(FUNC);
29595ff2 1832 s = PEEKSPACE(s);
79072805 1833 if (*s == '(')
bbf60fe6 1834 return REPORT(FUNC);
79072805 1835 else
bbf60fe6 1836 return REPORT(LSTOP);
79072805
LW
1837}
1838
5db06880
NC
1839#ifdef PERL_MAD
1840 /*
1841 * S_start_force
1842 * Sets up for an eventual force_next(). start_force(0) basically does
1843 * an unshift, while start_force(-1) does a push. yylex removes items
1844 * on the "pop" end.
1845 */
1846
1847STATIC void
1848S_start_force(pTHX_ int where)
1849{
1850 int i;
1851
cd81e915 1852 if (where < 0) /* so people can duplicate start_force(PL_curforce) */
5db06880 1853 where = PL_lasttoke;
cd81e915
NC
1854 assert(PL_curforce < 0 || PL_curforce == where);
1855 if (PL_curforce != where) {
5db06880
NC
1856 for (i = PL_lasttoke; i > where; --i) {
1857 PL_nexttoke[i] = PL_nexttoke[i-1];
1858 }
1859 PL_lasttoke++;
1860 }
cd81e915 1861 if (PL_curforce < 0) /* in case of duplicate start_force() */
5db06880 1862 Zero(&PL_nexttoke[where], 1, NEXTTOKE);
cd81e915
NC
1863 PL_curforce = where;
1864 if (PL_nextwhite) {
5db06880 1865 if (PL_madskills)
6b29d1f5 1866 curmad('^', newSVpvs(""));
cd81e915 1867 CURMAD('_', PL_nextwhite);
5db06880
NC
1868 }
1869}
1870
1871STATIC void
1872S_curmad(pTHX_ char slot, SV *sv)
1873{
1874 MADPROP **where;
1875
1876 if (!sv)
1877 return;
cd81e915
NC
1878 if (PL_curforce < 0)
1879 where = &PL_thismad;
5db06880 1880 else
cd81e915 1881 where = &PL_nexttoke[PL_curforce].next_mad;
5db06880 1882
cd81e915 1883 if (PL_faketokens)
76f68e9b 1884 sv_setpvs(sv, "");
5db06880
NC
1885 else {
1886 if (!IN_BYTES) {
1887 if (UTF && is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
1888 SvUTF8_on(sv);
1889 else if (PL_encoding) {
1890 sv_recode_to_utf8(sv, PL_encoding);
1891 }
1892 }
1893 }
1894
1895 /* keep a slot open for the head of the list? */
1896 if (slot != '_' && *where && (*where)->mad_key == '^') {
1897 (*where)->mad_key = slot;
daba3364 1898 sv_free(MUTABLE_SV(((*where)->mad_val)));
5db06880
NC
1899 (*where)->mad_val = (void*)sv;
1900 }
1901 else
1902 addmad(newMADsv(slot, sv), where, 0);
1903}
1904#else
b3f24c00
MHM
1905# define start_force(where) NOOP
1906# define curmad(slot, sv) NOOP
5db06880
NC
1907#endif
1908
ffb4593c
NT
1909/*
1910 * S_force_next
9cbb5ea2 1911 * When the lexer realizes it knows the next token (for instance,
ffb4593c 1912 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 1913 * to know what token to return the next time the lexer is called. Caller
5db06880
NC
1914 * will need to set PL_nextval[] (or PL_nexttoke[].next_val with PERL_MAD),
1915 * and possibly PL_expect to ensure the lexer handles the token correctly.
ffb4593c
NT
1916 */
1917
4e553d73 1918STATIC void
cea2e8a9 1919S_force_next(pTHX_ I32 type)
79072805 1920{
97aff369 1921 dVAR;
704d4215
GG
1922#ifdef DEBUGGING
1923 if (DEBUG_T_TEST) {
1924 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 1925 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
1926 }
1927#endif
5db06880 1928#ifdef PERL_MAD
cd81e915 1929 if (PL_curforce < 0)
5db06880 1930 start_force(PL_lasttoke);
cd81e915 1931 PL_nexttoke[PL_curforce].next_type = type;
5db06880
NC
1932 if (PL_lex_state != LEX_KNOWNEXT)
1933 PL_lex_defer = PL_lex_state;
1934 PL_lex_state = LEX_KNOWNEXT;
1935 PL_lex_expect = PL_expect;
cd81e915 1936 PL_curforce = -1;
5db06880 1937#else
3280af22
NIS
1938 PL_nexttype[PL_nexttoke] = type;
1939 PL_nexttoke++;
1940 if (PL_lex_state != LEX_KNOWNEXT) {
1941 PL_lex_defer = PL_lex_state;
1942 PL_lex_expect = PL_expect;
1943 PL_lex_state = LEX_KNOWNEXT;
79072805 1944 }
5db06880 1945#endif
79072805
LW
1946}
1947
28ac2b49
Z
1948void
1949Perl_yyunlex(pTHX)
1950{
a7aaec61
Z
1951 int yyc = PL_parser->yychar;
1952 if (yyc != YYEMPTY) {
1953 if (yyc) {
1954 start_force(-1);
1955 NEXTVAL_NEXTTOKE = PL_parser->yylval;
1956 if (yyc == '{'/*}*/ || yyc == HASHBRACK || yyc == '['/*]*/) {
1957 PL_lex_brackets--;
1958 yyc |= (1<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
1959 }
1960 force_next(yyc);
1961 }
28ac2b49
Z
1962 PL_parser->yychar = YYEMPTY;
1963 }
1964}
1965
d0a148a6 1966STATIC SV *
15f169a1 1967S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 1968{
97aff369 1969 dVAR;
740cce10 1970 SV * const sv = newSVpvn_utf8(start, len,
eaf7a4d2
CS
1971 !IN_BYTES
1972 && UTF
1973 && !is_ascii_string((const U8*)start, len)
740cce10 1974 && is_utf8_string((const U8*)start, len));
d0a148a6
NC
1975 return sv;
1976}
1977
ffb4593c
NT
1978/*
1979 * S_force_word
1980 * When the lexer knows the next thing is a word (for instance, it has
1981 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
1982 * it calls S_force_word to stick the next word into the PL_nexttoke/val
1983 * lookahead.
ffb4593c
NT
1984 *
1985 * Arguments:
b1b65b59 1986 * char *start : buffer position (must be within PL_linestr)
02b34bbe 1987 * int token : PL_next* will be this type of bare word (e.g., METHOD,WORD)
ffb4593c
NT
1988 * int check_keyword : if true, Perl checks to make sure the word isn't
1989 * a keyword (do this if the word is a label, e.g. goto FOO)
1990 * int allow_pack : if true, : characters will also be allowed (require,
1991 * use, etc. do this)
9cbb5ea2 1992 * int allow_initial_tick : used by the "sub" lexer only.
ffb4593c
NT
1993 */
1994
76e3520e 1995STATIC char *
cea2e8a9 1996S_force_word(pTHX_ register char *start, int token, int check_keyword, int allow_pack, int allow_initial_tick)
79072805 1997{
97aff369 1998 dVAR;
463ee0b2
LW
1999 register char *s;
2000 STRLEN len;
4e553d73 2001
7918f24d
NC
2002 PERL_ARGS_ASSERT_FORCE_WORD;
2003
29595ff2 2004 start = SKIPSPACE1(start);
463ee0b2 2005 s = start;
7e2040f0 2006 if (isIDFIRST_lazy_if(s,UTF) ||
a0d0e21e 2007 (allow_pack && *s == ':') ||
15f0808c 2008 (allow_initial_tick && *s == '\'') )
a0d0e21e 2009 {
3280af22 2010 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
5458a98a 2011 if (check_keyword && keyword(PL_tokenbuf, len, 0))
463ee0b2 2012 return start;
cd81e915 2013 start_force(PL_curforce);
5db06880
NC
2014 if (PL_madskills)
2015 curmad('X', newSVpvn(start,s-start));
463ee0b2 2016 if (token == METHOD) {
29595ff2 2017 s = SKIPSPACE1(s);
463ee0b2 2018 if (*s == '(')
3280af22 2019 PL_expect = XTERM;
463ee0b2 2020 else {
3280af22 2021 PL_expect = XOPERATOR;
463ee0b2 2022 }
79072805 2023 }
e74e6b3d 2024 if (PL_madskills)
63575281 2025 curmad('g', newSVpvs( "forced" ));
9ded7720 2026 NEXTVAL_NEXTTOKE.opval
d0a148a6
NC
2027 = (OP*)newSVOP(OP_CONST,0,
2028 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 2029 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
2030 force_next(token);
2031 }
2032 return s;
2033}
2034
ffb4593c
NT
2035/*
2036 * S_force_ident
9cbb5ea2 2037 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
2038 * text only contains the "foo" portion. The first argument is a pointer
2039 * to the "foo", and the second argument is the type symbol to prefix.
2040 * Forces the next token to be a "WORD".
9cbb5ea2 2041 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2042 */
2043
76e3520e 2044STATIC void
bfed75c6 2045S_force_ident(pTHX_ register const char *s, int kind)
79072805 2046{
97aff369 2047 dVAR;
7918f24d
NC
2048
2049 PERL_ARGS_ASSERT_FORCE_IDENT;
2050
c35e046a 2051 if (*s) {
90e5519e
NC
2052 const STRLEN len = strlen(s);
2053 OP* const o = (OP*)newSVOP(OP_CONST, 0, newSVpvn(s, len));
cd81e915 2054 start_force(PL_curforce);
9ded7720 2055 NEXTVAL_NEXTTOKE.opval = o;
79072805 2056 force_next(WORD);
748a9306 2057 if (kind) {
11343788 2058 o->op_private = OPpCONST_ENTERED;
55497cff 2059 /* XXX see note in pp_entereval() for why we forgo typo
2060 warnings if the symbol must be introduced in an eval.
2061 GSAR 96-10-12 */
90e5519e
NC
2062 gv_fetchpvn_flags(s, len,
2063 PL_in_eval ? (GV_ADDMULTI | GV_ADDINEVAL)
2064 : GV_ADD,
2065 kind == '$' ? SVt_PV :
2066 kind == '@' ? SVt_PVAV :
2067 kind == '%' ? SVt_PVHV :
a0d0e21e 2068 SVt_PVGV
90e5519e 2069 );
748a9306 2070 }
79072805
LW
2071 }
2072}
2073
1571675a
GS
2074NV
2075Perl_str_to_version(pTHX_ SV *sv)
2076{
2077 NV retval = 0.0;
2078 NV nshift = 1.0;
2079 STRLEN len;
cfd0369c 2080 const char *start = SvPV_const(sv,len);
9d4ba2ae 2081 const char * const end = start + len;
504618e9 2082 const bool utf = SvUTF8(sv) ? TRUE : FALSE;
7918f24d
NC
2083
2084 PERL_ARGS_ASSERT_STR_TO_VERSION;
2085
1571675a 2086 while (start < end) {
ba210ebe 2087 STRLEN skip;
1571675a
GS
2088 UV n;
2089 if (utf)
9041c2e3 2090 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2091 else {
2092 n = *(U8*)start;
2093 skip = 1;
2094 }
2095 retval += ((NV)n)/nshift;
2096 start += skip;
2097 nshift *= 1000;
2098 }
2099 return retval;
2100}
2101
4e553d73 2102/*
ffb4593c
NT
2103 * S_force_version
2104 * Forces the next token to be a version number.
e759cc13
RGS
2105 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2106 * and if "guessing" is TRUE, then no new token is created (and the caller
2107 * must use an alternative parsing method).
ffb4593c
NT
2108 */
2109
76e3520e 2110STATIC char *
e759cc13 2111S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2112{
97aff369 2113 dVAR;
5f66b61c 2114 OP *version = NULL;
44dcb63b 2115 char *d;
5db06880
NC
2116#ifdef PERL_MAD
2117 I32 startoff = s - SvPVX(PL_linestr);
2118#endif
89bfa8cd 2119
7918f24d
NC
2120 PERL_ARGS_ASSERT_FORCE_VERSION;
2121
29595ff2 2122 s = SKIPSPACE1(s);
89bfa8cd 2123
44dcb63b 2124 d = s;
dd629d5b 2125 if (*d == 'v')
44dcb63b 2126 d++;
44dcb63b 2127 if (isDIGIT(*d)) {
e759cc13
RGS
2128 while (isDIGIT(*d) || *d == '_' || *d == '.')
2129 d++;
5db06880
NC
2130#ifdef PERL_MAD
2131 if (PL_madskills) {
cd81e915 2132 start_force(PL_curforce);
5db06880
NC
2133 curmad('X', newSVpvn(s,d-s));
2134 }
2135#endif
4e4da3ac 2136 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
dd629d5b 2137 SV *ver;
8d08d9ba
DG
2138#ifdef USE_LOCALE_NUMERIC
2139 char *loc = setlocale(LC_NUMERIC, "C");
2140#endif
6154021b 2141 s = scan_num(s, &pl_yylval);
8d08d9ba
DG
2142#ifdef USE_LOCALE_NUMERIC
2143 setlocale(LC_NUMERIC, loc);
2144#endif
6154021b 2145 version = pl_yylval.opval;
dd629d5b
GS
2146 ver = cSVOPx(version)->op_sv;
2147 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2148 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2149 SvNV_set(ver, str_to_version(ver));
1571675a 2150 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2151 }
89bfa8cd 2152 }
5db06880
NC
2153 else if (guessing) {
2154#ifdef PERL_MAD
2155 if (PL_madskills) {
cd81e915
NC
2156 sv_free(PL_nextwhite); /* let next token collect whitespace */
2157 PL_nextwhite = 0;
5db06880
NC
2158 s = SvPVX(PL_linestr) + startoff;
2159 }
2160#endif
e759cc13 2161 return s;
5db06880 2162 }
89bfa8cd 2163 }
2164
5db06880
NC
2165#ifdef PERL_MAD
2166 if (PL_madskills && !version) {
cd81e915
NC
2167 sv_free(PL_nextwhite); /* let next token collect whitespace */
2168 PL_nextwhite = 0;
5db06880
NC
2169 s = SvPVX(PL_linestr) + startoff;
2170 }
2171#endif
89bfa8cd 2172 /* NOTE: The parser sees the package name and the VERSION swapped */
cd81e915 2173 start_force(PL_curforce);
9ded7720 2174 NEXTVAL_NEXTTOKE.opval = version;
4e553d73 2175 force_next(WORD);
89bfa8cd 2176
e759cc13 2177 return s;
89bfa8cd 2178}
2179
ffb4593c 2180/*
91152fc1
DG
2181 * S_force_strict_version
2182 * Forces the next token to be a version number using strict syntax rules.
2183 */
2184
2185STATIC char *
2186S_force_strict_version(pTHX_ char *s)
2187{
2188 dVAR;
2189 OP *version = NULL;
2190#ifdef PERL_MAD
2191 I32 startoff = s - SvPVX(PL_linestr);
2192#endif
2193 const char *errstr = NULL;
2194
2195 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2196
2197 while (isSPACE(*s)) /* leading whitespace */
2198 s++;
2199
2200 if (is_STRICT_VERSION(s,&errstr)) {
2201 SV *ver = newSV(0);
2202 s = (char *)scan_version(s, ver, 0);
2203 version = newSVOP(OP_CONST, 0, ver);
2204 }
4e4da3ac
Z
2205 else if ( (*s != ';' && *s != '{' && *s != '}' ) &&
2206 (s = SKIPSPACE1(s), (*s != ';' && *s != '{' && *s != '}' )))
2207 {
91152fc1
DG
2208 PL_bufptr = s;
2209 if (errstr)
2210 yyerror(errstr); /* version required */
2211 return s;
2212 }
2213
2214#ifdef PERL_MAD
2215 if (PL_madskills && !version) {
2216 sv_free(PL_nextwhite); /* let next token collect whitespace */
2217 PL_nextwhite = 0;
2218 s = SvPVX(PL_linestr) + startoff;
2219 }
2220#endif
2221 /* NOTE: The parser sees the package name and the VERSION swapped */
2222 start_force(PL_curforce);
2223 NEXTVAL_NEXTTOKE.opval = version;
2224 force_next(WORD);
2225
2226 return s;
2227}
2228
2229/*
ffb4593c
NT
2230 * S_tokeq
2231 * Tokenize a quoted string passed in as an SV. It finds the next
2232 * chunk, up to end of string or a backslash. It may make a new
2233 * SV containing that chunk (if HINT_NEW_STRING is on). It also
2234 * turns \\ into \.
2235 */
2236
76e3520e 2237STATIC SV *
cea2e8a9 2238S_tokeq(pTHX_ SV *sv)
79072805 2239{
97aff369 2240 dVAR;
79072805
LW
2241 register char *s;
2242 register char *send;
2243 register char *d;
b3ac6de7
IZ
2244 STRLEN len = 0;
2245 SV *pv = sv;
79072805 2246
7918f24d
NC
2247 PERL_ARGS_ASSERT_TOKEQ;
2248
79072805 2249 if (!SvLEN(sv))
b3ac6de7 2250 goto finish;
79072805 2251
a0d0e21e 2252 s = SvPV_force(sv, len);
21a311ee 2253 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1)
b3ac6de7 2254 goto finish;
463ee0b2 2255 send = s + len;
dcb21ed6
NC
2256 /* This is relying on the SV being "well formed" with a trailing '\0' */
2257 while (s < send && !(*s == '\\' && s[1] == '\\'))
79072805
LW
2258 s++;
2259 if (s == send)
b3ac6de7 2260 goto finish;
79072805 2261 d = s;
be4731d2 2262 if ( PL_hints & HINT_NEW_STRING ) {
59cd0e26 2263 pv = newSVpvn_flags(SvPVX_const(pv), len, SVs_TEMP | SvUTF8(sv));
be4731d2 2264 }
79072805
LW
2265 while (s < send) {
2266 if (*s == '\\') {
a0d0e21e 2267 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2268 s++; /* all that, just for this */
2269 }
2270 *d++ = *s++;
2271 }
2272 *d = '\0';
95a20fc0 2273 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2274 finish:
3280af22 2275 if ( PL_hints & HINT_NEW_STRING )
eb0d8d16 2276 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
79072805
LW
2277 return sv;
2278}
2279
ffb4593c
NT
2280/*
2281 * Now come three functions related to double-quote context,
2282 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2283 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2284 * interact with PL_lex_state, and create fake ( ... ) argument lists
2285 * to handle functions and concatenation.
2286 * They assume that whoever calls them will be setting up a fake
2287 * join call, because each subthing puts a ',' after it. This lets
2288 * "lower \luPpEr"
2289 * become
2290 * join($, , 'lower ', lcfirst( 'uPpEr', ) ,)
2291 *
2292 * (I'm not sure whether the spurious commas at the end of lcfirst's
2293 * arguments and join's arguments are created or not).
2294 */
2295
2296/*
2297 * S_sublex_start
6154021b 2298 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2299 *
2300 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2301 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c
NT
2302 *
2303 * OP_CONST and OP_READLINE are easy--just make the new op and return.
2304 *
2305 * Everything else becomes a FUNC.
2306 *
2307 * Sets PL_lex_state to LEX_INTERPPUSH unless (ival was OP_NULL or we
2308 * had an OP_CONST or OP_READLINE). This just sets us up for a
2309 * call to S_sublex_push().
2310 */
2311
76e3520e 2312STATIC I32
cea2e8a9 2313S_sublex_start(pTHX)
79072805 2314{
97aff369 2315 dVAR;
6154021b 2316 register const I32 op_type = pl_yylval.ival;
79072805
LW
2317
2318 if (op_type == OP_NULL) {
6154021b 2319 pl_yylval.opval = PL_lex_op;
5f66b61c 2320 PL_lex_op = NULL;
79072805
LW
2321 return THING;
2322 }
2323 if (op_type == OP_CONST || op_type == OP_READLINE) {
3280af22 2324 SV *sv = tokeq(PL_lex_stuff);
b3ac6de7
IZ
2325
2326 if (SvTYPE(sv) == SVt_PVIV) {
2327 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2328 STRLEN len;
96a5add6 2329 const char * const p = SvPV_const(sv, len);
740cce10 2330 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2331 SvREFCNT_dec(sv);
2332 sv = nsv;
4e553d73 2333 }
6154021b 2334 pl_yylval.opval = (OP*)newSVOP(op_type, 0, sv);
a0714e2c 2335 PL_lex_stuff = NULL;
6f33ba73
RGS
2336 /* Allow <FH> // "foo" */
2337 if (op_type == OP_READLINE)
2338 PL_expect = XTERMORDORDOR;
79072805
LW
2339 return THING;
2340 }
e3f73d4e
RGS
2341 else if (op_type == OP_BACKTICK && PL_lex_op) {
2342 /* readpipe() vas overriden */
2343 cSVOPx(cLISTOPx(cUNOPx(PL_lex_op)->op_first)->op_first->op_sibling)->op_sv = tokeq(PL_lex_stuff);
6154021b 2344 pl_yylval.opval = PL_lex_op;
9b201d7d 2345 PL_lex_op = NULL;
e3f73d4e
RGS
2346 PL_lex_stuff = NULL;
2347 return THING;
2348 }
79072805 2349
3280af22 2350 PL_sublex_info.super_state = PL_lex_state;
eac04b2e 2351 PL_sublex_info.sub_inwhat = (U16)op_type;
3280af22
NIS
2352 PL_sublex_info.sub_op = PL_lex_op;
2353 PL_lex_state = LEX_INTERPPUSH;
55497cff 2354
3280af22
NIS
2355 PL_expect = XTERM;
2356 if (PL_lex_op) {
6154021b 2357 pl_yylval.opval = PL_lex_op;
5f66b61c 2358 PL_lex_op = NULL;
55497cff 2359 return PMFUNC;
2360 }
2361 else
2362 return FUNC;
2363}
2364
ffb4593c
NT
2365/*
2366 * S_sublex_push
2367 * Create a new scope to save the lexing state. The scope will be
2368 * ended in S_sublex_done. Returns a '(', starting the function arguments
2369 * to the uc, lc, etc. found before.
2370 * Sets PL_lex_state to LEX_INTERPCONCAT.
2371 */
2372
76e3520e 2373STATIC I32
cea2e8a9 2374S_sublex_push(pTHX)
55497cff 2375{
27da23d5 2376 dVAR;
f46d017c 2377 ENTER;
55497cff 2378
3280af22 2379 PL_lex_state = PL_sublex_info.super_state;
651b5b28 2380 SAVEBOOL(PL_lex_dojoin);
3280af22 2381 SAVEI32(PL_lex_brackets);
3280af22
NIS
2382 SAVEI32(PL_lex_casemods);
2383 SAVEI32(PL_lex_starts);
651b5b28 2384 SAVEI8(PL_lex_state);
7766f137 2385 SAVEVPTR(PL_lex_inpat);
98246f1e 2386 SAVEI16(PL_lex_inwhat);
57843af0 2387 SAVECOPLINE(PL_curcop);
3280af22 2388 SAVEPPTR(PL_bufptr);
8452ff4b 2389 SAVEPPTR(PL_bufend);
3280af22
NIS
2390 SAVEPPTR(PL_oldbufptr);
2391 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2392 SAVEPPTR(PL_last_lop);
2393 SAVEPPTR(PL_last_uni);
3280af22
NIS
2394 SAVEPPTR(PL_linestart);
2395 SAVESPTR(PL_linestr);
8edd5f42
RGS
2396 SAVEGENERICPV(PL_lex_brackstack);
2397 SAVEGENERICPV(PL_lex_casestack);
3280af22
NIS
2398
2399 PL_linestr = PL_lex_stuff;
a0714e2c 2400 PL_lex_stuff = NULL;
3280af22 2401
9cbb5ea2
GS
2402 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2403 = SvPVX(PL_linestr);
3280af22 2404 PL_bufend += SvCUR(PL_linestr);
bd61b366 2405 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2406 SAVEFREESV(PL_linestr);
2407
2408 PL_lex_dojoin = FALSE;
2409 PL_lex_brackets = 0;
a02a5408
JC
2410 Newx(PL_lex_brackstack, 120, char);
2411 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2412 PL_lex_casemods = 0;
2413 *PL_lex_casestack = '\0';
2414 PL_lex_starts = 0;
2415 PL_lex_state = LEX_INTERPCONCAT;
eb160463 2416 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
3280af22
NIS
2417
2418 PL_lex_inwhat = PL_sublex_info.sub_inwhat;
bb16bae8 2419 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
3280af22
NIS
2420 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2421 PL_lex_inpat = PL_sublex_info.sub_op;
79072805 2422 else
5f66b61c 2423 PL_lex_inpat = NULL;
79072805 2424
55497cff 2425 return '(';
79072805
LW
2426}
2427
ffb4593c
NT
2428/*
2429 * S_sublex_done
2430 * Restores lexer state after a S_sublex_push.
2431 */
2432
76e3520e 2433STATIC I32
cea2e8a9 2434S_sublex_done(pTHX)
79072805 2435{
27da23d5 2436 dVAR;
3280af22 2437 if (!PL_lex_starts++) {
396482e1 2438 SV * const sv = newSVpvs("");
9aa983d2
JH
2439 if (SvUTF8(PL_linestr))
2440 SvUTF8_on(sv);
3280af22 2441 PL_expect = XOPERATOR;
6154021b 2442 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
79072805
LW
2443 return THING;
2444 }
2445
3280af22
NIS
2446 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2447 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2448 return yylex();
79072805
LW
2449 }
2450
ffb4593c 2451 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
bb16bae8 2452 assert(PL_lex_inwhat != OP_TRANSR);
3280af22
NIS
2453 if (PL_lex_repl && (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS)) {
2454 PL_linestr = PL_lex_repl;
2455 PL_lex_inpat = 0;
2456 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2457 PL_bufend += SvCUR(PL_linestr);
bd61b366 2458 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2459 SAVEFREESV(PL_linestr);
2460 PL_lex_dojoin = FALSE;
2461 PL_lex_brackets = 0;
3280af22
NIS
2462 PL_lex_casemods = 0;
2463 *PL_lex_casestack = '\0';
2464 PL_lex_starts = 0;
25da4f38 2465 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2466 PL_lex_state = LEX_INTERPNORMAL;
2467 PL_lex_starts++;
e9fa98b2
HS
2468 /* we don't clear PL_lex_repl here, so that we can check later
2469 whether this is an evalled subst; that means we rely on the
2470 logic to ensure sublex_done() is called again only via the
2471 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2472 }
e9fa98b2 2473 else {
3280af22 2474 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2475 PL_lex_repl = NULL;
e9fa98b2 2476 }
79072805 2477 return ',';
ffed7fef
LW
2478 }
2479 else {
5db06880
NC
2480#ifdef PERL_MAD
2481 if (PL_madskills) {
cd81e915
NC
2482 if (PL_thiswhite) {
2483 if (!PL_endwhite)
6b29d1f5 2484 PL_endwhite = newSVpvs("");
cd81e915
NC
2485 sv_catsv(PL_endwhite, PL_thiswhite);
2486 PL_thiswhite = 0;
2487 }
2488 if (PL_thistoken)
76f68e9b 2489 sv_setpvs(PL_thistoken,"");
5db06880 2490 else
cd81e915 2491 PL_realtokenstart = -1;
5db06880
NC
2492 }
2493#endif
f46d017c 2494 LEAVE;
3280af22
NIS
2495 PL_bufend = SvPVX(PL_linestr);
2496 PL_bufend += SvCUR(PL_linestr);
2497 PL_expect = XOPERATOR;
09bef843 2498 PL_sublex_info.sub_inwhat = 0;
79072805 2499 return ')';
ffed7fef
LW
2500 }
2501}
2502
02aa26ce
NT
2503/*
2504 scan_const
2505
2506 Extracts a pattern, double-quoted string, or transliteration. This
2507 is terrifying code.
2508
94def140 2509 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 2510 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 2511 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 2512
94def140
TS
2513 Returns a pointer to the character scanned up to. If this is
2514 advanced from the start pointer supplied (i.e. if anything was
9b599b2a 2515 successfully parsed), will leave an OP for the substring scanned
6154021b 2516 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
2517 by looking at the next characters herself.
2518
02aa26ce
NT
2519 In patterns:
2520 backslashes:
ff3f963a 2521 constants: \N{NAME} only
02aa26ce
NT
2522 case and quoting: \U \Q \E
2523 stops on @ and $, but not for $ as tail anchor
2524
2525 In transliterations:
2526 characters are VERY literal, except for - not at the start or end
94def140
TS
2527 of the string, which indicates a range. If the range is in bytes,
2528 scan_const expands the range to the full set of intermediate
2529 characters. If the range is in utf8, the hyphen is replaced with
2530 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
2531
2532 In double-quoted strings:
2533 backslashes:
2534 double-quoted style: \r and \n
ff3f963a 2535 constants: \x31, etc.
94def140 2536 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
2537 case and quoting: \U \Q \E
2538 stops on @ and $
2539
2540 scan_const does *not* construct ops to handle interpolated strings.
2541 It stops processing as soon as it finds an embedded $ or @ variable
2542 and leaves it to the caller to work out what's going on.
2543
94def140
TS
2544 embedded arrays (whether in pattern or not) could be:
2545 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2546
2547 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
2548
2549 $ in pattern could be $foo or could be tail anchor. Assumption:
2550 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 2551 followed by one of "()| \r\n\t"
02aa26ce
NT
2552
2553 \1 (backreferences) are turned into $1
2554
2555 The structure of the code is
2556 while (there's a character to process) {
94def140
TS
2557 handle transliteration ranges
2558 skip regexp comments /(?#comment)/ and codes /(?{code})/
2559 skip #-initiated comments in //x patterns
2560 check for embedded arrays
02aa26ce
NT
2561 check for embedded scalars
2562 if (backslash) {
94def140 2563 deprecate \1 in substitution replacements
02aa26ce
NT
2564 handle string-changing backslashes \l \U \Q \E, etc.
2565 switch (what was escaped) {
94def140 2566 handle \- in a transliteration (becomes a literal -)
ff3f963a 2567 if a pattern and not \N{, go treat as regular character
94def140
TS
2568 handle \132 (octal characters)
2569 handle \x15 and \x{1234} (hex characters)
ff3f963a 2570 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
TS
2571 handle \cV (control characters)
2572 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 2573 } (end switch)
77a135fe 2574 continue
02aa26ce 2575 } (end if backslash)
77a135fe 2576 handle regular character
02aa26ce 2577 } (end while character to read)
4e553d73 2578
02aa26ce
NT
2579*/
2580
76e3520e 2581STATIC char *
cea2e8a9 2582S_scan_const(pTHX_ char *start)
79072805 2583{
97aff369 2584 dVAR;
3280af22 2585 register char *send = PL_bufend; /* end of the constant */
77a135fe
KW
2586 SV *sv = newSV(send - start); /* sv for the constant. See
2587 note below on sizing. */
02aa26ce
NT
2588 register char *s = start; /* start of the constant */
2589 register char *d = SvPVX(sv); /* destination for copies */
2590 bool dorange = FALSE; /* are we in a translit range? */
c2e66d9e 2591 bool didrange = FALSE; /* did we just finish a range? */
2b9d42f0 2592 I32 has_utf8 = FALSE; /* Output constant is UTF8 */
77a135fe
KW
2593 I32 this_utf8 = UTF; /* Is the source string assumed
2594 to be UTF8? But, this can
2595 show as true when the source
2596 isn't utf8, as for example
2597 when it is entirely composed
2598 of hex constants */
2599
2600 /* Note on sizing: The scanned constant is placed into sv, which is
2601 * initialized by newSV() assuming one byte of output for every byte of
2602 * input. This routine expects newSV() to allocate an extra byte for a
2603 * trailing NUL, which this routine will append if it gets to the end of
2604 * the input. There may be more bytes of input than output (eg., \N{LATIN
2605 * CAPITAL LETTER A}), or more output than input if the constant ends up
2606 * recoded to utf8, but each time a construct is found that might increase
2607 * the needed size, SvGROW() is called. Its size parameter each time is
2608 * based on the best guess estimate at the time, namely the length used so
2609 * far, plus the length the current construct will occupy, plus room for
2610 * the trailing NUL, plus one byte for every input byte still unscanned */
2611
012bcf8d 2612 UV uv;
4c3a8340
TS
2613#ifdef EBCDIC
2614 UV literal_endpoint = 0;
e294cc5d 2615 bool native_range = TRUE; /* turned to FALSE if the first endpoint is Unicode. */
4c3a8340 2616#endif
012bcf8d 2617
7918f24d
NC
2618 PERL_ARGS_ASSERT_SCAN_CONST;
2619
bb16bae8 2620 assert(PL_lex_inwhat != OP_TRANSR);
2b9d42f0
NIS
2621 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
2622 /* If we are doing a trans and we know we want UTF8 set expectation */
2623 has_utf8 = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
2624 this_utf8 = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
2625 }
2626
2627
79072805 2628 while (s < send || dorange) {
ff3f963a 2629
02aa26ce 2630 /* get transliterations out of the way (they're most literal) */
3280af22 2631 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 2632 /* expand a range A-Z to the full set of characters. AIE! */
79072805 2633 if (dorange) {
1ba5c669
JH
2634 I32 i; /* current expanded character */
2635 I32 min; /* first character in range */
2636 I32 max; /* last character in range */
02aa26ce 2637
e294cc5d
JH
2638#ifdef EBCDIC
2639 UV uvmax = 0;
2640#endif
2641
2642 if (has_utf8
2643#ifdef EBCDIC
2644 && !native_range
2645#endif
2646 ) {
9d4ba2ae 2647 char * const c = (char*)utf8_hop((U8*)d, -1);
8973db79
JH
2648 char *e = d++;
2649 while (e-- > c)
2650 *(e + 1) = *e;
25716404 2651 *c = (char)UTF_TO_NATIVE(0xff);
8973db79
JH
2652 /* mark the range as done, and continue */
2653 dorange = FALSE;
2654 didrange = TRUE;
2655 continue;
2656 }
2b9d42f0 2657
95a20fc0 2658 i = d - SvPVX_const(sv); /* remember current offset */
e294cc5d
JH
2659#ifdef EBCDIC
2660 SvGROW(sv,
2661 SvLEN(sv) + (has_utf8 ?
2662 (512 - UTF_CONTINUATION_MARK +
2663 UNISKIP(0x100))
2664 : 256));
2665 /* How many two-byte within 0..255: 128 in UTF-8,
2666 * 96 in UTF-8-mod. */
2667#else
9cbb5ea2 2668 SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */
e294cc5d 2669#endif
9cbb5ea2 2670 d = SvPVX(sv) + i; /* refresh d after realloc */
e294cc5d
JH
2671#ifdef EBCDIC
2672 if (has_utf8) {
2673 int j;
2674 for (j = 0; j <= 1; j++) {
2675 char * const c = (char*)utf8_hop((U8*)d, -1);
2676 const UV uv = utf8n_to_uvchr((U8*)c, d - c, NULL, 0);
2677 if (j)
2678 min = (U8)uv;
2679 else if (uv < 256)
2680 max = (U8)uv;
2681 else {
2682 max = (U8)0xff; /* only to \xff */
2683 uvmax = uv; /* \x{100} to uvmax */
2684 }
2685 d = c; /* eat endpoint chars */
2686 }
2687 }
2688 else {
2689#endif
2690 d -= 2; /* eat the first char and the - */
2691 min = (U8)*d; /* first char in range */
2692 max = (U8)d[1]; /* last char in range */
2693#ifdef EBCDIC
2694 }
2695#endif
8ada0baa 2696
c2e66d9e 2697 if (min > max) {
01ec43d0 2698 Perl_croak(aTHX_
d1573ac7 2699 "Invalid range \"%c-%c\" in transliteration operator",
1ba5c669 2700 (char)min, (char)max);
c2e66d9e
GS
2701 }
2702
c7f1f016 2703#ifdef EBCDIC
4c3a8340
TS
2704 if (literal_endpoint == 2 &&
2705 ((isLOWER(min) && isLOWER(max)) ||
2706 (isUPPER(min) && isUPPER(max)))) {
8ada0baa
JH
2707 if (isLOWER(min)) {
2708 for (i = min; i <= max; i++)
2709 if (isLOWER(i))
db42d148 2710 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2711 } else {
2712 for (i = min; i <= max; i++)
2713 if (isUPPER(i))
db42d148 2714 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2715 }
2716 }
2717 else
2718#endif
2719 for (i = min; i <= max; i++)
e294cc5d
JH
2720#ifdef EBCDIC
2721 if (has_utf8) {
2722 const U8 ch = (U8)NATIVE_TO_UTF(i);
2723 if (UNI_IS_INVARIANT(ch))
2724 *d++ = (U8)i;
2725 else {
2726 *d++ = (U8)UTF8_EIGHT_BIT_HI(ch);
2727 *d++ = (U8)UTF8_EIGHT_BIT_LO(ch);
2728 }
2729 }
2730 else
2731#endif
2732 *d++ = (char)i;
2733
2734#ifdef EBCDIC
2735 if (uvmax) {
2736 d = (char*)uvchr_to_utf8((U8*)d, 0x100);
2737 if (uvmax > 0x101)
2738 *d++ = (char)UTF_TO_NATIVE(0xff);
2739 if (uvmax > 0x100)
2740 d = (char*)uvchr_to_utf8((U8*)d, uvmax);
2741 }
2742#endif
02aa26ce
NT
2743
2744 /* mark the range as done, and continue */
79072805 2745 dorange = FALSE;
01ec43d0 2746 didrange = TRUE;
4c3a8340
TS
2747#ifdef EBCDIC
2748 literal_endpoint = 0;
2749#endif
79072805 2750 continue;
4e553d73 2751 }
02aa26ce
NT
2752
2753 /* range begins (ignore - as first or last char) */
79072805 2754 else if (*s == '-' && s+1 < send && s != start) {
4e553d73 2755 if (didrange) {
1fafa243 2756 Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
01ec43d0 2757 }
e294cc5d
JH
2758 if (has_utf8
2759#ifdef EBCDIC
2760 && !native_range
2761#endif
2762 ) {
25716404 2763 *d++ = (char)UTF_TO_NATIVE(0xff); /* use illegal utf8 byte--see pmtrans */
a0ed51b3
LW
2764 s++;
2765 continue;
2766 }
79072805
LW
2767 dorange = TRUE;
2768 s++;
01ec43d0
GS
2769 }
2770 else {
2771 didrange = FALSE;
4c3a8340
TS
2772#ifdef EBCDIC
2773 literal_endpoint = 0;
e294cc5d 2774 native_range = TRUE;
4c3a8340 2775#endif
01ec43d0 2776 }
79072805 2777 }
02aa26ce
NT
2778
2779 /* if we get here, we're not doing a transliteration */
2780
0f5d15d6
IZ
2781 /* skip for regexp comments /(?#comment)/ and code /(?{code})/,
2782 except for the last char, which will be done separately. */
3280af22 2783 else if (*s == '(' && PL_lex_inpat && s[1] == '?') {
cc6b7395 2784 if (s[2] == '#') {
e994fd66 2785 while (s+1 < send && *s != ')')
db42d148 2786 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
155aba94
GS
2787 }
2788 else if (s[2] == '{' /* This should match regcomp.c */
67edc0c9 2789 || (s[2] == '?' && s[3] == '{'))
155aba94 2790 {
cc6b7395 2791 I32 count = 1;
0f5d15d6 2792 char *regparse = s + (s[2] == '{' ? 3 : 4);
cc6b7395
IZ
2793 char c;
2794
d9f97599
GS
2795 while (count && (c = *regparse)) {
2796 if (c == '\\' && regparse[1])
2797 regparse++;
4e553d73 2798 else if (c == '{')
cc6b7395 2799 count++;
4e553d73 2800 else if (c == '}')
cc6b7395 2801 count--;
d9f97599 2802 regparse++;
cc6b7395 2803 }
e994fd66 2804 if (*regparse != ')')
5bdf89e7 2805 regparse--; /* Leave one char for continuation. */
0f5d15d6 2806 while (s < regparse)
db42d148 2807 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
cc6b7395 2808 }
748a9306 2809 }
02aa26ce
NT
2810
2811 /* likewise skip #-initiated comments in //x patterns */
3280af22
NIS
2812 else if (*s == '#' && PL_lex_inpat &&
2813 ((PMOP*)PL_lex_inpat)->op_pmflags & PMf_EXTENDED) {
748a9306 2814 while (s+1 < send && *s != '\n')
db42d148 2815 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
748a9306 2816 }
02aa26ce 2817
5d1d4326 2818 /* check for embedded arrays
da6eedaa 2819 (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
5d1d4326 2820 */
1749ea0d
TS
2821 else if (*s == '@' && s[1]) {
2822 if (isALNUM_lazy_if(s+1,UTF))
2823 break;
2824 if (strchr(":'{$", s[1]))
2825 break;
2826 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
2827 break; /* in regexp, neither @+ nor @- are interpolated */
2828 }
02aa26ce
NT
2829
2830 /* check for embedded scalars. only stop if we're sure it's a
2831 variable.
2832 */
79072805 2833 else if (*s == '$') {
3280af22 2834 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
79072805 2835 break;
77772344 2836 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
a2a5de95
NC
2837 if (s[1] == '\\') {
2838 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
2839 "Possible unintended interpolation of $\\ in regex");
77772344 2840 }
79072805 2841 break; /* in regexp, $ might be tail anchor */
77772344 2842 }
79072805 2843 }
02aa26ce 2844
2b9d42f0
NIS
2845 /* End of else if chain - OP_TRANS rejoin rest */
2846
02aa26ce 2847 /* backslashes */
79072805 2848 if (*s == '\\' && s+1 < send) {
ff3f963a
KW
2849 char* e; /* Can be used for ending '}', etc. */
2850
79072805 2851 s++;
02aa26ce 2852
7d0fc23c
KW
2853 /* warn on \1 - \9 in substitution replacements, but note that \11
2854 * is an octal; and \19 is \1 followed by '9' */
3280af22 2855 if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat &&
a0d0e21e 2856 isDIGIT(*s) && *s != '0' && !isDIGIT(s[1]))
79072805 2857 {
a2a5de95 2858 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
79072805
LW
2859 *--s = '$';
2860 break;
2861 }
02aa26ce
NT
2862
2863 /* string-change backslash escapes */
3280af22 2864 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQ", *s)) {
79072805
LW
2865 --s;
2866 break;
2867 }
ff3f963a
KW
2868 /* In a pattern, process \N, but skip any other backslash escapes.
2869 * This is because we don't want to translate an escape sequence
2870 * into a meta symbol and have the regex compiler use the meta
2871 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
2872 * in spite of this, we do have to process \N here while the proper
2873 * charnames handler is in scope. See bugs #56444 and #62056.
2874 * There is a complication because \N in a pattern may also stand
2875 * for 'match a non-nl', and not mean a charname, in which case its
2876 * processing should be deferred to the regex compiler. To be a
2877 * charname it must be followed immediately by a '{', and not look
2878 * like \N followed by a curly quantifier, i.e., not something like
2879 * \N{3,}. regcurly returns a boolean indicating if it is a legal
2880 * quantifier */
2881 else if (PL_lex_inpat
2882 && (*s != 'N'
2883 || s[1] != '{'
2884 || regcurly(s + 1)))
2885 {
cc74c5bd
TS
2886 *d++ = NATIVE_TO_NEED(has_utf8,'\\');
2887 goto default_action;
2888 }
02aa26ce 2889
79072805 2890 switch (*s) {
02aa26ce
NT
2891
2892 /* quoted - in transliterations */
79072805 2893 case '-':
3280af22 2894 if (PL_lex_inwhat == OP_TRANS) {
79072805
LW
2895 *d++ = *s++;
2896 continue;
2897 }
2898 /* FALL THROUGH */
2899 default:
11b8faa4 2900 {
a2a5de95
NC
2901 if ((isALPHA(*s) || isDIGIT(*s)))
2902 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
2903 "Unrecognized escape \\%c passed through",
2904 *s);
11b8faa4 2905 /* default action is to copy the quoted character */
f9a63242 2906 goto default_action;
11b8faa4 2907 }
02aa26ce 2908
632403cc 2909 /* eg. \132 indicates the octal constant 0132 */
79072805
LW
2910 case '0': case '1': case '2': case '3':
2911 case '4': case '5': case '6': case '7':
ba210ebe 2912 {
53305cf1
NC
2913 I32 flags = 0;
2914 STRLEN len = 3;
77a135fe 2915 uv = NATIVE_TO_UNI(grok_oct(s, &len, &flags, NULL));
ba210ebe
JH
2916 s += len;
2917 }
012bcf8d 2918 goto NUM_ESCAPE_INSERT;
02aa26ce 2919
f0a2b745
KW
2920 /* eg. \o{24} indicates the octal constant \024 */
2921 case 'o':
2922 {
2923 STRLEN len;
454155d9 2924 const char* error;
f0a2b745 2925
454155d9 2926 bool valid = grok_bslash_o(s, &uv, &len, &error, 1);
f0a2b745 2927 s += len;
454155d9 2928 if (! valid) {
f0a2b745
KW
2929 yyerror(error);
2930 continue;
2931 }
2932 goto NUM_ESCAPE_INSERT;
2933 }
2934
77a135fe 2935 /* eg. \x24 indicates the hex constant 0x24 */
79072805 2936 case 'x':
a0ed51b3
LW
2937 ++s;
2938 if (*s == '{') {
9d4ba2ae 2939 char* const e = strchr(s, '}');
a4c04bdc
NC
2940 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES |
2941 PERL_SCAN_DISALLOW_PREFIX;
53305cf1 2942 STRLEN len;
355860ce 2943
53305cf1 2944 ++s;
adaeee49 2945 if (!e) {
a0ed51b3 2946 yyerror("Missing right brace on \\x{}");
355860ce 2947 continue;
ba210ebe 2948 }
53305cf1 2949 len = e - s;
77a135fe 2950 uv = NATIVE_TO_UNI(grok_hex(s, &len, &flags, NULL));
ba210ebe 2951 s = e + 1;
a0ed51b3
LW
2952 }
2953 else {
ba210ebe 2954 {
53305cf1 2955 STRLEN len = 2;
a4c04bdc 2956 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
77a135fe 2957 uv = NATIVE_TO_UNI(grok_hex(s, &len, &flags, NULL));
ba210ebe
JH
2958 s += len;
2959 }
012bcf8d
GS
2960 }
2961
2962 NUM_ESCAPE_INSERT:
ff3f963a
KW
2963 /* Insert oct or hex escaped character. There will always be
2964 * enough room in sv since such escapes will be longer than any
2965 * UTF-8 sequence they can end up as, except if they force us
2966 * to recode the rest of the string into utf8 */
ba7cea30 2967
77a135fe 2968 /* Here uv is the ordinal of the next character being added in
ff3f963a 2969 * unicode (converted from native). */
77a135fe 2970 if (!UNI_IS_INVARIANT(uv)) {
9aa983d2 2971 if (!has_utf8 && uv > 255) {
77a135fe
KW
2972 /* Might need to recode whatever we have accumulated so
2973 * far if it contains any chars variant in utf8 or
2974 * utf-ebcdic. */
2975
2976 SvCUR_set(sv, d - SvPVX_const(sv));
2977 SvPOK_on(sv);
2978 *d = '\0';
77a135fe 2979 /* See Note on sizing above. */
7bf79863
KW
2980 sv_utf8_upgrade_flags_grow(sv,
2981 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
2982 UNISKIP(uv) + (STRLEN)(send - s) + 1);
77a135fe
KW
2983 d = SvPVX(sv) + SvCUR(sv);
2984 has_utf8 = TRUE;
012bcf8d
GS
2985 }
2986
77a135fe
KW
2987 if (has_utf8) {
2988 d = (char*)uvuni_to_utf8((U8*)d, uv);
f9a63242
JH
2989 if (PL_lex_inwhat == OP_TRANS &&
2990 PL_sublex_info.sub_op) {
2991 PL_sublex_info.sub_op->op_private |=
2992 (PL_lex_repl ? OPpTRANS_FROM_UTF
2993 : OPpTRANS_TO_UTF);
f9a63242 2994 }
e294cc5d
JH
2995#ifdef EBCDIC
2996 if (uv > 255 && !dorange)
2997 native_range = FALSE;
2998#endif
012bcf8d 2999 }
a0ed51b3 3000 else {
012bcf8d 3001 *d++ = (char)uv;
a0ed51b3 3002 }
012bcf8d
GS
3003 }
3004 else {
c4d5f83a 3005 *d++ = (char) uv;
a0ed51b3 3006 }
79072805 3007 continue;
02aa26ce 3008
4a2d328f 3009 case 'N':
ff3f963a
KW
3010 /* In a non-pattern \N must be a named character, like \N{LATIN
3011 * SMALL LETTER A} or \N{U+0041}. For patterns, it also can
3012 * mean to match a non-newline. For non-patterns, named
3013 * characters are converted to their string equivalents. In
3014 * patterns, named characters are not converted to their
3015 * ultimate forms for the same reasons that other escapes
3016 * aren't. Instead, they are converted to the \N{U+...} form
3017 * to get the value from the charnames that is in effect right
3018 * now, while preserving the fact that it was a named character
3019 * so that the regex compiler knows this */
3020
3021 /* This section of code doesn't generally use the
3022 * NATIVE_TO_NEED() macro to transform the input. I (khw) did
3023 * a close examination of this macro and determined it is a
3024 * no-op except on utfebcdic variant characters. Every
3025 * character generated by this that would normally need to be
3026 * enclosed by this macro is invariant, so the macro is not
7538f724
KW
3027 * needed, and would complicate use of copy(). XXX There are
3028 * other parts of this file where the macro is used
3029 * inconsistently, but are saved by it being a no-op */
ff3f963a
KW
3030
3031 /* The structure of this section of code (besides checking for
3032 * errors and upgrading to utf8) is:
3033 * Further disambiguate between the two meanings of \N, and if
3034 * not a charname, go process it elsewhere
0a96133f
KW
3035 * If of form \N{U+...}, pass it through if a pattern;
3036 * otherwise convert to utf8
3037 * Otherwise must be \N{NAME}: convert to \N{U+c1.c2...} if a
3038 * pattern; otherwise convert to utf8 */
ff3f963a
KW
3039
3040 /* Here, s points to the 'N'; the test below is guaranteed to
3041 * succeed if we are being called on a pattern as we already
3042 * know from a test above that the next character is a '{'.
3043 * On a non-pattern \N must mean 'named sequence, which
3044 * requires braces */
3045 s++;
3046 if (*s != '{') {
3047 yyerror("Missing braces on \\N{}");
3048 continue;
3049 }
3050 s++;
3051
0a96133f 3052 /* If there is no matching '}', it is an error. */
ff3f963a
KW
3053 if (! (e = strchr(s, '}'))) {
3054 if (! PL_lex_inpat) {
5777a3f7 3055 yyerror("Missing right brace on \\N{}");
0a96133f
KW
3056 } else {
3057 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N.");
dbc0d4f2 3058 }
0a96133f 3059 continue;
ff3f963a 3060 }
cddc7ef4 3061
ff3f963a 3062 /* Here it looks like a named character */
cddc7ef4 3063
ff3f963a
KW
3064 if (PL_lex_inpat) {
3065
3066 /* XXX This block is temporary code. \N{} implies that the
3067 * pattern is to have Unicode semantics, and therefore
3068 * currently has to be encoded in utf8. By putting it in
3069 * utf8 now, we save a whole pass in the regular expression
3070 * compiler. Once that code is changed so Unicode
3071 * semantics doesn't necessarily have to be in utf8, this
3072 * block should be removed */
3073 if (!has_utf8) {
77a135fe 3074 SvCUR_set(sv, d - SvPVX_const(sv));
f08d6ad9 3075 SvPOK_on(sv);
e4f3eed8 3076 *d = '\0';
77a135fe 3077 /* See Note on sizing above. */
7bf79863 3078 sv_utf8_upgrade_flags_grow(sv,
ff3f963a
KW
3079 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3080 /* 5 = '\N{' + cur char + NUL */
3081 (STRLEN)(send - s) + 5);
f08d6ad9 3082 d = SvPVX(sv) + SvCUR(sv);
89491803 3083 has_utf8 = TRUE;
ff3f963a
KW
3084 }
3085 }
423cee85 3086
ff3f963a
KW
3087 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3088 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3089 | PERL_SCAN_DISALLOW_PREFIX;
3090 STRLEN len;
3091
3092 /* For \N{U+...}, the '...' is a unicode value even on
3093 * EBCDIC machines */
3094 s += 2; /* Skip to next char after the 'U+' */
3095 len = e - s;
3096 uv = grok_hex(s, &len, &flags, NULL);
3097 if (len == 0 || len != (STRLEN)(e - s)) {
3098 yyerror("Invalid hexadecimal number in \\N{U+...}");
3099 s = e + 1;
3100 continue;
3101 }
3102
3103 if (PL_lex_inpat) {
3104
3105 /* Pass through to the regex compiler unchanged. The
3106 * reason we evaluated the number above is to make sure
0a96133f 3107 * there wasn't a syntax error. */
ff3f963a
KW
3108 s -= 5; /* Include the '\N{U+' */
3109 Copy(s, d, e - s + 1, char); /* 1 = include the } */
3110 d += e - s + 1;
3111 }
3112 else { /* Not a pattern: convert the hex to string */
3113
3114 /* If destination is not in utf8, unconditionally
3115 * recode it to be so. This is because \N{} implies
3116 * Unicode semantics, and scalars have to be in utf8
3117 * to guarantee those semantics */
3118 if (! has_utf8) {
3119 SvCUR_set(sv, d - SvPVX_const(sv));
3120 SvPOK_on(sv);
3121 *d = '\0';
3122 /* See Note on sizing above. */
3123 sv_utf8_upgrade_flags_grow(
3124 sv,
3125 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3126 UNISKIP(uv) + (STRLEN)(send - e) + 1);
3127 d = SvPVX(sv) + SvCUR(sv);
3128 has_utf8 = TRUE;
3129 }
3130
3131 /* Add the string to the output */
3132 if (UNI_IS_INVARIANT(uv)) {
3133 *d++ = (char) uv;
3134 }
3135 else d = (char*)uvuni_to_utf8((U8*)d, uv);
3136 }
3137 }
3138 else { /* Here is \N{NAME} but not \N{U+...}. */
3139
3140 SV *res; /* result from charnames */
3141 const char *str; /* the string in 'res' */
3142 STRLEN len; /* its length */
3143
3144 /* Get the value for NAME */
3145 res = newSVpvn(s, e - s);
3146 res = new_constant( NULL, 0, "charnames",
3147 /* includes all of: \N{...} */
3148 res, NULL, s - 3, e - s + 4 );
3149
3150 /* Most likely res will be in utf8 already since the
3151 * standard charnames uses pack U, but a custom translator
3152 * can leave it otherwise, so make sure. XXX This can be
3153 * revisited to not have charnames use utf8 for characters
3154 * that don't need it when regexes don't have to be in utf8
3155 * for Unicode semantics. If doing so, remember EBCDIC */
3156 sv_utf8_upgrade(res);
3157 str = SvPV_const(res, len);
3158
3159 /* Don't accept malformed input */
3160 if (! is_utf8_string((U8 *) str, len)) {
3161 yyerror("Malformed UTF-8 returned by \\N");
3162 }
3163 else if (PL_lex_inpat) {
3164
3165 if (! len) { /* The name resolved to an empty string */
3166 Copy("\\N{}", d, 4, char);
3167 d += 4;
3168 }
3169 else {
3170 /* In order to not lose information for the regex
3171 * compiler, pass the result in the specially made
3172 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3173 * the code points in hex of each character
3174 * returned by charnames */
3175
3176 const char *str_end = str + len;
3177 STRLEN char_length; /* cur char's byte length */
3178 STRLEN output_length; /* and the number of bytes
3179 after this is translated
3180 into hex digits */
3181 const STRLEN off = d - SvPVX_const(sv);
3182
3183 /* 2 hex per byte; 2 chars for '\N'; 2 chars for
3184 * max('U+', '.'); and 1 for NUL */
3185 char hex_string[2 * UTF8_MAXBYTES + 5];
3186
3187 /* Get the first character of the result. */
3188 U32 uv = utf8n_to_uvuni((U8 *) str,
3189 len,
3190 &char_length,
3191 UTF8_ALLOW_ANYUV);
3192
3193 /* The call to is_utf8_string() above hopefully
3194 * guarantees that there won't be an error. But
3195 * it's easy here to make sure. The function just
3196 * above warns and returns 0 if invalid utf8, but
3197 * it can also return 0 if the input is validly a
3198 * NUL. Disambiguate */
3199 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3200 uv = UNICODE_REPLACEMENT;
3201 }
3202
3203 /* Convert first code point to hex, including the
3204 * boiler plate before it */
78c35590 3205 output_length =
3353de27
NC
3206 my_snprintf(hex_string, sizeof(hex_string),
3207 "\\N{U+%X", (unsigned int) uv);
ff3f963a
KW
3208
3209 /* Make sure there is enough space to hold it */
3210 d = off + SvGROW(sv, off
3211 + output_length
3212 + (STRLEN)(send - e)
3213 + 2); /* '}' + NUL */
3214 /* And output it */
3215 Copy(hex_string, d, output_length, char);
3216 d += output_length;
3217
3218 /* For each subsequent character, append dot and
3219 * its ordinal in hex */
3220 while ((str += char_length) < str_end) {
3221 const STRLEN off = d - SvPVX_const(sv);
3222 U32 uv = utf8n_to_uvuni((U8 *) str,
3223 str_end - str,
3224 &char_length,
3225 UTF8_ALLOW_ANYUV);
3226 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3227 uv = UNICODE_REPLACEMENT;
3228 }
3229
78c35590 3230 output_length =
3353de27
NC
3231 my_snprintf(hex_string, sizeof(hex_string),
3232 ".%X", (unsigned int) uv);
ff3f963a
KW
3233
3234 d = off + SvGROW(sv, off
3235 + output_length
3236 + (STRLEN)(send - e)
3237 + 2); /* '}' + NUL */
3238 Copy(hex_string, d, output_length, char);
3239 d += output_length;
3240 }
3241
3242 *d++ = '}'; /* Done. Add the trailing brace */
3243 }
3244 }
3245 else { /* Here, not in a pattern. Convert the name to a
3246 * string. */
3247
3248 /* If destination is not in utf8, unconditionally
3249 * recode it to be so. This is because \N{} implies
3250 * Unicode semantics, and scalars have to be in utf8
3251 * to guarantee those semantics */
3252 if (! has_utf8) {
3253 SvCUR_set(sv, d - SvPVX_const(sv));
3254 SvPOK_on(sv);
3255 *d = '\0';
3256 /* See Note on sizing above. */
3257 sv_utf8_upgrade_flags_grow(sv,
3258 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3259 len + (STRLEN)(send - s) + 1);
3260 d = SvPVX(sv) + SvCUR(sv);
3261 has_utf8 = TRUE;
3262 } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
3263
3264 /* See Note on sizing above. (NOTE: SvCUR() is not
3265 * set correctly here). */
3266 const STRLEN off = d - SvPVX_const(sv);
3267 d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1);
3268 }
3269 Copy(str, d, len, char);
3270 d += len;
423cee85 3271 }
423cee85 3272 SvREFCNT_dec(res);
cb233ae3
KW
3273
3274 /* Deprecate non-approved name syntax */
3275 if (ckWARN_d(WARN_DEPRECATED)) {
3276 bool problematic = FALSE;
3277 char* i = s;
3278
3279 /* For non-ut8 input, look to see that the first
3280 * character is an alpha, then loop through the rest
3281 * checking that each is a continuation */
3282 if (! this_utf8) {
3283 if (! isALPHAU(*i)) problematic = TRUE;
3284 else for (i = s + 1; i < e; i++) {
3285 if (isCHARNAME_CONT(*i)) continue;
3286 problematic = TRUE;
3287 break;
3288 }
3289 }
3290 else {
3291 /* Similarly for utf8. For invariants can check
3292 * directly. We accept anything above the latin1
3293 * range because it is immaterial to Perl if it is
3294 * correct or not, and is expensive to check. But
3295 * it is fairly easy in the latin1 range to convert
3296 * the variants into a single character and check
3297 * those */
3298 if (UTF8_IS_INVARIANT(*i)) {
3299 if (! isALPHAU(*i)) problematic = TRUE;
3300 } else if (UTF8_IS_DOWNGRADEABLE_START(*i)) {
81c14aa2 3301 if (! isALPHAU(UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(*i,
cb233ae3
KW
3302 *(i+1)))))
3303 {
3304 problematic = TRUE;
3305 }
3306 }
3307 if (! problematic) for (i = s + UTF8SKIP(s);
3308 i < e;
3309 i+= UTF8SKIP(i))
3310 {
3311 if (UTF8_IS_INVARIANT(*i)) {
3312 if (isCHARNAME_CONT(*i)) continue;
3313 } else if (! UTF8_IS_DOWNGRADEABLE_START(*i)) {
3314 continue;
3315 } else if (isCHARNAME_CONT(
3316 UNI_TO_NATIVE(
81c14aa2 3317 TWO_BYTE_UTF8_TO_UNI(*i, *(i+1)))))
cb233ae3
KW
3318 {
3319 continue;
3320 }
3321 problematic = TRUE;
3322 break;
3323 }
3324 }
3325 if (problematic) {
6e1bad6c
KW
3326 /* The e-i passed to the final %.*s makes sure that
3327 * should the trailing NUL be missing that this
3328 * print won't run off the end of the string */
cb233ae3 3329 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
b00fc8d4
NC
3330 "Deprecated character in \\N{...}; marked by <-- HERE in \\N{%.*s<-- HERE %.*s",
3331 (int)(i - s + 1), s, (int)(e - i), i + 1);
cb233ae3
KW
3332 }
3333 }
3334 } /* End \N{NAME} */
ff3f963a
KW
3335#ifdef EBCDIC
3336 if (!dorange)
3337 native_range = FALSE; /* \N{} is defined to be Unicode */
3338#endif
3339 s = e + 1; /* Point to just after the '}' */
423cee85
JH
3340 continue;
3341
02aa26ce 3342 /* \c is a control character */
79072805
LW
3343 case 'c':
3344 s++;
961ce445 3345 if (s < send) {
f9d13529 3346 *d++ = grok_bslash_c(*s++, 1);
ba210ebe 3347 }
961ce445
RGS
3348 else {
3349 yyerror("Missing control char name in \\c");
3350 }
79072805 3351 continue;
02aa26ce
NT
3352
3353 /* printf-style backslashes, formfeeds, newlines, etc */
79072805 3354 case 'b':
db42d148 3355 *d++ = NATIVE_TO_NEED(has_utf8,'\b');
79072805
LW
3356 break;
3357 case 'n':
db42d148 3358 *d++ = NATIVE_TO_NEED(has_utf8,'\n');
79072805
LW
3359 break;
3360 case 'r':
db42d148 3361 *d++ = NATIVE_TO_NEED(has_utf8,'\r');
79072805
LW
3362 break;
3363 case 'f':
db42d148 3364 *d++ = NATIVE_TO_NEED(has_utf8,'\f');
79072805
LW
3365 break;
3366 case 't':
db42d148 3367 *d++ = NATIVE_TO_NEED(has_utf8,'\t');
79072805 3368 break;
34a3fe2a 3369 case 'e':
db42d148 3370 *d++ = ASCII_TO_NEED(has_utf8,'\033');
34a3fe2a
PP
3371 break;
3372 case 'a':
db42d148 3373 *d++ = ASCII_TO_NEED(has_utf8,'\007');
79072805 3374 break;
02aa26ce
NT
3375 } /* end switch */
3376
79072805
LW
3377 s++;
3378 continue;
02aa26ce 3379 } /* end if (backslash) */
4c3a8340
TS
3380#ifdef EBCDIC
3381 else
3382 literal_endpoint++;
3383#endif
02aa26ce 3384
f9a63242 3385 default_action:
77a135fe
KW
3386 /* If we started with encoded form, or already know we want it,
3387 then encode the next character */
3388 if (! NATIVE_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
2b9d42f0 3389 STRLEN len = 1;
77a135fe
KW
3390
3391
3392 /* One might think that it is wasted effort in the case of the
3393 * source being utf8 (this_utf8 == TRUE) to take the next character
3394 * in the source, convert it to an unsigned value, and then convert
3395 * it back again. But the source has not been validated here. The
3396 * routine that does the conversion checks for errors like
3397 * malformed utf8 */
3398
5f66b61c
AL
3399 const UV nextuv = (this_utf8) ? utf8n_to_uvchr((U8*)s, send - s, &len, 0) : (UV) ((U8) *s);
3400 const STRLEN need = UNISKIP(NATIVE_TO_UNI(nextuv));
77a135fe
KW
3401 if (!has_utf8) {
3402 SvCUR_set(sv, d - SvPVX_const(sv));
3403 SvPOK_on(sv);
3404 *d = '\0';
77a135fe 3405 /* See Note on sizing above. */
7bf79863
KW
3406 sv_utf8_upgrade_flags_grow(sv,
3407 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3408 need + (STRLEN)(send - s) + 1);
77a135fe
KW
3409 d = SvPVX(sv) + SvCUR(sv);
3410 has_utf8 = TRUE;
3411 } else if (need > len) {
3412 /* encoded value larger than old, may need extra space (NOTE:
3413 * SvCUR() is not set correctly here). See Note on sizing
3414 * above. */
9d4ba2ae 3415 const STRLEN off = d - SvPVX_const(sv);
77a135fe 3416 d = SvGROW(sv, off + need + (STRLEN)(send - s) + 1) + off;
2b9d42f0 3417 }
77a135fe
KW
3418 s += len;
3419
5f66b61c 3420 d = (char*)uvchr_to_utf8((U8*)d, nextuv);
e294cc5d
JH
3421#ifdef EBCDIC
3422 if (uv > 255 && !dorange)
3423 native_range = FALSE;
3424#endif
2b9d42f0
NIS
3425 }
3426 else {
3427 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
3428 }
02aa26ce
NT
3429 } /* while loop to process each character */
3430
3431 /* terminate the string and set up the sv */
79072805 3432 *d = '\0';
95a20fc0 3433 SvCUR_set(sv, d - SvPVX_const(sv));
2b9d42f0 3434 if (SvCUR(sv) >= SvLEN(sv))
d0063567 3435 Perl_croak(aTHX_ "panic: constant overflowed allocated space");
2b9d42f0 3436
79072805 3437 SvPOK_on(sv);
9f4817db 3438 if (PL_encoding && !has_utf8) {
d0063567
DK
3439 sv_recode_to_utf8(sv, PL_encoding);
3440 if (SvUTF8(sv))
3441 has_utf8 = TRUE;
9f4817db 3442 }
2b9d42f0 3443 if (has_utf8) {
7e2040f0 3444 SvUTF8_on(sv);
2b9d42f0 3445 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
d0063567 3446 PL_sublex_info.sub_op->op_private |=
2b9d42f0
NIS
3447 (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
3448 }
3449 }
79072805 3450
02aa26ce 3451 /* shrink the sv if we allocated more than we used */
79072805 3452 if (SvCUR(sv) + 5 < SvLEN(sv)) {
1da4ca5f 3453 SvPV_shrink_to_cur(sv);
79072805 3454 }
02aa26ce 3455
6154021b 3456 /* return the substring (via pl_yylval) only if we parsed anything */
3280af22 3457 if (s > PL_bufptr) {
eb0d8d16
NC
3458 if ( PL_hints & ( PL_lex_inpat ? HINT_NEW_RE : HINT_NEW_STRING ) ) {
3459 const char *const key = PL_lex_inpat ? "qr" : "q";
3460 const STRLEN keylen = PL_lex_inpat ? 2 : 1;
3461 const char *type;
3462 STRLEN typelen;
3463
3464 if (PL_lex_inwhat == OP_TRANS) {
3465 type = "tr";
3466 typelen = 2;
3467 } else if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat) {
3468 type = "s";
3469 typelen = 1;
3470 } else {
3471 type = "qq";
3472 typelen = 2;
3473 }
3474
3475 sv = S_new_constant(aTHX_ start, s - start, key, keylen, sv, NULL,
3476 type, typelen);
3477 }
6154021b 3478 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
b3ac6de7 3479 } else
8990e307 3480 SvREFCNT_dec(sv);
79072805
LW
3481 return s;
3482}
3483
ffb4593c
NT
3484/* S_intuit_more
3485 * Returns TRUE if there's more to the expression (e.g., a subscript),
3486 * FALSE otherwise.
ffb4593c
NT
3487 *
3488 * It deals with "$foo[3]" and /$foo[3]/ and /$foo[0123456789$]+/
3489 *
3490 * ->[ and ->{ return TRUE
3491 * { and [ outside a pattern are always subscripts, so return TRUE
3492 * if we're outside a pattern and it's not { or [, then return FALSE
3493 * if we're in a pattern and the first char is a {
3494 * {4,5} (any digits around the comma) returns FALSE
3495 * if we're in a pattern and the first char is a [
3496 * [] returns FALSE
3497 * [SOMETHING] has a funky algorithm to decide whether it's a
3498 * character class or not. It has to deal with things like
3499 * /$foo[-3]/ and /$foo[$bar]/ as well as /$foo[$\d]+/
3500 * anything else returns TRUE
3501 */
3502
9cbb5ea2
GS
3503/* This is the one truly awful dwimmer necessary to conflate C and sed. */
3504
76e3520e 3505STATIC int
cea2e8a9 3506S_intuit_more(pTHX_ register char *s)
79072805 3507{
97aff369 3508 dVAR;
7918f24d
NC
3509
3510 PERL_ARGS_ASSERT_INTUIT_MORE;
3511
3280af22 3512 if (PL_lex_brackets)
79072805
LW
3513 return TRUE;
3514 if (*s == '-' && s[1] == '>' && (s[2] == '[' || s[2] == '{'))
3515 return TRUE;
3516 if (*s != '{' && *s != '[')
3517 return FALSE;
3280af22 3518 if (!PL_lex_inpat)
79072805
LW
3519 return TRUE;
3520
3521 /* In a pattern, so maybe we have {n,m}. */
3522 if (*s == '{') {
b3155d95 3523 if (regcurly(s)) {
79072805 3524 return FALSE;
b3155d95 3525 }
79072805 3526 return TRUE;
79072805
LW
3527 }
3528
3529 /* On the other hand, maybe we have a character class */
3530
3531 s++;
3532 if (*s == ']' || *s == '^')
3533 return FALSE;
3534 else {
ffb4593c 3535 /* this is terrifying, and it works */
79072805
LW
3536 int weight = 2; /* let's weigh the evidence */
3537 char seen[256];
f27ffc4a 3538 unsigned char un_char = 255, last_un_char;
9d4ba2ae 3539 const char * const send = strchr(s,']');
3280af22 3540 char tmpbuf[sizeof PL_tokenbuf * 4];
79072805
LW
3541
3542 if (!send) /* has to be an expression */
3543 return TRUE;
3544
3545 Zero(seen,256,char);
3546 if (*s == '$')
3547 weight -= 3;
3548 else if (isDIGIT(*s)) {
3549 if (s[1] != ']') {
3550 if (isDIGIT(s[1]) && s[2] == ']')
3551 weight -= 10;
3552 }
3553 else
3554 weight -= 100;
3555 }
3556 for (; s < send; s++) {
3557 last_un_char = un_char;
3558 un_char = (unsigned char)*s;
3559 switch (*s) {
3560 case '@':
3561 case '&':
3562 case '$':
3563 weight -= seen[un_char] * 10;
7e2040f0 3564 if (isALNUM_lazy_if(s+1,UTF)) {
90e5519e 3565 int len;
8903cb82 3566 scan_ident(s, send, tmpbuf, sizeof tmpbuf, FALSE);
90e5519e
NC
3567 len = (int)strlen(tmpbuf);
3568 if (len > 1 && gv_fetchpvn_flags(tmpbuf, len, 0, SVt_PV))
79072805
LW
3569 weight -= 100;
3570 else
3571 weight -= 10;
3572 }
3573 else if (*s == '$' && s[1] &&
93a17b20
LW
3574 strchr("[#!%*<>()-=",s[1])) {
3575 if (/*{*/ strchr("])} =",s[2]))
79072805
LW
3576 weight -= 10;
3577 else
3578 weight -= 1;
3579 }
3580 break;
3581 case '\\':
3582 un_char = 254;
3583 if (s[1]) {
93a17b20 3584 if (strchr("wds]",s[1]))
79072805 3585 weight += 100;
10edeb5d 3586 else if (seen[(U8)'\''] || seen[(U8)'"'])
79072805 3587 weight += 1;
93a17b20 3588 else if (strchr("rnftbxcav",s[1]))
79072805
LW
3589 weight += 40;
3590 else if (isDIGIT(s[1])) {
3591 weight += 40;
3592 while (s[1] && isDIGIT(s[1]))
3593 s++;
3594 }
3595 }
3596 else
3597 weight += 100;
3598 break;
3599 case '-':
3600 if (s[1] == '\\')
3601 weight += 50;
93a17b20 3602 if (strchr("aA01! ",last_un_char))
79072805 3603 weight += 30;
93a17b20 3604 if (strchr("zZ79~",s[1]))
79072805 3605 weight += 30;
f27ffc4a
GS
3606 if (last_un_char == 255 && (isDIGIT(s[1]) || s[1] == '$'))
3607 weight -= 5; /* cope with negative subscript */
79072805
LW
3608 break;
3609 default:
3792a11b
NC
3610 if (!isALNUM(last_un_char)
3611 && !(last_un_char == '$' || last_un_char == '@'
3612 || last_un_char == '&')
3613 && isALPHA(*s) && s[1] && isALPHA(s[1])) {
79072805
LW
3614 char *d = tmpbuf;
3615 while (isALPHA(*s))
3616 *d++ = *s++;
3617 *d = '\0';
5458a98a 3618 if (keyword(tmpbuf, d - tmpbuf, 0))
79072805
LW
3619 weight -= 150;
3620 }
3621 if (un_char == last_un_char + 1)
3622 weight += 5;
3623 weight -= seen[un_char];
3624 break;
3625 }
3626 seen[un_char]++;
3627 }
3628 if (weight >= 0) /* probably a character class */
3629 return FALSE;
3630 }
3631
3632 return TRUE;
3633}
ffed7fef 3634
ffb4593c
NT
3635/*
3636 * S_intuit_method
3637 *
3638 * Does all the checking to disambiguate
3639 * foo bar
3640 * between foo(bar) and bar->foo. Returns 0 if not a method, otherwise
3641 * FUNCMETH (bar->foo(args)) or METHOD (bar->foo args).
3642 *
3643 * First argument is the stuff after the first token, e.g. "bar".
3644 *
3645 * Not a method if bar is a filehandle.
3646 * Not a method if foo is a subroutine prototyped to take a filehandle.
3647 * Not a method if it's really "Foo $bar"
3648 * Method if it's "foo $bar"
3649 * Not a method if it's really "print foo $bar"
3650 * Method if it's really "foo package::" (interpreted as package->foo)
8f8cf39c 3651 * Not a method if bar is known to be a subroutine ("sub bar; foo bar")
3cb0bbe5 3652 * Not a method if bar is a filehandle or package, but is quoted with
ffb4593c
NT
3653 * =>
3654 */
3655
76e3520e 3656STATIC int
62d55b22 3657S_intuit_method(pTHX_ char *start, GV *gv, CV *cv)
a0d0e21e 3658{
97aff369 3659 dVAR;
a0d0e21e 3660 char *s = start + (*start == '$');
3280af22 3661 char tmpbuf[sizeof PL_tokenbuf];
a0d0e21e
LW
3662 STRLEN len;
3663 GV* indirgv;
5db06880
NC
3664#ifdef PERL_MAD
3665 int soff;
3666#endif
a0d0e21e 3667
7918f24d
NC
3668 PERL_ARGS_ASSERT_INTUIT_METHOD;
3669
a0d0e21e 3670 if (gv) {
62d55b22 3671 if (SvTYPE(gv) == SVt_PVGV && GvIO(gv))
a0d0e21e 3672 return 0;
62d55b22
NC
3673 if (cv) {
3674 if (SvPOK(cv)) {
3675 const char *proto = SvPVX_const(cv);
3676 if (proto) {
3677 if (*proto == ';')
3678 proto++;
3679 if (*proto == '*')
3680 return 0;
3681 }
b6c543e3
IZ
3682 }
3683 } else
c35e046a 3684 gv = NULL;
a0d0e21e 3685 }
8903cb82 3686 s = scan_word(s, tmpbuf, sizeof tmpbuf, TRUE, &len);
ffb4593c
NT
3687 /* start is the beginning of the possible filehandle/object,
3688 * and s is the end of it
3689 * tmpbuf is a copy of it
3690 */
3691
a0d0e21e 3692 if (*start == '$') {
3ef1310e
RGS
3693 if (gv || PL_last_lop_op == OP_PRINT || PL_last_lop_op == OP_SAY ||
3694 isUPPER(*PL_tokenbuf))
a0d0e21e 3695 return 0;
5db06880
NC
3696#ifdef PERL_MAD
3697 len = start - SvPVX(PL_linestr);
3698#endif
29595ff2 3699 s = PEEKSPACE(s);
f0092767 3700#ifdef PERL_MAD
5db06880
NC
3701 start = SvPVX(PL_linestr) + len;
3702#endif
3280af22
NIS
3703 PL_bufptr = start;
3704 PL_expect = XREF;
a0d0e21e
LW
3705 return *s == '(' ? FUNCMETH : METHOD;
3706 }
5458a98a 3707 if (!keyword(tmpbuf, len, 0)) {
c3e0f903
GS
3708 if (len > 2 && tmpbuf[len - 2] == ':' && tmpbuf[len - 1] == ':') {
3709 len -= 2;
3710 tmpbuf[len] = '\0';
5db06880
NC
3711#ifdef PERL_MAD
3712 soff = s - SvPVX(PL_linestr);
3713#endif
c3e0f903
GS
3714 goto bare_package;
3715 }
90e5519e 3716 indirgv = gv_fetchpvn_flags(tmpbuf, len, 0, SVt_PVCV);
8ebc5c01 3717 if (indirgv && GvCVu(indirgv))
a0d0e21e
LW
3718 return 0;
3719 /* filehandle or package name makes it a method */
da51bb9b 3720 if (!gv || GvIO(indirgv) || gv_stashpvn(tmpbuf, len, 0)) {
5db06880
NC
3721#ifdef PERL_MAD
3722 soff = s - SvPVX(PL_linestr);
3723#endif
29595ff2 3724 s = PEEKSPACE(s);
3280af22 3725 if ((PL_bufend - s) >= 2 && *s == '=' && *(s+1) == '>')
55497cff 3726 return 0; /* no assumptions -- "=>" quotes bearword */
c3e0f903 3727 bare_package:
cd81e915 3728 start_force(PL_curforce);
9ded7720 3729 NEXTVAL_NEXTTOKE.opval = (OP*)newSVOP(OP_CONST, 0,
64142370 3730 S_newSV_maybe_utf8(aTHX_ tmpbuf, len));
9ded7720 3731 NEXTVAL_NEXTTOKE.opval->op_private = OPpCONST_BARE;
5db06880
NC
3732 if (PL_madskills)
3733 curmad('X', newSVpvn(start,SvPVX(PL_linestr) + soff - start));
3280af22 3734 PL_expect = XTERM;
a0d0e21e 3735 force_next(WORD);
3280af22 3736 PL_bufptr = s;
5db06880
NC
3737#ifdef PERL_MAD
3738 PL_bufptr = SvPVX(PL_linestr) + soff; /* restart before space */
3739#endif
a0d0e21e
LW
3740 return *s == '(' ? FUNCMETH : METHOD;
3741 }
3742 }
3743 return 0;
3744}
3745
16d20bd9 3746/* Encoded script support. filter_add() effectively inserts a
4e553d73 3747 * 'pre-processing' function into the current source input stream.
16d20bd9
AD
3748 * Note that the filter function only applies to the current source file
3749 * (e.g., it will not affect files 'require'd or 'use'd by this one).
3750 *
3751 * The datasv parameter (which may be NULL) can be used to pass
3752 * private data to this instance of the filter. The filter function
3753 * can recover the SV using the FILTER_DATA macro and use it to
3754 * store private buffers and state information.
3755 *
3756 * The supplied datasv parameter is upgraded to a PVIO type
4755096e 3757 * and the IoDIRP/IoANY field is used to store the function pointer,
e0c19803 3758 * and IOf_FAKE_DIRP is enabled on datasv to mark this as such.
16d20bd9
AD
3759 * Note that IoTOP_NAME, IoFMT_NAME, IoBOTTOM_NAME, if set for
3760 * private use must be set using malloc'd pointers.
3761 */
16d20bd9
AD
3762
3763SV *
864dbfa3 3764Perl_filter_add(pTHX_ filter_t funcp, SV *datasv)
16d20bd9 3765{
97aff369 3766 dVAR;
f4c556ac 3767 if (!funcp)
a0714e2c 3768 return NULL;
f4c556ac 3769
5486870f
DM
3770 if (!PL_parser)
3771 return NULL;
3772