3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
12 * 'It all comes from here, the stench and the peril.' --Frodo
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
18 * This file is the lexer for Perl. It's closely linked to the
21 * The main routine is yylex(), which returns the next token.
25 =head1 Lexer interface
26 This is the lower layer of the Perl parser, managing characters and tokens.
28 =for apidoc AmnU|yy_parser *|PL_parser
30 Pointer to a structure encapsulating the state of the parsing operation
31 currently in progress. The pointer can be locally changed to perform
32 a nested parse without interfering with the state of an outer parse.
33 Individual members of C<PL_parser> have their own documentation.
39 #define PERL_IN_TOKE_C
41 #include "invlist_inline.h"
43 #define new_constant(a,b,c,d,e,f,g, h) \
44 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g, h)
46 #define pl_yylval (PL_parser->yylval)
48 /* XXX temporary backwards compatibility */
49 #define PL_lex_brackets (PL_parser->lex_brackets)
50 #define PL_lex_allbrackets (PL_parser->lex_allbrackets)
51 #define PL_lex_fakeeof (PL_parser->lex_fakeeof)
52 #define PL_lex_brackstack (PL_parser->lex_brackstack)
53 #define PL_lex_casemods (PL_parser->lex_casemods)
54 #define PL_lex_casestack (PL_parser->lex_casestack)
55 #define PL_lex_dojoin (PL_parser->lex_dojoin)
56 #define PL_lex_formbrack (PL_parser->lex_formbrack)
57 #define PL_lex_inpat (PL_parser->lex_inpat)
58 #define PL_lex_inwhat (PL_parser->lex_inwhat)
59 #define PL_lex_op (PL_parser->lex_op)
60 #define PL_lex_repl (PL_parser->lex_repl)
61 #define PL_lex_starts (PL_parser->lex_starts)
62 #define PL_lex_stuff (PL_parser->lex_stuff)
63 #define PL_multi_start (PL_parser->multi_start)
64 #define PL_multi_open (PL_parser->multi_open)
65 #define PL_multi_close (PL_parser->multi_close)
66 #define PL_preambled (PL_parser->preambled)
67 #define PL_linestr (PL_parser->linestr)
68 #define PL_expect (PL_parser->expect)
69 #define PL_copline (PL_parser->copline)
70 #define PL_bufptr (PL_parser->bufptr)
71 #define PL_oldbufptr (PL_parser->oldbufptr)
72 #define PL_oldoldbufptr (PL_parser->oldoldbufptr)
73 #define PL_linestart (PL_parser->linestart)
74 #define PL_bufend (PL_parser->bufend)
75 #define PL_last_uni (PL_parser->last_uni)
76 #define PL_last_lop (PL_parser->last_lop)
77 #define PL_last_lop_op (PL_parser->last_lop_op)
78 #define PL_lex_state (PL_parser->lex_state)
79 #define PL_rsfp (PL_parser->rsfp)
80 #define PL_rsfp_filters (PL_parser->rsfp_filters)
81 #define PL_in_my (PL_parser->in_my)
82 #define PL_in_my_stash (PL_parser->in_my_stash)
83 #define PL_tokenbuf (PL_parser->tokenbuf)
84 #define PL_multi_end (PL_parser->multi_end)
85 #define PL_error_count (PL_parser->error_count)
87 # define PL_nexttoke (PL_parser->nexttoke)
88 # define PL_nexttype (PL_parser->nexttype)
89 # define PL_nextval (PL_parser->nextval)
92 #define SvEVALED(sv) \
93 (SvTYPE(sv) >= SVt_PVNV \
94 && ((XPVIV*)SvANY(sv))->xiv_u.xivu_eval_seen)
96 static const char* const ident_too_long = "Identifier too long";
97 static const char* const ident_var_zero_multi_digit = "Numeric variables with more than one digit may not start with '0'";
99 # define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
101 #define XENUMMASK 0x3f
102 #define XFAKEEOF 0x40
103 #define XFAKEBRACK 0x80
105 #ifdef USE_UTF8_SCRIPTS
106 # define UTF cBOOL(!IN_BYTES)
108 # define UTF cBOOL((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
111 /* The maximum number of characters preceding the unrecognized one to display */
112 #define UNRECOGNIZED_PRECEDE_COUNT 10
114 /* In variables named $^X, these are the legal values for X.
115 * 1999-02-27 mjd-perl-patch@plover.com */
116 #define isCONTROLVAR(x) (isUPPER(x) || memCHRs("[\\]^_?", (x)))
118 #define SPACE_OR_TAB(c) isBLANK_A(c)
120 #define HEXFP_PEEK(s) \
122 (isXDIGIT(s[1]) || isALPHA_FOLD_EQ(s[1], 'p'))) || \
123 isALPHA_FOLD_EQ(s[0], 'p'))
125 /* LEX_* are values for PL_lex_state, the state of the lexer.
126 * They are arranged oddly so that the guard on the switch statement
127 * can get by with a single comparison (if the compiler is smart enough).
129 * These values refer to the various states within a sublex parse,
130 * i.e. within a double quotish string
133 /* #define LEX_NOTPARSING 11 is done in perl.h. */
135 #define LEX_NORMAL 10 /* normal code (ie not within "...") */
136 #define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
137 #define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
138 #define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
139 #define LEX_INTERPSTART 6 /* expecting the start of a $var */
141 /* at end of code, eg "$x" followed by: */
142 #define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
143 #define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
145 #define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
146 string or after \E, $foo, etc */
147 #define LEX_INTERPCONST 2 /* NOT USED */
148 #define LEX_FORMLINE 1 /* expecting a format line */
150 /* returned to yyl_try() to request it to retry the parse loop, expected to only
151 be returned directly by yyl_fake_eof(), but functions that call yyl_fake_eof()
154 yylex (aka Perl_yylex) returns 0 on EOF rather than returning -1,
155 other token values are 258 or higher (see perly.h), so -1 should be
158 #define YYL_RETRY (-1)
161 static const char* const lex_state_names[] = {
176 #include "keywords.h"
178 /* CLINE is a macro that ensures PL_copline has a sane value */
180 #define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
183 * Convenience functions to return different tokens and prime the
184 * lexer for the next token. They all take an argument.
186 * TOKEN : generic token (used for '(', DOLSHARP, etc)
187 * OPERATOR : generic operator
188 * AOPERATOR : assignment operator
189 * PREBLOCK : beginning the block after an if, while, foreach, ...
190 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
191 * PREREF : *EXPR where EXPR is not a simple identifier
192 * TERM : expression term
193 * POSTDEREF : postfix dereference (->$* ->@[...] etc.)
194 * LOOPX : loop exiting command (goto, last, dump, etc)
195 * FTST : file test operator
196 * FUN0 : zero-argument function
197 * FUN0OP : zero-argument function, with its op created in this file
198 * FUN1 : not used, except for not, which isn't a UNIOP
199 * BOop : bitwise or or xor
201 * BCop : bitwise complement
202 * SHop : shift operator
203 * PWop : power operator
204 * PMop : pattern-matching operator
205 * Aop : addition-level operator
206 * AopNOASSIGN : addition-level operator that is never part of .=
207 * Mop : multiplication-level operator
208 * ChEop : chaining equality-testing operator
209 * NCEop : non-chaining comparison operator at equality precedence
210 * ChRop : chaining relational operator <= != gt
211 * NCRop : non-chaining relational operator isa
213 * Also see LOP and lop() below.
216 #ifdef DEBUGGING /* Serve -DT. */
217 # define REPORT(retval) tokereport((I32)retval, &pl_yylval)
219 # define REPORT(retval) (retval)
222 #define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
223 #define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
224 #define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, retval))
225 #define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
226 #define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
227 #define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
228 #define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
229 #define POSTDEREF(f) return (PL_bufptr = s, S_postderef(aTHX_ REPORT(f),s[1]))
230 #define LOOPX(f) return (PL_bufptr = force_word(s,BAREWORD,TRUE,FALSE), \
232 PL_expect = PL_nexttoke ? XOPERATOR : XTERM, \
234 #define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
235 #define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
236 #define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
237 #define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
238 #define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)BITOROP))
239 #define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)BITANDOP))
240 #define BCop(f) return pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr = s, \
242 #define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)SHIFTOP))
243 #define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)POWOP))
244 #define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
245 #define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)ADDOP))
246 #define AopNOASSIGN(f) return (pl_yylval.ival=f, PL_bufptr=s, REPORT((int)ADDOP))
247 #define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)MULOP))
248 #define ChEop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)CHEQOP))
249 #define NCEop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)NCEQOP))
250 #define ChRop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)CHRELOP))
251 #define NCRop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)NCRELOP))
253 /* This bit of chicanery makes a unary function followed by
254 * a parenthesis into a function with one argument, highest precedence.
255 * The UNIDOR macro is for unary functions that can be followed by the //
256 * operator (such as C<shift // 0>).
258 #define UNI3(f,x,have_x) { \
259 pl_yylval.ival = f; \
260 if (have_x) PL_expect = x; \
262 PL_last_uni = PL_oldbufptr; \
263 PL_last_lop_op = (f) < 0 ? -(f) : (f); \
265 return REPORT( (int)FUNC1 ); \
267 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
269 #define UNI(f) UNI3(f,XTERM,1)
270 #define UNIDOR(f) UNI3(f,XTERMORDORDOR,1)
271 #define UNIPROTO(f,optional) { \
272 if (optional) PL_last_uni = PL_oldbufptr; \
276 #define UNIBRACK(f) UNI3(f,0,0)
278 /* grandfather return to old style */
281 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
282 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
283 pl_yylval.ival = (f); \
289 #define COPLINE_INC_WITH_HERELINES \
291 CopLINE_inc(PL_curcop); \
292 if (PL_parser->herelines) \
293 CopLINE(PL_curcop) += PL_parser->herelines, \
294 PL_parser->herelines = 0; \
296 /* Called after scan_str to update CopLINE(PL_curcop), but only when there
297 * is no sublex_push to follow. */
298 #define COPLINE_SET_FROM_MULTI_END \
300 CopLINE_set(PL_curcop, PL_multi_end); \
301 if (PL_multi_end != PL_multi_start) \
302 PL_parser->herelines = 0; \
306 /* A file-local structure for passing around information about subroutines and
307 * related definable words */
317 static const struct code no_code = { NULL, NULL, NULL, NULL, NULL, 0, FALSE };
321 /* how to interpret the pl_yylval associated with the token */
325 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
330 #define DEBUG_TOKEN(Type, Name) \
331 { Name, TOKENTYPE_##Type, #Name }
333 static struct debug_tokens {
335 enum token_type type;
337 } const debug_tokens[] =
339 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
340 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
341 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
342 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
343 { ANON_SIGSUB, TOKENTYPE_IVAL, "ANON_SIGSUB" },
344 { ARROW, TOKENTYPE_NONE, "ARROW" },
345 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
346 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
347 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
348 { CHEQOP, TOKENTYPE_OPNUM, "CHEQOP" },
349 { CHRELOP, TOKENTYPE_OPNUM, "CHRELOP" },
350 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
351 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
352 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
353 { DO, TOKENTYPE_NONE, "DO" },
354 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
355 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
356 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
357 { ELSE, TOKENTYPE_NONE, "ELSE" },
358 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
359 { FOR, TOKENTYPE_IVAL, "FOR" },
360 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
361 { FORMLBRACK, TOKENTYPE_NONE, "FORMLBRACK" },
362 { FORMRBRACK, TOKENTYPE_NONE, "FORMRBRACK" },
363 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
364 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
365 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
366 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
367 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
368 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
369 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
370 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
371 { IF, TOKENTYPE_IVAL, "IF" },
372 { LABEL, TOKENTYPE_OPVAL, "LABEL" },
373 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
374 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
375 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
376 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
377 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
378 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
379 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
380 { MY, TOKENTYPE_IVAL, "MY" },
381 { NCEQOP, TOKENTYPE_OPNUM, "NCEQOP" },
382 { NCRELOP, TOKENTYPE_OPNUM, "NCRELOP" },
383 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
384 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
385 { OROP, TOKENTYPE_IVAL, "OROP" },
386 { OROR, TOKENTYPE_NONE, "OROR" },
387 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
388 DEBUG_TOKEN (IVAL, PERLY_AMPERSAND),
389 DEBUG_TOKEN (IVAL, PERLY_BRACE_CLOSE),
390 DEBUG_TOKEN (IVAL, PERLY_BRACE_OPEN),
391 DEBUG_TOKEN (IVAL, PERLY_BRACKET_CLOSE),
392 DEBUG_TOKEN (IVAL, PERLY_BRACKET_OPEN),
393 DEBUG_TOKEN (IVAL, PERLY_COLON),
394 DEBUG_TOKEN (IVAL, PERLY_COMMA),
395 DEBUG_TOKEN (IVAL, PERLY_DOT),
396 DEBUG_TOKEN (IVAL, PERLY_EQUAL_SIGN),
397 DEBUG_TOKEN (IVAL, PERLY_EXCLAMATION_MARK),
398 DEBUG_TOKEN (IVAL, PERLY_MINUS),
399 DEBUG_TOKEN (IVAL, PERLY_PAREN_OPEN),
400 DEBUG_TOKEN (IVAL, PERLY_PERCENT_SIGN),
401 DEBUG_TOKEN (IVAL, PERLY_PLUS),
402 DEBUG_TOKEN (IVAL, PERLY_QUESTION_MARK),
403 DEBUG_TOKEN (IVAL, PERLY_SEMICOLON),
404 DEBUG_TOKEN (IVAL, PERLY_SLASH),
405 DEBUG_TOKEN (IVAL, PERLY_SNAIL),
406 DEBUG_TOKEN (IVAL, PERLY_STAR),
407 DEBUG_TOKEN (IVAL, PERLY_TILDE),
408 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
409 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
410 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
411 { POSTJOIN, TOKENTYPE_NONE, "POSTJOIN" },
412 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
413 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
414 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
415 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
416 { PREINC, TOKENTYPE_NONE, "PREINC" },
417 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
418 { QWLIST, TOKENTYPE_OPVAL, "QWLIST" },
419 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
420 { REQUIRE, TOKENTYPE_NONE, "REQUIRE" },
421 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
422 { SIGSUB, TOKENTYPE_NONE, "SIGSUB" },
423 { SUB, TOKENTYPE_NONE, "SUB" },
424 { SUBLEXEND, TOKENTYPE_NONE, "SUBLEXEND" },
425 { SUBLEXSTART, TOKENTYPE_NONE, "SUBLEXSTART" },
426 { THING, TOKENTYPE_OPVAL, "THING" },
427 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
428 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
429 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
430 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
431 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
432 { USE, TOKENTYPE_IVAL, "USE" },
433 { WHEN, TOKENTYPE_IVAL, "WHEN" },
434 { WHILE, TOKENTYPE_IVAL, "WHILE" },
435 { BAREWORD, TOKENTYPE_OPVAL, "BAREWORD" },
436 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
437 { 0, TOKENTYPE_NONE, NULL }
442 /* dump the returned token in rv, plus any optional arg in pl_yylval */
445 S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
447 PERL_ARGS_ASSERT_TOKEREPORT;
450 const char *name = NULL;
451 enum token_type type = TOKENTYPE_NONE;
452 const struct debug_tokens *p;
453 SV* const report = newSVpvs("<== ");
455 for (p = debug_tokens; p->token; p++) {
456 if (p->token == (int)rv) {
463 Perl_sv_catpv(aTHX_ report, name);
464 else if (isGRAPH(rv))
466 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
468 sv_catpvs(report, " (pending identifier)");
471 sv_catpvs(report, "EOF");
473 Perl_sv_catpvf(aTHX_ report, "?? %" IVdf, (IV)rv);
478 Perl_sv_catpvf(aTHX_ report, "(ival=%" IVdf ")", (IV)lvalp->ival);
480 case TOKENTYPE_OPNUM:
481 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
482 PL_op_name[lvalp->ival]);
485 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
487 case TOKENTYPE_OPVAL:
489 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
490 PL_op_name[lvalp->opval->op_type]);
491 if (lvalp->opval->op_type == OP_CONST) {
492 Perl_sv_catpvf(aTHX_ report, " %s",
493 SvPEEK(cSVOPx_sv(lvalp->opval)));
498 sv_catpvs(report, "(opval=null)");
501 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
507 /* print the buffer with suitable escapes */
510 S_printbuf(pTHX_ const char *const fmt, const char *const s)
512 SV* const tmp = newSVpvs("");
514 PERL_ARGS_ASSERT_PRINTBUF;
516 GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral); /* fmt checked by caller */
517 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
518 GCC_DIAG_RESTORE_STMT;
527 * This subroutine looks for an '=' next to the operator that has just been
528 * parsed and turns it into an ASSIGNOP if it finds one.
532 S_ao(pTHX_ int toketype)
534 if (*PL_bufptr == '=') {
536 if (toketype == ANDAND)
537 pl_yylval.ival = OP_ANDASSIGN;
538 else if (toketype == OROR)
539 pl_yylval.ival = OP_ORASSIGN;
540 else if (toketype == DORDOR)
541 pl_yylval.ival = OP_DORASSIGN;
544 return REPORT(toketype);
549 * When Perl expects an operator and finds something else, no_op
550 * prints the warning. It always prints "<something> found where
551 * operator expected. It prints "Missing semicolon on previous line?"
552 * if the surprise occurs at the start of the line. "do you need to
553 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
554 * where the compiler doesn't know if foo is a method call or a function.
555 * It prints "Missing operator before end of line" if there's nothing
556 * after the missing operator, or "... before <...>" if there is something
557 * after the missing operator.
559 * PL_bufptr is expected to point to the start of the thing that was found,
560 * and s after the next token or partial token.
564 S_no_op(pTHX_ const char *const what, char *s)
566 char * const oldbp = PL_bufptr;
567 const bool is_first = (PL_oldbufptr == PL_linestart);
569 PERL_ARGS_ASSERT_NO_OP;
575 yywarn(Perl_form(aTHX_ "%s found where operator expected", what), UTF ? SVf_UTF8 : 0);
576 if (ckWARN_d(WARN_SYNTAX)) {
578 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
579 "\t(Missing semicolon on previous line?)\n");
580 else if (PL_oldoldbufptr && isIDFIRST_lazy_if_safe(PL_oldoldbufptr,
585 for (t = PL_oldoldbufptr;
586 (isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF) || *t == ':');
587 t += UTF ? UTF8SKIP(t) : 1)
591 if (t < PL_bufptr && isSPACE(*t))
592 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
593 "\t(Do you need to predeclare %" UTF8f "?)\n",
594 UTF8fARG(UTF, t - PL_oldoldbufptr, PL_oldoldbufptr));
598 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
599 "\t(Missing operator before %" UTF8f "?)\n",
600 UTF8fARG(UTF, s - oldbp, oldbp));
608 * Complain about missing quote/regexp/heredoc terminator.
609 * If it's called with NULL then it cauterizes the line buffer.
610 * If we're in a delimited string and the delimiter is a control
611 * character, it's reformatted into a two-char sequence like ^C.
616 S_missingterm(pTHX_ char *s, STRLEN len)
618 char tmpbuf[UTF8_MAXBYTES + 1];
623 char * const nl = (char *) my_memrchr(s, '\n', len);
630 else if (PL_multi_close < 32) {
632 tmpbuf[1] = (char)toCTRL(PL_multi_close);
638 if (LIKELY(PL_multi_close < 256)) {
639 *tmpbuf = (char)PL_multi_close;
644 char *end = (char *)uvchr_to_utf8((U8 *)tmpbuf, PL_multi_close);
651 q = memchr(s, '"', len) ? '\'' : '"';
652 sv = sv_2mortal(newSVpvn(s, len));
655 Perl_croak(aTHX_ "Can't find string terminator %c%" SVf "%c"
656 " anywhere before EOF", q, SVfARG(sv), q);
662 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
663 * utf16-to-utf8-reversed.
666 #ifdef PERL_CR_FILTER
670 const char *s = SvPVX_const(sv);
671 const char * const e = s + SvCUR(sv);
673 PERL_ARGS_ASSERT_STRIP_RETURN;
675 /* outer loop optimized to do nothing if there are no CR-LFs */
677 if (*s++ == '\r' && *s == '\n') {
678 /* hit a CR-LF, need to copy the rest */
682 if (*s == '\r' && s[1] == '\n')
693 S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
695 const I32 count = FILTER_READ(idx+1, sv, maxlen);
696 if (count > 0 && !maxlen)
703 =for apidoc lex_start
705 Creates and initialises a new lexer/parser state object, supplying
706 a context in which to lex and parse from a new source of Perl code.
707 A pointer to the new state object is placed in L</PL_parser>. An entry
708 is made on the save stack so that upon unwinding, the new state object
709 will be destroyed and the former value of L</PL_parser> will be restored.
710 Nothing else need be done to clean up the parsing context.
712 The code to be parsed comes from C<line> and C<rsfp>. C<line>, if
713 non-null, provides a string (in SV form) containing code to be parsed.
714 A copy of the string is made, so subsequent modification of C<line>
715 does not affect parsing. C<rsfp>, if non-null, provides an input stream
716 from which code will be read to be parsed. If both are non-null, the
717 code in C<line> comes first and must consist of complete lines of input,
718 and C<rsfp> supplies the remainder of the source.
720 The C<flags> parameter is reserved for future use. Currently it is only
721 used by perl internally, so extensions should always pass zero.
726 /* LEX_START_SAME_FILTER indicates that this is not a new file, so it
727 can share filters with the current parser.
728 LEX_START_DONT_CLOSE indicates that the file handle wasn't opened by the
729 caller, hence isn't owned by the parser, so shouldn't be closed on parser
730 destruction. This is used to handle the case of defaulting to reading the
731 script from the standard input because no filename was given on the command
732 line (without getting confused by situation where STDIN has been closed, so
733 the script handle is opened on fd 0) */
736 Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
738 const char *s = NULL;
739 yy_parser *parser, *oparser;
741 if (flags && flags & ~LEX_START_FLAGS)
742 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
744 /* create and initialise a parser */
746 Newxz(parser, 1, yy_parser);
747 parser->old_parser = oparser = PL_parser;
750 parser->stack = NULL;
751 parser->stack_max1 = NULL;
754 /* on scope exit, free this parser and restore any outer one */
756 parser->saved_curcop = PL_curcop;
758 /* initialise lexer state */
760 parser->nexttoke = 0;
761 parser->error_count = oparser ? oparser->error_count : 0;
762 parser->copline = parser->preambling = NOLINE;
763 parser->lex_state = LEX_NORMAL;
764 parser->expect = XSTATE;
766 parser->recheck_utf8_validity = TRUE;
767 parser->rsfp_filters =
768 !(flags & LEX_START_SAME_FILTER) || !oparser
770 : MUTABLE_AV(SvREFCNT_inc(
771 oparser->rsfp_filters
772 ? oparser->rsfp_filters
773 : (oparser->rsfp_filters = newAV())
776 Newx(parser->lex_brackstack, 120, char);
777 Newx(parser->lex_casestack, 12, char);
778 *parser->lex_casestack = '\0';
779 Newxz(parser->lex_shared, 1, LEXSHARED);
783 const U8* first_bad_char_loc;
785 s = SvPV_const(line, len);
788 && UNLIKELY(! is_utf8_string_loc((U8 *) s,
790 &first_bad_char_loc)))
792 _force_out_malformed_utf8_message(first_bad_char_loc,
793 (U8 *) s + SvCUR(line),
795 1 /* 1 means die */ );
796 NOT_REACHED; /* NOTREACHED */
799 parser->linestr = flags & LEX_START_COPIED
800 ? SvREFCNT_inc_simple_NN(line)
801 : newSVpvn_flags(s, len, SvUTF8(line));
803 sv_catpvs(parser->linestr, "\n;");
805 parser->linestr = newSVpvn("\n;", rsfp ? 1 : 2);
808 parser->oldoldbufptr =
811 parser->linestart = SvPVX(parser->linestr);
812 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
813 parser->last_lop = parser->last_uni = NULL;
815 STATIC_ASSERT_STMT(FITS_IN_8_BITS(LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
816 |LEX_DONT_CLOSE_RSFP));
817 parser->lex_flags = (U8) (flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
818 |LEX_DONT_CLOSE_RSFP));
820 parser->in_pod = parser->filtered = 0;
824 /* delete a parser object */
827 Perl_parser_free(pTHX_ const yy_parser *parser)
829 PERL_ARGS_ASSERT_PARSER_FREE;
831 PL_curcop = parser->saved_curcop;
832 SvREFCNT_dec(parser->linestr);
834 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
835 PerlIO_clearerr(parser->rsfp);
836 else if (parser->rsfp && (!parser->old_parser
837 || (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
838 PerlIO_close(parser->rsfp);
839 SvREFCNT_dec(parser->rsfp_filters);
840 SvREFCNT_dec(parser->lex_stuff);
841 SvREFCNT_dec(parser->lex_sub_repl);
843 Safefree(parser->lex_brackstack);
844 Safefree(parser->lex_casestack);
845 Safefree(parser->lex_shared);
846 PL_parser = parser->old_parser;
851 Perl_parser_free_nexttoke_ops(pTHX_ yy_parser *parser, OPSLAB *slab)
853 I32 nexttoke = parser->nexttoke;
854 PERL_ARGS_ASSERT_PARSER_FREE_NEXTTOKE_OPS;
856 if (S_is_opval_token(parser->nexttype[nexttoke] & 0xffff)
857 && parser->nextval[nexttoke].opval
858 && parser->nextval[nexttoke].opval->op_slabbed
859 && OpSLAB(parser->nextval[nexttoke].opval) == slab) {
860 op_free(parser->nextval[nexttoke].opval);
861 parser->nextval[nexttoke].opval = NULL;
868 =for apidoc AmnxUN|SV *|PL_parser-E<gt>linestr
870 Buffer scalar containing the chunk currently under consideration of the
871 text currently being lexed. This is always a plain string scalar (for
872 which C<SvPOK> is true). It is not intended to be used as a scalar by
873 normal scalar means; instead refer to the buffer directly by the pointer
874 variables described below.
876 The lexer maintains various C<char*> pointers to things in the
877 C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
878 reallocated, all of these pointers must be updated. Don't attempt to
879 do this manually, but rather use L</lex_grow_linestr> if you need to
880 reallocate the buffer.
882 The content of the text chunk in the buffer is commonly exactly one
883 complete line of input, up to and including a newline terminator,
884 but there are situations where it is otherwise. The octets of the
885 buffer may be intended to be interpreted as either UTF-8 or Latin-1.
886 The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
887 flag on this scalar, which may disagree with it.
889 For direct examination of the buffer, the variable
890 L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
891 lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
892 of these pointers is usually preferable to examination of the scalar
893 through normal scalar means.
895 =for apidoc AmnxUN|char *|PL_parser-E<gt>bufend
897 Direct pointer to the end of the chunk of text currently being lexed, the
898 end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
899 + SvCUR(PL_parser-E<gt>linestr)>. A C<NUL> character (zero octet) is
900 always located at the end of the buffer, and does not count as part of
901 the buffer's contents.
903 =for apidoc AmnxUN|char *|PL_parser-E<gt>bufptr
905 Points to the current position of lexing inside the lexer buffer.
906 Characters around this point may be freely examined, within
907 the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
908 L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
909 interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
911 Lexing code (whether in the Perl core or not) moves this pointer past
912 the characters that it consumes. It is also expected to perform some
913 bookkeeping whenever a newline character is consumed. This movement
914 can be more conveniently performed by the function L</lex_read_to>,
915 which handles newlines appropriately.
917 Interpretation of the buffer's octets can be abstracted out by
918 using the slightly higher-level functions L</lex_peek_unichar> and
919 L</lex_read_unichar>.
921 =for apidoc AmnxUN|char *|PL_parser-E<gt>linestart
923 Points to the start of the current line inside the lexer buffer.
924 This is useful for indicating at which column an error occurred, and
925 not much else. This must be updated by any lexing code that consumes
926 a newline; the function L</lex_read_to> handles this detail.
932 =for apidoc lex_bufutf8
934 Indicates whether the octets in the lexer buffer
935 (L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
936 of Unicode characters. If not, they should be interpreted as Latin-1
937 characters. This is analogous to the C<SvUTF8> flag for scalars.
939 In UTF-8 mode, it is not guaranteed that the lexer buffer actually
940 contains valid UTF-8. Lexing code must be robust in the face of invalid
943 The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
944 is significant, but not the whole story regarding the input character
945 encoding. Normally, when a file is being read, the scalar contains octets
946 and its C<SvUTF8> flag is off, but the octets should be interpreted as
947 UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
948 however, the scalar may have the C<SvUTF8> flag on, and in this case its
949 octets should be interpreted as UTF-8 unless the C<use bytes> pragma
950 is in effect. This logic may change in the future; use this function
951 instead of implementing the logic yourself.
957 Perl_lex_bufutf8(pTHX)
963 =for apidoc lex_grow_linestr
965 Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
966 at least C<len> octets (including terminating C<NUL>). Returns a
967 pointer to the reallocated buffer. This is necessary before making
968 any direct modification of the buffer that would increase its length.
969 L</lex_stuff_pvn> provides a more convenient way to insert text into
972 Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
973 this function updates all of the lexer's variables that point directly
980 Perl_lex_grow_linestr(pTHX_ STRLEN len)
984 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
985 STRLEN linestart_pos, last_uni_pos, last_lop_pos, re_eval_start_pos;
988 linestr = PL_parser->linestr;
989 buf = SvPVX(linestr);
990 if (len <= SvLEN(linestr))
993 /* Is the lex_shared linestr SV the same as the current linestr SV?
994 * Only in this case does re_eval_start need adjusting, since it
995 * points within lex_shared->ls_linestr's buffer */
996 current = ( !PL_parser->lex_shared->ls_linestr
997 || linestr == PL_parser->lex_shared->ls_linestr);
999 bufend_pos = PL_parser->bufend - buf;
1000 bufptr_pos = PL_parser->bufptr - buf;
1001 oldbufptr_pos = PL_parser->oldbufptr - buf;
1002 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1003 linestart_pos = PL_parser->linestart - buf;
1004 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1005 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1006 re_eval_start_pos = (current && PL_parser->lex_shared->re_eval_start) ?
1007 PL_parser->lex_shared->re_eval_start - buf : 0;
1009 buf = sv_grow(linestr, len);
1011 PL_parser->bufend = buf + bufend_pos;
1012 PL_parser->bufptr = buf + bufptr_pos;
1013 PL_parser->oldbufptr = buf + oldbufptr_pos;
1014 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1015 PL_parser->linestart = buf + linestart_pos;
1016 if (PL_parser->last_uni)
1017 PL_parser->last_uni = buf + last_uni_pos;
1018 if (PL_parser->last_lop)
1019 PL_parser->last_lop = buf + last_lop_pos;
1020 if (current && PL_parser->lex_shared->re_eval_start)
1021 PL_parser->lex_shared->re_eval_start = buf + re_eval_start_pos;
1026 =for apidoc lex_stuff_pvn
1028 Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1029 immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1030 reallocating the buffer if necessary. This means that lexing code that
1031 runs later will see the characters as if they had appeared in the input.
1032 It is not recommended to do this as part of normal parsing, and most
1033 uses of this facility run the risk of the inserted characters being
1034 interpreted in an unintended manner.
1036 The string to be inserted is represented by C<len> octets starting
1037 at C<pv>. These octets are interpreted as either UTF-8 or Latin-1,
1038 according to whether the C<LEX_STUFF_UTF8> flag is set in C<flags>.
1039 The characters are recoded for the lexer buffer, according to how the
1040 buffer is currently being interpreted (L</lex_bufutf8>). If a string
1041 to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
1042 function is more convenient.
1044 =for apidoc Amnh||LEX_STUFF_UTF8
1050 Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
1053 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
1054 if (flags & ~(LEX_STUFF_UTF8))
1055 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
1057 if (flags & LEX_STUFF_UTF8) {
1060 STRLEN highhalf = variant_under_utf8_count((U8 *) pv,
1062 const char *p, *e = pv+len;;
1065 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
1066 bufptr = PL_parser->bufptr;
1067 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
1068 SvCUR_set(PL_parser->linestr,
1069 SvCUR(PL_parser->linestr) + len+highhalf);
1070 PL_parser->bufend += len+highhalf;
1071 for (p = pv; p != e; p++) {
1072 append_utf8_from_native_byte(*p, (U8 **) &bufptr);
1076 if (flags & LEX_STUFF_UTF8) {
1077 STRLEN highhalf = 0;
1078 const char *p, *e = pv+len;
1079 for (p = pv; p != e; p++) {
1081 if (UTF8_IS_ABOVE_LATIN1(c)) {
1082 Perl_croak(aTHX_ "Lexing code attempted to stuff "
1083 "non-Latin-1 character into Latin-1 input");
1084 } else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e)) {
1087 } else assert(UTF8_IS_INVARIANT(c));
1091 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
1092 bufptr = PL_parser->bufptr;
1093 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
1094 SvCUR_set(PL_parser->linestr,
1095 SvCUR(PL_parser->linestr) + len-highhalf);
1096 PL_parser->bufend += len-highhalf;
1099 if (UTF8_IS_INVARIANT(*p)) {
1105 *bufptr++ = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
1111 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1112 bufptr = PL_parser->bufptr;
1113 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
1114 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
1115 PL_parser->bufend += len;
1116 Copy(pv, bufptr, len, char);
1122 =for apidoc lex_stuff_pv
1124 Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1125 immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1126 reallocating the buffer if necessary. This means that lexing code that
1127 runs later will see the characters as if they had appeared in the input.
1128 It is not recommended to do this as part of normal parsing, and most
1129 uses of this facility run the risk of the inserted characters being
1130 interpreted in an unintended manner.
1132 The string to be inserted is represented by octets starting at C<pv>
1133 and continuing to the first nul. These octets are interpreted as either
1134 UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
1135 in C<flags>. The characters are recoded for the lexer buffer, according
1136 to how the buffer is currently being interpreted (L</lex_bufutf8>).
1137 If it is not convenient to nul-terminate a string to be inserted, the
1138 L</lex_stuff_pvn> function is more appropriate.
1144 Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1146 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1147 lex_stuff_pvn(pv, strlen(pv), flags);
1151 =for apidoc lex_stuff_sv
1153 Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1154 immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1155 reallocating the buffer if necessary. This means that lexing code that
1156 runs later will see the characters as if they had appeared in the input.
1157 It is not recommended to do this as part of normal parsing, and most
1158 uses of this facility run the risk of the inserted characters being
1159 interpreted in an unintended manner.
1161 The string to be inserted is the string value of C<sv>. The characters
1162 are recoded for the lexer buffer, according to how the buffer is currently
1163 being interpreted (L</lex_bufutf8>). If a string to be inserted is
1164 not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1165 need to construct a scalar.
1171 Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1175 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1177 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1179 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1183 =for apidoc lex_unstuff
1185 Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1186 C<ptr>. Text following C<ptr> will be moved, and the buffer shortened.
1187 This hides the discarded text from any lexing code that runs later,
1188 as if the text had never appeared.
1190 This is not the normal way to consume lexed text. For that, use
1197 Perl_lex_unstuff(pTHX_ char *ptr)
1201 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1202 buf = PL_parser->bufptr;
1204 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1207 bufend = PL_parser->bufend;
1209 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1210 unstuff_len = ptr - buf;
1211 Move(ptr, buf, bufend+1-ptr, char);
1212 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1213 PL_parser->bufend = bufend - unstuff_len;
1217 =for apidoc lex_read_to
1219 Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1220 to C<ptr>. This advances L</PL_parser-E<gt>bufptr> to match C<ptr>,
1221 performing the correct bookkeeping whenever a newline character is passed.
1222 This is the normal way to consume lexed text.
1224 Interpretation of the buffer's octets can be abstracted out by
1225 using the slightly higher-level functions L</lex_peek_unichar> and
1226 L</lex_read_unichar>.
1232 Perl_lex_read_to(pTHX_ char *ptr)
1235 PERL_ARGS_ASSERT_LEX_READ_TO;
1236 s = PL_parser->bufptr;
1237 if (ptr < s || ptr > PL_parser->bufend)
1238 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1239 for (; s != ptr; s++)
1241 COPLINE_INC_WITH_HERELINES;
1242 PL_parser->linestart = s+1;
1244 PL_parser->bufptr = ptr;
1248 =for apidoc lex_discard_to
1250 Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1251 up to C<ptr>. The remaining content of the buffer will be moved, and
1252 all pointers into the buffer updated appropriately. C<ptr> must not
1253 be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1254 it is not permitted to discard text that has yet to be lexed.
1256 Normally it is not necessarily to do this directly, because it suffices to
1257 use the implicit discarding behaviour of L</lex_next_chunk> and things
1258 based on it. However, if a token stretches across multiple lines,
1259 and the lexing code has kept multiple lines of text in the buffer for
1260 that purpose, then after completion of the token it would be wise to
1261 explicitly discard the now-unneeded earlier lines, to avoid future
1262 multi-line tokens growing the buffer without bound.
1268 Perl_lex_discard_to(pTHX_ char *ptr)
1272 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1273 buf = SvPVX(PL_parser->linestr);
1275 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1278 if (ptr > PL_parser->bufptr)
1279 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1280 discard_len = ptr - buf;
1281 if (PL_parser->oldbufptr < ptr)
1282 PL_parser->oldbufptr = ptr;
1283 if (PL_parser->oldoldbufptr < ptr)
1284 PL_parser->oldoldbufptr = ptr;
1285 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1286 PL_parser->last_uni = NULL;
1287 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1288 PL_parser->last_lop = NULL;
1289 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1290 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1291 PL_parser->bufend -= discard_len;
1292 PL_parser->bufptr -= discard_len;
1293 PL_parser->oldbufptr -= discard_len;
1294 PL_parser->oldoldbufptr -= discard_len;
1295 if (PL_parser->last_uni)
1296 PL_parser->last_uni -= discard_len;
1297 if (PL_parser->last_lop)
1298 PL_parser->last_lop -= discard_len;
1302 Perl_notify_parser_that_changed_to_utf8(pTHX)
1304 /* Called when $^H is changed to indicate that HINT_UTF8 has changed from
1305 * off to on. At compile time, this has the effect of entering a 'use
1306 * utf8' section. This means that any input was not previously checked for
1307 * UTF-8 (because it was off), but now we do need to check it, or our
1308 * assumptions about the input being sane could be wrong, and we could
1309 * segfault. This routine just sets a flag so that the next time we look
1310 * at the input we do the well-formed UTF-8 check. If we aren't in the
1311 * proper phase, there may not be a parser object, but if there is, setting
1312 * the flag is harmless */
1315 PL_parser->recheck_utf8_validity = TRUE;
1320 =for apidoc lex_next_chunk
1322 Reads in the next chunk of text to be lexed, appending it to
1323 L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1324 looked to the end of the current chunk and wants to know more. It is
1325 usual, but not necessary, for lexing to have consumed the entirety of
1326 the current chunk at this time.
1328 If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1329 chunk (i.e., the current chunk has been entirely consumed), normally the
1330 current chunk will be discarded at the same time that the new chunk is
1331 read in. If C<flags> has the C<LEX_KEEP_PREVIOUS> bit set, the current chunk
1332 will not be discarded. If the current chunk has not been entirely
1333 consumed, then it will not be discarded regardless of the flag.
1335 Returns true if some new text was added to the buffer, or false if the
1336 buffer has reached the end of the input text.
1338 =for apidoc Amnh||LEX_KEEP_PREVIOUS
1343 #define LEX_FAKE_EOF 0x80000000
1344 #define LEX_NO_TERM 0x40000000 /* here-doc */
1347 Perl_lex_next_chunk(pTHX_ U32 flags)
1351 STRLEN old_bufend_pos, new_bufend_pos;
1352 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1353 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
1354 bool got_some_for_debugger = 0;
1357 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM))
1358 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
1359 if (!(flags & LEX_NO_TERM) && PL_lex_inwhat)
1361 linestr = PL_parser->linestr;
1362 buf = SvPVX(linestr);
1363 if (!(flags & LEX_KEEP_PREVIOUS)
1364 && PL_parser->bufptr == PL_parser->bufend)
1366 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1368 if (PL_parser->last_uni != PL_parser->bufend)
1369 PL_parser->last_uni = NULL;
1370 if (PL_parser->last_lop != PL_parser->bufend)
1371 PL_parser->last_lop = NULL;
1372 last_uni_pos = last_lop_pos = 0;
1374 SvCUR_set(linestr, 0);
1376 old_bufend_pos = PL_parser->bufend - buf;
1377 bufptr_pos = PL_parser->bufptr - buf;
1378 oldbufptr_pos = PL_parser->oldbufptr - buf;
1379 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1380 linestart_pos = PL_parser->linestart - buf;
1381 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1382 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1384 if (flags & LEX_FAKE_EOF) {
1386 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
1388 } else if (filter_gets(linestr, old_bufend_pos)) {
1390 got_some_for_debugger = 1;
1391 } else if (flags & LEX_NO_TERM) {
1394 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1397 /* End of real input. Close filehandle (unless it was STDIN),
1398 * then add implicit termination.
1400 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
1401 PerlIO_clearerr(PL_parser->rsfp);
1402 else if (PL_parser->rsfp)
1403 (void)PerlIO_close(PL_parser->rsfp);
1404 PL_parser->rsfp = NULL;
1405 PL_parser->in_pod = PL_parser->filtered = 0;
1406 if (!PL_in_eval && PL_minus_p) {
1408 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1409 PL_minus_n = PL_minus_p = 0;
1410 } else if (!PL_in_eval && PL_minus_n) {
1411 sv_catpvs(linestr, /*{*/";}");
1414 sv_catpvs(linestr, ";");
1417 buf = SvPVX(linestr);
1418 new_bufend_pos = SvCUR(linestr);
1419 PL_parser->bufend = buf + new_bufend_pos;
1420 PL_parser->bufptr = buf + bufptr_pos;
1423 const U8* first_bad_char_loc;
1424 if (UNLIKELY(! is_utf8_string_loc(
1425 (U8 *) PL_parser->bufptr,
1426 PL_parser->bufend - PL_parser->bufptr,
1427 &first_bad_char_loc)))
1429 _force_out_malformed_utf8_message(first_bad_char_loc,
1430 (U8 *) PL_parser->bufend,
1432 1 /* 1 means die */ );
1433 NOT_REACHED; /* NOTREACHED */
1437 PL_parser->oldbufptr = buf + oldbufptr_pos;
1438 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1439 PL_parser->linestart = buf + linestart_pos;
1440 if (PL_parser->last_uni)
1441 PL_parser->last_uni = buf + last_uni_pos;
1442 if (PL_parser->last_lop)
1443 PL_parser->last_lop = buf + last_lop_pos;
1444 if (PL_parser->preambling != NOLINE) {
1445 CopLINE_set(PL_curcop, PL_parser->preambling + 1);
1446 PL_parser->preambling = NOLINE;
1448 if ( got_some_for_debugger
1449 && PERLDB_LINE_OR_SAVESRC
1450 && PL_curstash != PL_debstash)
1452 /* debugger active and we're not compiling the debugger code,
1453 * so store the line into the debugger's array of lines
1455 update_debugger_info(NULL, buf+old_bufend_pos,
1456 new_bufend_pos-old_bufend_pos);
1462 =for apidoc lex_peek_unichar
1464 Looks ahead one (Unicode) character in the text currently being lexed.
1465 Returns the codepoint (unsigned integer value) of the next character,
1466 or -1 if lexing has reached the end of the input text. To consume the
1467 peeked character, use L</lex_read_unichar>.
1469 If the next character is in (or extends into) the next chunk of input
1470 text, the next chunk will be read in. Normally the current chunk will be
1471 discarded at the same time, but if C<flags> has the C<LEX_KEEP_PREVIOUS>
1472 bit set, then the current chunk will not be discarded.
1474 If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1475 is encountered, an exception is generated.
1481 Perl_lex_peek_unichar(pTHX_ U32 flags)
1484 if (flags & ~(LEX_KEEP_PREVIOUS))
1485 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1486 s = PL_parser->bufptr;
1487 bufend = PL_parser->bufend;
1493 if (!lex_next_chunk(flags))
1495 s = PL_parser->bufptr;
1496 bufend = PL_parser->bufend;
1499 if (UTF8_IS_INVARIANT(head))
1501 if (UTF8_IS_START(head)) {
1502 len = UTF8SKIP(&head);
1503 while ((STRLEN)(bufend-s) < len) {
1504 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1506 s = PL_parser->bufptr;
1507 bufend = PL_parser->bufend;
1510 unichar = utf8n_to_uvchr((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
1511 if (retlen == (STRLEN)-1) {
1512 _force_out_malformed_utf8_message((U8 *) s,
1515 1 /* 1 means die */ );
1516 NOT_REACHED; /* NOTREACHED */
1521 if (!lex_next_chunk(flags))
1523 s = PL_parser->bufptr;
1530 =for apidoc lex_read_unichar
1532 Reads the next (Unicode) character in the text currently being lexed.
1533 Returns the codepoint (unsigned integer value) of the character read,
1534 and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1535 if lexing has reached the end of the input text. To non-destructively
1536 examine the next character, use L</lex_peek_unichar> instead.
1538 If the next character is in (or extends into) the next chunk of input
1539 text, the next chunk will be read in. Normally the current chunk will be
1540 discarded at the same time, but if C<flags> has the C<LEX_KEEP_PREVIOUS>
1541 bit set, then the current chunk will not be discarded.
1543 If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1544 is encountered, an exception is generated.
1550 Perl_lex_read_unichar(pTHX_ U32 flags)
1553 if (flags & ~(LEX_KEEP_PREVIOUS))
1554 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1555 c = lex_peek_unichar(flags);
1558 COPLINE_INC_WITH_HERELINES;
1560 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1562 ++(PL_parser->bufptr);
1568 =for apidoc lex_read_space
1570 Reads optional spaces, in Perl style, in the text currently being
1571 lexed. The spaces may include ordinary whitespace characters and
1572 Perl-style comments. C<#line> directives are processed if encountered.
1573 L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1574 at a non-space character (or the end of the input text).
1576 If spaces extend into the next chunk of input text, the next chunk will
1577 be read in. Normally the current chunk will be discarded at the same
1578 time, but if C<flags> has the C<LEX_KEEP_PREVIOUS> bit set, then the current
1579 chunk will not be discarded.
1584 #define LEX_NO_INCLINE 0x40000000
1585 #define LEX_NO_NEXT_CHUNK 0x80000000
1588 Perl_lex_read_space(pTHX_ U32 flags)
1591 const bool can_incline = !(flags & LEX_NO_INCLINE);
1592 bool need_incline = 0;
1593 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK|LEX_NO_INCLINE))
1594 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
1595 s = PL_parser->bufptr;
1596 bufend = PL_parser->bufend;
1602 } while (!(c == '\n' || (c == 0 && s == bufend)));
1603 } else if (c == '\n') {
1606 PL_parser->linestart = s;
1612 } else if (isSPACE(c)) {
1614 } else if (c == 0 && s == bufend) {
1617 if (flags & LEX_NO_NEXT_CHUNK)
1619 PL_parser->bufptr = s;
1620 l = CopLINE(PL_curcop);
1621 CopLINE(PL_curcop) += PL_parser->herelines + 1;
1622 got_more = lex_next_chunk(flags);
1623 CopLINE_set(PL_curcop, l);
1624 s = PL_parser->bufptr;
1625 bufend = PL_parser->bufend;
1628 if (can_incline && need_incline && PL_parser->rsfp) {
1638 PL_parser->bufptr = s;
1643 =for apidoc validate_proto
1645 This function performs syntax checking on a prototype, C<proto>.
1646 If C<warn> is true, any illegal characters or mismatched brackets
1647 will trigger illegalproto warnings, declaring that they were
1648 detected in the prototype for C<name>.
1650 The return value is C<true> if this is a valid prototype, and
1651 C<false> if it is not, regardless of whether C<warn> was C<true> or
1654 Note that C<NULL> is a valid C<proto> and will always return C<true>.
1661 Perl_validate_proto(pTHX_ SV *name, SV *proto, bool warn, bool curstash)
1663 STRLEN len, origlen;
1665 bool bad_proto = FALSE;
1666 bool in_brackets = FALSE;
1667 bool after_slash = FALSE;
1668 char greedy_proto = ' ';
1669 bool proto_after_greedy_proto = FALSE;
1670 bool must_be_last = FALSE;
1671 bool underscore = FALSE;
1672 bool bad_proto_after_underscore = FALSE;
1674 PERL_ARGS_ASSERT_VALIDATE_PROTO;
1679 p = SvPV(proto, len);
1681 for (; len--; p++) {
1684 proto_after_greedy_proto = TRUE;
1686 if (!memCHRs(";@%", *p))
1687 bad_proto_after_underscore = TRUE;
1690 if (!memCHRs("$@%*;[]&\\_+", *p) || *p == '\0') {
1697 in_brackets = FALSE;
1698 else if ((*p == '@' || *p == '%')
1702 must_be_last = TRUE;
1711 after_slash = FALSE;
1716 SV *tmpsv = newSVpvs_flags("", SVs_TEMP);
1719 ? sv_uni_display(tmpsv, newSVpvn_flags(p, origlen, SVs_TEMP | SVf_UTF8),
1720 origlen, UNI_DISPLAY_ISPRINT)
1721 : pv_pretty(tmpsv, p, origlen, 60, NULL, NULL, PERL_PV_ESCAPE_NONASCII);
1723 if (curstash && !memchr(SvPVX(name), ':', SvCUR(name))) {
1724 SV *name2 = sv_2mortal(newSVsv(PL_curstname));
1725 sv_catpvs(name2, "::");
1726 sv_catsv(name2, (SV *)name);
1730 if (proto_after_greedy_proto)
1731 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1732 "Prototype after '%c' for %" SVf " : %s",
1733 greedy_proto, SVfARG(name), p);
1735 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1736 "Missing ']' in prototype for %" SVf " : %s",
1739 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1740 "Illegal character in prototype for %" SVf " : %s",
1742 if (bad_proto_after_underscore)
1743 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1744 "Illegal character after '_' in prototype for %" SVf " : %s",
1748 return (! (proto_after_greedy_proto || bad_proto) );
1753 * This subroutine has nothing to do with tilting, whether at windmills
1754 * or pinball tables. Its name is short for "increment line". It
1755 * increments the current line number in CopLINE(PL_curcop) and checks
1756 * to see whether the line starts with a comment of the form
1757 * # line 500 "foo.pm"
1758 * If so, it sets the current line number and file to the values in the comment.
1762 S_incline(pTHX_ const char *s, const char *end)
1770 PERL_ARGS_ASSERT_INCLINE;
1774 COPLINE_INC_WITH_HERELINES;
1775 if (!PL_rsfp && !PL_parser->filtered && PL_lex_state == LEX_NORMAL
1776 && s+1 == PL_bufend && *s == ';') {
1777 /* fake newline in string eval */
1778 CopLINE_dec(PL_curcop);
1783 while (SPACE_OR_TAB(*s))
1785 if (memBEGINs(s, (STRLEN) (end - s), "line"))
1786 s += sizeof("line") - 1;
1789 if (SPACE_OR_TAB(*s))
1793 while (SPACE_OR_TAB(*s))
1801 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
1803 while (SPACE_OR_TAB(*s))
1805 if (*s == '"' && (t = (char *) memchr(s+1, '"', end - s))) {
1811 while (*t && !isSPACE(*t))
1815 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
1817 if (*e != '\n' && *e != '\0')
1818 return; /* false alarm */
1820 if (!grok_atoUV(n, &uv, &e))
1822 line_num = ((line_t)uv) - 1;
1825 const STRLEN len = t - s;
1827 if (!PL_rsfp && !PL_parser->filtered) {
1828 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1829 * to *{"::_<newfilename"} */
1830 /* However, the long form of evals is only turned on by the
1831 debugger - usually they're "(eval %lu)" */
1832 GV * const cfgv = CopFILEGV(PL_curcop);
1835 STRLEN tmplen2 = len;
1839 if (tmplen2 + 2 <= sizeof smallbuf)
1842 Newx(tmpbuf2, tmplen2 + 2, char);
1847 memcpy(tmpbuf2 + 2, s, tmplen2);
1850 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
1852 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
1853 /* adjust ${"::_<newfilename"} to store the new file name */
1854 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
1855 /* The line number may differ. If that is the case,
1856 alias the saved lines that are in the array.
1857 Otherwise alias the whole array. */
1858 if (CopLINE(PL_curcop) == line_num) {
1859 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(cfgv)));
1860 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(cfgv)));
1862 else if (GvAV(cfgv)) {
1863 AV * const av = GvAV(cfgv);
1864 const line_t start = CopLINE(PL_curcop)+1;
1865 SSize_t items = AvFILLp(av) - start;
1867 AV * const av2 = GvAVn(gv2);
1868 SV **svp = AvARRAY(av) + start;
1869 Size_t l = line_num+1;
1870 while (items-- && l < SSize_t_MAX && l == (line_t)l)
1871 av_store(av2, (SSize_t)l++, SvREFCNT_inc(*svp++));
1876 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
1879 CopFILE_free(PL_curcop);
1880 CopFILE_setn(PL_curcop, s, len);
1882 CopLINE_set(PL_curcop, line_num);
1886 S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
1888 AV *av = CopFILEAVx(PL_curcop);
1891 if (PL_parser->preambling == NOLINE) sv = newSV_type(SVt_PVMG);
1893 sv = *av_fetch(av, 0, 1);
1894 SvUPGRADE(sv, SVt_PVMG);
1896 if (!SvPOK(sv)) SvPVCLEAR(sv);
1898 sv_catsv(sv, orig_sv);
1900 sv_catpvn(sv, buf, len);
1905 if (PL_parser->preambling == NOLINE)
1906 av_store(av, CopLINE(PL_curcop), sv);
1912 * Called to gobble the appropriate amount and type of whitespace.
1913 * Skips comments as well.
1914 * Returns the next character after the whitespace that is skipped.
1917 * Same thing, but look ahead without incrementing line numbers or
1918 * adjusting PL_linestart.
1921 #define skipspace(s) skipspace_flags(s, 0)
1922 #define peekspace(s) skipspace_flags(s, LEX_NO_INCLINE)
1925 Perl_skipspace_flags(pTHX_ char *s, U32 flags)
1927 PERL_ARGS_ASSERT_SKIPSPACE_FLAGS;
1928 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
1929 while (s < PL_bufend && (SPACE_OR_TAB(*s) || !*s))
1932 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1934 lex_read_space(flags | LEX_KEEP_PREVIOUS |
1935 (PL_lex_inwhat || PL_lex_state == LEX_FORMLINE ?
1936 LEX_NO_NEXT_CHUNK : 0));
1938 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1939 if (PL_linestart > PL_bufptr)
1940 PL_bufptr = PL_linestart;
1948 * Check the unary operators to ensure there's no ambiguity in how they're
1949 * used. An ambiguous piece of code would be:
1951 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1952 * the +5 is its argument.
1960 if (PL_oldoldbufptr != PL_last_uni)
1962 while (isSPACE(*PL_last_uni))
1965 while (isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF) || *s == '-')
1966 s += UTF ? UTF8SKIP(s) : 1;
1967 if (s < PL_bufptr && memchr(s, '(', PL_bufptr - s))
1970 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
1971 "Warning: Use of \"%" UTF8f "\" without parentheses is ambiguous",
1972 UTF8fARG(UTF, (int)(s - PL_last_uni), PL_last_uni));
1976 * LOP : macro to build a list operator. Its behaviour has been replaced
1977 * with a subroutine, S_lop() for which LOP is just another name.
1980 #define LOP(f,x) return lop(f,x,s)
1984 * Build a list operator (or something that might be one). The rules:
1985 * - if we have a next token, then it's a list operator (no parens) for
1986 * which the next token has already been parsed; e.g.,
1989 * - if the next thing is an opening paren, then it's a function
1990 * - else it's a list operator
1994 S_lop(pTHX_ I32 f, U8 x, char *s)
1996 PERL_ARGS_ASSERT_LOP;
2001 PL_last_lop = PL_oldbufptr;
2002 PL_last_lop_op = (OPCODE)f;
2007 return REPORT(FUNC);
2010 return REPORT(FUNC);
2013 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
2014 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
2015 return REPORT(LSTOP);
2021 * When the lexer realizes it knows the next token (for instance,
2022 * it is reordering tokens for the parser) then it can call S_force_next
2023 * to know what token to return the next time the lexer is called. Caller
2024 * will need to set PL_nextval[] and possibly PL_expect to ensure
2025 * the lexer handles the token correctly.
2029 S_force_next(pTHX_ I32 type)
2033 PerlIO_printf(Perl_debug_log, "### forced token:\n");
2034 tokereport(type, &NEXTVAL_NEXTTOKE);
2037 assert(PL_nexttoke < C_ARRAY_LENGTH(PL_nexttype));
2038 PL_nexttype[PL_nexttoke] = type;
2045 * This subroutine handles postfix deref syntax after the arrow has already
2046 * been emitted. @* $* etc. are emitted as two separate tokens right here.
2047 * @[ @{ %[ %{ *{ are emitted also as two tokens, but this function emits
2048 * only the first, leaving yylex to find the next.
2052 S_postderef(pTHX_ int const funny, char const next)
2054 assert(funny == DOLSHARP
2055 || memCHRs("$@%&*", funny)
2056 || funny == PERLY_DOLLAR
2057 || funny == PERLY_SNAIL
2058 || funny == PERLY_PERCENT_SIGN
2059 || funny == PERLY_AMPERSAND
2060 || funny == PERLY_STAR
2063 PL_expect = XOPERATOR;
2064 if (PL_lex_state == LEX_INTERPNORMAL && !PL_lex_brackets) {
2065 assert(PERLY_SNAIL == funny || PERLY_DOLLAR == funny || DOLSHARP == funny);
2066 PL_lex_state = LEX_INTERPEND;
2067 if (PERLY_SNAIL == funny)
2068 force_next(POSTJOIN);
2070 force_next(PERLY_STAR);
2074 if (PERLY_SNAIL == funny && PL_lex_state == LEX_INTERPNORMAL
2075 && !PL_lex_brackets)
2077 PL_expect = XOPERATOR;
2086 int yyc = PL_parser->yychar;
2087 if (yyc != YYEMPTY) {
2089 NEXTVAL_NEXTTOKE = PL_parser->yylval;
2090 if (yyc == PERLY_BRACE_OPEN || yyc == HASHBRACK || yyc == PERLY_BRACKET_OPEN) {
2091 PL_lex_allbrackets--;
2093 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
2094 } else if (yyc == PERLY_PAREN_OPEN) {
2095 PL_lex_allbrackets--;
2100 PL_parser->yychar = YYEMPTY;
2105 S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
2107 SV * const sv = newSVpvn_utf8(start, len,
2111 && is_utf8_non_invariant_string((const U8*)start, len));
2117 * When the lexer knows the next thing is a word (for instance, it has
2118 * just seen -> and it knows that the next char is a word char, then
2119 * it calls S_force_word to stick the next word into the PL_nexttoke/val
2123 * char *start : buffer position (must be within PL_linestr)
2124 * int token : PL_next* will be this type of bare word
2125 * (e.g., METHOD,BAREWORD)
2126 * int check_keyword : if true, Perl checks to make sure the word isn't
2127 * a keyword (do this if the word is a label, e.g. goto FOO)
2128 * int allow_pack : if true, : characters will also be allowed (require,
2129 * use, etc. do this)
2133 S_force_word(pTHX_ char *start, int token, int check_keyword, int allow_pack)
2138 PERL_ARGS_ASSERT_FORCE_WORD;
2140 start = skipspace(start);
2142 if ( isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)
2143 || (allow_pack && *s == ':' && s[1] == ':') )
2145 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
2146 if (check_keyword) {
2147 char *s2 = PL_tokenbuf;
2149 if (allow_pack && memBEGINPs(s2, len, "CORE::")) {
2150 s2 += sizeof("CORE::") - 1;
2151 len2 -= sizeof("CORE::") - 1;
2153 if (keyword(s2, len2, 0))
2156 if (token == METHOD) {
2161 PL_expect = XOPERATOR;
2164 NEXTVAL_NEXTTOKE.opval
2165 = newSVOP(OP_CONST,0,
2166 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
2167 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
2175 * Called when the lexer wants $foo *foo &foo etc, but the program
2176 * text only contains the "foo" portion. The first argument is a pointer
2177 * to the "foo", and the second argument is the type symbol to prefix.
2178 * Forces the next token to be a "BAREWORD".
2179 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
2183 S_force_ident(pTHX_ const char *s, int kind)
2185 PERL_ARGS_ASSERT_FORCE_IDENT;
2188 const STRLEN len = s[1] ? strlen(s) : 1; /* s = "\"" see yylex */
2189 OP* const o = newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
2190 UTF ? SVf_UTF8 : 0));
2191 NEXTVAL_NEXTTOKE.opval = o;
2192 force_next(BAREWORD);
2194 o->op_private = OPpCONST_ENTERED;
2195 /* XXX see note in pp_entereval() for why we forgo typo
2196 warnings if the symbol must be introduced in an eval.
2198 gv_fetchpvn_flags(s, len,
2199 (PL_in_eval ? GV_ADDMULTI
2200 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
2201 kind == PERLY_DOLLAR ? SVt_PV :
2202 kind == PERLY_SNAIL ? SVt_PVAV :
2203 kind == PERLY_PERCENT_SIGN ? SVt_PVHV :
2211 S_force_ident_maybe_lex(pTHX_ char pit)
2213 NEXTVAL_NEXTTOKE.ival = pit;
2218 Perl_str_to_version(pTHX_ SV *sv)
2223 const char *start = SvPV_const(sv,len);
2224 const char * const end = start + len;
2225 const bool utf = cBOOL(SvUTF8(sv));
2227 PERL_ARGS_ASSERT_STR_TO_VERSION;
2229 while (start < end) {
2233 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
2238 retval += ((NV)n)/nshift;
2247 * Forces the next token to be a version number.
2248 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2249 * and if "guessing" is TRUE, then no new token is created (and the caller
2250 * must use an alternative parsing method).
2254 S_force_version(pTHX_ char *s, int guessing)
2259 PERL_ARGS_ASSERT_FORCE_VERSION;
2267 while (isDIGIT(*d) || *d == '_' || *d == '.')
2269 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
2271 s = scan_num(s, &pl_yylval);
2272 version = pl_yylval.opval;
2273 ver = cSVOPx(version)->op_sv;
2274 if (SvPOK(ver) && !SvNIOK(ver)) {
2275 SvUPGRADE(ver, SVt_PVNV);
2276 SvNV_set(ver, str_to_version(ver));
2277 SvNOK_on(ver); /* hint that it is a version */
2280 else if (guessing) {
2285 /* NOTE: The parser sees the package name and the VERSION swapped */
2286 NEXTVAL_NEXTTOKE.opval = version;
2287 force_next(BAREWORD);
2293 * S_force_strict_version
2294 * Forces the next token to be a version number using strict syntax rules.
2298 S_force_strict_version(pTHX_ char *s)
2301 const char *errstr = NULL;
2303 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2305 while (isSPACE(*s)) /* leading whitespace */
2308 if (is_STRICT_VERSION(s,&errstr)) {
2310 s = (char *)scan_version(s, ver, 0);
2311 version = newSVOP(OP_CONST, 0, ver);
2313 else if ((*s != ';' && *s != '{' && *s != '}' )
2314 && (s = skipspace(s), (*s != ';' && *s != '{' && *s != '}' )))
2318 yyerror(errstr); /* version required */
2322 /* NOTE: The parser sees the package name and the VERSION swapped */
2323 NEXTVAL_NEXTTOKE.opval = version;
2324 force_next(BAREWORD);
2331 * Turns any \\ into \ in a quoted string passed in in 'sv', returning 'sv',
2332 * modified as necessary. However, if HINT_NEW_STRING is on, 'sv' is
2333 * unchanged, and a new SV containing the modified input is returned.
2337 S_tokeq(pTHX_ SV *sv)
2344 PERL_ARGS_ASSERT_TOKEQ;
2348 assert (!SvIsCOW(sv));
2349 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1) /* <<'heredoc' */
2353 /* This is relying on the SV being "well formed" with a trailing '\0' */
2354 while (s < send && !(*s == '\\' && s[1] == '\\'))
2359 if ( PL_hints & HINT_NEW_STRING ) {
2360 pv = newSVpvn_flags(SvPVX_const(pv), SvCUR(sv),
2361 SVs_TEMP | SvUTF8(sv));
2365 if (s + 1 < send && (s[1] == '\\'))
2366 s++; /* all that, just for this */
2371 SvCUR_set(sv, d - SvPVX_const(sv));
2373 if ( PL_hints & HINT_NEW_STRING )
2374 return new_constant(NULL, 0, "q", sv, pv, "q", 1, NULL);
2379 * Now come three functions related to double-quote context,
2380 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2381 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2382 * interact with PL_lex_state, and create fake ( ... ) argument lists
2383 * to handle functions and concatenation.
2387 * stringify ( const[foo] concat lcfirst ( const[bar] ) )
2392 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
2394 * Pattern matching will set PL_lex_op to the pattern-matching op to
2395 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
2397 * OP_CONST is easy--just make the new op and return.
2399 * Everything else becomes a FUNC.
2401 * Sets PL_lex_state to LEX_INTERPPUSH unless ival was OP_NULL or we
2402 * had an OP_CONST. This just sets us up for a
2403 * call to S_sublex_push().
2407 S_sublex_start(pTHX)
2409 const I32 op_type = pl_yylval.ival;
2411 if (op_type == OP_NULL) {
2412 pl_yylval.opval = PL_lex_op;
2416 if (op_type == OP_CONST) {
2417 SV *sv = PL_lex_stuff;
2418 PL_lex_stuff = NULL;
2421 if (SvTYPE(sv) == SVt_PVIV) {
2422 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2424 const char * const p = SvPV_const(sv, len);
2425 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
2429 pl_yylval.opval = newSVOP(op_type, 0, sv);
2433 PL_parser->lex_super_state = PL_lex_state;
2434 PL_parser->lex_sub_inwhat = (U16)op_type;
2435 PL_parser->lex_sub_op = PL_lex_op;
2436 PL_parser->sub_no_recover = FALSE;
2437 PL_parser->sub_error_count = PL_error_count;
2438 PL_lex_state = LEX_INTERPPUSH;
2442 pl_yylval.opval = PL_lex_op;
2452 * Create a new scope to save the lexing state. The scope will be
2453 * ended in S_sublex_done. Returns a '(', starting the function arguments
2454 * to the uc, lc, etc. found before.
2455 * Sets PL_lex_state to LEX_INTERPCONCAT.
2462 const bool is_heredoc = PL_multi_close == '<';
2465 PL_lex_state = PL_parser->lex_super_state;
2466 SAVEI8(PL_lex_dojoin);
2467 SAVEI32(PL_lex_brackets);
2468 SAVEI32(PL_lex_allbrackets);
2469 SAVEI32(PL_lex_formbrack);
2470 SAVEI8(PL_lex_fakeeof);
2471 SAVEI32(PL_lex_casemods);
2472 SAVEI32(PL_lex_starts);
2473 SAVEI8(PL_lex_state);
2474 SAVESPTR(PL_lex_repl);
2475 SAVEVPTR(PL_lex_inpat);
2476 SAVEI16(PL_lex_inwhat);
2479 SAVECOPLINE(PL_curcop);
2480 SAVEI32(PL_multi_end);
2481 SAVEI32(PL_parser->herelines);
2482 PL_parser->herelines = 0;
2484 SAVEIV(PL_multi_close);
2485 SAVEPPTR(PL_bufptr);
2486 SAVEPPTR(PL_bufend);
2487 SAVEPPTR(PL_oldbufptr);
2488 SAVEPPTR(PL_oldoldbufptr);
2489 SAVEPPTR(PL_last_lop);
2490 SAVEPPTR(PL_last_uni);
2491 SAVEPPTR(PL_linestart);
2492 SAVESPTR(PL_linestr);
2493 SAVEGENERICPV(PL_lex_brackstack);
2494 SAVEGENERICPV(PL_lex_casestack);
2495 SAVEGENERICPV(PL_parser->lex_shared);
2496 SAVEBOOL(PL_parser->lex_re_reparsing);
2497 SAVEI32(PL_copline);
2499 /* The here-doc parser needs to be able to peek into outer lexing
2500 scopes to find the body of the here-doc. So we put PL_linestr and
2501 PL_bufptr into lex_shared, to ‘share’ those values.
2503 PL_parser->lex_shared->ls_linestr = PL_linestr;
2504 PL_parser->lex_shared->ls_bufptr = PL_bufptr;
2506 PL_linestr = PL_lex_stuff;
2507 PL_lex_repl = PL_parser->lex_sub_repl;
2508 PL_lex_stuff = NULL;
2509 PL_parser->lex_sub_repl = NULL;
2511 /* Arrange for PL_lex_stuff to be freed on scope exit, in case it gets
2512 set for an inner quote-like operator and then an error causes scope-
2513 popping. We must not have a PL_lex_stuff value left dangling, as
2514 that breaks assumptions elsewhere. See bug #123617. */
2515 SAVEGENERICSV(PL_lex_stuff);
2516 SAVEGENERICSV(PL_parser->lex_sub_repl);
2518 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2519 = SvPVX(PL_linestr);
2520 PL_bufend += SvCUR(PL_linestr);
2521 PL_last_lop = PL_last_uni = NULL;
2522 SAVEFREESV(PL_linestr);
2523 if (PL_lex_repl) SAVEFREESV(PL_lex_repl);
2525 PL_lex_dojoin = FALSE;
2526 PL_lex_brackets = PL_lex_formbrack = 0;
2527 PL_lex_allbrackets = 0;
2528 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
2529 Newx(PL_lex_brackstack, 120, char);
2530 Newx(PL_lex_casestack, 12, char);
2531 PL_lex_casemods = 0;
2532 *PL_lex_casestack = '\0';
2534 PL_lex_state = LEX_INTERPCONCAT;
2536 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
2537 PL_copline = NOLINE;
2539 Newxz(shared, 1, LEXSHARED);
2540 shared->ls_prev = PL_parser->lex_shared;
2541 PL_parser->lex_shared = shared;
2543 PL_lex_inwhat = PL_parser->lex_sub_inwhat;
2544 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
2545 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2546 PL_lex_inpat = PL_parser->lex_sub_op;
2548 PL_lex_inpat = NULL;
2550 PL_parser->lex_re_reparsing = cBOOL(PL_in_eval & EVAL_RE_REPARSING);
2551 PL_in_eval &= ~EVAL_RE_REPARSING;
2558 * Restores lexer state after a S_sublex_push.
2564 if (!PL_lex_starts++) {
2565 SV * const sv = newSVpvs("");
2566 if (SvUTF8(PL_linestr))
2568 PL_expect = XOPERATOR;
2569 pl_yylval.opval = newSVOP(OP_CONST, 0, sv);
2573 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2574 PL_lex_state = LEX_INTERPCASEMOD;
2578 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
2579 assert(PL_lex_inwhat != OP_TRANSR);
2581 assert (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS);
2582 PL_linestr = PL_lex_repl;
2584 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2585 PL_bufend += SvCUR(PL_linestr);
2586 PL_last_lop = PL_last_uni = NULL;
2587 PL_lex_dojoin = FALSE;
2588 PL_lex_brackets = 0;
2589 PL_lex_allbrackets = 0;
2590 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
2591 PL_lex_casemods = 0;
2592 *PL_lex_casestack = '\0';
2594 if (SvEVALED(PL_lex_repl)) {
2595 PL_lex_state = LEX_INTERPNORMAL;
2597 /* we don't clear PL_lex_repl here, so that we can check later
2598 whether this is an evalled subst; that means we rely on the
2599 logic to ensure sublex_done() is called again only via the
2600 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
2603 PL_lex_state = LEX_INTERPCONCAT;
2606 if (SvTYPE(PL_linestr) >= SVt_PVNV) {
2607 CopLINE(PL_curcop) +=
2608 ((XPVNV*)SvANY(PL_linestr))->xnv_u.xnv_lines
2609 + PL_parser->herelines;
2610 PL_parser->herelines = 0;
2615 const line_t l = CopLINE(PL_curcop);
2617 if (PL_parser->sub_error_count != PL_error_count) {
2618 if (PL_parser->sub_no_recover) {
2623 if (PL_multi_close == '<')
2624 PL_parser->herelines += l - PL_multi_end;
2625 PL_bufend = SvPVX(PL_linestr);
2626 PL_bufend += SvCUR(PL_linestr);
2627 PL_expect = XOPERATOR;
2633 Perl_load_charnames(pTHX_ SV * char_name, const char * context,
2634 const STRLEN context_len, const char ** error_msg)
2636 /* Load the official _charnames module if not already there. The
2637 * parameters are just to give info for any error messages generated:
2638 * char_name a name to look up which is the reason for loading this
2639 * context 'char_name' in the context in the input in which it appears
2640 * context_len how many bytes 'context' occupies
2641 * error_msg *error_msg will be set to any error
2643 * Returns the ^H table if success; otherwise NULL */
2650 PERL_ARGS_ASSERT_LOAD_CHARNAMES;
2652 /* This loop is executed 1 1/2 times. On the first time through, if it
2653 * isn't already loaded, try loading it, and iterate just once to see if it
2655 for (i = 0; i < 2; i++) {
2656 table = GvHV(PL_hintgv); /* ^H */
2659 && (PL_hints & HINT_LOCALIZE_HH)
2660 && (cvp = hv_fetchs(table, "charnames", FALSE))
2663 return table; /* Quit if already loaded */
2667 Perl_load_module(aTHX_
2669 newSVpvs("_charnames"),
2671 /* version parameter; no need to specify it, as if we get too early
2672 * a version, will fail anyway, not being able to find 'charnames'
2681 /* Here, it failed; new_constant will give appropriate error messages */
2683 res = new_constant( NULL, 0, "charnames", char_name, NULL,
2684 context, context_len, error_msg);
2691 S_get_and_check_backslash_N_name_wrapper(pTHX_ const char* s, const char* const e)
2693 /* This justs wraps get_and_check_backslash_N_name() to output any error
2694 * message it returns. */
2696 const char * error_msg = NULL;
2699 PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME_WRAPPER;
2701 /* charnames doesn't work well if there have been errors found */
2702 if (PL_error_count > 0) {
2706 result = get_and_check_backslash_N_name(s, e, cBOOL(UTF), &error_msg);
2709 yyerror_pv(error_msg, UTF ? SVf_UTF8 : 0);
2716 Perl_get_and_check_backslash_N_name(pTHX_ const char* s,
2717 const char* const e,
2719 const char ** error_msg)
2721 /* <s> points to first character of interior of \N{}, <e> to one beyond the
2722 * interior, hence to the "}". Finds what the name resolves to, returning
2723 * an SV* containing it; NULL if no valid one found.
2725 * 'is_utf8' is TRUE if we know we want the result to be UTF-8 even if it
2726 * doesn't have to be. */
2736 /* Points to the beginning of the \N{... so that any messages include the
2737 * context of what's failing*/
2738 const char* context = s - 3;
2739 STRLEN context_len = e - context + 1; /* include all of \N{...} */
2742 PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME;
2745 assert(s > (char *) 3);
2747 char_name = newSVpvn_flags(s, e - s, (is_utf8) ? SVf_UTF8 : 0);
2749 if (!SvCUR(char_name)) {
2750 SvREFCNT_dec_NN(char_name);
2751 /* diag_listed_as: Unknown charname '%s' */
2752 *error_msg = Perl_form(aTHX_ "Unknown charname ''");
2756 /* Autoload the charnames module */
2758 table = load_charnames(char_name, context, context_len, error_msg);
2759 if (table == NULL) {
2764 res = new_constant( NULL, 0, "charnames", char_name, NULL,
2765 context, context_len, error_msg);
2767 *error_msg = Perl_form(aTHX_ "Unknown charname '%s'", SvPVX(char_name));
2773 /* See if the charnames handler is the Perl core's, and if so, we can skip
2774 * the validation needed for a user-supplied one, as Perl's does its own
2776 cvp = hv_fetchs(table, "charnames", FALSE);
2777 if (cvp && (cv = *cvp) && SvROK(cv) && (rv = SvRV(cv),
2778 SvTYPE(rv) == SVt_PVCV) && ((stash = CvSTASH(rv)) != NULL))
2780 const char * const name = HvNAME(stash);
2781 if (memEQs(name, HvNAMELEN(stash), "_charnames")) {
2786 /* Here, it isn't Perl's charname handler. We can't rely on a
2787 * user-supplied handler to validate the input name. For non-ut8 input,
2788 * look to see that the first character is legal. Then loop through the
2789 * rest checking that each is a continuation */
2791 /* This code makes the reasonable assumption that the only Latin1-range
2792 * characters that begin a character name alias are alphabetic, otherwise
2793 * would have to create a isCHARNAME_BEGIN macro */
2796 if (! isALPHAU(*s)) {
2801 if (! isCHARNAME_CONT(*s)) {
2804 if (*s == ' ' && *(s-1) == ' ') {
2811 /* Similarly for utf8. For invariants can check directly; for other
2812 * Latin1, can calculate their code point and check; otherwise use an
2814 if (UTF8_IS_INVARIANT(*s)) {
2815 if (! isALPHAU(*s)) {
2819 } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
2820 if (! isALPHAU(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)))) {
2826 if (! _invlist_contains_cp(PL_utf8_charname_begin,
2827 utf8_to_uvchr_buf((U8 *) s,
2837 if (UTF8_IS_INVARIANT(*s)) {
2838 if (! isCHARNAME_CONT(*s)) {
2841 if (*s == ' ' && *(s-1) == ' ') {
2846 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
2847 if (! isCHARNAME_CONT(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1))))
2854 if (! _invlist_contains_cp(PL_utf8_charname_continue,
2855 utf8_to_uvchr_buf((U8 *) s,
2865 if (*(s-1) == ' ') {
2866 /* diag_listed_as: charnames alias definitions may not contain
2867 trailing white-space; marked by <-- HERE in %s
2869 *error_msg = Perl_form(aTHX_
2870 "charnames alias definitions may not contain trailing "
2871 "white-space; marked by <-- HERE in %.*s<-- HERE %.*s",
2872 (int)(s - context + 1), context,
2873 (int)(e - s + 1), s + 1);
2877 if (SvUTF8(res)) { /* Don't accept malformed charname value */
2878 const U8* first_bad_char_loc;
2880 const char* const str = SvPV_const(res, len);
2881 if (UNLIKELY(! is_utf8_string_loc((U8 *) str, len,
2882 &first_bad_char_loc)))
2884 _force_out_malformed_utf8_message(first_bad_char_loc,
2885 (U8 *) PL_parser->bufend,
2887 0 /* 0 means don't die */ );
2888 /* diag_listed_as: Malformed UTF-8 returned by \N{%s}
2889 immediately after '%s' */
2890 *error_msg = Perl_form(aTHX_
2891 "Malformed UTF-8 returned by %.*s immediately after '%.*s'",
2892 (int) context_len, context,
2893 (int) ((char *) first_bad_char_loc - str), str);
2902 /* The final %.*s makes sure that should the trailing NUL be missing
2903 * that this print won't run off the end of the string */
2904 /* diag_listed_as: Invalid character in \N{...}; marked by <-- HERE
2906 *error_msg = Perl_form(aTHX_
2907 "Invalid character in \\N{...}; marked by <-- HERE in %.*s<-- HERE %.*s",
2908 (int)(s - context + 1), context,
2909 (int)(e - s + 1), s + 1);
2914 /* diag_listed_as: charnames alias definitions may not contain a
2915 sequence of multiple spaces; marked by <-- HERE
2917 *error_msg = Perl_form(aTHX_
2918 "charnames alias definitions may not contain a sequence of "
2919 "multiple spaces; marked by <-- HERE in %.*s<-- HERE %.*s",
2920 (int)(s - context + 1), context,
2921 (int)(e - s + 1), s + 1);
2928 Extracts the next constant part of a pattern, double-quoted string,
2929 or transliteration. This is terrifying code.
2931 For example, in parsing the double-quoted string "ab\x63$d", it would
2932 stop at the '$' and return an OP_CONST containing 'abc'.
2934 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
2935 processing a pattern (PL_lex_inpat is true), a transliteration
2936 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
2938 Returns a pointer to the character scanned up to. If this is
2939 advanced from the start pointer supplied (i.e. if anything was
2940 successfully parsed), will leave an OP_CONST for the substring scanned
2941 in pl_yylval. Caller must intuit reason for not parsing further
2942 by looking at the next characters herself.
2946 \N{FOO} => \N{U+hex_for_character_FOO}
2947 (if FOO expands to multiple characters, expands to \N{U+xx.XX.yy ...})
2950 all other \-char, including \N and \N{ apart from \N{ABC}
2953 @ and $ where it appears to be a var, but not for $ as tail anchor
2957 In transliterations:
2958 characters are VERY literal, except for - not at the start or end
2959 of the string, which indicates a range. However some backslash sequences
2960 are recognized: \r, \n, and the like
2961 \007 \o{}, \x{}, \N{}
2962 If all elements in the transliteration are below 256,
2963 scan_const expands the range to the full set of intermediate
2964 characters. If the range is in utf8, the hyphen is replaced with
2965 a certain range mark which will be handled by pmtrans() in op.c.
2967 In double-quoted strings:
2969 all those recognized in transliterations
2970 deprecated backrefs: \1 (in substitution replacements)
2971 case and quoting: \U \Q \E
2974 scan_const does *not* construct ops to handle interpolated strings.
2975 It stops processing as soon as it finds an embedded $ or @ variable
2976 and leaves it to the caller to work out what's going on.
2978 embedded arrays (whether in pattern or not) could be:
2979 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2981 $ in double-quoted strings must be the symbol of an embedded scalar.
2983 $ in pattern could be $foo or could be tail anchor. Assumption:
2984 it's a tail anchor if $ is the last thing in the string, or if it's
2985 followed by one of "()| \r\n\t"
2987 \1 (backreferences) are turned into $1 in substitutions
2989 The structure of the code is
2990 while (there's a character to process) {
2991 handle transliteration ranges
2992 skip regexp comments /(?#comment)/ and codes /(?{code})/
2993 skip #-initiated comments in //x patterns
2994 check for embedded arrays
2995 check for embedded scalars
2997 deprecate \1 in substitution replacements
2998 handle string-changing backslashes \l \U \Q \E, etc.
2999 switch (what was escaped) {
3000 handle \- in a transliteration (becomes a literal -)
3001 if a pattern and not \N{, go treat as regular character
3002 handle \132 (octal characters)
3003 handle \x15 and \x{1234} (hex characters)
3004 handle \N{name} (named characters, also \N{3,5} in a pattern)
3005 handle \cV (control characters)
3006 handle printf-style backslashes (\f, \r, \n, etc)
3009 } (end if backslash)
3010 handle regular character
3011 } (end while character to read)
3016 S_scan_const(pTHX_ char *start)
3018 char *send = PL_bufend; /* end of the constant */
3019 SV *sv = newSV(send - start); /* sv for the constant. See note below
3021 char *s = start; /* start of the constant */
3022 char *d = SvPVX(sv); /* destination for copies */
3023 bool dorange = FALSE; /* are we in a translit range? */
3024 bool didrange = FALSE; /* did we just finish a range? */
3025 bool in_charclass = FALSE; /* within /[...]/ */
3026 bool s_is_utf8 = cBOOL(UTF); /* Is the source string assumed to be
3027 UTF8? But, this can show as true
3028 when the source isn't utf8, as for
3029 example when it is entirely composed
3031 bool d_is_utf8 = FALSE; /* Output constant is UTF8 */
3032 STRLEN utf8_variant_count = 0; /* When not in UTF-8, this counts the
3033 number of characters found so far
3034 that will expand (into 2 bytes)
3035 should we have to convert to
3037 SV *res; /* result from charnames */
3038 STRLEN offset_to_max = 0; /* The offset in the output to where the range
3039 high-end character is temporarily placed */
3041 /* Does something require special handling in tr/// ? This avoids extra
3042 * work in a less likely case. As such, khw didn't feel it was worth
3043 * adding any branches to the more mainline code to handle this, which
3044 * means that this doesn't get set in some circumstances when things like
3045 * \x{100} get expanded out. As a result there needs to be extra testing
3046 * done in the tr code */
3047 bool has_above_latin1 = FALSE;
3049 /* Note on sizing: The scanned constant is placed into sv, which is
3050 * initialized by newSV() assuming one byte of output for every byte of
3051 * input. This routine expects newSV() to allocate an extra byte for a
3052 * trailing NUL, which this routine will append if it gets to the end of
3053 * the input. There may be more bytes of input than output (eg., \N{LATIN
3054 * CAPITAL LETTER A}), or more output than input if the constant ends up
3055 * recoded to utf8, but each time a construct is found that might increase
3056 * the needed size, SvGROW() is called. Its size parameter each time is
3057 * based on the best guess estimate at the time, namely the length used so
3058 * far, plus the length the current construct will occupy, plus room for
3059 * the trailing NUL, plus one byte for every input byte still unscanned */
3061 UV uv = UV_MAX; /* Initialize to weird value to try to catch any uses
3064 int backslash_N = 0; /* ? was the character from \N{} */
3065 int non_portable_endpoint = 0; /* ? In a range is an endpoint
3066 platform-specific like \x65 */
3069 PERL_ARGS_ASSERT_SCAN_CONST;
3071 assert(PL_lex_inwhat != OP_TRANSR);
3073 /* Protect sv from errors and fatal warnings. */
3074 ENTER_with_name("scan_const");
3077 /* A bunch of code in the loop below assumes that if s[n] exists and is not
3078 * NUL, then s[n+1] exists. This assertion makes sure that assumption is
3080 assert(*send == '\0');
3083 || dorange /* Handle tr/// range at right edge of input */
3086 /* get transliterations out of the way (they're most literal) */
3087 if (PL_lex_inwhat == OP_TRANS) {
3089 /* But there isn't any special handling necessary unless there is a
3090 * range, so for most cases we just drop down and handle the value
3091 * as any other. There are two exceptions.
3093 * 1. A hyphen indicates that we are actually going to have a
3094 * range. In this case, skip the '-', set a flag, then drop
3095 * down to handle what should be the end range value.
3096 * 2. After we've handled that value, the next time through, that
3097 * flag is set and we fix up the range.
3099 * Ranges entirely within Latin1 are expanded out entirely, in
3100 * order to make the transliteration a simple table look-up.
3101 * Ranges that extend above Latin1 have to be done differently, so
3102 * there is no advantage to expanding them here, so they are
3103 * stored here as Min, RANGE_INDICATOR, Max. 'RANGE_INDICATOR' is
3104 * a byte that can't occur in legal UTF-8, and hence can signify a
3105 * hyphen without any possible ambiguity. On EBCDIC machines, if
3106 * the range is expressed as Unicode, the Latin1 portion is
3107 * expanded out even if the range extends above Latin1. This is
3108 * because each code point in it has to be processed here
3109 * individually to get its native translation */
3113 /* Here, we don't think we're in a range. If the new character
3114 * is not a hyphen; or if it is a hyphen, but it's too close to
3115 * either edge to indicate a range, or if we haven't output any
3116 * characters yet then it's a regular character. */
3117 if (*s != '-' || s >= send - 1 || s == start || d == SvPVX(sv))
3120 /* A regular character. Process like any other, but first
3121 * clear any flags */
3125 non_portable_endpoint = 0;
3128 /* The tests here for being above Latin1 and similar ones
3129 * in the following 'else' suffice to find all such
3130 * occurences in the constant, except those added by a
3131 * backslash escape sequence, like \x{100}. Mostly, those
3132 * set 'has_above_latin1' as appropriate */
3133 if (s_is_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) {
3134 has_above_latin1 = TRUE;
3137 /* Drops down to generic code to process current byte */
3139 else { /* Is a '-' in the context where it means a range */
3140 if (didrange) { /* Something like y/A-C-Z// */
3141 Perl_croak(aTHX_ "Ambiguous range in transliteration"
3147 s++; /* Skip past the hyphen */
3149 /* d now points to where the end-range character will be
3150 * placed. Drop down to get that character. We'll finish
3151 * processing the range the next time through the loop */
3153 if (s_is_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) {
3154 has_above_latin1 = TRUE;
3157 /* Drops down to generic code to process current byte */
3159 } /* End of not a range */
3161 /* Here we have parsed a range. Now must handle it. At this
3163 * 'sv' is a SV* that contains the output string we are
3164 * constructing. The final two characters in that string
3165 * are the range start and range end, in order.
3166 * 'd' points to just beyond the range end in the 'sv' string,
3167 * where we would next place something
3172 IV range_max; /* last character in range */
3174 Size_t offset_to_min = 0;
3177 bool convert_unicode;
3178 IV real_range_max = 0;
3180 /* Get the code point values of the range ends. */
3181 max_ptr = (d_is_utf8) ? (char *) utf8_hop( (U8*) d, -1) : d - 1;
3182 offset_to_max = max_ptr - SvPVX_const(sv);
3184 /* We know the utf8 is valid, because we just constructed
3185 * it ourselves in previous loop iterations */
3186 min_ptr = (char*) utf8_hop( (U8*) max_ptr, -1);
3187 range_min = valid_utf8_to_uvchr( (U8*) min_ptr, NULL);
3188 range_max = valid_utf8_to_uvchr( (U8*) max_ptr, NULL);
3190 /* This compensates for not all code setting
3191 * 'has_above_latin1', so that we don't skip stuff that
3192 * should be executed */
3193 if (range_max > 255) {
3194 has_above_latin1 = TRUE;
3198 min_ptr = max_ptr - 1;
3199 range_min = * (U8*) min_ptr;
3200 range_max = * (U8*) max_ptr;
3203 /* If the range is just a single code point, like tr/a-a/.../,
3204 * that code point is already in the output, twice. We can
3205 * just back up over the second instance and avoid all the rest
3206 * of the work. But if it is a variant character, it's been
3207 * counted twice, so decrement. (This unlikely scenario is
3208 * special cased, like the one for a range of 2 code points
3209 * below, only because the main-line code below needs a range
3210 * of 3 or more to work without special casing. Might as well
3211 * get it out of the way now.) */
3212 if (UNLIKELY(range_max == range_min)) {
3214 if (! d_is_utf8 && ! UVCHR_IS_INVARIANT(range_max)) {
3215 utf8_variant_count--;
3221 /* On EBCDIC platforms, we may have to deal with portable
3222 * ranges. These happen if at least one range endpoint is a
3223 * Unicode value (\N{...}), or if the range is a subset of
3224 * [A-Z] or [a-z], and both ends are literal characters,
3225 * like 'A', and not like \x{C1} */
3227 cBOOL(backslash_N) /* \N{} forces Unicode,
3228 hence portable range */
3229 || ( ! non_portable_endpoint
3230 && (( isLOWER_A(range_min) && isLOWER_A(range_max))
3231 || (isUPPER_A(range_min) && isUPPER_A(range_max))));
3232 if (convert_unicode) {
3234 /* Special handling is needed for these portable ranges.
3235 * They are defined to be in Unicode terms, which includes
3236 * all the Unicode code points between the end points.
3237 * Convert to Unicode to get the Unicode range. Later we
3238 * will convert each code point in the range back to
3240 range_min = NATIVE_TO_UNI(range_min);
3241 range_max = NATIVE_TO_UNI(range_max);
3245 if (range_min > range_max) {
3247 if (convert_unicode) {
3248 /* Need to convert back to native for meaningful
3249 * messages for this platform */
3250 range_min = UNI_TO_NATIVE(range_min);
3251 range_max = UNI_TO_NATIVE(range_max);
3254 /* Use the characters themselves for the error message if
3255 * ASCII printables; otherwise some visible representation
3257 if (isPRINT_A(range_min) && isPRINT_A(range_max)) {
3259 "Invalid range \"%c-%c\" in transliteration operator",
3260 (char)range_min, (char)range_max);
3263 else if (convert_unicode) {
3264 /* diag_listed_as: Invalid range "%s" in transliteration operator */
3266 "Invalid range \"\\N{U+%04" UVXf "}-\\N{U+%04"
3267 UVXf "}\" in transliteration operator",
3268 range_min, range_max);
3272 /* diag_listed_as: Invalid range "%s" in transliteration operator */
3274 "Invalid range \"\\x{%04" UVXf "}-\\x{%04" UVXf "}\""
3275 " in transliteration operator",
3276 range_min, range_max);
3280 /* If the range is exactly two code points long, they are
3281 * already both in the output */
3282 if (UNLIKELY(range_min + 1 == range_max)) {
3286 /* Here the range contains at least 3 code points */
3290 /* If everything in the transliteration is below 256, we
3291 * can avoid special handling later. A translation table
3292 * for each of those bytes is created by op.c. So we
3293 * expand out all ranges to their constituent code points.
3294 * But if we've encountered something above 255, the
3295 * expanding won't help, so skip doing that. But if it's
3296 * EBCDIC, we may have to look at each character below 256
3297 * if we have to convert to/from Unicode values */
3298 if ( has_above_latin1
3300 && (range_min > 255 || ! convert_unicode)
3303 const STRLEN off = d - SvPVX(sv);
3304 const STRLEN extra = 1 + (send - s) + 1;
3307 /* Move the high character one byte to the right; then
3308 * insert between it and the range begin, an illegal
3309 * byte which serves to indicate this is a range (using
3310 * a '-' would be ambiguous). */
3312 if (off + extra > SvLEN(sv)) {
3313 d = off + SvGROW(sv, off + extra);
3314 max_ptr = d - off + offset_to_max;
3318 while (e-- > max_ptr) {
3321 *(e + 1) = (char) RANGE_INDICATOR;
3325 /* Here, we're going to expand out the range. For EBCDIC
3326 * the range can extend above 255 (not so in ASCII), so
3327 * for EBCDIC, split it into the parts above and below
3330 if (range_max > 255) {
3331 real_range_max = range_max;
3337 /* Here we need to expand out the string to contain each
3338 * character in the range. Grow the output to handle this.
3339 * For non-UTF8, we need a byte for each code point in the
3340 * range, minus the three that we've already allocated for: the
3341 * hyphen, the min, and the max. For UTF-8, we need this
3342 * plus an extra byte for each code point that occupies two
3343 * bytes (is variant) when in UTF-8 (except we've already
3344 * allocated for the end points, including if they are
3345 * variants). For ASCII platforms and Unicode ranges on EBCDIC
3346 * platforms, it's easy to calculate a precise number. To
3347 * start, we count the variants in the range, which we need
3348 * elsewhere in this function anyway. (For the case where it
3349 * isn't easy to calculate, 'extras' has been initialized to 0,
3350 * and the calculation is done in a loop further down.) */
3352 if (convert_unicode)
3355 /* This is executed unconditionally on ASCII, and for
3356 * Unicode ranges on EBCDIC. Under these conditions, all
3357 * code points above a certain value are variant; and none
3358 * under that value are. We just need to find out how much
3359 * of the range is above that value. We don't count the
3360 * end points here, as they will already have been counted
3361 * as they were parsed. */
3362 if (range_min >= UTF_CONTINUATION_MARK) {
3364 /* The whole range is made up of variants */
3365 extras = (range_max - 1) - (range_min + 1) + 1;
3367 else if (range_max >= UTF_CONTINUATION_MARK) {
3369 /* Only the higher portion of the range is variants */
3370 extras = (range_max - 1) - UTF_CONTINUATION_MARK + 1;
3373 utf8_variant_count += extras;
3376 /* The base growth is the number of code points in the range,
3377 * not including the endpoints, which have already been sized
3378 * for (and output). We don't subtract for the hyphen, as it
3379 * has been parsed but not output, and the SvGROW below is
3380 * based only on what's been output plus what's left to parse.
3382 grow = (range_max - 1) - (range_min + 1) + 1;
3386 /* In some cases in EBCDIC, we haven't yet calculated a
3387 * precise amount needed for the UTF-8 variants. Just
3388 * assume the worst case, that everything will expand by a
3390 if (! convert_unicode) {
3396 /* Otherwise we know exactly how many variants there
3397 * are in the range. */
3402 /* Grow, but position the output to overwrite the range min end
3403 * point, because in some cases we overwrite that */
3404 SvCUR_set(sv, d - SvPVX_const(sv));
3405 offset_to_min = min_ptr - SvPVX_const(sv);
3407 /* See Note on sizing above. */
3408 d = offset_to_min + SvGROW(sv, SvCUR(sv)
3411 + 1 /* Trailing NUL */ );
3413 /* Now, we can expand out the range. */
3415 if (convert_unicode) {
3418 /* Recall that the min and max are now in Unicode terms, so
3419 * we have to convert each character to its native
3422 for (i = range_min; i <= range_max; i++) {
3423 append_utf8_from_native_byte(
3424 LATIN1_TO_NATIVE((U8) i),
3429 for (i = range_min; i <= range_max; i++) {
3430 *d++ = (char)LATIN1_TO_NATIVE((U8) i);
3436 /* Always gets run for ASCII, and sometimes for EBCDIC. */
3438 /* Here, no conversions are necessary, which means that the
3439 * first character in the range is already in 'd' and
3440 * valid, so we can skip overwriting it */
3444 for (i = range_min + 1; i <= range_max; i++) {
3445 append_utf8_from_native_byte((U8) i, (U8 **) &d);
3451 assert(range_min + 1 <= range_max);
3452 for (i = range_min + 1; i < range_max; i++) {
3454 /* In this case on EBCDIC, we haven't calculated
3455 * the variants. Do it here, as we go along */
3456 if (! UVCHR_IS_INVARIANT(i)) {
3457 utf8_variant_count++;
3463 /* The range_max is done outside the loop so as to
3464 * avoid having to special case not incrementing
3465 * 'utf8_variant_count' on EBCDIC (it's already been
3466 * counted when originally parsed) */
3467 *d++ = (char) range_max;
3472 /* If the original range extended above 255, add in that
3474 if (real_range_max) {
3475 *d++ = (char) UTF8_TWO_BYTE_HI(0x100);
3476 *d++ = (char) UTF8_TWO_BYTE_LO(0x100);
3477 if (real_range_max > 0x100) {
3478 if (real_range_max > 0x101) {
3479 *d++ = (char) RANGE_INDICATOR;
3481 d = (char*)uvchr_to_utf8((U8*)d, real_range_max);
3487 /* mark the range as done, and continue */
3491 non_portable_endpoint = 0;
3495 } /* End of is a range */
3496 } /* End of transliteration. Joins main code after these else's */
3497 else if (*s == '[' && PL_lex_inpat && !in_charclass) {
3500 while (s1 >= start && *s1-- == '\\')
3503 in_charclass = TRUE;
3505 else if (*s == ']' && PL_lex_inpat && in_charclass) {
3508 while (s1 >= start && *s1-- == '\\')
3511 in_charclass = FALSE;
3513 /* skip for regexp comments /(?#comment)/, except for the last
3514 * char, which will be done separately. Stop on (?{..}) and
3516 else if (*s == '(' && PL_lex_inpat && s[1] == '?' && !in_charclass) {
3519 PERL_UINT_FAST8_T len = UTF8SKIP(s);
3521 while (s + len < send && *s != ')') {
3522 Copy(s, d, len, U8);
3525 len = UTF8_SAFE_SKIP(s, send);
3528 else while (s+1 < send && *s != ')') {
3532 else if (!PL_lex_casemods
3533 && ( s[2] == '{' /* This should match regcomp.c */
3534 || (s[2] == '?' && s[3] == '{')))
3539 /* likewise skip #-initiated comments in //x patterns */
3543 && ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED)
3545 while (s < send && *s != '\n')
3548 /* no further processing of single-quoted regex */
3549 else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'')
3550 goto default_action;
3552 /* check for embedded arrays
3553 * (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
3555 else if (*s == '@' && s[1]) {
3557 ? isIDFIRST_utf8_safe(s+1, send)
3558 : isWORDCHAR_A(s[1]))
3562 if (memCHRs(":'{$", s[1]))
3564 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
3565 break; /* in regexp, neither @+ nor @- are interpolated */
3567 /* check for embedded scalars. only stop if we're sure it's a
3569 else if (*s == '$') {
3570 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
3572 if (s + 1 < send && !memCHRs("()| \r\n\t", s[1])) {
3574 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
3575 "Possible unintended interpolation of $\\ in regex");
3577 break; /* in regexp, $ might be tail anchor */
3581 /* End of else if chain - OP_TRANS rejoin rest */
3583 if (UNLIKELY(s >= send)) {
3589 if (*s == '\\' && s+1 < send) {
3590 char* e; /* Can be used for ending '}', etc. */
3594 /* warn on \1 - \9 in substitution replacements, but note that \11
3595 * is an octal; and \19 is \1 followed by '9' */
3596 if (PL_lex_inwhat == OP_SUBST
3602 /* diag_listed_as: \%d better written as $%d */
3603 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
3608 /* string-change backslash escapes */
3609 if (PL_lex_inwhat != OP_TRANS && *s && memCHRs("lLuUEQF", *s)) {
3613 /* In a pattern, process \N, but skip any other backslash escapes.
3614 * This is because we don't want to translate an escape sequence
3615 * into a meta symbol and have the regex compiler use the meta
3616 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
3617 * in spite of this, we do have to process \N here while the proper
3618 * charnames handler is in scope. See bugs #56444 and #62056.
3620 * There is a complication because \N in a pattern may also stand
3621 * for 'match a non-nl', and not mean a charname, in which case its
3622 * processing should be deferred to the regex compiler. To be a
3623 * charname it must be followed immediately by a '{', and not look
3624 * like \N followed by a curly quantifier, i.e., not something like
3625 * \N{3,}. regcurly returns a boolean indicating if it is a legal
3627 else if (PL_lex_inpat
3630 || regcurly(s + 1)))
3633 goto default_action;
3639 if ((isALPHANUMERIC(*s)))
3640 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
3641 "Unrecognized escape \\%c passed through",
3643 /* default action is to copy the quoted character */
3644 goto default_action;
3647 /* eg. \132 indicates the octal constant 0132 */
3648 case '0': case '1': case '2': case '3':
3649 case '4': case '5': case '6': case '7':
3651 I32 flags = PERL_SCAN_SILENT_ILLDIGIT
3652 | PERL_SCAN_NOTIFY_ILLDIGIT;
3654 uv = grok_oct(s, &len, &flags, NULL);
3656 if ( (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
3658 && isDIGIT(*s) /* like \08, \178 */
3659 && ckWARN(WARN_MISC))
3661 Perl_warner(aTHX_ packWARN(WARN_MISC), "%s",
3662 form_alien_digit_msg(8, len, s, send, UTF, FALSE));
3665 goto NUM_ESCAPE_INSERT;
3667 /* eg. \o{24} indicates the octal constant \024 */
3672 if (! grok_bslash_o(&s, send,
3675 FALSE, /* Not strict */
3676 FALSE, /* No illegal cp's */
3680 uv = 0; /* drop through to ensure range ends are set */
3682 goto NUM_ESCAPE_INSERT;
3685 /* eg. \x24 indicates the hex constant 0x24 */
3690 if (! grok_bslash_x(&s, send,
3693 FALSE, /* Not strict */
3694 FALSE, /* No illegal cp's */
3698 uv = 0; /* drop through to ensure range ends are set */
3703 /* Insert oct or hex escaped character. */
3705 /* Here uv is the ordinal of the next character being added */
3706 if (UVCHR_IS_INVARIANT(uv)) {
3710 if (!d_is_utf8 && uv > 255) {
3712 /* Here, 'uv' won't fit unless we convert to UTF-8.
3713 * If we've only seen invariants so far, all we have to
3714 * do is turn on the flag */
3715 if (utf8_variant_count == 0) {
3719 SvCUR_set(sv, d - SvPVX_const(sv));
3723 sv_utf8_upgrade_flags_grow(
3725 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3727 /* Since we're having to grow here,
3728 * make sure we have enough room for
3729 * this escape and a NUL, so the
3730 * code immediately below won't have
3731 * to actually grow again */
3733 + (STRLEN)(send - s) + 1);
3734 d = SvPVX(sv) + SvCUR(sv);
3737 has_above_latin1 = TRUE;
3743 utf8_variant_count++;
3746 /* Usually, there will already be enough room in 'sv'
3747 * since such escapes are likely longer than any UTF-8
3748 * sequence they can end up as. This isn't the case on
3749 * EBCDIC where \x{40000000} contains 12 bytes, and the
3750 * UTF-8 for it contains 14. And, we have to allow for
3751 * a trailing NUL. It probably can't happen on ASCII
3752 * platforms, but be safe. See Note on sizing above. */
3753 const STRLEN needed = d - SvPVX(sv)
3757 if (UNLIKELY(needed > SvLEN(sv))) {
3758 SvCUR_set(sv, d - SvPVX_const(sv));
3759 d = SvCUR(sv) + SvGROW(sv, needed);
3762 d = (char*) uvchr_to_utf8_flags((U8*)d, uv,
3763 (ckWARN(WARN_PORTABLE))
3764 ? UNICODE_WARN_PERL_EXTENDED
3769 non_portable_endpoint++;
3774 /* In a non-pattern \N must be like \N{U+0041}, or it can be a
3775 * named character, like \N{LATIN SMALL LETTER A}, or a named
3776 * sequence, like \N{LATIN CAPITAL LETTER A WITH MACRON AND
3777 * GRAVE} (except y/// can't handle the latter, croaking). For
3778 * convenience all three forms are referred to as "named
3779 * characters" below.
3781 * For patterns, \N also can mean to match a non-newline. Code
3782 * before this 'switch' statement should already have handled
3783 * this situation, and hence this code only has to deal with
3784 * the named character cases.
3786 * For non-patterns, the named characters are converted to
3787 * their string equivalents. In patterns, named characters are
3788 * not converted to their ultimate forms for the same reasons
3789 * that other escapes aren't (mainly that the ultimate
3790 * character could be considered a meta-symbol by the regex
3791 * compiler). Instead, they are converted to the \N{U+...}
3792 * form to get the value from the charnames that is in effect
3793 * right now, while preserving the fact that it was a named
3794 * character, so that the regex compiler knows this.
3796 * The structure of this section of code (besides checking for
3797 * errors and upgrading to utf8) is:
3798 * If the named character is of the form \N{U+...}, pass it
3799 * through if a pattern; otherwise convert the code point
3801 * Otherwise must be some \N{NAME}: convert to
3802 * \N{U+c1.c2...} if a pattern; otherwise convert to utf8
3804 * Transliteration is an exception. The conversion to utf8 is
3805 * only done if the code point requires it to be representable.
3807 * Here, 's' points to the 'N'; the test below is guaranteed to
3808 * succeed if we are being called on a pattern, as we already
3809 * know from a test above that the next character is a '{'. A
3810 * non-pattern \N must mean 'named character', which requires
3814 yyerror("Missing braces on \\N{}");
3820 /* If there is no matching '}', it is an error. */
3821 if (! (e = (char *) memchr(s, '}', send - s))) {
3822 if (! PL_lex_inpat) {
3823 yyerror("Missing right brace on \\N{}");
3825 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N");
3827 yyquit(); /* Have exhausted the input. */
3830 /* Here it looks like a named character */
3832 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3833 s += 2; /* Skip to next char after the 'U+' */
3836 /* In patterns, we can have \N{U+xxxx.yyyy.zzzz...} */
3837 /* Check the syntax. */
3840 if (!isXDIGIT(*s)) {
3843 "Invalid hexadecimal number in \\N{U+...}"
3852 else if ((*s == '.' || *s == '_')
3858 /* Pass everything through unchanged.
3859 * +1 is for the '}' */
3860 Copy(orig_s, d, e - orig_s + 1, char);
3861 d += e - orig_s + 1;
3863 else { /* Not a pattern: convert the hex to string */
3864 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3865 | PERL_SCAN_SILENT_ILLDIGIT
3866 | PERL_SCAN_SILENT_OVERFLOW
3867 | PERL_SCAN_DISALLOW_PREFIX;
3870 uv = grok_hex(s, &len, &flags, NULL);
3871 if (len == 0 || (len != (STRLEN)(e - s)))
3874 if ( uv > MAX_LEGAL_CP
3875 || (flags & PERL_SCAN_GREATER_THAN_UV_MAX))
3877 yyerror(form_cp_too_large_msg(16, s, len, 0));
3878 uv = 0; /* drop through to ensure range ends are
3882 /* For non-tr///, if the destination is not in utf8,
3883 * unconditionally recode it to be so. This is
3884 * because \N{} implies Unicode semantics, and scalars
3885 * have to be in utf8 to guarantee those semantics.
3886 * tr/// doesn't care about Unicode rules, so no need
3887 * there to upgrade to UTF-8 for small enough code
3889 if (! d_is_utf8 && ( uv > 0xFF
3890 || PL_lex_inwhat != OP_TRANS))
3892 /* See Note on sizing above. */
3893 const STRLEN extra = OFFUNISKIP(uv) + (send - e) + 1;
3895 SvCUR_set(sv, d - SvPVX_const(sv));
3899 if (utf8_variant_count == 0) {
3901 d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + extra);
3904 sv_utf8_upgrade_flags_grow(
3906 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3908 d = SvPVX(sv) + SvCUR(sv);
3912 has_above_latin1 = TRUE;
3915 /* Add the (Unicode) code point to the output. */
3916 if (! d_is_utf8 || OFFUNI_IS_INVARIANT(uv)) {
3917 *d++ = (char) LATIN1_TO_NATIVE(uv);
3920 d = (char*) uvoffuni_to_utf8_flags((U8*)d, uv,
3921 (ckWARN(WARN_PORTABLE))
3922 ? UNICODE_WARN_PERL_EXTENDED
3927 else /* Here is \N{NAME} but not \N{U+...}. */
3928 if (! (res = get_and_check_backslash_N_name_wrapper(s, e)))
3929 { /* Failed. We should die eventually, but for now use a NUL
3933 else { /* Successfully evaluated the name */
3935 const char *str = SvPV_const(res, len);
3938 if (! len) { /* The name resolved to an empty string */
3939 const char empty_N[] = "\\N{_}";
3940 Copy(empty_N, d, sizeof(empty_N) - 1, char);
3941 d += sizeof(empty_N) - 1;
3944 /* In order to not lose information for the regex
3945 * compiler, pass the result in the specially made
3946 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3947 * the code points in hex of each character
3948 * returned by charnames */
3950 const char *str_end = str + len;
3951 const STRLEN off = d - SvPVX_const(sv);
3953 if (! SvUTF8(res)) {
3954 /* For the non-UTF-8 case, we can determine the
3955 * exact length needed without having to parse
3956 * through the string. Each character takes up
3957 * 2 hex digits plus either a trailing dot or
3959 const char initial_text[] = "\\N{U+";
3960 const STRLEN initial_len = sizeof(initial_text)
3962 d = off + SvGROW(sv, off
3965 /* +1 for trailing NUL */
3968 + (STRLEN)(send - e));
3969 Copy(initial_text, d, initial_len, char);
3971 while (str < str_end) {
3974 my_snprintf(hex_string,
3978 /* The regex compiler is
3979 * expecting Unicode, not
3981 NATIVE_TO_LATIN1(*str));
3982 PERL_MY_SNPRINTF_POST_GUARD(len,
3983 sizeof(hex_string));
3984 Copy(hex_string, d, 3, char);
3988 d--; /* Below, we will overwrite the final
3989 dot with a right brace */
3992 STRLEN char_length; /* cur char's byte length */
3994 /* and the number of bytes after this is
3995 * translated into hex digits */
3996 STRLEN output_length;
3998 /* 2 hex per byte; 2 chars for '\N'; 2 chars
3999 * for max('U+', '.'); and 1 for NUL */
4000 char hex_string[2 * UTF8_MAXBYTES + 5];
4002 /* Get the first character of the result. */
4003 U32 uv = utf8n_to_uvchr((U8 *) str,
4007 /* Convert first code point to Unicode hex,
4008 * including the boiler plate before it. */
4010 my_snprintf(hex_string, sizeof(hex_string),
4012 (unsigned int) NATIVE_TO_UNI(uv));
4014 /* Make sure there is enough space to hold it */
4015 d = off + SvGROW(sv, off
4017 + (STRLEN)(send - e)
4018 + 2); /* '}' + NUL */
4020 Copy(hex_string, d, output_length, char);
4023 /* For each subsequent character, append dot and
4024 * its Unicode code point in hex */
4025 while ((str += char_length) < str_end) {
4026 const STRLEN off = d - SvPVX_const(sv);
4027 U32 uv = utf8n_to_uvchr((U8 *) str,
4032 my_snprintf(hex_string,
4035 (unsigned int) NATIVE_TO_UNI(uv));
4037 d = off + SvGROW(sv, off
4039 + (STRLEN)(send - e)
4040 + 2); /* '}' + NUL */
4041 Copy(hex_string, d, output_length, char);
4046 *d++ = '}'; /* Done. Add the trailing brace */
4049 else { /* Here, not in a pattern. Convert the name to a
4052 if (PL_lex_inwhat == OP_TRANS) {
4053 str = SvPV_const(res, len);
4054 if (len > ((SvUTF8(res))
4058 yyerror(Perl_form(aTHX_
4059 "%.*s must not be a named sequence"
4060 " in transliteration operator",
4061 /* +1 to include the "}" */
4062 (int) (e + 1 - start), start));
4064 goto end_backslash_N;
4067 if (SvUTF8(res) && UTF8_IS_ABOVE_LATIN1(*str)) {
4068 has_above_latin1 = TRUE;
4072 else if (! SvUTF8(res)) {
4073 /* Make sure \N{} return is UTF-8. This is because
4074 * \N{} implies Unicode semantics, and scalars have
4075 * to be in utf8 to guarantee those semantics; but
4076 * not needed in tr/// */
4077 sv_utf8_upgrade_flags(res, 0);
4078 str = SvPV_const(res, len);
4081 /* Upgrade destination to be utf8 if this new
4083 if (! d_is_utf8 && SvUTF8(res)) {
4084 /* See Note on sizing above. */
4085 const STRLEN extra = len + (send - s) + 1;
4087 SvCUR_set(sv, d - SvPVX_const(sv));
4091 if (utf8_variant_count == 0) {
4093 d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + extra);
4096 sv_utf8_upgrade_flags_grow(sv,
4097 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
4099 d = SvPVX(sv) + SvCUR(sv);
4102 } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
4104 /* See Note on sizing above. (NOTE: SvCUR() is not
4105 * set correctly here). */
4106 const STRLEN extra = len + (send - e) + 1;
4107 const STRLEN off = d - SvPVX_const(sv);
4108 d = off + SvGROW(sv, off + extra);
4110 Copy(str, d, len, char);
4116 } /* End \N{NAME} */
4120 backslash_N++; /* \N{} is defined to be Unicode */
4122 s = e + 1; /* Point to just after the '}' */
4125 /* \c is a control character */
4129 const char * message;
4131 if (! grok_bslash_c(*s, (U8 *) d, &message, NULL)) {
4133 yyquit(); /* Have always immediately croaked on
4139 yyerror("Missing control char name in \\c");
4140 yyquit(); /* Are at end of input, no sense continuing */
4143 non_portable_endpoint++;
4147 /* printf-style backslashes, formfeeds, newlines, etc */
4173 } /* end if (backslash) */
4176 /* Just copy the input to the output, though we may have to convert
4179 * If the input has the same representation in UTF-8 as not, it will be
4180 * a single byte, and we don't care about UTF8ness; just copy the byte */
4181 if (NATIVE_BYTE_IS_INVARIANT((U8)(*s))) {
4184 else if (! s_is_utf8 && ! d_is_utf8) {
4185 /* If neither source nor output is UTF-8, is also a single byte,
4186 * just copy it; but this byte counts should we later have to
4187 * convert to UTF-8 */
4189 utf8_variant_count++;
4191 else if (s_is_utf8 && d_is_utf8) { /* Both UTF-8, can just copy */
4192 const STRLEN len = UTF8SKIP(s);
4194 /* We expect the source to have already been checked for
4196 assert(isUTF8_CHAR((U8 *) s, (U8 *) send));
4198 Copy(s, d, len, U8);
4202 else if (s_is_utf8) { /* UTF8ness matters: convert output to utf8 */
4203 STRLEN need = send - s + 1; /* See Note on sizing above. */
4205 SvCUR_set(sv, d - SvPVX_const(sv));
4209 if (utf8_variant_count == 0) {
4211 d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + need);
4214 sv_utf8_upgrade_flags_grow(sv,
4215 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
4217 d = SvPVX(sv) + SvCUR(sv);
4220 goto default_action; /* Redo, having upgraded so both are UTF-8 */
4222 else { /* UTF8ness matters: convert this non-UTF8 source char to
4223 UTF-8 for output. It will occupy 2 bytes, but don't include
4224 the input byte since we haven't incremented 's' yet. See
4225 Note on sizing above. */
4226 const STRLEN off = d - SvPVX(sv);
4227 const STRLEN extra = 2 + (send - s - 1) + 1;
4228 if (off + extra > SvLEN(sv)) {
4229 d = off + SvGROW(sv, off + extra);
4231 *d++ = UTF8_EIGHT_BIT_HI(*s);
4232 *d++ = UTF8_EIGHT_BIT_LO(*s);
4235 } /* while loop to process each character */
4238 const STRLEN off = d - SvPVX(sv);
4240 /* See if room for the terminating NUL */
4241 if (UNLIKELY(off >= SvLEN(sv))) {
4245 if (off > SvLEN(sv))
4247 Perl_croak(aTHX_ "panic: constant overflowed allocated space,"
4248 " %" UVuf " >= %" UVuf, (UV)off, (UV)SvLEN(sv));
4250 /* Whew! Here we don't have room for the terminating NUL, but
4251 * everything else so far has fit. It's not too late to grow
4252 * to fit the NUL and continue on. But it is a bug, as the code
4253 * above was supposed to have made room for this, so under
4254 * DEBUGGING builds, we panic anyway. */
4255 d = off + SvGROW(sv, off + 1);
4259 /* terminate the string and set up the sv */
4261 SvCUR_set(sv, d - SvPVX_const(sv));
4268 /* shrink the sv if we allocated more than we used */
4269 if (SvCUR(sv) + 5 < SvLEN(sv)) {
4270 SvPV_shrink_to_cur(sv);
4273 /* return the substring (via pl_yylval) only if we parsed anything */
4276 for (; s2 < s; s2++) {
4278 COPLINE_INC_WITH_HERELINES;
4280 SvREFCNT_inc_simple_void_NN(sv);
4281 if ( (PL_hints & ( PL_lex_inpat ? HINT_NEW_RE : HINT_NEW_STRING ))
4282 && ! PL_parser->lex_re_reparsing)
4284 const char *const key = PL_lex_inpat ? "qr" : "q";
4285 const STRLEN keylen = PL_lex_inpat ? 2 : 1;
4289 if (PL_lex_inwhat == OP_TRANS) {
4292 } else if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat) {
4295 } else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'') {
4303 sv = S_new_constant(aTHX_ start, s - start, key, keylen, sv, NULL,
4304 type, typelen, NULL);
4306 pl_yylval.opval = newSVOP(OP_CONST, 0, sv);
4308 LEAVE_with_name("scan_const");
4313 * Returns TRUE if there's more to the expression (e.g., a subscript),
4316 * It deals with "$foo[3]" and /$foo[3]/ and /$foo[0123456789$]+/
4318 * ->[ and ->{ return TRUE
4319 * ->$* ->$#* ->@* ->@[ ->@{ return TRUE if postderef_qq is enabled
4320 * { and [ outside a pattern are always subscripts, so return TRUE
4321 * if we're outside a pattern and it's not { or [, then return FALSE
4322 * if we're in a pattern and the first char is a {
4323 * {4,5} (any digits around the comma) returns FALSE
4324 * if we're in a pattern and the first char is a [
4326 * [SOMETHING] has a funky algorithm to decide whether it's a
4327 * character class or not. It has to deal with things like
4328 * /$foo[-3]/ and /$foo[$bar]/ as well as /$foo[$\d]+/
4329 * anything else returns TRUE
4332 /* This is the one truly awful dwimmer necessary to conflate C and sed. */
4335 S_intuit_more(pTHX_ char *s, char *e)
4337 PERL_ARGS_ASSERT_INTUIT_MORE;
4339 if (PL_lex_brackets)
4341 if (*s == '-' && s[1] == '>' && (s[2] == '[' || s[2] == '{'))
4343 if (*s == '-' && s[1] == '>'
4344 && FEATURE_POSTDEREF_QQ_IS_ENABLED
4345 && ( (s[2] == '$' && (s[3] == '*' || (s[3] == '#' && s[4] == '*')))
4346 ||(s[2] == '@' && memCHRs("*[{",s[3])) ))
4348 if (*s != '{' && *s != '[')
4350 PL_parser->sub_no_recover = TRUE;
4354 /* In a pattern, so maybe we have {n,m}. */
4362 /* On the other hand, maybe we have a character class */
4365 if (*s == ']' || *s == '^')
4368 /* this is terrifying, and it works */
4371 const char * const send = (char *) memchr(s, ']', e - s);
4372 unsigned char un_char, last_un_char;
4373 char tmpbuf[sizeof PL_tokenbuf * 4];
4375 if (!send) /* has to be an expression */
4377 weight = 2; /* let's weigh the evidence */
4381 else if (isDIGIT(*s)) {
4383 if (isDIGIT(s[1]) && s[2] == ']')
4389 Zero(seen,256,char);
4391 for (; s < send; s++) {
4392 last_un_char = un_char;
4393 un_char = (unsigned char)*s;
4398 weight -= seen[un_char] * 10;
4399 if (isWORDCHAR_lazy_if_safe(s+1, PL_bufend, UTF)) {
4401 scan_ident(s, tmpbuf, sizeof tmpbuf, FALSE);
4402 len = (int)strlen(tmpbuf);
4403 if (len > 1 && gv_fetchpvn_flags(tmpbuf, len,
4404 UTF ? SVf_UTF8 : 0, SVt_PV))
4411 && memCHRs("[#!%*<>()-=",s[1]))
4413 if (/*{*/ memCHRs("])} =",s[2]))
4422 if (memCHRs("wds]",s[1]))
4424 else if (seen[(U8)'\''] || seen[(U8)'"'])
4426 else if (memCHRs("rnftbxcav",s[1]))
4428 else if (isDIGIT(s[1])) {
4430 while (s[1] && isDIGIT(s[1]))
4440 if (memCHRs("aA01! ",last_un_char))
4442 if (memCHRs("zZ79~",s[1]))
4444 if (last_un_char == 255 && (isDIGIT(s[1]) || s[1] == '$'))
4445 weight -= 5; /* cope with negative subscript */
4448 if (!isWORDCHAR(last_un_char)
4449 && !(last_un_char == '$' || last_un_char == '@'
4450 || last_un_char == '&')
4451 && isALPHA(*s) && s[1] && isALPHA(s[1])) {
4455 if (keyword(d, s - d, 0))
4458 if (un_char == last_un_char + 1)
4460 weight -= seen[un_char];
4465 if (weight >= 0) /* probably a character class */
4475 * Does all the checking to disambiguate
4477 * between foo(bar) and bar->foo. Returns 0 if not a method, otherwise
4478 * FUNCMETH (bar->foo(args)) or METHOD (bar->foo args).
4480 * First argument is the stuff after the first token, e.g. "bar".
4482 * Not a method if foo is a filehandle.
4483 * Not a method if foo is a subroutine prototyped to take a filehandle.
4484 * Not a method if it's really "Foo $bar"
4485 * Method if it's "foo $bar"
4486 * Not a method if it's really "print foo $bar"
4487 * Method if it's really "foo package::" (interpreted as package->foo)
4488 * Not a method if bar is known to be a subroutine ("sub bar; foo bar")
4489 * Not a method if bar is a filehandle or package, but is quoted with
4494 S_intuit_method(pTHX_ char *start, SV *ioname, CV *cv)
4496 char *s = start + (*start == '$');
4497 char tmpbuf[sizeof PL_tokenbuf];
4500 /* Mustn't actually add anything to a symbol table.
4501 But also don't want to "initialise" any placeholder
4502 constants that might already be there into full
4503 blown PVGVs with attached PVCV. */
4505 ioname ? gv_fetchsv(ioname, GV_NOADD_NOINIT, SVt_PVCV) : NULL;
4507 PERL_ARGS_ASSERT_INTUIT_METHOD;
4509 if (!FEATURE_INDIRECT_IS_ENABLED)
4512 if (gv && SvTYPE(gv) == SVt_PVGV && GvIO(gv))
4514 if (cv && SvPOK(cv)) {
4515 const char *proto = CvPROTO(cv);
4517 while (*proto && (isSPACE(*proto) || *proto == ';'))
4524 if (*start == '$') {
4525 SSize_t start_off = start - SvPVX(PL_linestr);
4526 if (cv || PL_last_lop_op == OP_PRINT || PL_last_lop_op == OP_SAY
4527 || isUPPER(*PL_tokenbuf))
4529 /* this could be $# */
4532 PL_bufptr = SvPVX(PL_linestr) + start_off;
4534 return *s == '(' ? FUNCMETH : METHOD;
4537 s = scan_word(s, tmpbuf, sizeof tmpbuf, TRUE, &len);
4538 /* start is the beginning of the possible filehandle/object,
4539 * and s is the end of it
4540 * tmpbuf is a copy of it (but with single quotes as double colons)
4543 if (!keyword(tmpbuf, len, 0)) {
4544 if (len > 2 && tmpbuf[len - 2] == ':' && tmpbuf[len - 1] == ':') {
4549 indirgv = gv_fetchpvn_flags(tmpbuf, len,
4550 GV_NOADD_NOINIT|( UTF ? SVf_UTF8 : 0 ),
4552 if (indirgv && SvTYPE(indirgv) != SVt_NULL
4553 && (!isGV(indirgv) || GvCVu(indirgv)))
4555 /* filehandle or package name makes it a method */
4556 if (!cv || GvIO(indirgv) || gv_stashpvn(tmpbuf, len, UTF ? SVf_UTF8 : 0)) {
4558 if ((PL_bufend - s) >= 2 && *s == '=' && *(s+1) == '>')
4559 return 0; /* no assumptions -- "=>" quotes bareword */
4561 NEXTVAL_NEXTTOKE.opval = newSVOP(OP_CONST, 0,
4562 S_newSV_maybe_utf8(aTHX_ tmpbuf, len));
4563 NEXTVAL_NEXTTOKE.opval->op_private = OPpCONST_BARE;
4565 force_next(BAREWORD);
4567 return *s == '(' ? FUNCMETH : METHOD;
4573 /* Encoded script support. filter_add() effectively inserts a
4574 * 'pre-processing' function into the current source input stream.
4575 * Note that the filter function only applies to the current source file
4576 * (e.g., it will not affect files 'require'd or 'use'd by this one).
4578 * The datasv parameter (which may be NULL) can be used to pass
4579 * private data to this instance of the filter. The filter function
4580 * can recover the SV using the FILTER_DATA macro and use it to
4581 * store private buffers and state information.
4583 * The supplied datasv parameter is upgraded to a PVIO type
4584 * and the IoDIRP/IoANY field is used to store the function pointer,
4585 * and IOf_FAKE_DIRP is enabled on datasv to mark this as such.
4586 * Note that IoTOP_NAME, IoFMT_NAME, IoBOTTOM_NAME, if set for
4587 * private use must be set using malloc'd pointers.
4591 Perl_filter_add(pTHX_ filter_t funcp, SV *datasv)
4599 if (PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS)
4600 Perl_croak(aTHX_ "Source filters apply only to byte streams");
4602 if (!PL_rsfp_filters)
4603 PL_rsfp_filters = newAV();
4606 SvUPGRADE(datasv, SVt_PVIO);
4607 IoANY(datasv) = FPTR2DPTR(void *, funcp); /* stash funcp into spare field */
4608 IoFLAGS(datasv) |= IOf_FAKE_DIRP;
4609 DEBUG_P(PerlIO_printf(Perl_debug_log, "filter_add func %p (%s)\n",
4610 FPTR2DPTR(void *, IoANY(datasv)),
4611 SvPV_nolen(datasv)));
4612 av_unshift(PL_rsfp_filters, 1);
4613 av_store(PL_rsfp_filters, 0, datasv) ;
4615 !PL_parser->filtered
4616 && PL_parser->lex_flags & LEX_EVALBYTES
4617 && PL_bufptr < PL_bufend
4619 const char *s = PL_bufptr;
4620 while (s < PL_bufend) {
4622 SV *linestr = PL_parser->linestr;
4623 char *buf = SvPVX(linestr);
4624 STRLEN const bufptr_pos = PL_parser->bufptr - buf;
4625 STRLEN const oldbufptr_pos = PL_parser->oldbufptr - buf;
4626 STRLEN const oldoldbufptr_pos=PL_parser->oldoldbufptr-buf;
4627 STRLEN const linestart_pos = PL_parser->linestart - buf;
4628 STRLEN const last_uni_pos =
4629 PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
4630 STRLEN const last_lop_pos =
4631 PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
4632 av_push(PL_rsfp_filters, linestr);
4633 PL_parser->linestr =
4634 newSVpvn(SvPVX(linestr), ++s-SvPVX(linestr));
4635 buf = SvPVX(PL_parser->linestr);
4636 PL_parser->bufend = buf + SvCUR(PL_parser->linestr);
4637 PL_parser->bufptr = buf + bufptr_pos;
4638 PL_parser->oldbufptr = buf + oldbufptr_pos;
4639 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
4640 PL_parser->linestart = buf + linestart_pos;
4641 if (PL_parser->last_uni)
4642 PL_parser->last_uni = buf + last_uni_pos;
4643 if (PL_parser->last_lop)
4644 PL_parser->last_lop = buf + last_lop_pos;
4645 SvLEN_set(linestr, SvCUR(linestr));
4646 SvCUR_set(linestr, s - SvPVX(linestr));
4647 PL_parser->filtered = 1;
4657 /* Delete most recently added instance of this filter function. */
4659 Perl_filter_del(pTHX_ filter_t funcp)
4663 PERL_ARGS_ASSERT_FILTER_DEL;
4666 DEBUG_P(PerlIO_printf(Perl_debug_log, "filter_del func %p",
4667 FPTR2DPTR(void*, funcp)));
4669 if (!PL_parser || !PL_rsfp_filters || AvFILLp(PL_rsfp_filters)<0)
4671 /* if filter is on top of stack (usual case) just pop it off */
4672 datasv = FILTER_DATA(AvFILLp(PL_rsfp_filters));
4673 if (IoANY(datasv) == FPTR2DPTR(void *, funcp)) {
4674 sv_free(av_pop(PL_rsfp_filters));
4678 /* we need to search for the correct entry and clear it */
4679 Perl_die(aTHX_ "filter_del can only delete in reverse order (currently)");
4683 /* Invoke the idxth filter function for the current rsfp. */
4684 /* maxlen 0 = read one text line */
4686 Perl_filter_read(pTHX_ int idx, SV *buf_sv, int maxlen)
4691 /* This API is bad. It should have been using unsigned int for maxlen.
4692 Not sure if we want to change the API, but if not we should sanity
4693 check the value here. */
4694 unsigned int correct_length = maxlen < 0 ? PERL_INT_MAX : maxlen;
4696 PERL_ARGS_ASSERT_FILTER_READ;
4698 if (!PL_parser || !PL_rsfp_filters)
4700 if (idx > AvFILLp(PL_rsfp_filters)) { /* Any more filters? */
4701 /* Provide a default input filter to make life easy. */
4702 /* Note that we append to the line. This is handy. */
4703 DEBUG_P(PerlIO_printf(Perl_debug_log,
4704 "filter_read %d: from rsfp\n", idx));
4705 if (correct_length) {
4708 const int old_len = SvCUR(buf_sv);
4710 /* ensure buf_sv is large enough */
4711 SvGROW(buf_sv, (STRLEN)(old_len + correct_length + 1)) ;
4712 if ((len = PerlIO_read(PL_rsfp, SvPVX(buf_sv) + old_len,
4713 correct_length)) <= 0) {
4714 if (PerlIO_error(PL_rsfp))
4715 return -1; /* error */
4717 return 0 ; /* end of file */
4719 SvCUR_set(buf_sv, old_len + len) ;
4720 SvPVX(buf_sv)[old_len + len] = '\0';
4723 if (sv_gets(buf_sv, PL_rsfp, SvCUR(buf_sv)) == NULL) {
4724 if (PerlIO_error(PL_rsfp))
4725 return -1; /* error */
4727 return 0 ; /* end of file */
4730 return SvCUR(buf_sv);
4732 /* Skip this filter slot if filter has been deleted */
4733 if ( (datasv = FILTER_DATA(idx)) == &PL_sv_undef) {
4734 DEBUG_P(PerlIO_printf(Perl_debug_log,
4735 "filter_read %d: skipped (filter deleted)\n",
4737 return FILTER_READ(idx+1, buf_sv, correct_length); /* recurse */
4739 if (SvTYPE(datasv) != SVt_PVIO) {
4740 if (correct_length) {
4742 const STRLEN remainder = SvLEN(datasv) - SvCUR(datasv);
4743 if (!remainder) return 0; /* eof */
4744 if (correct_length > remainder) correct_length = remainder;
4745 sv_catpvn(buf_sv, SvEND(datasv), correct_length);
4746 SvCUR_set(datasv, SvCUR(datasv) + correct_length);
4749 const char *s = SvEND(datasv);
4750 const char *send = SvPVX(datasv) + SvLEN(datasv);
4758 if (s == send) return 0; /* eof */
4759 sv_catpvn(buf_sv, SvEND(datasv), s-SvEND(datasv));
4760 SvCUR_set(datasv, s-SvPVX(datasv));
4762 return SvCUR(buf_sv);
4764 /* Get function pointer hidden within datasv */
4765 funcp = DPTR2FPTR(filter_t, IoANY(datasv));
4766 DEBUG_P(PerlIO_printf(Perl_debug_log,
4767 "filter_read %d: via function %p (%s)\n",
4768 idx, (void*)datasv, SvPV_nolen_const(datasv)));
4769 /* Call function. The function is expected to */
4770 /* call "FILTER_READ(idx+1, buf_sv)" first. */
4771 /* Return: <0:error, =0:eof, >0:not eof */
4773 save_scalar(PL_errgv);
4774 ret = (*funcp)(aTHX_ idx, buf_sv, correct_length);
4780 S_filter_gets(pTHX_ SV *sv, STRLEN append)
4782 PERL_ARGS_ASSERT_FILTER_GETS;
4784 #ifdef PERL_CR_FILTER
4785 if (!PL_rsfp_filters) {
4786 filter_add(S_cr_textfilter,NULL);
4789 if (PL_rsfp_filters) {
4791 SvCUR_set(sv, 0); /* start with empty line */
4792 if (FILTER_READ(0, sv, 0) > 0)
4793 return ( SvPVX(sv) ) ;
4798 return (sv_gets(sv, PL_rsfp, append));
4802 S_find_in_my_stash(pTHX_ const char *pkgname, STRLEN len)
4806 PERL_ARGS_ASSERT_FIND_IN_MY_STASH;
4808 if (memEQs(pkgname, len, "__PACKAGE__"))
4812 && (pkgname[len - 2] == ':' && pkgname[len - 1] == ':')
4813 && (gv = gv_fetchpvn_flags(pkgname,
4815 ( UTF ? SVf_UTF8 : 0 ), SVt_PVHV)))
4817 return GvHV(gv); /* Foo:: */
4820 /* use constant CLASS => 'MyClass' */
4821 gv = gv_fetchpvn_flags(pkgname, len, UTF ? SVf_UTF8 : 0, SVt_PVCV);
4822 if (gv && GvCV(gv)) {
4823 SV * const sv = cv_const_sv(GvCV(gv));
4825 return gv_stashsv(sv, 0);
4828 return gv_stashpvn(pkgname, len, UTF ? SVf_UTF8 : 0);
4833 S_tokenize_use(pTHX_ int is_use, char *s) {
4834 PERL_ARGS_ASSERT_TOKENIZE_USE;
4836 if (PL_expect != XSTATE)
4837 /* diag_listed_as: "use" not allowed in expression */
4838 yyerror(Perl_form(aTHX_ "\"%s\" not allowed in expression",
4839 is_use ? "use" : "no"));
4842 if (isDIGIT(*s) || (*s == 'v' && isDIGIT(s[1]))) {
4843 s = force_version(s, TRUE);
4844 if (*s == ';' || *s == '}'
4845 || (s = skipspace(s), (*s == ';' || *s == '}'))) {
4846 NEXTVAL_NEXTTOKE.opval = NULL;
4847 force_next(BAREWORD);
4849 else if (*s == 'v') {
4850 s = force_word(s,BAREWORD,FALSE,TRUE);
4851 s = force_version(s, FALSE);
4855 s = force_word(s,BAREWORD,FALSE,TRUE);
4856 s = force_version(s, FALSE);
4858 pl_yylval.ival = is_use;
4862 static const char* const exp_name[] =
4863 { "OPERATOR", "TERM", "REF", "STATE", "BLOCK", "ATTRBLOCK",
4864 "ATTRTERM", "TERMBLOCK", "XBLOCKTERM", "POSTDEREF",
4865 "SIGVAR", "TERMORDORDOR"
4869 #define word_takes_any_delimiter(p,l) S_word_takes_any_delimiter(p,l)
4871 S_word_takes_any_delimiter(char *p, STRLEN len)
4873 return (len == 1 && memCHRs("msyq", p[0]))
4875 && ((p[0] == 't' && p[1] == 'r')
4876 || (p[0] == 'q' && memCHRs("qwxr", p[1]))));
4880 S_check_scalar_slice(pTHX_ char *s)
4883 while (SPACE_OR_TAB(*s)) s++;
4884 if (*s == 'q' && s[1] == 'w' && !isWORDCHAR_lazy_if_safe(s+2,
4890 while ( isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF)
4891 || (*s && memCHRs(" \t$#+-'\"", *s)))
4893 s += UTF ? UTF8SKIP(s) : 1;
4895 if (*s == '}' || *s == ']')
4896 pl_yylval.ival = OPpSLICEWARNING;
4899 #define lex_token_boundary() S_lex_token_boundary(aTHX)
4901 S_lex_token_boundary(pTHX)
4903 PL_oldoldbufptr = PL_oldbufptr;
4904 PL_oldbufptr = PL_bufptr;
4907 #define vcs_conflict_marker(s) S_vcs_conflict_marker(aTHX_ s)
4909 S_vcs_conflict_marker(pTHX_ char *s)
4911 lex_token_boundary();
4913 yyerror("Version control conflict marker");
4914 while (s < PL_bufend && *s != '\n')
4920 yyl_sigvar(pTHX_ char *s)
4922 /* we expect the sigil and optional var name part of a
4923 * signature element here. Since a '$' is not necessarily
4924 * followed by a var name, handle it specially here; the general
4925 * yylex code would otherwise try to interpret whatever follows
4926 * as a var; e.g. ($, ...) would be seen as the var '$,'
4933 PL_bufptr = s; /* for error reporting */
4938 /* spot stuff that looks like an prototype */
4939 if (memCHRs("$:@%&*;\\[]", *s)) {
4940 yyerror("Illegal character following sigil in a subroutine signature");
4943 /* '$#' is banned, while '$ # comment' isn't */
4945 yyerror("'#' not allowed immediately following a sigil in a subroutine signature");
4949 if (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) {
4950 char *dest = PL_tokenbuf + 1;
4951 /* read var name, including sigil, into PL_tokenbuf */
4952 PL_tokenbuf[0] = sigil;
4953 parse_ident(&s, &dest, dest + sizeof(PL_tokenbuf) - 1,
4954 0, cBOOL(UTF), FALSE, FALSE);
4956 assert(PL_tokenbuf[1]); /* we have a variable name */
4964 /* parse the = for the default ourselves to avoid '+=' etc being accepted here
4965 * as the ASSIGNOP, and exclude other tokens that start with =
4967 if (*s == '=' && (!s[1] || memCHRs("=~>", s[1]) == 0)) {
4968 /* save now to report with the same context as we did when
4969 * all ASSIGNOPS were accepted */
4973 NEXTVAL_NEXTTOKE.ival = 0;
4974 force_next(ASSIGNOP);
4977 else if (*s == ',' || *s == ')') {
4978 PL_expect = XOPERATOR;
4981 /* make sure the context shows the unexpected character and
4982 * hopefully a bit more */
4984 while (*s && *s != '$' && *s != '@' && *s != '%' && *s != ')')
4986 PL_bufptr = s; /* for error reporting */
4987 yyerror("Illegal operator following parameter in a subroutine signature");
4991 NEXTVAL_NEXTTOKE.ival = sigil;
4992 force_next('p'); /* force a signature pending identifier */
4999 case ',': /* handle ($a,,$b) */
5004 yyerror("A signature parameter must start with '$', '@' or '%'");
5005 /* very crude error recovery: skip to likely next signature
5007 while (*s && *s != '$' && *s != '@' && *s != '%' && *s != ')')
5013 case ',': TOKEN (PERLY_COMMA);
5014 case '$': TOKEN (PERLY_DOLLAR);
5015 case '@': TOKEN (PERLY_SNAIL);
5016 case '%': TOKEN (PERLY_PERCENT_SIGN);
5017 case ')': TOKEN (PERLY_PAREN_CLOSE);
5018 default: TOKEN (sigil);
5023 yyl_dollar(pTHX_ char *s)
5027 if (PL_expect == XPOSTDEREF) {
5030 POSTDEREF(DOLSHARP);
5032 POSTDEREF(PERLY_DOLLAR);
5036 && ( isIDFIRST_lazy_if_safe(s+2, PL_bufend, UTF)
5037 || memCHRs("{$:+-@", s[2])))
5039 PL_tokenbuf[0] = '@';
5040 s = scan_ident(s + 1, PL_tokenbuf + 1,
5041 sizeof PL_tokenbuf - 1, FALSE);
5042 if (PL_expect == XOPERATOR) {
5044 if (PL_bufptr > s) {
5046 PL_bufptr = PL_oldbufptr;
5048 no_op("Array length", d);