3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
12 * 'It all comes from here, the stench and the peril.' --Frodo
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
18 * This file is the lexer for Perl. It's closely linked to the
21 * The main routine is yylex(), which returns the next token.
25 =head1 Lexer interface
26 This is the lower layer of the Perl parser, managing characters and tokens.
28 =for apidoc AmU|yy_parser *|PL_parser
30 Pointer to a structure encapsulating the state of the parsing operation
31 currently in progress. The pointer can be locally changed to perform
32 a nested parse without interfering with the state of an outer parse.
33 Individual members of C<PL_parser> have their own documentation.
39 #define PERL_IN_TOKE_C
41 #include "dquote_static.c"
43 #define new_constant(a,b,c,d,e,f,g) \
44 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
46 #define pl_yylval (PL_parser->yylval)
48 /* XXX temporary backwards compatibility */
49 #define PL_lex_brackets (PL_parser->lex_brackets)
50 #define PL_lex_allbrackets (PL_parser->lex_allbrackets)
51 #define PL_lex_fakeeof (PL_parser->lex_fakeeof)
52 #define PL_lex_brackstack (PL_parser->lex_brackstack)
53 #define PL_lex_casemods (PL_parser->lex_casemods)
54 #define PL_lex_casestack (PL_parser->lex_casestack)
55 #define PL_lex_defer (PL_parser->lex_defer)
56 #define PL_lex_dojoin (PL_parser->lex_dojoin)
57 #define PL_lex_expect (PL_parser->lex_expect)
58 #define PL_lex_formbrack (PL_parser->lex_formbrack)
59 #define PL_lex_inpat (PL_parser->lex_inpat)
60 #define PL_lex_inwhat (PL_parser->lex_inwhat)
61 #define PL_lex_op (PL_parser->lex_op)
62 #define PL_lex_repl (PL_parser->lex_repl)
63 #define PL_lex_starts (PL_parser->lex_starts)
64 #define PL_lex_stuff (PL_parser->lex_stuff)
65 #define PL_multi_start (PL_parser->multi_start)
66 #define PL_multi_open (PL_parser->multi_open)
67 #define PL_multi_close (PL_parser->multi_close)
68 #define PL_preambled (PL_parser->preambled)
69 #define PL_sublex_info (PL_parser->sublex_info)
70 #define PL_linestr (PL_parser->linestr)
71 #define PL_expect (PL_parser->expect)
72 #define PL_copline (PL_parser->copline)
73 #define PL_bufptr (PL_parser->bufptr)
74 #define PL_oldbufptr (PL_parser->oldbufptr)
75 #define PL_oldoldbufptr (PL_parser->oldoldbufptr)
76 #define PL_linestart (PL_parser->linestart)
77 #define PL_bufend (PL_parser->bufend)
78 #define PL_last_uni (PL_parser->last_uni)
79 #define PL_last_lop (PL_parser->last_lop)
80 #define PL_last_lop_op (PL_parser->last_lop_op)
81 #define PL_lex_state (PL_parser->lex_state)
82 #define PL_rsfp (PL_parser->rsfp)
83 #define PL_rsfp_filters (PL_parser->rsfp_filters)
84 #define PL_in_my (PL_parser->in_my)
85 #define PL_in_my_stash (PL_parser->in_my_stash)
86 #define PL_tokenbuf (PL_parser->tokenbuf)
87 #define PL_multi_end (PL_parser->multi_end)
88 #define PL_error_count (PL_parser->error_count)
90 # define PL_nexttoke (PL_parser->nexttoke)
91 # define PL_nexttype (PL_parser->nexttype)
92 # define PL_nextval (PL_parser->nextval)
94 static const char* const ident_too_long = "Identifier too long";
96 # define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
98 #define XENUMMASK 0x3f
100 #define XFAKEBRACK 0x80
102 #ifdef USE_UTF8_SCRIPTS
103 # define UTF (!IN_BYTES)
105 # define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
108 /* The maximum number of characters preceding the unrecognized one to display */
109 #define UNRECOGNIZED_PRECEDE_COUNT 10
111 /* In variables named $^X, these are the legal values for X.
112 * 1999-02-27 mjd-perl-patch@plover.com */
113 #define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
115 #define SPACE_OR_TAB(c) isBLANK_A(c)
117 #define HEXFP_PEEK(s) \
119 (isXDIGIT(s[1]) || isALPHA_FOLD_EQ(s[1], 'p'))) || \
120 isALPHA_FOLD_EQ(s[0], 'p'))
122 /* LEX_* are values for PL_lex_state, the state of the lexer.
123 * They are arranged oddly so that the guard on the switch statement
124 * can get by with a single comparison (if the compiler is smart enough).
126 * These values refer to the various states within a sublex parse,
127 * i.e. within a double quotish string
130 /* #define LEX_NOTPARSING 11 is done in perl.h. */
132 #define LEX_NORMAL 10 /* normal code (ie not within "...") */
133 #define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
134 #define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
135 #define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
136 #define LEX_INTERPSTART 6 /* expecting the start of a $var */
138 /* at end of code, eg "$x" followed by: */
139 #define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
140 #define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
142 #define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
143 string or after \E, $foo, etc */
144 #define LEX_INTERPCONST 2 /* NOT USED */
145 #define LEX_FORMLINE 1 /* expecting a format line */
146 #define LEX_KNOWNEXT 0 /* next token known; just return it */
150 static const char* const lex_state_names[] = {
165 #include "keywords.h"
167 /* CLINE is a macro that ensures PL_copline has a sane value */
169 #define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
171 # define SKIPSPACE0(s) skipspace(s)
172 # define SKIPSPACE1(s) skipspace(s)
173 # define SKIPSPACE2(s,tsv) skipspace(s)
174 # define PEEKSPACE(s) skipspace(s)
177 * Convenience functions to return different tokens and prime the
178 * lexer for the next token. They all take an argument.
180 * TOKEN : generic token (used for '(', DOLSHARP, etc)
181 * OPERATOR : generic operator
182 * AOPERATOR : assignment operator
183 * PREBLOCK : beginning the block after an if, while, foreach, ...
184 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
185 * PREREF : *EXPR where EXPR is not a simple identifier
186 * TERM : expression term
187 * POSTDEREF : postfix dereference (->$* ->@[...] etc.)
188 * LOOPX : loop exiting command (goto, last, dump, etc)
189 * FTST : file test operator
190 * FUN0 : zero-argument function
191 * FUN0OP : zero-argument function, with its op created in this file
192 * FUN1 : not used, except for not, which isn't a UNIOP
193 * BOop : bitwise or or xor
195 * SHop : shift operator
196 * PWop : power operator
197 * PMop : pattern-matching operator
198 * Aop : addition-level operator
199 * AopNOASSIGN : addition-level operator that is never part of .=
200 * Mop : multiplication-level operator
201 * Eop : equality-testing operator
202 * Rop : relational operator <= != gt
204 * Also see LOP and lop() below.
207 #ifdef DEBUGGING /* Serve -DT. */
208 # define REPORT(retval) tokereport((I32)retval, &pl_yylval)
210 # define REPORT(retval) (retval)
213 #define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
214 #define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
215 #define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, REPORT(retval)))
216 #define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
217 #define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
218 #define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
219 #define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
220 #define POSTDEREF(f) return (PL_bufptr = s, S_postderef(aTHX_ REPORT(f),s[1]))
221 #define LOOPX(f) return (PL_expect = XOPERATOR, \
222 PL_bufptr = force_word(s,WORD,TRUE,FALSE), \
224 (void)(PL_nexttoke || (PL_expect = XTERM)), \
226 #define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
227 #define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
228 #define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
229 #define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
230 #define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITOROP)))
231 #define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITANDOP)))
232 #define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)SHIFTOP)))
233 #define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)POWOP)))
234 #define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
235 #define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)ADDOP)))
236 #define AopNOASSIGN(f) return (pl_yylval.ival=f, PL_bufptr=s, REPORT((int)ADDOP))
237 #define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MULOP)))
238 #define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
239 #define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
241 /* This bit of chicanery makes a unary function followed by
242 * a parenthesis into a function with one argument, highest precedence.
243 * The UNIDOR macro is for unary functions that can be followed by the //
244 * operator (such as C<shift // 0>).
246 #define UNI3(f,x,have_x) { \
247 pl_yylval.ival = f; \
248 if (have_x) PL_expect = x; \
250 PL_last_uni = PL_oldbufptr; \
251 PL_last_lop_op = f; \
253 return REPORT( (int)FUNC1 ); \
255 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
257 #define UNI(f) UNI3(f,XTERM,1)
258 #define UNIDOR(f) UNI3(f,XTERMORDORDOR,1)
259 #define UNIPROTO(f,optional) { \
260 if (optional) PL_last_uni = PL_oldbufptr; \
264 #define UNIBRACK(f) UNI3(f,0,0)
266 /* grandfather return to old style */
269 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
270 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
271 pl_yylval.ival = (f); \
277 #define COPLINE_INC_WITH_HERELINES \
279 CopLINE_inc(PL_curcop); \
280 if (PL_parser->herelines) \
281 CopLINE(PL_curcop) += PL_parser->herelines, \
282 PL_parser->herelines = 0; \
284 /* Called after scan_str to update CopLINE(PL_curcop), but only when there
285 * is no sublex_push to follow. */
286 #define COPLINE_SET_FROM_MULTI_END \
288 CopLINE_set(PL_curcop, PL_multi_end); \
289 if (PL_multi_end != PL_multi_start) \
290 PL_parser->herelines = 0; \
296 /* how to interpret the pl_yylval associated with the token */
300 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
305 static struct debug_tokens {
307 enum token_type type;
309 } const debug_tokens[] =
311 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
312 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
313 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
314 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
315 { ARROW, TOKENTYPE_NONE, "ARROW" },
316 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
317 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
318 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
319 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
320 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
321 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
322 { DO, TOKENTYPE_NONE, "DO" },
323 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
324 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
325 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
326 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
327 { ELSE, TOKENTYPE_NONE, "ELSE" },
328 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
329 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
330 { FOR, TOKENTYPE_IVAL, "FOR" },
331 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
332 { FORMLBRACK, TOKENTYPE_NONE, "FORMLBRACK" },
333 { FORMRBRACK, TOKENTYPE_NONE, "FORMRBRACK" },
334 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
335 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
336 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
337 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
338 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
339 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
340 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
341 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
342 { IF, TOKENTYPE_IVAL, "IF" },
343 { LABEL, TOKENTYPE_PVAL, "LABEL" },
344 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
345 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
346 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
347 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
348 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
349 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
350 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
351 { MY, TOKENTYPE_IVAL, "MY" },
352 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
353 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
354 { OROP, TOKENTYPE_IVAL, "OROP" },
355 { OROR, TOKENTYPE_NONE, "OROR" },
356 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
357 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
358 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
359 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
360 { POSTJOIN, TOKENTYPE_NONE, "POSTJOIN" },
361 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
362 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
363 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
364 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
365 { PREINC, TOKENTYPE_NONE, "PREINC" },
366 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
367 { QWLIST, TOKENTYPE_OPVAL, "QWLIST" },
368 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
369 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
370 { REQUIRE, TOKENTYPE_NONE, "REQUIRE" },
371 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
372 { SUB, TOKENTYPE_NONE, "SUB" },
373 { THING, TOKENTYPE_OPVAL, "THING" },
374 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
375 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
376 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
377 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
378 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
379 { USE, TOKENTYPE_IVAL, "USE" },
380 { WHEN, TOKENTYPE_IVAL, "WHEN" },
381 { WHILE, TOKENTYPE_IVAL, "WHILE" },
382 { WORD, TOKENTYPE_OPVAL, "WORD" },
383 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
384 { 0, TOKENTYPE_NONE, NULL }
387 /* dump the returned token in rv, plus any optional arg in pl_yylval */
390 S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
392 PERL_ARGS_ASSERT_TOKEREPORT;
395 const char *name = NULL;
396 enum token_type type = TOKENTYPE_NONE;
397 const struct debug_tokens *p;
398 SV* const report = newSVpvs("<== ");
400 for (p = debug_tokens; p->token; p++) {
401 if (p->token == (int)rv) {
408 Perl_sv_catpv(aTHX_ report, name);
409 else if ((char)rv > ' ' && (char)rv <= '~')
411 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
413 sv_catpvs(report, " (pending identifier)");
416 sv_catpvs(report, "EOF");
418 Perl_sv_catpvf(aTHX_ report, "?? %"IVdf, (IV)rv);
423 Perl_sv_catpvf(aTHX_ report, "(ival=%"IVdf")", (IV)lvalp->ival);
425 case TOKENTYPE_OPNUM:
426 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
427 PL_op_name[lvalp->ival]);
430 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
432 case TOKENTYPE_OPVAL:
434 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
435 PL_op_name[lvalp->opval->op_type]);
436 if (lvalp->opval->op_type == OP_CONST) {
437 Perl_sv_catpvf(aTHX_ report, " %s",
438 SvPEEK(cSVOPx_sv(lvalp->opval)));
443 sv_catpvs(report, "(opval=null)");
446 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
452 /* print the buffer with suitable escapes */
455 S_printbuf(pTHX_ const char *const fmt, const char *const s)
457 SV* const tmp = newSVpvs("");
459 PERL_ARGS_ASSERT_PRINTBUF;
461 GCC_DIAG_IGNORE(-Wformat-nonliteral); /* fmt checked by caller */
462 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
470 S_deprecate_commaless_var_list(pTHX) {
472 deprecate("comma-less variable list");
473 return REPORT(','); /* grandfather non-comma-format format */
479 * This subroutine looks for an '=' next to the operator that has just been
480 * parsed and turns it into an ASSIGNOP if it finds one.
484 S_ao(pTHX_ int toketype)
486 if (*PL_bufptr == '=') {
488 if (toketype == ANDAND)
489 pl_yylval.ival = OP_ANDASSIGN;
490 else if (toketype == OROR)
491 pl_yylval.ival = OP_ORASSIGN;
492 else if (toketype == DORDOR)
493 pl_yylval.ival = OP_DORASSIGN;
501 * When Perl expects an operator and finds something else, no_op
502 * prints the warning. It always prints "<something> found where
503 * operator expected. It prints "Missing semicolon on previous line?"
504 * if the surprise occurs at the start of the line. "do you need to
505 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
506 * where the compiler doesn't know if foo is a method call or a function.
507 * It prints "Missing operator before end of line" if there's nothing
508 * after the missing operator, or "... before <...>" if there is something
509 * after the missing operator.
513 S_no_op(pTHX_ const char *const what, char *s)
515 char * const oldbp = PL_bufptr;
516 const bool is_first = (PL_oldbufptr == PL_linestart);
518 PERL_ARGS_ASSERT_NO_OP;
524 yywarn(Perl_form(aTHX_ "%s found where operator expected", what), UTF ? SVf_UTF8 : 0);
525 if (ckWARN_d(WARN_SYNTAX)) {
527 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
528 "\t(Missing semicolon on previous line?)\n");
529 else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) {
531 for (t = PL_oldoldbufptr; (isWORDCHAR_lazy_if(t,UTF) || *t == ':');
532 t += UTF ? UTF8SKIP(t) : 1)
534 if (t < PL_bufptr && isSPACE(*t))
535 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
536 "\t(Do you need to predeclare %"UTF8f"?)\n",
537 UTF8fARG(UTF, t - PL_oldoldbufptr, PL_oldoldbufptr));
541 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
542 "\t(Missing operator before %"UTF8f"?)\n",
543 UTF8fARG(UTF, s - oldbp, oldbp));
551 * Complain about missing quote/regexp/heredoc terminator.
552 * If it's called with NULL then it cauterizes the line buffer.
553 * If we're in a delimited string and the delimiter is a control
554 * character, it's reformatted into a two-char sequence like ^C.
559 S_missingterm(pTHX_ char *s)
564 char * const nl = strrchr(s,'\n');
568 else if ((U8) PL_multi_close < 32) {
570 tmpbuf[1] = (char)toCTRL(PL_multi_close);
575 *tmpbuf = (char)PL_multi_close;
579 q = strchr(s,'"') ? '\'' : '"';
580 Perl_croak(aTHX_ "Can't find string terminator %c%s%c anywhere before EOF",q,s,q);
586 * Check whether the named feature is enabled.
589 Perl_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
591 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
593 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
595 assert(CURRENT_FEATURE_BUNDLE == FEATURE_BUNDLE_CUSTOM);
597 if (namelen > MAX_FEATURE_LEN)
599 memcpy(&he_name[8], name, namelen);
601 return cBOOL(cop_hints_fetch_pvn(PL_curcop, he_name, 8 + namelen, 0,
602 REFCOUNTED_HE_EXISTS));
606 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
607 * utf16-to-utf8-reversed.
610 #ifdef PERL_CR_FILTER
614 const char *s = SvPVX_const(sv);
615 const char * const e = s + SvCUR(sv);
617 PERL_ARGS_ASSERT_STRIP_RETURN;
619 /* outer loop optimized to do nothing if there are no CR-LFs */
621 if (*s++ == '\r' && *s == '\n') {
622 /* hit a CR-LF, need to copy the rest */
626 if (*s == '\r' && s[1] == '\n')
637 S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
639 const I32 count = FILTER_READ(idx+1, sv, maxlen);
640 if (count > 0 && !maxlen)
647 =for apidoc Amx|void|lex_start|SV *line|PerlIO *rsfp|U32 flags
649 Creates and initialises a new lexer/parser state object, supplying
650 a context in which to lex and parse from a new source of Perl code.
651 A pointer to the new state object is placed in L</PL_parser>. An entry
652 is made on the save stack so that upon unwinding the new state object
653 will be destroyed and the former value of L</PL_parser> will be restored.
654 Nothing else need be done to clean up the parsing context.
656 The code to be parsed comes from I<line> and I<rsfp>. I<line>, if
657 non-null, provides a string (in SV form) containing code to be parsed.
658 A copy of the string is made, so subsequent modification of I<line>
659 does not affect parsing. I<rsfp>, if non-null, provides an input stream
660 from which code will be read to be parsed. If both are non-null, the
661 code in I<line> comes first and must consist of complete lines of input,
662 and I<rsfp> supplies the remainder of the source.
664 The I<flags> parameter is reserved for future use. Currently it is only
665 used by perl internally, so extensions should always pass zero.
670 /* LEX_START_SAME_FILTER indicates that this is not a new file, so it
671 can share filters with the current parser.
672 LEX_START_DONT_CLOSE indicates that the file handle wasn't opened by the
673 caller, hence isn't owned by the parser, so shouldn't be closed on parser
674 destruction. This is used to handle the case of defaulting to reading the
675 script from the standard input because no filename was given on the command
676 line (without getting confused by situation where STDIN has been closed, so
677 the script handle is opened on fd 0) */
680 Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
682 const char *s = NULL;
683 yy_parser *parser, *oparser;
684 if (flags && flags & ~LEX_START_FLAGS)
685 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
687 /* create and initialise a parser */
689 Newxz(parser, 1, yy_parser);
690 parser->old_parser = oparser = PL_parser;
693 parser->stack = NULL;
695 parser->stack_size = 0;
697 /* on scope exit, free this parser and restore any outer one */
699 parser->saved_curcop = PL_curcop;
701 /* initialise lexer state */
703 parser->nexttoke = 0;
704 parser->error_count = oparser ? oparser->error_count : 0;
705 parser->copline = parser->preambling = NOLINE;
706 parser->lex_state = LEX_NORMAL;
707 parser->expect = XSTATE;
709 parser->rsfp_filters =
710 !(flags & LEX_START_SAME_FILTER) || !oparser
712 : MUTABLE_AV(SvREFCNT_inc(
713 oparser->rsfp_filters
714 ? oparser->rsfp_filters
715 : (oparser->rsfp_filters = newAV())
718 Newx(parser->lex_brackstack, 120, char);
719 Newx(parser->lex_casestack, 12, char);
720 *parser->lex_casestack = '\0';
721 Newxz(parser->lex_shared, 1, LEXSHARED);
725 s = SvPV_const(line, len);
726 parser->linestr = flags & LEX_START_COPIED
727 ? SvREFCNT_inc_simple_NN(line)
728 : newSVpvn_flags(s, len, SvUTF8(line));
729 sv_catpvn(parser->linestr, "\n;", rsfp ? 1 : 2);
731 parser->linestr = newSVpvn("\n;", rsfp ? 1 : 2);
733 parser->oldoldbufptr =
736 parser->linestart = SvPVX(parser->linestr);
737 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
738 parser->last_lop = parser->last_uni = NULL;
740 assert(FITS_IN_8_BITS(LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
741 |LEX_DONT_CLOSE_RSFP));
742 parser->lex_flags = (U8) (flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
743 |LEX_DONT_CLOSE_RSFP));
745 parser->in_pod = parser->filtered = 0;
749 /* delete a parser object */
752 Perl_parser_free(pTHX_ const yy_parser *parser)
754 PERL_ARGS_ASSERT_PARSER_FREE;
756 PL_curcop = parser->saved_curcop;
757 SvREFCNT_dec(parser->linestr);
759 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
760 PerlIO_clearerr(parser->rsfp);
761 else if (parser->rsfp && (!parser->old_parser ||
762 (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
763 PerlIO_close(parser->rsfp);
764 SvREFCNT_dec(parser->rsfp_filters);
765 SvREFCNT_dec(parser->lex_stuff);
766 SvREFCNT_dec(parser->sublex_info.repl);
768 Safefree(parser->lex_brackstack);
769 Safefree(parser->lex_casestack);
770 Safefree(parser->lex_shared);
771 PL_parser = parser->old_parser;
776 Perl_parser_free_nexttoke_ops(pTHX_ yy_parser *parser, OPSLAB *slab)
778 I32 nexttoke = parser->nexttoke;
779 PERL_ARGS_ASSERT_PARSER_FREE_NEXTTOKE_OPS;
781 if (S_is_opval_token(parser->nexttype[nexttoke] & 0xffff)
782 && parser->nextval[nexttoke].opval
783 && parser->nextval[nexttoke].opval->op_slabbed
784 && OpSLAB(parser->nextval[nexttoke].opval) == slab) {
785 op_free(parser->nextval[nexttoke].opval);
786 parser->nextval[nexttoke].opval = NULL;
793 =for apidoc AmxU|SV *|PL_parser-E<gt>linestr
795 Buffer scalar containing the chunk currently under consideration of the
796 text currently being lexed. This is always a plain string scalar (for
797 which C<SvPOK> is true). It is not intended to be used as a scalar by
798 normal scalar means; instead refer to the buffer directly by the pointer
799 variables described below.
801 The lexer maintains various C<char*> pointers to things in the
802 C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
803 reallocated, all of these pointers must be updated. Don't attempt to
804 do this manually, but rather use L</lex_grow_linestr> if you need to
805 reallocate the buffer.
807 The content of the text chunk in the buffer is commonly exactly one
808 complete line of input, up to and including a newline terminator,
809 but there are situations where it is otherwise. The octets of the
810 buffer may be intended to be interpreted as either UTF-8 or Latin-1.
811 The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
812 flag on this scalar, which may disagree with it.
814 For direct examination of the buffer, the variable
815 L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
816 lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
817 of these pointers is usually preferable to examination of the scalar
818 through normal scalar means.
820 =for apidoc AmxU|char *|PL_parser-E<gt>bufend
822 Direct pointer to the end of the chunk of text currently being lexed, the
823 end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
824 + SvCUR(PL_parser-E<gt>linestr)>. A C<NUL> character (zero octet) is
825 always located at the end of the buffer, and does not count as part of
826 the buffer's contents.
828 =for apidoc AmxU|char *|PL_parser-E<gt>bufptr
830 Points to the current position of lexing inside the lexer buffer.
831 Characters around this point may be freely examined, within
832 the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
833 L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
834 interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
836 Lexing code (whether in the Perl core or not) moves this pointer past
837 the characters that it consumes. It is also expected to perform some
838 bookkeeping whenever a newline character is consumed. This movement
839 can be more conveniently performed by the function L</lex_read_to>,
840 which handles newlines appropriately.
842 Interpretation of the buffer's octets can be abstracted out by
843 using the slightly higher-level functions L</lex_peek_unichar> and
844 L</lex_read_unichar>.
846 =for apidoc AmxU|char *|PL_parser-E<gt>linestart
848 Points to the start of the current line inside the lexer buffer.
849 This is useful for indicating at which column an error occurred, and
850 not much else. This must be updated by any lexing code that consumes
851 a newline; the function L</lex_read_to> handles this detail.
857 =for apidoc Amx|bool|lex_bufutf8
859 Indicates whether the octets in the lexer buffer
860 (L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
861 of Unicode characters. If not, they should be interpreted as Latin-1
862 characters. This is analogous to the C<SvUTF8> flag for scalars.
864 In UTF-8 mode, it is not guaranteed that the lexer buffer actually
865 contains valid UTF-8. Lexing code must be robust in the face of invalid
868 The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
869 is significant, but not the whole story regarding the input character
870 encoding. Normally, when a file is being read, the scalar contains octets
871 and its C<SvUTF8> flag is off, but the octets should be interpreted as
872 UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
873 however, the scalar may have the C<SvUTF8> flag on, and in this case its
874 octets should be interpreted as UTF-8 unless the C<use bytes> pragma
875 is in effect. This logic may change in the future; use this function
876 instead of implementing the logic yourself.
882 Perl_lex_bufutf8(pTHX)
888 =for apidoc Amx|char *|lex_grow_linestr|STRLEN len
890 Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
891 at least I<len> octets (including terminating C<NUL>). Returns a
892 pointer to the reallocated buffer. This is necessary before making
893 any direct modification of the buffer that would increase its length.
894 L</lex_stuff_pvn> provides a more convenient way to insert text into
897 Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
898 this function updates all of the lexer's variables that point directly
905 Perl_lex_grow_linestr(pTHX_ STRLEN len)
909 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
910 STRLEN linestart_pos, last_uni_pos, last_lop_pos, re_eval_start_pos;
911 linestr = PL_parser->linestr;
912 buf = SvPVX(linestr);
913 if (len <= SvLEN(linestr))
915 bufend_pos = PL_parser->bufend - buf;
916 bufptr_pos = PL_parser->bufptr - buf;
917 oldbufptr_pos = PL_parser->oldbufptr - buf;
918 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
919 linestart_pos = PL_parser->linestart - buf;
920 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
921 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
922 re_eval_start_pos = PL_parser->lex_shared->re_eval_start ?
923 PL_parser->lex_shared->re_eval_start - buf : 0;
925 buf = sv_grow(linestr, len);
927 PL_parser->bufend = buf + bufend_pos;
928 PL_parser->bufptr = buf + bufptr_pos;
929 PL_parser->oldbufptr = buf + oldbufptr_pos;
930 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
931 PL_parser->linestart = buf + linestart_pos;
932 if (PL_parser->last_uni)
933 PL_parser->last_uni = buf + last_uni_pos;
934 if (PL_parser->last_lop)
935 PL_parser->last_lop = buf + last_lop_pos;
936 if (PL_parser->lex_shared->re_eval_start)
937 PL_parser->lex_shared->re_eval_start = buf + re_eval_start_pos;
942 =for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
944 Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
945 immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
946 reallocating the buffer if necessary. This means that lexing code that
947 runs later will see the characters as if they had appeared in the input.
948 It is not recommended to do this as part of normal parsing, and most
949 uses of this facility run the risk of the inserted characters being
950 interpreted in an unintended manner.
952 The string to be inserted is represented by I<len> octets starting
953 at I<pv>. These octets are interpreted as either UTF-8 or Latin-1,
954 according to whether the C<LEX_STUFF_UTF8> flag is set in I<flags>.
955 The characters are recoded for the lexer buffer, according to how the
956 buffer is currently being interpreted (L</lex_bufutf8>). If a string
957 to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
958 function is more convenient.
964 Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
968 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
969 if (flags & ~(LEX_STUFF_UTF8))
970 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
972 if (flags & LEX_STUFF_UTF8) {
975 STRLEN highhalf = 0; /* Count of variants */
976 const char *p, *e = pv+len;
977 for (p = pv; p != e; p++) {
978 if (! UTF8_IS_INVARIANT(*p)) {
984 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
985 bufptr = PL_parser->bufptr;
986 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
987 SvCUR_set(PL_parser->linestr,
988 SvCUR(PL_parser->linestr) + len+highhalf);
989 PL_parser->bufend += len+highhalf;
990 for (p = pv; p != e; p++) {
992 if (! UTF8_IS_INVARIANT(c)) {
993 *bufptr++ = UTF8_TWO_BYTE_HI(c);
994 *bufptr++ = UTF8_TWO_BYTE_LO(c);
1001 if (flags & LEX_STUFF_UTF8) {
1002 STRLEN highhalf = 0;
1003 const char *p, *e = pv+len;
1004 for (p = pv; p != e; p++) {
1006 if (UTF8_IS_ABOVE_LATIN1(c)) {
1007 Perl_croak(aTHX_ "Lexing code attempted to stuff "
1008 "non-Latin-1 character into Latin-1 input");
1009 } else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e)) {
1012 } else if (! UTF8_IS_INVARIANT(c)) {
1013 /* malformed UTF-8 */
1015 SAVESPTR(PL_warnhook);
1016 PL_warnhook = PERL_WARNHOOK_FATAL;
1017 utf8n_to_uvchr((U8*)p, e-p, NULL, 0);
1023 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
1024 bufptr = PL_parser->bufptr;
1025 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
1026 SvCUR_set(PL_parser->linestr,
1027 SvCUR(PL_parser->linestr) + len-highhalf);
1028 PL_parser->bufend += len-highhalf;
1031 if (UTF8_IS_INVARIANT(*p)) {
1037 *bufptr++ = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
1043 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1044 bufptr = PL_parser->bufptr;
1045 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
1046 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
1047 PL_parser->bufend += len;
1048 Copy(pv, bufptr, len, char);
1054 =for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
1056 Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1057 immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1058 reallocating the buffer if necessary. This means that lexing code that
1059 runs later will see the characters as if they had appeared in the input.
1060 It is not recommended to do this as part of normal parsing, and most
1061 uses of this facility run the risk of the inserted characters being
1062 interpreted in an unintended manner.
1064 The string to be inserted is represented by octets starting at I<pv>
1065 and continuing to the first nul. These octets are interpreted as either
1066 UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
1067 in I<flags>. The characters are recoded for the lexer buffer, according
1068 to how the buffer is currently being interpreted (L</lex_bufutf8>).
1069 If it is not convenient to nul-terminate a string to be inserted, the
1070 L</lex_stuff_pvn> function is more appropriate.
1076 Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1078 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1079 lex_stuff_pvn(pv, strlen(pv), flags);
1083 =for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1085 Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1086 immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1087 reallocating the buffer if necessary. This means that lexing code that
1088 runs later will see the characters as if they had appeared in the input.
1089 It is not recommended to do this as part of normal parsing, and most
1090 uses of this facility run the risk of the inserted characters being
1091 interpreted in an unintended manner.
1093 The string to be inserted is the string value of I<sv>. The characters
1094 are recoded for the lexer buffer, according to how the buffer is currently
1095 being interpreted (L</lex_bufutf8>). If a string to be inserted is
1096 not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1097 need to construct a scalar.
1103 Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1107 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1109 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1111 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1115 =for apidoc Amx|void|lex_unstuff|char *ptr
1117 Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1118 I<ptr>. Text following I<ptr> will be moved, and the buffer shortened.
1119 This hides the discarded text from any lexing code that runs later,
1120 as if the text had never appeared.
1122 This is not the normal way to consume lexed text. For that, use
1129 Perl_lex_unstuff(pTHX_ char *ptr)
1133 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1134 buf = PL_parser->bufptr;
1136 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1139 bufend = PL_parser->bufend;
1141 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1142 unstuff_len = ptr - buf;
1143 Move(ptr, buf, bufend+1-ptr, char);
1144 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1145 PL_parser->bufend = bufend - unstuff_len;
1149 =for apidoc Amx|void|lex_read_to|char *ptr
1151 Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1152 to I<ptr>. This advances L</PL_parser-E<gt>bufptr> to match I<ptr>,
1153 performing the correct bookkeeping whenever a newline character is passed.
1154 This is the normal way to consume lexed text.
1156 Interpretation of the buffer's octets can be abstracted out by
1157 using the slightly higher-level functions L</lex_peek_unichar> and
1158 L</lex_read_unichar>.
1164 Perl_lex_read_to(pTHX_ char *ptr)
1167 PERL_ARGS_ASSERT_LEX_READ_TO;
1168 s = PL_parser->bufptr;
1169 if (ptr < s || ptr > PL_parser->bufend)
1170 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1171 for (; s != ptr; s++)
1173 COPLINE_INC_WITH_HERELINES;
1174 PL_parser->linestart = s+1;
1176 PL_parser->bufptr = ptr;
1180 =for apidoc Amx|void|lex_discard_to|char *ptr
1182 Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1183 up to I<ptr>. The remaining content of the buffer will be moved, and
1184 all pointers into the buffer updated appropriately. I<ptr> must not
1185 be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1186 it is not permitted to discard text that has yet to be lexed.
1188 Normally it is not necessarily to do this directly, because it suffices to
1189 use the implicit discarding behaviour of L</lex_next_chunk> and things
1190 based on it. However, if a token stretches across multiple lines,
1191 and the lexing code has kept multiple lines of text in the buffer for
1192 that purpose, then after completion of the token it would be wise to
1193 explicitly discard the now-unneeded earlier lines, to avoid future
1194 multi-line tokens growing the buffer without bound.
1200 Perl_lex_discard_to(pTHX_ char *ptr)
1204 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1205 buf = SvPVX(PL_parser->linestr);
1207 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1210 if (ptr > PL_parser->bufptr)
1211 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1212 discard_len = ptr - buf;
1213 if (PL_parser->oldbufptr < ptr)
1214 PL_parser->oldbufptr = ptr;
1215 if (PL_parser->oldoldbufptr < ptr)
1216 PL_parser->oldoldbufptr = ptr;
1217 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1218 PL_parser->last_uni = NULL;
1219 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1220 PL_parser->last_lop = NULL;
1221 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1222 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1223 PL_parser->bufend -= discard_len;
1224 PL_parser->bufptr -= discard_len;
1225 PL_parser->oldbufptr -= discard_len;
1226 PL_parser->oldoldbufptr -= discard_len;
1227 if (PL_parser->last_uni)
1228 PL_parser->last_uni -= discard_len;
1229 if (PL_parser->last_lop)
1230 PL_parser->last_lop -= discard_len;
1234 =for apidoc Amx|bool|lex_next_chunk|U32 flags
1236 Reads in the next chunk of text to be lexed, appending it to
1237 L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1238 looked to the end of the current chunk and wants to know more. It is
1239 usual, but not necessary, for lexing to have consumed the entirety of
1240 the current chunk at this time.
1242 If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1243 chunk (i.e., the current chunk has been entirely consumed), normally the
1244 current chunk will be discarded at the same time that the new chunk is
1245 read in. If I<flags> includes C<LEX_KEEP_PREVIOUS>, the current chunk
1246 will not be discarded. If the current chunk has not been entirely
1247 consumed, then it will not be discarded regardless of the flag.
1249 Returns true if some new text was added to the buffer, or false if the
1250 buffer has reached the end of the input text.
1255 #define LEX_FAKE_EOF 0x80000000
1256 #define LEX_NO_TERM 0x40000000
1259 Perl_lex_next_chunk(pTHX_ U32 flags)
1263 STRLEN old_bufend_pos, new_bufend_pos;
1264 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1265 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
1266 bool got_some_for_debugger = 0;
1268 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM))
1269 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
1270 linestr = PL_parser->linestr;
1271 buf = SvPVX(linestr);
1272 if (!(flags & LEX_KEEP_PREVIOUS) &&
1273 PL_parser->bufptr == PL_parser->bufend) {
1274 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1276 if (PL_parser->last_uni != PL_parser->bufend)
1277 PL_parser->last_uni = NULL;
1278 if (PL_parser->last_lop != PL_parser->bufend)
1279 PL_parser->last_lop = NULL;
1280 last_uni_pos = last_lop_pos = 0;
1284 old_bufend_pos = PL_parser->bufend - buf;
1285 bufptr_pos = PL_parser->bufptr - buf;
1286 oldbufptr_pos = PL_parser->oldbufptr - buf;
1287 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1288 linestart_pos = PL_parser->linestart - buf;
1289 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1290 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1292 if (flags & LEX_FAKE_EOF) {
1294 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
1296 } else if (filter_gets(linestr, old_bufend_pos)) {
1298 got_some_for_debugger = 1;
1299 } else if (flags & LEX_NO_TERM) {
1302 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1303 sv_setpvs(linestr, "");
1305 /* End of real input. Close filehandle (unless it was STDIN),
1306 * then add implicit termination.
1308 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
1309 PerlIO_clearerr(PL_parser->rsfp);
1310 else if (PL_parser->rsfp)
1311 (void)PerlIO_close(PL_parser->rsfp);
1312 PL_parser->rsfp = NULL;
1313 PL_parser->in_pod = PL_parser->filtered = 0;
1314 if (!PL_in_eval && PL_minus_p) {
1316 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1317 PL_minus_n = PL_minus_p = 0;
1318 } else if (!PL_in_eval && PL_minus_n) {
1319 sv_catpvs(linestr, /*{*/";}");
1322 sv_catpvs(linestr, ";");
1325 buf = SvPVX(linestr);
1326 new_bufend_pos = SvCUR(linestr);
1327 PL_parser->bufend = buf + new_bufend_pos;
1328 PL_parser->bufptr = buf + bufptr_pos;
1329 PL_parser->oldbufptr = buf + oldbufptr_pos;
1330 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1331 PL_parser->linestart = buf + linestart_pos;
1332 if (PL_parser->last_uni)
1333 PL_parser->last_uni = buf + last_uni_pos;
1334 if (PL_parser->last_lop)
1335 PL_parser->last_lop = buf + last_lop_pos;
1336 if (PL_parser->preambling != NOLINE) {
1337 CopLINE_set(PL_curcop, PL_parser->preambling + 1);
1338 PL_parser->preambling = NOLINE;
1340 if (got_some_for_debugger && (PERLDB_LINE || PERLDB_SAVESRC) &&
1341 PL_curstash != PL_debstash) {
1342 /* debugger active and we're not compiling the debugger code,
1343 * so store the line into the debugger's array of lines
1345 update_debugger_info(NULL, buf+old_bufend_pos,
1346 new_bufend_pos-old_bufend_pos);
1352 =for apidoc Amx|I32|lex_peek_unichar|U32 flags
1354 Looks ahead one (Unicode) character in the text currently being lexed.
1355 Returns the codepoint (unsigned integer value) of the next character,
1356 or -1 if lexing has reached the end of the input text. To consume the
1357 peeked character, use L</lex_read_unichar>.
1359 If the next character is in (or extends into) the next chunk of input
1360 text, the next chunk will be read in. Normally the current chunk will be
1361 discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1362 then the current chunk will not be discarded.
1364 If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1365 is encountered, an exception is generated.
1371 Perl_lex_peek_unichar(pTHX_ U32 flags)
1375 if (flags & ~(LEX_KEEP_PREVIOUS))
1376 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1377 s = PL_parser->bufptr;
1378 bufend = PL_parser->bufend;
1384 if (!lex_next_chunk(flags))
1386 s = PL_parser->bufptr;
1387 bufend = PL_parser->bufend;
1390 if (UTF8_IS_INVARIANT(head))
1392 if (UTF8_IS_START(head)) {
1393 len = UTF8SKIP(&head);
1394 while ((STRLEN)(bufend-s) < len) {
1395 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1397 s = PL_parser->bufptr;
1398 bufend = PL_parser->bufend;
1401 unichar = utf8n_to_uvchr((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
1402 if (retlen == (STRLEN)-1) {
1403 /* malformed UTF-8 */
1405 SAVESPTR(PL_warnhook);
1406 PL_warnhook = PERL_WARNHOOK_FATAL;
1407 utf8n_to_uvchr((U8*)s, bufend-s, NULL, 0);
1413 if (!lex_next_chunk(flags))
1415 s = PL_parser->bufptr;
1422 =for apidoc Amx|I32|lex_read_unichar|U32 flags
1424 Reads the next (Unicode) character in the text currently being lexed.
1425 Returns the codepoint (unsigned integer value) of the character read,
1426 and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1427 if lexing has reached the end of the input text. To non-destructively
1428 examine the next character, use L</lex_peek_unichar> instead.
1430 If the next character is in (or extends into) the next chunk of input
1431 text, the next chunk will be read in. Normally the current chunk will be
1432 discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1433 then the current chunk will not be discarded.
1435 If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1436 is encountered, an exception is generated.
1442 Perl_lex_read_unichar(pTHX_ U32 flags)
1445 if (flags & ~(LEX_KEEP_PREVIOUS))
1446 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1447 c = lex_peek_unichar(flags);
1450 COPLINE_INC_WITH_HERELINES;
1452 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1454 ++(PL_parser->bufptr);
1460 =for apidoc Amx|void|lex_read_space|U32 flags
1462 Reads optional spaces, in Perl style, in the text currently being
1463 lexed. The spaces may include ordinary whitespace characters and
1464 Perl-style comments. C<#line> directives are processed if encountered.
1465 L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1466 at a non-space character (or the end of the input text).
1468 If spaces extend into the next chunk of input text, the next chunk will
1469 be read in. Normally the current chunk will be discarded at the same
1470 time, but if I<flags> includes C<LEX_KEEP_PREVIOUS> then the current
1471 chunk will not be discarded.
1476 #define LEX_NO_INCLINE 0x40000000
1477 #define LEX_NO_NEXT_CHUNK 0x80000000
1480 Perl_lex_read_space(pTHX_ U32 flags)
1483 const bool can_incline = !(flags & LEX_NO_INCLINE);
1484 bool need_incline = 0;
1485 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK|LEX_NO_INCLINE))
1486 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
1487 s = PL_parser->bufptr;
1488 bufend = PL_parser->bufend;
1494 } while (!(c == '\n' || (c == 0 && s == bufend)));
1495 } else if (c == '\n') {
1498 PL_parser->linestart = s;
1504 } else if (isSPACE(c)) {
1506 } else if (c == 0 && s == bufend) {
1509 if (flags & LEX_NO_NEXT_CHUNK)
1511 PL_parser->bufptr = s;
1512 l = CopLINE(PL_curcop);
1513 CopLINE(PL_curcop) += PL_parser->herelines + 1;
1514 got_more = lex_next_chunk(flags);
1515 CopLINE_set(PL_curcop, l);
1516 s = PL_parser->bufptr;
1517 bufend = PL_parser->bufend;
1520 if (can_incline && need_incline && PL_parser->rsfp) {
1528 PL_parser->bufptr = s;
1533 =for apidoc EXMp|bool|validate_proto|SV *name|SV *proto|bool warn
1535 This function performs syntax checking on a prototype, C<proto>.
1536 If C<warn> is true, any illegal characters or mismatched brackets
1537 will trigger illegalproto warnings, declaring that they were
1538 detected in the prototype for C<name>.
1540 The return value is C<true> if this is a valid prototype, and
1541 C<false> if it is not, regardless of whether C<warn> was C<true> or
1544 Note that C<NULL> is a valid C<proto> and will always return C<true>.
1551 Perl_validate_proto(pTHX_ SV *name, SV *proto, bool warn)
1553 STRLEN len, origlen;
1554 char *p = proto ? SvPV(proto, len) : NULL;
1555 bool bad_proto = FALSE;
1556 bool in_brackets = FALSE;
1557 bool after_slash = FALSE;
1558 char greedy_proto = ' ';
1559 bool proto_after_greedy_proto = FALSE;
1560 bool must_be_last = FALSE;
1561 bool underscore = FALSE;
1562 bool bad_proto_after_underscore = FALSE;
1564 PERL_ARGS_ASSERT_VALIDATE_PROTO;
1570 for (; len--; p++) {
1573 proto_after_greedy_proto = TRUE;
1575 if (!strchr(";@%", *p))
1576 bad_proto_after_underscore = TRUE;
1579 if (!strchr("$@%*;[]&\\_+", *p) || *p == '\0') {
1586 in_brackets = FALSE;
1587 else if ((*p == '@' || *p == '%') &&
1590 must_be_last = TRUE;
1599 after_slash = FALSE;
1604 SV *tmpsv = newSVpvs_flags("", SVs_TEMP);
1607 ? sv_uni_display(tmpsv, newSVpvn_flags(p, origlen, SVs_TEMP | SVf_UTF8),
1608 origlen, UNI_DISPLAY_ISPRINT)
1609 : pv_pretty(tmpsv, p, origlen, 60, NULL, NULL, PERL_PV_ESCAPE_NONASCII);
1611 if (proto_after_greedy_proto)
1612 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1613 "Prototype after '%c' for %"SVf" : %s",
1614 greedy_proto, SVfARG(name), p);
1616 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1617 "Missing ']' in prototype for %"SVf" : %s",
1620 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1621 "Illegal character in prototype for %"SVf" : %s",
1623 if (bad_proto_after_underscore)
1624 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1625 "Illegal character after '_' in prototype for %"SVf" : %s",
1629 return (! (proto_after_greedy_proto || bad_proto) );
1634 * This subroutine has nothing to do with tilting, whether at windmills
1635 * or pinball tables. Its name is short for "increment line". It
1636 * increments the current line number in CopLINE(PL_curcop) and checks
1637 * to see whether the line starts with a comment of the form
1638 * # line 500 "foo.pm"
1639 * If so, it sets the current line number and file to the values in the comment.
1643 S_incline(pTHX_ const char *s)
1650 PERL_ARGS_ASSERT_INCLINE;
1652 COPLINE_INC_WITH_HERELINES;
1653 if (!PL_rsfp && !PL_parser->filtered && PL_lex_state == LEX_NORMAL
1654 && s+1 == PL_bufend && *s == ';') {
1655 /* fake newline in string eval */
1656 CopLINE_dec(PL_curcop);
1661 while (SPACE_OR_TAB(*s))
1663 if (strnEQ(s, "line", 4))
1667 if (SPACE_OR_TAB(*s))
1671 while (SPACE_OR_TAB(*s))
1679 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
1681 while (SPACE_OR_TAB(*s))
1683 if (*s == '"' && (t = strchr(s+1, '"'))) {
1689 while (!isSPACE(*t))
1693 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
1695 if (*e != '\n' && *e != '\0')
1696 return; /* false alarm */
1698 line_num = grok_atou(n, &e) - 1;
1701 const STRLEN len = t - s;
1703 if (!PL_rsfp && !PL_parser->filtered) {
1704 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1705 * to *{"::_<newfilename"} */
1706 /* However, the long form of evals is only turned on by the
1707 debugger - usually they're "(eval %lu)" */
1708 GV * const cfgv = CopFILEGV(PL_curcop);
1711 STRLEN tmplen2 = len;
1715 if (tmplen2 + 2 <= sizeof smallbuf)
1718 Newx(tmpbuf2, tmplen2 + 2, char);
1723 memcpy(tmpbuf2 + 2, s, tmplen2);
1726 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
1728 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
1729 /* adjust ${"::_<newfilename"} to store the new file name */
1730 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
1731 /* The line number may differ. If that is the case,
1732 alias the saved lines that are in the array.
1733 Otherwise alias the whole array. */
1734 if (CopLINE(PL_curcop) == line_num) {
1735 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(cfgv)));
1736 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(cfgv)));
1738 else if (GvAV(cfgv)) {
1739 AV * const av = GvAV(cfgv);
1740 const I32 start = CopLINE(PL_curcop)+1;
1741 I32 items = AvFILLp(av) - start;
1743 AV * const av2 = GvAVn(gv2);
1744 SV **svp = AvARRAY(av) + start;
1745 I32 l = (I32)line_num+1;
1747 av_store(av2, l++, SvREFCNT_inc(*svp++));
1752 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
1755 CopFILE_free(PL_curcop);
1756 CopFILE_setn(PL_curcop, s, len);
1758 CopLINE_set(PL_curcop, line_num);
1761 #define skipspace(s) skipspace_flags(s, 0)
1765 S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
1767 AV *av = CopFILEAVx(PL_curcop);
1770 if (PL_parser->preambling == NOLINE) sv = newSV_type(SVt_PVMG);
1772 sv = *av_fetch(av, 0, 1);
1773 SvUPGRADE(sv, SVt_PVMG);
1775 if (!SvPOK(sv)) sv_setpvs(sv,"");
1777 sv_catsv(sv, orig_sv);
1779 sv_catpvn(sv, buf, len);
1784 if (PL_parser->preambling == NOLINE)
1785 av_store(av, CopLINE(PL_curcop), sv);
1791 * Called to gobble the appropriate amount and type of whitespace.
1792 * Skips comments as well.
1796 S_skipspace_flags(pTHX_ char *s, U32 flags)
1798 PERL_ARGS_ASSERT_SKIPSPACE_FLAGS;
1799 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
1800 while (s < PL_bufend && SPACE_OR_TAB(*s))
1803 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1805 lex_read_space(flags | LEX_KEEP_PREVIOUS |
1806 (PL_sublex_info.sub_inwhat || PL_lex_state == LEX_FORMLINE ?
1807 LEX_NO_NEXT_CHUNK : 0));
1809 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1810 if (PL_linestart > PL_bufptr)
1811 PL_bufptr = PL_linestart;
1819 * Check the unary operators to ensure there's no ambiguity in how they're
1820 * used. An ambiguous piece of code would be:
1822 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1823 * the +5 is its argument.
1832 if (PL_oldoldbufptr != PL_last_uni)
1834 while (isSPACE(*PL_last_uni))
1837 while (isWORDCHAR_lazy_if(s,UTF) || *s == '-')
1839 if ((t = strchr(s, '(')) && t < PL_bufptr)
1842 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
1843 "Warning: Use of \"%.*s\" without parentheses is ambiguous",
1844 (int)(s - PL_last_uni), PL_last_uni);
1848 * LOP : macro to build a list operator. Its behaviour has been replaced
1849 * with a subroutine, S_lop() for which LOP is just another name.
1852 #define LOP(f,x) return lop(f,x,s)
1856 * Build a list operator (or something that might be one). The rules:
1857 * - if we have a next token, then it's a list operator (no parens) for
1858 * which the next token has already been parsed; e.g.,
1861 * - if the next thing is an opening paren, then it's a function
1862 * - else it's a list operator
1866 S_lop(pTHX_ I32 f, int x, char *s)
1868 PERL_ARGS_ASSERT_LOP;
1873 PL_last_lop = PL_oldbufptr;
1874 PL_last_lop_op = (OPCODE)f;
1879 return REPORT(FUNC);
1882 return REPORT(FUNC);
1885 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
1886 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
1887 return REPORT(LSTOP);
1893 * When the lexer realizes it knows the next token (for instance,
1894 * it is reordering tokens for the parser) then it can call S_force_next
1895 * to know what token to return the next time the lexer is called. Caller
1896 * will need to set PL_nextval[] and possibly PL_expect to ensure
1897 * the lexer handles the token correctly.
1901 S_force_next(pTHX_ I32 type)
1905 PerlIO_printf(Perl_debug_log, "### forced token:\n");
1906 tokereport(type, &NEXTVAL_NEXTTOKE);
1909 PL_nexttype[PL_nexttoke] = type;
1911 if (PL_lex_state != LEX_KNOWNEXT) {
1912 PL_lex_defer = PL_lex_state;
1913 PL_lex_expect = PL_expect;
1914 PL_lex_state = LEX_KNOWNEXT;
1921 * This subroutine handles postfix deref syntax after the arrow has already
1922 * been emitted. @* $* etc. are emitted as two separate token right here.
1923 * @[ @{ %[ %{ *{ are emitted also as two tokens, but this function emits
1924 * only the first, leaving yylex to find the next.
1928 S_postderef(pTHX_ int const funny, char const next)
1930 assert(funny == DOLSHARP || strchr("$@%&*", funny));
1931 assert(strchr("*[{", next));
1933 PL_expect = XOPERATOR;
1934 if (PL_lex_state == LEX_INTERPNORMAL && !PL_lex_brackets) {
1935 assert('@' == funny || '$' == funny || DOLSHARP == funny);
1936 PL_lex_state = LEX_INTERPEND;
1937 force_next(POSTJOIN);
1943 if ('@' == funny && PL_lex_state == LEX_INTERPNORMAL
1944 && !PL_lex_brackets)
1946 PL_expect = XOPERATOR;
1955 int yyc = PL_parser->yychar;
1956 if (yyc != YYEMPTY) {
1958 NEXTVAL_NEXTTOKE = PL_parser->yylval;
1959 if (yyc == '{'/*}*/ || yyc == HASHBRACK || yyc == '['/*]*/) {
1960 PL_lex_allbrackets--;
1962 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
1963 } else if (yyc == '('/*)*/) {
1964 PL_lex_allbrackets--;
1969 PL_parser->yychar = YYEMPTY;
1974 S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
1976 SV * const sv = newSVpvn_utf8(start, len,
1979 && !is_ascii_string((const U8*)start, len)
1980 && is_utf8_string((const U8*)start, len));
1986 * When the lexer knows the next thing is a word (for instance, it has
1987 * just seen -> and it knows that the next char is a word char, then
1988 * it calls S_force_word to stick the next word into the PL_nexttoke/val
1992 * char *start : buffer position (must be within PL_linestr)
1993 * int token : PL_next* will be this type of bare word (e.g., METHOD,WORD)
1994 * int check_keyword : if true, Perl checks to make sure the word isn't
1995 * a keyword (do this if the word is a label, e.g. goto FOO)
1996 * int allow_pack : if true, : characters will also be allowed (require,
1997 * use, etc. do this)
1998 * int allow_initial_tick : used by the "sub" lexer only.
2002 S_force_word(pTHX_ char *start, int token, int check_keyword, int allow_pack)
2007 PERL_ARGS_ASSERT_FORCE_WORD;
2009 start = SKIPSPACE1(start);
2011 if (isIDFIRST_lazy_if(s,UTF) ||
2012 (allow_pack && *s == ':') )
2014 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
2015 if (check_keyword) {
2016 char *s2 = PL_tokenbuf;
2017 if (allow_pack && len > 6 && strnEQ(s2, "CORE::", 6))
2019 if (keyword(s2, len, 0))
2022 if (token == METHOD) {
2027 PL_expect = XOPERATOR;
2030 NEXTVAL_NEXTTOKE.opval
2031 = (OP*)newSVOP(OP_CONST,0,
2032 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
2033 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
2041 * Called when the lexer wants $foo *foo &foo etc, but the program
2042 * text only contains the "foo" portion. The first argument is a pointer
2043 * to the "foo", and the second argument is the type symbol to prefix.
2044 * Forces the next token to be a "WORD".
2045 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
2049 S_force_ident(pTHX_ const char *s, int kind)
2051 PERL_ARGS_ASSERT_FORCE_IDENT;
2054 const STRLEN len = s[1] ? strlen(s) : 1; /* s = "\"" see yylex */
2055 OP* const o = (OP*)newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
2056 UTF ? SVf_UTF8 : 0));
2057 NEXTVAL_NEXTTOKE.opval = o;
2060 o->op_private = OPpCONST_ENTERED;
2061 /* XXX see note in pp_entereval() for why we forgo typo
2062 warnings if the symbol must be introduced in an eval.
2064 gv_fetchpvn_flags(s, len,
2065 (PL_in_eval ? (GV_ADDMULTI | GV_ADDINEVAL)
2066 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
2067 kind == '$' ? SVt_PV :
2068 kind == '@' ? SVt_PVAV :
2069 kind == '%' ? SVt_PVHV :
2077 S_force_ident_maybe_lex(pTHX_ char pit)
2079 NEXTVAL_NEXTTOKE.ival = pit;
2084 Perl_str_to_version(pTHX_ SV *sv)
2089 const char *start = SvPV_const(sv,len);
2090 const char * const end = start + len;
2091 const bool utf = SvUTF8(sv) ? TRUE : FALSE;
2093 PERL_ARGS_ASSERT_STR_TO_VERSION;
2095 while (start < end) {
2099 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
2104 retval += ((NV)n)/nshift;
2113 * Forces the next token to be a version number.
2114 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2115 * and if "guessing" is TRUE, then no new token is created (and the caller
2116 * must use an alternative parsing method).
2120 S_force_version(pTHX_ char *s, int guessing)
2125 PERL_ARGS_ASSERT_FORCE_VERSION;
2133 while (isDIGIT(*d) || *d == '_' || *d == '.')
2135 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
2137 s = scan_num(s, &pl_yylval);
2138 version = pl_yylval.opval;
2139 ver = cSVOPx(version)->op_sv;
2140 if (SvPOK(ver) && !SvNIOK(ver)) {
2141 SvUPGRADE(ver, SVt_PVNV);
2142 SvNV_set(ver, str_to_version(ver));
2143 SvNOK_on(ver); /* hint that it is a version */
2146 else if (guessing) {
2151 /* NOTE: The parser sees the package name and the VERSION swapped */
2152 NEXTVAL_NEXTTOKE.opval = version;
2159 * S_force_strict_version
2160 * Forces the next token to be a version number using strict syntax rules.
2164 S_force_strict_version(pTHX_ char *s)
2167 const char *errstr = NULL;
2169 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2171 while (isSPACE(*s)) /* leading whitespace */
2174 if (is_STRICT_VERSION(s,&errstr)) {
2176 s = (char *)scan_version(s, ver, 0);
2177 version = newSVOP(OP_CONST, 0, ver);
2179 else if ( (*s != ';' && *s != '{' && *s != '}' ) &&
2180 (s = SKIPSPACE1(s), (*s != ';' && *s != '{' && *s != '}' )))
2184 yyerror(errstr); /* version required */
2188 /* NOTE: The parser sees the package name and the VERSION swapped */
2189 NEXTVAL_NEXTTOKE.opval = version;
2197 * Tokenize a quoted string passed in as an SV. It finds the next
2198 * chunk, up to end of string or a backslash. It may make a new
2199 * SV containing that chunk (if HINT_NEW_STRING is on). It also
2204 S_tokeq(pTHX_ SV *sv)
2211 PERL_ARGS_ASSERT_TOKEQ;
2215 assert (!SvIsCOW(sv));
2216 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1) /* <<'heredoc' */
2220 /* This is relying on the SV being "well formed" with a trailing '\0' */
2221 while (s < send && !(*s == '\\' && s[1] == '\\'))
2226 if ( PL_hints & HINT_NEW_STRING ) {
2227 pv = newSVpvn_flags(SvPVX_const(pv), SvCUR(sv),
2228 SVs_TEMP | SvUTF8(sv));
2232 if (s + 1 < send && (s[1] == '\\'))
2233 s++; /* all that, just for this */
2238 SvCUR_set(sv, d - SvPVX_const(sv));
2240 if ( PL_hints & HINT_NEW_STRING )
2241 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
2246 * Now come three functions related to double-quote context,
2247 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2248 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2249 * interact with PL_lex_state, and create fake ( ... ) argument lists
2250 * to handle functions and concatenation.
2254 * stringify ( const[foo] concat lcfirst ( const[bar] ) )
2259 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
2261 * Pattern matching will set PL_lex_op to the pattern-matching op to
2262 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
2264 * OP_CONST and OP_READLINE are easy--just make the new op and return.
2266 * Everything else becomes a FUNC.
2268 * Sets PL_lex_state to LEX_INTERPPUSH unless (ival was OP_NULL or we
2269 * had an OP_CONST or OP_READLINE). This just sets us up for a
2270 * call to S_sublex_push().
2274 S_sublex_start(pTHX)
2276 const I32 op_type = pl_yylval.ival;
2278 if (op_type == OP_NULL) {
2279 pl_yylval.opval = PL_lex_op;
2283 if (op_type == OP_CONST) {
2284 SV *sv = tokeq(PL_lex_stuff);
2286 if (SvTYPE(sv) == SVt_PVIV) {
2287 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2289 const char * const p = SvPV_const(sv, len);
2290 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
2294 pl_yylval.opval = (OP*)newSVOP(op_type, 0, sv);
2295 PL_lex_stuff = NULL;
2299 PL_sublex_info.super_state = PL_lex_state;
2300 PL_sublex_info.sub_inwhat = (U16)op_type;
2301 PL_sublex_info.sub_op = PL_lex_op;
2302 PL_lex_state = LEX_INTERPPUSH;
2306 pl_yylval.opval = PL_lex_op;
2316 * Create a new scope to save the lexing state. The scope will be
2317 * ended in S_sublex_done. Returns a '(', starting the function arguments
2318 * to the uc, lc, etc. found before.
2319 * Sets PL_lex_state to LEX_INTERPCONCAT.
2326 const bool is_heredoc = PL_multi_close == '<';
2329 PL_lex_state = PL_sublex_info.super_state;
2330 SAVEI8(PL_lex_dojoin);
2331 SAVEI32(PL_lex_brackets);
2332 SAVEI32(PL_lex_allbrackets);
2333 SAVEI32(PL_lex_formbrack);
2334 SAVEI8(PL_lex_fakeeof);
2335 SAVEI32(PL_lex_casemods);
2336 SAVEI32(PL_lex_starts);
2337 SAVEI8(PL_lex_state);
2338 SAVESPTR(PL_lex_repl);
2339 SAVEVPTR(PL_lex_inpat);
2340 SAVEI16(PL_lex_inwhat);
2343 SAVECOPLINE(PL_curcop);
2344 SAVEI32(PL_multi_end);
2345 SAVEI32(PL_parser->herelines);
2346 PL_parser->herelines = 0;
2348 SAVEI8(PL_multi_close);
2349 SAVEPPTR(PL_bufptr);
2350 SAVEPPTR(PL_bufend);
2351 SAVEPPTR(PL_oldbufptr);
2352 SAVEPPTR(PL_oldoldbufptr);
2353 SAVEPPTR(PL_last_lop);
2354 SAVEPPTR(PL_last_uni);
2355 SAVEPPTR(PL_linestart);
2356 SAVESPTR(PL_linestr);
2357 SAVEGENERICPV(PL_lex_brackstack);
2358 SAVEGENERICPV(PL_lex_casestack);
2359 SAVEGENERICPV(PL_parser->lex_shared);
2360 SAVEBOOL(PL_parser->lex_re_reparsing);
2361 SAVEI32(PL_copline);
2363 /* The here-doc parser needs to be able to peek into outer lexing
2364 scopes to find the body of the here-doc. So we put PL_linestr and
2365 PL_bufptr into lex_shared, to ‘share’ those values.
2367 PL_parser->lex_shared->ls_linestr = PL_linestr;
2368 PL_parser->lex_shared->ls_bufptr = PL_bufptr;
2370 PL_linestr = PL_lex_stuff;
2371 PL_lex_repl = PL_sublex_info.repl;
2372 PL_lex_stuff = NULL;
2373 PL_sublex_info.repl = NULL;
2375 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2376 = SvPVX(PL_linestr);
2377 PL_bufend += SvCUR(PL_linestr);
2378 PL_last_lop = PL_last_uni = NULL;
2379 SAVEFREESV(PL_linestr);
2380 if (PL_lex_repl) SAVEFREESV(PL_lex_repl);
2382 PL_lex_dojoin = FALSE;
2383 PL_lex_brackets = PL_lex_formbrack = 0;
2384 PL_lex_allbrackets = 0;
2385 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
2386 Newx(PL_lex_brackstack, 120, char);
2387 Newx(PL_lex_casestack, 12, char);
2388 PL_lex_casemods = 0;
2389 *PL_lex_casestack = '\0';
2391 PL_lex_state = LEX_INTERPCONCAT;
2393 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
2394 PL_copline = NOLINE;
2396 Newxz(shared, 1, LEXSHARED);
2397 shared->ls_prev = PL_parser->lex_shared;
2398 PL_parser->lex_shared = shared;
2400 PL_lex_inwhat = PL_sublex_info.sub_inwhat;
2401 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
2402 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2403 PL_lex_inpat = PL_sublex_info.sub_op;
2405 PL_lex_inpat = NULL;
2407 PL_parser->lex_re_reparsing = cBOOL(PL_in_eval & EVAL_RE_REPARSING);
2408 PL_in_eval &= ~EVAL_RE_REPARSING;
2415 * Restores lexer state after a S_sublex_push.
2421 if (!PL_lex_starts++) {
2422 SV * const sv = newSVpvs("");
2423 if (SvUTF8(PL_linestr))
2425 PL_expect = XOPERATOR;
2426 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
2430 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2431 PL_lex_state = LEX_INTERPCASEMOD;
2435 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
2436 assert(PL_lex_inwhat != OP_TRANSR);
2438 assert (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS);
2439 PL_linestr = PL_lex_repl;
2441 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2442 PL_bufend += SvCUR(PL_linestr);
2443 PL_last_lop = PL_last_uni = NULL;
2444 PL_lex_dojoin = FALSE;
2445 PL_lex_brackets = 0;
2446 PL_lex_allbrackets = 0;
2447 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
2448 PL_lex_casemods = 0;
2449 *PL_lex_casestack = '\0';
2451 if (SvEVALED(PL_lex_repl)) {
2452 PL_lex_state = LEX_INTERPNORMAL;
2454 /* we don't clear PL_lex_repl here, so that we can check later
2455 whether this is an evalled subst; that means we rely on the
2456 logic to ensure sublex_done() is called again only via the
2457 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
2460 PL_lex_state = LEX_INTERPCONCAT;
2463 if (SvTYPE(PL_linestr) >= SVt_PVNV) {
2464 CopLINE(PL_curcop) +=
2465 ((XPVNV*)SvANY(PL_linestr))->xnv_u.xpad_cop_seq.xlow
2466 + PL_parser->herelines;
2467 PL_parser->herelines = 0;
2472 const line_t l = CopLINE(PL_curcop);
2474 if (PL_multi_close == '<')
2475 PL_parser->herelines += l - PL_multi_end;
2476 PL_bufend = SvPVX(PL_linestr);
2477 PL_bufend += SvCUR(PL_linestr);
2478 PL_expect = XOPERATOR;
2479 PL_sublex_info.sub_inwhat = 0;
2484 PERL_STATIC_INLINE SV*
2485 S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
2487 /* <s> points to first character of interior of \N{}, <e> to one beyond the
2488 * interior, hence to the "}". Finds what the name resolves to, returning
2489 * an SV* containing it; NULL if no valid one found */
2491 SV* res = newSVpvn_flags(s, e - s, UTF ? SVf_UTF8 : 0);
2498 const U8* first_bad_char_loc;
2499 const char* backslash_ptr = s - 3; /* Points to the <\> of \N{... */
2501 PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME;
2503 if (UTF && ! is_utf8_string_loc((U8 *) backslash_ptr,
2505 &first_bad_char_loc))
2507 /* If warnings are on, this will print a more detailed analysis of what
2508 * is wrong than the error message below */
2509 utf8n_to_uvchr(first_bad_char_loc,
2510 e - ((char *) first_bad_char_loc),
2513 /* We deliberately don't try to print the malformed character, which
2514 * might not print very well; it also may be just the first of many
2515 * malformations, so don't print what comes after it */
2516 yyerror(Perl_form(aTHX_
2517 "Malformed UTF-8 character immediately after '%.*s'",
2518 (int) (first_bad_char_loc - (U8 *) backslash_ptr), backslash_ptr));
2522 res = new_constant( NULL, 0, "charnames", res, NULL, backslash_ptr,
2523 /* include the <}> */
2524 e - backslash_ptr + 1);
2526 SvREFCNT_dec_NN(res);
2530 /* See if the charnames handler is the Perl core's, and if so, we can skip
2531 * the validation needed for a user-supplied one, as Perl's does its own
2533 table = GvHV(PL_hintgv); /* ^H */
2534 cvp = hv_fetchs(table, "charnames", FALSE);
2535 if (cvp && (cv = *cvp) && SvROK(cv) && (rv = SvRV(cv),
2536 SvTYPE(rv) == SVt_PVCV) && ((stash = CvSTASH(rv)) != NULL))
2538 const char * const name = HvNAME(stash);
2539 if (HvNAMELEN(stash) == sizeof("_charnames")-1
2540 && strEQ(name, "_charnames")) {
2545 /* Here, it isn't Perl's charname handler. We can't rely on a
2546 * user-supplied handler to validate the input name. For non-ut8 input,
2547 * look to see that the first character is legal. Then loop through the
2548 * rest checking that each is a continuation */
2550 /* This code makes the reasonable assumption that the only Latin1-range
2551 * characters that begin a character name alias are alphabetic, otherwise
2552 * would have to create a isCHARNAME_BEGIN macro */
2555 if (! isALPHAU(*s)) {
2560 if (! isCHARNAME_CONT(*s)) {
2563 if (*s == ' ' && *(s-1) == ' ') {
2566 if ((U8) *s == NBSP_NATIVE && ckWARN_d(WARN_DEPRECATED)) {
2567 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2568 "NO-BREAK SPACE in a charnames "
2569 "alias definition is deprecated");
2575 /* Similarly for utf8. For invariants can check directly; for other
2576 * Latin1, can calculate their code point and check; otherwise use a
2578 if (UTF8_IS_INVARIANT(*s)) {
2579 if (! isALPHAU(*s)) {
2583 } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
2584 if (! isALPHAU(TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)))) {
2590 if (! PL_utf8_charname_begin) {
2591 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2592 PL_utf8_charname_begin = _core_swash_init("utf8",
2593 "_Perl_Charname_Begin",
2595 1, 0, NULL, &flags);
2597 if (! swash_fetch(PL_utf8_charname_begin, (U8 *) s, TRUE)) {
2604 if (UTF8_IS_INVARIANT(*s)) {
2605 if (! isCHARNAME_CONT(*s)) {
2608 if (*s == ' ' && *(s-1) == ' ') {
2613 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
2614 if (! isCHARNAME_CONT(TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1))))
2618 if (*s == *NBSP_UTF8
2619 && *(s+1) == *(NBSP_UTF8+1)
2620 && ckWARN_d(WARN_DEPRECATED))
2622 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2623 "NO-BREAK SPACE in a charnames "
2624 "alias definition is deprecated");
2629 if (! PL_utf8_charname_continue) {
2630 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2631 PL_utf8_charname_continue = _core_swash_init("utf8",
2632 "_Perl_Charname_Continue",
2634 1, 0, NULL, &flags);
2636 if (! swash_fetch(PL_utf8_charname_continue, (U8 *) s, TRUE)) {
2643 if (*(s-1) == ' ') {
2646 "charnames alias definitions may not contain trailing "
2647 "white-space; marked by <-- HERE in %.*s<-- HERE %.*s",
2648 (int)(s - backslash_ptr + 1), backslash_ptr,
2649 (int)(e - s + 1), s + 1
2651 UTF ? SVf_UTF8 : 0);
2655 if (SvUTF8(res)) { /* Don't accept malformed input */
2656 const U8* first_bad_char_loc;
2658 const char* const str = SvPV_const(res, len);
2659 if (! is_utf8_string_loc((U8 *) str, len, &first_bad_char_loc)) {
2660 /* If warnings are on, this will print a more detailed analysis of
2661 * what is wrong than the error message below */
2662 utf8n_to_uvchr(first_bad_char_loc,
2663 (char *) first_bad_char_loc - str,
2666 /* We deliberately don't try to print the malformed character,
2667 * which might not print very well; it also may be just the first
2668 * of many malformations, so don't print what comes after it */
2671 "Malformed UTF-8 returned by %.*s immediately after '%.*s'",
2672 (int) (e - backslash_ptr + 1), backslash_ptr,
2673 (int) ((char *) first_bad_char_loc - str), str
2684 /* The final %.*s makes sure that should the trailing NUL be missing
2685 * that this print won't run off the end of the string */
2688 "Invalid character in \\N{...}; marked by <-- HERE in %.*s<-- HERE %.*s",
2689 (int)(s - backslash_ptr + 1), backslash_ptr,
2690 (int)(e - s + 1), s + 1
2692 UTF ? SVf_UTF8 : 0);
2699 "charnames alias definitions may not contain a sequence of "
2700 "multiple spaces; marked by <-- HERE in %.*s<-- HERE %.*s",
2701 (int)(s - backslash_ptr + 1), backslash_ptr,
2702 (int)(e - s + 1), s + 1
2704 UTF ? SVf_UTF8 : 0);
2711 Extracts the next constant part of a pattern, double-quoted string,
2712 or transliteration. This is terrifying code.
2714 For example, in parsing the double-quoted string "ab\x63$d", it would
2715 stop at the '$' and return an OP_CONST containing 'abc'.
2717 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
2718 processing a pattern (PL_lex_inpat is true), a transliteration
2719 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
2721 Returns a pointer to the character scanned up to. If this is
2722 advanced from the start pointer supplied (i.e. if anything was
2723 successfully parsed), will leave an OP_CONST for the substring scanned
2724 in pl_yylval. Caller must intuit reason for not parsing further
2725 by looking at the next characters herself.
2729 \N{FOO} => \N{U+hex_for_character_FOO}
2730 (if FOO expands to multiple characters, expands to \N{U+xx.XX.yy ...})
2733 all other \-char, including \N and \N{ apart from \N{ABC}
2736 @ and $ where it appears to be a var, but not for $ as tail anchor
2741 In transliterations:
2742 characters are VERY literal, except for - not at the start or end
2743 of the string, which indicates a range. If the range is in bytes,
2744 scan_const expands the range to the full set of intermediate
2745 characters. If the range is in utf8, the hyphen is replaced with
2746 a certain range mark which will be handled by pmtrans() in op.c.
2748 In double-quoted strings:
2750 double-quoted style: \r and \n
2751 constants: \x31, etc.
2752 deprecated backrefs: \1 (in substitution replacements)
2753 case and quoting: \U \Q \E
2756 scan_const does *not* construct ops to handle interpolated strings.
2757 It stops processing as soon as it finds an embedded $ or @ variable
2758 and leaves it to the caller to work out what's going on.
2760 embedded arrays (whether in pattern or not) could be:
2761 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2763 $ in double-quoted strings must be the symbol of an embedded scalar.
2765 $ in pattern could be $foo or could be tail anchor. Assumption:
2766 it's a tail anchor if $ is the last thing in the string, or if it's
2767 followed by one of "()| \r\n\t"
2769 \1 (backreferences) are turned into $1 in substitutions
2771 The structure of the code is
2772 while (there's a character to process) {
2773 handle transliteration ranges
2774 skip regexp comments /(?#comment)/ and codes /(?{code})/
2775 skip #-initiated comments in //x patterns
2776 check for embedded arrays
2777 check for embedded scalars
2779 deprecate \1 in substitution replacements
2780 handle string-changing backslashes \l \U \Q \E, etc.
2781 switch (what was escaped) {
2782 handle \- in a transliteration (becomes a literal -)
2783 if a pattern and not \N{, go treat as regular character
2784 handle \132 (octal characters)
2785 handle \x15 and \x{1234} (hex characters)
2786 handle \N{name} (named characters, also \N{3,5} in a pattern)
2787 handle \cV (control characters)
2788 handle printf-style backslashes (\f, \r, \n, etc)
2791 } (end if backslash)
2792 handle regular character
2793 } (end while character to read)
2798 S_scan_const(pTHX_ char *start)
2800 char *send = PL_bufend; /* end of the constant */
2801 SV *sv = newSV(send - start); /* sv for the constant. See note below
2803 char *s = start; /* start of the constant */
2804 char *d = SvPVX(sv); /* destination for copies */
2805 bool dorange = FALSE; /* are we in a translit range? */
2806 bool didrange = FALSE; /* did we just finish a range? */
2807 bool in_charclass = FALSE; /* within /[...]/ */
2808 bool has_utf8 = FALSE; /* Output constant is UTF8 */
2809 bool this_utf8 = cBOOL(UTF); /* Is the source string assumed to be
2810 UTF8? But, this can show as true
2811 when the source isn't utf8, as for
2812 example when it is entirely composed
2814 SV *res; /* result from charnames */
2816 /* Note on sizing: The scanned constant is placed into sv, which is
2817 * initialized by newSV() assuming one byte of output for every byte of
2818 * input. This routine expects newSV() to allocate an extra byte for a
2819 * trailing NUL, which this routine will append if it gets to the end of
2820 * the input. There may be more bytes of input than output (eg., \N{LATIN
2821 * CAPITAL LETTER A}), or more output than input if the constant ends up
2822 * recoded to utf8, but each time a construct is found that might increase
2823 * the needed size, SvGROW() is called. Its size parameter each time is
2824 * based on the best guess estimate at the time, namely the length used so
2825 * far, plus the length the current construct will occupy, plus room for
2826 * the trailing NUL, plus one byte for every input byte still unscanned */
2828 UV uv = UV_MAX; /* Initialize to weird value to try to catch any uses
2831 UV literal_endpoint = 0;
2832 bool native_range = TRUE; /* turned to FALSE if the first endpoint is Unicode. */
2835 PERL_ARGS_ASSERT_SCAN_CONST;
2837 assert(PL_lex_inwhat != OP_TRANSR);
2838 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
2839 /* If we are doing a trans and we know we want UTF8 set expectation */
2840 has_utf8 = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
2841 this_utf8 = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
2844 /* Protect sv from errors and fatal warnings. */
2845 ENTER_with_name("scan_const");
2848 while (s < send || dorange) {
2850 /* get transliterations out of the way (they're most literal) */
2851 if (PL_lex_inwhat == OP_TRANS) {
2852 /* expand a range A-Z to the full set of characters. AIE! */
2854 I32 i; /* current expanded character */
2855 I32 min; /* first character in range */
2856 I32 max; /* last character in range */
2867 char * const c = (char*)utf8_hop((U8*)d, -1);
2871 *c = (char) ILLEGAL_UTF8_BYTE;
2872 /* mark the range as done, and continue */
2878 i = d - SvPVX_const(sv); /* remember current offset */
2881 SvLEN(sv) + ((has_utf8)
2882 ? (512 - UTF_CONTINUATION_MARK
2885 /* How many two-byte within 0..255: 128 in UTF-8,
2886 * 96 in UTF-8-mod. */
2888 SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */
2890 d = SvPVX(sv) + i; /* refresh d after realloc */
2894 for (j = 0; j <= 1; j++) {
2895 char * const c = (char*)utf8_hop((U8*)d, -1);
2896 const UV uv = utf8n_to_uvchr((U8*)c, d - c, NULL, 0);
2902 max = (U8)0xff; /* only to \xff */
2903 uvmax = uv; /* \x{100} to uvmax */
2905 d = c; /* eat endpoint chars */
2910 d -= 2; /* eat the first char and the - */
2911 min = (U8)*d; /* first char in range */
2912 max = (U8)d[1]; /* last char in range */
2919 "Invalid range \"%c-%c\" in transliteration operator",
2920 (char)min, (char)max);
2924 /* Because of the discontinuities in EBCDIC A-Z and a-z, expand
2925 * any subsets of these ranges into individual characters */
2926 if (literal_endpoint == 2 &&
2927 ((isLOWER_A(min) && isLOWER_A(max)) ||
2928 (isUPPER_A(min) && isUPPER_A(max))))
2930 for (i = min; i <= max; i++) {
2937 for (i = min; i <= max; i++)
2940 append_utf8_from_native_byte(i, &d);
2948 d = (char*)uvchr_to_utf8((U8*)d, 0x100);
2950 *d++ = (char) ILLEGAL_UTF8_BYTE;
2952 d = (char*)uvchr_to_utf8((U8*)d, uvmax);
2956 /* mark the range as done, and continue */
2960 literal_endpoint = 0;
2965 /* range begins (ignore - as first or last char) */
2966 else if (*s == '-' && s+1 < send && s != start) {
2968 Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
2975 *d++ = (char) ILLEGAL_UTF8_BYTE; /* use illegal utf8 byte--see pmtrans */
2985 literal_endpoint = 0;
2986 native_range = TRUE;
2991 /* if we get here, we're not doing a transliteration */
2993 else if (*s == '[' && PL_lex_inpat && !in_charclass) {
2996 while (s1 >= start && *s1-- == '\\')
2999 in_charclass = TRUE;
3002 else if (*s == ']' && PL_lex_inpat && in_charclass) {
3005 while (s1 >= start && *s1-- == '\\')
3008 in_charclass = FALSE;
3011 /* skip for regexp comments /(?#comment)/, except for the last
3012 * char, which will be done separately.
3013 * Stop on (?{..}) and friends */
3015 else if (*s == '(' && PL_lex_inpat && s[1] == '?' && !in_charclass) {
3017 while (s+1 < send && *s != ')')
3020 else if (!PL_lex_casemods &&
3021 ( s[2] == '{' /* This should match regcomp.c */
3022 || (s[2] == '?' && s[3] == '{')))
3028 /* likewise skip #-initiated comments in //x patterns */
3029 else if (*s == '#' && PL_lex_inpat && !in_charclass &&
3030 ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED) {
3031 while (s+1 < send && *s != '\n')
3035 /* no further processing of single-quoted regex */
3036 else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'')
3037 goto default_action;
3039 /* check for embedded arrays
3040 (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
3042 else if (*s == '@' && s[1]) {
3043 if (isWORDCHAR_lazy_if(s+1,UTF))
3045 if (strchr(":'{$", s[1]))
3047 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
3048 break; /* in regexp, neither @+ nor @- are interpolated */
3051 /* check for embedded scalars. only stop if we're sure it's a
3054 else if (*s == '$') {
3055 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
3057 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
3059 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
3060 "Possible unintended interpolation of $\\ in regex");
3062 break; /* in regexp, $ might be tail anchor */
3066 /* End of else if chain - OP_TRANS rejoin rest */
3069 if (*s == '\\' && s+1 < send) {
3070 char* e; /* Can be used for ending '}', etc. */
3074 /* warn on \1 - \9 in substitution replacements, but note that \11
3075 * is an octal; and \19 is \1 followed by '9' */
3076 if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat &&
3077 isDIGIT(*s) && *s != '0' && !isDIGIT(s[1]))
3079 /* diag_listed_as: \%d better written as $%d */
3080 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
3085 /* string-change backslash escapes */
3086 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQF", *s)) {
3090 /* In a pattern, process \N, but skip any other backslash escapes.
3091 * This is because we don't want to translate an escape sequence
3092 * into a meta symbol and have the regex compiler use the meta
3093 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
3094 * in spite of this, we do have to process \N here while the proper
3095 * charnames handler is in scope. See bugs #56444 and #62056.
3096 * There is a complication because \N in a pattern may also stand
3097 * for 'match a non-nl', and not mean a charname, in which case its
3098 * processing should be deferred to the regex compiler. To be a
3099 * charname it must be followed immediately by a '{', and not look
3100 * like \N followed by a curly quantifier, i.e., not something like
3101 * \N{3,}. regcurly returns a boolean indicating if it is a legal
3103 else if (PL_lex_inpat
3106 || regcurly(s + 1)))
3109 goto default_action;
3114 /* quoted - in transliterations */
3116 if (PL_lex_inwhat == OP_TRANS) {
3123 if ((isALPHANUMERIC(*s)))
3124 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
3125 "Unrecognized escape \\%c passed through",
3127 /* default action is to copy the quoted character */
3128 goto default_action;
3131 /* eg. \132 indicates the octal constant 0132 */
3132 case '0': case '1': case '2': case '3':
3133 case '4': case '5': case '6': case '7':
3135 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
3137 uv = grok_oct(s, &len, &flags, NULL);
3139 if (len < 3 && s < send && isDIGIT(*s)
3140 && ckWARN(WARN_MISC))
3142 Perl_warner(aTHX_ packWARN(WARN_MISC),
3143 "%s", form_short_octal_warning(s, len));
3146 goto NUM_ESCAPE_INSERT;
3148 /* eg. \o{24} indicates the octal constant \024 */
3153 bool valid = grok_bslash_o(&s, &uv, &error,
3154 TRUE, /* Output warning */
3155 FALSE, /* Not strict */
3156 TRUE, /* Output warnings for
3163 goto NUM_ESCAPE_INSERT;
3166 /* eg. \x24 indicates the hex constant 0x24 */
3171 bool valid = grok_bslash_x(&s, &uv, &error,
3172 TRUE, /* Output warning */
3173 FALSE, /* Not strict */
3174 TRUE, /* Output warnings for
3184 /* Insert oct or hex escaped character. There will always be
3185 * enough room in sv since such escapes will be longer than any
3186 * UTF-8 sequence they can end up as, except if they force us
3187 * to recode the rest of the string into utf8 */
3189 /* Here uv is the ordinal of the next character being added */
3190 if (!UVCHR_IS_INVARIANT(uv)) {
3191 if (!has_utf8 && uv > 255) {
3192 /* Might need to recode whatever we have accumulated so
3193 * far if it contains any chars variant in utf8 or
3196 SvCUR_set(sv, d - SvPVX_const(sv));
3199 /* See Note on sizing above. */
3200 sv_utf8_upgrade_flags_grow(sv,
3201 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3202 UNISKIP(uv) + (STRLEN)(send - s) + 1);
3203 d = SvPVX(sv) + SvCUR(sv);
3208 d = (char*)uvchr_to_utf8((U8*)d, uv);
3209 if (PL_lex_inwhat == OP_TRANS &&
3210 PL_sublex_info.sub_op) {
3211 PL_sublex_info.sub_op->op_private |=
3212 (PL_lex_repl ? OPpTRANS_FROM_UTF
3216 if (uv > 255 && !dorange)
3217 native_range = FALSE;
3230 /* In a non-pattern \N must be a named character, like \N{LATIN
3231 * SMALL LETTER A} or \N{U+0041}. For patterns, it also can
3232 * mean to match a non-newline. For non-patterns, named
3233 * characters are converted to their string equivalents. In
3234 * patterns, named characters are not converted to their
3235 * ultimate forms for the same reasons that other escapes
3236 * aren't. Instead, they are converted to the \N{U+...} form
3237 * to get the value from the charnames that is in effect right
3238 * now, while preserving the fact that it was a named character
3239 * so that the regex compiler knows this */
3241 /* The structure of this section of code (besides checking for
3242 * errors and upgrading to utf8) is:
3243 * Further disambiguate between the two meanings of \N, and if
3244 * not a charname, go process it elsewhere
3245 * If of form \N{U+...}, pass it through if a pattern;
3246 * otherwise convert to utf8
3247 * Otherwise must be \N{NAME}: convert to \N{U+c1.c2...} if a
3248 * pattern; otherwise convert to utf8 */
3250 /* Here, s points to the 'N'; the test below is guaranteed to
3251 * succeed if we are being called on a pattern as we already
3252 * know from a test above that the next character is a '{'.
3253 * On a non-pattern \N must mean 'named sequence, which
3254 * requires braces */
3257 yyerror("Missing braces on \\N{}");
3262 /* If there is no matching '}', it is an error. */
3263 if (! (e = strchr(s, '}'))) {
3264 if (! PL_lex_inpat) {
3265 yyerror("Missing right brace on \\N{}");
3267 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N");
3272 /* Here it looks like a named character */
3274 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3275 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3276 | PERL_SCAN_DISALLOW_PREFIX;
3279 /* For \N{U+...}, the '...' is a unicode value even on
3280 * EBCDIC machines */
3281 s += 2; /* Skip to next char after the 'U+' */
3283 uv = grok_hex(s, &len, &flags, NULL);
3284 if (len == 0 || len != (STRLEN)(e - s)) {
3285 yyerror("Invalid hexadecimal number in \\N{U+...}");
3292 /* On non-EBCDIC platforms, pass through to the regex
3293 * compiler unchanged. The reason we evaluated the
3294 * number above is to make sure there wasn't a syntax
3295 * error. But on EBCDIC we convert to native so
3296 * downstream code can continue to assume it's native
3298 s -= 5; /* Include the '\N{U+' */
3300 d += my_snprintf(d, e - s + 1 + 1, /* includes the }
3303 (unsigned int) UNI_TO_NATIVE(uv));
3305 Copy(s, d, e - s + 1, char); /* 1 = include the } */
3309 else { /* Not a pattern: convert the hex to string */
3311 /* If destination is not in utf8, unconditionally
3312 * recode it to be so. This is because \N{} implies
3313 * Unicode semantics, and scalars have to be in utf8
3314 * to guarantee those semantics */
3316 SvCUR_set(sv, d - SvPVX_const(sv));
3319 /* See Note on sizing above. */
3320 sv_utf8_upgrade_flags_grow(
3322 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3323 UNISKIP(uv) + (STRLEN)(send - e) + 1);
3324 d = SvPVX(sv) + SvCUR(sv);
3328 /* Add the (Unicode) code point to the output. */
3329 if (UNI_IS_INVARIANT(uv)) {
3330 *d++ = (char) LATIN1_TO_NATIVE(uv);
3333 d = (char*) uvoffuni_to_utf8_flags((U8*)d, uv, 0);
3337 else /* Here is \N{NAME} but not \N{U+...}. */
3338 if ((res = get_and_check_backslash_N_name(s, e)))
3341 const char *str = SvPV_const(res, len);
3344 if (! len) { /* The name resolved to an empty string */
3345 Copy("\\N{}", d, 4, char);
3349 /* In order to not lose information for the regex
3350 * compiler, pass the result in the specially made
3351 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3352 * the code points in hex of each character
3353 * returned by charnames */
3355 const char *str_end = str + len;
3356 const STRLEN off = d - SvPVX_const(sv);
3358 if (! SvUTF8(res)) {
3359 /* For the non-UTF-8 case, we can determine the
3360 * exact length needed without having to parse
3361 * through the string. Each character takes up
3362 * 2 hex digits plus either a trailing dot or
3364 d = off + SvGROW(sv, off
3366 + 6 /* For the "\N{U+", and
3368 + (STRLEN)(send - e));
3369 Copy("\\N{U+", d, 5, char);
3371 while (str < str_end) {
3374 my_snprintf(hex_string,
3376 "%02X.", (U8) *str);
3377 PERL_MY_SNPRINTF_POST_GUARD(len, sizeof(hex_string));
3378 Copy(hex_string, d, 3, char);
3382 d--; /* We will overwrite below the final
3383 dot with a right brace */
3386 STRLEN char_length; /* cur char's byte length */
3388 /* and the number of bytes after this is
3389 * translated into hex digits */
3390 STRLEN output_length;
3392 /* 2 hex per byte; 2 chars for '\N'; 2 chars
3393 * for max('U+', '.'); and 1 for NUL */
3394 char hex_string[2 * UTF8_MAXBYTES + 5];
3396 /* Get the first character of the result. */
3397 U32 uv = utf8n_to_uvchr((U8 *) str,
3401 /* Convert first code point to hex, including
3402 * the boiler plate before it. */
3404 my_snprintf(hex_string, sizeof(hex_string),
3408 /* Make sure there is enough space to hold it */
3409 d = off + SvGROW(sv, off
3411 + (STRLEN)(send - e)
3412 + 2); /* '}' + NUL */
3414 Copy(hex_string, d, output_length, char);
3417 /* For each subsequent character, append dot and
3418 * its ordinal in hex */
3419 while ((str += char_length) < str_end) {
3420 const STRLEN off = d - SvPVX_const(sv);
3421 U32 uv = utf8n_to_uvchr((U8 *) str,
3426 my_snprintf(hex_string,
3431 d = off + SvGROW(sv, off
3433 + (STRLEN)(send - e)
3434 + 2); /* '}' + NUL */
3435 Copy(hex_string, d, output_length, char);
3440 *d++ = '}'; /* Done. Add the trailing brace */
3443 else { /* Here, not in a pattern. Convert the name to a
3446 /* If destination is not in utf8, unconditionally
3447 * recode it to be so. This is because \N{} implies
3448 * Unicode semantics, and scalars have to be in utf8
3449 * to guarantee those semantics */
3451 SvCUR_set(sv, d - SvPVX_const(sv));
3454 /* See Note on sizing above. */
3455 sv_utf8_upgrade_flags_grow(sv,
3456 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3457 len + (STRLEN)(send - s) + 1);
3458 d = SvPVX(sv) + SvCUR(sv);
3460 } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
3462 /* See Note on sizing above. (NOTE: SvCUR() is not
3463 * set correctly here). */
3464 const STRLEN off = d - SvPVX_const(sv);
3465 d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1);
3467 if (! SvUTF8(res)) { /* Make sure is \N{} return is UTF-8 */
3468 sv_utf8_upgrade(res);
3469 str = SvPV_const(res, len);
3471 Copy(str, d, len, char);
3477 } /* End \N{NAME} */
3480 native_range = FALSE; /* \N{} is defined to be Unicode */
3482 s = e + 1; /* Point to just after the '}' */
3485 /* \c is a control character */
3489 *d++ = grok_bslash_c(*s++, 1);
3492 yyerror("Missing control char name in \\c");
3496 /* printf-style backslashes, formfeeds, newlines, etc */
3522 } /* end if (backslash) */
3529 /* If we started with encoded form, or already know we want it,
3530 then encode the next character */
3531 if (! NATIVE_BYTE_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
3535 /* One might think that it is wasted effort in the case of the
3536 * source being utf8 (this_utf8 == TRUE) to take the next character
3537 * in the source, convert it to an unsigned value, and then convert
3538 * it back again. But the source has not been validated here. The
3539 * routine that does the conversion checks for errors like
3542 const UV nextuv = (this_utf8)
3543 ? utf8n_to_uvchr((U8*)s, send - s, &len, 0)
3545 const STRLEN need = UNISKIP(nextuv);
3547 SvCUR_set(sv, d - SvPVX_const(sv));
3550 /* See Note on sizing above. */
3551 sv_utf8_upgrade_flags_grow(sv,
3552 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3553 need + (STRLEN)(send - s) + 1);
3554 d = SvPVX(sv) + SvCUR(sv);
3556 } else if (need > len) {
3557 /* encoded value larger than old, may need extra space (NOTE:
3558 * SvCUR() is not set correctly here). See Note on sizing
3560 const STRLEN off = d - SvPVX_const(sv);
3561 d = SvGROW(sv, off + need + (STRLEN)(send - s) + 1) + off;
3565 d = (char*)uvchr_to_utf8((U8*)d, nextuv);
3567 if (uv > 255 && !dorange)
3568 native_range = FALSE;
3574 } /* while loop to process each character */
3576 /* terminate the string and set up the sv */
3578 SvCUR_set(sv, d - SvPVX_const(sv));
3579 if (SvCUR(sv) >= SvLEN(sv))
3580 Perl_croak(aTHX_ "panic: constant overflowed allocated space, %"UVuf
3581 " >= %"UVuf, (UV)SvCUR(sv), (UV)SvLEN(sv));
3584 if (PL_encoding && !has_utf8) {
3585 sv_recode_to_utf8(sv, PL_encoding);
3591 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
3592 PL_sublex_info.sub_op->op_private |=
3593 (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
3597 /* shrink the sv if we allocated more than we used */
3598 if (SvCUR(sv) + 5 < SvLEN(sv)) {
3599 SvPV_shrink_to_cur(sv);
3602 /* return the substring (via pl_yylval) only if we parsed anything */
3605 for (; s2 < s; s2++) {
3607 COPLINE_INC_WITH_HERELINES;
3609 SvREFCNT_inc_simple_void_NN(sv);
3610 if ( (PL_hints & ( PL_lex_inpat ? HINT_NEW_RE : HINT_NEW_STRING ))
3611 && ! PL_parser->lex_re_reparsing)
3613 const char *const key = PL_lex_inpat ? "qr" : "q";
3614 const STRLEN keylen = PL_lex_inpat ? 2 : 1;
3618 if (PL_lex_inwhat == OP_TRANS) {
3621 } else if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat) {
3624 } else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'') {
3632 sv = S_new_constant(aTHX_ start, s - start, key, keylen, sv, NULL,
3635 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
3637 LEAVE_with_name("scan_const");
3642 * Returns TRUE if there's more to the expression (e.g., a subscript),
3645 * It deals with "$foo[3]" and /$foo[3]/ and /$foo[0123456789$]+/
3647 * ->[ and ->{ return TRUE
3648 * ->$* ->$#* ->@* ->@[ ->@{ return TRUE if postderef_qq is enabled
3649 * { and [ outside a pattern are always subscripts, so return TRUE
3650 * if we're outside a pattern and it's not { or [, then return FALSE
3651 * if we're in a pattern and the first char is a {
3652 * {4,5} (any digits around the comma) returns FALSE
3653 * if we're in a pattern and the first char is a [
3655 * [SOMETHING] has a funky algorithm to decide whether it's a
3656 * character class or not. It has to deal with things like
3657 * /$foo[-3]/ and /$foo[$bar]/ as well as /$foo[$\d]+/
3658 * anything else returns TRUE
3661 /* This is the one truly awful dwimmer necessary to conflate C and sed. */
3664 S_intuit_more(pTHX_ char *s)
3666 PERL_ARGS_ASSERT_INTUIT_MORE;
3668 if (PL_lex_brackets)
3670 if (*s == '-' && s[1] == '>' && (s[2] == '[' || s[2] == '{'))
3672 if (*s == '-' && s[1] == '>'
3673 && FEATURE_POSTDEREF_QQ_IS_ENABLED
3674 && ( (s[2] == '$' && (s[3] == '*' || (s[3] == '#' && s[4] == '*')))
3675 ||(s[2] == '@' && strchr("*[{",s[3])) ))
3677 if (*s != '{' && *s != '[')
3682 /* In a pattern, so maybe we have {n,m}. */
3690 /* On the other hand, maybe we have a character class */
3693 if (*s == ']' || *s == '^')
3696 /* this is terrifying, and it works */
3699 const char * const send = strchr(s,']');
3700 unsigned char un_char, last_un_char;
3701 char tmpbuf[sizeof PL_tokenbuf * 4];
3703 if (!send) /* has to be an expression */
3705 weight = 2; /* let's weigh the evidence */
3709 else if (isDIGIT(*s)) {
3711 if (isDIGIT(s[1]) && s[2] == ']')
3717 Zero(seen,256,char);
3719 for (; s < send; s++) {
3720 last_un_char = un_char;
3721 un_char = (unsigned char)*s;
3726 weight -= seen[un_char] * 10;
3727 if (isWORDCHAR_lazy_if(s+1,UTF)) {
3729 char *tmp = PL_bufend;
3730 PL_bufend = (char*)send;
3731 scan_ident(s, tmpbuf, sizeof tmpbuf, FALSE);
3733 len = (int)strlen(tmpbuf);
3734 if (len > 1 && gv_fetchpvn_flags(tmpbuf, len,
3735 UTF ? SVf_UTF8 : 0, SVt_PV))
3740 else if (*s == '$' && s[1] &&
3741 strchr("[#!%*<>()-=",s[1])) {
3742 if (/*{*/ strchr("])} =",s[2]))
3751 if (strchr("wds]",s[1]))
3753 else if (seen[(U8)'\''] || seen[(U8)'"'])
3755 else if (strchr("rnftbxcav",s[1]))
3757 else if (isDIGIT(s[1])) {
3759 while (s[1] && isDIGIT(s[1]))
3769 if (strchr("aA01! ",last_un_char))
3771 if (strchr("zZ79~",s[1]))
3773 if (last_un_char == 255 && (isDIGIT(s[1]) || s[1] == '$'))
3774 weight -= 5; /* cope with negative subscript */
3777 if (!isWORDCHAR(last_un_char)
3778 && !(last_un_char == '$' || last_un_char == '@'
3779 || last_un_char == '&')
3780 && isALPHA(*s) && s[1] && isALPHA(s[1])) {
3785 if (keyword(tmpbuf, d - tmpbuf, 0))
3788 if (un_char == last_un_char + 1)
3790 weight -= seen[un_char];
3795 if (weight >= 0) /* probably a character class */
3805 * Does all the checking to disambiguate
3807 * between foo(bar) and bar->foo. Returns 0 if not a method, otherwise
3808 * FUNCMETH (bar->foo(args)) or METHOD (bar->foo args).
3810 * First argument is the stuff after the first token, e.g. "bar".
3812 * Not a method if foo is a filehandle.
3813 * Not a method if foo is a subroutine prototyped to take a filehandle.
3814 * Not a method if it's really "Foo $bar"
3815 * Method if it's "foo $bar"
3816 * Not a method if it's really "print foo $bar"
3817 * Method if it's really "foo package::" (interpreted as package->foo)
3818 * Not a method if bar is known to be a subroutine ("sub bar; foo bar")
3819 * Not a method if bar is a filehandle or package, but is quoted with
3824 S_intuit_method(pTHX_ char *start, GV *gv, CV *cv)
3826 char *s = start + (*start == '$');
3827 char tmpbuf[sizeof PL_tokenbuf];
3831 PERL_ARGS_ASSERT_INTUIT_METHOD;
3833 if (gv && SvTYPE(gv) == SVt_PVGV && GvIO(gv))
3835 if (cv && SvPOK(cv)) {
3836 const char *proto = CvPROTO(cv);
3838 while (*proto && (isSPACE(*proto) || *proto == ';'))
3845 if (*start == '$') {
3846 if (cv || PL_last_lop_op == OP_PRINT || PL_last_lop_op == OP_SAY ||
3847 isUPPER(*PL_tokenbuf))
3852 return *s == '(' ? FUNCMETH : METHOD;
3855 s = scan_word(s, tmpbuf, sizeof tmpbuf, TRUE, &len);
3856 /* start is the beginning of the possible filehandle/object,
3857 * and s is the end of it
3858 * tmpbuf is a copy of it (but with single quotes as double colons)
3861 if (!keyword(tmpbuf, len, 0)) {
3862 if (len > 2 && tmpbuf[len - 2] == ':' && tmpbuf[len - 1] == ':') {
3867 indirgv = gv_fetchpvn_flags(tmpbuf, len, ( UTF ? SVf_UTF8 : 0 ), SVt_PVCV);
3868 if (indirgv && GvCVu(indirgv))
3870 /* filehandle or package name makes it a method */
3871 if (!cv || GvIO(indirgv) || gv_stashpvn(tmpbuf, len, UTF ? SVf_UTF8 : 0)) {
3873 if ((PL_bufend - s) >= 2 && *s == '=' && *(s+1) == '>')
3874 return 0; /* no assumptions -- "=>" quotes bareword */
3876 NEXTVAL_NEXTTOKE.opval = (OP*)newSVOP(OP_CONST, 0,
3877 S_newSV_maybe_utf8(aTHX_ tmpbuf, len));
3878 NEXTVAL_NEXTTOKE.opval->op_private = OPpCONST_BARE;
3882 return *s == '(' ? FUNCMETH : METHOD;
3888 /* Encoded script support. filter_add() effectively inserts a
3889 * 'pre-processing' function into the current source input stream.
3890 * Note that the filter function only applies to the current source file
3891 * (e.g., it will not affect files 'require'd or 'use'd by this one).
3893 * The datasv parameter (which may be NULL) can be used to pass
3894 * private data to this instance of the filter. The filter function
3895 * can recover the SV using the FILTER_DATA macro and use it to
3896 * store private buffers and state information.
3898 * The supplied datasv parameter is upgraded to a PVIO type
3899 * and the IoDIRP/IoANY field is used to store the function pointer,
3900 * and IOf_FAKE_DIRP is enabled on datasv to mark this as such.
3901 * Note that IoTOP_NAME, IoFMT_NAME, IoBOTTOM_NAME, if set for
3902 * private use must be set using malloc'd pointers.
3906 Perl_filter_add(pTHX_ filter_t funcp, SV *datasv)
3914 if (PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS)
3915 Perl_croak(aTHX_ "Source filters apply only to byte streams");
3917 if (!PL_rsfp_filters)
3918 PL_rsfp_filters = newAV();
3921 SvUPGRADE(datasv, SVt_PVIO);
3922 IoANY(datasv) = FPTR2DPTR(void *, funcp); /* stash funcp into spare field */
3923 IoFLAGS(datasv) |= IOf_FAKE_DIRP;
3924 DEBUG_P(PerlIO_printf(Perl_debug_log, "filter_add func %p (%s)\n",
3925 FPTR2DPTR(void *, IoANY(datasv)),
3926 SvPV_nolen(datasv)));
3927 av_unshift(PL_rsfp_filters, 1);
3928 av_store(PL_rsfp_filters, 0, datasv) ;
3930 !PL_parser->filtered
3931 && PL_parser->lex_flags & LEX_EVALBYTES
3932 && PL_bufptr < PL_bufend
3934 const char *s = PL_bufptr;
3935 while (s < PL_bufend) {
3937 SV *linestr = PL_parser->linestr;
3938 char *buf = SvPVX(linestr);
3939 STRLEN const bufptr_pos = PL_parser->bufptr - buf;
3940 STRLEN const oldbufptr_pos = PL_parser->oldbufptr - buf;
3941 STRLEN const oldoldbufptr_pos=PL_parser->oldoldbufptr-buf;
3942 STRLEN const linestart_pos = PL_parser->linestart - buf;
3943 STRLEN const last_uni_pos =
3944 PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
3945 STRLEN const last_lop_pos =
3946 PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
3947 av_push(PL_rsfp_filters, linestr);
3948 PL_parser->linestr =
3949 newSVpvn(SvPVX(linestr), ++s-SvPVX(linestr));
3950 buf = SvPVX(PL_parser->linestr);
3951 PL_parser->bufend = buf + SvCUR(PL_parser->linestr);
3952 PL_parser->bufptr = buf + bufptr_pos;
3953 PL_parser->oldbufptr = buf + oldbufptr_pos;
3954 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
3955 PL_parser->linestart = buf + linestart_pos;
3956 if (PL_parser->last_uni)
3957 PL_parser->last_uni = buf + last_uni_pos;
3958 if (PL_parser->last_lop)
3959 PL_parser->last_lop = buf + last_lop_pos;
3960 SvLEN(linestr) = SvCUR(linestr);
3961 SvCUR(linestr) = s-SvPVX(linestr);
3962 PL_parser->filtered = 1;
3972 /* Delete most recently added instance of this filter function. */
3974 Perl_filter_del(pTHX_ filter_t funcp)
3978 PERL_ARGS_ASSERT_FILTER_DEL;
3981 DEBUG_P(PerlIO_printf(Perl_debug_log, "filter_del func %p",
3982 FPTR2DPTR(void*, funcp)));
3984 if (!PL_parser || !PL_rsfp_filters || AvFILLp(PL_rsfp_filters)<0)
3986 /* if filter is on top of stack (usual case) just pop it off */
3987 datasv = FILTER_DATA(AvFILLp(PL_rsfp_filters));
3988 if (IoANY(datasv) == FPTR2DPTR(void *, funcp)) {
3989 sv_free(av_pop(PL_rsfp_filters));
3993 /* we need to search for the correct entry and clear it */
3994 Perl_die(aTHX_ "filter_del can only delete in reverse order (currently)");
3998 /* Invoke the idxth filter function for the current rsfp. */
3999 /* maxlen 0 = read one text line */
4001 Perl_filter_read(pTHX_ int idx, SV *buf_sv, int maxlen)
4005 /* This API is bad. It should have been using unsigned int for maxlen.
4006 Not sure if we want to change the API, but if not we should sanity
4007 check the value here. */
4008 unsigned int correct_length = maxlen < 0 ? PERL_INT_MAX : maxlen;
4010 PERL_ARGS_ASSERT_FILTER_READ;
4012 if (!PL_parser || !PL_rsfp_filters)
4014 if (idx > AvFILLp(PL_rsfp_filters)) { /* Any more filters? */
4015 /* Provide a default input filter to make life easy. */
4016 /* Note that we append to the line. This is handy. */
4017 DEBUG_P(PerlIO_printf(Perl_debug_log,
4018 "filter_read %d: from rsfp\n", idx));
4019 if (correct_length) {
4022 const int old_len = SvCUR(buf_sv);
4024 /* ensure buf_sv is large enough */
4025 SvGROW(buf_sv, (STRLEN)(old_len + correct_length + 1)) ;
4026 if ((len = PerlIO_read(PL_rsfp, SvPVX(buf_sv) + old_len,
4027 correct_length)) <= 0) {
4028 if (PerlIO_error(PL_rsfp))
4029 return -1; /* error */
4031 return 0 ; /* end of file */
4033 SvCUR_set(buf_sv, old_len + len) ;
4034 SvPVX(buf_sv)[old_len + len] = '\0';
4037 if (sv_gets(buf_sv, PL_rsfp, SvCUR(buf_sv)) == NULL) {
4038 if (PerlIO_error(PL_rsfp))
4039 return -1; /* error */
4041 return 0 ; /* end of file */
4044 return SvCUR(buf_sv);
4046 /* Skip this filter slot if filter has been deleted */
4047 if ( (datasv = FILTER_DATA(idx)) == &PL_sv_undef) {
4048 DEBUG_P(PerlIO_printf(Perl_debug_log,
4049 "filter_read %d: skipped (filter deleted)\n",
4051 return FILTER_READ(idx+1, buf_sv, correct_length); /* recurse */
4053 if (SvTYPE(datasv) != SVt_PVIO) {
4054 if (correct_length) {
4056 const STRLEN remainder = SvLEN(datasv) - SvCUR(datasv);
4057 if (!remainder) return 0; /* eof */
4058 if (correct_length > remainder) correct_length = remainder;
4059 sv_catpvn(buf_sv, SvEND(datasv), correct_length);
4060 SvCUR_set(datasv, SvCUR(datasv) + correct_length);
4063 const char *s = SvEND(datasv);
4064 const char *send = SvPVX(datasv) + SvLEN(datasv);
4072 if (s == send) return 0; /* eof */
4073 sv_catpvn(buf_sv, SvEND(datasv), s-SvEND(datasv));
4074 SvCUR_set(datasv, s-SvPVX(datasv));
4076 return SvCUR(buf_sv);
4078 /* Get function pointer hidden within datasv */
4079 funcp = DPTR2FPTR(filter_t, IoANY(datasv));
4080 DEBUG_P(PerlIO_printf(Perl_debug_log,
4081 "filter_read %d: via function %p (%s)\n",
4082 idx, (void*)datasv, SvPV_nolen_const(datasv)));
4083 /* Call function. The function is expected to */
4084 /* call "FILTER_READ(idx+1, buf_sv)" first. */
4085 /* Return: <0:error, =0:eof, >0:not eof */
4086 return (*funcp)(aTHX_ idx, buf_sv, correct_length);
4090 S_filter_gets(pTHX_ SV *sv, STRLEN append)
4092 PERL_ARGS_ASSERT_FILTER_GETS;
4094 #ifdef PERL_CR_FILTER
4095 if (!PL_rsfp_filters) {
4096 filter_add(S_cr_textfilter,NULL);
4099 if (PL_rsfp_filters) {
4101 SvCUR_set(sv, 0); /* start with empty line */
4102 if (FILTER_READ(0, sv, 0) > 0)
4103 return ( SvPVX(sv) ) ;
4108 return (sv_gets(sv, PL_rsfp, append));
4112 S_find_in_my_stash(pTHX_ const char *pkgname, STRLEN len)
4116 PERL_ARGS_ASSERT_FIND_IN_MY_STASH;
4118 if (len == 11 && *pkgname == '_' && strEQ(pkgname, "__PACKAGE__"))
4122 (pkgname[len - 2] == ':' && pkgname[len - 1] == ':') &&
4123 (gv = gv_fetchpvn_flags(pkgname, len, ( UTF ? SVf_UTF8 : 0 ), SVt_PVHV)))
4125 return GvHV(gv); /* Foo:: */
4128 /* use constant CLASS => 'MyClass' */
4129 gv = gv_fetchpvn_flags(pkgname, len, UTF ? SVf_UTF8 : 0, SVt_PVCV);
4130 if (gv && GvCV(gv)) {
4131 SV * const sv = cv_const_sv(GvCV(gv));
4133 pkgname = SvPV_const(sv, len);
4136 return gv_stashpvn(pkgname, len, UTF ? SVf_UTF8 : 0);
4141 S_tokenize_use(pTHX_ int is_use, char *s) {
4142 PERL_ARGS_ASSERT_TOKENIZE_USE;
4144 if (PL_expect != XSTATE)
4145 yyerror(Perl_form(aTHX_ "\"%s\" not allowed in expression",
4146 is_use ? "use" : "no"));
4149 if (isDIGIT(*s) || (*s == 'v' && isDIGIT(s[1]))) {
4150 s = force_version(s, TRUE);
4151 if (*s == ';' || *s == '}'
4152 || (s = SKIPSPACE1(s), (*s == ';' || *s == '}'))) {
4153 NEXTVAL_NEXTTOKE.opval = NULL;
4156 else if (*s == 'v') {
4157 s = force_word(s,WORD,FALSE,TRUE);
4158 s = force_version(s, FALSE);
4162 s = force_word(s,WORD,FALSE,TRUE);
4163 s = force_version(s, FALSE);
4165 pl_yylval.ival = is_use;
4169 static const char* const exp_name[] =
4170 { "OPERATOR", "TERM", "REF", "STATE", "BLOCK", "ATTRBLOCK",
4171 "ATTRTERM", "TERMBLOCK", "XBLOCKTERM", "POSTDEREF",
4176 #define word_takes_any_delimeter(p,l) S_word_takes_any_delimeter(p,l)
4178 S_word_takes_any_delimeter(char *p, STRLEN len)
4180 return (len == 1 && strchr("msyq", p[0])) ||
4182 (p[0] == 't' && p[1] == 'r') ||
4183 (p[0] == 'q' && strchr("qwxr", p[1]))));
4187 S_check_scalar_slice(pTHX_ char *s)
4190 while (*s == ' ' || *s == '\t') s++;
4191 if (*s == 'q' && s[1] == 'w'
4192 && !isWORDCHAR_lazy_if(s+2,UTF))
4194 while (*s && (isWORDCHAR_lazy_if(s,UTF) || strchr(" \t$#+-'\"", *s)))
4195 s += UTF ? UTF8SKIP(s) : 1;
4196 if (*s == '}' || *s == ']')
4197 pl_yylval.ival = OPpSLICEWARNING;
4203 Works out what to call the token just pulled out of the input
4204 stream. The yacc parser takes care of taking the ops we return and
4205 stitching them into a tree.
4208 The type of the next token
4211 Switch based on the current state:
4212 - if we already built the token before, use it
4213 - if we have a case modifier in a string, deal with that
4214 - handle other cases of interpolation inside a string
4215 - scan the next line if we are inside a format
4216 In the normal state switch on the next character:
4218 if alphabetic, go to key lookup
4219 unrecoginized character - croak
4220 - 0/4/26: handle end-of-line or EOF
4221 - cases for whitespace
4222 - \n and #: handle comments and line numbers
4223 - various operators, brackets and sigils
4226 - 'v': vstrings (or go to key lookup)
4227 - 'x' repetition operator (or go to key lookup)
4228 - other ASCII alphanumerics (key lookup begins here):
4231 scan built-in keyword (but do nothing with it yet)
4232 check for statement label
4233 check for lexical subs
4234 goto just_a_word if there is one
4235 see whether built-in keyword is overridden
4236 switch on keyword number:
4237 - default: just_a_word:
4238 not a built-in keyword; handle bareword lookup
4239 disambiguate between method and sub call
4240 fall back to bareword
4241 - cases for built-in keywords
4249 char *s = PL_bufptr;
4253 const bool saw_infix_sigil = cBOOL(PL_parser->saw_infix_sigil);
4257 /* orig_keyword, gvp, and gv are initialized here because
4258 * jump to the label just_a_word_zero can bypass their
4259 * initialization later. */
4260 I32 orig_keyword = 0;
4265 SV* tmp = newSVpvs("");
4266 PerlIO_printf(Perl_debug_log, "### %"IVdf":LEX_%s/X%s %s\n",
4267 (IV)CopLINE(PL_curcop),
4268 lex_state_names[PL_lex_state],
4269 exp_name[PL_expect],
4270 pv_display(tmp, s, strlen(s), 0, 60));
4274 switch (PL_lex_state) {
4276 case LEX_INTERPNORMAL:
4279 /* when we've already built the next token, just pull it out of the queue */
4282 pl_yylval = PL_nextval[PL_nexttoke];
4284 PL_lex_state = PL_lex_defer;
4285 PL_lex_defer = LEX_NORMAL;
4289 next_type = PL_nexttype[PL_nexttoke];
4290 if (next_type & (7<<24)) {
4291 if (next_type & (1<<24)) {
4292 if (PL_lex_brackets > 100)
4293 Renew(PL_lex_brackstack, PL_lex_brackets + 10, char);
4294 PL_lex_brackstack[PL_lex_brackets++] =
4295 (char) ((next_type >> 16) & 0xff);
4297 if (next_type & (2<<24))
4298 PL_lex_allbrackets++;
4299 if (next_type & (4<<24))
4300 PL_lex_allbrackets--;
4301 next_type &= 0xffff;
4303 return REPORT(next_type == 'p' ? pending_ident() : next_type);
4306 /* interpolated case modifiers like \L \U, including \Q and \E.
4307 when we get here, PL_bufptr is at the \
4309 case LEX_INTERPCASEMOD:
4311 if (PL_bufptr != PL_bufend && *PL_bufptr != '\\')
4313 "panic: INTERPCASEMOD bufptr=%p, bufend=%p, *bufptr=%u",
4314 PL_bufptr, PL_bufend, *PL_bufptr);
4316 /* handle \E or end of string */
4317 if (PL_bufptr == PL_bufend || PL_bufptr[1] == 'E') {
4319 if (PL_lex_casemods) {
4320 const char oldmod = PL_lex_casestack[--PL_lex_casemods];
4321 PL_lex_casestack[PL_lex_casemods] = '\0';
4323 if (PL_bufptr != PL_bufend
4324 && (oldmod == 'L' || oldmod == 'U' || oldmod == 'Q'
4325 || oldmod == 'F')) {
4327 PL_lex_state = LEX_INTERPCONCAT;
4329 PL_lex_allbrackets--;
4332 else if ( PL_bufptr != PL_bufend && PL_bufptr[1] == 'E' ) {
4333 /* Got an unpaired \E */
4334 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
4335 "Useless use of \\E");
4337 if (PL_bufptr != PL_bufend)
4339 PL_lex_state = LEX_INTERPCONCAT;
4343 DEBUG_T({ PerlIO_printf(Perl_debug_log,
4344 "### Saw case modifier\n"); });
4346 if (s[1] == '\\' && s[2] == 'E') {
4348 PL_lex_state = LEX_INTERPCONCAT;
4353 if (strnEQ(s, "L\\u", 3) || strnEQ(s, "U\\l", 3))
4354 tmp = *s, *s = s[2], s[2] = (char)tmp; /* misordered... */
4355 if ((*s == 'L' || *s == 'U' || *s == 'F') &&
4356 (strchr(PL_lex_casestack, 'L')
4357 || strchr(PL_lex_casestack, 'U')
4358 || strchr(PL_lex_casestack, 'F'))) {
4359 PL_lex_casestack[--PL_lex_casemods] = '\0';
4360 PL_lex_allbrackets--;
4363 if (PL_lex_casemods > 10)
4364 Renew(PL_lex_casestack, PL_lex_casemods + 2, char);
4365 PL_lex_casestack[PL_lex_casemods++] = *s;
4366 PL_lex_casestack[PL_lex_casemods] = '\0';
4367 PL_lex_state = LEX_INTERPCONCAT;
4368 NEXTVAL_NEXTTOKE.ival = 0;
4369 force_next((2<<24)|'(');
4371 NEXTVAL_NEXTTOKE.ival = OP_LCFIRST;
4373 NEXTVAL_NEXTTOKE.ival = OP_UCFIRST;
4375 NEXTVAL_NEXTTOKE.ival = OP_LC;
4377 NEXTVAL_NEXTTOKE.ival = OP_UC;
4379 NEXTVAL_NEXTTOKE.ival = OP_QUOTEMETA;
4381 NEXTVAL_NEXTTOKE.ival = OP_FC;
4383 Perl_croak(aTHX_ "panic: yylex, *s=%u", *s);