3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
12 * 'It all comes from here, the stench and the peril.' --Frodo
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
18 * This file is the lexer for Perl. It's closely linked to the
21 * The main routine is yylex(), which returns the next token.
25 =head1 Lexer interface
26 This is the lower layer of the Perl parser, managing characters and tokens.
28 =for apidoc AmU|yy_parser *|PL_parser
30 Pointer to a structure encapsulating the state of the parsing operation
31 currently in progress. The pointer can be locally changed to perform
32 a nested parse without interfering with the state of an outer parse.
33 Individual members of C<PL_parser> have their own documentation.
39 #define PERL_IN_TOKE_C
41 #include "dquote_inline.h"
43 #define new_constant(a,b,c,d,e,f,g) \
44 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
46 #define pl_yylval (PL_parser->yylval)
48 /* XXX temporary backwards compatibility */
49 #define PL_lex_brackets (PL_parser->lex_brackets)
50 #define PL_lex_allbrackets (PL_parser->lex_allbrackets)
51 #define PL_lex_fakeeof (PL_parser->lex_fakeeof)
52 #define PL_lex_brackstack (PL_parser->lex_brackstack)
53 #define PL_lex_casemods (PL_parser->lex_casemods)
54 #define PL_lex_casestack (PL_parser->lex_casestack)
55 #define PL_lex_dojoin (PL_parser->lex_dojoin)
56 #define PL_lex_formbrack (PL_parser->lex_formbrack)
57 #define PL_lex_inpat (PL_parser->lex_inpat)
58 #define PL_lex_inwhat (PL_parser->lex_inwhat)
59 #define PL_lex_op (PL_parser->lex_op)
60 #define PL_lex_repl (PL_parser->lex_repl)
61 #define PL_lex_starts (PL_parser->lex_starts)
62 #define PL_lex_stuff (PL_parser->lex_stuff)
63 #define PL_multi_start (PL_parser->multi_start)
64 #define PL_multi_open (PL_parser->multi_open)
65 #define PL_multi_close (PL_parser->multi_close)
66 #define PL_preambled (PL_parser->preambled)
67 #define PL_linestr (PL_parser->linestr)
68 #define PL_expect (PL_parser->expect)
69 #define PL_copline (PL_parser->copline)
70 #define PL_bufptr (PL_parser->bufptr)
71 #define PL_oldbufptr (PL_parser->oldbufptr)
72 #define PL_oldoldbufptr (PL_parser->oldoldbufptr)
73 #define PL_linestart (PL_parser->linestart)
74 #define PL_bufend (PL_parser->bufend)
75 #define PL_last_uni (PL_parser->last_uni)
76 #define PL_last_lop (PL_parser->last_lop)
77 #define PL_last_lop_op (PL_parser->last_lop_op)
78 #define PL_lex_state (PL_parser->lex_state)
79 #define PL_rsfp (PL_parser->rsfp)
80 #define PL_rsfp_filters (PL_parser->rsfp_filters)
81 #define PL_in_my (PL_parser->in_my)
82 #define PL_in_my_stash (PL_parser->in_my_stash)
83 #define PL_tokenbuf (PL_parser->tokenbuf)
84 #define PL_multi_end (PL_parser->multi_end)
85 #define PL_error_count (PL_parser->error_count)
87 # define PL_nexttoke (PL_parser->nexttoke)
88 # define PL_nexttype (PL_parser->nexttype)
89 # define PL_nextval (PL_parser->nextval)
92 #define SvEVALED(sv) \
93 (SvTYPE(sv) >= SVt_PVNV \
94 && ((XPVIV*)SvANY(sv))->xiv_u.xivu_eval_seen)
96 static const char* const ident_too_long = "Identifier too long";
98 # define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
100 #define XENUMMASK 0x3f
101 #define XFAKEEOF 0x40
102 #define XFAKEBRACK 0x80
104 #ifdef USE_UTF8_SCRIPTS
105 # define UTF cBOOL(!IN_BYTES)
107 # define UTF cBOOL((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
110 /* The maximum number of characters preceding the unrecognized one to display */
111 #define UNRECOGNIZED_PRECEDE_COUNT 10
113 /* In variables named $^X, these are the legal values for X.
114 * 1999-02-27 mjd-perl-patch@plover.com */
115 #define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
117 #define SPACE_OR_TAB(c) isBLANK_A(c)
119 #define HEXFP_PEEK(s) \
121 (isXDIGIT(s[1]) || isALPHA_FOLD_EQ(s[1], 'p'))) || \
122 isALPHA_FOLD_EQ(s[0], 'p'))
124 /* LEX_* are values for PL_lex_state, the state of the lexer.
125 * They are arranged oddly so that the guard on the switch statement
126 * can get by with a single comparison (if the compiler is smart enough).
128 * These values refer to the various states within a sublex parse,
129 * i.e. within a double quotish string
132 /* #define LEX_NOTPARSING 11 is done in perl.h. */
134 #define LEX_NORMAL 10 /* normal code (ie not within "...") */
135 #define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
136 #define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
137 #define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
138 #define LEX_INTERPSTART 6 /* expecting the start of a $var */
140 /* at end of code, eg "$x" followed by: */
141 #define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
142 #define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
144 #define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
145 string or after \E, $foo, etc */
146 #define LEX_INTERPCONST 2 /* NOT USED */
147 #define LEX_FORMLINE 1 /* expecting a format line */
151 static const char* const lex_state_names[] = {
166 #include "keywords.h"
168 /* CLINE is a macro that ensures PL_copline has a sane value */
170 #define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
173 * Convenience functions to return different tokens and prime the
174 * lexer for the next token. They all take an argument.
176 * TOKEN : generic token (used for '(', DOLSHARP, etc)
177 * OPERATOR : generic operator
178 * AOPERATOR : assignment operator
179 * PREBLOCK : beginning the block after an if, while, foreach, ...
180 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
181 * PREREF : *EXPR where EXPR is not a simple identifier
182 * TERM : expression term
183 * POSTDEREF : postfix dereference (->$* ->@[...] etc.)
184 * LOOPX : loop exiting command (goto, last, dump, etc)
185 * FTST : file test operator
186 * FUN0 : zero-argument function
187 * FUN0OP : zero-argument function, with its op created in this file
188 * FUN1 : not used, except for not, which isn't a UNIOP
189 * BOop : bitwise or or xor
191 * BCop : bitwise complement
192 * SHop : shift operator
193 * PWop : power operator
194 * PMop : pattern-matching operator
195 * Aop : addition-level operator
196 * AopNOASSIGN : addition-level operator that is never part of .=
197 * Mop : multiplication-level operator
198 * Eop : equality-testing operator
199 * Rop : relational operator <= != gt
201 * Also see LOP and lop() below.
204 #ifdef DEBUGGING /* Serve -DT. */
205 # define REPORT(retval) tokereport((I32)retval, &pl_yylval)
207 # define REPORT(retval) (retval)
210 #define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
211 #define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
212 #define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, retval))
213 #define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
214 #define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
215 #define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
216 #define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
217 #define POSTDEREF(f) return (PL_bufptr = s, S_postderef(aTHX_ REPORT(f),s[1]))
218 #define LOOPX(f) return (PL_bufptr = force_word(s,BAREWORD,TRUE,FALSE), \
220 PL_expect = PL_nexttoke ? XOPERATOR : XTERM, \
222 #define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
223 #define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
224 #define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
225 #define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
226 #define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)BITOROP))
227 #define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)BITANDOP))
228 #define BCop(f) return pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr = s, \
230 #define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)SHIFTOP))
231 #define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)POWOP))
232 #define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
233 #define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)ADDOP))
234 #define AopNOASSIGN(f) return (pl_yylval.ival=f, PL_bufptr=s, REPORT((int)ADDOP))
235 #define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)MULOP))
236 #define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
237 #define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
239 /* This bit of chicanery makes a unary function followed by
240 * a parenthesis into a function with one argument, highest precedence.
241 * The UNIDOR macro is for unary functions that can be followed by the //
242 * operator (such as C<shift // 0>).
244 #define UNI3(f,x,have_x) { \
245 pl_yylval.ival = f; \
246 if (have_x) PL_expect = x; \
248 PL_last_uni = PL_oldbufptr; \
249 PL_last_lop_op = (f) < 0 ? -(f) : (f); \
251 return REPORT( (int)FUNC1 ); \
253 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
255 #define UNI(f) UNI3(f,XTERM,1)
256 #define UNIDOR(f) UNI3(f,XTERMORDORDOR,1)
257 #define UNIPROTO(f,optional) { \
258 if (optional) PL_last_uni = PL_oldbufptr; \
262 #define UNIBRACK(f) UNI3(f,0,0)
264 /* grandfather return to old style */
267 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
268 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
269 pl_yylval.ival = (f); \
275 #define COPLINE_INC_WITH_HERELINES \
277 CopLINE_inc(PL_curcop); \
278 if (PL_parser->herelines) \
279 CopLINE(PL_curcop) += PL_parser->herelines, \
280 PL_parser->herelines = 0; \
282 /* Called after scan_str to update CopLINE(PL_curcop), but only when there
283 * is no sublex_push to follow. */
284 #define COPLINE_SET_FROM_MULTI_END \
286 CopLINE_set(PL_curcop, PL_multi_end); \
287 if (PL_multi_end != PL_multi_start) \
288 PL_parser->herelines = 0; \
294 /* how to interpret the pl_yylval associated with the token */
298 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
303 static struct debug_tokens {
305 enum token_type type;
307 } const debug_tokens[] =
309 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
310 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
311 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
312 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
313 { ARROW, TOKENTYPE_NONE, "ARROW" },
314 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
315 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
316 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
317 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
318 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
319 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
320 { DO, TOKENTYPE_NONE, "DO" },
321 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
322 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
323 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
324 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
325 { ELSE, TOKENTYPE_NONE, "ELSE" },
326 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
327 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
328 { FOR, TOKENTYPE_IVAL, "FOR" },
329 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
330 { FORMLBRACK, TOKENTYPE_NONE, "FORMLBRACK" },
331 { FORMRBRACK, TOKENTYPE_NONE, "FORMRBRACK" },
332 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
333 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
334 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
335 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
336 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
337 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
338 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
339 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
340 { IF, TOKENTYPE_IVAL, "IF" },
341 { LABEL, TOKENTYPE_PVAL, "LABEL" },
342 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
343 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
344 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
345 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
346 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
347 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
348 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
349 { MY, TOKENTYPE_IVAL, "MY" },
350 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
351 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
352 { OROP, TOKENTYPE_IVAL, "OROP" },
353 { OROR, TOKENTYPE_NONE, "OROR" },
354 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
355 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
356 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
357 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
358 { POSTJOIN, TOKENTYPE_NONE, "POSTJOIN" },
359 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
360 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
361 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
362 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
363 { PREINC, TOKENTYPE_NONE, "PREINC" },
364 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
365 { QWLIST, TOKENTYPE_OPVAL, "QWLIST" },
366 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
367 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
368 { REQUIRE, TOKENTYPE_NONE, "REQUIRE" },
369 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
370 { SUB, TOKENTYPE_NONE, "SUB" },
371 { THING, TOKENTYPE_OPVAL, "THING" },
372 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
373 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
374 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
375 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
376 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
377 { USE, TOKENTYPE_IVAL, "USE" },
378 { WHEN, TOKENTYPE_IVAL, "WHEN" },
379 { WHILE, TOKENTYPE_IVAL, "WHILE" },
380 { BAREWORD, TOKENTYPE_OPVAL, "BAREWORD" },
381 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
382 { 0, TOKENTYPE_NONE, NULL }
385 /* dump the returned token in rv, plus any optional arg in pl_yylval */
388 S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
390 PERL_ARGS_ASSERT_TOKEREPORT;
393 const char *name = NULL;
394 enum token_type type = TOKENTYPE_NONE;
395 const struct debug_tokens *p;
396 SV* const report = newSVpvs("<== ");
398 for (p = debug_tokens; p->token; p++) {
399 if (p->token == (int)rv) {
406 Perl_sv_catpv(aTHX_ report, name);
407 else if (isGRAPH(rv))
409 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
411 sv_catpvs(report, " (pending identifier)");
414 sv_catpvs(report, "EOF");
416 Perl_sv_catpvf(aTHX_ report, "?? %" IVdf, (IV)rv);
421 Perl_sv_catpvf(aTHX_ report, "(ival=%" IVdf ")", (IV)lvalp->ival);
423 case TOKENTYPE_OPNUM:
424 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
425 PL_op_name[lvalp->ival]);
428 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
430 case TOKENTYPE_OPVAL:
432 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
433 PL_op_name[lvalp->opval->op_type]);
434 if (lvalp->opval->op_type == OP_CONST) {
435 Perl_sv_catpvf(aTHX_ report, " %s",
436 SvPEEK(cSVOPx_sv(lvalp->opval)));
441 sv_catpvs(report, "(opval=null)");
444 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
450 /* print the buffer with suitable escapes */
453 S_printbuf(pTHX_ const char *const fmt, const char *const s)
455 SV* const tmp = newSVpvs("");
457 PERL_ARGS_ASSERT_PRINTBUF;
459 GCC_DIAG_IGNORE(-Wformat-nonliteral); /* fmt checked by caller */
460 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
470 * This subroutine looks for an '=' next to the operator that has just been
471 * parsed and turns it into an ASSIGNOP if it finds one.
475 S_ao(pTHX_ int toketype)
477 if (*PL_bufptr == '=') {
479 if (toketype == ANDAND)
480 pl_yylval.ival = OP_ANDASSIGN;
481 else if (toketype == OROR)
482 pl_yylval.ival = OP_ORASSIGN;
483 else if (toketype == DORDOR)
484 pl_yylval.ival = OP_DORASSIGN;
487 return REPORT(toketype);
492 * When Perl expects an operator and finds something else, no_op
493 * prints the warning. It always prints "<something> found where
494 * operator expected. It prints "Missing semicolon on previous line?"
495 * if the surprise occurs at the start of the line. "do you need to
496 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
497 * where the compiler doesn't know if foo is a method call or a function.
498 * It prints "Missing operator before end of line" if there's nothing
499 * after the missing operator, or "... before <...>" if there is something
500 * after the missing operator.
502 * PL_bufptr is expected to point to the start of the thing that was found,
503 * and s after the next token or partial token.
507 S_no_op(pTHX_ const char *const what, char *s)
509 char * const oldbp = PL_bufptr;
510 const bool is_first = (PL_oldbufptr == PL_linestart);
512 PERL_ARGS_ASSERT_NO_OP;
518 yywarn(Perl_form(aTHX_ "%s found where operator expected", what), UTF ? SVf_UTF8 : 0);
519 if (ckWARN_d(WARN_SYNTAX)) {
521 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
522 "\t(Missing semicolon on previous line?)\n");
523 else if (PL_oldoldbufptr && isIDFIRST_lazy_if_safe(PL_oldoldbufptr,
528 for (t = PL_oldoldbufptr;
529 (isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF) || *t == ':');
530 t += UTF ? UTF8SKIP(t) : 1)
534 if (t < PL_bufptr && isSPACE(*t))
535 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
536 "\t(Do you need to predeclare %" UTF8f "?)\n",
537 UTF8fARG(UTF, t - PL_oldoldbufptr, PL_oldoldbufptr));
541 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
542 "\t(Missing operator before %" UTF8f "?)\n",
543 UTF8fARG(UTF, s - oldbp, oldbp));
551 * Complain about missing quote/regexp/heredoc terminator.
552 * If it's called with NULL then it cauterizes the line buffer.
553 * If we're in a delimited string and the delimiter is a control
554 * character, it's reformatted into a two-char sequence like ^C.
559 S_missingterm(pTHX_ char *s)
561 char tmpbuf[UTF8_MAXBYTES + 1];
566 char * const nl = strrchr(s,'\n');
571 else if (PL_multi_close < 32) {
573 tmpbuf[1] = (char)toCTRL(PL_multi_close);
578 if (LIKELY(PL_multi_close < 256)) {
579 *tmpbuf = (char)PL_multi_close;
584 *uvchr_to_utf8((U8 *)tmpbuf, PL_multi_close) = 0;
588 q = strchr(s,'"') ? '\'' : '"';
589 sv = sv_2mortal(newSVpv(s,0));
592 Perl_croak(aTHX_ "Can't find string terminator %c%" SVf
593 "%c anywhere before EOF",q,SVfARG(sv),q);
599 * Check whether the named feature is enabled.
602 Perl_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
604 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
606 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
608 assert(CURRENT_FEATURE_BUNDLE == FEATURE_BUNDLE_CUSTOM);
610 if (namelen > MAX_FEATURE_LEN)
612 memcpy(&he_name[8], name, namelen);
614 return cBOOL(cop_hints_fetch_pvn(PL_curcop, he_name, 8 + namelen, 0,
615 REFCOUNTED_HE_EXISTS));
619 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
620 * utf16-to-utf8-reversed.
623 #ifdef PERL_CR_FILTER
627 const char *s = SvPVX_const(sv);
628 const char * const e = s + SvCUR(sv);
630 PERL_ARGS_ASSERT_STRIP_RETURN;
632 /* outer loop optimized to do nothing if there are no CR-LFs */
634 if (*s++ == '\r' && *s == '\n') {
635 /* hit a CR-LF, need to copy the rest */
639 if (*s == '\r' && s[1] == '\n')
650 S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
652 const I32 count = FILTER_READ(idx+1, sv, maxlen);
653 if (count > 0 && !maxlen)
660 =for apidoc Amx|void|lex_start|SV *line|PerlIO *rsfp|U32 flags
662 Creates and initialises a new lexer/parser state object, supplying
663 a context in which to lex and parse from a new source of Perl code.
664 A pointer to the new state object is placed in L</PL_parser>. An entry
665 is made on the save stack so that upon unwinding, the new state object
666 will be destroyed and the former value of L</PL_parser> will be restored.
667 Nothing else need be done to clean up the parsing context.
669 The code to be parsed comes from C<line> and C<rsfp>. C<line>, if
670 non-null, provides a string (in SV form) containing code to be parsed.
671 A copy of the string is made, so subsequent modification of C<line>
672 does not affect parsing. C<rsfp>, if non-null, provides an input stream
673 from which code will be read to be parsed. If both are non-null, the
674 code in C<line> comes first and must consist of complete lines of input,
675 and C<rsfp> supplies the remainder of the source.
677 The C<flags> parameter is reserved for future use. Currently it is only
678 used by perl internally, so extensions should always pass zero.
683 /* LEX_START_SAME_FILTER indicates that this is not a new file, so it
684 can share filters with the current parser.
685 LEX_START_DONT_CLOSE indicates that the file handle wasn't opened by the
686 caller, hence isn't owned by the parser, so shouldn't be closed on parser
687 destruction. This is used to handle the case of defaulting to reading the
688 script from the standard input because no filename was given on the command
689 line (without getting confused by situation where STDIN has been closed, so
690 the script handle is opened on fd 0) */
693 Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
695 const char *s = NULL;
696 yy_parser *parser, *oparser;
698 if (flags && flags & ~LEX_START_FLAGS)
699 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
701 /* create and initialise a parser */
703 Newxz(parser, 1, yy_parser);
704 parser->old_parser = oparser = PL_parser;
707 parser->stack = NULL;
708 parser->stack_max1 = NULL;
711 /* on scope exit, free this parser and restore any outer one */
713 parser->saved_curcop = PL_curcop;
715 /* initialise lexer state */
717 parser->nexttoke = 0;
718 parser->error_count = oparser ? oparser->error_count : 0;
719 parser->copline = parser->preambling = NOLINE;
720 parser->lex_state = LEX_NORMAL;
721 parser->expect = XSTATE;
723 parser->recheck_utf8_validity = FALSE;
724 parser->rsfp_filters =
725 !(flags & LEX_START_SAME_FILTER) || !oparser
727 : MUTABLE_AV(SvREFCNT_inc(
728 oparser->rsfp_filters
729 ? oparser->rsfp_filters
730 : (oparser->rsfp_filters = newAV())
733 Newx(parser->lex_brackstack, 120, char);
734 Newx(parser->lex_casestack, 12, char);
735 *parser->lex_casestack = '\0';
736 Newxz(parser->lex_shared, 1, LEXSHARED);
740 const U8* first_bad_char_loc;
742 s = SvPV_const(line, len);
745 && UNLIKELY(! is_utf8_string_loc((U8 *) s,
747 &first_bad_char_loc)))
749 _force_out_malformed_utf8_message(first_bad_char_loc,
750 (U8 *) s + SvCUR(line),
752 1 /* 1 means die */ );
753 NOT_REACHED; /* NOTREACHED */
756 parser->linestr = flags & LEX_START_COPIED
757 ? SvREFCNT_inc_simple_NN(line)
758 : newSVpvn_flags(s, len, SvUTF8(line));
760 sv_catpvs(parser->linestr, "\n;");
762 parser->linestr = newSVpvn("\n;", rsfp ? 1 : 2);
765 parser->oldoldbufptr =
768 parser->linestart = SvPVX(parser->linestr);
769 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
770 parser->last_lop = parser->last_uni = NULL;
772 STATIC_ASSERT_STMT(FITS_IN_8_BITS(LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
773 |LEX_DONT_CLOSE_RSFP));
774 parser->lex_flags = (U8) (flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
775 |LEX_DONT_CLOSE_RSFP));
777 parser->in_pod = parser->filtered = 0;
781 /* delete a parser object */
784 Perl_parser_free(pTHX_ const yy_parser *parser)
786 PERL_ARGS_ASSERT_PARSER_FREE;
788 PL_curcop = parser->saved_curcop;
789 SvREFCNT_dec(parser->linestr);
791 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
792 PerlIO_clearerr(parser->rsfp);
793 else if (parser->rsfp && (!parser->old_parser
794 || (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
795 PerlIO_close(parser->rsfp);
796 SvREFCNT_dec(parser->rsfp_filters);
797 SvREFCNT_dec(parser->lex_stuff);
798 SvREFCNT_dec(parser->lex_sub_repl);
800 Safefree(parser->lex_brackstack);
801 Safefree(parser->lex_casestack);
802 Safefree(parser->lex_shared);
803 PL_parser = parser->old_parser;
808 Perl_parser_free_nexttoke_ops(pTHX_ yy_parser *parser, OPSLAB *slab)
810 I32 nexttoke = parser->nexttoke;
811 PERL_ARGS_ASSERT_PARSER_FREE_NEXTTOKE_OPS;
813 if (S_is_opval_token(parser->nexttype[nexttoke] & 0xffff)
814 && parser->nextval[nexttoke].opval
815 && parser->nextval[nexttoke].opval->op_slabbed
816 && OpSLAB(parser->nextval[nexttoke].opval) == slab) {
817 op_free(parser->nextval[nexttoke].opval);
818 parser->nextval[nexttoke].opval = NULL;
825 =for apidoc AmxU|SV *|PL_parser-E<gt>linestr
827 Buffer scalar containing the chunk currently under consideration of the
828 text currently being lexed. This is always a plain string scalar (for
829 which C<SvPOK> is true). It is not intended to be used as a scalar by
830 normal scalar means; instead refer to the buffer directly by the pointer
831 variables described below.
833 The lexer maintains various C<char*> pointers to things in the
834 C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
835 reallocated, all of these pointers must be updated. Don't attempt to
836 do this manually, but rather use L</lex_grow_linestr> if you need to
837 reallocate the buffer.
839 The content of the text chunk in the buffer is commonly exactly one
840 complete line of input, up to and including a newline terminator,
841 but there are situations where it is otherwise. The octets of the
842 buffer may be intended to be interpreted as either UTF-8 or Latin-1.
843 The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
844 flag on this scalar, which may disagree with it.
846 For direct examination of the buffer, the variable
847 L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
848 lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
849 of these pointers is usually preferable to examination of the scalar
850 through normal scalar means.
852 =for apidoc AmxU|char *|PL_parser-E<gt>bufend
854 Direct pointer to the end of the chunk of text currently being lexed, the
855 end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
856 + SvCUR(PL_parser-E<gt>linestr)>. A C<NUL> character (zero octet) is
857 always located at the end of the buffer, and does not count as part of
858 the buffer's contents.
860 =for apidoc AmxU|char *|PL_parser-E<gt>bufptr
862 Points to the current position of lexing inside the lexer buffer.
863 Characters around this point may be freely examined, within
864 the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
865 L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
866 interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
868 Lexing code (whether in the Perl core or not) moves this pointer past
869 the characters that it consumes. It is also expected to perform some
870 bookkeeping whenever a newline character is consumed. This movement
871 can be more conveniently performed by the function L</lex_read_to>,
872 which handles newlines appropriately.
874 Interpretation of the buffer's octets can be abstracted out by
875 using the slightly higher-level functions L</lex_peek_unichar> and
876 L</lex_read_unichar>.
878 =for apidoc AmxU|char *|PL_parser-E<gt>linestart
880 Points to the start of the current line inside the lexer buffer.
881 This is useful for indicating at which column an error occurred, and
882 not much else. This must be updated by any lexing code that consumes
883 a newline; the function L</lex_read_to> handles this detail.
889 =for apidoc Amx|bool|lex_bufutf8
891 Indicates whether the octets in the lexer buffer
892 (L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
893 of Unicode characters. If not, they should be interpreted as Latin-1
894 characters. This is analogous to the C<SvUTF8> flag for scalars.
896 In UTF-8 mode, it is not guaranteed that the lexer buffer actually
897 contains valid UTF-8. Lexing code must be robust in the face of invalid
900 The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
901 is significant, but not the whole story regarding the input character
902 encoding. Normally, when a file is being read, the scalar contains octets
903 and its C<SvUTF8> flag is off, but the octets should be interpreted as
904 UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
905 however, the scalar may have the C<SvUTF8> flag on, and in this case its
906 octets should be interpreted as UTF-8 unless the C<use bytes> pragma
907 is in effect. This logic may change in the future; use this function
908 instead of implementing the logic yourself.
914 Perl_lex_bufutf8(pTHX)
920 =for apidoc Amx|char *|lex_grow_linestr|STRLEN len
922 Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
923 at least C<len> octets (including terminating C<NUL>). Returns a
924 pointer to the reallocated buffer. This is necessary before making
925 any direct modification of the buffer that would increase its length.
926 L</lex_stuff_pvn> provides a more convenient way to insert text into
929 Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
930 this function updates all of the lexer's variables that point directly
937 Perl_lex_grow_linestr(pTHX_ STRLEN len)
941 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
942 STRLEN linestart_pos, last_uni_pos, last_lop_pos, re_eval_start_pos;
945 linestr = PL_parser->linestr;
946 buf = SvPVX(linestr);
947 if (len <= SvLEN(linestr))
950 /* Is the lex_shared linestr SV the same as the current linestr SV?
951 * Only in this case does re_eval_start need adjusting, since it
952 * points within lex_shared->ls_linestr's buffer */
953 current = ( !PL_parser->lex_shared->ls_linestr
954 || linestr == PL_parser->lex_shared->ls_linestr);
956 bufend_pos = PL_parser->bufend - buf;
957 bufptr_pos = PL_parser->bufptr - buf;
958 oldbufptr_pos = PL_parser->oldbufptr - buf;
959 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
960 linestart_pos = PL_parser->linestart - buf;
961 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
962 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
963 re_eval_start_pos = (current && PL_parser->lex_shared->re_eval_start) ?
964 PL_parser->lex_shared->re_eval_start - buf : 0;
966 buf = sv_grow(linestr, len);
968 PL_parser->bufend = buf + bufend_pos;
969 PL_parser->bufptr = buf + bufptr_pos;
970 PL_parser->oldbufptr = buf + oldbufptr_pos;
971 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
972 PL_parser->linestart = buf + linestart_pos;
973 if (PL_parser->last_uni)
974 PL_parser->last_uni = buf + last_uni_pos;
975 if (PL_parser->last_lop)
976 PL_parser->last_lop = buf + last_lop_pos;
977 if (current && PL_parser->lex_shared->re_eval_start)
978 PL_parser->lex_shared->re_eval_start = buf + re_eval_start_pos;
983 =for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
985 Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
986 immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
987 reallocating the buffer if necessary. This means that lexing code that
988 runs later will see the characters as if they had appeared in the input.
989 It is not recommended to do this as part of normal parsing, and most
990 uses of this facility run the risk of the inserted characters being
991 interpreted in an unintended manner.
993 The string to be inserted is represented by C<len> octets starting
994 at C<pv>. These octets are interpreted as either UTF-8 or Latin-1,
995 according to whether the C<LEX_STUFF_UTF8> flag is set in C<flags>.
996 The characters are recoded for the lexer buffer, according to how the
997 buffer is currently being interpreted (L</lex_bufutf8>). If a string
998 to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
999 function is more convenient.
1005 Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
1009 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
1010 if (flags & ~(LEX_STUFF_UTF8))
1011 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
1013 if (flags & LEX_STUFF_UTF8) {
1016 STRLEN highhalf = 0; /* Count of variants */
1017 const char *p, *e = pv+len;
1018 for (p = pv; p != e; p++) {
1019 if (! UTF8_IS_INVARIANT(*p)) {
1025 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
1026 bufptr = PL_parser->bufptr;
1027 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
1028 SvCUR_set(PL_parser->linestr,
1029 SvCUR(PL_parser->linestr) + len+highhalf);
1030 PL_parser->bufend += len+highhalf;
1031 for (p = pv; p != e; p++) {
1033 if (! UTF8_IS_INVARIANT(c)) {
1034 *bufptr++ = UTF8_TWO_BYTE_HI(c);
1035 *bufptr++ = UTF8_TWO_BYTE_LO(c);
1037 *bufptr++ = (char)c;
1042 if (flags & LEX_STUFF_UTF8) {
1043 STRLEN highhalf = 0;
1044 const char *p, *e = pv+len;
1045 for (p = pv; p != e; p++) {
1047 if (UTF8_IS_ABOVE_LATIN1(c)) {
1048 Perl_croak(aTHX_ "Lexing code attempted to stuff "
1049 "non-Latin-1 character into Latin-1 input");
1050 } else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e)) {
1053 } else assert(UTF8_IS_INVARIANT(c));
1057 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
1058 bufptr = PL_parser->bufptr;
1059 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
1060 SvCUR_set(PL_parser->linestr,
1061 SvCUR(PL_parser->linestr) + len-highhalf);
1062 PL_parser->bufend += len-highhalf;
1065 if (UTF8_IS_INVARIANT(*p)) {
1071 *bufptr++ = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
1077 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1078 bufptr = PL_parser->bufptr;
1079 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
1080 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
1081 PL_parser->bufend += len;
1082 Copy(pv, bufptr, len, char);
1088 =for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
1090 Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1091 immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1092 reallocating the buffer if necessary. This means that lexing code that
1093 runs later will see the characters as if they had appeared in the input.
1094 It is not recommended to do this as part of normal parsing, and most
1095 uses of this facility run the risk of the inserted characters being
1096 interpreted in an unintended manner.
1098 The string to be inserted is represented by octets starting at C<pv>
1099 and continuing to the first nul. These octets are interpreted as either
1100 UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
1101 in C<flags>. The characters are recoded for the lexer buffer, according
1102 to how the buffer is currently being interpreted (L</lex_bufutf8>).
1103 If it is not convenient to nul-terminate a string to be inserted, the
1104 L</lex_stuff_pvn> function is more appropriate.
1110 Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1112 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1113 lex_stuff_pvn(pv, strlen(pv), flags);
1117 =for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1119 Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1120 immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1121 reallocating the buffer if necessary. This means that lexing code that
1122 runs later will see the characters as if they had appeared in the input.
1123 It is not recommended to do this as part of normal parsing, and most
1124 uses of this facility run the risk of the inserted characters being
1125 interpreted in an unintended manner.
1127 The string to be inserted is the string value of C<sv>. The characters
1128 are recoded for the lexer buffer, according to how the buffer is currently
1129 being interpreted (L</lex_bufutf8>). If a string to be inserted is
1130 not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1131 need to construct a scalar.
1137 Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1141 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1143 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1145 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1149 =for apidoc Amx|void|lex_unstuff|char *ptr
1151 Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1152 C<ptr>. Text following C<ptr> will be moved, and the buffer shortened.
1153 This hides the discarded text from any lexing code that runs later,
1154 as if the text had never appeared.
1156 This is not the normal way to consume lexed text. For that, use
1163 Perl_lex_unstuff(pTHX_ char *ptr)
1167 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1168 buf = PL_parser->bufptr;
1170 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1173 bufend = PL_parser->bufend;
1175 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1176 unstuff_len = ptr - buf;
1177 Move(ptr, buf, bufend+1-ptr, char);
1178 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1179 PL_parser->bufend = bufend - unstuff_len;
1183 =for apidoc Amx|void|lex_read_to|char *ptr
1185 Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1186 to C<ptr>. This advances L</PL_parser-E<gt>bufptr> to match C<ptr>,
1187 performing the correct bookkeeping whenever a newline character is passed.
1188 This is the normal way to consume lexed text.
1190 Interpretation of the buffer's octets can be abstracted out by
1191 using the slightly higher-level functions L</lex_peek_unichar> and
1192 L</lex_read_unichar>.
1198 Perl_lex_read_to(pTHX_ char *ptr)
1201 PERL_ARGS_ASSERT_LEX_READ_TO;
1202 s = PL_parser->bufptr;
1203 if (ptr < s || ptr > PL_parser->bufend)
1204 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1205 for (; s != ptr; s++)
1207 COPLINE_INC_WITH_HERELINES;
1208 PL_parser->linestart = s+1;
1210 PL_parser->bufptr = ptr;
1214 =for apidoc Amx|void|lex_discard_to|char *ptr
1216 Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1217 up to C<ptr>. The remaining content of the buffer will be moved, and
1218 all pointers into the buffer updated appropriately. C<ptr> must not
1219 be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1220 it is not permitted to discard text that has yet to be lexed.
1222 Normally it is not necessarily to do this directly, because it suffices to
1223 use the implicit discarding behaviour of L</lex_next_chunk> and things
1224 based on it. However, if a token stretches across multiple lines,
1225 and the lexing code has kept multiple lines of text in the buffer for
1226 that purpose, then after completion of the token it would be wise to
1227 explicitly discard the now-unneeded earlier lines, to avoid future
1228 multi-line tokens growing the buffer without bound.
1234 Perl_lex_discard_to(pTHX_ char *ptr)
1238 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1239 buf = SvPVX(PL_parser->linestr);
1241 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1244 if (ptr > PL_parser->bufptr)
1245 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1246 discard_len = ptr - buf;
1247 if (PL_parser->oldbufptr < ptr)
1248 PL_parser->oldbufptr = ptr;
1249 if (PL_parser->oldoldbufptr < ptr)
1250 PL_parser->oldoldbufptr = ptr;
1251 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1252 PL_parser->last_uni = NULL;
1253 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1254 PL_parser->last_lop = NULL;
1255 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1256 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1257 PL_parser->bufend -= discard_len;
1258 PL_parser->bufptr -= discard_len;
1259 PL_parser->oldbufptr -= discard_len;
1260 PL_parser->oldoldbufptr -= discard_len;
1261 if (PL_parser->last_uni)
1262 PL_parser->last_uni -= discard_len;
1263 if (PL_parser->last_lop)
1264 PL_parser->last_lop -= discard_len;
1268 Perl_notify_parser_that_changed_to_utf8(pTHX)
1270 /* Called when $^H is changed to indicate that HINT_UTF8 has changed from
1271 * off to on. At compile time, this has the effect of entering a 'use
1272 * utf8' section. This means that any input was not previously checked for
1273 * UTF-8 (because it was off), but now we do need to check it, or our
1274 * assumptions about the input being sane could be wrong, and we could
1275 * segfault. This routine just sets a flag so that the next time we look
1276 * at the input we do the well-formed UTF-8 check. If we aren't in the
1277 * proper phase, there may not be a parser object, but if there is, setting
1278 * the flag is harmless */
1281 PL_parser->recheck_utf8_validity = TRUE;
1286 =for apidoc Amx|bool|lex_next_chunk|U32 flags
1288 Reads in the next chunk of text to be lexed, appending it to
1289 L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1290 looked to the end of the current chunk and wants to know more. It is
1291 usual, but not necessary, for lexing to have consumed the entirety of
1292 the current chunk at this time.
1294 If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1295 chunk (i.e., the current chunk has been entirely consumed), normally the
1296 current chunk will be discarded at the same time that the new chunk is
1297 read in. If C<flags> has the C<LEX_KEEP_PREVIOUS> bit set, the current chunk
1298 will not be discarded. If the current chunk has not been entirely
1299 consumed, then it will not be discarded regardless of the flag.
1301 Returns true if some new text was added to the buffer, or false if the
1302 buffer has reached the end of the input text.
1307 #define LEX_FAKE_EOF 0x80000000
1308 #define LEX_NO_TERM 0x40000000 /* here-doc */
1311 Perl_lex_next_chunk(pTHX_ U32 flags)
1315 STRLEN old_bufend_pos, new_bufend_pos;
1316 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1317 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
1318 bool got_some_for_debugger = 0;
1321 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM))
1322 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
1323 if (!(flags & LEX_NO_TERM) && PL_lex_inwhat)
1325 linestr = PL_parser->linestr;
1326 buf = SvPVX(linestr);
1327 if (!(flags & LEX_KEEP_PREVIOUS)
1328 && PL_parser->bufptr == PL_parser->bufend)
1330 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1332 if (PL_parser->last_uni != PL_parser->bufend)
1333 PL_parser->last_uni = NULL;
1334 if (PL_parser->last_lop != PL_parser->bufend)
1335 PL_parser->last_lop = NULL;
1336 last_uni_pos = last_lop_pos = 0;
1340 old_bufend_pos = PL_parser->bufend - buf;
1341 bufptr_pos = PL_parser->bufptr - buf;
1342 oldbufptr_pos = PL_parser->oldbufptr - buf;
1343 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1344 linestart_pos = PL_parser->linestart - buf;
1345 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1346 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1348 if (flags & LEX_FAKE_EOF) {
1350 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
1352 } else if (filter_gets(linestr, old_bufend_pos)) {
1354 got_some_for_debugger = 1;
1355 } else if (flags & LEX_NO_TERM) {
1358 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1361 /* End of real input. Close filehandle (unless it was STDIN),
1362 * then add implicit termination.
1364 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
1365 PerlIO_clearerr(PL_parser->rsfp);
1366 else if (PL_parser->rsfp)
1367 (void)PerlIO_close(PL_parser->rsfp);
1368 PL_parser->rsfp = NULL;
1369 PL_parser->in_pod = PL_parser->filtered = 0;
1370 if (!PL_in_eval && PL_minus_p) {
1372 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1373 PL_minus_n = PL_minus_p = 0;
1374 } else if (!PL_in_eval && PL_minus_n) {
1375 sv_catpvs(linestr, /*{*/";}");
1378 sv_catpvs(linestr, ";");
1381 buf = SvPVX(linestr);
1382 new_bufend_pos = SvCUR(linestr);
1383 PL_parser->bufend = buf + new_bufend_pos;
1384 PL_parser->bufptr = buf + bufptr_pos;
1387 const U8* first_bad_char_loc;
1388 if (UNLIKELY(! is_utf8_string_loc(
1389 (U8 *) PL_parser->bufptr,
1390 PL_parser->bufend - PL_parser->bufptr,
1391 &first_bad_char_loc)))
1393 _force_out_malformed_utf8_message(first_bad_char_loc,
1394 (U8 *) PL_parser->bufend,
1396 1 /* 1 means die */ );
1397 NOT_REACHED; /* NOTREACHED */
1401 PL_parser->oldbufptr = buf + oldbufptr_pos;
1402 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1403 PL_parser->linestart = buf + linestart_pos;
1404 if (PL_parser->last_uni)
1405 PL_parser->last_uni = buf + last_uni_pos;
1406 if (PL_parser->last_lop)
1407 PL_parser->last_lop = buf + last_lop_pos;
1408 if (PL_parser->preambling != NOLINE) {
1409 CopLINE_set(PL_curcop, PL_parser->preambling + 1);
1410 PL_parser->preambling = NOLINE;
1412 if ( got_some_for_debugger
1413 && PERLDB_LINE_OR_SAVESRC
1414 && PL_curstash != PL_debstash)
1416 /* debugger active and we're not compiling the debugger code,
1417 * so store the line into the debugger's array of lines
1419 update_debugger_info(NULL, buf+old_bufend_pos,
1420 new_bufend_pos-old_bufend_pos);
1426 =for apidoc Amx|I32|lex_peek_unichar|U32 flags
1428 Looks ahead one (Unicode) character in the text currently being lexed.
1429 Returns the codepoint (unsigned integer value) of the next character,
1430 or -1 if lexing has reached the end of the input text. To consume the
1431 peeked character, use L</lex_read_unichar>.
1433 If the next character is in (or extends into) the next chunk of input
1434 text, the next chunk will be read in. Normally the current chunk will be
1435 discarded at the same time, but if C<flags> has the C<LEX_KEEP_PREVIOUS>
1436 bit set, then the current chunk will not be discarded.
1438 If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1439 is encountered, an exception is generated.
1445 Perl_lex_peek_unichar(pTHX_ U32 flags)
1449 if (flags & ~(LEX_KEEP_PREVIOUS))
1450 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1451 s = PL_parser->bufptr;
1452 bufend = PL_parser->bufend;
1458 if (!lex_next_chunk(flags))
1460 s = PL_parser->bufptr;
1461 bufend = PL_parser->bufend;
1464 if (UTF8_IS_INVARIANT(head))
1466 if (UTF8_IS_START(head)) {
1467 len = UTF8SKIP(&head);
1468 while ((STRLEN)(bufend-s) < len) {
1469 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1471 s = PL_parser->bufptr;
1472 bufend = PL_parser->bufend;
1475 unichar = utf8n_to_uvchr((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
1476 if (retlen == (STRLEN)-1) {
1477 _force_out_malformed_utf8_message((U8 *) s,
1480 1 /* 1 means die */ );
1481 NOT_REACHED; /* NOTREACHED */
1486 if (!lex_next_chunk(flags))
1488 s = PL_parser->bufptr;
1495 =for apidoc Amx|I32|lex_read_unichar|U32 flags
1497 Reads the next (Unicode) character in the text currently being lexed.
1498 Returns the codepoint (unsigned integer value) of the character read,
1499 and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1500 if lexing has reached the end of the input text. To non-destructively
1501 examine the next character, use L</lex_peek_unichar> instead.
1503 If the next character is in (or extends into) the next chunk of input
1504 text, the next chunk will be read in. Normally the current chunk will be
1505 discarded at the same time, but if C<flags> has the C<LEX_KEEP_PREVIOUS>
1506 bit set, then the current chunk will not be discarded.
1508 If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1509 is encountered, an exception is generated.
1515 Perl_lex_read_unichar(pTHX_ U32 flags)
1518 if (flags & ~(LEX_KEEP_PREVIOUS))
1519 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1520 c = lex_peek_unichar(flags);
1523 COPLINE_INC_WITH_HERELINES;
1525 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1527 ++(PL_parser->bufptr);
1533 =for apidoc Amx|void|lex_read_space|U32 flags
1535 Reads optional spaces, in Perl style, in the text currently being
1536 lexed. The spaces may include ordinary whitespace characters and
1537 Perl-style comments. C<#line> directives are processed if encountered.
1538 L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1539 at a non-space character (or the end of the input text).
1541 If spaces extend into the next chunk of input text, the next chunk will
1542 be read in. Normally the current chunk will be discarded at the same
1543 time, but if C<flags> has the C<LEX_KEEP_PREVIOUS> bit set, then the current
1544 chunk will not be discarded.
1549 #define LEX_NO_INCLINE 0x40000000
1550 #define LEX_NO_NEXT_CHUNK 0x80000000
1553 Perl_lex_read_space(pTHX_ U32 flags)
1556 const bool can_incline = !(flags & LEX_NO_INCLINE);
1557 bool need_incline = 0;
1558 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK|LEX_NO_INCLINE))
1559 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
1560 s = PL_parser->bufptr;
1561 bufend = PL_parser->bufend;
1567 } while (!(c == '\n' || (c == 0 && s == bufend)));
1568 } else if (c == '\n') {
1571 PL_parser->linestart = s;
1577 } else if (isSPACE(c)) {
1579 } else if (c == 0 && s == bufend) {
1582 if (flags & LEX_NO_NEXT_CHUNK)
1584 PL_parser->bufptr = s;
1585 l = CopLINE(PL_curcop);
1586 CopLINE(PL_curcop) += PL_parser->herelines + 1;
1587 got_more = lex_next_chunk(flags);
1588 CopLINE_set(PL_curcop, l);
1589 s = PL_parser->bufptr;
1590 bufend = PL_parser->bufend;
1593 if (can_incline && need_incline && PL_parser->rsfp) {
1603 PL_parser->bufptr = s;
1608 =for apidoc EXMp|bool|validate_proto|SV *name|SV *proto|bool warn
1610 This function performs syntax checking on a prototype, C<proto>.
1611 If C<warn> is true, any illegal characters or mismatched brackets
1612 will trigger illegalproto warnings, declaring that they were
1613 detected in the prototype for C<name>.
1615 The return value is C<true> if this is a valid prototype, and
1616 C<false> if it is not, regardless of whether C<warn> was C<true> or
1619 Note that C<NULL> is a valid C<proto> and will always return C<true>.
1626 Perl_validate_proto(pTHX_ SV *name, SV *proto, bool warn)
1628 STRLEN len, origlen;
1630 bool bad_proto = FALSE;
1631 bool in_brackets = FALSE;
1632 bool after_slash = FALSE;
1633 char greedy_proto = ' ';
1634 bool proto_after_greedy_proto = FALSE;
1635 bool must_be_last = FALSE;
1636 bool underscore = FALSE;
1637 bool bad_proto_after_underscore = FALSE;
1639 PERL_ARGS_ASSERT_VALIDATE_PROTO;
1644 p = SvPV(proto, len);
1646 for (; len--; p++) {
1649 proto_after_greedy_proto = TRUE;
1651 if (!strchr(";@%", *p))
1652 bad_proto_after_underscore = TRUE;
1655 if (!strchr("$@%*;[]&\\_+", *p) || *p == '\0') {
1662 in_brackets = FALSE;
1663 else if ((*p == '@' || *p == '%')
1667 must_be_last = TRUE;
1676 after_slash = FALSE;
1681 SV *tmpsv = newSVpvs_flags("", SVs_TEMP);
1684 ? sv_uni_display(tmpsv, newSVpvn_flags(p, origlen, SVs_TEMP | SVf_UTF8),
1685 origlen, UNI_DISPLAY_ISPRINT)
1686 : pv_pretty(tmpsv, p, origlen, 60, NULL, NULL, PERL_PV_ESCAPE_NONASCII);
1688 if (proto_after_greedy_proto)
1689 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1690 "Prototype after '%c' for %" SVf " : %s",
1691 greedy_proto, SVfARG(name), p);
1693 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1694 "Missing ']' in prototype for %" SVf " : %s",
1697 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1698 "Illegal character in prototype for %" SVf " : %s",
1700 if (bad_proto_after_underscore)
1701 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1702 "Illegal character after '_' in prototype for %" SVf " : %s",
1706 return (! (proto_after_greedy_proto || bad_proto) );
1711 * This subroutine has nothing to do with tilting, whether at windmills
1712 * or pinball tables. Its name is short for "increment line". It
1713 * increments the current line number in CopLINE(PL_curcop) and checks
1714 * to see whether the line starts with a comment of the form
1715 * # line 500 "foo.pm"
1716 * If so, it sets the current line number and file to the values in the comment.
1720 S_incline(pTHX_ const char *s)
1728 PERL_ARGS_ASSERT_INCLINE;
1730 COPLINE_INC_WITH_HERELINES;
1731 if (!PL_rsfp && !PL_parser->filtered && PL_lex_state == LEX_NORMAL
1732 && s+1 == PL_bufend && *s == ';') {
1733 /* fake newline in string eval */
1734 CopLINE_dec(PL_curcop);
1739 while (SPACE_OR_TAB(*s))
1741 if (strEQs(s, "line"))
1745 if (SPACE_OR_TAB(*s))
1749 while (SPACE_OR_TAB(*s))
1757 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
1759 while (SPACE_OR_TAB(*s))
1761 if (*s == '"' && (t = strchr(s+1, '"'))) {
1767 while (*t && !isSPACE(*t))
1771 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
1773 if (*e != '\n' && *e != '\0')
1774 return; /* false alarm */
1776 if (!grok_atoUV(n, &uv, &e))
1778 line_num = ((line_t)uv) - 1;
1781 const STRLEN len = t - s;
1783 if (!PL_rsfp && !PL_parser->filtered) {
1784 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1785 * to *{"::_<newfilename"} */
1786 /* However, the long form of evals is only turned on by the
1787 debugger - usually they're "(eval %lu)" */
1788 GV * const cfgv = CopFILEGV(PL_curcop);
1791 STRLEN tmplen2 = len;
1795 if (tmplen2 + 2 <= sizeof smallbuf)
1798 Newx(tmpbuf2, tmplen2 + 2, char);
1803 memcpy(tmpbuf2 + 2, s, tmplen2);
1806 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
1808 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
1809 /* adjust ${"::_<newfilename"} to store the new file name */
1810 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
1811 /* The line number may differ. If that is the case,
1812 alias the saved lines that are in the array.
1813 Otherwise alias the whole array. */
1814 if (CopLINE(PL_curcop) == line_num) {
1815 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(cfgv)));
1816 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(cfgv)));
1818 else if (GvAV(cfgv)) {
1819 AV * const av = GvAV(cfgv);
1820 const I32 start = CopLINE(PL_curcop)+1;
1821 I32 items = AvFILLp(av) - start;
1823 AV * const av2 = GvAVn(gv2);
1824 SV **svp = AvARRAY(av) + start;
1825 I32 l = (I32)line_num+1;
1827 av_store(av2, l++, SvREFCNT_inc(*svp++));
1832 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
1835 CopFILE_free(PL_curcop);
1836 CopFILE_setn(PL_curcop, s, len);
1838 CopLINE_set(PL_curcop, line_num);
1842 S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
1844 AV *av = CopFILEAVx(PL_curcop);
1847 if (PL_parser->preambling == NOLINE) sv = newSV_type(SVt_PVMG);
1849 sv = *av_fetch(av, 0, 1);
1850 SvUPGRADE(sv, SVt_PVMG);
1852 if (!SvPOK(sv)) SvPVCLEAR(sv);
1854 sv_catsv(sv, orig_sv);
1856 sv_catpvn(sv, buf, len);
1861 if (PL_parser->preambling == NOLINE)
1862 av_store(av, CopLINE(PL_curcop), sv);
1868 * Called to gobble the appropriate amount and type of whitespace.
1869 * Skips comments as well.
1870 * Returns the next character after the whitespace that is skipped.
1873 * Same thing, but look ahead without incrementing line numbers or
1874 * adjusting PL_linestart.
1877 #define skipspace(s) skipspace_flags(s, 0)
1878 #define peekspace(s) skipspace_flags(s, LEX_NO_INCLINE)
1881 S_skipspace_flags(pTHX_ char *s, U32 flags)
1883 PERL_ARGS_ASSERT_SKIPSPACE_FLAGS;
1884 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
1885 while (s < PL_bufend && (SPACE_OR_TAB(*s) || !*s))
1888 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1890 lex_read_space(flags | LEX_KEEP_PREVIOUS |
1891 (PL_lex_inwhat || PL_lex_state == LEX_FORMLINE ?
1892 LEX_NO_NEXT_CHUNK : 0));
1894 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1895 if (PL_linestart > PL_bufptr)
1896 PL_bufptr = PL_linestart;
1904 * Check the unary operators to ensure there's no ambiguity in how they're
1905 * used. An ambiguous piece of code would be:
1907 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1908 * the +5 is its argument.
1917 if (PL_oldoldbufptr != PL_last_uni)
1919 while (isSPACE(*PL_last_uni))
1922 while (isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF) || *s == '-')
1923 s += UTF ? UTF8SKIP(s) : 1;
1924 if ((t = strchr(s, '(')) && t < PL_bufptr)
1927 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
1928 "Warning: Use of \"%" UTF8f "\" without parentheses is ambiguous",
1929 UTF8fARG(UTF, (int)(s - PL_last_uni), PL_last_uni));
1933 * LOP : macro to build a list operator. Its behaviour has been replaced
1934 * with a subroutine, S_lop() for which LOP is just another name.
1937 #define LOP(f,x) return lop(f,x,s)
1941 * Build a list operator (or something that might be one). The rules:
1942 * - if we have a next token, then it's a list operator (no parens) for
1943 * which the next token has already been parsed; e.g.,
1946 * - if the next thing is an opening paren, then it's a function
1947 * - else it's a list operator
1951 S_lop(pTHX_ I32 f, U8 x, char *s)
1953 PERL_ARGS_ASSERT_LOP;
1958 PL_last_lop = PL_oldbufptr;
1959 PL_last_lop_op = (OPCODE)f;
1964 return REPORT(FUNC);
1967 return REPORT(FUNC);
1970 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
1971 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
1972 return REPORT(LSTOP);
1978 * When the lexer realizes it knows the next token (for instance,
1979 * it is reordering tokens for the parser) then it can call S_force_next
1980 * to know what token to return the next time the lexer is called. Caller
1981 * will need to set PL_nextval[] and possibly PL_expect to ensure
1982 * the lexer handles the token correctly.
1986 S_force_next(pTHX_ I32 type)
1990 PerlIO_printf(Perl_debug_log, "### forced token:\n");
1991 tokereport(type, &NEXTVAL_NEXTTOKE);
1994 assert(PL_nexttoke < C_ARRAY_LENGTH(PL_nexttype));
1995 PL_nexttype[PL_nexttoke] = type;
2002 * This subroutine handles postfix deref syntax after the arrow has already
2003 * been emitted. @* $* etc. are emitted as two separate token right here.
2004 * @[ @{ %[ %{ *{ are emitted also as two tokens, but this function emits
2005 * only the first, leaving yylex to find the next.
2009 S_postderef(pTHX_ int const funny, char const next)
2011 assert(funny == DOLSHARP || strchr("$@%&*", funny));
2013 PL_expect = XOPERATOR;
2014 if (PL_lex_state == LEX_INTERPNORMAL && !PL_lex_brackets) {
2015 assert('@' == funny || '$' == funny || DOLSHARP == funny);
2016 PL_lex_state = LEX_INTERPEND;
2018 force_next(POSTJOIN);
2024 if ('@' == funny && PL_lex_state == LEX_INTERPNORMAL
2025 && !PL_lex_brackets)
2027 PL_expect = XOPERATOR;
2036 int yyc = PL_parser->yychar;
2037 if (yyc != YYEMPTY) {
2039 NEXTVAL_NEXTTOKE = PL_parser->yylval;
2040 if (yyc == '{'/*}*/ || yyc == HASHBRACK || yyc == '['/*]*/) {
2041 PL_lex_allbrackets--;
2043 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
2044 } else if (yyc == '('/*)*/) {
2045 PL_lex_allbrackets--;
2050 PL_parser->yychar = YYEMPTY;
2055 S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
2057 SV * const sv = newSVpvn_utf8(start, len,
2060 && !is_utf8_invariant_string((const U8*)start, len)
2061 && is_utf8_string((const U8*)start, len));
2067 * When the lexer knows the next thing is a word (for instance, it has
2068 * just seen -> and it knows that the next char is a word char, then
2069 * it calls S_force_word to stick the next word into the PL_nexttoke/val
2073 * char *start : buffer position (must be within PL_linestr)
2074 * int token : PL_next* will be this type of bare word
2075 * (e.g., METHOD,BAREWORD)
2076 * int check_keyword : if true, Perl checks to make sure the word isn't
2077 * a keyword (do this if the word is a label, e.g. goto FOO)
2078 * int allow_pack : if true, : characters will also be allowed (require,
2079 * use, etc. do this)
2083 S_force_word(pTHX_ char *start, int token, int check_keyword, int allow_pack)
2088 PERL_ARGS_ASSERT_FORCE_WORD;
2090 start = skipspace(start);
2092 if ( isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)
2093 || (allow_pack && *s == ':' && s[1] == ':') )
2095 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
2096 if (check_keyword) {
2097 char *s2 = PL_tokenbuf;
2099 if (allow_pack && len > 6 && strEQs(s2, "CORE::"))
2101 if (keyword(s2, len2, 0))
2104 if (token == METHOD) {
2109 PL_expect = XOPERATOR;
2112 NEXTVAL_NEXTTOKE.opval
2113 = newSVOP(OP_CONST,0,
2114 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
2115 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
2123 * Called when the lexer wants $foo *foo &foo etc, but the program
2124 * text only contains the "foo" portion. The first argument is a pointer
2125 * to the "foo", and the second argument is the type symbol to prefix.
2126 * Forces the next token to be a "BAREWORD".
2127 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
2131 S_force_ident(pTHX_ const char *s, int kind)
2133 PERL_ARGS_ASSERT_FORCE_IDENT;
2136 const STRLEN len = s[1] ? strlen(s) : 1; /* s = "\"" see yylex */
2137 OP* const o = newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
2138 UTF ? SVf_UTF8 : 0));
2139 NEXTVAL_NEXTTOKE.opval = o;
2140 force_next(BAREWORD);
2142 o->op_private = OPpCONST_ENTERED;
2143 /* XXX see note in pp_entereval() for why we forgo typo
2144 warnings if the symbol must be introduced in an eval.
2146 gv_fetchpvn_flags(s, len,
2147 (PL_in_eval ? GV_ADDMULTI
2148 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
2149 kind == '$' ? SVt_PV :
2150 kind == '@' ? SVt_PVAV :
2151 kind == '%' ? SVt_PVHV :
2159 S_force_ident_maybe_lex(pTHX_ char pit)
2161 NEXTVAL_NEXTTOKE.ival = pit;
2166 Perl_str_to_version(pTHX_ SV *sv)
2171 const char *start = SvPV_const(sv,len);
2172 const char * const end = start + len;
2173 const bool utf = cBOOL(SvUTF8(sv));
2175 PERL_ARGS_ASSERT_STR_TO_VERSION;
2177 while (start < end) {
2181 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
2186 retval += ((NV)n)/nshift;
2195 * Forces the next token to be a version number.
2196 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2197 * and if "guessing" is TRUE, then no new token is created (and the caller
2198 * must use an alternative parsing method).
2202 S_force_version(pTHX_ char *s, int guessing)
2207 PERL_ARGS_ASSERT_FORCE_VERSION;
2215 while (isDIGIT(*d) || *d == '_' || *d == '.')
2217 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
2219 s = scan_num(s, &pl_yylval);
2220 version = pl_yylval.opval;
2221 ver = cSVOPx(version)->op_sv;
2222 if (SvPOK(ver) && !SvNIOK(ver)) {
2223 SvUPGRADE(ver, SVt_PVNV);
2224 SvNV_set(ver, str_to_version(ver));
2225 SvNOK_on(ver); /* hint that it is a version */
2228 else if (guessing) {
2233 /* NOTE: The parser sees the package name and the VERSION swapped */
2234 NEXTVAL_NEXTTOKE.opval = version;
2235 force_next(BAREWORD);
2241 * S_force_strict_version
2242 * Forces the next token to be a version number using strict syntax rules.
2246 S_force_strict_version(pTHX_ char *s)
2249 const char *errstr = NULL;
2251 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2253 while (isSPACE(*s)) /* leading whitespace */
2256 if (is_STRICT_VERSION(s,&errstr)) {
2258 s = (char *)scan_version(s, ver, 0);
2259 version = newSVOP(OP_CONST, 0, ver);
2261 else if ((*s != ';' && *s != '{' && *s != '}' )
2262 && (s = skipspace(s), (*s != ';' && *s != '{' && *s != '}' )))
2266 yyerror(errstr); /* version required */
2270 /* NOTE: The parser sees the package name and the VERSION swapped */
2271 NEXTVAL_NEXTTOKE.opval = version;
2272 force_next(BAREWORD);
2279 * Turns any \\ into \ in a quoted string passed in in 'sv', returning 'sv',
2280 * modified as necessary. However, if HINT_NEW_STRING is on, 'sv' is
2281 * unchanged, and a new SV containing the modified input is returned.
2285 S_tokeq(pTHX_ SV *sv)
2292 PERL_ARGS_ASSERT_TOKEQ;
2296 assert (!SvIsCOW(sv));
2297 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1) /* <<'heredoc' */
2301 /* This is relying on the SV being "well formed" with a trailing '\0' */
2302 while (s < send && !(*s == '\\' && s[1] == '\\'))
2307 if ( PL_hints & HINT_NEW_STRING ) {
2308 pv = newSVpvn_flags(SvPVX_const(pv), SvCUR(sv),
2309 SVs_TEMP | SvUTF8(sv));
2313 if (s + 1 < send && (s[1] == '\\'))
2314 s++; /* all that, just for this */
2319 SvCUR_set(sv, d - SvPVX_const(sv));
2321 if ( PL_hints & HINT_NEW_STRING )
2322 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
2327 * Now come three functions related to double-quote context,
2328 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2329 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2330 * interact with PL_lex_state, and create fake ( ... ) argument lists
2331 * to handle functions and concatenation.
2335 * stringify ( const[foo] concat lcfirst ( const[bar] ) )
2340 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
2342 * Pattern matching will set PL_lex_op to the pattern-matching op to
2343 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
2345 * OP_CONST is easy--just make the new op and return.
2347 * Everything else becomes a FUNC.
2349 * Sets PL_lex_state to LEX_INTERPPUSH unless ival was OP_NULL or we
2350 * had an OP_CONST. This just sets us up for a
2351 * call to S_sublex_push().
2355 S_sublex_start(pTHX)
2357 const I32 op_type = pl_yylval.ival;
2359 if (op_type == OP_NULL) {
2360 pl_yylval.opval = PL_lex_op;
2364 if (op_type == OP_CONST) {
2365 SV *sv = PL_lex_stuff;
2366 PL_lex_stuff = NULL;
2369 if (SvTYPE(sv) == SVt_PVIV) {
2370 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2372 const char * const p = SvPV_const(sv, len);
2373 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
2377 pl_yylval.opval = newSVOP(op_type, 0, sv);
2381 PL_parser->lex_super_state = PL_lex_state;
2382 PL_parser->lex_sub_inwhat = (U16)op_type;
2383 PL_parser->lex_sub_op = PL_lex_op;
2384 PL_lex_state = LEX_INTERPPUSH;
2388 pl_yylval.opval = PL_lex_op;
2398 * Create a new scope to save the lexing state. The scope will be
2399 * ended in S_sublex_done. Returns a '(', starting the function arguments
2400 * to the uc, lc, etc. found before.
2401 * Sets PL_lex_state to LEX_INTERPCONCAT.
2408 const bool is_heredoc = PL_multi_close == '<';
2411 PL_lex_state = PL_parser->lex_super_state;
2412 SAVEI8(PL_lex_dojoin);
2413 SAVEI32(PL_lex_brackets);
2414 SAVEI32(PL_lex_allbrackets);
2415 SAVEI32(PL_lex_formbrack);
2416 SAVEI8(PL_lex_fakeeof);
2417 SAVEI32(PL_lex_casemods);
2418 SAVEI32(PL_lex_starts);
2419 SAVEI8(PL_lex_state);
2420 SAVESPTR(PL_lex_repl);
2421 SAVEVPTR(PL_lex_inpat);
2422 SAVEI16(PL_lex_inwhat);
2425 SAVECOPLINE(PL_curcop);
2426 SAVEI32(PL_multi_end);
2427 SAVEI32(PL_parser->herelines);
2428 PL_parser->herelines = 0;
2430 SAVEIV(PL_multi_close);
2431 SAVEPPTR(PL_bufptr);
2432 SAVEPPTR(PL_bufend);
2433 SAVEPPTR(PL_oldbufptr);
2434 SAVEPPTR(PL_oldoldbufptr);
2435 SAVEPPTR(PL_last_lop);
2436 SAVEPPTR(PL_last_uni);
2437 SAVEPPTR(PL_linestart);
2438 SAVESPTR(PL_linestr);
2439 SAVEGENERICPV(PL_lex_brackstack);
2440 SAVEGENERICPV(PL_lex_casestack);
2441 SAVEGENERICPV(PL_parser->lex_shared);
2442 SAVEBOOL(PL_parser->lex_re_reparsing);
2443 SAVEI32(PL_copline);
2445 /* The here-doc parser needs to be able to peek into outer lexing
2446 scopes to find the body of the here-doc. So we put PL_linestr and
2447 PL_bufptr into lex_shared, to ‘share’ those values.
2449 PL_parser->lex_shared->ls_linestr = PL_linestr;
2450 PL_parser->lex_shared->ls_bufptr = PL_bufptr;
2452 PL_linestr = PL_lex_stuff;
2453 PL_lex_repl = PL_parser->lex_sub_repl;
2454 PL_lex_stuff = NULL;
2455 PL_parser->lex_sub_repl = NULL;
2457 /* Arrange for PL_lex_stuff to be freed on scope exit, in case it gets
2458 set for an inner quote-like operator and then an error causes scope-
2459 popping. We must not have a PL_lex_stuff value left dangling, as
2460 that breaks assumptions elsewhere. See bug #123617. */
2461 SAVEGENERICSV(PL_lex_stuff);
2462 SAVEGENERICSV(PL_parser->lex_sub_repl);
2464 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2465 = SvPVX(PL_linestr);
2466 PL_bufend += SvCUR(PL_linestr);
2467 PL_last_lop = PL_last_uni = NULL;
2468 SAVEFREESV(PL_linestr);
2469 if (PL_lex_repl) SAVEFREESV(PL_lex_repl);
2471 PL_lex_dojoin = FALSE;
2472 PL_lex_brackets = PL_lex_formbrack = 0;
2473 PL_lex_allbrackets = 0;
2474 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
2475 Newx(PL_lex_brackstack, 120, char);
2476 Newx(PL_lex_casestack, 12, char);
2477 PL_lex_casemods = 0;
2478 *PL_lex_casestack = '\0';
2480 PL_lex_state = LEX_INTERPCONCAT;
2482 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
2483 PL_copline = NOLINE;
2485 Newxz(shared, 1, LEXSHARED);
2486 shared->ls_prev = PL_parser->lex_shared;
2487 PL_parser->lex_shared = shared;
2489 PL_lex_inwhat = PL_parser->lex_sub_inwhat;
2490 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
2491 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2492 PL_lex_inpat = PL_parser->lex_sub_op;
2494 PL_lex_inpat = NULL;
2496 PL_parser->lex_re_reparsing = cBOOL(PL_in_eval & EVAL_RE_REPARSING);
2497 PL_in_eval &= ~EVAL_RE_REPARSING;
2504 * Restores lexer state after a S_sublex_push.
2510 if (!PL_lex_starts++) {
2511 SV * const sv = newSVpvs("");
2512 if (SvUTF8(PL_linestr))
2514 PL_expect = XOPERATOR;
2515 pl_yylval.opval = newSVOP(OP_CONST, 0, sv);
2519 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2520 PL_lex_state = LEX_INTERPCASEMOD;
2524 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
2525 assert(PL_lex_inwhat != OP_TRANSR);
2527 assert (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS);
2528 PL_linestr = PL_lex_repl;
2530 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2531 PL_bufend += SvCUR(PL_linestr);
2532 PL_last_lop = PL_last_uni = NULL;
2533 PL_lex_dojoin = FALSE;
2534 PL_lex_brackets = 0;
2535 PL_lex_allbrackets = 0;
2536 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
2537 PL_lex_casemods = 0;
2538 *PL_lex_casestack = '\0';
2540 if (SvEVALED(PL_lex_repl)) {
2541 PL_lex_state = LEX_INTERPNORMAL;
2543 /* we don't clear PL_lex_repl here, so that we can check later
2544 whether this is an evalled subst; that means we rely on the
2545 logic to ensure sublex_done() is called again only via the
2546 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
2549 PL_lex_state = LEX_INTERPCONCAT;
2552 if (SvTYPE(PL_linestr) >= SVt_PVNV) {
2553 CopLINE(PL_curcop) +=
2554 ((XPVNV*)SvANY(PL_linestr))->xnv_u.xnv_lines
2555 + PL_parser->herelines;
2556 PL_parser->herelines = 0;
2561 const line_t l = CopLINE(PL_curcop);
2563 if (PL_multi_close == '<')
2564 PL_parser->herelines += l - PL_multi_end;
2565 PL_bufend = SvPVX(PL_linestr);
2566 PL_bufend += SvCUR(PL_linestr);
2567 PL_expect = XOPERATOR;
2573 S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
2575 /* <s> points to first character of interior of \N{}, <e> to one beyond the
2576 * interior, hence to the "}". Finds what the name resolves to, returning
2577 * an SV* containing it; NULL if no valid one found */
2579 SV* res = newSVpvn_flags(s, e - s, UTF ? SVf_UTF8 : 0);
2586 const char* backslash_ptr = s - 3; /* Points to the <\> of \N{... */
2588 PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME;
2591 deprecate_fatal_in("5.28", "Unknown charname '' is deprecated");
2595 res = new_constant( NULL, 0, "charnames", res, NULL, backslash_ptr,
2596 /* include the <}> */
2597 e - backslash_ptr + 1);
2599 SvREFCNT_dec_NN(res);
2603 /* See if the charnames handler is the Perl core's, and if so, we can skip
2604 * the validation needed for a user-supplied one, as Perl's does its own
2606 table = GvHV(PL_hintgv); /* ^H */
2607 cvp = hv_fetchs(table, "charnames", FALSE);
2608 if (cvp && (cv = *cvp) && SvROK(cv) && (rv = SvRV(cv),
2609 SvTYPE(rv) == SVt_PVCV) && ((stash = CvSTASH(rv)) != NULL))
2611 const char * const name = HvNAME(stash);
2612 if (HvNAMELEN(stash) == sizeof("_charnames")-1
2613 && strEQ(name, "_charnames")) {
2618 /* Here, it isn't Perl's charname handler. We can't rely on a
2619 * user-supplied handler to validate the input name. For non-ut8 input,
2620 * look to see that the first character is legal. Then loop through the
2621 * rest checking that each is a continuation */
2623 /* This code makes the reasonable assumption that the only Latin1-range
2624 * characters that begin a character name alias are alphabetic, otherwise
2625 * would have to create a isCHARNAME_BEGIN macro */
2628 if (! isALPHAU(*s)) {
2633 if (! isCHARNAME_CONT(*s)) {
2636 if (*s == ' ' && *(s-1) == ' ') {
2643 /* Similarly for utf8. For invariants can check directly; for other
2644 * Latin1, can calculate their code point and check; otherwise use a
2646 if (UTF8_IS_INVARIANT(*s)) {
2647 if (! isALPHAU(*s)) {
2651 } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
2652 if (! isALPHAU(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)))) {
2658 if (! PL_utf8_charname_begin) {
2659 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2660 PL_utf8_charname_begin = _core_swash_init("utf8",
2661 "_Perl_Charname_Begin",
2663 1, 0, NULL, &flags);
2665 if (! swash_fetch(PL_utf8_charname_begin, (U8 *) s, TRUE)) {
2672 if (UTF8_IS_INVARIANT(*s)) {
2673 if (! isCHARNAME_CONT(*s)) {
2676 if (*s == ' ' && *(s-1) == ' ') {
2681 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
2682 if (! isCHARNAME_CONT(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1))))
2689 if (! PL_utf8_charname_continue) {
2690 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2691 PL_utf8_charname_continue = _core_swash_init("utf8",
2692 "_Perl_Charname_Continue",
2694 1, 0, NULL, &flags);
2696 if (! swash_fetch(PL_utf8_charname_continue, (U8 *) s, TRUE)) {
2703 if (*(s-1) == ' ') {
2706 "charnames alias definitions may not contain trailing "
2707 "white-space; marked by <-- HERE in %.*s<-- HERE %.*s",
2708 (int)(s - backslash_ptr + 1), backslash_ptr,
2709 (int)(e - s + 1), s + 1
2711 UTF ? SVf_UTF8 : 0);
2715 if (SvUTF8(res)) { /* Don't accept malformed input */
2716 const U8* first_bad_char_loc;
2718 const char* const str = SvPV_const(res, len);
2719 if (UNLIKELY(! is_utf8_string_loc((U8 *) str, len,
2720 &first_bad_char_loc)))
2722 _force_out_malformed_utf8_message(first_bad_char_loc,
2723 (U8 *) PL_parser->bufend,
2725 0 /* 0 means don't die */ );
2728 "Malformed UTF-8 returned by %.*s immediately after '%.*s'",
2729 (int) (e - backslash_ptr + 1), backslash_ptr,
2730 (int) ((char *) first_bad_char_loc - str), str
2741 /* The final %.*s makes sure that should the trailing NUL be missing
2742 * that this print won't run off the end of the string */
2745 "Invalid character in \\N{...}; marked by <-- HERE in %.*s<-- HERE %.*s",
2746 (int)(s - backslash_ptr + 1), backslash_ptr,
2747 (int)(e - s + 1), s + 1
2749 UTF ? SVf_UTF8 : 0);
2756 "charnames alias definitions may not contain a sequence of "
2757 "multiple spaces; marked by <-- HERE in %.*s<-- HERE %.*s",
2758 (int)(s - backslash_ptr + 1), backslash_ptr,
2759 (int)(e - s + 1), s + 1
2761 UTF ? SVf_UTF8 : 0);
2768 Extracts the next constant part of a pattern, double-quoted string,
2769 or transliteration. This is terrifying code.
2771 For example, in parsing the double-quoted string "ab\x63$d", it would
2772 stop at the '$' and return an OP_CONST containing 'abc'.
2774 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
2775 processing a pattern (PL_lex_inpat is true), a transliteration
2776 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
2778 Returns a pointer to the character scanned up to. If this is
2779 advanced from the start pointer supplied (i.e. if anything was
2780 successfully parsed), will leave an OP_CONST for the substring scanned
2781 in pl_yylval. Caller must intuit reason for not parsing further
2782 by looking at the next characters herself.
2786 \N{FOO} => \N{U+hex_for_character_FOO}
2787 (if FOO expands to multiple characters, expands to \N{U+xx.XX.yy ...})
2790 all other \-char, including \N and \N{ apart from \N{ABC}
2793 @ and $ where it appears to be a var, but not for $ as tail anchor
2797 In transliterations:
2798 characters are VERY literal, except for - not at the start or end
2799 of the string, which indicates a range. However some backslash sequences
2800 are recognized: \r, \n, and the like
2801 \007 \o{}, \x{}, \N{}
2802 If all elements in the transliteration are below 256,
2803 scan_const expands the range to the full set of intermediate
2804 characters. If the range is in utf8, the hyphen is replaced with
2805 a certain range mark which will be handled by pmtrans() in op.c.
2807 In double-quoted strings:
2809 all those recognized in transliterations
2810 deprecated backrefs: \1 (in substitution replacements)
2811 case and quoting: \U \Q \E
2814 scan_const does *not* construct ops to handle interpolated strings.
2815 It stops processing as soon as it finds an embedded $ or @ variable
2816 and leaves it to the caller to work out what's going on.
2818 embedded arrays (whether in pattern or not) could be:
2819 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2821 $ in double-quoted strings must be the symbol of an embedded scalar.
2823 $ in pattern could be $foo or could be tail anchor. Assumption:
2824 it's a tail anchor if $ is the last thing in the string, or if it's
2825 followed by one of "()| \r\n\t"
2827 \1 (backreferences) are turned into $1 in substitutions
2829 The structure of the code is
2830 while (there's a character to process) {
2831 handle transliteration ranges
2832 skip regexp comments /(?#comment)/ and codes /(?{code})/
2833 skip #-initiated comments in //x patterns
2834 check for embedded arrays
2835 check for embedded scalars
2837 deprecate \1 in substitution replacements
2838 handle string-changing backslashes \l \U \Q \E, etc.
2839 switch (what was escaped) {
2840 handle \- in a transliteration (becomes a literal -)
2841 if a pattern and not \N{, go treat as regular character
2842 handle \132 (octal characters)
2843 handle \x15 and \x{1234} (hex characters)
2844 handle \N{name} (named characters, also \N{3,5} in a pattern)
2845 handle \cV (control characters)
2846 handle printf-style backslashes (\f, \r, \n, etc)
2849 } (end if backslash)
2850 handle regular character
2851 } (end while character to read)
2856 S_scan_const(pTHX_ char *start)
2858 char *send = PL_bufend; /* end of the constant */
2859 SV *sv = newSV(send - start); /* sv for the constant. See note below
2861 char *s = start; /* start of the constant */
2862 char *d = SvPVX(sv); /* destination for copies */
2863 bool dorange = FALSE; /* are we in a translit range? */
2864 bool didrange = FALSE; /* did we just finish a range? */
2865 bool in_charclass = FALSE; /* within /[...]/ */
2866 bool has_utf8 = FALSE; /* Output constant is UTF8 */
2867 bool this_utf8 = cBOOL(UTF); /* Is the source string assumed to be
2868 UTF8? But, this can show as true
2869 when the source isn't utf8, as for
2870 example when it is entirely composed
2872 STRLEN utf8_variant_count = 0; /* When not in UTF-8, this counts the
2873 number of characters found so far
2874 that will expand (into 2 bytes)
2875 should we have to convert to
2877 SV *res; /* result from charnames */
2878 STRLEN offset_to_max; /* The offset in the output to where the range
2879 high-end character is temporarily placed */
2881 /* Does something require special handling in tr/// ? This avoids extra
2882 * work in a less likely case. As such, khw didn't feel it was worth
2883 * adding any branches to the more mainline code to handle this, which
2884 * means that this doesn't get set in some circumstances when things like
2885 * \x{100} get expanded out. As a result there needs to be extra testing
2886 * done in the tr code */
2887 bool has_above_latin1 = FALSE;
2889 /* Note on sizing: The scanned constant is placed into sv, which is
2890 * initialized by newSV() assuming one byte of output for every byte of
2891 * input. This routine expects newSV() to allocate an extra byte for a
2892 * trailing NUL, which this routine will append if it gets to the end of
2893 * the input. There may be more bytes of input than output (eg., \N{LATIN
2894 * CAPITAL LETTER A}), or more output than input if the constant ends up
2895 * recoded to utf8, but each time a construct is found that might increase
2896 * the needed size, SvGROW() is called. Its size parameter each time is
2897 * based on the best guess estimate at the time, namely the length used so
2898 * far, plus the length the current construct will occupy, plus room for
2899 * the trailing NUL, plus one byte for every input byte still unscanned */
2901 UV uv = UV_MAX; /* Initialize to weird value to try to catch any uses
2904 int backslash_N = 0; /* ? was the character from \N{} */
2905 int non_portable_endpoint = 0; /* ? In a range is an endpoint
2906 platform-specific like \x65 */
2909 PERL_ARGS_ASSERT_SCAN_CONST;
2911 assert(PL_lex_inwhat != OP_TRANSR);
2912 if (PL_lex_inwhat == OP_TRANS && PL_parser->lex_sub_op) {
2913 /* If we are doing a trans and we know we want UTF8 set expectation */
2914 has_utf8 = PL_parser->lex_sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
2915 this_utf8 = PL_parser->lex_sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
2918 /* Protect sv from errors and fatal warnings. */
2919 ENTER_with_name("scan_const");
2923 || dorange /* Handle tr/// range at right edge of input */
2926 /* get transliterations out of the way (they're most literal) */
2927 if (PL_lex_inwhat == OP_TRANS) {
2929 /* But there isn't any special handling necessary unless there is a
2930 * range, so for most cases we just drop down and handle the value
2931 * as any other. There are two exceptions.
2933 * 1. A hyphen indicates that we are actually going to have a
2934 * range. In this case, skip the '-', set a flag, then drop
2935 * down to handle what should be the end range value.
2936 * 2. After we've handled that value, the next time through, that
2937 * flag is set and we fix up the range.
2939 * Ranges entirely within Latin1 are expanded out entirely, in
2940 * order to make the transliteration a simple table look-up.
2941 * Ranges that extend above Latin1 have to be done differently, so
2942 * there is no advantage to expanding them here, so they are
2943 * stored here as Min, ILLEGAL_UTF8_BYTE, Max. The illegal byte
2944 * signifies a hyphen without any possible ambiguity. On EBCDIC
2945 * machines, if the range is expressed as Unicode, the Latin1
2946 * portion is expanded out even if the range extends above
2947 * Latin1. This is because each code point in it has to be
2948 * processed here individually to get its native translation */
2952 /* Here, we don't think we're in a range. If the new character
2953 * is not a hyphen; or if it is a hyphen, but it's too close to
2954 * either edge to indicate a range, then it's a regular
2956 if (*s != '-' || s >= send - 1 || s == start) {
2958 /* A regular character. Process like any other, but first
2959 * clear any flags */
2963 non_portable_endpoint = 0;
2966 /* The tests here for being above Latin1 and similar ones
2967 * in the following 'else' suffice to find all such
2968 * occurences in the constant, except those added by a
2969 * backslash escape sequence, like \x{100}. Mostly, those
2970 * set 'has_above_latin1' as appropriate */
2971 if (this_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) {
2972 has_above_latin1 = TRUE;
2975 /* Drops down to generic code to process current byte */
2977 else { /* Is a '-' in the context where it means a range */
2978 if (didrange) { /* Something like y/A-C-Z// */
2979 Perl_croak(aTHX_ "Ambiguous range in transliteration"
2985 s++; /* Skip past the hyphen */
2987 /* d now points to where the end-range character will be
2988 * placed. Save it so won't have to go finding it later,
2989 * and drop down to get that character. (Actually we
2990 * instead save the offset, to handle the case where a
2991 * realloc in the meantime could change the actual
2992 * pointer). We'll finish processing the range the next
2993 * time through the loop */
2994 offset_to_max = d - SvPVX_const(sv);
2996 if (this_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) {
2997 has_above_latin1 = TRUE;
3000 /* Drops down to generic code to process current byte */
3002 } /* End of not a range */
3004 /* Here we have parsed a range. Now must handle it. At this
3006 * 'sv' is a SV* that contains the output string we are
3007 * constructing. The final two characters in that string
3008 * are the range start and range end, in order.
3009 * 'd' points to just beyond the range end in the 'sv' string,
3010 * where we would next place something
3011 * 'offset_to_max' is the offset in 'sv' at which the character
3012 * (the range's maximum end point) before 'd' begins.
3014 char * max_ptr = SvPVX(sv) + offset_to_max;
3017 IV range_max; /* last character in range */
3019 Size_t offset_to_min = 0;
3022 bool convert_unicode;
3023 IV real_range_max = 0;
3025 /* Get the code point values of the range ends. */
3027 /* We know the utf8 is valid, because we just constructed
3028 * it ourselves in previous loop iterations */
3029 min_ptr = (char*) utf8_hop( (U8*) max_ptr, -1);
3030 range_min = valid_utf8_to_uvchr( (U8*) min_ptr, NULL);
3031 range_max = valid_utf8_to_uvchr( (U8*) max_ptr, NULL);
3033 /* This compensates for not all code setting
3034 * 'has_above_latin1', so that we don't skip stuff that
3035 * should be executed */
3036 if (range_max > 255) {
3037 has_above_latin1 = TRUE;
3041 min_ptr = max_ptr - 1;
3042 range_min = * (U8*) min_ptr;
3043 range_max = * (U8*) max_ptr;
3046 /* If the range is just a single code point, like tr/a-a/.../,
3047 * that code point is already in the output, twice. We can
3048 * just back up over the second instance and avoid all the rest
3049 * of the work. But if it is a variant character, it's been
3050 * counted twice, so decrement. (This unlikely scenario is
3051 * special cased, like the one for a range of 2 code points
3052 * below, only because the main-line code below needs a range
3053 * of 3 or more to work without special casing. Might as well
3054 * get it out of the way now.) */
3055 if (UNLIKELY(range_max == range_min)) {
3057 if (! has_utf8 && ! UVCHR_IS_INVARIANT(range_max)) {
3058 utf8_variant_count--;
3064 /* On EBCDIC platforms, we may have to deal with portable
3065 * ranges. These happen if at least one range endpoint is a
3066 * Unicode value (\N{...}), or if the range is a subset of
3067 * [A-Z] or [a-z], and both ends are literal characters,
3068 * like 'A', and not like \x{C1} */
3070 cBOOL(backslash_N) /* \N{} forces Unicode,
3071 hence portable range */
3072 || ( ! non_portable_endpoint
3073 && (( isLOWER_A(range_min) && isLOWER_A(range_max))
3074 || (isUPPER_A(range_min) && isUPPER_A(range_max))));
3075 if (convert_unicode) {
3077 /* Special handling is needed for these portable ranges.
3078 * They are defined to be in Unicode terms, which includes
3079 * all the Unicode code points between the end points.
3080 * Convert to Unicode to get the Unicode range. Later we
3081 * will convert each code point in the range back to
3083 range_min = NATIVE_TO_UNI(range_min);
3084 range_max = NATIVE_TO_UNI(range_max);
3088 if (range_min > range_max) {
3090 if (convert_unicode) {
3091 /* Need to convert back to native for meaningful
3092 * messages for this platform */
3093 range_min = UNI_TO_NATIVE(range_min);
3094 range_max = UNI_TO_NATIVE(range_max);
3097 /* Use the characters themselves for the error message if
3098 * ASCII printables; otherwise some visible representation
3100 if (isPRINT_A(range_min) && isPRINT_A(range_max)) {
3102 "Invalid range \"%c-%c\" in transliteration operator",
3103 (char)range_min, (char)range_max);
3106 else if (convert_unicode) {
3107 /* diag_listed_as: Invalid range "%s" in transliteration operator */
3109 "Invalid range \"\\N{U+%04" UVXf "}-\\N{U+%04"
3110 UVXf "}\" in transliteration operator",
3111 range_min, range_max);
3115 /* diag_listed_as: Invalid range "%s" in transliteration operator */
3117 "Invalid range \"\\x{%04" UVXf "}-\\x{%04" UVXf "}\""
3118 " in transliteration operator",
3119 range_min, range_max);
3123 /* If the range is exactly two code points long, they are
3124 * already both in the output */
3125 if (UNLIKELY(range_min + 1 == range_max)) {
3129 /* Here the range contains at least 3 code points */
3133 /* If everything in the transliteration is below 256, we
3134 * can avoid special handling later. A translation table
3135 * for each of those bytes is created by op.c. So we
3136 * expand out all ranges to their constituent code points.
3137 * But if we've encountered something above 255, the
3138 * expanding won't help, so skip doing that. But if it's
3139 * EBCDIC, we may have to look at each character below 256
3140 * if we have to convert to/from Unicode values */
3141 if ( has_above_latin1
3143 && (range_min > 255 || ! convert_unicode)
3146 /* Move the high character one byte to the right; then
3147 * insert between it and the range begin, an illegal
3148 * byte which serves to indicate this is a range (using
3149 * a '-' would be ambiguous). */
3151 while (e-- > max_ptr) {
3154 *(e + 1) = (char) ILLEGAL_UTF8_BYTE;
3158 /* Here, we're going to expand out the range. For EBCDIC
3159 * the range can extend above 255 (not so in ASCII), so
3160 * for EBCDIC, split it into the parts above and below
3163 if (range_max > 255) {
3164 real_range_max = range_max;
3170 /* Here we need to expand out the string to contain each
3171 * character in the range. Grow the output to handle this.
3172 * For non-UTF8, we need a byte for each code point in the
3173 * range, minus the three that we've already allocated for: the
3174 * hyphen, the min, and the max. For UTF-8, we need this
3175 * plus an extra byte for each code point that occupies two
3176 * bytes (is variant) when in UTF-8 (except we've already
3177 * allocated for the end points, including if they are
3178 * variants). For ASCII platforms and Unicode ranges on EBCDIC
3179 * platforms, it's easy to calculate a precise number. To
3180 * start, we count the variants in the range, which we need
3181 * elsewhere in this function anyway. (For the case where it
3182 * isn't easy to calculate, 'extras' has been initialized to 0,
3183 * and the calculation is done in a loop further down.) */
3185 if (convert_unicode)
3188 /* This is executed unconditionally on ASCII, and for
3189 * Unicode ranges on EBCDIC. Under these conditions, all
3190 * code points above a certain value are variant; and none
3191 * under that value are. We just need to find out how much
3192 * of the range is above that value. We don't count the
3193 * end points here, as they will already have been counted
3194 * as they were parsed. */
3195 if (range_min >= UTF_CONTINUATION_MARK) {
3197 /* The whole range is made up of variants */
3198 extras = (range_max - 1) - (range_min + 1) + 1;
3200 else if (range_max >= UTF_CONTINUATION_MARK) {
3202 /* Only the higher portion of the range is variants */
3203 extras = (range_max - 1) - UTF_CONTINUATION_MARK + 1;
3206 utf8_variant_count += extras;
3209 /* The base growth is the number of code points in the range,
3210 * not including the endpoints, which have already been sized
3211 * for (and output). We don't subtract for the hyphen, as it
3212 * has been parsed but not output, and the SvGROW below is
3213 * based only on what's been output plus what's left to parse.
3215 grow = (range_max - 1) - (range_min + 1) + 1;
3219 /* In some cases in EBCDIC, we haven't yet calculated a
3220 * precise amount needed for the UTF-8 variants. Just
3221 * assume the worst case, that everything will expand by a
3223 if (! convert_unicode) {
3229 /* Otherwise we know exactly how many variants there
3230 * are in the range. */
3235 /* Grow, but position the output to overwrite the range min end
3236 * point, because in some cases we overwrite that */
3237 SvCUR_set(sv, d - SvPVX_const(sv));
3238 offset_to_min = min_ptr - SvPVX_const(sv);
3240 /* See Note on sizing above. */
3241 d = offset_to_min + SvGROW(sv, SvCUR(sv)
3244 + 1 /* Trailing NUL */ );
3246 /* Now, we can expand out the range. */
3248 if (convert_unicode) {
3251 /* Recall that the min and max are now in Unicode terms, so
3252 * we have to convert each character to its native
3255 for (i = range_min; i <= range_max; i++) {
3256 append_utf8_from_native_byte(
3257 LATIN1_TO_NATIVE((U8) i),
3262 for (i = range_min; i <= range_max; i++) {
3263 *d++ = (char)LATIN1_TO_NATIVE((U8) i);
3269 /* Always gets run for ASCII, and sometimes for EBCDIC. */
3271 /* Here, no conversions are necessary, which means that the
3272 * first character in the range is already in 'd' and
3273 * valid, so we can skip overwriting it */
3277 for (i = range_min + 1; i <= range_max; i++) {
3278 append_utf8_from_native_byte((U8) i, (U8 **) &d);
3284 assert(range_min + 1 <= range_max);
3285 for (i = range_min + 1; i < range_max; i++) {
3287 /* In this case on EBCDIC, we haven't calculated
3288 * the variants. Do it here, as we go along */
3289 if (! UVCHR_IS_INVARIANT(i)) {
3290 utf8_variant_count++;
3296 /* The range_max is done outside the loop so as to
3297 * avoid having to special case not incrementing
3298 * 'utf8_variant_count' on EBCDIC (it's already been
3299 * counted when originally parsed) */
3300 *d++ = (char) range_max;
3305 /* If the original range extended above 255, add in that
3307 if (real_range_max) {
3308 *d++ = (char) UTF8_TWO_BYTE_HI(0x100);
3309 *d++ = (char) UTF8_TWO_BYTE_LO(0x100);
3310 if (real_range_max > 0x100) {
3311 if (real_range_max > 0x101) {
3312 *d++ = (char) ILLEGAL_UTF8_BYTE;
3314 d = (char*)uvchr_to_utf8((U8*)d, real_range_max);
3320 /* mark the range as done, and continue */
3324 non_portable_endpoint = 0;
3328 } /* End of is a range */
3329 } /* End of transliteration. Joins main code after these else's */
3330 else if (*s == '[' && PL_lex_inpat && !in_charclass) {
3333 while (s1 >= start && *s1-- == '\\')
3336 in_charclass = TRUE;
3338 else if (*s == ']' && PL_lex_inpat && in_charclass) {
3341 while (s1 >= start && *s1-- == '\\')
3344 in_charclass = FALSE;
3346 /* skip for regexp comments /(?#comment)/, except for the last
3347 * char, which will be done separately. Stop on (?{..}) and
3349 else if (*s == '(' && PL_lex_inpat && s[1] == '?' && !in_charclass) {
3351 while (s+1 < send && *s != ')')
3354 else if (!PL_lex_casemods
3355 && ( s[2] == '{' /* This should match regcomp.c */
3356 || (s[2] == '?' && s[3] == '{')))
3361 /* likewise skip #-initiated comments in //x patterns */
3365 && ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED)
3367 while (s < send && *s != '\n')
3370 /* no further processing of single-quoted regex */
3371 else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'')
3372 goto default_action;
3374 /* check for embedded arrays
3375 * (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
3377 else if (*s == '@' && s[1]) {
3379 ? isIDFIRST_utf8_safe(s+1, send)
3380 : isWORDCHAR_A(s[1]))
3384 if (strchr(":'{$", s[1]))
3386 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
3387 break; /* in regexp, neither @+ nor @- are interpolated */
3389 /* check for embedded scalars. only stop if we're sure it's a
3391 else if (*s == '$') {
3392 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
3394 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
3396 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
3397 "Possible unintended interpolation of $\\ in regex");
3399 break; /* in regexp, $ might be tail anchor */
3403 /* End of else if chain - OP_TRANS rejoin rest */
3405 if (UNLIKELY(s >= send)) {
3411 if (*s == '\\' && s+1 < send) {
3412 char* e; /* Can be used for ending '}', etc. */
3416 /* warn on \1 - \9 in substitution replacements, but note that \11
3417 * is an octal; and \19 is \1 followed by '9' */
3418 if (PL_lex_inwhat == OP_SUBST
3424 /* diag_listed_as: \%d better written as $%d */
3425 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
3430 /* string-change backslash escapes */
3431 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQF", *s)) {
3435 /* In a pattern, process \N, but skip any other backslash escapes.
3436 * This is because we don't want to translate an escape sequence
3437 * into a meta symbol and have the regex compiler use the meta
3438 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
3439 * in spite of this, we do have to process \N here while the proper
3440 * charnames handler is in scope. See bugs #56444 and #62056.
3442 * There is a complication because \N in a pattern may also stand
3443 * for 'match a non-nl', and not mean a charname, in which case its
3444 * processing should be deferred to the regex compiler. To be a
3445 * charname it must be followed immediately by a '{', and not look
3446 * like \N followed by a curly quantifier, i.e., not something like
3447 * \N{3,}. regcurly returns a boolean indicating if it is a legal
3449 else if (PL_lex_inpat
3452 || regcurly(s + 1)))
3455 goto default_action;
3461 if ((isALPHANUMERIC(*s)))
3462 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
3463 "Unrecognized escape \\%c passed through",
3465 /* default action is to copy the quoted character */
3466 goto default_action;
3469 /* eg. \132 indicates the octal constant 0132 */
3470 case '0': case '1': case '2': case '3':
3471 case '4': case '5': case '6': case '7':
3473 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
3475 uv = grok_oct(s, &len, &flags, NULL);
3477 if (len < 3 && s < send && isDIGIT(*s)
3478 && ckWARN(WARN_MISC))
3480 Perl_warner(aTHX_ packWARN(WARN_MISC),
3481 "%s", form_short_octal_warning(s, len));
3484 goto NUM_ESCAPE_INSERT;
3486 /* eg. \o{24} indicates the octal constant \024 */
3491 bool valid = grok_bslash_o(&s, &uv, &error,
3492 TRUE, /* Output warning */
3493 FALSE, /* Not strict */
3494 TRUE, /* Output warnings for
3499 uv = 0; /* drop through to ensure range ends are set */
3501 goto NUM_ESCAPE_INSERT;
3504 /* eg. \x24 indicates the hex constant 0x24 */
3509 bool valid = grok_bslash_x(&s, &uv, &error,
3510 TRUE, /* Output warning */
3511 FALSE, /* Not strict */
3512 TRUE, /* Output warnings for
3517 uv = 0; /* drop through to ensure range ends are set */
3522 /* Insert oct or hex escaped character. */
3524 /* Here uv is the ordinal of the next character being added */
3525 if (UVCHR_IS_INVARIANT(uv)) {
3529 if (!has_utf8 && uv > 255) {
3531 /* Here, 'uv' won't fit unless we convert to UTF-8.
3532 * If we've only seen invariants so far, all we have to
3533 * do is turn on the flag */
3534 if (utf8_variant_count == 0) {
3538 SvCUR_set(sv, d - SvPVX_const(sv));
3542 sv_utf8_upgrade_flags_grow(
3544 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3546 /* Since we're having to grow here,
3547 * make sure we have enough room for
3548 * this escape and a NUL, so the
3549 * code immediately below won't have
3550 * to actually grow again */
3552 + (STRLEN)(send - s) + 1);
3553 d = SvPVX(sv) + SvCUR(sv);
3556 has_above_latin1 = TRUE;
3562 utf8_variant_count++;
3565 /* Usually, there will already be enough room in 'sv'
3566 * since such escapes are likely longer than any UTF-8
3567 * sequence they can end up as. This isn't the case on
3568 * EBCDIC where \x{40000000} contains 12 bytes, and the
3569 * UTF-8 for it contains 14. And, we have to allow for
3570 * a trailing NUL. It probably can't happen on ASCII
3571 * platforms, but be safe. See Note on sizing above. */
3572 const STRLEN needed = d - SvPVX(sv)
3576 if (UNLIKELY(needed > SvLEN(sv))) {
3577 SvCUR_set(sv, d - SvPVX_const(sv));
3578 d = SvCUR(sv) + SvGROW(sv, needed);
3581 d = (char*)uvchr_to_utf8((U8*)d, uv);
3582 if (PL_lex_inwhat == OP_TRANS
3583 && PL_parser->lex_sub_op)
3585 PL_parser->lex_sub_op->op_private |=
3586 (PL_lex_repl ? OPpTRANS_FROM_UTF
3592 non_portable_endpoint++;
3597 /* In a non-pattern \N must be like \N{U+0041}, or it can be a
3598 * named character, like \N{LATIN SMALL LETTER A}, or a named
3599 * sequence, like \N{LATIN CAPITAL LETTER A WITH MACRON AND
3600 * GRAVE} (except y/// can't handle the latter, croaking). For
3601 * convenience all three forms are referred to as "named
3602 * characters" below.
3604 * For patterns, \N also can mean to match a non-newline. Code
3605 * before this 'switch' statement should already have handled
3606 * this situation, and hence this code only has to deal with
3607 * the named character cases.
3609 * For non-patterns, the named characters are converted to
3610 * their string equivalents. In patterns, named characters are
3611 * not converted to their ultimate forms for the same reasons
3612 * that other escapes aren't. Instead, they are converted to
3613 * the \N{U+...} form to get the value from the charnames that
3614 * is in effect right now, while preserving the fact that it
3615 * was a named character, so that the regex compiler knows
3618 * The structure of this section of code (besides checking for
3619 * errors and upgrading to utf8) is:
3620 * If the named character is of the form \N{U+...}, pass it
3621 * through if a pattern; otherwise convert the code point
3623 * Otherwise must be some \N{NAME}: convert to
3624 * \N{U+c1.c2...} if a pattern; otherwise convert to utf8
3626 * Transliteration is an exception. The conversion to utf8 is
3627 * only done if the code point requires it to be representable.
3629 * Here, 's' points to the 'N'; the test below is guaranteed to
3630 * succeed if we are being called on a pattern, as we already
3631 * know from a test above that the next character is a '{'. A
3632 * non-pattern \N must mean 'named character', which requires
3636 yyerror("Missing braces on \\N{}");
3642 /* If there is no matching '}', it is an error. */
3643 if (! (e = strchr(s, '}'))) {
3644 if (! PL_lex_inpat) {
3645 yyerror("Missing right brace on \\N{}");
3647 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N");
3649 yyquit(); /* Have exhausted the input. */
3652 /* Here it looks like a named character */
3654 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3655 s += 2; /* Skip to next char after the 'U+' */
3658 /* In patterns, we can have \N{U+xxxx.yyyy.zzzz...} */
3659 /* Check the syntax. */
3662 if (!isXDIGIT(*s)) {
3665 "Invalid hexadecimal number in \\N{U+...}"
3674 else if ((*s == '.' || *s == '_')
3680 /* Pass everything through unchanged.
3681 * +1 is for the '}' */
3682 Copy(orig_s, d, e - orig_s + 1, char);
3683 d += e - orig_s + 1;
3685 else { /* Not a pattern: convert the hex to string */
3686 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3687 | PERL_SCAN_SILENT_ILLDIGIT
3688 | PERL_SCAN_DISALLOW_PREFIX;
3690 uv = grok_hex(s, &len, &flags, NULL);
3691 if (len == 0 || (len != (STRLEN)(e - s)))
3694 /* For non-tr///, if the destination is not in utf8,
3695 * unconditionally recode it to be so. This is
3696 * because \N{} implies Unicode semantics, and scalars
3697 * have to be in utf8 to guarantee those semantics.
3698 * tr/// doesn't care about Unicode rules, so no need
3699 * there to upgrade to UTF-8 for small enough code
3701 if (! has_utf8 && ( uv > 0xFF
3702 || PL_lex_inwhat != OP_TRANS))
3704 /* See Note on sizing above. */
3705 const STRLEN extra = OFFUNISKIP(uv) + (send - e) + 1;
3707 SvCUR_set(sv, d - SvPVX_const(sv));
3711 if (utf8_variant_count == 0) {
3713 d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + extra);
3716 sv_utf8_upgrade_flags_grow(
3718 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3720 d = SvPVX(sv) + SvCUR(sv);
3724 has_above_latin1 = TRUE;
3727 /* Add the (Unicode) code point to the output. */
3728 if (! has_utf8 || OFFUNI_IS_INVARIANT(uv)) {
3729 *d++ = (char) LATIN1_TO_NATIVE(uv);
3732 d = (char*) uvoffuni_to_utf8_flags((U8*)d, uv, 0);
3736 else /* Here is \N{NAME} but not \N{U+...}. */
3737 if ((res = get_and_check_backslash_N_name(s, e)))
3740 const char *str = SvPV_const(res, len);
3743 if (! len) { /* The name resolved to an empty string */
3744 Copy("\\N{}", d, 4, char);
3748 /* In order to not lose information for the regex
3749 * compiler, pass the result in the specially made
3750 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3751 * the code points in hex of each character
3752 * returned by charnames */
3754 const char *str_end = str + len;
3755 const STRLEN off = d - SvPVX_const(sv);
3757 if (! SvUTF8(res)) {
3758 /* For the non-UTF-8 case, we can determine the
3759 * exact length needed without having to parse
3760 * through the string. Each character takes up
3761 * 2 hex digits plus either a trailing dot or
3763 const char initial_text[] = "\\N{U+";
3764 const STRLEN initial_len = sizeof(initial_text)
3766 d = off + SvGROW(sv, off
3769 /* +1 for trailing NUL */
3772 + (STRLEN)(send - e));
3773 Copy(initial_text, d, initial_len, char);
3775 while (str < str_end) {
3778 my_snprintf(hex_string,
3782 /* The regex compiler is
3783 * expecting Unicode, not
3785 NATIVE_TO_LATIN1(*str));
3786 PERL_MY_SNPRINTF_POST_GUARD(len,
3787 sizeof(hex_string));
3788 Copy(hex_string, d, 3, char);
3792 d--; /* Below, we will overwrite the final
3793 dot with a right brace */
3796 STRLEN char_length; /* cur char's byte length */
3798 /* and the number of bytes after this is
3799 * translated into hex digits */
3800 STRLEN output_length;
3802 /* 2 hex per byte; 2 chars for '\N'; 2 chars
3803 * for max('U+', '.'); and 1 for NUL */
3804 char hex_string[2 * UTF8_MAXBYTES + 5];
3806 /* Get the first character of the result. */
3807 U32 uv = utf8n_to_uvchr((U8 *) str,
3811 /* Convert first code point to Unicode hex,
3812 * including the boiler plate before it. */
3814 my_snprintf(hex_string, sizeof(hex_string),
3816 (unsigned int) NATIVE_TO_UNI(uv));
3818 /* Make sure there is enough space to hold it */
3819 d = off + SvGROW(sv, off
3821 + (STRLEN)(send - e)
3822 + 2); /* '}' + NUL */
3824 Copy(hex_string, d, output_length, char);
3827 /* For each subsequent character, append dot and
3828 * its Unicode code point in hex */
3829 while ((str += char_length) < str_end) {
3830 const STRLEN off = d - SvPVX_const(sv);
3831 U32 uv = utf8n_to_uvchr((U8 *) str,
3836 my_snprintf(hex_string,
3839 (unsigned int) NATIVE_TO_UNI(uv));
3841 d = off + SvGROW(sv, off
3843 + (STRLEN)(send - e)
3844 + 2); /* '}' + NUL */
3845 Copy(hex_string, d, output_length, char);
3850 *d++ = '}'; /* Done. Add the trailing brace */
3853 else { /* Here, not in a pattern. Convert the name to a
3856 if (PL_lex_inwhat == OP_TRANS) {
3857 str = SvPV_const(res, len);
3858 if (len > ((SvUTF8(res))
3862 yyerror(Perl_form(aTHX_
3863 "%.*s must not be a named sequence"
3864 " in transliteration operator",
3865 /* +1 to include the "}" */
3866 (int) (e + 1 - start), start));
3868 goto end_backslash_N;
3871 if (SvUTF8(res) && UTF8_IS_ABOVE_LATIN1(*str)) {
3872 has_above_latin1 = TRUE;
3876 else if (! SvUTF8(res)) {
3877 /* Make sure \N{} return is UTF-8. This is because
3878 * \N{} implies Unicode semantics, and scalars have
3879 * to be in utf8 to guarantee those semantics; but
3880 * not needed in tr/// */
3881 sv_utf8_upgrade_flags(res, 0);
3882 str = SvPV_const(res, len);
3885 /* Upgrade destination to be utf8 if this new
3887 if (! has_utf8 && SvUTF8(res)) {
3888 /* See Note on sizing above. */
3889 const STRLEN extra = len + (send - s) + 1;
3891 SvCUR_set(sv, d - SvPVX_const(sv));
3895 if (utf8_variant_count == 0) {
3897 d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + extra);
3900 sv_utf8_upgrade_flags_grow(sv,
3901 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3903 d = SvPVX(sv) + SvCUR(sv);
3906 } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
3908 /* See Note on sizing above. (NOTE: SvCUR() is not
3909 * set correctly here). */
3910 const STRLEN extra = len + (send - e) + 1;
3911 const STRLEN off = d - SvPVX_const(sv);
3912 d = off + SvGROW(sv, off + extra);
3914 Copy(str, d, len, char);
3920 } /* End \N{NAME} */
3924 backslash_N++; /* \N{} is defined to be Unicode */
3926 s = e + 1; /* Point to just after the '}' */
3929 /* \c is a control character */
3933 *d++ = grok_bslash_c(*s, 1);
3936 yyerror("Missing control char name in \\c");
3937 yyquit(); /* Are at end of input, no sense continuing */
3940 non_portable_endpoint++;
3944 /* printf-style backslashes, formfeeds, newlines, etc */
3970 } /* end if (backslash) */
3973 /* Just copy the input to the output, though we may have to convert
3976 * If the input has the same representation in UTF-8 as not, it will be
3977 * a single byte, and we don't care about UTF8ness; just copy the byte */
3978 if (NATIVE_BYTE_IS_INVARIANT((U8)(*s))) {
3981 else if (! this_utf8 && ! has_utf8) {
3982 /* If neither source nor output is UTF-8, is also a single byte,
3983 * just copy it; but this byte counts should we later have to
3984 * convert to UTF-8 */
3986 utf8_variant_count++;
3988 else if (this_utf8 && has_utf8) { /* Both UTF-8, can just copy */
3989 const STRLEN len = UTF8SKIP(s);
3991 /* We expect the source to have already been checked for
3993 assert(isUTF8_CHAR((U8 *) s, (U8 *) send));
3995 Copy(s, d, len, U8);
3999 else { /* UTF8ness matters and doesn't match, need to convert */
4001 const UV nextuv = (this_utf8)
4002 ? utf8n_to_uvchr((U8*)s, send - s, &len, 0)
4004 STRLEN need = UVCHR_SKIP(nextuv);
4007 SvCUR_set(sv, d - SvPVX_const(sv));
4011 /* See Note on sizing above. */
4012 need += (STRLEN)(send - s) + 1;
4014 if (utf8_variant_count == 0) {
4016 d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + need);
4019 sv_utf8_upgrade_flags_grow(sv,
4020 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
4022 d = SvPVX(sv) + SvCUR(sv);
4025 } else if (need > len) {
4026 /* encoded value larger than old, may need extra space (NOTE:
4027 * SvCUR() is not set correctly here). See Note on sizing
4029 const STRLEN extra = need + (send - s) + 1;
4030 const STRLEN off = d - SvPVX_const(sv);
4031 d = off + SvGROW(sv, off + extra);
4035 d = (char*)uvchr_to_utf8((U8*)d, nextuv);
4037 } /* while loop to process each character */
4039 /* terminate the string and set up the sv */
4041 SvCUR_set(sv, d - SvPVX_const(sv));
4042 if (SvCUR(sv) >= SvLEN(sv))
4043 Perl_croak(aTHX_ "panic: constant overflowed allocated space, %" UVuf
4044 " >= %" UVuf, (UV)SvCUR(sv), (UV)SvLEN(sv));
4049 if (PL_lex_inwhat == OP_TRANS && PL_parser->lex_sub_op) {
4050 PL_parser->lex_sub_op->op_private |=
4051 (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
4055 /* shrink the sv if we allocated more than we used */
4056 if (SvCUR(sv) + 5 < SvLEN(sv)) {
4057 SvPV_shrink_to_cur(sv);
4060 /* return the substring (via pl_yylval) only if we parsed anything */
4063 for (; s2 < s; s2++) {
4065 COPLINE_INC_WITH_HERELINES;
4067 SvREFCNT_inc_simple_void_NN(sv);
4068 if ( (PL_hints & ( PL_lex_inpat ? HINT_NEW_RE : HINT_NEW_STRING ))
4069 && ! PL_parser->lex_re_reparsing)
4071 const char *const key = PL_lex_inpat ? "qr" : "q";
4072 const STRLEN keylen = PL_lex_inpat ? 2 : 1;
4076 if (PL_lex_inwhat == OP_TRANS) {
4079 } else if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat) {
4082 } else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'') {
4090 sv = S_new_constant(aTHX_ start, s - start, key, keylen, sv, NULL,
4093 pl_yylval.opval = newSVOP(OP_CONST, 0, sv);
4095 LEAVE_with_name("scan_const");
4100 * Returns TRUE if there's more to the expression (e.g., a subscript),
4103 * It deals with "$foo[3]" and /$foo[3]/ and /$foo[0123456789$]+/
4105 * ->[ and ->{ return TRUE
4106 * ->$* ->$#* ->@* ->@[ ->@{ return TRUE if postderef_qq is enabled
4107 * { and [ outside a pattern are always subscripts, so return TRUE
4108 * if we're outside a pattern and it's not { or [, then return FALSE
4109 * if we're in a pattern and the first char is a {
4110 * {4,5} (any digits around the comma) returns FALSE
4111 * if we're in a pattern and the first char is a [
4113 * [SOMETHING] has a funky algorithm to decide whether it's a
4114 * character class or not. It has to deal with things like
4115 * /$foo[-3]/ and /$foo[$bar]/ as well as /$foo[$\d]+/
4116 * anything else returns TRUE
4119 /* This is the one truly awful dwimmer necessary to conflate C and sed. */
4122 S_intuit_more(pTHX_ char *s)
4124 PERL_ARGS_ASSERT_INTUIT_MORE;
4126 if (PL_lex_brackets)
4128 if (*s == '-' && s[1] == '>' && (s[2] == '[' || s[2] == '{'))
4130 if (*s == '-' && s[1] == '>'
4131 && FEATURE_POSTDEREF_QQ_IS_ENABLED
4132 && ( (s[2] == '$' && (s[3] == '*' || (s[3] == '#' && s[4] == '*')))
4133 ||(s[2] == '@' && strchr("*[{",s[3])) ))
4135 if (*s != '{' && *s != '[')
4140 /* In a pattern, so maybe we have {n,m}. */
4148 /* On the other hand, maybe we have a character class */
4151 if (*s == ']' || *s == '^')
4154 /* this is terrifying, and it works */
4157 const char * const send = strchr(s,']');
4158 unsigned char un_char, last_un_char;
4159 char tmpbuf[sizeof PL_tokenbuf * 4];
4161 if (!send) /* has to be an expression */
4163 weight = 2; /* let's weigh the evidence */
4167 else if (isDIGIT(*s)) {
4169 if (isDIGIT(s[1]) && s[2] == ']')
4175 Zero(seen,256,char);
4177 for (; s < send; s++) {
4178 last_un_char = un_char;
4179 un_char = (unsigned char)*s;
4184 weight -= seen[un_char] * 10;
4185 if (isWORDCHAR_lazy_if_safe(s+1, PL_bufend, UTF)) {
4187 scan_ident(s, tmpbuf, sizeof tmpbuf, FALSE);
4188 len = (int)strlen(tmpbuf);
4189 if (len > 1 && gv_fetchpvn_flags(tmpbuf, len,
4190 UTF ? SVf_UTF8 : 0, SVt_PV))
4197 && strchr("[#!%*<>()-=",s[1]))
4199 if (/*{*/ strchr("])} =",s[2]))
4208 if (strchr("wds]",s[1]))
4210 else if (seen[(U8)'\''] || seen[(U8)'"'])
4212 else if (strchr("rnftbxcav",s[1]))
4214 else if (isDIGIT(s[1])) {
4216 while (s[1] && isDIGIT(s[1]))
4226 if (strchr("aA01! ",last_un_char))
4228 if (strchr("zZ79~",s[1]))
4230 if (last_un_char == 255 && (isDIGIT(s[1]) || s[1] == '$'))
4231 weight -= 5; /* cope with negative subscript */
4234 if (!isWORDCHAR(last_un_char)
4235 && !(last_un_char == '$' || last_un_char == '@'
4236 || last_un_char == '&')
4237 && isALPHA(*s) && s[1] && isALPHA(s[1])) {
4241 if (keyword(d, s - d, 0))
4244 if (un_char == last_un_char + 1)
4246 weight -= seen[un_char];
4251 if (weight >= 0) /* probably a character class */
4261 * Does all the checking to disambiguate
4263 * between foo(bar) and bar->foo. Returns 0 if not a method, otherwise
4264 * FUNCMETH (bar->foo(args)) or METHOD (bar->foo args).
4266 * First argument is the stuff after the first token, e.g. "bar".
4268 * Not a method if foo is a filehandle.
4269 * Not a method if foo is a subroutine prototyped to take a filehandle.
4270 * Not a method if it's really "Foo $bar"
4271 * Method if it's "foo $bar"
4272 * Not a method if it's really "print foo $bar"
4273 * Method if it's really "foo package::" (interpreted as package->foo)
4274 * Not a method if bar is known to be a subroutine ("sub bar; foo bar")
4275 * Not a method if bar is a filehandle or package, but is quoted with
4280 S_intuit_method(pTHX_ char *start, SV *ioname, CV *cv)
4282 char *s = start + (*start == '$');
4283 char tmpbuf[sizeof PL_tokenbuf];
4286 /* Mustn't actually add anything to a symbol table.
4287 But also don't want to "initialise" any placeholder
4288 constants that might already be there into full
4289 blown PVGVs with attached PVCV. */
4291 ioname ? gv_fetchsv(ioname, GV_NOADD_NOINIT, SVt_PVCV) : NULL;
4293 PERL_ARGS_ASSERT_INTUIT_METHOD;
4295 if (gv && SvTYPE(gv) == SVt_PVGV && GvIO(gv))
4297 if (cv && SvPOK(cv)) {
4298 const char *proto = CvPROTO(cv);
4300 while (*proto && (isSPACE(*proto) || *proto == ';'))
4307 if (*start == '$') {
4308 SSize_t start_off = start - SvPVX(PL_linestr);
4309 if (cv || PL_last_lop_op == OP_PRINT || PL_last_lop_op == OP_SAY
4310 || isUPPER(*PL_tokenbuf))
4312 /* this could be $# */
4315 PL_bufptr = SvPVX(PL_linestr) + start_off;
4317 return *s == '(' ? FUNCMETH : METHOD;
4320 s = scan_word(s, tmpbuf, sizeof tmpbuf, TRUE, &len);
4321 /* start is the beginning of the possible filehandle/object,
4322 * and s is the end of it
4323 * tmpbuf is a copy of it (but with single quotes as double colons)
4326 if (!keyword(tmpbuf, len, 0)) {
4327 if (len > 2 && tmpbuf[len - 2] == ':' && tmpbuf[len - 1] == ':') {
4332 indirgv = gv_fetchpvn_flags(tmpbuf, len,
4333 GV_NOADD_NOINIT|( UTF ? SVf_UTF8 : 0 ),
4335 if (indirgv && SvTYPE(indirgv) != SVt_NULL
4336 && (!isGV(indirgv) || GvCVu(indirgv)))
4338 /* filehandle or package name makes it a method */
4339 if (!cv || GvIO(indirgv) || gv_stashpvn(tmpbuf, len, UTF ? SVf_UTF8 : 0)) {
4341 if ((PL_bufend - s) >= 2 && *s == '=' && *(s+1) == '>')
4342 return 0; /* no assumptions -- "=>" quotes bareword */
4344 NEXTVAL_NEXTTOKE.opval = newSVOP(OP_CONST, 0,
4345 S_newSV_maybe_utf8(aTHX_ tmpbuf, len));
4346 NEXTVAL_NEXTTOKE.opval->op_private = OPpCONST_BARE;
4348 force_next(BAREWORD);
4350 return *s == '(' ? FUNCMETH : METHOD;
4356 /* Encoded script support. filter_add() effectively inserts a
4357 * 'pre-processing' function into the current source input stream.
4358 * Note that the filter function only applies to the current source file
4359 * (e.g., it will not affect files 'require'd or 'use'd by this one).
4361 * The datasv parameter (which may be NULL) can be used to pass
4362 * private data to this instance of the filter. The filter function
4363 * can recover the SV using the FILTER_DATA macro and use it to
4364 * store private buffers and state information.
4366 * The supplied datasv parameter is upgraded to a PVIO type
4367 * and the IoDIRP/IoANY field is used to store the function pointer,
4368 * and IOf_FAKE_DIRP is enabled on datasv to mark this as such.
4369 * Note that IoTOP_NAME, IoFMT_NAME, IoBOTTOM_NAME, if set for
4370 * private use must be set using malloc'd pointers.
4374 Perl_filter_add(pTHX_ filter_t funcp, SV *datasv)
4382 if (PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS)
4383 Perl_croak(aTHX_ "Source filters apply only to byte streams");
4385 if (!PL_rsfp_filters)
4386 PL_rsfp_filters = newAV();
4389 SvUPGRADE(datasv, SVt_PVIO);
4390 IoANY(datasv) = FPTR2DPTR(void *, funcp); /* stash funcp into spare field */
4391 IoFLAGS(datasv) |= IOf_FAKE_DIRP;
4392 DEBUG_P(PerlIO_printf(Perl_debug_log, "filter_add func %p (%s)\n",
4393 FPTR2DPTR(void *, IoANY(datasv)),
4394 SvPV_nolen(datasv)));
4395 av_unshift(PL_rsfp_filters, 1);
4396 av_store(PL_rsfp_filters, 0, datasv) ;
4398 !PL_parser->filtered
4399 && PL_parser->lex_flags & LEX_EVALBYTES
4400 && PL_bufptr < PL_bufend
4402 const char *s = PL_bufptr;
4403 while (s < PL_bufend) {
4405 SV *linestr = PL_parser->linestr;
4406 char *buf = SvPVX(linestr);
4407 STRLEN const bufptr_pos = PL_parser->bufptr - buf;
4408 STRLEN const oldbufptr_pos = PL_parser->oldbufptr - buf;
4409 STRLEN const oldoldbufptr_pos=PL_parser->oldoldbufptr-buf;
4410 STRLEN const linestart_pos = PL_parser->linestart - buf;
4411 STRLEN const last_uni_pos =
4412 PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
4413 STRLEN const last_lop_pos =
4414 PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
4415 av_push(PL_rsfp_filters, linestr);
4416 PL_parser->linestr =
4417 newSVpvn(SvPVX(linestr), ++s-SvPVX(linestr));
4418 buf = SvPVX(PL_parser->linestr);
4419 PL_parser->bufend = buf + SvCUR(PL_parser->linestr);
4420 PL_parser->bufptr = buf + bufptr_pos;
4421 PL_parser->oldbufptr = buf + oldbufptr_pos;
4422 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
4423 PL_parser->linestart = buf + linestart_pos;
4424 if (PL_parser->last_uni)
4425 PL_parser->last_uni = buf + last_uni_pos;
4426 if (PL_parser->last_lop)
4427 PL_parser->last_lop = buf + last_lop_pos;
4428 SvLEN(linestr) = SvCUR(linestr);
4429 SvCUR(linestr) = s-SvPVX(linestr);
4430 PL_parser->filtered = 1;
4440 /* Delete most recently added instance of this filter function. */
4442 Perl_filter_del(pTHX_ filter_t funcp)
4446 PERL_ARGS_ASSERT_FILTER_DEL;
4449 DEBUG_P(PerlIO_printf(Perl_debug_log, "filter_del func %p",
4450 FPTR2DPTR(void*, funcp)));
4452 if (!PL_parser || !PL_rsfp_filters || AvFILLp(PL_rsfp_filters)<0)
4454 /* if filter is on top of stack (usual case) just pop it off */
4455 datasv = FILTER_DATA(AvFILLp(PL_rsfp_filters));
4456 if (IoANY(datasv) == FPTR2DPTR(void *, funcp)) {
4457 sv_free(av_pop(PL_rsfp_filters));
4461 /* we need to search for the correct entry and clear it */
4462 Perl_die(aTHX_ "filter_del can only delete in reverse order (currently)");
4466 /* Invoke the idxth filter function for the current rsfp. */
4467 /* maxlen 0 = read one text line */
4469 Perl_filter_read(pTHX_ int idx, SV *buf_sv, int maxlen)
4473 /* This API is bad. It should have been using unsigned int for maxlen.
4474 Not sure if we want to change the API, but if not we should sanity
4475 check the value here. */
4476 unsigned int correct_length = maxlen < 0 ? PERL_INT_MAX : maxlen;
4478 PERL_ARGS_ASSERT_FILTER_READ;
4480 if (!PL_parser || !PL_rsfp_filters)
4482 if (idx > AvFILLp(PL_rsfp_filters)) { /* Any more filters? */
4483 /* Provide a default input filter to make life easy. */
4484 /* Note that we append to the line. This is handy. */
4485 DEBUG_P(PerlIO_printf(Perl_debug_log,
4486 "filter_read %d: from rsfp\n", idx));
4487 if (correct_length) {
4490 const int old_len = SvCUR(buf_sv);
4492 /* ensure buf_sv is large enough */
4493 SvGROW(buf_sv, (STRLEN)(old_len + correct_length + 1)) ;
4494 if ((len = PerlIO_read(PL_rsfp, SvPVX(buf_sv) + old_len,
4495 correct_length)) <= 0) {
4496 if (PerlIO_error(PL_rsfp))
4497 return -1; /* error */
4499 return 0 ; /* end of file */
4501 SvCUR_set(buf_sv, old_len + len) ;
4502 SvPVX(buf_sv)[old_len + len] = '\0';
4505 if (sv_gets(buf_sv, PL_rsfp, SvCUR(buf_sv)) == NULL) {
4506 if (PerlIO_error(PL_rsfp))
4507 return -1; /* error */
4509 return 0 ; /* end of file */
4512 return SvCUR(buf_sv);
4514 /* Skip this filter slot if filter has been deleted */
4515 if ( (datasv = FILTER_DATA(idx)) == &PL_sv_undef) {
4516 DEBUG_P(PerlIO_printf(Perl_debug_log,
4517 "filter_read %d: skipped (filter deleted)\n",
4519 return FILTER_READ(idx+1, buf_sv, correct_length); /* recurse */
4521 if (SvTYPE(datasv) != SVt_PVIO) {
4522 if (correct_length) {
4524 const STRLEN remainder = SvLEN(datasv) - SvCUR(datasv);
4525 if (!remainder) return 0; /* eof */
4526 if (correct_length > remainder) correct_length = remainder;
4527 sv_catpvn(buf_sv, SvEND(datasv), correct_length);
4528 SvCUR_set(datasv, SvCUR(datasv) + correct_length);
4531 const char *s = SvEND(datasv);
4532 const char *send = SvPVX(datasv) + SvLEN(datasv);
4540 if (s == send) return 0; /* eof */
4541 sv_catpvn(buf_sv, SvEND(datasv), s-SvEND(datasv));
4542 SvCUR_set(datasv, s-SvPVX(datasv));
4544 return SvCUR(buf_sv);
4546 /* Get function pointer hidden within datasv */
4547 funcp = DPTR2FPTR(filter_t, IoANY(datasv));
4548 DEBUG_P(PerlIO_printf(Perl_debug_log,
4549 "filter_read %d: via function %p (%s)\n",
4550 idx, (void*)datasv, SvPV_nolen_const(datasv)));
4551 /* Call function. The function is expected to */
4552 /* call "FILTER_READ(idx+1, buf_sv)" first. */
4553 /* Return: <0:error, =0:eof, >0:not eof */
4554 return (*funcp)(aTHX_ idx, buf_sv, correct_length);
4558 S_filter_gets(pTHX_ SV *sv, STRLEN append)
4560 PERL_ARGS_ASSERT_FILTER_GETS;
4562 #ifdef PERL_CR_FILTER
4563 if (!PL_rsfp_filters) {
4564 filter_add(S_cr_textfilter,NULL);
4567 if (PL_rsfp_filters) {
4569 SvCUR_set(sv, 0); /* start with empty line */
4570 if (FILTER_READ(0, sv, 0) > 0)
4571 return ( SvPVX(sv) ) ;
4576 return (sv_gets(sv, PL_rsfp, append));
4580 S_find_in_my_stash(pTHX_ const char *pkgname, STRLEN len)
4584 PERL_ARGS_ASSERT_FIND_IN_MY_STASH;
4586 if (len == 11 && *pkgname == '_' && strEQ(pkgname, "__PACKAGE__"))
4590 && (pkgname[len - 2] == ':' && pkgname[len - 1] == ':')
4591 && (gv = gv_fetchpvn_flags(pkgname,
4593 ( UTF ? SVf_UTF8 : 0 ), SVt_PVHV)))
4595 return GvHV(gv); /* Foo:: */
4598 /* use constant CLASS => 'MyClass' */
4599 gv = gv_fetchpvn_flags(pkgname, len, UTF ? SVf_UTF8 : 0, SVt_PVCV);
4600 if (gv && GvCV(gv)) {
4601 SV * const sv = cv_const_sv(GvCV(gv));
4603 return gv_stashsv(sv, 0);
4606 return gv_stashpvn(pkgname, len, UTF ? SVf_UTF8 : 0);
4611 S_tokenize_use(pTHX_ int is_use, char *s) {
4612 PERL_ARGS_ASSERT_TOKENIZE_USE;
4614 if (PL_expect != XSTATE)
4615 yyerror(Perl_form(aTHX_ "\"%s\" not allowed in expression",
4616 is_use ? "use" : "no"));
4619 if (isDIGIT(*s) || (*s == 'v' && isDIGIT(s[1]))) {
4620 s = force_version(s, TRUE);
4621 if (*s == ';' || *s == '}'
4622 || (s = skipspace(s), (*s == ';' || *s == '}'))) {
4623 NEXTVAL_NEXTTOKE.opval = NULL;
4624 force_next(BAREWORD);
4626 else if (*s == 'v') {
4627 s = force_word(s,BAREWORD,FALSE,TRUE);
4628 s = force_version(s, FALSE);
4632 s = force_word(s,BAREWORD,FALSE,TRUE);
4633 s = force_version(s, FALSE);
4635 pl_yylval.ival = is_use;
4639 static const char* const exp_name[] =
4640 { "OPERATOR", "TERM", "REF", "STATE", "BLOCK", "ATTRBLOCK",
4641 "ATTRTERM", "TERMBLOCK", "XBLOCKTERM", "POSTDEREF",
4642 "SIGVAR", "TERMORDORDOR"
4646 #define word_takes_any_delimiter(p,l) S_word_takes_any_delimiter(p,l)
4648 S_word_takes_any_delimiter(char *p, STRLEN len)
4650 return (len == 1 && strchr("msyq", p[0]))
4652 && ((p[0] == 't' && p[1] == 'r')
4653 || (p[0] == 'q' && strchr("qwxr", p[1]))));
4657 S_check_scalar_slice(pTHX_ char *s)
4660 while (SPACE_OR_TAB(*s)) s++;
4661 if (*s == 'q' && s[1] == 'w' && !isWORDCHAR_lazy_if_safe(s+2,
4667 while ( isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF)
4668 || (*s && strchr(" \t$#+-'\"", *s)))
4670 s += UTF ? UTF8SKIP(s) : 1;
4672 if (*s == '}' || *s == ']')
4673 pl_yylval.ival = OPpSLICEWARNING;
4676 #define lex_token_boundary() S_lex_token_boundary(aTHX)
4678 S_lex_token_boundary(pTHX)
4680 PL_oldoldbufptr = PL_oldbufptr;
4681 PL_oldbufptr = PL_bufptr;
4684 #define vcs_conflict_marker(s) S_vcs_conflict_marker(aTHX_ s)
4686 S_vcs_conflict_marker(pTHX_ char *s)
4688 lex_token_boundary();
4690 yyerror("Version control conflict marker");
4691 while (s < PL_bufend && *s != '\n')
4699 Works out what to call the token just pulled out of the input
4700 stream. The yacc parser takes care of taking the ops we return and
4701 stitching them into a tree.
4704 The type of the next token
4707 Check if we have already built the token; if so, use it.
4708 Switch based on the current state:
4709 - if we have a case modifier in a string, deal with that
4710 - handle other cases of interpolation inside a string
4711 - scan the next line if we are inside a format
4712 In the normal state, switch on the next character:
4714 if alphabetic, go to key lookup
4715 unrecognized character - croak
4716 - 0/4/26: handle end-of-line or EOF
4717 - cases for whitespace
4718 - \n and #: handle comments and line numbers
4719 - various operators, brackets and sigils
4722 - 'v': vstrings (or go to key lookup)
4723 - 'x' repetition operator (or go to key lookup)
4724 - other ASCII alphanumerics (key lookup begins here):
4727 scan built-in keyword (but do nothing with it yet)
4728 check for statement label
4729 check for lexical subs
4730 goto just_a_word if there is one
4731 see whether built-in keyword is overridden
4732 switch on keyword number:
4733 - default: just_a_word:
4734 not a built-in keyword; handle bareword lookup
4735 disambiguate between method and sub call
4736 fall back to bareword
4737 - cases for built-in keywords
4745 char *s = PL_bufptr;
4749 const bool saw_infix_sigil = cBOOL(PL_parser->saw_infix_sigil);
4753 /* orig_keyword, gvp, and gv are initialized here because
4754 * jump to the label just_a_word_zero can bypass their
4755 * initialization later. */
4756 I32 orig_keyword = 0;
4760 if (UNLIKELY(PL_parser->recheck_utf8_validity)) {
4761 const U8* first_bad_char_loc;
4762 if (UTF && UNLIKELY(! is_utf8_string_loc((U8 *) PL_bufptr,
4763 PL_bufend - PL_bufptr,
4764 &first_bad_char_loc)))
4766 _force_out_malformed_utf8_message(first_bad_char_loc,
4769 1 /* 1 means die */ );
4770 NOT_REACHED; /* NOTREACHED */
4772 PL_parser->recheck_utf8_validity = FALSE;
4775 SV* tmp = newSVpvs("");
4776 PerlIO_printf(Perl_debug_log, "### %" IVdf ":LEX_%s/X%s %s\n",
4777 (IV)CopLINE(PL_curcop),
4778 lex_state_names[PL_lex_state],
4779 exp_name[PL_expect],
4780 pv_display(tmp, s, strlen(s), 0, 60));
4784 /* when we've already built the next token, just pull it out of the queue */
4787 pl_yylval = PL_nextval[PL_nexttoke];
4790 next_type = PL_nexttype[PL_nexttoke];
4791 if (next_type & (7<<24)) {
4792 if (next_type & (1<<24)) {
4793 if (PL_lex_brackets > 100)
4794 Renew(PL_lex_brackstack, PL_lex_brackets + 10, char);
4795 PL_lex_brackstack[PL_lex_brackets++] =
4796 (char) ((next_type >> 16) & 0xff);
4798 if (next_type & (2<<24))
4799 PL_lex_allbrackets++;
4800 if (next_type & (4<<24))
4801 PL_lex_allbrackets--;
4802 next_type &= 0xffff;
4804 return REPORT(next_type == 'p' ? pending_ident() : next_type);
4808 switch (PL_lex_state) {
4810 case LEX_INTERPNORMAL:
4813 /* interpolated case modifiers like \L \U, including \Q and \E.
4814 when we get here, PL_bufptr is at the \
4816 case LEX_INTERPCASEMOD:
4818 if (PL_bufptr != PL_bufend && *PL_bufptr != '\\')
4820 "panic: INTERPCASEMOD bufptr=%p, bufend=%p, *bufptr=%u",
4821 PL_bufptr, PL_bufend, *PL_bufptr);
4823 /* handle \E or end of string */
4824 if (PL_bufptr == PL_bufend || PL_bufptr[1] == 'E') {
4826 if (PL_lex_casemods) {
4827 const char oldmod = PL_lex_casestack[--PL_lex_casemods];
4828 PL_lex_casestack[PL_lex_casemods] = '\0';
4830 if (PL_bufptr != PL_bufend
4831 && (oldmod == 'L' || oldmod == 'U' || oldmod == 'Q'
4832 || oldmod == 'F')) {
4834 PL_lex_state = LEX_INTERPCONCAT;
4836 PL_lex_allbrackets--;
4839 else if ( PL_bufptr != PL_bufend && PL_bufptr[1] == 'E' ) {
4840 /* Got an unpaired \E */
4841 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
4842 "Useless use of \\E");
4844 if (PL_bufptr != PL_bufend)
4846 PL_lex_state = LEX_INTERPCONCAT;
4850 DEBUG_T({ PerlIO_printf(Perl_debug_log,
4851 "### Saw case modifier\n"); });
4853 if (s[1] == '\\' && s[2] == 'E') {
4855 PL_lex_state = LEX_INTERPCONCAT;
4860 if (strnEQ(s, "L\\u", 3) || strnEQ(s, "U\\l", 3))
4861 tmp = *s, *s = s[2], s[2] = (char)tmp; /* misordered... */
4862 if ((*s == 'L' || *s == 'U' || *s == 'F')
4863 && (strpbrk(PL_lex_casestack, "LUF")))
4865 PL_lex_casestack[--PL_lex_casemods] = '\0';
4866 PL_lex_allbrackets--;
4869 if (PL_lex_casemods > 10)
4870 Renew(PL_lex_casestack, PL_lex_casemods + 2, char);
4871 PL_lex_casestack[PL_lex_casemods++] = *s;
4872 PL_lex_casestack[PL_lex_casemods] = '\0';
4873 PL_lex_state = LEX_INTERPCONCAT;
4874 NEXTVAL_NEXTTOKE.ival = 0;
4875 force_next((2<<24)|'(');
4877 NEXTVAL_NEXTTOKE.ival = OP_LCFIRST;
4879 NEXTVAL_NEXTTOKE.ival = OP_UCFIRST;
4881 NEXTVAL_NEXTTOKE.ival = OP_LC;
4883 NEXTVAL_NEXTTOKE.ival = OP_UC;
4885 NEXTVAL_NEXTTOKE.ival = OP_QUOTEMETA;
4887 NEXTVAL_NEXTTOKE.ival = OP_FC;
4889 Perl_croak(aTHX_ "panic: yylex, *s=%u", *s);
4893 if (PL_lex_starts) {
4896 /* commas only at base level: /$a\Ub$c/ => ($a,uc(b.$c)) */
4897 if (PL_lex_casemods == 1 && PL_lex_inpat)
4900 AopNOASSIGN(OP_CONCAT);
4906 case LEX_INTERPPUSH:
4907 return REPORT(sublex_push());
4909 case LEX_INTERPSTART:
4910 if (PL_bufptr == PL_bufend)
4911 return REPORT(sublex_done());
4912 DEBUG_T({ if(*PL_bufptr != '(') PerlIO_printf(Perl_debug_log,
4913 "### Interpolated variable\n"); });
4915 /* for /@a/, we leave the joining for the regex engine to do
4916 * (unless we're within \Q etc) */
4917 PL_lex_dojoin = (*PL_bufptr == '@'
4918 && (!PL_lex_inpat || PL_lex_casemods));
4919 PL_lex_state = LEX_INTERPNORMAL;
4920 if (PL_lex_dojoin) {
4921 NEXTVAL_NEXTTOKE.ival = 0;
4923 force_ident("\"", '$');
4924 NEXTVAL_NEXTTOKE.ival = 0;
4926 NEXTVAL_NEXTTOKE.ival = 0;
4927 force_next((2<<24)|'(');
4928 NEXTVAL_NEXTTOKE.ival = OP_JOIN; /* emulate join($", ...) */
4931 /* Convert (?{...}) and friends to 'do {...}' */
4932 if (PL_lex_inpat && *PL_bufptr == '(') {
4933 PL_parser->lex_shared->re_eval_start = PL_bufptr;
4935 if (*PL_bufptr != '{')
4937 PL_expect = XTERMBLOCK;
4941 if (PL_lex_starts++) {
4943 /* commas only at base level: /$a\Ub$c/ => ($a,uc(b.$c)) */
4944 if (!PL_lex_casemods && PL_lex_inpat)
4947 AopNOASSIGN(OP_CONCAT);
4951 case LEX_INTERPENDMAYBE:
4952 if (intuit_more(PL_bufptr)) {
4953 PL_lex_state = LEX_INTERPNORMAL; /* false alarm, more expr */
4959 if (PL_lex_dojoin) {
4960 const U8 dojoin_was = PL_lex_dojoin;
4961 PL_lex_dojoin = FALSE;
4962 PL_lex_state = LEX_INTERPCONCAT;
4963 PL_lex_allbrackets--;
4964 return REPORT(dojoin_was == 1 ? (int)')' : (int)POSTJOIN);
4966 if (PL_lex_inwhat == OP_SUBST && PL_linestr == PL_lex_repl
4967 && SvEVALED(PL_lex_repl))
4969 if (PL_bufptr != PL_bufend)
4970 Perl_croak(aTHX_ "Bad evalled substitution pattern");
4973 /* Paranoia. re_eval_start is adjusted when S_scan_heredoc sets
4974 re_eval_str. If the here-doc body’s length equals the previous
4975 value of re_eval_start, re_eval_start will now be null. So
4976 check re_eval_str as well. */
4977 if (PL_parser->lex_shared->re_eval_start
4978 || PL_parser->lex_shared->re_eval_str) {
4980 if (*PL_bufptr != ')')
4981 Perl_croak(aTHX_ "Sequence (?{...}) not terminated with ')'");
4983 /* having compiled a (?{..}) expression, return the original
4984 * text too, as a const */
4985 if (PL_parser->lex_shared->re_eval_str) {
4986 sv = PL_parser->lex_shared->re_eval_str;
4987 PL_parser->lex_shared->re_eval_str = NULL;
4989 PL_bufptr - PL_parser->lex_shared->re_eval_start);
4990 SvPV_shrink_to_cur(sv);
4992 else sv = newSVpvn(PL_parser->lex_shared->re_eval_start,
4993 PL_bufptr - PL_parser->lex_shared->re_eval_start);
4994 NEXTVAL_NEXTTOKE.opval =
4995 newSVOP(OP_CONST, 0,
4998 PL_parser->lex_shared->re_eval_start = NULL;
5004 case LEX_INTERPCONCAT:
5006 if (PL_lex_brackets)
5007 Perl_croak(aTHX_ "panic: INTERPCONCAT, lex_brackets=%ld",
5008 (long) PL_lex_brackets);
5010 if (PL_bufptr == PL_bufend)
5011 return REPORT(sublex_done());
5013 /* m'foo' still needs to be parsed for possible (?{...}) */
5014 if (SvIVX(PL_linestr) == '\'' && !PL_lex_inpat) {
5015 SV *sv = newSVsv(PL_linestr);
5017 pl_yylval.opval = newSVOP(OP_CONST, 0, sv);
5021 int save_error_count = PL_error_count;
5023 s = scan_const(PL_bufptr);
5025 /* Set flag if this was a pattern and there were errors. op.c will
5026 * refuse to compile a pattern with this flag set. Otherwise, we
5027 * could get segfaults, etc. */
5028 if (PL_lex_inpat && PL_error_count > save_error_count) {
5029 ((PMOP*)PL_lex_inpat)->op_pmflags |= PMf_HAS_ERROR;
5032 PL_lex_state = LEX_INTERPCASEMOD;
5034 PL_lex_state = LEX_INTERPSTART;
5037 if (s != PL_bufptr) {
5038 NEXTVAL_NEXTTOKE = pl_yylval;
5041 if (PL_lex_starts++) {
5042 /* commas only at base level: /$a\Ub$c/ => ($a,uc(b.$c)) */
5043 if (!PL_lex_casemods && PL_lex_inpat)
5046 AopNOASSIGN(OP_CONCAT);
5056 s = scan_formline(PL_bufptr);
5057 if (!PL_lex_formbrack)
5066 /* We really do *not* want PL_linestr ever becoming a COW. */
5067 assert (!SvIsCOW(PL_linestr));
5069 PL_oldoldbufptr = PL_oldbufptr;
5071 PL_parser->saw_infix_sigil = 0;
5073 if (PL_in_my == KEY_sigvar) {
5074 /* we expect the sigil and optional var name part of a
5075 * signature element here. Since a '$' is not necessarily
5076 * followed by a var name, handle it specially here; the general
5077 * yylex code would otherwise try to interpret whatever follows
5078 * as a var; e.g. ($, ...) would be seen as the var '$,'
5085 PL_bufptr = s; /* for error reporting */
5090 /* spot stuff that looks like an prototype */
5091 if (strchr("$:@%&*;\\[]", *s)) {
5092 yyerror("Illegal character following sigil in a subroutine signature");
5095 /* '$#' is banned, while '$ # comment' isn't */
5097 yyerror("'#' not allowed immediately following a sigil in a subroutine signature");
5101 if (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) {
5102 char *dest = PL_tokenbuf + 1;
5103 /* read var name, including sigil, into PL_tokenbuf */
5104 PL_tokenbuf[0] = sigil;
5105 parse_ident(&s, &dest, dest + sizeof(PL_tokenbuf) - 1,
5106 0, cBOOL(UTF), FALSE);
5108 assert(PL_tokenbuf[1]); /* we have a variable name */
5109 NEXTVAL_NEXTTOKE.ival = sigil;
5110 force_next('p'); /* force a signature pending identifier */
5114 PL_expect = XOPERATOR;
5120 case ',': /* handle ($a,,$b) */
5125 yyerror("A signature parameter must start with '$', '@' or '%'");
5126 /* very crude error recovery: skip to likely next signature
5128 while (*s && *s != '$' && *s != '@' && *s != '%' && *s != ')')
5139 if (isIDFIRST_utf8_safe(s, PL_bufend)) {
5143 else if (isALNUMC(*s)) {
5147 SV *dsv = newSVpvs_flags("", SVs_TEMP);
5150 STRLEN skiplen = UTF8SKIP(s);
5151 STRLEN stravail = PL_bufend - s;
5152 c = sv_uni_display(dsv, newSVpvn_flags(s,
5153 skiplen > stravail ? stravail : skiplen,
5154 SVs_TEMP | SVf_UTF8),
5155 10, UNI_DISPLAY_ISPRINT);
5158 c = Perl_form(aTHX_ "\\x%02X", (unsigned char)*s);
5160 len = UTF ? Perl_utf8_length(aTHX_ (U8 *) PL_linestart, (U8 *) s) : (STRLEN) (s - PL_linestart);
5161 if (len > UNRECOGNIZED_PRECEDE_COUNT) {
5162 d = UTF ? (char *) utf8_hop_back((U8 *) s, -UNRECOGNIZED_PRECEDE_COUNT, (U8 *)PL_linestart) : s - UNRECOGNIZED_PRECEDE_COUNT;
5166 Perl_croak(aTHX_ "Unrecognized character %s; marked by <-- HERE after %" UTF8f "<-- HERE near column %d", c,
5167 UTF8fARG(UTF, (s - d), d),
5172 goto fake_eof; /* emulate EOF on ^D or ^Z */
5174 if ((!PL_rsfp || PL_lex_inwhat)
5175 && (!PL_parser->filtered || s+1 < PL_bufend)) {
5179 && PL_lex_brackstack[PL_lex_brackets-1] != XFAKEEOF)
5181 yyerror((const char *)
5183 ? "Format not terminated"
5184 : "Missing right curly or square bracket"));
5186 DEBUG_T( { PerlIO_printf(Perl_debug_log,
5187 "### Tokener got EOF\n");
5191 if (s++ < PL_bufend)
5192 goto retry; /* ignore stray nulls */
5195 if (!PL_in_eval && !PL_preambled) {
5196 PL_preambled = TRUE;
5198 /* Generate a string of Perl code to load the debugger.
5199 * If PERL5DB is set, it will return the contents of that,
5200 * otherwise a compile-time require of perl5db.pl. */
5202 const char * const pdb = PerlEnv_getenv("PERL5DB");
5205 sv_setpv(PL_linestr, pdb);
5206 sv_catpvs(PL_linestr,";");
5208 SETERRNO(0,SS_NORMAL);
5209 sv_setpvs(PL_linestr, "BEGIN { require 'perl5db.pl' };");
5211 PL_parser->preambling = CopLINE(PL_curcop);
5213 SvPVCLEAR(PL_linestr);
5214 if (PL_preambleav) {
5215 SV **svp = AvARRAY(PL_preambleav);
5216 SV **const end = svp + AvFILLp(PL_preambleav);
5218 sv_catsv(PL_linestr, *svp);
5220 sv_catpvs(PL_linestr, ";");
5222 sv_free(MUTABLE_SV(PL_preambleav));
5223 PL_preambleav = NULL;
5226 sv_catpvs(PL_linestr,
5227 "use feature ':5." STRINGIFY(PERL_VERSION) "';");
5228 if (PL_minus_n || PL_minus_p) {
5229 sv_catpvs(PL_linestr, "LINE: while (<>) {"/*}*/);
5231 sv_catpvs(PL_linestr,"chomp;");
5234 if ((*PL_splitstr == '/' || *PL_splitstr == '\''
5235 || *PL_splitstr == '"')
5236 && strchr(PL_splitstr + 1, *PL_splitstr))
5237 Perl_sv_catpvf(aTHX_ PL_linestr, "our @F=split(%s);", PL_splitstr);
5239 /* "q\0${splitstr}\0" is legal perl. Yes, even NUL
5240 bytes can be used as quoting characters. :-) */
5241 const char *splits = PL_splitstr;
5242 sv_catpvs(PL_linestr, "our @F=split(q\0");
5245 if (*splits == '\\')
5246 sv_catpvn(PL_linestr, splits, 1);
5247 sv_catpvn(PL_linestr, splits, 1);
5248 } while (*splits++);
5249 /* This loop will embed the trailing NUL of
5250 PL_linestr as the last thing it does before
5252 sv_catpvs(PL_linestr, ");");
5256 sv_catpvs(PL_linestr,"our @F=split(' ');");
5259 sv_catpvs(PL_linestr, "\n");
5260 PL_oldoldbufptr = PL_oldbufptr = s = PL_linestart = SvPVX(PL_linestr);
5261 PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr);
5262 PL_last_lop = PL_last_uni = NULL;
5263 if (PERLDB_LINE_OR_SAVESRC && PL_curstash != PL_debstash)
5264 update_debugger_info(PL_linestr, NULL, 0);
5269 bof = cBOOL(PL_rsfp);
5272 fake_eof = LEX_FAKE_EOF;
5274 PL_bufptr = PL_bufend;
5275 COPLINE_INC_WITH_HERELINES;
5276 if (!lex_next_chunk(fake_eof)) {
5277 CopLINE_dec(PL_curcop);
5279 TOKEN(';'); /* not infinite loop because rsfp is NULL now */
5281 CopLINE_dec(PL_curcop);
5283 /* If it looks like the start of a BOM or raw UTF-16,
5284 * check if it in fact is. */
5287 || *(U8*)s == BOM_UTF8_FIRST_BYTE
5291 Off_t offset = (IV)PerlIO_tell(PL_rsfp);
5292 bof = (offset == (Off_t)SvCUR(PL_linestr));
5293 #if defined(PERLIO_USING_CRLF) && defined(PERL_TEXTMODE_SCRIPTS)
5294 /* offset may include swallowed CR */
5296 bof = (offset == (Off_t)SvCUR(PL_linestr)+1);
5299 PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr);
5300 s = swallow_bom((U8*)s);
5303 if (PL_parser->in_pod) {
5304 /* Incest with pod. */
5305 if (*s == '=' && strEQs(s, "=cut") && !isALPHA(s[4])) {
5306 SvPVCLEAR(PL_linestr);
5307 PL_oldoldbufptr = PL_oldbufptr = s = PL_linestart = SvPVX(PL_linestr);
5308 PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr);
5309 PL_last_lop = PL_last_uni = NULL;
5310 PL_parser->in_pod = 0;
5313 if (PL_rsfp || PL_parser->filtered)
5315 } while (PL_parser->in_pod);
5316 PL_oldoldbufptr = PL_oldbufptr = PL_bufptr = PL_linestart = s;
5317 PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr);
5318 PL_last_lop = PL_last_uni = NULL;
5319 if (CopLINE(PL_curcop) == 1) {
5320 while (s < PL_bufend && isSPACE(*s))
5322 if (*s == ':' && s[1] != ':') /* for csh execing sh scripts */
5326 if (*s == '#' && *(s+1) == '!')
5328 #ifdef ALTERNATE_SHEBANG
5330 static char const as[] = ALTERNATE_SHEBANG;
5331 if (*s == as[0] && strnEQ(s, as, sizeof(as) - 1))
5332 d = s + (sizeof(as) - 1);
5334 #endif /* ALTERNATE_SHEBANG */
5343 while (*d && !isSPACE(*d))
5347 #ifdef ARG_ZERO_IS_SCRIPT
5348 if (ipathend > ipath) {
5350 * HP-UX (at least) sets argv[0] to the script name,
5351 * which makes $^X incorrect. And Digital UNIX and Linux,
5352 * at least, set argv[0] to the basename of the Perl
5353 * interpreter. So, having found "#!", we'll set it right.
5355 SV* copfilesv = CopFILESV(PL_curcop);
5358 GvSV(gv_fetchpvs("\030", GV_ADD|GV_NOTQUAL,
5360 assert(SvPOK(x) || SvGMAGICAL(x));
5361 if (sv_eq(x, copfilesv)) {
5362 sv_setpvn(x, ipath, ipathend - ipath);
5368 const char *bstart = SvPV_const(copfilesv, blen);
5369 const char * const lstart = SvPV_const(x, llen);
5371 bstart += blen - llen;
5372 if (strnEQ(bstart, lstart, llen) && bstart[-1] == '/') {
5373 sv_setpvn(x, ipath, ipathend - ipath);
5380 /* Anything to do if no copfilesv? */
5382 TAINT_NOT; /* $^X is always tainted, but that's OK */
5384 #endif /* ARG_ZERO_IS_SCRIPT */
5389 d = instr(s,"perl -");
5391 d = instr(s,"perl");
5393 /* avoid getting into infinite loops when shebang
5394 * line contains "Perl" rather than "perl" */
5396 for (d = ipathend-4; d >= ipath; --d) {
5397 if (isALPHA_FOLD_EQ(*d, 'p')
5398 && !ibcmp(d, "perl", 4))
5408 #ifdef ALTERNATE_SHEBANG
5410 * If the ALTERNATE_SHEBANG on this system starts with a
5411 * character that can be part of a Perl expression, then if
5412 * we see it but not "perl", we're probably looking at the
5413 * start of Perl code, not a request to hand off to some
5414 * other interpreter. Similarly, if "perl" is there, but
5415 * not in the first 'word' of the line, we assume the line
5416 * contains the start of the Perl program.
5418 if (d && *s != '#') {
5419 const char *c = ipath;
5420 while (*c && !strchr("; \t\r\n\f\v#", *c))
5423 d = NULL; /* "perl" not in first word; ignore */
5425 *s = '#'; /* Don't try to parse shebang line */
5427 #endif /* ALTERNATE_SHEBANG */
5432 && !instr(s,"indir")
5433 && instr(PL_origargv[0],"perl"))
5440 while (s < PL_bufend && isSPACE(*s))
5442 if (s < PL_bufend) {
5443 Newx(newargv,PL_origargc+3,char*);
5445 while (s < PL_bufend && !isSPACE(*s))
5448 Copy(PL_origargv+1, newargv+2, PL_origargc+1, char*);
5451 newargv = PL_origargv;
5454 PerlProc_execv(ipath, EXEC_ARGV_CAST(newargv));
5456 Perl_croak(aTHX_ "Can't exec %s", ipath);
5459 while (*d && !isSPACE(*d))
5461 while (SPACE_OR_TAB(*d))
5465 const bool switches_done = PL_doswitches;
5466 const U32 oldpdb = PL_perldb;
5467 const bool oldn = PL_minus_n;
5468 const bool oldp = PL_minus_p;
5472 bool baduni = FALSE;
5474 const char *d2 = d1 + 1;
5475 if (parse_unicode_opts((const char **)&d2)
5479 if (baduni || isALPHA_FOLD_EQ(*d1, 'M')) {
5480 const char * const m = d1;
5481 while (*d1 && !isSPACE(*d1))
5483 Perl_croak(aTHX_ "Too late for \"-%.*s\" option",
5486 d1 = moreswitches(d1);
5488 if (PL_doswitches && !switches_done) {
5489 int argc = PL_origargc;
5490 char **argv = PL_origargv;
5493 } while (argc && argv[0][0] == '-' && argv[0][1]);
5494 init_argv_symbols(argc,argv);
5496 if ( (PERLDB_LINE_OR_SAVESRC && !oldpdb)
5497 || ((PL_minus_n || PL_minus_p) && !(oldn || oldp)))
5498 /* if we have already added "LINE: while (<>) {",
5499 we must not do it again */
5501 SvPVCLEAR(PL_linestr);
5502 PL_oldoldbufptr = PL_oldbufptr = s = PL_linestart = SvPVX(PL_linestr);
5503 PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr);
5504 PL_last_lop = PL_last_uni = NULL;
5505 PL_preambled = FALSE;
5506 if (PERLDB_LINE_OR_SAVESRC)
5507 (void)gv_fetchfile(PL_origfilename);
5514 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
5515 PL_lex_state = LEX_FORMLINE;
5516 force_next(FORMRBRACK);
5521 #ifdef PERL_STRICT_CR
5522 Perl_warn(aTHX_ "Illegal character \\%03o (carriage return)", '\r');
5524 "\t(Maybe you didn't strip carriage returns after a network transfer?)\n");
5526 case ' ': case '\t': case '\f': case '\v':
5531 if (PL_lex_state != LEX_NORMAL
5532 || (PL_in_eval && !PL_rsfp && !PL_parser->filtered))
5534 const bool in_comment = *s == '#';
5535 if (*s == '#' && s == PL_linestart && PL_in_eval
5536 && !PL_rsfp && !PL_parser->filtered) {
5537 /* handle eval qq[#line 1 "foo"\n ...] */
5538 CopLINE_dec(PL_curcop);
5542 while (d < PL_bufend && *d != '\n')
5546 else if (d > PL_bufend)
5547 /* Found by Ilya: feed random input to Perl. */
5548 Perl_croak(aTHX_ "panic: input overflow, %p > %p",
5551 if (in_comment && d == PL_bufend
5552 && PL_lex_state == LEX_INTERPNORMAL
5553 && PL_lex_inwhat == OP_SUBST && PL_lex_repl == PL_linestr
5554 && SvEVALED(PL_lex_repl) && d[-1] == '}') s--;
5557 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
5558 PL_lex_state = LEX_FORMLINE;
5559 force_next(FORMRBRACK);
5564 while (s < PL_bufend && *s != '\n')
5572 else if (s > PL_bufend)
5573 /* Found by Ilya: feed random input to Perl. */
5574 Perl_croak(aTHX_ "panic: input overflow");
5578 if (s[1] && isALPHA(s[1]) && !isWORDCHAR(s[2])) {
5586 while (s < PL_bufend && SPACE_OR_TAB(*s))
5589 if (strEQs(s,"=>")) {
5590 s = force_word(PL_bufptr,BAREWORD,FALSE,FALSE);
5591 DEBUG_T( { printbuf("### Saw unary minus before =>, forcing word %s\n", s); } );
5592 OPERATOR('-'); /* unary minus */
5595 case 'r': ftst = OP_FTEREAD; break;
5596 case 'w': ftst = OP_FTEWRITE; break;
5597 case 'x': ftst = OP_FTEEXEC; break;
5598 case 'o': ftst = OP_FTEOWNED; break;
5599 case 'R': ftst = OP_FTRREAD; break;
5600 case 'W': ftst = OP_FTRWRITE; break;
5601 case 'X': ftst = OP_FTREXEC; break;
5602 case 'O': ftst = OP_FTROWNED; break;
5603 case 'e': ftst = OP_FTIS; break;
5604 case 'z': ftst = OP_FTZERO; break;
5605 case 's': ftst = OP_FTSIZE; break;
5606 case 'f': ftst = OP_FTFILE; break;
5607 case 'd': ftst = OP_FTDIR; break;
5608 case 'l': ftst = OP_FTLINK; break;
5609 case 'p': ftst = OP_FTPIPE; break;
5610 case 'S': ftst = OP_FTSOCK; break;
5611 case 'u': ftst = OP_FTSUID; break;
5612 case 'g': ftst = OP_FTSGID; break;
5613 case 'k': ftst = OP_FTSVTX; break;
5614 case 'b': ftst = OP_FTBLK; break;
5615 case 'c': ftst = OP_FTCHR; break;
5616 case 't': ftst = OP_FTTTY; break;
5617 case 'T': ftst = OP_FTTEXT; break;
5618 case 'B': ftst = OP_FTBINARY; break;
5619 case 'M': case 'A': case 'C':
5620 gv_fetchpvs("\024", GV_ADD|GV_NOTQUAL, SVt_PV);
5622 case 'M': ftst = OP_FTMTIME; break;
5623 case 'A': ftst = OP_FTATIME; break;
5624 case 'C': ftst = OP_FTCTIME; break;
5632 PL_last_uni = PL_oldbufptr;
5633 PL_last_lop_op = (OPCODE)ftst;
5634 DEBUG_T( { PerlIO_printf(Perl_debug_log,
5635 "### Saw file test %c\n", (int)tmp);
5640 /* Assume it was a minus followed by a one-letter named
5641 * subroutine call (or a -bareword), then. */
5642 DEBUG_T( { PerlIO_printf(Perl_debug_log,
5643 "### '-%c' looked like a file test but was not\n",
5650 const char tmp = *s++;
5653 if (PL_expect == XOPERATOR)
5658 else if (*s == '>') {
5661 if (((*s == '$' || *s == '&') && s[1] == '*')
5662 ||(*s == '$' && s[1] == '#' && s[2] == '*')
5663 ||((*s == '@' || *s == '%') && strchr("*[{", s[1]))
5664 ||(*s == '*' && (s[1] == '*' || s[1] == '{'))
5667 PL_expect = XPOSTDEREF;
5670 if (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) {
5671 s = force_word(s,METHOD,FALSE,TRUE);
5679 if (PL_expect == XOPERATOR) {
5681 && !PL_lex_allbrackets
5682 && PL_lex_fakeeof >= LEX_FAKEEOF_ASSIGN)
5690 if (isSPACE(*s) || !isSPACE(*PL_bufptr))
5692 OPERATOR('-'); /* unary minus */
5698 const char tmp = *s++;
5701 if (PL_expect == XOPERATOR)
5706 if (PL_expect == XOPERATOR) {
5708 && !PL_lex_allbrackets
5709 && PL_lex_fakeeof >= LEX_FAKEEOF_ASSIGN)
5717 if (isSPACE(*s) || !isSPACE(*PL_bufptr))
5724 if (PL_expect == XPOSTDEREF) POSTDEREF('*');
5725 if (PL_expect != XOPERATOR) {
5726 s = scan_ident(s, PL_tokenbuf, sizeof PL_tokenbuf, TRUE);
5727 PL_expect = XOPERATOR;
5728 force_ident(PL_tokenbuf, '*');
5736 if (*s == '=' && !PL_lex_allbrackets
5737 && PL_lex_fakeeof >= LEX_FAKEEOF_ASSIGN)
5745 && !PL_lex_allbrackets
5746 && PL_lex_fakeeof >= LEX_FAKEEOF_ASSIGN)
5751 PL_parser->saw_infix_sigil = 1;
5756 if (PL_expect == XOPERATOR) {
5758 && !PL_lex_allbrackets
5759 && PL_lex_fakeeof >= LEX_FAKEEOF_ASSIGN)
5764 PL_parser->saw_infix_sigil = 1;
5767 else if (PL_expect == XPOSTDEREF) POSTDEREF('%');
5768 PL_tokenbuf[0] = '%';
5769 s = scan_ident(s, PL_tokenbuf + 1, sizeof PL_tokenbuf - 1, FALSE);
5771 if (!PL_tokenbuf[1]) {
5774 if ((PL_expect != XREF || PL_oldoldbufptr == PL_last_lop) && intuit_more(s)) {
5776 PL_tokenbuf[0] = '@';
5778 PL_expect = XOPERATOR;
5779 force_ident_maybe_lex('%');
5784 bof = FEATURE_BITWISE_IS_ENABLED;
5785 if (bof && s[1] == '.')
5787 if (!PL_lex_allbrackets && PL_lex_fakeeof >=
5788 (s[1] == '=' ? LEX_FAKEEOF_ASSIGN : LEX_FAKEEOF_BITWISE))
5794 BOop(bof ? d == s-2 ? OP_SBIT_XOR : OP_NBIT_XOR : OP_BIT_XOR);
5796 if (PL_lex_brackets > 100)
5797 Renew(PL_lex_brackstack, PL_lex_brackets + 10, char);
5798 PL_lex_brackstack[PL_lex_brackets++] = 0;
5799 PL_lex_allbrackets++;
5801 const char tmp = *s++;
5806 && (PL_expect == XOPERATOR || PL_expect == XTERMORDORDOR))
5808 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE)
5811 Perl_ck_warner_d(aTHX_
5812 packWARN(WARN_EXPERIMENTAL__SMARTMATCH),
5813 "Smartmatch is experimental");
5817 if ((bof = FEATURE_BITWISE_IS_ENABLED) && *s == '.') {
5819 BCop(OP_SCOMPLEMENT);
5821 BCop(bof ? OP_NCOMPLEMENT : OP_COMPLEMENT);
5823 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_COMMA)
5830 goto just_a_word_zero_gv;
5836 switch (PL_expect) {
5838 if (!PL_in_my || PL_lex_state != LEX_NORMAL)
5840 PL_bufptr = s; /* update in case we back off */
5843 "Use of := for an empty attribute list is not allowed");
5850 PL_expect = XTERMBLOCK;
5854 while (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) {
5857 d = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, FALSE, &len);
5858 if (isLOWER(*s) && (tmp = keyword(PL_tokenbuf, len, 0))) {
5859 if (tmp < 0) tmp = -tmp;
5874 sv = newSVpvn_flags(s, len, UTF ? SVf_UTF8 : 0);
5876 d = scan_str(d,TRUE,TRUE,FALSE,NULL);
5881 Perl_croak(aTHX_ "Unterminated attribute parameter in attribute list");
5883 COPLINE_SET_FROM_MULTI_END;
5886 sv_catsv(sv, PL_lex_stuff);
5887 attrs = op_append_elem(OP_LIST, attrs,
5888 newSVOP(OP_CONST, 0, sv));
5889 SvREFCNT_dec_NN(PL_lex_stuff);
5890 PL_lex_stuff = NULL;
5893 /* NOTE: any CV attrs applied here need to be part of
5894 the CVf_BUILTIN_ATTRS define in cv.h! */
5895 if (!PL_in_my && len == 6 && strnEQ(SvPVX(sv), "lvalue", len)) {
5897 CvLVALUE_on(PL_compcv);
5899 else if (!PL_in_my && len == 6 && strnEQ(SvPVX(sv), "method", len)) {
5901 CvMETHOD_on(PL_compcv);
5903 else if (!PL_in_my && len == 5
5904 && strnEQ(SvPVX(sv), "const", len))
5907 Perl_ck_warner_d(aTHX_
5908 packWARN(WARN_EXPERIMENTAL__CONST_ATTR),
5909 ":const is experimental"
5911 CvANONCONST_on(PL_compcv);
5912 if (!CvANON(PL_compcv))
5913 yyerror(":const is not permitted on named "
5916 /* After we've set the flags, it could be argued that
5917 we don't need to do the attributes.pm-based setting
5918 process, and shouldn't bother appending recognized
5919 flags. To experiment with that, uncomment the
5920 following "else". (Note that's already been
5921 uncommented. That keeps the above-applied built-in
5922 attributes from being intercepted (and possibly
5923 rejected) by a package's attribute routines, but is
5924 justified by the performance win for the common case
5925 of applying only built-in attributes.) */
5927 attrs = op_append_elem(OP_LIST, attrs,
5928 newSVOP(OP_CONST, 0,
5932 if (*s == ':' && s[1] != ':')
5935 break; /* require real whitespace or :'s */
5936 /* XXX losing whitespace on sequential attributes here */
5941 && !(PL_expect == XOPERATOR
5942 ? (*s == '=' || *s == ')')
5943 : (*s == '{' || *s == '(')))
5945 const char q = ((*s == '\'') ? '"' : '\'');
5946 /* If here for an expression, and parsed no attrs, back
5948 if (PL_expect == XOPERATOR && !attrs) {
5952 /* MUST advance bufptr here to avoid bogus "at end of line"
5953 context messages from yyerror().
5956 yyerror( (const char *)
5958 ? Perl_form(aTHX_ "Invalid separator character "
5959 "%c%c%c in attribute list", q, *s, q)
5960 : "Unterminated attribute list" ) );
5968 NEXTVAL_NEXTTOKE.opval = attrs;
5974 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_CLOSING) {
5978 PL_lex_allbrackets--;
5982 if (PL_last_lop == PL_oldoldbufptr || PL_last_uni == PL_oldoldbufptr)
5983 PL_oldbufptr = PL_oldoldbufptr; /* allow print(STDOUT 123) */
5987 PL_lex_allbrackets++;
5990 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_NONEXPR)
5997 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_CLOSING)
6000 PL_lex_allbrackets--;
6006 if (PL_lex_brackets && PL_lex_brackstack[PL_lex_brackets-1] == XFAKEEOF)
6009 if (PL_lex_brackets <= 0)
6010 /* diag_listed_as: Unmatched right %s bracket */
6011 yyerror("Unmatched right square bracket");
6014 PL_lex_allbrackets--;
6015 if (PL_lex_state == LEX_INTERPNORMAL) {
6016 if (PL_lex_brackets == 0) {
6017 if (*s == '-' && s[1] == '>')
6018 PL_lex_state = LEX_INTERPENDMAYBE;
6019 else if (*s != '[' && *s != '{')
6020 PL_lex_state = LEX_INTERPEND;
6027 if (PL_lex_brackets > 100) {
6028 Renew(PL_lex_brackstack, PL_lex_brackets + 10, char);
6030 switch (PL_expect) {
6033 PL_lex_brackstack[PL_lex_brackets++] = XOPERATOR;
6034 PL_lex_allbrackets++;
6035 OPERATOR(HASHBRACK);
6037 while (s < PL_bufend && SPACE_OR_TAB(*s))
6040 PL_tokenbuf[0] = '\0';
6041 if (d < PL_bufend && *d == '-') {
6042 PL_tokenbuf[0] = '-';
6044 while (d < PL_bufend && SPACE_OR_TAB(*d))
6047 if (d < PL_bufend && isIDFIRST_lazy_if_safe(d, PL_bufend, UTF)) {
6048 d = scan_word(d, PL_tokenbuf + 1, sizeof PL_tokenbuf - 1,
6050 while (d < PL_bufend && SPACE_OR_TAB(*d))
6053 const char minus = (PL_tokenbuf[0] == '-');
6054 s = force_word(s + minus, BAREWORD, FALSE, TRUE);
6062 PL_lex_brackstack[PL_lex_brackets++] = XOPERATOR;
6063 PL_lex_allbrackets++;
6068 PL_lex_brackstack[PL_lex_brackets++] = XSTATE;
6069 PL_lex_allbrackets++;
6073 PL_lex_brackstack[PL_lex_brackets++] = XTERM;
6074 PL_lex_allbrackets++;
6079 if (PL_oldoldbufptr == PL_last_lop)
6080 PL_lex_brackstack[PL_lex_brackets++] = XTERM;
6082 PL_lex_brackstack[PL_lex_brackets++] = XOPERATOR;
6083 PL_lex_allbrackets++;
6086 if (PL_expect == XREF && PL_lex_state == LEX_INTERPNORMAL) {
6088 /* This hack is to get the ${} in the message. */
6090 yyerror("syntax error");
6093 OPERATOR(HASHBRACK);
6095 if (PL_expect == XREF && PL_oldoldbufptr != PL_last_lop) {
6096 /* ${...} or @{...} etc., but not print {...}
6097 * Skip the disambiguation and treat this as a block.
6099 goto block_expectation;
6101 /* This hack serves to disambiguate a pair of curlies
6102 * as being a block or an anon hash. Normally, expectation
6103 * determines that, but in cases where we're not in a
6104 * position to expect anything in particular (like inside
6105 * eval"") we have to resolve the ambiguity. This code
6106 * covers the case where the first term in the curlies is a
6107 * quoted string. Most other cases need to be explicitly
6108 * disambiguated by prepending a "+" before the opening
6109 * curly in order to force resolution as an anon hash.
6111 * XXX should probably propagate the outer expectation
6112 * into eval"" to rely less on this hack, but that could
6113 * potentially break current behavior of eval"".
6117 if (*s == '\'' || *s == '"' || *s == '`') {
6118 /* common case: get past first string, handling escapes */
6119 for (t++; t < PL_bufend && *t != *s;)
6124 else if (*s == 'q') {
6127 || ((*t == 'q' || *t == 'x') && ++t < PL_bufend
6128 && !isWORDCHAR(*t))))
6130 /* skip q//-like construct */
6132 char open, close, term;
6135 while (t < PL_bufend && isSPACE(*t))
6137 /* check for q => */
6138 if (t+1 < PL_bufend && t[0] == '=' && t[1] == '>') {
6139 OPERATOR(HASHBRACK);
6143 if (term && (tmps = strchr("([{< )]}> )]}>",term)))
6147 for (t++; t < PL_bufend; t++) {
6148 if (*t == '\\' && t+1 < PL_bufend && open != '\\')
6150 else if (*t == open)
6154 for (t++; t < PL_bufend; t++) {
6155 if (*t == '\\' && t+1 < PL_bufend)
6157 else if (*t == close && --brackets <= 0)
6159 else if (*t == open)
6166 /* skip plain q word */
6167 while ( t < PL_bufend
6168 && isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF))
6170 t += UTF ? UTF8SKIP(t) : 1;
6173 else if (isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF)) {
6174 t += UTF ? UTF8SKIP(t) : 1;
6175 while ( t < PL_bufend
6176 && isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF))
6178 t += UTF ? UTF8SKIP(t) : 1;
6181 while (t < PL_bufend && isSPACE(*t))
6183 /* if comma follows first term, call it an anon hash */
6184 /* XXX it could be a comma expression with loop modifiers */
6185 if (t < PL_bufend && ((*t == ',' && (*s == 'q' || !isLOWER(*s)))
6186 || (*t == '=' && t[1] == '>')))
6187 OPERATOR(HASHBRACK);
6188 if (PL_expect == XREF)
6191 /* If there is an opening brace or 'sub:', treat it
6192 as a term to make ${{...}}{k} and &{sub:attr...}
6193 dwim. Otherwise, treat it as a statement, so
6194 map {no strict; ...} works.
6201 if (strEQs(s, "sub")) {
6212 PL_lex_brackstack[PL_lex_brackets-1] = XSTATE;
6218 pl_yylval.ival = CopLINE(PL_curcop);
6219 PL_copline = NOLINE; /* invalidate current command line number */
6220 TOKEN(formbrack ? '=' : '{');
6222 if (PL_lex_brackets && PL_lex_brackstack[PL_lex_brackets-1] == XFAKEEOF)
6226 if (PL_lex_brackets <= 0)
6227 /* diag_listed_as: Unmatched right %s bracket */
6228 yyerror("Unmatched right curly bracket");
6230 PL_expect = (expectation)PL_lex_brackstack[--PL_lex_brackets];
6231 PL_lex_allbrackets--;
6232 if (PL_lex_state == LEX_INTERPNORMAL) {
6233 if (PL_lex_brackets == 0) {
6234 if (PL_expect & XFAKEBRACK) {
6235 PL_expect &= XENUMMASK;
6236 PL_lex_state = LEX_INTERPEND;
6238 return yylex(); /* ignore fake brackets */
6240 if (PL_lex_inwhat == OP_SUBST && PL_lex_repl == PL_linestr
6241 && SvEVALED(PL_lex_repl))
6242 PL_lex_state = LEX_INTERPEND;
6243 else if (*s == '-' && s[1] == '>')
6244 PL_lex_state = LEX_INTERPENDMAYBE;
6245 else if (*s != '[' && *s != '{')
6246 PL_lex_state = LEX_INTERPEND;
6249 if (PL_expect & XFAKEBRACK) {
6250 PL_expect &= XENUMMASK;
6252 return yylex(); /* ignore fake brackets */
6254 force_next(formbrack ? '.' : '}');
6255 if (formbrack) LEAVE;
6256 if (formbrack == 2) { /* means . where arguments were expected */
6262 if (PL_expect == XPOSTDEREF) POSTDEREF('&');
6265 if (!PL_lex_allbrackets && PL_lex_fakeeof >=
6266 (*s == '=' ? LEX_FAKEEOF_ASSIGN : LEX_FAKEEOF_LOGIC)) {
6273 if (PL_expect == XOPERATOR) {
6274 if ( PL_bufptr == PL_linestart
6275 && ckWARN(WARN_SEMICOLON)
6276 && isIDFIRST_lazy_if_safe(s, PL_bufend, UTF))
6278 CopLINE_dec(PL_curcop);
6279 Perl_warner(aTHX_ packWARN(WARN_SEMICOLON), "%s", PL_warn_nosemi);
6280 CopLINE_inc(PL_curcop);
6283 if ((bof = FEATURE_BITWISE_IS_ENABLED) && *s == '.')
6285 if (!PL_lex_allbrackets && PL_lex_fakeeof >=
6286 (*s == '=' ? LEX_FAKEEOF_ASSIGN : LEX_FAKEEOF_BITWISE)) {
6292 PL_parser->saw_infix_sigil = 1;
6293 BAop(bof ? OP_NBIT_AND : OP_BIT_AND);
6299 PL_tokenbuf[0] = '&';
6300 s = scan_ident(s - 1, PL_tokenbuf + 1, sizeof PL_tokenbuf - 1, TRUE);
6301 pl_yylval.ival = (OPpENTERSUB_AMPER<<8);
6302 if (PL_tokenbuf[1]) {
6303 force_ident_maybe_lex('&');
6312 if (!PL_lex_allbrackets && PL_lex_fakeeof >=
6313 (*s == '=' ? LEX_FAKEEOF_ASSIGN : LEX_FAKEEOF_LOGIC)) {
6321 if ((bof = FEATURE_BITWISE_IS_ENABLED) && *s == '.')
6323 if (!PL_lex_allbrackets && PL_lex_fakeeof >=
6324 (*s == '=' ? LEX_FAKEEOF_ASSIGN : LEX_FAKEEOF_BITWISE)) {
6328 BOop(bof ? s == d ? OP_NBIT_OR : OP_SBIT_OR : OP_BIT_OR);
6332 const char tmp = *s++;
6334 if ((s == PL_linestart+2 || s[-3] == '\n') && strEQs(s, "=====")) {
6335 s = vcs_conflict_marker(s + 5);
6338 if (!PL_lex_allbrackets
6339 && PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE)
6347 if (!PL_lex_allbrackets
6348 && PL_lex_fakeeof >= LEX_FAKEEOF_COMMA)
6357 if (tmp && isSPACE(*s) && ckWARN(WARN_SYNTAX)
6358 && strchr("+-*/%.^&|<",tmp))
6359 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
6360 "Reversed %c= operator",(int)tmp);
6362 if (PL_expect == XSTATE
6364 && (s == PL_linestart+1 || s[-2] == '\n') )
6366 if ((PL_in_eval && !PL_rsfp && !PL_parser->filtered)
6367 || PL_lex_state != LEX_NORMAL) {
6372 if (strEQs(s,"=cut")) {
6386 PL_parser->in_pod = 1;
6390 if (PL_expect == XBLOCK) {
6392 #ifdef PERL_STRICT_CR
6393 while (SPACE_OR_TAB(*t))
6395 while (SPACE_OR_TAB(*t) || *t == '\r')
6398 if (*t == '\n' || *t == '#') {
6401 SAVEI8(PL_parser->form_lex_state);
6402 SAVEI32(PL_lex_formbrack);
6403 PL_parser->form_lex_state = PL_lex_state;
6404 PL_lex_formbrack = PL_lex_brackets + 1;
6408 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_ASSIGN) {
6417 const char tmp = *s++;
6419 /* was this !=~ where !~ was meant?
6420 * warn on m:!=~\s+([/?]|[msy]\W|tr\W): */
6422 if (*s == '~' && ckWARN(WARN_SYNTAX)) {
6423 const char *t = s+1;
6425 while (t < PL_bufend && isSPACE(*t))
6428 if (*t == '/' || *t == '?'
6429 || ((*t == 'm' || *t == 's' || *t == 'y')
6430 && !isWORDCHAR(t[1]))
6431 || (*t == 't' && t[1] == 'r' && !isWORDCHAR(t[2])))
6432 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
6433 "!=~ should be !~");
6435 if (!PL_lex_allbrackets
6436 && PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE)
6449 if (PL_expect != XOPERATOR) {
6450 if (s[1] != '<' && !strchr(s,'>'))
6452 if (s[1] == '<' && s[2] != '>') {
6453 if ((s == PL_linestart || s[-1] == '\n') && strEQs(s+2, "<<<<<")) {
6454 s = vcs_conflict_marker(s + 7);
6457 s = scan_heredoc(s);
6460 s = scan_inputsymbol(s);
6461 PL_expect = XOPERATOR;
6462 TOKEN(sublex_start());
6468 if ((s == PL_linestart+2 || s[-3] == '\n') && strEQs(s, "<<<<<")) {
6469 s = vcs_conflict_marker(s + 5);
6472 if (*s == '=' && !PL_lex_allbrackets
6473 && PL_lex_fakeeof >= LEX_FAKEEOF_ASSIGN)
6478 SHop(OP_LEFT_SHIFT);
6483 if (!PL_lex_allbrackets
6484 && PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE)
6492 if (!PL_lex_allbrackets
6493 && PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE)
6502 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE) {
6510 const char tmp = *s++;
6512 if ((s == PL_linestart+2 || s[-3] == '\n') && strEQs(s, ">>>>>")) {
6513 s = vcs_conflict_marker(s + 5);
6516 if (*s == '=' && !PL_lex_allbrackets
6517 && PL_lex_fakeeof >= LEX_FAKEEOF_ASSIGN)
6522 SHop(OP_RIGHT_SHIFT);
6524 else if (tmp == '=') {
6525 if (!PL_lex_allbrackets
6526 && PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE)
6535 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE) {
6544 if (PL_expect == XPOSTDEREF) {
6547 POSTDEREF(DOLSHARP);
6553 && ( isIDFIRST_lazy_if_safe(s+2, PL_bufend, UTF)
6554 || strchr("{$:+-@", s[2])))
6556 PL_tokenbuf[0] = '@';
6557 s = scan_ident(s + 1, PL_tokenbuf + 1,
6558 sizeof PL_tokenbuf - 1, FALSE);
6559 if (PL_expect == XOPERATOR) {
6561 if (PL_bufptr > s) {
6563 PL_bufptr = PL_oldbufptr;
6565 no_op("Array length", d);
6567 if (!PL_tokenbuf[1])
6569 PL_expect = XOPERATOR;
6570 force_ident_maybe_lex('#');
6574 PL_tokenbuf[0] = '$';
6575 s = scan_ident(s, PL_tokenbuf + 1, sizeof PL_tokenbuf - 1, FALSE);
6576 if (PL_expect == XOPERATOR) {
6578 if (PL_bufptr > s) {
6580 PL_bufptr = PL_oldbufptr;
6584 if (!PL_tokenbuf[1]) {
6586 yyerror("Final $ should be \\$ or $name");
6592 const char tmp = *s;
6593 if (PL_lex_state == LEX_NORMAL || PL_lex_brackets)
6596 if ((PL_expect != XREF || PL_oldoldbufptr == PL_last_lop)
6597 && intuit_more(s)) {
6599 PL_tokenbuf[0] = '@';
6600 if (ckWARN(WARN_SYNTAX)) {
6604 || isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF)
6607 t += UTF ? UTF8SKIP(t) : 1;
6610 PL_bufptr = skipspace(PL_bufptr); /* XXX can realloc */
6611 while (t < PL_bufend && *t != ']')
6613 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
6614 "Multidimensional syntax %" UTF8f " not supported",
6615 UTF8fARG(UTF,(int)((t - PL_bufptr) + 1), PL_bufptr));
6619 else if (*s == '{') {
6621 PL_tokenbuf[0] = '%';
6622 if (strEQ(PL_tokenbuf+1, "SIG") && ckWARN(WARN_SYNTAX)
6623 && (t = strchr(s, '}')) && (t = strchr(t, '=')))
6625 char tmpbuf[sizeof PL_tokenbuf];
6628 } while (isSPACE(*t));
6629 if (isIDFIRST_lazy_if_safe(t, PL_bufend, UTF)) {
6631 t = scan_word(t, tmpbuf, sizeof tmpbuf, TRUE,
6636 && get_cvn_flags(tmpbuf, len, UTF
6640 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
6641 "You need to quote \"%" UTF8f "\"",
6642 UTF8fARG(UTF, len, tmpbuf));
6649 PL_expect = XOPERATOR;
6650 if (PL_lex_state == LEX_NORMAL && isSPACE((char)tmp)) {
6651 const bool islop = (PL_last_lop == PL_oldoldbufptr);
6652 if (!islop || PL_last_lop_op == OP_GREPSTART)
6653 PL_expect = XOPERATOR;
6654 else if (strchr("$@\"'`q", *s))
6655 PL_expect = XTERM; /* e.g. print $fh "foo" */
6656 else if ( strchr("&*<%", *s)
6657 && isIDFIRST_lazy_if_safe(s+1, PL_bufend, UTF))
6659 PL_expect = XTERM; /* e.g. print $fh &sub */
6661 else if (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) {
6662 char tmpbuf[sizeof PL_tokenbuf];
6664 scan_word(s, tmpbuf, sizeof tmpbuf, TRUE, &len);
6665 if ((t2 = keyword(tmpbuf, len, 0))) {
6666 /* binary operators exclude handle interpretations */
6678 PL_expect = XTERM; /* e.g. print $fh length() */
6683 PL_expect = XTERM; /* e.g. print $fh subr() */
6686 else if (isDIGIT(*s))
6687 PL_expect = XTERM; /* e.g. print $fh 3 */
6688 else if (*s == '.' && isDIGIT(s[1]))
6689 PL_expect = XTERM; /* e.g. print $fh .3 */
6690 else if ((*s == '?' || *s == '-' || *s == '+')
6691 && !isSPACE(s[1]) && s[1] != '=')
6692 PL_expect = XTERM; /* e.g. print $fh -1 */
6693 else if (*s == '/' && !isSPACE(s[1]) && s[1] != '='
6695 PL_expect = XTERM; /* e.g. print $fh /.../
6696 XXX except DORDOR operator
6698 else if (*s == '<' && s[1] == '<' && !isSPACE(s[2])
6700 PL_expect = XTERM; /* print $fh <<"EOF" */
6703 force_ident_maybe_lex('$');
6707 if (PL_expect == XPOSTDEREF)
6709 PL_tokenbuf[0] = '@';
6710 s = scan_ident(s, PL_tokenbuf + 1, sizeof PL_tokenbuf - 1, FALSE);
6711 if (PL_expect == XOPERATOR) {
6713 if (PL_bufptr > s) {
6715 PL_bufptr = PL_oldbufptr;
6720 if (!PL_tokenbuf[1]) {
6723 if (PL_lex_state == LEX_NORMAL)
6725 if ((PL_expect != XREF || PL_oldoldbufptr == PL_last_lop) && intuit_more(s)) {
6727 PL_tokenbuf[0] = '%';
6729 /* Warn about @ where they meant $. */
6730 if (*s == '[' || *s == '{') {
6731 if (ckWARN(WARN_SYNTAX)) {
6732 S_check_scalar_slice(aTHX_ s);
6736 PL_expect = XOPERATOR;
6737 force_ident_maybe_lex('@');
6740 case '/': /* may be division, defined-or, or pattern */
6741 if ((PL_expect == XOPERATOR || PL_expect == XTERMORDORDOR) && s[1] == '/') {
6742 if (!PL_lex_allbrackets && PL_lex_fakeeof >=
6743 (s[2] == '=' ? LEX_FAKEEOF_ASSIGN : LEX_FAKEEOF_LOGIC))
6748 else if (PL_expect == XOPERATOR) {
6750 if (*s == '=' && !PL_lex_allbrackets
6751 && PL_lex_fakeeof >= LEX_FAKEEOF_ASSIGN)
6759 /* Disable warning on "study /blah/" */
6760 if ( PL_oldoldbufptr == PL_last_uni
6761 && ( *PL_last_uni != 's' || s - PL_last_uni < 5
6762 || memNE(PL_last_uni, "study", 5)
6763 || isWORDCHAR_lazy_if_safe(PL_last_uni+5, PL_bufend, UTF)
6766 s = scan_pat(s,OP_MATCH);
6767 TERM(sublex_start());
6770 case '?': /* conditional */
6772 if (!PL_lex_allbrackets
6773 && PL_lex_fakeeof >= LEX_FAKEEOF_IFELSE)
6778 PL_lex_allbrackets++;
6782 if (PL_lex_formbrack && PL_lex_brackets == PL_lex_formbrack
6783 #ifdef PERL_STRICT_CR
6786 && (s[1] == '\n' || (s[1] == '\r' && s[2] == '\n'))
6788 && (s == PL_linestart || s[-1] == '\n') )
6791 formbrack = 2; /* dot seen where arguments expected */
6794 if (PL_expect == XSTATE && s[1] == '.' && s[2] == '.') {
6798 if (PL_expect == XOPERATOR || !isDIGIT(s[1])) {
6801 if (!PL_lex_allbrackets
6802 && PL_lex_fakeeof >= LEX_FAKEEOF_RANGE)
6810 pl_yylval.ival = OPf_SPECIAL;
6816 if (*s == '=' && !PL_lex_allbrackets
6817 && PL_lex_fakeeof >= LEX_FAKEEOF_ASSIGN)
6825 case '0': case '1': case '2': case '3': case '4':
6826 case '5': case '6': case '7': case '8': case '9':
6827 s = scan_num(s, &pl_yylval);
6828 DEBUG_T( { printbuf("### Saw number in %s\n", s); } );
6829 if (PL_expect == XOPERATOR)
6834 s = scan_str(s,FALSE,FALSE,FALSE,NULL);
6837 COPLINE_SET_FROM_MULTI_END;
6838 DEBUG_T( { printbuf("### Saw string before %s\n", s); } );
6839 if (PL_expect == XOPERATOR) {
6842 pl_yylval.ival = OP_CONST;
6843 TERM(sublex_start());
6846 s = scan_str(s,FALSE,FALSE,FALSE,NULL);
6849 printbuf("### Saw string before %s\n", s);
6851 PerlIO_printf(Perl_debug_log,
6852 "### Saw unterminated string\n");
6854 if (PL_expect == XOPERATOR) {
6859 pl_yylval.ival = OP_CONST;
6860 /* FIXME. I think that this can be const if char *d is replaced by
6861 more localised variables. */
6862 for (d = SvPV(PL_lex_stuff, len); len; len--, d++) {
6863 if (*d == '$' || *d == '@' || *d == '\\' || !UTF8_IS_INVARIANT((U8)*d)) {
6864 pl_yylval.ival = OP_STRINGIFY;
6868 if (pl_yylval.ival == OP_CONST)
6869 COPLINE_SET_FROM_MULTI_END;
6870 TERM(sublex_start());
6873 s = scan_str(s,FALSE,FALSE,FALSE,NULL);
6876 printbuf("### Saw backtick string before %s\n", s);
6878 PerlIO_printf(Perl_debug_log,
6879 "### Saw unterminated backtick string\n");
6881 if (PL_expect == XOPERATOR)
6882 no_op("Backticks",s);
6885 pl_yylval.ival = OP_BACKTICK;
6886 TERM(sublex_start());
6890 if (PL_lex_inwhat == OP_SUBST && PL_lex_repl == PL_linestr
6892 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX),"Can't use \\%c to mean $%c in expression",
6894 if (PL_expect == XOPERATOR)
6895 no_op("Backslash",s);
6899 if (isDIGIT(s[1]) && PL_expect != XOPERATOR) {
6900 char *start = s + 2;
6901 while (isDIGIT(*start) || *start == '_')
6903 if (*start == '.' && isDIGIT(start[1])) {
6904 s = scan_num(s, &pl_yylval);
6907 else if ((*start == ':' && start[1] == ':')
6908 || (PL_expect == XSTATE && *start == ':'))
6910 else if (PL_expect == XSTATE) {
6912 while (d < PL_bufend && isSPACE(*d)) d++;
6913 if (*d == ':') goto keylookup;
6915 /* avoid v123abc() or $h{v1}, allow C<print v10;> */
6916 if (!isALPHA(*start) && (PL_expect == XTERM
6917 || PL_expect == XREF || PL_expect == XSTATE
6918 || PL_expect == XTERMORDORDOR)) {
6919 GV *const gv = gv_fetchpvn_flags(s, start - s,
6920 UTF ? SVf_UTF8 : 0, SVt_PVCV);
6922 s = scan_num(s, &pl_yylval);
6929 if (isDIGIT(s[1]) && PL_expect == XOPERATOR) {
6982 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, FALSE, &len);
6984 /* Some keywords can be followed by any delimiter, including ':' */
6985 anydelim = word_takes_any_delimiter(PL_tokenbuf, len);
6987 /* x::* is just a word, unless x is "CORE" */
6988 if (!anydelim && *s == ':' && s[1] == ':') {
6989 if (strEQ(PL_tokenbuf, "CORE")) goto case_KEY_CORE;
6994 while (d < PL_bufend && isSPACE(*d))
6995 d++; /* no comments skipped here, or s### is misparsed */
6997 /* Is this a word before a => operator? */
6998 if (*d == '=' && d[1] == '>') {
7002 = newSVOP(OP_CONST, 0,
7003 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
7004 pl_yylval.opval->op_private = OPpCONST_BARE;
7008 /* Check for plugged-in keyword */
7012 char *saved_bufptr = PL_bufptr;
7014 result = PL_keyword_plugin(aTHX_ PL_tokenbuf, len, &o);
7016 if (result == KEYWORD_PLUGIN_DECLINE) {
7017 /* not a plugged-in keyword */
7018 PL_bufptr = saved_bufptr;
7019 } else if (result == KEYWORD_PLUGIN_STMT) {
7020 pl_yylval.opval = o;
7022 if (!PL_nexttoke) PL_expect = XSTATE;
7023 return REPORT(PLUGSTMT);
7024 } else if (result == KEYWORD_PLUGIN_EXPR) {
7025 pl_yylval.opval = o;
7027 if (!PL_nexttoke) PL_expect = XOPERATOR;
7028 return REPORT(PLUGEXPR);
7030 Perl_croak(aTHX_ "Bad plugin affecting keyword '%s'",
7035 /* Check for built-in keyword */
7036 tmp = keyword(PL_tokenbuf, len, 0);
7038 /* Is this a label? */
7039 if (!anydelim && PL_expect == XSTATE
7040 && d < PL_bufend && *d == ':' && *(d + 1) != ':') {
7042 pl_yylval.pval = savepvn(PL_tokenbuf, len+1);
7043 pl_yylval.pval[len] = '\0';
7044 pl_yylval.pval[len+1] = UTF ? 1 : 0;
7049 /* Check for lexical sub */
7050 if (PL_expect != XOPERATOR) {
7051 char tmpbuf[sizeof PL_tokenbuf + 1];
7053 Copy(PL_tokenbuf, tmpbuf+1, len, char);
7054 off = pad_findmy_pvn(tmpbuf, len+1, 0);
7055 if (off != NOT_IN_PAD) {
7056 assert(off); /* we assume this is boolean-true below */
7057 if (PAD_COMPNAME_FLAGS_isOUR(off)) {
7058 HV * const stash = PAD_COMPNAME_OURSTASH(off);
7059 HEK * const stashname = HvNAME_HEK(stash);
7060 sv = newSVhek(stashname);
7061 sv_catpvs(sv, "::");
7062 sv_catpvn_flags(sv, PL_tokenbuf, len,
7063 (UTF ? SV_CATUTF8 : SV_CATBYTES));
7064 gv = gv_fetchsv(sv, GV_NOADD_NOINIT | SvUTF8(sv),
7074 rv2cv_op = newOP(OP_PADANY, 0);
7075 rv2cv_op->op_targ = off;
7076 cv = find_lexical_cv(off);
7084 if (tmp < 0) { /* second-class keyword? */
7085 GV *ogv = NULL; /* override (winner) */
7086 GV *hgv = NULL; /* hidden (loser) */
7087 if (PL_expect != XOPERATOR && (*s != ':' || s[1] != ':')) {
7089 if ((gv = gv_fetchpvn_flags(PL_tokenbuf, len,
7090 (UTF ? SVf_UTF8 : 0)|GV_NOTQUAL,
7092 && (cv = GvCVu(gv)))
7094 if (GvIMPORTED_CV(gv))
7096 else if (! CvMETHOD(cv))
7100 && (gvp = (GV**)hv_fetch(PL_globalstash, PL_tokenbuf,
7103 && (isGV_with_GP(gv)
7104 ? GvCVu(gv) && GvIMPORTED_CV(gv)
7105 : SvPCS_IMPORTED(gv)
7106 && (gv_init(gv, PL_globalstash, PL_tokenbuf,
7114 tmp = 0; /* overridden by import or by GLOBAL */
7117 && -tmp==KEY_lock /* XXX generalizable kludge */
7120 tmp = 0; /* any sub overrides "weak" keyword */
7122 else { /* no override */
7124 if (tmp == KEY_dump) {
7125 Perl_ck_warner_d(aTHX_ packWARN2(WARN_MISC,WARN_DEPRECATED),
7126 "dump() better written as CORE::dump(). "
7127 "dump() will no longer be available "
7132 if (hgv && tmp != KEY_x) /* never ambiguous */
7133 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
7134 "Ambiguous call resolved as CORE::%s(), "
7135 "qualify as such or use &",
7140 if (tmp && tmp != KEY___DATA__ && tmp != KEY___END__
7141 && (!anydelim || *s != '#')) {
7142 /* no override, and not s### either; skipspace is safe here
7143 * check for => on following line */
7145 STRLEN bufoff = PL_bufptr - SvPVX(PL_linestr);
7146 STRLEN soff = s - SvPVX(PL_linestr);
7148 arrow = *s == '=' && s[1] == '>';
7149 PL_bufptr = SvPVX(PL_linestr) + bufoff;
7150 s = SvPVX(PL_linestr) + soff;
7158 /* Trade off - by using this evil construction we can pull the
7159 variable gv into the block labelled keylookup. If not, then
7160 we have to give it function scope so that the goto from the
7161 earlier ':' case doesn't bypass the initialisation. */
7162 just_a_word_zero_gv:
7171 default: /* not a keyword */
7174 const char lastchar = (PL_bufptr == PL_oldoldbufptr ? 0 : PL_bufptr[-1]);
7178 /* Get the rest if it looks like a package qualifier */
7180 if (*s == '\'' || (*s == ':' && s[1] == ':')) {
7182 s = scan_word(s, PL_tokenbuf + len, sizeof PL_tokenbuf - len,
7185 Perl_croak(aTHX_ "Bad name after %" UTF8f "%s",
7186 UTF8fARG(UTF, len, PL_tokenbuf),
7187 *s == '\'' ? "'" : "::");
7192 if (PL_expect == XOPERATOR) {
7193 if (PL_bufptr == PL_linestart) {
7194 CopLINE_dec(PL_curcop);
7195 Perl_warner(aTHX_ packWARN(WARN_SEMICOLON), "%s", PL_warn_nosemi);
7196 CopLINE_inc(PL_curcop);
7199 no_op("Bareword",s);
7202 /* See if the name is "Foo::",
7203 in which case Foo is a bareword
7204 (and a package name). */
7207 && PL_tokenbuf[len - 2] == ':'
7208 && PL_tokenbuf[len - 1] == ':')
7210 if (ckWARN(WARN_BAREWORD)
7211 && ! gv_fetchpvn_flags(PL_tokenbuf, len, UTF ? SVf_UTF8 : 0, SVt_PVHV))
7212 Perl_warner(aTHX_ packWARN(WARN_BAREWORD),
7213 "Bareword \"%" UTF8f
7214 "\" refers to nonexistent package",
7215 UTF8fARG(UTF, len, PL_tokenbuf));
7217 PL_tokenbuf[len] = '\0';
7226 /* if we saw a global override before, get the right name */
7229 sv = S_newSV_maybe_utf8(aTHX_ PL_tokenbuf,
7232 SV * const tmp_sv = sv;
7233 sv = newSVpvs("CORE::GLOBAL::");
7234 sv_catsv(sv, tmp_sv);
7235 SvREFCNT_dec(tmp_sv);
7239 /* Presume this is going to be a bareword of some sort. */
7241 pl_yylval.opval = newSVOP(OP_CONST, 0, sv);
7242 pl_yylval.opval->op_private = OPpCONST_BARE;
7244 /* And if "Foo::", then that's what it certainly is. */
7250 OP *const_op = newSVOP(OP_CONST, 0, SvREFCNT_inc_NN(sv));
7251 const_op->op_private = OPpCONST_BARE;
7253 newCVREF(OPpMAY_RETURN_CONSTANT<<8, const_op);
7257 : SvROK(gv) && SvTYPE(SvRV(gv)) == SVt_PVCV
7260 : rv2cv_op_cv(rv2cv_op, RV2CVOPCV_RETURN_STUB);
7263 /* Use this var to track whether intuit_method has been
7264 called. intuit_method returns 0 or > 255. */
7267 /* See if it's the indirect object for a list operator. */
7270 && PL_oldoldbufptr < PL_bufptr
7271 && (PL_oldoldbufptr == PL_last_lop
7272 || PL_oldoldbufptr == PL_last_uni)
7273 && /* NO SKIPSPACE BEFORE HERE! */
7275 || ((PL_opargs[PL_last_lop_op] >> OASHIFT)& 7)
7278 bool immediate_paren = *s == '(';
7281 /* (Now we can afford to cross potential line boundary.) */
7284 /* intuit_method() can indirectly call lex_next_chunk(),
7287 s_off = s - SvPVX(PL_linestr);
7288 /* Two barewords in a row may indicate method call. */
7289 if ( ( isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)
7291 && (tmp = intuit_method(s, lex ? NULL : sv, cv)))
7293 /* the code at method: doesn't use s */
7296 s = SvPVX(PL_linestr) + s_off;
7298 /* If not a declared subroutine, it's an indirect object. */
7299 /* (But it's an indir obj regardless for sort.) */
7300 /* Also, if "_" follows a filetest operator, it's a bareword */
7303 ( !immediate_paren && (PL_last_lop_op == OP_SORT
7305 && (PL_last_lop_op != OP_MAPSTART
7306 && PL_last_lop_op != OP_GREPSTART))))
7307 || (PL_tokenbuf[0] == '_' && PL_tokenbuf[1] == '\0'
7308 && ((PL_opargs[PL_last_lop_op] & OA_CLASS_MASK)
7312 PL_expect = (PL_last_lop == PL_oldoldbufptr) ? XTERM : XOPERATOR;
7317 PL_expect = XOPERATOR;
7320 /* Is this a word before a => operator? */
7321 if (*s == '=' && s[1] == '>' && !pkgname) {
7324 if (gvp || (lex && !off)) {
7325 assert (cSVOPx(pl_yylval.opval)->op_sv == sv);
7326 /* This is our own scalar, created a few lines
7327 above, so this is safe. */
7329 sv_setpv(sv, PL_tokenbuf);
7330 if (UTF && !IN_BYTES
7331 && is_utf8_string((U8*)PL_tokenbuf, len))
7338 /* If followed by a paren, it's certainly a subroutine. */
7343 while (SPACE_OR_TAB(*d))
7345 if (*d == ')' && (sv = cv_const_sv_or_av(cv))) {
7350 NEXTVAL_NEXTTOKE.opval =
7351 off ? rv2cv_op : pl_yylval.opval;
7353 op_free(pl_yylval.opval), force_next(PRIVATEREF);
7354 else op_free(rv2cv_op), force_next(BAREWORD);
7359 /* If followed by var or block, call it a method (unless sub) */
7361 if ((*s == '$' || *s == '{') && !cv) {
7363 PL_last_lop = PL_oldbufptr;
7364 PL_last_lop_op = OP_METHOD;
7365 if (!PL_lex_allbrackets
7366 && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
7368 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
7370 PL_expect = XBLOCKTERM;
7372 return REPORT(METHOD);
7375 /* If followed by a bareword, see if it looks like indir obj. */
7379 && (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF) || *s == '$')
7380 && (tmp = intuit_method(s, lex ? NULL : sv, cv)))
7384 assert(cSVOPx(pl_yylval.opval)->op_sv == sv);
7386 sv_setpvn(sv, PL_tokenbuf, len);
7387 if (UTF && !IN_BYTES
7388 && is_utf8_string((U8*)PL_tokenbuf, len))
7390 else SvUTF8_off(sv);
7393 if (tmp == METHOD && !PL_lex_allbrackets
7394 && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
7396 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
7401 /* Not a method, so call it a subroutine (if defined) */
7404 /* Check for a constant sub */
7405 if ((sv = cv_const_sv_or_av(cv))) {
7408 SvREFCNT_dec(((SVOP*)pl_yylval.opval)->op_sv);
7409 ((SVOP*)pl_yylval.opval)->op_sv = SvREFCNT_inc_simple(sv);
7410 if (SvTYPE(sv) == SVt_PVAV)
7411 pl_yylval.opval = newUNOP(OP_RV2AV, OPf_PARENS,
7414 pl_yylval.opval->op_private = 0;
7415 pl_yylval.opval->op_folded = 1;
7416 pl_yylval.opval->op_flags |= OPf_SPECIAL;
7421 op_free(pl_yylval.opval);
7423 off ? newCVREF(0, rv2cv_op) : rv2cv_op;
7424 pl_yylval.opval->op_private |= OPpENTERSUB_NOPAREN;
7425 PL_last_lop = PL_oldbufptr;
7426 PL_last_lop_op = OP_ENTERSUB;
7427 /* Is there a prototype? */
7431 STRLEN protolen = CvPROTOLEN(cv);
7432 const char *proto = CvPROTO(cv);
7434 proto = S_strip_spaces(aTHX_ proto, &protolen);
7437 if ((optional = *proto == ';'))
7440 while (*proto == ';');
7444 *proto == '$' || *proto == '_'
7445 || *proto == '*' || *proto == '+'
7450 *proto == '\\' && proto[1] && proto[2] == '\0'
7453 UNIPROTO(UNIOPSUB,optional);
7454 if (*proto == '\\' && proto[1] == '[') {
7455 const char *p = proto + 2;
7456 while(*p && *p != ']')
7458 if(*p == ']' && !p[1])
7459 UNIPROTO(UNIOPSUB,optional);
7461 if (*proto == '&' && *s == '{') {
7463 sv_setpvs(PL_subname, "__ANON__");
7465 sv_setpvs(PL_subname, "__ANON__::__ANON__");
7466 if (!PL_lex_allbrackets
7467 && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
7469 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
7474 NEXTVAL_NEXTTOKE.opval = pl_yylval.opval;
7476 force_next(off ? PRIVATEREF : BAREWORD);
7477 if (!PL_lex_allbrackets
7478 && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
7480 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
7485 /* Call it a bare word */
7487 if (PL_hints & HINT_STRICT_SUBS)
7488 pl_yylval.opval->op_private |= OPpCONST_STRICT;
7491 /* after "print" and similar functions (corresponding to
7492 * "F? L" in opcode.pl), whatever wasn't already parsed as
7493 * a filehandle should be subject to "strict subs".
7494 * Likewise for the optional indirect-object argument to system
7495 * or exec, which can't be a bareword */
7496 if ((PL_last_lop_op == OP_PRINT
7497 || PL_last_lop_op == OP_PRTF
7498 || PL_last_lop_op == OP_SAY
7499 || PL_last_lop_op == OP_SYSTEM
7500 || PL_last_lop_op == OP_EXEC)
7501 && (PL_hints & HINT_STRICT_SUBS))
7502 pl_yylval.opval->op_private |= OPpCONST_STRICT;
7503 if (lastchar != '-') {
7504 if (ckWARN(WARN_RESERVED)) {
7508 if (!*d && !gv_stashpv(PL_tokenbuf, UTF ? SVf_UTF8 : 0))
7510 /* PL_warn_reserved is constant */
7511 GCC_DIAG_IGNORE(-Wformat-nonliteral);
7512 Perl_warner(aTHX_ packWARN(WARN_RESERVED), PL_warn_reserved,
7522 if ((lastchar == '*' || lastchar == '%' || lastchar == '&')
7523 && saw_infix_sigil) {
7524 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
7525 "Operator or semicolon missing before %c%" UTF8f,
7527 UTF8fARG(UTF, strlen(PL_tokenbuf),
7529 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
7530 "Ambiguous use of %c resolved as operator %c",
7531 lastchar, lastchar);
7538 newSVOP(OP_CONST, 0, newSVpv(CopFILE(PL_curcop),0))
7543 newSVOP(OP_CONST, 0,
7544 Perl_newSVpvf(aTHX_ "%" IVdf, (IV)CopLINE(PL_curcop)))
7547 case KEY___PACKAGE__:
7549 newSVOP(OP_CONST, 0,
7551 ? newSVhek(HvNAME_HEK(PL_curstash))
7558 if (PL_rsfp && (!PL_in_eval || PL_tokenbuf[2] == 'D')) {
7559 HV * const stash = PL_tokenbuf[2] == 'D' && PL_curstash
7562 gv = (GV *)*hv_fetchs(stash, "DATA", 1);
7564 gv_init(gv,stash,"DATA",4,0);
7567 GvIOp(gv) = newIO();
7568 IoIFP(GvIOp(gv)) = PL_rsfp;
7569 #if defined(HAS_FCNTL) && defined(F_SETFD) && defined(FD_CLOEXEC)
7571 const int fd = PerlIO_fileno(PL_rsfp);
7573 fcntl(fd,F_SETFD, FD_CLOEXEC);
7577 /* Mark this internal pseudo-handle as clean */
7578 IoFLAGS(GvIOp(gv)) |= IOf_UNTAINT;
7579 if ((PerlIO*)PL_rsfp == PerlIO_stdin())
7580 IoTYPE(GvIOp(gv)) = IoTYPE_STD;
7582 IoTYPE(GvIOp(gv)) = IoTYPE_RDONLY;
7583 #if defined(WIN32) && !defined(PERL_TEXTMODE_SCRIPTS)
7584 /* if the script was opened in binmode, we need to revert
7585 * it to text mode for compatibility; but only iff it has CRs
7586 * XXX this is a questionable hack at best. */
7587 if (PL_bufend-PL_bufptr > 2
7588 && PL_bufend[-1] == '\n' && PL_bufend[-2] == '\r')
7591 if (IoTYPE(GvIOp(gv)) == IoTYPE_RDONLY) {
7592 loc = PerlIO_tell(PL_rsfp);
7593 (void)PerlIO_seek(PL_rsfp, 0L, 0);
7596 if (PerlLIO_setmode(PL_rsfp, O_TEXT) != -1) {
7598 if (PerlLIO_setmode(PerlIO_fileno(PL_rsfp), O_TEXT) != -1) {
7599 #endif /* NETWARE */
7601 PerlIO_seek(PL_rsfp, loc, 0);
7605 #ifdef PERLIO_LAYERS
7608 PerlIO_apply_layers(aTHX_ PL_rsfp, NULL, ":utf8");
7617 FUN0OP(CvCLONE(PL_compcv)
7618 ? newOP(OP_RUNCV, 0)
7619 : newPVOP(OP_RUNCV,0,NULL));
7628 if (PL_expect == XSTATE) {
7639 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, FALSE, &len);
7640 if ((*s == ':' && s[1] == ':')
7641 || (!(tmp = keyword(PL_tokenbuf, len, 1)) && *s == '\''))
7645 Copy(PL_bufptr, PL_tokenbuf, olen, char);
7649 Perl_croak(aTHX_ "CORE::%" UTF8f " is not a keyword",
7650 UTF8fARG(UTF, len, PL_tokenbuf));
7653 else if (tmp == KEY_require || tmp == KEY_do
7655 /* that's a way to remember we saw "CORE::" */
7667 LOP(OP_ACCEPT,XTERM);
7670 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_LOWLOGIC)
7675 LOP(OP_ATAN2,XTERM);
7681 LOP(OP_BINMODE,XTERM);
7684 LOP(OP_BLESS,XTERM);
7693 /* We have to disambiguate the two senses of
7694 "continue". If the next token is a '{' then
7695 treat it as the start of a continue block;
7696 otherwise treat it as a control operator.
7706 (void)gv_fetchpvs("ENV", GV_ADD|GV_NOTQUAL, SVt_PVHV);
7716 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE)
7725 if (!PL_cryptseen) {
7726 PL_cryptseen = TRUE;
7730 LOP(OP_CRYPT,XTERM);
7733 LOP(OP_CHMOD,XTERM);
7736 LOP(OP_CHOWN,XTERM);
7739 LOP(OP_CONNECT,XTERM);
7759 d = scan_word(s, PL_tokenbuf + 1, sizeof PL_tokenbuf - 1,
7761 if (len && (len != 4 || strNE(PL_tokenbuf+1, "CORE"))
7762 && !keyword(PL_tokenbuf + 1, len, 0)) {
7763 SSize_t off = s-SvPVX(PL_linestr);
7765 s = SvPVX(PL_linestr)+off;
7767 force_ident_maybe_lex('&');
7772 if (orig_keyword == KEY_do) {
7781 PL_hints |= HINT_BLOCK_SCOPE;
7791 Perl_populate_isa(aTHX_ STR_WITH_LEN("AnyDBM_File::ISA"),
7792 STR_WITH_LEN("NDBM_File::"),
7793 STR_WITH_LEN("DB_File::"),
7794 STR_WITH_LEN("GDBM_File::"),
7795 STR_WITH_LEN("SDBM_File::"),
7796 STR_WITH_LEN("ODBM_File::"),
7798 LOP(OP_DBMOPEN,XTERM);
7810 pl_yylval.ival = CopLINE(PL_curcop);
7814 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE)
7826 if (*s == '{') { /* block eval */
7827 PL_expect = XTERMBLOCK;
7828 UNIBRACK(OP_ENTERTRY);
7830 else { /* string eval */
7832 UNIBRACK(OP_ENTEREVAL);
7837 UNIBRACK(-OP_ENTEREVAL);
7851 case KEY_endhostent:
7857 case KEY_endservent:
7860 case KEY_endprotoent:
7871 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_NONEXPR)
7873 pl_yylval.ival = CopLINE(PL_curcop);
7875 if ( PL_expect == XSTATE
7876 && isIDFIRST_lazy_if_safe(s, PL_bufend, UTF))
7879 SSize_t s_off = s - SvPVX(PL_linestr);
7881 if ((PL_bufend - p) >= 3
7882 && strEQs(p, "my") && isSPACE(*(p + 2)))
7886 else if ((PL_bufend - p) >= 4
7887 && strEQs(p, "our") && isSPACE(*(p + 3)))
7890 /* skip optional package name, as in "for my abc $x (..)" */
7891 if (isIDFIRST_lazy_if_safe(p, PL_bufend, UTF)) {
7892 p = scan_word(p, PL_tokenbuf, sizeof PL_tokenbuf, TRUE, &len);
7895 if (*p != '$' && *p != '\\')
7896 Perl_croak(aTHX_ "Missing $ on loop variable");
7898 /* The buffer may have been reallocated, update s */
7899 s = SvPVX(PL_linestr) + s_off;
7904 LOP(OP_FORMLINE,XTERM);
7913 LOP(OP_FCNTL,XTERM);
7919 LOP(OP_FLOCK,XTERM);
7922 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE)
7927 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE)
7932 LOP(OP_GREPSTART, XREF);
7949 case KEY_getpriority:
7950 LOP(OP_GETPRIORITY,XTERM);
7952 case KEY_getprotobyname:
7955 case KEY_getprotobynumber:
7956 LOP(OP_GPBYNUMBER,XTERM);
7958 case KEY_getprotoent:
7970 case KEY_getpeername:
7971 UNI(OP_GETPEERNAME);
7973 case KEY_gethostbyname:
7976 case KEY_gethostbyaddr:
7977 LOP(OP_GHBYADDR,XTERM);
7979 case KEY_gethostent:
7982 case KEY_getnetbyname:
7985 case KEY_getnetbyaddr:
7986 LOP(OP_GNBYADDR,XTERM);
7991 case KEY_getservbyname:
7992 LOP(OP_GSBYNAME,XTERM);
7994 case KEY_getservbyport:
7995 LOP(OP_GSBYPORT,XTERM);
7997 case KEY_getservent:
8000 case KEY_getsockname:
8001 UNI(OP_GETSOCKNAME);
8003 case KEY_getsockopt:
8004 LOP(OP_GSOCKOPT,XTERM);
8019 pl_yylval.ival = CopLINE(PL_curcop);
8020 Perl_ck_warner_d(aTHX_
8021 packWARN(WARN_EXPERIMENTAL__SMARTMATCH),
8022 "given is experimental");
8027 orig_keyword==KEY_glob ? -OP_GLOB : OP_GLOB,
8035 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_NONEXPR)
8037 pl_yylval.ival = CopLINE(PL_curcop);
8041 LOP(OP_INDEX,XTERM);
8047 LOP(OP_IOCTL,XTERM);
8074 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE)
8079 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE)
8093 LOP(OP_LISTEN,XTERM);
8102 s = scan_pat(s,OP_MATCH);
8103 TERM(sublex_start());
8106 LOP(OP_MAPSTART, XREF);
8109 LOP(OP_MKDIR,XTERM);
8112 LOP(OP_MSGCTL,XTERM);
8115 LOP(OP_MSGGET,XTERM);
8118 LOP(OP_MSGRCV,XTERM);
8121 LOP(OP_MSGSND,XTERM);
8128 yyerror(Perl_form(aTHX_
8129 "Can't redeclare \"%s\" in \"%s\"",
8130 tmp == KEY_my ? "my" :
8131 tmp == KEY_state ? "state" : "our",
8132 PL_in_my == KEY_my ? "my" :
8133 PL_in_my == KEY_state ? "state" : "our"));
8135 PL_in_my = (U16)tmp;
8137 if (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) {
8138 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, TRUE, &len);
8139 if (len == 3 && strEQs(PL_tokenbuf, "sub"))
8141 PL_in_my_stash = find_in_my_stash(PL_tokenbuf, len);
8142 if (!PL_in_my_stash) {
8146 len = my_snprintf(tmpbuf, sizeof(tmpbuf), "No such class %.1000s", PL_tokenbuf);
8147 PERL_MY_SNPRINTF_POST_GUARD(len, sizeof(tmpbuf));
8148 yyerror_pv(tmpbuf, UTF ? SVf_UTF8 : 0);
8151 else if (*s == '\\') {
8152 if (!FEATURE_MYREF_IS_ENABLED)
8153 Perl_croak(aTHX_ "The experimental declared_refs "
8154 "feature is not enabled");
8155 Perl_ck_warner_d(aTHX_
8156 packWARN(WARN_EXPERIMENTAL__DECLARED_REFS),
8157 "Declaring references is experimental");
8165 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE)
8170 s = tokenize_use(0, s);
8174 if (*s == '(' || (s = skipspace(s), *s == '('))
8177 if (!PL_lex_allbrackets
8178 && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
8180 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
8187 if (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) {
8189 d = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, FALSE,
8191 for (t=d; isSPACE(*t);)
8193 if ( *t && strchr("|&*+-=!?:.", *t) && ckWARN_d(WARN_PRECEDENCE)
8195 && !(t[0] == '=' && t[1] == '>')
8196 && !(t[0] == ':' && t[1] == ':')
8197 && !keyword(s, d-s, 0)
8199 Perl_warner(aTHX_ packWARN(WARN_PRECEDENCE),
8200 "Precedence problem: open %" UTF8f " should be open(%" UTF8f ")",
8201 UTF8fARG(UTF, d-s, s), UTF8fARG(UTF, d-s, s));
8207 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_LOWLOGIC)
8209 pl_yylval.ival = OP_OR;
8219 LOP(OP_OPEN_DIR,XTERM);
8222 checkcomma(s,PL_tokenbuf,"filehandle");
8226 checkcomma(s,PL_tokenbuf,"filehandle");
8245 s = force_word(s,BAREWORD,FALSE,TRUE);
8247 s = force_strict_version(s);
8251 LOP(OP_PIPE_OP,XTERM);
8254 s = scan_str(s,FALSE,FALSE,FALSE,NULL);
8257 COPLINE_SET_FROM_MULTI_END;
8258 pl_yylval.ival = OP_CONST;
8259 TERM(sublex_start());
8266 s = scan_str(s,FALSE,FALSE,FALSE,NULL);
8269 COPLINE_SET_FROM_MULTI_END;
8270 PL_expect = XOPERATOR;
8271 if (SvCUR(PL_lex_stuff)) {
8272 int warned_comma = !ckWARN(WARN_QW);
8273 int warned_comment = warned_comma;
8274 d = SvPV_force(PL_lex_stuff, len);
8276 for (; isSPACE(*d) && len; --len, ++d)
8281 if (!warned_comma || !warned_comment) {
8282 for (; !isSPACE(*d) && len; --len, ++d) {
8283 if (!warned_comma && *d == ',') {
8284 Perl_warner(aTHX_ packWARN(WARN_QW),
8285 "Possible attempt to separate words with commas");
8288 else if (!warned_comment && *d == '#') {
8289 Perl_warner(aTHX_ packWARN(WARN_QW),
8290 "Possible attempt to put comments in qw() list");
8296 for (; !isSPACE(*d) && len; --len, ++d)
8299 sv = newSVpvn_utf8(b, d-b, DO_UTF8(PL_lex_stuff));
8300 words = op_append_elem(OP_LIST, words,
8301 newSVOP(OP_CONST, 0, tokeq(sv)));
8306 words = newNULLLIST();
8307 SvREFCNT_dec_NN(PL_lex_stuff);
8308 PL_lex_stuff = NULL;
8309 PL_expect = XOPERATOR;
8310 pl_yylval.opval = sawparens(words);
8315 s = scan_str(s,FALSE,FALSE,FALSE,NULL);
8318 pl_yylval.ival = OP_STRINGIFY;
8319 if (SvIVX(PL_lex_stuff) == '\'')
8320 SvIV_set(PL_lex_stuff, 0); /* qq'$foo' should interpolate */
8321 TERM(sublex_start());
8324 s = scan_pat(s,OP_QR);
8325 TERM(sublex_start());
8328 s = scan_str(s,FALSE,FALSE,FALSE,NULL);
8331 pl_yylval.ival = OP_BACKTICK;
8332 TERM(sublex_start());
8340 s = force_version(s, FALSE);
8342 else if (*s != 'v' || !isDIGIT(s[1])
8343 || (s = force_version(s, TRUE), *s == 'v'))
8345 *PL_tokenbuf = '\0';
8346 s = force_word(s,BAREWORD,TRUE,TRUE);
8347 if (isIDFIRST_lazy_if_safe(PL_tokenbuf,
8348 PL_tokenbuf + sizeof(PL_tokenbuf),
8351 gv_stashpvn(PL_tokenbuf, strlen(PL_tokenbuf),
8352 GV_ADD | (UTF ? SVf_UTF8 : 0));
8355 yyerror("<> at require-statement should be quotes");
8357 if (orig_keyword == KEY_require) {
8363 PL_expect = PL_nexttoke ? XOPERATOR : XTERM;
8365 PL_last_uni = PL_oldbufptr;
8366 PL_last_lop_op = OP_REQUIRE;
8368 return REPORT( (int)REQUIRE );
8377 LOP(OP_RENAME,XTERM);
8386 LOP(OP_RINDEX,XTERM);
8395 UNIDOR(OP_READLINE);
8398 UNIDOR(OP_BACKTICK);
8407 LOP(OP_REVERSE,XTERM);
8410 UNIDOR(OP_READLINK);
8417 if (pl_yylval.opval)
8418 TERM(sublex_start());
8420 TOKEN(1); /* force error */
8423 checkcomma(s,PL_tokenbuf,"filehandle");
8433 LOP(OP_SELECT,XTERM);
8439 LOP(OP_SEMCTL,XTERM);
8442 LOP(OP_SEMGET,XTERM);
8445 LOP(OP_SEMOP,XTERM);
8451 LOP(OP_SETPGRP,XTERM);
8453 case KEY_setpriority:
8454 LOP(OP_SETPRIORITY,XTERM);
8456 case KEY_sethostent:
8462 case KEY_setservent:
8465 case KEY_setprotoent:
8475 LOP(OP_SEEKDIR,XTERM);
8477 case KEY_setsockopt:
8478 LOP(OP_SSOCKOPT,XTERM);
8484 LOP(OP_SHMCTL,XTERM);
8487 LOP(OP_SHMGET,XTERM);
8490 LOP(OP_SHMREAD,XTERM);
8493 LOP(OP_SHMWRITE,XTERM);
8496 LOP(OP_SHUTDOWN,XTERM);
8505 LOP(OP_SOCKET,XTERM);
8507 case KEY_socketpair:
8508 LOP(OP_SOCKPAIR,XTERM);
8511 checkcomma(s,PL_tokenbuf,"subroutine name");
8514 s = force_word(s,BAREWORD,TRUE,TRUE);
8518 LOP(OP_SPLIT,XTERM);
8521 LOP(OP_SPRINTF,XTERM);
8524 LOP(OP_SPLICE,XTERM);
8539 LOP(OP_SUBSTR,XTERM);
8545 char * const tmpbuf = PL_tokenbuf + 1;
8546 expectation attrful;
8547 bool have_name, have_proto;
8548 const int key = tmp;
8549 SV *format_name = NULL;
8551 SSize_t off = s-SvPVX(PL_linestr);
8553 d = SvPVX(PL_linestr)+off;
8555 if ( isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)
8557 || (*s == ':' && s[1] == ':'))
8561 attrful = XATTRBLOCK;
8562 d = scan_word(s, tmpbuf, sizeof PL_tokenbuf - 1, TRUE,
8564 if (key == KEY_format)
8565 format_name = S_newSV_maybe_utf8(aTHX_ s, d - s);
8567 if (memchr(tmpbuf, ':', len) || key != KEY_sub
8569 PL_tokenbuf, len + 1, 0
8571 sv_setpvn(PL_subname, tmpbuf, len);
8573 sv_setsv(PL_subname,PL_curstname);
8574 sv_catpvs(PL_subname,"::");
8575 sv_catpvn(PL_subname,tmpbuf,len);
8577 if (SvUTF8(PL_linestr))
8578 SvUTF8_on(PL_subname);
8585 if (key == KEY_my || key == KEY_our || key==KEY_state)
8588 /* diag_listed_as: Missing name in "%s sub" */
8590 "Missing name in \"%s\"", PL_bufptr);
8592 PL_expect = XTERMBLOCK;
8593 attrful = XATTRTERM;
8594 sv_setpvs(PL_subname,"?");
8598 if (key == KEY_format) {
8600 NEXTVAL_NEXTTOKE.opval
8601 = newSVOP(OP_CONST,0, format_name);
8602 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
8603 force_next(BAREWORD);
8608 /* Look for a prototype */
8609 if (*s == '(' && !FEATURE_SIGNATURES_IS_ENABLED) {
8610 s = scan_str(s,FALSE,FALSE,FALSE,NULL);
8611 COPLINE_SET_FROM_MULTI_END;
8613 Perl_croak(aTHX_ "Prototype not terminated");
8614 (void)validate_proto(PL_subname, PL_lex_stuff, ckWARN(WARN_ILLEGALPROTO));
8622 if (*s == ':' && s[1] != ':')
8623 PL_expect = attrful;
8624 else if ((*s != '{' && *s != '(') && key != KEY_format) {
8625 assert(key == KEY_sub || key == KEY_AUTOLOAD ||
8626 key == KEY_DESTROY || key == KEY_BEGIN ||
8627 key == KEY_UNITCHECK || key == KEY_CHECK ||
8628 key == KEY_INIT || key == KEY_END ||
8629 key == KEY_my || key == KEY_state ||
8632 Perl_croak(aTHX_ "Illegal declaration of anonymous subroutine");
8633 else if (*s != ';' && *s != '}')
8634 Perl_croak(aTHX_ "Illegal declaration of subroutine %" SVf, SVfARG(PL_subname));
8638 NEXTVAL_NEXTTOKE.opval =
8639 newSVOP(OP_CONST, 0, PL_lex_stuff);
8640 PL_lex_stuff = NULL;
8645 sv_setpvs(PL_subname, "__ANON__");
8647 sv_setpvs(PL_subname, "__ANON__::__ANON__");
8650 force_ident_maybe_lex('&');
8655 LOP(OP_SYSTEM,XREF);
8658 LOP(OP_SYMLINK,XTERM);
8661 LOP(OP_SYSCALL,XTERM);
8664 LOP(OP_SYSOPEN,XTERM);
8667 LOP(OP_SYSSEEK,XTERM);
8670 LOP(OP_SYSREAD,XTERM);
8673 LOP(OP_SYSWRITE,XTERM);
8678 TERM(sublex_start());
8699 LOP(OP_TRUNCATE,XTERM);
8711 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_NONEXPR)
8713 pl_yylval.ival = CopLINE(PL_curcop);
8717 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_NONEXPR)
8719 pl_yylval.ival = CopLINE(PL_curcop);
8723 LOP(OP_UNLINK,XTERM);
8729 LOP(OP_UNPACK,XTERM);
8732 LOP(OP_UTIME,XTERM);
8738 LOP(OP_UNSHIFT,XTERM);
8741 s = tokenize_use(1, s);
8751 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_NONEXPR)
8753 pl_yylval.ival = CopLINE(PL_curcop);
8754 Perl_ck_warner_d(aTHX_
8755 packWARN(WARN_EXPERIMENTAL__SMARTMATCH),
8756 "when is experimental");
8760 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_NONEXPR)
8762 pl_yylval.ival = CopLINE(PL_curcop);
8766 PL_hints |= HINT_BLOCK_SCOPE;
8773 LOP(OP_WAITPID,XTERM);
8779 /* Make sure $^L is defined. 0x0C is CTRL-L on ASCII platforms, and
8780 * we use the same number on EBCDIC */
8781 gv_fetchpvs("\x0C", GV_ADD|GV_NOTQUAL, SVt_PV);
8785 if (PL_expect == XOPERATOR) {
8786 if (*s == '=' && !PL_lex_allbrackets
8787 && PL_lex_fakeeof >= LEX_FAKEEOF_ASSIGN)
8797 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_LOWLOGIC)
8799 pl_yylval.ival = OP_XOR;
8808 Looks up an identifier in the pad or in a package
8810 is_sig indicates that this is a subroutine signature variable
8811 rather than a plain pad var.
8814 PRIVATEREF if this is a lexical name.
8815 BAREWORD if this belongs to a package.
8818 if we're in a my declaration
8819 croak if they tried to say my($foo::bar)
8820 build the ops for a my() declaration
8821 if it's an access to a my() variable
8822 build ops for access to a my() variable
8823 if in a dq string, and they've said @foo and we can't find @foo
8825 build ops for a bareword
8829 S_pending_ident(pTHX)
8832 const char pit = (char)pl_yylval.ival;
8833 const STRLEN tokenbuf_len = strlen(PL_tokenbuf);
8834 /* All routes through this function want to know if there is a colon. */
8835 const char *const has_colon = (const char*) memchr (PL_tokenbuf, ':', tokenbuf_len);
8837 DEBUG_T({ PerlIO_printf(Perl_debug_log,
8838 "### Pending identifier '%s'\n", PL_tokenbuf); });
8840 /* if we're in a my(), we can't allow dynamics here.
8841 $foo'bar has already been turned into $foo::bar, so
8842 just check for colons.
8844 if it's a legal name, the OP is a PADANY.
8847 if (PL_in_my == KEY_our) { /* "our" is merely analogous to "my" */
8849 yyerror_pv(Perl_form(aTHX_ "No package name allowed for "
8850 "variable %s in \"our\"",
8851 PL_tokenbuf), UTF ? SVf_UTF8 : 0);
8852 tmp = allocmy(PL_tokenbuf, tokenbuf_len, UTF ? SVf_UTF8 : 0);
8857 /* "my" variable %s can't be in a package */
8858 /* PL_no_myglob is constant */
8859 GCC_DIAG_IGNORE(-Wformat-nonliteral);
8860 yyerror_pv(Perl_form(aTHX_ PL_no_myglob,
8861 PL_in_my == KEY_my ? "my" : "state",
8862 *PL_tokenbuf == '&' ? "subroutin" : "variabl",
8864 UTF ? SVf_UTF8 : 0);
8868 if (PL_in_my == KEY_sigvar) {
8869 /* A signature 'padop' needs in addition, an op_first to
8870 * point to a child sigdefelem, and an extra field to hold
8871 * the signature index. We can achieve both by using an
8872 * UNOP_AUX and (ab)using the op_aux field to hold the
8873 * index. If we ever need more fields, use a real malloced
8874 * aux strut instead.
8876 o = newUNOP_AUX(OP_ARGELEM, 0, NULL,
8877 INT2PTR(UNOP_AUX_item *,
8878 (PL_parser->sig_elems)));
8879 o->op_private |= ( PL_tokenbuf[0] == '$' ? OPpARGELEM_SV
8880 : PL_tokenbuf[0] == '@' ? OPpARGELEM_AV
8884 o = newOP(OP_PADANY, 0);
8885 o->op_targ = allocmy(PL_tokenbuf, tokenbuf_len,
8886 UTF ? SVf_UTF8 : 0);
8887 if (PL_in_my == KEY_sigvar)
8890 pl_yylval.opval = o;
8896 build the ops for accesses to a my() variable.
8901 tmp = pad_findmy_pvn(PL_tokenbuf, tokenbuf_len,
8903 if (tmp != NOT_IN_PAD) {
8904 /* might be an "our" variable" */
8905 if (PAD_COMPNAME_FLAGS_isOUR(tmp)) {
8906 /* build ops for a bareword */
8907 HV * const stash = PAD_COMPNAME_OURSTASH(tmp);
8908 HEK * const stashname = HvNAME_HEK(stash);
8909 SV * const sym = newSVhek(stashname);
8910 sv_catpvs(sym, "::");
8911 sv_catpvn_flags(sym, PL_tokenbuf+1, tokenbuf_len - 1, (UTF ? SV_CATUTF8 : SV_CATBYTES ));
8912 pl_yylval.opval = newSVOP(OP_CONST, 0, sym);
8913 pl_yylval.opval->op_private = OPpCONST_ENTERED;
8917 ((PL_tokenbuf[0] == '$') ? SVt_PV
8918 : (PL_tokenbuf[0] == '@') ? SVt_PVAV
8923 pl_yylval.opval = newOP(OP_PADANY, 0);
8924 pl_yylval.opval->op_targ = tmp;
8930 Whine if they've said @foo or @foo{key} in a doublequoted string,
8931 and @foo (or %foo) isn't a variable we can find in the symbol
8934 if (ckWARN(WARN_AMBIGUOUS)
8936 && PL_lex_state != LEX_NORMAL
8937 && !PL_lex_brackets)
8939 GV *const gv = gv_fetchpvn_flags(PL_tokenbuf + 1, tokenbuf_len - 1,
8940 ( UTF ? SVf_UTF8 : 0 ) | GV_ADDMG,
8942 if ((!gv || ((PL_tokenbuf[0] == '@') ? !GvAV(gv) : !GvHV(gv)))
8945 /* Downgraded from fatal to warning 20000522 mjd */
8946 Perl_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
8947 "Possible unintended interpolation of %" UTF8f
8949 UTF8fARG(UTF, tokenbuf_len, PL_tokenbuf));
8953 /* build ops for a bareword */
8954 pl_yylval.opval = newSVOP(OP_CONST, 0,
8955 newSVpvn_flags(PL_tokenbuf + 1,
8957 UTF ? SVf_UTF8 : 0 ));
8958 pl_yylval.opval->op_private = OPpCONST_ENTERED;
8960 gv_fetchpvn_flags(PL_tokenbuf+1, tokenbuf_len - 1,
8961 (PL_in_eval ? GV_ADDMULTI : GV_ADD)
8962 | ( UTF ? SVf_UTF8 : 0 ),
8963 ((PL_tokenbuf[0] == '$') ? SVt_PV
8964 : (PL_tokenbuf[0] == '@') ? SVt_PVAV
8970 S_checkcomma(pTHX_ const char *s, const char *name, const char *what)
8972 PERL_ARGS_ASSERT_CHECKCOMMA;
8974 if (*s == ' ' && s[1] == '(') { /* XXX gotta be a better way */
8975 if (ckWARN(WARN_SYNTAX)) {
8978 for (w = s+2; *w && level; w++) {
8986 /* the list of chars below is for end of statements or
8987 * block / parens, boolean operators (&&, ||, //) and branch
8988 * constructs (or, and, if, until, unless, while, err, for).
8989 * Not a very solid hack... */
8990 if (!*w || !strchr(";&/|})]oaiuwef!=", *w))
8991 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
8992 "%s (...) interpreted as function",name);
8995 while (s < PL_bufend && isSPACE(*s))
8999 while (s < PL_bufend && isSPACE(*s))
9001 if (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) {
9002 const char * const w = s;
9003 s += UTF ? UTF8SKIP(s) : 1;
9004 while (isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF))
9005 s += UTF ? UTF8SKIP(s) : 1;
9006 while (s < PL_bufend && isSPACE(*s))
9010 if (keyword(w, s - w, 0))
9013 gv = gv_fetchpvn_flags(w, s - w, ( UTF ? SVf_UTF8 : 0 ), SVt_PVCV);
9014 if (gv && GvCVu(gv))
9019 Copy(w, tmpbuf+1, s - w, char);
9021 off = pad_findmy_pvn(tmpbuf, s-w+1, 0);
9022 if (off != NOT_IN_PAD) return;
9024 Perl_croak(aTHX_ "No comma allowed after %s", what);
9029 /* S_new_constant(): do any overload::constant lookup.
9031 Either returns sv, or mortalizes/frees sv and returns a new SV*.
9032 Best used as sv=new_constant(..., sv, ...).
9033 If s, pv are NULL, calls subroutine with one argument,
9034 and <type> is used with error messages only.
9035 <type> is assumed to be well formed UTF-8 */
9038 S_new_constant(pTHX_ const char *s, STRLEN len, const char *key, STRLEN keylen,
9039 SV *sv, SV *pv, const char *type, STRLEN typelen)
9042 HV * table = GvHV(PL_hintgv); /* ^H */
9047 const char *why1 = "", *why2 = "", *why3 = "";
9049 PERL_ARGS_ASSERT_NEW_CONSTANT;
9050 /* We assume that this is true: */
9051 if (*key == 'c') { assert (strEQ(key, "charnames")); }
9054 /* charnames doesn't work well if there have been errors found */
9055 if (PL_error_count > 0 && *key == 'c')
9057 SvREFCNT_dec_NN(sv);
9058 return &PL_sv_undef;
9061 sv_2mortal(sv); /* Parent created it permanently */
9063 || ! (PL_hints & HINT_LOCALIZE_HH)
9064 || ! (cvp = hv_fetch(table, key, keylen, FALSE))
9069 /* Here haven't found what we're looking for. If it is charnames,
9070 * perhaps it needs to be loaded. Try doing that before giving up */
9072 Perl_load_module(aTHX_
9074 newSVpvs("_charnames"),
9075 /* version parameter; no need to specify it, as if
9076 * we get too early a version, will fail anyway,
9077 * not being able to find '_charnames' */
9082 assert(sp == PL_stack_sp);
9083 table = GvHV(PL_hintgv);
9085 && (PL_hints & HINT_LOCALIZE_HH)
9086 && (cvp = hv_fetch(table, key, keylen, FALSE))
9092 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
9093 msg = Perl_form(aTHX_
9094 "Constant(%.*s) unknown",
9095 (int)(type ? typelen : len),
9101 why3 = "} is not defined";
9104 msg = Perl_form(aTHX_
9105 /* The +3 is for '\N{'; -4 for that, plus '}' */
9106 "Unknown charname '%.*s'", (int)typelen - 4, type + 3
9110 msg = Perl_form(aTHX_ "Constant(%.*s): %s%s%s",
9111 (int)(type ? typelen : len),
9112 (type ? type: s), why1, why2, why3);
9115 yyerror_pv(msg, UTF ? SVf_UTF8 : 0);
9116 return SvREFCNT_inc_simple_NN(sv);
9121 pv = newSVpvn_flags(s, len, SVs_TEMP);
9123 typesv = newSVpvn_flags(type, typelen, SVs_TEMP);
9125 typesv = &PL_sv_undef;
9127 PUSHSTACKi(PERLSI_OVERLOAD);
9139 call_sv(cv, G_SCALAR | ( PL_in_eval ? 0 : G_EVAL));
9143 /* Check the eval first */
9144 if (!PL_in_eval && ((errsv = ERRSV), SvTRUE_NN(errsv))) {
9146 const char * errstr;
9147 sv_catpvs(errsv, "Propagated");
9148 errstr = SvPV_const(errsv, errlen);
9149 yyerror_pvn(errstr, errlen, 0); /* Duplicates the message inside eval */
9151 res = SvREFCNT_inc_simple_NN(sv);
9155 SvREFCNT_inc_simple_void_NN(res);
9164 why1 = "Call to &{$^H{";
9166 why3 = "}} did not return a defined value";
9168 (void)sv_2mortal(sv);
9175 PERL_STATIC_INLINE void
9176 S_parse_ident(pTHX_ char **s, char **d, char * const e, int allow_package,
9177 bool is_utf8, bool check_dollar)
9179 PERL_ARGS_ASSERT_PARSE_IDENT;
9181 while (*s < PL_bufend) {
9183 Perl_croak(aTHX_ "%s", ident_too_long);
9184 if (is_utf8 && isIDFIRST_utf8_safe(*s, PL_bufend)) {
9185 /* The UTF-8 case must come first, otherwise things
9186 * like c\N{COMBINING TILDE} would start failing, as the
9187 * isWORDCHAR_A case below would gobble the 'c' up.
9190 char *t = *s + UTF8SKIP(*s);
9191 while (isIDCONT_utf8_safe((const U8*) t, (const U8*) PL_bufend)) {
9194 if (*d + (t - *s) > e)
9195 Perl_croak(aTHX_ "%s", ident_too_long);
9196 Copy(*s, *d, t - *s, char);
9200 else if ( isWORDCHAR_A(**s) ) {
9203 } while (isWORDCHAR_A(**s) && *d < e);
9205 else if ( allow_package
9207 && isIDFIRST_lazy_if_safe((*s)+1, PL_bufend, is_utf8))
9213 else if (allow_package && **s == ':' && (*s)[1] == ':'
9214 /* Disallow things like Foo::$bar. For the curious, this is
9215 * the code path that triggers the "Bad name after" warning
9216 * when looking for barewords.
9218 && !(check_dollar && (*s)[2] == '$')) {
9228 /* Returns a NUL terminated string, with the length of the string written to
9232 S_scan_word(pTHX_ char *s, char *dest, STRLEN destlen, int allow_package, STRLEN *slp)
9235 char * const e = d + destlen - 3; /* two-character token, ending NUL */
9236 bool is_utf8 = cBOOL(UTF);
9238 PERL_ARGS_ASSERT_SCAN_WORD;
9240 parse_ident(&s, &d, e, allow_package, is_utf8, TRUE);
9246 /* Is the byte 'd' a legal single character identifier name? 'u' is true
9247 * iff Unicode semantics are to be used. The legal ones are any of:
9248 * a) all ASCII characters except:
9249 * 1) control and space-type ones, like NUL, SOH, \t, and SPACE;
9251 * The final case currently doesn't get this far in the program, so we
9252 * don't test for it. If that were to change, it would be ok to allow it.
9253 * b) When not under Unicode rules, any upper Latin1 character
9254 * c) Otherwise, when unicode rules are used, all XIDS characters.
9256 * Because all ASCII characters have the same representation whether
9257 * encoded in UTF-8 or not, we can use the foo_A macros below and '\0' and
9258 * '{' without knowing if is UTF-8 or not. */
9259 #define VALID_LEN_ONE_IDENT(s, e, is_utf8) \
9260 (isGRAPH_A(*(s)) || ((is_utf8) \
9261 ? isIDFIRST_utf8_safe(s, e) \
9263 && LIKELY((U8) *(s) != LATIN1_TO_NATIVE(0xAD)))))
9266 S_scan_ident(pTHX_ char *s, char *dest, STRLEN destlen, I32 ck_uni)
9268 I32 herelines = PL_parser->herelines;
9269 SSize_t bracket = -1;
9272 char * const e = d + destlen - 3; /* two-character token, ending NUL */
9273 bool is_utf8 = cBOOL(UTF);
9274 I32 orig_copline = 0, tmp_copline = 0;
9276 PERL_ARGS_ASSERT_SCAN_IDENT;
9278 if (isSPACE(*s) || !*s)
9281 while (isDIGIT(*s)) {
9283 Perl_croak(aTHX_ "%s", ident_too_long);
9287 else { /* See if it is a "normal" identifier */
9288 parse_ident(&s, &d, e, 1, is_utf8, FALSE);
9293 /* Either a digit variable, or parse_ident() found an identifier
9294 (anything valid as a bareword), so job done and return. */
9295 if (PL_lex_state != LEX_NORMAL)
9296 PL_lex_state = LEX_INTERPENDMAYBE;
9300 /* Here, it is not a run-of-the-mill identifier name */
9302 if (*s == '$' && s[1]
9303 && ( isIDFIRST_lazy_if_safe(s+1, PL_bufend, is_utf8)
9304 || isDIGIT_A((U8)s[1])
9307 || strEQs(s+1,"::")) )
9309 /* Dereferencing a value in a scalar variable.
9310 The alternatives are different syntaxes for a scalar variable.
9311 Using ' as a leading package separator isn't allowed. :: is. */
9314 /* Handle the opening { of @{...}, &{...}, *{...}, %{...}, ${...} */
9316 bracket = s - SvPVX(PL_linestr);
9318 orig_copline = CopLINE(PL_curcop);
9319 if (s < PL_bufend && isSPACE(*s)) {
9323 if ((s <= PL_bufend - (is_utf8)
9326 && VALID_LEN_ONE_IDENT(s, PL_bufend, is_utf8))
9329 const STRLEN skip = UTF8SKIP(s);
9332 for ( i = 0; i < skip; i++ )
9340 /* Convert $^F, ${^F} and the ^F of ${^FOO} to control characters */
9341 if (*d == '^' && *s && isCONTROLVAR(*s)) {
9345 /* Warn about ambiguous code after unary operators if {...} notation isn't
9346 used. There's no difference in ambiguity; it's merely a heuristic
9347 about when not to warn. */
9348 else if (ck_uni && bracket == -1)
9350 if (bracket != -1) {
9353 /* If we were processing {...} notation then... */
9354 if (isIDFIRST_lazy_if_safe(d, e, is_utf8)) {
9355 /* if it starts as a valid identifier, assume that it is one.
9356 (the later check for } being at the expected point will trap
9357 cases where this doesn't pan out.) */
9358 d += is_utf8 ? UTF8SKIP(d) : 1;
9359 parse_ident(&s, &d, e, 1, is_utf8, TRUE);
9361 tmp_copline = CopLINE(PL_curcop);
9362 if (s < PL_bufend && isSPACE(*s)) {
9365 if ((*s == '[' || (*s == '{' && strNE(dest, "sub")))) {
9366 /* ${foo[0]} and ${foo{bar}} notation. */
9367 if (ckWARN(WARN_AMBIGUOUS) && keyword(dest, d - dest, 0)) {
9368 const char * const brack =
9370 ((*s == '[') ? "[...]" : "{...}");
9371 orig_copline = CopLINE(PL_curcop);
9372 CopLINE_set(PL_curcop, tmp_copline);
9373 /* diag_listed_as: Ambiguous use of %c{%s[...]} resolved to %c%s[...] */
9374 Perl_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
9375 "Ambiguous use of %c{%s%s} resolved to %c%s%s",
9376 funny, dest, brack, funny, dest, brack);
9377 CopLINE_set(PL_curcop, orig_copline);
9380 PL_lex_brackstack[PL_lex_brackets++] = (char)(XOPERATOR | XFAKEBRACK);
9381 PL_lex_allbrackets++;
9385 /* Handle extended ${^Foo} variables
9386 * 1999-02-27 mjd-perl-patch@plover.com */
9387 else if (! isPRINT(*d) /* isCNTRL(d), plus all non-ASCII */
9391 while (isWORDCHAR(*s) && d < e) {
9395 Perl_croak(aTHX_ "%s", ident_too_long);
9400 tmp_copline = CopLINE(PL_curcop);
9401 if ((skip = s < PL_bufend && isSPACE(*s)))
9402 /* Avoid incrementing line numbers or resetting PL_linestart,
9403 in case we have to back up. */
9408 /* Expect to find a closing } after consuming any trailing whitespace.
9411 /* Now increment line numbers if applicable. */
9415 if (PL_lex_state == LEX_INTERPNORMAL && !PL_lex_brackets) {
9416 PL_lex_state = LEX_INTERPEND;
9419 if (PL_lex_state == LEX_NORMAL) {
9420 if (ckWARN(WARN_AMBIGUOUS)
9421 && (keyword(dest, d - dest, 0)
9422 || get_cvn_flags(dest, d - dest, is_utf8
9426 SV *tmp = newSVpvn_flags( dest, d - dest,
9427 SVs_TEMP | (is_utf8 ? SVf_UTF8 : 0) );
9430 orig_copline = CopLINE(PL_curcop);
9431 CopLINE_set(PL_curcop, tmp_copline);
9432 Perl_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
9433 "Ambiguous use of %c{%" SVf "} resolved to %c%" SVf,
9434 funny, SVfARG(tmp), funny, SVfARG(tmp));
9435 CopLINE_set(PL_curcop, orig_copline);
9440 /* Didn't find the closing } at the point we expected, so restore
9441 state such that the next thing to process is the opening { and */
9442 s = SvPVX(PL_linestr) + bracket; /* let the parser handle it */
9443 CopLINE_set(PL_curcop, orig_copline);
9444 PL_parser->herelines = herelines;
9448 else if (PL_lex_state == LEX_INTERPNORMAL && !PL_lex_brackets && !intuit_more(s))
9449 PL_lex_state = LEX_INTERPEND;
9454 S_pmflag(pTHX_ const char* const valid_flags, U32 * pmfl, char** s, char* charset, unsigned int * x_mod_count) {
9456 /* Adds, subtracts to/from 'pmfl' based on the next regex modifier flag
9457 * found in the parse starting at 's', based on the subset that are valid
9458 * in this context input to this routine in 'valid_flags'. Advances s.
9459 * Returns TRUE if the input should be treated as a valid flag, so the next
9460 * char may be as well; otherwise FALSE. 'charset' should point to a NUL
9461 * upon first call on the current regex. This routine will set it to any
9462 * charset modifier found. The caller shouldn't change it. This way,
9463 * another charset modifier encountered in the parse can be detected as an
9464 * error, as we have decided to allow only one */
9467 STRLEN charlen = UTF ? UTF8SKIP(*s) : 1;
9469 if ( charlen != 1 || ! strchr(valid_flags, c) ) {
9470 if (isWORDCHAR_lazy_if_safe( *s, PL_bufend, UTF)) {
9471 yyerror_pv(Perl_form(aTHX_ "Unknown regexp modifier \"/%.*s\"", (int)charlen, *s),
9472 UTF ? SVf_UTF8 : 0);
9474 /* Pretend that it worked, so will continue processing before
9483 CASE_STD_PMMOD_FLAGS_PARSE_SET(pmfl, *x_mod_count);
9484 case GLOBAL_PAT_MOD: *pmfl |= PMf_GLOBAL; break;
9485 case CONTINUE_PAT_MOD: *pmfl |= PMf_CONTINUE; break;
9486 case ONCE_PAT_MOD: *pmfl |= PMf_KEEP; break;
9487 case KEEPCOPY_PAT_MOD: *pmfl |= RXf_PMf_KEEPCOPY; break;
9488 case NONDESTRUCT_PAT_MOD: *pmfl |= PMf_NONDESTRUCT; break;
9489 case LOCALE_PAT_MOD:
9491 goto multiple_charsets;
9493 set_regex_charset(pmfl, REGEX_LOCALE_CHARSET);
9496 case UNICODE_PAT_MOD:
9498 goto multiple_charsets;
9500 set_regex_charset(pmfl, REGEX_UNICODE_CHARSET);
9503 case ASCII_RESTRICT_PAT_MOD:
9505 set_regex_charset(pmfl, REGEX_ASCII_RESTRICTED_CHARSET);
9509 /* Error if previous modifier wasn't an 'a', but if it was, see
9510 * if, and accept, a second occurrence (only) */
9512 || get_regex_charset(*pmfl)
9513 != REGEX_ASCII_RESTRICTED_CHARSET)
9515 goto multiple_charsets;
9517 set_regex_charset(pmfl, REGEX_ASCII_MORE_RESTRICTED_CHARSET);
9521 case DEPENDS_PAT_MOD:
9523 goto multiple_charsets;
9525 set_regex_charset(pmfl, REGEX_DEPENDS_CHARSET);
9534 if (*charset != c) {
9535 yyerror(Perl_form(aTHX_ "Regexp modifiers \"/%c\" and \"/%c\" are mutually exclusive", *charset, c));
9537 else if (c == 'a') {
9538 /* diag_listed_as: Regexp modifier "/%c" may appear a maximum of twice */
9539 yyerror("Regexp modifier \"/a\" may appear a maximum of twice");
9542 yyerror(Perl_form(aTHX_ "Regexp modifier \"/%c\" may not appear twice", c));
9545 /* Pretend that it worked, so will continue processing before dieing */
9551 S_scan_pat(pTHX_ char *start, I32 type)
9555 const char * const valid_flags =
9556 (const char *)((type == OP_QR) ? QR_PAT_MODS : M_PAT_MODS);
9557 char charset = '\0'; /* character set modifier */
9558 unsigned int x_mod_count = 0;
9560 PERL_ARGS_ASSERT_SCAN_PAT;
9562 s = scan_str(start,TRUE,FALSE, (PL_in_eval & EVAL_RE_REPARSING), NULL);
9564 Perl_croak(aTHX_ "Search pattern not terminated");
9566 pm = (PMOP*)newPMOP(type, 0);
9567 if (PL_multi_open == '?') {
9568 /* This is the only point in the code that sets PMf_ONCE: */
9569 pm->op_pmflags |= PMf_ONCE;
9571 /* Hence it's safe to do this bit of PMOP book-keeping here, which
9572 allows us to restrict the list needed by reset to just the ??
9574 assert(type != OP_TRANS);
9576 MAGIC *mg = mg_find((const SV *)PL_curstash, PERL_MAGIC_symtab);
9579 mg = sv_magicext(MUTABLE_SV(PL_curstash), 0, PERL_MAGIC_symtab, 0, 0,
9582 elements = mg->mg_len / sizeof(PMOP**);
9583 Renewc(mg->mg_ptr, elements + 1, PMOP*, char);
9584 ((PMOP**)mg->mg_ptr) [elements++] = pm;
9585 mg->mg_len = elements * sizeof(PMOP**);
9586 PmopSTASH_set(pm,PL_curstash);
9590 /* if qr/...(?{..}).../, then need to parse the pattern within a new
9591 * anon CV. False positives like qr/[(?{]/ are harmless */
9593 if (type == OP_QR) {
9595 char *e, *p = SvPV(PL_lex_stuff, len);
9597 for (; p < e; p++) {
9598 if (p[0] == '(' && p[1] == '?'
9599 && (p[2] == '{' || (p[2] == '?' && p[3] == '{')))
9601 pm->op_pmflags |= PMf_HAS_CV;
9605 pm->op_pmflags |= PMf_IS_QR;
9608 while (*s && S_pmflag(aTHX_ valid_flags, &(pm->op_pmflags),
9609 &s, &charset, &x_mod_count))
9611 /* issue a warning if /c is specified,but /g is not */
9612 if ((pm->op_pmflags & PMf_CONTINUE) && !(pm->op_pmflags & PMf_GLOBAL))
9614 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
9615 "Use of /c modifier is meaningless without /g" );
9618 PL_lex_op = (OP*)pm;
9619 pl_yylval.ival = OP_MATCH;
9624 S_scan_subst(pTHX_ char *start)
9630 line_t linediff = 0;
9632 char charset = '\0'; /* character set modifier */
9633 unsigned int x_mod_count = 0;
9636 PERL_ARGS_ASSERT_SCAN_SUBST;
9638 pl_yylval.ival = OP_NULL;
9640 s = scan_str(start, TRUE, FALSE, FALSE, &t);
9643 Perl_croak(aTHX_ "Substitution pattern not terminated");
9647 first_start = PL_multi_start;
9648 first_line = CopLINE(PL_curcop);
9649 s = scan_str(s,FALSE,FALSE,FALSE,NULL);
9651 SvREFCNT_dec_NN(PL_lex_stuff);
9652 PL_lex_stuff = NULL;
9653 Perl_croak(aTHX_ "Substitution replacement not terminated");
9655 PL_multi_start = first_start; /* so whole substitution is taken together */
9657 pm = (PMOP*)newPMOP(OP_SUBST, 0);
9661 if (*s == EXEC_PAT_MOD) {
9665 else if (! S_pmflag(aTHX_ S_PAT_MODS, &(pm->op_pmflags),
9666 &s, &charset, &x_mod_count))
9672 if ((pm->op_pmflags & PMf_CONTINUE)) {
9673 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), "Use of /c modifier is meaningless in s///" );
9677 SV * const repl = newSVpvs("");
9680 pm->op_pmflags |= PMf_EVAL;
9683 sv_catpvs(repl, "eval ");
9685 sv_catpvs(repl, "do ");
9687 sv_catpvs(repl, "{");
9688 sv_catsv(repl, PL_parser->lex_sub_repl);
9689 sv_catpvs(repl, "}");
9690 SvREFCNT_dec(PL_parser->lex_sub_repl);
9691 PL_parser->lex_sub_repl = repl;
9696 linediff = CopLINE(PL_curcop) - first_line;
9698 CopLINE_set(PL_curcop, first_line);
9700 if (linediff || es) {
9701 /* the IVX field indicates that the replacement string is a s///e;
9702 * the NVX field indicates how many src code lines the replacement
9704 sv_upgrade(PL_parser->lex_sub_repl, SVt_PVNV);
9705 ((XPVNV*)SvANY(PL_parser->lex_sub_repl))->xnv_u.xnv_lines = 0;
9706 ((XPVIV*)SvANY(PL_parser->lex_sub_repl))->xiv_u.xivu_eval_seen =
9710 PL_lex_op = (OP*)pm;
9711 pl_yylval.ival = OP_SUBST;
9716 S_scan_trans(pTHX_ char *start)
9723 bool nondestruct = 0;
9726 PERL_ARGS_ASSERT_SCAN_TRANS;
9728 pl_yylval.ival = OP_NULL;
9730 s = scan_str(start,FALSE,FALSE,FALSE,&t);
9732 Perl_croak(aTHX_ "Transliteration pattern not terminated");
9736 s = scan_str(s,FALSE,FALSE,FALSE,NULL);
9738 SvREFCNT_dec_NN(PL_lex_stuff);
9739 PL_lex_stuff = NULL;
9740 Perl_croak(aTHX_ "Transliteration replacement not terminated");
9743 complement = del = squash = 0;
9747 complement = OPpTRANS_COMPLEMENT;
9750 del = OPpTRANS_DELETE;
9753 squash = OPpTRANS_SQUASH;
9765 o = newPVOP(nondestruct ? OP_TRANSR : OP_TRANS, 0, (char*)NULL);
9766 o->op_private &= ~OPpTRANS_ALL;
9767 o->op_private |= del|squash|complement|
9768 (DO_UTF8(PL_lex_stuff)? OPpTRANS_FROM_UTF : 0)|
9769 (DO_UTF8(PL_parser->lex_sub_repl) ? OPpTRANS_TO_UTF : 0);
9772 pl_yylval.ival = nondestruct ? OP_TRANSR : OP_TRANS;
9779 Takes a pointer to the first < in <<FOO.
9780 Returns a pointer to the byte following <<FOO.
9782 This function scans a heredoc, which involves different methods
9783 depending on whether we are in a string eval, quoted construct, etc.
9784 This is because PL_linestr could containing a single line of input, or
9785 a whole string being evalled, or the contents of the current quote-
9788 The two basic methods are:
9789 - Steal lines from the input stream
9790 - Scan the heredoc in PL_linestr and remove it therefrom
9792 In a file scope or filtered eval, the first method is used; in a
9793 string eval, the second.
9795 In a quote-like operator, we have to choose between the two,
9796 depending on where we can find a newline. We peek into outer lex-
9797 ing scopes until we find one with a newline in it. If we reach the
9798 outermost lexing scope and it is a file, we use the stream method.
9799 Otherwise it is treated as an eval.
9803 S_scan_heredoc(pTHX_ char *s)
9805 I32 op_type = OP_SCALAR;
9814 bool indented = FALSE;
9815 const bool infile = PL_rsfp || PL_parser->filtered;
9816 const line_t origline = CopLINE(PL_curcop);
9817 LEXSHARED *shared = PL_parser->lex_shared;
9819 PERL_ARGS_ASSERT_SCAN_HEREDOC;
9822 d = PL_tokenbuf + 1;
9823 e = PL_tokenbuf + sizeof PL_tokenbuf - 1;
9824 *PL_tokenbuf = '\n';
9830 while (SPACE_OR_TAB(*peek))
9832 if (*peek == '`' || *peek == '\'' || *peek =='"') {
9835 s = delimcpy(d, e, s, PL_bufend, term, &len);
9837 Perl_croak(aTHX_ "Unterminated delimiter for here document");
9843 /* <<\FOO is equivalent to <<'FOO' */
9847 if (! isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF))
9848 deprecate_fatal_in("5.28", "Use of bare << to mean <<\"\" is deprecated");
9851 isWORDCHAR_lazy_if_safe(peek, PL_bufend, UTF))
9853 peek += UTF ? UTF8SKIP(peek) : 1;
9855 len = (peek - s >= e - d) ? (e - d) : (peek - s);
9856 Copy(s, d, len, char);
9860 if (d >= PL_tokenbuf + sizeof PL_tokenbuf - 1)
9861 Perl_croak(aTHX_ "Delimiter for here document is too long");
9864 len = d - PL_tokenbuf;
9866 #ifndef PERL_STRICT_CR
9867 d = strchr(s, '\r');
9869 char * const olds = s;
9871 while (s < PL_bufend) {
9877 else if (*s == '\n' && s[1] == '\r') { /* \015\013 on a mac? */
9886 SvCUR_set(PL_linestr, PL_bufend - SvPVX_const(PL_linestr));
9891 tmpstr = newSV_type(SVt_PVIV);
9895 SvIV_set(tmpstr, -1);
9897 else if (term == '`') {
9898 op_type = OP_BACKTICK;
9899 SvIV_set(tmpstr, '\\');
9902 PL_multi_start = origline + 1 + PL_parser->herelines;
9903 PL_multi_open = PL_multi_close = '<';
9904 /* inside a string eval or quote-like operator */
9905 if (!infile || PL_lex_inwhat) {
9908 char * const olds = s;
9909 PERL_CONTEXT * const cx = CX_CUR();
9910 /* These two fields are not set until an inner lexing scope is
9911 entered. But we need them set here. */
9912 shared->ls_bufptr = s;
9913 shared->ls_linestr = PL_linestr;
9915 /* Look for a newline. If the current buffer does not have one,
9916 peek into the line buffer of the parent lexing scope, going
9917 up as many levels as necessary to find one with a newline
9920 while (!(s = (char *)memchr(
9921 (void *)shared->ls_bufptr, '\n',
9922 SvEND(shared->ls_linestr)-shared->ls_bufptr
9924 shared = shared->ls_prev;
9925 /* shared is only null if we have gone beyond the outermost
9926 lexing scope. In a file, we will have broken out of the
9927 loop in the previous iteration. In an eval, the string buf-
9928 fer ends with "\n;", so the while condition above will have
9929 evaluated to false. So shared can never be null. Or so you
9930 might think. Odd syntax errors like s;@{<<; can gobble up
9931 the implicit semicolon at the end of a flie, causing the
9932 file handle to be closed even when we are not in a string
9933 eval. So shared may be null in that case.
9934 (Closing '}' here to balance the earlier open brace for
9935 editors that look for matched pairs.) */
9936 if (UNLIKELY(!shared))
9938 /* A LEXSHARED struct with a null ls_prev pointer is the outer-
9939 most lexing scope. In a file, shared->ls_linestr at that
9940 level is just one line, so there is no body to steal. */
9941 if (infile && !shared->ls_prev) {
9946 else { /* eval or we've already hit EOF */
9947 s = (char*)memchr((void*)s, '\n', PL_bufend - s);
9951 linestr = shared->ls_linestr;
9952 bufend = SvEND(linestr);
9957 while (s < bufend - len + 1) {
9959 ++PL_parser->herelines;
9961 if (memEQ(s, PL_tokenbuf + 1, len - 1)) {
9965 /* Only valid if it's preceded by whitespace only */
9966 while (backup != myolds && --backup >= myolds) {
9967 if (! SPACE_OR_TAB(*backup)) {
9974 /* No whitespace or all! */
9975 if (backup == s || *backup == '\n') {
9976 Newxz(indent, indent_len + 1, char);
9977 memcpy(indent, backup + 1, indent_len);
9978 s--; /* before our delimiter */
9979 PL_parser->herelines--; /* this line doesn't count */
9985 while (s < bufend - len + 1
9986 && memNE(s,PL_tokenbuf,len) )
9989 ++PL_parser->herelines;
9993 if (s >= bufend - len + 1) {
9996 sv_setpvn(tmpstr,d+1,s-d);
9998 /* the preceding stmt passes a newline */
9999 PL_parser->herelines++;
10001 /* s now points to the newline after the heredoc terminator.
10002 d points to the newline before the body of the heredoc.
10005 /* We are going to modify linestr in place here, so set
10006 aside copies of the string if necessary for re-evals or
10008 /* See the Paranoia note in case LEX_INTERPEND in yylex, for why we
10009 check shared->re_eval_str. */
10010 if (shared->re_eval_start || shared->re_eval_str) {
10011 /* Set aside the rest of the regexp */
10012 if (!shared->re_eval_str)
10013 shared->re_eval_str =
10014 newSVpvn(shared->re_eval_start,
10015 bufend - shared->re_eval_start);
10016 shared->re_eval_start -= s-d;
10018 if (cxstack_ix >= 0
10019 && CxTYPE(cx) == CXt_EVAL
10020 && CxOLD_OP_TYPE(cx) == OP_ENTEREVAL
10021 && cx->blk_eval.cur_text == linestr)
10023 cx->blk_eval.cur_text = newSVsv(linestr);
10024 cx->blk_u16 |= 0x40; /* indicate cur_text is ref counted */
10026 /* Copy everything from s onwards back to d. */
10027 Move(s,d,bufend-s + 1,char);
10028 SvCUR_set(linestr, SvCUR(linestr) - (s-d));
10029 /* Setting PL_bufend only applies when we have not dug deeper
10030 into other scopes, because sublex_done sets PL_bufend to
10031 SvEND(PL_linestr). */
10032 if (shared == PL_parser->lex_shared) PL_bufend = SvEND(linestr);
10038 char *oldbufptr_save;
10039 char *oldoldbufptr_save;
10041 SvPVCLEAR(tmpstr); /* avoid "uninitialized" warning */
10042 term = PL_tokenbuf[1];
10044 linestr_save = PL_linestr; /* must restore this afterwards */
10045 d = s; /* and this */
10046 oldbufptr_save = PL_oldbufptr;
10047 oldoldbufptr_save = PL_oldoldbufptr;
10048 PL_linestr = newSVpvs("");
10049 PL_bufend = SvPVX(PL_linestr);
10051 PL_bufptr = PL_bufend;
10052 CopLINE_set(PL_curcop,
10053 origline + 1 + PL_parser->herelines);
10054 if (!lex_next_chunk(LEX_NO_TERM)
10055 && (!SvCUR(tmpstr) || SvEND(tmpstr)[-1] != '\n')) {
10056 /* Simply freeing linestr_save might seem simpler here, as it
10057 does not matter what PL_linestr points to, since we are
10058 about to croak; but in a quote-like op, linestr_save
10059 will have been prospectively freed already, via
10060 SAVEFREESV(PL_linestr) in sublex_push, so it’s easier to
10061 restore PL_linestr. */
10062 SvREFCNT_dec_NN(PL_linestr);
10063 PL_linestr = linestr_save;
10064 PL_oldbufptr = oldbufptr_save;
10065 PL_oldoldbufptr = oldoldbufptr_save;
10068 CopLINE_set(PL_curcop, origline);
10069 if (!SvCUR(PL_linestr) || PL_bufend[-1] != '\n') {
10070 s = lex_grow_linestr(SvLEN(PL_linestr) + 3);
10071 /* ^That should be enough to avoid this needing to grow: */
10072 sv_catpvs(PL_linestr, "\n\0");
10073 assert(s == SvPVX(PL_linestr));
10074 PL_bufend = SvEND(PL_linestr);
10077 PL_parser->herelines++;
10078 PL_last_lop = PL_last_uni = NULL;
10079 #ifndef PERL_STRICT_CR
10080 if (PL_bufend - PL_linestart >= 2) {
10081 if ( (PL_bufend[-2] == '\r' && PL_bufend[-1] == '\n')
10082 || (PL_bufend[-2] == '\n' && PL_bufend[-1] == '\r'))
10084 PL_bufend[-2] = '\n';
10086 SvCUR_set(PL_linestr, PL_bufend - SvPVX_const(PL_linestr));
10088 else if (PL_bufend[-1] == '\r')
10089 PL_bufend[-1] = '\n';
10091 else if (PL_bufend - PL_linestart == 1 && PL_bufend[-1] == '\r')
10092 PL_bufend[-1] = '\n';
10094 if (indented && (PL_bufend-s) >= len) {
10095 char * found = ninstr(s, PL_bufend, (PL_tokenbuf + 1), (PL_tokenbuf +1 + len));
10098 char *backup = found;
10101 /* Only valid if it's preceded by whitespace only */
10102 while (backup != s && --backup >= s) {
10103 if (! SPACE_OR_TAB(*backup)) {
10109 /* All whitespace or none! */
10110 if (backup == found || SPACE_OR_TAB(*backup)) {
10111 Newxz(indent, indent_len + 1, char);
10112 memcpy(indent, backup, indent_len);
10113 SvREFCNT_dec(PL_linestr);
10114 PL_linestr = linestr_save;
10115 PL_linestart = SvPVX(linestr_save);
10116 PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr);
10117 PL_oldbufptr = oldbufptr_save;
10118 PL_oldoldbufptr = oldoldbufptr_save;
10124 /* Didn't find it */
10125 sv_catsv(tmpstr,PL_linestr);
10127 if (*s == term && PL_bufend-s >= len
10128 && memEQ(s,PL_tokenbuf + 1,len))
10130 SvREFCNT_dec(PL_linestr);
10131 PL_linestr = linestr_save;
10132 PL_linestart = SvPVX(linestr_save);
10133 PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr);
10134 PL_oldbufptr = oldbufptr_save;
10135 PL_oldoldbufptr = oldoldbufptr_save;
10139 sv_catsv(tmpstr,PL_linestr);
10144 PL_multi_end = origline + PL_parser->herelines;
10145 if (indented && indent) {
10146 STRLEN linecount = 1;
10147 STRLEN herelen = SvCUR(tmpstr);
10148 char *ss = SvPVX(tmpstr);
10149 char *se = ss + herelen;
10150 SV *newstr = newSV(herelen+1);
10153 /* Trim leading whitespace */
10155 /* newline only? Copy and move on */
10157 sv_catpv(newstr,"\n");
10161 /* Found our indentation? Strip it */
10162 } else if (se - ss >= indent_len
10163 && memEQ(ss, indent, indent_len))
10169 while ((ss + le) < se && *(ss + le) != '\n')
10172 sv_catpvn(newstr, ss, le);
10176 /* Line doesn't begin with our indentation? Croak */
10179 "Indentation on line %d of here-doc doesn't match delimiter",
10184 /* avoid sv_setsv() as we dont wan't to COW here */
10185 sv_setpvn(tmpstr,SvPVX(newstr),SvCUR(newstr));
10187 SvREFCNT_dec_NN(newstr);
10189 if (SvCUR(tmpstr) + 5 < SvLEN(tmpstr)) {
10190 SvPV_shrink_to_cur(tmpstr);
10193 if (UTF && is_utf8_string((U8*)SvPVX_const(tmpstr), SvCUR(tmpstr)))
10196 PL_lex_stuff = tmpstr;
10197 pl_yylval.ival = op_type;
10201 SvREFCNT_dec(tmpstr);
10202 CopLINE_set(PL_curcop, origline);
10203 missingterm(PL_tokenbuf + 1);
10206 /* scan_inputsymbol
10207 takes: position of first '<' in input buffer
10208 returns: position of first char following the matching '>' in
10210 side-effects: pl_yylval and lex_op are set.
10215 <<>> read from ARGV without magic open
10216 <FH> read from filehandle
10217 <pkg::FH> read from package qualified filehandle
10218 <pkg'FH> read from package qualified filehandle
10219 <$fh> read from filehandle in $fh
10220 <*.h> filename glob
10225 S_scan_inputsymbol(pTHX_ char *start)
10227 char *s = start; /* current position in buffer */
10230 bool nomagicopen = FALSE;
10231 char *d = PL_tokenbuf; /* start of temp holding space */
10232 const char * const e = PL_tokenbuf + sizeof PL_tokenbuf; /* end of temp holding space */
10234 PERL_ARGS_ASSERT_SCAN_INPUTSYMBOL;
10236 end = strchr(s, '\n');
10239 if (s[1] == '<' && s[2] == '>' && s[3] == '>') {
10240 nomagicopen = TRUE;
10246 s = delimcpy(d, e, s + 1, end, '>', &len); /* extract until > */
10248 /* die if we didn't have space for the contents of the <>,
10249 or if it didn't end, or if we see a newline
10252 if (len >= (I32)sizeof PL_tokenbuf)
10253 Perl_croak(aTHX_ "Excessively long <> operator");
10255 Perl_croak(aTHX_ "Unterminated <> operator");
10260 Remember, only scalar variables are interpreted as filehandles by
10261 this code. Anything more complex (e.g., <$fh{$num}>) will be
10262 treated as a glob() call.
10263 This code makes use of the fact that except for the $ at the front,
10264 a scalar variable and a filehandle look the same.
10266 if (*d == '$' && d[1]) d++;
10268 /* allow <Pkg'VALUE> or <Pkg::VALUE> */
10269 while (isWORDCHAR_lazy_if_safe(d, e, UTF) || *d == '\'' || *d == ':') {
10270 d += UTF ? UTF8SKIP(d) : 1;
10273 /* If we've tried to read what we allow filehandles to look like, and
10274 there's still text left, then it must be a glob() and not a getline.
10275 Use scan_str to pull out the stuff between the <> and treat it
10276 as nothing more than a string.
10279 if (d - PL_tokenbuf != len) {
10280 pl_yylval.ival = OP_GLOB;
10281 s = scan_str(start,FALSE,FALSE,FALSE,NULL);
10283 Perl_croak(aTHX_ "Glob not terminated");
10287 bool readline_overriden = FALSE;
10289 /* we're in a filehandle read situation */
10292 /* turn <> into <ARGV> */
10294 Copy("ARGV",d,5,char);
10296 /* Check whether readline() is overriden */
10297 if ((gv_readline = gv_override("readline",8)))
10298 readline_overriden = TRUE;
10300 /* if <$fh>, create the ops to turn the variable into a
10304 /* try to find it in the pad for this block, otherwise find
10305 add symbol table ops
10307 const PADOFFSET tmp = pad_findmy_pvn(d, len, 0);
10308 if (tmp != NOT_IN_PAD) {
10309 if (PAD_COMPNAME_FLAGS_isOUR(tmp)) {
10310 HV * const stash = PAD_COMPNAME_OURSTASH(tmp);
10311 HEK * const stashname = HvNAME_HEK(stash);
10312 SV * const sym = sv_2mortal(newSVhek(stashname));
10313 sv_catpvs(sym, "::");
10314 sv_catpv(sym, d+1);
10319 OP * const o = newOP(OP_PADSV, 0);
10321 PL_lex_op = readline_overriden
10322 ? newUNOP(OP_ENTERSUB, OPf_STACKED,
10323 op_append_elem(OP_LIST, o,
10324 newCVREF(0, newGVOP(OP_GV,0,gv_readline))))
10325 : newUNOP(OP_READLINE, 0, o);
10333 GV_ADDMULTI | ( UTF ? SVf_UTF8 : 0 ),
10335 PL_lex_op = readline_overriden
10336 ? newUNOP(OP_ENTERSUB, OPf_STACKED,
10337 op_append_elem(OP_LIST,
10338 newUNOP(OP_RV2SV, 0, newGVOP(OP_GV, 0, gv)),
10339 newCVREF(0, newGVOP(OP_GV, 0, gv_readline))))
10340 : newUNOP(OP_READLINE, 0,
10341 newUNOP(OP_RV2SV, 0,
10342 newGVOP(OP_GV, 0, gv)));
10344 /* we created the ops in PL_lex_op, so make pl_yylval.ival a null op */
10345 pl_yylval.ival = OP_NULL;
10348 /* If it's none of the above, it must be a literal filehandle
10349 (<Foo::BAR> or <FOO>) so build a simple readline OP */
10351 GV * const gv = gv_fetchpv(d, GV_ADD | ( UTF ? SVf_UTF8 : 0 ), SVt_PVIO);
10352 PL_lex_op = readline_overriden
10353 ? newUNOP(OP_ENTERSUB, OPf_STACKED,
10354 op_append_elem(OP_LIST,
10355 newGVOP(OP_GV, 0, gv),
10356 newCVREF(0, newGVOP(OP_GV, 0, gv_readline))))
10357 : newUNOP(OP_READLINE, nomagicopen ? OPf_SPECIAL : 0, newGVOP(OP_GV, 0, gv));
10358 pl_yylval.ival = OP_NULL;
10368 start position in buffer
10369 keep_bracketed_quoted preserve \ quoting of embedded delimiters, but
10370 only if they are of the open/close form
10371 keep_delims preserve the delimiters around the string
10372 re_reparse compiling a run-time /(?{})/:
10373 collapse // to /, and skip encoding src
10374 delimp if non-null, this is set to the position of
10375 the closing delimiter, or just after it if
10376 the closing and opening delimiters differ
10377 (i.e., the opening delimiter of a substitu-
10379 returns: position to continue reading from buffer
10380 side-effects: multi_start, multi_close, lex_repl or lex_stuff, and
10381 updates the read buffer.
10383 This subroutine pulls a string out of the input. It is called for:
10384 q single quotes q(literal text)
10385 ' single quotes 'literal text'
10386 qq double quotes qq(interpolate $here please)
10387 " double quotes "interpolate $here please"
10388 qx backticks qx(/bin/ls -l)
10389 ` backticks `/bin/ls -l`
10390 qw quote words @EXPORT_OK = qw( func() $spam )
10391 m// regexp match m/this/
10392 s/// regexp substitute s/this/that/
10393 tr/// string transliterate tr/this/that/
10394 y/// string transliterate y/this/that/
10395 ($*@) sub prototypes sub foo ($)
10396 (stuff) sub attr parameters sub foo : attr(stuff)
10397 <> readline or globs <FOO>, <>, <$fh>, or <*.c>
10399 In most of these cases (all but <>, patterns and transliterate)
10400 yylex() calls scan_str(). m// makes yylex() call scan_pat() which
10401 calls scan_str(). s/// makes yylex() call scan_subst() which calls
10402 scan_str(). tr/// and y/// make yylex() call scan_trans() which
10405 It skips whitespace before the string starts, and treats the first
10406 character as the delimiter. If the delimiter is one of ([{< then
10407 the corresponding "close" character )]}> is used as the closing
10408 delimiter. It allows quoting of delimiters, and if the string has
10409 balanced delimiters ([{<>}]) it allows nesting.
10411 On success, the SV with the resulting string is put into lex_stuff or,
10412 if that is already non-NULL, into lex_repl. The second case occurs only
10413 when parsing the RHS of the special constructs s/// and tr/// (y///).
10414 For convenience, the terminating delimiter character is stuffed into
10419 S_scan_str(pTHX_ char *start, int keep_bracketed_quoted, int keep_delims, int re_reparse,
10423 SV *sv; /* scalar value: string */
10424 const char *tmps; /* temp string, used for delimiter matching */
10425 char *s = start; /* current position in the buffer */
10426 char term; /* terminating character */
10427 char *to; /* current position in the sv's data */
10428 I32 brackets = 1; /* bracket nesting level */
10429 bool has_utf8 = FALSE; /* is there any utf8 content? */
10430 IV termcode; /* terminating char. code */
10431 U8 termstr[UTF8_MAXBYTES]; /* terminating string */
10432 STRLEN termlen; /* length of terminating string */
10435 /* The delimiters that have a mirror-image closing one */
10436 const char * opening_delims = "([{<";
10437 const char * closing_delims = ")]}>";
10439 const char * non_grapheme_msg = "Use of unassigned code point or"
10440 " non-standalone grapheme for a delimiter"
10441 " will be a fatal error starting in Perl"
10443 /* The only non-UTF character that isn't a stand alone grapheme is
10444 * white-space, hence can't be a delimiter. So can skip for non-UTF-8 */
10445 bool check_grapheme = UTF && ckWARN_d(WARN_DEPRECATED);
10447 PERL_ARGS_ASSERT_SCAN_STR;
10449 /* skip space before the delimiter */
10454 /* mark where we are, in case we need to report errors */
10457 /* after skipping whitespace, the next character is the terminator */
10459 if (!UTF || UTF8_IS_INVARIANT(term)) {
10460 termcode = termstr[0] = term;
10464 termcode = utf8_to_uvchr_buf((U8*)s, (U8*)PL_bufend, &termlen);
10465 if (check_grapheme) {
10466 if ( UNLIKELY(UNICODE_IS_SUPER(termcode))
10467 || UNLIKELY(UNICODE_IS_NONCHAR(termcode)))
10469 /* These are considered graphemes, and since the ending
10470 * delimiter will be the same, we don't have to check the other
10472 check_grapheme = FALSE;
10474 else if (UNLIKELY(! _is_grapheme((U8 *) start,
10479 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), "%s", non_grapheme_msg);
10481 /* Don't have to check the other end, as have already warned at
10483 check_grapheme = FALSE;
10487 Copy(s, termstr, termlen, U8);
10490 /* mark where we are */
10491 PL_multi_start = CopLINE(PL_curcop);
10492 PL_multi_open = termcode;
10493 herelines = PL_parser->herelines;
10495 /* If the delimiter has a mirror-image closing one, get it */
10496 if (term && (tmps = strchr(opening_delims, term))) {
10497 termcode = termstr[0] = term = closing_delims[tmps - opening_delims];
10500 PL_multi_close = termcode;
10502 if (PL_multi_open == PL_multi_close) {
10503 keep_bracketed_quoted = FALSE;
10506 /* create a new SV to hold the contents. 79 is the SV's initial length.
10507 What a random number. */
10508 sv = newSV_type(SVt_PVIV);
10510 SvIV_set(sv, termcode);
10511 (void)SvPOK_only(sv); /* validate pointer */
10513 /* move past delimiter and try to read a complete string */
10515 sv_catpvn(sv, s, termlen);
10518 /* extend sv if need be */
10519 SvGROW(sv, SvCUR(sv) + (PL_bufend - s) + 1);
10520 /* set 'to' to the next character in the sv's string */
10521 to = SvPVX(sv)+SvCUR(sv);
10523 /* if open delimiter is the close delimiter read unbridle */
10524 if (PL_multi_open == PL_multi_close) {
10525 for (; s < PL_bufend; s++,to++) {
10526 /* embedded newlines increment the current line number */
10527 if (*s == '\n' && !PL_rsfp && !PL_parser->filtered)
10528 COPLINE_INC_WITH_HERELINES;
10529 /* handle quoted delimiters */
10530 if (*s == '\\' && s+1 < PL_bufend && term != '\\') {
10531 if (!keep_bracketed_quoted
10533 || (re_reparse && s[1] == '\\'))
10536 else /* any other quotes are simply copied straight through */
10539 /* terminate when run out of buffer (the for() condition), or
10540 have found the terminator */
10541 else if (*s == term) { /* First byte of terminator matches */
10542 if (termlen == 1) /* If is the only byte, are done */
10545 /* If the remainder of the terminator matches, also are
10546 * done, after checking that is a separate grapheme */
10547 if ( s + termlen <= PL_bufend
10548 && memEQ(s + 1, (char*)termstr + 1, termlen - 1))
10550 if ( check_grapheme
10551 && UNLIKELY(! _is_grapheme((U8 *) start,
10556 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
10557 "%s", non_grapheme_msg);
10562 else if (!has_utf8 && !UTF8_IS_INVARIANT((U8)*s) && UTF) {
10570 /* if the terminator isn't the same as the start character (e.g.,
10571 matched brackets), we have to allow more in the quoting, and
10572 be prepared for nested brackets.
10575 /* read until we run out of string, or we find the terminator */
10576 for (; s < PL_bufend; s++,to++) {
10577 /* embedded newlines increment the line count */
10578 if (*s == '\n' && !PL_rsfp && !PL_parser->filtered)
10579 COPLINE_INC_WITH_HERELINES;
10580 /* backslashes can escape the open or closing characters */
10581 if (*s == '\\' && s+1 < PL_bufend) {
10582 if (!keep_bracketed_quoted
10583 && ( ((UV)s[1] == PL_multi_open)
10584 || ((UV)s[1] == PL_multi_close) ))
10591 /* allow nested opens and closes */
10592 else if ((UV)*s == PL_multi_close && --brackets <= 0)
10594 else if ((UV)*s == PL_multi_open)
10596 else if (!has_utf8 && !UTF8_IS_INVARIANT((U8)*s) && UTF)
10601 /* terminate the copied string and update the sv's end-of-string */
10603 SvCUR_set(sv, to - SvPVX_const(sv));
10606 * this next chunk reads more into the buffer if we're not done yet
10610 break; /* handle case where we are done yet :-) */
10612 #ifndef PERL_STRICT_CR
10613 if (to - SvPVX_const(sv) >= 2) {
10614 if ( (to[-2] == '\r' && to[-1] == '\n')
10615 || (to[-2] == '\n' && to[-1] == '\r'))
10619 SvCUR_set(sv, to - SvPVX_const(sv));
10621 else if (to[-1] == '\r')
10624 else if (to - SvPVX_const(sv) == 1 && to[-1] == '\r')
10628 /* if we're out of file, or a read fails, bail and reset the current
10629 line marker so we can report where the unterminated string began
10631 COPLINE_INC_WITH_HERELINES;
10632 PL_bufptr = PL_bufend;
10633 if (!lex_next_chunk(0)) {
10635 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
10638 s = start = PL_bufptr;
10641 /* at this point, we have successfully read the delimited string */
10644 sv_catpvn(sv, s, termlen);
10650 PL_multi_end = CopLINE(PL_curcop);
10651 CopLINE_set(PL_curcop, PL_multi_start);
10652 PL_parser->herelines = herelines;
10654 /* if we allocated too much space, give some back */
10655 if (SvCUR(sv) + 5 < SvLEN(sv)) {
10656 SvLEN_set(sv, SvCUR(sv) + 1);
10657 SvPV_renew(sv, SvLEN(sv));
10660 /* decide whether this is the first or second quoted string we've read
10665 PL_parser->lex_sub_repl = sv;
10668 if (delimp) *delimp = PL_multi_open == PL_multi_close ? s-termlen : s;
10674 takes: pointer to position in buffer
10675 returns: pointer to new position in buffer
10676 side-effects: builds ops for the constant in pl_yylval.op
10678 Read a number in any of the formats that Perl accepts:
10680 \d(_?\d)*(\.(\d(_?\d)*)?)?[Ee][\+\-]?(\d(_?\d)*) 12 12.34 12.
10681 \.\d(_?\d)*[Ee][\+\-]?(\d(_?\d)*) .34
10682 0b[01](_?[01])* binary integers
10683 0[0-7](_?[0-7])* octal integers
10684 0x[0-9A-Fa-f](_?[0-9A-Fa-f])* hexadecimal integers
10685 0x[0-9A-Fa-f](_?[0-9A-Fa-f])*(?:\.\d*)?p[+-]?[0-9]+ hexadecimal floats
10687 Like most scan_ routines, it uses the PL_tokenbuf buffer to hold the
10690 If it reads a number without a decimal point or an exponent, it will
10691 try converting the number to an integer and see if it can do so
10692 without loss of precision.
10696 Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp)
10698 const char *s = start; /* current position in buffer */
10699 char *d; /* destination in temp buffer */
10700 char *e; /* end of temp buffer */
10701 NV nv; /* number read, as a double */
10702 SV *sv = NULL; /* place to put the converted number */
10703 bool floatit; /* boolean: int or float? */
10704 const char *lastub = NULL; /* position of last underbar */
10705 static const char* const number_too_long = "Number too long";
10706 bool warned_about_underscore = 0;
10707 #define WARN_ABOUT_UNDERSCORE() \
10709 if (!warned_about_underscore) { \
10710 warned_about_underscore = 1; \
10711 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), \
10712 "Misplaced _ in number"); \
10715 /* Hexadecimal floating point.
10717 * In many places (where we have quads and NV is IEEE 754 double)
10718 * we can fit the mantissa bits of a NV into an unsigned quad.
10719 * (Note that UVs might not be quads even when we have quads.)
10720 * This will not work everywhere, though (either no quads, or
10721 * using long doubles), in which case we have to resort to NV,
10722 * which will probably mean horrible loss of precision due to
10723 * multiple fp operations. */
10724 bool hexfp = FALSE;
10725 int total_bits = 0;
10726 int significant_bits = 0;
10727 #if NVSIZE == 8 && defined(HAS_QUAD) && defined(Uquad_t)
10728 # define HEXFP_UQUAD
10729 Uquad_t hexfp_uquad = 0;
10730 int hexfp_frac_bits = 0;
10735 NV hexfp_mult = 1.0;
10736 UV high_non_zero = 0; /* highest digit */
10737 int non_zero_integer_digits = 0;
10739 PERL_ARGS_ASSERT_SCAN_NUM;
10741 /* We use the first character to decide what type of number this is */
10745 Perl_croak(aTHX_ "panic: scan_num, *s=%d", *s);
10747 /* if it starts with a 0, it could be an octal number, a decimal in
10748 0.13 disguise, or a hexadecimal number, or a binary number. */
10752 u holds the "number so far"
10753 shift the power of 2 of the base
10754 (hex == 4, octal == 3, binary == 1)
10755 overflowed was the number more than we can hold?
10757 Shift is used when we add a digit. It also serves as an "are
10758 we in octal/hex/binary?" indicator to disallow hex characters
10759 when in octal mode.
10764 bool overflowed = FALSE;
10765 bool just_zero = TRUE; /* just plain 0 or binary number? */
10766 static const NV nvshift[5] = { 1.0, 2.0, 4.0, 8.0, 16.0 };
10767 static const char* const bases[5] =
10768 { "", "binary", "", "octal", "hexadecimal" };
10769 static const char* const Bases[5] =
10770 { "", "Binary", "", "Octal", "Hexadecimal" };
10771 static const char* const maxima[5] =
10773 "0b11111111111111111111111111111111",
10777 const char *base, *Base, *max;
10779 /* check for hex */
10780 if (isALPHA_FOLD_EQ(s[1], 'x')) {
10784 } else if (isALPHA_FOLD_EQ(s[1], 'b')) {
10789 /* check for a decimal in disguise */
10790 else if (s[1] == '.' || isALPHA_FOLD_EQ(s[1], 'e'))
10792 /* so it must be octal */
10799 WARN_ABOUT_UNDERSCORE();
10803 base = bases[shift];
10804 Base = Bases[shift];
10805 max = maxima[shift];
10807 /* read the rest of the number */
10809 /* x is used in the overflow test,
10810 b is the digit we're adding on. */
10815 /* if we don't mention it, we're done */
10819 /* _ are ignored -- but warned about if consecutive */
10821 if (lastub && s == lastub + 1)
10822 WARN_ABOUT_UNDERSCORE();
10826 /* 8 and 9 are not octal */
10827 case '8': case '9':
10829 yyerror(Perl_form(aTHX_ "Illegal octal digit '%c'", *s));
10833 case '2': case '3': case '4':
10834 case '5': case '6': case '7':
10836 yyerror(Perl_form(aTHX_ "Illegal binary digit '%c'", *s));
10839 case '0': case '1':
10840 b = *s++ & 15; /* ASCII digit -> value of digit */
10844 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
10845 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
10846 /* make sure they said 0x */
10849 b = (*s++ & 7) + 9;
10851 /* Prepare to put the digit we have onto the end
10852 of the number so far. We check for overflows.
10858 x = u << shift; /* make room for the digit */
10860 total_bits += shift;
10862 if ((x >> shift) != u
10863 && !(PL_hints & HINT_NEW_BINARY)) {
10866 Perl_ck_warner_d(aTHX_ packWARN(WARN_OVERFLOW),
10867 "Integer overflow in %s number",
10870 u = x | b; /* add the digit to the end */
10873 n *= nvshift[shift];
10874 /* If an NV has not enough bits in its
10875 * mantissa to represent an UV this summing of
10876 * small low-order numbers is a waste of time
10877 * (because the NV cannot preserve the
10878 * low-order bits anyway): we could just
10879 * remember when did we overflow and in the
10880 * end just multiply n by the right
10885 if (high_non_zero == 0 && b > 0)
10889 non_zero_integer_digits++;
10891 /* this could be hexfp, but peek ahead
10892 * to avoid matching ".." */
10893 if (UNLIKELY(HEXFP_PEEK(s))) {
10901 /* if we get here, we had success: make a scalar value from
10906 /* final misplaced underbar check */
10908 WARN_ABOUT_UNDERSCORE();
10910 if (UNLIKELY(HEXFP_PEEK(s))) {
10911 /* Do sloppy (on the underbars) but quick detection
10912 * (and value construction) for hexfp, the decimal
10913 * detection will shortly be more thorough with the
10914 * underbar checks. */
10916 significant_bits = non_zero_integer_digits * shift;
10919 #else /* HEXFP_NV */
10922 /* Ignore the leading zero bits of
10923 * the high (first) non-zero digit. */
10924 if (high_non_zero) {
10925 if (high_non_zero < 0x8)
10926 significant_bits--;
10927 if (high_non_zero < 0x4)
10928 significant_bits--;
10929 if (high_non_zero < 0x2)
10930 significant_bits--;
10937 bool accumulate = TRUE;
10938 for (h++; (isXDIGIT(*h) || *h == '_'); h++) {
10939 if (isXDIGIT(*h)) {
10940 U8 b = XDIGIT_VALUE(*h);
10941 significant_bits += shift;
10944 if (significant_bits < NV_MANT_DIG) {
10945 /* We are in the long "run" of xdigits,
10946 * accumulate the full four bits. */
10947 hexfp_uquad <<= shift;
10949 hexfp_frac_bits += shift;
10951 /* We are at a hexdigit either at,
10952 * or straddling, the edge of mantissa.
10953 * We will try grabbing as many as
10954 * possible bits. */
10956 significant_bits - NV_MANT_DIG;
10959 hexfp_uquad <<= tail;
10960 hexfp_uquad |= b >> (shift - tail);
10961 hexfp_frac_bits += tail;
10963 /* Ignore the trailing zero bits
10964 * of the last non-zero xdigit.
10966 * The assumption here is that if
10967 * one has input of e.g. the xdigit
10968 * eight (0x8), there is only one
10969 * bit being input, not the full
10970 * four bits. Conversely, if one
10971 * specifies a zero xdigit, the
10972 * assumption is that one really
10973 * wants all those bits to be zero. */
10975 if ((b & 0x1) == 0x0) {
10976 significant_bits--;
10977 if ((b & 0x2) == 0x0) {
10978 significant_bits--;
10979 if ((b & 0x4) == 0x0) {
10980 significant_bits--;
10986 accumulate = FALSE;
10989 /* Keep skipping the xdigits, and
10990 * accumulating the significant bits,
10991 * but do not shift the uquad
10992 * (which would catastrophically drop
10993 * high-order bits) or accumulate the
10994 * xdigits anymore. */
10996 #else /* HEXFP_NV */
11000 hexfp_nv += b * nv_mult;
11002 accumulate = FALSE;
11006 if (significant_bits >= NV_MANT_DIG)
11007 accumulate = FALSE;
11011 if ((total_bits > 0 || significant_bits > 0) &&
11012 isALPHA_FOLD_EQ(*h, 'p')) {
11013 bool negexp = FALSE;
11017 else if (*h == '-') {
11023 while (isDIGIT(*h) || *h == '_') {
11026 hexfp_exp += *h - '0';
11029 && -hexfp_exp < NV_MIN_EXP - 1) {
11030 /* NOTE: this means that the exponent
11031 * underflow warning happens for
11032 * the IEEE 754 subnormals (denormals),
11033 * because DBL_MIN_EXP etc are the lowest
11034 * possible binary (or, rather, DBL_RADIX-base)
11035 * exponent for normals, not subnormals.
11037 * This may or may not be a good thing. */
11038 Perl_ck_warner(aTHX_ packWARN(WARN_OVERFLOW),
11039 "Hexadecimal float: exponent underflow");
11045 && hexfp_exp > NV_MAX_EXP - 1) {
11046 Perl_ck_warner(aTHX_ packWARN(WARN_OVERFLOW),
11047 "Hexadecimal float: exponent overflow");
11055 hexfp_exp = -hexfp_exp;
11057 hexfp_exp -= hexfp_frac_bits;
11059 hexfp_mult = Perl_pow(2.0, hexfp_exp);
11067 if (n > 4294967295.0)
11068 Perl_ck_warner(aTHX_ packWARN(WARN_PORTABLE),
11069 "%s number > %s non-portable",
11075 if (u > 0xffffffff)
11076 Perl_ck_warner(aTHX_ packWARN(WARN_PORTABLE),
11077 "%s number > %s non-portable",
11082 if (just_zero && (PL_hints & HINT_NEW_INTEGER))
11083 sv = new_constant(start, s - start, "integer",
11084 sv, NULL, NULL, 0);
11085 else if (PL_hints & HINT_NEW_BINARY)
11086 sv = new_constant(start, s - start, "binary", sv, NULL, NULL, 0);
11091 handle decimal numbers.
11092 we're also sent here when we read a 0 as the first digit
11094 case '1': case '2': case '3': case '4': case '5':
11095 case '6': case '7': case '8': case '9': case '.':
11098 e = PL_tokenbuf + sizeof PL_tokenbuf - 6; /* room for various punctuation */
11107 /* read next group of digits and _ and copy into d */
11110 || UNLIKELY(hexfp && isXDIGIT(*s)))
11112 /* skip underscores, checking for misplaced ones
11116 if (lastub && s == lastub + 1)
11117 WARN_ABOUT_UNDERSCORE();
11121 /* check for end of fixed-length buffer */
11123 Perl_croak(aTHX_ "%s", number_too_long);
11124 /* if we're ok, copy the character */
11129 /* final misplaced underbar check */
11130 if (lastub && s == lastub + 1)
11131 WARN_ABOUT_UNDERSCORE();
11133 /* read a decimal portion if there is one. avoid
11134 3..5 being interpreted as the number 3. followed
11137 if (*s == '.' && s[1] != '.') {
11142 WARN_ABOUT_UNDERSCORE();
11146 /* copy, ignoring underbars, until we run out of digits.
11150 || UNLIKELY(hexfp && isXDIGIT(*s));
11153 /* fixed length buffer check */
11155 Perl_croak(aTHX_ "%s", number_too_long);
11157 if (lastub && s == lastub + 1)
11158 WARN_ABOUT_UNDERSCORE();
11164 /* fractional part ending in underbar? */
11166 WARN_ABOUT_UNDERSCORE();
11167 if (*s == '.' && isDIGIT(s[1])) {
11168 /* oops, it's really a v-string, but without the "v" */
11174 /* read exponent part, if present */
11175 if ((isALPHA_FOLD_EQ(*s, 'e')
11176 || UNLIKELY(hexfp && isALPHA_FOLD_EQ(*s, 'p')))
11177 && strchr("+-0123456789_", s[1]))
11181 /* regardless of whether user said 3E5 or 3e5, use lower 'e',
11182 ditto for p (hexfloats) */
11183 if ((isALPHA_FOLD_EQ(*s, 'e'))) {
11184 /* At least some Mach atof()s don't grok 'E' */
11187 else if (UNLIKELY(hexfp && (isALPHA_FOLD_EQ(*s, 'p')))) {
11194 /* stray preinitial _ */
11196 WARN_ABOUT_UNDERSCORE();
11200 /* allow positive or negative exponent */
11201 if (*s == '+' || *s == '-')
11204 /* stray initial _ */
11206 WARN_ABOUT_UNDERSCORE();
11210 /* read digits of exponent */
11211 while (isDIGIT(*s) || *s == '_') {
11214 Perl_croak(aTHX_ "%s", number_too_long);
11218 if (((lastub && s == lastub + 1)
11219 || (!isDIGIT(s[1]) && s[1] != '_')))
11220 WARN_ABOUT_UNDERSCORE();
11228 We try to do an integer conversion first if no characters
11229 indicating "float" have been found.
11234 const int flags = grok_number (PL_tokenbuf, d - PL_tokenbuf, &uv);
11236 if (flags == IS_NUMBER_IN_UV) {
11238 sv = newSViv(uv); /* Prefer IVs over UVs. */
11241 } else if (flags == (IS_NUMBER_IN_UV | IS_NUMBER_NEG)) {
11242 if (uv <= (UV) IV_MIN)
11243 sv = newSViv(-(IV)uv);
11250 STORE_LC_NUMERIC_UNDERLYING_SET_STANDARD();
11251 /* terminate the string */
11253 if (UNLIKELY(hexfp)) {
11254 # ifdef NV_MANT_DIG
11255 if (significant_bits > NV_MANT_DIG)
11256 Perl_ck_warner(aTHX_ packWARN(WARN_OVERFLOW),
11257 "Hexadecimal float: mantissa overflow");
11260 nv = hexfp_uquad * hexfp_mult;
11261 #else /* HEXFP_NV */
11262 nv = hexfp_nv * hexfp_mult;
11265 nv = Atof(PL_tokenbuf);
11267 RESTORE_LC_NUMERIC_UNDERLYING();
11272 ? (PL_hints & HINT_NEW_FLOAT) : (PL_hints & HINT_NEW_INTEGER) ) {
11273 const char *const key = floatit ? "float" : "integer";
11274 const STRLEN keylen = floatit ? 5 : 7;
11275 sv = S_new_constant(aTHX_ PL_tokenbuf, d - PL_tokenbuf,
11276 key, keylen, sv, NULL, NULL, 0);
11280 /* if it starts with a v, it could be a v-string */
11283 sv = newSV(5); /* preallocate storage space */
11284 ENTER_with_name("scan_vstring");
11286 s = scan_vstring(s, PL_bufend, sv);
11287 SvREFCNT_inc_simple_void_NN(sv);
11288 LEAVE_with_name("scan_vstring");
11292 /* make the op for the constant and return */
11295 lvalp->opval = newSVOP(OP_CONST, 0, sv);
11297 lvalp->opval = NULL;
11303 S_scan_formline(pTHX_ char *s)
11305 SV * const stuff = newSVpvs("");
11306 bool needargs = FALSE;
11307 bool eofmt = FALSE;
11309 PERL_ARGS_ASSERT_SCAN_FORMLINE;
11311 while (!needargs) {
11315 #ifdef PERL_STRICT_CR
11316 while (SPACE_OR_TAB(*t))
11319 while (SPACE_OR_TAB(*t) || *t == '\r')
11322 if (*t == '\n' || t == PL_bufend) {
11327 eol = (char *) memchr(s,'\n',PL_bufend-s);
11332 for (t = s; t < eol; t++) {
11333 if (*t == '~' && t[1] == '~' && SvCUR(stuff)) {
11335 goto enough; /* ~~ must be first line in formline */
11337 if (*t == '@' || *t == '^')
11341 sv_catpvn(stuff, s, eol-s);
11342 #ifndef PERL_STRICT_CR
11343 if (eol-s > 1 && eol[-2] == '\r' && eol[-1] == '\n') {
11344 char *end = SvPVX(stuff) + SvCUR(stuff);
11347 SvCUR_set(stuff, SvCUR(stuff) - 1);
11355 if ((PL_rsfp || PL_parser->filtered)
11356 && PL_parser->form_lex_state == LEX_NORMAL) {
11358 PL_bufptr = PL_bufend;
11359 COPLINE_INC_WITH_HERELINES;
11360 got_some = lex_next_chunk(0);
11361 CopLINE_dec(PL_curcop);
11369 if (!SvCUR(stuff) || needargs)
11370 PL_lex_state = PL_parser->form_lex_state;
11371 if (SvCUR(stuff)) {
11372 PL_expect = XSTATE;
11374 const char *s2 = s;
11375 while (isSPACE(*s2) && *s2 != '\n')
11378 PL_expect = XTERMBLOCK;
11379 NEXTVAL_NEXTTOKE.ival = 0;
11382 NEXTVAL_NEXTTOKE.ival = 0;
11383 force_next(FORMLBRACK);
11386 if (UTF && is_utf8_string((U8*)SvPVX_const(stuff), SvCUR(stuff)))
11389 NEXTVAL_NEXTTOKE.opval = newSVOP(OP_CONST, 0, stuff);
11393 SvREFCNT_dec(stuff);
11395 PL_lex_formbrack = 0;
11401 Perl_start_subparse(pTHX_ I32 is_format, U32 flags)
11403 const I32 oldsavestack_ix = PL_savestack_ix;
11404 CV* const outsidecv = PL_compcv;
11406 SAVEI32(PL_subline);
11407 save_item(PL_subname);
11408 SAVESPTR(PL_compcv);
11410 PL_compcv = MUTABLE_CV(newSV_type(is_format ? SVt_PVFM : SVt_PVCV));
11411 CvFLAGS(PL_compcv) |= flags;
11413 PL_subline = CopLINE(PL_curcop);
11414 CvPADLIST(PL_compcv) = pad_new(padnew_SAVE|padnew_SAVESUB);
11415 CvOUTSIDE(PL_compcv) = MUTABLE_CV(SvREFCNT_inc_simple(outsidecv));
11416 CvOUTSIDE_SEQ(PL_compcv) = PL_cop_seqmax;
11417 if (outsidecv && CvPADLIST(outsidecv))
11418 CvPADLIST(PL_compcv)->xpadl_outid = CvPADLIST(outsidecv)->xpadl_id;
11420 return oldsavestack_ix;
11424 S_yywarn(pTHX_ const char *const s, U32 flags)
11426 PERL_ARGS_ASSERT_YYWARN;
11428 PL_in_eval |= EVAL_WARNONLY;
11429 yyerror_pv(s, flags);
11434 Perl_abort_execution(pTHX_ const char * const msg, const char * const name)
11436 PERL_ARGS_ASSERT_ABORT_EXECUTION;
11439 Perl_croak(aTHX_ "%s%s had compilation errors.\n", msg, name);
11442 "%sExecution of %s aborted due to compilation errors.\n", msg, name);
11444 NOT_REACHED; /* NOTREACHED */
11450 /* Called, after at least one error has been found, to abort the parse now,
11451 * instead of trying to forge ahead */
11453 yyerror_pvn(NULL, 0, 0);
11457 Perl_yyerror(pTHX_ const char *const s)
11459 PERL_ARGS_ASSERT_YYERROR;
11460 return yyerror_pvn(s, strlen(s), 0);
11464 Perl_yyerror_pv(pTHX_ const char *const s, U32 flags)
11466 PERL_ARGS_ASSERT_YYERROR_PV;
11467 return yyerror_pvn(s, strlen(s), flags);
11471 Perl_yyerror_pvn(pTHX_ const char *const s, STRLEN len, U32 flags)
11473 const char *context = NULL;
11476 SV * const where_sv = newSVpvs_flags("", SVs_TEMP);
11477 int yychar = PL_parser->yychar;
11479 /* Output error message 's' with length 'len'. 'flags' are SV flags that
11480 * apply. If the number of errors found is large enough, it abandons
11481 * parsing. If 's' is NULL, there is no message, and it abandons
11482 * processing unconditionally */
11485 if (!yychar || (yychar == ';' && !PL_rsfp))
11486 sv_catpvs(where_sv, "at EOF");
11487 else if ( PL_oldoldbufptr
11488 && PL_bufptr > PL_oldoldbufptr
11489 && PL_bufptr - PL_oldoldbufptr < 200
11490 && PL_oldoldbufptr != PL_oldbufptr
11491 && PL_oldbufptr != PL_bufptr)
11495 The code below is removed for NetWare because it
11496 abends/crashes on NetWare when the script has error such as
11497 not having the closing quotes like:
11498 if ($var eq "value)
11499 Checking of white spaces is anyway done in NetWare code.
11502 while (isSPACE(*PL_oldoldbufptr))
11505 context = PL_oldoldbufptr;
11506 contlen = PL_bufptr - PL_oldoldbufptr;
11508 else if ( PL_oldbufptr
11509 && PL_bufptr > PL_oldbufptr
11510 && PL_bufptr - PL_oldbufptr < 200
11511 && PL_oldbufptr != PL_bufptr) {
11514 The code below is removed for NetWare because it
11515 abends/crashes on NetWare when the script has error such as
11516 not having the closing quotes like:
11517 if ($var eq "value)
11518 Checking of white spaces is anyway done in NetWare code.
11521 while (isSPACE(*PL_oldbufptr))
11524 context = PL_oldbufptr;
11525 contlen = PL_bufptr - PL_oldbufptr;
11527 else if (yychar > 255)
11528 sv_catpvs(where_sv, "next token ???");
11529 else if (yychar == YYEMPTY) {
11530 if (PL_lex_state == LEX_NORMAL)
11531 sv_catpvs(where_sv, "at end of line");
11532 else if (PL_lex_inpat)
11533 sv_catpvs(where_sv, "within pattern");
11535 sv_catpvs(where_sv, "within string");
11538 sv_catpvs(where_sv, "next char ");
11540 Perl_sv_catpvf(aTHX_ where_sv, "^%c", toCTRL(yychar));
11541 else if (isPRINT_LC(yychar)) {
11542 const char string = yychar;
11543 sv_catpvn(where_sv, &string, 1);
11546 Perl_sv_catpvf(aTHX_ where_sv, "\\%03o", yychar & 255);
11548 msg = newSVpvn_flags(s, len, (flags & SVf_UTF8) | SVs_TEMP);
11549 Perl_sv_catpvf(aTHX_ msg, " at %s line %" IVdf ", ",
11550 OutCopFILE(PL_curcop),
11551 (IV)(PL_parser->preambling == NOLINE
11552 ? CopLINE(PL_curcop)
11553 : PL_parser->preambling));
11555 Perl_sv_catpvf(aTHX_ msg, "near \"%" UTF8f "\"\n",
11556 UTF8fARG(UTF, contlen, context));
11558 Perl_sv_catpvf(aTHX_ msg, "%" SVf "\n", SVfARG(where_sv));
11559 if ( PL_multi_start < PL_multi_end
11560 && (U32)(CopLINE(PL_curcop) - PL_multi_end) <= 1)
11562 Perl_sv_catpvf(aTHX_ msg,
11563 " (Might be a runaway multi-line %c%c string starting on"
11564 " line %" IVdf ")\n",
11565 (int)PL_multi_open,(int)PL_multi_close,(IV)PL_multi_start);
11568 if (PL_in_eval & EVAL_WARNONLY) {
11569 PL_in_eval &= ~EVAL_WARNONLY;
11570 Perl_ck_warner_d(aTHX_ packWARN(WARN_SYNTAX), "%" SVf, SVfARG(msg));
11576 if (s == NULL || PL_error_count >= 10) {
11577 const char * msg = "";
11578 const char * const name = OutCopFILE(PL_curcop);
11581 SV * errsv = ERRSV;
11582 if (SvCUR(errsv)) {
11583 msg = Perl_form(aTHX_ "%" SVf, SVfARG(errsv));
11588 abort_execution(msg, name);
11591 Perl_croak(aTHX_ "%s%s has too many errors.\n", msg, name);
11595 PL_in_my_stash = NULL;
11600 S_swallow_bom(pTHX_ U8 *s)
11602 const STRLEN slen = SvCUR(PL_linestr);
11604 PERL_ARGS_ASSERT_SWALLOW_BOM;
11608 if (s[1] == 0xFE) {
11609 /* UTF-16 little-endian? (or UTF-32LE?) */
11610 if (s[2] == 0 && s[3] == 0) /* UTF-32 little-endian */
11611 /* diag_listed_as: Unsupported script encoding %s */
11612 Perl_croak(aTHX_ "Unsupported script encoding UTF-32LE");
11613 #ifndef PERL_NO_UTF16_FILTER
11614 if (DEBUG_p_TEST || DEBUG_T_TEST) PerlIO_printf(Perl_debug_log, "UTF-16LE script encoding (BOM)\n");
11616 if (PL_bufend > (char*)s) {
11617 s = add_utf16_textfilter(s, TRUE);
11620 /* diag_listed_as: Unsupported script encoding %s */
11621 Perl_croak(aTHX_ "Unsupported script encoding UTF-16LE");
11626 if (s[1] == 0xFF) { /* UTF-16 big-endian? */
11627 #ifndef PERL_NO_UTF16_FILTER
11628 if (DEBUG_p_TEST || DEBUG_T_TEST) PerlIO_printf(Perl_debug_log, "UTF-16BE script encoding (BOM)\n");
11630 if (PL_bufend > (char *)s) {
11631 s = add_utf16_textfilter(s, FALSE);
11634 /* diag_listed_as: Unsupported script encoding %s */
11635 Perl_croak(aTHX_ "Unsupported script encoding UTF-16BE");
11639 case BOM_UTF8_FIRST_BYTE: {
11640 const STRLEN len = sizeof(BOM_UTF8_TAIL) - 1; /* Exclude trailing NUL */
11641 if (slen > len && memEQ(s+1, BOM_UTF8_TAIL, len)) {
11642 if (DEBUG_p_TEST || DEBUG_T_TEST) PerlIO_printf(Perl_debug_log, "UTF-8 script encoding (BOM)\n");
11643 s += len + 1; /* UTF-8 */
11650 if (s[2] == 0xFE && s[3] == 0xFF) {
11651 /* UTF-32 big-endian */
11652 /* diag_listed_as: Unsupported script encoding %s */
11653 Perl_croak(aTHX_ "Unsupported script encoding UTF-32BE");
11656 else if (s[2] == 0 && s[3] != 0) {
11659 * are a good indicator of UTF-16BE. */
11660 #ifndef PERL_NO_UTF16_FILTER
11661 if (DEBUG_p_TEST || DEBUG_T_TEST) PerlIO_printf(Perl_debug_log, "UTF-16BE script encoding (no BOM)\n");
11662 s = add_utf16_textfilter(s, FALSE);
11664 /* diag_listed_as: Unsupported script encoding %s */
11665 Perl_croak(aTHX_ "Unsupported script encoding UTF-16BE");
11672 if (slen > 3 && s[1] == 0 && s[2] != 0 && s[3] == 0) {
11675 * are a good indicator of UTF-16LE. */
11676 #ifndef PERL_NO_UTF16_FILTER
11677 if (DEBUG_p_TEST || DEBUG_T_TEST) PerlIO_printf(Perl_debug_log, "UTF-16LE script encoding (no BOM)\n");
11678 s = add_utf16_textfilter(s, TRUE);
11680 /* diag_listed_as: Unsupported script encoding %s */
11681 Perl_croak(aTHX_ "Unsupported script encoding UTF-16LE");
11689 #ifndef PERL_NO_UTF16_FILTER
11691 S_utf16_textfilter(pTHX_ int idx, SV *sv, int maxlen)
11693 SV *const filter = FILTER_DATA(idx);
11694 /* We re-use this each time round, throwing the contents away before we
11696 SV *const utf16_buffer = MUTABLE_SV(IoTOP_GV(filter));
11697 SV *const utf8_buffer = filter;
11698 IV status = IoPAGE(filter);
11699 const bool reverse = cBOOL(IoLINES(filter));
11702 PERL_ARGS_ASSERT_UTF16_TEXTFILTER;
11704 /* As we're automatically added, at the lowest level, and hence only called
11705 from this file, we can be sure that we're not called in block mode. Hence
11706 don't bother writing code to deal with block mode. */
11708 Perl_croak(aTHX_ "panic: utf16_textfilter called in block mode (for %d characters)", maxlen);
11711 Perl_croak(aTHX_ "panic: utf16_textfilter called after error (status=%" IVdf ")", status);
11713 DEBUG_P(PerlIO_printf(Perl_debug_log,
11714 "utf16_textfilter(%p,%ce): idx=%d maxlen=%d status=%" IVdf " utf16=%" UVuf " utf8=%" UVuf "\n",
11715 FPTR2DPTR(void *, S_utf16_textfilter),
11716 reverse ? 'l' : 'b', idx, maxlen, status,
11717 (UV)SvCUR(utf16_buffer), (UV)SvCUR(utf8_buffer)));
11724 /* First, look in our buffer of existing UTF-8 data: */
11725 char *nl = (char *)memchr(SvPVX(utf8_buffer), '\n', SvCUR(utf8_buffer));
11729 } else if (status == 0) {
11731 IoPAGE(filter) = 0;
11732 nl = SvEND(utf8_buffer);
11735 STRLEN got = nl - SvPVX(utf8_buffer);
11736 /* Did we have anything to append? */
11738 sv_catpvn(sv, SvPVX(utf8_buffer), got);
11739 /* Everything else in this code works just fine if SVp_POK isn't
11740 set. This, however, needs it, and we need it to work, else
11741 we loop infinitely because the buffer is never consumed. */
11742 sv_chop(utf8_buffer, nl);
11746 /* OK, not a complete line there, so need to read some more UTF-16.
11747 Read an extra octect if the buffer currently has an odd number. */
11751 if (SvCUR(utf16_buffer) >= 2) {
11752 /* Location of the high octet of the last complete code point.
11753 Gosh, UTF-16 is a pain. All the benefits of variable length,
11754 *coupled* with all the benefits of partial reads and
11756 const U8 *const last_hi = (U8*)SvPVX(utf16_buffer)
11757 + ((SvCUR(utf16_buffer) & ~1) - (reverse ? 1 : 2));
11759 if (*last_hi < 0xd8 || *last_hi > 0xdb) {
11763 /* We have the first half of a surrogate. Read more. */
11764 DEBUG_P(PerlIO_printf(Perl_debug_log, "utf16_textfilter partial surrogate detected at %p\n", last_hi));
11767 status = FILTER_READ(idx + 1, utf16_buffer,
11768 160 + (SvCUR(utf16_buffer) & 1));
11769 DEBUG_P(PerlIO_printf(Perl_debug_log, "utf16_textfilter status=%" IVdf " SvCUR(sv)=%" UVuf "\n", status, (UV)SvCUR(utf16_buffer)));
11770 DEBUG_P({ sv_dump(utf16_buffer); sv_dump(utf8_buffer);});
11773 IoPAGE(filter) = status;
11778 chars = SvCUR(utf16_buffer) >> 1;
11779 have = SvCUR(utf8_buffer);
11780 SvGROW(utf8_buffer, have + chars * 3 + 1);
11783 end = utf16_to_utf8_reversed((U8*)SvPVX(utf16_buffer),
11784 (U8*)SvPVX_const(utf8_buffer) + have,
11785 chars * 2, &newlen);
11787 end = utf16_to_utf8((U8*)SvPVX(utf16_buffer),
11788 (U8*)SvPVX_const(utf8_buffer) + have,
11789 chars * 2, &newlen);
11791 SvCUR_set(utf8_buffer, have + newlen);
11794 /* No need to keep this SV "well-formed" with a '\0' after the end, as
11795 it's private to us, and utf16_to_utf8{,reversed} take a
11796 (pointer,length) pair, rather than a NUL-terminated string. */
11797 if(SvCUR(utf16_buffer) & 1) {
11798 *SvPVX(utf16_buffer) = SvEND(utf16_buffer)[-1];
11799 SvCUR_set(utf16_buffer, 1);
11801 SvCUR_set(utf16_buffer, 0);
11804 DEBUG_P(PerlIO_printf(Perl_debug_log,
11805 "utf16_textfilter: returns, status=%" IVdf " utf16=%" UVuf " utf8=%" UVuf "\n",
11807 (UV)SvCUR(utf16_buffer), (UV)SvCUR(utf8_buffer)));
11808 DEBUG_P({ sv_dump(utf8_buffer); sv_dump(sv);});
11813 S_add_utf16_textfilter(pTHX_ U8 *const s, bool reversed)
11815 SV *filter = filter_add(S_utf16_textfilter, NULL);
11817 PERL_ARGS_ASSERT_ADD_UTF16_TEXTFILTER;
11819 IoTOP_GV(filter) = MUTABLE_GV(newSVpvn((char *)s, PL_bufend - (char*)s));
11821 IoLINES(filter) = reversed;
11822 IoPAGE(filter) = 1; /* Not EOF */
11824 /* Sadly, we have to return a valid pointer, come what may, so we have to
11825 ignore any error return from this. */
11826 SvCUR_set(PL_linestr, 0);
11827 if (FILTER_READ(0, PL_linestr, 0)) {
11828 SvUTF8_on(PL_linestr);
11830 SvUTF8_on(PL_linestr);
11832 PL_bufend = SvEND(PL_linestr);
11833 return (U8*)SvPVX(PL_linestr);
11838 Returns a pointer to the next character after the parsed
11839 vstring, as well as updating the passed in sv.
11841 Function must be called like
11843 sv = sv_2mortal(newSV(5));
11844 s = scan_vstring(s,e,sv);
11846 where s and e are the start and end of the string.
11847 The sv should already be large enough to store the vstring
11848 passed in, for performance reasons.
11850 This function may croak if fatal warnings are enabled in the
11851 calling scope, hence the sv_2mortal in the example (to prevent
11852 a leak). Make sure to do SvREFCNT_inc afterwards if you use
11858 Perl_scan_vstring(pTHX_ const char *s, const char *const e, SV *sv)
11860 const char *pos = s;
11861 const char *start = s;
11863 PERL_ARGS_ASSERT_SCAN_VSTRING;
11865 if (*pos == 'v') pos++; /* get past 'v' */
11866 while (pos < e && (isDIGIT(*pos) || *pos == '_'))
11868 if ( *pos != '.') {
11869 /* this may not be a v-string if followed by => */
11870 const char *next = pos;
11871 while (next < e && isSPACE(*next))
11873 if ((e - next) >= 2 && *next == '=' && next[1] == '>' ) {
11874 /* return string not v-string */
11875 sv_setpvn(sv,(char *)s,pos-s);
11876 return (char *)pos;
11880 if (!isALPHA(*pos)) {
11881 U8 tmpbuf[UTF8_MAXBYTES+1];
11884 s++; /* get past 'v' */
11889 /* this is atoi() that tolerates underscores */
11892 const char *end = pos;
11894 while (--end >= s) {
11896 const UV orev = rev;
11897 rev += (*end - '0') * mult;
11900 /* diag_listed_as: Integer overflow in %s number */
11901 Perl_ck_warner_d(aTHX_ packWARN(WARN_OVERFLOW),
11902 "Integer overflow in decimal number");
11906 /* Append native character for the rev point */
11907 tmpend = uvchr_to_utf8(tmpbuf, rev);
11908 sv_catpvn(sv, (const char*)tmpbuf, tmpend - tmpbuf);
11909 if (!UVCHR_IS_INVARIANT(rev))
11911 if (pos + 1 < e && *pos == '.' && isDIGIT(pos[1]))
11917 while (pos < e && (isDIGIT(*pos) || *pos == '_'))
11921 sv_magic(sv,NULL,PERL_MAGIC_vstring,(const char*)start, pos-start);
11928 Perl_keyword_plugin_standard(pTHX_
11929 char *keyword_ptr, STRLEN keyword_len, OP **op_ptr)
11931 PERL_ARGS_ASSERT_KEYWORD_PLUGIN_STANDARD;
11932 PERL_UNUSED_CONTEXT;
11933 PERL_UNUSED_ARG(keyword_ptr);
11934 PERL_UNUSED_ARG(keyword_len);
11935 PERL_UNUSED_ARG(op_ptr);
11936 return KEYWORD_PLUGIN_DECLINE;
11939 #define parse_recdescent(g,p) S_parse_recdescent(aTHX_ g,p)
11941 S_parse_recdescent(pTHX_ int gramtype, I32 fakeeof)
11943 SAVEI32(PL_lex_brackets);
11944 if (PL_lex_brackets > 100)
11945 Renew(PL_lex_brackstack, PL_lex_brackets + 10, char);
11946 PL_lex_brackstack[PL_lex_brackets++] = XFAKEEOF;
11947 SAVEI32(PL_lex_allbrackets);
11948 PL_lex_allbrackets = 0;
11949 SAVEI8(PL_lex_fakeeof);
11950 PL_lex_fakeeof = (U8)fakeeof;
11951 if(yyparse(gramtype) && !PL_parser->error_count)
11952 qerror(Perl_mess(aTHX_ "Parse error"));
11955 #define parse_recdescent_for_op(g,p) S_parse_recdescent_for_op(aTHX_ g,p)
11957 S_parse_recdescent_for_op(pTHX_ int gramtype, I32 fakeeof)
11961 SAVEVPTR(PL_eval_root);
11962 PL_eval_root = NULL;
11963 parse_recdescent(gramtype, fakeeof);
11969 #define parse_expr(p,f) S_parse_expr(aTHX_ p,f)
11971 S_parse_expr(pTHX_ I32 fakeeof, U32 flags)
11974 if (flags & ~PARSE_OPTIONAL)
11975 Perl_croak(aTHX_ "Parsing code internal error (%s)", "parse_expr");
11976 exprop = parse_recdescent_for_op(GRAMEXPR, fakeeof);
11977 if (!exprop && !(flags & PARSE_OPTIONAL)) {
11978 if (!PL_parser->error_count)
11979 qerror(Perl_mess(aTHX_ "Parse error"));
11980 exprop = newOP(OP_NULL, 0);
11986 =for apidoc Amx|OP *|parse_arithexpr|U32 flags
11988 Parse a Perl arithmetic expression. This may contain operators of precedence
11989 down to the bit shift operators. The expression must be followed (and thus
11990 terminated) either by a comparison or lower-precedence operator or by
11991 something that would normally terminate an expression such as semicolon.
11992 If C<flags> has the C<PARSE_OPTIONAL> bit set, then the expression is optional,
11993 otherwise it is mandatory. It is up to the caller to ensure that the
11994 dynamic parser state (L</PL_parser> et al) is correctly set to reflect
11995 the source of the code to be parsed and the lexical context for the
11998 The op tree representing the expression is returned. If an optional
11999 expression is absent, a null pointer is returned, otherwise the pointer
12002 If an error occurs in parsing or compilation, in most cases a valid op
12003 tree is returned anyway. The error is reflected in the parser state,
12004 normally resulting in a single exception at the top level of parsing
12005 which covers all the compilation errors that occurred. Some compilation
12006 errors, however, will throw an exception immediately.
12012 Perl_parse_arithexpr(pTHX_ U32 flags)
12014 return parse_expr(LEX_FAKEEOF_COMPARE, flags);
12018 =for apidoc Amx|OP *|parse_termexpr|U32 flags
12020 Parse a Perl term expression. This may contain operators of precedence
12021 down to the assignment operators. The expression must be followed (and thus
12022 terminated) either by a comma or lower-precedence operator or by
12023 something that would normally terminate an expression such as semicolon.
12024 If C<flags> has the C<PARSE_OPTIONAL> bit set, then the expression is optional,
12025 otherwise it is mandatory. It is up to the caller to ensure that the
12026 dynamic parser state (L</PL_parser> et al) is correctly set to reflect
12027 the source of the code to be parsed and the lexical context for the
12030 The op tree representing the expression is returned. If an optional
12031 expression is absent, a null pointer is returned, otherwise the pointer
12034 If an error occurs in parsing or compilation, in most cases a valid op
12035 tree is returned anyway. The error is reflected in the parser state,
12036 normally resulting in a single exception at the top level of parsing
12037 which covers all the compilation errors that occurred. Some compilation
12038 errors, however, will throw an exception immediately.
12044 Perl_parse_termexpr(pTHX_ U32 flags)
12046 return parse_expr(LEX_FAKEEOF_COMMA, flags);
12050 =for apidoc Amx|OP *|parse_listexpr|U32 flags
12052 Parse a Perl list expression. This may contain operators of precedence
12053 down to the comma operator. The expression must be followed (and thus
12054 terminated) either by a low-precedence logic operator such as C<or> or by
12055 something that would normally terminate an expression such as semicolon.
12056 If C<flags> has the C<PARSE_OPTIONAL> bit set, then the expression is optional,
12057 otherwise it is mandatory. It is up to the caller to ensure that the
12058 dynamic parser state (L</PL_parser> et al) is correctly set to reflect
12059 the source of the code to be parsed and the lexical context for the
12062 The op tree representing the expression is returned. If an optional
12063 expression is absent, a null pointer is returned, otherwise the pointer
12066 If an error occurs in parsing or compilation, in most cases a valid op
12067 tree is returned anyway. The error is reflected in the parser state,
12068 normally resulting in a single exception at the top level of parsing
12069 which covers all the compilation errors that occurred. Some compilation
12070 errors, however, will throw an exception immediately.
12076 Perl_parse_listexpr(pTHX_ U32 flags)
12078 return parse_expr(LEX_FAKEEOF_LOWLOGIC, flags);
12082 =for apidoc Amx|OP *|parse_fullexpr|U32 flags
12084 Parse a single complete Perl expression. This allows the full
12085 expression grammar, including the lowest-precedence operators such
12086 as C<or>. The expression must be followed (and thus terminated) by a
12087 token that an expression would normally be terminated by: end-of-file,
12088 closing bracketing punctuation, semicolon, or one of the keywords that
12089 signals a postfix expression-statement modifier. If C<flags> has the
12090 C<PARSE_OPTIONAL> bit set, then the expression is optional, otherwise it is
12091 mandatory. It is up to the caller to ensure that the dynamic parser
12092 state (L</PL_parser> et al) is correctly set to reflect the source of
12093 the code to be parsed and the lexical context for the expression.
12095 The op tree representing the expression is returned. If an optional
12096 expression is absent, a null pointer is returned, otherwise the pointer
12099 If an error occurs in parsing or compilation, in most cases a valid op
12100 tree is returned anyway. The error is reflected in the parser state,
12101 normally resulting in a single exception at the top level of parsing
12102 which covers all the compilation errors that occurred. Some compilation
12103 errors, however, will throw an exception immediately.
12109 Perl_parse_fullexpr(pTHX_ U32 flags)
12111 return parse_expr(LEX_FAKEEOF_NONEXPR, flags);
12115 =for apidoc Amx|OP *|parse_block|U32 flags
12117 Parse a single complete Perl code block. This consists of an opening
12118 brace, a sequence of statements, and a closing brace. The block
12119 constitutes a lexical scope, so C<my> variables and various compile-time
12120 effects can be contained within it. It is up to the caller to ensure
12121 that the dynamic parser state (L</PL_parser> et al) is correctly set to
12122 reflect the source of the code to be parsed and the lexical context for
12125 The op tree representing the code block is returned. This is always a
12126 real op, never a null pointer. It will normally be a C<lineseq> list,
12127 including C<nextstate> or equivalent ops. No ops to construct any kind
12128 of runtime scope are included by virtue of it being a block.
12130 If an error occurs in parsing or compilation, in most cases a valid op
12131 tree (most likely null) is returned anyway. The error is reflected in
12132 the parser state, normally resulting in a single exception at the top
12133 level of parsing which covers all the compilation errors that occurred.
12134 Some compilation errors, however, will throw an exception immediately.
12136 The C<flags> parameter is reserved for future use, and must always
12143 Perl_parse_block(pTHX_ U32 flags)
12146 Perl_croak(aTHX_ "Parsing code internal error (%s)", "parse_block");
12147 return parse_recdescent_for_op(GRAMBLOCK, LEX_FAKEEOF_NEVER);
12151 =for apidoc Amx|OP *|parse_barestmt|U32 flags
12153 Parse a single unadorned Perl statement. This may be a normal imperative
12154 statement or a declaration that has compile-time effect. It does not
12155 include any label or other affixture. It is up to the caller to ensure
12156 that the dynamic parser state (L</PL_parser> et al) is correctly set to
12157 reflect the source of the code to be parsed and the lexical context for
12160 The op tree representing the statement is returned. This may be a
12161 null pointer if the statement is null, for example if it was actually
12162 a subroutine definition (which has compile-time side effects). If not
12163 null, it will be ops directly implementing the statement, suitable to
12164 pass to L</newSTATEOP>. It will not normally include a C<nextstate> or
12165 equivalent op (except for those embedded in a scope contained entirely
12166 within the statement).
12168 If an error occurs in parsing or compilation, in most cases a valid op
12169 tree (most likely null) is returned anyway. The error is reflected in
12170 the parser state, normally resulting in a single exception at the top
12171 level of parsing which covers all the compilation errors that occurred.
12172 Some compilation errors, however, will throw an exception immediately.
12174 The C<flags> parameter is reserved for future use, and must always
12181 Perl_parse_barestmt(pTHX_ U32 flags)
12184 Perl_croak(aTHX_ "Parsing code internal error (%s)", "parse_barestmt");
12185 return parse_recdescent_for_op(GRAMBARESTMT, LEX_FAKEEOF_NEVER);
12189 =for apidoc Amx|SV *|parse_label|U32 flags
12191 Parse a single label, possibly optional, of the type that may prefix a
12192 Perl statement. It is up to the caller to ensure that the dynamic parser
12193 state (L</PL_parser> et al) is correctly set to reflect the source of
12194 the code to be parsed. If C<flags> has the C<PARSE_OPTIONAL> bit set, then the
12195 label is optional, otherwise it is mandatory.
12197 The name of the label is returned in the form of a fresh scalar. If an
12198 optional label is absent, a null pointer is returned.
12200 If an error occurs in parsing, which can only occur if the label is
12201 mandatory, a valid label is returned anyway. The error is reflected in
12202 the parser state, normally resulting in a single exception at the top
12203 level of parsing which covers all the compilation errors that occurred.
12209 Perl_parse_label(pTHX_ U32 flags)
12211 if (flags & ~PARSE_OPTIONAL)
12212 Perl_croak(aTHX_ "Parsing code internal error (%s)", "parse_label");
12214 PL_parser->yychar = yylex();
12215 if (PL_parser->yychar == LABEL) {
12216 char * const lpv = pl_yylval.pval;
12217 STRLEN llen = strlen(lpv);
12218 PL_parser->yychar = YYEMPTY;
12219 return newSVpvn_flags(lpv, llen, lpv[llen+1] ? SVf_UTF8 : 0);
12226 STRLEN wlen, bufptr_pos;
12229 if (!isIDFIRST_lazy_if_safe(s, PL_bufend, UTF))
12231 t = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, FALSE, &wlen);
12232 if (word_takes_any_delimiter(s, wlen))
12234 bufptr_pos = s - SvPVX(PL_linestr);
12236 lex_read_space(LEX_KEEP_PREVIOUS);
12238 s = SvPVX(PL_linestr) + bufptr_pos;
12239 if (t[0] == ':' && t[1] != ':') {
12240 PL_oldoldbufptr = PL_oldbufptr;
12243 return newSVpvn_flags(s, wlen, UTF ? SVf_UTF8 : 0);
12247 if (flags & PARSE_OPTIONAL) {
12250 qerror(Perl_mess(aTHX_ "Parse error"));
12251 return newSVpvs("x");
12258 =for apidoc Amx|OP *|parse_fullstmt|U32 flags
12260 Parse a single complete Perl statement. This may be a normal imperative
12261 statement or a declaration that has compile-time effect, and may include
12262 optional labels. It is up to the caller to ensure that the dynamic
12263 parser state (L</PL_parser> et al) is correctly set to reflect the source
12264 of the code to be parsed and the lexical context for the statement.
12266 The op tree representing the statement is returned. This may be a
12267 null pointer if the statement is null, for example if it was actually
12268 a subroutine definition (which has compile-time side effects). If not
12269 null, it will be the result of a L</newSTATEOP> call, normally including
12270 a C<nextstate> or equivalent op.
12272 If an error occurs in parsing or compilation, in most cases a valid op
12273 tree (most likely null) is returned anyway. The error is reflected in
12274 the parser state, normally resulting in a single exception at the top
12275 level of parsing which covers all the compilation errors that occurred.
12276 Some compilation errors, however, will throw an exception immediately.
12278 The C<flags> parameter is reserved for future use, and must always
12285 Perl_parse_fullstmt(pTHX_ U32 flags)
12288 Perl_croak(aTHX_ "Parsing code internal error (%s)", "parse_fullstmt");
12289 return parse_recdescent_for_op(GRAMFULLSTMT, LEX_FAKEEOF_NEVER);
12293 =for apidoc Amx|OP *|parse_stmtseq|U32 flags
12295 Parse a sequence of zero or more Perl statements. These may be normal
12296 imperative statements, including optional labels, or declarations
12297 that have compile-time effect, or any mixture thereof. The statement
12298 sequence ends when a closing brace or end-of-file is encountered in a
12299 place where a new statement could have validly started. It is up to
12300 the caller to ensure that the dynamic parser state (L</PL_parser> et al)
12301 is correctly set to reflect the source of the code to be parsed and the
12302 lexical context for the statements.
12304 The op tree representing the statement sequence is returned. This may
12305 be a null pointer if the statements were all null, for example if there
12306 were no statements or if there were only subroutine definitions (which
12307 have compile-time side effects). If not null, it will be a C<lineseq>
12308 list, normally including C<nextstate> or equivalent ops.
12310 If an error occurs in parsing or compilation, in most cases a valid op
12311 tree is returned anyway. The error is reflected in the parser state,
12312 normally resulting in a single exception at the top level of parsing
12313 which covers all the compilation errors that occurred. Some compilation
12314 errors, however, will throw an exception immediately.
12316 The C<flags> parameter is reserved for future use, and must always
12323 Perl_parse_stmtseq(pTHX_ U32 flags)
12328 Perl_croak(aTHX_ "Parsing code internal error (%s)", "parse_stmtseq");
12329 stmtseqop = parse_recdescent_for_op(GRAMSTMTSEQ, LEX_FAKEEOF_CLOSING);
12330 c = lex_peek_unichar(0);
12331 if (c != -1 && c != /*{*/'}')
12332 qerror(Perl_mess(aTHX_ "Parse error"));
12337 * ex: set ts=8 sts=4 sw=4 et: