3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
12 * 'It all comes from here, the stench and the peril.' --Frodo
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
18 * This file is the lexer for Perl. It's closely linked to the
21 * The main routine is yylex(), which returns the next token.
25 =head1 Lexer interface
26 This is the lower layer of the Perl parser, managing characters and tokens.
28 =for apidoc AmU|yy_parser *|PL_parser
30 Pointer to a structure encapsulating the state of the parsing operation
31 currently in progress. The pointer can be locally changed to perform
32 a nested parse without interfering with the state of an outer parse.
33 Individual members of C<PL_parser> have their own documentation.
39 #define PERL_IN_TOKE_C
41 #include "dquote_inline.h"
43 #define new_constant(a,b,c,d,e,f,g) \
44 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
46 #define pl_yylval (PL_parser->yylval)
48 /* XXX temporary backwards compatibility */
49 #define PL_lex_brackets (PL_parser->lex_brackets)
50 #define PL_lex_allbrackets (PL_parser->lex_allbrackets)
51 #define PL_lex_fakeeof (PL_parser->lex_fakeeof)
52 #define PL_lex_brackstack (PL_parser->lex_brackstack)
53 #define PL_lex_casemods (PL_parser->lex_casemods)
54 #define PL_lex_casestack (PL_parser->lex_casestack)
55 #define PL_lex_dojoin (PL_parser->lex_dojoin)
56 #define PL_lex_formbrack (PL_parser->lex_formbrack)
57 #define PL_lex_inpat (PL_parser->lex_inpat)
58 #define PL_lex_inwhat (PL_parser->lex_inwhat)
59 #define PL_lex_op (PL_parser->lex_op)
60 #define PL_lex_repl (PL_parser->lex_repl)
61 #define PL_lex_starts (PL_parser->lex_starts)
62 #define PL_lex_stuff (PL_parser->lex_stuff)
63 #define PL_multi_start (PL_parser->multi_start)
64 #define PL_multi_open (PL_parser->multi_open)
65 #define PL_multi_close (PL_parser->multi_close)
66 #define PL_preambled (PL_parser->preambled)
67 #define PL_linestr (PL_parser->linestr)
68 #define PL_expect (PL_parser->expect)
69 #define PL_copline (PL_parser->copline)
70 #define PL_bufptr (PL_parser->bufptr)
71 #define PL_oldbufptr (PL_parser->oldbufptr)
72 #define PL_oldoldbufptr (PL_parser->oldoldbufptr)
73 #define PL_linestart (PL_parser->linestart)
74 #define PL_bufend (PL_parser->bufend)
75 #define PL_last_uni (PL_parser->last_uni)
76 #define PL_last_lop (PL_parser->last_lop)
77 #define PL_last_lop_op (PL_parser->last_lop_op)
78 #define PL_lex_state (PL_parser->lex_state)
79 #define PL_rsfp (PL_parser->rsfp)
80 #define PL_rsfp_filters (PL_parser->rsfp_filters)
81 #define PL_in_my (PL_parser->in_my)
82 #define PL_in_my_stash (PL_parser->in_my_stash)
83 #define PL_tokenbuf (PL_parser->tokenbuf)
84 #define PL_multi_end (PL_parser->multi_end)
85 #define PL_error_count (PL_parser->error_count)
87 # define PL_nexttoke (PL_parser->nexttoke)
88 # define PL_nexttype (PL_parser->nexttype)
89 # define PL_nextval (PL_parser->nextval)
92 #define SvEVALED(sv) \
93 (SvTYPE(sv) >= SVt_PVNV \
94 && ((XPVIV*)SvANY(sv))->xiv_u.xivu_eval_seen)
96 static const char* const ident_too_long = "Identifier too long";
98 # define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
100 #define XENUMMASK 0x3f
101 #define XFAKEEOF 0x40
102 #define XFAKEBRACK 0x80
104 #ifdef USE_UTF8_SCRIPTS
105 # define UTF cBOOL(!IN_BYTES)
107 # define UTF cBOOL((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
110 /* The maximum number of characters preceding the unrecognized one to display */
111 #define UNRECOGNIZED_PRECEDE_COUNT 10
113 /* In variables named $^X, these are the legal values for X.
114 * 1999-02-27 mjd-perl-patch@plover.com */
115 #define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
117 #define SPACE_OR_TAB(c) isBLANK_A(c)
119 #define HEXFP_PEEK(s) \
121 (isXDIGIT(s[1]) || isALPHA_FOLD_EQ(s[1], 'p'))) || \
122 isALPHA_FOLD_EQ(s[0], 'p'))
124 /* LEX_* are values for PL_lex_state, the state of the lexer.
125 * They are arranged oddly so that the guard on the switch statement
126 * can get by with a single comparison (if the compiler is smart enough).
128 * These values refer to the various states within a sublex parse,
129 * i.e. within a double quotish string
132 /* #define LEX_NOTPARSING 11 is done in perl.h. */
134 #define LEX_NORMAL 10 /* normal code (ie not within "...") */
135 #define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
136 #define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
137 #define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
138 #define LEX_INTERPSTART 6 /* expecting the start of a $var */
140 /* at end of code, eg "$x" followed by: */
141 #define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
142 #define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
144 #define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
145 string or after \E, $foo, etc */
146 #define LEX_INTERPCONST 2 /* NOT USED */
147 #define LEX_FORMLINE 1 /* expecting a format line */
151 static const char* const lex_state_names[] = {
166 #include "keywords.h"
168 /* CLINE is a macro that ensures PL_copline has a sane value */
170 #define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
173 * Convenience functions to return different tokens and prime the
174 * lexer for the next token. They all take an argument.
176 * TOKEN : generic token (used for '(', DOLSHARP, etc)
177 * OPERATOR : generic operator
178 * AOPERATOR : assignment operator
179 * PREBLOCK : beginning the block after an if, while, foreach, ...
180 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
181 * PREREF : *EXPR where EXPR is not a simple identifier
182 * TERM : expression term
183 * POSTDEREF : postfix dereference (->$* ->@[...] etc.)
184 * LOOPX : loop exiting command (goto, last, dump, etc)
185 * FTST : file test operator
186 * FUN0 : zero-argument function
187 * FUN0OP : zero-argument function, with its op created in this file
188 * FUN1 : not used, except for not, which isn't a UNIOP
189 * BOop : bitwise or or xor
191 * BCop : bitwise complement
192 * SHop : shift operator
193 * PWop : power operator
194 * PMop : pattern-matching operator
195 * Aop : addition-level operator
196 * AopNOASSIGN : addition-level operator that is never part of .=
197 * Mop : multiplication-level operator
198 * Eop : equality-testing operator
199 * Rop : relational operator <= != gt
201 * Also see LOP and lop() below.
204 #ifdef DEBUGGING /* Serve -DT. */
205 # define REPORT(retval) tokereport((I32)retval, &pl_yylval)
207 # define REPORT(retval) (retval)
210 #define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
211 #define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
212 #define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, retval))
213 #define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
214 #define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
215 #define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
216 #define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
217 #define POSTDEREF(f) return (PL_bufptr = s, S_postderef(aTHX_ REPORT(f),s[1]))
218 #define LOOPX(f) return (PL_bufptr = force_word(s,BAREWORD,TRUE,FALSE), \
220 PL_expect = PL_nexttoke ? XOPERATOR : XTERM, \
222 #define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
223 #define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
224 #define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
225 #define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
226 #define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)BITOROP))
227 #define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)BITANDOP))
228 #define BCop(f) return pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr = s, \
230 #define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)SHIFTOP))
231 #define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)POWOP))
232 #define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
233 #define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)ADDOP))
234 #define AopNOASSIGN(f) return (pl_yylval.ival=f, PL_bufptr=s, REPORT((int)ADDOP))
235 #define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)MULOP))
236 #define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
237 #define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
239 /* This bit of chicanery makes a unary function followed by
240 * a parenthesis into a function with one argument, highest precedence.
241 * The UNIDOR macro is for unary functions that can be followed by the //
242 * operator (such as C<shift // 0>).
244 #define UNI3(f,x,have_x) { \
245 pl_yylval.ival = f; \
246 if (have_x) PL_expect = x; \
248 PL_last_uni = PL_oldbufptr; \
249 PL_last_lop_op = (f) < 0 ? -(f) : (f); \
251 return REPORT( (int)FUNC1 ); \
253 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
255 #define UNI(f) UNI3(f,XTERM,1)
256 #define UNIDOR(f) UNI3(f,XTERMORDORDOR,1)
257 #define UNIPROTO(f,optional) { \
258 if (optional) PL_last_uni = PL_oldbufptr; \
262 #define UNIBRACK(f) UNI3(f,0,0)
264 /* grandfather return to old style */
267 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
268 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
269 pl_yylval.ival = (f); \
275 #define COPLINE_INC_WITH_HERELINES \
277 CopLINE_inc(PL_curcop); \
278 if (PL_parser->herelines) \
279 CopLINE(PL_curcop) += PL_parser->herelines, \
280 PL_parser->herelines = 0; \
282 /* Called after scan_str to update CopLINE(PL_curcop), but only when there
283 * is no sublex_push to follow. */
284 #define COPLINE_SET_FROM_MULTI_END \
286 CopLINE_set(PL_curcop, PL_multi_end); \
287 if (PL_multi_end != PL_multi_start) \
288 PL_parser->herelines = 0; \
294 /* how to interpret the pl_yylval associated with the token */
298 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
303 static struct debug_tokens {
305 enum token_type type;
307 } const debug_tokens[] =
309 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
310 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
311 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
312 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
313 { ARROW, TOKENTYPE_NONE, "ARROW" },
314 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
315 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
316 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
317 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
318 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
319 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
320 { DO, TOKENTYPE_NONE, "DO" },
321 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
322 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
323 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
324 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
325 { ELSE, TOKENTYPE_NONE, "ELSE" },
326 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
327 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
328 { FOR, TOKENTYPE_IVAL, "FOR" },
329 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
330 { FORMLBRACK, TOKENTYPE_NONE, "FORMLBRACK" },
331 { FORMRBRACK, TOKENTYPE_NONE, "FORMRBRACK" },
332 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
333 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
334 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
335 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
336 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
337 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
338 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
339 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
340 { IF, TOKENTYPE_IVAL, "IF" },
341 { LABEL, TOKENTYPE_PVAL, "LABEL" },
342 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
343 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
344 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
345 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
346 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
347 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
348 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
349 { MY, TOKENTYPE_IVAL, "MY" },
350 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
351 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
352 { OROP, TOKENTYPE_IVAL, "OROP" },
353 { OROR, TOKENTYPE_NONE, "OROR" },
354 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
355 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
356 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
357 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
358 { POSTJOIN, TOKENTYPE_NONE, "POSTJOIN" },
359 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
360 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
361 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
362 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
363 { PREINC, TOKENTYPE_NONE, "PREINC" },
364 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
365 { QWLIST, TOKENTYPE_OPVAL, "QWLIST" },
366 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
367 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
368 { REQUIRE, TOKENTYPE_NONE, "REQUIRE" },
369 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
370 { SUB, TOKENTYPE_NONE, "SUB" },
371 { THING, TOKENTYPE_OPVAL, "THING" },
372 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
373 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
374 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
375 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
376 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
377 { USE, TOKENTYPE_IVAL, "USE" },
378 { WHEN, TOKENTYPE_IVAL, "WHEN" },
379 { WHILE, TOKENTYPE_IVAL, "WHILE" },
380 { BAREWORD, TOKENTYPE_OPVAL, "BAREWORD" },
381 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
382 { 0, TOKENTYPE_NONE, NULL }
385 /* dump the returned token in rv, plus any optional arg in pl_yylval */
388 S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
390 PERL_ARGS_ASSERT_TOKEREPORT;
393 const char *name = NULL;
394 enum token_type type = TOKENTYPE_NONE;
395 const struct debug_tokens *p;
396 SV* const report = newSVpvs("<== ");
398 for (p = debug_tokens; p->token; p++) {
399 if (p->token == (int)rv) {
406 Perl_sv_catpv(aTHX_ report, name);
407 else if (isGRAPH(rv))
409 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
411 sv_catpvs(report, " (pending identifier)");
414 sv_catpvs(report, "EOF");
416 Perl_sv_catpvf(aTHX_ report, "?? %" IVdf, (IV)rv);
421 Perl_sv_catpvf(aTHX_ report, "(ival=%" IVdf ")", (IV)lvalp->ival);
423 case TOKENTYPE_OPNUM:
424 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
425 PL_op_name[lvalp->ival]);
428 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
430 case TOKENTYPE_OPVAL:
432 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
433 PL_op_name[lvalp->opval->op_type]);
434 if (lvalp->opval->op_type == OP_CONST) {
435 Perl_sv_catpvf(aTHX_ report, " %s",
436 SvPEEK(cSVOPx_sv(lvalp->opval)));
441 sv_catpvs(report, "(opval=null)");
444 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
450 /* print the buffer with suitable escapes */
453 S_printbuf(pTHX_ const char *const fmt, const char *const s)
455 SV* const tmp = newSVpvs("");
457 PERL_ARGS_ASSERT_PRINTBUF;
459 GCC_DIAG_IGNORE(-Wformat-nonliteral); /* fmt checked by caller */
460 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
470 * This subroutine looks for an '=' next to the operator that has just been
471 * parsed and turns it into an ASSIGNOP if it finds one.
475 S_ao(pTHX_ int toketype)
477 if (*PL_bufptr == '=') {
479 if (toketype == ANDAND)
480 pl_yylval.ival = OP_ANDASSIGN;
481 else if (toketype == OROR)
482 pl_yylval.ival = OP_ORASSIGN;
483 else if (toketype == DORDOR)
484 pl_yylval.ival = OP_DORASSIGN;
487 return REPORT(toketype);
492 * When Perl expects an operator and finds something else, no_op
493 * prints the warning. It always prints "<something> found where
494 * operator expected. It prints "Missing semicolon on previous line?"
495 * if the surprise occurs at the start of the line. "do you need to
496 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
497 * where the compiler doesn't know if foo is a method call or a function.
498 * It prints "Missing operator before end of line" if there's nothing
499 * after the missing operator, or "... before <...>" if there is something
500 * after the missing operator.
502 * PL_bufptr is expected to point to the start of the thing that was found,
503 * and s after the next token or partial token.
507 S_no_op(pTHX_ const char *const what, char *s)
509 char * const oldbp = PL_bufptr;
510 const bool is_first = (PL_oldbufptr == PL_linestart);
512 PERL_ARGS_ASSERT_NO_OP;
518 yywarn(Perl_form(aTHX_ "%s found where operator expected", what), UTF ? SVf_UTF8 : 0);
519 if (ckWARN_d(WARN_SYNTAX)) {
521 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
522 "\t(Missing semicolon on previous line?)\n");
523 else if (PL_oldoldbufptr && isIDFIRST_lazy_if_safe(PL_oldoldbufptr,
528 for (t = PL_oldoldbufptr;
529 (isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF) || *t == ':');
530 t += UTF ? UTF8SKIP(t) : 1)
534 if (t < PL_bufptr && isSPACE(*t))
535 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
536 "\t(Do you need to predeclare %" UTF8f "?)\n",
537 UTF8fARG(UTF, t - PL_oldoldbufptr, PL_oldoldbufptr));
541 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
542 "\t(Missing operator before %" UTF8f "?)\n",
543 UTF8fARG(UTF, s - oldbp, oldbp));
551 * Complain about missing quote/regexp/heredoc terminator.
552 * If it's called with NULL then it cauterizes the line buffer.
553 * If we're in a delimited string and the delimiter is a control
554 * character, it's reformatted into a two-char sequence like ^C.
559 S_missingterm(pTHX_ char *s)
561 char tmpbuf[UTF8_MAXBYTES + 1];
566 char * const nl = strrchr(s,'\n');
571 else if (PL_multi_close < 32) {
573 tmpbuf[1] = (char)toCTRL(PL_multi_close);
578 if (LIKELY(PL_multi_close < 256)) {
579 *tmpbuf = (char)PL_multi_close;
584 *uvchr_to_utf8((U8 *)tmpbuf, PL_multi_close) = 0;
588 q = strchr(s,'"') ? '\'' : '"';
589 sv = sv_2mortal(newSVpv(s,0));
592 Perl_croak(aTHX_ "Can't find string terminator %c%" SVf
593 "%c anywhere before EOF",q,SVfARG(sv),q);
599 * Check whether the named feature is enabled.
602 Perl_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
604 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
606 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
608 assert(CURRENT_FEATURE_BUNDLE == FEATURE_BUNDLE_CUSTOM);
610 if (namelen > MAX_FEATURE_LEN)
612 memcpy(&he_name[8], name, namelen);
614 return cBOOL(cop_hints_fetch_pvn(PL_curcop, he_name, 8 + namelen, 0,
615 REFCOUNTED_HE_EXISTS));
619 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
620 * utf16-to-utf8-reversed.
623 #ifdef PERL_CR_FILTER
627 const char *s = SvPVX_const(sv);
628 const char * const e = s + SvCUR(sv);
630 PERL_ARGS_ASSERT_STRIP_RETURN;
632 /* outer loop optimized to do nothing if there are no CR-LFs */
634 if (*s++ == '\r' && *s == '\n') {
635 /* hit a CR-LF, need to copy the rest */
639 if (*s == '\r' && s[1] == '\n')
650 S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
652 const I32 count = FILTER_READ(idx+1, sv, maxlen);
653 if (count > 0 && !maxlen)
660 =for apidoc Amx|void|lex_start|SV *line|PerlIO *rsfp|U32 flags
662 Creates and initialises a new lexer/parser state object, supplying
663 a context in which to lex and parse from a new source of Perl code.
664 A pointer to the new state object is placed in L</PL_parser>. An entry
665 is made on the save stack so that upon unwinding, the new state object
666 will be destroyed and the former value of L</PL_parser> will be restored.
667 Nothing else need be done to clean up the parsing context.
669 The code to be parsed comes from C<line> and C<rsfp>. C<line>, if
670 non-null, provides a string (in SV form) containing code to be parsed.
671 A copy of the string is made, so subsequent modification of C<line>
672 does not affect parsing. C<rsfp>, if non-null, provides an input stream
673 from which code will be read to be parsed. If both are non-null, the
674 code in C<line> comes first and must consist of complete lines of input,
675 and C<rsfp> supplies the remainder of the source.
677 The C<flags> parameter is reserved for future use. Currently it is only
678 used by perl internally, so extensions should always pass zero.
683 /* LEX_START_SAME_FILTER indicates that this is not a new file, so it
684 can share filters with the current parser.
685 LEX_START_DONT_CLOSE indicates that the file handle wasn't opened by the
686 caller, hence isn't owned by the parser, so shouldn't be closed on parser
687 destruction. This is used to handle the case of defaulting to reading the
688 script from the standard input because no filename was given on the command
689 line (without getting confused by situation where STDIN has been closed, so
690 the script handle is opened on fd 0) */
693 Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
695 const char *s = NULL;
696 yy_parser *parser, *oparser;
698 if (flags && flags & ~LEX_START_FLAGS)
699 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
701 /* create and initialise a parser */
703 Newxz(parser, 1, yy_parser);
704 parser->old_parser = oparser = PL_parser;
707 parser->stack = NULL;
708 parser->stack_max1 = NULL;
711 /* on scope exit, free this parser and restore any outer one */
713 parser->saved_curcop = PL_curcop;
715 /* initialise lexer state */
717 parser->nexttoke = 0;
718 parser->error_count = oparser ? oparser->error_count : 0;
719 parser->copline = parser->preambling = NOLINE;
720 parser->lex_state = LEX_NORMAL;
721 parser->expect = XSTATE;
723 parser->recheck_utf8_validity = FALSE;
724 parser->rsfp_filters =
725 !(flags & LEX_START_SAME_FILTER) || !oparser
727 : MUTABLE_AV(SvREFCNT_inc(
728 oparser->rsfp_filters
729 ? oparser->rsfp_filters
730 : (oparser->rsfp_filters = newAV())
733 Newx(parser->lex_brackstack, 120, char);
734 Newx(parser->lex_casestack, 12, char);
735 *parser->lex_casestack = '\0';
736 Newxz(parser->lex_shared, 1, LEXSHARED);
740 const U8* first_bad_char_loc;
742 s = SvPV_const(line, len);
745 && UNLIKELY(! is_utf8_string_loc((U8 *) s,
747 &first_bad_char_loc)))
749 _force_out_malformed_utf8_message(first_bad_char_loc,
750 (U8 *) s + SvCUR(line),
752 1 /* 1 means die */ );
753 NOT_REACHED; /* NOTREACHED */
756 parser->linestr = flags & LEX_START_COPIED
757 ? SvREFCNT_inc_simple_NN(line)
758 : newSVpvn_flags(s, len, SvUTF8(line));
760 sv_catpvs(parser->linestr, "\n;");
762 parser->linestr = newSVpvn("\n;", rsfp ? 1 : 2);
765 parser->oldoldbufptr =
768 parser->linestart = SvPVX(parser->linestr);
769 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
770 parser->last_lop = parser->last_uni = NULL;
772 STATIC_ASSERT_STMT(FITS_IN_8_BITS(LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
773 |LEX_DONT_CLOSE_RSFP));
774 parser->lex_flags = (U8) (flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
775 |LEX_DONT_CLOSE_RSFP));
777 parser->in_pod = parser->filtered = 0;
781 /* delete a parser object */
784 Perl_parser_free(pTHX_ const yy_parser *parser)
786 PERL_ARGS_ASSERT_PARSER_FREE;
788 PL_curcop = parser->saved_curcop;
789 SvREFCNT_dec(parser->linestr);
791 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
792 PerlIO_clearerr(parser->rsfp);
793 else if (parser->rsfp && (!parser->old_parser
794 || (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
795 PerlIO_close(parser->rsfp);
796 SvREFCNT_dec(parser->rsfp_filters);
797 SvREFCNT_dec(parser->lex_stuff);
798 SvREFCNT_dec(parser->lex_sub_repl);
800 Safefree(parser->lex_brackstack);
801 Safefree(parser->lex_casestack);
802 Safefree(parser->lex_shared);
803 PL_parser = parser->old_parser;
808 Perl_parser_free_nexttoke_ops(pTHX_ yy_parser *parser, OPSLAB *slab)
810 I32 nexttoke = parser->nexttoke;
811 PERL_ARGS_ASSERT_PARSER_FREE_NEXTTOKE_OPS;
813 if (S_is_opval_token(parser->nexttype[nexttoke] & 0xffff)
814 && parser->nextval[nexttoke].opval
815 && parser->nextval[nexttoke].opval->op_slabbed
816 && OpSLAB(parser->nextval[nexttoke].opval) == slab) {
817 op_free(parser->nextval[nexttoke].opval);
818 parser->nextval[nexttoke].opval = NULL;
825 =for apidoc AmxU|SV *|PL_parser-E<gt>linestr
827 Buffer scalar containing the chunk currently under consideration of the
828 text currently being lexed. This is always a plain string scalar (for
829 which C<SvPOK> is true). It is not intended to be used as a scalar by
830 normal scalar means; instead refer to the buffer directly by the pointer
831 variables described below.
833 The lexer maintains various C<char*> pointers to things in the
834 C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
835 reallocated, all of these pointers must be updated. Don't attempt to
836 do this manually, but rather use L</lex_grow_linestr> if you need to
837 reallocate the buffer.
839 The content of the text chunk in the buffer is commonly exactly one
840 complete line of input, up to and including a newline terminator,
841 but there are situations where it is otherwise. The octets of the
842 buffer may be intended to be interpreted as either UTF-8 or Latin-1.
843 The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
844 flag on this scalar, which may disagree with it.
846 For direct examination of the buffer, the variable
847 L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
848 lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
849 of these pointers is usually preferable to examination of the scalar
850 through normal scalar means.
852 =for apidoc AmxU|char *|PL_parser-E<gt>bufend
854 Direct pointer to the end of the chunk of text currently being lexed, the
855 end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
856 + SvCUR(PL_parser-E<gt>linestr)>. A C<NUL> character (zero octet) is
857 always located at the end of the buffer, and does not count as part of
858 the buffer's contents.
860 =for apidoc AmxU|char *|PL_parser-E<gt>bufptr
862 Points to the current position of lexing inside the lexer buffer.
863 Characters around this point may be freely examined, within
864 the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
865 L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
866 interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
868 Lexing code (whether in the Perl core or not) moves this pointer past
869 the characters that it consumes. It is also expected to perform some
870 bookkeeping whenever a newline character is consumed. This movement
871 can be more conveniently performed by the function L</lex_read_to>,
872 which handles newlines appropriately.
874 Interpretation of the buffer's octets can be abstracted out by
875 using the slightly higher-level functions L</lex_peek_unichar> and
876 L</lex_read_unichar>.
878 =for apidoc AmxU|char *|PL_parser-E<gt>linestart
880 Points to the start of the current line inside the lexer buffer.
881 This is useful for indicating at which column an error occurred, and
882 not much else. This must be updated by any lexing code that consumes
883 a newline; the function L</lex_read_to> handles this detail.
889 =for apidoc Amx|bool|lex_bufutf8
891 Indicates whether the octets in the lexer buffer
892 (L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
893 of Unicode characters. If not, they should be interpreted as Latin-1
894 characters. This is analogous to the C<SvUTF8> flag for scalars.
896 In UTF-8 mode, it is not guaranteed that the lexer buffer actually
897 contains valid UTF-8. Lexing code must be robust in the face of invalid
900 The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
901 is significant, but not the whole story regarding the input character
902 encoding. Normally, when a file is being read, the scalar contains octets
903 and its C<SvUTF8> flag is off, but the octets should be interpreted as
904 UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
905 however, the scalar may have the C<SvUTF8> flag on, and in this case its
906 octets should be interpreted as UTF-8 unless the C<use bytes> pragma
907 is in effect. This logic may change in the future; use this function
908 instead of implementing the logic yourself.
914 Perl_lex_bufutf8(pTHX)
920 =for apidoc Amx|char *|lex_grow_linestr|STRLEN len
922 Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
923 at least C<len> octets (including terminating C<NUL>). Returns a
924 pointer to the reallocated buffer. This is necessary before making
925 any direct modification of the buffer that would increase its length.
926 L</lex_stuff_pvn> provides a more convenient way to insert text into
929 Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
930 this function updates all of the lexer's variables that point directly
937 Perl_lex_grow_linestr(pTHX_ STRLEN len)
941 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
942 STRLEN linestart_pos, last_uni_pos, last_lop_pos, re_eval_start_pos;
945 linestr = PL_parser->linestr;
946 buf = SvPVX(linestr);
947 if (len <= SvLEN(linestr))
950 /* Is the lex_shared linestr SV the same as the current linestr SV?
951 * Only in this case does re_eval_start need adjusting, since it
952 * points within lex_shared->ls_linestr's buffer */
953 current = ( !PL_parser->lex_shared->ls_linestr
954 || linestr == PL_parser->lex_shared->ls_linestr);
956 bufend_pos = PL_parser->bufend - buf;
957 bufptr_pos = PL_parser->bufptr - buf;
958 oldbufptr_pos = PL_parser->oldbufptr - buf;
959 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
960 linestart_pos = PL_parser->linestart - buf;
961 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
962 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
963 re_eval_start_pos = (current && PL_parser->lex_shared->re_eval_start) ?
964 PL_parser->lex_shared->re_eval_start - buf : 0;
966 buf = sv_grow(linestr, len);
968 PL_parser->bufend = buf + bufend_pos;
969 PL_parser->bufptr = buf + bufptr_pos;
970 PL_parser->oldbufptr = buf + oldbufptr_pos;
971 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
972 PL_parser->linestart = buf + linestart_pos;
973 if (PL_parser->last_uni)
974 PL_parser->last_uni = buf + last_uni_pos;
975 if (PL_parser->last_lop)
976 PL_parser->last_lop = buf + last_lop_pos;
977 if (current && PL_parser->lex_shared->re_eval_start)
978 PL_parser->lex_shared->re_eval_start = buf + re_eval_start_pos;
983 =for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
985 Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
986 immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
987 reallocating the buffer if necessary. This means that lexing code that
988 runs later will see the characters as if they had appeared in the input.
989 It is not recommended to do this as part of normal parsing, and most
990 uses of this facility run the risk of the inserted characters being
991 interpreted in an unintended manner.
993 The string to be inserted is represented by C<len> octets starting
994 at C<pv>. These octets are interpreted as either UTF-8 or Latin-1,
995 according to whether the C<LEX_STUFF_UTF8> flag is set in C<flags>.
996 The characters are recoded for the lexer buffer, according to how the
997 buffer is currently being interpreted (L</lex_bufutf8>). If a string
998 to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
999 function is more convenient.
1005 Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
1009 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
1010 if (flags & ~(LEX_STUFF_UTF8))
1011 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
1013 if (flags & LEX_STUFF_UTF8) {
1016 STRLEN highhalf = 0; /* Count of variants */
1017 const char *p, *e = pv+len;
1018 for (p = pv; p != e; p++) {
1019 if (! UTF8_IS_INVARIANT(*p)) {
1025 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
1026 bufptr = PL_parser->bufptr;
1027 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
1028 SvCUR_set(PL_parser->linestr,
1029 SvCUR(PL_parser->linestr) + len+highhalf);
1030 PL_parser->bufend += len+highhalf;
1031 for (p = pv; p != e; p++) {
1033 if (! UTF8_IS_INVARIANT(c)) {
1034 *bufptr++ = UTF8_TWO_BYTE_HI(c);
1035 *bufptr++ = UTF8_TWO_BYTE_LO(c);
1037 *bufptr++ = (char)c;
1042 if (flags & LEX_STUFF_UTF8) {
1043 STRLEN highhalf = 0;
1044 const char *p, *e = pv+len;
1045 for (p = pv; p != e; p++) {
1047 if (UTF8_IS_ABOVE_LATIN1(c)) {
1048 Perl_croak(aTHX_ "Lexing code attempted to stuff "
1049 "non-Latin-1 character into Latin-1 input");
1050 } else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e)) {
1053 } else assert(UTF8_IS_INVARIANT(c));
1057 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
1058 bufptr = PL_parser->bufptr;
1059 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
1060 SvCUR_set(PL_parser->linestr,
1061 SvCUR(PL_parser->linestr) + len-highhalf);
1062 PL_parser->bufend += len-highhalf;
1065 if (UTF8_IS_INVARIANT(*p)) {
1071 *bufptr++ = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
1077 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1078 bufptr = PL_parser->bufptr;
1079 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
1080 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
1081 PL_parser->bufend += len;
1082 Copy(pv, bufptr, len, char);
1088 =for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
1090 Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1091 immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1092 reallocating the buffer if necessary. This means that lexing code that
1093 runs later will see the characters as if they had appeared in the input.
1094 It is not recommended to do this as part of normal parsing, and most
1095 uses of this facility run the risk of the inserted characters being
1096 interpreted in an unintended manner.
1098 The string to be inserted is represented by octets starting at C<pv>
1099 and continuing to the first nul. These octets are interpreted as either
1100 UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
1101 in C<flags>. The characters are recoded for the lexer buffer, according
1102 to how the buffer is currently being interpreted (L</lex_bufutf8>).
1103 If it is not convenient to nul-terminate a string to be inserted, the
1104 L</lex_stuff_pvn> function is more appropriate.
1110 Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1112 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1113 lex_stuff_pvn(pv, strlen(pv), flags);
1117 =for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1119 Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1120 immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1121 reallocating the buffer if necessary. This means that lexing code that
1122 runs later will see the characters as if they had appeared in the input.
1123 It is not recommended to do this as part of normal parsing, and most
1124 uses of this facility run the risk of the inserted characters being
1125 interpreted in an unintended manner.
1127 The string to be inserted is the string value of C<sv>. The characters
1128 are recoded for the lexer buffer, according to how the buffer is currently
1129 being interpreted (L</lex_bufutf8>). If a string to be inserted is
1130 not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1131 need to construct a scalar.
1137 Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1141 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1143 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1145 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1149 =for apidoc Amx|void|lex_unstuff|char *ptr
1151 Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1152 C<ptr>. Text following C<ptr> will be moved, and the buffer shortened.
1153 This hides the discarded text from any lexing code that runs later,
1154 as if the text had never appeared.
1156 This is not the normal way to consume lexed text. For that, use
1163 Perl_lex_unstuff(pTHX_ char *ptr)
1167 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1168 buf = PL_parser->bufptr;
1170 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1173 bufend = PL_parser->bufend;
1175 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1176 unstuff_len = ptr - buf;
1177 Move(ptr, buf, bufend+1-ptr, char);
1178 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1179 PL_parser->bufend = bufend - unstuff_len;
1183 =for apidoc Amx|void|lex_read_to|char *ptr
1185 Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1186 to C<ptr>. This advances L</PL_parser-E<gt>bufptr> to match C<ptr>,
1187 performing the correct bookkeeping whenever a newline character is passed.
1188 This is the normal way to consume lexed text.
1190 Interpretation of the buffer's octets can be abstracted out by
1191 using the slightly higher-level functions L</lex_peek_unichar> and
1192 L</lex_read_unichar>.
1198 Perl_lex_read_to(pTHX_ char *ptr)
1201 PERL_ARGS_ASSERT_LEX_READ_TO;
1202 s = PL_parser->bufptr;
1203 if (ptr < s || ptr > PL_parser->bufend)
1204 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1205 for (; s != ptr; s++)
1207 COPLINE_INC_WITH_HERELINES;
1208 PL_parser->linestart = s+1;
1210 PL_parser->bufptr = ptr;
1214 =for apidoc Amx|void|lex_discard_to|char *ptr
1216 Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1217 up to C<ptr>. The remaining content of the buffer will be moved, and
1218 all pointers into the buffer updated appropriately. C<ptr> must not
1219 be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1220 it is not permitted to discard text that has yet to be lexed.
1222 Normally it is not necessarily to do this directly, because it suffices to
1223 use the implicit discarding behaviour of L</lex_next_chunk> and things
1224 based on it. However, if a token stretches across multiple lines,
1225 and the lexing code has kept multiple lines of text in the buffer for
1226 that purpose, then after completion of the token it would be wise to
1227 explicitly discard the now-unneeded earlier lines, to avoid future
1228 multi-line tokens growing the buffer without bound.
1234 Perl_lex_discard_to(pTHX_ char *ptr)
1238 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1239 buf = SvPVX(PL_parser->linestr);
1241 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1244 if (ptr > PL_parser->bufptr)
1245 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1246 discard_len = ptr - buf;
1247 if (PL_parser->oldbufptr < ptr)
1248 PL_parser->oldbufptr = ptr;
1249 if (PL_parser->oldoldbufptr < ptr)
1250 PL_parser->oldoldbufptr = ptr;
1251 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1252 PL_parser->last_uni = NULL;
1253 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1254 PL_parser->last_lop = NULL;
1255 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1256 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1257 PL_parser->bufend -= discard_len;
1258 PL_parser->bufptr -= discard_len;
1259 PL_parser->oldbufptr -= discard_len;
1260 PL_parser->oldoldbufptr -= discard_len;
1261 if (PL_parser->last_uni)
1262 PL_parser->last_uni -= discard_len;
1263 if (PL_parser->last_lop)
1264 PL_parser->last_lop -= discard_len;
1268 Perl_notify_parser_that_changed_to_utf8(pTHX)
1270 /* Called when $^H is changed to indicate that HINT_UTF8 has changed from
1271 * off to on. At compile time, this has the effect of entering a 'use
1272 * utf8' section. This means that any input was not previously checked for
1273 * UTF-8 (because it was off), but now we do need to check it, or our
1274 * assumptions about the input being sane could be wrong, and we could
1275 * segfault. This routine just sets a flag so that the next time we look
1276 * at the input we do the well-formed UTF-8 check. If we aren't in the
1277 * proper phase, there may not be a parser object, but if there is, setting
1278 * the flag is harmless */
1281 PL_parser->recheck_utf8_validity = TRUE;
1286 =for apidoc Amx|bool|lex_next_chunk|U32 flags
1288 Reads in the next chunk of text to be lexed, appending it to
1289 L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1290 looked to the end of the current chunk and wants to know more. It is
1291 usual, but not necessary, for lexing to have consumed the entirety of
1292 the current chunk at this time.
1294 If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1295 chunk (i.e., the current chunk has been entirely consumed), normally the
1296 current chunk will be discarded at the same time that the new chunk is
1297 read in. If C<flags> has the C<LEX_KEEP_PREVIOUS> bit set, the current chunk
1298 will not be discarded. If the current chunk has not been entirely
1299 consumed, then it will not be discarded regardless of the flag.
1301 Returns true if some new text was added to the buffer, or false if the
1302 buffer has reached the end of the input text.
1307 #define LEX_FAKE_EOF 0x80000000
1308 #define LEX_NO_TERM 0x40000000 /* here-doc */
1311 Perl_lex_next_chunk(pTHX_ U32 flags)
1315 STRLEN old_bufend_pos, new_bufend_pos;
1316 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1317 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
1318 bool got_some_for_debugger = 0;
1321 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM))
1322 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
1323 if (!(flags & LEX_NO_TERM) && PL_lex_inwhat)
1325 linestr = PL_parser->linestr;
1326 buf = SvPVX(linestr);
1327 if (!(flags & LEX_KEEP_PREVIOUS)
1328 && PL_parser->bufptr == PL_parser->bufend)
1330 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1332 if (PL_parser->last_uni != PL_parser->bufend)
1333 PL_parser->last_uni = NULL;
1334 if (PL_parser->last_lop != PL_parser->bufend)
1335 PL_parser->last_lop = NULL;
1336 last_uni_pos = last_lop_pos = 0;
1340 old_bufend_pos = PL_parser->bufend - buf;
1341 bufptr_pos = PL_parser->bufptr - buf;
1342 oldbufptr_pos = PL_parser->oldbufptr - buf;
1343 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1344 linestart_pos = PL_parser->linestart - buf;
1345 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1346 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1348 if (flags & LEX_FAKE_EOF) {
1350 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
1352 } else if (filter_gets(linestr, old_bufend_pos)) {
1354 got_some_for_debugger = 1;
1355 } else if (flags & LEX_NO_TERM) {
1358 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1361 /* End of real input. Close filehandle (unless it was STDIN),
1362 * then add implicit termination.
1364 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
1365 PerlIO_clearerr(PL_parser->rsfp);
1366 else if (PL_parser->rsfp)
1367 (void)PerlIO_close(PL_parser->rsfp);
1368 PL_parser->rsfp = NULL;
1369 PL_parser->in_pod = PL_parser->filtered = 0;
1370 if (!PL_in_eval && PL_minus_p) {
1372 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1373 PL_minus_n = PL_minus_p = 0;
1374 } else if (!PL_in_eval && PL_minus_n) {
1375 sv_catpvs(linestr, /*{*/";}");
1378 sv_catpvs(linestr, ";");
1381 buf = SvPVX(linestr);
1382 new_bufend_pos = SvCUR(linestr);
1383 PL_parser->bufend = buf + new_bufend_pos;
1384 PL_parser->bufptr = buf + bufptr_pos;
1387 const U8* first_bad_char_loc;
1388 if (UNLIKELY(! is_utf8_string_loc(
1389 (U8 *) PL_parser->bufptr,
1390 PL_parser->bufend - PL_parser->bufptr,
1391 &first_bad_char_loc)))
1393 _force_out_malformed_utf8_message(first_bad_char_loc,
1394 (U8 *) PL_parser->bufend,
1396 1 /* 1 means die */ );
1397 NOT_REACHED; /* NOTREACHED */
1401 PL_parser->oldbufptr = buf + oldbufptr_pos;
1402 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1403 PL_parser->linestart = buf + linestart_pos;
1404 if (PL_parser->last_uni)
1405 PL_parser->last_uni = buf + last_uni_pos;
1406 if (PL_parser->last_lop)
1407 PL_parser->last_lop = buf + last_lop_pos;
1408 if (PL_parser->preambling != NOLINE) {
1409 CopLINE_set(PL_curcop, PL_parser->preambling + 1);
1410 PL_parser->preambling = NOLINE;
1412 if ( got_some_for_debugger
1413 && PERLDB_LINE_OR_SAVESRC
1414 && PL_curstash != PL_debstash)
1416 /* debugger active and we're not compiling the debugger code,
1417 * so store the line into the debugger's array of lines
1419 update_debugger_info(NULL, buf+old_bufend_pos,
1420 new_bufend_pos-old_bufend_pos);
1426 =for apidoc Amx|I32|lex_peek_unichar|U32 flags
1428 Looks ahead one (Unicode) character in the text currently being lexed.
1429 Returns the codepoint (unsigned integer value) of the next character,
1430 or -1 if lexing has reached the end of the input text. To consume the
1431 peeked character, use L</lex_read_unichar>.
1433 If the next character is in (or extends into) the next chunk of input
1434 text, the next chunk will be read in. Normally the current chunk will be
1435 discarded at the same time, but if C<flags> has the C<LEX_KEEP_PREVIOUS>
1436 bit set, then the current chunk will not be discarded.
1438 If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1439 is encountered, an exception is generated.
1445 Perl_lex_peek_unichar(pTHX_ U32 flags)
1449 if (flags & ~(LEX_KEEP_PREVIOUS))
1450 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1451 s = PL_parser->bufptr;
1452 bufend = PL_parser->bufend;
1458 if (!lex_next_chunk(flags))
1460 s = PL_parser->bufptr;
1461 bufend = PL_parser->bufend;
1464 if (UTF8_IS_INVARIANT(head))
1466 if (UTF8_IS_START(head)) {
1467 len = UTF8SKIP(&head);
1468 while ((STRLEN)(bufend-s) < len) {
1469 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1471 s = PL_parser->bufptr;
1472 bufend = PL_parser->bufend;
1475 unichar = utf8n_to_uvchr((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
1476 if (retlen == (STRLEN)-1) {
1477 _force_out_malformed_utf8_message((U8 *) s,
1480 1 /* 1 means die */ );
1481 NOT_REACHED; /* NOTREACHED */
1486 if (!lex_next_chunk(flags))
1488 s = PL_parser->bufptr;
1495 =for apidoc Amx|I32|lex_read_unichar|U32 flags
1497 Reads the next (Unicode) character in the text currently being lexed.
1498 Returns the codepoint (unsigned integer value) of the character read,
1499 and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1500 if lexing has reached the end of the input text. To non-destructively
1501 examine the next character, use L</lex_peek_unichar> instead.
1503 If the next character is in (or extends into) the next chunk of input
1504 text, the next chunk will be read in. Normally the current chunk will be
1505 discarded at the same time, but if C<flags> has the C<LEX_KEEP_PREVIOUS>
1506 bit set, then the current chunk will not be discarded.
1508 If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1509 is encountered, an exception is generated.
1515 Perl_lex_read_unichar(pTHX_ U32 flags)
1518 if (flags & ~(LEX_KEEP_PREVIOUS))
1519 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1520 c = lex_peek_unichar(flags);
1523 COPLINE_INC_WITH_HERELINES;
1525 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1527 ++(PL_parser->bufptr);
1533 =for apidoc Amx|void|lex_read_space|U32 flags
1535 Reads optional spaces, in Perl style, in the text currently being
1536 lexed. The spaces may include ordinary whitespace characters and
1537 Perl-style comments. C<#line> directives are processed if encountered.
1538 L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1539 at a non-space character (or the end of the input text).
1541 If spaces extend into the next chunk of input text, the next chunk will
1542 be read in. Normally the current chunk will be discarded at the same
1543 time, but if C<flags> has the C<LEX_KEEP_PREVIOUS> bit set, then the current
1544 chunk will not be discarded.
1549 #define LEX_NO_INCLINE 0x40000000
1550 #define LEX_NO_NEXT_CHUNK 0x80000000
1553 Perl_lex_read_space(pTHX_ U32 flags)
1556 const bool can_incline = !(flags & LEX_NO_INCLINE);
1557 bool need_incline = 0;
1558 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK|LEX_NO_INCLINE))
1559 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
1560 s = PL_parser->bufptr;
1561 bufend = PL_parser->bufend;
1567 } while (!(c == '\n' || (c == 0 && s == bufend)));
1568 } else if (c == '\n') {
1571 PL_parser->linestart = s;
1577 } else if (isSPACE(c)) {
1579 } else if (c == 0 && s == bufend) {
1582 if (flags & LEX_NO_NEXT_CHUNK)
1584 PL_parser->bufptr = s;
1585 l = CopLINE(PL_curcop);
1586 CopLINE(PL_curcop) += PL_parser->herelines + 1;
1587 got_more = lex_next_chunk(flags);
1588 CopLINE_set(PL_curcop, l);
1589 s = PL_parser->bufptr;
1590 bufend = PL_parser->bufend;
1593 if (can_incline && need_incline && PL_parser->rsfp) {
1603 PL_parser->bufptr = s;
1608 =for apidoc EXMp|bool|validate_proto|SV *name|SV *proto|bool warn
1610 This function performs syntax checking on a prototype, C<proto>.
1611 If C<warn> is true, any illegal characters or mismatched brackets
1612 will trigger illegalproto warnings, declaring that they were
1613 detected in the prototype for C<name>.
1615 The return value is C<true> if this is a valid prototype, and
1616 C<false> if it is not, regardless of whether C<warn> was C<true> or
1619 Note that C<NULL> is a valid C<proto> and will always return C<true>.
1626 Perl_validate_proto(pTHX_ SV *name, SV *proto, bool warn)
1628 STRLEN len, origlen;
1630 bool bad_proto = FALSE;
1631 bool in_brackets = FALSE;
1632 bool after_slash = FALSE;
1633 char greedy_proto = ' ';
1634 bool proto_after_greedy_proto = FALSE;
1635 bool must_be_last = FALSE;
1636 bool underscore = FALSE;
1637 bool bad_proto_after_underscore = FALSE;
1639 PERL_ARGS_ASSERT_VALIDATE_PROTO;
1644 p = SvPV(proto, len);
1646 for (; len--; p++) {
1649 proto_after_greedy_proto = TRUE;
1651 if (!strchr(";@%", *p))
1652 bad_proto_after_underscore = TRUE;
1655 if (!strchr("$@%*;[]&\\_+", *p) || *p == '\0') {
1662 in_brackets = FALSE;
1663 else if ((*p == '@' || *p == '%')
1667 must_be_last = TRUE;
1676 after_slash = FALSE;
1681 SV *tmpsv = newSVpvs_flags("", SVs_TEMP);
1684 ? sv_uni_display(tmpsv, newSVpvn_flags(p, origlen, SVs_TEMP | SVf_UTF8),
1685 origlen, UNI_DISPLAY_ISPRINT)
1686 : pv_pretty(tmpsv, p, origlen, 60, NULL, NULL, PERL_PV_ESCAPE_NONASCII);
1688 if (proto_after_greedy_proto)
1689 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1690 "Prototype after '%c' for %" SVf " : %s",
1691 greedy_proto, SVfARG(name), p);
1693 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1694 "Missing ']' in prototype for %" SVf " : %s",
1697 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1698 "Illegal character in prototype for %" SVf " : %s",
1700 if (bad_proto_after_underscore)
1701 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1702 "Illegal character after '_' in prototype for %" SVf " : %s",
1706 return (! (proto_after_greedy_proto || bad_proto) );
1711 * This subroutine has nothing to do with tilting, whether at windmills
1712 * or pinball tables. Its name is short for "increment line". It
1713 * increments the current line number in CopLINE(PL_curcop) and checks
1714 * to see whether the line starts with a comment of the form
1715 * # line 500 "foo.pm"
1716 * If so, it sets the current line number and file to the values in the comment.
1720 S_incline(pTHX_ const char *s)
1728 PERL_ARGS_ASSERT_INCLINE;
1730 COPLINE_INC_WITH_HERELINES;
1731 if (!PL_rsfp && !PL_parser->filtered && PL_lex_state == LEX_NORMAL
1732 && s+1 == PL_bufend && *s == ';') {
1733 /* fake newline in string eval */
1734 CopLINE_dec(PL_curcop);
1739 while (SPACE_OR_TAB(*s))
1741 if (strEQs(s, "line"))
1745 if (SPACE_OR_TAB(*s))
1749 while (SPACE_OR_TAB(*s))
1757 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
1759 while (SPACE_OR_TAB(*s))
1761 if (*s == '"' && (t = strchr(s+1, '"'))) {
1767 while (*t && !isSPACE(*t))
1771 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
1773 if (*e != '\n' && *e != '\0')
1774 return; /* false alarm */
1776 if (!grok_atoUV(n, &uv, &e))
1778 line_num = ((line_t)uv) - 1;
1781 const STRLEN len = t - s;
1783 if (!PL_rsfp && !PL_parser->filtered) {
1784 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1785 * to *{"::_<newfilename"} */
1786 /* However, the long form of evals is only turned on by the
1787 debugger - usually they're "(eval %lu)" */
1788 GV * const cfgv = CopFILEGV(PL_curcop);
1791 STRLEN tmplen2 = len;
1795 if (tmplen2 + 2 <= sizeof smallbuf)
1798 Newx(tmpbuf2, tmplen2 + 2, char);
1803 memcpy(tmpbuf2 + 2, s, tmplen2);
1806 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
1808 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
1809 /* adjust ${"::_<newfilename"} to store the new file name */
1810 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
1811 /* The line number may differ. If that is the case,
1812 alias the saved lines that are in the array.
1813 Otherwise alias the whole array. */
1814 if (CopLINE(PL_curcop) == line_num) {
1815 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(cfgv)));
1816 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(cfgv)));
1818 else if (GvAV(cfgv)) {
1819 AV * const av = GvAV(cfgv);
1820 const I32 start = CopLINE(PL_curcop)+1;
1821 I32 items = AvFILLp(av) - start;
1823 AV * const av2 = GvAVn(gv2);
1824 SV **svp = AvARRAY(av) + start;
1825 I32 l = (I32)line_num+1;
1827 av_store(av2, l++, SvREFCNT_inc(*svp++));
1832 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
1835 CopFILE_free(PL_curcop);
1836 CopFILE_setn(PL_curcop, s, len);
1838 CopLINE_set(PL_curcop, line_num);
1842 S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
1844 AV *av = CopFILEAVx(PL_curcop);
1847 if (PL_parser->preambling == NOLINE) sv = newSV_type(SVt_PVMG);
1849 sv = *av_fetch(av, 0, 1);
1850 SvUPGRADE(sv, SVt_PVMG);
1852 if (!SvPOK(sv)) SvPVCLEAR(sv);
1854 sv_catsv(sv, orig_sv);
1856 sv_catpvn(sv, buf, len);
1861 if (PL_parser->preambling == NOLINE)
1862 av_store(av, CopLINE(PL_curcop), sv);
1868 * Called to gobble the appropriate amount and type of whitespace.
1869 * Skips comments as well.
1870 * Returns the next character after the whitespace that is skipped.
1873 * Same thing, but look ahead without incrementing line numbers or
1874 * adjusting PL_linestart.
1877 #define skipspace(s) skipspace_flags(s, 0)
1878 #define peekspace(s) skipspace_flags(s, LEX_NO_INCLINE)
1881 S_skipspace_flags(pTHX_ char *s, U32 flags)
1883 PERL_ARGS_ASSERT_SKIPSPACE_FLAGS;
1884 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
1885 while (s < PL_bufend && (SPACE_OR_TAB(*s) || !*s))
1888 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1890 lex_read_space(flags | LEX_KEEP_PREVIOUS |
1891 (PL_lex_inwhat || PL_lex_state == LEX_FORMLINE ?
1892 LEX_NO_NEXT_CHUNK : 0));
1894 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1895 if (PL_linestart > PL_bufptr)
1896 PL_bufptr = PL_linestart;
1904 * Check the unary operators to ensure there's no ambiguity in how they're
1905 * used. An ambiguous piece of code would be:
1907 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1908 * the +5 is its argument.
1917 if (PL_oldoldbufptr != PL_last_uni)
1919 while (isSPACE(*PL_last_uni))
1922 while (isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF) || *s == '-')
1923 s += UTF ? UTF8SKIP(s) : 1;
1924 if ((t = strchr(s, '(')) && t < PL_bufptr)
1927 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
1928 "Warning: Use of \"%" UTF8f "\" without parentheses is ambiguous",
1929 UTF8fARG(UTF, (int)(s - PL_last_uni), PL_last_uni));
1933 * LOP : macro to build a list operator. Its behaviour has been replaced
1934 * with a subroutine, S_lop() for which LOP is just another name.
1937 #define LOP(f,x) return lop(f,x,s)
1941 * Build a list operator (or something that might be one). The rules:
1942 * - if we have a next token, then it's a list operator (no parens) for
1943 * which the next token has already been parsed; e.g.,
1946 * - if the next thing is an opening paren, then it's a function
1947 * - else it's a list operator
1951 S_lop(pTHX_ I32 f, U8 x, char *s)
1953 PERL_ARGS_ASSERT_LOP;
1958 PL_last_lop = PL_oldbufptr;
1959 PL_last_lop_op = (OPCODE)f;
1964 return REPORT(FUNC);
1967 return REPORT(FUNC);
1970 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
1971 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
1972 return REPORT(LSTOP);
1978 * When the lexer realizes it knows the next token (for instance,
1979 * it is reordering tokens for the parser) then it can call S_force_next
1980 * to know what token to return the next time the lexer is called. Caller
1981 * will need to set PL_nextval[] and possibly PL_expect to ensure
1982 * the lexer handles the token correctly.
1986 S_force_next(pTHX_ I32 type)
1990 PerlIO_printf(Perl_debug_log, "### forced token:\n");
1991 tokereport(type, &NEXTVAL_NEXTTOKE);
1994 assert(PL_nexttoke < C_ARRAY_LENGTH(PL_nexttype));
1995 PL_nexttype[PL_nexttoke] = type;
2002 * This subroutine handles postfix deref syntax after the arrow has already
2003 * been emitted. @* $* etc. are emitted as two separate token right here.
2004 * @[ @{ %[ %{ *{ are emitted also as two tokens, but this function emits
2005 * only the first, leaving yylex to find the next.
2009 S_postderef(pTHX_ int const funny, char const next)
2011 assert(funny == DOLSHARP || strchr("$@%&*", funny));
2013 PL_expect = XOPERATOR;
2014 if (PL_lex_state == LEX_INTERPNORMAL && !PL_lex_brackets) {
2015 assert('@' == funny || '$' == funny || DOLSHARP == funny);
2016 PL_lex_state = LEX_INTERPEND;
2018 force_next(POSTJOIN);
2024 if ('@' == funny && PL_lex_state == LEX_INTERPNORMAL
2025 && !PL_lex_brackets)
2027 PL_expect = XOPERATOR;
2036 int yyc = PL_parser->yychar;
2037 if (yyc != YYEMPTY) {
2039 NEXTVAL_NEXTTOKE = PL_parser->yylval;
2040 if (yyc == '{'/*}*/ || yyc == HASHBRACK || yyc == '['/*]*/) {
2041 PL_lex_allbrackets--;
2043 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
2044 } else if (yyc == '('/*)*/) {
2045 PL_lex_allbrackets--;
2050 PL_parser->yychar = YYEMPTY;
2055 S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
2057 SV * const sv = newSVpvn_utf8(start, len,
2060 && !is_utf8_invariant_string((const U8*)start, len)
2061 && is_utf8_string((const U8*)start, len));
2067 * When the lexer knows the next thing is a word (for instance, it has
2068 * just seen -> and it knows that the next char is a word char, then
2069 * it calls S_force_word to stick the next word into the PL_nexttoke/val
2073 * char *start : buffer position (must be within PL_linestr)
2074 * int token : PL_next* will be this type of bare word
2075 * (e.g., METHOD,BAREWORD)
2076 * int check_keyword : if true, Perl checks to make sure the word isn't
2077 * a keyword (do this if the word is a label, e.g. goto FOO)
2078 * int allow_pack : if true, : characters will also be allowed (require,
2079 * use, etc. do this)
2083 S_force_word(pTHX_ char *start, int token, int check_keyword, int allow_pack)
2088 PERL_ARGS_ASSERT_FORCE_WORD;
2090 start = skipspace(start);
2092 if ( isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)
2093 || (allow_pack && *s == ':' && s[1] == ':') )
2095 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
2096 if (check_keyword) {
2097 char *s2 = PL_tokenbuf;
2099 if (allow_pack && len > 6 && strEQs(s2, "CORE::"))
2101 if (keyword(s2, len2, 0))
2104 if (token == METHOD) {
2109 PL_expect = XOPERATOR;
2112 NEXTVAL_NEXTTOKE.opval
2113 = newSVOP(OP_CONST,0,
2114 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
2115 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
2123 * Called when the lexer wants $foo *foo &foo etc, but the program
2124 * text only contains the "foo" portion. The first argument is a pointer
2125 * to the "foo", and the second argument is the type symbol to prefix.
2126 * Forces the next token to be a "BAREWORD".
2127 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
2131 S_force_ident(pTHX_ const char *s, int kind)
2133 PERL_ARGS_ASSERT_FORCE_IDENT;
2136 const STRLEN len = s[1] ? strlen(s) : 1; /* s = "\"" see yylex */
2137 OP* const o = newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
2138 UTF ? SVf_UTF8 : 0));
2139 NEXTVAL_NEXTTOKE.opval = o;
2140 force_next(BAREWORD);
2142 o->op_private = OPpCONST_ENTERED;
2143 /* XXX see note in pp_entereval() for why we forgo typo
2144 warnings if the symbol must be introduced in an eval.
2146 gv_fetchpvn_flags(s, len,
2147 (PL_in_eval ? GV_ADDMULTI
2148 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
2149 kind == '$' ? SVt_PV :
2150 kind == '@' ? SVt_PVAV :
2151 kind == '%' ? SVt_PVHV :
2159 S_force_ident_maybe_lex(pTHX_ char pit)
2161 NEXTVAL_NEXTTOKE.ival = pit;
2166 Perl_str_to_version(pTHX_ SV *sv)
2171 const char *start = SvPV_const(sv,len);
2172 const char * const end = start + len;
2173 const bool utf = cBOOL(SvUTF8(sv));
2175 PERL_ARGS_ASSERT_STR_TO_VERSION;
2177 while (start < end) {
2181 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
2186 retval += ((NV)n)/nshift;
2195 * Forces the next token to be a version number.
2196 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2197 * and if "guessing" is TRUE, then no new token is created (and the caller
2198 * must use an alternative parsing method).
2202 S_force_version(pTHX_ char *s, int guessing)
2207 PERL_ARGS_ASSERT_FORCE_VERSION;
2215 while (isDIGIT(*d) || *d == '_' || *d == '.')
2217 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
2219 s = scan_num(s, &pl_yylval);
2220 version = pl_yylval.opval;
2221 ver = cSVOPx(version)->op_sv;
2222 if (SvPOK(ver) && !SvNIOK(ver)) {
2223 SvUPGRADE(ver, SVt_PVNV);
2224 SvNV_set(ver, str_to_version(ver));
2225 SvNOK_on(ver); /* hint that it is a version */
2228 else if (guessing) {
2233 /* NOTE: The parser sees the package name and the VERSION swapped */
2234 NEXTVAL_NEXTTOKE.opval = version;
2235 force_next(BAREWORD);
2241 * S_force_strict_version
2242 * Forces the next token to be a version number using strict syntax rules.
2246 S_force_strict_version(pTHX_ char *s)
2249 const char *errstr = NULL;
2251 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2253 while (isSPACE(*s)) /* leading whitespace */
2256 if (is_STRICT_VERSION(s,&errstr)) {
2258 s = (char *)scan_version(s, ver, 0);
2259 version = newSVOP(OP_CONST, 0, ver);
2261 else if ((*s != ';' && *s != '{' && *s != '}' )
2262 && (s = skipspace(s), (*s != ';' && *s != '{' && *s != '}' )))
2266 yyerror(errstr); /* version required */
2270 /* NOTE: The parser sees the package name and the VERSION swapped */
2271 NEXTVAL_NEXTTOKE.opval = version;
2272 force_next(BAREWORD);
2279 * Turns any \\ into \ in a quoted string passed in in 'sv', returning 'sv',
2280 * modified as necessary. However, if HINT_NEW_STRING is on, 'sv' is
2281 * unchanged, and a new SV containing the modified input is returned.
2285 S_tokeq(pTHX_ SV *sv)
2292 PERL_ARGS_ASSERT_TOKEQ;
2296 assert (!SvIsCOW(sv));
2297 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1) /* <<'heredoc' */
2301 /* This is relying on the SV being "well formed" with a trailing '\0' */
2302 while (s < send && !(*s == '\\' && s[1] == '\\'))
2307 if ( PL_hints & HINT_NEW_STRING ) {
2308 pv = newSVpvn_flags(SvPVX_const(pv), SvCUR(sv),
2309 SVs_TEMP | SvUTF8(sv));
2313 if (s + 1 < send && (s[1] == '\\'))
2314 s++; /* all that, just for this */
2319 SvCUR_set(sv, d - SvPVX_const(sv));
2321 if ( PL_hints & HINT_NEW_STRING )
2322 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
2327 * Now come three functions related to double-quote context,
2328 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2329 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2330 * interact with PL_lex_state, and create fake ( ... ) argument lists
2331 * to handle functions and concatenation.
2335 * stringify ( const[foo] concat lcfirst ( const[bar] ) )
2340 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
2342 * Pattern matching will set PL_lex_op to the pattern-matching op to
2343 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
2345 * OP_CONST is easy--just make the new op and return.
2347 * Everything else becomes a FUNC.
2349 * Sets PL_lex_state to LEX_INTERPPUSH unless ival was OP_NULL or we
2350 * had an OP_CONST. This just sets us up for a
2351 * call to S_sublex_push().
2355 S_sublex_start(pTHX)
2357 const I32 op_type = pl_yylval.ival;
2359 if (op_type == OP_NULL) {
2360 pl_yylval.opval = PL_lex_op;
2364 if (op_type == OP_CONST) {
2365 SV *sv = PL_lex_stuff;
2366 PL_lex_stuff = NULL;
2369 if (SvTYPE(sv) == SVt_PVIV) {
2370 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2372 const char * const p = SvPV_const(sv, len);
2373 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
2377 pl_yylval.opval = newSVOP(op_type, 0, sv);
2381 PL_parser->lex_super_state = PL_lex_state;
2382 PL_parser->lex_sub_inwhat = (U16)op_type;
2383 PL_parser->lex_sub_op = PL_lex_op;
2384 PL_lex_state = LEX_INTERPPUSH;
2388 pl_yylval.opval = PL_lex_op;
2398 * Create a new scope to save the lexing state. The scope will be
2399 * ended in S_sublex_done. Returns a '(', starting the function arguments
2400 * to the uc, lc, etc. found before.
2401 * Sets PL_lex_state to LEX_INTERPCONCAT.
2408 const bool is_heredoc = PL_multi_close == '<';
2411 PL_lex_state = PL_parser->lex_super_state;
2412 SAVEI8(PL_lex_dojoin);
2413 SAVEI32(PL_lex_brackets);
2414 SAVEI32(PL_lex_allbrackets);
2415 SAVEI32(PL_lex_formbrack);
2416 SAVEI8(PL_lex_fakeeof);
2417 SAVEI32(PL_lex_casemods);
2418 SAVEI32(PL_lex_starts);
2419 SAVEI8(PL_lex_state);
2420 SAVESPTR(PL_lex_repl);
2421 SAVEVPTR(PL_lex_inpat);
2422 SAVEI16(PL_lex_inwhat);
2425 SAVECOPLINE(PL_curcop);
2426 SAVEI32(PL_multi_end);
2427 SAVEI32(PL_parser->herelines);
2428 PL_parser->herelines = 0;
2430 SAVEIV(PL_multi_close);
2431 SAVEPPTR(PL_bufptr);
2432 SAVEPPTR(PL_bufend);
2433 SAVEPPTR(PL_oldbufptr);
2434 SAVEPPTR(PL_oldoldbufptr);
2435 SAVEPPTR(PL_last_lop);
2436 SAVEPPTR(PL_last_uni);
2437 SAVEPPTR(PL_linestart);
2438 SAVESPTR(PL_linestr);
2439 SAVEGENERICPV(PL_lex_brackstack);
2440 SAVEGENERICPV(PL_lex_casestack);
2441 SAVEGENERICPV(PL_parser->lex_shared);
2442 SAVEBOOL(PL_parser->lex_re_reparsing);
2443 SAVEI32(PL_copline);
2445 /* The here-doc parser needs to be able to peek into outer lexing
2446 scopes to find the body of the here-doc. So we put PL_linestr and
2447 PL_bufptr into lex_shared, to ‘share’ those values.
2449 PL_parser->lex_shared->ls_linestr = PL_linestr;
2450 PL_parser->lex_shared->ls_bufptr = PL_bufptr;
2452 PL_linestr = PL_lex_stuff;
2453 PL_lex_repl = PL_parser->lex_sub_repl;
2454 PL_lex_stuff = NULL;
2455 PL_parser->lex_sub_repl = NULL;
2457 /* Arrange for PL_lex_stuff to be freed on scope exit, in case it gets
2458 set for an inner quote-like operator and then an error causes scope-
2459 popping. We must not have a PL_lex_stuff value left dangling, as
2460 that breaks assumptions elsewhere. See bug #123617. */
2461 SAVEGENERICSV(PL_lex_stuff);
2462 SAVEGENERICSV(PL_parser->lex_sub_repl);
2464 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2465 = SvPVX(PL_linestr);
2466 PL_bufend += SvCUR(PL_linestr);
2467 PL_last_lop = PL_last_uni = NULL;
2468 SAVEFREESV(PL_linestr);
2469 if (PL_lex_repl) SAVEFREESV(PL_lex_repl);
2471 PL_lex_dojoin = FALSE;
2472 PL_lex_brackets = PL_lex_formbrack = 0;
2473 PL_lex_allbrackets = 0;
2474 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
2475 Newx(PL_lex_brackstack, 120, char);
2476 Newx(PL_lex_casestack, 12, char);
2477 PL_lex_casemods = 0;
2478 *PL_lex_casestack = '\0';
2480 PL_lex_state = LEX_INTERPCONCAT;
2482 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
2483 PL_copline = NOLINE;
2485 Newxz(shared, 1, LEXSHARED);
2486 shared->ls_prev = PL_parser->lex_shared;
2487 PL_parser->lex_shared = shared;
2489 PL_lex_inwhat = PL_parser->lex_sub_inwhat;
2490 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
2491 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2492 PL_lex_inpat = PL_parser->lex_sub_op;
2494 PL_lex_inpat = NULL;
2496 PL_parser->lex_re_reparsing = cBOOL(PL_in_eval & EVAL_RE_REPARSING);
2497 PL_in_eval &= ~EVAL_RE_REPARSING;
2504 * Restores lexer state after a S_sublex_push.
2510 if (!PL_lex_starts++) {
2511 SV * const sv = newSVpvs("");
2512 if (SvUTF8(PL_linestr))
2514 PL_expect = XOPERATOR;
2515 pl_yylval.opval = newSVOP(OP_CONST, 0, sv);
2519 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2520 PL_lex_state = LEX_INTERPCASEMOD;
2524 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
2525 assert(PL_lex_inwhat != OP_TRANSR);
2527 assert (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS);
2528 PL_linestr = PL_lex_repl;
2530 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2531 PL_bufend += SvCUR(PL_linestr);
2532 PL_last_lop = PL_last_uni = NULL;
2533 PL_lex_dojoin = FALSE;
2534 PL_lex_brackets = 0;
2535 PL_lex_allbrackets = 0;
2536 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
2537 PL_lex_casemods = 0;
2538 *PL_lex_casestack = '\0';
2540 if (SvEVALED(PL_lex_repl)) {
2541 PL_lex_state = LEX_INTERPNORMAL;
2543 /* we don't clear PL_lex_repl here, so that we can check later
2544 whether this is an evalled subst; that means we rely on the
2545 logic to ensure sublex_done() is called again only via the
2546 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
2549 PL_lex_state = LEX_INTERPCONCAT;
2552 if (SvTYPE(PL_linestr) >= SVt_PVNV) {
2553 CopLINE(PL_curcop) +=
2554 ((XPVNV*)SvANY(PL_linestr))->xnv_u.xnv_lines
2555 + PL_parser->herelines;
2556 PL_parser->herelines = 0;
2561 const line_t l = CopLINE(PL_curcop);
2563 if (PL_multi_close == '<')
2564 PL_parser->herelines += l - PL_multi_end;
2565 PL_bufend = SvPVX(PL_linestr);
2566 PL_bufend += SvCUR(PL_linestr);
2567 PL_expect = XOPERATOR;
2573 S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
2575 /* <s> points to first character of interior of \N{}, <e> to one beyond the
2576 * interior, hence to the "}". Finds what the name resolves to, returning
2577 * an SV* containing it; NULL if no valid one found */
2579 SV* res = newSVpvn_flags(s, e - s, UTF ? SVf_UTF8 : 0);
2586 const char* backslash_ptr = s - 3; /* Points to the <\> of \N{... */
2588 PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME;
2591 /* diag_listed_as: Unknown charname '%s' */
2592 yyerror("Unknown charname ''");
2596 res = new_constant( NULL, 0, "charnames", res, NULL, backslash_ptr,
2597 /* include the <}> */
2598 e - backslash_ptr + 1);
2600 SvREFCNT_dec_NN(res);
2604 /* See if the charnames handler is the Perl core's, and if so, we can skip
2605 * the validation needed for a user-supplied one, as Perl's does its own
2607 table = GvHV(PL_hintgv); /* ^H */
2608 cvp = hv_fetchs(table, "charnames", FALSE);
2609 if (cvp && (cv = *cvp) && SvROK(cv) && (rv = SvRV(cv),
2610 SvTYPE(rv) == SVt_PVCV) && ((stash = CvSTASH(rv)) != NULL))
2612 const char * const name = HvNAME(stash);
2613 if (HvNAMELEN(stash) == sizeof("_charnames")-1
2614 && strEQ(name, "_charnames")) {
2619 /* Here, it isn't Perl's charname handler. We can't rely on a
2620 * user-supplied handler to validate the input name. For non-ut8 input,
2621 * look to see that the first character is legal. Then loop through the
2622 * rest checking that each is a continuation */
2624 /* This code makes the reasonable assumption that the only Latin1-range
2625 * characters that begin a character name alias are alphabetic, otherwise
2626 * would have to create a isCHARNAME_BEGIN macro */
2629 if (! isALPHAU(*s)) {
2634 if (! isCHARNAME_CONT(*s)) {
2637 if (*s == ' ' && *(s-1) == ' ') {
2644 /* Similarly for utf8. For invariants can check directly; for other
2645 * Latin1, can calculate their code point and check; otherwise use a
2647 if (UTF8_IS_INVARIANT(*s)) {
2648 if (! isALPHAU(*s)) {
2652 } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
2653 if (! isALPHAU(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)))) {
2659 if (! PL_utf8_charname_begin) {
2660 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2661 PL_utf8_charname_begin = _core_swash_init("utf8",
2662 "_Perl_Charname_Begin",
2664 1, 0, NULL, &flags);
2666 if (! swash_fetch(PL_utf8_charname_begin, (U8 *) s, TRUE)) {
2673 if (UTF8_IS_INVARIANT(*s)) {
2674 if (! isCHARNAME_CONT(*s)) {
2677 if (*s == ' ' && *(s-1) == ' ') {
2682 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
2683 if (! isCHARNAME_CONT(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1))))
2690 if (! PL_utf8_charname_continue) {
2691 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2692 PL_utf8_charname_continue = _core_swash_init("utf8",
2693 "_Perl_Charname_Continue",
2695 1, 0, NULL, &flags);
2697 if (! swash_fetch(PL_utf8_charname_continue, (U8 *) s, TRUE)) {
2704 if (*(s-1) == ' ') {
2707 "charnames alias definitions may not contain trailing "
2708 "white-space; marked by <-- HERE in %.*s<-- HERE %.*s",
2709 (int)(s - backslash_ptr + 1), backslash_ptr,
2710 (int)(e - s + 1), s + 1
2712 UTF ? SVf_UTF8 : 0);
2716 if (SvUTF8(res)) { /* Don't accept malformed input */
2717 const U8* first_bad_char_loc;
2719 const char* const str = SvPV_const(res, len);
2720 if (UNLIKELY(! is_utf8_string_loc((U8 *) str, len,
2721 &first_bad_char_loc)))
2723 _force_out_malformed_utf8_message(first_bad_char_loc,
2724 (U8 *) PL_parser->bufend,
2726 0 /* 0 means don't die */ );
2729 "Malformed UTF-8 returned by %.*s immediately after '%.*s'",
2730 (int) (e - backslash_ptr + 1), backslash_ptr,
2731 (int) ((char *) first_bad_char_loc - str), str
2742 /* The final %.*s makes sure that should the trailing NUL be missing
2743 * that this print won't run off the end of the string */
2746 "Invalid character in \\N{...}; marked by <-- HERE in %.*s<-- HERE %.*s",
2747 (int)(s - backslash_ptr + 1), backslash_ptr,
2748 (int)(e - s + 1), s + 1
2750 UTF ? SVf_UTF8 : 0);
2757 "charnames alias definitions may not contain a sequence of "
2758 "multiple spaces; marked by <-- HERE in %.*s<-- HERE %.*s",
2759 (int)(s - backslash_ptr + 1), backslash_ptr,
2760 (int)(e - s + 1), s + 1
2762 UTF ? SVf_UTF8 : 0);
2769 Extracts the next constant part of a pattern, double-quoted string,
2770 or transliteration. This is terrifying code.
2772 For example, in parsing the double-quoted string "ab\x63$d", it would
2773 stop at the '$' and return an OP_CONST containing 'abc'.
2775 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
2776 processing a pattern (PL_lex_inpat is true), a transliteration
2777 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
2779 Returns a pointer to the character scanned up to. If this is
2780 advanced from the start pointer supplied (i.e. if anything was
2781 successfully parsed), will leave an OP_CONST for the substring scanned
2782 in pl_yylval. Caller must intuit reason for not parsing further
2783 by looking at the next characters herself.
2787 \N{FOO} => \N{U+hex_for_character_FOO}
2788 (if FOO expands to multiple characters, expands to \N{U+xx.XX.yy ...})
2791 all other \-char, including \N and \N{ apart from \N{ABC}
2794 @ and $ where it appears to be a var, but not for $ as tail anchor
2798 In transliterations:
2799 characters are VERY literal, except for - not at the start or end
2800 of the string, which indicates a range. However some backslash sequences
2801 are recognized: \r, \n, and the like
2802 \007 \o{}, \x{}, \N{}
2803 If all elements in the transliteration are below 256,
2804 scan_const expands the range to the full set of intermediate
2805 characters. If the range is in utf8, the hyphen is replaced with
2806 a certain range mark which will be handled by pmtrans() in op.c.
2808 In double-quoted strings:
2810 all those recognized in transliterations
2811 deprecated backrefs: \1 (in substitution replacements)
2812 case and quoting: \U \Q \E
2815 scan_const does *not* construct ops to handle interpolated strings.
2816 It stops processing as soon as it finds an embedded $ or @ variable
2817 and leaves it to the caller to work out what's going on.
2819 embedded arrays (whether in pattern or not) could be:
2820 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2822 $ in double-quoted strings must be the symbol of an embedded scalar.
2824 $ in pattern could be $foo or could be tail anchor. Assumption:
2825 it's a tail anchor if $ is the last thing in the string, or if it's
2826 followed by one of "()| \r\n\t"
2828 \1 (backreferences) are turned into $1 in substitutions
2830 The structure of the code is
2831 while (there's a character to process) {
2832 handle transliteration ranges
2833 skip regexp comments /(?#comment)/ and codes /(?{code})/
2834 skip #-initiated comments in //x patterns
2835 check for embedded arrays
2836 check for embedded scalars
2838 deprecate \1 in substitution replacements
2839 handle string-changing backslashes \l \U \Q \E, etc.
2840 switch (what was escaped) {
2841 handle \- in a transliteration (becomes a literal -)
2842 if a pattern and not \N{, go treat as regular character
2843 handle \132 (octal characters)
2844 handle \x15 and \x{1234} (hex characters)
2845 handle \N{name} (named characters, also \N{3,5} in a pattern)
2846 handle \cV (control characters)
2847 handle printf-style backslashes (\f, \r, \n, etc)
2850 } (end if backslash)
2851 handle regular character
2852 } (end while character to read)
2857 S_scan_const(pTHX_ char *start)
2859 char *send = PL_bufend; /* end of the constant */
2860 SV *sv = newSV(send - start); /* sv for the constant. See note below
2862 char *s = start; /* start of the constant */
2863 char *d = SvPVX(sv); /* destination for copies */
2864 bool dorange = FALSE; /* are we in a translit range? */
2865 bool didrange = FALSE; /* did we just finish a range? */
2866 bool in_charclass = FALSE; /* within /[...]/ */
2867 bool has_utf8 = FALSE; /* Output constant is UTF8 */
2868 bool this_utf8 = cBOOL(UTF); /* Is the source string assumed to be
2869 UTF8? But, this can show as true
2870 when the source isn't utf8, as for
2871 example when it is entirely composed
2873 STRLEN utf8_variant_count = 0; /* When not in UTF-8, this counts the
2874 number of characters found so far
2875 that will expand (into 2 bytes)
2876 should we have to convert to
2878 SV *res; /* result from charnames */
2879 STRLEN offset_to_max; /* The offset in the output to where the range
2880 high-end character is temporarily placed */
2882 /* Does something require special handling in tr/// ? This avoids extra
2883 * work in a less likely case. As such, khw didn't feel it was worth
2884 * adding any branches to the more mainline code to handle this, which
2885 * means that this doesn't get set in some circumstances when things like
2886 * \x{100} get expanded out. As a result there needs to be extra testing
2887 * done in the tr code */
2888 bool has_above_latin1 = FALSE;
2890 /* Note on sizing: The scanned constant is placed into sv, which is
2891 * initialized by newSV() assuming one byte of output for every byte of
2892 * input. This routine expects newSV() to allocate an extra byte for a
2893 * trailing NUL, which this routine will append if it gets to the end of
2894 * the input. There may be more bytes of input than output (eg., \N{LATIN
2895 * CAPITAL LETTER A}), or more output than input if the constant ends up
2896 * recoded to utf8, but each time a construct is found that might increase
2897 * the needed size, SvGROW() is called. Its size parameter each time is
2898 * based on the best guess estimate at the time, namely the length used so
2899 * far, plus the length the current construct will occupy, plus room for
2900 * the trailing NUL, plus one byte for every input byte still unscanned */
2902 UV uv = UV_MAX; /* Initialize to weird value to try to catch any uses
2905 int backslash_N = 0; /* ? was the character from \N{} */
2906 int non_portable_endpoint = 0; /* ? In a range is an endpoint
2907 platform-specific like \x65 */
2910 PERL_ARGS_ASSERT_SCAN_CONST;
2912 assert(PL_lex_inwhat != OP_TRANSR);
2913 if (PL_lex_inwhat == OP_TRANS && PL_parser->lex_sub_op) {
2914 /* If we are doing a trans and we know we want UTF8 set expectation */
2915 has_utf8 = PL_parser->lex_sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
2916 this_utf8 = PL_parser->lex_sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
2919 /* Protect sv from errors and fatal warnings. */
2920 ENTER_with_name("scan_const");
2924 || dorange /* Handle tr/// range at right edge of input */
2927 /* get transliterations out of the way (they're most literal) */
2928 if (PL_lex_inwhat == OP_TRANS) {
2930 /* But there isn't any special handling necessary unless there is a
2931 * range, so for most cases we just drop down and handle the value
2932 * as any other. There are two exceptions.
2934 * 1. A hyphen indicates that we are actually going to have a
2935 * range. In this case, skip the '-', set a flag, then drop
2936 * down to handle what should be the end range value.
2937 * 2. After we've handled that value, the next time through, that
2938 * flag is set and we fix up the range.
2940 * Ranges entirely within Latin1 are expanded out entirely, in
2941 * order to make the transliteration a simple table look-up.
2942 * Ranges that extend above Latin1 have to be done differently, so
2943 * there is no advantage to expanding them here, so they are
2944 * stored here as Min, ILLEGAL_UTF8_BYTE, Max. The illegal byte
2945 * signifies a hyphen without any possible ambiguity. On EBCDIC
2946 * machines, if the range is expressed as Unicode, the Latin1
2947 * portion is expanded out even if the range extends above
2948 * Latin1. This is because each code point in it has to be
2949 * processed here individually to get its native translation */
2953 /* Here, we don't think we're in a range. If the new character
2954 * is not a hyphen; or if it is a hyphen, but it's too close to
2955 * either edge to indicate a range, then it's a regular
2957 if (*s != '-' || s >= send - 1 || s == start) {
2959 /* A regular character. Process like any other, but first
2960 * clear any flags */
2964 non_portable_endpoint = 0;
2967 /* The tests here for being above Latin1 and similar ones
2968 * in the following 'else' suffice to find all such
2969 * occurences in the constant, except those added by a
2970 * backslash escape sequence, like \x{100}. Mostly, those
2971 * set 'has_above_latin1' as appropriate */
2972 if (this_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) {
2973 has_above_latin1 = TRUE;
2976 /* Drops down to generic code to process current byte */
2978 else { /* Is a '-' in the context where it means a range */
2979 if (didrange) { /* Something like y/A-C-Z// */
2980 Perl_croak(aTHX_ "Ambiguous range in transliteration"
2986 s++; /* Skip past the hyphen */
2988 /* d now points to where the end-range character will be
2989 * placed. Save it so won't have to go finding it later,
2990 * and drop down to get that character. (Actually we
2991 * instead save the offset, to handle the case where a
2992 * realloc in the meantime could change the actual
2993 * pointer). We'll finish processing the range the next
2994 * time through the loop */
2995 offset_to_max = d - SvPVX_const(sv);
2997 if (this_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) {
2998 has_above_latin1 = TRUE;
3001 /* Drops down to generic code to process current byte */
3003 } /* End of not a range */
3005 /* Here we have parsed a range. Now must handle it. At this
3007 * 'sv' is a SV* that contains the output string we are
3008 * constructing. The final two characters in that string
3009 * are the range start and range end, in order.
3010 * 'd' points to just beyond the range end in the 'sv' string,
3011 * where we would next place something
3012 * 'offset_to_max' is the offset in 'sv' at which the character
3013 * (the range's maximum end point) before 'd' begins.
3015 char * max_ptr = SvPVX(sv) + offset_to_max;
3018 IV range_max; /* last character in range */
3020 Size_t offset_to_min = 0;
3023 bool convert_unicode;
3024 IV real_range_max = 0;
3026 /* Get the code point values of the range ends. */
3028 /* We know the utf8 is valid, because we just constructed
3029 * it ourselves in previous loop iterations */
3030 min_ptr = (char*) utf8_hop( (U8*) max_ptr, -1);
3031 range_min = valid_utf8_to_uvchr( (U8*) min_ptr, NULL);
3032 range_max = valid_utf8_to_uvchr( (U8*) max_ptr, NULL);
3034 /* This compensates for not all code setting
3035 * 'has_above_latin1', so that we don't skip stuff that
3036 * should be executed */
3037 if (range_max > 255) {
3038 has_above_latin1 = TRUE;
3042 min_ptr = max_ptr - 1;
3043 range_min = * (U8*) min_ptr;
3044 range_max = * (U8*) max_ptr;
3047 /* If the range is just a single code point, like tr/a-a/.../,
3048 * that code point is already in the output, twice. We can
3049 * just back up over the second instance and avoid all the rest
3050 * of the work. But if it is a variant character, it's been
3051 * counted twice, so decrement. (This unlikely scenario is
3052 * special cased, like the one for a range of 2 code points
3053 * below, only because the main-line code below needs a range
3054 * of 3 or more to work without special casing. Might as well
3055 * get it out of the way now.) */
3056 if (UNLIKELY(range_max == range_min)) {
3058 if (! has_utf8 && ! UVCHR_IS_INVARIANT(range_max)) {
3059 utf8_variant_count--;
3065 /* On EBCDIC platforms, we may have to deal with portable
3066 * ranges. These happen if at least one range endpoint is a
3067 * Unicode value (\N{...}), or if the range is a subset of
3068 * [A-Z] or [a-z], and both ends are literal characters,
3069 * like 'A', and not like \x{C1} */
3071 cBOOL(backslash_N) /* \N{} forces Unicode,
3072 hence portable range */
3073 || ( ! non_portable_endpoint
3074 && (( isLOWER_A(range_min) && isLOWER_A(range_max))
3075 || (isUPPER_A(range_min) && isUPPER_A(range_max))));
3076 if (convert_unicode) {
3078 /* Special handling is needed for these portable ranges.
3079 * They are defined to be in Unicode terms, which includes
3080 * all the Unicode code points between the end points.
3081 * Convert to Unicode to get the Unicode range. Later we
3082 * will convert each code point in the range back to
3084 range_min = NATIVE_TO_UNI(range_min);
3085 range_max = NATIVE_TO_UNI(range_max);
3089 if (range_min > range_max) {
3091 if (convert_unicode) {
3092 /* Need to convert back to native for meaningful
3093 * messages for this platform */
3094 range_min = UNI_TO_NATIVE(range_min);
3095 range_max = UNI_TO_NATIVE(range_max);
3098 /* Use the characters themselves for the error message if
3099 * ASCII printables; otherwise some visible representation
3101 if (isPRINT_A(range_min) && isPRINT_A(range_max)) {
3103 "Invalid range \"%c-%c\" in transliteration operator",
3104 (char)range_min, (char)range_max);
3107 else if (convert_unicode) {
3108 /* diag_listed_as: Invalid range "%s" in transliteration operator */
3110 "Invalid range \"\\N{U+%04" UVXf "}-\\N{U+%04"
3111 UVXf "}\" in transliteration operator",
3112 range_min, range_max);
3116 /* diag_listed_as: Invalid range "%s" in transliteration operator */
3118 "Invalid range \"\\x{%04" UVXf "}-\\x{%04" UVXf "}\""
3119 " in transliteration operator",
3120 range_min, range_max);
3124 /* If the range is exactly two code points long, they are
3125 * already both in the output */
3126 if (UNLIKELY(range_min + 1 == range_max)) {
3130 /* Here the range contains at least 3 code points */
3134 /* If everything in the transliteration is below 256, we
3135 * can avoid special handling later. A translation table
3136 * for each of those bytes is created by op.c. So we
3137 * expand out all ranges to their constituent code points.
3138 * But if we've encountered something above 255, the
3139 * expanding won't help, so skip doing that. But if it's
3140 * EBCDIC, we may have to look at each character below 256
3141 * if we have to convert to/from Unicode values */
3142 if ( has_above_latin1
3144 && (range_min > 255 || ! convert_unicode)
3147 /* Move the high character one byte to the right; then
3148 * insert between it and the range begin, an illegal
3149 * byte which serves to indicate this is a range (using
3150 * a '-' would be ambiguous). */
3152 while (e-- > max_ptr) {
3155 *(e + 1) = (char) ILLEGAL_UTF8_BYTE;
3159 /* Here, we're going to expand out the range. For EBCDIC
3160 * the range can extend above 255 (not so in ASCII), so
3161 * for EBCDIC, split it into the parts above and below
3164 if (range_max > 255) {
3165 real_range_max = range_max;
3171 /* Here we need to expand out the string to contain each
3172 * character in the range. Grow the output to handle this.
3173 * For non-UTF8, we need a byte for each code point in the
3174 * range, minus the three that we've already allocated for: the
3175 * hyphen, the min, and the max. For UTF-8, we need this
3176 * plus an extra byte for each code point that occupies two
3177 * bytes (is variant) when in UTF-8 (except we've already
3178 * allocated for the end points, including if they are
3179 * variants). For ASCII platforms and Unicode ranges on EBCDIC
3180 * platforms, it's easy to calculate a precise number. To
3181 * start, we count the variants in the range, which we need
3182 * elsewhere in this function anyway. (For the case where it
3183 * isn't easy to calculate, 'extras' has been initialized to 0,
3184 * and the calculation is done in a loop further down.) */
3186 if (convert_unicode)
3189 /* This is executed unconditionally on ASCII, and for
3190 * Unicode ranges on EBCDIC. Under these conditions, all
3191 * code points above a certain value are variant; and none
3192 * under that value are. We just need to find out how much
3193 * of the range is above that value. We don't count the
3194 * end points here, as they will already have been counted
3195 * as they were parsed. */
3196 if (range_min >= UTF_CONTINUATION_MARK) {
3198 /* The whole range is made up of variants */
3199 extras = (range_max - 1) - (range_min + 1) + 1;
3201 else if (range_max >= UTF_CONTINUATION_MARK) {
3203 /* Only the higher portion of the range is variants */
3204 extras = (range_max - 1) - UTF_CONTINUATION_MARK + 1;
3207 utf8_variant_count += extras;
3210 /* The base growth is the number of code points in the range,
3211 * not including the endpoints, which have already been sized
3212 * for (and output). We don't subtract for the hyphen, as it
3213 * has been parsed but not output, and the SvGROW below is
3214 * based only on what's been output plus what's left to parse.
3216 grow = (range_max - 1) - (range_min + 1) + 1;
3220 /* In some cases in EBCDIC, we haven't yet calculated a
3221 * precise amount needed for the UTF-8 variants. Just
3222 * assume the worst case, that everything will expand by a
3224 if (! convert_unicode) {
3230 /* Otherwise we know exactly how many variants there
3231 * are in the range. */
3236 /* Grow, but position the output to overwrite the range min end
3237 * point, because in some cases we overwrite that */
3238 SvCUR_set(sv, d - SvPVX_const(sv));
3239 offset_to_min = min_ptr - SvPVX_const(sv);
3241 /* See Note on sizing above. */
3242 d = offset_to_min + SvGROW(sv, SvCUR(sv)
3245 + 1 /* Trailing NUL */ );
3247 /* Now, we can expand out the range. */
3249 if (convert_unicode) {
3252 /* Recall that the min and max are now in Unicode terms, so
3253 * we have to convert each character to its native
3256 for (i = range_min; i <= range_max; i++) {
3257 append_utf8_from_native_byte(
3258 LATIN1_TO_NATIVE((U8) i),
3263 for (i = range_min; i <= range_max; i++) {
3264 *d++ = (char)LATIN1_TO_NATIVE((U8) i);
3270 /* Always gets run for ASCII, and sometimes for EBCDIC. */
3272 /* Here, no conversions are necessary, which means that the
3273 * first character in the range is already in 'd' and
3274 * valid, so we can skip overwriting it */
3278 for (i = range_min + 1; i <= range_max; i++) {
3279 append_utf8_from_native_byte((U8) i, (U8 **) &d);
3285 assert(range_min + 1 <= range_max);
3286 for (i = range_min + 1; i < range_max; i++) {
3288 /* In this case on EBCDIC, we haven't calculated
3289 * the variants. Do it here, as we go along */
3290 if (! UVCHR_IS_INVARIANT(i)) {
3291 utf8_variant_count++;
3297 /* The range_max is done outside the loop so as to
3298 * avoid having to special case not incrementing
3299 * 'utf8_variant_count' on EBCDIC (it's already been
3300 * counted when originally parsed) */
3301 *d++ = (char) range_max;
3306 /* If the original range extended above 255, add in that
3308 if (real_range_max) {
3309 *d++ = (char) UTF8_TWO_BYTE_HI(0x100);
3310 *d++ = (char) UTF8_TWO_BYTE_LO(0x100);
3311 if (real_range_max > 0x100) {
3312 if (real_range_max > 0x101) {
3313 *d++ = (char) ILLEGAL_UTF8_BYTE;
3315 d = (char*)uvchr_to_utf8((U8*)d, real_range_max);
3321 /* mark the range as done, and continue */
3325 non_portable_endpoint = 0;
3329 } /* End of is a range */
3330 } /* End of transliteration. Joins main code after these else's */
3331 else if (*s == '[' && PL_lex_inpat && !in_charclass) {
3334 while (s1 >= start && *s1-- == '\\')
3337 in_charclass = TRUE;
3339 else if (*s == ']' && PL_lex_inpat && in_charclass) {
3342 while (s1 >= start && *s1-- == '\\')
3345 in_charclass = FALSE;
3347 /* skip for regexp comments /(?#comment)/, except for the last
3348 * char, which will be done separately. Stop on (?{..}) and
3350 else if (*s == '(' && PL_lex_inpat && s[1] == '?' && !in_charclass) {
3352 while (s+1 < send && *s != ')')
3355 else if (!PL_lex_casemods
3356 && ( s[2] == '{' /* This should match regcomp.c */
3357 || (s[2] == '?' && s[3] == '{')))
3362 /* likewise skip #-initiated comments in //x patterns */
3366 && ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED)
3368 while (s < send && *s != '\n')
3371 /* no further processing of single-quoted regex */
3372 else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'')
3373 goto default_action;
3375 /* check for embedded arrays
3376 * (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
3378 else if (*s == '@' && s[1]) {
3380 ? isIDFIRST_utf8_safe(s+1, send)
3381 : isWORDCHAR_A(s[1]))
3385 if (strchr(":'{$", s[1]))
3387 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
3388 break; /* in regexp, neither @+ nor @- are interpolated */
3390 /* check for embedded scalars. only stop if we're sure it's a
3392 else if (*s == '$') {
3393 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
3395 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
3397 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
3398 "Possible unintended interpolation of $\\ in regex");
3400 break; /* in regexp, $ might be tail anchor */
3404 /* End of else if chain - OP_TRANS rejoin rest */
3406 if (UNLIKELY(s >= send)) {
3412 if (*s == '\\' && s+1 < send) {
3413 char* e; /* Can be used for ending '}', etc. */
3417 /* warn on \1 - \9 in substitution replacements, but note that \11
3418 * is an octal; and \19 is \1 followed by '9' */
3419 if (PL_lex_inwhat == OP_SUBST
3425 /* diag_listed_as: \%d better written as $%d */
3426 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
3431 /* string-change backslash escapes */
3432 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQF", *s)) {
3436 /* In a pattern, process \N, but skip any other backslash escapes.
3437 * This is because we don't want to translate an escape sequence
3438 * into a meta symbol and have the regex compiler use the meta
3439 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
3440 * in spite of this, we do have to process \N here while the proper
3441 * charnames handler is in scope. See bugs #56444 and #62056.
3443 * There is a complication because \N in a pattern may also stand
3444 * for 'match a non-nl', and not mean a charname, in which case its
3445 * processing should be deferred to the regex compiler. To be a
3446 * charname it must be followed immediately by a '{', and not look
3447 * like \N followed by a curly quantifier, i.e., not something like
3448 * \N{3,}. regcurly returns a boolean indicating if it is a legal
3450 else if (PL_lex_inpat
3453 || regcurly(s + 1)))
3456 goto default_action;
3462 if ((isALPHANUMERIC(*s)))
3463 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
3464 "Unrecognized escape \\%c passed through",
3466 /* default action is to copy the quoted character */
3467 goto default_action;
3470 /* eg. \132 indicates the octal constant 0132 */
3471 case '0': case '1': case '2': case '3':
3472 case '4': case '5': case '6': case '7':
3474 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
3476 uv = grok_oct(s, &len, &flags, NULL);
3478 if (len < 3 && s < send && isDIGIT(*s)
3479 && ckWARN(WARN_MISC))
3481 Perl_warner(aTHX_ packWARN(WARN_MISC),
3482 "%s", form_short_octal_warning(s, len));
3485 goto NUM_ESCAPE_INSERT;
3487 /* eg. \o{24} indicates the octal constant \024 */
3492 bool valid = grok_bslash_o(&s, &uv, &error,
3493 TRUE, /* Output warning */
3494 FALSE, /* Not strict */
3495 TRUE, /* Output warnings for
3500 uv = 0; /* drop through to ensure range ends are set */
3502 goto NUM_ESCAPE_INSERT;
3505 /* eg. \x24 indicates the hex constant 0x24 */
3510 bool valid = grok_bslash_x(&s, &uv, &error,
3511 TRUE, /* Output warning */
3512 FALSE, /* Not strict */
3513 TRUE, /* Output warnings for
3518 uv = 0; /* drop through to ensure range ends are set */
3523 /* Insert oct or hex escaped character. */
3525 /* Here uv is the ordinal of the next character being added */
3526 if (UVCHR_IS_INVARIANT(uv)) {
3530 if (!has_utf8 && uv > 255) {
3532 /* Here, 'uv' won't fit unless we convert to UTF-8.
3533 * If we've only seen invariants so far, all we have to
3534 * do is turn on the flag */
3535 if (utf8_variant_count == 0) {
3539 SvCUR_set(sv, d - SvPVX_const(sv));
3543 sv_utf8_upgrade_flags_grow(
3545 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3547 /* Since we're having to grow here,
3548 * make sure we have enough room for
3549 * this escape and a NUL, so the
3550 * code immediately below won't have
3551 * to actually grow again */
3553 + (STRLEN)(send - s) + 1);
3554 d = SvPVX(sv) + SvCUR(sv);
3557 has_above_latin1 = TRUE;
3563 utf8_variant_count++;
3566 /* Usually, there will already be enough room in 'sv'
3567 * since such escapes are likely longer than any UTF-8
3568 * sequence they can end up as. This isn't the case on
3569 * EBCDIC where \x{40000000} contains 12 bytes, and the
3570 * UTF-8 for it contains 14. And, we have to allow for
3571 * a trailing NUL. It probably can't happen on ASCII
3572 * platforms, but be safe. See Note on sizing above. */
3573 const STRLEN needed = d - SvPVX(sv)
3577 if (UNLIKELY(needed > SvLEN(sv))) {
3578 SvCUR_set(sv, d - SvPVX_const(sv));
3579 d = SvCUR(sv) + SvGROW(sv, needed);
3582 d = (char*)uvchr_to_utf8((U8*)d, uv);
3583 if (PL_lex_inwhat == OP_TRANS
3584 && PL_parser->lex_sub_op)
3586 PL_parser->lex_sub_op->op_private |=
3587 (PL_lex_repl ? OPpTRANS_FROM_UTF
3593 non_portable_endpoint++;
3598 /* In a non-pattern \N must be like \N{U+0041}, or it can be a
3599 * named character, like \N{LATIN SMALL LETTER A}, or a named
3600 * sequence, like \N{LATIN CAPITAL LETTER A WITH MACRON AND
3601 * GRAVE} (except y/// can't handle the latter, croaking). For
3602 * convenience all three forms are referred to as "named
3603 * characters" below.
3605 * For patterns, \N also can mean to match a non-newline. Code
3606 * before this 'switch' statement should already have handled
3607 * this situation, and hence this code only has to deal with
3608 * the named character cases.
3610 * For non-patterns, the named characters are converted to
3611 * their string equivalents. In patterns, named characters are
3612 * not converted to their ultimate forms for the same reasons
3613 * that other escapes aren't. Instead, they are converted to
3614 * the \N{U+...} form to get the value from the charnames that
3615 * is in effect right now, while preserving the fact that it
3616 * was a named character, so that the regex compiler knows
3619 * The structure of this section of code (besides checking for
3620 * errors and upgrading to utf8) is:
3621 * If the named character is of the form \N{U+...}, pass it
3622 * through if a pattern; otherwise convert the code point
3624 * Otherwise must be some \N{NAME}: convert to
3625 * \N{U+c1.c2...} if a pattern; otherwise convert to utf8
3627 * Transliteration is an exception. The conversion to utf8 is
3628 * only done if the code point requires it to be representable.
3630 * Here, 's' points to the 'N'; the test below is guaranteed to
3631 * succeed if we are being called on a pattern, as we already
3632 * know from a test above that the next character is a '{'. A
3633 * non-pattern \N must mean 'named character', which requires
3637 yyerror("Missing braces on \\N{}");
3643 /* If there is no matching '}', it is an error. */
3644 if (! (e = strchr(s, '}'))) {
3645 if (! PL_lex_inpat) {
3646 yyerror("Missing right brace on \\N{}");
3648 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N");
3650 yyquit(); /* Have exhausted the input. */
3653 /* Here it looks like a named character */
3655 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3656 s += 2; /* Skip to next char after the 'U+' */
3659 /* In patterns, we can have \N{U+xxxx.yyyy.zzzz...} */
3660 /* Check the syntax. */
3663 if (!isXDIGIT(*s)) {
3666 "Invalid hexadecimal number in \\N{U+...}"
3675 else if ((*s == '.' || *s == '_')
3681 /* Pass everything through unchanged.
3682 * +1 is for the '}' */
3683 Copy(orig_s, d, e - orig_s + 1, char);
3684 d += e - orig_s + 1;
3686 else { /* Not a pattern: convert the hex to string */
3687 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3688 | PERL_SCAN_SILENT_ILLDIGIT
3689 | PERL_SCAN_DISALLOW_PREFIX;
3691 uv = grok_hex(s, &len, &flags, NULL);
3692 if (len == 0 || (len != (STRLEN)(e - s)))
3695 /* For non-tr///, if the destination is not in utf8,
3696 * unconditionally recode it to be so. This is
3697 * because \N{} implies Unicode semantics, and scalars
3698 * have to be in utf8 to guarantee those semantics.
3699 * tr/// doesn't care about Unicode rules, so no need
3700 * there to upgrade to UTF-8 for small enough code
3702 if (! has_utf8 && ( uv > 0xFF
3703 || PL_lex_inwhat != OP_TRANS))
3705 /* See Note on sizing above. */
3706 const STRLEN extra = OFFUNISKIP(uv) + (send - e) + 1;
3708 SvCUR_set(sv, d - SvPVX_const(sv));
3712 if (utf8_variant_count == 0) {
3714 d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + extra);
3717 sv_utf8_upgrade_flags_grow(
3719 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3721 d = SvPVX(sv) + SvCUR(sv);
3725 has_above_latin1 = TRUE;
3728 /* Add the (Unicode) code point to the output. */
3729 if (! has_utf8 || OFFUNI_IS_INVARIANT(uv)) {
3730 *d++ = (char) LATIN1_TO_NATIVE(uv);
3733 d = (char*) uvoffuni_to_utf8_flags((U8*)d, uv, 0);
3737 else /* Here is \N{NAME} but not \N{U+...}. */
3738 if ((res = get_and_check_backslash_N_name(s, e)))
3741 const char *str = SvPV_const(res, len);
3744 if (! len) { /* The name resolved to an empty string */
3745 Copy("\\N{}", d, 4, char);
3749 /* In order to not lose information for the regex
3750 * compiler, pass the result in the specially made
3751 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3752 * the code points in hex of each character
3753 * returned by charnames */
3755 const char *str_end = str + len;
3756 const STRLEN off = d - SvPVX_const(sv);
3758 if (! SvUTF8(res)) {
3759 /* For the non-UTF-8 case, we can determine the
3760 * exact length needed without having to parse
3761 * through the string. Each character takes up
3762 * 2 hex digits plus either a trailing dot or
3764 const char initial_text[] = "\\N{U+";
3765 const STRLEN initial_len = sizeof(initial_text)
3767 d = off + SvGROW(sv, off
3770 /* +1 for trailing NUL */
3773 + (STRLEN)(send - e));
3774 Copy(initial_text, d, initial_len, char);
3776 while (str < str_end) {
3779 my_snprintf(hex_string,
3783 /* The regex compiler is
3784 * expecting Unicode, not
3786 NATIVE_TO_LATIN1(*str));
3787 PERL_MY_SNPRINTF_POST_GUARD(len,
3788 sizeof(hex_string));
3789 Copy(hex_string, d, 3, char);
3793 d--; /* Below, we will overwrite the final
3794 dot with a right brace */
3797 STRLEN char_length; /* cur char's byte length */
3799 /* and the number of bytes after this is
3800 * translated into hex digits */
3801 STRLEN output_length;
3803 /* 2 hex per byte; 2 chars for '\N'; 2 chars
3804 * for max('U+', '.'); and 1 for NUL */
3805 char hex_string[2 * UTF8_MAXBYTES + 5];
3807 /* Get the first character of the result. */
3808 U32 uv = utf8n_to_uvchr((U8 *) str,
3812 /* Convert first code point to Unicode hex,
3813 * including the boiler plate before it. */
3815 my_snprintf(hex_string, sizeof(hex_string),
3817 (unsigned int) NATIVE_TO_UNI(uv));
3819 /* Make sure there is enough space to hold it */
3820 d = off + SvGROW(sv, off
3822 + (STRLEN)(send - e)
3823 + 2); /* '}' + NUL */
3825 Copy(hex_string, d, output_length, char);
3828 /* For each subsequent character, append dot and
3829 * its Unicode code point in hex */
3830 while ((str += char_length) < str_end) {
3831 const STRLEN off = d - SvPVX_const(sv);
3832 U32 uv = utf8n_to_uvchr((U8 *) str,
3837 my_snprintf(hex_string,
3840 (unsigned int) NATIVE_TO_UNI(uv));
3842 d = off + SvGROW(sv, off
3844 + (STRLEN)(send - e)
3845 + 2); /* '}' + NUL */
3846 Copy(hex_string, d, output_length, char);
3851 *d++ = '}'; /* Done. Add the trailing brace */
3854 else { /* Here, not in a pattern. Convert the name to a
3857 if (PL_lex_inwhat == OP_TRANS) {
3858 str = SvPV_const(res, len);
3859 if (len > ((SvUTF8(res))
3863 yyerror(Perl_form(aTHX_
3864 "%.*s must not be a named sequence"
3865 " in transliteration operator",
3866 /* +1 to include the "}" */
3867 (int) (e + 1 - start), start));
3869 goto end_backslash_N;
3872 if (SvUTF8(res) && UTF8_IS_ABOVE_LATIN1(*str)) {
3873 has_above_latin1 = TRUE;
3877 else if (! SvUTF8(res)) {
3878 /* Make sure \N{} return is UTF-8. This is because
3879 * \N{} implies Unicode semantics, and scalars have
3880 * to be in utf8 to guarantee those semantics; but
3881 * not needed in tr/// */
3882 sv_utf8_upgrade_flags(res, 0);
3883 str = SvPV_const(res, len);
3886 /* Upgrade destination to be utf8 if this new
3888 if (! has_utf8 && SvUTF8(res)) {
3889 /* See Note on sizing above. */
3890 const STRLEN extra = len + (send - s) + 1;
3892 SvCUR_set(sv, d - SvPVX_const(sv));
3896 if (utf8_variant_count == 0) {
3898 d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + extra);
3901 sv_utf8_upgrade_flags_grow(sv,
3902 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3904 d = SvPVX(sv) + SvCUR(sv);
3907 } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
3909 /* See Note on sizing above. (NOTE: SvCUR() is not
3910 * set correctly here). */
3911 const STRLEN extra = len + (send - e) + 1;
3912 const STRLEN off = d - SvPVX_const(sv);
3913 d = off + SvGROW(sv, off + extra);
3915 Copy(str, d, len, char);
3921 } /* End \N{NAME} */
3925 backslash_N++; /* \N{} is defined to be Unicode */
3927 s = e + 1; /* Point to just after the '}' */
3930 /* \c is a control character */
3934 *d++ = grok_bslash_c(*s, 1);
3937 yyerror("Missing control char name in \\c");
3938 yyquit(); /* Are at end of input, no sense continuing */
3941 non_portable_endpoint++;
3945 /* printf-style backslashes, formfeeds, newlines, etc */
3971 } /* end if (backslash) */
3974 /* Just copy the input to the output, though we may have to convert
3977 * If the input has the same representation in UTF-8 as not, it will be
3978 * a single byte, and we don't care about UTF8ness; just copy the byte */
3979 if (NATIVE_BYTE_IS_INVARIANT((U8)(*s))) {
3982 else if (! this_utf8 && ! has_utf8) {
3983 /* If neither source nor output is UTF-8, is also a single byte,
3984 * just copy it; but this byte counts should we later have to
3985 * convert to UTF-8 */
3987 utf8_variant_count++;
3989 else if (this_utf8 && has_utf8) { /* Both UTF-8, can just copy */
3990 const STRLEN len = UTF8SKIP(s);
3992 /* We expect the source to have already been checked for
3994 assert(isUTF8_CHAR((U8 *) s, (U8 *) send));
3996 Copy(s, d, len, U8);
4000 else { /* UTF8ness matters and doesn't match, need to convert */
4002 const UV nextuv = (this_utf8)
4003 ? utf8n_to_uvchr((U8*)s, send - s, &len, 0)
4005 STRLEN need = UVCHR_SKIP(nextuv);
4008 SvCUR_set(sv, d - SvPVX_const(sv));
4012 /* See Note on sizing above. */
4013 need += (STRLEN)(send - s) + 1;
4015 if (utf8_variant_count == 0) {
4017 d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + need);
4020 sv_utf8_upgrade_flags_grow(sv,
4021 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
4023 d = SvPVX(sv) + SvCUR(sv);
4026 } else if (need > len) {
4027 /* encoded value larger than old, may need extra space (NOTE:
4028 * SvCUR() is not set correctly here). See Note on sizing
4030 const STRLEN extra = need + (send - s) + 1;
4031 const STRLEN off = d - SvPVX_const(sv);
4032 d = off + SvGROW(sv, off + extra);
4036 d = (char*)uvchr_to_utf8((U8*)d, nextuv);
4038 } /* while loop to process each character */
4040 /* terminate the string and set up the sv */
4042 SvCUR_set(sv, d - SvPVX_const(sv));
4043 if (SvCUR(sv) >= SvLEN(sv))
4044 Perl_croak(aTHX_ "panic: constant overflowed allocated space, %" UVuf
4045 " >= %" UVuf, (UV)SvCUR(sv), (UV)SvLEN(sv));
4050 if (PL_lex_inwhat == OP_TRANS && PL_parser->lex_sub_op) {
4051 PL_parser->lex_sub_op->op_private |=
4052 (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
4056 /* shrink the sv if we allocated more than we used */
4057 if (SvCUR(sv) + 5 < SvLEN(sv)) {
4058 SvPV_shrink_to_cur(sv);
4061 /* return the substring (via pl_yylval) only if we parsed anything */
4064 for (; s2 < s; s2++) {
4066 COPLINE_INC_WITH_HERELINES;
4068 SvREFCNT_inc_simple_void_NN(sv);
4069 if ( (PL_hints & ( PL_lex_inpat ? HINT_NEW_RE : HINT_NEW_STRING ))
4070 && ! PL_parser->lex_re_reparsing)
4072 const char *const key = PL_lex_inpat ? "qr" : "q";
4073 const STRLEN keylen = PL_lex_inpat ? 2 : 1;
4077 if (PL_lex_inwhat == OP_TRANS) {
4080 } else if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat) {
4083 } else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'') {
4091 sv = S_new_constant(aTHX_ start, s - start, key, keylen, sv, NULL,
4094 pl_yylval.opval = newSVOP(OP_CONST, 0, sv);
4096 LEAVE_with_name("scan_const");
4101 * Returns TRUE if there's more to the expression (e.g., a subscript),
4104 * It deals with "$foo[3]" and /$foo[3]/ and /$foo[0123456789$]+/
4106 * ->[ and ->{ return TRUE
4107 * ->$* ->$#* ->@* ->@[ ->@{ return TRUE if postderef_qq is enabled
4108 * { and [ outside a pattern are always subscripts, so return TRUE
4109 * if we're outside a pattern and it's not { or [, then return FALSE
4110 * if we're in a pattern and the first char is a {
4111 * {4,5} (any digits around the comma) returns FALSE
4112 * if we're in a pattern and the first char is a [
4114 * [SOMETHING] has a funky algorithm to decide whether it's a
4115 * character class or not. It has to deal with things like
4116 * /$foo[-3]/ and /$foo[$bar]/ as well as /$foo[$\d]+/
4117 * anything else returns TRUE
4120 /* This is the one truly awful dwimmer necessary to conflate C and sed. */
4123 S_intuit_more(pTHX_ char *s)
4125 PERL_ARGS_ASSERT_INTUIT_MORE;
4127 if (PL_lex_brackets)
4129 if (*s == '-' && s[1] == '>' && (s[2] == '[' || s[2] == '{'))
4131 if (*s == '-' && s[1] == '>'
4132 && FEATURE_POSTDEREF_QQ_IS_ENABLED
4133 && ( (s[2] == '$' && (s[3] == '*' || (s[3] == '#' && s[4] == '*')))
4134 ||(s[2] == '@' && strchr("*[{",s[3])) ))
4136 if (*s != '{' && *s != '[')
4141 /* In a pattern, so maybe we have {n,m}. */
4149 /* On the other hand, maybe we have a character class */
4152 if (*s == ']' || *s == '^')
4155 /* this is terrifying, and it works */
4158 const char * const send = strchr(s,']');
4159 unsigned char un_char, last_un_char;
4160 char tmpbuf[sizeof PL_tokenbuf * 4];
4162 if (!send) /* has to be an expression */
4164 weight = 2; /* let's weigh the evidence */
4168 else if (isDIGIT(*s)) {
4170 if (isDIGIT(s[1]) && s[2] == ']')
4176 Zero(seen,256,char);
4178 for (; s < send; s++) {
4179 last_un_char = un_char;
4180 un_char = (unsigned char)*s;
4185 weight -= seen[un_char] * 10;
4186 if (isWORDCHAR_lazy_if_safe(s+1, PL_bufend, UTF)) {
4188 scan_ident(s, tmpbuf, sizeof tmpbuf, FALSE);
4189 len = (int)strlen(tmpbuf);
4190 if (len > 1 && gv_fetchpvn_flags(tmpbuf, len,
4191 UTF ? SVf_UTF8 : 0, SVt_PV))
4198 && strchr("[#!%*<>()-=",s[1]))
4200 if (/*{*/ strchr("])} =",s[2]))
4209 if (strchr("wds]",s[1]))
4211 else if (seen[(U8)'\''] || seen[(U8)'"'])
4213 else if (strchr("rnftbxcav",s[1]))
4215 else if (isDIGIT(s[1])) {
4217 while (s[1] && isDIGIT(s[1]))
4227 if (strchr("aA01! ",last_un_char))
4229 if (strchr("zZ79~",s[1]))
4231 if (last_un_char == 255 && (isDIGIT(s[1]) || s[1] == '$'))
4232 weight -= 5; /* cope with negative subscript */
4235 if (!isWORDCHAR(last_un_char)
4236 && !(last_un_char == '$' || last_un_char == '@'
4237 || last_un_char == '&')
4238 && isALPHA(*s) && s[1] && isALPHA(s[1])) {
4242 if (keyword(d, s - d, 0))
4245 if (un_char == last_un_char + 1)
4247 weight -= seen[un_char];
4252 if (weight >= 0) /* probably a character class */
4262 * Does all the checking to disambiguate
4264 * between foo(bar) and bar->foo. Returns 0 if not a method, otherwise
4265 * FUNCMETH (bar->foo(args)) or METHOD (bar->foo args).
4267 * First argument is the stuff after the first token, e.g. "bar".
4269 * Not a method if foo is a filehandle.
4270 * Not a method if foo is a subroutine prototyped to take a filehandle.
4271 * Not a method if it's really "Foo $bar"
4272 * Method if it's "foo $bar"
4273 * Not a method if it's really "print foo $bar"
4274 * Method if it's really "foo package::" (interpreted as package->foo)
4275 * Not a method if bar is known to be a subroutine ("sub bar; foo bar")
4276 * Not a method if bar is a filehandle or package, but is quoted with
4281 S_intuit_method(pTHX_ char *start, SV *ioname, CV *cv)
4283 char *s = start + (*start == '$');
4284 char tmpbuf[sizeof PL_tokenbuf];
4287 /* Mustn't actually add anything to a symbol table.
4288 But also don't want to "initialise" any placeholder
4289 constants that might already be there into full
4290 blown PVGVs with attached PVCV. */
4292 ioname ? gv_fetchsv(ioname, GV_NOADD_NOINIT, SVt_PVCV) : NULL;
4294 PERL_ARGS_ASSERT_INTUIT_METHOD;
4296 if (gv && SvTYPE(gv) == SVt_PVGV && GvIO(gv))
4298 if (cv && SvPOK(cv)) {
4299 const char *proto = CvPROTO(cv);
4301 while (*proto && (isSPACE(*proto) || *proto == ';'))
4308 if (*start == '$') {
4309 SSize_t start_off = start - SvPVX(PL_linestr);
4310 if (cv || PL_last_lop_op == OP_PRINT || PL_last_lop_op == OP_SAY
4311 || isUPPER(*PL_tokenbuf))
4313 /* this could be $# */
4316 PL_bufptr = SvPVX(PL_linestr) + start_off;
4318 return *s == '(' ? FUNCMETH : METHOD;
4321 s = scan_word(s, tmpbuf, sizeof tmpbuf, TRUE, &len);
4322 /* start is the beginning of the possible filehandle/object,
4323 * and s is the end of it
4324 * tmpbuf is a copy of it (but with single quotes as double colons)
4327 if (!keyword(tmpbuf, len, 0)) {
4328 if (len > 2 && tmpbuf[len - 2] == ':' && tmpbuf[len - 1] == ':') {
4333 indirgv = gv_fetchpvn_flags(tmpbuf, len,
4334 GV_NOADD_NOINIT|( UTF ? SVf_UTF8 : 0 ),
4336 if (indirgv && SvTYPE(indirgv) != SVt_NULL
4337 && (!isGV(indirgv) || GvCVu(indirgv)))
4339 /* filehandle or package name makes it a method */
4340 if (!cv || GvIO(indirgv) || gv_stashpvn(tmpbuf, len, UTF ? SVf_UTF8 : 0)) {
4342 if ((PL_bufend - s) >= 2 && *s == '=' && *(s+1) == '>')
4343 return 0; /* no assumptions -- "=>" quotes bareword */
4345 NEXTVAL_NEXTTOKE.opval = newSVOP(OP_CONST, 0,
4346 S_newSV_maybe_utf8(aTHX_ tmpbuf, len));
4347 NEXTVAL_NEXTTOKE.opval->op_private = OPpCONST_BARE;
4349 force_next(BAREWORD);
4351 return *s == '(' ? FUNCMETH : METHOD;
4357 /* Encoded script support. filter_add() effectively inserts a
4358 * 'pre-processing' function into the current source input stream.
4359 * Note that the filter function only applies to the current source file
4360 * (e.g., it will not affect files 'require'd or 'use'd by this one).
4362 * The datasv parameter (which may be NULL) can be used to pass
4363 * private data to this instance of the filter. The filter function
4364 * can recover the SV using the FILTER_DATA macro and use it to
4365 * store private buffers and state information.
4367 * The supplied datasv parameter is upgraded to a PVIO type
4368 * and the IoDIRP/IoANY field is used to store the function pointer,
4369 * and IOf_FAKE_DIRP is enabled on datasv to mark this as such.
4370 * Note that IoTOP_NAME, IoFMT_NAME, IoBOTTOM_NAME, if set for
4371 * private use must be set using malloc'd pointers.
4375 Perl_filter_add(pTHX_ filter_t funcp, SV *datasv)
4383 if (PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS)
4384 Perl_croak(aTHX_ "Source filters apply only to byte streams");
4386 if (!PL_rsfp_filters)
4387 PL_rsfp_filters = newAV();
4390 SvUPGRADE(datasv, SVt_PVIO);
4391 IoANY(datasv) = FPTR2DPTR(void *, funcp); /* stash funcp into spare field */
4392 IoFLAGS(datasv) |= IOf_FAKE_DIRP;
4393 DEBUG_P(PerlIO_printf(Perl_debug_log, "filter_add func %p (%s)\n",
4394 FPTR2DPTR(void *, IoANY(datasv)),
4395 SvPV_nolen(datasv)));
4396 av_unshift(PL_rsfp_filters, 1);
4397 av_store(PL_rsfp_filters, 0, datasv) ;
4399 !PL_parser->filtered
4400 && PL_parser->lex_flags & LEX_EVALBYTES
4401 && PL_bufptr < PL_bufend
4403 const char *s = PL_bufptr;
4404 while (s < PL_bufend) {
4406 SV *linestr = PL_parser->linestr;
4407 char *buf = SvPVX(linestr);
4408 STRLEN const bufptr_pos = PL_parser->bufptr - buf;
4409 STRLEN const oldbufptr_pos = PL_parser->oldbufptr - buf;
4410 STRLEN const oldoldbufptr_pos=PL_parser->oldoldbufptr-buf;
4411 STRLEN const linestart_pos = PL_parser->linestart - buf;
4412 STRLEN const last_uni_pos =
4413 PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
4414 STRLEN const last_lop_pos =
4415 PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
4416 av_push(PL_rsfp_filters, linestr);
4417 PL_parser->linestr =
4418 newSVpvn(SvPVX(linestr), ++s-SvPVX(linestr));
4419 buf = SvPVX(PL_parser->linestr);
4420 PL_parser->bufend = buf + SvCUR(PL_parser->linestr);
4421 PL_parser->bufptr = buf + bufptr_pos;
4422 PL_parser->oldbufptr = buf + oldbufptr_pos;
4423 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
4424 PL_parser->linestart = buf + linestart_pos;
4425 if (PL_parser->last_uni)
4426 PL_parser->last_uni = buf + last_uni_pos;
4427 if (PL_parser->last_lop)
4428 PL_parser->last_lop = buf + last_lop_pos;
4429 SvLEN(linestr) = SvCUR(linestr);
4430 SvCUR(linestr) = s-SvPVX(linestr);
4431 PL_parser->filtered = 1;
4441 /* Delete most recently added instance of this filter function. */
4443 Perl_filter_del(pTHX_ filter_t funcp)
4447 PERL_ARGS_ASSERT_FILTER_DEL;
4450 DEBUG_P(PerlIO_printf(Perl_debug_log, "filter_del func %p",
4451 FPTR2DPTR(void*, funcp)));
4453 if (!PL_parser || !PL_rsfp_filters || AvFILLp(PL_rsfp_filters)<0)
4455 /* if filter is on top of stack (usual case) just pop it off */
4456 datasv = FILTER_DATA(AvFILLp(PL_rsfp_filters));
4457 if (IoANY(datasv) == FPTR2DPTR(void *, funcp)) {
4458 sv_free(av_pop(PL_rsfp_filters));
4462 /* we need to search for the correct entry and clear it */
4463 Perl_die(aTHX_ "filter_del can only delete in reverse order (currently)");
4467 /* Invoke the idxth filter function for the current rsfp. */
4468 /* maxlen 0 = read one text line */
4470 Perl_filter_read(pTHX_ int idx, SV *buf_sv, int maxlen)
4474 /* This API is bad. It should have been using unsigned int for maxlen.
4475 Not sure if we want to change the API, but if not we should sanity
4476 check the value here. */
4477 unsigned int correct_length = maxlen < 0 ? PERL_INT_MAX : maxlen;
4479 PERL_ARGS_ASSERT_FILTER_READ;
4481 if (!PL_parser || !PL_rsfp_filters)
4483 if (idx > AvFILLp(PL_rsfp_filters)) { /* Any more filters? */
4484 /* Provide a default input filter to make life easy. */
4485 /* Note that we append to the line. This is handy. */
4486 DEBUG_P(PerlIO_printf(Perl_debug_log,
4487 "filter_read %d: from rsfp\n", idx));
4488 if (correct_length) {
4491 const int old_len = SvCUR(buf_sv);
4493 /* ensure buf_sv is large enough */
4494 SvGROW(buf_sv, (STRLEN)(old_len + correct_length + 1)) ;
4495 if ((len = PerlIO_read(PL_rsfp, SvPVX(buf_sv) + old_len,
4496 correct_length)) <= 0) {
4497 if (PerlIO_error(PL_rsfp))
4498 return -1; /* error */
4500 return 0 ; /* end of file */
4502 SvCUR_set(buf_sv, old_len + len) ;
4503 SvPVX(buf_sv)[old_len + len] = '\0';
4506 if (sv_gets(buf_sv, PL_rsfp, SvCUR(buf_sv)) == NULL) {
4507 if (PerlIO_error(PL_rsfp))
4508 return -1; /* error */
4510 return 0 ; /* end of file */
4513 return SvCUR(buf_sv);
4515 /* Skip this filter slot if filter has been deleted */
4516 if ( (datasv = FILTER_DATA(idx)) == &PL_sv_undef) {
4517 DEBUG_P(PerlIO_printf(Perl_debug_log,
4518 "filter_read %d: skipped (filter deleted)\n",
4520 return FILTER_READ(idx+1, buf_sv, correct_length); /* recurse */
4522 if (SvTYPE(datasv) != SVt_PVIO) {
4523 if (correct_length) {
4525 const STRLEN remainder = SvLEN(datasv) - SvCUR(datasv);
4526 if (!remainder) return 0; /* eof */
4527 if (correct_length > remainder) correct_length = remainder;
4528 sv_catpvn(buf_sv, SvEND(datasv), correct_length);
4529 SvCUR_set(datasv, SvCUR(datasv) + correct_length);
4532 const char *s = SvEND(datasv);
4533 const char *send = SvPVX(datasv) + SvLEN(datasv);
4541 if (s == send) return 0; /* eof */
4542 sv_catpvn(buf_sv, SvEND(datasv), s-SvEND(datasv));
4543 SvCUR_set(datasv, s-SvPVX(datasv));
4545 return SvCUR(buf_sv);
4547 /* Get function pointer hidden within datasv */
4548 funcp = DPTR2FPTR(filter_t, IoANY(datasv));
4549 DEBUG_P(PerlIO_printf(Perl_debug_log,
4550 "filter_read %d: via function %p (%s)\n",
4551 idx, (void*)datasv, SvPV_nolen_const(datasv)));
4552 /* Call function. The function is expected to */
4553 /* call "FILTER_READ(idx+1, buf_sv)" first. */
4554 /* Return: <0:error, =0:eof, >0:not eof */
4555 return (*funcp)(aTHX_ idx, buf_sv, correct_length);
4559 S_filter_gets(pTHX_ SV *sv, STRLEN append)
4561 PERL_ARGS_ASSERT_FILTER_GETS;
4563 #ifdef PERL_CR_FILTER
4564 if (!PL_rsfp_filters) {
4565 filter_add(S_cr_textfilter,NULL);
4568 if (PL_rsfp_filters) {
4570 SvCUR_set(sv, 0); /* start with empty line */
4571 if (FILTER_READ(0, sv, 0) > 0)
4572 return ( SvPVX(sv) ) ;
4577 return (sv_gets(sv, PL_rsfp, append));
4581 S_find_in_my_stash(pTHX_ const char *pkgname, STRLEN len)
4585 PERL_ARGS_ASSERT_FIND_IN_MY_STASH;
4587 if (len == 11 && *pkgname == '_' && strEQ(pkgname, "__PACKAGE__"))
4591 && (pkgname[len - 2] == ':' && pkgname[len - 1] == ':')
4592 && (gv = gv_fetchpvn_flags(pkgname,
4594 ( UTF ? SVf_UTF8 : 0 ), SVt_PVHV)))
4596 return GvHV(gv); /* Foo:: */
4599 /* use constant CLASS => 'MyClass' */
4600 gv = gv_fetchpvn_flags(pkgname, len, UTF ? SVf_UTF8 : 0, SVt_PVCV);
4601 if (gv && GvCV(gv)) {
4602 SV * const sv = cv_const_sv(GvCV(gv));
4604 return gv_stashsv(sv, 0);
4607 return gv_stashpvn(pkgname, len, UTF ? SVf_UTF8 : 0);
4612 S_tokenize_use(pTHX_ int is_use, char *s) {
4613 PERL_ARGS_ASSERT_TOKENIZE_USE;
4615 if (PL_expect != XSTATE)
4616 yyerror(Perl_form(aTHX_ "\"%s\" not allowed in expression",
4617 is_use ? "use" : "no"));
4620 if (isDIGIT(*s) || (*s == 'v' && isDIGIT(s[1]))) {
4621 s = force_version(s, TRUE);
4622 if (*s == ';' || *s == '}'
4623 || (s = skipspace(s), (*s == ';' || *s == '}'))) {
4624 NEXTVAL_NEXTTOKE.opval = NULL;
4625 force_next(BAREWORD);
4627 else if (*s == 'v') {
4628 s = force_word(s,BAREWORD,FALSE,TRUE);
4629 s = force_version(s, FALSE);
4633 s = force_word(s,BAREWORD,FALSE,TRUE);
4634 s = force_version(s, FALSE);
4636 pl_yylval.ival = is_use;
4640 static const char* const exp_name[] =
4641 { "OPERATOR", "TERM", "REF", "STATE", "BLOCK", "ATTRBLOCK",
4642 "ATTRTERM", "TERMBLOCK", "XBLOCKTERM", "POSTDEREF",
4643 "SIGVAR", "TERMORDORDOR"
4647 #define word_takes_any_delimiter(p,l) S_word_takes_any_delimiter(p,l)
4649 S_word_takes_any_delimiter(char *p, STRLEN len)
4651 return (len == 1 && strchr("msyq", p[0]))
4653 && ((p[0] == 't' && p[1] == 'r')
4654 || (p[0] == 'q' && strchr("qwxr", p[1]))));
4658 S_check_scalar_slice(pTHX_ char *s)
4661 while (SPACE_OR_TAB(*s)) s++;
4662 if (*s == 'q' && s[1] == 'w' && !isWORDCHAR_lazy_if_safe(s+2,
4668 while ( isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF)
4669 || (*s && strchr(" \t$#+-'\"", *s)))
4671 s += UTF ? UTF8SKIP(s) : 1;
4673 if (*s == '}' || *s == ']')
4674 pl_yylval.ival = OPpSLICEWARNING;
4677 #define lex_token_boundary() S_lex_token_boundary(aTHX)
4679 S_lex_token_boundary(pTHX)
4681 PL_oldoldbufptr = PL_oldbufptr;
4682 PL_oldbufptr = PL_bufptr;
4685 #define vcs_conflict_marker(s) S_vcs_conflict_marker(aTHX_ s)
4687 S_vcs_conflict_marker(pTHX_ char *s)
4689 lex_token_boundary();
4691 yyerror("Version control conflict marker");
4692 while (s < PL_bufend && *s != '\n')
4700 Works out what to call the token just pulled out of the input
4701 stream. The yacc parser takes care of taking the ops we return and
4702 stitching them into a tree.
4705 The type of the next token
4708 Check if we have already built the token; if so, use it.
4709 Switch based on the current state:
4710 - if we have a case modifier in a string, deal with that
4711 - handle other cases of interpolation inside a string
4712 - scan the next line if we are inside a format
4713 In the normal state, switch on the next character:
4715 if alphabetic, go to key lookup
4716 unrecognized character - croak
4717 - 0/4/26: handle end-of-line or EOF
4718 - cases for whitespace
4719 - \n and #: handle comments and line numbers
4720 - various operators, brackets and sigils
4723 - 'v': vstrings (or go to key lookup)
4724 - 'x' repetition operator (or go to key lookup)
4725 - other ASCII alphanumerics (key lookup begins here):
4728 scan built-in keyword (but do nothing with it yet)
4729 check for statement label
4730 check for lexical subs
4731 goto just_a_word if there is one
4732 see whether built-in keyword is overridden
4733 switch on keyword number:
4734 - default: just_a_word:
4735 not a built-in keyword; handle bareword lookup
4736 disambiguate between method and sub call
4737 fall back to bareword
4738 - cases for built-in keywords
4746 char *s = PL_bufptr;
4750 const bool saw_infix_sigil = cBOOL(PL_parser->saw_infix_sigil);
4754 /* orig_keyword, gvp, and gv are initialized here because
4755 * jump to the label just_a_word_zero can bypass their
4756 * initialization later. */
4757 I32 orig_keyword = 0;
4761 if (UNLIKELY(PL_parser->recheck_utf8_validity)) {
4762 const U8* first_bad_char_loc;
4763 if (UTF && UNLIKELY(! is_utf8_string_loc((U8 *) PL_bufptr,
4764 PL_bufend - PL_bufptr,
4765 &first_bad_char_loc)))
4767 _force_out_malformed_utf8_message(first_bad_char_loc,
4770 1 /* 1 means die */ );
4771 NOT_REACHED; /* NOTREACHED */
4773 PL_parser->recheck_utf8_validity = FALSE;
4776 SV* tmp = newSVpvs("");
4777 PerlIO_printf(Perl_debug_log, "### %" IVdf ":LEX_%s/X%s %s\n",
4778 (IV)CopLINE(PL_curcop),
4779 lex_state_names[PL_lex_state],
4780 exp_name[PL_expect],
4781 pv_display(tmp, s, strlen(s), 0, 60));
4785 /* when we've already built the next token, just pull it out of the queue */
4788 pl_yylval = PL_nextval[PL_nexttoke];
4791 next_type = PL_nexttype[PL_nexttoke];
4792 if (next_type & (7<<24)) {
4793 if (next_type & (1<<24)) {
4794 if (PL_lex_brackets > 100)
4795 Renew(PL_lex_brackstack, PL_lex_brackets + 10, char);
4796 PL_lex_brackstack[PL_lex_brackets++] =
4797 (char) ((next_type >> 16) & 0xff);
4799 if (next_type & (2<<24))
4800 PL_lex_allbrackets++;
4801 if (next_type & (4<<24))
4802 PL_lex_allbrackets--;
4803 next_type &= 0xffff;
4805 return REPORT(next_type == 'p' ? pending_ident() : next_type);
4809 switch (PL_lex_state) {
4811 case LEX_INTERPNORMAL:
4814 /* interpolated case modifiers like \L \U, including \Q and \E.
4815 when we get here, PL_bufptr is at the \
4817 case LEX_INTERPCASEMOD:
4819 if (PL_bufptr != PL_bufend && *PL_bufptr != '\\')
4821 "panic: INTERPCASEMOD bufptr=%p, bufend=%p, *bufptr=%u",
4822 PL_bufptr, PL_bufend, *PL_bufptr);
4824 /* handle \E or end of string */
4825 if (PL_bufptr == PL_bufend || PL_bufptr[1] == 'E') {
4827 if (PL_lex_casemods) {
4828 const char oldmod = PL_lex_casestack[--PL_lex_casemods];
4829 PL_lex_casestack[PL_lex_casemods] = '\0';
4831 if (PL_bufptr != PL_bufend
4832 && (oldmod == 'L' || oldmod == 'U' || oldmod == 'Q'
4833 || oldmod == 'F')) {
4835 PL_lex_state = LEX_INTERPCONCAT;
4837 PL_lex_allbrackets--;
4840 else if ( PL_bufptr != PL_bufend && PL_bufptr[1] == 'E' ) {
4841 /* Got an unpaired \E */
4842 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
4843 "Useless use of \\E");
4845 if (PL_bufptr != PL_bufend)
4847 PL_lex_state = LEX_INTERPCONCAT;
4851 DEBUG_T({ PerlIO_printf(Perl_debug_log,
4852 "### Saw case modifier\n"); });
4854 if (s[1] == '\\' && s[2] == 'E') {
4856 PL_lex_state = LEX_INTERPCONCAT;
4861 if (strnEQ(s, "L\\u", 3) || strnEQ(s, "U\\l", 3))
4862 tmp = *s, *s = s[2], s[2] = (char)tmp; /* misordered... */
4863 if ((*s == 'L' || *s == 'U' || *s == 'F')
4864 && (strpbrk(PL_lex_casestack, "LUF")))
4866 PL_lex_casestack[--PL_lex_casemods] = '\0';
4867 PL_lex_allbrackets--;
4870 if (PL_lex_casemods > 10)
4871 Renew(PL_lex_casestack, PL_lex_casemods + 2, char);
4872 PL_lex_casestack[PL_lex_casemods++] = *s;
4873 PL_lex_casestack[PL_lex_casemods] = '\0';
4874 PL_lex_state = LEX_INTERPCONCAT;
4875 NEXTVAL_NEXTTOKE.ival = 0;
4876 force_next((2<<24)|'(');
4878 NEXTVAL_NEXTTOKE.ival = OP_LCFIRST;
4880 NEXTVAL_NEXTTOKE.ival = OP_UCFIRST;
4882 NEXTVAL_NEXTTOKE.ival = OP_LC;
4884 NEXTVAL_NEXTTOKE.ival = OP_UC;
4886 NEXTVAL_NEXTTOKE.ival = OP_QUOTEMETA;
4888 NEXTVAL_NEXTTOKE.ival = OP_FC;
4890 Perl_croak(aTHX_ "panic: yylex, *s=%u", *s);
4894 if (PL_lex_starts) {
4897 /* commas only at base level: /$a\Ub$c/ => ($a,uc(b.$c)) */
4898 if (PL_lex_casemods == 1 && PL_lex_inpat)
4901 AopNOASSIGN(OP_CONCAT);
4907 case LEX_INTERPPUSH:
4908 return REPORT(sublex_push());
4910 case LEX_INTERPSTART:
4911 if (PL_bufptr == PL_bufend)
4912 return REPORT(sublex_done());
4913 DEBUG_T({ if(*PL_bufptr != '(') PerlIO_printf(Perl_debug_log,
4914 "### Interpolated variable\n"); });
4916 /* for /@a/, we leave the joining for the regex engine to do
4917 * (unless we're within \Q etc) */
4918 PL_lex_dojoin = (*PL_bufptr == '@'
4919 && (!PL_lex_inpat || PL_lex_casemods));
4920 PL_lex_state = LEX_INTERPNORMAL;
4921 if (PL_lex_dojoin) {
4922 NEXTVAL_NEXTTOKE.ival = 0;
4924 force_ident("\"", '$');
4925 NEXTVAL_NEXTTOKE.ival = 0;
4927 NEXTVAL_NEXTTOKE.ival = 0;
4928 force_next((2<<24)|'(');
4929 NEXTVAL_NEXTTOKE.ival = OP_JOIN; /* emulate join($", ...) */
4932 /* Convert (?{...}) and friends to 'do {...}' */
4933 if (PL_lex_inpat && *PL_bufptr == '(') {
4934 PL_parser->lex_shared->re_eval_start = PL_bufptr;
4936 if (*PL_bufptr != '{')
4938 PL_expect = XTERMBLOCK;
4942 if (PL_lex_starts++) {
4944 /* commas only at base level: /$a\Ub$c/ => ($a,uc(b.$c)) */
4945 if (!PL_lex_casemods && PL_lex_inpat)
4948 AopNOASSIGN(OP_CONCAT);
4952 case LEX_INTERPENDMAYBE:
4953 if (intuit_more(PL_bufptr)) {
4954 PL_lex_state = LEX_INTERPNORMAL; /* false alarm, more expr */
4960 if (PL_lex_dojoin) {
4961 const U8 dojoin_was = PL_lex_dojoin;
4962 PL_lex_dojoin = FALSE;
4963 PL_lex_state = LEX_INTERPCONCAT;
4964 PL_lex_allbrackets--;
4965 return REPORT(dojoin_was == 1 ? (int)')' : (int)POSTJOIN);
4967 if (PL_lex_inwhat == OP_SUBST && PL_linestr == PL_lex_repl
4968 && SvEVALED(PL_lex_repl))
4970 if (PL_bufptr != PL_bufend)
4971 Perl_croak(aTHX_ "Bad evalled substitution pattern");
4974 /* Paranoia. re_eval_start is adjusted when S_scan_heredoc sets
4975 re_eval_str. If the here-doc body’s length equals the previous
4976 value of re_eval_start, re_eval_start will now be null. So
4977 check re_eval_str as well. */
4978 if (PL_parser->lex_shared->re_eval_start
4979 || PL_parser->lex_shared->re_eval_str) {
4981 if (*PL_bufptr != ')')
4982 Perl_croak(aTHX_ "Sequence (?{...}) not terminated with ')'");
4984 /* having compiled a (?{..}) expression, return the original
4985 * text too, as a const */
4986 if (PL_parser->lex_shared->re_eval_str) {
4987 sv = PL_parser->lex_shared->re_eval_str;
4988 PL_parser->lex_shared->re_eval_str = NULL;
4990 PL_bufptr - PL_parser->lex_shared->re_eval_start);
4991 SvPV_shrink_to_cur(sv);
4993 else sv = newSVpvn(PL_parser->lex_shared->re_eval_start,
4994 PL_bufptr - PL_parser->lex_shared->re_eval_start);
4995 NEXTVAL_NEXTTOKE.opval =
4996 newSVOP(OP_CONST, 0,
4999 PL_parser->lex_shared->re_eval_start = NULL;
5005 case LEX_INTERPCONCAT:
5007 if (PL_lex_brackets)
5008 Perl_croak(aTHX_ "panic: INTERPCONCAT, lex_brackets=%ld",
5009 (long) PL_lex_brackets);
5011 if (PL_bufptr == PL_bufend)
5012 return REPORT(sublex_done());
5014 /* m'foo' still needs to be parsed for possible (?{...}) */
5015 if (SvIVX(PL_linestr) == '\'' && !PL_lex_inpat) {
5016 SV *sv = newSVsv(PL_linestr);
5018 pl_yylval.opval = newSVOP(OP_CONST, 0, sv);
5022 int save_error_count = PL_error_count;
5024 s = scan_const(PL_bufptr);
5026 /* Set flag if this was a pattern and there were errors. op.c will
5027 * refuse to compile a pattern with this flag set. Otherwise, we
5028 * could get segfaults, etc. */
5029 if (PL_lex_inpat && PL_error_count > save_error_count) {
5030 ((PMOP*)PL_lex_inpat)->op_pmflags |= PMf_HAS_ERROR;
5033 PL_lex_state = LEX_INTERPCASEMOD;
5035 PL_lex_state = LEX_INTERPSTART;
5038 if (s != PL_bufptr) {
5039 NEXTVAL_NEXTTOKE = pl_yylval;
5042 if (PL_lex_starts++) {
5043 /* commas only at base level: /$a\Ub$c/ => ($a,uc(b.$c)) */
5044 if (!PL_lex_casemods && PL_lex_inpat)
5047 AopNOASSIGN(OP_CONCAT);
5057 s = scan_formline(PL_bufptr);
5058 if (!PL_lex_formbrack)
5067 /* We really do *not* want PL_linestr ever becoming a COW. */
5068 assert (!SvIsCOW(PL_linestr));
5070 PL_oldoldbufptr = PL_oldbufptr;
5072 PL_parser->saw_infix_sigil = 0;
5074 if (PL_in_my == KEY_sigvar) {
5075 /* we expect the sigil and optional var name part of a
5076 * signature element here. Since a '$' is not necessarily
5077 * followed by a var name, handle it specially here; the general
5078 * yylex code would otherwise try to interpret whatever follows
5079 * as a var; e.g. ($, ...) would be seen as the var '$,'
5086 PL_bufptr = s; /* for error reporting */
5091 /* spot stuff that looks like an prototype */
5092 if (strchr("$:@%&*;\\[]", *s)) {
5093 yyerror("Illegal character following sigil in a subroutine signature");
5096 /* '$#' is banned, while '$ # comment' isn't */
5098 yyerror("'#' not allowed immediately following a sigil in a subroutine signature");
5102 if (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) {
5103 char *dest = PL_tokenbuf + 1;
5104 /* read var name, including sigil, into PL_tokenbuf */
5105 PL_tokenbuf[0] = sigil;
5106 parse_ident(&s, &dest, dest + sizeof(PL_tokenbuf) - 1,
5107 0, cBOOL(UTF), FALSE);
5109 assert(PL_tokenbuf[1]); /* we have a variable name */
5110 NEXTVAL_NEXTTOKE.ival = sigil;
5111 force_next('p'); /* force a signature pending identifier */
5115 PL_expect = XOPERATOR;
5121 case ',': /* handle ($a,,$b) */
5126 yyerror("A signature parameter must start with '$', '@' or '%'");
5127 /* very crude error recovery: skip to likely next signature
5129 while (*s && *s != '$' && *s != '@' && *s != '%' && *s != ')')
5140 if (isIDFIRST_utf8_safe(s, PL_bufend)) {
5144 else if (isALNUMC(*s)) {
5148 SV *dsv = newSVpvs_flags("", SVs_TEMP);
5151 STRLEN skiplen = UTF8SKIP(s);
5152 STRLEN stravail = PL_bufend - s;
5153 c = sv_uni_display(dsv, newSVpvn_flags(s,
5154 skiplen > stravail ? stravail : skiplen,
5155 SVs_TEMP | SVf_UTF8),
5156 10, UNI_DISPLAY_ISPRINT);
5159 c = Perl_form(aTHX_ "\\x%02X", (unsigned char)*s);
5161 len = UTF ? Perl_utf8_length(aTHX_ (U8 *) PL_linestart, (U8 *) s) : (STRLEN) (s - PL_linestart);
5162 if (len > UNRECOGNIZED_PRECEDE_COUNT) {
5163 d = UTF ? (char *) utf8_hop_back((U8 *) s, -UNRECOGNIZED_PRECEDE_COUNT, (U8 *)PL_linestart) : s - UNRECOGNIZED_PRECEDE_COUNT;
5167 Perl_croak(aTHX_ "Unrecognized character %s; marked by <-- HERE after %" UTF8f "<-- HERE near column %d", c,
5168 UTF8fARG(UTF, (s - d), d),
5173 goto fake_eof; /* emulate EOF on ^D or ^Z */
5175 if ((!PL_rsfp || PL_lex_inwhat)
5176 && (!PL_parser->filtered || s+1 < PL_bufend)) {
5180 && PL_lex_brackstack[PL_lex_brackets-1] != XFAKEEOF)
5182 yyerror((const char *)
5184 ? "Format not terminated"
5185 : "Missing right curly or square bracket"));
5187 DEBUG_T( { PerlIO_printf(Perl_debug_log,
5188 "### Tokener got EOF\n");
5192 if (s++ < PL_bufend)
5193 goto retry; /* ignore stray nulls */
5196 if (!PL_in_eval && !PL_preambled) {
5197 PL_preambled = TRUE;
5199 /* Generate a string of Perl code to load the debugger.
5200 * If PERL5DB is set, it will return the contents of that,
5201 * otherwise a compile-time require of perl5db.pl. */
5203 const char * const pdb = PerlEnv_getenv("PERL5DB");
5206 sv_setpv(PL_linestr, pdb);
5207 sv_catpvs(PL_linestr,";");
5209 SETERRNO(0,SS_NORMAL);
5210 sv_setpvs(PL_linestr, "BEGIN { require 'perl5db.pl' };");
5212 PL_parser->preambling = CopLINE(PL_curcop);
5214 SvPVCLEAR(PL_linestr);
5215 if (PL_preambleav) {
5216 SV **svp = AvARRAY(PL_preambleav);
5217 SV **const end = svp + AvFILLp(PL_preambleav);
5219 sv_catsv(PL_linestr, *svp);
5221 sv_catpvs(PL_linestr, ";");
5223 sv_free(MUTABLE_SV(PL_preambleav));
5224 PL_preambleav = NULL;
5227 sv_catpvs(PL_linestr,
5228 "use feature ':5." STRINGIFY(PERL_VERSION) "';");
5229 if (PL_minus_n || PL_minus_p) {
5230 sv_catpvs(PL_linestr, "LINE: while (<>) {"/*}*/);
5232 sv_catpvs(PL_linestr,"chomp;");
5235 if ((*PL_splitstr == '/' || *PL_splitstr == '\''
5236 || *PL_splitstr == '"')
5237 && strchr(PL_splitstr + 1, *PL_splitstr))
5238 Perl_sv_catpvf(aTHX_ PL_linestr, "our @F=split(%s);", PL_splitstr);
5240 /* "q\0${splitstr}\0" is legal perl. Yes, even NUL
5241 bytes can be used as quoting characters. :-) */
5242 const char *splits = PL_splitstr;
5243 sv_catpvs(PL_linestr, "our @F=split(q\0");
5246 if (*splits == '\\')
5247 sv_catpvn(PL_linestr, splits, 1);
5248 sv_catpvn(PL_linestr, splits, 1);
5249 } while (*splits++);
5250 /* This loop will embed the trailing NUL of
5251 PL_linestr as the last thing it does before
5253 sv_catpvs(PL_linestr, ");");
5257 sv_catpvs(PL_linestr,"our @F=split(' ');");
5260 sv_catpvs(PL_linestr, "\n");
5261 PL_oldoldbufptr = PL_oldbufptr = s = PL_linestart = SvPVX(PL_linestr);
5262 PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr);
5263 PL_last_lop = PL_last_uni = NULL;
5264 if (PERLDB_LINE_OR_SAVESRC && PL_curstash != PL_debstash)
5265 update_debugger_info(PL_linestr, NULL, 0);
5270 bof = cBOOL(PL_rsfp);
5273 fake_eof = LEX_FAKE_EOF;
5275 PL_bufptr = PL_bufend;
5276 COPLINE_INC_WITH_HERELINES;
5277 if (!lex_next_chunk(fake_eof)) {
5278 CopLINE_dec(PL_curcop);
5280 TOKEN(';'); /* not infinite loop because rsfp is NULL now */
5282 CopLINE_dec(PL_curcop);
5284 /* If it looks like the start of a BOM or raw UTF-16,
5285 * check if it in fact is. */
5288 || *(U8*)s == BOM_UTF8_FIRST_BYTE
5292 Off_t offset = (IV)PerlIO_tell(PL_rsfp);
5293 bof = (offset == (Off_t)SvCUR(PL_linestr));
5294 #if defined(PERLIO_USING_CRLF) && defined(PERL_TEXTMODE_SCRIPTS)
5295 /* offset may include swallowed CR */
5297 bof = (offset == (Off_t)SvCUR(PL_linestr)+1);
5300 PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr);
5301 s = swallow_bom((U8*)s);
5304 if (PL_parser->in_pod) {
5305 /* Incest with pod. */
5306 if (*s == '=' && strEQs(s, "=cut") && !isALPHA(s[4])) {
5307 SvPVCLEAR(PL_linestr);
5308 PL_oldoldbufptr = PL_oldbufptr = s = PL_linestart = SvPVX(PL_linestr);
5309 PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr);
5310 PL_last_lop = PL_last_uni = NULL;
5311 PL_parser->in_pod = 0;
5314 if (PL_rsfp || PL_parser->filtered)
5316 } while (PL_parser->in_pod);
5317 PL_oldoldbufptr = PL_oldbufptr = PL_bufptr = PL_linestart = s;
5318 PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr);
5319 PL_last_lop = PL_last_uni = NULL;
5320 if (CopLINE(PL_curcop) == 1) {
5321 while (s < PL_bufend && isSPACE(*s))
5323 if (*s == ':' && s[1] != ':') /* for csh execing sh scripts */
5327 if (*s == '#' && *(s+1) == '!')
5329 #ifdef ALTERNATE_SHEBANG
5331 static char const as[] = ALTERNATE_SHEBANG;
5332 if (*s == as[0] && strnEQ(s, as, sizeof(as) - 1))
5333 d = s + (sizeof(as) - 1);
5335 #endif /* ALTERNATE_SHEBANG */
5344 while (*d && !isSPACE(*d))
5348 #ifdef ARG_ZERO_IS_SCRIPT
5349 if (ipathend > ipath) {
5351 * HP-UX (at least) sets argv[0] to the script name,
5352 * which makes $^X incorrect. And Digital UNIX and Linux,
5353 * at least, set argv[0] to the basename of the Perl
5354 * interpreter. So, having found "#!", we'll set it right.
5356 SV* copfilesv = CopFILESV(PL_curcop);
5359 GvSV(gv_fetchpvs("\030", GV_ADD|GV_NOTQUAL,
5361 assert(SvPOK(x) || SvGMAGICAL(x));
5362 if (sv_eq(x, copfilesv)) {
5363 sv_setpvn(x, ipath, ipathend - ipath);
5369 const char *bstart = SvPV_const(copfilesv, blen);
5370 const char * const lstart = SvPV_const(x, llen);
5372 bstart += blen - llen;
5373 if (strnEQ(bstart, lstart, llen) && bstart[-1] == '/') {
5374 sv_setpvn(x, ipath, ipathend - ipath);
5381 /* Anything to do if no copfilesv? */
5383 TAINT_NOT; /* $^X is always tainted, but that's OK */
5385 #endif /* ARG_ZERO_IS_SCRIPT */
5390 d = instr(s,"perl -");
5392 d = instr(s,"perl");
5394 /* avoid getting into infinite loops when shebang
5395 * line contains "Perl" rather than "perl" */
5397 for (d = ipathend-4; d >= ipath; --d) {
5398 if (isALPHA_FOLD_EQ(*d, 'p')
5399 && !ibcmp(d, "perl", 4))
5409 #ifdef ALTERNATE_SHEBANG
5411 * If the ALTERNATE_SHEBANG on this system starts with a
5412 * character that can be part of a Perl expression, then if
5413 * we see it but not "perl", we're probably looking at the
5414 * start of Perl code, not a request to hand off to some
5415 * other interpreter. Similarly, if "perl" is there, but
5416 * not in the first 'word' of the line, we assume the line
5417 * contains the start of the Perl program.
5419 if (d && *s != '#') {
5420 const char *c = ipath;
5421 while (*c && !strchr("; \t\r\n\f\v#", *c))
5424 d = NULL; /* "perl" not in first word; ignore */
5426 *s = '#'; /* Don't try to parse shebang line */
5428 #endif /* ALTERNATE_SHEBANG */
5433 && !instr(s,"indir")
5434 && instr(PL_origargv[0],"perl"))
5441 while (s < PL_bufend && isSPACE(*s))
5443 if (s < PL_bufend) {
5444 Newx(newargv,PL_origargc+3,char*);
5446 while (s < PL_bufend && !isSPACE(*s))
5449 Copy(PL_origargv+1, newargv+2, PL_origargc+1, char*);
5452 newargv = PL_origargv;
5455 PerlProc_execv(ipath, EXEC_ARGV_CAST(newargv));
5457 Perl_croak(aTHX_ "Can't exec %s", ipath);
5460 while (*d && !isSPACE(*d))
5462 while (SPACE_OR_TAB(*d))
5466 const bool switches_done = PL_doswitches;
5467 const U32 oldpdb = PL_perldb;
5468 const bool oldn = PL_minus_n;
5469 const bool oldp = PL_minus_p;
5473 bool baduni = FALSE;
5475 const char *d2 = d1 + 1;
5476 if (parse_unicode_opts((const char **)&d2)
5480 if (baduni || isALPHA_FOLD_EQ(*d1, 'M')) {
5481 const char * const m = d1;
5482 while (*d1 && !isSPACE(*d1))
5484 Perl_croak(aTHX_ "Too late for \"-%.*s\" option",
5487 d1 = moreswitches(d1);
5489 if (PL_doswitches && !switches_done) {
5490 int argc = PL_origargc;
5491 char **argv = PL_origargv;
5494 } while (argc && argv[0][0] == '-' && argv[0][1]);
5495 init_argv_symbols(argc,argv);
5497 if ( (PERLDB_LINE_OR_SAVESRC && !oldpdb)
5498 || ((PL_minus_n || PL_minus_p) && !(oldn || oldp)))
5499 /* if we have already added "LINE: while (<>) {",
5500 we must not do it again */
5502 SvPVCLEAR(PL_linestr);
5503 PL_oldoldbufptr = PL_oldbufptr = s = PL_linestart = SvPVX(PL_linestr);
5504 PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr);
5505 PL_last_lop = PL_last_uni = NULL;
5506 PL_preambled = FALSE;
5507 if (PERLDB_LINE_OR_SAVESRC)
5508 (void)gv_fetchfile(PL_origfilename);
5515 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
5516 PL_lex_state = LEX_FORMLINE;
5517 force_next(FORMRBRACK);
5522 #ifdef PERL_STRICT_CR
5523 Perl_warn(aTHX_ "Illegal character \\%03o (carriage return)", '\r');
5525 "\t(Maybe you didn't strip carriage returns after a network transfer?)\n");
5527 case ' ': case '\t': case '\f': case '\v':
5532 if (PL_lex_state != LEX_NORMAL
5533 || (PL_in_eval && !PL_rsfp && !PL_parser->filtered))
5535 const bool in_comment = *s == '#';
5536 if (*s == '#' && s == PL_linestart && PL_in_eval
5537 && !PL_rsfp && !PL_parser->filtered) {
5538 /* handle eval qq[#line 1 "foo"\n ...] */
5539 CopLINE_dec(PL_curcop);
5543 while (d < PL_bufend && *d != '\n')
5547 else if (d > PL_bufend)
5548 /* Found by Ilya: feed random input to Perl. */
5549 Perl_croak(aTHX_ "panic: input overflow, %p > %p",
5552 if (in_comment && d == PL_bufend
5553 && PL_lex_state == LEX_INTERPNORMAL
5554 && PL_lex_inwhat == OP_SUBST && PL_lex_repl == PL_linestr
5555 && SvEVALED(PL_lex_repl) && d[-1] == '}') s--;
5558 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
5559 PL_lex_state = LEX_FORMLINE;
5560 force_next(FORMRBRACK);
5565 while (s < PL_bufend && *s != '\n')
5573 else if (s > PL_bufend)
5574 /* Found by Ilya: feed random input to Perl. */
5575 Perl_croak(aTHX_ "panic: input overflow");
5579 if (s[1] && isALPHA(s[1]) && !isWORDCHAR(s[2])) {
5587 while (s < PL_bufend && SPACE_OR_TAB(*s))
5590 if (strEQs(s,"=>")) {
5591 s = force_word(PL_bufptr,BAREWORD,FALSE,FALSE);
5592 DEBUG_T( { printbuf("### Saw unary minus before =>, forcing word %s\n", s); } );
5593 OPERATOR('-'); /* unary minus */
5596 case 'r': ftst = OP_FTEREAD; break;
5597 case 'w': ftst = OP_FTEWRITE; break;
5598 case 'x': ftst = OP_FTEEXEC; break;
5599 case 'o': ftst = OP_FTEOWNED; break;
5600 case 'R': ftst = OP_FTRREAD; break;
5601 case 'W': ftst = OP_FTRWRITE; break;
5602 case 'X': ftst = OP_FTREXEC; break;
5603 case 'O': ftst = OP_FTROWNED; break;
5604 case 'e': ftst = OP_FTIS; break;
5605 case 'z': ftst = OP_FTZERO; break;
5606 case 's': ftst = OP_FTSIZE; break;
5607 case 'f': ftst = OP_FTFILE; break;
5608 case 'd': ftst = OP_FTDIR; break;
5609 case 'l': ftst = OP_FTLINK; break;
5610 case 'p': ftst = OP_FTPIPE; break;
5611 case 'S': ftst = OP_FTSOCK; break;
5612 case 'u': ftst = OP_FTSUID; break;
5613 case 'g': ftst = OP_FTSGID; break;
5614 case 'k': ftst = OP_FTSVTX; break;
5615 case 'b': ftst = OP_FTBLK; break;
5616 case 'c': ftst = OP_FTCHR; break;
5617 case 't': ftst = OP_FTTTY; break;
5618 case 'T': ftst = OP_FTTEXT; break;
5619 case 'B': ftst = OP_FTBINARY; break;
5620 case 'M': case 'A': case 'C':
5621 gv_fetchpvs("\024", GV_ADD|GV_NOTQUAL, SVt_PV);
5623 case 'M': ftst = OP_FTMTIME; break;
5624 case 'A': ftst = OP_FTATIME; break;
5625 case 'C': ftst = OP_FTCTIME; break;
5633 PL_last_uni = PL_oldbufptr;
5634 PL_last_lop_op = (OPCODE)ftst;
5635 DEBUG_T( { PerlIO_printf(Perl_debug_log,
5636 "### Saw file test %c\n", (int)tmp);
5641 /* Assume it was a minus followed by a one-letter named
5642 * subroutine call (or a -bareword), then. */
5643 DEBUG_T( { PerlIO_printf(Perl_debug_log,
5644 "### '-%c' looked like a file test but was not\n",
5651 const char tmp = *s++;
5654 if (PL_expect == XOPERATOR)
5659 else if (*s == '>') {
5662 if (((*s == '$' || *s == '&') && s[1] == '*')
5663 ||(*s == '$' && s[1] == '#' && s[2] == '*')
5664 ||((*s == '@' || *s == '%') && strchr("*[{", s[1]))
5665 ||(*s == '*' && (s[1] == '*' || s[1] == '{'))
5668 PL_expect = XPOSTDEREF;
5671 if (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) {
5672 s = force_word(s,METHOD,FALSE,TRUE);
5680 if (PL_expect == XOPERATOR) {
5682 && !PL_lex_allbrackets
5683 && PL_lex_fakeeof >= LEX_FAKEEOF_ASSIGN)
5691 if (isSPACE(*s) || !isSPACE(*PL_bufptr))
5693 OPERATOR('-'); /* unary minus */
5699 const char tmp = *s++;
5702 if (PL_expect == XOPERATOR)
5707 if (PL_expect == XOPERATOR) {
5709 && !PL_lex_allbrackets
5710 && PL_lex_fakeeof >= LEX_FAKEEOF_ASSIGN)
5718 if (isSPACE(*s) || !isSPACE(*PL_bufptr))
5725 if (PL_expect == XPOSTDEREF) POSTDEREF('*');
5726 if (PL_expect != XOPERATOR) {
5727 s = scan_ident(s, PL_tokenbuf, sizeof PL_tokenbuf, TRUE);
5728 PL_expect = XOPERATOR;
5729 force_ident(PL_tokenbuf, '*');
5737 if (*s == '=' && !PL_lex_allbrackets
5738 && PL_lex_fakeeof >= LEX_FAKEEOF_ASSIGN)
5746 && !PL_lex_allbrackets
5747 && PL_lex_fakeeof >= LEX_FAKEEOF_ASSIGN)
5752 PL_parser->saw_infix_sigil = 1;
5757 if (PL_expect == XOPERATOR) {
5759 && !PL_lex_allbrackets
5760 && PL_lex_fakeeof >= LEX_FAKEEOF_ASSIGN)
5765 PL_parser->saw_infix_sigil = 1;
5768 else if (PL_expect == XPOSTDEREF) POSTDEREF('%');
5769 PL_tokenbuf[0] = '%';
5770 s = scan_ident(s, PL_tokenbuf + 1, sizeof PL_tokenbuf - 1, FALSE);
5772 if (!PL_tokenbuf[1]) {
5775 if ((PL_expect != XREF || PL_oldoldbufptr == PL_last_lop) && intuit_more(s)) {
5777 PL_tokenbuf[0] = '@';
5779 PL_expect = XOPERATOR;
5780 force_ident_maybe_lex('%');
5785 bof = FEATURE_BITWISE_IS_ENABLED;
5786 if (bof && s[1] == '.')
5788 if (!PL_lex_allbrackets && PL_lex_fakeeof >=
5789 (s[1] == '=' ? LEX_FAKEEOF_ASSIGN : LEX_FAKEEOF_BITWISE))
5795 BOop(bof ? d == s-2 ? OP_SBIT_XOR : OP_NBIT_XOR : OP_BIT_XOR);
5797 if (PL_lex_brackets > 100)
5798 Renew(PL_lex_brackstack, PL_lex_brackets + 10, char);
5799 PL_lex_brackstack[PL_lex_brackets++] = 0;
5800 PL_lex_allbrackets++;
5802 const char tmp = *s++;
5807 && (PL_expect == XOPERATOR || PL_expect == XTERMORDORDOR))
5809 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE)
5812 Perl_ck_warner_d(aTHX_
5813 packWARN(WARN_EXPERIMENTAL__SMARTMATCH),
5814 "Smartmatch is experimental");
5818 if ((bof = FEATURE_BITWISE_IS_ENABLED) && *s == '.') {
5820 BCop(OP_SCOMPLEMENT);
5822 BCop(bof ? OP_NCOMPLEMENT : OP_COMPLEMENT);
5824 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_COMMA)
5831 goto just_a_word_zero_gv;
5837 switch (PL_expect) {
5839 if (!PL_in_my || PL_lex_state != LEX_NORMAL)
5841 PL_bufptr = s; /* update in case we back off */
5844 "Use of := for an empty attribute list is not allowed");
5851 PL_expect = XTERMBLOCK;
5855 while (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) {
5858 d = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, FALSE, &len);
5859 if (isLOWER(*s) && (tmp = keyword(PL_tokenbuf, len, 0))) {
5860 if (tmp < 0) tmp = -tmp;
5875 sv = newSVpvn_flags(s, len, UTF ? SVf_UTF8 : 0);
5877 d = scan_str(d,TRUE,TRUE,FALSE,NULL);
5882 Perl_croak(aTHX_ "Unterminated attribute parameter in attribute list");
5884 COPLINE_SET_FROM_MULTI_END;
5887 sv_catsv(sv, PL_lex_stuff);
5888 attrs = op_append_elem(OP_LIST, attrs,
5889 newSVOP(OP_CONST, 0, sv));
5890 SvREFCNT_dec_NN(PL_lex_stuff);
5891 PL_lex_stuff = NULL;
5894 /* NOTE: any CV attrs applied here need to be part of
5895 the CVf_BUILTIN_ATTRS define in cv.h! */
5896 if (!PL_in_my && len == 6 && strnEQ(SvPVX(sv), "lvalue", len)) {
5898 CvLVALUE_on(PL_compcv);
5900 else if (!PL_in_my && len == 6 && strnEQ(SvPVX(sv), "method", len)) {
5902 CvMETHOD_on(PL_compcv);
5904 else if (!PL_in_my && len == 5
5905 && strnEQ(SvPVX(sv), "const", len))
5908 Perl_ck_warner_d(aTHX_
5909 packWARN(WARN_EXPERIMENTAL__CONST_ATTR),
5910 ":const is experimental"
5912 CvANONCONST_on(PL_compcv);
5913 if (!CvANON(PL_compcv))
5914 yyerror(":const is not permitted on named "
5917 /* After we've set the flags, it could be argued that
5918 we don't need to do the attributes.pm-based setting
5919 process, and shouldn't bother appending recognized
5920 flags. To experiment with that, uncomment the
5921 following "else". (Note that's already been
5922 uncommented. That keeps the above-applied built-in
5923 attributes from being intercepted (and possibly
5924 rejected) by a package's attribute routines, but is
5925 justified by the performance win for the common case
5926 of applying only built-in attributes.) */
5928 attrs = op_append_elem(OP_LIST, attrs,
5929 newSVOP(OP_CONST, 0,
5933 if (*s == ':' && s[1] != ':')
5936 break; /* require real whitespace or :'s */
5937 /* XXX losing whitespace on sequential attributes here */
5942 && !(PL_expect == XOPERATOR
5943 ? (*s == '=' || *s == ')')
5944 : (*s == '{' || *s == '(')))
5946 const char q = ((*s == '\'') ? '"' : '\'');
5947 /* If here for an expression, and parsed no attrs, back
5949 if (PL_expect == XOPERATOR && !attrs) {
5953 /* MUST advance bufptr here to avoid bogus "at end of line"
5954 context messages from yyerror().
5957 yyerror( (const char *)
5959 ? Perl_form(aTHX_ "Invalid separator character "
5960 "%c%c%c in attribute list", q, *s, q)
5961 : "Unterminated attribute list" ) );
5969 NEXTVAL_NEXTTOKE.opval = attrs;
5975 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_CLOSING) {
5979 PL_lex_allbrackets--;
5983 if (PL_last_lop == PL_oldoldbufptr || PL_last_uni == PL_oldoldbufptr)
5984 PL_oldbufptr = PL_oldoldbufptr; /* allow print(STDOUT 123) */
5988 PL_lex_allbrackets++;
5991 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_NONEXPR)
5998 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_CLOSING)
6001 PL_lex_allbrackets--;
6007 if (PL_lex_brackets && PL_lex_brackstack[PL_lex_brackets-1] == XFAKEEOF)
6010 if (PL_lex_brackets <= 0)
6011 /* diag_listed_as: Unmatched right %s bracket */
6012 yyerror("Unmatched right square bracket");
6015 PL_lex_allbrackets--;
6016 if (PL_lex_state == LEX_INTERPNORMAL) {
6017 if (PL_lex_brackets == 0) {
6018 if (*s == '-' && s[1] == '>')
6019 PL_lex_state = LEX_INTERPENDMAYBE;
6020 else if (*s != '[' && *s != '{')
6021 PL_lex_state = LEX_INTERPEND;
6028 if (PL_lex_brackets > 100) {
6029 Renew(PL_lex_brackstack, PL_lex_brackets + 10, char);
6031 switch (PL_expect) {
6034 PL_lex_brackstack[PL_lex_brackets++] = XOPERATOR;
6035 PL_lex_allbrackets++;
6036 OPERATOR(HASHBRACK);
6038 while (s < PL_bufend && SPACE_OR_TAB(*s))
6041 PL_tokenbuf[0] = '\0';
6042 if (d < PL_bufend && *d == '-') {
6043 PL_tokenbuf[0] = '-';
6045 while (d < PL_bufend && SPACE_OR_TAB(*d))
6048 if (d < PL_bufend && isIDFIRST_lazy_if_safe(d, PL_bufend, UTF)) {
6049 d = scan_word(d, PL_tokenbuf + 1, sizeof PL_tokenbuf - 1,
6051 while (d < PL_bufend && SPACE_OR_TAB(*d))
6054 const char minus = (PL_tokenbuf[0] == '-');
6055 s = force_word(s + minus, BAREWORD, FALSE, TRUE);
6063 PL_lex_brackstack[PL_lex_brackets++] = XOPERATOR;
6064 PL_lex_allbrackets++;
6069 PL_lex_brackstack[PL_lex_brackets++] = XSTATE;
6070 PL_lex_allbrackets++;
6074 PL_lex_brackstack[PL_lex_brackets++] = XTERM;
6075 PL_lex_allbrackets++;
6080 if (PL_oldoldbufptr == PL_last_lop)
6081 PL_lex_brackstack[PL_lex_brackets++] = XTERM;
6083 PL_lex_brackstack[PL_lex_brackets++] = XOPERATOR;
6084 PL_lex_allbrackets++;
6087 if (PL_expect == XREF && PL_lex_state == LEX_INTERPNORMAL) {
6089 /* This hack is to get the ${} in the message. */
6091 yyerror("syntax error");
6094 OPERATOR(HASHBRACK);
6096 if (PL_expect == XREF && PL_oldoldbufptr != PL_last_lop) {
6097 /* ${...} or @{...} etc., but not print {...}
6098 * Skip the disambiguation and treat this as a block.
6100 goto block_expectation;
6102 /* This hack serves to disambiguate a pair of curlies
6103 * as being a block or an anon hash. Normally, expectation
6104 * determines that, but in cases where we're not in a
6105 * position to expect anything in particular (like inside
6106 * eval"") we have to resolve the ambiguity. This code
6107 * covers the case where the first term in the curlies is a
6108 * quoted string. Most other cases need to be explicitly
6109 * disambiguated by prepending a "+" before the opening
6110 * curly in order to force resolution as an anon hash.
6112 * XXX should probably propagate the outer expectation
6113 * into eval"" to rely less on this hack, but that could
6114 * potentially break current behavior of eval"".
6118 if (*s == '\'' || *s == '"' || *s == '`') {
6119 /* common case: get past first string, handling escapes */
6120 for (t++; t < PL_bufend && *t != *s;)
6125 else if (*s == 'q') {
6128 || ((*t == 'q' || *t == 'x') && ++t < PL_bufend
6129 && !isWORDCHAR(*t))))
6131 /* skip q//-like construct */
6133 char open, close, term;
6136 while (t < PL_bufend && isSPACE(*t))
6138 /* check for q => */
6139 if (t+1 < PL_bufend && t[0] == '=' && t[1] == '>') {
6140 OPERATOR(HASHBRACK);
6144 if (term && (tmps = strchr("([{< )]}> )]}>",term)))
6148 for (t++; t < PL_bufend; t++) {
6149 if (*t == '\\' && t+1 < PL_bufend && open != '\\')
6151 else if (*t == open)
6155 for (t++; t < PL_bufend; t++) {
6156 if (*t == '\\' && t+1 < PL_bufend)
6158 else if (*t == close && --brackets <= 0)
6160 else if (*t == open)
6167 /* skip plain q word */
6168 while ( t < PL_bufend
6169 && isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF))
6171 t += UTF ? UTF8SKIP(t) : 1;
6174 else if (isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF)) {
6175 t += UTF ? UTF8SKIP(t) : 1;
6176 while ( t < PL_bufend
6177 && isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF))
6179 t += UTF ? UTF8SKIP(t) : 1;
6182 while (t < PL_bufend && isSPACE(*t))
6184 /* if comma follows first term, call it an anon hash */
6185 /* XXX it could be a comma expression with loop modifiers */
6186 if (t < PL_bufend && ((*t == ',' && (*s == 'q' || !isLOWER(*s)))
6187 || (*t == '=' && t[1] == '>')))
6188 OPERATOR(HASHBRACK);
6189 if (PL_expect == XREF)
6192 /* If there is an opening brace or 'sub:', treat it
6193 as a term to make ${{...}}{k} and &{sub:attr...}
6194 dwim. Otherwise, treat it as a statement, so
6195 map {no strict; ...} works.
6202 if (strEQs(s, "sub")) {
6213 PL_lex_brackstack[PL_lex_brackets-1] = XSTATE;
6219 pl_yylval.ival = CopLINE(PL_curcop);
6220 PL_copline = NOLINE; /* invalidate current command line number */
6221 TOKEN(formbrack ? '=' : '{');
6223 if (PL_lex_brackets && PL_lex_brackstack[PL_lex_brackets-1] == XFAKEEOF)
6227 if (PL_lex_brackets <= 0)
6228 /* diag_listed_as: Unmatched right %s bracket */
6229 yyerror("Unmatched right curly bracket");
6231 PL_expect = (expectation)PL_lex_brackstack[--PL_lex_brackets];
6232 PL_lex_allbrackets--;
6233 if (PL_lex_state == LEX_INTERPNORMAL) {
6234 if (PL_lex_brackets == 0) {
6235 if (PL_expect & XFAKEBRACK) {
6236 PL_expect &= XENUMMASK;
6237 PL_lex_state = LEX_INTERPEND;
6239 return yylex(); /* ignore fake brackets */
6241 if (PL_lex_inwhat == OP_SUBST && PL_lex_repl == PL_linestr
6242 && SvEVALED(PL_lex_repl))
6243 PL_lex_state = LEX_INTERPEND;
6244 else if (*s == '-' && s[1] == '>')
6245 PL_lex_state = LEX_INTERPENDMAYBE;
6246 else if (*s != '[' && *s != '{')
6247 PL_lex_state = LEX_INTERPEND;
6250 if (PL_expect & XFAKEBRACK) {
6251 PL_expect &= XENUMMASK;
6253 return yylex(); /* ignore fake brackets */
6255 force_next(formbrack ? '.' : '}');
6256 if (formbrack) LEAVE;
6257 if (formbrack == 2) { /* means . where arguments were expected */
6263 if (PL_expect == XPOSTDEREF) POSTDEREF('&');
6266 if (!PL_lex_allbrackets && PL_lex_fakeeof >=
6267 (*s == '=' ? LEX_FAKEEOF_ASSIGN : LEX_FAKEEOF_LOGIC)) {
6274 if (PL_expect == XOPERATOR) {
6275 if ( PL_bufptr == PL_linestart
6276 && ckWARN(WARN_SEMICOLON)
6277 && isIDFIRST_lazy_if_safe(s, PL_bufend, UTF))
6279 CopLINE_dec(PL_curcop);
6280 Perl_warner(aTHX_ packWARN(WARN_SEMICOLON), "%s", PL_warn_nosemi);
6281 CopLINE_inc(PL_curcop);
6284 if ((bof = FEATURE_BITWISE_IS_ENABLED) && *s == '.')
6286 if (!PL_lex_allbrackets && PL_lex_fakeeof >=
6287 (*s == '=' ? LEX_FAKEEOF_ASSIGN : LEX_FAKEEOF_BITWISE)) {
6293 PL_parser->saw_infix_sigil = 1;
6294 BAop(bof ? OP_NBIT_AND : OP_BIT_AND);
6300 PL_tokenbuf[0] = '&';
6301 s = scan_ident(s - 1, PL_tokenbuf + 1, sizeof PL_tokenbuf - 1, TRUE);
6302 pl_yylval.ival = (OPpENTERSUB_AMPER<<8);
6303 if (PL_tokenbuf[1]) {
6304 force_ident_maybe_lex('&');
6313 if (!PL_lex_allbrackets && PL_lex_fakeeof >=
6314 (*s == '=' ? LEX_FAKEEOF_ASSIGN : LEX_FAKEEOF_LOGIC)) {
6322 if ((bof = FEATURE_BITWISE_IS_ENABLED) && *s == '.')
6324 if (!PL_lex_allbrackets && PL_lex_fakeeof >=
6325 (*s == '=' ? LEX_FAKEEOF_ASSIGN : LEX_FAKEEOF_BITWISE)) {
6329 BOop(bof ? s == d ? OP_NBIT_OR : OP_SBIT_OR : OP_BIT_OR);
6333 const char tmp = *s++;
6335 if ((s == PL_linestart+2 || s[-3] == '\n') && strEQs(s, "=====")) {
6336 s = vcs_conflict_marker(s + 5);
6339 if (!PL_lex_allbrackets
6340 && PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE)
6348 if (!PL_lex_allbrackets
6349 && PL_lex_fakeeof >= LEX_FAKEEOF_COMMA)
6358 if (tmp && isSPACE(*s) && ckWARN(WARN_SYNTAX)
6359 && strchr("+-*/%.^&|<",tmp))
6360 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
6361 "Reversed %c= operator",(int)tmp);
6363 if (PL_expect == XSTATE
6365 && (s == PL_linestart+1 || s[-2] == '\n') )
6367 if ((PL_in_eval && !PL_rsfp && !PL_parser->filtered)
6368 || PL_lex_state != LEX_NORMAL) {
6373 if (strEQs(s,"=cut")) {
6387 PL_parser->in_pod = 1;
6391 if (PL_expect == XBLOCK) {
6393 #ifdef PERL_STRICT_CR
6394 while (SPACE_OR_TAB(*t))
6396 while (SPACE_OR_TAB(*t) || *t == '\r')
6399 if (*t == '\n' || *t == '#') {
6402 SAVEI8(PL_parser->form_lex_state);
6403 SAVEI32(PL_lex_formbrack);
6404 PL_parser->form_lex_state = PL_lex_state;
6405 PL_lex_formbrack = PL_lex_brackets + 1;
6409 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_ASSIGN) {
6418 const char tmp = *s++;
6420 /* was this !=~ where !~ was meant?
6421 * warn on m:!=~\s+([/?]|[msy]\W|tr\W): */
6423 if (*s == '~' && ckWARN(WARN_SYNTAX)) {
6424 const char *t = s+1;
6426 while (t < PL_bufend && isSPACE(*t))
6429 if (*t == '/' || *t == '?'
6430 || ((*t == 'm' || *t == 's' || *t == 'y')
6431 && !isWORDCHAR(t[1]))
6432 || (*t == 't' && t[1] == 'r' && !isWORDCHAR(t[2])))
6433 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
6434 "!=~ should be !~");
6436 if (!PL_lex_allbrackets
6437 && PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE)
6450 if (PL_expect != XOPERATOR) {
6451 if (s[1] != '<' && !strchr(s,'>'))
6453 if (s[1] == '<' && s[2] != '>') {
6454 if ((s == PL_linestart || s[-1] == '\n') && strEQs(s+2, "<<<<<")) {
6455 s = vcs_conflict_marker(s + 7);
6458 s = scan_heredoc(s);
6461 s = scan_inputsymbol(s);
6462 PL_expect = XOPERATOR;
6463 TOKEN(sublex_start());
6469 if ((s == PL_linestart+2 || s[-3] == '\n') && strEQs(s, "<<<<<")) {
6470 s = vcs_conflict_marker(s + 5);
6473 if (*s == '=' && !PL_lex_allbrackets
6474 && PL_lex_fakeeof >= LEX_FAKEEOF_ASSIGN)
6479 SHop(OP_LEFT_SHIFT);
6484 if (!PL_lex_allbrackets
6485 && PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE)
6493 if (!PL_lex_allbrackets
6494 && PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE)
6503 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE) {
6511 const char tmp = *s++;
6513 if ((s == PL_linestart+2 || s[-3] == '\n') && strEQs(s, ">>>>>")) {
6514 s = vcs_conflict_marker(s + 5);
6517 if (*s == '=' && !PL_lex_allbrackets
6518 && PL_lex_fakeeof >= LEX_FAKEEOF_ASSIGN)
6523 SHop(OP_RIGHT_SHIFT);
6525 else if (tmp == '=') {
6526 if (!PL_lex_allbrackets
6527 && PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE)
6536 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE) {
6545 if (PL_expect == XPOSTDEREF) {
6548 POSTDEREF(DOLSHARP);
6554 && ( isIDFIRST_lazy_if_safe(s+2, PL_bufend, UTF)
6555 || strchr("{$:+-@", s[2])))
6557 PL_tokenbuf[0] = '@';
6558 s = scan_ident(s + 1, PL_tokenbuf + 1,
6559 sizeof PL_tokenbuf - 1, FALSE);
6560 if (PL_expect == XOPERATOR) {
6562 if (PL_bufptr > s) {
6564 PL_bufptr = PL_oldbufptr;
6566 no_op("Array length", d);
6568 if (!PL_tokenbuf[1])
6570 PL_expect = XOPERATOR;
6571 force_ident_maybe_lex('#');
6575 PL_tokenbuf[0] = '$';
6576 s = scan_ident(s, PL_tokenbuf + 1, sizeof PL_tokenbuf - 1, FALSE);
6577 if (PL_expect == XOPERATOR) {
6579 if (PL_bufptr > s) {
6581 PL_bufptr = PL_oldbufptr;
6585 if (!PL_tokenbuf[1]) {
6587 yyerror("Final $ should be \\$ or $name");
6593 const char tmp = *s;
6594 if (PL_lex_state == LEX_NORMAL || PL_lex_brackets)
6597 if ((PL_expect != XREF || PL_oldoldbufptr == PL_last_lop)
6598 && intuit_more(s)) {
6600 PL_tokenbuf[0] = '@';
6601 if (ckWARN(WARN_SYNTAX)) {
6605 || isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF)
6608 t += UTF ? UTF8SKIP(t) : 1;
6611 PL_bufptr = skipspace(PL_bufptr); /* XXX can realloc */
6612 while (t < PL_bufend && *t != ']')
6614 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
6615 "Multidimensional syntax %" UTF8f " not supported",
6616 UTF8fARG(UTF,(int)((t - PL_bufptr) + 1), PL_bufptr));
6620 else if (*s == '{') {
6622 PL_tokenbuf[0] = '%';
6623 if (strEQ(PL_tokenbuf+1, "SIG") && ckWARN(WARN_SYNTAX)
6624 && (t = strchr(s, '}')) && (t = strchr(t, '=')))
6626 char tmpbuf[sizeof PL_tokenbuf];
6629 } while (isSPACE(*t));
6630 if (isIDFIRST_lazy_if_safe(t, PL_bufend, UTF)) {
6632 t = scan_word(t, tmpbuf, sizeof tmpbuf, TRUE,
6637 && get_cvn_flags(tmpbuf, len, UTF
6641 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
6642 "You need to quote \"%" UTF8f "\"",
6643 UTF8fARG(UTF, len, tmpbuf));
6650 PL_expect = XOPERATOR;
6651 if (PL_lex_state == LEX_NORMAL && isSPACE((char)tmp)) {
6652 const bool islop = (PL_last_lop == PL_oldoldbufptr);
6653 if (!islop || PL_last_lop_op == OP_GREPSTART)
6654 PL_expect = XOPERATOR;
6655 else if (strchr("$@\"'`q", *s))
6656 PL_expect = XTERM; /* e.g. print $fh "foo" */
6657 else if ( strchr("&*<%", *s)
6658 && isIDFIRST_lazy_if_safe(s+1, PL_bufend, UTF))
6660 PL_expect = XTERM; /* e.g. print $fh &sub */
6662 else if (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) {
6663 char tmpbuf[sizeof PL_tokenbuf];
6665 scan_word(s, tmpbuf, sizeof tmpbuf, TRUE, &len);
6666 if ((t2 = keyword(tmpbuf, len, 0))) {
6667 /* binary operators exclude handle interpretations */
6679 PL_expect = XTERM; /* e.g. print $fh length() */
6684 PL_expect = XTERM; /* e.g. print $fh subr() */
6687 else if (isDIGIT(*s))
6688 PL_expect = XTERM; /* e.g. print $fh 3 */
6689 else if (*s == '.' && isDIGIT(s[1]))
6690 PL_expect = XTERM; /* e.g. print $fh .3 */
6691 else if ((*s == '?' || *s == '-' || *s == '+')
6692 && !isSPACE(s[1]) && s[1] != '=')
6693 PL_expect = XTERM; /* e.g. print $fh -1 */
6694 else if (*s == '/' && !isSPACE(s[1]) && s[1] != '='
6696 PL_expect = XTERM; /* e.g. print $fh /.../
6697 XXX except DORDOR operator
6699 else if (*s == '<' && s[1] == '<' && !isSPACE(s[2])
6701 PL_expect = XTERM; /* print $fh <<"EOF" */
6704 force_ident_maybe_lex('$');
6708 if (PL_expect == XPOSTDEREF)
6710 PL_tokenbuf[0] = '@';
6711 s = scan_ident(s, PL_tokenbuf + 1, sizeof PL_tokenbuf - 1, FALSE);
6712 if (PL_expect == XOPERATOR) {
6714 if (PL_bufptr > s) {
6716 PL_bufptr = PL_oldbufptr;
6721 if (!PL_tokenbuf[1]) {
6724 if (PL_lex_state == LEX_NORMAL)
6726 if ((PL_expect != XREF || PL_oldoldbufptr == PL_last_lop) && intuit_more(s)) {
6728 PL_tokenbuf[0] = '%';
6730 /* Warn about @ where they meant $. */
6731 if (*s == '[' || *s == '{') {
6732 if (ckWARN(WARN_SYNTAX)) {
6733 S_check_scalar_slice(aTHX_ s);
6737 PL_expect = XOPERATOR;
6738 force_ident_maybe_lex('@');
6741 case '/': /* may be division, defined-or, or pattern */
6742 if ((PL_expect == XOPERATOR || PL_expect == XTERMORDORDOR) && s[1] == '/') {
6743 if (!PL_lex_allbrackets && PL_lex_fakeeof >=
6744 (s[2] == '=' ? LEX_FAKEEOF_ASSIGN : LEX_FAKEEOF_LOGIC))
6749 else if (PL_expect == XOPERATOR) {
6751 if (*s == '=' && !PL_lex_allbrackets
6752 && PL_lex_fakeeof >= LEX_FAKEEOF_ASSIGN)
6760 /* Disable warning on "study /blah/" */
6761 if ( PL_oldoldbufptr == PL_last_uni
6762 && ( *PL_last_uni != 's' || s - PL_last_uni < 5
6763 || memNE(PL_last_uni, "study", 5)
6764 || isWORDCHAR_lazy_if_safe(PL_last_uni+5, PL_bufend, UTF)
6767 s = scan_pat(s,OP_MATCH);
6768 TERM(sublex_start());
6771 case '?': /* conditional */
6773 if (!PL_lex_allbrackets
6774 && PL_lex_fakeeof >= LEX_FAKEEOF_IFELSE)
6779 PL_lex_allbrackets++;
6783 if (PL_lex_formbrack && PL_lex_brackets == PL_lex_formbrack
6784 #ifdef PERL_STRICT_CR
6787 && (s[1] == '\n' || (s[1] == '\r' && s[2] == '\n'))
6789 && (s == PL_linestart || s[-1] == '\n') )
6792 formbrack = 2; /* dot seen where arguments expected */
6795 if (PL_expect == XSTATE && s[1] == '.' && s[2] == '.') {
6799 if (PL_expect == XOPERATOR || !isDIGIT(s[1])) {
6802 if (!PL_lex_allbrackets
6803 && PL_lex_fakeeof >= LEX_FAKEEOF_RANGE)
6811 pl_yylval.ival = OPf_SPECIAL;
6817 if (*s == '=' && !PL_lex_allbrackets
6818 && PL_lex_fakeeof >= LEX_FAKEEOF_ASSIGN)
6826 case '0': case '1': case '2': case '3': case '4':
6827 case '5': case '6': case '7': case '8': case '9':
6828 s = scan_num(s, &pl_yylval);
6829 DEBUG_T( { printbuf("### Saw number in %s\n", s); } );
6830 if (PL_expect == XOPERATOR)
6835 s = scan_str(s,FALSE,FALSE,FALSE,NULL);
6838 COPLINE_SET_FROM_MULTI_END;
6839 DEBUG_T( { printbuf("### Saw string before %s\n", s); } );
6840 if (PL_expect == XOPERATOR) {
6843 pl_yylval.ival = OP_CONST;
6844 TERM(sublex_start());
6847 s = scan_str(s,FALSE,FALSE,FALSE,NULL);
6850 printbuf("### Saw string before %s\n", s);
6852 PerlIO_printf(Perl_debug_log,
6853 "### Saw unterminated string\n");
6855 if (PL_expect == XOPERATOR) {
6860 pl_yylval.ival = OP_CONST;
6861 /* FIXME. I think that this can be const if char *d is replaced by
6862 more localised variables. */
6863 for (d = SvPV(PL_lex_stuff, len); len; len--, d++) {
6864 if (*d == '$' || *d == '@' || *d == '\\' || !UTF8_IS_INVARIANT((U8)*d)) {
6865 pl_yylval.ival = OP_STRINGIFY;
6869 if (pl_yylval.ival == OP_CONST)
6870 COPLINE_SET_FROM_MULTI_END;
6871 TERM(sublex_start());
6874 s = scan_str(s,FALSE,FALSE,FALSE,NULL);
6877 printbuf("### Saw backtick string before %s\n", s);
6879 PerlIO_printf(Perl_debug_log,
6880 "### Saw unterminated backtick string\n");
6882 if (PL_expect == XOPERATOR)
6883 no_op("Backticks",s);
6886 pl_yylval.ival = OP_BACKTICK;
6887 TERM(sublex_start());
6891 if (PL_lex_inwhat == OP_SUBST && PL_lex_repl == PL_linestr
6893 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX),"Can't use \\%c to mean $%c in expression",
6895 if (PL_expect == XOPERATOR)
6896 no_op("Backslash",s);
6900 if (isDIGIT(s[1]) && PL_expect != XOPERATOR) {
6901 char *start = s + 2;
6902 while (isDIGIT(*start) || *start == '_')
6904 if (*start == '.' && isDIGIT(start[1])) {
6905 s = scan_num(s, &pl_yylval);
6908 else if ((*start == ':' && start[1] == ':')
6909 || (PL_expect == XSTATE && *start == ':'))
6911 else if (PL_expect == XSTATE) {
6913 while (d < PL_bufend && isSPACE(*d)) d++;
6914 if (*d == ':') goto keylookup;
6916 /* avoid v123abc() or $h{v1}, allow C<print v10;> */
6917 if (!isALPHA(*start) && (PL_expect == XTERM
6918 || PL_expect == XREF || PL_expect == XSTATE
6919 || PL_expect == XTERMORDORDOR)) {
6920 GV *const gv = gv_fetchpvn_flags(s, start - s,
6921 UTF ? SVf_UTF8 : 0, SVt_PVCV);
6923 s = scan_num(s, &pl_yylval);
6930 if (isDIGIT(s[1]) && PL_expect == XOPERATOR) {
6983 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, FALSE, &len);
6985 /* Some keywords can be followed by any delimiter, including ':' */
6986 anydelim = word_takes_any_delimiter(PL_tokenbuf, len);
6988 /* x::* is just a word, unless x is "CORE" */
6989 if (!anydelim && *s == ':' && s[1] == ':') {
6990 if (strEQ(PL_tokenbuf, "CORE")) goto case_KEY_CORE;
6995 while (d < PL_bufend && isSPACE(*d))
6996 d++; /* no comments skipped here, or s### is misparsed */
6998 /* Is this a word before a => operator? */
6999 if (*d == '=' && d[1] == '>') {
7003 = newSVOP(OP_CONST, 0,
7004 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
7005 pl_yylval.opval->op_private = OPpCONST_BARE;
7009 /* Check for plugged-in keyword */
7013 char *saved_bufptr = PL_bufptr;
7015 result = PL_keyword_plugin(aTHX_ PL_tokenbuf, len, &o);
7017 if (result == KEYWORD_PLUGIN_DECLINE) {
7018 /* not a plugged-in keyword */
7019 PL_bufptr = saved_bufptr;
7020 } else if (result == KEYWORD_PLUGIN_STMT) {
7021 pl_yylval.opval = o;
7023 if (!PL_nexttoke) PL_expect = XSTATE;
7024 return REPORT(PLUGSTMT);
7025 } else if (result == KEYWORD_PLUGIN_EXPR) {
7026 pl_yylval.opval = o;
7028 if (!PL_nexttoke) PL_expect = XOPERATOR;
7029 return REPORT(PLUGEXPR);
7031 Perl_croak(aTHX_ "Bad plugin affecting keyword '%s'",
7036 /* Check for built-in keyword */
7037 tmp = keyword(PL_tokenbuf, len, 0);
7039 /* Is this a label? */
7040 if (!anydelim && PL_expect == XSTATE
7041 && d < PL_bufend && *d == ':' && *(d + 1) != ':') {
7043 pl_yylval.pval = savepvn(PL_tokenbuf, len+1);
7044 pl_yylval.pval[len] = '\0';
7045 pl_yylval.pval[len+1] = UTF ? 1 : 0;
7050 /* Check for lexical sub */
7051 if (PL_expect != XOPERATOR) {
7052 char tmpbuf[sizeof PL_tokenbuf + 1];
7054 Copy(PL_tokenbuf, tmpbuf+1, len, char);
7055 off = pad_findmy_pvn(tmpbuf, len+1, 0);
7056 if (off != NOT_IN_PAD) {
7057 assert(off); /* we assume this is boolean-true below */
7058 if (PAD_COMPNAME_FLAGS_isOUR(off)) {
7059 HV * const stash = PAD_COMPNAME_OURSTASH(off);
7060 HEK * const stashname = HvNAME_HEK(stash);
7061 sv = newSVhek(stashname);
7062 sv_catpvs(sv, "::");
7063 sv_catpvn_flags(sv, PL_tokenbuf, len,
7064 (UTF ? SV_CATUTF8 : SV_CATBYTES));
7065 gv = gv_fetchsv(sv, GV_NOADD_NOINIT | SvUTF8(sv),
7075 rv2cv_op = newOP(OP_PADANY, 0);
7076 rv2cv_op->op_targ = off;
7077 cv = find_lexical_cv(off);
7085 if (tmp < 0) { /* second-class keyword? */
7086 GV *ogv = NULL; /* override (winner) */
7087 GV *hgv = NULL; /* hidden (loser) */
7088 if (PL_expect != XOPERATOR && (*s != ':' || s[1] != ':')) {
7090 if ((gv = gv_fetchpvn_flags(PL_tokenbuf, len,
7091 (UTF ? SVf_UTF8 : 0)|GV_NOTQUAL,
7093 && (cv = GvCVu(gv)))
7095 if (GvIMPORTED_CV(gv))
7097 else if (! CvMETHOD(cv))
7101 && (gvp = (GV**)hv_fetch(PL_globalstash, PL_tokenbuf,
7104 && (isGV_with_GP(gv)
7105 ? GvCVu(gv) && GvIMPORTED_CV(gv)
7106 : SvPCS_IMPORTED(gv)
7107 && (gv_init(gv, PL_globalstash, PL_tokenbuf,
7115 tmp = 0; /* overridden by import or by GLOBAL */
7118 && -tmp==KEY_lock /* XXX generalizable kludge */
7121 tmp = 0; /* any sub overrides "weak" keyword */
7123 else { /* no override */
7125 if (tmp == KEY_dump) {
7126 Perl_ck_warner_d(aTHX_ packWARN2(WARN_MISC,WARN_DEPRECATED),
7127 "dump() better written as CORE::dump(). "
7128 "dump() will no longer be available "
7133 if (hgv && tmp != KEY_x) /* never ambiguous */
7134 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
7135 "Ambiguous call resolved as CORE::%s(), "
7136 "qualify as such or use &",
7141 if (tmp && tmp != KEY___DATA__ && tmp != KEY___END__
7142 && (!anydelim || *s != '#')) {
7143 /* no override, and not s### either; skipspace is safe here
7144 * check for => on following line */
7146 STRLEN bufoff = PL_bufptr - SvPVX(PL_linestr);
7147 STRLEN soff = s - SvPVX(PL_linestr);
7149 arrow = *s == '=' && s[1] == '>';
7150 PL_bufptr = SvPVX(PL_linestr) + bufoff;
7151 s = SvPVX(PL_linestr) + soff;
7159 /* Trade off - by using this evil construction we can pull the
7160 variable gv into the block labelled keylookup. If not, then
7161 we have to give it function scope so that the goto from the
7162 earlier ':' case doesn't bypass the initialisation. */
7163 just_a_word_zero_gv:
7172 default: /* not a keyword */
7175 const char lastchar = (PL_bufptr == PL_oldoldbufptr ? 0 : PL_bufptr[-1]);
7179 /* Get the rest if it looks like a package qualifier */
7181 if (*s == '\'' || (*s == ':' && s[1] == ':')) {
7183 s = scan_word(s, PL_tokenbuf + len, sizeof PL_tokenbuf - len,
7186 Perl_croak(aTHX_ "Bad name after %" UTF8f "%s",
7187 UTF8fARG(UTF, len, PL_tokenbuf),
7188 *s == '\'' ? "'" : "::");
7193 if (PL_expect == XOPERATOR) {
7194 if (PL_bufptr == PL_linestart) {
7195 CopLINE_dec(PL_curcop);
7196 Perl_warner(aTHX_ packWARN(WARN_SEMICOLON), "%s", PL_warn_nosemi);
7197 CopLINE_inc(PL_curcop);
7200 no_op("Bareword",s);
7203 /* See if the name is "Foo::",
7204 in which case Foo is a bareword
7205 (and a package name). */
7208 && PL_tokenbuf[len - 2] == ':'
7209 && PL_tokenbuf[len - 1] == ':')
7211 if (ckWARN(WARN_BAREWORD)
7212 && ! gv_fetchpvn_flags(PL_tokenbuf, len, UTF ? SVf_UTF8 : 0, SVt_PVHV))
7213 Perl_warner(aTHX_ packWARN(WARN_BAREWORD),
7214 "Bareword \"%" UTF8f
7215 "\" refers to nonexistent package",
7216 UTF8fARG(UTF, len, PL_tokenbuf));
7218 PL_tokenbuf[len] = '\0';
7227 /* if we saw a global override before, get the right name */
7230 sv = S_newSV_maybe_utf8(aTHX_ PL_tokenbuf,
7233 SV * const tmp_sv = sv;
7234 sv = newSVpvs("CORE::GLOBAL::");
7235 sv_catsv(sv, tmp_sv);
7236 SvREFCNT_dec(tmp_sv);
7240 /* Presume this is going to be a bareword of some sort. */
7242 pl_yylval.opval = newSVOP(OP_CONST, 0, sv);
7243 pl_yylval.opval->op_private = OPpCONST_BARE;
7245 /* And if "Foo::", then that's what it certainly is. */
7251 OP *const_op = newSVOP(OP_CONST, 0, SvREFCNT_inc_NN(sv));
7252 const_op->op_private = OPpCONST_BARE;
7254 newCVREF(OPpMAY_RETURN_CONSTANT<<8, const_op);
7258 : SvROK(gv) && SvTYPE(SvRV(gv)) == SVt_PVCV
7261 : rv2cv_op_cv(rv2cv_op, RV2CVOPCV_RETURN_STUB);
7264 /* Use this var to track whether intuit_method has been
7265 called. intuit_method returns 0 or > 255. */
7268 /* See if it's the indirect object for a list operator. */
7271 && PL_oldoldbufptr < PL_bufptr
7272 && (PL_oldoldbufptr == PL_last_lop
7273 || PL_oldoldbufptr == PL_last_uni)
7274 && /* NO SKIPSPACE BEFORE HERE! */
7276 || ((PL_opargs[PL_last_lop_op] >> OASHIFT)& 7)
7279 bool immediate_paren = *s == '(';
7282 /* (Now we can afford to cross potential line boundary.) */
7285 /* intuit_method() can indirectly call lex_next_chunk(),
7288 s_off = s - SvPVX(PL_linestr);
7289 /* Two barewords in a row may indicate method call. */
7290 if ( ( isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)
7292 && (tmp = intuit_method(s, lex ? NULL : sv, cv)))
7294 /* the code at method: doesn't use s */
7297 s = SvPVX(PL_linestr) + s_off;
7299 /* If not a declared subroutine, it's an indirect object. */
7300 /* (But it's an indir obj regardless for sort.) */
7301 /* Also, if "_" follows a filetest operator, it's a bareword */
7304 ( !immediate_paren && (PL_last_lop_op == OP_SORT
7306 && (PL_last_lop_op != OP_MAPSTART
7307 && PL_last_lop_op != OP_GREPSTART))))
7308 || (PL_tokenbuf[0] == '_' && PL_tokenbuf[1] == '\0'
7309 && ((PL_opargs[PL_last_lop_op] & OA_CLASS_MASK)
7313 PL_expect = (PL_last_lop == PL_oldoldbufptr) ? XTERM : XOPERATOR;
7318 PL_expect = XOPERATOR;
7321 /* Is this a word before a => operator? */
7322 if (*s == '=' && s[1] == '>' && !pkgname) {
7325 if (gvp || (lex && !off)) {
7326 assert (cSVOPx(pl_yylval.opval)->op_sv == sv);
7327 /* This is our own scalar, created a few lines
7328 above, so this is safe. */
7330 sv_setpv(sv, PL_tokenbuf);
7331 if (UTF && !IN_BYTES
7332 && is_utf8_string((U8*)PL_tokenbuf, len))
7339 /* If followed by a paren, it's certainly a subroutine. */
7344 while (SPACE_OR_TAB(*d))
7346 if (*d == ')' && (sv = cv_const_sv_or_av(cv))) {
7351 NEXTVAL_NEXTTOKE.opval =
7352 off ? rv2cv_op : pl_yylval.opval;
7354 op_free(pl_yylval.opval), force_next(PRIVATEREF);
7355 else op_free(rv2cv_op), force_next(BAREWORD);
7360 /* If followed by var or block, call it a method (unless sub) */
7362 if ((*s == '$' || *s == '{') && !cv) {
7364 PL_last_lop = PL_oldbufptr;
7365 PL_last_lop_op = OP_METHOD;
7366 if (!PL_lex_allbrackets
7367 && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
7369 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
7371 PL_expect = XBLOCKTERM;
7373 return REPORT(METHOD);
7376 /* If followed by a bareword, see if it looks like indir obj. */
7380 && (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF) || *s == '$')
7381 && (tmp = intuit_method(s, lex ? NULL : sv, cv)))
7385 assert(cSVOPx(pl_yylval.opval)->op_sv == sv);
7387 sv_setpvn(sv, PL_tokenbuf, len);
7388 if (UTF && !IN_BYTES
7389 && is_utf8_string((U8*)PL_tokenbuf, len))
7391 else SvUTF8_off(sv);
7394 if (tmp == METHOD && !PL_lex_allbrackets
7395 && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
7397 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
7402 /* Not a method, so call it a subroutine (if defined) */
7405 /* Check for a constant sub */
7406 if ((sv = cv_const_sv_or_av(cv))) {
7409 SvREFCNT_dec(((SVOP*)pl_yylval.opval)->op_sv);
7410 ((SVOP*)pl_yylval.opval)->op_sv = SvREFCNT_inc_simple(sv);
7411 if (SvTYPE(sv) == SVt_PVAV)
7412 pl_yylval.opval = newUNOP(OP_RV2AV, OPf_PARENS,
7415 pl_yylval.opval->op_private = 0;
7416 pl_yylval.opval->op_folded = 1;
7417 pl_yylval.opval->op_flags |= OPf_SPECIAL;
7422 op_free(pl_yylval.opval);
7424 off ? newCVREF(0, rv2cv_op) : rv2cv_op;
7425 pl_yylval.opval->op_private |= OPpENTERSUB_NOPAREN;
7426 PL_last_lop = PL_oldbufptr;
7427 PL_last_lop_op = OP_ENTERSUB;
7428 /* Is there a prototype? */
7432 STRLEN protolen = CvPROTOLEN(cv);
7433 const char *proto = CvPROTO(cv);
7435 proto = S_strip_spaces(aTHX_ proto, &protolen);
7438 if ((optional = *proto == ';'))
7441 while (*proto == ';');
7445 *proto == '$' || *proto == '_'
7446 || *proto == '*' || *proto == '+'
7451 *proto == '\\' && proto[1] && proto[2] == '\0'
7454 UNIPROTO(UNIOPSUB,optional);
7455 if (*proto == '\\' && proto[1] == '[') {
7456 const char *p = proto + 2;
7457 while(*p && *p != ']')
7459 if(*p == ']' && !p[1])
7460 UNIPROTO(UNIOPSUB,optional);
7462 if (*proto == '&' && *s == '{') {
7464 sv_setpvs(PL_subname, "__ANON__");
7466 sv_setpvs(PL_subname, "__ANON__::__ANON__");
7467 if (!PL_lex_allbrackets
7468 && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
7470 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
7475 NEXTVAL_NEXTTOKE.opval = pl_yylval.opval;
7477 force_next(off ? PRIVATEREF : BAREWORD);
7478 if (!PL_lex_allbrackets
7479 && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
7481 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
7486 /* Call it a bare word */
7488 if (PL_hints & HINT_STRICT_SUBS)
7489 pl_yylval.opval->op_private |= OPpCONST_STRICT;
7492 /* after "print" and similar functions (corresponding to
7493 * "F? L" in opcode.pl), whatever wasn't already parsed as
7494 * a filehandle should be subject to "strict subs".
7495 * Likewise for the optional indirect-object argument to system
7496 * or exec, which can't be a bareword */
7497 if ((PL_last_lop_op == OP_PRINT
7498 || PL_last_lop_op == OP_PRTF
7499 || PL_last_lop_op == OP_SAY
7500 || PL_last_lop_op == OP_SYSTEM
7501 || PL_last_lop_op == OP_EXEC)
7502 && (PL_hints & HINT_STRICT_SUBS))
7503 pl_yylval.opval->op_private |= OPpCONST_STRICT;
7504 if (lastchar != '-') {
7505 if (ckWARN(WARN_RESERVED)) {
7509 if (!*d && !gv_stashpv(PL_tokenbuf, UTF ? SVf_UTF8 : 0))
7511 /* PL_warn_reserved is constant */
7512 GCC_DIAG_IGNORE(-Wformat-nonliteral);
7513 Perl_warner(aTHX_ packWARN(WARN_RESERVED), PL_warn_reserved,
7523 if ((lastchar == '*' || lastchar == '%' || lastchar == '&')
7524 && saw_infix_sigil) {
7525 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
7526 "Operator or semicolon missing before %c%" UTF8f,
7528 UTF8fARG(UTF, strlen(PL_tokenbuf),
7530 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
7531 "Ambiguous use of %c resolved as operator %c",
7532 lastchar, lastchar);
7539 newSVOP(OP_CONST, 0, newSVpv(CopFILE(PL_curcop),0))
7544 newSVOP(OP_CONST, 0,
7545 Perl_newSVpvf(aTHX_ "%" IVdf, (IV)CopLINE(PL_curcop)))
7548 case KEY___PACKAGE__:
7550 newSVOP(OP_CONST, 0,
7552 ? newSVhek(HvNAME_HEK(PL_curstash))
7559 if (PL_rsfp && (!PL_in_eval || PL_tokenbuf[2] == 'D')) {
7560 HV * const stash = PL_tokenbuf[2] == 'D' && PL_curstash
7563 gv = (GV *)*hv_fetchs(stash, "DATA", 1);
7565 gv_init(gv,stash,"DATA",4,0);
7568 GvIOp(gv) = newIO();
7569 IoIFP(GvIOp(gv)) = PL_rsfp;
7570 #if defined(HAS_FCNTL) && defined(F_SETFD) && defined(FD_CLOEXEC)
7572 const int fd = PerlIO_fileno(PL_rsfp);
7574 fcntl(fd,F_SETFD, FD_CLOEXEC);
7578 /* Mark this internal pseudo-handle as clean */
7579 IoFLAGS(GvIOp(gv)) |= IOf_UNTAINT;
7580 if ((PerlIO*)PL_rsfp == PerlIO_stdin())
7581 IoTYPE(GvIOp(gv)) = IoTYPE_STD;
7583 IoTYPE(GvIOp(gv)) = IoTYPE_RDONLY;
7584 #if defined(WIN32) && !defined(PERL_TEXTMODE_SCRIPTS)
7585 /* if the script was opened in binmode, we need to revert
7586 * it to text mode for compatibility; but only iff it has CRs
7587 * XXX this is a questionable hack at best. */
7588 if (PL_bufend-PL_bufptr > 2
7589 && PL_bufend[-1] == '\n' && PL_bufend[-2] == '\r')
7592 if (IoTYPE(GvIOp(gv)) == IoTYPE_RDONLY) {
7593 loc = PerlIO_tell(PL_rsfp);
7594 (void)PerlIO_seek(PL_rsfp, 0L, 0);
7597 if (PerlLIO_setmode(PL_rsfp, O_TEXT) != -1) {
7599 if (PerlLIO_setmode(PerlIO_fileno(PL_rsfp), O_TEXT) != -1) {
7600 #endif /* NETWARE */
7602 PerlIO_seek(PL_rsfp, loc, 0);
7606 #ifdef PERLIO_LAYERS
7609 PerlIO_apply_layers(aTHX_ PL_rsfp, NULL, ":utf8");
7618 FUN0OP(CvCLONE(PL_compcv)
7619 ? newOP(OP_RUNCV, 0)
7620 : newPVOP(OP_RUNCV,0,NULL));
7629 if (PL_expect == XSTATE) {
7640 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, FALSE, &len);
7641 if ((*s == ':' && s[1] == ':')
7642 || (!(tmp = keyword(PL_tokenbuf, len, 1)) && *s == '\''))
7646 Copy(PL_bufptr, PL_tokenbuf, olen, char);
7650 Perl_croak(aTHX_ "CORE::%" UTF8f " is not a keyword",
7651 UTF8fARG(UTF, len, PL_tokenbuf));
7654 else if (tmp == KEY_require || tmp == KEY_do
7656 /* that's a way to remember we saw "CORE::" */
7668 LOP(OP_ACCEPT,XTERM);
7671 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_LOWLOGIC)
7676 LOP(OP_ATAN2,XTERM);
7682 LOP(OP_BINMODE,XTERM);
7685 LOP(OP_BLESS,XTERM);
7694 /* We have to disambiguate the two senses of
7695 "continue". If the next token is a '{' then
7696 treat it as the start of a continue block;
7697 otherwise treat it as a control operator.
7707 (void)gv_fetchpvs("ENV", GV_ADD|GV_NOTQUAL, SVt_PVHV);
7717 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE)
7726 if (!PL_cryptseen) {
7727 PL_cryptseen = TRUE;
7731 LOP(OP_CRYPT,XTERM);
7734 LOP(OP_CHMOD,XTERM);
7737 LOP(OP_CHOWN,XTERM);
7740 LOP(OP_CONNECT,XTERM);
7760 d = scan_word(s, PL_tokenbuf + 1, sizeof PL_tokenbuf - 1,
7762 if (len && (len != 4 || strNE(PL_tokenbuf+1, "CORE"))
7763 && !keyword(PL_tokenbuf + 1, len, 0)) {
7764 SSize_t off = s-SvPVX(PL_linestr);
7766 s = SvPVX(PL_linestr)+off;
7768 force_ident_maybe_lex('&');
7773 if (orig_keyword == KEY_do) {
7782 PL_hints |= HINT_BLOCK_SCOPE;
7792 Perl_populate_isa(aTHX_ STR_WITH_LEN("AnyDBM_File::ISA"),
7793 STR_WITH_LEN("NDBM_File::"),
7794 STR_WITH_LEN("DB_File::"),
7795 STR_WITH_LEN("GDBM_File::"),
7796 STR_WITH_LEN("SDBM_File::"),
7797 STR_WITH_LEN("ODBM_File::"),
7799 LOP(OP_DBMOPEN,XTERM);
7811 pl_yylval.ival = CopLINE(PL_curcop);
7815 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE)
7827 if (*s == '{') { /* block eval */
7828 PL_expect = XTERMBLOCK;
7829 UNIBRACK(OP_ENTERTRY);
7831 else { /* string eval */
7833 UNIBRACK(OP_ENTEREVAL);
7838 UNIBRACK(-OP_ENTEREVAL);
7852 case KEY_endhostent:
7858 case KEY_endservent:
7861 case KEY_endprotoent:
7872 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_NONEXPR)
7874 pl_yylval.ival = CopLINE(PL_curcop);
7876 if ( PL_expect == XSTATE
7877 && isIDFIRST_lazy_if_safe(s, PL_bufend, UTF))
7880 SSize_t s_off = s - SvPVX(PL_linestr);
7882 if ((PL_bufend - p) >= 3
7883 && strEQs(p, "my") && isSPACE(*(p + 2)))
7887 else if ((PL_bufend - p) >= 4
7888 && strEQs(p, "our") && isSPACE(*(p + 3)))
7891 /* skip optional package name, as in "for my abc $x (..)" */
7892 if (isIDFIRST_lazy_if_safe(p, PL_bufend, UTF)) {
7893 p = scan_word(p, PL_tokenbuf, sizeof PL_tokenbuf, TRUE, &len);
7896 if (*p != '$' && *p != '\\')
7897 Perl_croak(aTHX_ "Missing $ on loop variable");
7899 /* The buffer may have been reallocated, update s */
7900 s = SvPVX(PL_linestr) + s_off;
7905 LOP(OP_FORMLINE,XTERM);
7914 LOP(OP_FCNTL,XTERM);
7920 LOP(OP_FLOCK,XTERM);
7923 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE)
7928 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE)
7933 LOP(OP_GREPSTART, XREF);
7950 case KEY_getpriority:
7951 LOP(OP_GETPRIORITY,XTERM);
7953 case KEY_getprotobyname:
7956 case KEY_getprotobynumber:
7957 LOP(OP_GPBYNUMBER,XTERM);
7959 case KEY_getprotoent:
7971 case KEY_getpeername:
7972 UNI(OP_GETPEERNAME);
7974 case KEY_gethostbyname:
7977 case KEY_gethostbyaddr:
7978 LOP(OP_GHBYADDR,XTERM);
7980 case KEY_gethostent:
7983 case KEY_getnetbyname:
7986 case KEY_getnetbyaddr:
7987 LOP(OP_GNBYADDR,XTERM);
7992 case KEY_getservbyname:
7993 LOP(OP_GSBYNAME,XTERM);
7995 case KEY_getservbyport:
7996 LOP(OP_GSBYPORT,XTERM);
7998 case KEY_getservent:
8001 case KEY_getsockname:
8002 UNI(OP_GETSOCKNAME);
8004 case KEY_getsockopt:
8005 LOP(OP_GSOCKOPT,XTERM);
8020 pl_yylval.ival = CopLINE(PL_curcop);
8021 Perl_ck_warner_d(aTHX_
8022 packWARN(WARN_EXPERIMENTAL__SMARTMATCH),
8023 "given is experimental");
8028 orig_keyword==KEY_glob ? -OP_GLOB : OP_GLOB,
8036 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_NONEXPR)
8038 pl_yylval.ival = CopLINE(PL_curcop);
8042 LOP(OP_INDEX,XTERM);
8048 LOP(OP_IOCTL,XTERM);
8075 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE)
8080 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE)
8094 LOP(OP_LISTEN,XTERM);
8103 s = scan_pat(s,OP_MATCH);
8104 TERM(sublex_start());
8107 LOP(OP_MAPSTART, XREF);
8110 LOP(OP_MKDIR,XTERM);
8113 LOP(OP_MSGCTL,XTERM);
8116 LOP(OP_MSGGET,XTERM);
8119 LOP(OP_MSGRCV,XTERM);
8122 LOP(OP_MSGSND,XTERM);
8129 yyerror(Perl_form(aTHX_
8130 "Can't redeclare \"%s\" in \"%s\"",
8131 tmp == KEY_my ? "my" :
8132 tmp == KEY_state ? "state" : "our",
8133 PL_in_my == KEY_my ? "my" :
8134 PL_in_my == KEY_state ? "state" : "our"));
8136 PL_in_my = (U16)tmp;
8138 if (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) {
8139 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, TRUE, &len);
8140 if (len == 3 && strEQs(PL_tokenbuf, "sub"))
8142 PL_in_my_stash = find_in_my_stash(PL_tokenbuf, len);
8143 if (!PL_in_my_stash) {
8147 len = my_snprintf(tmpbuf, sizeof(tmpbuf), "No such class %.1000s", PL_tokenbuf);
8148 PERL_MY_SNPRINTF_POST_GUARD(len, sizeof(tmpbuf));
8149 yyerror_pv(tmpbuf, UTF ? SVf_UTF8 : 0);
8152 else if (*s == '\\') {
8153 if (!FEATURE_MYREF_IS_ENABLED)
8154 Perl_croak(aTHX_ "The experimental declared_refs "
8155 "feature is not enabled");
8156 Perl_ck_warner_d(aTHX_
8157 packWARN(WARN_EXPERIMENTAL__DECLARED_REFS),
8158 "Declaring references is experimental");
8166 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE)
8171 s = tokenize_use(0, s);
8175 if (*s == '(' || (s = skipspace(s), *s == '('))
8178 if (!PL_lex_allbrackets
8179 && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
8181 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
8188 if (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) {
8190 d = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, FALSE,
8192 for (t=d; isSPACE(*t);)
8194 if ( *t && strchr("|&*+-=!?:.", *t) && ckWARN_d(WARN_PRECEDENCE)
8196 && !(t[0] == '=' && t[1] == '>')
8197 && !(t[0] == ':' && t[1] == ':')
8198 && !keyword(s, d-s, 0)
8200 Perl_warner(aTHX_ packWARN(WARN_PRECEDENCE),
8201 "Precedence problem: open %" UTF8f " should be open(%" UTF8f ")",
8202 UTF8fARG(UTF, d-s, s), UTF8fARG(UTF, d-s, s));
8208 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_LOWLOGIC)
8210 pl_yylval.ival = OP_OR;
8220 LOP(OP_OPEN_DIR,XTERM);
8223 checkcomma(s,PL_tokenbuf,"filehandle");
8227 checkcomma(s,PL_tokenbuf,"filehandle");
8246 s = force_word(s,BAREWORD,FALSE,TRUE);
8248 s = force_strict_version(s);
8252 LOP(OP_PIPE_OP,XTERM);
8255 s = scan_str(s,FALSE,FALSE,FALSE,NULL);
8258 COPLINE_SET_FROM_MULTI_END;
8259 pl_yylval.ival = OP_CONST;
8260 TERM(sublex_start());
8267 s = scan_str(s,FALSE,FALSE,FALSE,NULL);
8270 COPLINE_SET_FROM_MULTI_END;
8271 PL_expect = XOPERATOR;
8272 if (SvCUR(PL_lex_stuff)) {
8273 int warned_comma = !ckWARN(WARN_QW);
8274 int warned_comment = warned_comma;
8275 d = SvPV_force(PL_lex_stuff, len);
8277 for (; isSPACE(*d) && len; --len, ++d)
8282 if (!warned_comma || !warned_comment) {
8283 for (; !isSPACE(*d) && len; --len, ++d) {
8284 if (!warned_comma && *d == ',') {
8285 Perl_warner(aTHX_ packWARN(WARN_QW),
8286 "Possible attempt to separate words with commas");
8289 else if (!warned_comment && *d == '#') {
8290 Perl_warner(aTHX_ packWARN(WARN_QW),
8291 "Possible attempt to put comments in qw() list");
8297 for (; !isSPACE(*d) && len; --len, ++d)
8300 sv = newSVpvn_utf8(b, d-b, DO_UTF8(PL_lex_stuff));
8301 words = op_append_elem(OP_LIST, words,
8302 newSVOP(OP_CONST, 0, tokeq(sv)));
8307 words = newNULLLIST();
8308 SvREFCNT_dec_NN(PL_lex_stuff);
8309 PL_lex_stuff = NULL;
8310 PL_expect = XOPERATOR;
8311 pl_yylval.opval = sawparens(words);
8316 s = scan_str(s,FALSE,FALSE,FALSE,NULL);
8319 pl_yylval.ival = OP_STRINGIFY;
8320 if (SvIVX(PL_lex_stuff) == '\'')
8321 SvIV_set(PL_lex_stuff, 0); /* qq'$foo' should interpolate */
8322 TERM(sublex_start());
8325 s = scan_pat(s,OP_QR);
8326 TERM(sublex_start());
8329 s = scan_str(s,FALSE,FALSE,FALSE,NULL);
8332 pl_yylval.ival = OP_BACKTICK;
8333 TERM(sublex_start());
8341 s = force_version(s, FALSE);
8343 else if (*s != 'v' || !isDIGIT(s[1])
8344 || (s = force_version(s, TRUE), *s == 'v'))
8346 *PL_tokenbuf = '\0';
8347 s = force_word(s,BAREWORD,TRUE,TRUE);
8348 if (isIDFIRST_lazy_if_safe(PL_tokenbuf,
8349 PL_tokenbuf + sizeof(PL_tokenbuf),
8352 gv_stashpvn(PL_tokenbuf, strlen(PL_tokenbuf),
8353 GV_ADD | (UTF ? SVf_UTF8 : 0));
8356 yyerror("<> at require-statement should be quotes");
8358 if (orig_keyword == KEY_require) {
8364 PL_expect = PL_nexttoke ? XOPERATOR : XTERM;
8366 PL_last_uni = PL_oldbufptr;
8367 PL_last_lop_op = OP_REQUIRE;
8369 return REPORT( (int)REQUIRE );
8378 LOP(OP_RENAME,XTERM);
8387 LOP(OP_RINDEX,XTERM);
8396 UNIDOR(OP_READLINE);
8399 UNIDOR(OP_BACKTICK);
8408 LOP(OP_REVERSE,XTERM);
8411 UNIDOR(OP_READLINK);
8418 if (pl_yylval.opval)
8419 TERM(sublex_start());
8421 TOKEN(1); /* force error */
8424 checkcomma(s,PL_tokenbuf,"filehandle");
8434 LOP(OP_SELECT,XTERM);
8440 LOP(OP_SEMCTL,XTERM);
8443 LOP(OP_SEMGET,XTERM);
8446 LOP(OP_SEMOP,XTERM);
8452 LOP(OP_SETPGRP,XTERM);
8454 case KEY_setpriority:
8455 LOP(OP_SETPRIORITY,XTERM);
8457 case KEY_sethostent:
8463 case KEY_setservent:
8466 case KEY_setprotoent:
8476 LOP(OP_SEEKDIR,XTERM);
8478 case KEY_setsockopt:
8479 LOP(OP_SSOCKOPT,XTERM);
8485 LOP(OP_SHMCTL,XTERM);
8488 LOP(OP_SHMGET,XTERM);
8491 LOP(OP_SHMREAD,XTERM);
8494 LOP(OP_SHMWRITE,XTERM);
8497 LOP(OP_SHUTDOWN,XTERM);
8506 LOP(OP_SOCKET,XTERM);
8508 case KEY_socketpair:
8509 LOP(OP_SOCKPAIR,XTERM);
8512 checkcomma(s,PL_tokenbuf,"subroutine name");
8515 s = force_word(s,BAREWORD,TRUE,TRUE);
8519 LOP(OP_SPLIT,XTERM);
8522 LOP(OP_SPRINTF,XTERM);
8525 LOP(OP_SPLICE,XTERM);
8540 LOP(OP_SUBSTR,XTERM);
8546 char * const tmpbuf = PL_tokenbuf + 1;
8547 expectation attrful;
8548 bool have_name, have_proto;
8549 const int key = tmp;
8550 SV *format_name = NULL;
8552 SSize_t off = s-SvPVX(PL_linestr);
8554 d = SvPVX(PL_linestr)+off;
8556 if ( isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)
8558 || (*s == ':' && s[1] == ':'))
8562 attrful = XATTRBLOCK;
8563 d = scan_word(s, tmpbuf, sizeof PL_tokenbuf - 1, TRUE,
8565 if (key == KEY_format)
8566 format_name = S_newSV_maybe_utf8(aTHX_ s, d - s);
8568 if (memchr(tmpbuf, ':', len) || key != KEY_sub
8570 PL_tokenbuf, len + 1, 0
8572 sv_setpvn(PL_subname, tmpbuf, len);
8574 sv_setsv(PL_subname,PL_curstname);
8575 sv_catpvs(PL_subname,"::");
8576 sv_catpvn(PL_subname,tmpbuf,len);
8578 if (SvUTF8(PL_linestr))
8579 SvUTF8_on(PL_subname);
8586 if (key == KEY_my || key == KEY_our || key==KEY_state)
8589 /* diag_listed_as: Missing name in "%s sub" */
8591 "Missing name in \"%s\"", PL_bufptr);
8593 PL_expect = XTERMBLOCK;
8594 attrful = XATTRTERM;
8595 sv_setpvs(PL_subname,"?");
8599 if (key == KEY_format) {
8601 NEXTVAL_NEXTTOKE.opval
8602 = newSVOP(OP_CONST,0, format_name);
8603 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
8604 force_next(BAREWORD);
8609 /* Look for a prototype */
8610 if (*s == '(' && !FEATURE_SIGNATURES_IS_ENABLED) {
8611 s = scan_str(s,FALSE,FALSE,FALSE,NULL);
8612 COPLINE_SET_FROM_MULTI_END;
8614 Perl_croak(aTHX_ "Prototype not terminated");
8615 (void)validate_proto(PL_subname, PL_lex_stuff, ckWARN(WARN_ILLEGALPROTO));
8623 if (*s == ':' && s[1] != ':')
8624 PL_expect = attrful;
8625 else if ((*s != '{' && *s != '(') && key != KEY_format) {
8626 assert(key == KEY_sub || key == KEY_AUTOLOAD ||
8627 key == KEY_DESTROY || key == KEY_BEGIN ||
8628 key == KEY_UNITCHECK || key == KEY_CHECK ||
8629 key == KEY_INIT || key == KEY_END ||
8630 key == KEY_my || key == KEY_state ||
8633 Perl_croak(aTHX_ "Illegal declaration of anonymous subroutine");
8634 else if (*s != ';' && *s != '}')
8635 Perl_croak(aTHX_ "Illegal declaration of subroutine %" SVf, SVfARG(PL_subname));
8639 NEXTVAL_NEXTTOKE.opval =
8640 newSVOP(OP_CONST, 0, PL_lex_stuff);
8641 PL_lex_stuff = NULL;
8646 sv_setpvs(PL_subname, "__ANON__");
8648 sv_setpvs(PL_subname, "__ANON__::__ANON__");
8651 force_ident_maybe_lex('&');
8656 LOP(OP_SYSTEM,XREF);
8659 LOP(OP_SYMLINK,XTERM);
8662 LOP(OP_SYSCALL,XTERM);
8665 LOP(OP_SYSOPEN,XTERM);
8668 LOP(OP_SYSSEEK,XTERM);
8671 LOP(OP_SYSREAD,XTERM);
8674 LOP(OP_SYSWRITE,XTERM);
8679 TERM(sublex_start());
8700 LOP(OP_TRUNCATE,XTERM);
8712 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_NONEXPR)
8714 pl_yylval.ival = CopLINE(PL_curcop);
8718 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_NONEXPR)
8720 pl_yylval.ival = CopLINE(PL_curcop);
8724 LOP(OP_UNLINK,XTERM);
8730 LOP(OP_UNPACK,XTERM);
8733 LOP(OP_UTIME,XTERM);
8739 LOP(OP_UNSHIFT,XTERM);
8742 s = tokenize_use(1, s);
8752 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_NONEXPR)
8754 pl_yylval.ival = CopLINE(PL_curcop);
8755 Perl_ck_warner_d(aTHX_
8756 packWARN(WARN_EXPERIMENTAL__SMARTMATCH),
8757 "when is experimental");
8761 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_NONEXPR)
8763 pl_yylval.ival = CopLINE(PL_curcop);
8767 PL_hints |= HINT_BLOCK_SCOPE;
8774 LOP(OP_WAITPID,XTERM);
8780 /* Make sure $^L is defined. 0x0C is CTRL-L on ASCII platforms, and
8781 * we use the same number on EBCDIC */
8782 gv_fetchpvs("\x0C", GV_ADD|GV_NOTQUAL, SVt_PV);
8786 if (PL_expect == XOPERATOR) {
8787 if (*s == '=' && !PL_lex_allbrackets
8788 && PL_lex_fakeeof >= LEX_FAKEEOF_ASSIGN)
8798 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_LOWLOGIC)
8800 pl_yylval.ival = OP_XOR;
8809 Looks up an identifier in the pad or in a package
8811 is_sig indicates that this is a subroutine signature variable
8812 rather than a plain pad var.
8815 PRIVATEREF if this is a lexical name.
8816 BAREWORD if this belongs to a package.
8819 if we're in a my declaration
8820 croak if they tried to say my($foo::bar)
8821 build the ops for a my() declaration
8822 if it's an access to a my() variable
8823 build ops for access to a my() variable
8824 if in a dq string, and they've said @foo and we can't find @foo
8826 build ops for a bareword
8830 S_pending_ident(pTHX)
8833 const char pit = (char)pl_yylval.ival;
8834 const STRLEN tokenbuf_len = strlen(PL_tokenbuf);
8835 /* All routes through this function want to know if there is a colon. */
8836 const char *const has_colon = (const char*) memchr (PL_tokenbuf, ':', tokenbuf_len);
8838 DEBUG_T({ PerlIO_printf(Perl_debug_log,
8839 "### Pending identifier '%s'\n", PL_tokenbuf); });
8841 /* if we're in a my(), we can't allow dynamics here.
8842 $foo'bar has already been turned into $foo::bar, so
8843 just check for colons.
8845 if it's a legal name, the OP is a PADANY.
8848 if (PL_in_my == KEY_our) { /* "our" is merely analogous to "my" */
8850 yyerror_pv(Perl_form(aTHX_ "No package name allowed for "
8851 "%se %s in \"our\"",
8852 *PL_tokenbuf=='&' ?"subroutin":"variabl",
8853 PL_tokenbuf), UTF ? SVf_UTF8 : 0);
8854 tmp = allocmy(PL_tokenbuf, tokenbuf_len, UTF ? SVf_UTF8 : 0);
8859 /* "my" variable %s can't be in a package */
8860 /* PL_no_myglob is constant */
8861 GCC_DIAG_IGNORE(-Wformat-nonliteral);
8862 yyerror_pv(Perl_form(aTHX_ PL_no_myglob,
8863 PL_in_my == KEY_my ? "my" : "state",
8864 *PL_tokenbuf == '&' ? "subroutin" : "variabl",
8866 UTF ? SVf_UTF8 : 0);
8870 if (PL_in_my == KEY_sigvar) {
8871 /* A signature 'padop' needs in addition, an op_first to
8872 * point to a child sigdefelem, and an extra field to hold
8873 * the signature index. We can achieve both by using an
8874 * UNOP_AUX and (ab)using the op_aux field to hold the
8875 * index. If we ever need more fields, use a real malloced
8876 * aux strut instead.
8878 o = newUNOP_AUX(OP_ARGELEM, 0, NULL,
8879 INT2PTR(UNOP_AUX_item *,
8880 (PL_parser->sig_elems)));
8881 o->op_private |= ( PL_tokenbuf[0] == '$' ? OPpARGELEM_SV
8882 : PL_tokenbuf[0] == '@' ? OPpARGELEM_AV
8886 o = newOP(OP_PADANY, 0);
8887 o->op_targ = allocmy(PL_tokenbuf, tokenbuf_len,
8888 UTF ? SVf_UTF8 : 0);
8889 if (PL_in_my == KEY_sigvar)
8892 pl_yylval.opval = o;
8898 build the ops for accesses to a my() variable.
8903 tmp = pad_findmy_pvn(PL_tokenbuf, tokenbuf_len,
8905 if (tmp != NOT_IN_PAD) {
8906 /* might be an "our" variable" */
8907 if (PAD_COMPNAME_FLAGS_isOUR(tmp)) {
8908 /* build ops for a bareword */
8909 HV * const stash = PAD_COMPNAME_OURSTASH(tmp);
8910 HEK * const stashname = HvNAME_HEK(stash);
8911 SV * const sym = newSVhek(stashname);
8912 sv_catpvs(sym, "::");
8913 sv_catpvn_flags(sym, PL_tokenbuf+1, tokenbuf_len - 1, (UTF ? SV_CATUTF8 : SV_CATBYTES ));
8914 pl_yylval.opval = newSVOP(OP_CONST, 0, sym);
8915 pl_yylval.opval->op_private = OPpCONST_ENTERED;
8919 ((PL_tokenbuf[0] == '$') ? SVt_PV
8920 : (PL_tokenbuf[0] == '@') ? SVt_PVAV
8925 pl_yylval.opval = newOP(OP_PADANY, 0);
8926 pl_yylval.opval->op_targ = tmp;
8932 Whine if they've said @foo or @foo{key} in a doublequoted string,
8933 and @foo (or %foo) isn't a variable we can find in the symbol
8936 if (ckWARN(WARN_AMBIGUOUS)
8938 && PL_lex_state != LEX_NORMAL
8939 && !PL_lex_brackets)
8941 GV *const gv = gv_fetchpvn_flags(PL_tokenbuf + 1, tokenbuf_len - 1,
8942 ( UTF ? SVf_UTF8 : 0 ) | GV_ADDMG,
8944 if ((!gv || ((PL_tokenbuf[0] == '@') ? !GvAV(gv) : !GvHV(gv)))
8947 /* Downgraded from fatal to warning 20000522 mjd */
8948 Perl_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
8949 "Possible unintended interpolation of %" UTF8f
8951 UTF8fARG(UTF, tokenbuf_len, PL_tokenbuf));
8955 /* build ops for a bareword */
8956 pl_yylval.opval = newSVOP(OP_CONST, 0,
8957 newSVpvn_flags(PL_tokenbuf + 1,
8959 UTF ? SVf_UTF8 : 0 ));
8960 pl_yylval.opval->op_private = OPpCONST_ENTERED;
8962 gv_fetchpvn_flags(PL_tokenbuf+1, tokenbuf_len - 1,
8963 (PL_in_eval ? GV_ADDMULTI : GV_ADD)
8964 | ( UTF ? SVf_UTF8 : 0 ),
8965 ((PL_tokenbuf[0] == '$') ? SVt_PV
8966 : (PL_tokenbuf[0] == '@') ? SVt_PVAV
8972 S_checkcomma(pTHX_ const char *s, const char *name, const char *what)
8974 PERL_ARGS_ASSERT_CHECKCOMMA;
8976 if (*s == ' ' && s[1] == '(') { /* XXX gotta be a better way */
8977 if (ckWARN(WARN_SYNTAX)) {
8980 for (w = s+2; *w && level; w++) {
8988 /* the list of chars below is for end of statements or
8989 * block / parens, boolean operators (&&, ||, //) and branch
8990 * constructs (or, and, if, until, unless, while, err, for).
8991 * Not a very solid hack... */
8992 if (!*w || !strchr(";&/|})]oaiuwef!=", *w))
8993 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
8994 "%s (...) interpreted as function",name);
8997 while (s < PL_bufend && isSPACE(*s))
9001 while (s < PL_bufend && isSPACE(*s))
9003 if (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) {
9004 const char * const w = s;
9005 s += UTF ? UTF8SKIP(s) : 1;
9006 while (isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF))
9007 s += UTF ? UTF8SKIP(s) : 1;
9008 while (s < PL_bufend && isSPACE(*s))
9012 if (keyword(w, s - w, 0))
9015 gv = gv_fetchpvn_flags(w, s - w, ( UTF ? SVf_UTF8 : 0 ), SVt_PVCV);
9016 if (gv && GvCVu(gv))
9021 Copy(w, tmpbuf+1, s - w, char);
9023 off = pad_findmy_pvn(tmpbuf, s-w+1, 0);
9024 if (off != NOT_IN_PAD) return;
9026 Perl_croak(aTHX_ "No comma allowed after %s", what);
9031 /* S_new_constant(): do any overload::constant lookup.
9033 Either returns sv, or mortalizes/frees sv and returns a new SV*.
9034 Best used as sv=new_constant(..., sv, ...).
9035 If s, pv are NULL, calls subroutine with one argument,
9036 and <type> is used with error messages only.
9037 <type> is assumed to be well formed UTF-8 */
9040 S_new_constant(pTHX_ const char *s, STRLEN len, const char *key, STRLEN keylen,
9041 SV *sv, SV *pv, const char *type, STRLEN typelen)
9044 HV * table = GvHV(PL_hintgv); /* ^H */
9049 const char *why1 = "", *why2 = "", *why3 = "";
9051 PERL_ARGS_ASSERT_NEW_CONSTANT;
9052 /* We assume that this is true: */
9053 if (*key == 'c') { assert (strEQ(key, "charnames")); }
9056 /* charnames doesn't work well if there have been errors found */
9057 if (PL_error_count > 0 && *key == 'c')
9059 SvREFCNT_dec_NN(sv);
9060 return &PL_sv_undef;
9063 sv_2mortal(sv); /* Parent created it permanently */
9065 || ! (PL_hints & HINT_LOCALIZE_HH)
9066 || ! (cvp = hv_fetch(table, key, keylen, FALSE))
9071 /* Here haven't found what we're looking for. If it is charnames,
9072 * perhaps it needs to be loaded. Try doing that before giving up */
9074 Perl_load_module(aTHX_
9076 newSVpvs("_charnames"),
9077 /* version parameter; no need to specify it, as if
9078 * we get too early a version, will fail anyway,
9079 * not being able to find '_charnames' */
9084 assert(sp == PL_stack_sp);
9085 table = GvHV(PL_hintgv);
9087 && (PL_hints & HINT_LOCALIZE_HH)
9088 && (cvp = hv_fetch(table, key, keylen, FALSE))
9094 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
9095 msg = Perl_form(aTHX_
9096 "Constant(%.*s) unknown",
9097 (int)(type ? typelen : len),
9103 why3 = "} is not defined";
9106 msg = Perl_form(aTHX_
9107 /* The +3 is for '\N{'; -4 for that, plus '}' */
9108 "Unknown charname '%.*s'", (int)typelen - 4, type + 3
9112 msg = Perl_form(aTHX_ "Constant(%.*s): %s%s%s",
9113 (int)(type ? typelen : len),
9114 (type ? type: s), why1, why2, why3);
9117 yyerror_pv(msg, UTF ? SVf_UTF8 : 0);
9118 return SvREFCNT_inc_simple_NN(sv);
9123 pv = newSVpvn_flags(s, len, SVs_TEMP);
9125 typesv = newSVpvn_flags(type, typelen, SVs_TEMP);
9127 typesv = &PL_sv_undef;
9129 PUSHSTACKi(PERLSI_OVERLOAD);
9141 call_sv(cv, G_SCALAR | ( PL_in_eval ? 0 : G_EVAL));
9145 /* Check the eval first */
9146 if (!PL_in_eval && ((errsv = ERRSV), SvTRUE_NN(errsv))) {
9148 const char * errstr;
9149 sv_catpvs(errsv, "Propagated");
9150 errstr = SvPV_const(errsv, errlen);
9151 yyerror_pvn(errstr, errlen, 0); /* Duplicates the message inside eval */
9153 res = SvREFCNT_inc_simple_NN(sv);
9157 SvREFCNT_inc_simple_void_NN(res);
9166 why1 = "Call to &{$^H{";
9168 why3 = "}} did not return a defined value";
9170 (void)sv_2mortal(sv);
9177 PERL_STATIC_INLINE void
9178 S_parse_ident(pTHX_ char **s, char **d, char * const e, int allow_package,
9179 bool is_utf8, bool check_dollar)
9181 PERL_ARGS_ASSERT_PARSE_IDENT;
9183 while (*s < PL_bufend) {
9185 Perl_croak(aTHX_ "%s", ident_too_long);
9186 if (is_utf8 && isIDFIRST_utf8_safe(*s, PL_bufend)) {
9187 /* The UTF-8 case must come first, otherwise things
9188 * like c\N{COMBINING TILDE} would start failing, as the
9189 * isWORDCHAR_A case below would gobble the 'c' up.
9192 char *t = *s + UTF8SKIP(*s);
9193 while (isIDCONT_utf8_safe((const U8*) t, (const U8*) PL_bufend)) {
9196 if (*d + (t - *s) > e)
9197 Perl_croak(aTHX_ "%s", ident_too_long);
9198 Copy(*s, *d, t - *s, char);
9202 else if ( isWORDCHAR_A(**s) ) {
9205 } while (isWORDCHAR_A(**s) && *d < e);
9207 else if ( allow_package
9209 && isIDFIRST_lazy_if_safe((*s)+1, PL_bufend, is_utf8))
9215 else if (allow_package && **s == ':' && (*s)[1] == ':'
9216 /* Disallow things like Foo::$bar. For the curious, this is
9217 * the code path that triggers the "Bad name after" warning
9218 * when looking for barewords.
9220 && !(check_dollar && (*s)[2] == '$')) {
9230 /* Returns a NUL terminated string, with the length of the string written to
9234 S_scan_word(pTHX_ char *s, char *dest, STRLEN destlen, int allow_package, STRLEN *slp)
9237 char * const e = d + destlen - 3; /* two-character token, ending NUL */
9238 bool is_utf8 = cBOOL(UTF);
9240 PERL_ARGS_ASSERT_SCAN_WORD;
9242 parse_ident(&s, &d, e, allow_package, is_utf8, TRUE);
9248 /* Is the byte 'd' a legal single character identifier name? 'u' is true
9249 * iff Unicode semantics are to be used. The legal ones are any of:
9250 * a) all ASCII characters except:
9251 * 1) control and space-type ones, like NUL, SOH, \t, and SPACE;
9253 * The final case currently doesn't get this far in the program, so we
9254 * don't test for it. If that were to change, it would be ok to allow it.
9255 * b) When not under Unicode rules, any upper Latin1 character
9256 * c) Otherwise, when unicode rules are used, all XIDS characters.
9258 * Because all ASCII characters have the same representation whether
9259 * encoded in UTF-8 or not, we can use the foo_A macros below and '\0' and
9260 * '{' without knowing if is UTF-8 or not. */
9261 #define VALID_LEN_ONE_IDENT(s, e, is_utf8) \
9262 (isGRAPH_A(*(s)) || ((is_utf8) \
9263 ? isIDFIRST_utf8_safe(s, e) \
9265 && LIKELY((U8) *(s) != LATIN1_TO_NATIVE(0xAD)))))
9268 S_scan_ident(pTHX_ char *s, char *dest, STRLEN destlen, I32 ck_uni)
9270 I32 herelines = PL_parser->herelines;
9271 SSize_t bracket = -1;
9274 char * const e = d + destlen - 3; /* two-character token, ending NUL */
9275 bool is_utf8 = cBOOL(UTF);
9276 I32 orig_copline = 0, tmp_copline = 0;
9278 PERL_ARGS_ASSERT_SCAN_IDENT;
9280 if (isSPACE(*s) || !*s)
9283 while (isDIGIT(*s)) {
9285 Perl_croak(aTHX_ "%s", ident_too_long);
9289 else { /* See if it is a "normal" identifier */
9290 parse_ident(&s, &d, e, 1, is_utf8, FALSE);
9295 /* Either a digit variable, or parse_ident() found an identifier
9296 (anything valid as a bareword), so job done and return. */
9297 if (PL_lex_state != LEX_NORMAL)
9298 PL_lex_state = LEX_INTERPENDMAYBE;
9302 /* Here, it is not a run-of-the-mill identifier name */
9304 if (*s == '$' && s[1]
9305 && ( isIDFIRST_lazy_if_safe(s+1, PL_bufend, is_utf8)
9306 || isDIGIT_A((U8)s[1])
9309 || strEQs(s+1,"::")) )
9311 /* Dereferencing a value in a scalar variable.
9312 The alternatives are different syntaxes for a scalar variable.
9313 Using ' as a leading package separator isn't allowed. :: is. */
9316 /* Handle the opening { of @{...}, &{...}, *{...}, %{...}, ${...} */
9318 bracket = s - SvPVX(PL_linestr);
9320 orig_copline = CopLINE(PL_curcop);
9321 if (s < PL_bufend && isSPACE(*s)) {
9325 if ((s <= PL_bufend - (is_utf8)
9328 && VALID_LEN_ONE_IDENT(s, PL_bufend, is_utf8))
9331 const STRLEN skip = UTF8SKIP(s);
9334 for ( i = 0; i < skip; i++ )
9342 /* Convert $^F, ${^F} and the ^F of ${^FOO} to control characters */
9343 if (*d == '^' && *s && isCONTROLVAR(*s)) {
9347 /* Warn about ambiguous code after unary operators if {...} notation isn't
9348 used. There's no difference in ambiguity; it's merely a heuristic
9349 about when not to warn. */
9350 else if (ck_uni && bracket == -1)
9352 if (bracket != -1) {
9355 /* If we were processing {...} notation then... */
9356 if (isIDFIRST_lazy_if_safe(d, e, is_utf8)
9357 || (!isPRINT(*d) /* isCNTRL(d), plus all non-ASCII */
9360 /* note we have to check for a normal identifier first,
9361 * as it handles utf8 symbols, and only after that has
9362 * been ruled out can we look at the caret words */
9363 if (isIDFIRST_lazy_if_safe(d, e, is_utf8) ) {
9364 /* if it starts as a valid identifier, assume that it is one.
9365 (the later check for } being at the expected point will trap
9366 cases where this doesn't pan out.) */
9367 d += is_utf8 ? UTF8SKIP(d) : 1;
9368 parse_ident(&s, &d, e, 1, is_utf8, TRUE);
9371 else { /* caret word: ${^Foo} ${^CAPTURE[0]} */
9373 while (isWORDCHAR(*s) && d < e) {
9377 Perl_croak(aTHX_ "%s", ident_too_long);
9380 tmp_copline = CopLINE(PL_curcop);
9381 if (s < PL_bufend && isSPACE(*s)) {
9384 if ((*s == '[' || (*s == '{' && strNE(dest, "sub")))) {
9385 /* ${foo[0]} and ${foo{bar}} and ${^CAPTURE[0]} notation. */
9386 if (ckWARN(WARN_AMBIGUOUS) && keyword(dest, d - dest, 0)) {
9387 const char * const brack =
9389 ((*s == '[') ? "[...]" : "{...}");
9390 orig_copline = CopLINE(PL_curcop);
9391 CopLINE_set(PL_curcop, tmp_copline);
9392 /* diag_listed_as: Ambiguous use of %c{%s[...]} resolved to %c%s[...] */
9393 Perl_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
9394 "Ambiguous use of %c{%s%s} resolved to %c%s%s",
9395 funny, dest, brack, funny, dest, brack);
9396 CopLINE_set(PL_curcop, orig_copline);
9399 PL_lex_brackstack[PL_lex_brackets++] = (char)(XOPERATOR | XFAKEBRACK);
9400 PL_lex_allbrackets++;
9406 tmp_copline = CopLINE(PL_curcop);
9407 if ((skip = s < PL_bufend && isSPACE(*s)))
9408 /* Avoid incrementing line numbers or resetting PL_linestart,
9409 in case we have to back up. */
9414 /* Expect to find a closing } after consuming any trailing whitespace.
9417 /* Now increment line numbers if applicable. */
9421 if (PL_lex_state == LEX_INTERPNORMAL && !PL_lex_brackets) {
9422 PL_lex_state = LEX_INTERPEND;
9425 if (PL_lex_state == LEX_NORMAL) {
9426 if (ckWARN(WARN_AMBIGUOUS)
9427 && (keyword(dest, d - dest, 0)
9428 || get_cvn_flags(dest, d - dest, is_utf8
9432 SV *tmp = newSVpvn_flags( dest, d - dest,
9433 SVs_TEMP | (is_utf8 ? SVf_UTF8 : 0) );
9436 orig_copline = CopLINE(PL_curcop);
9437 CopLINE_set(PL_curcop, tmp_copline);
9438 Perl_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
9439 "Ambiguous use of %c{%" SVf "} resolved to %c%" SVf,
9440 funny, SVfARG(tmp), funny, SVfARG(tmp));
9441 CopLINE_set(PL_curcop, orig_copline);
9446 /* Didn't find the closing } at the point we expected, so restore
9447 state such that the next thing to process is the opening { and */
9448 s = SvPVX(PL_linestr) + bracket; /* let the parser handle it */
9449 CopLINE_set(PL_curcop, orig_copline);
9450 PL_parser->herelines = herelines;
9454 else if (PL_lex_state == LEX_INTERPNORMAL && !PL_lex_brackets && !intuit_more(s))
9455 PL_lex_state = LEX_INTERPEND;
9460 S_pmflag(pTHX_ const char* const valid_flags, U32 * pmfl, char** s, char* charset, unsigned int * x_mod_count) {
9462 /* Adds, subtracts to/from 'pmfl' based on the next regex modifier flag
9463 * found in the parse starting at 's', based on the subset that are valid
9464 * in this context input to this routine in 'valid_flags'. Advances s.
9465 * Returns TRUE if the input should be treated as a valid flag, so the next
9466 * char may be as well; otherwise FALSE. 'charset' should point to a NUL
9467 * upon first call on the current regex. This routine will set it to any
9468 * charset modifier found. The caller shouldn't change it. This way,
9469 * another charset modifier encountered in the parse can be detected as an
9470 * error, as we have decided to allow only one */
9473 STRLEN charlen = UTF ? UTF8SKIP(*s) : 1;
9475 if ( charlen != 1 || ! strchr(valid_flags, c) ) {
9476 if (isWORDCHAR_lazy_if_safe( *s, PL_bufend, UTF)) {
9477 yyerror_pv(Perl_form(aTHX_ "Unknown regexp modifier \"/%.*s\"", (int)charlen, *s),
9478 UTF ? SVf_UTF8 : 0);
9480 /* Pretend that it worked, so will continue processing before
9489 CASE_STD_PMMOD_FLAGS_PARSE_SET(pmfl, *x_mod_count);
9490 case GLOBAL_PAT_MOD: *pmfl |= PMf_GLOBAL; break;
9491 case CONTINUE_PAT_MOD: *pmfl |= PMf_CONTINUE; break;
9492 case ONCE_PAT_MOD: *pmfl |= PMf_KEEP; break;
9493 case KEEPCOPY_PAT_MOD: *pmfl |= RXf_PMf_KEEPCOPY; break;
9494 case NONDESTRUCT_PAT_MOD: *pmfl |= PMf_NONDESTRUCT; break;
9495 case LOCALE_PAT_MOD:
9497 goto multiple_charsets;
9499 set_regex_charset(pmfl, REGEX_LOCALE_CHARSET);
9502 case UNICODE_PAT_MOD:
9504 goto multiple_charsets;
9506 set_regex_charset(pmfl, REGEX_UNICODE_CHARSET);
9509 case ASCII_RESTRICT_PAT_MOD:
9511 set_regex_charset(pmfl, REGEX_ASCII_RESTRICTED_CHARSET);
9515 /* Error if previous modifier wasn't an 'a', but if it was, see
9516 * if, and accept, a second occurrence (only) */
9518 || get_regex_charset(*pmfl)
9519 != REGEX_ASCII_RESTRICTED_CHARSET)
9521 goto multiple_charsets;
9523 set_regex_charset(pmfl, REGEX_ASCII_MORE_RESTRICTED_CHARSET);
9527 case DEPENDS_PAT_MOD:
9529 goto multiple_charsets;
9531 set_regex_charset(pmfl, REGEX_DEPENDS_CHARSET);
9540 if (*charset != c) {
9541 yyerror(Perl_form(aTHX_ "Regexp modifiers \"/%c\" and \"/%c\" are mutually exclusive", *charset, c));
9543 else if (c == 'a') {
9544 /* diag_listed_as: Regexp modifier "/%c" may appear a maximum of twice */
9545 yyerror("Regexp modifier \"/a\" may appear a maximum of twice");
9548 yyerror(Perl_form(aTHX_ "Regexp modifier \"/%c\" may not appear twice", c));
9551 /* Pretend that it worked, so will continue processing before dieing */
9557 S_scan_pat(pTHX_ char *start, I32 type)
9561 const char * const valid_flags =
9562 (const char *)((type == OP_QR) ? QR_PAT_MODS : M_PAT_MODS);
9563 char charset = '\0'; /* character set modifier */
9564 unsigned int x_mod_count = 0;
9566 PERL_ARGS_ASSERT_SCAN_PAT;
9568 s = scan_str(start,TRUE,FALSE, (PL_in_eval & EVAL_RE_REPARSING), NULL);
9570 Perl_croak(aTHX_ "Search pattern not terminated");
9572 pm = (PMOP*)newPMOP(type, 0);
9573 if (PL_multi_open == '?') {
9574 /* This is the only point in the code that sets PMf_ONCE: */
9575 pm->op_pmflags |= PMf_ONCE;
9577 /* Hence it's safe to do this bit of PMOP book-keeping here, which
9578 allows us to restrict the list needed by reset to just the ??
9580 assert(type != OP_TRANS);
9582 MAGIC *mg = mg_find((const SV *)PL_curstash, PERL_MAGIC_symtab);
9585 mg = sv_magicext(MUTABLE_SV(PL_curstash), 0, PERL_MAGIC_symtab, 0, 0,
9588 elements = mg->mg_len / sizeof(PMOP**);
9589 Renewc(mg->mg_ptr, elements + 1, PMOP*, char);
9590 ((PMOP**)mg->mg_ptr) [elements++] = pm;
9591 mg->mg_len = elements * sizeof(PMOP**);
9592 PmopSTASH_set(pm,PL_curstash);
9596 /* if qr/...(?{..}).../, then need to parse the pattern within a new
9597 * anon CV. False positives like qr/[(?{]/ are harmless */
9599 if (type == OP_QR) {
9601 char *e, *p = SvPV(PL_lex_stuff, len);
9603 for (; p < e; p++) {
9604 if (p[0] == '(' && p[1] == '?'
9605 && (p[2] == '{' || (p[2] == '?' && p[3] == '{')))
9607 pm->op_pmflags |= PMf_HAS_CV;
9611 pm->op_pmflags |= PMf_IS_QR;
9614 while (*s && S_pmflag(aTHX_ valid_flags, &(pm->op_pmflags),
9615 &s, &charset, &x_mod_count))
9617 /* issue a warning if /c is specified,but /g is not */
9618 if ((pm->op_pmflags & PMf_CONTINUE) && !(pm->op_pmflags & PMf_GLOBAL))
9620 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
9621 "Use of /c modifier is meaningless without /g" );
9624 PL_lex_op = (OP*)pm;
9625 pl_yylval.ival = OP_MATCH;
9630 S_scan_subst(pTHX_ char *start)
9636 line_t linediff = 0;
9638 char charset = '\0'; /* character set modifier */
9639 unsigned int x_mod_count = 0;
9642 PERL_ARGS_ASSERT_SCAN_SUBST;
9644 pl_yylval.ival = OP_NULL;
9646 s = scan_str(start, TRUE, FALSE, FALSE, &t);
9649 Perl_croak(aTHX_ "Substitution pattern not terminated");
9653 first_start = PL_multi_start;
9654 first_line = CopLINE(PL_curcop);
9655 s = scan_str(s,FALSE,FALSE,FALSE,NULL);
9657 SvREFCNT_dec_NN(PL_lex_stuff);
9658 PL_lex_stuff = NULL;
9659 Perl_croak(aTHX_ "Substitution replacement not terminated");
9661 PL_multi_start = first_start; /* so whole substitution is taken together */
9663 pm = (PMOP*)newPMOP(OP_SUBST, 0);
9667 if (*s == EXEC_PAT_MOD) {
9671 else if (! S_pmflag(aTHX_ S_PAT_MODS, &(pm->op_pmflags),
9672 &s, &charset, &x_mod_count))
9678 if ((pm->op_pmflags & PMf_CONTINUE)) {
9679 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), "Use of /c modifier is meaningless in s///" );
9683 SV * const repl = newSVpvs("");
9686 pm->op_pmflags |= PMf_EVAL;
9689 sv_catpvs(repl, "eval ");
9691 sv_catpvs(repl, "do ");
9693 sv_catpvs(repl, "{");
9694 sv_catsv(repl, PL_parser->lex_sub_repl);
9695 sv_catpvs(repl, "}");
9696 SvREFCNT_dec(PL_parser->lex_sub_repl);
9697 PL_parser->lex_sub_repl = repl;
9702 linediff = CopLINE(PL_curcop) - first_line;
9704 CopLINE_set(PL_curcop, first_line);
9706 if (linediff || es) {
9707 /* the IVX field indicates that the replacement string is a s///e;
9708 * the NVX field indicates how many src code lines the replacement
9710 sv_upgrade(PL_parser->lex_sub_repl, SVt_PVNV);
9711 ((XPVNV*)SvANY(PL_parser->lex_sub_repl))->xnv_u.xnv_lines = 0;
9712 ((XPVIV*)SvANY(PL_parser->lex_sub_repl))->xiv_u.xivu_eval_seen =
9716 PL_lex_op = (OP*)pm;
9717 pl_yylval.ival = OP_SUBST;
9722 S_scan_trans(pTHX_ char *start)
9729 bool nondestruct = 0;
9732 PERL_ARGS_ASSERT_SCAN_TRANS;
9734 pl_yylval.ival = OP_NULL;
9736 s = scan_str(start,FALSE,FALSE,FALSE,&t);
9738 Perl_croak(aTHX_ "Transliteration pattern not terminated");
9742 s = scan_str(s,FALSE,FALSE,FALSE,NULL);
9744 SvREFCNT_dec_NN(PL_lex_stuff);
9745 PL_lex_stuff = NULL;
9746 Perl_croak(aTHX_ "Transliteration replacement not terminated");
9749 complement = del = squash = 0;
9753 complement = OPpTRANS_COMPLEMENT;
9756 del = OPpTRANS_DELETE;
9759 squash = OPpTRANS_SQUASH;
9771 o = newPVOP(nondestruct ? OP_TRANSR : OP_TRANS, 0, (char*)NULL);
9772 o->op_private &= ~OPpTRANS_ALL;
9773 o->op_private |= del|squash|complement|
9774 (DO_UTF8(PL_lex_stuff)? OPpTRANS_FROM_UTF : 0)|
9775 (DO_UTF8(PL_parser->lex_sub_repl) ? OPpTRANS_TO_UTF : 0);
9778 pl_yylval.ival = nondestruct ? OP_TRANSR : OP_TRANS;
9785 Takes a pointer to the first < in <<FOO.
9786 Returns a pointer to the byte following <<FOO.
9788 This function scans a heredoc, which involves different methods
9789 depending on whether we are in a string eval, quoted construct, etc.
9790 This is because PL_linestr could containing a single line of input, or
9791 a whole string being evalled, or the contents of the current quote-
9794 The two basic methods are:
9795 - Steal lines from the input stream
9796 - Scan the heredoc in PL_linestr and remove it therefrom
9798 In a file scope or filtered eval, the first method is used; in a
9799 string eval, the second.
9801 In a quote-like operator, we have to choose between the two,
9802 depending on where we can find a newline. We peek into outer lex-
9803 ing scopes until we find one with a newline in it. If we reach the
9804 outermost lexing scope and it is a file, we use the stream method.
9805 Otherwise it is treated as an eval.
9809 S_scan_heredoc(pTHX_ char *s)
9811 I32 op_type = OP_SCALAR;
9820 bool indented = FALSE;
9821 const bool infile = PL_rsfp || PL_parser->filtered;
9822 const line_t origline = CopLINE(PL_curcop);
9823 LEXSHARED *shared = PL_parser->lex_shared;
9825 PERL_ARGS_ASSERT_SCAN_HEREDOC;
9828 d = PL_tokenbuf + 1;
9829 e = PL_tokenbuf + sizeof PL_tokenbuf - 1;
9830 *PL_tokenbuf = '\n';
9836 while (SPACE_OR_TAB(*peek))
9838 if (*peek == '`' || *peek == '\'' || *peek =='"') {
9841 s = delimcpy(d, e, s, PL_bufend, term, &len);
9843 Perl_croak(aTHX_ "Unterminated delimiter for here document");
9849 /* <<\FOO is equivalent to <<'FOO' */
9853 if (! isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF))
9854 Perl_croak(aTHX_ "Use of bare << to mean <<\"\" is forbidden");
9857 isWORDCHAR_lazy_if_safe(peek, PL_bufend, UTF))
9859 peek += UTF ? UTF8SKIP(peek) : 1;
9861 len = (peek - s >= e - d) ? (e - d) : (peek - s);
9862 Copy(s, d, len, char);
9866 if (d >= PL_tokenbuf + sizeof PL_tokenbuf - 1)
9867 Perl_croak(aTHX_ "Delimiter for here document is too long");
9870 len = d - PL_tokenbuf;
9872 #ifndef PERL_STRICT_CR
9873 d = strchr(s, '\r');
9875 char * const olds = s;
9877 while (s < PL_bufend) {
9883 else if (*s == '\n' && s[1] == '\r') { /* \015\013 on a mac? */
9892 SvCUR_set(PL_linestr, PL_bufend - SvPVX_const(PL_linestr));
9897 tmpstr = newSV_type(SVt_PVIV);
9901 SvIV_set(tmpstr, -1);
9903 else if (term == '`') {
9904 op_type = OP_BACKTICK;
9905 SvIV_set(tmpstr, '\\');
9908 PL_multi_start = origline + 1 + PL_parser->herelines;
9909 PL_multi_open = PL_multi_close = '<';
9910 /* inside a string eval or quote-like operator */
9911 if (!infile || PL_lex_inwhat) {
9914 char * const olds = s;
9915 PERL_CONTEXT * const cx = CX_CUR();
9916 /* These two fields are not set until an inner lexing scope is
9917 entered. But we need them set here. */
9918 shared->ls_bufptr = s;
9919 shared->ls_linestr = PL_linestr;
9921 /* Look for a newline. If the current buffer does not have one,
9922 peek into the line buffer of the parent lexing scope, going
9923 up as many levels as necessary to find one with a newline
9926 while (!(s = (char *)memchr(
9927 (void *)shared->ls_bufptr, '\n',
9928 SvEND(shared->ls_linestr)-shared->ls_bufptr
9930 shared = shared->ls_prev;
9931 /* shared is only null if we have gone beyond the outermost
9932 lexing scope. In a file, we will have broken out of the
9933 loop in the previous iteration. In an eval, the string buf-
9934 fer ends with "\n;", so the while condition above will have
9935 evaluated to false. So shared can never be null. Or so you
9936 might think. Odd syntax errors like s;@{<<; can gobble up
9937 the implicit semicolon at the end of a flie, causing the
9938 file handle to be closed even when we are not in a string
9939 eval. So shared may be null in that case.
9940 (Closing '}' here to balance the earlier open brace for
9941 editors that look for matched pairs.) */
9942 if (UNLIKELY(!shared))
9944 /* A LEXSHARED struct with a null ls_prev pointer is the outer-
9945 most lexing scope. In a file, shared->ls_linestr at that
9946 level is just one line, so there is no body to steal. */
9947 if (infile && !shared->ls_prev) {
9952 else { /* eval or we've already hit EOF */
9953 s = (char*)memchr((void*)s, '\n', PL_bufend - s);
9957 linestr = shared->ls_linestr;
9958 bufend = SvEND(linestr);
9963 while (s < bufend - len + 1) {
9965 ++PL_parser->herelines;
9967 if (memEQ(s, PL_tokenbuf + 1, len - 1)) {
9971 /* Only valid if it's preceded by whitespace only */
9972 while (backup != myolds && --backup >= myolds) {
9973 if (! SPACE_OR_TAB(*backup)) {
9980 /* No whitespace or all! */
9981 if (backup == s || *backup == '\n') {
9982 Newxz(indent, indent_len + 1, char);
9983 memcpy(indent, backup + 1, indent_len);
9984 s--; /* before our delimiter */
9985 PL_parser->herelines--; /* this line doesn't count */
9991 while (s < bufend - len + 1
9992 && memNE(s,PL_tokenbuf,len) )
9995 ++PL_parser->herelines;
9999 if (s >= bufend - len + 1) {
10002 sv_setpvn(tmpstr,d+1,s-d);
10004 /* the preceding stmt passes a newline */
10005 PL_parser->herelines++;
10007 /* s now points to the newline after the heredoc terminator.
10008 d points to the newline before the body of the heredoc.
10011 /* We are going to modify linestr in place here, so set
10012 aside copies of the string if necessary for re-evals or
10014 /* See the Paranoia note in case LEX_INTERPEND in yylex, for why we
10015 check shared->re_eval_str. */
10016 if (shared->re_eval_start || shared->re_eval_str) {
10017 /* Set aside the rest of the regexp */
10018 if (!shared->re_eval_str)
10019 shared->re_eval_str =
10020 newSVpvn(shared->re_eval_start,
10021 bufend - shared->re_eval_start);
10022 shared->re_eval_start -= s-d;
10024 if (cxstack_ix >= 0
10025 && CxTYPE(cx) == CXt_EVAL
10026 && CxOLD_OP_TYPE(cx) == OP_ENTEREVAL
10027 && cx->blk_eval.cur_text == linestr)
10029 cx->blk_eval.cur_text = newSVsv(linestr);
10030 cx->blk_u16 |= 0x40; /* indicate cur_text is ref counted */
10032 /* Copy everything from s onwards back to d. */
10033 Move(s,d,bufend-s + 1,char);
10034 SvCUR_set(linestr, SvCUR(linestr) - (s-d));
10035 /* Setting PL_bufend only applies when we have not dug deeper
10036 into other scopes, because sublex_done sets PL_bufend to
10037 SvEND(PL_linestr). */
10038 if (shared == PL_parser->lex_shared) PL_bufend = SvEND(linestr);
10044 char *oldbufptr_save;
10045 char *oldoldbufptr_save;
10047 SvPVCLEAR(tmpstr); /* avoid "uninitialized" warning */
10048 term = PL_tokenbuf[1];
10050 linestr_save = PL_linestr; /* must restore this afterwards */
10051 d = s; /* and this */
10052 oldbufptr_save = PL_oldbufptr;
10053 oldoldbufptr_save = PL_oldoldbufptr;
10054 PL_linestr = newSVpvs("");
10055 PL_bufend = SvPVX(PL_linestr);
10057 PL_bufptr = PL_bufend;
10058 CopLINE_set(PL_curcop,
10059 origline + 1 + PL_parser->herelines);
10060 if (!lex_next_chunk(LEX_NO_TERM)
10061 && (!SvCUR(tmpstr) || SvEND(tmpstr)[-1] != '\n')) {
10062 /* Simply freeing linestr_save might seem simpler here, as it
10063 does not matter what PL_linestr points to, since we are
10064 about to croak; but in a quote-like op, linestr_save
10065 will have been prospectively freed already, via
10066 SAVEFREESV(PL_linestr) in sublex_push, so it’s easier to
10067 restore PL_linestr. */
10068 SvREFCNT_dec_NN(PL_linestr);
10069 PL_linestr = linestr_save;
10070 PL_oldbufptr = oldbufptr_save;
10071 PL_oldoldbufptr = oldoldbufptr_save;
10074 CopLINE_set(PL_curcop, origline);
10075 if (!SvCUR(PL_linestr) || PL_bufend[-1] != '\n') {
10076 s = lex_grow_linestr(SvLEN(PL_linestr) + 3);
10077 /* ^That should be enough to avoid this needing to grow: */
10078 sv_catpvs(PL_linestr, "\n\0");
10079 assert(s == SvPVX(PL_linestr));
10080 PL_bufend = SvEND(PL_linestr);
10083 PL_parser->herelines++;
10084 PL_last_lop = PL_last_uni = NULL;
10085 #ifndef PERL_STRICT_CR
10086 if (PL_bufend - PL_linestart >= 2) {
10087 if ( (PL_bufend[-2] == '\r' && PL_bufend[-1] == '\n')
10088 || (PL_bufend[-2] == '\n' && PL_bufend[-1] == '\r'))
10090 PL_bufend[-2] = '\n';
10092 SvCUR_set(PL_linestr, PL_bufend - SvPVX_const(PL_linestr));
10094 else if (PL_bufend[-1] == '\r')
10095 PL_bufend[-1] = '\n';
10097 else if (PL_bufend - PL_linestart == 1 && PL_bufend[-1] == '\r')
10098 PL_bufend[-1] = '\n';
10100 if (indented && (PL_bufend-s) >= len) {
10101 char * found = ninstr(s, PL_bufend, (PL_tokenbuf + 1), (PL_tokenbuf +1 + len));
10104 char *backup = found;
10107 /* Only valid if it's preceded by whitespace only */
10108 while (backup != s && --backup >= s) {
10109 if (! SPACE_OR_TAB(*backup)) {
10115 /* All whitespace or none! */
10116 if (backup == found || SPACE_OR_TAB(*backup)) {
10117 Newxz(indent, indent_len + 1, char);
10118 memcpy(indent, backup, indent_len);
10119 SvREFCNT_dec(PL_linestr);
10120 PL_linestr = linestr_save;
10121 PL_linestart = SvPVX(linestr_save);
10122 PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr);
10123 PL_oldbufptr = oldbufptr_save;
10124 PL_oldoldbufptr = oldoldbufptr_save;
10130 /* Didn't find it */
10131 sv_catsv(tmpstr,PL_linestr);
10133 if (*s == term && PL_bufend-s >= len
10134 && memEQ(s,PL_tokenbuf + 1,len))
10136 SvREFCNT_dec(PL_linestr);
10137 PL_linestr = linestr_save;
10138 PL_linestart = SvPVX(linestr_save);
10139 PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr);
10140 PL_oldbufptr = oldbufptr_save;
10141 PL_oldoldbufptr = oldoldbufptr_save;
10145 sv_catsv(tmpstr,PL_linestr);
10150 PL_multi_end = origline + PL_parser->herelines;
10151 if (indented && indent) {
10152 STRLEN linecount = 1;
10153 STRLEN herelen = SvCUR(tmpstr);
10154 char *ss = SvPVX(tmpstr);
10155 char *se = ss + herelen;
10156 SV *newstr = newSV(herelen+1);
10159 /* Trim leading whitespace */
10161 /* newline only? Copy and move on */
10163 sv_catpv(newstr,"\n");
10167 /* Found our indentation? Strip it */
10168 } else if (se - ss >= indent_len
10169 && memEQ(ss, indent, indent_len))
10175 while ((ss + le) < se && *(ss + le) != '\n')
10178 sv_catpvn(newstr, ss, le);
10182 /* Line doesn't begin with our indentation? Croak */
10185 "Indentation on line %d of here-doc doesn't match delimiter",
10190 /* avoid sv_setsv() as we dont wan't to COW here */
10191 sv_setpvn(tmpstr,SvPVX(newstr),SvCUR(newstr));
10193 SvREFCNT_dec_NN(newstr);
10195 if (SvCUR(tmpstr) + 5 < SvLEN(tmpstr)) {
10196 SvPV_shrink_to_cur(tmpstr);
10199 if (UTF && is_utf8_string((U8*)SvPVX_const(tmpstr), SvCUR(tmpstr)))
10202 PL_lex_stuff = tmpstr;
10203 pl_yylval.ival = op_type;
10207 SvREFCNT_dec(tmpstr);
10208 CopLINE_set(PL_curcop, origline);
10209 missingterm(PL_tokenbuf + 1);
10212 /* scan_inputsymbol
10213 takes: position of first '<' in input buffer
10214 returns: position of first char following the matching '>' in
10216 side-effects: pl_yylval and lex_op are set.
10221 <<>> read from ARGV without magic open
10222 <FH> read from filehandle
10223 <pkg::FH> read from package qualified filehandle
10224 <pkg'FH> read from package qualified filehandle
10225 <$fh> read from filehandle in $fh
10226 <*.h> filename glob
10231 S_scan_inputsymbol(pTHX_ char *start)
10233 char *s = start; /* current position in buffer */
10236 bool nomagicopen = FALSE;
10237 char *d = PL_tokenbuf; /* start of temp holding space */
10238 const char * const e = PL_tokenbuf + sizeof PL_tokenbuf; /* end of temp holding space */
10240 PERL_ARGS_ASSERT_SCAN_INPUTSYMBOL;
10242 end = strchr(s, '\n');
10245 if (s[1] == '<' && s[2] == '>' && s[3] == '>') {
10246 nomagicopen = TRUE;
10252 s = delimcpy(d, e, s + 1, end, '>', &len); /* extract until > */
10254 /* die if we didn't have space for the contents of the <>,
10255 or if it didn't end, or if we see a newline
10258 if (len >= (I32)sizeof PL_tokenbuf)
10259 Perl_croak(aTHX_ "Excessively long <> operator");
10261 Perl_croak(aTHX_ "Unterminated <> operator");
10266 Remember, only scalar variables are interpreted as filehandles by
10267 this code. Anything more complex (e.g., <$fh{$num}>) will be
10268 treated as a glob() call.
10269 This code makes use of the fact that except for the $ at the front,
10270 a scalar variable and a filehandle look the same.
10272 if (*d == '$' && d[1]) d++;
10274 /* allow <Pkg'VALUE> or <Pkg::VALUE> */
10275 while (isWORDCHAR_lazy_if_safe(d, e, UTF) || *d == '\'' || *d == ':') {
10276 d += UTF ? UTF8SKIP(d) : 1;
10279 /* If we've tried to read what we allow filehandles to look like, and
10280 there's still text left, then it must be a glob() and not a getline.
10281 Use scan_str to pull out the stuff between the <> and treat it
10282 as nothing more than a string.
10285 if (d - PL_tokenbuf != len) {
10286 pl_yylval.ival = OP_GLOB;
10287 s = scan_str(start,FALSE,FALSE,FALSE,NULL);
10289 Perl_croak(aTHX_ "Glob not terminated");
10293 bool readline_overriden = FALSE;
10295 /* we're in a filehandle read situation */
10298 /* turn <> into <ARGV> */
10300 Copy("ARGV",d,5,char);
10302 /* Check whether readline() is overriden */
10303 if ((gv_readline = gv_override("readline",8)))
10304 readline_overriden = TRUE;
10306 /* if <$fh>, create the ops to turn the variable into a
10310 /* try to find it in the pad for this block, otherwise find
10311 add symbol table ops
10313 const PADOFFSET tmp = pad_findmy_pvn(d, len, 0);
10314 if (tmp != NOT_IN_PAD) {
10315 if (PAD_COMPNAME_FLAGS_isOUR(tmp)) {
10316 HV * const stash = PAD_COMPNAME_OURSTASH(tmp);
10317 HEK * const stashname = HvNAME_HEK(stash);
10318 SV * const sym = sv_2mortal(newSVhek(stashname));
10319 sv_catpvs(sym, "::");
10320 sv_catpv(sym, d+1);
10325 OP * const o = newOP(OP_PADSV, 0);
10327 PL_lex_op = readline_overriden
10328 ? newUNOP(OP_ENTERSUB, OPf_STACKED,
10329 op_append_elem(OP_LIST, o,
10330 newCVREF(0, newGVOP(OP_GV,0,gv_readline))))
10331 : newUNOP(OP_READLINE, 0, o);
10339 GV_ADDMULTI | ( UTF ? SVf_UTF8 : 0 ),
10341 PL_lex_op = readline_overriden
10342 ? newUNOP(OP_ENTERSUB, OPf_STACKED,
10343 op_append_elem(OP_LIST,
10344 newUNOP(OP_RV2SV, 0, newGVOP(OP_GV, 0, gv)),
10345 newCVREF(0, newGVOP(OP_GV, 0, gv_readline))))
10346 : newUNOP(OP_READLINE, 0,
10347 newUNOP(OP_RV2SV, 0,
10348 newGVOP(OP_GV, 0, gv)));
10350 /* we created the ops in PL_lex_op, so make pl_yylval.ival a null op */
10351 pl_yylval.ival = OP_NULL;
10354 /* If it's none of the above, it must be a literal filehandle
10355 (<Foo::BAR> or <FOO>) so build a simple readline OP */
10357 GV * const gv = gv_fetchpv(d, GV_ADD | ( UTF ? SVf_UTF8 : 0 ), SVt_PVIO);
10358 PL_lex_op = readline_overriden
10359 ? newUNOP(OP_ENTERSUB, OPf_STACKED,
10360 op_append_elem(OP_LIST,
10361 newGVOP(OP_GV, 0, gv),
10362 newCVREF(0, newGVOP(OP_GV, 0, gv_readline))))
10363 : newUNOP(OP_READLINE, nomagicopen ? OPf_SPECIAL : 0, newGVOP(OP_GV, 0, gv));
10364 pl_yylval.ival = OP_NULL;
10374 start position in buffer
10375 keep_bracketed_quoted preserve \ quoting of embedded delimiters, but
10376 only if they are of the open/close form
10377 keep_delims preserve the delimiters around the string
10378 re_reparse compiling a run-time /(?{})/:
10379 collapse // to /, and skip encoding src
10380 delimp if non-null, this is set to the position of
10381 the closing delimiter, or just after it if
10382 the closing and opening delimiters differ
10383 (i.e., the opening delimiter of a substitu-
10385 returns: position to continue reading from buffer
10386 side-effects: multi_start, multi_close, lex_repl or lex_stuff, and
10387 updates the read buffer.
10389 This subroutine pulls a string out of the input. It is called for:
10390 q single quotes q(literal text)
10391 ' single quotes 'literal text'
10392 qq double quotes qq(interpolate $here please)
10393 " double quotes "interpolate $here please"
10394 qx backticks qx(/bin/ls -l)
10395 ` backticks `/bin/ls -l`
10396 qw quote words @EXPORT_OK = qw( func() $spam )
10397 m// regexp match m/this/
10398 s/// regexp substitute s/this/that/
10399 tr/// string transliterate tr/this/that/
10400 y/// string transliterate y/this/that/
10401 ($*@) sub prototypes sub foo ($)
10402 (stuff) sub attr parameters sub foo : attr(stuff)
10403 <> readline or globs <FOO>, <>, <$fh>, or <*.c>
10405 In most of these cases (all but <>, patterns and transliterate)
10406 yylex() calls scan_str(). m// makes yylex() call scan_pat() which
10407 calls scan_str(). s/// makes yylex() call scan_subst() which calls
10408 scan_str(). tr/// and y/// make yylex() call scan_trans() which
10411 It skips whitespace before the string starts, and treats the first
10412 character as the delimiter. If the delimiter is one of ([{< then
10413 the corresponding "close" character )]}> is used as the closing
10414 delimiter. It allows quoting of delimiters, and if the string has
10415 balanced delimiters ([{<>}]) it allows nesting.
10417 On success, the SV with the resulting string is put into lex_stuff or,
10418 if that is already non-NULL, into lex_repl. The second case occurs only
10419 when parsing the RHS of the special constructs s/// and tr/// (y///).
10420 For convenience, the terminating delimiter character is stuffed into
10425 S_scan_str(pTHX_ char *start, int keep_bracketed_quoted, int keep_delims, int re_reparse,
10429 SV *sv; /* scalar value: string */
10430 const char *tmps; /* temp string, used for delimiter matching */
10431 char *s = start; /* current position in the buffer */
10432 char term; /* terminating character */
10433 char *to; /* current position in the sv's data */
10434 I32 brackets = 1; /* bracket nesting level */
10435 bool has_utf8 = FALSE; /* is there any utf8 content? */
10436 IV termcode; /* terminating char. code */
10437 U8 termstr[UTF8_MAXBYTES]; /* terminating string */
10438 STRLEN termlen; /* length of terminating string */
10441 /* The delimiters that have a mirror-image closing one */
10442 const char * opening_delims = "([{<";
10443 const char * closing_delims = ")]}>";
10445 const char * non_grapheme_msg = "Use of unassigned code point or"
10446 " non-standalone grapheme for a delimiter"
10447 " will be a fatal error starting in Perl"
10449 /* The only non-UTF character that isn't a stand alone grapheme is
10450 * white-space, hence can't be a delimiter. So can skip for non-UTF-8 */
10451 bool check_grapheme = UTF && ckWARN_d(WARN_DEPRECATED);
10453 PERL_ARGS_ASSERT_SCAN_STR;
10455 /* skip space before the delimiter */
10460 /* mark where we are, in case we need to report errors */
10463 /* after skipping whitespace, the next character is the terminator */
10465 if (!UTF || UTF8_IS_INVARIANT(term)) {
10466 termcode = termstr[0] = term;
10470 termcode = utf8_to_uvchr_buf((U8*)s, (U8*)PL_bufend, &termlen);
10471 if (check_grapheme) {
10472 if ( UNLIKELY(UNICODE_IS_SUPER(termcode))
10473 || UNLIKELY(UNICODE_IS_NONCHAR(termcode)))
10475 /* These are considered graphemes, and since the ending
10476 * delimiter will be the same, we don't have to check the other
10478 check_grapheme = FALSE;
10480 else if (UNLIKELY(! _is_grapheme((U8 *) start,
10485 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), "%s", non_grapheme_msg);
10487 /* Don't have to check the other end, as have already warned at
10489 check_grapheme = FALSE;
10493 Copy(s, termstr, termlen, U8);
10496 /* mark where we are */
10497 PL_multi_start = CopLINE(PL_curcop);
10498 PL_multi_open = termcode;
10499 herelines = PL_parser->herelines;
10501 /* If the delimiter has a mirror-image closing one, get it */
10502 if (term && (tmps = strchr(opening_delims, term))) {
10503 termcode = termstr[0] = term = closing_delims[tmps - opening_delims];
10506 PL_multi_close = termcode;
10508 if (PL_multi_open == PL_multi_close) {
10509 keep_bracketed_quoted = FALSE;
10512 /* create a new SV to hold the contents. 79 is the SV's initial length.
10513 What a random number. */
10514 sv = newSV_type(SVt_PVIV);
10516 SvIV_set(sv, termcode);
10517 (void)SvPOK_only(sv); /* validate pointer */
10519 /* move past delimiter and try to read a complete string */
10521 sv_catpvn(sv, s, termlen);
10524 /* extend sv if need be */
10525 SvGROW(sv, SvCUR(sv) + (PL_bufend - s) + 1);
10526 /* set 'to' to the next character in the sv's string */
10527 to = SvPVX(sv)+SvCUR(sv);
10529 /* if open delimiter is the close delimiter read unbridle */
10530 if (PL_multi_open == PL_multi_close) {
10531 for (; s < PL_bufend; s++,to++) {
10532 /* embedded newlines increment the current line number */
10533 if (*s == '\n' && !PL_rsfp && !PL_parser->filtered)
10534 COPLINE_INC_WITH_HERELINES;
10535 /* handle quoted delimiters */
10536 if (*s == '\\' && s+1 < PL_bufend && term != '\\') {
10537 if (!keep_bracketed_quoted
10539 || (re_reparse && s[1] == '\\'))
10542 else /* any other quotes are simply copied straight through */
10545 /* terminate when run out of buffer (the for() condition), or
10546 have found the terminator */
10547 else if (*s == term) { /* First byte of terminator matches */
10548 if (termlen == 1) /* If is the only byte, are done */
10551 /* If the remainder of the terminator matches, also are
10552 * done, after checking that is a separate grapheme */
10553 if ( s + termlen <= PL_bufend
10554 && memEQ(s + 1, (char*)termstr + 1, termlen - 1))
10556 if ( check_grapheme
10557 && UNLIKELY(! _is_grapheme((U8 *) start,
10562 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
10563 "%s", non_grapheme_msg);
10568 else if (!has_utf8 && !UTF8_IS_INVARIANT((U8)*s) && UTF) {
10576 /* if the terminator isn't the same as the start character (e.g.,
10577 matched brackets), we have to allow more in the quoting, and
10578 be prepared for nested brackets.
10581 /* read until we run out of string, or we find the terminator */
10582 for (; s < PL_bufend; s++,to++) {
10583 /* embedded newlines increment the line count */
10584 if (*s == '\n' && !PL_rsfp && !PL_parser->filtered)
10585 COPLINE_INC_WITH_HERELINES;
10586 /* backslashes can escape the open or closing characters */
10587 if (*s == '\\' && s+1 < PL_bufend) {
10588 if (!keep_bracketed_quoted
10589 && ( ((UV)s[1] == PL_multi_open)
10590 || ((UV)s[1] == PL_multi_close) ))
10597 /* allow nested opens and closes */
10598 else if ((UV)*s == PL_multi_close && --brackets <= 0)
10600 else if ((UV)*s == PL_multi_open)
10602 else if (!has_utf8 && !UTF8_IS_INVARIANT((U8)*s) && UTF)
10607 /* terminate the copied string and update the sv's end-of-string */
10609 SvCUR_set(sv, to - SvPVX_const(sv));
10612 * this next chunk reads more into the buffer if we're not done yet
10616 break; /* handle case where we are done yet :-) */
10618 #ifndef PERL_STRICT_CR
10619 if (to - SvPVX_const(sv) >= 2) {
10620 if ( (to[-2] == '\r' && to[-1] == '\n')
10621 || (to[-2] == '\n' && to[-1] == '\r'))
10625 SvCUR_set(sv, to - SvPVX_const(sv));
10627 else if (to[-1] == '\r')
10630 else if (to - SvPVX_const(sv) == 1 && to[-1] == '\r')
10634 /* if we're out of file, or a read fails, bail and reset the current
10635 line marker so we can report where the unterminated string began
10637 COPLINE_INC_WITH_HERELINES;
10638 PL_bufptr = PL_bufend;
10639 if (!lex_next_chunk(0)) {
10641 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
10644 s = start = PL_bufptr;
10647 /* at this point, we have successfully read the delimited string */
10650 sv_catpvn(sv, s, termlen);
10656 PL_multi_end = CopLINE(PL_curcop);
10657 CopLINE_set(PL_curcop, PL_multi_start);
10658 PL_parser->herelines = herelines;
10660 /* if we allocated too much space, give some back */
10661 if (SvCUR(sv) + 5 < SvLEN(sv)) {
10662 SvLEN_set(sv, SvCUR(sv) + 1);
10663 SvPV_renew(sv, SvLEN(sv));
10666 /* decide whether this is the first or second quoted string we've read
10671 PL_parser->lex_sub_repl = sv;
10674 if (delimp) *delimp = PL_multi_open == PL_multi_close ? s-termlen : s;
10680 takes: pointer to position in buffer
10681 returns: pointer to new position in buffer
10682 side-effects: builds ops for the constant in pl_yylval.op
10684 Read a number in any of the formats that Perl accepts:
10686 \d(_?\d)*(\.(\d(_?\d)*)?)?[Ee][\+\-]?(\d(_?\d)*) 12 12.34 12.
10687 \.\d(_?\d)*[Ee][\+\-]?(\d(_?\d)*) .34
10688 0b[01](_?[01])* binary integers
10689 0[0-7](_?[0-7])* octal integers
10690 0x[0-9A-Fa-f](_?[0-9A-Fa-f])* hexadecimal integers
10691 0x[0-9A-Fa-f](_?[0-9A-Fa-f])*(?:\.\d*)?p[+-]?[0-9]+ hexadecimal floats
10693 Like most scan_ routines, it uses the PL_tokenbuf buffer to hold the
10696 If it reads a number without a decimal point or an exponent, it will
10697 try converting the number to an integer and see if it can do so
10698 without loss of precision.
10702 Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp)
10704 const char *s = start; /* current position in buffer */
10705 char *d; /* destination in temp buffer */
10706 char *e; /* end of temp buffer */
10707 NV nv; /* number read, as a double */
10708 SV *sv = NULL; /* place to put the converted number */
10709 bool floatit; /* boolean: int or float? */
10710 const char *lastub = NULL; /* position of last underbar */
10711 static const char* const number_too_long = "Number too long";
10712 bool warned_about_underscore = 0;
10713 #define WARN_ABOUT_UNDERSCORE() \
10715 if (!warned_about_underscore) { \
10716 warned_about_underscore = 1; \
10717 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), \
10718 "Misplaced _ in number"); \
10721 /* Hexadecimal floating point.
10723 * In many places (where we have quads and NV is IEEE 754 double)
10724 * we can fit the mantissa bits of a NV into an unsigned quad.
10725 * (Note that UVs might not be quads even when we have quads.)
10726 * This will not work everywhere, though (either no quads, or
10727 * using long doubles), in which case we have to resort to NV,
10728 * which will probably mean horrible loss of precision due to
10729 * multiple fp operations. */
10730 bool hexfp = FALSE;
10731 int total_bits = 0;
10732 int significant_bits = 0;
10733 #if NVSIZE == 8 && defined(HAS_QUAD) && defined(Uquad_t)
10734 # define HEXFP_UQUAD
10735 Uquad_t hexfp_uquad = 0;
10736 int hexfp_frac_bits = 0;
10741 NV hexfp_mult = 1.0;
10742 UV high_non_zero = 0; /* highest digit */
10743 int non_zero_integer_digits = 0;
10745 PERL_ARGS_ASSERT_SCAN_NUM;
10747 /* We use the first character to decide what type of number this is */
10751 Perl_croak(aTHX_ "panic: scan_num, *s=%d", *s);
10753 /* if it starts with a 0, it could be an octal number, a decimal in
10754 0.13 disguise, or a hexadecimal number, or a binary number. */
10758 u holds the "number so far"
10759 shift the power of 2 of the base
10760 (hex == 4, octal == 3, binary == 1)
10761 overflowed was the number more than we can hold?
10763 Shift is used when we add a digit. It also serves as an "are
10764 we in octal/hex/binary?" indicator to disallow hex characters
10765 when in octal mode.
10770 bool overflowed = FALSE;
10771 bool just_zero = TRUE; /* just plain 0 or binary number? */
10772 static const NV nvshift[5] = { 1.0, 2.0, 4.0, 8.0, 16.0 };
10773 static const char* const bases[5] =
10774 { "", "binary", "", "octal", "hexadecimal" };
10775 static const char* const Bases[5] =
10776 { "", "Binary", "", "Octal", "Hexadecimal" };
10777 static const char* const maxima[5] =
10779 "0b11111111111111111111111111111111",
10783 const char *base, *Base, *max;
10785 /* check for hex */
10786 if (isALPHA_FOLD_EQ(s[1], 'x')) {
10790 } else if (isALPHA_FOLD_EQ(s[1], 'b')) {
10795 /* check for a decimal in disguise */
10796 else if (s[1] == '.' || isALPHA_FOLD_EQ(s[1], 'e'))
10798 /* so it must be octal */
10805 WARN_ABOUT_UNDERSCORE();
10809 base = bases[shift];
10810 Base = Bases[shift];
10811 max = maxima[shift];
10813 /* read the rest of the number */
10815 /* x is used in the overflow test,
10816 b is the digit we're adding on. */
10821 /* if we don't mention it, we're done */
10825 /* _ are ignored -- but warned about if consecutive */
10827 if (lastub && s == lastub + 1)
10828 WARN_ABOUT_UNDERSCORE();
10832 /* 8 and 9 are not octal */
10833 case '8': case '9':
10835 yyerror(Perl_form(aTHX_ "Illegal octal digit '%c'", *s));
10839 case '2': case '3': case '4':
10840 case '5': case '6': case '7':
10842 yyerror(Perl_form(aTHX_ "Illegal binary digit '%c'", *s));
10845 case '0': case '1':
10846 b = *s++ & 15; /* ASCII digit -> value of digit */
10850 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
10851 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
10852 /* make sure they said 0x */
10855 b = (*s++ & 7) + 9;
10857 /* Prepare to put the digit we have onto the end
10858 of the number so far. We check for overflows.
10864 x = u << shift; /* make room for the digit */
10866 total_bits += shift;
10868 if ((x >> shift) != u
10869 && !(PL_hints & HINT_NEW_BINARY)) {
10872 Perl_ck_warner_d(aTHX_ packWARN(WARN_OVERFLOW),
10873 "Integer overflow in %s number",
10876 u = x | b; /* add the digit to the end */
10879 n *= nvshift[shift];
10880 /* If an NV has not enough bits in its
10881 * mantissa to represent an UV this summing of
10882 * small low-order numbers is a waste of time
10883 * (because the NV cannot preserve the
10884 * low-order bits anyway): we could just
10885 * remember when did we overflow and in the
10886 * end just multiply n by the right
10891 if (high_non_zero == 0 && b > 0)
10895 non_zero_integer_digits++;
10897 /* this could be hexfp, but peek ahead
10898 * to avoid matching ".." */
10899 if (UNLIKELY(HEXFP_PEEK(s))) {
10907 /* if we get here, we had success: make a scalar value from
10912 /* final misplaced underbar check */
10914 WARN_ABOUT_UNDERSCORE();
10916 if (UNLIKELY(HEXFP_PEEK(s))) {
10917 /* Do sloppy (on the underbars) but quick detection
10918 * (and value construction) for hexfp, the decimal
10919 * detection will shortly be more thorough with the
10920 * underbar checks. */
10922 significant_bits = non_zero_integer_digits * shift;
10925 #else /* HEXFP_NV */
10928 /* Ignore the leading zero bits of
10929 * the high (first) non-zero digit. */
10930 if (high_non_zero) {
10931 if (high_non_zero < 0x8)
10932 significant_bits--;
10933 if (high_non_zero < 0x4)
10934 significant_bits--;
10935 if (high_non_zero < 0x2)
10936 significant_bits--;
10943 bool accumulate = TRUE;
10944 for (h++; (isXDIGIT(*h) || *h == '_'); h++) {
10945 if (isXDIGIT(*h)) {
10946 U8 b = XDIGIT_VALUE(*h);
10947 significant_bits += shift;
10950 if (significant_bits < NV_MANT_DIG) {
10951 /* We are in the long "run" of xdigits,
10952 * accumulate the full four bits. */
10953 hexfp_uquad <<= shift;
10955 hexfp_frac_bits += shift;
10957 /* We are at a hexdigit either at,
10958 * or straddling, the edge of mantissa.
10959 * We will try grabbing as many as
10960 * possible bits. */
10962 significant_bits - NV_MANT_DIG;
10965 hexfp_uquad <<= tail;
10966 hexfp_uquad |= b >> (shift - tail);
10967 hexfp_frac_bits += tail;
10969 /* Ignore the trailing zero bits
10970 * of the last non-zero xdigit.
10972 * The assumption here is that if
10973 * one has input of e.g. the xdigit
10974 * eight (0x8), there is only one
10975 * bit being input, not the full
10976 * four bits. Conversely, if one
10977 * specifies a zero xdigit, the
10978 * assumption is that one really
10979 * wants all those bits to be zero. */
10981 if ((b & 0x1) == 0x0) {
10982 significant_bits--;
10983 if ((b & 0x2) == 0x0) {
10984 significant_bits--;
10985 if ((b & 0x4) == 0x0) {
10986 significant_bits--;
10992 accumulate = FALSE;
10995 /* Keep skipping the xdigits, and
10996 * accumulating the significant bits,
10997 * but do not shift the uquad
10998 * (which would catastrophically drop
10999 * high-order bits) or accumulate the
11000 * xdigits anymore. */
11002 #else /* HEXFP_NV */
11006 hexfp_nv += b * nv_mult;
11008 accumulate = FALSE;
11012 if (significant_bits >= NV_MANT_DIG)
11013 accumulate = FALSE;
11017 if ((total_bits > 0 || significant_bits > 0) &&
11018 isALPHA_FOLD_EQ(*h, 'p')) {
11019 bool negexp = FALSE;
11023 else if (*h == '-') {
11029 while (isDIGIT(*h) || *h == '_') {
11032 hexfp_exp += *h - '0';
11035 && -hexfp_exp < NV_MIN_EXP - 1) {
11036 /* NOTE: this means that the exponent
11037 * underflow warning happens for
11038 * the IEEE 754 subnormals (denormals),
11039 * because DBL_MIN_EXP etc are the lowest
11040 * possible binary (or, rather, DBL_RADIX-base)
11041 * exponent for normals, not subnormals.
11043 * This may or may not be a good thing. */
11044 Perl_ck_warner(aTHX_ packWARN(WARN_OVERFLOW),
11045 "Hexadecimal float: exponent underflow");
11051 && hexfp_exp > NV_MAX_EXP - 1) {
11052 Perl_ck_warner(aTHX_ packWARN(WARN_OVERFLOW),
11053 "Hexadecimal float: exponent overflow");
11061 hexfp_exp = -hexfp_exp;
11063 hexfp_exp -= hexfp_frac_bits;
11065 hexfp_mult = Perl_pow(2.0, hexfp_exp);
11073 if (n > 4294967295.0)
11074 Perl_ck_warner(aTHX_ packWARN(WARN_PORTABLE),
11075 "%s number > %s non-portable",
11081 if (u > 0xffffffff)
11082 Perl_ck_warner(aTHX_ packWARN(WARN_PORTABLE),
11083 "%s number > %s non-portable",
11088 if (just_zero && (PL_hints & HINT_NEW_INTEGER))
11089 sv = new_constant(start, s - start, "integer",
11090 sv, NULL, NULL, 0);
11091 else if (PL_hints & HINT_NEW_BINARY)
11092 sv = new_constant(start, s - start, "binary", sv, NULL, NULL, 0);
11097 handle decimal numbers.
11098 we're also sent here when we read a 0 as the first digit
11100 case '1': case '2': case '3': case '4': case '5':
11101 case '6': case '7': case '8': case '9': case '.':
11104 e = PL_tokenbuf + sizeof PL_tokenbuf - 6; /* room for various punctuation */
11113 /* read next group of digits and _ and copy into d */
11116 || UNLIKELY(hexfp && isXDIGIT(*s)))
11118 /* skip underscores, checking for misplaced ones
11122 if (lastub && s == lastub + 1)
11123 WARN_ABOUT_UNDERSCORE();
11127 /* check for end of fixed-length buffer */
11129 Perl_croak(aTHX_ "%s", number_too_long);
11130 /* if we're ok, copy the character */
11135 /* final misplaced underbar check */
11136 if (lastub && s == lastub + 1)
11137 WARN_ABOUT_UNDERSCORE();
11139 /* read a decimal portion if there is one. avoid
11140 3..5 being interpreted as the number 3. followed
11143 if (*s == '.' && s[1] != '.') {
11148 WARN_ABOUT_UNDERSCORE();
11152 /* copy, ignoring underbars, until we run out of digits.
11156 || UNLIKELY(hexfp && isXDIGIT(*s));
11159 /* fixed length buffer check */
11161 Perl_croak(aTHX_ "%s", number_too_long);
11163 if (lastub && s == lastub + 1)
11164 WARN_ABOUT_UNDERSCORE();
11170 /* fractional part ending in underbar? */
11172 WARN_ABOUT_UNDERSCORE();
11173 if (*s == '.' && isDIGIT(s[1])) {
11174 /* oops, it's really a v-string, but without the "v" */
11180 /* read exponent part, if present */
11181 if ((isALPHA_FOLD_EQ(*s, 'e')
11182 || UNLIKELY(hexfp && isALPHA_FOLD_EQ(*s, 'p')))
11183 && strchr("+-0123456789_", s[1]))
11187 /* regardless of whether user said 3E5 or 3e5, use lower 'e',
11188 ditto for p (hexfloats) */
11189 if ((isALPHA_FOLD_EQ(*s, 'e'))) {
11190 /* At least some Mach atof()s don't grok 'E' */
11193 else if (UNLIKELY(hexfp && (isALPHA_FOLD_EQ(*s, 'p')))) {
11200 /* stray preinitial _ */
11202 WARN_ABOUT_UNDERSCORE();
11206 /* allow positive or negative exponent */
11207 if (*s == '+' || *s == '-')
11210 /* stray initial _ */
11212 WARN_ABOUT_UNDERSCORE();
11216 /* read digits of exponent */
11217 while (isDIGIT(*s) || *s == '_') {
11220 Perl_croak(aTHX_ "%s", number_too_long);
11224 if (((lastub && s == lastub + 1)
11225 || (!isDIGIT(s[1]) && s[1] != '_')))
11226 WARN_ABOUT_UNDERSCORE();
11234 We try to do an integer conversion first if no characters
11235 indicating "float" have been found.
11240 const int flags = grok_number (PL_tokenbuf, d - PL_tokenbuf, &uv);
11242 if (flags == IS_NUMBER_IN_UV) {
11244 sv = newSViv(uv); /* Prefer IVs over UVs. */
11247 } else if (flags == (IS_NUMBER_IN_UV | IS_NUMBER_NEG)) {
11248 if (uv <= (UV) IV_MIN)
11249 sv = newSViv(-(IV)uv);
11256 STORE_LC_NUMERIC_UNDERLYING_SET_STANDARD();
11257 /* terminate the string */
11259 if (UNLIKELY(hexfp)) {
11260 # ifdef NV_MANT_DIG
11261 if (significant_bits > NV_MANT_DIG)
11262 Perl_ck_warner(aTHX_ packWARN(WARN_OVERFLOW),
11263 "Hexadecimal float: mantissa overflow");
11266 nv = hexfp_uquad * hexfp_mult;
11267 #else /* HEXFP_NV */
11268 nv = hexfp_nv * hexfp_mult;
11271 nv = Atof(PL_tokenbuf);
11273 RESTORE_LC_NUMERIC_UNDERLYING();
11278 ? (PL_hints & HINT_NEW_FLOAT) : (PL_hints & HINT_NEW_INTEGER) ) {
11279 const char *const key = floatit ? "float" : "integer";
11280 const STRLEN keylen = floatit ? 5 : 7;
11281 sv = S_new_constant(aTHX_ PL_tokenbuf, d - PL_tokenbuf,
11282 key, keylen, sv, NULL, NULL, 0);
11286 /* if it starts with a v, it could be a v-string */
11289 sv = newSV(5); /* preallocate storage space */
11290 ENTER_with_name("scan_vstring");
11292 s = scan_vstring(s, PL_bufend, sv);
11293 SvREFCNT_inc_simple_void_NN(sv);
11294 LEAVE_with_name("scan_vstring");
11298 /* make the op for the constant and return */
11301 lvalp->opval = newSVOP(OP_CONST, 0, sv);
11303 lvalp->opval = NULL;
11309 S_scan_formline(pTHX_ char *s)
11311 SV * const stuff = newSVpvs("");
11312 bool needargs = FALSE;
11313 bool eofmt = FALSE;
11315 PERL_ARGS_ASSERT_SCAN_FORMLINE;
11317 while (!needargs) {
11321 #ifdef PERL_STRICT_CR
11322 while (SPACE_OR_TAB(*t))
11325 while (SPACE_OR_TAB(*t) || *t == '\r')
11328 if (*t == '\n' || t == PL_bufend) {
11333 eol = (char *) memchr(s,'\n',PL_bufend-s);
11338 for (t = s; t < eol; t++) {
11339 if (*t == '~' && t[1] == '~' && SvCUR(stuff)) {
11341 goto enough; /* ~~ must be first line in formline */
11343 if (*t == '@' || *t == '^')
11347 sv_catpvn(stuff, s, eol-s);
11348 #ifndef PERL_STRICT_CR
11349 if (eol-s > 1 && eol[-2] == '\r' && eol[-1] == '\n') {
11350 char *end = SvPVX(stuff) + SvCUR(stuff);
11353 SvCUR_set(stuff, SvCUR(stuff) - 1);
11361 if ((PL_rsfp || PL_parser->filtered)
11362 && PL_parser->form_lex_state == LEX_NORMAL) {
11364 PL_bufptr = PL_bufend;
11365 COPLINE_INC_WITH_HERELINES;
11366 got_some = lex_next_chunk(0);
11367 CopLINE_dec(PL_curcop);
11375 if (!SvCUR(stuff) || needargs)
11376 PL_lex_state = PL_parser->form_lex_state;
11377 if (SvCUR(stuff)) {
11378 PL_expect = XSTATE;
11380 const char *s2 = s;
11381 while (isSPACE(*s2) && *s2 != '\n')
11384 PL_expect = XTERMBLOCK;
11385 NEXTVAL_NEXTTOKE.ival = 0;
11388 NEXTVAL_NEXTTOKE.ival = 0;
11389 force_next(FORMLBRACK);
11392 if (UTF && is_utf8_string((U8*)SvPVX_const(stuff), SvCUR(stuff)))
11395 NEXTVAL_NEXTTOKE.opval = newSVOP(OP_CONST, 0, stuff);
11399 SvREFCNT_dec(stuff);
11401 PL_lex_formbrack = 0;
11407 Perl_start_subparse(pTHX_ I32 is_format, U32 flags)
11409 const I32 oldsavestack_ix = PL_savestack_ix;
11410 CV* const outsidecv = PL_compcv;
11412 SAVEI32(PL_subline);
11413 save_item(PL_subname);
11414 SAVESPTR(PL_compcv);
11416 PL_compcv = MUTABLE_CV(newSV_type(is_format ? SVt_PVFM : SVt_PVCV));
11417 CvFLAGS(PL_compcv) |= flags;
11419 PL_subline = CopLINE(PL_curcop);
11420 CvPADLIST(PL_compcv) = pad_new(padnew_SAVE|padnew_SAVESUB);
11421 CvOUTSIDE(PL_compcv) = MUTABLE_CV(SvREFCNT_inc_simple(outsidecv));
11422 CvOUTSIDE_SEQ(PL_compcv) = PL_cop_seqmax;
11423 if (outsidecv && CvPADLIST(outsidecv))
11424 CvPADLIST(PL_compcv)->xpadl_outid = CvPADLIST(outsidecv)->xpadl_id;
11426 return oldsavestack_ix;
11430 S_yywarn(pTHX_ const char *const s, U32 flags)
11432 PERL_ARGS_ASSERT_YYWARN;
11434 PL_in_eval |= EVAL_WARNONLY;
11435 yyerror_pv(s, flags);
11440 Perl_abort_execution(pTHX_ const char * const msg, const char * const name)
11442 PERL_ARGS_ASSERT_ABORT_EXECUTION;
11445 Perl_croak(aTHX_ "%s%s had compilation errors.\n", msg, name);
11448 "%sExecution of %s aborted due to compilation errors.\n", msg, name);
11450 NOT_REACHED; /* NOTREACHED */
11456 /* Called, after at least one error has been found, to abort the parse now,
11457 * instead of trying to forge ahead */
11459 yyerror_pvn(NULL, 0, 0);
11463 Perl_yyerror(pTHX_ const char *const s)
11465 PERL_ARGS_ASSERT_YYERROR;
11466 return yyerror_pvn(s, strlen(s), 0);
11470 Perl_yyerror_pv(pTHX_ const char *const s, U32 flags)
11472 PERL_ARGS_ASSERT_YYERROR_PV;
11473 return yyerror_pvn(s, strlen(s), flags);
11477 Perl_yyerror_pvn(pTHX_ const char *const s, STRLEN len, U32 flags)
11479 const char *context = NULL;
11482 SV * const where_sv = newSVpvs_flags("", SVs_TEMP);
11483 int yychar = PL_parser->yychar;
11485 /* Output error message 's' with length 'len'. 'flags' are SV flags that
11486 * apply. If the number of errors found is large enough, it abandons
11487 * parsing. If 's' is NULL, there is no message, and it abandons
11488 * processing unconditionally */
11491 if (!yychar || (yychar == ';' && !PL_rsfp))
11492 sv_catpvs(where_sv, "at EOF");
11493 else if ( PL_oldoldbufptr
11494 && PL_bufptr > PL_oldoldbufptr
11495 && PL_bufptr - PL_oldoldbufptr < 200
11496 && PL_oldoldbufptr != PL_oldbufptr
11497 && PL_oldbufptr != PL_bufptr)
11501 The code below is removed for NetWare because it
11502 abends/crashes on NetWare when the script has error such as
11503 not having the closing quotes like:
11504 if ($var eq "value)
11505 Checking of white spaces is anyway done in NetWare code.
11508 while (isSPACE(*PL_oldoldbufptr))
11511 context = PL_oldoldbufptr;
11512 contlen = PL_bufptr - PL_oldoldbufptr;
11514 else if ( PL_oldbufptr
11515 && PL_bufptr > PL_oldbufptr
11516 && PL_bufptr - PL_oldbufptr < 200
11517 && PL_oldbufptr != PL_bufptr) {
11520 The code below is removed for NetWare because it
11521 abends/crashes on NetWare when the script has error such as
11522 not having the closing quotes like:
11523 if ($var eq "value)
11524 Checking of white spaces is anyway done in NetWare code.
11527 while (isSPACE(*PL_oldbufptr))
11530 context = PL_oldbufptr;
11531 contlen = PL_bufptr - PL_oldbufptr;
11533 else if (yychar > 255)
11534 sv_catpvs(where_sv, "next token ???");
11535 else if (yychar == YYEMPTY) {
11536 if (PL_lex_state == LEX_NORMAL)
11537 sv_catpvs(where_sv, "at end of line");
11538 else if (PL_lex_inpat)
11539 sv_catpvs(where_sv, "within pattern");
11541 sv_catpvs(where_sv, "within string");
11544 sv_catpvs(where_sv, "next char ");
11546 Perl_sv_catpvf(aTHX_ where_sv, "^%c", toCTRL(yychar));
11547 else if (isPRINT_LC(yychar)) {
11548 const char string = yychar;
11549 sv_catpvn(where_sv, &string, 1);
11552 Perl_sv_catpvf(aTHX_ where_sv, "\\%03o", yychar & 255);
11554 msg = newSVpvn_flags(s, len, (flags & SVf_UTF8) | SVs_TEMP);
11555 Perl_sv_catpvf(aTHX_ msg, " at %s line %" IVdf ", ",
11556 OutCopFILE(PL_curcop),
11557 (IV)(PL_parser->preambling == NOLINE
11558 ? CopLINE(PL_curcop)
11559 : PL_parser->preambling));
11561 Perl_sv_catpvf(aTHX_ msg, "near \"%" UTF8f "\"\n",
11562 UTF8fARG(UTF, contlen, context));
11564 Perl_sv_catpvf(aTHX_ msg, "%" SVf "\n", SVfARG(where_sv));
11565 if ( PL_multi_start < PL_multi_end
11566 && (U32)(CopLINE(PL_curcop) - PL_multi_end) <= 1)
11568 Perl_sv_catpvf(aTHX_ msg,
11569 " (Might be a runaway multi-line %c%c string starting on"
11570 " line %" IVdf ")\n",
11571 (int)PL_multi_open,(int)PL_multi_close,(IV)PL_multi_start);
11574 if (PL_in_eval & EVAL_WARNONLY) {
11575 PL_in_eval &= ~EVAL_WARNONLY;
11576 Perl_ck_warner_d(aTHX_ packWARN(WARN_SYNTAX), "%" SVf, SVfARG(msg));
11582 if (s == NULL || PL_error_count >= 10) {
11583 const char * msg = "";
11584 const char * const name = OutCopFILE(PL_curcop);
11587 SV * errsv = ERRSV;
11588 if (SvCUR(errsv)) {
11589 msg = Perl_form(aTHX_ "%" SVf, SVfARG(errsv));
11594 abort_execution(msg, name);
11597 Perl_croak(aTHX_ "%s%s has too many errors.\n", msg, name);
11601 PL_in_my_stash = NULL;
11606 S_swallow_bom(pTHX_ U8 *s)
11608 const STRLEN slen = SvCUR(PL_linestr);
11610 PERL_ARGS_ASSERT_SWALLOW_BOM;
11614 if (s[1] == 0xFE) {
11615 /* UTF-16 little-endian? (or UTF-32LE?) */
11616 if (s[2] == 0 && s[3] == 0) /* UTF-32 little-endian */
11617 /* diag_listed_as: Unsupported script encoding %s */
11618 Perl_croak(aTHX_ "Unsupported script encoding UTF-32LE");
11619 #ifndef PERL_NO_UTF16_FILTER
11620 if (DEBUG_p_TEST || DEBUG_T_TEST) PerlIO_printf(Perl_debug_log, "UTF-16LE script encoding (BOM)\n");
11622 if (PL_bufend > (char*)s) {
11623 s = add_utf16_textfilter(s, TRUE);
11626 /* diag_listed_as: Unsupported script encoding %s */
11627 Perl_croak(aTHX_ "Unsupported script encoding UTF-16LE");
11632 if (s[1] == 0xFF) { /* UTF-16 big-endian? */
11633 #ifndef PERL_NO_UTF16_FILTER
11634 if (DEBUG_p_TEST || DEBUG_T_TEST) PerlIO_printf(Perl_debug_log, "UTF-16BE script encoding (BOM)\n");
11636 if (PL_bufend > (char *)s) {
11637 s = add_utf16_textfilter(s, FALSE);
11640 /* diag_listed_as: Unsupported script encoding %s */
11641 Perl_croak(aTHX_ "Unsupported script encoding UTF-16BE");
11645 case BOM_UTF8_FIRST_BYTE: {
11646 const STRLEN len = sizeof(BOM_UTF8_TAIL) - 1; /* Exclude trailing NUL */
11647 if (slen > len && memEQ(s+1, BOM_UTF8_TAIL, len)) {
11648 if (DEBUG_p_TEST || DEBUG_T_TEST) PerlIO_printf(Perl_debug_log, "UTF-8 script encoding (BOM)\n");
11649 s += len + 1; /* UTF-8 */
11656 if (s[2] == 0xFE && s[3] == 0xFF) {
11657 /* UTF-32 big-endian */
11658 /* diag_listed_as: Unsupported script encoding %s */
11659 Perl_croak(aTHX_ "Unsupported script encoding UTF-32BE");
11662 else if (s[2] == 0 && s[3] != 0) {
11665 * are a good indicator of UTF-16BE. */
11666 #ifndef PERL_NO_UTF16_FILTER
11667 if (DEBUG_p_TEST || DEBUG_T_TEST) PerlIO_printf(Perl_debug_log, "UTF-16BE script encoding (no BOM)\n");
11668 s = add_utf16_textfilter(s, FALSE);
11670 /* diag_listed_as: Unsupported script encoding %s */
11671 Perl_croak(aTHX_ "Unsupported script encoding UTF-16BE");
11678 if (slen > 3 && s[1] == 0 && s[2] != 0 && s[3] == 0) {
11681 * are a good indicator of UTF-16LE. */
11682 #ifndef PERL_NO_UTF16_FILTER
11683 if (DEBUG_p_TEST || DEBUG_T_TEST) PerlIO_printf(Perl_debug_log, "UTF-16LE script encoding (no BOM)\n");
11684 s = add_utf16_textfilter(s, TRUE);
11686 /* diag_listed_as: Unsupported script encoding %s */
11687 Perl_croak(aTHX_ "Unsupported script encoding UTF-16LE");
11695 #ifndef PERL_NO_UTF16_FILTER
11697 S_utf16_textfilter(pTHX_ int idx, SV *sv, int maxlen)
11699 SV *const filter = FILTER_DATA(idx);
11700 /* We re-use this each time round, throwing the contents away before we
11702 SV *const utf16_buffer = MUTABLE_SV(IoTOP_GV(filter));
11703 SV *const utf8_buffer = filter;
11704 IV status = IoPAGE(filter);
11705 const bool reverse = cBOOL(IoLINES(filter));
11708 PERL_ARGS_ASSERT_UTF16_TEXTFILTER;
11710 /* As we're automatically added, at the lowest level, and hence only called
11711 from this file, we can be sure that we're not called in block mode. Hence
11712 don't bother writing code to deal with block mode. */
11714 Perl_croak(aTHX_ "panic: utf16_textfilter called in block mode (for %d characters)", maxlen);
11717 Perl_croak(aTHX_ "panic: utf16_textfilter called after error (status=%" IVdf ")", status);
11719 DEBUG_P(PerlIO_printf(Perl_debug_log,
11720 "utf16_textfilter(%p,%ce): idx=%d maxlen=%d status=%" IVdf " utf16=%" UVuf " utf8=%" UVuf "\n",
11721 FPTR2DPTR(void *, S_utf16_textfilter),
11722 reverse ? 'l' : 'b', idx, maxlen, status,
11723 (UV)SvCUR(utf16_buffer), (UV)SvCUR(utf8_buffer)));
11730 /* First, look in our buffer of existing UTF-8 data: */
11731 char *nl = (char *)memchr(SvPVX(utf8_buffer), '\n', SvCUR(utf8_buffer));
11735 } else if (status == 0) {
11737 IoPAGE(filter) = 0;
11738 nl = SvEND(utf8_buffer);
11741 STRLEN got = nl - SvPVX(utf8_buffer);
11742 /* Did we have anything to append? */
11744 sv_catpvn(sv, SvPVX(utf8_buffer), got);
11745 /* Everything else in this code works just fine if SVp_POK isn't
11746 set. This, however, needs it, and we need it to work, else
11747 we loop infinitely because the buffer is never consumed. */
11748 sv_chop(utf8_buffer, nl);
11752 /* OK, not a complete line there, so need to read some more UTF-16.
11753 Read an extra octect if the buffer currently has an odd number. */
11757 if (SvCUR(utf16_buffer) >= 2) {
11758 /* Location of the high octet of the last complete code point.
11759 Gosh, UTF-16 is a pain. All the benefits of variable length,
11760 *coupled* with all the benefits of partial reads and
11762 const U8 *const last_hi = (U8*)SvPVX(utf16_buffer)
11763 + ((SvCUR(utf16_buffer) & ~1) - (reverse ? 1 : 2));
11765 if (*last_hi < 0xd8 || *last_hi > 0xdb) {
11769 /* We have the first half of a surrogate. Read more. */
11770 DEBUG_P(PerlIO_printf(Perl_debug_log, "utf16_textfilter partial surrogate detected at %p\n", last_hi));
11773 status = FILTER_READ(idx + 1, utf16_buffer,
11774 160 + (SvCUR(utf16_buffer) & 1));
11775 DEBUG_P(PerlIO_printf(Perl_debug_log, "utf16_textfilter status=%" IVdf " SvCUR(sv)=%" UVuf "\n", status, (UV)SvCUR(utf16_buffer)));
11776 DEBUG_P({ sv_dump(utf16_buffer); sv_dump(utf8_buffer);});
11779 IoPAGE(filter) = status;
11784 chars = SvCUR(utf16_buffer) >> 1;
11785 have = SvCUR(utf8_buffer);
11786 SvGROW(utf8_buffer, have + chars * 3 + 1);
11789 end = utf16_to_utf8_reversed((U8*)SvPVX(utf16_buffer),
11790 (U8*)SvPVX_const(utf8_buffer) + have,
11791 chars * 2, &newlen);
11793 end = utf16_to_utf8((U8*)SvPVX(utf16_buffer),
11794 (U8*)SvPVX_const(utf8_buffer) + have,
11795 chars * 2, &newlen);
11797 SvCUR_set(utf8_buffer, have + newlen);
11800 /* No need to keep this SV "well-formed" with a '\0' after the end, as
11801 it's private to us, and utf16_to_utf8{,reversed} take a
11802 (pointer,length) pair, rather than a NUL-terminated string. */
11803 if(SvCUR(utf16_buffer) & 1) {
11804 *SvPVX(utf16_buffer) = SvEND(utf16_buffer)[-1];
11805 SvCUR_set(utf16_buffer, 1);
11807 SvCUR_set(utf16_buffer, 0);
11810 DEBUG_P(PerlIO_printf(Perl_debug_log,
11811 "utf16_textfilter: returns, status=%" IVdf " utf16=%" UVuf " utf8=%" UVuf "\n",
11813 (UV)SvCUR(utf16_buffer), (UV)SvCUR(utf8_buffer)));
11814 DEBUG_P({ sv_dump(utf8_buffer); sv_dump(sv);});
11819 S_add_utf16_textfilter(pTHX_ U8 *const s, bool reversed)
11821 SV *filter = filter_add(S_utf16_textfilter, NULL);
11823 PERL_ARGS_ASSERT_ADD_UTF16_TEXTFILTER;
11825 IoTOP_GV(filter) = MUTABLE_GV(newSVpvn((char *)s, PL_bufend - (char*)s));
11827 IoLINES(filter) = reversed;
11828 IoPAGE(filter) = 1; /* Not EOF */
11830 /* Sadly, we have to return a valid pointer, come what may, so we have to
11831 ignore any error return from this. */
11832 SvCUR_set(PL_linestr, 0);
11833 if (FILTER_READ(0, PL_linestr, 0)) {
11834 SvUTF8_on(PL_linestr);
11836 SvUTF8_on(PL_linestr);
11838 PL_bufend = SvEND(PL_linestr);
11839 return (U8*)SvPVX(PL_linestr);
11844 Returns a pointer to the next character after the parsed
11845 vstring, as well as updating the passed in sv.
11847 Function must be called like
11849 sv = sv_2mortal(newSV(5));
11850 s = scan_vstring(s,e,sv);
11852 where s and e are the start and end of the string.
11853 The sv should already be large enough to store the vstring
11854 passed in, for performance reasons.
11856 This function may croak if fatal warnings are enabled in the
11857 calling scope, hence the sv_2mortal in the example (to prevent
11858 a leak). Make sure to do SvREFCNT_inc afterwards if you use
11864 Perl_scan_vstring(pTHX_ const char *s, const char *const e, SV *sv)
11866 const char *pos = s;
11867 const char *start = s;
11869 PERL_ARGS_ASSERT_SCAN_VSTRING;
11871 if (*pos == 'v') pos++; /* get past 'v' */
11872 while (pos < e && (isDIGIT(*pos) || *pos == '_'))
11874 if ( *pos != '.') {
11875 /* this may not be a v-string if followed by => */
11876 const char *next = pos;
11877 while (next < e && isSPACE(*next))
11879 if ((e - next) >= 2 && *next == '=' && next[1] == '>' ) {
11880 /* return string not v-string */
11881 sv_setpvn(sv,(char *)s,pos-s);
11882 return (char *)pos;
11886 if (!isALPHA(*pos)) {
11887 U8 tmpbuf[UTF8_MAXBYTES+1];
11890 s++; /* get past 'v' */
11895 /* this is atoi() that tolerates underscores */
11898 const char *end = pos;
11900 while (--end >= s) {
11902 const UV orev = rev;
11903 rev += (*end - '0') * mult;
11906 /* diag_listed_as: Integer overflow in %s number */
11907 Perl_ck_warner_d(aTHX_ packWARN(WARN_OVERFLOW),
11908 "Integer overflow in decimal number");
11912 /* Append native character for the rev point */
11913 tmpend = uvchr_to_utf8(tmpbuf, rev);
11914 sv_catpvn(sv, (const char*)tmpbuf, tmpend - tmpbuf);
11915 if (!UVCHR_IS_INVARIANT(rev))
11917 if (pos + 1 < e && *pos == '.' && isDIGIT(pos[1]))
11923 while (pos < e && (isDIGIT(*pos) || *pos == '_'))
11927 sv_magic(sv,NULL,PERL_MAGIC_vstring,(const char*)start, pos-start);
11934 Perl_keyword_plugin_standard(pTHX_
11935 char *keyword_ptr, STRLEN keyword_len, OP **op_ptr)
11937 PERL_ARGS_ASSERT_KEYWORD_PLUGIN_STANDARD;
11938 PERL_UNUSED_CONTEXT;
11939 PERL_UNUSED_ARG(keyword_ptr);
11940 PERL_UNUSED_ARG(keyword_len);
11941 PERL_UNUSED_ARG(op_ptr);
11942 return KEYWORD_PLUGIN_DECLINE;
11945 #define parse_recdescent(g,p) S_parse_recdescent(aTHX_ g,p)
11947 S_parse_recdescent(pTHX_ int gramtype, I32 fakeeof)
11949 SAVEI32(PL_lex_brackets);
11950 if (PL_lex_brackets > 100)
11951 Renew(PL_lex_brackstack, PL_lex_brackets + 10, char);
11952 PL_lex_brackstack[PL_lex_brackets++] = XFAKEEOF;
11953 SAVEI32(PL_lex_allbrackets);
11954 PL_lex_allbrackets = 0;
11955 SAVEI8(PL_lex_fakeeof);
11956 PL_lex_fakeeof = (U8)fakeeof;
11957 if(yyparse(gramtype) && !PL_parser->error_count)
11958 qerror(Perl_mess(aTHX_ "Parse error"));
11961 #define parse_recdescent_for_op(g,p) S_parse_recdescent_for_op(aTHX_ g,p)
11963 S_parse_recdescent_for_op(pTHX_ int gramtype, I32 fakeeof)
11967 SAVEVPTR(PL_eval_root);
11968 PL_eval_root = NULL;
11969 parse_recdescent(gramtype, fakeeof);
11975 #define parse_expr(p,f) S_parse_expr(aTHX_ p,f)
11977 S_parse_expr(pTHX_ I32 fakeeof, U32 flags)
11980 if (flags & ~PARSE_OPTIONAL)
11981 Perl_croak(aTHX_ "Parsing code internal error (%s)", "parse_expr");
11982 exprop = parse_recdescent_for_op(GRAMEXPR, fakeeof);
11983 if (!exprop && !(flags & PARSE_OPTIONAL)) {
11984 if (!PL_parser->error_count)
11985 qerror(Perl_mess(aTHX_ "Parse error"));
11986 exprop = newOP(OP_NULL, 0);
11992 =for apidoc Amx|OP *|parse_arithexpr|U32 flags
11994 Parse a Perl arithmetic expression. This may contain operators of precedence
11995 down to the bit shift operators. The expression must be followed (and thus
11996 terminated) either by a comparison or lower-precedence operator or by
11997 something that would normally terminate an expression such as semicolon.
11998 If C<flags> has the C<PARSE_OPTIONAL> bit set, then the expression is optional,
11999 otherwise it is mandatory. It is up to the caller to ensure that the
12000 dynamic parser state (L</PL_parser> et al) is correctly set to reflect
12001 the source of the code to be parsed and the lexical context for the
12004 The op tree representing the expression is returned. If an optional
12005 expression is absent, a null pointer is returned, otherwise the pointer
12008 If an error occurs in parsing or compilation, in most cases a valid op
12009 tree is returned anyway. The error is reflected in the parser state,
12010 normally resulting in a single exception at the top level of parsing
12011 which covers all the compilation errors that occurred. Some compilation
12012 errors, however, will throw an exception immediately.
12018 Perl_parse_arithexpr(pTHX_ U32 flags)
12020 return parse_expr(LEX_FAKEEOF_COMPARE, flags);
12024 =for apidoc Amx|OP *|parse_termexpr|U32 flags
12026 Parse a Perl term expression. This may contain operators of precedence
12027 down to the assignment operators. The expression must be followed (and thus
12028 terminated) either by a comma or lower-precedence operator or by
12029 something that would normally terminate an expression such as semicolon.
12030 If C<flags> has the C<PARSE_OPTIONAL> bit set, then the expression is optional,
12031 otherwise it is mandatory. It is up to the caller to ensure that the
12032 dynamic parser state (L</PL_parser> et al) is correctly set to reflect
12033 the source of the code to be parsed and the lexical context for the
12036 The op tree representing the expression is returned. If an optional
12037 expression is absent, a null pointer is returned, otherwise the pointer
12040 If an error occurs in parsing or compilation, in most cases a valid op
12041 tree is returned anyway. The error is reflected in the parser state,
12042 normally resulting in a single exception at the top level of parsing
12043 which covers all the compilation errors that occurred. Some compilation
12044 errors, however, will throw an exception immediately.
12050 Perl_parse_termexpr(pTHX_ U32 flags)
12052 return parse_expr(LEX_FAKEEOF_COMMA, flags);
12056 =for apidoc Amx|OP *|parse_listexpr|U32 flags
12058 Parse a Perl list expression. This may contain operators of precedence
12059 down to the comma operator. The expression must be followed (and thus
12060 terminated) either by a low-precedence logic operator such as C<or> or by
12061 something that would normally terminate an expression such as semicolon.
12062 If C<flags> has the C<PARSE_OPTIONAL> bit set, then the expression is optional,
12063 otherwise it is mandatory. It is up to the caller to ensure that the
12064 dynamic parser state (L</PL_parser> et al) is correctly set to reflect
12065 the source of the code to be parsed and the lexical context for the
12068 The op tree representing the expression is returned. If an optional
12069 expression is absent, a null pointer is returned, otherwise the pointer
12072 If an error occurs in parsing or compilation, in most cases a valid op
12073 tree is returned anyway. The error is reflected in the parser state,
12074 normally resulting in a single exception at the top level of parsing
12075 which covers all the compilation errors that occurred. Some compilation
12076 errors, however, will throw an exception immediately.
12082 Perl_parse_listexpr(pTHX_ U32 flags)
12084 return parse_expr(LEX_FAKEEOF_LOWLOGIC, flags);
12088 =for apidoc Amx|OP *|parse_fullexpr|U32 flags
12090 Parse a single complete Perl expression. This allows the full
12091 expression grammar, including the lowest-precedence operators such
12092 as C<or>. The expression must be followed (and thus terminated) by a
12093 token that an expression would normally be terminated by: end-of-file,
12094 closing bracketing punctuation, semicolon, or one of the keywords that
12095 signals a postfix expression-statement modifier. If C<flags> has the
12096 C<PARSE_OPTIONAL> bit set, then the expression is optional, otherwise it is
12097 mandatory. It is up to the caller to ensure that the dynamic parser
12098 state (L</PL_parser> et al) is correctly set to reflect the source of
12099 the code to be parsed and the lexical context for the expression.
12101 The op tree representing the expression is returned. If an optional
12102 expression is absent, a null pointer is returned, otherwise the pointer
12105 If an error occurs in parsing or compilation, in most cases a valid op
12106 tree is returned anyway. The error is reflected in the parser state,
12107 normally resulting in a single exception at the top level of parsing
12108 which covers all the compilation errors that occurred. Some compilation
12109 errors, however, will throw an exception immediately.
12115 Perl_parse_fullexpr(pTHX_ U32 flags)
12117 return parse_expr(LEX_FAKEEOF_NONEXPR, flags);
12121 =for apidoc Amx|OP *|parse_block|U32 flags
12123 Parse a single complete Perl code block. This consists of an opening
12124 brace, a sequence of statements, and a closing brace. The block
12125 constitutes a lexical scope, so C<my> variables and various compile-time
12126 effects can be contained within it. It is up to the caller to ensure
12127 that the dynamic parser state (L</PL_parser> et al) is correctly set to
12128 reflect the source of the code to be parsed and the lexical context for
12131 The op tree representing the code block is returned. This is always a
12132 real op, never a null pointer. It will normally be a C<lineseq> list,
12133 including C<nextstate> or equivalent ops. No ops to construct any kind
12134 of runtime scope are included by virtue of it being a block.
12136 If an error occurs in parsing or compilation, in most cases a valid op
12137 tree (most likely null) is returned anyway. The error is reflected in
12138 the parser state, normally resulting in a single exception at the top
12139 level of parsing which covers all the compilation errors that occurred.
12140 Some compilation errors, however, will throw an exception immediately.
12142 The C<flags> parameter is reserved for future use, and must always
12149 Perl_parse_block(pTHX_ U32 flags)
12152 Perl_croak(aTHX_ "Parsing code internal error (%s)", "parse_block");
12153 return parse_recdescent_for_op(GRAMBLOCK, LEX_FAKEEOF_NEVER);
12157 =for apidoc Amx|OP *|parse_barestmt|U32 flags
12159 Parse a single unadorned Perl statement. This may be a normal imperative
12160 statement or a declaration that has compile-time effect. It does not
12161 include any label or other affixture. It is up to the caller to ensure
12162 that the dynamic parser state (L</PL_parser> et al) is correctly set to
12163 reflect the source of the code to be parsed and the lexical context for
12166 The op tree representing the statement is returned. This may be a
12167 null pointer if the statement is null, for example if it was actually
12168 a subroutine definition (which has compile-time side effects). If not
12169 null, it will be ops directly implementing the statement, suitable to
12170 pass to L</newSTATEOP>. It will not normally include a C<nextstate> or
12171 equivalent op (except for those embedded in a scope contained entirely
12172 within the statement).
12174 If an error occurs in parsing or compilation, in most cases a valid op
12175 tree (most likely null) is returned anyway. The error is reflected in
12176 the parser state, normally resulting in a single exception at the top
12177 level of parsing which covers all the compilation errors that occurred.
12178 Some compilation errors, however, will throw an exception immediately.
12180 The C<flags> parameter is reserved for future use, and must always
12187 Perl_parse_barestmt(pTHX_ U32 flags)
12190 Perl_croak(aTHX_ "Parsing code internal error (%s)", "parse_barestmt");
12191 return parse_recdescent_for_op(GRAMBARESTMT, LEX_FAKEEOF_NEVER);
12195 =for apidoc Amx|SV *|parse_label|U32 flags
12197 Parse a single label, possibly optional, of the type that may prefix a
12198 Perl statement. It is up to the caller to ensure that the dynamic parser
12199 state (L</PL_parser> et al) is correctly set to reflect the source of
12200 the code to be parsed. If C<flags> has the C<PARSE_OPTIONAL> bit set, then the
12201 label is optional, otherwise it is mandatory.
12203 The name of the label is returned in the form of a fresh scalar. If an
12204 optional label is absent, a null pointer is returned.
12206 If an error occurs in parsing, which can only occur if the label is
12207 mandatory, a valid label is returned anyway. The error is reflected in
12208 the parser state, normally resulting in a single exception at the top
12209 level of parsing which covers all the compilation errors that occurred.
12215 Perl_parse_label(pTHX_ U32 flags)
12217 if (flags & ~PARSE_OPTIONAL)
12218 Perl_croak(aTHX_ "Parsing code internal error (%s)", "parse_label");
12220 PL_parser->yychar = yylex();
12221 if (PL_parser->yychar == LABEL) {
12222 char * const lpv = pl_yylval.pval;
12223 STRLEN llen = strlen(lpv);
12224 PL_parser->yychar = YYEMPTY;
12225 return newSVpvn_flags(lpv, llen, lpv[llen+1] ? SVf_UTF8 : 0);
12232 STRLEN wlen, bufptr_pos;
12235 if (!isIDFIRST_lazy_if_safe(s, PL_bufend, UTF))
12237 t = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, FALSE, &wlen);
12238 if (word_takes_any_delimiter(s, wlen))
12240 bufptr_pos = s - SvPVX(PL_linestr);
12242 lex_read_space(LEX_KEEP_PREVIOUS);
12244 s = SvPVX(PL_linestr) + bufptr_pos;
12245 if (t[0] == ':' && t[1] != ':') {
12246 PL_oldoldbufptr = PL_oldbufptr;
12249 return newSVpvn_flags(s, wlen, UTF ? SVf_UTF8 : 0);
12253 if (flags & PARSE_OPTIONAL) {
12256 qerror(Perl_mess(aTHX_ "Parse error"));
12257 return newSVpvs("x");
12264 =for apidoc Amx|OP *|parse_fullstmt|U32 flags
12266 Parse a single complete Perl statement. This may be a normal imperative
12267 statement or a declaration that has compile-time effect, and may include
12268 optional labels. It is up to the caller to ensure that the dynamic
12269 parser state (L</PL_parser> et al) is correctly set to reflect the source
12270 of the code to be parsed and the lexical context for the statement.
12272 The op tree representing the statement is returned. This may be a
12273 null pointer if the statement is null, for example if it was actually
12274 a subroutine definition (which has compile-time side effects). If not
12275 null, it will be the result of a L</newSTATEOP> call, normally including
12276 a C<nextstate> or equivalent op.
12278 If an error occurs in parsing or compilation, in most cases a valid op
12279 tree (most likely null) is returned anyway. The error is reflected in
12280 the parser state, normally resulting in a single exception at the top
12281 level of parsing which covers all the compilation errors that occurred.
12282 Some compilation errors, however, will throw an exception immediately.
12284 The C<flags> parameter is reserved for future use, and must always
12291 Perl_parse_fullstmt(pTHX_ U32 flags)
12294 Perl_croak(aTHX_ "Parsing code internal error (%s)", "parse_fullstmt");
12295 return parse_recdescent_for_op(GRAMFULLSTMT, LEX_FAKEEOF_NEVER);
12299 =for apidoc Amx|OP *|parse_stmtseq|U32 flags
12301 Parse a sequence of zero or more Perl statements. These may be normal
12302 imperative statements, including optional labels, or declarations
12303 that have compile-time effect, or any mixture thereof. The statement
12304 sequence ends when a closing brace or end-of-file is encountered in a
12305 place where a new statement could have validly started. It is up to
12306 the caller to ensure that the dynamic parser state (L</PL_parser> et al)
12307 is correctly set to reflect the source of the code to be parsed and the
12308 lexical context for the statements.
12310 The op tree representing the statement sequence is returned. This may
12311 be a null pointer if the statements were all null, for example if there
12312 were no statements or if there were only subroutine definitions (which
12313 have compile-time side effects). If not null, it will be a C<lineseq>
12314 list, normally including C<nextstate> or equivalent ops.
12316 If an error occurs in parsing or compilation, in most cases a valid op
12317 tree is returned anyway. The error is reflected in the parser state,
12318 normally resulting in a single exception at the top level of parsing
12319 which covers all the compilation errors that occurred. Some compilation
12320 errors, however, will throw an exception immediately.
12322 The C<flags> parameter is reserved for future use, and must always
12329 Perl_parse_stmtseq(pTHX_ U32 flags)
12334 Perl_croak(aTHX_ "Parsing code internal error (%s)", "parse_stmtseq");
12335 stmtseqop = parse_recdescent_for_op(GRAMSTMTSEQ, LEX_FAKEEOF_CLOSING);
12336 c = lex_peek_unichar(0);
12337 if (c != -1 && c != /*{*/'}')
12338 qerror(Perl_mess(aTHX_ "Parse error"));
12343 * ex: set ts=8 sts=4 sw=4 et: