3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
12 * 'It all comes from here, the stench and the peril.' --Frodo
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
18 * This file is the lexer for Perl. It's closely linked to the
21 * The main routine is yylex(), which returns the next token.
25 =head1 Lexer interface
26 This is the lower layer of the Perl parser, managing characters and tokens.
28 =for apidoc AmU|yy_parser *|PL_parser
30 Pointer to a structure encapsulating the state of the parsing operation
31 currently in progress. The pointer can be locally changed to perform
32 a nested parse without interfering with the state of an outer parse.
33 Individual members of C<PL_parser> have their own documentation.
39 #define PERL_IN_TOKE_C
41 #include "dquote_inline.h"
43 #define new_constant(a,b,c,d,e,f,g) \
44 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
46 #define pl_yylval (PL_parser->yylval)
48 /* XXX temporary backwards compatibility */
49 #define PL_lex_brackets (PL_parser->lex_brackets)
50 #define PL_lex_allbrackets (PL_parser->lex_allbrackets)
51 #define PL_lex_fakeeof (PL_parser->lex_fakeeof)
52 #define PL_lex_brackstack (PL_parser->lex_brackstack)
53 #define PL_lex_casemods (PL_parser->lex_casemods)
54 #define PL_lex_casestack (PL_parser->lex_casestack)
55 #define PL_lex_dojoin (PL_parser->lex_dojoin)
56 #define PL_lex_formbrack (PL_parser->lex_formbrack)
57 #define PL_lex_inpat (PL_parser->lex_inpat)
58 #define PL_lex_inwhat (PL_parser->lex_inwhat)
59 #define PL_lex_op (PL_parser->lex_op)
60 #define PL_lex_repl (PL_parser->lex_repl)
61 #define PL_lex_starts (PL_parser->lex_starts)
62 #define PL_lex_stuff (PL_parser->lex_stuff)
63 #define PL_multi_start (PL_parser->multi_start)
64 #define PL_multi_open (PL_parser->multi_open)
65 #define PL_multi_close (PL_parser->multi_close)
66 #define PL_preambled (PL_parser->preambled)
67 #define PL_linestr (PL_parser->linestr)
68 #define PL_expect (PL_parser->expect)
69 #define PL_copline (PL_parser->copline)
70 #define PL_bufptr (PL_parser->bufptr)
71 #define PL_oldbufptr (PL_parser->oldbufptr)
72 #define PL_oldoldbufptr (PL_parser->oldoldbufptr)
73 #define PL_linestart (PL_parser->linestart)
74 #define PL_bufend (PL_parser->bufend)
75 #define PL_last_uni (PL_parser->last_uni)
76 #define PL_last_lop (PL_parser->last_lop)
77 #define PL_last_lop_op (PL_parser->last_lop_op)
78 #define PL_lex_state (PL_parser->lex_state)
79 #define PL_rsfp (PL_parser->rsfp)
80 #define PL_rsfp_filters (PL_parser->rsfp_filters)
81 #define PL_in_my (PL_parser->in_my)
82 #define PL_in_my_stash (PL_parser->in_my_stash)
83 #define PL_tokenbuf (PL_parser->tokenbuf)
84 #define PL_multi_end (PL_parser->multi_end)
85 #define PL_error_count (PL_parser->error_count)
87 # define PL_nexttoke (PL_parser->nexttoke)
88 # define PL_nexttype (PL_parser->nexttype)
89 # define PL_nextval (PL_parser->nextval)
92 #define SvEVALED(sv) \
93 (SvTYPE(sv) >= SVt_PVNV \
94 && ((XPVIV*)SvANY(sv))->xiv_u.xivu_eval_seen)
96 static const char* const ident_too_long = "Identifier too long";
98 # define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
100 #define XENUMMASK 0x3f
101 #define XFAKEEOF 0x40
102 #define XFAKEBRACK 0x80
104 #ifdef USE_UTF8_SCRIPTS
105 # define UTF cBOOL(!IN_BYTES)
107 # define UTF cBOOL((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
110 /* The maximum number of characters preceding the unrecognized one to display */
111 #define UNRECOGNIZED_PRECEDE_COUNT 10
113 /* In variables named $^X, these are the legal values for X.
114 * 1999-02-27 mjd-perl-patch@plover.com */
115 #define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
117 #define SPACE_OR_TAB(c) isBLANK_A(c)
119 #define HEXFP_PEEK(s) \
121 (isXDIGIT(s[1]) || isALPHA_FOLD_EQ(s[1], 'p'))) || \
122 isALPHA_FOLD_EQ(s[0], 'p'))
124 /* LEX_* are values for PL_lex_state, the state of the lexer.
125 * They are arranged oddly so that the guard on the switch statement
126 * can get by with a single comparison (if the compiler is smart enough).
128 * These values refer to the various states within a sublex parse,
129 * i.e. within a double quotish string
132 /* #define LEX_NOTPARSING 11 is done in perl.h. */
134 #define LEX_NORMAL 10 /* normal code (ie not within "...") */
135 #define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
136 #define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
137 #define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
138 #define LEX_INTERPSTART 6 /* expecting the start of a $var */
140 /* at end of code, eg "$x" followed by: */
141 #define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
142 #define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
144 #define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
145 string or after \E, $foo, etc */
146 #define LEX_INTERPCONST 2 /* NOT USED */
147 #define LEX_FORMLINE 1 /* expecting a format line */
151 static const char* const lex_state_names[] = {
166 #include "keywords.h"
168 /* CLINE is a macro that ensures PL_copline has a sane value */
170 #define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
173 * Convenience functions to return different tokens and prime the
174 * lexer for the next token. They all take an argument.
176 * TOKEN : generic token (used for '(', DOLSHARP, etc)
177 * OPERATOR : generic operator
178 * AOPERATOR : assignment operator
179 * PREBLOCK : beginning the block after an if, while, foreach, ...
180 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
181 * PREREF : *EXPR where EXPR is not a simple identifier
182 * TERM : expression term
183 * POSTDEREF : postfix dereference (->$* ->@[...] etc.)
184 * LOOPX : loop exiting command (goto, last, dump, etc)
185 * FTST : file test operator
186 * FUN0 : zero-argument function
187 * FUN0OP : zero-argument function, with its op created in this file
188 * FUN1 : not used, except for not, which isn't a UNIOP
189 * BOop : bitwise or or xor
191 * BCop : bitwise complement
192 * SHop : shift operator
193 * PWop : power operator
194 * PMop : pattern-matching operator
195 * Aop : addition-level operator
196 * AopNOASSIGN : addition-level operator that is never part of .=
197 * Mop : multiplication-level operator
198 * Eop : equality-testing operator
199 * Rop : relational operator <= != gt
201 * Also see LOP and lop() below.
204 #ifdef DEBUGGING /* Serve -DT. */
205 # define REPORT(retval) tokereport((I32)retval, &pl_yylval)
207 # define REPORT(retval) (retval)
210 #define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
211 #define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
212 #define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, retval))
213 #define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
214 #define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
215 #define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
216 #define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
217 #define POSTDEREF(f) return (PL_bufptr = s, S_postderef(aTHX_ REPORT(f),s[1]))
218 #define LOOPX(f) return (PL_bufptr = force_word(s,BAREWORD,TRUE,FALSE), \
220 PL_expect = PL_nexttoke ? XOPERATOR : XTERM, \
222 #define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
223 #define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
224 #define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
225 #define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
226 #define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)BITOROP))
227 #define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)BITANDOP))
228 #define BCop(f) return pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr = s, \
230 #define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)SHIFTOP))
231 #define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)POWOP))
232 #define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
233 #define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)ADDOP))
234 #define AopNOASSIGN(f) return (pl_yylval.ival=f, PL_bufptr=s, REPORT((int)ADDOP))
235 #define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)MULOP))
236 #define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
237 #define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
239 /* This bit of chicanery makes a unary function followed by
240 * a parenthesis into a function with one argument, highest precedence.
241 * The UNIDOR macro is for unary functions that can be followed by the //
242 * operator (such as C<shift // 0>).
244 #define UNI3(f,x,have_x) { \
245 pl_yylval.ival = f; \
246 if (have_x) PL_expect = x; \
248 PL_last_uni = PL_oldbufptr; \
249 PL_last_lop_op = (f) < 0 ? -(f) : (f); \
251 return REPORT( (int)FUNC1 ); \
253 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
255 #define UNI(f) UNI3(f,XTERM,1)
256 #define UNIDOR(f) UNI3(f,XTERMORDORDOR,1)
257 #define UNIPROTO(f,optional) { \
258 if (optional) PL_last_uni = PL_oldbufptr; \
262 #define UNIBRACK(f) UNI3(f,0,0)
264 /* grandfather return to old style */
267 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
268 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
269 pl_yylval.ival = (f); \
275 #define COPLINE_INC_WITH_HERELINES \
277 CopLINE_inc(PL_curcop); \
278 if (PL_parser->herelines) \
279 CopLINE(PL_curcop) += PL_parser->herelines, \
280 PL_parser->herelines = 0; \
282 /* Called after scan_str to update CopLINE(PL_curcop), but only when there
283 * is no sublex_push to follow. */
284 #define COPLINE_SET_FROM_MULTI_END \
286 CopLINE_set(PL_curcop, PL_multi_end); \
287 if (PL_multi_end != PL_multi_start) \
288 PL_parser->herelines = 0; \
294 /* how to interpret the pl_yylval associated with the token */
298 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
303 static struct debug_tokens {
305 enum token_type type;
307 } const debug_tokens[] =
309 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
310 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
311 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
312 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
313 { ARROW, TOKENTYPE_NONE, "ARROW" },
314 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
315 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
316 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
317 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
318 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
319 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
320 { DO, TOKENTYPE_NONE, "DO" },
321 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
322 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
323 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
324 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
325 { ELSE, TOKENTYPE_NONE, "ELSE" },
326 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
327 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
328 { FOR, TOKENTYPE_IVAL, "FOR" },
329 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
330 { FORMLBRACK, TOKENTYPE_NONE, "FORMLBRACK" },
331 { FORMRBRACK, TOKENTYPE_NONE, "FORMRBRACK" },
332 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
333 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
334 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
335 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
336 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
337 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
338 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
339 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
340 { IF, TOKENTYPE_IVAL, "IF" },
341 { LABEL, TOKENTYPE_PVAL, "LABEL" },
342 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
343 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
344 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
345 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
346 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
347 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
348 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
349 { MY, TOKENTYPE_IVAL, "MY" },
350 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
351 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
352 { OROP, TOKENTYPE_IVAL, "OROP" },
353 { OROR, TOKENTYPE_NONE, "OROR" },
354 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
355 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
356 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
357 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
358 { POSTJOIN, TOKENTYPE_NONE, "POSTJOIN" },
359 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
360 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
361 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
362 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
363 { PREINC, TOKENTYPE_NONE, "PREINC" },
364 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
365 { QWLIST, TOKENTYPE_OPVAL, "QWLIST" },
366 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
367 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
368 { REQUIRE, TOKENTYPE_NONE, "REQUIRE" },
369 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
370 { SUB, TOKENTYPE_NONE, "SUB" },
371 { THING, TOKENTYPE_OPVAL, "THING" },
372 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
373 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
374 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
375 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
376 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
377 { USE, TOKENTYPE_IVAL, "USE" },
378 { WHEN, TOKENTYPE_IVAL, "WHEN" },
379 { WHILE, TOKENTYPE_IVAL, "WHILE" },
380 { BAREWORD, TOKENTYPE_OPVAL, "BAREWORD" },
381 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
382 { 0, TOKENTYPE_NONE, NULL }
385 /* dump the returned token in rv, plus any optional arg in pl_yylval */
388 S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
390 PERL_ARGS_ASSERT_TOKEREPORT;
393 const char *name = NULL;
394 enum token_type type = TOKENTYPE_NONE;
395 const struct debug_tokens *p;
396 SV* const report = newSVpvs("<== ");
398 for (p = debug_tokens; p->token; p++) {
399 if (p->token == (int)rv) {
406 Perl_sv_catpv(aTHX_ report, name);
407 else if (isGRAPH(rv))
409 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
411 sv_catpvs(report, " (pending identifier)");
414 sv_catpvs(report, "EOF");
416 Perl_sv_catpvf(aTHX_ report, "?? %" IVdf, (IV)rv);
421 Perl_sv_catpvf(aTHX_ report, "(ival=%" IVdf ")", (IV)lvalp->ival);
423 case TOKENTYPE_OPNUM:
424 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
425 PL_op_name[lvalp->ival]);
428 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
430 case TOKENTYPE_OPVAL:
432 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
433 PL_op_name[lvalp->opval->op_type]);
434 if (lvalp->opval->op_type == OP_CONST) {
435 Perl_sv_catpvf(aTHX_ report, " %s",
436 SvPEEK(cSVOPx_sv(lvalp->opval)));
441 sv_catpvs(report, "(opval=null)");
444 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
450 /* print the buffer with suitable escapes */
453 S_printbuf(pTHX_ const char *const fmt, const char *const s)
455 SV* const tmp = newSVpvs("");
457 PERL_ARGS_ASSERT_PRINTBUF;
459 GCC_DIAG_IGNORE(-Wformat-nonliteral); /* fmt checked by caller */
460 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
468 S_deprecate_commaless_var_list(pTHX) {
470 deprecate_fatal_in("5.28", "Use of comma-less variable list is deprecated");
471 return REPORT(','); /* grandfather non-comma-format format */
477 * This subroutine looks for an '=' next to the operator that has just been
478 * parsed and turns it into an ASSIGNOP if it finds one.
482 S_ao(pTHX_ int toketype)
484 if (*PL_bufptr == '=') {
486 if (toketype == ANDAND)
487 pl_yylval.ival = OP_ANDASSIGN;
488 else if (toketype == OROR)
489 pl_yylval.ival = OP_ORASSIGN;
490 else if (toketype == DORDOR)
491 pl_yylval.ival = OP_DORASSIGN;
494 return REPORT(toketype);
499 * When Perl expects an operator and finds something else, no_op
500 * prints the warning. It always prints "<something> found where
501 * operator expected. It prints "Missing semicolon on previous line?"
502 * if the surprise occurs at the start of the line. "do you need to
503 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
504 * where the compiler doesn't know if foo is a method call or a function.
505 * It prints "Missing operator before end of line" if there's nothing
506 * after the missing operator, or "... before <...>" if there is something
507 * after the missing operator.
509 * PL_bufptr is expected to point to the start of the thing that was found,
510 * and s after the next token or partial token.
514 S_no_op(pTHX_ const char *const what, char *s)
516 char * const oldbp = PL_bufptr;
517 const bool is_first = (PL_oldbufptr == PL_linestart);
519 PERL_ARGS_ASSERT_NO_OP;
525 yywarn(Perl_form(aTHX_ "%s found where operator expected", what), UTF ? SVf_UTF8 : 0);
526 if (ckWARN_d(WARN_SYNTAX)) {
528 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
529 "\t(Missing semicolon on previous line?)\n");
530 else if (PL_oldoldbufptr && isIDFIRST_lazy_if_safe(PL_oldoldbufptr,
535 for (t = PL_oldoldbufptr;
536 (isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF) || *t == ':');
537 t += UTF ? UTF8SKIP(t) : 1)
541 if (t < PL_bufptr && isSPACE(*t))
542 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
543 "\t(Do you need to predeclare %" UTF8f "?)\n",
544 UTF8fARG(UTF, t - PL_oldoldbufptr, PL_oldoldbufptr));
548 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
549 "\t(Missing operator before %" UTF8f "?)\n",
550 UTF8fARG(UTF, s - oldbp, oldbp));
558 * Complain about missing quote/regexp/heredoc terminator.
559 * If it's called with NULL then it cauterizes the line buffer.
560 * If we're in a delimited string and the delimiter is a control
561 * character, it's reformatted into a two-char sequence like ^C.
566 S_missingterm(pTHX_ char *s)
568 char tmpbuf[UTF8_MAXBYTES + 1];
573 char * const nl = strrchr(s,'\n');
578 else if (PL_multi_close < 32) {
580 tmpbuf[1] = (char)toCTRL(PL_multi_close);
585 if (LIKELY(PL_multi_close < 256)) {
586 *tmpbuf = (char)PL_multi_close;
591 *uvchr_to_utf8((U8 *)tmpbuf, PL_multi_close) = 0;
595 q = strchr(s,'"') ? '\'' : '"';
596 sv = sv_2mortal(newSVpv(s,0));
599 Perl_croak(aTHX_ "Can't find string terminator %c%" SVf
600 "%c anywhere before EOF",q,SVfARG(sv),q);
606 * Check whether the named feature is enabled.
609 Perl_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
611 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
613 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
615 assert(CURRENT_FEATURE_BUNDLE == FEATURE_BUNDLE_CUSTOM);
617 if (namelen > MAX_FEATURE_LEN)
619 memcpy(&he_name[8], name, namelen);
621 return cBOOL(cop_hints_fetch_pvn(PL_curcop, he_name, 8 + namelen, 0,
622 REFCOUNTED_HE_EXISTS));
626 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
627 * utf16-to-utf8-reversed.
630 #ifdef PERL_CR_FILTER
634 const char *s = SvPVX_const(sv);
635 const char * const e = s + SvCUR(sv);
637 PERL_ARGS_ASSERT_STRIP_RETURN;
639 /* outer loop optimized to do nothing if there are no CR-LFs */
641 if (*s++ == '\r' && *s == '\n') {
642 /* hit a CR-LF, need to copy the rest */
646 if (*s == '\r' && s[1] == '\n')
657 S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
659 const I32 count = FILTER_READ(idx+1, sv, maxlen);
660 if (count > 0 && !maxlen)
667 =for apidoc Amx|void|lex_start|SV *line|PerlIO *rsfp|U32 flags
669 Creates and initialises a new lexer/parser state object, supplying
670 a context in which to lex and parse from a new source of Perl code.
671 A pointer to the new state object is placed in L</PL_parser>. An entry
672 is made on the save stack so that upon unwinding the new state object
673 will be destroyed and the former value of L</PL_parser> will be restored.
674 Nothing else need be done to clean up the parsing context.
676 The code to be parsed comes from C<line> and C<rsfp>. C<line>, if
677 non-null, provides a string (in SV form) containing code to be parsed.
678 A copy of the string is made, so subsequent modification of C<line>
679 does not affect parsing. C<rsfp>, if non-null, provides an input stream
680 from which code will be read to be parsed. If both are non-null, the
681 code in C<line> comes first and must consist of complete lines of input,
682 and C<rsfp> supplies the remainder of the source.
684 The C<flags> parameter is reserved for future use. Currently it is only
685 used by perl internally, so extensions should always pass zero.
690 /* LEX_START_SAME_FILTER indicates that this is not a new file, so it
691 can share filters with the current parser.
692 LEX_START_DONT_CLOSE indicates that the file handle wasn't opened by the
693 caller, hence isn't owned by the parser, so shouldn't be closed on parser
694 destruction. This is used to handle the case of defaulting to reading the
695 script from the standard input because no filename was given on the command
696 line (without getting confused by situation where STDIN has been closed, so
697 the script handle is opened on fd 0) */
700 Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
702 const char *s = NULL;
703 yy_parser *parser, *oparser;
704 if (flags && flags & ~LEX_START_FLAGS)
705 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
707 /* create and initialise a parser */
709 Newxz(parser, 1, yy_parser);
710 parser->old_parser = oparser = PL_parser;
713 parser->stack = NULL;
714 parser->stack_max1 = NULL;
717 /* on scope exit, free this parser and restore any outer one */
719 parser->saved_curcop = PL_curcop;
721 /* initialise lexer state */
723 parser->nexttoke = 0;
724 parser->error_count = oparser ? oparser->error_count : 0;
725 parser->copline = parser->preambling = NOLINE;
726 parser->lex_state = LEX_NORMAL;
727 parser->expect = XSTATE;
729 parser->rsfp_filters =
730 !(flags & LEX_START_SAME_FILTER) || !oparser
732 : MUTABLE_AV(SvREFCNT_inc(
733 oparser->rsfp_filters
734 ? oparser->rsfp_filters
735 : (oparser->rsfp_filters = newAV())
738 Newx(parser->lex_brackstack, 120, char);
739 Newx(parser->lex_casestack, 12, char);
740 *parser->lex_casestack = '\0';
741 Newxz(parser->lex_shared, 1, LEXSHARED);
745 s = SvPV_const(line, len);
746 parser->linestr = flags & LEX_START_COPIED
747 ? SvREFCNT_inc_simple_NN(line)
748 : newSVpvn_flags(s, len, SvUTF8(line));
750 sv_catpvs(parser->linestr, "\n;");
752 parser->linestr = newSVpvn("\n;", rsfp ? 1 : 2);
754 parser->oldoldbufptr =
757 parser->linestart = SvPVX(parser->linestr);
758 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
759 parser->last_lop = parser->last_uni = NULL;
761 STATIC_ASSERT_STMT(FITS_IN_8_BITS(LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
762 |LEX_DONT_CLOSE_RSFP));
763 parser->lex_flags = (U8) (flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
764 |LEX_DONT_CLOSE_RSFP));
766 parser->in_pod = parser->filtered = 0;
770 /* delete a parser object */
773 Perl_parser_free(pTHX_ const yy_parser *parser)
775 PERL_ARGS_ASSERT_PARSER_FREE;
777 PL_curcop = parser->saved_curcop;
778 SvREFCNT_dec(parser->linestr);
780 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
781 PerlIO_clearerr(parser->rsfp);
782 else if (parser->rsfp && (!parser->old_parser
783 || (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
784 PerlIO_close(parser->rsfp);
785 SvREFCNT_dec(parser->rsfp_filters);
786 SvREFCNT_dec(parser->lex_stuff);
787 SvREFCNT_dec(parser->lex_sub_repl);
789 Safefree(parser->lex_brackstack);
790 Safefree(parser->lex_casestack);
791 Safefree(parser->lex_shared);
792 PL_parser = parser->old_parser;
797 Perl_parser_free_nexttoke_ops(pTHX_ yy_parser *parser, OPSLAB *slab)
799 I32 nexttoke = parser->nexttoke;
800 PERL_ARGS_ASSERT_PARSER_FREE_NEXTTOKE_OPS;
802 if (S_is_opval_token(parser->nexttype[nexttoke] & 0xffff)
803 && parser->nextval[nexttoke].opval
804 && parser->nextval[nexttoke].opval->op_slabbed
805 && OpSLAB(parser->nextval[nexttoke].opval) == slab) {
806 op_free(parser->nextval[nexttoke].opval);
807 parser->nextval[nexttoke].opval = NULL;
814 =for apidoc AmxU|SV *|PL_parser-E<gt>linestr
816 Buffer scalar containing the chunk currently under consideration of the
817 text currently being lexed. This is always a plain string scalar (for
818 which C<SvPOK> is true). It is not intended to be used as a scalar by
819 normal scalar means; instead refer to the buffer directly by the pointer
820 variables described below.
822 The lexer maintains various C<char*> pointers to things in the
823 C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
824 reallocated, all of these pointers must be updated. Don't attempt to
825 do this manually, but rather use L</lex_grow_linestr> if you need to
826 reallocate the buffer.
828 The content of the text chunk in the buffer is commonly exactly one
829 complete line of input, up to and including a newline terminator,
830 but there are situations where it is otherwise. The octets of the
831 buffer may be intended to be interpreted as either UTF-8 or Latin-1.
832 The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
833 flag on this scalar, which may disagree with it.
835 For direct examination of the buffer, the variable
836 L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
837 lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
838 of these pointers is usually preferable to examination of the scalar
839 through normal scalar means.
841 =for apidoc AmxU|char *|PL_parser-E<gt>bufend
843 Direct pointer to the end of the chunk of text currently being lexed, the
844 end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
845 + SvCUR(PL_parser-E<gt>linestr)>. A C<NUL> character (zero octet) is
846 always located at the end of the buffer, and does not count as part of
847 the buffer's contents.
849 =for apidoc AmxU|char *|PL_parser-E<gt>bufptr
851 Points to the current position of lexing inside the lexer buffer.
852 Characters around this point may be freely examined, within
853 the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
854 L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
855 interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
857 Lexing code (whether in the Perl core or not) moves this pointer past
858 the characters that it consumes. It is also expected to perform some
859 bookkeeping whenever a newline character is consumed. This movement
860 can be more conveniently performed by the function L</lex_read_to>,
861 which handles newlines appropriately.
863 Interpretation of the buffer's octets can be abstracted out by
864 using the slightly higher-level functions L</lex_peek_unichar> and
865 L</lex_read_unichar>.
867 =for apidoc AmxU|char *|PL_parser-E<gt>linestart
869 Points to the start of the current line inside the lexer buffer.
870 This is useful for indicating at which column an error occurred, and
871 not much else. This must be updated by any lexing code that consumes
872 a newline; the function L</lex_read_to> handles this detail.
878 =for apidoc Amx|bool|lex_bufutf8
880 Indicates whether the octets in the lexer buffer
881 (L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
882 of Unicode characters. If not, they should be interpreted as Latin-1
883 characters. This is analogous to the C<SvUTF8> flag for scalars.
885 In UTF-8 mode, it is not guaranteed that the lexer buffer actually
886 contains valid UTF-8. Lexing code must be robust in the face of invalid
889 The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
890 is significant, but not the whole story regarding the input character
891 encoding. Normally, when a file is being read, the scalar contains octets
892 and its C<SvUTF8> flag is off, but the octets should be interpreted as
893 UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
894 however, the scalar may have the C<SvUTF8> flag on, and in this case its
895 octets should be interpreted as UTF-8 unless the C<use bytes> pragma
896 is in effect. This logic may change in the future; use this function
897 instead of implementing the logic yourself.
903 Perl_lex_bufutf8(pTHX)
909 =for apidoc Amx|char *|lex_grow_linestr|STRLEN len
911 Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
912 at least C<len> octets (including terminating C<NUL>). Returns a
913 pointer to the reallocated buffer. This is necessary before making
914 any direct modification of the buffer that would increase its length.
915 L</lex_stuff_pvn> provides a more convenient way to insert text into
918 Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
919 this function updates all of the lexer's variables that point directly
926 Perl_lex_grow_linestr(pTHX_ STRLEN len)
930 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
931 STRLEN linestart_pos, last_uni_pos, last_lop_pos, re_eval_start_pos;
934 linestr = PL_parser->linestr;
935 buf = SvPVX(linestr);
936 if (len <= SvLEN(linestr))
939 /* Is the lex_shared linestr SV the same as the current linestr SV?
940 * Only in this case does re_eval_start need adjusting, since it
941 * points within lex_shared->ls_linestr's buffer */
942 current = ( !PL_parser->lex_shared->ls_linestr
943 || linestr == PL_parser->lex_shared->ls_linestr);
945 bufend_pos = PL_parser->bufend - buf;
946 bufptr_pos = PL_parser->bufptr - buf;
947 oldbufptr_pos = PL_parser->oldbufptr - buf;
948 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
949 linestart_pos = PL_parser->linestart - buf;
950 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
951 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
952 re_eval_start_pos = (current && PL_parser->lex_shared->re_eval_start) ?
953 PL_parser->lex_shared->re_eval_start - buf : 0;
955 buf = sv_grow(linestr, len);
957 PL_parser->bufend = buf + bufend_pos;
958 PL_parser->bufptr = buf + bufptr_pos;
959 PL_parser->oldbufptr = buf + oldbufptr_pos;
960 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
961 PL_parser->linestart = buf + linestart_pos;
962 if (PL_parser->last_uni)
963 PL_parser->last_uni = buf + last_uni_pos;
964 if (PL_parser->last_lop)
965 PL_parser->last_lop = buf + last_lop_pos;
966 if (current && PL_parser->lex_shared->re_eval_start)
967 PL_parser->lex_shared->re_eval_start = buf + re_eval_start_pos;
972 =for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
974 Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
975 immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
976 reallocating the buffer if necessary. This means that lexing code that
977 runs later will see the characters as if they had appeared in the input.
978 It is not recommended to do this as part of normal parsing, and most
979 uses of this facility run the risk of the inserted characters being
980 interpreted in an unintended manner.
982 The string to be inserted is represented by C<len> octets starting
983 at C<pv>. These octets are interpreted as either UTF-8 or Latin-1,
984 according to whether the C<LEX_STUFF_UTF8> flag is set in C<flags>.
985 The characters are recoded for the lexer buffer, according to how the
986 buffer is currently being interpreted (L</lex_bufutf8>). If a string
987 to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
988 function is more convenient.
994 Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
998 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
999 if (flags & ~(LEX_STUFF_UTF8))
1000 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
1002 if (flags & LEX_STUFF_UTF8) {
1005 STRLEN highhalf = 0; /* Count of variants */
1006 const char *p, *e = pv+len;
1007 for (p = pv; p != e; p++) {
1008 if (! UTF8_IS_INVARIANT(*p)) {
1014 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
1015 bufptr = PL_parser->bufptr;
1016 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
1017 SvCUR_set(PL_parser->linestr,
1018 SvCUR(PL_parser->linestr) + len+highhalf);
1019 PL_parser->bufend += len+highhalf;
1020 for (p = pv; p != e; p++) {
1022 if (! UTF8_IS_INVARIANT(c)) {
1023 *bufptr++ = UTF8_TWO_BYTE_HI(c);
1024 *bufptr++ = UTF8_TWO_BYTE_LO(c);
1026 *bufptr++ = (char)c;
1031 if (flags & LEX_STUFF_UTF8) {
1032 STRLEN highhalf = 0;
1033 const char *p, *e = pv+len;
1034 for (p = pv; p != e; p++) {
1036 if (UTF8_IS_ABOVE_LATIN1(c)) {
1037 Perl_croak(aTHX_ "Lexing code attempted to stuff "
1038 "non-Latin-1 character into Latin-1 input");
1039 } else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e)) {
1042 } else if (! UTF8_IS_INVARIANT(c)) {
1043 _force_out_malformed_utf8_message((U8 *) p, (U8 *) e,
1045 1 /* 1 means die */ );
1046 NOT_REACHED; /* NOTREACHED */
1051 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
1052 bufptr = PL_parser->bufptr;
1053 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
1054 SvCUR_set(PL_parser->linestr,
1055 SvCUR(PL_parser->linestr) + len-highhalf);
1056 PL_parser->bufend += len-highhalf;
1059 if (UTF8_IS_INVARIANT(*p)) {
1065 *bufptr++ = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
1071 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1072 bufptr = PL_parser->bufptr;
1073 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
1074 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
1075 PL_parser->bufend += len;
1076 Copy(pv, bufptr, len, char);
1082 =for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
1084 Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1085 immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1086 reallocating the buffer if necessary. This means that lexing code that
1087 runs later will see the characters as if they had appeared in the input.
1088 It is not recommended to do this as part of normal parsing, and most
1089 uses of this facility run the risk of the inserted characters being
1090 interpreted in an unintended manner.
1092 The string to be inserted is represented by octets starting at C<pv>
1093 and continuing to the first nul. These octets are interpreted as either
1094 UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
1095 in C<flags>. The characters are recoded for the lexer buffer, according
1096 to how the buffer is currently being interpreted (L</lex_bufutf8>).
1097 If it is not convenient to nul-terminate a string to be inserted, the
1098 L</lex_stuff_pvn> function is more appropriate.
1104 Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1106 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1107 lex_stuff_pvn(pv, strlen(pv), flags);
1111 =for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1113 Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1114 immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1115 reallocating the buffer if necessary. This means that lexing code that
1116 runs later will see the characters as if they had appeared in the input.
1117 It is not recommended to do this as part of normal parsing, and most
1118 uses of this facility run the risk of the inserted characters being
1119 interpreted in an unintended manner.
1121 The string to be inserted is the string value of C<sv>. The characters
1122 are recoded for the lexer buffer, according to how the buffer is currently
1123 being interpreted (L</lex_bufutf8>). If a string to be inserted is
1124 not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1125 need to construct a scalar.
1131 Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1135 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1137 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1139 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1143 =for apidoc Amx|void|lex_unstuff|char *ptr
1145 Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1146 C<ptr>. Text following C<ptr> will be moved, and the buffer shortened.
1147 This hides the discarded text from any lexing code that runs later,
1148 as if the text had never appeared.
1150 This is not the normal way to consume lexed text. For that, use
1157 Perl_lex_unstuff(pTHX_ char *ptr)
1161 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1162 buf = PL_parser->bufptr;
1164 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1167 bufend = PL_parser->bufend;
1169 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1170 unstuff_len = ptr - buf;
1171 Move(ptr, buf, bufend+1-ptr, char);
1172 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1173 PL_parser->bufend = bufend - unstuff_len;
1177 =for apidoc Amx|void|lex_read_to|char *ptr
1179 Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1180 to C<ptr>. This advances L</PL_parser-E<gt>bufptr> to match C<ptr>,
1181 performing the correct bookkeeping whenever a newline character is passed.
1182 This is the normal way to consume lexed text.
1184 Interpretation of the buffer's octets can be abstracted out by
1185 using the slightly higher-level functions L</lex_peek_unichar> and
1186 L</lex_read_unichar>.
1192 Perl_lex_read_to(pTHX_ char *ptr)
1195 PERL_ARGS_ASSERT_LEX_READ_TO;
1196 s = PL_parser->bufptr;
1197 if (ptr < s || ptr > PL_parser->bufend)
1198 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1199 for (; s != ptr; s++)
1201 COPLINE_INC_WITH_HERELINES;
1202 PL_parser->linestart = s+1;
1204 PL_parser->bufptr = ptr;
1208 =for apidoc Amx|void|lex_discard_to|char *ptr
1210 Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1211 up to C<ptr>. The remaining content of the buffer will be moved, and
1212 all pointers into the buffer updated appropriately. C<ptr> must not
1213 be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1214 it is not permitted to discard text that has yet to be lexed.
1216 Normally it is not necessarily to do this directly, because it suffices to
1217 use the implicit discarding behaviour of L</lex_next_chunk> and things
1218 based on it. However, if a token stretches across multiple lines,
1219 and the lexing code has kept multiple lines of text in the buffer for
1220 that purpose, then after completion of the token it would be wise to
1221 explicitly discard the now-unneeded earlier lines, to avoid future
1222 multi-line tokens growing the buffer without bound.
1228 Perl_lex_discard_to(pTHX_ char *ptr)
1232 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1233 buf = SvPVX(PL_parser->linestr);
1235 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1238 if (ptr > PL_parser->bufptr)
1239 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1240 discard_len = ptr - buf;
1241 if (PL_parser->oldbufptr < ptr)
1242 PL_parser->oldbufptr = ptr;
1243 if (PL_parser->oldoldbufptr < ptr)
1244 PL_parser->oldoldbufptr = ptr;
1245 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1246 PL_parser->last_uni = NULL;
1247 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1248 PL_parser->last_lop = NULL;
1249 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1250 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1251 PL_parser->bufend -= discard_len;
1252 PL_parser->bufptr -= discard_len;
1253 PL_parser->oldbufptr -= discard_len;
1254 PL_parser->oldoldbufptr -= discard_len;
1255 if (PL_parser->last_uni)
1256 PL_parser->last_uni -= discard_len;
1257 if (PL_parser->last_lop)
1258 PL_parser->last_lop -= discard_len;
1262 =for apidoc Amx|bool|lex_next_chunk|U32 flags
1264 Reads in the next chunk of text to be lexed, appending it to
1265 L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1266 looked to the end of the current chunk and wants to know more. It is
1267 usual, but not necessary, for lexing to have consumed the entirety of
1268 the current chunk at this time.
1270 If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1271 chunk (i.e., the current chunk has been entirely consumed), normally the
1272 current chunk will be discarded at the same time that the new chunk is
1273 read in. If C<flags> has the C<LEX_KEEP_PREVIOUS> bit set, the current chunk
1274 will not be discarded. If the current chunk has not been entirely
1275 consumed, then it will not be discarded regardless of the flag.
1277 Returns true if some new text was added to the buffer, or false if the
1278 buffer has reached the end of the input text.
1283 #define LEX_FAKE_EOF 0x80000000
1284 #define LEX_NO_TERM 0x40000000 /* here-doc */
1287 Perl_lex_next_chunk(pTHX_ U32 flags)
1291 STRLEN old_bufend_pos, new_bufend_pos;
1292 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1293 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
1294 bool got_some_for_debugger = 0;
1296 const U8* first_bad_char_loc;
1298 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM))
1299 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
1300 if (!(flags & LEX_NO_TERM) && PL_lex_inwhat)
1302 linestr = PL_parser->linestr;
1303 buf = SvPVX(linestr);
1304 if (!(flags & LEX_KEEP_PREVIOUS)
1305 && PL_parser->bufptr == PL_parser->bufend)
1307 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1309 if (PL_parser->last_uni != PL_parser->bufend)
1310 PL_parser->last_uni = NULL;
1311 if (PL_parser->last_lop != PL_parser->bufend)
1312 PL_parser->last_lop = NULL;
1313 last_uni_pos = last_lop_pos = 0;
1317 old_bufend_pos = PL_parser->bufend - buf;
1318 bufptr_pos = PL_parser->bufptr - buf;
1319 oldbufptr_pos = PL_parser->oldbufptr - buf;
1320 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1321 linestart_pos = PL_parser->linestart - buf;
1322 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1323 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1325 if (flags & LEX_FAKE_EOF) {
1327 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
1329 } else if (filter_gets(linestr, old_bufend_pos)) {
1331 got_some_for_debugger = 1;
1332 } else if (flags & LEX_NO_TERM) {
1335 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1338 /* End of real input. Close filehandle (unless it was STDIN),
1339 * then add implicit termination.
1341 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
1342 PerlIO_clearerr(PL_parser->rsfp);
1343 else if (PL_parser->rsfp)
1344 (void)PerlIO_close(PL_parser->rsfp);
1345 PL_parser->rsfp = NULL;
1346 PL_parser->in_pod = PL_parser->filtered = 0;
1347 if (!PL_in_eval && PL_minus_p) {
1349 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1350 PL_minus_n = PL_minus_p = 0;
1351 } else if (!PL_in_eval && PL_minus_n) {
1352 sv_catpvs(linestr, /*{*/";}");
1355 sv_catpvs(linestr, ";");
1358 buf = SvPVX(linestr);
1359 new_bufend_pos = SvCUR(linestr);
1360 PL_parser->bufend = buf + new_bufend_pos;
1361 PL_parser->bufptr = buf + bufptr_pos;
1363 if (UTF && ! is_utf8_string_loc((U8 *) PL_parser->bufptr,
1364 PL_parser->bufend - PL_parser->bufptr,
1365 &first_bad_char_loc))
1367 _force_out_malformed_utf8_message(first_bad_char_loc,
1368 (U8 *) PL_parser->bufend,
1370 1 /* 1 means die */ );
1371 NOT_REACHED; /* NOTREACHED */
1374 PL_parser->oldbufptr = buf + oldbufptr_pos;
1375 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1376 PL_parser->linestart = buf + linestart_pos;
1377 if (PL_parser->last_uni)
1378 PL_parser->last_uni = buf + last_uni_pos;
1379 if (PL_parser->last_lop)
1380 PL_parser->last_lop = buf + last_lop_pos;
1381 if (PL_parser->preambling != NOLINE) {
1382 CopLINE_set(PL_curcop, PL_parser->preambling + 1);
1383 PL_parser->preambling = NOLINE;
1385 if ( got_some_for_debugger
1386 && PERLDB_LINE_OR_SAVESRC
1387 && PL_curstash != PL_debstash)
1389 /* debugger active and we're not compiling the debugger code,
1390 * so store the line into the debugger's array of lines
1392 update_debugger_info(NULL, buf+old_bufend_pos,
1393 new_bufend_pos-old_bufend_pos);
1399 =for apidoc Amx|I32|lex_peek_unichar|U32 flags
1401 Looks ahead one (Unicode) character in the text currently being lexed.
1402 Returns the codepoint (unsigned integer value) of the next character,
1403 or -1 if lexing has reached the end of the input text. To consume the
1404 peeked character, use L</lex_read_unichar>.
1406 If the next character is in (or extends into) the next chunk of input
1407 text, the next chunk will be read in. Normally the current chunk will be
1408 discarded at the same time, but if C<flags> has the C<LEX_KEEP_PREVIOUS>
1409 bit set, then the current chunk will not be discarded.
1411 If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1412 is encountered, an exception is generated.
1418 Perl_lex_peek_unichar(pTHX_ U32 flags)
1422 if (flags & ~(LEX_KEEP_PREVIOUS))
1423 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1424 s = PL_parser->bufptr;
1425 bufend = PL_parser->bufend;
1431 if (!lex_next_chunk(flags))
1433 s = PL_parser->bufptr;
1434 bufend = PL_parser->bufend;
1437 if (UTF8_IS_INVARIANT(head))
1439 if (UTF8_IS_START(head)) {
1440 len = UTF8SKIP(&head);
1441 while ((STRLEN)(bufend-s) < len) {
1442 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1444 s = PL_parser->bufptr;
1445 bufend = PL_parser->bufend;
1448 unichar = utf8n_to_uvchr((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
1449 if (retlen == (STRLEN)-1) {
1450 _force_out_malformed_utf8_message((U8 *) s,
1453 1 /* 1 means die */ );
1454 NOT_REACHED; /* NOTREACHED */
1459 if (!lex_next_chunk(flags))
1461 s = PL_parser->bufptr;
1468 =for apidoc Amx|I32|lex_read_unichar|U32 flags
1470 Reads the next (Unicode) character in the text currently being lexed.
1471 Returns the codepoint (unsigned integer value) of the character read,
1472 and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1473 if lexing has reached the end of the input text. To non-destructively
1474 examine the next character, use L</lex_peek_unichar> instead.
1476 If the next character is in (or extends into) the next chunk of input
1477 text, the next chunk will be read in. Normally the current chunk will be
1478 discarded at the same time, but if C<flags> has the C<LEX_KEEP_PREVIOUS>
1479 bit set, then the current chunk will not be discarded.
1481 If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1482 is encountered, an exception is generated.
1488 Perl_lex_read_unichar(pTHX_ U32 flags)
1491 if (flags & ~(LEX_KEEP_PREVIOUS))
1492 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1493 c = lex_peek_unichar(flags);
1496 COPLINE_INC_WITH_HERELINES;
1498 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1500 ++(PL_parser->bufptr);
1506 =for apidoc Amx|void|lex_read_space|U32 flags
1508 Reads optional spaces, in Perl style, in the text currently being
1509 lexed. The spaces may include ordinary whitespace characters and
1510 Perl-style comments. C<#line> directives are processed if encountered.
1511 L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1512 at a non-space character (or the end of the input text).
1514 If spaces extend into the next chunk of input text, the next chunk will
1515 be read in. Normally the current chunk will be discarded at the same
1516 time, but if C<flags> has the C<LEX_KEEP_PREVIOUS> bit set, then the current
1517 chunk will not be discarded.
1522 #define LEX_NO_INCLINE 0x40000000
1523 #define LEX_NO_NEXT_CHUNK 0x80000000
1526 Perl_lex_read_space(pTHX_ U32 flags)
1529 const bool can_incline = !(flags & LEX_NO_INCLINE);
1530 bool need_incline = 0;
1531 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK|LEX_NO_INCLINE))
1532 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
1533 s = PL_parser->bufptr;
1534 bufend = PL_parser->bufend;
1540 } while (!(c == '\n' || (c == 0 && s == bufend)));
1541 } else if (c == '\n') {
1544 PL_parser->linestart = s;
1550 } else if (isSPACE(c)) {
1552 } else if (c == 0 && s == bufend) {
1555 if (flags & LEX_NO_NEXT_CHUNK)
1557 PL_parser->bufptr = s;
1558 l = CopLINE(PL_curcop);
1559 CopLINE(PL_curcop) += PL_parser->herelines + 1;
1560 got_more = lex_next_chunk(flags);
1561 CopLINE_set(PL_curcop, l);
1562 s = PL_parser->bufptr;
1563 bufend = PL_parser->bufend;
1566 if (can_incline && need_incline && PL_parser->rsfp) {
1576 PL_parser->bufptr = s;
1581 =for apidoc EXMp|bool|validate_proto|SV *name|SV *proto|bool warn
1583 This function performs syntax checking on a prototype, C<proto>.
1584 If C<warn> is true, any illegal characters or mismatched brackets
1585 will trigger illegalproto warnings, declaring that they were
1586 detected in the prototype for C<name>.
1588 The return value is C<true> if this is a valid prototype, and
1589 C<false> if it is not, regardless of whether C<warn> was C<true> or
1592 Note that C<NULL> is a valid C<proto> and will always return C<true>.
1599 Perl_validate_proto(pTHX_ SV *name, SV *proto, bool warn)
1601 STRLEN len, origlen;
1603 bool bad_proto = FALSE;
1604 bool in_brackets = FALSE;
1605 bool after_slash = FALSE;
1606 char greedy_proto = ' ';
1607 bool proto_after_greedy_proto = FALSE;
1608 bool must_be_last = FALSE;
1609 bool underscore = FALSE;
1610 bool bad_proto_after_underscore = FALSE;
1612 PERL_ARGS_ASSERT_VALIDATE_PROTO;
1617 p = SvPV(proto, len);
1619 for (; len--; p++) {
1622 proto_after_greedy_proto = TRUE;
1624 if (!strchr(";@%", *p))
1625 bad_proto_after_underscore = TRUE;
1628 if (!strchr("$@%*;[]&\\_+", *p) || *p == '\0') {
1635 in_brackets = FALSE;
1636 else if ((*p == '@' || *p == '%')
1640 must_be_last = TRUE;
1649 after_slash = FALSE;
1654 SV *tmpsv = newSVpvs_flags("", SVs_TEMP);
1657 ? sv_uni_display(tmpsv, newSVpvn_flags(p, origlen, SVs_TEMP | SVf_UTF8),
1658 origlen, UNI_DISPLAY_ISPRINT)
1659 : pv_pretty(tmpsv, p, origlen, 60, NULL, NULL, PERL_PV_ESCAPE_NONASCII);
1661 if (proto_after_greedy_proto)
1662 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1663 "Prototype after '%c' for %" SVf " : %s",
1664 greedy_proto, SVfARG(name), p);
1666 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1667 "Missing ']' in prototype for %" SVf " : %s",
1670 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1671 "Illegal character in prototype for %" SVf " : %s",
1673 if (bad_proto_after_underscore)
1674 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1675 "Illegal character after '_' in prototype for %" SVf " : %s",
1679 return (! (proto_after_greedy_proto || bad_proto) );
1684 * This subroutine has nothing to do with tilting, whether at windmills
1685 * or pinball tables. Its name is short for "increment line". It
1686 * increments the current line number in CopLINE(PL_curcop) and checks
1687 * to see whether the line starts with a comment of the form
1688 * # line 500 "foo.pm"
1689 * If so, it sets the current line number and file to the values in the comment.
1693 S_incline(pTHX_ const char *s)
1701 PERL_ARGS_ASSERT_INCLINE;
1703 COPLINE_INC_WITH_HERELINES;
1704 if (!PL_rsfp && !PL_parser->filtered && PL_lex_state == LEX_NORMAL
1705 && s+1 == PL_bufend && *s == ';') {
1706 /* fake newline in string eval */
1707 CopLINE_dec(PL_curcop);
1712 while (SPACE_OR_TAB(*s))
1714 if (strEQs(s, "line"))
1718 if (SPACE_OR_TAB(*s))
1722 while (SPACE_OR_TAB(*s))
1730 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
1732 while (SPACE_OR_TAB(*s))
1734 if (*s == '"' && (t = strchr(s+1, '"'))) {
1740 while (*t && !isSPACE(*t))
1744 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
1746 if (*e != '\n' && *e != '\0')
1747 return; /* false alarm */
1749 if (!grok_atoUV(n, &uv, &e))
1751 line_num = ((line_t)uv) - 1;
1754 const STRLEN len = t - s;
1756 if (!PL_rsfp && !PL_parser->filtered) {
1757 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1758 * to *{"::_<newfilename"} */
1759 /* However, the long form of evals is only turned on by the
1760 debugger - usually they're "(eval %lu)" */
1761 GV * const cfgv = CopFILEGV(PL_curcop);
1764 STRLEN tmplen2 = len;
1768 if (tmplen2 + 2 <= sizeof smallbuf)
1771 Newx(tmpbuf2, tmplen2 + 2, char);
1776 memcpy(tmpbuf2 + 2, s, tmplen2);
1779 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
1781 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
1782 /* adjust ${"::_<newfilename"} to store the new file name */
1783 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
1784 /* The line number may differ. If that is the case,
1785 alias the saved lines that are in the array.
1786 Otherwise alias the whole array. */
1787 if (CopLINE(PL_curcop) == line_num) {
1788 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(cfgv)));
1789 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(cfgv)));
1791 else if (GvAV(cfgv)) {
1792 AV * const av = GvAV(cfgv);
1793 const I32 start = CopLINE(PL_curcop)+1;
1794 I32 items = AvFILLp(av) - start;
1796 AV * const av2 = GvAVn(gv2);
1797 SV **svp = AvARRAY(av) + start;
1798 I32 l = (I32)line_num+1;
1800 av_store(av2, l++, SvREFCNT_inc(*svp++));
1805 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
1808 CopFILE_free(PL_curcop);
1809 CopFILE_setn(PL_curcop, s, len);
1811 CopLINE_set(PL_curcop, line_num);
1815 S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
1817 AV *av = CopFILEAVx(PL_curcop);
1820 if (PL_parser->preambling == NOLINE) sv = newSV_type(SVt_PVMG);
1822 sv = *av_fetch(av, 0, 1);
1823 SvUPGRADE(sv, SVt_PVMG);
1825 if (!SvPOK(sv)) SvPVCLEAR(sv);
1827 sv_catsv(sv, orig_sv);
1829 sv_catpvn(sv, buf, len);
1834 if (PL_parser->preambling == NOLINE)
1835 av_store(av, CopLINE(PL_curcop), sv);
1841 * Called to gobble the appropriate amount and type of whitespace.
1842 * Skips comments as well.
1843 * Returns the next character after the whitespace that is skipped.
1846 * Same thing, but look ahead without incrementing line numbers or
1847 * adjusting PL_linestart.
1850 #define skipspace(s) skipspace_flags(s, 0)
1851 #define peekspace(s) skipspace_flags(s, LEX_NO_INCLINE)
1854 S_skipspace_flags(pTHX_ char *s, U32 flags)
1856 PERL_ARGS_ASSERT_SKIPSPACE_FLAGS;
1857 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
1858 while (s < PL_bufend && (SPACE_OR_TAB(*s) || !*s))
1861 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1863 lex_read_space(flags | LEX_KEEP_PREVIOUS |
1864 (PL_lex_inwhat || PL_lex_state == LEX_FORMLINE ?
1865 LEX_NO_NEXT_CHUNK : 0));
1867 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1868 if (PL_linestart > PL_bufptr)
1869 PL_bufptr = PL_linestart;
1877 * Check the unary operators to ensure there's no ambiguity in how they're
1878 * used. An ambiguous piece of code would be:
1880 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1881 * the +5 is its argument.
1890 if (PL_oldoldbufptr != PL_last_uni)
1892 while (isSPACE(*PL_last_uni))
1895 while (isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF) || *s == '-')
1896 s += UTF ? UTF8SKIP(s) : 1;
1897 if ((t = strchr(s, '(')) && t < PL_bufptr)
1900 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
1901 "Warning: Use of \"%" UTF8f "\" without parentheses is ambiguous",
1902 UTF8fARG(UTF, (int)(s - PL_last_uni), PL_last_uni));
1906 * LOP : macro to build a list operator. Its behaviour has been replaced
1907 * with a subroutine, S_lop() for which LOP is just another name.
1910 #define LOP(f,x) return lop(f,x,s)
1914 * Build a list operator (or something that might be one). The rules:
1915 * - if we have a next token, then it's a list operator (no parens) for
1916 * which the next token has already been parsed; e.g.,
1919 * - if the next thing is an opening paren, then it's a function
1920 * - else it's a list operator
1924 S_lop(pTHX_ I32 f, U8 x, char *s)
1926 PERL_ARGS_ASSERT_LOP;
1931 PL_last_lop = PL_oldbufptr;
1932 PL_last_lop_op = (OPCODE)f;
1937 return REPORT(FUNC);
1940 return REPORT(FUNC);
1943 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
1944 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
1945 return REPORT(LSTOP);
1951 * When the lexer realizes it knows the next token (for instance,
1952 * it is reordering tokens for the parser) then it can call S_force_next
1953 * to know what token to return the next time the lexer is called. Caller
1954 * will need to set PL_nextval[] and possibly PL_expect to ensure
1955 * the lexer handles the token correctly.
1959 S_force_next(pTHX_ I32 type)
1963 PerlIO_printf(Perl_debug_log, "### forced token:\n");
1964 tokereport(type, &NEXTVAL_NEXTTOKE);
1967 assert(PL_nexttoke < C_ARRAY_LENGTH(PL_nexttype));
1968 PL_nexttype[PL_nexttoke] = type;
1975 * This subroutine handles postfix deref syntax after the arrow has already
1976 * been emitted. @* $* etc. are emitted as two separate token right here.
1977 * @[ @{ %[ %{ *{ are emitted also as two tokens, but this function emits
1978 * only the first, leaving yylex to find the next.
1982 S_postderef(pTHX_ int const funny, char const next)
1984 assert(funny == DOLSHARP || strchr("$@%&*", funny));
1986 PL_expect = XOPERATOR;
1987 if (PL_lex_state == LEX_INTERPNORMAL && !PL_lex_brackets) {
1988 assert('@' == funny || '$' == funny || DOLSHARP == funny);
1989 PL_lex_state = LEX_INTERPEND;
1991 force_next(POSTJOIN);
1997 if ('@' == funny && PL_lex_state == LEX_INTERPNORMAL
1998 && !PL_lex_brackets)
2000 PL_expect = XOPERATOR;
2009 int yyc = PL_parser->yychar;
2010 if (yyc != YYEMPTY) {
2012 NEXTVAL_NEXTTOKE = PL_parser->yylval;
2013 if (yyc == '{'/*}*/ || yyc == HASHBRACK || yyc == '['/*]*/) {
2014 PL_lex_allbrackets--;
2016 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
2017 } else if (yyc == '('/*)*/) {
2018 PL_lex_allbrackets--;
2023 PL_parser->yychar = YYEMPTY;
2028 S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
2030 SV * const sv = newSVpvn_utf8(start, len,
2033 && !is_utf8_invariant_string((const U8*)start, len)
2034 && is_utf8_string((const U8*)start, len));
2040 * When the lexer knows the next thing is a word (for instance, it has
2041 * just seen -> and it knows that the next char is a word char, then
2042 * it calls S_force_word to stick the next word into the PL_nexttoke/val
2046 * char *start : buffer position (must be within PL_linestr)
2047 * int token : PL_next* will be this type of bare word
2048 * (e.g., METHOD,BAREWORD)
2049 * int check_keyword : if true, Perl checks to make sure the word isn't
2050 * a keyword (do this if the word is a label, e.g. goto FOO)
2051 * int allow_pack : if true, : characters will also be allowed (require,
2052 * use, etc. do this)
2056 S_force_word(pTHX_ char *start, int token, int check_keyword, int allow_pack)
2061 PERL_ARGS_ASSERT_FORCE_WORD;
2063 start = skipspace(start);
2065 if ( isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)
2066 || (allow_pack && *s == ':' && s[1] == ':') )
2068 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
2069 if (check_keyword) {
2070 char *s2 = PL_tokenbuf;
2072 if (allow_pack && len > 6 && strEQs(s2, "CORE::"))
2074 if (keyword(s2, len2, 0))
2077 if (token == METHOD) {
2082 PL_expect = XOPERATOR;
2085 NEXTVAL_NEXTTOKE.opval
2086 = newSVOP(OP_CONST,0,
2087 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
2088 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
2096 * Called when the lexer wants $foo *foo &foo etc, but the program
2097 * text only contains the "foo" portion. The first argument is a pointer
2098 * to the "foo", and the second argument is the type symbol to prefix.
2099 * Forces the next token to be a "BAREWORD".
2100 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
2104 S_force_ident(pTHX_ const char *s, int kind)
2106 PERL_ARGS_ASSERT_FORCE_IDENT;
2109 const STRLEN len = s[1] ? strlen(s) : 1; /* s = "\"" see yylex */
2110 OP* const o = newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
2111 UTF ? SVf_UTF8 : 0));
2112 NEXTVAL_NEXTTOKE.opval = o;
2113 force_next(BAREWORD);
2115 o->op_private = OPpCONST_ENTERED;
2116 /* XXX see note in pp_entereval() for why we forgo typo
2117 warnings if the symbol must be introduced in an eval.
2119 gv_fetchpvn_flags(s, len,
2120 (PL_in_eval ? GV_ADDMULTI
2121 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
2122 kind == '$' ? SVt_PV :
2123 kind == '@' ? SVt_PVAV :
2124 kind == '%' ? SVt_PVHV :
2132 S_force_ident_maybe_lex(pTHX_ char pit)
2134 NEXTVAL_NEXTTOKE.ival = pit;
2139 Perl_str_to_version(pTHX_ SV *sv)
2144 const char *start = SvPV_const(sv,len);
2145 const char * const end = start + len;
2146 const bool utf = SvUTF8(sv) ? TRUE : FALSE;
2148 PERL_ARGS_ASSERT_STR_TO_VERSION;
2150 while (start < end) {
2154 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
2159 retval += ((NV)n)/nshift;
2168 * Forces the next token to be a version number.
2169 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2170 * and if "guessing" is TRUE, then no new token is created (and the caller
2171 * must use an alternative parsing method).
2175 S_force_version(pTHX_ char *s, int guessing)
2180 PERL_ARGS_ASSERT_FORCE_VERSION;
2188 while (isDIGIT(*d) || *d == '_' || *d == '.')
2190 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
2192 s = scan_num(s, &pl_yylval);
2193 version = pl_yylval.opval;
2194 ver = cSVOPx(version)->op_sv;
2195 if (SvPOK(ver) && !SvNIOK(ver)) {
2196 SvUPGRADE(ver, SVt_PVNV);
2197 SvNV_set(ver, str_to_version(ver));
2198 SvNOK_on(ver); /* hint that it is a version */
2201 else if (guessing) {
2206 /* NOTE: The parser sees the package name and the VERSION swapped */
2207 NEXTVAL_NEXTTOKE.opval = version;
2208 force_next(BAREWORD);
2214 * S_force_strict_version
2215 * Forces the next token to be a version number using strict syntax rules.
2219 S_force_strict_version(pTHX_ char *s)
2222 const char *errstr = NULL;
2224 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2226 while (isSPACE(*s)) /* leading whitespace */
2229 if (is_STRICT_VERSION(s,&errstr)) {
2231 s = (char *)scan_version(s, ver, 0);
2232 version = newSVOP(OP_CONST, 0, ver);
2234 else if ((*s != ';' && *s != '{' && *s != '}' )
2235 && (s = skipspace(s), (*s != ';' && *s != '{' && *s != '}' )))
2239 yyerror(errstr); /* version required */
2243 /* NOTE: The parser sees the package name and the VERSION swapped */
2244 NEXTVAL_NEXTTOKE.opval = version;
2245 force_next(BAREWORD);
2252 * Tokenize a quoted string passed in as an SV. It finds the next
2253 * chunk, up to end of string or a backslash. It may make a new
2254 * SV containing that chunk (if HINT_NEW_STRING is on). It also
2259 S_tokeq(pTHX_ SV *sv)
2266 PERL_ARGS_ASSERT_TOKEQ;
2270 assert (!SvIsCOW(sv));
2271 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1) /* <<'heredoc' */
2275 /* This is relying on the SV being "well formed" with a trailing '\0' */
2276 while (s < send && !(*s == '\\' && s[1] == '\\'))
2281 if ( PL_hints & HINT_NEW_STRING ) {
2282 pv = newSVpvn_flags(SvPVX_const(pv), SvCUR(sv),
2283 SVs_TEMP | SvUTF8(sv));
2287 if (s + 1 < send && (s[1] == '\\'))
2288 s++; /* all that, just for this */
2293 SvCUR_set(sv, d - SvPVX_const(sv));
2295 if ( PL_hints & HINT_NEW_STRING )
2296 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
2301 * Now come three functions related to double-quote context,
2302 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2303 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2304 * interact with PL_lex_state, and create fake ( ... ) argument lists
2305 * to handle functions and concatenation.
2309 * stringify ( const[foo] concat lcfirst ( const[bar] ) )
2314 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
2316 * Pattern matching will set PL_lex_op to the pattern-matching op to
2317 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
2319 * OP_CONST is easy--just make the new op and return.
2321 * Everything else becomes a FUNC.
2323 * Sets PL_lex_state to LEX_INTERPPUSH unless ival was OP_NULL or we
2324 * had an OP_CONST. This just sets us up for a
2325 * call to S_sublex_push().
2329 S_sublex_start(pTHX)
2331 const I32 op_type = pl_yylval.ival;
2333 if (op_type == OP_NULL) {
2334 pl_yylval.opval = PL_lex_op;
2338 if (op_type == OP_CONST) {
2339 SV *sv = PL_lex_stuff;
2340 PL_lex_stuff = NULL;
2343 if (SvTYPE(sv) == SVt_PVIV) {
2344 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2346 const char * const p = SvPV_const(sv, len);
2347 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
2351 pl_yylval.opval = newSVOP(op_type, 0, sv);
2355 PL_parser->lex_super_state = PL_lex_state;
2356 PL_parser->lex_sub_inwhat = (U16)op_type;
2357 PL_parser->lex_sub_op = PL_lex_op;
2358 PL_lex_state = LEX_INTERPPUSH;
2362 pl_yylval.opval = PL_lex_op;
2372 * Create a new scope to save the lexing state. The scope will be
2373 * ended in S_sublex_done. Returns a '(', starting the function arguments
2374 * to the uc, lc, etc. found before.
2375 * Sets PL_lex_state to LEX_INTERPCONCAT.
2382 const bool is_heredoc = PL_multi_close == '<';
2385 PL_lex_state = PL_parser->lex_super_state;
2386 SAVEI8(PL_lex_dojoin);
2387 SAVEI32(PL_lex_brackets);
2388 SAVEI32(PL_lex_allbrackets);
2389 SAVEI32(PL_lex_formbrack);
2390 SAVEI8(PL_lex_fakeeof);
2391 SAVEI32(PL_lex_casemods);
2392 SAVEI32(PL_lex_starts);
2393 SAVEI8(PL_lex_state);
2394 SAVESPTR(PL_lex_repl);
2395 SAVEVPTR(PL_lex_inpat);
2396 SAVEI16(PL_lex_inwhat);
2399 SAVECOPLINE(PL_curcop);
2400 SAVEI32(PL_multi_end);
2401 SAVEI32(PL_parser->herelines);
2402 PL_parser->herelines = 0;
2404 SAVEIV(PL_multi_close);
2405 SAVEPPTR(PL_bufptr);
2406 SAVEPPTR(PL_bufend);
2407 SAVEPPTR(PL_oldbufptr);
2408 SAVEPPTR(PL_oldoldbufptr);
2409 SAVEPPTR(PL_last_lop);
2410 SAVEPPTR(PL_last_uni);
2411 SAVEPPTR(PL_linestart);
2412 SAVESPTR(PL_linestr);
2413 SAVEGENERICPV(PL_lex_brackstack);
2414 SAVEGENERICPV(PL_lex_casestack);
2415 SAVEGENERICPV(PL_parser->lex_shared);
2416 SAVEBOOL(PL_parser->lex_re_reparsing);
2417 SAVEI32(PL_copline);
2419 /* The here-doc parser needs to be able to peek into outer lexing
2420 scopes to find the body of the here-doc. So we put PL_linestr and
2421 PL_bufptr into lex_shared, to ‘share’ those values.
2423 PL_parser->lex_shared->ls_linestr = PL_linestr;
2424 PL_parser->lex_shared->ls_bufptr = PL_bufptr;
2426 PL_linestr = PL_lex_stuff;
2427 PL_lex_repl = PL_parser->lex_sub_repl;
2428 PL_lex_stuff = NULL;
2429 PL_parser->lex_sub_repl = NULL;
2431 /* Arrange for PL_lex_stuff to be freed on scope exit, in case it gets
2432 set for an inner quote-like operator and then an error causes scope-
2433 popping. We must not have a PL_lex_stuff value left dangling, as
2434 that breaks assumptions elsewhere. See bug #123617. */
2435 SAVEGENERICSV(PL_lex_stuff);
2436 SAVEGENERICSV(PL_parser->lex_sub_repl);
2438 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2439 = SvPVX(PL_linestr);
2440 PL_bufend += SvCUR(PL_linestr);
2441 PL_last_lop = PL_last_uni = NULL;
2442 SAVEFREESV(PL_linestr);
2443 if (PL_lex_repl) SAVEFREESV(PL_lex_repl);
2445 PL_lex_dojoin = FALSE;
2446 PL_lex_brackets = PL_lex_formbrack = 0;
2447 PL_lex_allbrackets = 0;
2448 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
2449 Newx(PL_lex_brackstack, 120, char);
2450 Newx(PL_lex_casestack, 12, char);
2451 PL_lex_casemods = 0;
2452 *PL_lex_casestack = '\0';
2454 PL_lex_state = LEX_INTERPCONCAT;
2456 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
2457 PL_copline = NOLINE;
2459 Newxz(shared, 1, LEXSHARED);
2460 shared->ls_prev = PL_parser->lex_shared;
2461 PL_parser->lex_shared = shared;
2463 PL_lex_inwhat = PL_parser->lex_sub_inwhat;
2464 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
2465 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2466 PL_lex_inpat = PL_parser->lex_sub_op;
2468 PL_lex_inpat = NULL;
2470 PL_parser->lex_re_reparsing = cBOOL(PL_in_eval & EVAL_RE_REPARSING);
2471 PL_in_eval &= ~EVAL_RE_REPARSING;
2478 * Restores lexer state after a S_sublex_push.
2484 if (!PL_lex_starts++) {
2485 SV * const sv = newSVpvs("");
2486 if (SvUTF8(PL_linestr))
2488 PL_expect = XOPERATOR;
2489 pl_yylval.opval = newSVOP(OP_CONST, 0, sv);
2493 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2494 PL_lex_state = LEX_INTERPCASEMOD;
2498 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
2499 assert(PL_lex_inwhat != OP_TRANSR);
2501 assert (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS);
2502 PL_linestr = PL_lex_repl;
2504 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2505 PL_bufend += SvCUR(PL_linestr);
2506 PL_last_lop = PL_last_uni = NULL;
2507 PL_lex_dojoin = FALSE;
2508 PL_lex_brackets = 0;
2509 PL_lex_allbrackets = 0;
2510 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
2511 PL_lex_casemods = 0;
2512 *PL_lex_casestack = '\0';
2514 if (SvEVALED(PL_lex_repl)) {
2515 PL_lex_state = LEX_INTERPNORMAL;
2517 /* we don't clear PL_lex_repl here, so that we can check later
2518 whether this is an evalled subst; that means we rely on the
2519 logic to ensure sublex_done() is called again only via the
2520 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
2523 PL_lex_state = LEX_INTERPCONCAT;
2526 if (SvTYPE(PL_linestr) >= SVt_PVNV) {
2527 CopLINE(PL_curcop) +=
2528 ((XPVNV*)SvANY(PL_linestr))->xnv_u.xnv_lines
2529 + PL_parser->herelines;
2530 PL_parser->herelines = 0;
2535 const line_t l = CopLINE(PL_curcop);
2537 if (PL_multi_close == '<')
2538 PL_parser->herelines += l - PL_multi_end;
2539 PL_bufend = SvPVX(PL_linestr);
2540 PL_bufend += SvCUR(PL_linestr);
2541 PL_expect = XOPERATOR;
2547 S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
2549 /* <s> points to first character of interior of \N{}, <e> to one beyond the
2550 * interior, hence to the "}". Finds what the name resolves to, returning
2551 * an SV* containing it; NULL if no valid one found */
2553 SV* res = newSVpvn_flags(s, e - s, UTF ? SVf_UTF8 : 0);
2560 const U8* first_bad_char_loc;
2561 const char* backslash_ptr = s - 3; /* Points to the <\> of \N{... */
2563 PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME;
2566 deprecate_fatal_in("5.28", "Unknown charname '' is deprecated");
2570 if (UTF && ! is_utf8_string_loc((U8 *) backslash_ptr,
2572 &first_bad_char_loc))
2574 _force_out_malformed_utf8_message(first_bad_char_loc,
2575 (U8 *) PL_parser->bufend,
2577 0 /* 0 means don't die */ );
2578 yyerror_pv(Perl_form(aTHX_
2579 "Malformed UTF-8 character immediately after '%.*s'",
2580 (int) (first_bad_char_loc - (U8 *) backslash_ptr), backslash_ptr),
2585 res = new_constant( NULL, 0, "charnames", res, NULL, backslash_ptr,
2586 /* include the <}> */
2587 e - backslash_ptr + 1);
2589 SvREFCNT_dec_NN(res);
2593 /* See if the charnames handler is the Perl core's, and if so, we can skip
2594 * the validation needed for a user-supplied one, as Perl's does its own
2596 table = GvHV(PL_hintgv); /* ^H */
2597 cvp = hv_fetchs(table, "charnames", FALSE);
2598 if (cvp && (cv = *cvp) && SvROK(cv) && (rv = SvRV(cv),
2599 SvTYPE(rv) == SVt_PVCV) && ((stash = CvSTASH(rv)) != NULL))
2601 const char * const name = HvNAME(stash);
2602 if (HvNAMELEN(stash) == sizeof("_charnames")-1
2603 && strEQ(name, "_charnames")) {
2608 /* Here, it isn't Perl's charname handler. We can't rely on a
2609 * user-supplied handler to validate the input name. For non-ut8 input,
2610 * look to see that the first character is legal. Then loop through the
2611 * rest checking that each is a continuation */
2613 /* This code makes the reasonable assumption that the only Latin1-range
2614 * characters that begin a character name alias are alphabetic, otherwise
2615 * would have to create a isCHARNAME_BEGIN macro */
2618 if (! isALPHAU(*s)) {
2623 if (! isCHARNAME_CONT(*s)) {
2626 if (*s == ' ' && *(s-1) == ' ') {
2633 /* Similarly for utf8. For invariants can check directly; for other
2634 * Latin1, can calculate their code point and check; otherwise use a
2636 if (UTF8_IS_INVARIANT(*s)) {
2637 if (! isALPHAU(*s)) {
2641 } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
2642 if (! isALPHAU(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)))) {
2648 if (! PL_utf8_charname_begin) {
2649 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2650 PL_utf8_charname_begin = _core_swash_init("utf8",
2651 "_Perl_Charname_Begin",
2653 1, 0, NULL, &flags);
2655 if (! swash_fetch(PL_utf8_charname_begin, (U8 *) s, TRUE)) {
2662 if (UTF8_IS_INVARIANT(*s)) {
2663 if (! isCHARNAME_CONT(*s)) {
2666 if (*s == ' ' && *(s-1) == ' ') {
2671 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
2672 if (! isCHARNAME_CONT(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1))))
2679 if (! PL_utf8_charname_continue) {
2680 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2681 PL_utf8_charname_continue = _core_swash_init("utf8",
2682 "_Perl_Charname_Continue",
2684 1, 0, NULL, &flags);
2686 if (! swash_fetch(PL_utf8_charname_continue, (U8 *) s, TRUE)) {
2693 if (*(s-1) == ' ') {
2696 "charnames alias definitions may not contain trailing "
2697 "white-space; marked by <-- HERE in %.*s<-- HERE %.*s",
2698 (int)(s - backslash_ptr + 1), backslash_ptr,
2699 (int)(e - s + 1), s + 1
2701 UTF ? SVf_UTF8 : 0);
2705 if (SvUTF8(res)) { /* Don't accept malformed input */
2706 const U8* first_bad_char_loc;
2708 const char* const str = SvPV_const(res, len);
2709 if (! is_utf8_string_loc((U8 *) str, len, &first_bad_char_loc)) {
2710 _force_out_malformed_utf8_message(first_bad_char_loc,
2711 (U8 *) PL_parser->bufend,
2713 0 /* 0 means don't die */ );
2716 "Malformed UTF-8 returned by %.*s immediately after '%.*s'",
2717 (int) (e - backslash_ptr + 1), backslash_ptr,
2718 (int) ((char *) first_bad_char_loc - str), str
2729 /* The final %.*s makes sure that should the trailing NUL be missing
2730 * that this print won't run off the end of the string */
2733 "Invalid character in \\N{...}; marked by <-- HERE in %.*s<-- HERE %.*s",
2734 (int)(s - backslash_ptr + 1), backslash_ptr,
2735 (int)(e - s + 1), s + 1
2737 UTF ? SVf_UTF8 : 0);
2744 "charnames alias definitions may not contain a sequence of "
2745 "multiple spaces; marked by <-- HERE in %.*s<-- HERE %.*s",
2746 (int)(s - backslash_ptr + 1), backslash_ptr,
2747 (int)(e - s + 1), s + 1
2749 UTF ? SVf_UTF8 : 0);
2756 Extracts the next constant part of a pattern, double-quoted string,
2757 or transliteration. This is terrifying code.
2759 For example, in parsing the double-quoted string "ab\x63$d", it would
2760 stop at the '$' and return an OP_CONST containing 'abc'.
2762 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
2763 processing a pattern (PL_lex_inpat is true), a transliteration
2764 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
2766 Returns a pointer to the character scanned up to. If this is
2767 advanced from the start pointer supplied (i.e. if anything was
2768 successfully parsed), will leave an OP_CONST for the substring scanned
2769 in pl_yylval. Caller must intuit reason for not parsing further
2770 by looking at the next characters herself.
2774 \N{FOO} => \N{U+hex_for_character_FOO}
2775 (if FOO expands to multiple characters, expands to \N{U+xx.XX.yy ...})
2778 all other \-char, including \N and \N{ apart from \N{ABC}
2781 @ and $ where it appears to be a var, but not for $ as tail anchor
2785 In transliterations:
2786 characters are VERY literal, except for - not at the start or end
2787 of the string, which indicates a range. However some backslash sequences
2788 are recognized: \r, \n, and the like
2789 \007 \o{}, \x{}, \N{}
2790 If all elements in the transliteration are below 256,
2791 scan_const expands the range to the full set of intermediate
2792 characters. If the range is in utf8, the hyphen is replaced with
2793 a certain range mark which will be handled by pmtrans() in op.c.
2795 In double-quoted strings:
2797 all those recognized in transliterations
2798 deprecated backrefs: \1 (in substitution replacements)
2799 case and quoting: \U \Q \E
2802 scan_const does *not* construct ops to handle interpolated strings.
2803 It stops processing as soon as it finds an embedded $ or @ variable
2804 and leaves it to the caller to work out what's going on.
2806 embedded arrays (whether in pattern or not) could be:
2807 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2809 $ in double-quoted strings must be the symbol of an embedded scalar.
2811 $ in pattern could be $foo or could be tail anchor. Assumption:
2812 it's a tail anchor if $ is the last thing in the string, or if it's
2813 followed by one of "()| \r\n\t"
2815 \1 (backreferences) are turned into $1 in substitutions
2817 The structure of the code is
2818 while (there's a character to process) {
2819 handle transliteration ranges
2820 skip regexp comments /(?#comment)/ and codes /(?{code})/
2821 skip #-initiated comments in //x patterns
2822 check for embedded arrays
2823 check for embedded scalars
2825 deprecate \1 in substitution replacements
2826 handle string-changing backslashes \l \U \Q \E, etc.
2827 switch (what was escaped) {
2828 handle \- in a transliteration (becomes a literal -)
2829 if a pattern and not \N{, go treat as regular character
2830 handle \132 (octal characters)
2831 handle \x15 and \x{1234} (hex characters)
2832 handle \N{name} (named characters, also \N{3,5} in a pattern)
2833 handle \cV (control characters)
2834 handle printf-style backslashes (\f, \r, \n, etc)
2837 } (end if backslash)
2838 handle regular character
2839 } (end while character to read)
2844 S_scan_const(pTHX_ char *start)
2846 char *send = PL_bufend; /* end of the constant */
2847 SV *sv = newSV(send - start); /* sv for the constant. See note below
2849 char *s = start; /* start of the constant */
2850 char *d = SvPVX(sv); /* destination for copies */
2851 bool dorange = FALSE; /* are we in a translit range? */
2852 bool didrange = FALSE; /* did we just finish a range? */
2853 bool in_charclass = FALSE; /* within /[...]/ */
2854 bool has_utf8 = FALSE; /* Output constant is UTF8 */
2855 bool has_above_latin1 = FALSE; /* does something require special
2856 handling in tr/// ? */
2857 bool this_utf8 = cBOOL(UTF); /* Is the source string assumed to be
2858 UTF8? But, this can show as true
2859 when the source isn't utf8, as for
2860 example when it is entirely composed
2862 STRLEN utf8_variant_count = 0; /* When not in UTF-8, this counts the
2863 number of characters found so far
2864 that will expand (into 2 bytes)
2865 should we have to convert to
2867 SV *res; /* result from charnames */
2868 STRLEN offset_to_max; /* The offset in the output to where the range
2869 high-end character is temporarily placed */
2871 /* Note on sizing: The scanned constant is placed into sv, which is
2872 * initialized by newSV() assuming one byte of output for every byte of
2873 * input. This routine expects newSV() to allocate an extra byte for a
2874 * trailing NUL, which this routine will append if it gets to the end of
2875 * the input. There may be more bytes of input than output (eg., \N{LATIN
2876 * CAPITAL LETTER A}), or more output than input if the constant ends up
2877 * recoded to utf8, but each time a construct is found that might increase
2878 * the needed size, SvGROW() is called. Its size parameter each time is
2879 * based on the best guess estimate at the time, namely the length used so
2880 * far, plus the length the current construct will occupy, plus room for
2881 * the trailing NUL, plus one byte for every input byte still unscanned */
2883 UV uv = UV_MAX; /* Initialize to weird value to try to catch any uses
2886 int backslash_N = 0; /* ? was the character from \N{} */
2887 int non_portable_endpoint = 0; /* ? In a range is an endpoint
2888 platform-specific like \x65 */
2891 PERL_ARGS_ASSERT_SCAN_CONST;
2893 assert(PL_lex_inwhat != OP_TRANSR);
2894 if (PL_lex_inwhat == OP_TRANS && PL_parser->lex_sub_op) {
2895 /* If we are doing a trans and we know we want UTF8 set expectation */
2896 has_utf8 = PL_parser->lex_sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
2897 this_utf8 = PL_parser->lex_sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
2900 /* Protect sv from errors and fatal warnings. */
2901 ENTER_with_name("scan_const");
2905 || dorange /* Handle tr/// range at right edge of input */
2908 /* get transliterations out of the way (they're most literal) */
2909 if (PL_lex_inwhat == OP_TRANS) {
2911 /* But there isn't any special handling necessary unless there is a
2912 * range, so for most cases we just drop down and handle the value
2913 * as any other. There are two exceptions.
2915 * 1. A minus sign indicates that we are actually going to have
2916 * a range. In this case, skip the '-', set a flag, then drop
2917 * down to handle what should be the end range value.
2918 * 2. After we've handled that value, the next time through, that
2919 * flag is set and we fix up the range.
2921 * Ranges entirely within Latin1 are expanded out entirely, in
2922 * order to make the transliteration a simple table look-up.
2923 * Ranges that extend above Latin1 have to be done differently, so
2924 * there is no advantage to expanding them here, so they are
2925 * stored here as Min, ILLEGAL_UTF8_BYTE, Max. The illegal byte
2926 * signifies a hyphen without any possible ambiguity. On EBCDIC
2927 * machines, if the range is expressed as Unicode, the Latin1
2928 * portion is expanded out even if the range extends above
2929 * Latin1. This is because each code point in it has to be
2930 * processed here individually to get its native translation */
2934 /* Here, we don't think we're in a range. If we've processed
2935 * at least one character, then see if this next one is a '-',
2936 * indicating the previous one was the start of a range. But
2937 * don't bother if we're too close to the end for the minus to
2939 if (*s != '-' || s >= send - 1 || s == start) {
2941 /* A regular character. Process like any other, but first
2942 * clear any flags */
2946 non_portable_endpoint = 0;
2949 /* The tests here and the following 'else' for being above
2950 * Latin1 suffice to find all such occurences in the
2951 * constant, except those added by a backslash escape
2952 * sequence, like \x{100}. And all those set
2953 * 'has_above_latin1' as appropriate */
2954 if (this_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) {
2955 has_above_latin1 = TRUE;
2958 /* Drops down to generic code to process current byte */
2961 if (didrange) { /* Something like y/A-C-Z// */
2962 Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
2967 s++; /* Skip past the minus */
2969 /* d now points to where the end-range character will be
2970 * placed. Save it so won't have to go finding it later,
2971 * and drop down to get that character. (Actually we
2972 * instead save the offset, to handle the case where a
2973 * realloc in the meantime could change the actual
2974 * pointer). We'll finish processing the range the next
2975 * time through the loop */
2976 offset_to_max = d - SvPVX_const(sv);
2978 if (this_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) {
2979 has_above_latin1 = TRUE;
2982 } /* End of not a range */
2984 /* Here we have parsed a range. Now must handle it. At this
2986 * 'sv' is a SV* that contains the output string we are
2987 * constructing. The final two characters in that string
2988 * are the range start and range end, in order.
2989 * 'd' points to just beyond the range end in the 'sv' string,
2990 * where we would next place something
2991 * 'offset_to_max' is the offset in 'sv' at which the character
2992 * before 'd' begins.
2994 const char * max_ptr = SvPVX_const(sv) + offset_to_max;
2995 const char * min_ptr;
2997 IV range_max; /* last character in range */
3001 bool convert_unicode;
3002 IV real_range_max = 0;
3005 /* Get the range-ends code point values. */
3007 /* We know the utf8 is valid, because we just constructed
3008 * it ourselves in previous loop iterations */
3009 min_ptr = (char*) utf8_hop( (U8*) max_ptr, -1);
3010 range_min = valid_utf8_to_uvchr( (U8*) min_ptr, NULL);
3011 range_max = valid_utf8_to_uvchr( (U8*) max_ptr, NULL);
3014 min_ptr = max_ptr - 1;
3015 range_min = * (U8*) min_ptr;
3016 range_max = * (U8*) max_ptr;
3020 /* On EBCDIC platforms, we may have to deal with portable
3021 * ranges. These happen if at least one range endpoint is a
3022 * Unicode value (\N{...}), or if the range is a subset of
3023 * [A-Z] or [a-z], and both ends are literal characters,
3024 * like 'A', and not like \x{C1} */
3026 cBOOL(backslash_N) /* \N{} forces Unicode, hence
3028 || ( ! non_portable_endpoint
3029 && (( isLOWER_A(range_min) && isLOWER_A(range_max))
3030 || (isUPPER_A(range_min) && isUPPER_A(range_max))));
3031 if (convert_unicode) {
3033 /* Special handling is needed for these portable ranges.
3034 * They are defined to all be in Unicode terms, which
3035 * include all Unicode code points between the end points.
3036 * Convert to Unicode to get the Unicode range. Later we
3037 * will convert each code point in the range back to
3039 range_min = NATIVE_TO_UNI(range_min);
3040 range_max = NATIVE_TO_UNI(range_max);
3044 if (range_min > range_max) {
3046 if (convert_unicode) {
3047 /* Need to convert back to native for meaningful
3048 * messages for this platform */
3049 range_min = UNI_TO_NATIVE(range_min);
3050 range_max = UNI_TO_NATIVE(range_max);
3054 /* Use the characters themselves for the error message if
3055 * ASCII printables; otherwise some visible representation
3057 if (isPRINT_A(range_min) && isPRINT_A(range_max)) {
3059 "Invalid range \"%c-%c\" in transliteration operator",
3060 (char)range_min, (char)range_max);
3063 else if (convert_unicode) {
3064 /* diag_listed_as: Invalid range "%s" in transliteration operator */
3066 "Invalid range \"\\N{U+%04" UVXf "}-\\N{U+%04" UVXf "}\""
3067 " in transliteration operator",
3068 range_min, range_max);
3072 /* diag_listed_as: Invalid range "%s" in transliteration operator */
3074 "Invalid range \"\\x{%04" UVXf "}-\\x{%04" UVXf "}\""
3075 " in transliteration operator",
3076 range_min, range_max);
3082 /* If everything in the transliteration is below 256, we
3083 * can avoid special handling later. A translation table
3084 * of each of those bytes is created. And so we expand out
3085 * all ranges to their constituent code points. But if
3086 * we've encountered something above 255, the expanding
3087 * won't help, so skip doing that. But if it's EBCDIC, we
3088 * may have to look at each character below 256 if we have
3089 * to convert to/from Unicode values */
3090 if ( has_above_latin1
3092 && (range_min > 255 || ! convert_unicode)
3095 /* Move the high character one byte to the right; then
3096 * insert between it and the range begin, an illegal
3097 * byte which serves to indicate this is a range (using
3098 * a '-' could be ambiguous). */
3100 while (e-- > max_ptr) {
3103 *(e + 1) = (char) ILLEGAL_UTF8_BYTE;
3107 /* Here, we're going to expand out the range. For EBCDIC
3108 * the range can extend above 255 (not so in ASCII), so
3109 * for EBCDIC, split it into the parts above and below
3112 if (range_max > 255) {
3113 real_range_max = range_max;
3119 /* Here we need to expand out the string to contain each
3120 * character in the range. Grow the output to handle this */
3122 save_offset = min_ptr - SvPVX_const(sv);
3124 /* The base growth is the number of code points in the range */
3125 grow = range_max - range_min + 1;
3128 /* But if the output is UTF-8, some of those characters may
3129 * need two bytes (since the maximum range value here is
3130 * 255, the max bytes per character is two). On ASCII
3131 * platforms, it's not much trouble to get an accurate
3132 * count of what's needed. But on EBCDIC, the ones that
3133 * need 2 bytes are scattered around, so just use a worst
3134 * case value instead of calculating for that platform. */
3138 /* Only those above 127 require 2 bytes. This may be
3139 * everything in the range, or not */
3140 if (range_min > 127) {
3143 else if (range_max > 127) {
3144 grow += range_max - 127;
3149 /* Subtract 3 for the bytes that were already accounted for
3150 * (min, max, and the hyphen) */
3151 d = save_offset + SvGROW(sv, SvLEN(sv) + grow - 3);
3154 /* Here, we expand out the range. */
3155 if (convert_unicode) {
3158 /* Recall that the min and max are now in Unicode terms, so
3159 * we have to convert each character to its native
3162 for (i = range_min; i <= range_max; i++) {
3163 append_utf8_from_native_byte(LATIN1_TO_NATIVE((U8) i),
3168 for (i = range_min; i <= range_max; i++) {
3169 *d++ = (char)LATIN1_TO_NATIVE((U8) i);
3175 /* Always gets run for ASCII, and sometimes for EBCDIC. */
3179 /* Here, no conversions are necessary, which means that the
3180 * first character in the range is already in 'd' and
3181 * valid, so we can skip overwriting it */
3184 for (i = range_min + 1; i <= range_max; i++) {
3185 append_utf8_from_native_byte((U8) i, (U8 **) &d);
3190 for (i = range_min + 1; i <= range_max; i++) {
3197 /* If the original range extended above 255, add in that portion. */
3198 if (real_range_max) {
3199 *d++ = (char) UTF8_TWO_BYTE_HI(0x100);
3200 *d++ = (char) UTF8_TWO_BYTE_LO(0x100);
3201 if (real_range_max > 0x101)
3202 *d++ = (char) ILLEGAL_UTF8_BYTE;
3203 if (real_range_max > 0x100)
3204 d = (char*)uvchr_to_utf8((U8*)d, real_range_max);
3209 /* mark the range as done, and continue */
3213 non_portable_endpoint = 0;
3217 } /* End of is a range */
3218 } /* End of transliteration. Joins main code after these else's */
3219 else if (*s == '[' && PL_lex_inpat && !in_charclass) {
3222 while (s1 >= start && *s1-- == '\\')
3225 in_charclass = TRUE;
3227 else if (*s == ']' && PL_lex_inpat && in_charclass) {
3230 while (s1 >= start && *s1-- == '\\')
3233 in_charclass = FALSE;
3236 /* skip for regexp comments /(?#comment)/, except for the last
3237 * char, which will be done separately.
3238 * Stop on (?{..}) and friends */
3240 else if (*s == '(' && PL_lex_inpat && s[1] == '?' && !in_charclass) {
3242 while (s+1 < send && *s != ')')
3245 else if (!PL_lex_casemods
3246 && ( s[2] == '{' /* This should match regcomp.c */
3247 || (s[2] == '?' && s[3] == '{')))
3253 /* likewise skip #-initiated comments in //x patterns */
3257 && ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED)
3259 while (s < send && *s != '\n')
3263 /* no further processing of single-quoted regex */
3264 else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'')
3265 goto default_action;
3267 /* check for embedded arrays
3268 (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
3270 else if (*s == '@' && s[1]) {
3272 ? isIDFIRST_utf8_safe(s+1, send)
3273 : isWORDCHAR_A(s[1]))
3277 if (strchr(":'{$", s[1]))
3279 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
3280 break; /* in regexp, neither @+ nor @- are interpolated */
3283 /* check for embedded scalars. only stop if we're sure it's a
3286 else if (*s == '$') {
3287 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
3289 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
3291 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
3292 "Possible unintended interpolation of $\\ in regex");
3294 break; /* in regexp, $ might be tail anchor */
3298 /* End of else if chain - OP_TRANS rejoin rest */
3300 if (UNLIKELY(s >= send)) {
3306 if (*s == '\\' && s+1 < send) {
3307 char* e; /* Can be used for ending '}', etc. */
3311 /* warn on \1 - \9 in substitution replacements, but note that \11
3312 * is an octal; and \19 is \1 followed by '9' */
3313 if (PL_lex_inwhat == OP_SUBST
3319 /* diag_listed_as: \%d better written as $%d */
3320 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
3325 /* string-change backslash escapes */
3326 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQF", *s)) {
3330 /* In a pattern, process \N, but skip any other backslash escapes.
3331 * This is because we don't want to translate an escape sequence
3332 * into a meta symbol and have the regex compiler use the meta
3333 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
3334 * in spite of this, we do have to process \N here while the proper
3335 * charnames handler is in scope. See bugs #56444 and #62056.
3337 * There is a complication because \N in a pattern may also stand
3338 * for 'match a non-nl', and not mean a charname, in which case its
3339 * processing should be deferred to the regex compiler. To be a
3340 * charname it must be followed immediately by a '{', and not look
3341 * like \N followed by a curly quantifier, i.e., not something like
3342 * \N{3,}. regcurly returns a boolean indicating if it is a legal
3344 else if (PL_lex_inpat
3347 || regcurly(s + 1)))
3350 goto default_action;
3356 if ((isALPHANUMERIC(*s)))
3357 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
3358 "Unrecognized escape \\%c passed through",
3360 /* default action is to copy the quoted character */
3361 goto default_action;
3364 /* eg. \132 indicates the octal constant 0132 */
3365 case '0': case '1': case '2': case '3':
3366 case '4': case '5': case '6': case '7':
3368 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
3370 uv = grok_oct(s, &len, &flags, NULL);
3372 if (len < 3 && s < send && isDIGIT(*s)
3373 && ckWARN(WARN_MISC))
3375 Perl_warner(aTHX_ packWARN(WARN_MISC),
3376 "%s", form_short_octal_warning(s, len));
3379 goto NUM_ESCAPE_INSERT;
3381 /* eg. \o{24} indicates the octal constant \024 */
3386 bool valid = grok_bslash_o(&s, &uv, &error,
3387 TRUE, /* Output warning */
3388 FALSE, /* Not strict */
3389 TRUE, /* Output warnings for
3394 uv = 0; /* drop through to ensure range ends are set */
3396 goto NUM_ESCAPE_INSERT;
3399 /* eg. \x24 indicates the hex constant 0x24 */
3404 bool valid = grok_bslash_x(&s, &uv, &error,
3405 TRUE, /* Output warning */
3406 FALSE, /* Not strict */
3407 TRUE, /* Output warnings for
3412 uv = 0; /* drop through to ensure range ends are set */
3417 /* Insert oct or hex escaped character. */
3419 /* Here uv is the ordinal of the next character being added */
3420 if (UVCHR_IS_INVARIANT(uv)) {
3424 if (!has_utf8 && uv > 255) {
3426 /* Here, 'uv' won't fit unless we convert to UTF-8.
3427 * If we've only seen invariants so far, all we have to
3428 * do is turn on the flag */
3429 if (utf8_variant_count == 0) {
3433 SvCUR_set(sv, d - SvPVX_const(sv));
3437 sv_utf8_upgrade_flags_grow(
3439 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3441 /* Since we're having to grow here,
3442 * make sure we have enough room for
3443 * this escape and a NUL, so the
3444 * code immediately below won't have
3445 * to actually grow again */
3447 + (STRLEN)(send - s) + 1);
3448 d = SvPVX(sv) + SvCUR(sv);
3451 has_above_latin1 = TRUE;
3457 utf8_variant_count++;
3460 /* Usually, there will already be enough room in 'sv'
3461 * since such escapes are likely longer than any UTF-8
3462 * sequence they can end up as. This isn't the case on
3463 * EBCDIC where \x{40000000} contains 12 bytes, and the
3464 * UTF-8 for it contains 14. And, we have to allow for
3465 * a trailing NUL. It probably can't happen on ASCII
3466 * platforms, but be safe. See Note on sizing above. */
3467 const STRLEN needed = d - SvPVX(sv)
3471 if (UNLIKELY(needed > SvLEN(sv))) {
3472 SvCUR_set(sv, d - SvPVX_const(sv));
3473 d = SvCUR(sv) + SvGROW(sv, needed);
3476 d = (char*)uvchr_to_utf8((U8*)d, uv);
3477 if (PL_lex_inwhat == OP_TRANS
3478 && PL_parser->lex_sub_op)
3480 PL_parser->lex_sub_op->op_private |=
3481 (PL_lex_repl ? OPpTRANS_FROM_UTF
3487 non_portable_endpoint++;
3492 /* In a non-pattern \N must be like \N{U+0041}, or it can be a
3493 * named character, like \N{LATIN SMALL LETTER A}, or a named
3494 * sequence, like \N{LATIN CAPITAL LETTER A WITH MACRON AND
3495 * GRAVE} (except y/// can't handle the latter, croaking). For
3496 * convenience all three forms are referred to as "named
3497 * characters" below.
3499 * For patterns, \N also can mean to match a non-newline. Code
3500 * before this 'switch' statement should already have handled
3501 * this situation, and hence this code only has to deal with
3502 * the named character cases.
3504 * For non-patterns, the named characters are converted to
3505 * their string equivalents. In patterns, named characters are
3506 * not converted to their ultimate forms for the same reasons
3507 * that other escapes aren't. Instead, they are converted to
3508 * the \N{U+...} form to get the value from the charnames that
3509 * is in effect right now, while preserving the fact that it
3510 * was a named character, so that the regex compiler knows
3513 * The structure of this section of code (besides checking for
3514 * errors and upgrading to utf8) is:
3515 * If the named character is of the form \N{U+...}, pass it
3516 * through if a pattern; otherwise convert the code point
3518 * Otherwise must be some \N{NAME}: convert to
3519 * \N{U+c1.c2...} if a pattern; otherwise convert to utf8
3521 * Transliteration is an exception. The conversion to utf8 is
3522 * only done if the code point requires it to be representable.
3524 * Here, 's' points to the 'N'; the test below is guaranteed to
3525 * succeed if we are being called on a pattern, as we already
3526 * know from a test above that the next character is a '{'. A
3527 * non-pattern \N must mean 'named character', which requires
3531 yyerror("Missing braces on \\N{}");
3536 /* If there is no matching '}', it is an error. */
3537 if (! (e = strchr(s, '}'))) {
3538 if (! PL_lex_inpat) {
3539 yyerror("Missing right brace on \\N{}");
3541 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N");
3546 /* Here it looks like a named character */
3548 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3549 s += 2; /* Skip to next char after the 'U+' */
3552 /* In patterns, we can have \N{U+xxxx.yyyy.zzzz...} */
3553 /* Check the syntax. */
3556 if (!isXDIGIT(*s)) {
3559 "Invalid hexadecimal number in \\N{U+...}"
3567 else if ((*s == '.' || *s == '_')
3573 /* Pass everything through unchanged.
3574 * +1 is for the '}' */
3575 Copy(orig_s, d, e - orig_s + 1, char);
3576 d += e - orig_s + 1;
3578 else { /* Not a pattern: convert the hex to string */
3579 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3580 | PERL_SCAN_SILENT_ILLDIGIT
3581 | PERL_SCAN_DISALLOW_PREFIX;
3583 uv = grok_hex(s, &len, &flags, NULL);
3584 if (len == 0 || (len != (STRLEN)(e - s)))
3587 /* For non-tr///, if the destination is not in utf8,
3588 * unconditionally recode it to be so. This is
3589 * because \N{} implies Unicode semantics, and scalars
3590 * have to be in utf8 to guarantee those semantics.
3591 * tr/// doesn't care about Unicode rules, so no need
3592 * there to upgrade to UTF-8 for small enough code
3594 if (! has_utf8 && ( uv > 0xFF
3595 || PL_lex_inwhat != OP_TRANS))
3597 /* See Note on sizing above. */
3598 const STRLEN extra = OFFUNISKIP(uv) + (send - e) + 1;
3600 SvCUR_set(sv, d - SvPVX_const(sv));
3604 if (utf8_variant_count == 0) {
3606 d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + extra);
3609 sv_utf8_upgrade_flags_grow(
3611 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3613 d = SvPVX(sv) + SvCUR(sv);
3617 has_above_latin1 = TRUE;
3620 /* Add the (Unicode) code point to the output. */
3621 if (! has_utf8 || OFFUNI_IS_INVARIANT(uv)) {
3622 *d++ = (char) LATIN1_TO_NATIVE(uv);
3625 d = (char*) uvoffuni_to_utf8_flags((U8*)d, uv, 0);
3629 else /* Here is \N{NAME} but not \N{U+...}. */
3630 if ((res = get_and_check_backslash_N_name(s, e)))
3633 const char *str = SvPV_const(res, len);
3636 if (! len) { /* The name resolved to an empty string */
3637 Copy("\\N{}", d, 4, char);
3641 /* In order to not lose information for the regex
3642 * compiler, pass the result in the specially made
3643 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3644 * the code points in hex of each character
3645 * returned by charnames */
3647 const char *str_end = str + len;
3648 const STRLEN off = d - SvPVX_const(sv);
3650 if (! SvUTF8(res)) {
3651 /* For the non-UTF-8 case, we can determine the
3652 * exact length needed without having to parse
3653 * through the string. Each character takes up
3654 * 2 hex digits plus either a trailing dot or
3656 const char initial_text[] = "\\N{U+";
3657 const STRLEN initial_len = sizeof(initial_text)
3659 d = off + SvGROW(sv, off
3662 /* +1 for trailing NUL */
3665 + (STRLEN)(send - e));
3666 Copy(initial_text, d, initial_len, char);
3668 while (str < str_end) {
3671 my_snprintf(hex_string,
3675 /* The regex compiler is
3676 * expecting Unicode, not
3678 NATIVE_TO_LATIN1(*str));
3679 PERL_MY_SNPRINTF_POST_GUARD(len,
3680 sizeof(hex_string));
3681 Copy(hex_string, d, 3, char);
3685 d--; /* Below, we will overwrite the final
3686 dot with a right brace */
3689 STRLEN char_length; /* cur char's byte length */
3691 /* and the number of bytes after this is
3692 * translated into hex digits */
3693 STRLEN output_length;
3695 /* 2 hex per byte; 2 chars for '\N'; 2 chars
3696 * for max('U+', '.'); and 1 for NUL */
3697 char hex_string[2 * UTF8_MAXBYTES + 5];
3699 /* Get the first character of the result. */
3700 U32 uv = utf8n_to_uvchr((U8 *) str,
3704 /* Convert first code point to Unicode hex,
3705 * including the boiler plate before it. */
3707 my_snprintf(hex_string, sizeof(hex_string),
3709 (unsigned int) NATIVE_TO_UNI(uv));
3711 /* Make sure there is enough space to hold it */
3712 d = off + SvGROW(sv, off
3714 + (STRLEN)(send - e)
3715 + 2); /* '}' + NUL */
3717 Copy(hex_string, d, output_length, char);
3720 /* For each subsequent character, append dot and
3721 * its Unicode code point in hex */
3722 while ((str += char_length) < str_end) {
3723 const STRLEN off = d - SvPVX_const(sv);
3724 U32 uv = utf8n_to_uvchr((U8 *) str,
3729 my_snprintf(hex_string,
3732 (unsigned int) NATIVE_TO_UNI(uv));
3734 d = off + SvGROW(sv, off
3736 + (STRLEN)(send - e)
3737 + 2); /* '}' + NUL */
3738 Copy(hex_string, d, output_length, char);
3743 *d++ = '}'; /* Done. Add the trailing brace */
3746 else { /* Here, not in a pattern. Convert the name to a
3749 if (PL_lex_inwhat == OP_TRANS) {
3750 str = SvPV_const(res, len);
3751 if (len > ((SvUTF8(res))
3755 yyerror(Perl_form(aTHX_
3756 "%.*s must not be a named sequence"
3757 " in transliteration operator",
3758 /* +1 to include the "}" */
3759 (int) (e + 1 - start), start));
3760 goto end_backslash_N;
3763 if (SvUTF8(res) && UTF8_IS_ABOVE_LATIN1(*str)) {
3764 has_above_latin1 = TRUE;
3768 else if (! SvUTF8(res)) {
3769 /* Make sure \N{} return is UTF-8. This is because
3770 * \N{} implies Unicode semantics, and scalars have
3771 * to be in utf8 to guarantee those semantics; but
3772 * not needed in tr/// */
3773 sv_utf8_upgrade_flags(res, 0);
3774 str = SvPV_const(res, len);
3777 /* Upgrade destination to be utf8 if this new
3779 if (! has_utf8 && SvUTF8(res)) {
3780 /* See Note on sizing above. */
3781 const STRLEN extra = len + (send - s) + 1;
3783 SvCUR_set(sv, d - SvPVX_const(sv));
3787 if (utf8_variant_count == 0) {
3789 d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + extra);
3792 sv_utf8_upgrade_flags_grow(sv,
3793 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3795 d = SvPVX(sv) + SvCUR(sv);
3798 } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
3800 /* See Note on sizing above. (NOTE: SvCUR() is not
3801 * set correctly here). */
3802 const STRLEN extra = len + (send - e) + 1;
3803 const STRLEN off = d - SvPVX_const(sv);
3804 d = off + SvGROW(sv, off + extra);
3806 Copy(str, d, len, char);
3812 } /* End \N{NAME} */
3816 backslash_N++; /* \N{} is defined to be Unicode */
3818 s = e + 1; /* Point to just after the '}' */
3821 /* \c is a control character */
3825 *d++ = grok_bslash_c(*s++, 1);
3828 yyerror("Missing control char name in \\c");
3831 non_portable_endpoint++;
3835 /* printf-style backslashes, formfeeds, newlines, etc */
3861 } /* end if (backslash) */
3864 /* Just copy the input to the output, though we may have to convert
3867 * If the input has the same representation in UTF-8 as not, it will be
3868 * a single byte, and we don't care about UTF8ness; just copy the byte */
3869 if (NATIVE_BYTE_IS_INVARIANT((U8)(*s))) {
3872 else if (! this_utf8 && ! has_utf8) {
3873 /* If neither source nor output is UTF-8, is also a single byte,
3874 * just copy it; but this byte counts should we later have to
3875 * convert to UTF-8 */
3877 utf8_variant_count++;
3879 else if (this_utf8 && has_utf8) { /* Both UTF-8, can just copy */
3880 const STRLEN len = UTF8SKIP(s);
3882 /* We expect the source to have already been checked for
3884 assert(isUTF8_CHAR((U8 *) s, (U8 *) send));
3886 Copy(s, d, len, U8);
3890 else { /* UTF8ness matters and doesn't match, need to convert */
3892 const UV nextuv = (this_utf8)
3893 ? utf8n_to_uvchr((U8*)s, send - s, &len, 0)
3895 STRLEN need = UVCHR_SKIP(nextuv);
3898 SvCUR_set(sv, d - SvPVX_const(sv));
3902 /* See Note on sizing above. */
3903 need += (STRLEN)(send - s) + 1;
3905 if (utf8_variant_count == 0) {
3907 d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + need);
3910 sv_utf8_upgrade_flags_grow(sv,
3911 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3913 d = SvPVX(sv) + SvCUR(sv);
3916 } else if (need > len) {
3917 /* encoded value larger than old, may need extra space (NOTE:
3918 * SvCUR() is not set correctly here). See Note on sizing
3920 const STRLEN extra = need + (send - s) + 1;
3921 const STRLEN off = d - SvPVX_const(sv);
3922 d = off + SvGROW(sv, off + extra);
3926 d = (char*)uvchr_to_utf8((U8*)d, nextuv);
3928 } /* while loop to process each character */
3930 /* terminate the string and set up the sv */
3932 SvCUR_set(sv, d - SvPVX_const(sv));
3933 if (SvCUR(sv) >= SvLEN(sv))
3934 Perl_croak(aTHX_ "panic: constant overflowed allocated space, %" UVuf
3935 " >= %" UVuf, (UV)SvCUR(sv), (UV)SvLEN(sv));
3940 if (PL_lex_inwhat == OP_TRANS && PL_parser->lex_sub_op) {
3941 PL_parser->lex_sub_op->op_private |=
3942 (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
3946 /* shrink the sv if we allocated more than we used */
3947 if (SvCUR(sv) + 5 < SvLEN(sv)) {
3948 SvPV_shrink_to_cur(sv);
3951 /* return the substring (via pl_yylval) only if we parsed anything */
3954 for (; s2 < s; s2++) {
3956 COPLINE_INC_WITH_HERELINES;
3958 SvREFCNT_inc_simple_void_NN(sv);
3959 if ( (PL_hints & ( PL_lex_inpat ? HINT_NEW_RE : HINT_NEW_STRING ))
3960 && ! PL_parser->lex_re_reparsing)
3962 const char *const key = PL_lex_inpat ? "qr" : "q";
3963 const STRLEN keylen = PL_lex_inpat ? 2 : 1;
3967 if (PL_lex_inwhat == OP_TRANS) {
3970 } else if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat) {
3973 } else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'') {
3981 sv = S_new_constant(aTHX_ start, s - start, key, keylen, sv, NULL,
3984 pl_yylval.opval = newSVOP(OP_CONST, 0, sv);
3986 LEAVE_with_name("scan_const");
3991 * Returns TRUE if there's more to the expression (e.g., a subscript),
3994 * It deals with "$foo[3]" and /$foo[3]/ and /$foo[0123456789$]+/
3996 * ->[ and ->{ return TRUE
3997 * ->$* ->$#* ->@* ->@[ ->@{ return TRUE if postderef_qq is enabled
3998 * { and [ outside a pattern are always subscripts, so return TRUE
3999 * if we're outside a pattern and it's not { or [, then return FALSE
4000 * if we're in a pattern and the first char is a {
4001 * {4,5} (any digits around the comma) returns FALSE
4002 * if we're in a pattern and the first char is a [
4004 * [SOMETHING] has a funky algorithm to decide whether it's a
4005 * character class or not. It has to deal with things like
4006 * /$foo[-3]/ and /$foo[$bar]/ as well as /$foo[$\d]+/
4007 * anything else returns TRUE
4010 /* This is the one truly awful dwimmer necessary to conflate C and sed. */
4013 S_intuit_more(pTHX_ char *s)
4015 PERL_ARGS_ASSERT_INTUIT_MORE;
4017 if (PL_lex_brackets)
4019 if (*s == '-' && s[1] == '>' && (s[2] == '[' || s[2] == '{'))
4021 if (*s == '-' && s[1] == '>'
4022 && FEATURE_POSTDEREF_QQ_IS_ENABLED
4023 && ( (s[2] == '$' && (s[3] == '*' || (s[3] == '#' && s[4] == '*')))
4024 ||(s[2] == '@' && strchr("*[{",s[3])) ))
4026 if (*s != '{' && *s != '[')
4031 /* In a pattern, so maybe we have {n,m}. */
4039 /* On the other hand, maybe we have a character class */
4042 if (*s == ']' || *s == '^')
4045 /* this is terrifying, and it works */
4048 const char * const send = strchr(s,']');
4049 unsigned char un_char, last_un_char;
4050 char tmpbuf[sizeof PL_tokenbuf * 4];
4052 if (!send) /* has to be an expression */
4054 weight = 2; /* let's weigh the evidence */
4058 else if (isDIGIT(*s)) {
4060 if (isDIGIT(s[1]) && s[2] == ']')
4066 Zero(seen,256,char);
4068 for (; s < send; s++) {
4069 last_un_char = un_char;
4070 un_char = (unsigned char)*s;
4075 weight -= seen[un_char] * 10;
4076 if (isWORDCHAR_lazy_if_safe(s+1, PL_bufend, UTF)) {
4078 char *tmp = PL_bufend;
4079 PL_bufend = (char*)send;
4080 scan_ident(s, tmpbuf, sizeof tmpbuf, FALSE);
4082 len = (int)strlen(tmpbuf);
4083 if (len > 1 && gv_fetchpvn_flags(tmpbuf, len,
4084 UTF ? SVf_UTF8 : 0, SVt_PV))
4091 && strchr("[#!%*<>()-=",s[1]))
4093 if (/*{*/ strchr("])} =",s[2]))
4102 if (strchr("wds]",s[1]))
4104 else if (seen[(U8)'\''] || seen[(U8)'"'])
4106 else if (strchr("rnftbxcav",s[1]))
4108 else if (isDIGIT(s[1])) {
4110 while (s[1] && isDIGIT(s[1]))
4120 if (strchr("aA01! ",last_un_char))
4122 if (strchr("zZ79~",s[1]))
4124 if (last_un_char == 255 && (isDIGIT(s[1]) || s[1] == '$'))
4125 weight -= 5; /* cope with negative subscript */
4128 if (!isWORDCHAR(last_un_char)
4129 && !(last_un_char == '$' || last_un_char == '@'
4130 || last_un_char == '&')
4131 && isALPHA(*s) && s[1] && isALPHA(s[1])) {
4135 if (keyword(d, s - d, 0))
4138 if (un_char == last_un_char + 1)
4140 weight -= seen[un_char];
4145 if (weight >= 0) /* probably a character class */
4155 * Does all the checking to disambiguate
4157 * between foo(bar) and bar->foo. Returns 0 if not a method, otherwise
4158 * FUNCMETH (bar->foo(args)) or METHOD (bar->foo args).
4160 * First argument is the stuff after the first token, e.g. "bar".
4162 * Not a method if foo is a filehandle.
4163 * Not a method if foo is a subroutine prototyped to take a filehandle.
4164 * Not a method if it's really "Foo $bar"
4165 * Method if it's "foo $bar"
4166 * Not a method if it's really "print foo $bar"
4167 * Method if it's really "foo package::" (interpreted as package->foo)
4168 * Not a method if bar is known to be a subroutine ("sub bar; foo bar")
4169 * Not a method if bar is a filehandle or package, but is quoted with
4174 S_intuit_method(pTHX_ char *start, SV *ioname, CV *cv)
4176 char *s = start + (*start == '$');
4177 char tmpbuf[sizeof PL_tokenbuf];
4180 /* Mustn't actually add anything to a symbol table.
4181 But also don't want to "initialise" any placeholder
4182 constants that might already be there into full
4183 blown PVGVs with attached PVCV. */
4185 ioname ? gv_fetchsv(ioname, GV_NOADD_NOINIT, SVt_PVCV) : NULL;
4187 PERL_ARGS_ASSERT_INTUIT_METHOD;
4189 if (gv && SvTYPE(gv) == SVt_PVGV && GvIO(gv))
4191 if (cv && SvPOK(cv)) {
4192 const char *proto = CvPROTO(cv);
4194 while (*proto && (isSPACE(*proto) || *proto == ';'))
4201 if (*start == '$') {
4202 if (cv || PL_last_lop_op == OP_PRINT || PL_last_lop_op == OP_SAY
4203 || isUPPER(*PL_tokenbuf))
4208 return *s == '(' ? FUNCMETH : METHOD;
4211 s = scan_word(s, tmpbuf, sizeof tmpbuf, TRUE, &len);
4212 /* start is the beginning of the possible filehandle/object,
4213 * and s is the end of it
4214 * tmpbuf is a copy of it (but with single quotes as double colons)
4217 if (!keyword(tmpbuf, len, 0)) {
4218 if (len > 2 && tmpbuf[len - 2] == ':' && tmpbuf[len - 1] == ':') {
4223 indirgv = gv_fetchpvn_flags(tmpbuf, len,
4224 GV_NOADD_NOINIT|( UTF ? SVf_UTF8 : 0 ),
4226 if (indirgv && SvTYPE(indirgv) != SVt_NULL
4227 && (!isGV(indirgv) || GvCVu(indirgv)))
4229 /* filehandle or package name makes it a method */
4230 if (!cv || GvIO(indirgv) || gv_stashpvn(tmpbuf, len, UTF ? SVf_UTF8 : 0)) {
4232 if ((PL_bufend - s) >= 2 && *s == '=' && *(s+1) == '>')
4233 return 0; /* no assumptions -- "=>" quotes bareword */
4235 NEXTVAL_NEXTTOKE.opval = newSVOP(OP_CONST, 0,
4236 S_newSV_maybe_utf8(aTHX_ tmpbuf, len));
4237 NEXTVAL_NEXTTOKE.opval->op_private = OPpCONST_BARE;
4239 force_next(BAREWORD);
4241 return *s == '(' ? FUNCMETH : METHOD;
4247 /* Encoded script support. filter_add() effectively inserts a
4248 * 'pre-processing' function into the current source input stream.
4249 * Note that the filter function only applies to the current source file
4250 * (e.g., it will not affect files 'require'd or 'use'd by this one).
4252 * The datasv parameter (which may be NULL) can be used to pass
4253 * private data to this instance of the filter. The filter function
4254 * can recover the SV using the FILTER_DATA macro and use it to
4255 * store private buffers and state information.
4257 * The supplied datasv parameter is upgraded to a PVIO type
4258 * and the IoDIRP/IoANY field is used to store the function pointer,
4259 * and IOf_FAKE_DIRP is enabled on datasv to mark this as such.
4260 * Note that IoTOP_NAME, IoFMT_NAME, IoBOTTOM_NAME, if set for
4261 * private use must be set using malloc'd pointers.
4265 Perl_filter_add(pTHX_ filter_t funcp, SV *datasv)
4273 if (PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS)
4274 Perl_croak(aTHX_ "Source filters apply only to byte streams");
4276 if (!PL_rsfp_filters)
4277 PL_rsfp_filters = newAV();
4280 SvUPGRADE(datasv, SVt_PVIO);
4281 IoANY(datasv) = FPTR2DPTR(void *, funcp); /* stash funcp into spare field */
4282 IoFLAGS(datasv) |= IOf_FAKE_DIRP;
4283 DEBUG_P(PerlIO_printf(Perl_debug_log, "filter_add func %p (%s)\n",
4284 FPTR2DPTR(void *, IoANY(datasv)),
4285 SvPV_nolen(datasv)));
4286 av_unshift(PL_rsfp_filters, 1);
4287 av_store(PL_rsfp_filters, 0, datasv) ;
4289 !PL_parser->filtered
4290 && PL_parser->lex_flags & LEX_EVALBYTES
4291 && PL_bufptr < PL_bufend
4293 const char *s = PL_bufptr;
4294 while (s < PL_bufend) {