3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
12 * 'It all comes from here, the stench and the peril.' --Frodo
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
18 * This file is the lexer for Perl. It's closely linked to the
21 * The main routine is yylex(), which returns the next token.
25 =head1 Lexer interface
26 This is the lower layer of the Perl parser, managing characters and tokens.
28 =for apidoc AmU|yy_parser *|PL_parser
30 Pointer to a structure encapsulating the state of the parsing operation
31 currently in progress. The pointer can be locally changed to perform
32 a nested parse without interfering with the state of an outer parse.
33 Individual members of C<PL_parser> have their own documentation.
39 #define PERL_IN_TOKE_C
41 #include "dquote_inline.h"
43 #define new_constant(a,b,c,d,e,f,g) \
44 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
46 #define pl_yylval (PL_parser->yylval)
48 /* XXX temporary backwards compatibility */
49 #define PL_lex_brackets (PL_parser->lex_brackets)
50 #define PL_lex_allbrackets (PL_parser->lex_allbrackets)
51 #define PL_lex_fakeeof (PL_parser->lex_fakeeof)
52 #define PL_lex_brackstack (PL_parser->lex_brackstack)
53 #define PL_lex_casemods (PL_parser->lex_casemods)
54 #define PL_lex_casestack (PL_parser->lex_casestack)
55 #define PL_lex_dojoin (PL_parser->lex_dojoin)
56 #define PL_lex_formbrack (PL_parser->lex_formbrack)
57 #define PL_lex_inpat (PL_parser->lex_inpat)
58 #define PL_lex_inwhat (PL_parser->lex_inwhat)
59 #define PL_lex_op (PL_parser->lex_op)
60 #define PL_lex_repl (PL_parser->lex_repl)
61 #define PL_lex_starts (PL_parser->lex_starts)
62 #define PL_lex_stuff (PL_parser->lex_stuff)
63 #define PL_multi_start (PL_parser->multi_start)
64 #define PL_multi_open (PL_parser->multi_open)
65 #define PL_multi_close (PL_parser->multi_close)
66 #define PL_preambled (PL_parser->preambled)
67 #define PL_linestr (PL_parser->linestr)
68 #define PL_expect (PL_parser->expect)
69 #define PL_copline (PL_parser->copline)
70 #define PL_bufptr (PL_parser->bufptr)
71 #define PL_oldbufptr (PL_parser->oldbufptr)
72 #define PL_oldoldbufptr (PL_parser->oldoldbufptr)
73 #define PL_linestart (PL_parser->linestart)
74 #define PL_bufend (PL_parser->bufend)
75 #define PL_last_uni (PL_parser->last_uni)
76 #define PL_last_lop (PL_parser->last_lop)
77 #define PL_last_lop_op (PL_parser->last_lop_op)
78 #define PL_lex_state (PL_parser->lex_state)
79 #define PL_rsfp (PL_parser->rsfp)
80 #define PL_rsfp_filters (PL_parser->rsfp_filters)
81 #define PL_in_my (PL_parser->in_my)
82 #define PL_in_my_stash (PL_parser->in_my_stash)
83 #define PL_tokenbuf (PL_parser->tokenbuf)
84 #define PL_multi_end (PL_parser->multi_end)
85 #define PL_error_count (PL_parser->error_count)
87 # define PL_nexttoke (PL_parser->nexttoke)
88 # define PL_nexttype (PL_parser->nexttype)
89 # define PL_nextval (PL_parser->nextval)
92 #define SvEVALED(sv) \
93 (SvTYPE(sv) >= SVt_PVNV \
94 && ((XPVIV*)SvANY(sv))->xiv_u.xivu_eval_seen)
96 static const char* const ident_too_long = "Identifier too long";
98 # define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
100 #define XENUMMASK 0x3f
101 #define XFAKEEOF 0x40
102 #define XFAKEBRACK 0x80
104 #ifdef USE_UTF8_SCRIPTS
105 # define UTF cBOOL(!IN_BYTES)
107 # define UTF cBOOL((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
110 /* The maximum number of characters preceding the unrecognized one to display */
111 #define UNRECOGNIZED_PRECEDE_COUNT 10
113 /* In variables named $^X, these are the legal values for X.
114 * 1999-02-27 mjd-perl-patch@plover.com */
115 #define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
117 #define SPACE_OR_TAB(c) isBLANK_A(c)
119 #define HEXFP_PEEK(s) \
121 (isXDIGIT(s[1]) || isALPHA_FOLD_EQ(s[1], 'p'))) || \
122 isALPHA_FOLD_EQ(s[0], 'p'))
124 /* LEX_* are values for PL_lex_state, the state of the lexer.
125 * They are arranged oddly so that the guard on the switch statement
126 * can get by with a single comparison (if the compiler is smart enough).
128 * These values refer to the various states within a sublex parse,
129 * i.e. within a double quotish string
132 /* #define LEX_NOTPARSING 11 is done in perl.h. */
134 #define LEX_NORMAL 10 /* normal code (ie not within "...") */
135 #define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
136 #define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
137 #define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
138 #define LEX_INTERPSTART 6 /* expecting the start of a $var */
140 /* at end of code, eg "$x" followed by: */
141 #define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
142 #define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
144 #define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
145 string or after \E, $foo, etc */
146 #define LEX_INTERPCONST 2 /* NOT USED */
147 #define LEX_FORMLINE 1 /* expecting a format line */
151 static const char* const lex_state_names[] = {
166 #include "keywords.h"
168 /* CLINE is a macro that ensures PL_copline has a sane value */
170 #define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
173 * Convenience functions to return different tokens and prime the
174 * lexer for the next token. They all take an argument.
176 * TOKEN : generic token (used for '(', DOLSHARP, etc)
177 * OPERATOR : generic operator
178 * AOPERATOR : assignment operator
179 * PREBLOCK : beginning the block after an if, while, foreach, ...
180 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
181 * PREREF : *EXPR where EXPR is not a simple identifier
182 * TERM : expression term
183 * POSTDEREF : postfix dereference (->$* ->@[...] etc.)
184 * LOOPX : loop exiting command (goto, last, dump, etc)
185 * FTST : file test operator
186 * FUN0 : zero-argument function
187 * FUN0OP : zero-argument function, with its op created in this file
188 * FUN1 : not used, except for not, which isn't a UNIOP
189 * BOop : bitwise or or xor
191 * BCop : bitwise complement
192 * SHop : shift operator
193 * PWop : power operator
194 * PMop : pattern-matching operator
195 * Aop : addition-level operator
196 * AopNOASSIGN : addition-level operator that is never part of .=
197 * Mop : multiplication-level operator
198 * Eop : equality-testing operator
199 * Rop : relational operator <= != gt
201 * Also see LOP and lop() below.
204 #ifdef DEBUGGING /* Serve -DT. */
205 # define REPORT(retval) tokereport((I32)retval, &pl_yylval)
207 # define REPORT(retval) (retval)
210 #define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
211 #define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
212 #define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, retval))
213 #define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
214 #define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
215 #define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
216 #define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
217 #define POSTDEREF(f) return (PL_bufptr = s, S_postderef(aTHX_ REPORT(f),s[1]))
218 #define LOOPX(f) return (PL_bufptr = force_word(s,BAREWORD,TRUE,FALSE), \
220 PL_expect = PL_nexttoke ? XOPERATOR : XTERM, \
222 #define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
223 #define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
224 #define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
225 #define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
226 #define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)BITOROP))
227 #define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)BITANDOP))
228 #define BCop(f) return pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr = s, \
230 #define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)SHIFTOP))
231 #define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)POWOP))
232 #define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
233 #define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)ADDOP))
234 #define AopNOASSIGN(f) return (pl_yylval.ival=f, PL_bufptr=s, REPORT((int)ADDOP))
235 #define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)MULOP))
236 #define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
237 #define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
239 /* This bit of chicanery makes a unary function followed by
240 * a parenthesis into a function with one argument, highest precedence.
241 * The UNIDOR macro is for unary functions that can be followed by the //
242 * operator (such as C<shift // 0>).
244 #define UNI3(f,x,have_x) { \
245 pl_yylval.ival = f; \
246 if (have_x) PL_expect = x; \
248 PL_last_uni = PL_oldbufptr; \
249 PL_last_lop_op = (f) < 0 ? -(f) : (f); \
251 return REPORT( (int)FUNC1 ); \
253 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
255 #define UNI(f) UNI3(f,XTERM,1)
256 #define UNIDOR(f) UNI3(f,XTERMORDORDOR,1)
257 #define UNIPROTO(f,optional) { \
258 if (optional) PL_last_uni = PL_oldbufptr; \
262 #define UNIBRACK(f) UNI3(f,0,0)
264 /* grandfather return to old style */
267 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
268 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
269 pl_yylval.ival = (f); \
275 #define COPLINE_INC_WITH_HERELINES \
277 CopLINE_inc(PL_curcop); \
278 if (PL_parser->herelines) \
279 CopLINE(PL_curcop) += PL_parser->herelines, \
280 PL_parser->herelines = 0; \
282 /* Called after scan_str to update CopLINE(PL_curcop), but only when there
283 * is no sublex_push to follow. */
284 #define COPLINE_SET_FROM_MULTI_END \
286 CopLINE_set(PL_curcop, PL_multi_end); \
287 if (PL_multi_end != PL_multi_start) \
288 PL_parser->herelines = 0; \
294 /* how to interpret the pl_yylval associated with the token */
298 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
303 static struct debug_tokens {
305 enum token_type type;
307 } const debug_tokens[] =
309 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
310 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
311 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
312 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
313 { ARROW, TOKENTYPE_NONE, "ARROW" },
314 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
315 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
316 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
317 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
318 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
319 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
320 { DO, TOKENTYPE_NONE, "DO" },
321 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
322 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
323 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
324 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
325 { ELSE, TOKENTYPE_NONE, "ELSE" },
326 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
327 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
328 { FOR, TOKENTYPE_IVAL, "FOR" },
329 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
330 { FORMLBRACK, TOKENTYPE_NONE, "FORMLBRACK" },
331 { FORMRBRACK, TOKENTYPE_NONE, "FORMRBRACK" },
332 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
333 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
334 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
335 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
336 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
337 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
338 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
339 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
340 { IF, TOKENTYPE_IVAL, "IF" },
341 { LABEL, TOKENTYPE_PVAL, "LABEL" },
342 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
343 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
344 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
345 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
346 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
347 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
348 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
349 { MY, TOKENTYPE_IVAL, "MY" },
350 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
351 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
352 { OROP, TOKENTYPE_IVAL, "OROP" },
353 { OROR, TOKENTYPE_NONE, "OROR" },
354 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
355 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
356 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
357 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
358 { POSTJOIN, TOKENTYPE_NONE, "POSTJOIN" },
359 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
360 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
361 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
362 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
363 { PREINC, TOKENTYPE_NONE, "PREINC" },
364 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
365 { QWLIST, TOKENTYPE_OPVAL, "QWLIST" },
366 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
367 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
368 { REQUIRE, TOKENTYPE_NONE, "REQUIRE" },
369 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
370 { SUB, TOKENTYPE_NONE, "SUB" },
371 { THING, TOKENTYPE_OPVAL, "THING" },
372 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
373 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
374 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
375 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
376 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
377 { USE, TOKENTYPE_IVAL, "USE" },
378 { WHEN, TOKENTYPE_IVAL, "WHEN" },
379 { WHILE, TOKENTYPE_IVAL, "WHILE" },
380 { BAREWORD, TOKENTYPE_OPVAL, "BAREWORD" },
381 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
382 { 0, TOKENTYPE_NONE, NULL }
385 /* dump the returned token in rv, plus any optional arg in pl_yylval */
388 S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
390 PERL_ARGS_ASSERT_TOKEREPORT;
393 const char *name = NULL;
394 enum token_type type = TOKENTYPE_NONE;
395 const struct debug_tokens *p;
396 SV* const report = newSVpvs("<== ");
398 for (p = debug_tokens; p->token; p++) {
399 if (p->token == (int)rv) {
406 Perl_sv_catpv(aTHX_ report, name);
407 else if (isGRAPH(rv))
409 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
411 sv_catpvs(report, " (pending identifier)");
414 sv_catpvs(report, "EOF");
416 Perl_sv_catpvf(aTHX_ report, "?? %" IVdf, (IV)rv);
421 Perl_sv_catpvf(aTHX_ report, "(ival=%" IVdf ")", (IV)lvalp->ival);
423 case TOKENTYPE_OPNUM:
424 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
425 PL_op_name[lvalp->ival]);
428 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
430 case TOKENTYPE_OPVAL:
432 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
433 PL_op_name[lvalp->opval->op_type]);
434 if (lvalp->opval->op_type == OP_CONST) {
435 Perl_sv_catpvf(aTHX_ report, " %s",
436 SvPEEK(cSVOPx_sv(lvalp->opval)));
441 sv_catpvs(report, "(opval=null)");
444 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
450 /* print the buffer with suitable escapes */
453 S_printbuf(pTHX_ const char *const fmt, const char *const s)
455 SV* const tmp = newSVpvs("");
457 PERL_ARGS_ASSERT_PRINTBUF;
459 GCC_DIAG_IGNORE(-Wformat-nonliteral); /* fmt checked by caller */
460 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
468 S_deprecate_commaless_var_list(pTHX) {
470 deprecate_fatal_in("5.28", "Use of comma-less variable list is deprecated");
471 return REPORT(','); /* grandfather non-comma-format format */
477 * This subroutine looks for an '=' next to the operator that has just been
478 * parsed and turns it into an ASSIGNOP if it finds one.
482 S_ao(pTHX_ int toketype)
484 if (*PL_bufptr == '=') {
486 if (toketype == ANDAND)
487 pl_yylval.ival = OP_ANDASSIGN;
488 else if (toketype == OROR)
489 pl_yylval.ival = OP_ORASSIGN;
490 else if (toketype == DORDOR)
491 pl_yylval.ival = OP_DORASSIGN;
494 return REPORT(toketype);
499 * When Perl expects an operator and finds something else, no_op
500 * prints the warning. It always prints "<something> found where
501 * operator expected. It prints "Missing semicolon on previous line?"
502 * if the surprise occurs at the start of the line. "do you need to
503 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
504 * where the compiler doesn't know if foo is a method call or a function.
505 * It prints "Missing operator before end of line" if there's nothing
506 * after the missing operator, or "... before <...>" if there is something
507 * after the missing operator.
509 * PL_bufptr is expected to point to the start of the thing that was found,
510 * and s after the next token or partial token.
514 S_no_op(pTHX_ const char *const what, char *s)
516 char * const oldbp = PL_bufptr;
517 const bool is_first = (PL_oldbufptr == PL_linestart);
519 PERL_ARGS_ASSERT_NO_OP;
525 yywarn(Perl_form(aTHX_ "%s found where operator expected", what), UTF ? SVf_UTF8 : 0);
526 if (ckWARN_d(WARN_SYNTAX)) {
528 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
529 "\t(Missing semicolon on previous line?)\n");
530 else if (PL_oldoldbufptr && isIDFIRST_lazy_if_safe(PL_oldoldbufptr,
535 for (t = PL_oldoldbufptr;
536 (isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF) || *t == ':');
537 t += UTF ? UTF8SKIP(t) : 1)
541 if (t < PL_bufptr && isSPACE(*t))
542 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
543 "\t(Do you need to predeclare %" UTF8f "?)\n",
544 UTF8fARG(UTF, t - PL_oldoldbufptr, PL_oldoldbufptr));
548 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
549 "\t(Missing operator before %" UTF8f "?)\n",
550 UTF8fARG(UTF, s - oldbp, oldbp));
558 * Complain about missing quote/regexp/heredoc terminator.
559 * If it's called with NULL then it cauterizes the line buffer.
560 * If we're in a delimited string and the delimiter is a control
561 * character, it's reformatted into a two-char sequence like ^C.
566 S_missingterm(pTHX_ char *s)
568 char tmpbuf[UTF8_MAXBYTES + 1];
573 char * const nl = strrchr(s,'\n');
578 else if (PL_multi_close < 32) {
580 tmpbuf[1] = (char)toCTRL(PL_multi_close);
585 if (LIKELY(PL_multi_close < 256)) {
586 *tmpbuf = (char)PL_multi_close;
591 *uvchr_to_utf8((U8 *)tmpbuf, PL_multi_close) = 0;
595 q = strchr(s,'"') ? '\'' : '"';
596 sv = sv_2mortal(newSVpv(s,0));
599 Perl_croak(aTHX_ "Can't find string terminator %c%" SVf
600 "%c anywhere before EOF",q,SVfARG(sv),q);
606 * Check whether the named feature is enabled.
609 Perl_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
611 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
613 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
615 assert(CURRENT_FEATURE_BUNDLE == FEATURE_BUNDLE_CUSTOM);
617 if (namelen > MAX_FEATURE_LEN)
619 memcpy(&he_name[8], name, namelen);
621 return cBOOL(cop_hints_fetch_pvn(PL_curcop, he_name, 8 + namelen, 0,
622 REFCOUNTED_HE_EXISTS));
626 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
627 * utf16-to-utf8-reversed.
630 #ifdef PERL_CR_FILTER
634 const char *s = SvPVX_const(sv);
635 const char * const e = s + SvCUR(sv);
637 PERL_ARGS_ASSERT_STRIP_RETURN;
639 /* outer loop optimized to do nothing if there are no CR-LFs */
641 if (*s++ == '\r' && *s == '\n') {
642 /* hit a CR-LF, need to copy the rest */
646 if (*s == '\r' && s[1] == '\n')
657 S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
659 const I32 count = FILTER_READ(idx+1, sv, maxlen);
660 if (count > 0 && !maxlen)
667 =for apidoc Amx|void|lex_start|SV *line|PerlIO *rsfp|U32 flags
669 Creates and initialises a new lexer/parser state object, supplying
670 a context in which to lex and parse from a new source of Perl code.
671 A pointer to the new state object is placed in L</PL_parser>. An entry
672 is made on the save stack so that upon unwinding the new state object
673 will be destroyed and the former value of L</PL_parser> will be restored.
674 Nothing else need be done to clean up the parsing context.
676 The code to be parsed comes from C<line> and C<rsfp>. C<line>, if
677 non-null, provides a string (in SV form) containing code to be parsed.
678 A copy of the string is made, so subsequent modification of C<line>
679 does not affect parsing. C<rsfp>, if non-null, provides an input stream
680 from which code will be read to be parsed. If both are non-null, the
681 code in C<line> comes first and must consist of complete lines of input,
682 and C<rsfp> supplies the remainder of the source.
684 The C<flags> parameter is reserved for future use. Currently it is only
685 used by perl internally, so extensions should always pass zero.
690 /* LEX_START_SAME_FILTER indicates that this is not a new file, so it
691 can share filters with the current parser.
692 LEX_START_DONT_CLOSE indicates that the file handle wasn't opened by the
693 caller, hence isn't owned by the parser, so shouldn't be closed on parser
694 destruction. This is used to handle the case of defaulting to reading the
695 script from the standard input because no filename was given on the command
696 line (without getting confused by situation where STDIN has been closed, so
697 the script handle is opened on fd 0) */
700 Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
702 const char *s = NULL;
703 yy_parser *parser, *oparser;
704 if (flags && flags & ~LEX_START_FLAGS)
705 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
707 /* create and initialise a parser */
709 Newxz(parser, 1, yy_parser);
710 parser->old_parser = oparser = PL_parser;
713 parser->stack = NULL;
714 parser->stack_max1 = NULL;
717 /* on scope exit, free this parser and restore any outer one */
719 parser->saved_curcop = PL_curcop;
721 /* initialise lexer state */
723 parser->nexttoke = 0;
724 parser->error_count = oparser ? oparser->error_count : 0;
725 parser->copline = parser->preambling = NOLINE;
726 parser->lex_state = LEX_NORMAL;
727 parser->expect = XSTATE;
729 parser->rsfp_filters =
730 !(flags & LEX_START_SAME_FILTER) || !oparser
732 : MUTABLE_AV(SvREFCNT_inc(
733 oparser->rsfp_filters
734 ? oparser->rsfp_filters
735 : (oparser->rsfp_filters = newAV())
738 Newx(parser->lex_brackstack, 120, char);
739 Newx(parser->lex_casestack, 12, char);
740 *parser->lex_casestack = '\0';
741 Newxz(parser->lex_shared, 1, LEXSHARED);
745 s = SvPV_const(line, len);
746 parser->linestr = flags & LEX_START_COPIED
747 ? SvREFCNT_inc_simple_NN(line)
748 : newSVpvn_flags(s, len, SvUTF8(line));
750 sv_catpvs(parser->linestr, "\n;");
752 parser->linestr = newSVpvn("\n;", rsfp ? 1 : 2);
754 parser->oldoldbufptr =
757 parser->linestart = SvPVX(parser->linestr);
758 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
759 parser->last_lop = parser->last_uni = NULL;
761 STATIC_ASSERT_STMT(FITS_IN_8_BITS(LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
762 |LEX_DONT_CLOSE_RSFP));
763 parser->lex_flags = (U8) (flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
764 |LEX_DONT_CLOSE_RSFP));
766 parser->in_pod = parser->filtered = 0;
770 /* delete a parser object */
773 Perl_parser_free(pTHX_ const yy_parser *parser)
775 PERL_ARGS_ASSERT_PARSER_FREE;
777 PL_curcop = parser->saved_curcop;
778 SvREFCNT_dec(parser->linestr);
780 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
781 PerlIO_clearerr(parser->rsfp);
782 else if (parser->rsfp && (!parser->old_parser
783 || (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
784 PerlIO_close(parser->rsfp);
785 SvREFCNT_dec(parser->rsfp_filters);
786 SvREFCNT_dec(parser->lex_stuff);
787 SvREFCNT_dec(parser->lex_sub_repl);
789 Safefree(parser->lex_brackstack);
790 Safefree(parser->lex_casestack);
791 Safefree(parser->lex_shared);
792 PL_parser = parser->old_parser;
797 Perl_parser_free_nexttoke_ops(pTHX_ yy_parser *parser, OPSLAB *slab)
799 I32 nexttoke = parser->nexttoke;
800 PERL_ARGS_ASSERT_PARSER_FREE_NEXTTOKE_OPS;
802 if (S_is_opval_token(parser->nexttype[nexttoke] & 0xffff)
803 && parser->nextval[nexttoke].opval
804 && parser->nextval[nexttoke].opval->op_slabbed
805 && OpSLAB(parser->nextval[nexttoke].opval) == slab) {
806 op_free(parser->nextval[nexttoke].opval);
807 parser->nextval[nexttoke].opval = NULL;
814 =for apidoc AmxU|SV *|PL_parser-E<gt>linestr
816 Buffer scalar containing the chunk currently under consideration of the
817 text currently being lexed. This is always a plain string scalar (for
818 which C<SvPOK> is true). It is not intended to be used as a scalar by
819 normal scalar means; instead refer to the buffer directly by the pointer
820 variables described below.
822 The lexer maintains various C<char*> pointers to things in the
823 C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
824 reallocated, all of these pointers must be updated. Don't attempt to
825 do this manually, but rather use L</lex_grow_linestr> if you need to
826 reallocate the buffer.
828 The content of the text chunk in the buffer is commonly exactly one
829 complete line of input, up to and including a newline terminator,
830 but there are situations where it is otherwise. The octets of the
831 buffer may be intended to be interpreted as either UTF-8 or Latin-1.
832 The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
833 flag on this scalar, which may disagree with it.
835 For direct examination of the buffer, the variable
836 L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
837 lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
838 of these pointers is usually preferable to examination of the scalar
839 through normal scalar means.
841 =for apidoc AmxU|char *|PL_parser-E<gt>bufend
843 Direct pointer to the end of the chunk of text currently being lexed, the
844 end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
845 + SvCUR(PL_parser-E<gt>linestr)>. A C<NUL> character (zero octet) is
846 always located at the end of the buffer, and does not count as part of
847 the buffer's contents.
849 =for apidoc AmxU|char *|PL_parser-E<gt>bufptr
851 Points to the current position of lexing inside the lexer buffer.
852 Characters around this point may be freely examined, within
853 the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
854 L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
855 interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
857 Lexing code (whether in the Perl core or not) moves this pointer past
858 the characters that it consumes. It is also expected to perform some
859 bookkeeping whenever a newline character is consumed. This movement
860 can be more conveniently performed by the function L</lex_read_to>,
861 which handles newlines appropriately.
863 Interpretation of the buffer's octets can be abstracted out by
864 using the slightly higher-level functions L</lex_peek_unichar> and
865 L</lex_read_unichar>.
867 =for apidoc AmxU|char *|PL_parser-E<gt>linestart
869 Points to the start of the current line inside the lexer buffer.
870 This is useful for indicating at which column an error occurred, and
871 not much else. This must be updated by any lexing code that consumes
872 a newline; the function L</lex_read_to> handles this detail.
878 =for apidoc Amx|bool|lex_bufutf8
880 Indicates whether the octets in the lexer buffer
881 (L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
882 of Unicode characters. If not, they should be interpreted as Latin-1
883 characters. This is analogous to the C<SvUTF8> flag for scalars.
885 In UTF-8 mode, it is not guaranteed that the lexer buffer actually
886 contains valid UTF-8. Lexing code must be robust in the face of invalid
889 The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
890 is significant, but not the whole story regarding the input character
891 encoding. Normally, when a file is being read, the scalar contains octets
892 and its C<SvUTF8> flag is off, but the octets should be interpreted as
893 UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
894 however, the scalar may have the C<SvUTF8> flag on, and in this case its
895 octets should be interpreted as UTF-8 unless the C<use bytes> pragma
896 is in effect. This logic may change in the future; use this function
897 instead of implementing the logic yourself.
903 Perl_lex_bufutf8(pTHX)
909 =for apidoc Amx|char *|lex_grow_linestr|STRLEN len
911 Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
912 at least C<len> octets (including terminating C<NUL>). Returns a
913 pointer to the reallocated buffer. This is necessary before making
914 any direct modification of the buffer that would increase its length.
915 L</lex_stuff_pvn> provides a more convenient way to insert text into
918 Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
919 this function updates all of the lexer's variables that point directly
926 Perl_lex_grow_linestr(pTHX_ STRLEN len)
930 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
931 STRLEN linestart_pos, last_uni_pos, last_lop_pos, re_eval_start_pos;
934 linestr = PL_parser->linestr;
935 buf = SvPVX(linestr);
936 if (len <= SvLEN(linestr))
939 /* Is the lex_shared linestr SV the same as the current linestr SV?
940 * Only in this case does re_eval_start need adjusting, since it
941 * points within lex_shared->ls_linestr's buffer */
942 current = ( !PL_parser->lex_shared->ls_linestr
943 || linestr == PL_parser->lex_shared->ls_linestr);
945 bufend_pos = PL_parser->bufend - buf;
946 bufptr_pos = PL_parser->bufptr - buf;
947 oldbufptr_pos = PL_parser->oldbufptr - buf;
948 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
949 linestart_pos = PL_parser->linestart - buf;
950 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
951 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
952 re_eval_start_pos = (current && PL_parser->lex_shared->re_eval_start) ?
953 PL_parser->lex_shared->re_eval_start - buf : 0;
955 buf = sv_grow(linestr, len);
957 PL_parser->bufend = buf + bufend_pos;
958 PL_parser->bufptr = buf + bufptr_pos;
959 PL_parser->oldbufptr = buf + oldbufptr_pos;
960 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
961 PL_parser->linestart = buf + linestart_pos;
962 if (PL_parser->last_uni)
963 PL_parser->last_uni = buf + last_uni_pos;
964 if (PL_parser->last_lop)
965 PL_parser->last_lop = buf + last_lop_pos;
966 if (current && PL_parser->lex_shared->re_eval_start)
967 PL_parser->lex_shared->re_eval_start = buf + re_eval_start_pos;
972 =for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
974 Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
975 immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
976 reallocating the buffer if necessary. This means that lexing code that
977 runs later will see the characters as if they had appeared in the input.
978 It is not recommended to do this as part of normal parsing, and most
979 uses of this facility run the risk of the inserted characters being
980 interpreted in an unintended manner.
982 The string to be inserted is represented by C<len> octets starting
983 at C<pv>. These octets are interpreted as either UTF-8 or Latin-1,
984 according to whether the C<LEX_STUFF_UTF8> flag is set in C<flags>.
985 The characters are recoded for the lexer buffer, according to how the
986 buffer is currently being interpreted (L</lex_bufutf8>). If a string
987 to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
988 function is more convenient.
994 Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
998 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
999 if (flags & ~(LEX_STUFF_UTF8))
1000 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
1002 if (flags & LEX_STUFF_UTF8) {
1005 STRLEN highhalf = 0; /* Count of variants */
1006 const char *p, *e = pv+len;
1007 for (p = pv; p != e; p++) {
1008 if (! UTF8_IS_INVARIANT(*p)) {
1014 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
1015 bufptr = PL_parser->bufptr;
1016 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
1017 SvCUR_set(PL_parser->linestr,
1018 SvCUR(PL_parser->linestr) + len+highhalf);
1019 PL_parser->bufend += len+highhalf;
1020 for (p = pv; p != e; p++) {
1022 if (! UTF8_IS_INVARIANT(c)) {
1023 *bufptr++ = UTF8_TWO_BYTE_HI(c);
1024 *bufptr++ = UTF8_TWO_BYTE_LO(c);
1026 *bufptr++ = (char)c;
1031 if (flags & LEX_STUFF_UTF8) {
1032 STRLEN highhalf = 0;
1033 const char *p, *e = pv+len;
1034 for (p = pv; p != e; p++) {
1036 if (UTF8_IS_ABOVE_LATIN1(c)) {
1037 Perl_croak(aTHX_ "Lexing code attempted to stuff "
1038 "non-Latin-1 character into Latin-1 input");
1039 } else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e)) {
1042 } else if (! UTF8_IS_INVARIANT(c)) {
1043 _force_out_malformed_utf8_message((U8 *) p, (U8 *) e,
1045 1 /* 1 means die */ );
1046 NOT_REACHED; /* NOTREACHED */
1051 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
1052 bufptr = PL_parser->bufptr;
1053 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
1054 SvCUR_set(PL_parser->linestr,
1055 SvCUR(PL_parser->linestr) + len-highhalf);
1056 PL_parser->bufend += len-highhalf;
1059 if (UTF8_IS_INVARIANT(*p)) {
1065 *bufptr++ = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
1071 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1072 bufptr = PL_parser->bufptr;
1073 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
1074 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
1075 PL_parser->bufend += len;
1076 Copy(pv, bufptr, len, char);
1082 =for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
1084 Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1085 immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1086 reallocating the buffer if necessary. This means that lexing code that
1087 runs later will see the characters as if they had appeared in the input.
1088 It is not recommended to do this as part of normal parsing, and most
1089 uses of this facility run the risk of the inserted characters being
1090 interpreted in an unintended manner.
1092 The string to be inserted is represented by octets starting at C<pv>
1093 and continuing to the first nul. These octets are interpreted as either
1094 UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
1095 in C<flags>. The characters are recoded for the lexer buffer, according
1096 to how the buffer is currently being interpreted (L</lex_bufutf8>).
1097 If it is not convenient to nul-terminate a string to be inserted, the
1098 L</lex_stuff_pvn> function is more appropriate.
1104 Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1106 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1107 lex_stuff_pvn(pv, strlen(pv), flags);
1111 =for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1113 Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1114 immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1115 reallocating the buffer if necessary. This means that lexing code that
1116 runs later will see the characters as if they had appeared in the input.
1117 It is not recommended to do this as part of normal parsing, and most
1118 uses of this facility run the risk of the inserted characters being
1119 interpreted in an unintended manner.
1121 The string to be inserted is the string value of C<sv>. The characters
1122 are recoded for the lexer buffer, according to how the buffer is currently
1123 being interpreted (L</lex_bufutf8>). If a string to be inserted is
1124 not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1125 need to construct a scalar.
1131 Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1135 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1137 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1139 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1143 =for apidoc Amx|void|lex_unstuff|char *ptr
1145 Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1146 C<ptr>. Text following C<ptr> will be moved, and the buffer shortened.
1147 This hides the discarded text from any lexing code that runs later,
1148 as if the text had never appeared.
1150 This is not the normal way to consume lexed text. For that, use
1157 Perl_lex_unstuff(pTHX_ char *ptr)
1161 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1162 buf = PL_parser->bufptr;
1164 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1167 bufend = PL_parser->bufend;
1169 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1170 unstuff_len = ptr - buf;
1171 Move(ptr, buf, bufend+1-ptr, char);
1172 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1173 PL_parser->bufend = bufend - unstuff_len;
1177 =for apidoc Amx|void|lex_read_to|char *ptr
1179 Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1180 to C<ptr>. This advances L</PL_parser-E<gt>bufptr> to match C<ptr>,
1181 performing the correct bookkeeping whenever a newline character is passed.
1182 This is the normal way to consume lexed text.
1184 Interpretation of the buffer's octets can be abstracted out by
1185 using the slightly higher-level functions L</lex_peek_unichar> and
1186 L</lex_read_unichar>.
1192 Perl_lex_read_to(pTHX_ char *ptr)
1195 PERL_ARGS_ASSERT_LEX_READ_TO;
1196 s = PL_parser->bufptr;
1197 if (ptr < s || ptr > PL_parser->bufend)
1198 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1199 for (; s != ptr; s++)
1201 COPLINE_INC_WITH_HERELINES;
1202 PL_parser->linestart = s+1;
1204 PL_parser->bufptr = ptr;
1208 =for apidoc Amx|void|lex_discard_to|char *ptr
1210 Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1211 up to C<ptr>. The remaining content of the buffer will be moved, and
1212 all pointers into the buffer updated appropriately. C<ptr> must not
1213 be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1214 it is not permitted to discard text that has yet to be lexed.
1216 Normally it is not necessarily to do this directly, because it suffices to
1217 use the implicit discarding behaviour of L</lex_next_chunk> and things
1218 based on it. However, if a token stretches across multiple lines,
1219 and the lexing code has kept multiple lines of text in the buffer for
1220 that purpose, then after completion of the token it would be wise to
1221 explicitly discard the now-unneeded earlier lines, to avoid future
1222 multi-line tokens growing the buffer without bound.
1228 Perl_lex_discard_to(pTHX_ char *ptr)
1232 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1233 buf = SvPVX(PL_parser->linestr);
1235 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1238 if (ptr > PL_parser->bufptr)
1239 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1240 discard_len = ptr - buf;
1241 if (PL_parser->oldbufptr < ptr)
1242 PL_parser->oldbufptr = ptr;
1243 if (PL_parser->oldoldbufptr < ptr)
1244 PL_parser->oldoldbufptr = ptr;
1245 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1246 PL_parser->last_uni = NULL;
1247 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1248 PL_parser->last_lop = NULL;
1249 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1250 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1251 PL_parser->bufend -= discard_len;
1252 PL_parser->bufptr -= discard_len;
1253 PL_parser->oldbufptr -= discard_len;
1254 PL_parser->oldoldbufptr -= discard_len;
1255 if (PL_parser->last_uni)
1256 PL_parser->last_uni -= discard_len;
1257 if (PL_parser->last_lop)
1258 PL_parser->last_lop -= discard_len;
1262 =for apidoc Amx|bool|lex_next_chunk|U32 flags
1264 Reads in the next chunk of text to be lexed, appending it to
1265 L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1266 looked to the end of the current chunk and wants to know more. It is
1267 usual, but not necessary, for lexing to have consumed the entirety of
1268 the current chunk at this time.
1270 If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1271 chunk (i.e., the current chunk has been entirely consumed), normally the
1272 current chunk will be discarded at the same time that the new chunk is
1273 read in. If C<flags> has the C<LEX_KEEP_PREVIOUS> bit set, the current chunk
1274 will not be discarded. If the current chunk has not been entirely
1275 consumed, then it will not be discarded regardless of the flag.
1277 Returns true if some new text was added to the buffer, or false if the
1278 buffer has reached the end of the input text.
1283 #define LEX_FAKE_EOF 0x80000000
1284 #define LEX_NO_TERM 0x40000000 /* here-doc */
1287 Perl_lex_next_chunk(pTHX_ U32 flags)
1291 STRLEN old_bufend_pos, new_bufend_pos;
1292 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1293 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
1294 bool got_some_for_debugger = 0;
1296 const U8* first_bad_char_loc;
1298 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM))
1299 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
1300 if (!(flags & LEX_NO_TERM) && PL_lex_inwhat)
1302 linestr = PL_parser->linestr;
1303 buf = SvPVX(linestr);
1304 if (!(flags & LEX_KEEP_PREVIOUS)
1305 && PL_parser->bufptr == PL_parser->bufend)
1307 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1309 if (PL_parser->last_uni != PL_parser->bufend)
1310 PL_parser->last_uni = NULL;
1311 if (PL_parser->last_lop != PL_parser->bufend)
1312 PL_parser->last_lop = NULL;
1313 last_uni_pos = last_lop_pos = 0;
1317 old_bufend_pos = PL_parser->bufend - buf;
1318 bufptr_pos = PL_parser->bufptr - buf;
1319 oldbufptr_pos = PL_parser->oldbufptr - buf;
1320 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1321 linestart_pos = PL_parser->linestart - buf;
1322 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1323 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1325 if (flags & LEX_FAKE_EOF) {
1327 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
1329 } else if (filter_gets(linestr, old_bufend_pos)) {
1331 got_some_for_debugger = 1;
1332 } else if (flags & LEX_NO_TERM) {
1335 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1338 /* End of real input. Close filehandle (unless it was STDIN),
1339 * then add implicit termination.
1341 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
1342 PerlIO_clearerr(PL_parser->rsfp);
1343 else if (PL_parser->rsfp)
1344 (void)PerlIO_close(PL_parser->rsfp);
1345 PL_parser->rsfp = NULL;
1346 PL_parser->in_pod = PL_parser->filtered = 0;
1347 if (!PL_in_eval && PL_minus_p) {
1349 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1350 PL_minus_n = PL_minus_p = 0;
1351 } else if (!PL_in_eval && PL_minus_n) {
1352 sv_catpvs(linestr, /*{*/";}");
1355 sv_catpvs(linestr, ";");
1358 buf = SvPVX(linestr);
1359 new_bufend_pos = SvCUR(linestr);
1360 PL_parser->bufend = buf + new_bufend_pos;
1361 PL_parser->bufptr = buf + bufptr_pos;
1363 if (UTF && ! is_utf8_string_loc((U8 *) PL_parser->bufptr,
1364 PL_parser->bufend - PL_parser->bufptr,
1365 &first_bad_char_loc))
1367 _force_out_malformed_utf8_message(first_bad_char_loc,
1368 (U8 *) PL_parser->bufend,
1370 1 /* 1 means die */ );
1371 NOT_REACHED; /* NOTREACHED */
1374 PL_parser->oldbufptr = buf + oldbufptr_pos;
1375 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1376 PL_parser->linestart = buf + linestart_pos;
1377 if (PL_parser->last_uni)
1378 PL_parser->last_uni = buf + last_uni_pos;
1379 if (PL_parser->last_lop)
1380 PL_parser->last_lop = buf + last_lop_pos;
1381 if (PL_parser->preambling != NOLINE) {
1382 CopLINE_set(PL_curcop, PL_parser->preambling + 1);
1383 PL_parser->preambling = NOLINE;
1385 if ( got_some_for_debugger
1386 && PERLDB_LINE_OR_SAVESRC
1387 && PL_curstash != PL_debstash)
1389 /* debugger active and we're not compiling the debugger code,
1390 * so store the line into the debugger's array of lines
1392 update_debugger_info(NULL, buf+old_bufend_pos,
1393 new_bufend_pos-old_bufend_pos);
1399 =for apidoc Amx|I32|lex_peek_unichar|U32 flags
1401 Looks ahead one (Unicode) character in the text currently being lexed.
1402 Returns the codepoint (unsigned integer value) of the next character,
1403 or -1 if lexing has reached the end of the input text. To consume the
1404 peeked character, use L</lex_read_unichar>.
1406 If the next character is in (or extends into) the next chunk of input
1407 text, the next chunk will be read in. Normally the current chunk will be
1408 discarded at the same time, but if C<flags> has the C<LEX_KEEP_PREVIOUS>
1409 bit set, then the current chunk will not be discarded.
1411 If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1412 is encountered, an exception is generated.
1418 Perl_lex_peek_unichar(pTHX_ U32 flags)
1422 if (flags & ~(LEX_KEEP_PREVIOUS))
1423 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1424 s = PL_parser->bufptr;
1425 bufend = PL_parser->bufend;
1431 if (!lex_next_chunk(flags))
1433 s = PL_parser->bufptr;
1434 bufend = PL_parser->bufend;
1437 if (UTF8_IS_INVARIANT(head))
1439 if (UTF8_IS_START(head)) {
1440 len = UTF8SKIP(&head);
1441 while ((STRLEN)(bufend-s) < len) {
1442 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1444 s = PL_parser->bufptr;
1445 bufend = PL_parser->bufend;
1448 unichar = utf8n_to_uvchr((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
1449 if (retlen == (STRLEN)-1) {
1450 _force_out_malformed_utf8_message((U8 *) s,
1453 1 /* 1 means die */ );
1454 NOT_REACHED; /* NOTREACHED */
1459 if (!lex_next_chunk(flags))
1461 s = PL_parser->bufptr;
1468 =for apidoc Amx|I32|lex_read_unichar|U32 flags
1470 Reads the next (Unicode) character in the text currently being lexed.
1471 Returns the codepoint (unsigned integer value) of the character read,
1472 and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1473 if lexing has reached the end of the input text. To non-destructively
1474 examine the next character, use L</lex_peek_unichar> instead.
1476 If the next character is in (or extends into) the next chunk of input
1477 text, the next chunk will be read in. Normally the current chunk will be
1478 discarded at the same time, but if C<flags> has the C<LEX_KEEP_PREVIOUS>
1479 bit set, then the current chunk will not be discarded.
1481 If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1482 is encountered, an exception is generated.
1488 Perl_lex_read_unichar(pTHX_ U32 flags)
1491 if (flags & ~(LEX_KEEP_PREVIOUS))
1492 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1493 c = lex_peek_unichar(flags);
1496 COPLINE_INC_WITH_HERELINES;
1498 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1500 ++(PL_parser->bufptr);
1506 =for apidoc Amx|void|lex_read_space|U32 flags
1508 Reads optional spaces, in Perl style, in the text currently being
1509 lexed. The spaces may include ordinary whitespace characters and
1510 Perl-style comments. C<#line> directives are processed if encountered.
1511 L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1512 at a non-space character (or the end of the input text).
1514 If spaces extend into the next chunk of input text, the next chunk will
1515 be read in. Normally the current chunk will be discarded at the same
1516 time, but if C<flags> has the C<LEX_KEEP_PREVIOUS> bit set, then the current
1517 chunk will not be discarded.
1522 #define LEX_NO_INCLINE 0x40000000
1523 #define LEX_NO_NEXT_CHUNK 0x80000000
1526 Perl_lex_read_space(pTHX_ U32 flags)
1529 const bool can_incline = !(flags & LEX_NO_INCLINE);
1530 bool need_incline = 0;
1531 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK|LEX_NO_INCLINE))
1532 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
1533 s = PL_parser->bufptr;
1534 bufend = PL_parser->bufend;
1540 } while (!(c == '\n' || (c == 0 && s == bufend)));
1541 } else if (c == '\n') {
1544 PL_parser->linestart = s;
1550 } else if (isSPACE(c)) {
1552 } else if (c == 0 && s == bufend) {
1555 if (flags & LEX_NO_NEXT_CHUNK)
1557 PL_parser->bufptr = s;
1558 l = CopLINE(PL_curcop);
1559 CopLINE(PL_curcop) += PL_parser->herelines + 1;
1560 got_more = lex_next_chunk(flags);
1561 CopLINE_set(PL_curcop, l);
1562 s = PL_parser->bufptr;
1563 bufend = PL_parser->bufend;
1566 if (can_incline && need_incline && PL_parser->rsfp) {
1576 PL_parser->bufptr = s;
1581 =for apidoc EXMp|bool|validate_proto|SV *name|SV *proto|bool warn
1583 This function performs syntax checking on a prototype, C<proto>.
1584 If C<warn> is true, any illegal characters or mismatched brackets
1585 will trigger illegalproto warnings, declaring that they were
1586 detected in the prototype for C<name>.
1588 The return value is C<true> if this is a valid prototype, and
1589 C<false> if it is not, regardless of whether C<warn> was C<true> or
1592 Note that C<NULL> is a valid C<proto> and will always return C<true>.
1599 Perl_validate_proto(pTHX_ SV *name, SV *proto, bool warn)
1601 STRLEN len, origlen;
1603 bool bad_proto = FALSE;
1604 bool in_brackets = FALSE;
1605 bool after_slash = FALSE;
1606 char greedy_proto = ' ';
1607 bool proto_after_greedy_proto = FALSE;
1608 bool must_be_last = FALSE;
1609 bool underscore = FALSE;
1610 bool bad_proto_after_underscore = FALSE;
1612 PERL_ARGS_ASSERT_VALIDATE_PROTO;
1617 p = SvPV(proto, len);
1619 for (; len--; p++) {
1622 proto_after_greedy_proto = TRUE;
1624 if (!strchr(";@%", *p))
1625 bad_proto_after_underscore = TRUE;
1628 if (!strchr("$@%*;[]&\\_+", *p) || *p == '\0') {
1635 in_brackets = FALSE;
1636 else if ((*p == '@' || *p == '%')
1640 must_be_last = TRUE;
1649 after_slash = FALSE;
1654 SV *tmpsv = newSVpvs_flags("", SVs_TEMP);
1657 ? sv_uni_display(tmpsv, newSVpvn_flags(p, origlen, SVs_TEMP | SVf_UTF8),
1658 origlen, UNI_DISPLAY_ISPRINT)
1659 : pv_pretty(tmpsv, p, origlen, 60, NULL, NULL, PERL_PV_ESCAPE_NONASCII);
1661 if (proto_after_greedy_proto)
1662 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1663 "Prototype after '%c' for %" SVf " : %s",
1664 greedy_proto, SVfARG(name), p);
1666 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1667 "Missing ']' in prototype for %" SVf " : %s",
1670 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1671 "Illegal character in prototype for %" SVf " : %s",
1673 if (bad_proto_after_underscore)
1674 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1675 "Illegal character after '_' in prototype for %" SVf " : %s",
1679 return (! (proto_after_greedy_proto || bad_proto) );
1684 * This subroutine has nothing to do with tilting, whether at windmills
1685 * or pinball tables. Its name is short for "increment line". It
1686 * increments the current line number in CopLINE(PL_curcop) and checks
1687 * to see whether the line starts with a comment of the form
1688 * # line 500 "foo.pm"
1689 * If so, it sets the current line number and file to the values in the comment.
1693 S_incline(pTHX_ const char *s)
1701 PERL_ARGS_ASSERT_INCLINE;
1703 COPLINE_INC_WITH_HERELINES;
1704 if (!PL_rsfp && !PL_parser->filtered && PL_lex_state == LEX_NORMAL
1705 && s+1 == PL_bufend && *s == ';') {
1706 /* fake newline in string eval */
1707 CopLINE_dec(PL_curcop);
1712 while (SPACE_OR_TAB(*s))
1714 if (strEQs(s, "line"))
1718 if (SPACE_OR_TAB(*s))
1722 while (SPACE_OR_TAB(*s))
1730 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
1732 while (SPACE_OR_TAB(*s))
1734 if (*s == '"' && (t = strchr(s+1, '"'))) {
1740 while (*t && !isSPACE(*t))
1744 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
1746 if (*e != '\n' && *e != '\0')
1747 return; /* false alarm */
1749 if (!grok_atoUV(n, &uv, &e))
1751 line_num = ((line_t)uv) - 1;
1754 const STRLEN len = t - s;
1756 if (!PL_rsfp && !PL_parser->filtered) {
1757 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1758 * to *{"::_<newfilename"} */
1759 /* However, the long form of evals is only turned on by the
1760 debugger - usually they're "(eval %lu)" */
1761 GV * const cfgv = CopFILEGV(PL_curcop);
1764 STRLEN tmplen2 = len;
1768 if (tmplen2 + 2 <= sizeof smallbuf)
1771 Newx(tmpbuf2, tmplen2 + 2, char);
1776 memcpy(tmpbuf2 + 2, s, tmplen2);
1779 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
1781 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
1782 /* adjust ${"::_<newfilename"} to store the new file name */
1783 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
1784 /* The line number may differ. If that is the case,
1785 alias the saved lines that are in the array.
1786 Otherwise alias the whole array. */
1787 if (CopLINE(PL_curcop) == line_num) {
1788 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(cfgv)));
1789 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(cfgv)));
1791 else if (GvAV(cfgv)) {
1792 AV * const av = GvAV(cfgv);
1793 const I32 start = CopLINE(PL_curcop)+1;
1794 I32 items = AvFILLp(av) - start;
1796 AV * const av2 = GvAVn(gv2);
1797 SV **svp = AvARRAY(av) + start;
1798 I32 l = (I32)line_num+1;
1800 av_store(av2, l++, SvREFCNT_inc(*svp++));
1805 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
1808 CopFILE_free(PL_curcop);
1809 CopFILE_setn(PL_curcop, s, len);
1811 CopLINE_set(PL_curcop, line_num);
1815 S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
1817 AV *av = CopFILEAVx(PL_curcop);
1820 if (PL_parser->preambling == NOLINE) sv = newSV_type(SVt_PVMG);
1822 sv = *av_fetch(av, 0, 1);
1823 SvUPGRADE(sv, SVt_PVMG);
1825 if (!SvPOK(sv)) SvPVCLEAR(sv);
1827 sv_catsv(sv, orig_sv);
1829 sv_catpvn(sv, buf, len);
1834 if (PL_parser->preambling == NOLINE)
1835 av_store(av, CopLINE(PL_curcop), sv);
1841 * Called to gobble the appropriate amount and type of whitespace.
1842 * Skips comments as well.
1843 * Returns the next character after the whitespace that is skipped.
1846 * Same thing, but look ahead without incrementing line numbers or
1847 * adjusting PL_linestart.
1850 #define skipspace(s) skipspace_flags(s, 0)
1851 #define peekspace(s) skipspace_flags(s, LEX_NO_INCLINE)
1854 S_skipspace_flags(pTHX_ char *s, U32 flags)
1856 PERL_ARGS_ASSERT_SKIPSPACE_FLAGS;
1857 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
1858 while (s < PL_bufend && (SPACE_OR_TAB(*s) || !*s))
1861 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1863 lex_read_space(flags | LEX_KEEP_PREVIOUS |
1864 (PL_lex_inwhat || PL_lex_state == LEX_FORMLINE ?
1865 LEX_NO_NEXT_CHUNK : 0));
1867 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1868 if (PL_linestart > PL_bufptr)
1869 PL_bufptr = PL_linestart;
1877 * Check the unary operators to ensure there's no ambiguity in how they're
1878 * used. An ambiguous piece of code would be:
1880 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1881 * the +5 is its argument.
1890 if (PL_oldoldbufptr != PL_last_uni)
1892 while (isSPACE(*PL_last_uni))
1895 while (isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF) || *s == '-')
1896 s += UTF ? UTF8SKIP(s) : 1;
1897 if ((t = strchr(s, '(')) && t < PL_bufptr)
1900 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
1901 "Warning: Use of \"%" UTF8f "\" without parentheses is ambiguous",
1902 UTF8fARG(UTF, (int)(s - PL_last_uni), PL_last_uni));
1906 * LOP : macro to build a list operator. Its behaviour has been replaced
1907 * with a subroutine, S_lop() for which LOP is just another name.
1910 #define LOP(f,x) return lop(f,x,s)
1914 * Build a list operator (or something that might be one). The rules:
1915 * - if we have a next token, then it's a list operator (no parens) for
1916 * which the next token has already been parsed; e.g.,
1919 * - if the next thing is an opening paren, then it's a function
1920 * - else it's a list operator
1924 S_lop(pTHX_ I32 f, U8 x, char *s)
1926 PERL_ARGS_ASSERT_LOP;
1931 PL_last_lop = PL_oldbufptr;
1932 PL_last_lop_op = (OPCODE)f;
1937 return REPORT(FUNC);
1940 return REPORT(FUNC);
1943 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
1944 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
1945 return REPORT(LSTOP);
1951 * When the lexer realizes it knows the next token (for instance,
1952 * it is reordering tokens for the parser) then it can call S_force_next
1953 * to know what token to return the next time the lexer is called. Caller
1954 * will need to set PL_nextval[] and possibly PL_expect to ensure
1955 * the lexer handles the token correctly.
1959 S_force_next(pTHX_ I32 type)
1963 PerlIO_printf(Perl_debug_log, "### forced token:\n");
1964 tokereport(type, &NEXTVAL_NEXTTOKE);
1967 assert(PL_nexttoke < C_ARRAY_LENGTH(PL_nexttype));
1968 PL_nexttype[PL_nexttoke] = type;
1975 * This subroutine handles postfix deref syntax after the arrow has already
1976 * been emitted. @* $* etc. are emitted as two separate token right here.
1977 * @[ @{ %[ %{ *{ are emitted also as two tokens, but this function emits
1978 * only the first, leaving yylex to find the next.
1982 S_postderef(pTHX_ int const funny, char const next)
1984 assert(funny == DOLSHARP || strchr("$@%&*", funny));
1986 PL_expect = XOPERATOR;
1987 if (PL_lex_state == LEX_INTERPNORMAL && !PL_lex_brackets) {
1988 assert('@' == funny || '$' == funny || DOLSHARP == funny);
1989 PL_lex_state = LEX_INTERPEND;
1991 force_next(POSTJOIN);
1997 if ('@' == funny && PL_lex_state == LEX_INTERPNORMAL
1998 && !PL_lex_brackets)
2000 PL_expect = XOPERATOR;
2009 int yyc = PL_parser->yychar;
2010 if (yyc != YYEMPTY) {
2012 NEXTVAL_NEXTTOKE = PL_parser->yylval;
2013 if (yyc == '{'/*}*/ || yyc == HASHBRACK || yyc == '['/*]*/) {
2014 PL_lex_allbrackets--;
2016 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
2017 } else if (yyc == '('/*)*/) {
2018 PL_lex_allbrackets--;
2023 PL_parser->yychar = YYEMPTY;
2028 S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
2030 SV * const sv = newSVpvn_utf8(start, len,
2033 && !is_utf8_invariant_string((const U8*)start, len)
2034 && is_utf8_string((const U8*)start, len));
2040 * When the lexer knows the next thing is a word (for instance, it has
2041 * just seen -> and it knows that the next char is a word char, then
2042 * it calls S_force_word to stick the next word into the PL_nexttoke/val
2046 * char *start : buffer position (must be within PL_linestr)
2047 * int token : PL_next* will be this type of bare word
2048 * (e.g., METHOD,BAREWORD)
2049 * int check_keyword : if true, Perl checks to make sure the word isn't
2050 * a keyword (do this if the word is a label, e.g. goto FOO)
2051 * int allow_pack : if true, : characters will also be allowed (require,
2052 * use, etc. do this)
2056 S_force_word(pTHX_ char *start, int token, int check_keyword, int allow_pack)
2061 PERL_ARGS_ASSERT_FORCE_WORD;
2063 start = skipspace(start);
2065 if ( isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)
2066 || (allow_pack && *s == ':' && s[1] == ':') )
2068 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
2069 if (check_keyword) {
2070 char *s2 = PL_tokenbuf;
2072 if (allow_pack && len > 6 && strEQs(s2, "CORE::"))
2074 if (keyword(s2, len2, 0))
2077 if (token == METHOD) {
2082 PL_expect = XOPERATOR;
2085 NEXTVAL_NEXTTOKE.opval
2086 = newSVOP(OP_CONST,0,
2087 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
2088 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
2096 * Called when the lexer wants $foo *foo &foo etc, but the program
2097 * text only contains the "foo" portion. The first argument is a pointer
2098 * to the "foo", and the second argument is the type symbol to prefix.
2099 * Forces the next token to be a "BAREWORD".
2100 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
2104 S_force_ident(pTHX_ const char *s, int kind)
2106 PERL_ARGS_ASSERT_FORCE_IDENT;
2109 const STRLEN len = s[1] ? strlen(s) : 1; /* s = "\"" see yylex */
2110 OP* const o = newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
2111 UTF ? SVf_UTF8 : 0));
2112 NEXTVAL_NEXTTOKE.opval = o;
2113 force_next(BAREWORD);
2115 o->op_private = OPpCONST_ENTERED;
2116 /* XXX see note in pp_entereval() for why we forgo typo
2117 warnings if the symbol must be introduced in an eval.
2119 gv_fetchpvn_flags(s, len,
2120 (PL_in_eval ? GV_ADDMULTI
2121 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
2122 kind == '$' ? SVt_PV :
2123 kind == '@' ? SVt_PVAV :
2124 kind == '%' ? SVt_PVHV :
2132 S_force_ident_maybe_lex(pTHX_ char pit)
2134 NEXTVAL_NEXTTOKE.ival = pit;
2139 Perl_str_to_version(pTHX_ SV *sv)
2144 const char *start = SvPV_const(sv,len);
2145 const char * const end = start + len;
2146 const bool utf = SvUTF8(sv) ? TRUE : FALSE;
2148 PERL_ARGS_ASSERT_STR_TO_VERSION;
2150 while (start < end) {
2154 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
2159 retval += ((NV)n)/nshift;
2168 * Forces the next token to be a version number.
2169 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2170 * and if "guessing" is TRUE, then no new token is created (and the caller
2171 * must use an alternative parsing method).
2175 S_force_version(pTHX_ char *s, int guessing)
2180 PERL_ARGS_ASSERT_FORCE_VERSION;
2188 while (isDIGIT(*d) || *d == '_' || *d == '.')
2190 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
2192 s = scan_num(s, &pl_yylval);
2193 version = pl_yylval.opval;
2194 ver = cSVOPx(version)->op_sv;
2195 if (SvPOK(ver) && !SvNIOK(ver)) {
2196 SvUPGRADE(ver, SVt_PVNV);
2197 SvNV_set(ver, str_to_version(ver));
2198 SvNOK_on(ver); /* hint that it is a version */
2201 else if (guessing) {
2206 /* NOTE: The parser sees the package name and the VERSION swapped */
2207 NEXTVAL_NEXTTOKE.opval = version;
2208 force_next(BAREWORD);
2214 * S_force_strict_version
2215 * Forces the next token to be a version number using strict syntax rules.
2219 S_force_strict_version(pTHX_ char *s)
2222 const char *errstr = NULL;
2224 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2226 while (isSPACE(*s)) /* leading whitespace */
2229 if (is_STRICT_VERSION(s,&errstr)) {
2231 s = (char *)scan_version(s, ver, 0);
2232 version = newSVOP(OP_CONST, 0, ver);
2234 else if ((*s != ';' && *s != '{' && *s != '}' )
2235 && (s = skipspace(s), (*s != ';' && *s != '{' && *s != '}' )))
2239 yyerror(errstr); /* version required */
2243 /* NOTE: The parser sees the package name and the VERSION swapped */
2244 NEXTVAL_NEXTTOKE.opval = version;
2245 force_next(BAREWORD);
2252 * Tokenize a quoted string passed in as an SV. It finds the next
2253 * chunk, up to end of string or a backslash. It may make a new
2254 * SV containing that chunk (if HINT_NEW_STRING is on). It also
2259 S_tokeq(pTHX_ SV *sv)
2266 PERL_ARGS_ASSERT_TOKEQ;
2270 assert (!SvIsCOW(sv));
2271 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1) /* <<'heredoc' */
2275 /* This is relying on the SV being "well formed" with a trailing '\0' */
2276 while (s < send && !(*s == '\\' && s[1] == '\\'))
2281 if ( PL_hints & HINT_NEW_STRING ) {
2282 pv = newSVpvn_flags(SvPVX_const(pv), SvCUR(sv),
2283 SVs_TEMP | SvUTF8(sv));
2287 if (s + 1 < send && (s[1] == '\\'))
2288 s++; /* all that, just for this */
2293 SvCUR_set(sv, d - SvPVX_const(sv));
2295 if ( PL_hints & HINT_NEW_STRING )
2296 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
2301 * Now come three functions related to double-quote context,
2302 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2303 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2304 * interact with PL_lex_state, and create fake ( ... ) argument lists
2305 * to handle functions and concatenation.
2309 * stringify ( const[foo] concat lcfirst ( const[bar] ) )
2314 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
2316 * Pattern matching will set PL_lex_op to the pattern-matching op to
2317 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
2319 * OP_CONST is easy--just make the new op and return.
2321 * Everything else becomes a FUNC.
2323 * Sets PL_lex_state to LEX_INTERPPUSH unless ival was OP_NULL or we
2324 * had an OP_CONST. This just sets us up for a
2325 * call to S_sublex_push().
2329 S_sublex_start(pTHX)
2331 const I32 op_type = pl_yylval.ival;
2333 if (op_type == OP_NULL) {
2334 pl_yylval.opval = PL_lex_op;
2338 if (op_type == OP_CONST) {
2339 SV *sv = PL_lex_stuff;
2340 PL_lex_stuff = NULL;
2343 if (SvTYPE(sv) == SVt_PVIV) {
2344 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2346 const char * const p = SvPV_const(sv, len);
2347 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
2351 pl_yylval.opval = newSVOP(op_type, 0, sv);
2355 PL_parser->lex_super_state = PL_lex_state;
2356 PL_parser->lex_sub_inwhat = (U16)op_type;
2357 PL_parser->lex_sub_op = PL_lex_op;
2358 PL_lex_state = LEX_INTERPPUSH;
2362 pl_yylval.opval = PL_lex_op;
2372 * Create a new scope to save the lexing state. The scope will be
2373 * ended in S_sublex_done. Returns a '(', starting the function arguments
2374 * to the uc, lc, etc. found before.
2375 * Sets PL_lex_state to LEX_INTERPCONCAT.
2382 const bool is_heredoc = PL_multi_close == '<';
2385 PL_lex_state = PL_parser->lex_super_state;
2386 SAVEI8(PL_lex_dojoin);
2387 SAVEI32(PL_lex_brackets);
2388 SAVEI32(PL_lex_allbrackets);
2389 SAVEI32(PL_lex_formbrack);
2390 SAVEI8(PL_lex_fakeeof);
2391 SAVEI32(PL_lex_casemods);
2392 SAVEI32(PL_lex_starts);
2393 SAVEI8(PL_lex_state);
2394 SAVESPTR(PL_lex_repl);
2395 SAVEVPTR(PL_lex_inpat);
2396 SAVEI16(PL_lex_inwhat);
2399 SAVECOPLINE(PL_curcop);
2400 SAVEI32(PL_multi_end);
2401 SAVEI32(PL_parser->herelines);
2402 PL_parser->herelines = 0;
2404 SAVEIV(PL_multi_close);
2405 SAVEPPTR(PL_bufptr);
2406 SAVEPPTR(PL_bufend);
2407 SAVEPPTR(PL_oldbufptr);
2408 SAVEPPTR(PL_oldoldbufptr);
2409 SAVEPPTR(PL_last_lop);
2410 SAVEPPTR(PL_last_uni);
2411 SAVEPPTR(PL_linestart);
2412 SAVESPTR(PL_linestr);
2413 SAVEGENERICPV(PL_lex_brackstack);
2414 SAVEGENERICPV(PL_lex_casestack);
2415 SAVEGENERICPV(PL_parser->lex_shared);
2416 SAVEBOOL(PL_parser->lex_re_reparsing);
2417 SAVEI32(PL_copline);
2419 /* The here-doc parser needs to be able to peek into outer lexing
2420 scopes to find the body of the here-doc. So we put PL_linestr and
2421 PL_bufptr into lex_shared, to ‘share’ those values.
2423 PL_parser->lex_shared->ls_linestr = PL_linestr;
2424 PL_parser->lex_shared->ls_bufptr = PL_bufptr;
2426 PL_linestr = PL_lex_stuff;
2427 PL_lex_repl = PL_parser->lex_sub_repl;
2428 PL_lex_stuff = NULL;
2429 PL_parser->lex_sub_repl = NULL;
2431 /* Arrange for PL_lex_stuff to be freed on scope exit, in case it gets
2432 set for an inner quote-like operator and then an error causes scope-
2433 popping. We must not have a PL_lex_stuff value left dangling, as
2434 that breaks assumptions elsewhere. See bug #123617. */
2435 SAVEGENERICSV(PL_lex_stuff);
2436 SAVEGENERICSV(PL_parser->lex_sub_repl);
2438 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2439 = SvPVX(PL_linestr);
2440 PL_bufend += SvCUR(PL_linestr);
2441 PL_last_lop = PL_last_uni = NULL;
2442 SAVEFREESV(PL_linestr);
2443 if (PL_lex_repl) SAVEFREESV(PL_lex_repl);
2445 PL_lex_dojoin = FALSE;
2446 PL_lex_brackets = PL_lex_formbrack = 0;
2447 PL_lex_allbrackets = 0;
2448 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
2449 Newx(PL_lex_brackstack, 120, char);
2450 Newx(PL_lex_casestack, 12, char);
2451 PL_lex_casemods = 0;
2452 *PL_lex_casestack = '\0';
2454 PL_lex_state = LEX_INTERPCONCAT;
2456 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
2457 PL_copline = NOLINE;
2459 Newxz(shared, 1, LEXSHARED);
2460 shared->ls_prev = PL_parser->lex_shared;
2461 PL_parser->lex_shared = shared;
2463 PL_lex_inwhat = PL_parser->lex_sub_inwhat;
2464 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
2465 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2466 PL_lex_inpat = PL_parser->lex_sub_op;
2468 PL_lex_inpat = NULL;
2470 PL_parser->lex_re_reparsing = cBOOL(PL_in_eval & EVAL_RE_REPARSING);
2471 PL_in_eval &= ~EVAL_RE_REPARSING;
2478 * Restores lexer state after a S_sublex_push.
2484 if (!PL_lex_starts++) {
2485 SV * const sv = newSVpvs("");
2486 if (SvUTF8(PL_linestr))
2488 PL_expect = XOPERATOR;
2489 pl_yylval.opval = newSVOP(OP_CONST, 0, sv);
2493 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2494 PL_lex_state = LEX_INTERPCASEMOD;
2498 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
2499 assert(PL_lex_inwhat != OP_TRANSR);
2501 assert (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS);
2502 PL_linestr = PL_lex_repl;
2504 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2505 PL_bufend += SvCUR(PL_linestr);
2506 PL_last_lop = PL_last_uni = NULL;
2507 PL_lex_dojoin = FALSE;
2508 PL_lex_brackets = 0;
2509 PL_lex_allbrackets = 0;
2510 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
2511 PL_lex_casemods = 0;
2512 *PL_lex_casestack = '\0';
2514 if (SvEVALED(PL_lex_repl)) {
2515 PL_lex_state = LEX_INTERPNORMAL;
2517 /* we don't clear PL_lex_repl here, so that we can check later
2518 whether this is an evalled subst; that means we rely on the
2519 logic to ensure sublex_done() is called again only via the
2520 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
2523 PL_lex_state = LEX_INTERPCONCAT;
2526 if (SvTYPE(PL_linestr) >= SVt_PVNV) {
2527 CopLINE(PL_curcop) +=
2528 ((XPVNV*)SvANY(PL_linestr))->xnv_u.xnv_lines
2529 + PL_parser->herelines;
2530 PL_parser->herelines = 0;
2535 const line_t l = CopLINE(PL_curcop);
2537 if (PL_multi_close == '<')
2538 PL_parser->herelines += l - PL_multi_end;
2539 PL_bufend = SvPVX(PL_linestr);
2540 PL_bufend += SvCUR(PL_linestr);
2541 PL_expect = XOPERATOR;
2547 S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
2549 /* <s> points to first character of interior of \N{}, <e> to one beyond the
2550 * interior, hence to the "}". Finds what the name resolves to, returning
2551 * an SV* containing it; NULL if no valid one found */
2553 SV* res = newSVpvn_flags(s, e - s, UTF ? SVf_UTF8 : 0);
2560 const U8* first_bad_char_loc;
2561 const char* backslash_ptr = s - 3; /* Points to the <\> of \N{... */
2563 PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME;
2566 deprecate_fatal_in("5.28", "Unknown charname '' is deprecated");
2570 if (UTF && ! is_utf8_string_loc((U8 *) backslash_ptr,
2572 &first_bad_char_loc))
2574 _force_out_malformed_utf8_message(first_bad_char_loc,
2575 (U8 *) PL_parser->bufend,
2577 0 /* 0 means don't die */ );
2578 yyerror_pv(Perl_form(aTHX_
2579 "Malformed UTF-8 character immediately after '%.*s'",
2580 (int) (first_bad_char_loc - (U8 *) backslash_ptr), backslash_ptr),
2585 res = new_constant( NULL, 0, "charnames", res, NULL, backslash_ptr,
2586 /* include the <}> */
2587 e - backslash_ptr + 1);
2589 SvREFCNT_dec_NN(res);
2593 /* See if the charnames handler is the Perl core's, and if so, we can skip
2594 * the validation needed for a user-supplied one, as Perl's does its own
2596 table = GvHV(PL_hintgv); /* ^H */
2597 cvp = hv_fetchs(table, "charnames", FALSE);
2598 if (cvp && (cv = *cvp) && SvROK(cv) && (rv = SvRV(cv),
2599 SvTYPE(rv) == SVt_PVCV) && ((stash = CvSTASH(rv)) != NULL))
2601 const char * const name = HvNAME(stash);
2602 if (HvNAMELEN(stash) == sizeof("_charnames")-1
2603 && strEQ(name, "_charnames")) {
2608 /* Here, it isn't Perl's charname handler. We can't rely on a
2609 * user-supplied handler to validate the input name. For non-ut8 input,
2610 * look to see that the first character is legal. Then loop through the
2611 * rest checking that each is a continuation */
2613 /* This code makes the reasonable assumption that the only Latin1-range
2614 * characters that begin a character name alias are alphabetic, otherwise
2615 * would have to create a isCHARNAME_BEGIN macro */
2618 if (! isALPHAU(*s)) {
2623 if (! isCHARNAME_CONT(*s)) {
2626 if (*s == ' ' && *(s-1) == ' ') {
2633 /* Similarly for utf8. For invariants can check directly; for other
2634 * Latin1, can calculate their code point and check; otherwise use a
2636 if (UTF8_IS_INVARIANT(*s)) {
2637 if (! isALPHAU(*s)) {
2641 } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
2642 if (! isALPHAU(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)))) {
2648 if (! PL_utf8_charname_begin) {
2649 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2650 PL_utf8_charname_begin = _core_swash_init("utf8",
2651 "_Perl_Charname_Begin",
2653 1, 0, NULL, &flags);
2655 if (! swash_fetch(PL_utf8_charname_begin, (U8 *) s, TRUE)) {
2662 if (UTF8_IS_INVARIANT(*s)) {
2663 if (! isCHARNAME_CONT(*s)) {
2666 if (*s == ' ' && *(s-1) == ' ') {
2671 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
2672 if (! isCHARNAME_CONT(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1))))
2679 if (! PL_utf8_charname_continue) {
2680 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2681 PL_utf8_charname_continue = _core_swash_init("utf8",
2682 "_Perl_Charname_Continue",
2684 1, 0, NULL, &flags);
2686 if (! swash_fetch(PL_utf8_charname_continue, (U8 *) s, TRUE)) {
2693 if (*(s-1) == ' ') {
2696 "charnames alias definitions may not contain trailing "
2697 "white-space; marked by <-- HERE in %.*s<-- HERE %.*s",
2698 (int)(s - backslash_ptr + 1), backslash_ptr,
2699 (int)(e - s + 1), s + 1
2701 UTF ? SVf_UTF8 : 0);
2705 if (SvUTF8(res)) { /* Don't accept malformed input */
2706 const U8* first_bad_char_loc;
2708 const char* const str = SvPV_const(res, len);
2709 if (! is_utf8_string_loc((U8 *) str, len, &first_bad_char_loc)) {
2710 _force_out_malformed_utf8_message(first_bad_char_loc,
2711 (U8 *) PL_parser->bufend,
2713 0 /* 0 means don't die */ );
2716 "Malformed UTF-8 returned by %.*s immediately after '%.*s'",
2717 (int) (e - backslash_ptr + 1), backslash_ptr,
2718 (int) ((char *) first_bad_char_loc - str), str
2729 /* The final %.*s makes sure that should the trailing NUL be missing
2730 * that this print won't run off the end of the string */
2733 "Invalid character in \\N{...}; marked by <-- HERE in %.*s<-- HERE %.*s",
2734 (int)(s - backslash_ptr + 1), backslash_ptr,
2735 (int)(e - s + 1), s + 1
2737 UTF ? SVf_UTF8 : 0);
2744 "charnames alias definitions may not contain a sequence of "
2745 "multiple spaces; marked by <-- HERE in %.*s<-- HERE %.*s",
2746 (int)(s - backslash_ptr + 1), backslash_ptr,
2747 (int)(e - s + 1), s + 1
2749 UTF ? SVf_UTF8 : 0);
2756 Extracts the next constant part of a pattern, double-quoted string,
2757 or transliteration. This is terrifying code.
2759 For example, in parsing the double-quoted string "ab\x63$d", it would
2760 stop at the '$' and return an OP_CONST containing 'abc'.
2762 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
2763 processing a pattern (PL_lex_inpat is true), a transliteration
2764 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
2766 Returns a pointer to the character scanned up to. If this is
2767 advanced from the start pointer supplied (i.e. if anything was
2768 successfully parsed), will leave an OP_CONST for the substring scanned
2769 in pl_yylval. Caller must intuit reason for not parsing further
2770 by looking at the next characters herself.
2774 \N{FOO} => \N{U+hex_for_character_FOO}
2775 (if FOO expands to multiple characters, expands to \N{U+xx.XX.yy ...})
2778 all other \-char, including \N and \N{ apart from \N{ABC}
2781 @ and $ where it appears to be a var, but not for $ as tail anchor
2785 In transliterations:
2786 characters are VERY literal, except for - not at the start or end
2787 of the string, which indicates a range. However some backslash sequences
2788 are recognized: \r, \n, and the like
2789 \007 \o{}, \x{}, \N{}
2790 If all elements in the transliteration are below 256,
2791 scan_const expands the range to the full set of intermediate
2792 characters. If the range is in utf8, the hyphen is replaced with
2793 a certain range mark which will be handled by pmtrans() in op.c.
2795 In double-quoted strings:
2797 all those recognized in transliterations
2798 deprecated backrefs: \1 (in substitution replacements)
2799 case and quoting: \U \Q \E
2802 scan_const does *not* construct ops to handle interpolated strings.
2803 It stops processing as soon as it finds an embedded $ or @ variable
2804 and leaves it to the caller to work out what's going on.
2806 embedded arrays (whether in pattern or not) could be:
2807 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2809 $ in double-quoted strings must be the symbol of an embedded scalar.
2811 $ in pattern could be $foo or could be tail anchor. Assumption:
2812 it's a tail anchor if $ is the last thing in the string, or if it's
2813 followed by one of "()| \r\n\t"
2815 \1 (backreferences) are turned into $1 in substitutions
2817 The structure of the code is
2818 while (there's a character to process) {
2819 handle transliteration ranges
2820 skip regexp comments /(?#comment)/ and codes /(?{code})/
2821 skip #-initiated comments in //x patterns
2822 check for embedded arrays
2823 check for embedded scalars
2825 deprecate \1 in substitution replacements
2826 handle string-changing backslashes \l \U \Q \E, etc.
2827 switch (what was escaped) {
2828 handle \- in a transliteration (becomes a literal -)
2829 if a pattern and not \N{, go treat as regular character
2830 handle \132 (octal characters)
2831 handle \x15 and \x{1234} (hex characters)
2832 handle \N{name} (named characters, also \N{3,5} in a pattern)
2833 handle \cV (control characters)
2834 handle printf-style backslashes (\f, \r, \n, etc)
2837 } (end if backslash)
2838 handle regular character
2839 } (end while character to read)
2844 S_scan_const(pTHX_ char *start)
2846 char *send = PL_bufend; /* end of the constant */
2847 SV *sv = newSV(send - start); /* sv for the constant. See note below
2849 char *s = start; /* start of the constant */
2850 char *d = SvPVX(sv); /* destination for copies */
2851 bool dorange = FALSE; /* are we in a translit range? */
2852 bool didrange = FALSE; /* did we just finish a range? */
2853 bool in_charclass = FALSE; /* within /[...]/ */
2854 bool has_utf8 = FALSE; /* Output constant is UTF8 */
2855 bool has_above_latin1 = FALSE; /* does something require special
2856 handling in tr/// ? */
2857 bool this_utf8 = cBOOL(UTF); /* Is the source string assumed to be
2858 UTF8? But, this can show as true
2859 when the source isn't utf8, as for
2860 example when it is entirely composed
2862 STRLEN utf8_variant_count = 0; /* When not in UTF-8, this counts the
2863 number of characters found so far
2864 that will expand (into 2 bytes)
2865 should we have to convert to
2867 SV *res; /* result from charnames */
2868 STRLEN offset_to_max; /* The offset in the output to where the range
2869 high-end character is temporarily placed */
2871 /* Note on sizing: The scanned constant is placed into sv, which is
2872 * initialized by newSV() assuming one byte of output for every byte of
2873 * input. This routine expects newSV() to allocate an extra byte for a
2874 * trailing NUL, which this routine will append if it gets to the end of
2875 * the input. There may be more bytes of input than output (eg., \N{LATIN
2876 * CAPITAL LETTER A}), or more output than input if the constant ends up
2877 * recoded to utf8, but each time a construct is found that might increase
2878 * the needed size, SvGROW() is called. Its size parameter each time is
2879 * based on the best guess estimate at the time, namely the length used so
2880 * far, plus the length the current construct will occupy, plus room for
2881 * the trailing NUL, plus one byte for every input byte still unscanned */
2883 UV uv = UV_MAX; /* Initialize to weird value to try to catch any uses
2886 int backslash_N = 0; /* ? was the character from \N{} */
2887 int non_portable_endpoint = 0; /* ? In a range is an endpoint
2888 platform-specific like \x65 */
2891 PERL_ARGS_ASSERT_SCAN_CONST;
2893 assert(PL_lex_inwhat != OP_TRANSR);
2894 if (PL_lex_inwhat == OP_TRANS && PL_parser->lex_sub_op) {
2895 /* If we are doing a trans and we know we want UTF8 set expectation */
2896 has_utf8 = PL_parser->lex_sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
2897 this_utf8 = PL_parser->lex_sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
2900 /* Protect sv from errors and fatal warnings. */
2901 ENTER_with_name("scan_const");
2905 || dorange /* Handle tr/// range at right edge of input */
2908 /* get transliterations out of the way (they're most literal) */
2909 if (PL_lex_inwhat == OP_TRANS) {
2911 /* But there isn't any special handling necessary unless there is a
2912 * range, so for most cases we just drop down and handle the value
2913 * as any other. There are two exceptions.
2915 * 1. A hyphen indicates that we are actually going to have a
2916 * range. In this case, skip the '-', set a flag, then drop
2917 * down to handle what should be the end range value.
2918 * 2. After we've handled that value, the next time through, that
2919 * flag is set and we fix up the range.
2921 * Ranges entirely within Latin1 are expanded out entirely, in
2922 * order to make the transliteration a simple table look-up.
2923 * Ranges that extend above Latin1 have to be done differently, so
2924 * there is no advantage to expanding them here, so they are
2925 * stored here as Min, ILLEGAL_UTF8_BYTE, Max. The illegal byte
2926 * signifies a hyphen without any possible ambiguity. On EBCDIC
2927 * machines, if the range is expressed as Unicode, the Latin1
2928 * portion is expanded out even if the range extends above
2929 * Latin1. This is because each code point in it has to be
2930 * processed here individually to get its native translation */
2934 /* Here, we don't think we're in a range. If the new character
2935 * is not a hyphen; or if it is a hyphen, but it's too close to
2936 * either edge to indicate a range, then it's a regular
2938 if (*s != '-' || s >= send - 1 || s == start) {
2940 /* A regular character. Process like any other, but first
2941 * clear any flags */
2945 non_portable_endpoint = 0;
2948 /* The tests here for being above Latin1 and similar ones
2949 * in the following 'else' suffice to find all such
2950 * occurences in the constant, except those added by a
2951 * backslash escape sequence, like \x{100}. And all those
2952 * set 'has_above_latin1' as appropriate */
2953 if (this_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) {
2954 has_above_latin1 = TRUE;
2957 /* Drops down to generic code to process current byte */
2959 else { /* Is a '-' in the context where it means a range */
2960 if (didrange) { /* Something like y/A-C-Z// */
2961 Perl_croak(aTHX_ "Ambiguous range in transliteration"
2967 s++; /* Skip past the hyphen */
2969 /* d now points to where the end-range character will be
2970 * placed. Save it so won't have to go finding it later,
2971 * and drop down to get that character. (Actually we
2972 * instead save the offset, to handle the case where a
2973 * realloc in the meantime could change the actual
2974 * pointer). We'll finish processing the range the next
2975 * time through the loop */
2976 offset_to_max = d - SvPVX_const(sv);
2978 if (this_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) {
2979 has_above_latin1 = TRUE;
2982 /* Drops down to generic code to process current byte */
2984 } /* End of not a range */
2986 /* Here we have parsed a range. Now must handle it. At this
2988 * 'sv' is a SV* that contains the output string we are
2989 * constructing. The final two characters in that string
2990 * are the range start and range end, in order.
2991 * 'd' points to just beyond the range end in the 'sv' string,
2992 * where we would next place something
2993 * 'offset_to_max' is the offset in 'sv' at which the character
2994 * (the range's maximum end point) before 'd' begins.
2996 char * max_ptr = SvPVX(sv) + offset_to_max;
2997 const char * min_ptr;
2999 IV range_max; /* last character in range */
3003 bool convert_unicode;
3004 IV real_range_max = 0;
3007 /* Get the code point values of the range ends. */
3009 /* We know the utf8 is valid, because we just constructed
3010 * it ourselves in previous loop iterations */
3011 min_ptr = (char*) utf8_hop( (U8*) max_ptr, -1);
3012 range_min = valid_utf8_to_uvchr( (U8*) min_ptr, NULL);
3013 range_max = valid_utf8_to_uvchr( (U8*) max_ptr, NULL);
3016 min_ptr = max_ptr - 1;
3017 range_min = * (U8*) min_ptr;
3018 range_max = * (U8*) max_ptr;
3021 /* If the range is just a single code point, like tr/a-a/.../,
3022 * that code point is already in the output, twice. We can
3023 * just back up over the second instance and avoid all the rest
3024 * of the work. But if it is a variant character, it's been
3025 * counted twice, so decrement */
3026 if (UNLIKELY(range_max == range_min)) {
3028 if (! has_utf8 && ! UVCHR_IS_INVARIANT(range_max)) {
3029 utf8_variant_count--;
3035 /* On EBCDIC platforms, we may have to deal with portable
3036 * ranges. These happen if at least one range endpoint is a
3037 * Unicode value (\N{...}), or if the range is a subset of
3038 * [A-Z] or [a-z], and both ends are literal characters,
3039 * like 'A', and not like \x{C1} */
3041 cBOOL(backslash_N) /* \N{} forces Unicode,
3042 hence portable range */
3043 || ( ! non_portable_endpoint
3044 && (( isLOWER_A(range_min) && isLOWER_A(range_max))
3045 || (isUPPER_A(range_min) && isUPPER_A(range_max))));
3046 if (convert_unicode) {
3048 /* Special handling is needed for these portable ranges.
3049 * They are defined to be in Unicode terms, which includes
3050 * all the Unicode code points between the end points.
3051 * Convert to Unicode to get the Unicode range. Later we
3052 * will convert each code point in the range back to
3054 range_min = NATIVE_TO_UNI(range_min);
3055 range_max = NATIVE_TO_UNI(range_max);
3059 if (range_min > range_max) {
3061 if (convert_unicode) {
3062 /* Need to convert back to native for meaningful
3063 * messages for this platform */
3064 range_min = UNI_TO_NATIVE(range_min);
3065 range_max = UNI_TO_NATIVE(range_max);
3069 /* Use the characters themselves for the error message if
3070 * ASCII printables; otherwise some visible representation
3072 if (isPRINT_A(range_min) && isPRINT_A(range_max)) {
3074 "Invalid range \"%c-%c\" in transliteration operator",
3075 (char)range_min, (char)range_max);
3078 else if (convert_unicode) {
3079 /* diag_listed_as: Invalid range "%s" in transliteration operator */
3081 "Invalid range \"\\N{U+%04" UVXf "}-\\N{U+%04"
3082 UVXf "}\" in transliteration operator",
3083 range_min, range_max);
3087 /* diag_listed_as: Invalid range "%s" in transliteration operator */
3089 "Invalid range \"\\x{%04" UVXf "}-\\x{%04" UVXf "}\""
3090 " in transliteration operator",
3091 range_min, range_max);
3097 /* If everything in the transliteration is below 256, we
3098 * can avoid special handling later. A translation table
3099 * for each of those bytes is created by op.c. So we
3100 * expand out all ranges to their constituent code points.
3101 * But if we've encountered something above 255, the
3102 * expanding won't help, so skip doing that. But if it's
3103 * EBCDIC, we may have to look at each character below 256
3104 * if we have to convert to/from Unicode values */
3105 if ( has_above_latin1
3107 && (range_min > 255 || ! convert_unicode)
3110 /* Move the high character one byte to the right; then
3111 * insert between it and the range begin, an illegal
3112 * byte which serves to indicate this is a range (using
3113 * a '-' would be ambiguous). */
3115 while (e-- > max_ptr) {
3118 *(e + 1) = (char) ILLEGAL_UTF8_BYTE;
3122 /* Here, we're going to expand out the range. For EBCDIC
3123 * the range can extend above 255 (not so in ASCII), so
3124 * for EBCDIC, split it into the parts above and below
3127 if (range_max > 255) {
3128 real_range_max = range_max;
3134 /* Here we need to expand out the string to contain each
3135 * character in the range. Grow the output to handle this */
3137 save_offset = min_ptr - SvPVX_const(sv);
3139 /* The base growth is the number of code points in the range */
3140 grow = range_max - range_min + 1;
3143 /* But if the output is UTF-8, some of those characters may
3144 * need two bytes (since the maximum range value here is
3145 * 255, the max bytes per character is two). On ASCII
3146 * platforms, it's not much trouble to get an accurate
3147 * count of what's needed. But on EBCDIC, the ones that
3148 * need 2 bytes are scattered around, so just use a worst
3149 * case value instead of calculating for that platform. */
3153 /* Only those above 127 require 2 bytes. This may be
3154 * everything in the range, or not */
3155 if (range_min > 127) {
3158 else if (range_max > 127) {
3159 grow += range_max - 127;
3164 /* Subtract 3 for the bytes that were already accounted for
3165 * (min, max, and the hyphen) */
3166 d = save_offset + SvGROW(sv, SvLEN(sv) + grow - 3);
3169 /* Here, we expand out the range. */
3170 if (convert_unicode) {
3173 /* Recall that the min and max are now in Unicode terms, so
3174 * we have to convert each character to its native
3177 for (i = range_min; i <= range_max; i++) {
3178 append_utf8_from_native_byte(
3179 LATIN1_TO_NATIVE((U8) i),
3184 for (i = range_min; i <= range_max; i++) {
3185 *d++ = (char)LATIN1_TO_NATIVE((U8) i);
3191 /* Always gets run for ASCII, and sometimes for EBCDIC. */
3195 /* Here, no conversions are necessary, which means that the
3196 * first character in the range is already in 'd' and
3197 * valid, so we can skip overwriting it */
3200 for (i = range_min + 1; i <= range_max; i++) {
3201 append_utf8_from_native_byte((U8) i, (U8 **) &d);
3206 for (i = range_min + 1; i <= range_max; i++) {
3213 /* If the original range extended above 255, add in that
3215 if (real_range_max) {
3216 *d++ = (char) UTF8_TWO_BYTE_HI(0x100);
3217 *d++ = (char) UTF8_TWO_BYTE_LO(0x100);
3218 if (real_range_max > 0x100) {
3219 if (real_range_max > 0x101) {
3220 *d++ = (char) ILLEGAL_UTF8_BYTE;
3222 d = (char*)uvchr_to_utf8((U8*)d, real_range_max);
3228 /* mark the range as done, and continue */
3232 non_portable_endpoint = 0;
3236 } /* End of is a range */
3237 } /* End of transliteration. Joins main code after these else's */
3238 else if (*s == '[' && PL_lex_inpat && !in_charclass) {
3241 while (s1 >= start && *s1-- == '\\')
3244 in_charclass = TRUE;
3246 else if (*s == ']' && PL_lex_inpat && in_charclass) {
3249 while (s1 >= start && *s1-- == '\\')
3252 in_charclass = FALSE;
3254 /* skip for regexp comments /(?#comment)/, except for the last
3255 * char, which will be done separately. Stop on (?{..}) and
3257 else if (*s == '(' && PL_lex_inpat && s[1] == '?' && !in_charclass) {
3259 while (s+1 < send && *s != ')')
3262 else if (!PL_lex_casemods
3263 && ( s[2] == '{' /* This should match regcomp.c */
3264 || (s[2] == '?' && s[3] == '{')))
3269 /* likewise skip #-initiated comments in //x patterns */
3273 && ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED)
3275 while (s < send && *s != '\n')
3278 /* no further processing of single-quoted regex */
3279 else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'')
3280 goto default_action;
3282 /* check for embedded arrays
3283 * (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
3285 else if (*s == '@' && s[1]) {
3287 ? isIDFIRST_utf8_safe(s+1, send)
3288 : isWORDCHAR_A(s[1]))
3292 if (strchr(":'{$", s[1]))
3294 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
3295 break; /* in regexp, neither @+ nor @- are interpolated */
3297 /* check for embedded scalars. only stop if we're sure it's a
3299 else if (*s == '$') {
3300 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
3302 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
3304 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
3305 "Possible unintended interpolation of $\\ in regex");
3307 break; /* in regexp, $ might be tail anchor */
3311 /* End of else if chain - OP_TRANS rejoin rest */
3313 if (UNLIKELY(s >= send)) {
3319 if (*s == '\\' && s+1 < send) {
3320 char* e; /* Can be used for ending '}', etc. */
3324 /* warn on \1 - \9 in substitution replacements, but note that \11
3325 * is an octal; and \19 is \1 followed by '9' */
3326 if (PL_lex_inwhat == OP_SUBST
3332 /* diag_listed_as: \%d better written as $%d */
3333 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
3338 /* string-change backslash escapes */
3339 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQF", *s)) {
3343 /* In a pattern, process \N, but skip any other backslash escapes.
3344 * This is because we don't want to translate an escape sequence
3345 * into a meta symbol and have the regex compiler use the meta
3346 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
3347 * in spite of this, we do have to process \N here while the proper
3348 * charnames handler is in scope. See bugs #56444 and #62056.
3350 * There is a complication because \N in a pattern may also stand
3351 * for 'match a non-nl', and not mean a charname, in which case its
3352 * processing should be deferred to the regex compiler. To be a
3353 * charname it must be followed immediately by a '{', and not look
3354 * like \N followed by a curly quantifier, i.e., not something like
3355 * \N{3,}. regcurly returns a boolean indicating if it is a legal
3357 else if (PL_lex_inpat
3360 || regcurly(s + 1)))
3363 goto default_action;
3369 if ((isALPHANUMERIC(*s)))
3370 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
3371 "Unrecognized escape \\%c passed through",
3373 /* default action is to copy the quoted character */
3374 goto default_action;
3377 /* eg. \132 indicates the octal constant 0132 */
3378 case '0': case '1': case '2': case '3':
3379 case '4': case '5': case '6': case '7':
3381 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
3383 uv = grok_oct(s, &len, &flags, NULL);
3385 if (len < 3 && s < send && isDIGIT(*s)
3386 && ckWARN(WARN_MISC))
3388 Perl_warner(aTHX_ packWARN(WARN_MISC),
3389 "%s", form_short_octal_warning(s, len));
3392 goto NUM_ESCAPE_INSERT;
3394 /* eg. \o{24} indicates the octal constant \024 */
3399 bool valid = grok_bslash_o(&s, &uv, &error,
3400 TRUE, /* Output warning */
3401 FALSE, /* Not strict */
3402 TRUE, /* Output warnings for
3407 uv = 0; /* drop through to ensure range ends are set */
3409 goto NUM_ESCAPE_INSERT;
3412 /* eg. \x24 indicates the hex constant 0x24 */
3417 bool valid = grok_bslash_x(&s, &uv, &error,
3418 TRUE, /* Output warning */
3419 FALSE, /* Not strict */
3420 TRUE, /* Output warnings for
3425 uv = 0; /* drop through to ensure range ends are set */
3430 /* Insert oct or hex escaped character. */
3432 /* Here uv is the ordinal of the next character being added */
3433 if (UVCHR_IS_INVARIANT(uv)) {
3437 if (!has_utf8 && uv > 255) {
3439 /* Here, 'uv' won't fit unless we convert to UTF-8.
3440 * If we've only seen invariants so far, all we have to
3441 * do is turn on the flag */
3442 if (utf8_variant_count == 0) {
3446 SvCUR_set(sv, d - SvPVX_const(sv));
3450 sv_utf8_upgrade_flags_grow(
3452 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3454 /* Since we're having to grow here,
3455 * make sure we have enough room for
3456 * this escape and a NUL, so the
3457 * code immediately below won't have
3458 * to actually grow again */
3460 + (STRLEN)(send - s) + 1);
3461 d = SvPVX(sv) + SvCUR(sv);
3464 has_above_latin1 = TRUE;
3470 utf8_variant_count++;
3473 /* Usually, there will already be enough room in 'sv'
3474 * since such escapes are likely longer than any UTF-8
3475 * sequence they can end up as. This isn't the case on
3476 * EBCDIC where \x{40000000} contains 12 bytes, and the
3477 * UTF-8 for it contains 14. And, we have to allow for
3478 * a trailing NUL. It probably can't happen on ASCII
3479 * platforms, but be safe. See Note on sizing above. */
3480 const STRLEN needed = d - SvPVX(sv)
3484 if (UNLIKELY(needed > SvLEN(sv))) {
3485 SvCUR_set(sv, d - SvPVX_const(sv));
3486 d = SvCUR(sv) + SvGROW(sv, needed);
3489 d = (char*)uvchr_to_utf8((U8*)d, uv);
3490 if (PL_lex_inwhat == OP_TRANS
3491 && PL_parser->lex_sub_op)
3493 PL_parser->lex_sub_op->op_private |=
3494 (PL_lex_repl ? OPpTRANS_FROM_UTF
3500 non_portable_endpoint++;
3505 /* In a non-pattern \N must be like \N{U+0041}, or it can be a
3506 * named character, like \N{LATIN SMALL LETTER A}, or a named
3507 * sequence, like \N{LATIN CAPITAL LETTER A WITH MACRON AND
3508 * GRAVE} (except y/// can't handle the latter, croaking). For
3509 * convenience all three forms are referred to as "named
3510 * characters" below.
3512 * For patterns, \N also can mean to match a non-newline. Code
3513 * before this 'switch' statement should already have handled
3514 * this situation, and hence this code only has to deal with
3515 * the named character cases.
3517 * For non-patterns, the named characters are converted to
3518 * their string equivalents. In patterns, named characters are
3519 * not converted to their ultimate forms for the same reasons
3520 * that other escapes aren't. Instead, they are converted to
3521 * the \N{U+...} form to get the value from the charnames that
3522 * is in effect right now, while preserving the fact that it
3523 * was a named character, so that the regex compiler knows
3526 * The structure of this section of code (besides checking for
3527 * errors and upgrading to utf8) is:
3528 * If the named character is of the form \N{U+...}, pass it
3529 * through if a pattern; otherwise convert the code point
3531 * Otherwise must be some \N{NAME}: convert to
3532 * \N{U+c1.c2...} if a pattern; otherwise convert to utf8
3534 * Transliteration is an exception. The conversion to utf8 is
3535 * only done if the code point requires it to be representable.
3537 * Here, 's' points to the 'N'; the test below is guaranteed to
3538 * succeed if we are being called on a pattern, as we already
3539 * know from a test above that the next character is a '{'. A
3540 * non-pattern \N must mean 'named character', which requires
3544 yyerror("Missing braces on \\N{}");
3549 /* If there is no matching '}', it is an error. */
3550 if (! (e = strchr(s, '}'))) {
3551 if (! PL_lex_inpat) {
3552 yyerror("Missing right brace on \\N{}");
3554 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N");
3559 /* Here it looks like a named character */
3561 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3562 s += 2; /* Skip to next char after the 'U+' */
3565 /* In patterns, we can have \N{U+xxxx.yyyy.zzzz...} */
3566 /* Check the syntax. */
3569 if (!isXDIGIT(*s)) {
3572 "Invalid hexadecimal number in \\N{U+...}"
3580 else if ((*s == '.' || *s == '_')
3586 /* Pass everything through unchanged.
3587 * +1 is for the '}' */
3588 Copy(orig_s, d, e - orig_s + 1, char);
3589 d += e - orig_s + 1;
3591 else { /* Not a pattern: convert the hex to string */
3592 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3593 | PERL_SCAN_SILENT_ILLDIGIT
3594 | PERL_SCAN_DISALLOW_PREFIX;
3596 uv = grok_hex(s, &len, &flags, NULL);
3597 if (len == 0 || (len != (STRLEN)(e - s)))
3600 /* For non-tr///, if the destination is not in utf8,
3601 * unconditionally recode it to be so. This is
3602 * because \N{} implies Unicode semantics, and scalars
3603 * have to be in utf8 to guarantee those semantics.
3604 * tr/// doesn't care about Unicode rules, so no need
3605 * there to upgrade to UTF-8 for small enough code
3607 if (! has_utf8 && ( uv > 0xFF
3608 || PL_lex_inwhat != OP_TRANS))
3610 /* See Note on sizing above. */
3611 const STRLEN extra = OFFUNISKIP(uv) + (send - e) + 1;
3613 SvCUR_set(sv, d - SvPVX_const(sv));
3617 if (utf8_variant_count == 0) {
3619 d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + extra);
3622 sv_utf8_upgrade_flags_grow(
3624 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3626 d = SvPVX(sv) + SvCUR(sv);
3630 has_above_latin1 = TRUE;
3633 /* Add the (Unicode) code point to the output. */
3634 if (! has_utf8 || OFFUNI_IS_INVARIANT(uv)) {
3635 *d++ = (char) LATIN1_TO_NATIVE(uv);
3638 d = (char*) uvoffuni_to_utf8_flags((U8*)d, uv, 0);
3642 else /* Here is \N{NAME} but not \N{U+...}. */
3643 if ((res = get_and_check_backslash_N_name(s, e)))
3646 const char *str = SvPV_const(res, len);
3649 if (! len) { /* The name resolved to an empty string */
3650 Copy("\\N{}", d, 4, char);
3654 /* In order to not lose information for the regex
3655 * compiler, pass the result in the specially made
3656 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3657 * the code points in hex of each character
3658 * returned by charnames */
3660 const char *str_end = str + len;
3661 const STRLEN off = d - SvPVX_const(sv);
3663 if (! SvUTF8(res)) {
3664 /* For the non-UTF-8 case, we can determine the
3665 * exact length needed without having to parse
3666 * through the string. Each character takes up
3667 * 2 hex digits plus either a trailing dot or
3669 const char initial_text[] = "\\N{U+";
3670 const STRLEN initial_len = sizeof(initial_text)
3672 d = off + SvGROW(sv, off
3675 /* +1 for trailing NUL */
3678 + (STRLEN)(send - e));
3679 Copy(initial_text, d, initial_len, char);
3681 while (str < str_end) {
3684 my_snprintf(hex_string,
3688 /* The regex compiler is
3689 * expecting Unicode, not
3691 NATIVE_TO_LATIN1(*str));
3692 PERL_MY_SNPRINTF_POST_GUARD(len,
3693 sizeof(hex_string));
3694 Copy(hex_string, d, 3, char);
3698 d--; /* Below, we will overwrite the final
3699 dot with a right brace */
3702 STRLEN char_length; /* cur char's byte length */
3704 /* and the number of bytes after this is
3705 * translated into hex digits */
3706 STRLEN output_length;
3708 /* 2 hex per byte; 2 chars for '\N'; 2 chars
3709 * for max('U+', '.'); and 1 for NUL */
3710 char hex_string[2 * UTF8_MAXBYTES + 5];
3712 /* Get the first character of the result. */
3713 U32 uv = utf8n_to_uvchr((U8 *) str,
3717 /* Convert first code point to Unicode hex,
3718 * including the boiler plate before it. */
3720 my_snprintf(hex_string, sizeof(hex_string),
3722 (unsigned int) NATIVE_TO_UNI(uv));
3724 /* Make sure there is enough space to hold it */
3725 d = off + SvGROW(sv, off
3727 + (STRLEN)(send - e)
3728 + 2); /* '}' + NUL */
3730 Copy(hex_string, d, output_length, char);
3733 /* For each subsequent character, append dot and
3734 * its Unicode code point in hex */
3735 while ((str += char_length) < str_end) {
3736 const STRLEN off = d - SvPVX_const(sv);
3737 U32 uv = utf8n_to_uvchr((U8 *) str,
3742 my_snprintf(hex_string,
3745 (unsigned int) NATIVE_TO_UNI(uv));
3747 d = off + SvGROW(sv, off
3749 + (STRLEN)(send - e)
3750 + 2); /* '}' + NUL */
3751 Copy(hex_string, d, output_length, char);
3756 *d++ = '}'; /* Done. Add the trailing brace */
3759 else { /* Here, not in a pattern. Convert the name to a
3762 if (PL_lex_inwhat == OP_TRANS) {
3763 str = SvPV_const(res, len);
3764 if (len > ((SvUTF8(res))
3768 yyerror(Perl_form(aTHX_
3769 "%.*s must not be a named sequence"
3770 " in transliteration operator",
3771 /* +1 to include the "}" */
3772 (int) (e + 1 - start), start));
3773 goto end_backslash_N;
3776 if (SvUTF8(res) && UTF8_IS_ABOVE_LATIN1(*str)) {
3777 has_above_latin1 = TRUE;
3781 else if (! SvUTF8(res)) {
3782 /* Make sure \N{} return is UTF-8. This is because
3783 * \N{} implies Unicode semantics, and scalars have
3784 * to be in utf8 to guarantee those semantics; but
3785 * not needed in tr/// */
3786 sv_utf8_upgrade_flags(res, 0);
3787 str = SvPV_const(res, len);
3790 /* Upgrade destination to be utf8 if this new
3792 if (! has_utf8 && SvUTF8(res)) {
3793 /* See Note on sizing above. */
3794 const STRLEN extra = len + (send - s) + 1;
3796 SvCUR_set(sv, d - SvPVX_const(sv));
3800 if (utf8_variant_count == 0) {
3802 d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + extra);
3805 sv_utf8_upgrade_flags_grow(sv,
3806 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3808 d = SvPVX(sv) + SvCUR(sv);
3811 } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
3813 /* See Note on sizing above. (NOTE: SvCUR() is not
3814 * set correctly here). */
3815 const STRLEN extra = len + (send - e) + 1;
3816 const STRLEN off = d - SvPVX_const(sv);
3817 d = off + SvGROW(sv, off + extra);
3819 Copy(str, d, len, char);
3825 } /* End \N{NAME} */
3829 backslash_N++; /* \N{} is defined to be Unicode */
3831 s = e + 1; /* Point to just after the '}' */
3834 /* \c is a control character */
3838 *d++ = grok_bslash_c(*s++, 1);
3841 yyerror("Missing control char name in \\c");
3844 non_portable_endpoint++;
3848 /* printf-style backslashes, formfeeds, newlines, etc */
3874 } /* end if (backslash) */
3877 /* Just copy the input to the output, though we may have to convert
3880 * If the input has the same representation in UTF-8 as not, it will be
3881 * a single byte, and we don't care about UTF8ness; just copy the byte */
3882 if (NATIVE_BYTE_IS_INVARIANT((U8)(*s))) {
3885 else if (! this_utf8 && ! has_utf8) {
3886 /* If neither source nor output is UTF-8, is also a single byte,
3887 * just copy it; but this byte counts should we later have to
3888 * convert to UTF-8 */
3890 utf8_variant_count++;
3892 else if (this_utf8 && has_utf8) { /* Both UTF-8, can just copy */
3893 const STRLEN len = UTF8SKIP(s);
3895 /* We expect the source to have already been checked for
3897 assert(isUTF8_CHAR((U8 *) s, (U8 *) send));
3899 Copy(s, d, len, U8);
3903 else { /* UTF8ness matters and doesn't match, need to convert */
3905 const UV nextuv = (this_utf8)
3906 ? utf8n_to_uvchr((U8*)s, send - s, &len, 0)
3908 STRLEN need = UVCHR_SKIP(nextuv);
3911 SvCUR_set(sv, d - SvPVX_const(sv));
3915 /* See Note on sizing above. */
3916 need += (STRLEN)(send - s) + 1;
3918 if (utf8_variant_count == 0) {
3920 d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + need);
3923 sv_utf8_upgrade_flags_grow(sv,
3924 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3926 d = SvPVX(sv) + SvCUR(sv);
3929 } else if (need > len) {
3930 /* encoded value larger than old, may need extra space (NOTE:
3931 * SvCUR() is not set correctly here). See Note on sizing
3933 const STRLEN extra = need + (send - s) + 1;
3934 const STRLEN off = d - SvPVX_const(sv);
3935 d = off + SvGROW(sv, off + extra);
3939 d = (char*)uvchr_to_utf8((U8*)d, nextuv);
3941 } /* while loop to process each character */
3943 /* terminate the string and set up the sv */
3945 SvCUR_set(sv, d - SvPVX_const(sv));
3946 if (SvCUR(sv) >= SvLEN(sv))
3947 Perl_croak(aTHX_ "panic: constant overflowed allocated space, %" UVuf
3948 " >= %" UVuf, (UV)SvCUR(sv), (UV)SvLEN(sv));
3953 if (PL_lex_inwhat == OP_TRANS && PL_parser->lex_sub_op) {
3954 PL_parser->lex_sub_op->op_private |=
3955 (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
3959 /* shrink the sv if we allocated more than we used */
3960 if (SvCUR(sv) + 5 < SvLEN(sv)) {
3961 SvPV_shrink_to_cur(sv);
3964 /* return the substring (via pl_yylval) only if we parsed anything */
3967 for (; s2 < s; s2++) {
3969 COPLINE_INC_WITH_HERELINES;
3971 SvREFCNT_inc_simple_void_NN(sv);
3972 if ( (PL_hints & ( PL_lex_inpat ? HINT_NEW_RE : HINT_NEW_STRING ))
3973 && ! PL_parser->lex_re_reparsing)
3975 const char *const key = PL_lex_inpat ? "qr" : "q";
3976 const STRLEN keylen = PL_lex_inpat ? 2 : 1;
3980 if (PL_lex_inwhat == OP_TRANS) {
3983 } else if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat) {
3986 } else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'') {
3994 sv = S_new_constant(aTHX_ start, s - start, key, keylen, sv, NULL,
3997 pl_yylval.opval = newSVOP(OP_CONST, 0, sv);
3999 LEAVE_with_name("scan_const");
4004 * Returns TRUE if there's more to the expression (e.g., a subscript),
4007 * It deals with "$foo[3]" and /$foo[3]/ and /$foo[0123456789$]+/
4009 * ->[ and ->{ return TRUE
4010 * ->$* ->$#* ->@* ->@[ ->@{ return TRUE if postderef_qq is enabled
4011 * { and [ outside a pattern are always subscripts, so return TRUE
4012 * if we're outside a pattern and it's not { or [, then return FALSE
4013 * if we're in a pattern and the first char is a {
4014 * {4,5} (any digits around the comma) returns FALSE
4015 * if we're in a pattern and the first char is a [
4017 * [SOMETHING] has a funky algorithm to decide whether it's a
4018 * character class or not. It has to deal with things like
4019 * /$foo[-3]/ and /$foo[$bar]/ as well as /$foo[$\d]+/
4020 * anything else returns TRUE
4023 /* This is the one truly awful dwimmer necessary to conflate C and sed. */
4026 S_intuit_more(pTHX_ char *s)
4028 PERL_ARGS_ASSERT_INTUIT_MORE;
4030 if (PL_lex_brackets)
4032 if (*s == '-' && s[1] == '>' && (s[2] == '[' || s[2] == '{'))
4034 if (*s == '-' && s[1] == '>'
4035 && FEATURE_POSTDEREF_QQ_IS_ENABLED
4036 && ( (s[2] == '$' && (s[3] == '*' || (s[3] == '#' && s[4] == '*')))
4037 ||(s[2] == '@' && strchr("*[{",s[3])) ))
4039 if (*s != '{' && *s != '[')
4044 /* In a pattern, so maybe we have {n,m}. */
4052 /* On the other hand, maybe we have a character class */
4055 if (*s == ']' || *s == '^')
4058 /* this is terrifying, and it works */
4061 const char * const send = strchr(s,']');
4062 unsigned char un_char, last_un_char;
4063 char tmpbuf[sizeof PL_tokenbuf * 4];
4065 if (!send) /* has to be an expression */
4067 weight = 2; /* let's weigh the evidence */
4071 else if (isDIGIT(*s)) {
4073 if (isDIGIT(s[1]) && s[2] == ']')
4079 Zero(seen,256,char);
4081 for (; s < send; s++) {
4082 last_un_char = un_char;
4083 un_char = (unsigned char)*s;
4088 weight -= seen[un_char] * 10;
4089 if (isWORDCHAR_lazy_if_safe(s+1, PL_bufend, UTF)) {
4091 char *tmp = PL_bufend;
4092 PL_bufend = (char*)send;
4093 scan_ident(s, tmpbuf, sizeof tmpbuf, FALSE);
4095 len = (int)strlen(tmpbuf);
4096 if (len > 1 && gv_fetchpvn_flags(tmpbuf, len,
4097 UTF ? SVf_UTF8 : 0, SVt_PV))
4104 && strchr("[#!%*<>()-=",s[1]))
4106 if (/*{*/ strchr("])} =",s[2]))
4115 if (strchr("wds]",s[1]))
4117 else if (seen[(U8)'\''] || seen[(U8)'"'])
4119 else if (strchr("rnftbxcav",s[1]))
4121 else if (isDIGIT(s[1])) {
4123 while (s[1] && isDIGIT(s[1]))
4133 if (strchr("aA01! ",last_un_char))
4135 if (strchr("zZ79~",s[1]))
4137 if (last_un_char == 255 && (isDIGIT(s[1]) || s[1] == '$'))
4138 weight -= 5; /* cope with negative subscript */
4141 if (!isWORDCHAR(last_un_char)
4142 && !(last_un_char == '$' || last_un_char == '@'
4143 || last_un_char == '&')
4144 && isALPHA(*s) && s[1] && isALPHA(s[1])) {
4148 if (keyword(d, s - d, 0))
4151 if (un_char == last_un_char + 1)
4153 weight -= seen[un_char];
4158 if (weight >= 0) /* probably a character class */
4168 * Does all the checking to disambiguate
4170 * between foo(bar) and bar->foo. Returns 0 if not a method, otherwise
4171 * FUNCMETH (bar->foo(args)) or METHOD (bar->foo args).
4173 * First argument is the stuff after the first token, e.g. "bar".
4175 * Not a method if foo is a filehandle.
4176 * Not a method if foo is a subroutine prototyped to take a filehandle.
4177 * Not a method if it's really "Foo $bar"
4178 * Method if it's "foo $bar"
4179 * Not a method if it's really "print foo $bar"
4180 * Method if it's really "foo package::" (interpreted as package->foo)
4181 * Not a method if bar is known to be a subroutine ("sub bar; foo bar")
4182 * Not a method if bar is a filehandle or package, but is quoted with
4187 S_intuit_method(pTHX_ char *start, SV *ioname, CV *cv)
4189 char *s = start + (*start == '$');
4190 char tmpbuf[sizeof PL_tokenbuf];
4193 /* Mustn't actually add anything to a symbol table.
4194 But also don't want to "initialise" any placeholder
4195 constants that might already be there into full
4196 blown PVGVs with attached PVCV. */
4198 ioname ? gv_fetchsv(ioname, GV_NOADD_NOINIT, SVt_PVCV) : NULL;
4200 PERL_ARGS_ASSERT_INTUIT_METHOD;
4202 if (gv && SvTYPE(gv) == SVt_PVGV && GvIO(gv))
4204 if (cv && SvPOK(cv)) {
4205 const char *proto = CvPROTO(cv);
4207 while (*proto && (isSPACE(*proto) || *proto == ';'))
4214 if (*start == '$') {
4215 if (cv || PL_last_lop_op == OP_PRINT || PL_last_lop_op == OP_SAY
4216 || isUPPER(*PL_tokenbuf))
4221 return *s == '(' ? FUNCMETH : METHOD;
4224 s = scan_word(s, tmpbuf, sizeof tmpbuf, TRUE, &len);
4225 /* start is the beginning of the possible filehandle/object,
4226 * and s is the end of it
4227 * tmpbuf is a copy of it (but with single quotes as double colons)
4230 if (!keyword(tmpbuf, len, 0)) {
4231 if (len > 2 && tmpbuf[len - 2] == ':' && tmpbuf[len - 1] == ':') {
4236 indirgv = gv_fetchpvn_flags(tmpbuf, len,
4237 GV_NOADD_NOINIT|( UTF ? SVf_UTF8 : 0 ),
4239 if (indirgv && SvTYPE(indirgv) != SVt_NULL
4240 && (!isGV(indirgv) || GvCVu(indirgv)))
4242 /* filehandle or package name makes it a method */
4243 if (!cv || GvIO(indirgv) || gv_stashpvn(tmpbuf, len, UTF ? SVf_UTF8 : 0)) {
4245 if ((PL_bufend - s) >= 2 && *s == '=' && *(s+1) == '>')
4246 return 0; /* no assumptions -- "=>" quotes bareword */
4248 NEXTVAL_NEXTTOKE.opval = newSVOP(OP_CONST, 0,
4249 S_newSV_maybe_utf8(aTHX_ tmpbuf, len));
4250 NEXTVAL_NEXTTOKE.opval->op_private = OPpCONST_BARE;
4252 force_next(BAREWORD);
4254 return *s == '(' ? FUNCMETH : METHOD;
4260 /* Encoded script support. filter_add() effectively inserts a
4261 * 'pre-processing' function into the current source input stream.
4262 * Note that the filter function only applies to the current source file
4263 * (e.g., it will not affect files 'require'd or 'use'd by this one).
4265 * The datasv parameter (which may be NULL) can be used to pass
4266 * private data to this instance of the filter. The filter function
4267 * can recover the SV using the FILTER_DATA macro and use it to
4268 * store private buffers and state information.
4270 * The supplied datasv parameter is upgraded to a PVIO type
4271 * and the IoDIRP/IoANY field is used to store the function pointer,
4272 * and IOf_FAKE_DIRP is enabled on datasv to mark this as such.
4273 * Note that IoTOP_NAME, IoFMT_NAME, IoBOTTOM_NAME, if set for
4274 * private use must be set using malloc'd pointers.
4278 Perl_filter_add(pTHX_ filter_t funcp, SV *datasv)
4286 if (PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS)
4287 Perl_croak(aTHX_ "Source&nbs