3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
12 * 'It all comes from here, the stench and the peril.' --Frodo
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
18 * This file is the lexer for Perl. It's closely linked to the
21 * The main routine is yylex(), which returns the next token.
25 =head1 Lexer interface
26 This is the lower layer of the Perl parser, managing characters and tokens.
28 =for apidoc AmU|yy_parser *|PL_parser
30 Pointer to a structure encapsulating the state of the parsing operation
31 currently in progress. The pointer can be locally changed to perform
32 a nested parse without interfering with the state of an outer parse.
33 Individual members of C<PL_parser> have their own documentation.
39 #define PERL_IN_TOKE_C
41 #include "dquote_inline.h"
43 #define new_constant(a,b,c,d,e,f,g) \
44 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
46 #define pl_yylval (PL_parser->yylval)
48 /* XXX temporary backwards compatibility */
49 #define PL_lex_brackets (PL_parser->lex_brackets)
50 #define PL_lex_allbrackets (PL_parser->lex_allbrackets)
51 #define PL_lex_fakeeof (PL_parser->lex_fakeeof)
52 #define PL_lex_brackstack (PL_parser->lex_brackstack)
53 #define PL_lex_casemods (PL_parser->lex_casemods)
54 #define PL_lex_casestack (PL_parser->lex_casestack)
55 #define PL_lex_dojoin (PL_parser->lex_dojoin)
56 #define PL_lex_formbrack (PL_parser->lex_formbrack)
57 #define PL_lex_inpat (PL_parser->lex_inpat)
58 #define PL_lex_inwhat (PL_parser->lex_inwhat)
59 #define PL_lex_op (PL_parser->lex_op)
60 #define PL_lex_repl (PL_parser->lex_repl)
61 #define PL_lex_starts (PL_parser->lex_starts)
62 #define PL_lex_stuff (PL_parser->lex_stuff)
63 #define PL_multi_start (PL_parser->multi_start)
64 #define PL_multi_open (PL_parser->multi_open)
65 #define PL_multi_close (PL_parser->multi_close)
66 #define PL_preambled (PL_parser->preambled)
67 #define PL_linestr (PL_parser->linestr)
68 #define PL_expect (PL_parser->expect)
69 #define PL_copline (PL_parser->copline)
70 #define PL_bufptr (PL_parser->bufptr)
71 #define PL_oldbufptr (PL_parser->oldbufptr)
72 #define PL_oldoldbufptr (PL_parser->oldoldbufptr)
73 #define PL_linestart (PL_parser->linestart)
74 #define PL_bufend (PL_parser->bufend)
75 #define PL_last_uni (PL_parser->last_uni)
76 #define PL_last_lop (PL_parser->last_lop)
77 #define PL_last_lop_op (PL_parser->last_lop_op)
78 #define PL_lex_state (PL_parser->lex_state)
79 #define PL_rsfp (PL_parser->rsfp)
80 #define PL_rsfp_filters (PL_parser->rsfp_filters)
81 #define PL_in_my (PL_parser->in_my)
82 #define PL_in_my_stash (PL_parser->in_my_stash)
83 #define PL_tokenbuf (PL_parser->tokenbuf)
84 #define PL_multi_end (PL_parser->multi_end)
85 #define PL_error_count (PL_parser->error_count)
87 # define PL_nexttoke (PL_parser->nexttoke)
88 # define PL_nexttype (PL_parser->nexttype)
89 # define PL_nextval (PL_parser->nextval)
92 #define SvEVALED(sv) \
93 (SvTYPE(sv) >= SVt_PVNV \
94 && ((XPVIV*)SvANY(sv))->xiv_u.xivu_eval_seen)
96 static const char* const ident_too_long = "Identifier too long";
98 # define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
100 #define XENUMMASK 0x3f
101 #define XFAKEEOF 0x40
102 #define XFAKEBRACK 0x80
104 #ifdef USE_UTF8_SCRIPTS
105 # define UTF cBOOL(!IN_BYTES)
107 # define UTF cBOOL((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
110 /* The maximum number of characters preceding the unrecognized one to display */
111 #define UNRECOGNIZED_PRECEDE_COUNT 10
113 /* In variables named $^X, these are the legal values for X.
114 * 1999-02-27 mjd-perl-patch@plover.com */
115 #define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
117 #define SPACE_OR_TAB(c) isBLANK_A(c)
119 #define HEXFP_PEEK(s) \
121 (isXDIGIT(s[1]) || isALPHA_FOLD_EQ(s[1], 'p'))) || \
122 isALPHA_FOLD_EQ(s[0], 'p'))
124 /* LEX_* are values for PL_lex_state, the state of the lexer.
125 * They are arranged oddly so that the guard on the switch statement
126 * can get by with a single comparison (if the compiler is smart enough).
128 * These values refer to the various states within a sublex parse,
129 * i.e. within a double quotish string
132 /* #define LEX_NOTPARSING 11 is done in perl.h. */
134 #define LEX_NORMAL 10 /* normal code (ie not within "...") */
135 #define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
136 #define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
137 #define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
138 #define LEX_INTERPSTART 6 /* expecting the start of a $var */
140 /* at end of code, eg "$x" followed by: */
141 #define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
142 #define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
144 #define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
145 string or after \E, $foo, etc */
146 #define LEX_INTERPCONST 2 /* NOT USED */
147 #define LEX_FORMLINE 1 /* expecting a format line */
151 static const char* const lex_state_names[] = {
166 #include "keywords.h"
168 /* CLINE is a macro that ensures PL_copline has a sane value */
170 #define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
173 * Convenience functions to return different tokens and prime the
174 * lexer for the next token. They all take an argument.
176 * TOKEN : generic token (used for '(', DOLSHARP, etc)
177 * OPERATOR : generic operator
178 * AOPERATOR : assignment operator
179 * PREBLOCK : beginning the block after an if, while, foreach, ...
180 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
181 * PREREF : *EXPR where EXPR is not a simple identifier
182 * TERM : expression term
183 * POSTDEREF : postfix dereference (->$* ->@[...] etc.)
184 * LOOPX : loop exiting command (goto, last, dump, etc)
185 * FTST : file test operator
186 * FUN0 : zero-argument function
187 * FUN0OP : zero-argument function, with its op created in this file
188 * FUN1 : not used, except for not, which isn't a UNIOP
189 * BOop : bitwise or or xor
191 * BCop : bitwise complement
192 * SHop : shift operator
193 * PWop : power operator
194 * PMop : pattern-matching operator
195 * Aop : addition-level operator
196 * AopNOASSIGN : addition-level operator that is never part of .=
197 * Mop : multiplication-level operator
198 * Eop : equality-testing operator
199 * Rop : relational operator <= != gt
201 * Also see LOP and lop() below.
204 #ifdef DEBUGGING /* Serve -DT. */
205 # define REPORT(retval) tokereport((I32)retval, &pl_yylval)
207 # define REPORT(retval) (retval)
210 #define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
211 #define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
212 #define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, retval))
213 #define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
214 #define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
215 #define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
216 #define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
217 #define POSTDEREF(f) return (PL_bufptr = s, S_postderef(aTHX_ REPORT(f),s[1]))
218 #define LOOPX(f) return (PL_bufptr = force_word(s,BAREWORD,TRUE,FALSE), \
220 PL_expect = PL_nexttoke ? XOPERATOR : XTERM, \
222 #define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
223 #define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
224 #define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
225 #define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
226 #define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)BITOROP))
227 #define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)BITANDOP))
228 #define BCop(f) return pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr = s, \
230 #define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)SHIFTOP))
231 #define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)POWOP))
232 #define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
233 #define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)ADDOP))
234 #define AopNOASSIGN(f) return (pl_yylval.ival=f, PL_bufptr=s, REPORT((int)ADDOP))
235 #define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)MULOP))
236 #define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
237 #define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
239 /* This bit of chicanery makes a unary function followed by
240 * a parenthesis into a function with one argument, highest precedence.
241 * The UNIDOR macro is for unary functions that can be followed by the //
242 * operator (such as C<shift // 0>).
244 #define UNI3(f,x,have_x) { \
245 pl_yylval.ival = f; \
246 if (have_x) PL_expect = x; \
248 PL_last_uni = PL_oldbufptr; \
249 PL_last_lop_op = (f) < 0 ? -(f) : (f); \
251 return REPORT( (int)FUNC1 ); \
253 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
255 #define UNI(f) UNI3(f,XTERM,1)
256 #define UNIDOR(f) UNI3(f,XTERMORDORDOR,1)
257 #define UNIPROTO(f,optional) { \
258 if (optional) PL_last_uni = PL_oldbufptr; \
262 #define UNIBRACK(f) UNI3(f,0,0)
264 /* grandfather return to old style */
267 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
268 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
269 pl_yylval.ival = (f); \
275 #define COPLINE_INC_WITH_HERELINES \
277 CopLINE_inc(PL_curcop); \
278 if (PL_parser->herelines) \
279 CopLINE(PL_curcop) += PL_parser->herelines, \
280 PL_parser->herelines = 0; \
282 /* Called after scan_str to update CopLINE(PL_curcop), but only when there
283 * is no sublex_push to follow. */
284 #define COPLINE_SET_FROM_MULTI_END \
286 CopLINE_set(PL_curcop, PL_multi_end); \
287 if (PL_multi_end != PL_multi_start) \
288 PL_parser->herelines = 0; \
294 /* how to interpret the pl_yylval associated with the token */
298 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
303 static struct debug_tokens {
305 enum token_type type;
307 } const debug_tokens[] =
309 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
310 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
311 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
312 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
313 { ARROW, TOKENTYPE_NONE, "ARROW" },
314 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
315 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
316 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
317 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
318 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
319 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
320 { DO, TOKENTYPE_NONE, "DO" },
321 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
322 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
323 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
324 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
325 { ELSE, TOKENTYPE_NONE, "ELSE" },
326 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
327 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
328 { FOR, TOKENTYPE_IVAL, "FOR" },
329 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
330 { FORMLBRACK, TOKENTYPE_NONE, "FORMLBRACK" },
331 { FORMRBRACK, TOKENTYPE_NONE, "FORMRBRACK" },
332 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
333 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
334 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
335 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
336 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
337 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
338 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
339 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
340 { IF, TOKENTYPE_IVAL, "IF" },
341 { LABEL, TOKENTYPE_PVAL, "LABEL" },
342 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
343 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
344 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
345 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
346 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
347 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
348 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
349 { MY, TOKENTYPE_IVAL, "MY" },
350 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
351 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
352 { OROP, TOKENTYPE_IVAL, "OROP" },
353 { OROR, TOKENTYPE_NONE, "OROR" },
354 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
355 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
356 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
357 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
358 { POSTJOIN, TOKENTYPE_NONE, "POSTJOIN" },
359 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
360 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
361 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
362 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
363 { PREINC, TOKENTYPE_NONE, "PREINC" },
364 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
365 { QWLIST, TOKENTYPE_OPVAL, "QWLIST" },
366 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
367 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
368 { REQUIRE, TOKENTYPE_NONE, "REQUIRE" },
369 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
370 { SUB, TOKENTYPE_NONE, "SUB" },
371 { THING, TOKENTYPE_OPVAL, "THING" },
372 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
373 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
374 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
375 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
376 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
377 { USE, TOKENTYPE_IVAL, "USE" },
378 { WHEN, TOKENTYPE_IVAL, "WHEN" },
379 { WHILE, TOKENTYPE_IVAL, "WHILE" },
380 { BAREWORD, TOKENTYPE_OPVAL, "BAREWORD" },
381 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
382 { 0, TOKENTYPE_NONE, NULL }
385 /* dump the returned token in rv, plus any optional arg in pl_yylval */
388 S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
390 PERL_ARGS_ASSERT_TOKEREPORT;
393 const char *name = NULL;
394 enum token_type type = TOKENTYPE_NONE;
395 const struct debug_tokens *p;
396 SV* const report = newSVpvs("<== ");
398 for (p = debug_tokens; p->token; p++) {
399 if (p->token == (int)rv) {
406 Perl_sv_catpv(aTHX_ report, name);
407 else if (isGRAPH(rv))
409 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
411 sv_catpvs(report, " (pending identifier)");
414 sv_catpvs(report, "EOF");
416 Perl_sv_catpvf(aTHX_ report, "?? %" IVdf, (IV)rv);
421 Perl_sv_catpvf(aTHX_ report, "(ival=%" IVdf ")", (IV)lvalp->ival);
423 case TOKENTYPE_OPNUM:
424 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
425 PL_op_name[lvalp->ival]);
428 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
430 case TOKENTYPE_OPVAL:
432 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
433 PL_op_name[lvalp->opval->op_type]);
434 if (lvalp->opval->op_type == OP_CONST) {
435 Perl_sv_catpvf(aTHX_ report, " %s",
436 SvPEEK(cSVOPx_sv(lvalp->opval)));
441 sv_catpvs(report, "(opval=null)");
444 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
450 /* print the buffer with suitable escapes */
453 S_printbuf(pTHX_ const char *const fmt, const char *const s)
455 SV* const tmp = newSVpvs("");
457 PERL_ARGS_ASSERT_PRINTBUF;
459 GCC_DIAG_IGNORE(-Wformat-nonliteral); /* fmt checked by caller */
460 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
468 S_deprecate_commaless_var_list(pTHX) {
470 deprecate_fatal_in("5.28", "Use of comma-less variable list is deprecated");
471 return REPORT(','); /* grandfather non-comma-format format */
477 * This subroutine looks for an '=' next to the operator that has just been
478 * parsed and turns it into an ASSIGNOP if it finds one.
482 S_ao(pTHX_ int toketype)
484 if (*PL_bufptr == '=') {
486 if (toketype == ANDAND)
487 pl_yylval.ival = OP_ANDASSIGN;
488 else if (toketype == OROR)
489 pl_yylval.ival = OP_ORASSIGN;
490 else if (toketype == DORDOR)
491 pl_yylval.ival = OP_DORASSIGN;
494 return REPORT(toketype);
499 * When Perl expects an operator and finds something else, no_op
500 * prints the warning. It always prints "<something> found where
501 * operator expected. It prints "Missing semicolon on previous line?"
502 * if the surprise occurs at the start of the line. "do you need to
503 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
504 * where the compiler doesn't know if foo is a method call or a function.
505 * It prints "Missing operator before end of line" if there's nothing
506 * after the missing operator, or "... before <...>" if there is something
507 * after the missing operator.
509 * PL_bufptr is expected to point to the start of the thing that was found,
510 * and s after the next token or partial token.
514 S_no_op(pTHX_ const char *const what, char *s)
516 char * const oldbp = PL_bufptr;
517 const bool is_first = (PL_oldbufptr == PL_linestart);
519 PERL_ARGS_ASSERT_NO_OP;
525 yywarn(Perl_form(aTHX_ "%s found where operator expected", what), UTF ? SVf_UTF8 : 0);
526 if (ckWARN_d(WARN_SYNTAX)) {
528 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
529 "\t(Missing semicolon on previous line?)\n");
530 else if (PL_oldoldbufptr && isIDFIRST_lazy_if_safe(PL_oldoldbufptr,
535 for (t = PL_oldoldbufptr;
536 (isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF) || *t == ':');
537 t += UTF ? UTF8SKIP(t) : 1)
541 if (t < PL_bufptr && isSPACE(*t))
542 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
543 "\t(Do you need to predeclare %" UTF8f "?)\n",
544 UTF8fARG(UTF, t - PL_oldoldbufptr, PL_oldoldbufptr));
548 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
549 "\t(Missing operator before %" UTF8f "?)\n",
550 UTF8fARG(UTF, s - oldbp, oldbp));
558 * Complain about missing quote/regexp/heredoc terminator.
559 * If it's called with NULL then it cauterizes the line buffer.
560 * If we're in a delimited string and the delimiter is a control
561 * character, it's reformatted into a two-char sequence like ^C.
566 S_missingterm(pTHX_ char *s)
568 char tmpbuf[UTF8_MAXBYTES + 1];
573 char * const nl = strrchr(s,'\n');
578 else if (PL_multi_close < 32) {
580 tmpbuf[1] = (char)toCTRL(PL_multi_close);
585 if (LIKELY(PL_multi_close < 256)) {
586 *tmpbuf = (char)PL_multi_close;
591 *uvchr_to_utf8((U8 *)tmpbuf, PL_multi_close) = 0;
595 q = strchr(s,'"') ? '\'' : '"';
596 sv = sv_2mortal(newSVpv(s,0));
599 Perl_croak(aTHX_ "Can't find string terminator %c%" SVf
600 "%c anywhere before EOF",q,SVfARG(sv),q);
606 * Check whether the named feature is enabled.
609 Perl_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
611 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
613 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
615 assert(CURRENT_FEATURE_BUNDLE == FEATURE_BUNDLE_CUSTOM);
617 if (namelen > MAX_FEATURE_LEN)
619 memcpy(&he_name[8], name, namelen);
621 return cBOOL(cop_hints_fetch_pvn(PL_curcop, he_name, 8 + namelen, 0,
622 REFCOUNTED_HE_EXISTS));
626 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
627 * utf16-to-utf8-reversed.
630 #ifdef PERL_CR_FILTER
634 const char *s = SvPVX_const(sv);
635 const char * const e = s + SvCUR(sv);
637 PERL_ARGS_ASSERT_STRIP_RETURN;
639 /* outer loop optimized to do nothing if there are no CR-LFs */
641 if (*s++ == '\r' && *s == '\n') {
642 /* hit a CR-LF, need to copy the rest */
646 if (*s == '\r' && s[1] == '\n')
657 S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
659 const I32 count = FILTER_READ(idx+1, sv, maxlen);
660 if (count > 0 && !maxlen)
667 =for apidoc Amx|void|lex_start|SV *line|PerlIO *rsfp|U32 flags
669 Creates and initialises a new lexer/parser state object, supplying
670 a context in which to lex and parse from a new source of Perl code.
671 A pointer to the new state object is placed in L</PL_parser>. An entry
672 is made on the save stack so that upon unwinding, the new state object
673 will be destroyed and the former value of L</PL_parser> will be restored.
674 Nothing else need be done to clean up the parsing context.
676 The code to be parsed comes from C<line> and C<rsfp>. C<line>, if
677 non-null, provides a string (in SV form) containing code to be parsed.
678 A copy of the string is made, so subsequent modification of C<line>
679 does not affect parsing. C<rsfp>, if non-null, provides an input stream
680 from which code will be read to be parsed. If both are non-null, the
681 code in C<line> comes first and must consist of complete lines of input,
682 and C<rsfp> supplies the remainder of the source.
684 The C<flags> parameter is reserved for future use. Currently it is only
685 used by perl internally, so extensions should always pass zero.
690 /* LEX_START_SAME_FILTER indicates that this is not a new file, so it
691 can share filters with the current parser.
692 LEX_START_DONT_CLOSE indicates that the file handle wasn't opened by the
693 caller, hence isn't owned by the parser, so shouldn't be closed on parser
694 destruction. This is used to handle the case of defaulting to reading the
695 script from the standard input because no filename was given on the command
696 line (without getting confused by situation where STDIN has been closed, so
697 the script handle is opened on fd 0) */
700 Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
702 const char *s = NULL;
703 yy_parser *parser, *oparser;
705 if (flags && flags & ~LEX_START_FLAGS)
706 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
708 /* create and initialise a parser */
710 Newxz(parser, 1, yy_parser);
711 parser->old_parser = oparser = PL_parser;
714 parser->stack = NULL;
715 parser->stack_max1 = NULL;
718 /* on scope exit, free this parser and restore any outer one */
720 parser->saved_curcop = PL_curcop;
722 /* initialise lexer state */
724 parser->nexttoke = 0;
725 parser->error_count = oparser ? oparser->error_count : 0;
726 parser->copline = parser->preambling = NOLINE;
727 parser->lex_state = LEX_NORMAL;
728 parser->expect = XSTATE;
730 parser->rsfp_filters =
731 !(flags & LEX_START_SAME_FILTER) || !oparser
733 : MUTABLE_AV(SvREFCNT_inc(
734 oparser->rsfp_filters
735 ? oparser->rsfp_filters
736 : (oparser->rsfp_filters = newAV())
739 Newx(parser->lex_brackstack, 120, char);
740 Newx(parser->lex_casestack, 12, char);
741 *parser->lex_casestack = '\0';
742 Newxz(parser->lex_shared, 1, LEXSHARED);
746 const U8* first_bad_char_loc;
748 s = SvPV_const(line, len);
750 if (SvUTF8(line) && ! is_utf8_string_loc((U8 *) s,
752 &first_bad_char_loc))
754 _force_out_malformed_utf8_message(first_bad_char_loc,
755 (U8 *) s + SvCUR(line),
757 1 /* 1 means die */ );
758 NOT_REACHED; /* NOTREACHED */
761 parser->linestr = flags & LEX_START_COPIED
762 ? SvREFCNT_inc_simple_NN(line)
763 : newSVpvn_flags(s, len, SvUTF8(line));
765 sv_catpvs(parser->linestr, "\n;");
767 parser->linestr = newSVpvn("\n;", rsfp ? 1 : 2);
770 parser->oldoldbufptr =
773 parser->linestart = SvPVX(parser->linestr);
774 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
775 parser->last_lop = parser->last_uni = NULL;
777 STATIC_ASSERT_STMT(FITS_IN_8_BITS(LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
778 |LEX_DONT_CLOSE_RSFP));
779 parser->lex_flags = (U8) (flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
780 |LEX_DONT_CLOSE_RSFP));
782 parser->in_pod = parser->filtered = 0;
786 /* delete a parser object */
789 Perl_parser_free(pTHX_ const yy_parser *parser)
791 PERL_ARGS_ASSERT_PARSER_FREE;
793 PL_curcop = parser->saved_curcop;
794 SvREFCNT_dec(parser->linestr);
796 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
797 PerlIO_clearerr(parser->rsfp);
798 else if (parser->rsfp && (!parser->old_parser
799 || (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
800 PerlIO_close(parser->rsfp);
801 SvREFCNT_dec(parser->rsfp_filters);
802 SvREFCNT_dec(parser->lex_stuff);
803 SvREFCNT_dec(parser->lex_sub_repl);
805 Safefree(parser->lex_brackstack);
806 Safefree(parser->lex_casestack);
807 Safefree(parser->lex_shared);
808 PL_parser = parser->old_parser;
813 Perl_parser_free_nexttoke_ops(pTHX_ yy_parser *parser, OPSLAB *slab)
815 I32 nexttoke = parser->nexttoke;
816 PERL_ARGS_ASSERT_PARSER_FREE_NEXTTOKE_OPS;
818 if (S_is_opval_token(parser->nexttype[nexttoke] & 0xffff)
819 && parser->nextval[nexttoke].opval
820 && parser->nextval[nexttoke].opval->op_slabbed
821 && OpSLAB(parser->nextval[nexttoke].opval) == slab) {
822 op_free(parser->nextval[nexttoke].opval);
823 parser->nextval[nexttoke].opval = NULL;
830 =for apidoc AmxU|SV *|PL_parser-E<gt>linestr
832 Buffer scalar containing the chunk currently under consideration of the
833 text currently being lexed. This is always a plain string scalar (for
834 which C<SvPOK> is true). It is not intended to be used as a scalar by
835 normal scalar means; instead refer to the buffer directly by the pointer
836 variables described below.
838 The lexer maintains various C<char*> pointers to things in the
839 C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
840 reallocated, all of these pointers must be updated. Don't attempt to
841 do this manually, but rather use L</lex_grow_linestr> if you need to
842 reallocate the buffer.
844 The content of the text chunk in the buffer is commonly exactly one
845 complete line of input, up to and including a newline terminator,
846 but there are situations where it is otherwise. The octets of the
847 buffer may be intended to be interpreted as either UTF-8 or Latin-1.
848 The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
849 flag on this scalar, which may disagree with it.
851 For direct examination of the buffer, the variable
852 L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
853 lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
854 of these pointers is usually preferable to examination of the scalar
855 through normal scalar means.
857 =for apidoc AmxU|char *|PL_parser-E<gt>bufend
859 Direct pointer to the end of the chunk of text currently being lexed, the
860 end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
861 + SvCUR(PL_parser-E<gt>linestr)>. A C<NUL> character (zero octet) is
862 always located at the end of the buffer, and does not count as part of
863 the buffer's contents.
865 =for apidoc AmxU|char *|PL_parser-E<gt>bufptr
867 Points to the current position of lexing inside the lexer buffer.
868 Characters around this point may be freely examined, within
869 the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
870 L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
871 interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
873 Lexing code (whether in the Perl core or not) moves this pointer past
874 the characters that it consumes. It is also expected to perform some
875 bookkeeping whenever a newline character is consumed. This movement
876 can be more conveniently performed by the function L</lex_read_to>,
877 which handles newlines appropriately.
879 Interpretation of the buffer's octets can be abstracted out by
880 using the slightly higher-level functions L</lex_peek_unichar> and
881 L</lex_read_unichar>.
883 =for apidoc AmxU|char *|PL_parser-E<gt>linestart
885 Points to the start of the current line inside the lexer buffer.
886 This is useful for indicating at which column an error occurred, and
887 not much else. This must be updated by any lexing code that consumes
888 a newline; the function L</lex_read_to> handles this detail.
894 =for apidoc Amx|bool|lex_bufutf8
896 Indicates whether the octets in the lexer buffer
897 (L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
898 of Unicode characters. If not, they should be interpreted as Latin-1
899 characters. This is analogous to the C<SvUTF8> flag for scalars.
901 In UTF-8 mode, it is not guaranteed that the lexer buffer actually
902 contains valid UTF-8. Lexing code must be robust in the face of invalid
905 The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
906 is significant, but not the whole story regarding the input character
907 encoding. Normally, when a file is being read, the scalar contains octets
908 and its C<SvUTF8> flag is off, but the octets should be interpreted as
909 UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
910 however, the scalar may have the C<SvUTF8> flag on, and in this case its
911 octets should be interpreted as UTF-8 unless the C<use bytes> pragma
912 is in effect. This logic may change in the future; use this function
913 instead of implementing the logic yourself.
919 Perl_lex_bufutf8(pTHX)
925 =for apidoc Amx|char *|lex_grow_linestr|STRLEN len
927 Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
928 at least C<len> octets (including terminating C<NUL>). Returns a
929 pointer to the reallocated buffer. This is necessary before making
930 any direct modification of the buffer that would increase its length.
931 L</lex_stuff_pvn> provides a more convenient way to insert text into
934 Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
935 this function updates all of the lexer's variables that point directly
942 Perl_lex_grow_linestr(pTHX_ STRLEN len)
946 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
947 STRLEN linestart_pos, last_uni_pos, last_lop_pos, re_eval_start_pos;
950 linestr = PL_parser->linestr;
951 buf = SvPVX(linestr);
952 if (len <= SvLEN(linestr))
955 /* Is the lex_shared linestr SV the same as the current linestr SV?
956 * Only in this case does re_eval_start need adjusting, since it
957 * points within lex_shared->ls_linestr's buffer */
958 current = ( !PL_parser->lex_shared->ls_linestr
959 || linestr == PL_parser->lex_shared->ls_linestr);
961 bufend_pos = PL_parser->bufend - buf;
962 bufptr_pos = PL_parser->bufptr - buf;
963 oldbufptr_pos = PL_parser->oldbufptr - buf;
964 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
965 linestart_pos = PL_parser->linestart - buf;
966 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
967 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
968 re_eval_start_pos = (current && PL_parser->lex_shared->re_eval_start) ?
969 PL_parser->lex_shared->re_eval_start - buf : 0;
971 buf = sv_grow(linestr, len);
973 PL_parser->bufend = buf + bufend_pos;
974 PL_parser->bufptr = buf + bufptr_pos;
975 PL_parser->oldbufptr = buf + oldbufptr_pos;
976 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
977 PL_parser->linestart = buf + linestart_pos;
978 if (PL_parser->last_uni)
979 PL_parser->last_uni = buf + last_uni_pos;
980 if (PL_parser->last_lop)
981 PL_parser->last_lop = buf + last_lop_pos;
982 if (current && PL_parser->lex_shared->re_eval_start)
983 PL_parser->lex_shared->re_eval_start = buf + re_eval_start_pos;
988 =for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
990 Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
991 immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
992 reallocating the buffer if necessary. This means that lexing code that
993 runs later will see the characters as if they had appeared in the input.
994 It is not recommended to do this as part of normal parsing, and most
995 uses of this facility run the risk of the inserted characters being
996 interpreted in an unintended manner.
998 The string to be inserted is represented by C<len> octets starting
999 at C<pv>. These octets are interpreted as either UTF-8 or Latin-1,
1000 according to whether the C<LEX_STUFF_UTF8> flag is set in C<flags>.
1001 The characters are recoded for the lexer buffer, according to how the
1002 buffer is currently being interpreted (L</lex_bufutf8>). If a string
1003 to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
1004 function is more convenient.
1010 Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
1014 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
1015 if (flags & ~(LEX_STUFF_UTF8))
1016 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
1018 if (flags & LEX_STUFF_UTF8) {
1021 STRLEN highhalf = 0; /* Count of variants */
1022 const char *p, *e = pv+len;
1023 for (p = pv; p != e; p++) {
1024 if (! UTF8_IS_INVARIANT(*p)) {
1030 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
1031 bufptr = PL_parser->bufptr;
1032 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
1033 SvCUR_set(PL_parser->linestr,
1034 SvCUR(PL_parser->linestr) + len+highhalf);
1035 PL_parser->bufend += len+highhalf;
1036 for (p = pv; p != e; p++) {
1038 if (! UTF8_IS_INVARIANT(c)) {
1039 *bufptr++ = UTF8_TWO_BYTE_HI(c);
1040 *bufptr++ = UTF8_TWO_BYTE_LO(c);
1042 *bufptr++ = (char)c;
1047 if (flags & LEX_STUFF_UTF8) {
1048 STRLEN highhalf = 0;
1049 const char *p, *e = pv+len;
1050 for (p = pv; p != e; p++) {
1052 if (UTF8_IS_ABOVE_LATIN1(c)) {
1053 Perl_croak(aTHX_ "Lexing code attempted to stuff "
1054 "non-Latin-1 character into Latin-1 input");
1055 } else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e)) {
1058 } else if (! UTF8_IS_INVARIANT(c)) {
1059 _force_out_malformed_utf8_message((U8 *) p, (U8 *) e,
1061 1 /* 1 means die */ );
1062 NOT_REACHED; /* NOTREACHED */
1067 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
1068 bufptr = PL_parser->bufptr;
1069 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
1070 SvCUR_set(PL_parser->linestr,
1071 SvCUR(PL_parser->linestr) + len-highhalf);
1072 PL_parser->bufend += len-highhalf;
1075 if (UTF8_IS_INVARIANT(*p)) {
1081 *bufptr++ = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
1087 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1088 bufptr = PL_parser->bufptr;
1089 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
1090 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
1091 PL_parser->bufend += len;
1092 Copy(pv, bufptr, len, char);
1098 =for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
1100 Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1101 immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1102 reallocating the buffer if necessary. This means that lexing code that
1103 runs later will see the characters as if they had appeared in the input.
1104 It is not recommended to do this as part of normal parsing, and most
1105 uses of this facility run the risk of the inserted characters being
1106 interpreted in an unintended manner.
1108 The string to be inserted is represented by octets starting at C<pv>
1109 and continuing to the first nul. These octets are interpreted as either
1110 UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
1111 in C<flags>. The characters are recoded for the lexer buffer, according
1112 to how the buffer is currently being interpreted (L</lex_bufutf8>).
1113 If it is not convenient to nul-terminate a string to be inserted, the
1114 L</lex_stuff_pvn> function is more appropriate.
1120 Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1122 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1123 lex_stuff_pvn(pv, strlen(pv), flags);
1127 =for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1129 Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1130 immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1131 reallocating the buffer if necessary. This means that lexing code that
1132 runs later will see the characters as if they had appeared in the input.
1133 It is not recommended to do this as part of normal parsing, and most
1134 uses of this facility run the risk of the inserted characters being
1135 interpreted in an unintended manner.
1137 The string to be inserted is the string value of C<sv>. The characters
1138 are recoded for the lexer buffer, according to how the buffer is currently
1139 being interpreted (L</lex_bufutf8>). If a string to be inserted is
1140 not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1141 need to construct a scalar.
1147 Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1151 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1153 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1155 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1159 =for apidoc Amx|void|lex_unstuff|char *ptr
1161 Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1162 C<ptr>. Text following C<ptr> will be moved, and the buffer shortened.
1163 This hides the discarded text from any lexing code that runs later,
1164 as if the text had never appeared.
1166 This is not the normal way to consume lexed text. For that, use
1173 Perl_lex_unstuff(pTHX_ char *ptr)
1177 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1178 buf = PL_parser->bufptr;
1180 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1183 bufend = PL_parser->bufend;
1185 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1186 unstuff_len = ptr - buf;
1187 Move(ptr, buf, bufend+1-ptr, char);
1188 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1189 PL_parser->bufend = bufend - unstuff_len;
1193 =for apidoc Amx|void|lex_read_to|char *ptr
1195 Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1196 to C<ptr>. This advances L</PL_parser-E<gt>bufptr> to match C<ptr>,
1197 performing the correct bookkeeping whenever a newline character is passed.
1198 This is the normal way to consume lexed text.
1200 Interpretation of the buffer's octets can be abstracted out by
1201 using the slightly higher-level functions L</lex_peek_unichar> and
1202 L</lex_read_unichar>.
1208 Perl_lex_read_to(pTHX_ char *ptr)
1211 PERL_ARGS_ASSERT_LEX_READ_TO;
1212 s = PL_parser->bufptr;
1213 if (ptr < s || ptr > PL_parser->bufend)
1214 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1215 for (; s != ptr; s++)
1217 COPLINE_INC_WITH_HERELINES;
1218 PL_parser->linestart = s+1;
1220 PL_parser->bufptr = ptr;
1224 =for apidoc Amx|void|lex_discard_to|char *ptr
1226 Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1227 up to C<ptr>. The remaining content of the buffer will be moved, and
1228 all pointers into the buffer updated appropriately. C<ptr> must not
1229 be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1230 it is not permitted to discard text that has yet to be lexed.
1232 Normally it is not necessarily to do this directly, because it suffices to
1233 use the implicit discarding behaviour of L</lex_next_chunk> and things
1234 based on it. However, if a token stretches across multiple lines,
1235 and the lexing code has kept multiple lines of text in the buffer for
1236 that purpose, then after completion of the token it would be wise to
1237 explicitly discard the now-unneeded earlier lines, to avoid future
1238 multi-line tokens growing the buffer without bound.
1244 Perl_lex_discard_to(pTHX_ char *ptr)
1248 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1249 buf = SvPVX(PL_parser->linestr);
1251 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1254 if (ptr > PL_parser->bufptr)
1255 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1256 discard_len = ptr - buf;
1257 if (PL_parser->oldbufptr < ptr)
1258 PL_parser->oldbufptr = ptr;
1259 if (PL_parser->oldoldbufptr < ptr)
1260 PL_parser->oldoldbufptr = ptr;
1261 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1262 PL_parser->last_uni = NULL;
1263 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1264 PL_parser->last_lop = NULL;
1265 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1266 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1267 PL_parser->bufend -= discard_len;
1268 PL_parser->bufptr -= discard_len;
1269 PL_parser->oldbufptr -= discard_len;
1270 PL_parser->oldoldbufptr -= discard_len;
1271 if (PL_parser->last_uni)
1272 PL_parser->last_uni -= discard_len;
1273 if (PL_parser->last_lop)
1274 PL_parser->last_lop -= discard_len;
1278 =for apidoc Amx|bool|lex_next_chunk|U32 flags
1280 Reads in the next chunk of text to be lexed, appending it to
1281 L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1282 looked to the end of the current chunk and wants to know more. It is
1283 usual, but not necessary, for lexing to have consumed the entirety of
1284 the current chunk at this time.
1286 If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1287 chunk (i.e., the current chunk has been entirely consumed), normally the
1288 current chunk will be discarded at the same time that the new chunk is
1289 read in. If C<flags> has the C<LEX_KEEP_PREVIOUS> bit set, the current chunk
1290 will not be discarded. If the current chunk has not been entirely
1291 consumed, then it will not be discarded regardless of the flag.
1293 Returns true if some new text was added to the buffer, or false if the
1294 buffer has reached the end of the input text.
1299 #define LEX_FAKE_EOF 0x80000000
1300 #define LEX_NO_TERM 0x40000000 /* here-doc */
1303 Perl_lex_next_chunk(pTHX_ U32 flags)
1307 STRLEN old_bufend_pos, new_bufend_pos;
1308 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1309 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
1310 bool got_some_for_debugger = 0;
1312 const U8* first_bad_char_loc;
1314 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM))
1315 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
1316 if (!(flags & LEX_NO_TERM) && PL_lex_inwhat)
1318 linestr = PL_parser->linestr;
1319 buf = SvPVX(linestr);
1320 if (!(flags & LEX_KEEP_PREVIOUS)
1321 && PL_parser->bufptr == PL_parser->bufend)
1323 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1325 if (PL_parser->last_uni != PL_parser->bufend)
1326 PL_parser->last_uni = NULL;
1327 if (PL_parser->last_lop != PL_parser->bufend)
1328 PL_parser->last_lop = NULL;
1329 last_uni_pos = last_lop_pos = 0;
1333 old_bufend_pos = PL_parser->bufend - buf;
1334 bufptr_pos = PL_parser->bufptr - buf;
1335 oldbufptr_pos = PL_parser->oldbufptr - buf;
1336 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1337 linestart_pos = PL_parser->linestart - buf;
1338 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1339 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1341 if (flags & LEX_FAKE_EOF) {
1343 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
1345 } else if (filter_gets(linestr, old_bufend_pos)) {
1347 got_some_for_debugger = 1;
1348 } else if (flags & LEX_NO_TERM) {
1351 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1354 /* End of real input. Close filehandle (unless it was STDIN),
1355 * then add implicit termination.
1357 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
1358 PerlIO_clearerr(PL_parser->rsfp);
1359 else if (PL_parser->rsfp)
1360 (void)PerlIO_close(PL_parser->rsfp);
1361 PL_parser->rsfp = NULL;
1362 PL_parser->in_pod = PL_parser->filtered = 0;
1363 if (!PL_in_eval && PL_minus_p) {
1365 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1366 PL_minus_n = PL_minus_p = 0;
1367 } else if (!PL_in_eval && PL_minus_n) {
1368 sv_catpvs(linestr, /*{*/";}");
1371 sv_catpvs(linestr, ";");
1374 buf = SvPVX(linestr);
1375 new_bufend_pos = SvCUR(linestr);
1376 PL_parser->bufend = buf + new_bufend_pos;
1377 PL_parser->bufptr = buf + bufptr_pos;
1379 if (UTF && ! is_utf8_string_loc((U8 *) PL_parser->bufptr,
1380 PL_parser->bufend - PL_parser->bufptr,
1381 &first_bad_char_loc))
1383 _force_out_malformed_utf8_message(first_bad_char_loc,
1384 (U8 *) PL_parser->bufend,
1386 1 /* 1 means die */ );
1387 NOT_REACHED; /* NOTREACHED */
1390 PL_parser->oldbufptr = buf + oldbufptr_pos;
1391 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1392 PL_parser->linestart = buf + linestart_pos;
1393 if (PL_parser->last_uni)
1394 PL_parser->last_uni = buf + last_uni_pos;
1395 if (PL_parser->last_lop)
1396 PL_parser->last_lop = buf + last_lop_pos;
1397 if (PL_parser->preambling != NOLINE) {
1398 CopLINE_set(PL_curcop, PL_parser->preambling + 1);
1399 PL_parser->preambling = NOLINE;
1401 if ( got_some_for_debugger
1402 && PERLDB_LINE_OR_SAVESRC
1403 && PL_curstash != PL_debstash)
1405 /* debugger active and we're not compiling the debugger code,
1406 * so store the line into the debugger's array of lines
1408 update_debugger_info(NULL, buf+old_bufend_pos,
1409 new_bufend_pos-old_bufend_pos);
1415 =for apidoc Amx|I32|lex_peek_unichar|U32 flags
1417 Looks ahead one (Unicode) character in the text currently being lexed.
1418 Returns the codepoint (unsigned integer value) of the next character,
1419 or -1 if lexing has reached the end of the input text. To consume the
1420 peeked character, use L</lex_read_unichar>.
1422 If the next character is in (or extends into) the next chunk of input
1423 text, the next chunk will be read in. Normally the current chunk will be
1424 discarded at the same time, but if C<flags> has the C<LEX_KEEP_PREVIOUS>
1425 bit set, then the current chunk will not be discarded.
1427 If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1428 is encountered, an exception is generated.
1434 Perl_lex_peek_unichar(pTHX_ U32 flags)
1438 if (flags & ~(LEX_KEEP_PREVIOUS))
1439 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1440 s = PL_parser->bufptr;
1441 bufend = PL_parser->bufend;
1447 if (!lex_next_chunk(flags))
1449 s = PL_parser->bufptr;
1450 bufend = PL_parser->bufend;
1453 if (UTF8_IS_INVARIANT(head))
1455 if (UTF8_IS_START(head)) {
1456 len = UTF8SKIP(&head);
1457 while ((STRLEN)(bufend-s) < len) {
1458 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1460 s = PL_parser->bufptr;
1461 bufend = PL_parser->bufend;
1464 unichar = utf8n_to_uvchr((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
1465 if (retlen == (STRLEN)-1) {
1466 _force_out_malformed_utf8_message((U8 *) s,
1469 1 /* 1 means die */ );
1470 NOT_REACHED; /* NOTREACHED */
1475 if (!lex_next_chunk(flags))
1477 s = PL_parser->bufptr;
1484 =for apidoc Amx|I32|lex_read_unichar|U32 flags
1486 Reads the next (Unicode) character in the text currently being lexed.
1487 Returns the codepoint (unsigned integer value) of the character read,
1488 and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1489 if lexing has reached the end of the input text. To non-destructively
1490 examine the next character, use L</lex_peek_unichar> instead.
1492 If the next character is in (or extends into) the next chunk of input
1493 text, the next chunk will be read in. Normally the current chunk will be
1494 discarded at the same time, but if C<flags> has the C<LEX_KEEP_PREVIOUS>
1495 bit set, then the current chunk will not be discarded.
1497 If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1498 is encountered, an exception is generated.
1504 Perl_lex_read_unichar(pTHX_ U32 flags)
1507 if (flags & ~(LEX_KEEP_PREVIOUS))
1508 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1509 c = lex_peek_unichar(flags);
1512 COPLINE_INC_WITH_HERELINES;
1514 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1516 ++(PL_parser->bufptr);
1522 =for apidoc Amx|void|lex_read_space|U32 flags
1524 Reads optional spaces, in Perl style, in the text currently being
1525 lexed. The spaces may include ordinary whitespace characters and
1526 Perl-style comments. C<#line> directives are processed if encountered.
1527 L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1528 at a non-space character (or the end of the input text).
1530 If spaces extend into the next chunk of input text, the next chunk will
1531 be read in. Normally the current chunk will be discarded at the same
1532 time, but if C<flags> has the C<LEX_KEEP_PREVIOUS> bit set, then the current
1533 chunk will not be discarded.
1538 #define LEX_NO_INCLINE 0x40000000
1539 #define LEX_NO_NEXT_CHUNK 0x80000000
1542 Perl_lex_read_space(pTHX_ U32 flags)
1545 const bool can_incline = !(flags & LEX_NO_INCLINE);
1546 bool need_incline = 0;
1547 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK|LEX_NO_INCLINE))
1548 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
1549 s = PL_parser->bufptr;
1550 bufend = PL_parser->bufend;
1556 } while (!(c == '\n' || (c == 0 && s == bufend)));
1557 } else if (c == '\n') {
1560 PL_parser->linestart = s;
1566 } else if (isSPACE(c)) {
1568 } else if (c == 0 && s == bufend) {
1571 if (flags & LEX_NO_NEXT_CHUNK)
1573 PL_parser->bufptr = s;
1574 l = CopLINE(PL_curcop);
1575 CopLINE(PL_curcop) += PL_parser->herelines + 1;
1576 got_more = lex_next_chunk(flags);
1577 CopLINE_set(PL_curcop, l);
1578 s = PL_parser->bufptr;
1579 bufend = PL_parser->bufend;
1582 if (can_incline && need_incline && PL_parser->rsfp) {
1592 PL_parser->bufptr = s;
1597 =for apidoc EXMp|bool|validate_proto|SV *name|SV *proto|bool warn
1599 This function performs syntax checking on a prototype, C<proto>.
1600 If C<warn> is true, any illegal characters or mismatched brackets
1601 will trigger illegalproto warnings, declaring that they were
1602 detected in the prototype for C<name>.
1604 The return value is C<true> if this is a valid prototype, and
1605 C<false> if it is not, regardless of whether C<warn> was C<true> or
1608 Note that C<NULL> is a valid C<proto> and will always return C<true>.
1615 Perl_validate_proto(pTHX_ SV *name, SV *proto, bool warn)
1617 STRLEN len, origlen;
1619 bool bad_proto = FALSE;
1620 bool in_brackets = FALSE;
1621 bool after_slash = FALSE;
1622 char greedy_proto = ' ';
1623 bool proto_after_greedy_proto = FALSE;
1624 bool must_be_last = FALSE;
1625 bool underscore = FALSE;
1626 bool bad_proto_after_underscore = FALSE;
1628 PERL_ARGS_ASSERT_VALIDATE_PROTO;
1633 p = SvPV(proto, len);
1635 for (; len--; p++) {
1638 proto_after_greedy_proto = TRUE;
1640 if (!strchr(";@%", *p))
1641 bad_proto_after_underscore = TRUE;
1644 if (!strchr("$@%*;[]&\\_+", *p) || *p == '\0') {
1651 in_brackets = FALSE;
1652 else if ((*p == '@' || *p == '%')
1656 must_be_last = TRUE;
1665 after_slash = FALSE;
1670 SV *tmpsv = newSVpvs_flags("", SVs_TEMP);
1673 ? sv_uni_display(tmpsv, newSVpvn_flags(p, origlen, SVs_TEMP | SVf_UTF8),
1674 origlen, UNI_DISPLAY_ISPRINT)
1675 : pv_pretty(tmpsv, p, origlen, 60, NULL, NULL, PERL_PV_ESCAPE_NONASCII);
1677 if (proto_after_greedy_proto)
1678 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1679 "Prototype after '%c' for %" SVf " : %s",
1680 greedy_proto, SVfARG(name), p);
1682 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1683 "Missing ']' in prototype for %" SVf " : %s",
1686 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1687 "Illegal character in prototype for %" SVf " : %s",
1689 if (bad_proto_after_underscore)
1690 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1691 "Illegal character after '_' in prototype for %" SVf " : %s",
1695 return (! (proto_after_greedy_proto || bad_proto) );
1700 * This subroutine has nothing to do with tilting, whether at windmills
1701 * or pinball tables. Its name is short for "increment line". It
1702 * increments the current line number in CopLINE(PL_curcop) and checks
1703 * to see whether the line starts with a comment of the form
1704 * # line 500 "foo.pm"
1705 * If so, it sets the current line number and file to the values in the comment.
1709 S_incline(pTHX_ const char *s)
1717 PERL_ARGS_ASSERT_INCLINE;
1719 COPLINE_INC_WITH_HERELINES;
1720 if (!PL_rsfp && !PL_parser->filtered && PL_lex_state == LEX_NORMAL
1721 && s+1 == PL_bufend && *s == ';') {
1722 /* fake newline in string eval */
1723 CopLINE_dec(PL_curcop);
1728 while (SPACE_OR_TAB(*s))
1730 if (strEQs(s, "line"))
1734 if (SPACE_OR_TAB(*s))
1738 while (SPACE_OR_TAB(*s))
1746 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
1748 while (SPACE_OR_TAB(*s))
1750 if (*s == '"' && (t = strchr(s+1, '"'))) {
1756 while (*t && !isSPACE(*t))
1760 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
1762 if (*e != '\n' && *e != '\0')
1763 return; /* false alarm */
1765 if (!grok_atoUV(n, &uv, &e))
1767 line_num = ((line_t)uv) - 1;
1770 const STRLEN len = t - s;
1772 if (!PL_rsfp && !PL_parser->filtered) {
1773 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1774 * to *{"::_<newfilename"} */
1775 /* However, the long form of evals is only turned on by the
1776 debugger - usually they're "(eval %lu)" */
1777 GV * const cfgv = CopFILEGV(PL_curcop);
1780 STRLEN tmplen2 = len;
1784 if (tmplen2 + 2 <= sizeof smallbuf)
1787 Newx(tmpbuf2, tmplen2 + 2, char);
1792 memcpy(tmpbuf2 + 2, s, tmplen2);
1795 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
1797 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
1798 /* adjust ${"::_<newfilename"} to store the new file name */
1799 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
1800 /* The line number may differ. If that is the case,
1801 alias the saved lines that are in the array.
1802 Otherwise alias the whole array. */
1803 if (CopLINE(PL_curcop) == line_num) {
1804 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(cfgv)));
1805 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(cfgv)));
1807 else if (GvAV(cfgv)) {
1808 AV * const av = GvAV(cfgv);
1809 const I32 start = CopLINE(PL_curcop)+1;
1810 I32 items = AvFILLp(av) - start;
1812 AV * const av2 = GvAVn(gv2);
1813 SV **svp = AvARRAY(av) + start;
1814 I32 l = (I32)line_num+1;
1816 av_store(av2, l++, SvREFCNT_inc(*svp++));
1821 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
1824 CopFILE_free(PL_curcop);
1825 CopFILE_setn(PL_curcop, s, len);
1827 CopLINE_set(PL_curcop, line_num);
1831 S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
1833 AV *av = CopFILEAVx(PL_curcop);
1836 if (PL_parser->preambling == NOLINE) sv = newSV_type(SVt_PVMG);
1838 sv = *av_fetch(av, 0, 1);
1839 SvUPGRADE(sv, SVt_PVMG);
1841 if (!SvPOK(sv)) SvPVCLEAR(sv);
1843 sv_catsv(sv, orig_sv);
1845 sv_catpvn(sv, buf, len);
1850 if (PL_parser->preambling == NOLINE)
1851 av_store(av, CopLINE(PL_curcop), sv);
1857 * Called to gobble the appropriate amount and type of whitespace.
1858 * Skips comments as well.
1859 * Returns the next character after the whitespace that is skipped.
1862 * Same thing, but look ahead without incrementing line numbers or
1863 * adjusting PL_linestart.
1866 #define skipspace(s) skipspace_flags(s, 0)
1867 #define peekspace(s) skipspace_flags(s, LEX_NO_INCLINE)
1870 S_skipspace_flags(pTHX_ char *s, U32 flags)
1872 PERL_ARGS_ASSERT_SKIPSPACE_FLAGS;
1873 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
1874 while (s < PL_bufend && (SPACE_OR_TAB(*s) || !*s))
1877 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1879 lex_read_space(flags | LEX_KEEP_PREVIOUS |
1880 (PL_lex_inwhat || PL_lex_state == LEX_FORMLINE ?
1881 LEX_NO_NEXT_CHUNK : 0));
1883 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1884 if (PL_linestart > PL_bufptr)
1885 PL_bufptr = PL_linestart;
1893 * Check the unary operators to ensure there's no ambiguity in how they're
1894 * used. An ambiguous piece of code would be:
1896 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1897 * the +5 is its argument.
1906 if (PL_oldoldbufptr != PL_last_uni)
1908 while (isSPACE(*PL_last_uni))
1911 while (isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF) || *s == '-')
1912 s += UTF ? UTF8SKIP(s) : 1;
1913 if ((t = strchr(s, '(')) && t < PL_bufptr)
1916 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
1917 "Warning: Use of \"%" UTF8f "\" without parentheses is ambiguous",
1918 UTF8fARG(UTF, (int)(s - PL_last_uni), PL_last_uni));
1922 * LOP : macro to build a list operator. Its behaviour has been replaced
1923 * with a subroutine, S_lop() for which LOP is just another name.
1926 #define LOP(f,x) return lop(f,x,s)
1930 * Build a list operator (or something that might be one). The rules:
1931 * - if we have a next token, then it's a list operator (no parens) for
1932 * which the next token has already been parsed; e.g.,
1935 * - if the next thing is an opening paren, then it's a function
1936 * - else it's a list operator
1940 S_lop(pTHX_ I32 f, U8 x, char *s)
1942 PERL_ARGS_ASSERT_LOP;
1947 PL_last_lop = PL_oldbufptr;
1948 PL_last_lop_op = (OPCODE)f;
1953 return REPORT(FUNC);
1956 return REPORT(FUNC);
1959 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
1960 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
1961 return REPORT(LSTOP);
1967 * When the lexer realizes it knows the next token (for instance,
1968 * it is reordering tokens for the parser) then it can call S_force_next
1969 * to know what token to return the next time the lexer is called. Caller
1970 * will need to set PL_nextval[] and possibly PL_expect to ensure
1971 * the lexer handles the token correctly.
1975 S_force_next(pTHX_ I32 type)
1979 PerlIO_printf(Perl_debug_log, "### forced token:\n");
1980 tokereport(type, &NEXTVAL_NEXTTOKE);
1983 assert(PL_nexttoke < C_ARRAY_LENGTH(PL_nexttype));
1984 PL_nexttype[PL_nexttoke] = type;
1991 * This subroutine handles postfix deref syntax after the arrow has already
1992 * been emitted. @* $* etc. are emitted as two separate token right here.
1993 * @[ @{ %[ %{ *{ are emitted also as two tokens, but this function emits
1994 * only the first, leaving yylex to find the next.
1998 S_postderef(pTHX_ int const funny, char const next)
2000 assert(funny == DOLSHARP || strchr("$@%&*", funny));
2002 PL_expect = XOPERATOR;
2003 if (PL_lex_state == LEX_INTERPNORMAL && !PL_lex_brackets) {
2004 assert('@' == funny || '$' == funny || DOLSHARP == funny);
2005 PL_lex_state = LEX_INTERPEND;
2007 force_next(POSTJOIN);
2013 if ('@' == funny && PL_lex_state == LEX_INTERPNORMAL
2014 && !PL_lex_brackets)
2016 PL_expect = XOPERATOR;
2025 int yyc = PL_parser->yychar;
2026 if (yyc != YYEMPTY) {
2028 NEXTVAL_NEXTTOKE = PL_parser->yylval;
2029 if (yyc == '{'/*}*/ || yyc == HASHBRACK || yyc == '['/*]*/) {
2030 PL_lex_allbrackets--;
2032 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
2033 } else if (yyc == '('/*)*/) {
2034 PL_lex_allbrackets--;
2039 PL_parser->yychar = YYEMPTY;
2044 S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
2046 SV * const sv = newSVpvn_utf8(start, len,
2049 && !is_utf8_invariant_string((const U8*)start, len)
2050 && is_utf8_string((const U8*)start, len));
2056 * When the lexer knows the next thing is a word (for instance, it has
2057 * just seen -> and it knows that the next char is a word char, then
2058 * it calls S_force_word to stick the next word into the PL_nexttoke/val
2062 * char *start : buffer position (must be within PL_linestr)
2063 * int token : PL_next* will be this type of bare word
2064 * (e.g., METHOD,BAREWORD)
2065 * int check_keyword : if true, Perl checks to make sure the word isn't
2066 * a keyword (do this if the word is a label, e.g. goto FOO)
2067 * int allow_pack : if true, : characters will also be allowed (require,
2068 * use, etc. do this)
2072 S_force_word(pTHX_ char *start, int token, int check_keyword, int allow_pack)
2077 PERL_ARGS_ASSERT_FORCE_WORD;
2079 start = skipspace(start);
2081 if ( isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)
2082 || (allow_pack && *s == ':' && s[1] == ':') )
2084 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
2085 if (check_keyword) {
2086 char *s2 = PL_tokenbuf;
2088 if (allow_pack && len > 6 && strEQs(s2, "CORE::"))
2090 if (keyword(s2, len2, 0))
2093 if (token == METHOD) {
2098 PL_expect = XOPERATOR;
2101 NEXTVAL_NEXTTOKE.opval
2102 = newSVOP(OP_CONST,0,
2103 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
2104 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
2112 * Called when the lexer wants $foo *foo &foo etc, but the program
2113 * text only contains the "foo" portion. The first argument is a pointer
2114 * to the "foo", and the second argument is the type symbol to prefix.
2115 * Forces the next token to be a "BAREWORD".
2116 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
2120 S_force_ident(pTHX_ const char *s, int kind)
2122 PERL_ARGS_ASSERT_FORCE_IDENT;
2125 const STRLEN len = s[1] ? strlen(s) : 1; /* s = "\"" see yylex */
2126 OP* const o = newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
2127 UTF ? SVf_UTF8 : 0));
2128 NEXTVAL_NEXTTOKE.opval = o;
2129 force_next(BAREWORD);
2131 o->op_private = OPpCONST_ENTERED;
2132 /* XXX see note in pp_entereval() for why we forgo typo
2133 warnings if the symbol must be introduced in an eval.
2135 gv_fetchpvn_flags(s, len,
2136 (PL_in_eval ? GV_ADDMULTI
2137 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
2138 kind == '$' ? SVt_PV :
2139 kind == '@' ? SVt_PVAV :
2140 kind == '%' ? SVt_PVHV :
2148 S_force_ident_maybe_lex(pTHX_ char pit)
2150 NEXTVAL_NEXTTOKE.ival = pit;
2155 Perl_str_to_version(pTHX_ SV *sv)
2160 const char *start = SvPV_const(sv,len);
2161 const char * const end = start + len;
2162 const bool utf = cBOOL(SvUTF8(sv));
2164 PERL_ARGS_ASSERT_STR_TO_VERSION;
2166 while (start < end) {
2170 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
2175 retval += ((NV)n)/nshift;
2184 * Forces the next token to be a version number.
2185 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2186 * and if "guessing" is TRUE, then no new token is created (and the caller
2187 * must use an alternative parsing method).
2191 S_force_version(pTHX_ char *s, int guessing)
2196 PERL_ARGS_ASSERT_FORCE_VERSION;
2204 while (isDIGIT(*d) || *d == '_' || *d == '.')
2206 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
2208 s = scan_num(s, &pl_yylval);
2209 version = pl_yylval.opval;
2210 ver = cSVOPx(version)->op_sv;
2211 if (SvPOK(ver) && !SvNIOK(ver)) {
2212 SvUPGRADE(ver, SVt_PVNV);
2213 SvNV_set(ver, str_to_version(ver));
2214 SvNOK_on(ver); /* hint that it is a version */
2217 else if (guessing) {
2222 /* NOTE: The parser sees the package name and the VERSION swapped */
2223 NEXTVAL_NEXTTOKE.opval = version;
2224 force_next(BAREWORD);
2230 * S_force_strict_version
2231 * Forces the next token to be a version number using strict syntax rules.
2235 S_force_strict_version(pTHX_ char *s)
2238 const char *errstr = NULL;
2240 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2242 while (isSPACE(*s)) /* leading whitespace */
2245 if (is_STRICT_VERSION(s,&errstr)) {
2247 s = (char *)scan_version(s, ver, 0);
2248 version = newSVOP(OP_CONST, 0, ver);
2250 else if ((*s != ';' && *s != '{' && *s != '}' )
2251 && (s = skipspace(s), (*s != ';' && *s != '{' && *s != '}' )))
2255 yyerror(errstr); /* version required */
2259 /* NOTE: The parser sees the package name and the VERSION swapped */
2260 NEXTVAL_NEXTTOKE.opval = version;
2261 force_next(BAREWORD);
2268 * Tokenize a quoted string passed in as an SV. It finds the next
2269 * chunk, up to end of string or a backslash. It may make a new
2270 * SV containing that chunk (if HINT_NEW_STRING is on). It also
2275 S_tokeq(pTHX_ SV *sv)
2282 PERL_ARGS_ASSERT_TOKEQ;
2286 assert (!SvIsCOW(sv));
2287 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1) /* <<'heredoc' */
2291 /* This is relying on the SV being "well formed" with a trailing '\0' */
2292 while (s < send && !(*s == '\\' && s[1] == '\\'))
2297 if ( PL_hints & HINT_NEW_STRING ) {
2298 pv = newSVpvn_flags(SvPVX_const(pv), SvCUR(sv),
2299 SVs_TEMP | SvUTF8(sv));
2303 if (s + 1 < send && (s[1] == '\\'))
2304 s++; /* all that, just for this */
2309 SvCUR_set(sv, d - SvPVX_const(sv));
2311 if ( PL_hints & HINT_NEW_STRING )
2312 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
2317 * Now come three functions related to double-quote context,
2318 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2319 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2320 * interact with PL_lex_state, and create fake ( ... ) argument lists
2321 * to handle functions and concatenation.
2325 * stringify ( const[foo] concat lcfirst ( const[bar] ) )
2330 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
2332 * Pattern matching will set PL_lex_op to the pattern-matching op to
2333 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
2335 * OP_CONST is easy--just make the new op and return.
2337 * Everything else becomes a FUNC.
2339 * Sets PL_lex_state to LEX_INTERPPUSH unless ival was OP_NULL or we
2340 * had an OP_CONST. This just sets us up for a
2341 * call to S_sublex_push().
2345 S_sublex_start(pTHX)
2347 const I32 op_type = pl_yylval.ival;
2349 if (op_type == OP_NULL) {
2350 pl_yylval.opval = PL_lex_op;
2354 if (op_type == OP_CONST) {
2355 SV *sv = PL_lex_stuff;
2356 PL_lex_stuff = NULL;
2359 if (SvTYPE(sv) == SVt_PVIV) {
2360 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2362 const char * const p = SvPV_const(sv, len);
2363 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
2367 pl_yylval.opval = newSVOP(op_type, 0, sv);
2371 PL_parser->lex_super_state = PL_lex_state;
2372 PL_parser->lex_sub_inwhat = (U16)op_type;
2373 PL_parser->lex_sub_op = PL_lex_op;
2374 PL_lex_state = LEX_INTERPPUSH;
2378 pl_yylval.opval = PL_lex_op;
2388 * Create a new scope to save the lexing state. The scope will be
2389 * ended in S_sublex_done. Returns a '(', starting the function arguments
2390 * to the uc, lc, etc. found before.
2391 * Sets PL_lex_state to LEX_INTERPCONCAT.
2398 const bool is_heredoc = PL_multi_close == '<';
2401 PL_lex_state = PL_parser->lex_super_state;
2402 SAVEI8(PL_lex_dojoin);
2403 SAVEI32(PL_lex_brackets);
2404 SAVEI32(PL_lex_allbrackets);
2405 SAVEI32(PL_lex_formbrack);
2406 SAVEI8(PL_lex_fakeeof);
2407 SAVEI32(PL_lex_casemods);
2408 SAVEI32(PL_lex_starts);
2409 SAVEI8(PL_lex_state);
2410 SAVESPTR(PL_lex_repl);
2411 SAVEVPTR(PL_lex_inpat);
2412 SAVEI16(PL_lex_inwhat);
2415 SAVECOPLINE(PL_curcop);
2416 SAVEI32(PL_multi_end);
2417 SAVEI32(PL_parser->herelines);
2418 PL_parser->herelines = 0;
2420 SAVEIV(PL_multi_close);
2421 SAVEPPTR(PL_bufptr);
2422 SAVEPPTR(PL_bufend);
2423 SAVEPPTR(PL_oldbufptr);
2424 SAVEPPTR(PL_oldoldbufptr);
2425 SAVEPPTR(PL_last_lop);
2426 SAVEPPTR(PL_last_uni);
2427 SAVEPPTR(PL_linestart);
2428 SAVESPTR(PL_linestr);
2429 SAVEGENERICPV(PL_lex_brackstack);
2430 SAVEGENERICPV(PL_lex_casestack);
2431 SAVEGENERICPV(PL_parser->lex_shared);
2432 SAVEBOOL(PL_parser->lex_re_reparsing);
2433 SAVEI32(PL_copline);
2435 /* The here-doc parser needs to be able to peek into outer lexing
2436 scopes to find the body of the here-doc. So we put PL_linestr and
2437 PL_bufptr into lex_shared, to ‘share’ those values.
2439 PL_parser->lex_shared->ls_linestr = PL_linestr;
2440 PL_parser->lex_shared->ls_bufptr = PL_bufptr;
2442 PL_linestr = PL_lex_stuff;
2443 PL_lex_repl = PL_parser->lex_sub_repl;
2444 PL_lex_stuff = NULL;
2445 PL_parser->lex_sub_repl = NULL;
2447 /* Arrange for PL_lex_stuff to be freed on scope exit, in case it gets
2448 set for an inner quote-like operator and then an error causes scope-
2449 popping. We must not have a PL_lex_stuff value left dangling, as
2450 that breaks assumptions elsewhere. See bug #123617. */
2451 SAVEGENERICSV(PL_lex_stuff);
2452 SAVEGENERICSV(PL_parser->lex_sub_repl);
2454 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2455 = SvPVX(PL_linestr);
2456 PL_bufend += SvCUR(PL_linestr);
2457 PL_last_lop = PL_last_uni = NULL;
2458 SAVEFREESV(PL_linestr);
2459 if (PL_lex_repl) SAVEFREESV(PL_lex_repl);
2461 PL_lex_dojoin = FALSE;
2462 PL_lex_brackets = PL_lex_formbrack = 0;
2463 PL_lex_allbrackets = 0;
2464 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
2465 Newx(PL_lex_brackstack, 120, char);
2466 Newx(PL_lex_casestack, 12, char);
2467 PL_lex_casemods = 0;
2468 *PL_lex_casestack = '\0';
2470 PL_lex_state = LEX_INTERPCONCAT;
2472 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
2473 PL_copline = NOLINE;
2475 Newxz(shared, 1, LEXSHARED);
2476 shared->ls_prev = PL_parser->lex_shared;
2477 PL_parser->lex_shared = shared;
2479 PL_lex_inwhat = PL_parser->lex_sub_inwhat;
2480 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
2481 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2482 PL_lex_inpat = PL_parser->lex_sub_op;
2484 PL_lex_inpat = NULL;
2486 PL_parser->lex_re_reparsing = cBOOL(PL_in_eval & EVAL_RE_REPARSING);
2487 PL_in_eval &= ~EVAL_RE_REPARSING;
2494 * Restores lexer state after a S_sublex_push.
2500 if (!PL_lex_starts++) {
2501 SV * const sv = newSVpvs("");
2502 if (SvUTF8(PL_linestr))
2504 PL_expect = XOPERATOR;
2505 pl_yylval.opval = newSVOP(OP_CONST, 0, sv);
2509 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2510 PL_lex_state = LEX_INTERPCASEMOD;
2514 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
2515 assert(PL_lex_inwhat != OP_TRANSR);
2517 assert (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS);
2518 PL_linestr = PL_lex_repl;
2520 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2521 PL_bufend += SvCUR(PL_linestr);
2522 PL_last_lop = PL_last_uni = NULL;
2523 PL_lex_dojoin = FALSE;
2524 PL_lex_brackets = 0;
2525 PL_lex_allbrackets = 0;
2526 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
2527 PL_lex_casemods = 0;
2528 *PL_lex_casestack = '\0';
2530 if (SvEVALED(PL_lex_repl)) {
2531 PL_lex_state = LEX_INTERPNORMAL;
2533 /* we don't clear PL_lex_repl here, so that we can check later
2534 whether this is an evalled subst; that means we rely on the
2535 logic to ensure sublex_done() is called again only via the
2536 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
2539 PL_lex_state = LEX_INTERPCONCAT;
2542 if (SvTYPE(PL_linestr) >= SVt_PVNV) {
2543 CopLINE(PL_curcop) +=
2544 ((XPVNV*)SvANY(PL_linestr))->xnv_u.xnv_lines
2545 + PL_parser->herelines;
2546 PL_parser->herelines = 0;
2551 const line_t l = CopLINE(PL_curcop);
2553 if (PL_multi_close == '<')
2554 PL_parser->herelines += l - PL_multi_end;
2555 PL_bufend = SvPVX(PL_linestr);
2556 PL_bufend += SvCUR(PL_linestr);
2557 PL_expect = XOPERATOR;
2563 S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
2565 /* <s> points to first character of interior of \N{}, <e> to one beyond the
2566 * interior, hence to the "}". Finds what the name resolves to, returning
2567 * an SV* containing it; NULL if no valid one found */
2569 SV* res = newSVpvn_flags(s, e - s, UTF ? SVf_UTF8 : 0);
2576 const U8* first_bad_char_loc;
2577 const char* backslash_ptr = s - 3; /* Points to the <\> of \N{... */
2579 PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME;
2582 deprecate_fatal_in("5.28", "Unknown charname '' is deprecated");
2586 if (UTF && ! is_utf8_string_loc((U8 *) backslash_ptr,
2588 &first_bad_char_loc))
2590 _force_out_malformed_utf8_message(first_bad_char_loc,
2591 (U8 *) PL_parser->bufend,
2593 0 /* 0 means don't die */ );
2594 yyerror_pv(Perl_form(aTHX_
2595 "Malformed UTF-8 character immediately after '%.*s'",
2596 (int) (first_bad_char_loc - (U8 *) backslash_ptr), backslash_ptr),
2601 res = new_constant( NULL, 0, "charnames", res, NULL, backslash_ptr,
2602 /* include the <}> */
2603 e - backslash_ptr + 1);
2605 SvREFCNT_dec_NN(res);
2609 /* See if the charnames handler is the Perl core's, and if so, we can skip
2610 * the validation needed for a user-supplied one, as Perl's does its own
2612 table = GvHV(PL_hintgv); /* ^H */
2613 cvp = hv_fetchs(table, "charnames", FALSE);
2614 if (cvp && (cv = *cvp) && SvROK(cv) && (rv = SvRV(cv),
2615 SvTYPE(rv) == SVt_PVCV) && ((stash = CvSTASH(rv)) != NULL))
2617 const char * const name = HvNAME(stash);
2618 if (HvNAMELEN(stash) == sizeof("_charnames")-1
2619 && strEQ(name, "_charnames")) {
2624 /* Here, it isn't Perl's charname handler. We can't rely on a
2625 * user-supplied handler to validate the input name. For non-ut8 input,
2626 * look to see that the first character is legal. Then loop through the
2627 * rest checking that each is a continuation */
2629 /* This code makes the reasonable assumption that the only Latin1-range
2630 * characters that begin a character name alias are alphabetic, otherwise
2631 * would have to create a isCHARNAME_BEGIN macro */
2634 if (! isALPHAU(*s)) {
2639 if (! isCHARNAME_CONT(*s)) {
2642 if (*s == ' ' && *(s-1) == ' ') {
2649 /* Similarly for utf8. For invariants can check directly; for other
2650 * Latin1, can calculate their code point and check; otherwise use a
2652 if (UTF8_IS_INVARIANT(*s)) {
2653 if (! isALPHAU(*s)) {
2657 } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
2658 if (! isALPHAU(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)))) {
2664 if (! PL_utf8_charname_begin) {
2665 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2666 PL_utf8_charname_begin = _core_swash_init("utf8",
2667 "_Perl_Charname_Begin",
2669 1, 0, NULL, &flags);
2671 if (! swash_fetch(PL_utf8_charname_begin, (U8 *) s, TRUE)) {
2678 if (UTF8_IS_INVARIANT(*s)) {
2679 if (! isCHARNAME_CONT(*s)) {
2682 if (*s == ' ' && *(s-1) == ' ') {
2687 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
2688 if (! isCHARNAME_CONT(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1))))
2695 if (! PL_utf8_charname_continue) {
2696 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2697 PL_utf8_charname_continue = _core_swash_init("utf8",
2698 "_Perl_Charname_Continue",
2700 1, 0, NULL, &flags);
2702 if (! swash_fetch(PL_utf8_charname_continue, (U8 *) s, TRUE)) {
2709 if (*(s-1) == ' ') {
2712 "charnames alias definitions may not contain trailing "
2713 "white-space; marked by <-- HERE in %.*s<-- HERE %.*s",
2714 (int)(s - backslash_ptr + 1), backslash_ptr,
2715 (int)(e - s + 1), s + 1
2717 UTF ? SVf_UTF8 : 0);
2721 if (SvUTF8(res)) { /* Don't accept malformed input */
2722 const U8* first_bad_char_loc;
2724 const char* const str = SvPV_const(res, len);
2725 if (! is_utf8_string_loc((U8 *) str, len, &first_bad_char_loc)) {
2726 _force_out_malformed_utf8_message(first_bad_char_loc,
2727 (U8 *) PL_parser->bufend,
2729 0 /* 0 means don't die */ );
2732 "Malformed UTF-8 returned by %.*s immediately after '%.*s'",
2733 (int) (e - backslash_ptr + 1), backslash_ptr,
2734 (int) ((char *) first_bad_char_loc - str), str
2745 /* The final %.*s makes sure that should the trailing NUL be missing
2746 * that this print won't run off the end of the string */
2749 "Invalid character in \\N{...}; marked by <-- HERE in %.*s<-- HERE %.*s",
2750 (int)(s - backslash_ptr + 1), backslash_ptr,
2751 (int)(e - s + 1), s + 1
2753 UTF ? SVf_UTF8 : 0);
2760 "charnames alias definitions may not contain a sequence of "
2761 "multiple spaces; marked by <-- HERE in %.*s<-- HERE %.*s",
2762 (int)(s - backslash_ptr + 1), backslash_ptr,
2763 (int)(e - s + 1), s + 1
2765 UTF ? SVf_UTF8 : 0);
2772 Extracts the next constant part of a pattern, double-quoted string,
2773 or transliteration. This is terrifying code.
2775 For example, in parsing the double-quoted string "ab\x63$d", it would
2776 stop at the '$' and return an OP_CONST containing 'abc'.
2778 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
2779 processing a pattern (PL_lex_inpat is true), a transliteration
2780 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
2782 Returns a pointer to the character scanned up to. If this is
2783 advanced from the start pointer supplied (i.e. if anything was
2784 successfully parsed), will leave an OP_CONST for the substring scanned
2785 in pl_yylval. Caller must intuit reason for not parsing further
2786 by looking at the next characters herself.
2790 \N{FOO} => \N{U+hex_for_character_FOO}
2791 (if FOO expands to multiple characters, expands to \N{U+xx.XX.yy ...})
2794 all other \-char, including \N and \N{ apart from \N{ABC}
2797 @ and $ where it appears to be a var, but not for $ as tail anchor
2801 In transliterations:
2802 characters are VERY literal, except for - not at the start or end
2803 of the string, which indicates a range. However some backslash sequences
2804 are recognized: \r, \n, and the like
2805 \007 \o{}, \x{}, \N{}
2806 If all elements in the transliteration are below 256,
2807 scan_const expands the range to the full set of intermediate
2808 characters. If the range is in utf8, the hyphen is replaced with
2809 a certain range mark which will be handled by pmtrans() in op.c.
2811 In double-quoted strings:
2813 all those recognized in transliterations
2814 deprecated backrefs: \1 (in substitution replacements)
2815 case and quoting: \U \Q \E
2818 scan_const does *not* construct ops to handle interpolated strings.
2819 It stops processing as soon as it finds an embedded $ or @ variable
2820 and leaves it to the caller to work out what's going on.
2822 embedded arrays (whether in pattern or not) could be:
2823 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2825 $ in double-quoted strings must be the symbol of an embedded scalar.
2827 $ in pattern could be $foo or could be tail anchor. Assumption:
2828 it's a tail anchor if $ is the last thing in the string, or if it's
2829 followed by one of "()| \r\n\t"
2831 \1 (backreferences) are turned into $1 in substitutions
2833 The structure of the code is
2834 while (there's a character to process) {
2835 handle transliteration ranges
2836 skip regexp comments /(?#comment)/ and codes /(?{code})/
2837 skip #-initiated comments in //x patterns
2838 check for embedded arrays
2839 check for embedded scalars
2841 deprecate \1 in substitution replacements
2842 handle string-changing backslashes \l \U \Q \E, etc.
2843 switch (what was escaped) {
2844 handle \- in a transliteration (becomes a literal -)
2845 if a pattern and not \N{, go treat as regular character
2846 handle \132 (octal characters)
2847 handle \x15 and \x{1234} (hex characters)
2848 handle \N{name} (named characters, also \N{3,5} in a pattern)
2849 handle \cV (control characters)
2850 handle printf-style backslashes (\f, \r, \n, etc)
2853 } (end if backslash)
2854 handle regular character
2855 } (end while character to read)
2860 S_scan_const(pTHX_ char *start)
2862 char *send = PL_bufend; /* end of the constant */
2863 SV *sv = newSV(send - start); /* sv for the constant. See note below
2865 char *s = start; /* start of the constant */
2866 char *d = SvPVX(sv); /* destination for copies */
2867 bool dorange = FALSE; /* are we in a translit range? */
2868 bool didrange = FALSE; /* did we just finish a range? */
2869 bool in_charclass = FALSE; /* within /[...]/ */
2870 bool has_utf8 = FALSE; /* Output constant is UTF8 */
2871 bool this_utf8 = cBOOL(UTF); /* Is the source string assumed to be
2872 UTF8? But, this can show as true
2873 when the source isn't utf8, as for
2874 example when it is entirely composed
2876 STRLEN utf8_variant_count = 0; /* When not in UTF-8, this counts the
2877 number of characters found so far
2878 that will expand (into 2 bytes)
2879 should we have to convert to
2881 SV *res; /* result from charnames */
2882 STRLEN offset_to_max; /* The offset in the output to where the range
2883 high-end character is temporarily placed */
2885 /* Does something require special handling in tr/// ? This avoids extra
2886 * work in a less likely case. As such, khw didn't feel it was worth
2887 * adding any branches to the more mainline code to handle this, which
2888 * means that this doesn't get set in some circumstances when things like
2889 * \x{100} get expanded out. As a result there needs to be extra testing
2890 * done in the tr code */
2891 bool has_above_latin1 = FALSE;
2893 /* Note on sizing: The scanned constant is placed into sv, which is
2894 * initialized by newSV() assuming one byte of output for every byte of
2895 * input. This routine expects newSV() to allocate an extra byte for a
2896 * trailing NUL, which this routine will append if it gets to the end of
2897 * the input. There may be more bytes of input than output (eg., \N{LATIN
2898 * CAPITAL LETTER A}), or more output than input if the constant ends up
2899 * recoded to utf8, but each time a construct is found that might increase
2900 * the needed size, SvGROW() is called. Its size parameter each time is
2901 * based on the best guess estimate at the time, namely the length used so
2902 * far, plus the length the current construct will occupy, plus room for
2903 * the trailing NUL, plus one byte for every input byte still unscanned */
2905 UV uv = UV_MAX; /* Initialize to weird value to try to catch any uses
2908 int backslash_N = 0; /* ? was the character from \N{} */
2909 int non_portable_endpoint = 0; /* ? In a range is an endpoint
2910 platform-specific like \x65 */
2913 PERL_ARGS_ASSERT_SCAN_CONST;
2915 assert(PL_lex_inwhat != OP_TRANSR);
2916 if (PL_lex_inwhat == OP_TRANS && PL_parser->lex_sub_op) {
2917 /* If we are doing a trans and we know we want UTF8 set expectation */
2918 has_utf8 = PL_parser->lex_sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
2919 this_utf8 = PL_parser->lex_sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
2922 /* Protect sv from errors and fatal warnings. */
2923 ENTER_with_name("scan_const");
2927 || dorange /* Handle tr/// range at right edge of input */
2930 /* get transliterations out of the way (they're most literal) */
2931 if (PL_lex_inwhat == OP_TRANS) {
2933 /* But there isn't any special handling necessary unless there is a
2934 * range, so for most cases we just drop down and handle the value
2935 * as any other. There are two exceptions.
2937 * 1. A hyphen indicates that we are actually going to have a
2938 * range. In this case, skip the '-', set a flag, then drop
2939 * down to handle what should be the end range value.
2940 * 2. After we've handled that value, the next time through, that
2941 * flag is set and we fix up the range.
2943 * Ranges entirely within Latin1 are expanded out entirely, in
2944 * order to make the transliteration a simple table look-up.
2945 * Ranges that extend above Latin1 have to be done differently, so
2946 * there is no advantage to expanding them here, so they are
2947 * stored here as Min, ILLEGAL_UTF8_BYTE, Max. The illegal byte
2948 * signifies a hyphen without any possible ambiguity. On EBCDIC
2949 * machines, if the range is expressed as Unicode, the Latin1
2950 * portion is expanded out even if the range extends above
2951 * Latin1. This is because each code point in it has to be
2952 * processed here individually to get its native translation */
2956 /* Here, we don't think we're in a range. If the new character
2957 * is not a hyphen; or if it is a hyphen, but it's too close to
2958 * either edge to indicate a range, then it's a regular
2960 if (*s != '-' || s >= send - 1 || s == start) {
2962 /* A regular character. Process like any other, but first
2963 * clear any flags */
2967 non_portable_endpoint = 0;
2970 /* The tests here for being above Latin1 and similar ones
2971 * in the following 'else' suffice to find all such
2972 * occurences in the constant, except those added by a
2973 * backslash escape sequence, like \x{100}. Mostly, those
2974 * set 'has_above_latin1' as appropriate */
2975 if (this_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) {
2976 has_above_latin1 = TRUE;
2979 /* Drops down to generic code to process current byte */
2981 else { /* Is a '-' in the context where it means a range */
2982 if (didrange) { /* Something like y/A-C-Z// */
2983 Perl_croak(aTHX_ "Ambiguous range in transliteration"
2989 s++; /* Skip past the hyphen */
2991 /* d now points to where the end-range character will be
2992 * placed. Save it so won't have to go finding it later,
2993 * and drop down to get that character. (Actually we
2994 * instead save the offset, to handle the case where a
2995 * realloc in the meantime could change the actual
2996 * pointer). We'll finish processing the range the next
2997 * time through the loop */
2998 offset_to_max = d - SvPVX_const(sv);
3000 if (this_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) {
3001 has_above_latin1 = TRUE;
3004 /* Drops down to generic code to process current byte */
3006 } /* End of not a range */
3008 /* Here we have parsed a range. Now must handle it. At this
3010 * 'sv' is a SV* that contains the output string we are
3011 * constructing. The final two characters in that string
3012 * are the range start and range end, in order.
3013 * 'd' points to just beyond the range end in the 'sv' string,
3014 * where we would next place something
3015 * 'offset_to_max' is the offset in 'sv' at which the character
3016 * (the range's maximum end point) before 'd' begins.
3018 char * max_ptr = SvPVX(sv) + offset_to_max;
3021 IV range_max; /* last character in range */
3023 Size_t offset_to_min = 0;
3026 bool convert_unicode;
3027 IV real_range_max = 0;
3029 /* Get the code point values of the range ends. */
3031 /* We know the utf8 is valid, because we just constructed
3032 * it ourselves in previous loop iterations */
3033 min_ptr = (char*) utf8_hop( (U8*) max_ptr, -1);
3034 range_min = valid_utf8_to_uvchr( (U8*) min_ptr, NULL);
3035 range_max = valid_utf8_to_uvchr( (U8*) max_ptr, NULL);
3037 /* This compensates for not all code setting
3038 * 'has_above_latin1', so that we don't skip stuff that
3039 * should be executed */
3040 if (range_max > 255) {
3041 has_above_latin1 = TRUE;
3045 min_ptr = max_ptr - 1;
3046 range_min = * (U8*) min_ptr;
3047 range_max = * (U8*) max_ptr;
3050 /* If the range is just a single code point, like tr/a-a/.../,
3051 * that code point is already in the output, twice. We can
3052 * just back up over the second instance and avoid all the rest
3053 * of the work. But if it is a variant character, it's been
3054 * counted twice, so decrement. (This unlikely scenario is
3055 * special cased, like the one for a range of 2 code points
3056 * below, only because the main-line code below needs a range
3057 * of 3 or more to work without special casing. Might as well
3058 * get it out of the way now.) */
3059 if (UNLIKELY(range_max == range_min)) {
3061 if (! has_utf8 && ! UVCHR_IS_INVARIANT(range_max)) {
3062 utf8_variant_count--;
3068 /* On EBCDIC platforms, we may have to deal with portable
3069 * ranges. These happen if at least one range endpoint is a
3070 * Unicode value (\N{...}), or if the range is a subset of
3071 * [A-Z] or [a-z], and both ends are literal characters,
3072 * like 'A', and not like \x{C1} */
3074 cBOOL(backslash_N) /* \N{} forces Unicode,
3075 hence portable range */
3076 || ( ! non_portable_endpoint
3077 && (( isLOWER_A(range_min) && isLOWER_A(range_max))
3078 || (isUPPER_A(range_min) && isUPPER_A(range_max))));
3079 if (convert_unicode) {
3081 /* Special handling is needed for these portable ranges.
3082 * They are defined to be in Unicode terms, which includes
3083 * all the Unicode code points between the end points.
3084 * Convert to Unicode to get the Unicode range. Later we
3085 * will convert each code point in the range back to
3087 range_min = NATIVE_TO_UNI(range_min);
3088 range_max = NATIVE_TO_UNI(range_max);
3092 if (range_min > range_max) {
3094 if (convert_unicode) {
3095 /* Need to convert back to native for meaningful
3096 * messages for this platform */
3097 range_min = UNI_TO_NATIVE(range_min);
3098 range_max = UNI_TO_NATIVE(range_max);
3101 /* Use the characters themselves for the error message if
3102 * ASCII printables; otherwise some visible representation
3104 if (isPRINT_A(range_min) && isPRINT_A(range_max)) {
3106 "Invalid range \"%c-%c\" in transliteration operator",
3107 (char)range_min, (char)range_max);
3110 else if (convert_unicode) {
3111 /* diag_listed_as: Invalid range "%s" in transliteration operator */
3113 "Invalid range \"\\N{U+%04" UVXf "}-\\N{U+%04"
3114 UVXf "}\" in transliteration operator",
3115 range_min, range_max);
3119 /* diag_listed_as: Invalid range "%s" in transliteration operator */
3121 "Invalid range \"\\x{%04" UVXf "}-\\x{%04" UVXf "}\""
3122 " in transliteration operator",
3123 range_min, range_max);
3127 /* If the range is exactly two code points long, they are
3128 * already both in the output */
3129 if (UNLIKELY(range_min + 1 == range_max)) {
3133 /* Here the range contains at least 3 code points */
3137 /* If everything in the transliteration is below 256, we
3138 * can avoid special handling later. A translation table
3139 * for each of those bytes is created by op.c. So we
3140 * expand out all ranges to their constituent code points.
3141 * But if we've encountered something above 255, the
3142 * expanding won't help, so skip doing that. But if it's
3143 * EBCDIC, we may have to look at each character below 256
3144 * if we have to convert to/from Unicode values */
3145 if ( has_above_latin1
3147 && (range_min > 255 || ! convert_unicode)
3150 /* Move the high character one byte to the right; then
3151 * insert between it and the range begin, an illegal
3152 * byte which serves to indicate this is a range (using
3153 * a '-' would be ambiguous). */
3155 while (e-- > max_ptr) {
3158 *(e + 1) = (char) ILLEGAL_UTF8_BYTE;
3162 /* Here, we're going to expand out the range. For EBCDIC
3163 * the range can extend above 255 (not so in ASCII), so
3164 * for EBCDIC, split it into the parts above and below
3167 if (range_max > 255) {
3168 real_range_max = range_max;
3174 /* Here we need to expand out the string to contain each
3175 * character in the range. Grow the output to handle this.
3176 * For non-UTF8, we need a byte for each code point in the
3177 * range, minus the three that we've already allocated for: the
3178 * hyphen, the min, and the max. For UTF-8, we need this
3179 * plus an extra byte for each code point that occupies two
3180 * bytes (is variant) when in UTF-8 (except we've already
3181 * allocated for the end points, including if they are
3182 * variants). For ASCII platforms and Unicode ranges on EBCDIC
3183 * platforms, it's easy to calculate a precise number. To
3184 * start, we count the variants in the range, which we need
3185 * elsewhere in this function anyway. (For the case where it
3186 * isn't easy to calculate, 'extras' has been initialized to 0,
3187 * and the calculation is done in a loop further down.) */
3189 if (convert_unicode)
3192 /* This is executed unconditionally on ASCII, and for
3193 * Unicode ranges on EBCDIC. Under these conditions, all
3194 * code points above a certain value are variant; and none
3195 * under that value are. We just need to find out how much
3196 * of the range is above that value. We don't count the
3197 * end points here, as they will already have been counted
3198 * as they were parsed. */
3199 if (range_min >= UTF_CONTINUATION_MARK) {
3201 /* The whole range is made up of variants */
3202 extras = (range_max - 1) - (range_min + 1) + 1;
3204 else if (range_max >= UTF_CONTINUATION_MARK) {
3206 /* Only the higher portion of the range is variants */
3207 extras = (range_max - 1) - UTF_CONTINUATION_MARK + 1;
3210 utf8_variant_count += extras;
3213 /* The base growth is the number of code points in the range,
3214 * not including the endpoints, which have already been sized
3215 * for (and output). We don't subtract for the hyphen, as it
3216 * has been parsed but not output, and the SvGROW below is
3217 * based only on what's been output plus what's left to parse.
3219 grow = (range_max - 1) - (range_min + 1) + 1;
3223 /* In some cases in EBCDIC, we haven't yet calculated a
3224 * precise amount needed for the UTF-8 variants. Just
3225 * assume the worst case, that everything will expand by a
3227 if (! convert_unicode) {
3233 /* Otherwise we know exactly how many variants there
3234 * are in the range. */
3239 /* Grow, but position the output to overwrite the range min end
3240 * point, because in some cases we overwrite that */
3241 SvCUR_set(sv, d - SvPVX_const(sv));
3242 offset_to_min = min_ptr - SvPVX_const(sv);
3244 /* See Note on sizing above. */
3245 d = offset_to_min + SvGROW(sv, SvCUR(sv)
3248 + 1 /* Trailing NUL */ );
3250 /* Now, we can expand out the range. */
3252 if (convert_unicode) {
3255 /* Recall that the min and max are now in Unicode terms, so
3256 * we have to convert each character to its native
3259 for (i = range_min; i <= range_max; i++) {
3260 append_utf8_from_native_byte(
3261 LATIN1_TO_NATIVE((U8) i),
3266 for (i = range_min; i <= range_max; i++) {
3267 *d++ = (char)LATIN1_TO_NATIVE((U8) i);
3273 /* Always gets run for ASCII, and sometimes for EBCDIC. */
3277 /* Here, no conversions are necessary, which means that the
3278 * first character in the range is already in 'd' and
3279 * valid, so we can skip overwriting it */
3282 for (i = range_min + 1; i <= range_max; i++) {
3283 append_utf8_from_native_byte((U8) i, (U8 **) &d);
3288 assert(range_min + 1 <= range_max);
3289 for (i = range_min + 1; i < range_max; i++) {
3291 /* In this case on EBCDIC, we haven't calculated
3292 * the variants. Do it here, as we go along */
3293 if (! UVCHR_IS_INVARIANT(i)) {
3294 utf8_variant_count++;
3300 /* The range_max is done outside the loop so as to
3301 * avoid having to special case not incrementing
3302 * 'utf8_variant_count' on EBCDIC (it's already been
3303 * counted when originally parsed) */
3304 *d++ = (char) range_max;
3309 /* If the original range extended above 255, add in that
3311 if (real_range_max) {
3312 *d++ = (char) UTF8_TWO_BYTE_HI(0x100);
3313 *d++ = (char) UTF8_TWO_BYTE_LO(0x100);
3314 if (real_range_max > 0x100) {
3315 if (real_range_max > 0x101) {
3316 *d++ = (char) ILLEGAL_UTF8_BYTE;
3318 d = (char*)uvchr_to_utf8((U8*)d, real_range_max);
3324 /* mark the range as done, and continue */
3328 non_portable_endpoint = 0;
3332 } /* End of is a range */
3333 } /* End of transliteration. Joins main code after these else's */
3334 else if (*s == '[' && PL_lex_inpat && !in_charclass) {
3337 while (s1 >= start && *s1-- == '\\')
3340 in_charclass = TRUE;
3342 else if (*s == ']' && PL_lex_inpat && in_charclass) {
3345 while (s1 >= start && *s1-- == '\\')
3348 in_charclass = FALSE;
3350 /* skip for regexp comments /(?#comment)/, except for the last
3351 * char, which will be done separately. Stop on (?{..}) and
3353 else if (*s == '(' && PL_lex_inpat && s[1] == '?' && !in_charclass) {
3355 while (s+1 < send && *s != ')')
3358 else if (!PL_lex_casemods
3359 && ( s[2] == '{' /* This should match regcomp.c */
3360 || (s[2] == '?' && s[3] == '{')))
3365 /* likewise skip #-initiated comments in //x patterns */
3369 && ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED)
3371 while (s < send && *s != '\n')
3374 /* no further processing of single-quoted regex */
3375 else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'')
3376 goto default_action;
3378 /* check for embedded arrays
3379 * (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
3381 else if (*s == '@' && s[1]) {
3383 ? isIDFIRST_utf8_safe(s+1, send)
3384 : isWORDCHAR_A(s[1]))
3388 if (strchr(":'{$", s[1]))
3390 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
3391 break; /* in regexp, neither @+ nor @- are interpolated */
3393 /* check for embedded scalars. only stop if we're sure it's a
3395 else if (*s == '$') {
3396 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
3398 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
3400 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
3401 "Possible unintended interpolation of $\\ in regex");
3403 break; /* in regexp, $ might be tail anchor */
3407 /* End of else if chain - OP_TRANS rejoin rest */
3409 if (UNLIKELY(s >= send)) {
3415 if (*s == '\\' && s+1 < send) {
3416 char* e; /* Can be used for ending '}', etc. */
3420 /* warn on \1 - \9 in substitution replacements, but note that \11
3421 * is an octal; and \19 is \1 followed by '9' */
3422 if (PL_lex_inwhat == OP_SUBST
3428 /* diag_listed_as: \%d better written as $%d */
3429 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
3434 /* string-change backslash escapes */
3435 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQF", *s)) {
3439 /* In a pattern, process \N, but skip any other backslash escapes.
3440 * This is because we don't want to translate an escape sequence
3441 * into a meta symbol and have the regex compiler use the meta
3442 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
3443 * in spite of this, we do have to process \N here while the proper
3444 * charnames handler is in scope. See bugs #56444 and #62056.
3446 * There is a complication because \N in a pattern may also stand
3447 * for 'match a non-nl', and not mean a charname, in which case its
3448 * processing should be deferred to the regex compiler. To be a
3449 * charname it must be followed immediately by a '{', and not look
3450 * like \N followed by a curly quantifier, i.e., not something like
3451 * \N{3,}. regcurly returns a boolean indicating if it is a legal
3453 else if (PL_lex_inpat
3456 || regcurly(s + 1)))
3459 goto default_action;
3465 if ((isALPHANUMERIC(*s)))
3466 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
3467 "Unrecognized escape \\%c passed through",
3469 /* default action is to copy the quoted character */
3470 goto default_action;
3473 /* eg. \132 indicates the octal constant 0132 */
3474 case '0': case '1': case '2': case '3':
3475 case '4': case '5': case '6': case '7':
3477 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
3479 uv = grok_oct(s, &len, &flags, NULL);
3481 if (len < 3 && s < send && isDIGIT(*s)
3482 && ckWARN(WARN_MISC))
3484 Perl_warner(aTHX_ packWARN(WARN_MISC),
3485 "%s", form_short_octal_warning(s, len));
3488 goto NUM_ESCAPE_INSERT;
3490 /* eg. \o{24} indicates the octal constant \024 */
3495 bool valid = grok_bslash_o(&s, &uv, &error,
3496 TRUE, /* Output warning */
3497 FALSE, /* Not strict */
3498 TRUE, /* Output warnings for
3503 uv = 0; /* drop through to ensure range ends are set */
3505 goto NUM_ESCAPE_INSERT;
3508 /* eg. \x24 indicates the hex constant 0x24 */
3513 bool valid = grok_bslash_x(&s, &uv, &error,
3514 TRUE, /* Output warning */
3515 FALSE, /* Not strict */
3516 TRUE, /* Output warnings for
3521 uv = 0; /* drop through to ensure range ends are set */
3526 /* Insert oct or hex escaped character. */
3528 /* Here uv is the ordinal of the next character being added */
3529 if (UVCHR_IS_INVARIANT(uv)) {
3533 if (!has_utf8 && uv > 255) {
3535 /* Here, 'uv' won't fit unless we convert to UTF-8.
3536 * If we've only seen invariants so far, all we have to
3537 * do is turn on the flag */
3538 if (utf8_variant_count == 0) {
3542 SvCUR_set(sv, d - SvPVX_const(sv));
3546 sv_utf8_upgrade_flags_grow(
3548 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3550 /* Since we're having to grow here,
3551 * make sure we have enough room for
3552 * this escape and a NUL, so the
3553 * code immediately below won't have
3554 * to actually grow again */
3556 + (STRLEN)(send - s) + 1);
3557 d = SvPVX(sv) + SvCUR(sv);
3560 has_above_latin1 = TRUE;
3566 utf8_variant_count++;
3569 /* Usually, there will already be enough room in 'sv'
3570 * since such escapes are likely longer than any UTF-8
3571 * sequence they can end up as. This isn't the case on
3572 * EBCDIC where \x{40000000} contains 12 bytes, and the
3573 * UTF-8 for it contains 14. And, we have to allow for
3574 * a trailing NUL. It probably can't happen on ASCII
3575 * platforms, but be safe. See Note on sizing above. */
3576 const STRLEN needed = d - SvPVX(sv)
3580 if (UNLIKELY(needed > SvLEN(sv))) {
3581 SvCUR_set(sv, d - SvPVX_const(sv));
3582 d = SvCUR(sv) + SvGROW(sv, needed);
3585 d = (char*)uvchr_to_utf8((U8*)d, uv);
3586 if (PL_lex_inwhat == OP_TRANS
3587 && PL_parser->lex_sub_op)
3589 PL_parser->lex_sub_op->op_private |=
3590 (PL_lex_repl ? OPpTRANS_FROM_UTF
3596 non_portable_endpoint++;
3601 /* In a non-pattern \N must be like \N{U+0041}, or it can be a
3602 * named character, like \N{LATIN SMALL LETTER A}, or a named
3603 * sequence, like \N{LATIN CAPITAL LETTER A WITH MACRON AND
3604 * GRAVE} (except y/// can't handle the latter, croaking). For
3605 * convenience all three forms are referred to as "named
3606 * characters" below.
3608 * For patterns, \N also can mean to match a non-newline. Code
3609 * before this 'switch' statement should already have handled
3610 * this situation, and hence this code only has to deal with
3611 * the named character cases.
3613 * For non-patterns, the named characters are converted to
3614 * their string equivalents. In patterns, named characters are
3615 * not converted to their ultimate forms for the same reasons
3616 * that other escapes aren't. Instead, they are converted to
3617 * the \N{U+...} form to get the value from the charnames that
3618 * is in effect right now, while preserving the fact that it
3619 * was a named character, so that the regex compiler knows
3622 * The structure of this section of code (besides checking for
3623 * errors and upgrading to utf8) is:
3624 * If the named character is of the form \N{U+...}, pass it
3625 * through if a pattern; otherwise convert the code point
3627 * Otherwise must be some \N{NAME}: convert to
3628 * \N{U+c1.c2...} if a pattern; otherwise convert to utf8
3630 * Transliteration is an exception. The conversion to utf8 is
3631 * only done if the code point requires it to be representable.
3633 * Here, 's' points to the 'N'; the test below is guaranteed to
3634 * succeed if we are being called on a pattern, as we already
3635 * know from a test above that the next character is a '{'. A
3636 * non-pattern \N must mean 'named character', which requires
3640 yyerror("Missing braces on \\N{}");
3645 /* If there is no matching '}', it is an error. */
3646 if (! (e = strchr(s, '}'))) {
3647 if (! PL_lex_inpat) {
3648 yyerror("Missing right brace on \\N{}");
3650 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N");
3655 /* Here it looks like a named character */
3657 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3658 s += 2; /* Skip to next char after the 'U+' */
3661 /* In patterns, we can have \N{U+xxxx.yyyy.zzzz...} */
3662 /* Check the syntax. */
3665 if (!isXDIGIT(*s)) {
3668 "Invalid hexadecimal number in \\N{U+...}"
3676 else if ((*s == '.' || *s == '_')
3682 /* Pass everything through unchanged.
3683 * +1 is for the '}' */
3684 Copy(orig_s, d, e - orig_s + 1, char);
3685 d += e - orig_s + 1;
3687 else { /* Not a pattern: convert the hex to string */
3688 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3689 | PERL_SCAN_SILENT_ILLDIGIT
3690 | PERL_SCAN_DISALLOW_PREFIX;
3692 uv = grok_hex(s, &len, &flags, NULL);
3693 if (len == 0 || (len != (STRLEN)(e - s)))
3696 /* For non-tr///, if the destination is not in utf8,
3697 * unconditionally recode it to be so. This is
3698 * because \N{} implies Unicode semantics, and scalars
3699 * have to be in utf8 to guarantee those semantics.
3700 * tr/// doesn't care about Unicode rules, so no need
3701 * there to upgrade to UTF-8 for small enough code
3703 if (! has_utf8 && ( uv > 0xFF
3704 || PL_lex_inwhat != OP_TRANS))
3706 /* See Note on sizing above. */
3707 const STRLEN extra = OFFUNISKIP(uv) + (send - e) + 1;
3709 SvCUR_set(sv, d - SvPVX_const(sv));
3713 if (utf8_variant_count == 0) {
3715 d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + extra);
3718 sv_utf8_upgrade_flags_grow(
3720 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3722 d = SvPVX(sv) + SvCUR(sv);
3726 has_above_latin1 = TRUE;
3729 /* Add the (Unicode) code point to the output. */
3730 if (! has_utf8 || OFFUNI_IS_INVARIANT(uv)) {
3731 *d++ = (char) LATIN1_TO_NATIVE(uv);
3734 d = (char*) uvoffuni_to_utf8_flags((U8*)d, uv, 0);
3738 else /* Here is \N{NAME} but not \N{U+...}. */
3739 if ((res = get_and_check_backslash_N_name(s, e)))
3742 const char *str = SvPV_const(res, len);
3745 if (! len) { /* The name resolved to an empty string */
3746 Copy("\\N{}", d, 4, char);
3750 /* In order to not lose information for the regex
3751 * compiler, pass the result in the specially made
3752 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3753 * the code points in hex of each character
3754 * returned by charnames */
3756 const char *str_end = str + len;
3757 const STRLEN off = d - SvPVX_const(sv);
3759 if (! SvUTF8(res)) {
3760 /* For the non-UTF-8 case, we can determine the
3761 * exact length needed without having to parse
3762 * through the string. Each character takes up
3763 * 2 hex digits plus either a trailing dot or
3765 const char initial_text[] = "\\N{U+";
3766 const STRLEN initial_len = sizeof(initial_text)
3768 d = off + SvGROW(sv, off
3771 /* +1 for trailing NUL */
3774 + (STRLEN)(send - e));
3775 Copy(initial_text, d, initial_len, char);
3777 while (str < str_end) {
3780 my_snprintf(hex_string,
3784 /* The regex compiler is
3785 * expecting Unicode, not
3787 NATIVE_TO_LATIN1(*str));
3788 PERL_MY_SNPRINTF_POST_GUARD(len,
3789 sizeof(hex_string));
3790 Copy(hex_string, d, 3, char);
3794 d--; /* Below, we will overwrite the final
3795 dot with a right brace */
3798 STRLEN char_length; /* cur char's byte length */
3800 /* and the number of bytes after this is
3801 * translated into hex digits */
3802 STRLEN output_length;
3804 /* 2 hex per byte; 2 chars for '\N'; 2 chars
3805 * for max('U+', '.'); and 1 for NUL */
3806 char hex_string[2 * UTF8_MAXBYTES + 5];
3808 /* Get the first character of the result. */
3809 U32 uv = utf8n_to_uvchr((U8 *) str,
3813 /* Convert first code point to Unicode hex,
3814 * including the boiler plate before it. */
3816 my_snprintf(hex_string, sizeof(hex_string),
3818 (unsigned int) NATIVE_TO_UNI(uv));
3820 /* Make sure there is enough space to hold it */
3821 d = off + SvGROW(sv, off
3823 + (STRLEN)(send - e)
3824 + 2); /* '}' + NUL */
3826 Copy(hex_string, d, output_length, char);
3829 /* For each subsequent character, append dot and
3830 * its Unicode code point in hex */
3831 while ((str += char_length) < str_end) {
3832 const STRLEN off = d - SvPVX_const(sv);
3833 U32 uv = utf8n_to_uvchr((U8 *) str,
3838 my_snprintf(hex_string,
3841 (unsigned int) NATIVE_TO_UNI(uv));
3843 d = off + SvGROW(sv, off
3845 + (STRLEN)(send - e)
3846 + 2); /* '}' + NUL */
3847 Copy(hex_string, d, output_length, char);
3852 *d++ = '}'; /* Done. Add the trailing brace */
3855 else { /* Here, not in a pattern. Convert the name to a
3858 if (PL_lex_inwhat == OP_TRANS) {
3859 str = SvPV_const(res, len);
3860 if (len > ((SvUTF8(res))
3864 yyerror(Perl_form(aTHX_
3865 "%.*s must not be a named sequence"
3866 " in transliteration operator",
3867 /* +1 to include the "}" */
3868 (int) (e + 1 - start), start));
3869 goto end_backslash_N;
3872 if (SvUTF8(res) && UTF8_IS_ABOVE_LATIN1(*str)) {
3873 has_above_latin1 = TRUE;
3877 else if (! SvUTF8(res)) {
3878 /* Make sure \N{} return is UTF-8. This is because
3879 * \N{} implies Unicode semantics, and scalars have
3880 * to be in utf8 to guarantee those semantics; but
3881 * not needed in tr/// */
3882 sv_utf8_upgrade_flags(res, 0);
3883 str = SvPV_const(res, len);
3886 /* Upgrade destination to be utf8 if this new
3888 if (! has_utf8 && SvUTF8(res)) {
3889 /* See Note on sizing above. */
3890 const STRLEN extra = len + (send - s) + 1;
3892 SvCUR_set(sv, d - SvPVX_const(sv));
3896 if (utf8_variant_count == 0) {
3898 d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + extra);
3901 sv_utf8_upgrade_flags_grow(sv,
3902 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3904 d = SvPVX(sv) + SvCUR(sv);
3907 } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
3909 /* See Note on sizing above. (NOTE: SvCUR() is not
3910 * set correctly here). */
3911 const STRLEN extra = len + (send - e) + 1;
3912 const STRLEN off = d - SvPVX_const(sv);
3913 d = off + SvGROW(sv, off + extra);
3915 Copy(str, d, len, char);
3921 } /* End \N{NAME} */
3925 backslash_N++; /* \N{} is defined to be Unicode */
3927 s = e + 1; /* Point to just after the '}' */
3930 /* \c is a control character */
3934 *d++ = grok_bslash_c(*s++, 1);
3937 yyerror("Missing control char name in \\c");
3940 non_portable_endpoint++;
3944 /* printf-style backslashes, formfeeds, newlines, etc */
3970 } /* end if (backslash) */
3973 /* Just copy the input to the output, though we may have to convert
3976 * If the input has the same representation in UTF-8 as not, it will be
3977 * a single byte, and we don't care about UTF8ness; just copy the byte */
3978 if (NATIVE_BYTE_IS_INVARIANT((U8)(*s))) {
3981 else if (! this_utf8 && ! has_utf8) {
3982 /* If neither source nor output is UTF-8, is also a single byte,
3983 * just copy it; but this byte counts should we later have to
3984 * convert to UTF-8 */
3986 utf8_variant_count++;
3988 else if (this_utf8 && has_utf8) { /* Both UTF-8, can just copy */
3989 const STRLEN len = UTF8SKIP(s);
3991 /* We expect the source to have already been checked for
3993 assert(isUTF8_CHAR((U8 *) s, (U8 *) send));
3995 Copy(s, d, len, U8);
3999 else { /* UTF8ness matters and doesn't match, need to convert */
4001 const UV nextuv = (this_utf8)
4002 ? utf8n_to_uvchr((U8*)s, send - s, &len, 0)
4004 STRLEN need = UVCHR_SKIP(nextuv);
4007 SvCUR_set(sv, d - SvPVX_const(sv));
4011 /* See Note on sizing above. */
4012 need += (STRLEN)(send - s) + 1;
4014 if (utf8_variant_count == 0) {
4016 d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + need);
4019 sv_utf8_upgrade_flags_grow(sv,
4020 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
4022 d = SvPVX(sv) + SvCUR(sv);
4025 } else if (need > len) {
4026 /* encoded value larger than old, may need extra space (NOTE:
4027 * SvCUR() is not set correctly here). See Note on sizing
4029 const STRLEN extra = need + (send - s) + 1;
4030 const STRLEN off = d - SvPVX_const(sv);
4031 d = off + SvGROW(sv, off + extra);
4035 d = (char*)uvchr_to_utf8((U8*)d, nextuv);
4037 } /* while loop to process each character */
4039 /* terminate the string and set up the sv */
4041 SvCUR_set(sv, d - SvPVX_const(sv));
4042 if (SvCUR(sv) >= SvLEN(sv))
4043 Perl_croak(aTHX_ "panic: constant overflowed allocated space, %" UVuf
4044 " >= %" UVuf, (UV)SvCUR(sv), (UV)SvLEN(sv));
4049 if (PL_lex_inwhat == OP_TRANS && PL_parser->lex_sub_op) {
4050 PL_parser->lex_sub_op->op_private |=
4051 (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
4055 /* shrink the sv if we allocated more than we used */
4056 if (SvCUR(sv) + 5 < SvLEN(sv)) {
4057 SvPV_shrink_to_cur(sv);
4060 /* return the substring (via pl_yylval) only if we parsed anything */
4063 for (; s2 < s; s2++) {
4065 COPLINE_INC_WITH_HERELINES;
4067 SvREFCNT_inc_simple_void_NN(sv);
4068 if ( (PL_hints & ( PL_lex_inpat ? HINT_NEW_RE : HINT_NEW_STRING ))
4069 && ! PL_parser->lex_re_reparsing)
4071 const char *const key = PL_lex_inpat ? "qr" : "q";
4072 const STRLEN keylen = PL_lex_inpat ? 2 : 1;
4076 if (PL_lex_inwhat == OP_TRANS) {
4079 } else if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat) {
4082 } else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'') {
4090 sv = S_new_constant(aTHX_ start, s - start, key, keylen, sv, NULL,
4093 pl_yylval.opval = newSVOP(OP_CONST, 0, sv);
4095 LEAVE_with_name("scan_const");
4100 * Returns TRUE if there's more to the expression (e.g., a subscript),
4103 * It deals with "$foo[3]" and /$foo[3]/ and /$foo[0123456789$]+/
4105 * ->[ and ->{ return TRUE
4106 * ->$* ->$#* ->@* ->@[ ->@{ return TRUE if postderef_qq is enabled
4107 * { and [ outside a pattern are always subscripts, so return TRUE
4108 * if we're outside a pattern and it's not { or [, then return FALSE
4109 * if we're in a pattern and the first char is a {
4110 * {4,5} (any digits around the comma) returns FALSE
4111 * if we're in a pattern and the first char is a [
4113 * [SOMETHING] has a funky algorithm to decide whether it's a
4114 * character class or not. It has to deal with things like
4115 * /$foo[-3]/ and /$foo[$bar]/ as well as /$foo[$\d]+/
4116 * anything else returns TRUE
4119 /* This is the one truly awful dwimmer necessary to conflate C and sed. */
4122 S_intuit_more(pTHX_ char *s)
4124 PERL_ARGS_ASSERT_INTUIT_MORE;
4126 if (PL_lex_brackets)
4128 if (*s == '-' && s[1] == '>' && (s[2] == '[' || s[2] == '{'))
4130 if (*s == '-' && s[1] == '>'
4131 && FEATURE_POSTDEREF_QQ_IS_ENABLED
4132 && ( (s[2] == '$' && (s[3] == '*' || (s[3] == '#' && s[4] == '*')))
4133 ||(s[2] == '@' && strchr("*[{",s[3])) ))
4135 if (*s != '{' && *s != '[')
4140 /* In a pattern, so maybe we have {n,m}. */
4148 /* On the other hand, maybe we have a character class */
4151 if (*s == ']' || *s == '^')
4154 /* this is terrifying, and it works */
4157 const char * const send = strchr(s,']');
4158 unsigned char un_char, last_un_char;
4159 char tmpbuf[sizeof PL_tokenbuf * 4];
4161 if (!send) /* has to be an expression */
4163 weight = 2; /* let's weigh the evidence */
4167 else if (isDIGIT(*s)) {
4169 if (isDIGIT(s[1]) && s[2] == ']')
4175 Zero(seen,256,char);
4177 for (; s < send; s++) {
4178 last_un_char = un_char;
4179 un_char = (unsigned char)*s;
4184 weight -= seen[un_char] * 10;
4185 if (isWORDCHAR_lazy_if_safe(s+1, PL_bufend, UTF)) {
4187 scan_ident(s, tmpbuf, sizeof tmpbuf, FALSE);
4188 len = (int)strlen(tmpbuf);
4189 if (len > 1 && gv_fetchpvn_flags(tmpbuf, len,
4190 UTF ? SVf_UTF8 : 0, SVt_PV))
4197 && strchr("[#!%*<>()-=",s[1]))
4199 if (/*{*/ strchr("])} =",s[2]))
4208 if (strchr("wds]",s[1]))
4210 else if (seen[(U8)'\''] || seen[(U8)'"'])
4212 else if (strchr("rnftbxcav",s[1]))
4214 else if (isDIGIT(s[1])) {
4216 while (s[1] && isDIGIT(s[1]))
4226 if (strchr("aA01! ",last_un_char))
4228 if (strchr("zZ79~",s[1]))
4230 if (last_un_char == 255 && (isDIGIT(s[1]) || s[1] == '$'))
4231 weight -= 5; /* cope with negative subscript */
4234 if (!isWORDCHAR(last_un_char)
4235 && !(last_un_char == '$' || last_un_char == '@'
4236 || last_un_char == '&')
4237 && isALPHA(*s) && s[1] && isALPHA(s[1])) {