3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
12 * 'It all comes from here, the stench and the peril.' --Frodo
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
18 * This file is the lexer for Perl. It's closely linked to the
21 * The main routine is yylex(), which returns the next token.
25 =head1 Lexer interface
26 This is the lower layer of the Perl parser, managing characters and tokens.
28 =for apidoc AmU|yy_parser *|PL_parser
30 Pointer to a structure encapsulating the state of the parsing operation
31 currently in progress. The pointer can be locally changed to perform
32 a nested parse without interfering with the state of an outer parse.
33 Individual members of C<PL_parser> have their own documentation.
39 #define PERL_IN_TOKE_C
41 #include "dquote_inline.h"
43 #define new_constant(a,b,c,d,e,f,g) \
44 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
46 #define pl_yylval (PL_parser->yylval)
48 /* XXX temporary backwards compatibility */
49 #define PL_lex_brackets (PL_parser->lex_brackets)
50 #define PL_lex_allbrackets (PL_parser->lex_allbrackets)
51 #define PL_lex_fakeeof (PL_parser->lex_fakeeof)
52 #define PL_lex_brackstack (PL_parser->lex_brackstack)
53 #define PL_lex_casemods (PL_parser->lex_casemods)
54 #define PL_lex_casestack (PL_parser->lex_casestack)
55 #define PL_lex_dojoin (PL_parser->lex_dojoin)
56 #define PL_lex_formbrack (PL_parser->lex_formbrack)
57 #define PL_lex_inpat (PL_parser->lex_inpat)
58 #define PL_lex_inwhat (PL_parser->lex_inwhat)
59 #define PL_lex_op (PL_parser->lex_op)
60 #define PL_lex_repl (PL_parser->lex_repl)
61 #define PL_lex_starts (PL_parser->lex_starts)
62 #define PL_lex_stuff (PL_parser->lex_stuff)
63 #define PL_multi_start (PL_parser->multi_start)
64 #define PL_multi_open (PL_parser->multi_open)
65 #define PL_multi_close (PL_parser->multi_close)
66 #define PL_preambled (PL_parser->preambled)
67 #define PL_linestr (PL_parser->linestr)
68 #define PL_expect (PL_parser->expect)
69 #define PL_copline (PL_parser->copline)
70 #define PL_bufptr (PL_parser->bufptr)
71 #define PL_oldbufptr (PL_parser->oldbufptr)
72 #define PL_oldoldbufptr (PL_parser->oldoldbufptr)
73 #define PL_linestart (PL_parser->linestart)
74 #define PL_bufend (PL_parser->bufend)
75 #define PL_last_uni (PL_parser->last_uni)
76 #define PL_last_lop (PL_parser->last_lop)
77 #define PL_last_lop_op (PL_parser->last_lop_op)
78 #define PL_lex_state (PL_parser->lex_state)
79 #define PL_rsfp (PL_parser->rsfp)
80 #define PL_rsfp_filters (PL_parser->rsfp_filters)
81 #define PL_in_my (PL_parser->in_my)
82 #define PL_in_my_stash (PL_parser->in_my_stash)
83 #define PL_tokenbuf (PL_parser->tokenbuf)
84 #define PL_multi_end (PL_parser->multi_end)
85 #define PL_error_count (PL_parser->error_count)
87 # define PL_nexttoke (PL_parser->nexttoke)
88 # define PL_nexttype (PL_parser->nexttype)
89 # define PL_nextval (PL_parser->nextval)
92 #define SvEVALED(sv) \
93 (SvTYPE(sv) >= SVt_PVNV \
94 && ((XPVIV*)SvANY(sv))->xiv_u.xivu_eval_seen)
96 static const char* const ident_too_long = "Identifier too long";
98 # define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
100 #define XENUMMASK 0x3f
101 #define XFAKEEOF 0x40
102 #define XFAKEBRACK 0x80
104 #ifdef USE_UTF8_SCRIPTS
105 # define UTF cBOOL(!IN_BYTES)
107 # define UTF cBOOL((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
110 /* The maximum number of characters preceding the unrecognized one to display */
111 #define UNRECOGNIZED_PRECEDE_COUNT 10
113 /* In variables named $^X, these are the legal values for X.
114 * 1999-02-27 mjd-perl-patch@plover.com */
115 #define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
117 #define SPACE_OR_TAB(c) isBLANK_A(c)
119 #define HEXFP_PEEK(s) \
121 (isXDIGIT(s[1]) || isALPHA_FOLD_EQ(s[1], 'p'))) || \
122 isALPHA_FOLD_EQ(s[0], 'p'))
124 /* LEX_* are values for PL_lex_state, the state of the lexer.
125 * They are arranged oddly so that the guard on the switch statement
126 * can get by with a single comparison (if the compiler is smart enough).
128 * These values refer to the various states within a sublex parse,
129 * i.e. within a double quotish string
132 /* #define LEX_NOTPARSING 11 is done in perl.h. */
134 #define LEX_NORMAL 10 /* normal code (ie not within "...") */
135 #define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
136 #define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
137 #define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
138 #define LEX_INTERPSTART 6 /* expecting the start of a $var */
140 /* at end of code, eg "$x" followed by: */
141 #define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
142 #define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
144 #define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
145 string or after \E, $foo, etc */
146 #define LEX_INTERPCONST 2 /* NOT USED */
147 #define LEX_FORMLINE 1 /* expecting a format line */
151 static const char* const lex_state_names[] = {
166 #include "keywords.h"
168 /* CLINE is a macro that ensures PL_copline has a sane value */
170 #define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
173 * Convenience functions to return different tokens and prime the
174 * lexer for the next token. They all take an argument.
176 * TOKEN : generic token (used for '(', DOLSHARP, etc)
177 * OPERATOR : generic operator
178 * AOPERATOR : assignment operator
179 * PREBLOCK : beginning the block after an if, while, foreach, ...
180 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
181 * PREREF : *EXPR where EXPR is not a simple identifier
182 * TERM : expression term
183 * POSTDEREF : postfix dereference (->$* ->@[...] etc.)
184 * LOOPX : loop exiting command (goto, last, dump, etc)
185 * FTST : file test operator
186 * FUN0 : zero-argument function
187 * FUN0OP : zero-argument function, with its op created in this file
188 * FUN1 : not used, except for not, which isn't a UNIOP
189 * BOop : bitwise or or xor
191 * BCop : bitwise complement
192 * SHop : shift operator
193 * PWop : power operator
194 * PMop : pattern-matching operator
195 * Aop : addition-level operator
196 * AopNOASSIGN : addition-level operator that is never part of .=
197 * Mop : multiplication-level operator
198 * Eop : equality-testing operator
199 * Rop : relational operator <= != gt
201 * Also see LOP and lop() below.
204 #ifdef DEBUGGING /* Serve -DT. */
205 # define REPORT(retval) tokereport((I32)retval, &pl_yylval)
207 # define REPORT(retval) (retval)
210 #define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
211 #define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
212 #define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, retval))
213 #define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
214 #define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
215 #define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
216 #define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
217 #define POSTDEREF(f) return (PL_bufptr = s, S_postderef(aTHX_ REPORT(f),s[1]))
218 #define LOOPX(f) return (PL_bufptr = force_word(s,BAREWORD,TRUE,FALSE), \
220 PL_expect = PL_nexttoke ? XOPERATOR : XTERM, \
222 #define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
223 #define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
224 #define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
225 #define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
226 #define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)BITOROP))
227 #define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)BITANDOP))
228 #define BCop(f) return pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr = s, \
230 #define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)SHIFTOP))
231 #define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)POWOP))
232 #define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
233 #define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)ADDOP))
234 #define AopNOASSIGN(f) return (pl_yylval.ival=f, PL_bufptr=s, REPORT((int)ADDOP))
235 #define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)MULOP))
236 #define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
237 #define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
239 /* This bit of chicanery makes a unary function followed by
240 * a parenthesis into a function with one argument, highest precedence.
241 * The UNIDOR macro is for unary functions that can be followed by the //
242 * operator (such as C<shift // 0>).
244 #define UNI3(f,x,have_x) { \
245 pl_yylval.ival = f; \
246 if (have_x) PL_expect = x; \
248 PL_last_uni = PL_oldbufptr; \
249 PL_last_lop_op = (f) < 0 ? -(f) : (f); \
251 return REPORT( (int)FUNC1 ); \
253 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
255 #define UNI(f) UNI3(f,XTERM,1)
256 #define UNIDOR(f) UNI3(f,XTERMORDORDOR,1)
257 #define UNIPROTO(f,optional) { \
258 if (optional) PL_last_uni = PL_oldbufptr; \
262 #define UNIBRACK(f) UNI3(f,0,0)
264 /* grandfather return to old style */
267 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
268 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
269 pl_yylval.ival = (f); \
275 #define COPLINE_INC_WITH_HERELINES \
277 CopLINE_inc(PL_curcop); \
278 if (PL_parser->herelines) \
279 CopLINE(PL_curcop) += PL_parser->herelines, \
280 PL_parser->herelines = 0; \
282 /* Called after scan_str to update CopLINE(PL_curcop), but only when there
283 * is no sublex_push to follow. */
284 #define COPLINE_SET_FROM_MULTI_END \
286 CopLINE_set(PL_curcop, PL_multi_end); \
287 if (PL_multi_end != PL_multi_start) \
288 PL_parser->herelines = 0; \
294 /* how to interpret the pl_yylval associated with the token */
298 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
303 static struct debug_tokens {
305 enum token_type type;
307 } const debug_tokens[] =
309 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
310 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
311 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
312 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
313 { ARROW, TOKENTYPE_NONE, "ARROW" },
314 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
315 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
316 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
317 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
318 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
319 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
320 { DO, TOKENTYPE_NONE, "DO" },
321 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
322 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
323 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
324 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
325 { ELSE, TOKENTYPE_NONE, "ELSE" },
326 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
327 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
328 { FOR, TOKENTYPE_IVAL, "FOR" },
329 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
330 { FORMLBRACK, TOKENTYPE_NONE, "FORMLBRACK" },
331 { FORMRBRACK, TOKENTYPE_NONE, "FORMRBRACK" },
332 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
333 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
334 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
335 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
336 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
337 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
338 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
339 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
340 { IF, TOKENTYPE_IVAL, "IF" },
341 { LABEL, TOKENTYPE_PVAL, "LABEL" },
342 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
343 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
344 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
345 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
346 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
347 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
348 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
349 { MY, TOKENTYPE_IVAL, "MY" },
350 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
351 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
352 { OROP, TOKENTYPE_IVAL, "OROP" },
353 { OROR, TOKENTYPE_NONE, "OROR" },
354 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
355 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
356 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
357 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
358 { POSTJOIN, TOKENTYPE_NONE, "POSTJOIN" },
359 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
360 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
361 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
362 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
363 { PREINC, TOKENTYPE_NONE, "PREINC" },
364 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
365 { QWLIST, TOKENTYPE_OPVAL, "QWLIST" },
366 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
367 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
368 { REQUIRE, TOKENTYPE_NONE, "REQUIRE" },
369 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
370 { SUB, TOKENTYPE_NONE, "SUB" },
371 { THING, TOKENTYPE_OPVAL, "THING" },
372 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
373 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
374 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
375 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
376 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
377 { USE, TOKENTYPE_IVAL, "USE" },
378 { WHEN, TOKENTYPE_IVAL, "WHEN" },
379 { WHILE, TOKENTYPE_IVAL, "WHILE" },
380 { BAREWORD, TOKENTYPE_OPVAL, "BAREWORD" },
381 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
382 { 0, TOKENTYPE_NONE, NULL }
385 /* dump the returned token in rv, plus any optional arg in pl_yylval */
388 S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
390 PERL_ARGS_ASSERT_TOKEREPORT;
393 const char *name = NULL;
394 enum token_type type = TOKENTYPE_NONE;
395 const struct debug_tokens *p;
396 SV* const report = newSVpvs("<== ");
398 for (p = debug_tokens; p->token; p++) {
399 if (p->token == (int)rv) {
406 Perl_sv_catpv(aTHX_ report, name);
407 else if (isGRAPH(rv))
409 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
411 sv_catpvs(report, " (pending identifier)");
414 sv_catpvs(report, "EOF");
416 Perl_sv_catpvf(aTHX_ report, "?? %"IVdf, (IV)rv);
421 Perl_sv_catpvf(aTHX_ report, "(ival=%"IVdf")", (IV)lvalp->ival);
423 case TOKENTYPE_OPNUM:
424 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
425 PL_op_name[lvalp->ival]);
428 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
430 case TOKENTYPE_OPVAL:
432 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
433 PL_op_name[lvalp->opval->op_type]);
434 if (lvalp->opval->op_type == OP_CONST) {
435 Perl_sv_catpvf(aTHX_ report, " %s",
436 SvPEEK(cSVOPx_sv(lvalp->opval)));
441 sv_catpvs(report, "(opval=null)");
444 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
450 /* print the buffer with suitable escapes */
453 S_printbuf(pTHX_ const char *const fmt, const char *const s)
455 SV* const tmp = newSVpvs("");
457 PERL_ARGS_ASSERT_PRINTBUF;
459 GCC_DIAG_IGNORE(-Wformat-nonliteral); /* fmt checked by caller */
460 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
468 S_deprecate_commaless_var_list(pTHX) {
470 deprecate("comma-less variable list");
471 return REPORT(','); /* grandfather non-comma-format format */
477 * This subroutine looks for an '=' next to the operator that has just been
478 * parsed and turns it into an ASSIGNOP if it finds one.
482 S_ao(pTHX_ int toketype)
484 if (*PL_bufptr == '=') {
486 if (toketype == ANDAND)
487 pl_yylval.ival = OP_ANDASSIGN;
488 else if (toketype == OROR)
489 pl_yylval.ival = OP_ORASSIGN;
490 else if (toketype == DORDOR)
491 pl_yylval.ival = OP_DORASSIGN;
494 return REPORT(toketype);
499 * When Perl expects an operator and finds something else, no_op
500 * prints the warning. It always prints "<something> found where
501 * operator expected. It prints "Missing semicolon on previous line?"
502 * if the surprise occurs at the start of the line. "do you need to
503 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
504 * where the compiler doesn't know if foo is a method call or a function.
505 * It prints "Missing operator before end of line" if there's nothing
506 * after the missing operator, or "... before <...>" if there is something
507 * after the missing operator.
509 * PL_bufptr is expected to point to the start of the thing that was found,
510 * and s after the next token or partial token.
514 S_no_op(pTHX_ const char *const what, char *s)
516 char * const oldbp = PL_bufptr;
517 const bool is_first = (PL_oldbufptr == PL_linestart);
519 PERL_ARGS_ASSERT_NO_OP;
525 yywarn(Perl_form(aTHX_ "%s found where operator expected", what), UTF ? SVf_UTF8 : 0);
526 if (ckWARN_d(WARN_SYNTAX)) {
528 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
529 "\t(Missing semicolon on previous line?)\n");
530 else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) {
532 for (t = PL_oldoldbufptr; (isWORDCHAR_lazy_if(t,UTF) || *t == ':');
533 t += UTF ? UTF8SKIP(t) : 1)
535 if (t < PL_bufptr && isSPACE(*t))
536 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
537 "\t(Do you need to predeclare %"UTF8f"?)\n",
538 UTF8fARG(UTF, t - PL_oldoldbufptr, PL_oldoldbufptr));
542 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
543 "\t(Missing operator before %"UTF8f"?)\n",
544 UTF8fARG(UTF, s - oldbp, oldbp));
552 * Complain about missing quote/regexp/heredoc terminator.
553 * If it's called with NULL then it cauterizes the line buffer.
554 * If we're in a delimited string and the delimiter is a control
555 * character, it's reformatted into a two-char sequence like ^C.
560 S_missingterm(pTHX_ char *s)
562 char tmpbuf[UTF8_MAXBYTES + 1];
567 char * const nl = strrchr(s,'\n');
572 else if (PL_multi_close < 32) {
574 tmpbuf[1] = (char)toCTRL(PL_multi_close);
579 if (LIKELY(PL_multi_close < 256)) {
580 *tmpbuf = (char)PL_multi_close;
585 *uvchr_to_utf8((U8 *)tmpbuf, PL_multi_close) = 0;
589 q = strchr(s,'"') ? '\'' : '"';
590 sv = sv_2mortal(newSVpv(s,0));
593 Perl_croak(aTHX_ "Can't find string terminator %c%"SVf
594 "%c anywhere before EOF",q,SVfARG(sv),q);
600 * Check whether the named feature is enabled.
603 Perl_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
605 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
607 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
609 assert(CURRENT_FEATURE_BUNDLE == FEATURE_BUNDLE_CUSTOM);
611 if (namelen > MAX_FEATURE_LEN)
613 memcpy(&he_name[8], name, namelen);
615 return cBOOL(cop_hints_fetch_pvn(PL_curcop, he_name, 8 + namelen, 0,
616 REFCOUNTED_HE_EXISTS));
620 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
621 * utf16-to-utf8-reversed.
624 #ifdef PERL_CR_FILTER
628 const char *s = SvPVX_const(sv);
629 const char * const e = s + SvCUR(sv);
631 PERL_ARGS_ASSERT_STRIP_RETURN;
633 /* outer loop optimized to do nothing if there are no CR-LFs */
635 if (*s++ == '\r' && *s == '\n') {
636 /* hit a CR-LF, need to copy the rest */
640 if (*s == '\r' && s[1] == '\n')
651 S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
653 const I32 count = FILTER_READ(idx+1, sv, maxlen);
654 if (count > 0 && !maxlen)
661 =for apidoc Amx|void|lex_start|SV *line|PerlIO *rsfp|U32 flags
663 Creates and initialises a new lexer/parser state object, supplying
664 a context in which to lex and parse from a new source of Perl code.
665 A pointer to the new state object is placed in L</PL_parser>. An entry
666 is made on the save stack so that upon unwinding the new state object
667 will be destroyed and the former value of L</PL_parser> will be restored.
668 Nothing else need be done to clean up the parsing context.
670 The code to be parsed comes from C<line> and C<rsfp>. C<line>, if
671 non-null, provides a string (in SV form) containing code to be parsed.
672 A copy of the string is made, so subsequent modification of C<line>
673 does not affect parsing. C<rsfp>, if non-null, provides an input stream
674 from which code will be read to be parsed. If both are non-null, the
675 code in C<line> comes first and must consist of complete lines of input,
676 and C<rsfp> supplies the remainder of the source.
678 The C<flags> parameter is reserved for future use. Currently it is only
679 used by perl internally, so extensions should always pass zero.
684 /* LEX_START_SAME_FILTER indicates that this is not a new file, so it
685 can share filters with the current parser.
686 LEX_START_DONT_CLOSE indicates that the file handle wasn't opened by the
687 caller, hence isn't owned by the parser, so shouldn't be closed on parser
688 destruction. This is used to handle the case of defaulting to reading the
689 script from the standard input because no filename was given on the command
690 line (without getting confused by situation where STDIN has been closed, so
691 the script handle is opened on fd 0) */
694 Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
696 const char *s = NULL;
697 yy_parser *parser, *oparser;
698 if (flags && flags & ~LEX_START_FLAGS)
699 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
701 /* create and initialise a parser */
703 Newxz(parser, 1, yy_parser);
704 parser->old_parser = oparser = PL_parser;
707 parser->stack = NULL;
709 parser->stack_size = 0;
711 /* on scope exit, free this parser and restore any outer one */
713 parser->saved_curcop = PL_curcop;
715 /* initialise lexer state */
717 parser->nexttoke = 0;
718 parser->error_count = oparser ? oparser->error_count : 0;
719 parser->copline = parser->preambling = NOLINE;
720 parser->lex_state = LEX_NORMAL;
721 parser->expect = XSTATE;
723 parser->rsfp_filters =
724 !(flags & LEX_START_SAME_FILTER) || !oparser
726 : MUTABLE_AV(SvREFCNT_inc(
727 oparser->rsfp_filters
728 ? oparser->rsfp_filters
729 : (oparser->rsfp_filters = newAV())
732 Newx(parser->lex_brackstack, 120, char);
733 Newx(parser->lex_casestack, 12, char);
734 *parser->lex_casestack = '\0';
735 Newxz(parser->lex_shared, 1, LEXSHARED);
739 s = SvPV_const(line, len);
740 parser->linestr = flags & LEX_START_COPIED
741 ? SvREFCNT_inc_simple_NN(line)
742 : newSVpvn_flags(s, len, SvUTF8(line));
744 sv_catpvs(parser->linestr, "\n;");
746 parser->linestr = newSVpvn("\n;", rsfp ? 1 : 2);
748 parser->oldoldbufptr =
751 parser->linestart = SvPVX(parser->linestr);
752 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
753 parser->last_lop = parser->last_uni = NULL;
755 STATIC_ASSERT_STMT(FITS_IN_8_BITS(LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
756 |LEX_DONT_CLOSE_RSFP));
757 parser->lex_flags = (U8) (flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
758 |LEX_DONT_CLOSE_RSFP));
760 parser->in_pod = parser->filtered = 0;
764 /* delete a parser object */
767 Perl_parser_free(pTHX_ const yy_parser *parser)
769 PERL_ARGS_ASSERT_PARSER_FREE;
771 PL_curcop = parser->saved_curcop;
772 SvREFCNT_dec(parser->linestr);
774 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
775 PerlIO_clearerr(parser->rsfp);
776 else if (parser->rsfp && (!parser->old_parser
777 || (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
778 PerlIO_close(parser->rsfp);
779 SvREFCNT_dec(parser->rsfp_filters);
780 SvREFCNT_dec(parser->lex_stuff);
781 SvREFCNT_dec(parser->lex_sub_repl);
783 Safefree(parser->lex_brackstack);
784 Safefree(parser->lex_casestack);
785 Safefree(parser->lex_shared);
786 PL_parser = parser->old_parser;
791 Perl_parser_free_nexttoke_ops(pTHX_ yy_parser *parser, OPSLAB *slab)
793 I32 nexttoke = parser->nexttoke;
794 PERL_ARGS_ASSERT_PARSER_FREE_NEXTTOKE_OPS;
796 if (S_is_opval_token(parser->nexttype[nexttoke] & 0xffff)
797 && parser->nextval[nexttoke].opval
798 && parser->nextval[nexttoke].opval->op_slabbed
799 && OpSLAB(parser->nextval[nexttoke].opval) == slab) {
800 op_free(parser->nextval[nexttoke].opval);
801 parser->nextval[nexttoke].opval = NULL;
808 =for apidoc AmxU|SV *|PL_parser-E<gt>linestr
810 Buffer scalar containing the chunk currently under consideration of the
811 text currently being lexed. This is always a plain string scalar (for
812 which C<SvPOK> is true). It is not intended to be used as a scalar by
813 normal scalar means; instead refer to the buffer directly by the pointer
814 variables described below.
816 The lexer maintains various C<char*> pointers to things in the
817 C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
818 reallocated, all of these pointers must be updated. Don't attempt to
819 do this manually, but rather use L</lex_grow_linestr> if you need to
820 reallocate the buffer.
822 The content of the text chunk in the buffer is commonly exactly one
823 complete line of input, up to and including a newline terminator,
824 but there are situations where it is otherwise. The octets of the
825 buffer may be intended to be interpreted as either UTF-8 or Latin-1.
826 The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
827 flag on this scalar, which may disagree with it.
829 For direct examination of the buffer, the variable
830 L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
831 lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
832 of these pointers is usually preferable to examination of the scalar
833 through normal scalar means.
835 =for apidoc AmxU|char *|PL_parser-E<gt>bufend
837 Direct pointer to the end of the chunk of text currently being lexed, the
838 end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
839 + SvCUR(PL_parser-E<gt>linestr)>. A C<NUL> character (zero octet) is
840 always located at the end of the buffer, and does not count as part of
841 the buffer's contents.
843 =for apidoc AmxU|char *|PL_parser-E<gt>bufptr
845 Points to the current position of lexing inside the lexer buffer.
846 Characters around this point may be freely examined, within
847 the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
848 L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
849 interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
851 Lexing code (whether in the Perl core or not) moves this pointer past
852 the characters that it consumes. It is also expected to perform some
853 bookkeeping whenever a newline character is consumed. This movement
854 can be more conveniently performed by the function L</lex_read_to>,
855 which handles newlines appropriately.
857 Interpretation of the buffer's octets can be abstracted out by
858 using the slightly higher-level functions L</lex_peek_unichar> and
859 L</lex_read_unichar>.
861 =for apidoc AmxU|char *|PL_parser-E<gt>linestart
863 Points to the start of the current line inside the lexer buffer.
864 This is useful for indicating at which column an error occurred, and
865 not much else. This must be updated by any lexing code that consumes
866 a newline; the function L</lex_read_to> handles this detail.
872 =for apidoc Amx|bool|lex_bufutf8
874 Indicates whether the octets in the lexer buffer
875 (L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
876 of Unicode characters. If not, they should be interpreted as Latin-1
877 characters. This is analogous to the C<SvUTF8> flag for scalars.
879 In UTF-8 mode, it is not guaranteed that the lexer buffer actually
880 contains valid UTF-8. Lexing code must be robust in the face of invalid
883 The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
884 is significant, but not the whole story regarding the input character
885 encoding. Normally, when a file is being read, the scalar contains octets
886 and its C<SvUTF8> flag is off, but the octets should be interpreted as
887 UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
888 however, the scalar may have the C<SvUTF8> flag on, and in this case its
889 octets should be interpreted as UTF-8 unless the C<use bytes> pragma
890 is in effect. This logic may change in the future; use this function
891 instead of implementing the logic yourself.
897 Perl_lex_bufutf8(pTHX)
903 =for apidoc Amx|char *|lex_grow_linestr|STRLEN len
905 Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
906 at least C<len> octets (including terminating C<NUL>). Returns a
907 pointer to the reallocated buffer. This is necessary before making
908 any direct modification of the buffer that would increase its length.
909 L</lex_stuff_pvn> provides a more convenient way to insert text into
912 Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
913 this function updates all of the lexer's variables that point directly
920 Perl_lex_grow_linestr(pTHX_ STRLEN len)
924 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
925 STRLEN linestart_pos, last_uni_pos, last_lop_pos, re_eval_start_pos;
926 linestr = PL_parser->linestr;
927 buf = SvPVX(linestr);
928 if (len <= SvLEN(linestr))
930 bufend_pos = PL_parser->bufend - buf;
931 bufptr_pos = PL_parser->bufptr - buf;
932 oldbufptr_pos = PL_parser->oldbufptr - buf;
933 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
934 linestart_pos = PL_parser->linestart - buf;
935 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
936 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
937 re_eval_start_pos = PL_parser->lex_shared->re_eval_start ?
938 PL_parser->lex_shared->re_eval_start - buf : 0;
940 buf = sv_grow(linestr, len);
942 PL_parser->bufend = buf + bufend_pos;
943 PL_parser->bufptr = buf + bufptr_pos;
944 PL_parser->oldbufptr = buf + oldbufptr_pos;
945 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
946 PL_parser->linestart = buf + linestart_pos;
947 if (PL_parser->last_uni)
948 PL_parser->last_uni = buf + last_uni_pos;
949 if (PL_parser->last_lop)
950 PL_parser->last_lop = buf + last_lop_pos;
951 if (PL_parser->lex_shared->re_eval_start)
952 PL_parser->lex_shared->re_eval_start = buf + re_eval_start_pos;
957 =for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
959 Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
960 immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
961 reallocating the buffer if necessary. This means that lexing code that
962 runs later will see the characters as if they had appeared in the input.
963 It is not recommended to do this as part of normal parsing, and most
964 uses of this facility run the risk of the inserted characters being
965 interpreted in an unintended manner.
967 The string to be inserted is represented by C<len> octets starting
968 at C<pv>. These octets are interpreted as either UTF-8 or Latin-1,
969 according to whether the C<LEX_STUFF_UTF8> flag is set in C<flags>.
970 The characters are recoded for the lexer buffer, according to how the
971 buffer is currently being interpreted (L</lex_bufutf8>). If a string
972 to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
973 function is more convenient.
979 Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
983 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
984 if (flags & ~(LEX_STUFF_UTF8))
985 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
987 if (flags & LEX_STUFF_UTF8) {
990 STRLEN highhalf = 0; /* Count of variants */
991 const char *p, *e = pv+len;
992 for (p = pv; p != e; p++) {
993 if (! UTF8_IS_INVARIANT(*p)) {
999 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
1000 bufptr = PL_parser->bufptr;
1001 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
1002 SvCUR_set(PL_parser->linestr,
1003 SvCUR(PL_parser->linestr) + len+highhalf);
1004 PL_parser->bufend += len+highhalf;
1005 for (p = pv; p != e; p++) {
1007 if (! UTF8_IS_INVARIANT(c)) {
1008 *bufptr++ = UTF8_TWO_BYTE_HI(c);
1009 *bufptr++ = UTF8_TWO_BYTE_LO(c);
1011 *bufptr++ = (char)c;
1016 if (flags & LEX_STUFF_UTF8) {
1017 STRLEN highhalf = 0;
1018 const char *p, *e = pv+len;
1019 for (p = pv; p != e; p++) {
1021 if (UTF8_IS_ABOVE_LATIN1(c)) {
1022 Perl_croak(aTHX_ "Lexing code attempted to stuff "
1023 "non-Latin-1 character into Latin-1 input");
1024 } else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e)) {
1027 } else if (! UTF8_IS_INVARIANT(c)) {
1028 /* malformed UTF-8 */
1030 SAVESPTR(PL_warnhook);
1031 PL_warnhook = PERL_WARNHOOK_FATAL;
1032 utf8n_to_uvchr((U8*)p, e-p, NULL, 0);
1038 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
1039 bufptr = PL_parser->bufptr;
1040 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
1041 SvCUR_set(PL_parser->linestr,
1042 SvCUR(PL_parser->linestr) + len-highhalf);
1043 PL_parser->bufend += len-highhalf;
1046 if (UTF8_IS_INVARIANT(*p)) {
1052 *bufptr++ = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
1058 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1059 bufptr = PL_parser->bufptr;
1060 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
1061 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
1062 PL_parser->bufend += len;
1063 Copy(pv, bufptr, len, char);
1069 =for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
1071 Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1072 immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1073 reallocating the buffer if necessary. This means that lexing code that
1074 runs later will see the characters as if they had appeared in the input.
1075 It is not recommended to do this as part of normal parsing, and most
1076 uses of this facility run the risk of the inserted characters being
1077 interpreted in an unintended manner.
1079 The string to be inserted is represented by octets starting at C<pv>
1080 and continuing to the first nul. These octets are interpreted as either
1081 UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
1082 in C<flags>. The characters are recoded for the lexer buffer, according
1083 to how the buffer is currently being interpreted (L</lex_bufutf8>).
1084 If it is not convenient to nul-terminate a string to be inserted, the
1085 L</lex_stuff_pvn> function is more appropriate.
1091 Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1093 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1094 lex_stuff_pvn(pv, strlen(pv), flags);
1098 =for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1100 Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1101 immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1102 reallocating the buffer if necessary. This means that lexing code that
1103 runs later will see the characters as if they had appeared in the input.
1104 It is not recommended to do this as part of normal parsing, and most
1105 uses of this facility run the risk of the inserted characters being
1106 interpreted in an unintended manner.
1108 The string to be inserted is the string value of C<sv>. The characters
1109 are recoded for the lexer buffer, according to how the buffer is currently
1110 being interpreted (L</lex_bufutf8>). If a string to be inserted is
1111 not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1112 need to construct a scalar.
1118 Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1122 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1124 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1126 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1130 =for apidoc Amx|void|lex_unstuff|char *ptr
1132 Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1133 C<ptr>. Text following C<ptr> will be moved, and the buffer shortened.
1134 This hides the discarded text from any lexing code that runs later,
1135 as if the text had never appeared.
1137 This is not the normal way to consume lexed text. For that, use
1144 Perl_lex_unstuff(pTHX_ char *ptr)
1148 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1149 buf = PL_parser->bufptr;
1151 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1154 bufend = PL_parser->bufend;
1156 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1157 unstuff_len = ptr - buf;
1158 Move(ptr, buf, bufend+1-ptr, char);
1159 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1160 PL_parser->bufend = bufend - unstuff_len;
1164 =for apidoc Amx|void|lex_read_to|char *ptr
1166 Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1167 to C<ptr>. This advances L</PL_parser-E<gt>bufptr> to match C<ptr>,
1168 performing the correct bookkeeping whenever a newline character is passed.
1169 This is the normal way to consume lexed text.
1171 Interpretation of the buffer's octets can be abstracted out by
1172 using the slightly higher-level functions L</lex_peek_unichar> and
1173 L</lex_read_unichar>.
1179 Perl_lex_read_to(pTHX_ char *ptr)
1182 PERL_ARGS_ASSERT_LEX_READ_TO;
1183 s = PL_parser->bufptr;
1184 if (ptr < s || ptr > PL_parser->bufend)
1185 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1186 for (; s != ptr; s++)
1188 COPLINE_INC_WITH_HERELINES;
1189 PL_parser->linestart = s+1;
1191 PL_parser->bufptr = ptr;
1195 =for apidoc Amx|void|lex_discard_to|char *ptr
1197 Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1198 up to C<ptr>. The remaining content of the buffer will be moved, and
1199 all pointers into the buffer updated appropriately. C<ptr> must not
1200 be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1201 it is not permitted to discard text that has yet to be lexed.
1203 Normally it is not necessarily to do this directly, because it suffices to
1204 use the implicit discarding behaviour of L</lex_next_chunk> and things
1205 based on it. However, if a token stretches across multiple lines,
1206 and the lexing code has kept multiple lines of text in the buffer for
1207 that purpose, then after completion of the token it would be wise to
1208 explicitly discard the now-unneeded earlier lines, to avoid future
1209 multi-line tokens growing the buffer without bound.
1215 Perl_lex_discard_to(pTHX_ char *ptr)
1219 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1220 buf = SvPVX(PL_parser->linestr);
1222 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1225 if (ptr > PL_parser->bufptr)
1226 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1227 discard_len = ptr - buf;
1228 if (PL_parser->oldbufptr < ptr)
1229 PL_parser->oldbufptr = ptr;
1230 if (PL_parser->oldoldbufptr < ptr)
1231 PL_parser->oldoldbufptr = ptr;
1232 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1233 PL_parser->last_uni = NULL;
1234 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1235 PL_parser->last_lop = NULL;
1236 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1237 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1238 PL_parser->bufend -= discard_len;
1239 PL_parser->bufptr -= discard_len;
1240 PL_parser->oldbufptr -= discard_len;
1241 PL_parser->oldoldbufptr -= discard_len;
1242 if (PL_parser->last_uni)
1243 PL_parser->last_uni -= discard_len;
1244 if (PL_parser->last_lop)
1245 PL_parser->last_lop -= discard_len;
1249 =for apidoc Amx|bool|lex_next_chunk|U32 flags
1251 Reads in the next chunk of text to be lexed, appending it to
1252 L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1253 looked to the end of the current chunk and wants to know more. It is
1254 usual, but not necessary, for lexing to have consumed the entirety of
1255 the current chunk at this time.
1257 If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1258 chunk (i.e., the current chunk has been entirely consumed), normally the
1259 current chunk will be discarded at the same time that the new chunk is
1260 read in. If C<flags> has the C<LEX_KEEP_PREVIOUS> bit set, the current chunk
1261 will not be discarded. If the current chunk has not been entirely
1262 consumed, then it will not be discarded regardless of the flag.
1264 Returns true if some new text was added to the buffer, or false if the
1265 buffer has reached the end of the input text.
1270 #define LEX_FAKE_EOF 0x80000000
1271 #define LEX_NO_TERM 0x40000000 /* here-doc */
1274 Perl_lex_next_chunk(pTHX_ U32 flags)
1278 STRLEN old_bufend_pos, new_bufend_pos;
1279 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1280 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
1281 bool got_some_for_debugger = 0;
1283 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM))
1284 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
1285 if (!(flags & LEX_NO_TERM) && PL_lex_inwhat)
1287 linestr = PL_parser->linestr;
1288 buf = SvPVX(linestr);
1289 if (!(flags & LEX_KEEP_PREVIOUS)
1290 && PL_parser->bufptr == PL_parser->bufend)
1292 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1294 if (PL_parser->last_uni != PL_parser->bufend)
1295 PL_parser->last_uni = NULL;
1296 if (PL_parser->last_lop != PL_parser->bufend)
1297 PL_parser->last_lop = NULL;
1298 last_uni_pos = last_lop_pos = 0;
1302 old_bufend_pos = PL_parser->bufend - buf;
1303 bufptr_pos = PL_parser->bufptr - buf;
1304 oldbufptr_pos = PL_parser->oldbufptr - buf;
1305 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1306 linestart_pos = PL_parser->linestart - buf;
1307 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1308 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1310 if (flags & LEX_FAKE_EOF) {
1312 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
1314 } else if (filter_gets(linestr, old_bufend_pos)) {
1316 got_some_for_debugger = 1;
1317 } else if (flags & LEX_NO_TERM) {
1320 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1323 /* End of real input. Close filehandle (unless it was STDIN),
1324 * then add implicit termination.
1326 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
1327 PerlIO_clearerr(PL_parser->rsfp);
1328 else if (PL_parser->rsfp)
1329 (void)PerlIO_close(PL_parser->rsfp);
1330 PL_parser->rsfp = NULL;
1331 PL_parser->in_pod = PL_parser->filtered = 0;
1332 if (!PL_in_eval && PL_minus_p) {
1334 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1335 PL_minus_n = PL_minus_p = 0;
1336 } else if (!PL_in_eval && PL_minus_n) {
1337 sv_catpvs(linestr, /*{*/";}");
1340 sv_catpvs(linestr, ";");
1343 buf = SvPVX(linestr);
1344 new_bufend_pos = SvCUR(linestr);
1345 PL_parser->bufend = buf + new_bufend_pos;
1346 PL_parser->bufptr = buf + bufptr_pos;
1347 PL_parser->oldbufptr = buf + oldbufptr_pos;
1348 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1349 PL_parser->linestart = buf + linestart_pos;
1350 if (PL_parser->last_uni)
1351 PL_parser->last_uni = buf + last_uni_pos;
1352 if (PL_parser->last_lop)
1353 PL_parser->last_lop = buf + last_lop_pos;
1354 if (PL_parser->preambling != NOLINE) {
1355 CopLINE_set(PL_curcop, PL_parser->preambling + 1);
1356 PL_parser->preambling = NOLINE;
1358 if ( got_some_for_debugger
1359 && PERLDB_LINE_OR_SAVESRC
1360 && PL_curstash != PL_debstash)
1362 /* debugger active and we're not compiling the debugger code,
1363 * so store the line into the debugger's array of lines
1365 update_debugger_info(NULL, buf+old_bufend_pos,
1366 new_bufend_pos-old_bufend_pos);
1372 =for apidoc Amx|I32|lex_peek_unichar|U32 flags
1374 Looks ahead one (Unicode) character in the text currently being lexed.
1375 Returns the codepoint (unsigned integer value) of the next character,
1376 or -1 if lexing has reached the end of the input text. To consume the
1377 peeked character, use L</lex_read_unichar>.
1379 If the next character is in (or extends into) the next chunk of input
1380 text, the next chunk will be read in. Normally the current chunk will be
1381 discarded at the same time, but if C<flags> has the C<LEX_KEEP_PREVIOUS>
1382 bit set, then the current chunk will not be discarded.
1384 If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1385 is encountered, an exception is generated.
1391 Perl_lex_peek_unichar(pTHX_ U32 flags)
1395 if (flags & ~(LEX_KEEP_PREVIOUS))
1396 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1397 s = PL_parser->bufptr;
1398 bufend = PL_parser->bufend;
1404 if (!lex_next_chunk(flags))
1406 s = PL_parser->bufptr;
1407 bufend = PL_parser->bufend;
1410 if (UTF8_IS_INVARIANT(head))
1412 if (UTF8_IS_START(head)) {
1413 len = UTF8SKIP(&head);
1414 while ((STRLEN)(bufend-s) < len) {
1415 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1417 s = PL_parser->bufptr;
1418 bufend = PL_parser->bufend;
1421 unichar = utf8n_to_uvchr((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
1422 if (retlen == (STRLEN)-1) {
1423 /* malformed UTF-8 */
1425 SAVESPTR(PL_warnhook);
1426 PL_warnhook = PERL_WARNHOOK_FATAL;
1427 utf8n_to_uvchr((U8*)s, bufend-s, NULL, 0);
1433 if (!lex_next_chunk(flags))
1435 s = PL_parser->bufptr;
1442 =for apidoc Amx|I32|lex_read_unichar|U32 flags
1444 Reads the next (Unicode) character in the text currently being lexed.
1445 Returns the codepoint (unsigned integer value) of the character read,
1446 and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1447 if lexing has reached the end of the input text. To non-destructively
1448 examine the next character, use L</lex_peek_unichar> instead.
1450 If the next character is in (or extends into) the next chunk of input
1451 text, the next chunk will be read in. Normally the current chunk will be
1452 discarded at the same time, but if C<flags> has the C<LEX_KEEP_PREVIOUS>
1453 bit set, then the current chunk will not be discarded.
1455 If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1456 is encountered, an exception is generated.
1462 Perl_lex_read_unichar(pTHX_ U32 flags)
1465 if (flags & ~(LEX_KEEP_PREVIOUS))
1466 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1467 c = lex_peek_unichar(flags);
1470 COPLINE_INC_WITH_HERELINES;
1472 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1474 ++(PL_parser->bufptr);
1480 =for apidoc Amx|void|lex_read_space|U32 flags
1482 Reads optional spaces, in Perl style, in the text currently being
1483 lexed. The spaces may include ordinary whitespace characters and
1484 Perl-style comments. C<#line> directives are processed if encountered.
1485 L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1486 at a non-space character (or the end of the input text).
1488 If spaces extend into the next chunk of input text, the next chunk will
1489 be read in. Normally the current chunk will be discarded at the same
1490 time, but if C<flags> has the C<LEX_KEEP_PREVIOUS> bit set, then the current
1491 chunk will not be discarded.
1496 #define LEX_NO_INCLINE 0x40000000
1497 #define LEX_NO_NEXT_CHUNK 0x80000000
1500 Perl_lex_read_space(pTHX_ U32 flags)
1503 const bool can_incline = !(flags & LEX_NO_INCLINE);
1504 bool need_incline = 0;
1505 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK|LEX_NO_INCLINE))
1506 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
1507 s = PL_parser->bufptr;
1508 bufend = PL_parser->bufend;
1514 } while (!(c == '\n' || (c == 0 && s == bufend)));
1515 } else if (c == '\n') {
1518 PL_parser->linestart = s;
1524 } else if (isSPACE(c)) {
1526 } else if (c == 0 && s == bufend) {
1529 if (flags & LEX_NO_NEXT_CHUNK)
1531 PL_parser->bufptr = s;
1532 l = CopLINE(PL_curcop);
1533 CopLINE(PL_curcop) += PL_parser->herelines + 1;
1534 got_more = lex_next_chunk(flags);
1535 CopLINE_set(PL_curcop, l);
1536 s = PL_parser->bufptr;
1537 bufend = PL_parser->bufend;
1540 if (can_incline && need_incline && PL_parser->rsfp) {
1550 PL_parser->bufptr = s;
1555 =for apidoc EXMp|bool|validate_proto|SV *name|SV *proto|bool warn
1557 This function performs syntax checking on a prototype, C<proto>.
1558 If C<warn> is true, any illegal characters or mismatched brackets
1559 will trigger illegalproto warnings, declaring that they were
1560 detected in the prototype for C<name>.
1562 The return value is C<true> if this is a valid prototype, and
1563 C<false> if it is not, regardless of whether C<warn> was C<true> or
1566 Note that C<NULL> is a valid C<proto> and will always return C<true>.
1573 Perl_validate_proto(pTHX_ SV *name, SV *proto, bool warn)
1575 STRLEN len, origlen;
1577 bool bad_proto = FALSE;
1578 bool in_brackets = FALSE;
1579 bool after_slash = FALSE;
1580 char greedy_proto = ' ';
1581 bool proto_after_greedy_proto = FALSE;
1582 bool must_be_last = FALSE;
1583 bool underscore = FALSE;
1584 bool bad_proto_after_underscore = FALSE;
1586 PERL_ARGS_ASSERT_VALIDATE_PROTO;
1591 p = SvPV(proto, len);
1593 for (; len--; p++) {
1596 proto_after_greedy_proto = TRUE;
1598 if (!strchr(";@%", *p))
1599 bad_proto_after_underscore = TRUE;
1602 if (!strchr("$@%*;[]&\\_+", *p) || *p == '\0') {
1609 in_brackets = FALSE;
1610 else if ((*p == '@' || *p == '%')
1614 must_be_last = TRUE;
1623 after_slash = FALSE;
1628 SV *tmpsv = newSVpvs_flags("", SVs_TEMP);
1631 ? sv_uni_display(tmpsv, newSVpvn_flags(p, origlen, SVs_TEMP | SVf_UTF8),
1632 origlen, UNI_DISPLAY_ISPRINT)
1633 : pv_pretty(tmpsv, p, origlen, 60, NULL, NULL, PERL_PV_ESCAPE_NONASCII);
1635 if (proto_after_greedy_proto)
1636 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1637 "Prototype after '%c' for %"SVf" : %s",
1638 greedy_proto, SVfARG(name), p);
1640 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1641 "Missing ']' in prototype for %"SVf" : %s",
1644 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1645 "Illegal character in prototype for %"SVf" : %s",
1647 if (bad_proto_after_underscore)
1648 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1649 "Illegal character after '_' in prototype for %"SVf" : %s",
1653 return (! (proto_after_greedy_proto || bad_proto) );
1658 * This subroutine has nothing to do with tilting, whether at windmills
1659 * or pinball tables. Its name is short for "increment line". It
1660 * increments the current line number in CopLINE(PL_curcop) and checks
1661 * to see whether the line starts with a comment of the form
1662 * # line 500 "foo.pm"
1663 * If so, it sets the current line number and file to the values in the comment.
1667 S_incline(pTHX_ const char *s)
1675 PERL_ARGS_ASSERT_INCLINE;
1677 COPLINE_INC_WITH_HERELINES;
1678 if (!PL_rsfp && !PL_parser->filtered && PL_lex_state == LEX_NORMAL
1679 && s+1 == PL_bufend && *s == ';') {
1680 /* fake newline in string eval */
1681 CopLINE_dec(PL_curcop);
1686 while (SPACE_OR_TAB(*s))
1688 if (strEQs(s, "line"))
1692 if (SPACE_OR_TAB(*s))
1696 while (SPACE_OR_TAB(*s))
1704 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
1706 while (SPACE_OR_TAB(*s))
1708 if (*s == '"' && (t = strchr(s+1, '"'))) {
1714 while (*t && !isSPACE(*t))
1718 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
1720 if (*e != '\n' && *e != '\0')
1721 return; /* false alarm */
1723 if (!grok_atoUV(n, &uv, &e))
1725 line_num = ((line_t)uv) - 1;
1728 const STRLEN len = t - s;
1730 if (!PL_rsfp && !PL_parser->filtered) {
1731 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1732 * to *{"::_<newfilename"} */
1733 /* However, the long form of evals is only turned on by the
1734 debugger - usually they're "(eval %lu)" */
1735 GV * const cfgv = CopFILEGV(PL_curcop);
1738 STRLEN tmplen2 = len;
1742 if (tmplen2 + 2 <= sizeof smallbuf)
1745 Newx(tmpbuf2, tmplen2 + 2, char);
1750 memcpy(tmpbuf2 + 2, s, tmplen2);
1753 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
1755 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
1756 /* adjust ${"::_<newfilename"} to store the new file name */
1757 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
1758 /* The line number may differ. If that is the case,
1759 alias the saved lines that are in the array.
1760 Otherwise alias the whole array. */
1761 if (CopLINE(PL_curcop) == line_num) {
1762 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(cfgv)));
1763 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(cfgv)));
1765 else if (GvAV(cfgv)) {
1766 AV * const av = GvAV(cfgv);
1767 const I32 start = CopLINE(PL_curcop)+1;
1768 I32 items = AvFILLp(av) - start;
1770 AV * const av2 = GvAVn(gv2);
1771 SV **svp = AvARRAY(av) + start;
1772 I32 l = (I32)line_num+1;
1774 av_store(av2, l++, SvREFCNT_inc(*svp++));
1779 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
1782 CopFILE_free(PL_curcop);
1783 CopFILE_setn(PL_curcop, s, len);
1785 CopLINE_set(PL_curcop, line_num);
1789 S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
1791 AV *av = CopFILEAVx(PL_curcop);
1794 if (PL_parser->preambling == NOLINE) sv = newSV_type(SVt_PVMG);
1796 sv = *av_fetch(av, 0, 1);
1797 SvUPGRADE(sv, SVt_PVMG);
1799 if (!SvPOK(sv)) SvPVCLEAR(sv);
1801 sv_catsv(sv, orig_sv);
1803 sv_catpvn(sv, buf, len);
1808 if (PL_parser->preambling == NOLINE)
1809 av_store(av, CopLINE(PL_curcop), sv);
1815 * Called to gobble the appropriate amount and type of whitespace.
1816 * Skips comments as well.
1817 * Returns the next character after the whitespace that is skipped.
1820 * Same thing, but look ahead without incrementing line numbers or
1821 * adjusting PL_linestart.
1824 #define skipspace(s) skipspace_flags(s, 0)
1825 #define peekspace(s) skipspace_flags(s, LEX_NO_INCLINE)
1828 S_skipspace_flags(pTHX_ char *s, U32 flags)
1830 PERL_ARGS_ASSERT_SKIPSPACE_FLAGS;
1831 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
1832 while (s < PL_bufend && (SPACE_OR_TAB(*s) || !*s))
1835 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1837 lex_read_space(flags | LEX_KEEP_PREVIOUS |
1838 (PL_lex_inwhat || PL_lex_state == LEX_FORMLINE ?
1839 LEX_NO_NEXT_CHUNK : 0));
1841 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1842 if (PL_linestart > PL_bufptr)
1843 PL_bufptr = PL_linestart;
1851 * Check the unary operators to ensure there's no ambiguity in how they're
1852 * used. An ambiguous piece of code would be:
1854 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1855 * the +5 is its argument.
1864 if (PL_oldoldbufptr != PL_last_uni)
1866 while (isSPACE(*PL_last_uni))
1869 while (isWORDCHAR_lazy_if(s,UTF) || *s == '-')
1870 s += UTF ? UTF8SKIP(s) : 1;
1871 if ((t = strchr(s, '(')) && t < PL_bufptr)
1874 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
1875 "Warning: Use of \"%"UTF8f"\" without parentheses is ambiguous",
1876 UTF8fARG(UTF, (int)(s - PL_last_uni), PL_last_uni));
1880 * LOP : macro to build a list operator. Its behaviour has been replaced
1881 * with a subroutine, S_lop() for which LOP is just another name.
1884 #define LOP(f,x) return lop(f,x,s)
1888 * Build a list operator (or something that might be one). The rules:
1889 * - if we have a next token, then it's a list operator (no parens) for
1890 * which the next token has already been parsed; e.g.,
1893 * - if the next thing is an opening paren, then it's a function
1894 * - else it's a list operator
1898 S_lop(pTHX_ I32 f, U8 x, char *s)
1900 PERL_ARGS_ASSERT_LOP;
1905 PL_last_lop = PL_oldbufptr;
1906 PL_last_lop_op = (OPCODE)f;
1911 return REPORT(FUNC);
1914 return REPORT(FUNC);
1917 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
1918 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
1919 return REPORT(LSTOP);
1925 * When the lexer realizes it knows the next token (for instance,
1926 * it is reordering tokens for the parser) then it can call S_force_next
1927 * to know what token to return the next time the lexer is called. Caller
1928 * will need to set PL_nextval[] and possibly PL_expect to ensure
1929 * the lexer handles the token correctly.
1933 S_force_next(pTHX_ I32 type)
1937 PerlIO_printf(Perl_debug_log, "### forced token:\n");
1938 tokereport(type, &NEXTVAL_NEXTTOKE);
1941 assert(PL_nexttoke < C_ARRAY_LENGTH(PL_nexttype));
1942 PL_nexttype[PL_nexttoke] = type;
1949 * This subroutine handles postfix deref syntax after the arrow has already
1950 * been emitted. @* $* etc. are emitted as two separate token right here.
1951 * @[ @{ %[ %{ *{ are emitted also as two tokens, but this function emits
1952 * only the first, leaving yylex to find the next.
1956 S_postderef(pTHX_ int const funny, char const next)
1958 assert(funny == DOLSHARP || strchr("$@%&*", funny));
1960 PL_expect = XOPERATOR;
1961 if (PL_lex_state == LEX_INTERPNORMAL && !PL_lex_brackets) {
1962 assert('@' == funny || '$' == funny || DOLSHARP == funny);
1963 PL_lex_state = LEX_INTERPEND;
1965 force_next(POSTJOIN);
1971 if ('@' == funny && PL_lex_state == LEX_INTERPNORMAL
1972 && !PL_lex_brackets)
1974 PL_expect = XOPERATOR;
1983 int yyc = PL_parser->yychar;
1984 if (yyc != YYEMPTY) {
1986 NEXTVAL_NEXTTOKE = PL_parser->yylval;
1987 if (yyc == '{'/*}*/ || yyc == HASHBRACK || yyc == '['/*]*/) {
1988 PL_lex_allbrackets--;
1990 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
1991 } else if (yyc == '('/*)*/) {
1992 PL_lex_allbrackets--;
1997 PL_parser->yychar = YYEMPTY;
2002 S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
2004 SV * const sv = newSVpvn_utf8(start, len,
2007 && !is_utf8_invariant_string((const U8*)start, len)
2008 && is_utf8_string((const U8*)start, len));
2014 * When the lexer knows the next thing is a word (for instance, it has
2015 * just seen -> and it knows that the next char is a word char, then
2016 * it calls S_force_word to stick the next word into the PL_nexttoke/val
2020 * char *start : buffer position (must be within PL_linestr)
2021 * int token : PL_next* will be this type of bare word
2022 * (e.g., METHOD,BAREWORD)
2023 * int check_keyword : if true, Perl checks to make sure the word isn't
2024 * a keyword (do this if the word is a label, e.g. goto FOO)
2025 * int allow_pack : if true, : characters will also be allowed (require,
2026 * use, etc. do this)
2030 S_force_word(pTHX_ char *start, int token, int check_keyword, int allow_pack)
2035 PERL_ARGS_ASSERT_FORCE_WORD;
2037 start = skipspace(start);
2039 if (isIDFIRST_lazy_if(s,UTF)
2040 || (allow_pack && *s == ':' && s[1] == ':') )
2042 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
2043 if (check_keyword) {
2044 char *s2 = PL_tokenbuf;
2046 if (allow_pack && len > 6 && strEQs(s2, "CORE::"))
2048 if (keyword(s2, len2, 0))
2051 if (token == METHOD) {
2056 PL_expect = XOPERATOR;
2059 NEXTVAL_NEXTTOKE.opval
2060 = newSVOP(OP_CONST,0,
2061 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
2062 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
2070 * Called when the lexer wants $foo *foo &foo etc, but the program
2071 * text only contains the "foo" portion. The first argument is a pointer
2072 * to the "foo", and the second argument is the type symbol to prefix.
2073 * Forces the next token to be a "BAREWORD".
2074 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
2078 S_force_ident(pTHX_ const char *s, int kind)
2080 PERL_ARGS_ASSERT_FORCE_IDENT;
2083 const STRLEN len = s[1] ? strlen(s) : 1; /* s = "\"" see yylex */
2084 OP* const o = newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
2085 UTF ? SVf_UTF8 : 0));
2086 NEXTVAL_NEXTTOKE.opval = o;
2087 force_next(BAREWORD);
2089 o->op_private = OPpCONST_ENTERED;
2090 /* XXX see note in pp_entereval() for why we forgo typo
2091 warnings if the symbol must be introduced in an eval.
2093 gv_fetchpvn_flags(s, len,
2094 (PL_in_eval ? GV_ADDMULTI
2095 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
2096 kind == '$' ? SVt_PV :
2097 kind == '@' ? SVt_PVAV :
2098 kind == '%' ? SVt_PVHV :
2106 S_force_ident_maybe_lex(pTHX_ char pit)
2108 NEXTVAL_NEXTTOKE.ival = pit;
2113 Perl_str_to_version(pTHX_ SV *sv)
2118 const char *start = SvPV_const(sv,len);
2119 const char * const end = start + len;
2120 const bool utf = SvUTF8(sv) ? TRUE : FALSE;
2122 PERL_ARGS_ASSERT_STR_TO_VERSION;
2124 while (start < end) {
2128 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
2133 retval += ((NV)n)/nshift;
2142 * Forces the next token to be a version number.
2143 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2144 * and if "guessing" is TRUE, then no new token is created (and the caller
2145 * must use an alternative parsing method).
2149 S_force_version(pTHX_ char *s, int guessing)
2154 PERL_ARGS_ASSERT_FORCE_VERSION;
2162 while (isDIGIT(*d) || *d == '_' || *d == '.')
2164 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
2166 s = scan_num(s, &pl_yylval);
2167 version = pl_yylval.opval;
2168 ver = cSVOPx(version)->op_sv;
2169 if (SvPOK(ver) && !SvNIOK(ver)) {
2170 SvUPGRADE(ver, SVt_PVNV);
2171 SvNV_set(ver, str_to_version(ver));
2172 SvNOK_on(ver); /* hint that it is a version */
2175 else if (guessing) {
2180 /* NOTE: The parser sees the package name and the VERSION swapped */
2181 NEXTVAL_NEXTTOKE.opval = version;
2182 force_next(BAREWORD);
2188 * S_force_strict_version
2189 * Forces the next token to be a version number using strict syntax rules.
2193 S_force_strict_version(pTHX_ char *s)
2196 const char *errstr = NULL;
2198 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2200 while (isSPACE(*s)) /* leading whitespace */
2203 if (is_STRICT_VERSION(s,&errstr)) {
2205 s = (char *)scan_version(s, ver, 0);
2206 version = newSVOP(OP_CONST, 0, ver);
2208 else if ((*s != ';' && *s != '{' && *s != '}' )
2209 && (s = skipspace(s), (*s != ';' && *s != '{' && *s != '}' )))
2213 yyerror(errstr); /* version required */
2217 /* NOTE: The parser sees the package name and the VERSION swapped */
2218 NEXTVAL_NEXTTOKE.opval = version;
2219 force_next(BAREWORD);
2226 * Tokenize a quoted string passed in as an SV. It finds the next
2227 * chunk, up to end of string or a backslash. It may make a new
2228 * SV containing that chunk (if HINT_NEW_STRING is on). It also
2233 S_tokeq(pTHX_ SV *sv)
2240 PERL_ARGS_ASSERT_TOKEQ;
2244 assert (!SvIsCOW(sv));
2245 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1) /* <<'heredoc' */
2249 /* This is relying on the SV being "well formed" with a trailing '\0' */
2250 while (s < send && !(*s == '\\' && s[1] == '\\'))
2255 if ( PL_hints & HINT_NEW_STRING ) {
2256 pv = newSVpvn_flags(SvPVX_const(pv), SvCUR(sv),
2257 SVs_TEMP | SvUTF8(sv));
2261 if (s + 1 < send && (s[1] == '\\'))
2262 s++; /* all that, just for this */
2267 SvCUR_set(sv, d - SvPVX_const(sv));
2269 if ( PL_hints & HINT_NEW_STRING )
2270 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
2275 * Now come three functions related to double-quote context,
2276 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2277 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2278 * interact with PL_lex_state, and create fake ( ... ) argument lists
2279 * to handle functions and concatenation.
2283 * stringify ( const[foo] concat lcfirst ( const[bar] ) )
2288 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
2290 * Pattern matching will set PL_lex_op to the pattern-matching op to
2291 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
2293 * OP_CONST and OP_READLINE are easy--just make the new op and return.
2295 * Everything else becomes a FUNC.
2297 * Sets PL_lex_state to LEX_INTERPPUSH unless (ival was OP_NULL or we
2298 * had an OP_CONST or OP_READLINE). This just sets us up for a
2299 * call to S_sublex_push().
2303 S_sublex_start(pTHX)
2305 const I32 op_type = pl_yylval.ival;
2307 if (op_type == OP_NULL) {
2308 pl_yylval.opval = PL_lex_op;
2312 if (op_type == OP_CONST) {
2313 SV *sv = PL_lex_stuff;
2314 PL_lex_stuff = NULL;
2317 if (SvTYPE(sv) == SVt_PVIV) {
2318 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2320 const char * const p = SvPV_const(sv, len);
2321 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
2325 pl_yylval.opval = newSVOP(op_type, 0, sv);
2329 PL_parser->lex_super_state = PL_lex_state;
2330 PL_parser->lex_sub_inwhat = (U16)op_type;
2331 PL_parser->lex_sub_op = PL_lex_op;
2332 PL_lex_state = LEX_INTERPPUSH;
2336 pl_yylval.opval = PL_lex_op;
2346 * Create a new scope to save the lexing state. The scope will be
2347 * ended in S_sublex_done. Returns a '(', starting the function arguments
2348 * to the uc, lc, etc. found before.
2349 * Sets PL_lex_state to LEX_INTERPCONCAT.
2356 const bool is_heredoc = PL_multi_close == '<';
2359 PL_lex_state = PL_parser->lex_super_state;
2360 SAVEI8(PL_lex_dojoin);
2361 SAVEI32(PL_lex_brackets);
2362 SAVEI32(PL_lex_allbrackets);
2363 SAVEI32(PL_lex_formbrack);
2364 SAVEI8(PL_lex_fakeeof);
2365 SAVEI32(PL_lex_casemods);
2366 SAVEI32(PL_lex_starts);
2367 SAVEI8(PL_lex_state);
2368 SAVESPTR(PL_lex_repl);
2369 SAVEVPTR(PL_lex_inpat);
2370 SAVEI16(PL_lex_inwhat);
2373 SAVECOPLINE(PL_curcop);
2374 SAVEI32(PL_multi_end);
2375 SAVEI32(PL_parser->herelines);
2376 PL_parser->herelines = 0;
2378 SAVEIV(PL_multi_close);
2379 SAVEPPTR(PL_bufptr);
2380 SAVEPPTR(PL_bufend);
2381 SAVEPPTR(PL_oldbufptr);
2382 SAVEPPTR(PL_oldoldbufptr);
2383 SAVEPPTR(PL_last_lop);
2384 SAVEPPTR(PL_last_uni);
2385 SAVEPPTR(PL_linestart);
2386 SAVESPTR(PL_linestr);
2387 SAVEGENERICPV(PL_lex_brackstack);
2388 SAVEGENERICPV(PL_lex_casestack);
2389 SAVEGENERICPV(PL_parser->lex_shared);
2390 SAVEBOOL(PL_parser->lex_re_reparsing);
2391 SAVEI32(PL_copline);
2393 /* The here-doc parser needs to be able to peek into outer lexing
2394 scopes to find the body of the here-doc. So we put PL_linestr and
2395 PL_bufptr into lex_shared, to ‘share’ those values.
2397 PL_parser->lex_shared->ls_linestr = PL_linestr;
2398 PL_parser->lex_shared->ls_bufptr = PL_bufptr;
2400 PL_linestr = PL_lex_stuff;
2401 PL_lex_repl = PL_parser->lex_sub_repl;
2402 PL_lex_stuff = NULL;
2403 PL_parser->lex_sub_repl = NULL;
2405 /* Arrange for PL_lex_stuff to be freed on scope exit, in case it gets
2406 set for an inner quote-like operator and then an error causes scope-
2407 popping. We must not have a PL_lex_stuff value left dangling, as
2408 that breaks assumptions elsewhere. See bug #123617. */
2409 SAVEGENERICSV(PL_lex_stuff);
2410 SAVEGENERICSV(PL_parser->lex_sub_repl);
2412 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2413 = SvPVX(PL_linestr);
2414 PL_bufend += SvCUR(PL_linestr);
2415 PL_last_lop = PL_last_uni = NULL;
2416 SAVEFREESV(PL_linestr);
2417 if (PL_lex_repl) SAVEFREESV(PL_lex_repl);
2419 PL_lex_dojoin = FALSE;
2420 PL_lex_brackets = PL_lex_formbrack = 0;
2421 PL_lex_allbrackets = 0;
2422 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
2423 Newx(PL_lex_brackstack, 120, char);
2424 Newx(PL_lex_casestack, 12, char);
2425 PL_lex_casemods = 0;
2426 *PL_lex_casestack = '\0';
2428 PL_lex_state = LEX_INTERPCONCAT;
2430 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
2431 PL_copline = NOLINE;
2433 Newxz(shared, 1, LEXSHARED);
2434 shared->ls_prev = PL_parser->lex_shared;
2435 PL_parser->lex_shared = shared;
2437 PL_lex_inwhat = PL_parser->lex_sub_inwhat;
2438 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
2439 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2440 PL_lex_inpat = PL_parser->lex_sub_op;
2442 PL_lex_inpat = NULL;
2444 PL_parser->lex_re_reparsing = cBOOL(PL_in_eval & EVAL_RE_REPARSING);
2445 PL_in_eval &= ~EVAL_RE_REPARSING;
2452 * Restores lexer state after a S_sublex_push.
2458 if (!PL_lex_starts++) {
2459 SV * const sv = newSVpvs("");
2460 if (SvUTF8(PL_linestr))
2462 PL_expect = XOPERATOR;
2463 pl_yylval.opval = newSVOP(OP_CONST, 0, sv);
2467 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2468 PL_lex_state = LEX_INTERPCASEMOD;
2472 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
2473 assert(PL_lex_inwhat != OP_TRANSR);
2475 assert (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS);
2476 PL_linestr = PL_lex_repl;
2478 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2479 PL_bufend += SvCUR(PL_linestr);
2480 PL_last_lop = PL_last_uni = NULL;
2481 PL_lex_dojoin = FALSE;
2482 PL_lex_brackets = 0;
2483 PL_lex_allbrackets = 0;
2484 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
2485 PL_lex_casemods = 0;
2486 *PL_lex_casestack = '\0';
2488 if (SvEVALED(PL_lex_repl)) {
2489 PL_lex_state = LEX_INTERPNORMAL;
2491 /* we don't clear PL_lex_repl here, so that we can check later
2492 whether this is an evalled subst; that means we rely on the
2493 logic to ensure sublex_done() is called again only via the
2494 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
2497 PL_lex_state = LEX_INTERPCONCAT;
2500 if (SvTYPE(PL_linestr) >= SVt_PVNV) {
2501 CopLINE(PL_curcop) +=
2502 ((XPVNV*)SvANY(PL_linestr))->xnv_u.xnv_lines
2503 + PL_parser->herelines;
2504 PL_parser->herelines = 0;
2509 const line_t l = CopLINE(PL_curcop);
2511 if (PL_multi_close == '<')
2512 PL_parser->herelines += l - PL_multi_end;
2513 PL_bufend = SvPVX(PL_linestr);
2514 PL_bufend += SvCUR(PL_linestr);
2515 PL_expect = XOPERATOR;
2520 PERL_STATIC_INLINE SV*
2521 S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
2523 /* <s> points to first character of interior of \N{}, <e> to one beyond the
2524 * interior, hence to the "}". Finds what the name resolves to, returning
2525 * an SV* containing it; NULL if no valid one found */
2527 SV* res = newSVpvn_flags(s, e - s, UTF ? SVf_UTF8 : 0);
2534 const U8* first_bad_char_loc;
2535 const char* backslash_ptr = s - 3; /* Points to the <\> of \N{... */
2537 PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME;
2540 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),
2541 "Unknown charname '' is deprecated");
2545 if (UTF && ! is_utf8_string_loc((U8 *) backslash_ptr,
2547 &first_bad_char_loc))
2549 /* If warnings are on, this will print a more detailed analysis of what
2550 * is wrong than the error message below */
2551 utf8n_to_uvchr(first_bad_char_loc,
2552 e - ((char *) first_bad_char_loc),
2555 /* We deliberately don't try to print the malformed character, which
2556 * might not print very well; it also may be just the first of many
2557 * malformations, so don't print what comes after it */
2558 yyerror_pv(Perl_form(aTHX_
2559 "Malformed UTF-8 character immediately after '%.*s'",
2560 (int) (first_bad_char_loc - (U8 *) backslash_ptr), backslash_ptr),
2565 res = new_constant( NULL, 0, "charnames", res, NULL, backslash_ptr,
2566 /* include the <}> */
2567 e - backslash_ptr + 1);
2569 SvREFCNT_dec_NN(res);
2573 /* See if the charnames handler is the Perl core's, and if so, we can skip
2574 * the validation needed for a user-supplied one, as Perl's does its own
2576 table = GvHV(PL_hintgv); /* ^H */
2577 cvp = hv_fetchs(table, "charnames", FALSE);
2578 if (cvp && (cv = *cvp) && SvROK(cv) && (rv = SvRV(cv),
2579 SvTYPE(rv) == SVt_PVCV) && ((stash = CvSTASH(rv)) != NULL))
2581 const char * const name = HvNAME(stash);
2582 if (HvNAMELEN(stash) == sizeof("_charnames")-1
2583 && strEQ(name, "_charnames")) {
2588 /* Here, it isn't Perl's charname handler. We can't rely on a
2589 * user-supplied handler to validate the input name. For non-ut8 input,
2590 * look to see that the first character is legal. Then loop through the
2591 * rest checking that each is a continuation */
2593 /* This code makes the reasonable assumption that the only Latin1-range
2594 * characters that begin a character name alias are alphabetic, otherwise
2595 * would have to create a isCHARNAME_BEGIN macro */
2598 if (! isALPHAU(*s)) {
2603 if (! isCHARNAME_CONT(*s)) {
2606 if (*s == ' ' && *(s-1) == ' ') {
2613 /* Similarly for utf8. For invariants can check directly; for other
2614 * Latin1, can calculate their code point and check; otherwise use a
2616 if (UTF8_IS_INVARIANT(*s)) {
2617 if (! isALPHAU(*s)) {
2621 } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
2622 if (! isALPHAU(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)))) {
2628 if (! PL_utf8_charname_begin) {
2629 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2630 PL_utf8_charname_begin = _core_swash_init("utf8",
2631 "_Perl_Charname_Begin",
2633 1, 0, NULL, &flags);
2635 if (! swash_fetch(PL_utf8_charname_begin, (U8 *) s, TRUE)) {
2642 if (UTF8_IS_INVARIANT(*s)) {
2643 if (! isCHARNAME_CONT(*s)) {
2646 if (*s == ' ' && *(s-1) == ' ') {
2651 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
2652 if (! isCHARNAME_CONT(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1))))
2659 if (! PL_utf8_charname_continue) {
2660 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2661 PL_utf8_charname_continue = _core_swash_init("utf8",
2662 "_Perl_Charname_Continue",
2664 1, 0, NULL, &flags);
2666 if (! swash_fetch(PL_utf8_charname_continue, (U8 *) s, TRUE)) {
2673 if (*(s-1) == ' ') {
2676 "charnames alias definitions may not contain trailing "
2677 "white-space; marked by <-- HERE in %.*s<-- HERE %.*s",
2678 (int)(s - backslash_ptr + 1), backslash_ptr,
2679 (int)(e - s + 1), s + 1
2681 UTF ? SVf_UTF8 : 0);
2685 if (SvUTF8(res)) { /* Don't accept malformed input */
2686 const U8* first_bad_char_loc;
2688 const char* const str = SvPV_const(res, len);
2689 if (! is_utf8_string_loc((U8 *) str, len, &first_bad_char_loc)) {
2690 /* If warnings are on, this will print a more detailed analysis of
2691 * what is wrong than the error message below */
2692 utf8n_to_uvchr(first_bad_char_loc,
2693 (char *) first_bad_char_loc - str,
2696 /* We deliberately don't try to print the malformed character,
2697 * which might not print very well; it also may be just the first
2698 * of many malformations, so don't print what comes after it */
2701 "Malformed UTF-8 returned by %.*s immediately after '%.*s'",
2702 (int) (e - backslash_ptr + 1), backslash_ptr,
2703 (int) ((char *) first_bad_char_loc - str), str
2714 /* The final %.*s makes sure that should the trailing NUL be missing
2715 * that this print won't run off the end of the string */
2718 "Invalid character in \\N{...}; marked by <-- HERE in %.*s<-- HERE %.*s",
2719 (int)(s - backslash_ptr + 1), backslash_ptr,
2720 (int)(e - s + 1), s + 1
2722 UTF ? SVf_UTF8 : 0);
2729 "charnames alias definitions may not contain a sequence of "
2730 "multiple spaces; marked by <-- HERE in %.*s<-- HERE %.*s",
2731 (int)(s - backslash_ptr + 1), backslash_ptr,
2732 (int)(e - s + 1), s + 1
2734 UTF ? SVf_UTF8 : 0);
2741 Extracts the next constant part of a pattern, double-quoted string,
2742 or transliteration. This is terrifying code.
2744 For example, in parsing the double-quoted string "ab\x63$d", it would
2745 stop at the '$' and return an OP_CONST containing 'abc'.
2747 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
2748 processing a pattern (PL_lex_inpat is true), a transliteration
2749 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
2751 Returns a pointer to the character scanned up to. If this is
2752 advanced from the start pointer supplied (i.e. if anything was
2753 successfully parsed), will leave an OP_CONST for the substring scanned
2754 in pl_yylval. Caller must intuit reason for not parsing further
2755 by looking at the next characters herself.
2759 \N{FOO} => \N{U+hex_for_character_FOO}
2760 (if FOO expands to multiple characters, expands to \N{U+xx.XX.yy ...})
2763 all other \-char, including \N and \N{ apart from \N{ABC}
2766 @ and $ where it appears to be a var, but not for $ as tail anchor
2770 In transliterations:
2771 characters are VERY literal, except for - not at the start or end
2772 of the string, which indicates a range. If the range is in bytes,
2773 scan_const expands the range to the full set of intermediate
2774 characters. If the range is in utf8, the hyphen is replaced with
2775 a certain range mark which will be handled by pmtrans() in op.c.
2777 In double-quoted strings:
2779 double-quoted style: \r and \n
2780 constants: \x31, etc.
2781 deprecated backrefs: \1 (in substitution replacements)
2782 case and quoting: \U \Q \E
2785 scan_const does *not* construct ops to handle interpolated strings.
2786 It stops processing as soon as it finds an embedded $ or @ variable
2787 and leaves it to the caller to work out what's going on.
2789 embedded arrays (whether in pattern or not) could be:
2790 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2792 $ in double-quoted strings must be the symbol of an embedded scalar.
2794 $ in pattern could be $foo or could be tail anchor. Assumption:
2795 it's a tail anchor if $ is the last thing in the string, or if it's
2796 followed by one of "()| \r\n\t"
2798 \1 (backreferences) are turned into $1 in substitutions
2800 The structure of the code is
2801 while (there's a character to process) {
2802 handle transliteration ranges
2803 skip regexp comments /(?#comment)/ and codes /(?{code})/
2804 skip #-initiated comments in //x patterns
2805 check for embedded arrays
2806 check for embedded scalars
2808 deprecate \1 in substitution replacements
2809 handle string-changing backslashes \l \U \Q \E, etc.
2810 switch (what was escaped) {
2811 handle \- in a transliteration (becomes a literal -)
2812 if a pattern and not \N{, go treat as regular character
2813 handle \132 (octal characters)
2814 handle \x15 and \x{1234} (hex characters)
2815 handle \N{name} (named characters, also \N{3,5} in a pattern)
2816 handle \cV (control characters)
2817 handle printf-style backslashes (\f, \r, \n, etc)
2820 } (end if backslash)
2821 handle regular character
2822 } (end while character to read)
2827 S_scan_const(pTHX_ char *start)
2829 char *send = PL_bufend; /* end of the constant */
2830 SV *sv = newSV(send - start); /* sv for the constant. See note below
2832 char *s = start; /* start of the constant */
2833 char *d = SvPVX(sv); /* destination for copies */
2834 bool dorange = FALSE; /* are we in a translit range? */
2835 bool didrange = FALSE; /* did we just finish a range? */
2836 bool in_charclass = FALSE; /* within /[...]/ */
2837 bool has_utf8 = FALSE; /* Output constant is UTF8 */
2838 bool this_utf8 = cBOOL(UTF); /* Is the source string assumed to be
2839 UTF8? But, this can show as true
2840 when the source isn't utf8, as for
2841 example when it is entirely composed
2843 SV *res; /* result from charnames */
2844 STRLEN offset_to_max; /* The offset in the output to where the range
2845 high-end character is temporarily placed */
2847 /* Note on sizing: The scanned constant is placed into sv, which is
2848 * initialized by newSV() assuming one byte of output for every byte of
2849 * input. This routine expects newSV() to allocate an extra byte for a
2850 * trailing NUL, which this routine will append if it gets to the end of
2851 * the input. There may be more bytes of input than output (eg., \N{LATIN
2852 * CAPITAL LETTER A}), or more output than input if the constant ends up
2853 * recoded to utf8, but each time a construct is found that might increase
2854 * the needed size, SvGROW() is called. Its size parameter each time is
2855 * based on the best guess estimate at the time, namely the length used so
2856 * far, plus the length the current construct will occupy, plus room for
2857 * the trailing NUL, plus one byte for every input byte still unscanned */
2859 UV uv = UV_MAX; /* Initialize to weird value to try to catch any uses
2862 int backslash_N = 0; /* ? was the character from \N{} */
2863 int non_portable_endpoint = 0; /* ? In a range is an endpoint
2864 platform-specific like \x65 */
2867 PERL_ARGS_ASSERT_SCAN_CONST;
2869 assert(PL_lex_inwhat != OP_TRANSR);
2870 if (PL_lex_inwhat == OP_TRANS && PL_parser->lex_sub_op) {
2871 /* If we are doing a trans and we know we want UTF8 set expectation */
2872 has_utf8 = PL_parser->lex_sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
2873 this_utf8 = PL_parser->lex_sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
2876 /* Protect sv from errors and fatal warnings. */
2877 ENTER_with_name("scan_const");
2881 || dorange /* Handle tr/// range at right edge of input */
2884 /* get transliterations out of the way (they're most literal) */
2885 if (PL_lex_inwhat == OP_TRANS) {
2887 /* But there isn't any special handling necessary unless there is a
2888 * range, so for most cases we just drop down and handle the value
2889 * as any other. There are two exceptions.
2891 * 1. A minus sign indicates that we are actually going to have
2892 * a range. In this case, skip the '-', set a flag, then drop
2893 * down to handle what should be the end range value.
2894 * 2. After we've handled that value, the next time through, that
2895 * flag is set and we fix up the range.
2897 * Ranges entirely within Latin1 are expanded out entirely, in
2898 * order to avoid the significant overhead of making a swash.
2899 * Ranges that extend above Latin1 have to have a swash, so there
2900 * is no advantage to abbreviating them here, so they are stored
2901 * here as Min, ILLEGAL_UTF8_BYTE, Max. The illegal byte signifies
2902 * a hyphen without any possible ambiguity. On EBCDIC machines, if
2903 * the range is expressed as Unicode, the Latin1 portion is
2904 * expanded out even if the entire range extends above Latin1.
2905 * This is because each code point in it has to be processed here
2906 * individually to get its native translation */
2910 /* Here, we don't think we're in a range. If we've processed
2911 * at least one character, then see if this next one is a '-',
2912 * indicating the previous one was the start of a range. But
2913 * don't bother if we're too close to the end for the minus to
2915 if (*s != '-' || s >= send - 1 || s == start) {
2917 /* A regular character. Process like any other, but first
2918 * clear any flags */
2922 non_portable_endpoint = 0;
2925 /* Drops down to generic code to process current byte */
2928 if (didrange) { /* Something like y/A-C-Z// */
2929 Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
2934 s++; /* Skip past the minus */
2936 /* d now points to where the end-range character will be
2937 * placed. Save it so won't have to go finding it later,
2938 * and drop down to get that character. (Actually we
2939 * instead save the offset, to handle the case where a
2940 * realloc in the meantime could change the actual
2941 * pointer). We'll finish processing the range the next
2942 * time through the loop */
2943 offset_to_max = d - SvPVX_const(sv);
2945 } /* End of not a range */
2947 /* Here we have parsed a range. Now must handle it. At this
2949 * 'sv' is a SV* that contains the output string we are
2950 * constructing. The final two characters in that string
2951 * are the range start and range end, in order.
2952 * 'd' points to just beyond the range end in the 'sv' string,
2953 * where we would next place something
2954 * 'offset_to_max' is the offset in 'sv' at which the character
2955 * before 'd' begins.
2957 const char * max_ptr = SvPVX_const(sv) + offset_to_max;
2958 const char * min_ptr;
2960 IV range_max; /* last character in range */
2964 bool convert_unicode;
2965 IV real_range_max = 0;
2968 /* Get the range-ends code point values. */
2970 /* We know the utf8 is valid, because we just constructed
2971 * it ourselves in previous loop iterations */
2972 min_ptr = (char*) utf8_hop( (U8*) max_ptr, -1);
2973 range_min = valid_utf8_to_uvchr( (U8*) min_ptr, NULL);
2974 range_max = valid_utf8_to_uvchr( (U8*) max_ptr, NULL);
2977 min_ptr = max_ptr - 1;
2978 range_min = * (U8*) min_ptr;
2979 range_max = * (U8*) max_ptr;
2983 /* On EBCDIC platforms, we may have to deal with portable
2984 * ranges. These happen if at least one range endpoint is a
2985 * Unicode value (\N{...}), or if the range is a subset of
2986 * [A-Z] or [a-z], and both ends are literal characters,
2987 * like 'A', and not like \x{C1} */
2988 if ((convert_unicode
2989 = cBOOL(backslash_N) /* \N{} forces Unicode, hence
2991 || ( ! non_portable_endpoint
2992 && (( isLOWER_A(range_min) && isLOWER_A(range_max))
2993 || (isUPPER_A(range_min) && isUPPER_A(range_max))))
2996 /* Special handling is needed for these portable ranges.
2997 * They are defined to all be in Unicode terms, which
2998 * include all Unicode code points between the end points.
2999 * Convert to Unicode to get the Unicode range. Later we
3000 * will convert each code point in the range back to
3002 range_min = NATIVE_TO_UNI(range_min);
3003 range_max = NATIVE_TO_UNI(range_max);
3007 if (range_min > range_max) {
3009 if (convert_unicode) {
3010 /* Need to convert back to native for meaningful
3011 * messages for this platform */
3012 range_min = UNI_TO_NATIVE(range_min);
3013 range_max = UNI_TO_NATIVE(range_max);
3017 /* Use the characters themselves for the error message if
3018 * ASCII printables; otherwise some visible representation
3020 if (isPRINT_A(range_min) && isPRINT_A(range_max)) {
3022 "Invalid range \"%c-%c\" in transliteration operator",
3023 (char)range_min, (char)range_max);
3026 else if (convert_unicode) {
3027 /* diag_listed_as: Invalid range "%s" in transliteration operator */
3029 "Invalid range \"\\N{U+%04"UVXf"}-\\N{U+%04"UVXf"}\""
3030 " in transliteration operator",
3031 range_min, range_max);
3035 /* diag_listed_as: Invalid range "%s" in transliteration operator */
3037 "Invalid range \"\\x{%04"UVXf"}-\\x{%04"UVXf"}\""
3038 " in transliteration operator",
3039 range_min, range_max);
3045 /* We try to avoid creating a swash. If the upper end of
3046 * this range is below 256, this range won't force a swash;
3047 * otherwise it does force a swash, and as long as we have
3048 * to have one, we might as well not expand things out.
3049 * But if it's EBCDIC, we may have to look at each
3050 * character below 256 if we have to convert to/from
3054 && (range_min > 255 || ! convert_unicode)
3057 /* Move the high character one byte to the right; then
3058 * insert between it and the range begin, an illegal
3059 * byte which serves to indicate this is a range (using
3060 * a '-' could be ambiguous). */
3062 while (e-- > max_ptr) {
3065 *(e + 1) = (char) ILLEGAL_UTF8_BYTE;
3069 /* Here, we're going to expand out the range. For EBCDIC
3070 * the range can extend above 255 (not so in ASCII), so
3071 * for EBCDIC, split it into the parts above and below
3074 if (range_max > 255) {
3075 real_range_max = range_max;
3081 /* Here we need to expand out the string to contain each
3082 * character in the range. Grow the output to handle this */
3084 save_offset = min_ptr - SvPVX_const(sv);
3086 /* The base growth is the number of code points in the range */
3087 grow = range_max - range_min + 1;
3090 /* But if the output is UTF-8, some of those characters may
3091 * need two bytes (since the maximum range value here is
3092 * 255, the max bytes per character is two). On ASCII
3093 * platforms, it's not much trouble to get an accurate
3094 * count of what's needed. But on EBCDIC, the ones that
3095 * need 2 bytes are scattered around, so just use a worst
3096 * case value instead of calculating for that platform. */
3100 /* Only those above 127 require 2 bytes. This may be
3101 * everything in the range, or not */
3102 if (range_min > 127) {
3105 else if (range_max > 127) {
3106 grow += range_max - 127;
3111 /* Subtract 3 for the bytes that were already accounted for
3112 * (min, max, and the hyphen) */
3113 d = save_offset + SvGROW(sv, SvLEN(sv) + grow - 3);
3116 /* Here, we expand out the range. */
3117 if (convert_unicode) {
3120 /* Recall that the min and max are now in Unicode terms, so
3121 * we have to convert each character to its native
3124 for (i = range_min; i <= range_max; i++) {
3125 append_utf8_from_native_byte(LATIN1_TO_NATIVE((U8) i),
3130 for (i = range_min; i <= range_max; i++) {
3131 *d++ = (char)LATIN1_TO_NATIVE((U8) i);
3137 /* Always gets run for ASCII, and sometimes for EBCDIC. */
3141 /* Here, no conversions are necessary, which means that the
3142 * first character in the range is already in 'd' and
3143 * valid, so we can skip overwriting it */
3146 for (i = range_min + 1; i <= range_max; i++) {
3147 append_utf8_from_native_byte((U8) i, (U8 **) &d);
3152 for (i = range_min + 1; i <= range_max; i++) {
3159 /* If the original range extended above 255, add in that portion. */
3160 if (real_range_max) {
3161 *d++ = (char) UTF8_TWO_BYTE_HI(0x100);
3162 *d++ = (char) UTF8_TWO_BYTE_LO(0x100);
3163 if (real_range_max > 0x101)
3164 *d++ = (char) ILLEGAL_UTF8_BYTE;
3165 if (real_range_max > 0x100)
3166 d = (char*)uvchr_to_utf8((U8*)d, real_range_max);
3171 /* mark the range as done, and continue */
3175 non_portable_endpoint = 0;
3179 } /* End of is a range */
3180 } /* End of transliteration. Joins main code after these else's */
3181 else if (*s == '[' && PL_lex_inpat && !in_charclass) {
3184 while (s1 >= start && *s1-- == '\\')
3187 in_charclass = TRUE;
3190 else if (*s == ']' && PL_lex_inpat && in_charclass) {
3193 while (s1 >= start && *s1-- == '\\')
3196 in_charclass = FALSE;
3199 /* skip for regexp comments /(?#comment)/, except for the last
3200 * char, which will be done separately.
3201 * Stop on (?{..}) and friends */
3203 else if (*s == '(' && PL_lex_inpat && s[1] == '?' && !in_charclass) {
3205 while (s+1 < send && *s != ')')
3208 else if (!PL_lex_casemods
3209 && ( s[2] == '{' /* This should match regcomp.c */
3210 || (s[2] == '?' && s[3] == '{')))
3216 /* likewise skip #-initiated comments in //x patterns */
3220 && ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED)
3222 while (s+1 < send && *s != '\n')
3226 /* no further processing of single-quoted regex */
3227 else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'')
3228 goto default_action;
3230 /* check for embedded arrays
3231 (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
3233 else if (*s == '@' && s[1]) {
3234 if (UTF ? isIDFIRST_utf8((U8*)s+1) : isWORDCHAR_A(s[1]))
3236 if (strchr(":'{$", s[1]))
3238 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
3239 break; /* in regexp, neither @+ nor @- are interpolated */
3242 /* check for embedded scalars. only stop if we're sure it's a
3245 else if (*s == '$') {
3246 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
3248 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
3250 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
3251 "Possible unintended interpolation of $\\ in regex");
3253 break; /* in regexp, $ might be tail anchor */
3257 /* End of else if chain - OP_TRANS rejoin rest */
3260 if (*s == '\\' && s+1 < send) {
3261 char* e; /* Can be used for ending '}', etc. */
3265 /* warn on \1 - \9 in substitution replacements, but note that \11
3266 * is an octal; and \19 is \1 followed by '9' */
3267 if (PL_lex_inwhat == OP_SUBST
3273 /* diag_listed_as: \%d better written as $%d */
3274 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
3279 /* string-change backslash escapes */
3280 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQF", *s)) {
3284 /* In a pattern, process \N, but skip any other backslash escapes.
3285 * This is because we don't want to translate an escape sequence
3286 * into a meta symbol and have the regex compiler use the meta
3287 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
3288 * in spite of this, we do have to process \N here while the proper
3289 * charnames handler is in scope. See bugs #56444 and #62056.
3291 * There is a complication because \N in a pattern may also stand
3292 * for 'match a non-nl', and not mean a charname, in which case its
3293 * processing should be deferred to the regex compiler. To be a
3294 * charname it must be followed immediately by a '{', and not look
3295 * like \N followed by a curly quantifier, i.e., not something like
3296 * \N{3,}. regcurly returns a boolean indicating if it is a legal
3298 else if (PL_lex_inpat
3301 || regcurly(s + 1)))
3304 goto default_action;
3310 if ((isALPHANUMERIC(*s)))
3311 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
3312 "Unrecognized escape \\%c passed through",
3314 /* default action is to copy the quoted character */
3315 goto default_action;
3318 /* eg. \132 indicates the octal constant 0132 */
3319 case '0': case '1': case '2': case '3':
3320 case '4': case '5': case '6': case '7':
3322 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
3324 uv = grok_oct(s, &len, &flags, NULL);
3326 if (len < 3 && s < send && isDIGIT(*s)
3327 && ckWARN(WARN_MISC))
3329 Perl_warner(aTHX_ packWARN(WARN_MISC),
3330 "%s", form_short_octal_warning(s, len));
3333 goto NUM_ESCAPE_INSERT;
3335 /* eg. \o{24} indicates the octal constant \024 */
3340 bool valid = grok_bslash_o(&s, &uv, &error,
3341 TRUE, /* Output warning */
3342 FALSE, /* Not strict */
3343 TRUE, /* Output warnings for
3350 goto NUM_ESCAPE_INSERT;
3353 /* eg. \x24 indicates the hex constant 0x24 */
3358 bool valid = grok_bslash_x(&s, &uv, &error,
3359 TRUE, /* Output warning */
3360 FALSE, /* Not strict */
3361 TRUE, /* Output warnings for
3371 /* Insert oct or hex escaped character. */
3373 /* Here uv is the ordinal of the next character being added */
3374 if (UVCHR_IS_INVARIANT(uv)) {
3378 if (!has_utf8 && uv > 255) {
3379 /* Might need to recode whatever we have accumulated so
3380 * far if it contains any chars variant in utf8 or
3383 SvCUR_set(sv, d - SvPVX_const(sv));
3386 /* See Note on sizing above. */
3387 sv_utf8_upgrade_flags_grow(
3389 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE
3390 /* Above-latin1 in string
3391 * implies no encoding */
3392 |SV_UTF8_NO_ENCODING,
3393 UVCHR_SKIP(uv) + (STRLEN)(send - s) + 1);
3394 d = SvPVX(sv) + SvCUR(sv);
3399 /* Usually, there will already be enough room in 'sv'
3400 * since such escapes are likely longer than any UTF-8
3401 * sequence they can end up as. This isn't the case on
3402 * EBCDIC where \x{40000000} contains 12 bytes, and the
3403 * UTF-8 for it contains 14. And, we have to allow for
3404 * a trailing NUL. It probably can't happen on ASCII
3405 * platforms, but be safe */
3406 const STRLEN needed = d - SvPVX(sv) + UVCHR_SKIP(uv)
3408 if (UNLIKELY(needed > SvLEN(sv))) {
3409 SvCUR_set(sv, d - SvPVX_const(sv));
3410 d = sv_grow(sv, needed) + SvCUR(sv);
3413 d = (char*)uvchr_to_utf8((U8*)d, uv);
3414 if (PL_lex_inwhat == OP_TRANS
3415 && PL_parser->lex_sub_op)
3417 PL_parser->lex_sub_op->op_private |=
3418 (PL_lex_repl ? OPpTRANS_FROM_UTF
3427 non_portable_endpoint++;
3432 /* In a non-pattern \N must be like \N{U+0041}, or it can be a
3433 * named character, like \N{LATIN SMALL LETTER A}, or a named
3434 * sequence, like \N{LATIN CAPITAL LETTER A WITH MACRON AND
3435 * GRAVE} (except y/// can't handle the latter, croaking). For
3436 * convenience all three forms are referred to as "named
3437 * characters" below.
3439 * For patterns, \N also can mean to match a non-newline. Code
3440 * before this 'switch' statement should already have handled
3441 * this situation, and hence this code only has to deal with
3442 * the named character cases.
3444 * For non-patterns, the named characters are converted to
3445 * their string equivalents. In patterns, named characters are
3446 * not converted to their ultimate forms for the same reasons
3447 * that other escapes aren't. Instead, they are converted to
3448 * the \N{U+...} form to get the value from the charnames that
3449 * is in effect right now, while preserving the fact that it
3450 * was a named character, so that the regex compiler knows
3453 * The structure of this section of code (besides checking for
3454 * errors and upgrading to utf8) is:
3455 * If the named character is of the form \N{U+...}, pass it
3456 * through if a pattern; otherwise convert the code point
3458 * Otherwise must be some \N{NAME}: convert to
3459 * \N{U+c1.c2...} if a pattern; otherwise convert to utf8
3461 * Transliteration is an exception. The conversion to utf8 is
3462 * only done if the code point requires it to be representable.
3464 * Here, 's' points to the 'N'; the test below is guaranteed to
3465 * succeed if we are being called on a pattern, as we already
3466 * know from a test above that the next character is a '{'. A
3467 * non-pattern \N must mean 'named character', which requires
3471 yyerror("Missing braces on \\N{}");
3476 /* If there is no matching '}', it is an error. */
3477 if (! (e = strchr(s, '}'))) {
3478 if (! PL_lex_inpat) {
3479 yyerror("Missing right brace on \\N{}");
3481 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N");
3486 /* Here it looks like a named character */
3488 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3489 s += 2; /* Skip to next char after the 'U+' */
3492 /* In patterns, we can have \N{U+xxxx.yyyy.zzzz...} */
3493 /* Check the syntax. */
3496 if (!isXDIGIT(*s)) {
3499 "Invalid hexadecimal number in \\N{U+...}"
3507 else if ((*s == '.' || *s == '_')
3513 /* Pass everything through unchanged.
3514 * +1 is for the '}' */
3515 Copy(orig_s, d, e - orig_s + 1, char);
3516 d += e - orig_s + 1;
3518 else { /* Not a pattern: convert the hex to string */
3519 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3520 | PERL_SCAN_SILENT_ILLDIGIT
3521 | PERL_SCAN_DISALLOW_PREFIX;
3523 uv = grok_hex(s, &len, &flags, NULL);
3524 if (len == 0 || (len != (STRLEN)(e - s)))
3527 /* For non-tr///, if the destination is not in utf8,
3528 * unconditionally recode it to be so. This is
3529 * because \N{} implies Unicode semantics, and scalars
3530 * have to be in utf8 to guarantee those semantics.
3531 * tr/// doesn't care about Unicode rules, so no need
3532 * there to upgrade to UTF-8 for small enough code
3534 if (! has_utf8 && ( uv > 0xFF
3535 || PL_lex_inwhat != OP_TRANS))
3537 SvCUR_set(sv, d - SvPVX_const(sv));
3540 /* See Note on sizing above. */
3541 sv_utf8_upgrade_flags_grow(
3543 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3544 OFFUNISKIP(uv) + (STRLEN)(send - e) + 1);
3545 d = SvPVX(sv) + SvCUR(sv);
3549 /* Add the (Unicode) code point to the output. */
3550 if (! has_utf8 || OFFUNI_IS_INVARIANT(uv)) {
3551 *d++ = (char) LATIN1_TO_NATIVE(uv);
3554 d = (char*) uvoffuni_to_utf8_flags((U8*)d, uv, 0);
3558 else /* Here is \N{NAME} but not \N{U+...}. */
3559 if ((res = get_and_check_backslash_N_name(s, e)))
3562 const char *str = SvPV_const(res, len);
3565 if (! len) { /* The name resolved to an empty string */
3566 Copy("\\N{}", d, 4, char);
3570 /* In order to not lose information for the regex
3571 * compiler, pass the result in the specially made
3572 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3573 * the code points in hex of each character
3574 * returned by charnames */
3576 const char *str_end = str + len;
3577 const STRLEN off = d - SvPVX_const(sv);
3579 if (! SvUTF8(res)) {
3580 /* For the non-UTF-8 case, we can determine the
3581 * exact length needed without having to parse
3582 * through the string. Each character takes up
3583 * 2 hex digits plus either a trailing dot or
3585 const char initial_text[] = "\\N{U+";
3586 const STRLEN initial_len = sizeof(initial_text)
3588 d = off + SvGROW(sv, off
3591 /* +1 for trailing NUL */
3594 + (STRLEN)(send - e));
3595 Copy(initial_text, d, initial_len, char);
3597 while (str < str_end) {
3600 my_snprintf(hex_string,
3604 /* The regex compiler is
3605 * expecting Unicode, not
3607 NATIVE_TO_LATIN1(*str));
3608 PERL_MY_SNPRINTF_POST_GUARD(len,
3609 sizeof(hex_string));
3610 Copy(hex_string, d, 3, char);
3614 d--; /* Below, we will overwrite the final
3615 dot with a right brace */
3618 STRLEN char_length; /* cur char's byte length */
3620 /* and the number of bytes after this is
3621 * translated into hex digits */
3622 STRLEN output_length;
3624 /* 2 hex per byte; 2 chars for '\N'; 2 chars
3625 * for max('U+', '.'); and 1 for NUL */
3626 char hex_string[2 * UTF8_MAXBYTES + 5];
3628 /* Get the first character of the result. */
3629 U32 uv = utf8n_to_uvchr((U8 *) str,
3633 /* Convert first code point to Unicode hex,
3634 * including the boiler plate before it. */
3636 my_snprintf(hex_string, sizeof(hex_string),
3638 (unsigned int) NATIVE_TO_UNI(uv));
3640 /* Make sure there is enough space to hold it */
3641 d = off + SvGROW(sv, off
3643 + (STRLEN)(send - e)
3644 + 2); /* '}' + NUL */
3646 Copy(hex_string, d, output_length, char);
3649 /* For each subsequent character, append dot and
3650 * its Unicode code point in hex */
3651 while ((str += char_length) < str_end) {
3652 const STRLEN off = d - SvPVX_const(sv);
3653 U32 uv = utf8n_to_uvchr((U8 *) str,
3658 my_snprintf(hex_string,
3661 (unsigned int) NATIVE_TO_UNI(uv));
3663 d = off + SvGROW(sv, off
3665 + (STRLEN)(send - e)
3666 + 2); /* '}' + NUL */
3667 Copy(hex_string, d, output_length, char);
3672 *d++ = '}'; /* Done. Add the trailing brace */
3675 else { /* Here, not in a pattern. Convert the name to a
3678 if (PL_lex_inwhat == OP_TRANS) {
3679 str = SvPV_const(res, len);
3680 if (len > ((SvUTF8(res))
3684 yyerror(Perl_form(aTHX_
3685 "%.*s must not be a named sequence"
3686 " in transliteration operator",
3687 /* +1 to include the "}" */
3688 (int) (e + 1 - start), start));
3689 goto end_backslash_N;
3692 else if (! SvUTF8(res)) {
3693 /* Make sure \N{} return is UTF-8. This is because
3694 * \N{} implies Unicode semantics, and scalars have
3695 * to be in utf8 to guarantee those semantics; but
3696 * not needed in tr/// */
3697 sv_utf8_upgrade_flags(res, SV_UTF8_NO_ENCODING);
3698 str = SvPV_const(res, len);
3701 /* Upgrade destination to be utf8 if this new
3703 if (! has_utf8 && SvUTF8(res)) {
3704 SvCUR_set(sv, d - SvPVX_const(sv));
3707 /* See Note on sizing above. */
3708 sv_utf8_upgrade_flags_grow(sv,
3709 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3710 len + (STRLEN)(send - s) + 1);
3711 d = SvPVX(sv) + SvCUR(sv);
3713 } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
3715 /* See Note on sizing above. (NOTE: SvCUR() is not
3716 * set correctly here). */
3717 const STRLEN off = d - SvPVX_const(sv);
3718 d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1);
3720 Copy(str, d, len, char);
3726 } /* End \N{NAME} */
3730 backslash_N++; /* \N{} is defined to be Unicode */
3732 s = e + 1; /* Point to just after the '}' */
3735 /* \c is a control character */
3739 *d++ = grok_bslash_c(*s++, 1);
3742 yyerror("Missing control char name in \\c");
3745 non_portable_endpoint++;
3749 /* printf-style backslashes, formfeeds, newlines, etc */
3775 } /* end if (backslash) */
3778 /* If we started with encoded form, or already know we want it,
3779 then encode the next character */
3780 if (! NATIVE_BYTE_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
3783 /* One might think that it is wasted effort in the case of the
3784 * source being utf8 (this_utf8 == TRUE) to take the next character
3785 * in the source, convert it to an unsigned value, and then convert
3786 * it back again. But the source has not been validated here. The
3787 * routine that does the conversion checks for errors like
3790 const UV nextuv = (this_utf8)
3791 ? utf8n_to_uvchr((U8*)s, send - s, &len, 0)
3793 const STRLEN need = UVCHR_SKIP(nextuv);
3795 SvCUR_set(sv, d - SvPVX_const(sv));
3798 /* See Note on sizing above. */
3799 sv_utf8_upgrade_flags_grow(sv,
3800 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3801 need + (STRLEN)(send - s) + 1);
3802 d = SvPVX(sv) + SvCUR(sv);
3804 } else if (need > len) {
3805 /* encoded value larger than old, may need extra space (NOTE:
3806 * SvCUR() is not set correctly here). See Note on sizing
3808 const STRLEN off = d - SvPVX_const(sv);
3809 d = SvGROW(sv, off + need + (STRLEN)(send - s) + 1) + off;
3813 d = (char*)uvchr_to_utf8((U8*)d, nextuv);
3818 } /* while loop to process each character */
3820 /* terminate the string and set up the sv */
3822 SvCUR_set(sv, d - SvPVX_const(sv));
3823 if (SvCUR(sv) >= SvLEN(sv))
3824 Perl_croak(aTHX_ "panic: constant overflowed allocated space, %"UVuf
3825 " >= %"UVuf, (UV)SvCUR(sv), (UV)SvLEN(sv));
3830 if (PL_lex_inwhat == OP_TRANS && PL_parser->lex_sub_op) {
3831 PL_parser->lex_sub_op->op_private |=
3832 (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
3836 /* shrink the sv if we allocated more than we used */
3837 if (SvCUR(sv) + 5 < SvLEN(sv)) {
3838 SvPV_shrink_to_cur(sv);
3841 /* return the substring (via pl_yylval) only if we parsed anything */
3844 for (; s2 < s; s2++) {
3846 COPLINE_INC_WITH_HERELINES;
3848 SvREFCNT_inc_simple_void_NN(sv);
3849 if ( (PL_hints & ( PL_lex_inpat ? HINT_NEW_RE : HINT_NEW_STRING ))
3850 && ! PL_parser->lex_re_reparsing)
3852 const char *const key = PL_lex_inpat ? "qr" : "q";
3853 const STRLEN keylen = PL_lex_inpat ? 2 : 1;
3857 if (PL_lex_inwhat == OP_TRANS) {
3860 } else if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat) {
3863 } else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'') {
3871 sv = S_new_constant(aTHX_ start, s - start, key, keylen, sv, NULL,
3874 pl_yylval.opval = newSVOP(OP_CONST, 0, sv);
3876 LEAVE_with_name("scan_const");
3881 * Returns TRUE if there's more to the expression (e.g., a subscript),
3884 * It deals with "$foo[3]" and /$foo[3]/ and /$foo[0123456789$]+/
3886 * ->[ and ->{ return TRUE
3887 * ->$* ->$#* ->@* ->@[ ->@{ return TRUE if postderef_qq is enabled
3888 * { and [ outside a pattern are always subscripts, so return TRUE
3889 * if we're outside a pattern and it's not { or [, then return FALSE
3890 * if we're in a pattern and the first char is a {
3891 * {4,5} (any digits around the comma) returns FALSE
3892 * if we're in a pattern and the first char is a [
3894 * [SOMETHING] has a funky algorithm to decide whether it's a
3895 * character class or not. It has to deal with things like
3896 * /$foo[-3]/ and /$foo[$bar]/ as well as /$foo[$\d]+/
3897 * anything else returns TRUE
3900 /* This is the one truly awful dwimmer necessary to conflate C and sed. */
3903 S_intuit_more(pTHX_ char *s)
3905 PERL_ARGS_ASSERT_INTUIT_MORE;
3907 if (PL_lex_brackets)
3909 if (*s == '-' && s[1] == '>' && (s[2] == '[' || s[2] == '{'))
3911 if (*s == '-' && s[1] == '>'
3912 && FEATURE_POSTDEREF_QQ_IS_ENABLED
3913 && ( (s[2] == '$' && (s[3] == '*' || (s[3] == '#' && s[4] == '*')))
3914 ||(s[2] == '@' && strchr("*[{",s[3])) ))
3916 if (*s != '{' && *s != '[')
3921 /* In a pattern, so maybe we have {n,m}. */
3929 /* On the other hand, maybe we have a character class */
3932 if (*s == ']' || *s == '^')
3935 /* this is terrifying, and it works */
3938 const char * const send = strchr(s,']');
3939 unsigned char un_char, last_un_char;
3940 char tmpbuf[sizeof PL_tokenbuf * 4];
3942 if (!send) /* has to be an expression */
3944 weight = 2; /* let's weigh the evidence */
3948 else if (isDIGIT(*s)) {
3950 if (isDIGIT(s[1]) && s[2] == ']')
3956 Zero(seen,256,char);
3958 for (; s < send; s++) {
3959 last_un_char = un_char;
3960 un_char = (unsigned char)*s;
3965 weight -= seen[un_char] * 10;
3966 if (isWORDCHAR_lazy_if(s+1,UTF)) {
3968 char *tmp = PL_bufend;
3969 PL_bufend = (char*)send;
3970 scan_ident(s, tmpbuf, sizeof tmpbuf, FALSE);
3972 len = (int)strlen(tmpbuf);
3973 if (len > 1 && gv_fetchpvn_flags(tmpbuf, len,
3974 UTF ? SVf_UTF8 : 0, SVt_PV))
3981 && strchr("[#!%*<>()-=",s[1]))
3983 if (/*{*/ strchr("])} =",s[2]))
3992 if (strchr("wds]",s[1]))
3994 else if (seen[(U8)'\''] || seen[(U8)'"'])
3996 else if (strchr("rnftbxcav",s[1]))
3998 else if (isDIGIT(s[1])) {
4000 while (s[1] && isDIGIT(s[1]))
4010 if (strchr("aA01! ",last_un_char))
4012 if (strchr("zZ79~",s[1]))
4014 if (last_un_char == 255 && (isDIGIT(s[1]) || s[1] == '$'))
4015 weight -= 5; /* cope with negative subscript */
4018 if (!isWORDCHAR(last_un_char)
4019 && !(last_un_char == '$' || last_un_char == '@'
4020 || last_un_char == '&')
4021 && isALPHA(*s) && s[1] && isALPHA(s[1])) {
4025 if (keyword(d, s - d, 0))
4028 if (un_char == last_un_char + 1)
4030 weight -= seen[un_char];
4035 if (weight >= 0) /* probably a character class */
4045 * Does all the checking to disambiguate
4047 * between foo(bar) and bar->foo. Returns 0 if not a method, otherwise
4048 * FUNCMETH (bar->foo(args)) or METHOD (bar->foo args).
4050 * First argument is the stuff after the first token, e.g. "bar".
4052 * Not a method if foo is a filehandle.
4053 * Not a method if foo is a subroutine prototyped to take a filehandle.
4054 * Not a method if it's really "Foo $bar"
4055 * Method if it's "foo $bar"
4056 * Not a method if it's really "print foo $bar"
4057 * Method if it's really "foo package::" (interpreted as package->foo)
4058 * Not a method if bar is known to be a subroutine ("sub bar; foo bar")
4059 * Not a method if bar is a filehandle or package, but is quoted with
4064 S_intuit_method(pTHX_ char *start, SV *ioname, CV *cv)
4066 char *s = start + (*start == '$');
4067 char tmpbuf[sizeof PL_tokenbuf];
4070 /* Mustn't actually add anything to a symbol table.
4071 But also don't want to "initialise" any placeholder
4072 constants that might already be there into full
4073 blown PVGVs with attached PVCV. */
4075 ioname ? gv_fetchsv(ioname, GV_NOADD_NOINIT, SVt_PVCV) : NULL;
4077 PERL_ARGS_ASSERT_INTUIT_METHOD;
4079 if (gv && SvTYPE(gv) == SVt_PVGV && GvIO(gv))
4081 if (cv && SvPOK(cv)) {
4082 const char *proto = CvPROTO(cv);
4084 while (*proto && (isSPACE(*proto) || *proto == ';'))
4091 if (*start == '$') {
4092 if (cv || PL_last_lop_op == OP_PRINT || PL_last_lop_op == OP_SAY
4093 || isUPPER(*PL_tokenbuf))
4098 return *s == '(' ? FUNCMETH : METHOD;
4101 s = scan_word(s, tmpbuf, sizeof tmpbuf, TRUE, &len);
4102 /* start is the beginning of the possible filehandle/object,
4103 * and s is the end of it
4104 * tmpbuf is a copy of it (but with single quotes as double colons)
4107 if (!keyword(tmpbuf, len, 0)) {
4108 if (len > 2 && tmpbuf[len - 2] == ':' && tmpbuf[len - 1] == ':') {
4113 indirgv = gv_fetchpvn_flags(tmpbuf, len,
4114 GV_NOADD_NOINIT|( UTF ? SVf_UTF8 : 0 ),
4116 if (indirgv && SvTYPE(indirgv) != SVt_NULL
4117 && (!isGV(indirgv) || GvCVu(indirgv)))
4119 /* filehandle or package name makes it a method */
4120 if (!cv || GvIO(indirgv) || gv_stashpvn(tmpbuf, len, UTF ? SVf_UTF8 : 0)) {
4122 if ((PL_bufend - s) >= 2 && *s == '=' && *(s+1) == '>')
4123 return 0; /* no assumptions -- "=>" quotes bareword */
4125 NEXTVAL_NEXTTOKE.opval = newSVOP(OP_CONST, 0,
4126 S_newSV_maybe_utf8(aTHX_ tmpbuf, len));
4127 NEXTVAL_NEXTTOKE.opval->op_private = OPpCONST_BARE;
4129 force_next(BAREWORD);
4131 return *s == '(' ? FUNCMETH : METHOD;
4137 /* Encoded script support. filter_add() effectively inserts a
4138 * 'pre-processing' function into the current source input stream.
4139 * Note that the filter function only applies to the current source file
4140 * (e.g., it will not affect files 'require'd or 'use'd by this one).
4142 * The datasv parameter (which may be NULL) can be used to pass
4143 * private data to this instance of the filter. The filter function
4144 * can recover the SV using the FILTER_DATA macro and use it to
4145 * store private buffers and state information.
4147 * The supplied datasv parameter is upgraded to a PVIO type
4148 * and the IoDIRP/IoANY field is used to store the function pointer,
4149 * and IOf_FAKE_DIRP is enabled on datasv to mark this as such.
4150 * Note that IoTOP_NAME, IoFMT_NAME, IoBOTTOM_NAME, if set for
4151 * private use must be set using malloc'd pointers.
4155 Perl_filter_add(pTHX_ filter_t funcp, SV *datasv)
4163 if (PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS)
4164 Perl_croak(aTHX_ "Source filters apply only to byte streams");
4166 if (!PL_rsfp_filters)
4167 PL_rsfp_filters = newAV();
4170 SvUPGRADE(datasv, SVt_PVIO);
4171 IoANY(datasv) = FPTR2DPTR(void *, funcp); /* stash funcp into spare field */
4172 IoFLAGS(datasv) |= IOf_FAKE_DIRP;
4173 DEBUG_P(PerlIO_printf(Perl_debug_log, "filter_add func %p (%s)\n",
4174 FPTR2DPTR(void *, IoANY(datasv)),
4175 SvPV_nolen(datasv)));
4176 av_unshift(PL_rsfp_filters, 1);
4177 av_store(PL_rsfp_filters, 0, datasv) ;
4179 !PL_parser->filtered
4180 && PL_parser->lex_flags & LEX_EVALBYTES
4181 && PL_bufptr < PL_bufend
4183 const char *s = PL_bufptr;
4184 while (s < PL_bufend) {
4186 SV *linestr = PL_parser->linestr;
4187 char *buf = SvPVX(linestr);
4188 STRLEN const bufptr_pos = PL_parser->bufptr - buf;
4189 STRLEN const oldbufptr_pos = PL_parser->oldbufptr - buf;
4190 STRLEN const oldoldbufptr_pos=PL_parser->oldoldbufptr-buf;
4191 STRLEN const linestart_pos = PL_parser->linestart - buf;
4192 STRLEN const last_uni_pos =
4193 PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
4194 STRLEN const last_lop_pos =
4195 PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
4196 av_push(PL_rsfp_filters, linestr);
4197 PL_parser->linestr =
4198 newSVpvn(SvPVX(linestr), ++s-SvPVX(linestr));
4199 buf = SvPVX(PL_parser->linestr);
4200 PL_parser->bufend = buf + SvCUR(PL_parser->linestr);
4201 PL_parser->bufptr = buf + bufptr_pos;
4202 PL_parser->oldbufptr = buf + oldbufptr_pos;
4203 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
4204 PL_parser->linestart = buf + linestart_pos;
4205 if (PL_parser->last_uni)
4206 PL_parser->last_uni = buf + last_uni_pos;
4207 if (PL_parser->last_lop)
4208 PL_parser->last_lop = buf + last_lop_pos;
4209 SvLEN(linestr) = SvCUR(linestr);
4210 SvCUR(linestr) = s-SvPVX(linestr);
4211 PL_parser->filtered = 1;
4221 /* Delete most recently added instance of this filter function. */
4223 Perl_filter_del(pTHX_ filter_t funcp)
4227 PERL_ARGS_ASSERT_FILTER_DEL;
4230 DEBUG_P(PerlIO_printf(Perl_debug_log, "filter_del func %p",
4231 FPTR2DPTR(void*, funcp)));
4233 if (!PL_parser || !PL_rsfp_filters || AvFILLp(PL_rsfp_filters)<0)
4235 /* if filter is on top of stack (usual case) just pop it off */
4236 datasv = FILTER_DATA(AvFILLp(PL_rsfp_filters));
4237 if (IoANY(datasv) == FPTR2DPTR(void *, funcp)) {
4238 sv_free(av_pop(PL_rsfp_filters));
4242 /* we need to search for the correct entry and clear it */
4243 Perl_die(aTHX_ "filter_del can only delete in reverse order (currently)");
4247 /* Invoke the idxth filter function for the current rsfp. */
4248 /* maxlen 0 = read one text line */
4250 Perl_filter_read(pTHX_ int idx, SV *buf_sv, int maxlen)
4254 /* This API is bad. It should have been using unsigned int for maxlen.
4255 Not sure if we want to change the API, but if not we should sanity
4256 check the value here. */
4257 unsigned int correct_length = maxlen < 0 ? PERL_INT_MAX : maxlen;
4259 PERL_ARGS_ASSERT_FILTER_READ;
4261 if (!PL_parser || !PL_rsfp_filters)
4263 if (idx > AvFILLp(PL_rsfp_filters)) { /* Any more filters? */
4264 /* Provide a default input filter to make life easy. */
4265 /* Note that we append to the line. This is handy. */
4266 DEBUG_P(PerlIO_printf(Perl_debug_log,
4267 "filter_read %d: from rsfp\n", idx));
4268 if (correct_length) {
4271 const int old_len = SvCUR(buf_sv);
4273 /* ensure buf_sv is large enough */
4274 SvGROW(buf_sv, (STRLEN)(old_len + correct_length + 1)) ;
4275 if ((len = PerlIO_read(PL_rsfp, SvPVX(buf_sv) + old_len,
4276 correct_length)) <= 0) {
4277 if (PerlIO_error(PL_rsfp))
4278 return -1; /* error */
4280 return 0 ; /* end of file */
4282 SvCUR_set(buf_sv, old_len + len) ;
4283 SvPVX(buf_sv)[old_len + len] = '\0';
4286 if (sv_gets(buf_sv, PL_rsfp, SvCUR(buf_sv)) == NULL) {
4287 if (PerlIO_error(PL_rsfp))
4288 return -1; /* error */
4290 return 0 ; /* end of file */
4293 return SvCUR(buf_sv);
4295 /* Skip this filter slot if filter has been deleted */
4296 if ( (datasv = FILTER_DATA(idx)) == &PL_sv_undef) {
4297 DEBUG_P(PerlIO_printf(Perl_debug_log,
4298 "filter_read %d: skipped (filter deleted)\n",
4300 return FILTER_READ(idx+1, buf_sv, correct_length); /* recurse */
4302 if (SvTYPE(datasv) != SVt_PVIO) {
4303 if (correct_length) {
4305 const STRLEN remainder = SvLEN(datasv) - SvCUR(datasv);
4306 if (!remainder) return 0; /* eof */
4307 if (correct_length > remainder) correct_length = remainder;
4308 sv_catpvn(buf_sv, SvEND(datasv), correct_length);
4309 SvCUR_set(datasv, SvCUR(datasv) + correct_length);
4312 const char *s = SvEND(datasv);
4313 const char *send = SvPVX(datasv) + SvLEN(datasv);