This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
toke.c: Split code to load _charnames.pm into own fnc
[perl5.git] / toke.c
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
f0e67a1d
Z
26This is the lower layer of the Perl parser, managing characters and tokens.
27
78342678 28=for apidoc AmnU|yy_parser *|PL_parser
f0e67a1d
Z
29
30Pointer to a structure encapsulating the state of the parsing operation
31currently in progress. The pointer can be locally changed to perform
32a nested parse without interfering with the state of an outer parse.
33Individual members of C<PL_parser> have their own documentation.
34
35=cut
36*/
37
378cc40b 38#include "EXTERN.h"
864dbfa3 39#define PERL_IN_TOKE_C
378cc40b 40#include "perl.h"
f1bcae08 41#include "invlist_inline.h"
378cc40b 42
164e423c
KW
43#define new_constant(a,b,c,d,e,f,g, h) \
44 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g, h)
eb0d8d16 45
6154021b 46#define pl_yylval (PL_parser->yylval)
d3b6f988 47
199e78b7
DM
48/* XXX temporary backwards compatibility */
49#define PL_lex_brackets (PL_parser->lex_brackets)
78cdf107
Z
50#define PL_lex_allbrackets (PL_parser->lex_allbrackets)
51#define PL_lex_fakeeof (PL_parser->lex_fakeeof)
199e78b7
DM
52#define PL_lex_brackstack (PL_parser->lex_brackstack)
53#define PL_lex_casemods (PL_parser->lex_casemods)
54#define PL_lex_casestack (PL_parser->lex_casestack)
199e78b7 55#define PL_lex_dojoin (PL_parser->lex_dojoin)
199e78b7
DM
56#define PL_lex_formbrack (PL_parser->lex_formbrack)
57#define PL_lex_inpat (PL_parser->lex_inpat)
58#define PL_lex_inwhat (PL_parser->lex_inwhat)
59#define PL_lex_op (PL_parser->lex_op)
60#define PL_lex_repl (PL_parser->lex_repl)
61#define PL_lex_starts (PL_parser->lex_starts)
62#define PL_lex_stuff (PL_parser->lex_stuff)
63#define PL_multi_start (PL_parser->multi_start)
64#define PL_multi_open (PL_parser->multi_open)
65#define PL_multi_close (PL_parser->multi_close)
199e78b7 66#define PL_preambled (PL_parser->preambled)
bdc0bf6f 67#define PL_linestr (PL_parser->linestr)
c2598295
DM
68#define PL_expect (PL_parser->expect)
69#define PL_copline (PL_parser->copline)
f06b5848
DM
70#define PL_bufptr (PL_parser->bufptr)
71#define PL_oldbufptr (PL_parser->oldbufptr)
72#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
73#define PL_linestart (PL_parser->linestart)
74#define PL_bufend (PL_parser->bufend)
75#define PL_last_uni (PL_parser->last_uni)
76#define PL_last_lop (PL_parser->last_lop)
77#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 78#define PL_lex_state (PL_parser->lex_state)
2f9285f8 79#define PL_rsfp (PL_parser->rsfp)
5486870f 80#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
81#define PL_in_my (PL_parser->in_my)
82#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 83#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 84#define PL_multi_end (PL_parser->multi_end)
13765c85 85#define PL_error_count (PL_parser->error_count)
199e78b7 86
fb205e7a
DM
87# define PL_nexttoke (PL_parser->nexttoke)
88# define PL_nexttype (PL_parser->nexttype)
89# define PL_nextval (PL_parser->nextval)
199e78b7 90
6432a58a
DM
91
92#define SvEVALED(sv) \
93 (SvTYPE(sv) >= SVt_PVNV \
94 && ((XPVIV*)SvANY(sv))->xiv_u.xivu_eval_seen)
95
a1894d81 96static const char* const ident_too_long = "Identifier too long";
60267e1d 97static const char* const ident_var_zero_multi_digit = "Numeric variables with more than one digit may not start with '0'";
8903cb82 98
9ded7720 99# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2 100
a7aaec61
Z
101#define XENUMMASK 0x3f
102#define XFAKEEOF 0x40
103#define XFAKEBRACK 0x80
9059aa12 104
39e02b42 105#ifdef USE_UTF8_SCRIPTS
b3041197 106# define UTF cBOOL(!IN_BYTES)
2b9d42f0 107#else
b3041197 108# define UTF cBOOL((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
2b9d42f0 109#endif
a0ed51b3 110
b1fc3636
CJ
111/* The maximum number of characters preceding the unrecognized one to display */
112#define UNRECOGNIZED_PRECEDE_COUNT 10
113
61f0cdd9 114/* In variables named $^X, these are the legal values for X.
2b92dfce 115 * 1999-02-27 mjd-perl-patch@plover.com */
4aada8b9 116#define isCONTROLVAR(x) (isUPPER(x) || memCHRs("[\\]^_?", (x)))
2b92dfce 117
14bd96d0 118#define SPACE_OR_TAB(c) isBLANK_A(c)
bf4acbe4 119
9ff909cf
JH
120#define HEXFP_PEEK(s) \
121 (((s[0] == '.') && \
122 (isXDIGIT(s[1]) || isALPHA_FOLD_EQ(s[1], 'p'))) || \
123 isALPHA_FOLD_EQ(s[0], 'p'))
124
ffb4593c
NT
125/* LEX_* are values for PL_lex_state, the state of the lexer.
126 * They are arranged oddly so that the guard on the switch statement
79072805 127 * can get by with a single comparison (if the compiler is smart enough).
9da1dd8f
DM
128 *
129 * These values refer to the various states within a sublex parse,
130 * i.e. within a double quotish string
79072805
LW
131 */
132
fb73857a
PP
133/* #define LEX_NOTPARSING 11 is done in perl.h. */
134
b6007c36
DM
135#define LEX_NORMAL 10 /* normal code (ie not within "...") */
136#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
137#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
138#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
139#define LEX_INTERPSTART 6 /* expecting the start of a $var */
140
141 /* at end of code, eg "$x" followed by: */
142#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
143#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
144
145#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
146 string or after \E, $foo, etc */
147#define LEX_INTERPCONST 2 /* NOT USED */
148#define LEX_FORMLINE 1 /* expecting a format line */
b6007c36 149
79072805 150
bbf60fe6 151#ifdef DEBUGGING
27da23d5 152static const char* const lex_state_names[] = {
bbf60fe6
DM
153 "KNOWNEXT",
154 "FORMLINE",
155 "INTERPCONST",
156 "INTERPCONCAT",
157 "INTERPENDMAYBE",
158 "INTERPEND",
159 "INTERPSTART",
160 "INTERPPUSH",
161 "INTERPCASEMOD",
162 "INTERPNORMAL",
163 "NORMAL"
164};
165#endif
166
79072805 167#include "keywords.h"
fe14fcc3 168
ffb4593c
NT
169/* CLINE is a macro that ensures PL_copline has a sane value */
170
57843af0 171#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 172
ffb4593c
NT
173/*
174 * Convenience functions to return different tokens and prime the
9cbb5ea2 175 * lexer for the next token. They all take an argument.
ffb4593c
NT
176 *
177 * TOKEN : generic token (used for '(', DOLSHARP, etc)
178 * OPERATOR : generic operator
179 * AOPERATOR : assignment operator
180 * PREBLOCK : beginning the block after an if, while, foreach, ...
181 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
182 * PREREF : *EXPR where EXPR is not a simple identifier
183 * TERM : expression term
89f35911 184 * POSTDEREF : postfix dereference (->$* ->@[...] etc.)
ffb4593c
NT
185 * LOOPX : loop exiting command (goto, last, dump, etc)
186 * FTST : file test operator
187 * FUN0 : zero-argument function
7eb971ee 188 * FUN0OP : zero-argument function, with its op created in this file
2d2e263d 189 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
190 * BOop : bitwise or or xor
191 * BAop : bitwise and
8823cb89 192 * BCop : bitwise complement
ffb4593c
NT
193 * SHop : shift operator
194 * PWop : power operator
9cbb5ea2 195 * PMop : pattern-matching operator
ffb4593c 196 * Aop : addition-level operator
e4916dd1 197 * AopNOASSIGN : addition-level operator that is never part of .=
ffb4593c
NT
198 * Mop : multiplication-level operator
199 * Eop : equality-testing operator
e5edeb50 200 * Rop : relational operator <= != gt
ffb4593c
NT
201 *
202 * Also see LOP and lop() below.
203 */
204
998054bd 205#ifdef DEBUGGING /* Serve -DT. */
704d4215 206# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 207#else
bbf60fe6 208# define REPORT(retval) (retval)
998054bd
SC
209#endif
210
bbf60fe6
DM
211#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
212#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
b1764551 213#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, retval))
bbf60fe6
DM
214#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
215#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
216#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
217#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
89f35911 218#define POSTDEREF(f) return (PL_bufptr = s, S_postderef(aTHX_ REPORT(f),s[1]))
185c2e96 219#define LOOPX(f) return (PL_bufptr = force_word(s,BAREWORD,TRUE,FALSE), \
7a61bf3c 220 pl_yylval.ival=f, \
a49203fd 221 PL_expect = PL_nexttoke ? XOPERATOR : XTERM, \
7a61bf3c 222 REPORT((int)LOOPEX))
6154021b
RGS
223#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
224#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
7eb971ee 225#define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
6154021b 226#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
b1764551
FC
227#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)BITOROP))
228#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)BITANDOP))
8823cb89
FC
229#define BCop(f) return pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr = s, \
230 REPORT('~')
b1764551
FC
231#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)SHIFTOP))
232#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)POWOP))
6154021b 233#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
b1764551 234#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)ADDOP))
e4916dd1 235#define AopNOASSIGN(f) return (pl_yylval.ival=f, PL_bufptr=s, REPORT((int)ADDOP))
b1764551 236#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)MULOP))
6154021b
RGS
237#define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
238#define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
2f3197b3 239
a687059c
LW
240/* This bit of chicanery makes a unary function followed by
241 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
242 * The UNIDOR macro is for unary functions that can be followed by the //
243 * operator (such as C<shift // 0>).
a687059c 244 */
d68ce4ac 245#define UNI3(f,x,have_x) { \
6154021b 246 pl_yylval.ival = f; \
d68ce4ac 247 if (have_x) PL_expect = x; \
376fcdbf
AL
248 PL_bufptr = s; \
249 PL_last_uni = PL_oldbufptr; \
0af40c75 250 PL_last_lop_op = (f) < 0 ? -(f) : (f); \
376fcdbf
AL
251 if (*s == '(') \
252 return REPORT( (int)FUNC1 ); \
294a536f 253 s = skipspace(s); \
376fcdbf
AL
254 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
255 }
d68ce4ac
FC
256#define UNI(f) UNI3(f,XTERM,1)
257#define UNIDOR(f) UNI3(f,XTERMORDORDOR,1)
b5fb7ce3
FC
258#define UNIPROTO(f,optional) { \
259 if (optional) PL_last_uni = PL_oldbufptr; \
22393538
MH
260 OPERATOR(f); \
261 }
a687059c 262
d68ce4ac 263#define UNIBRACK(f) UNI3(f,0,0)
79072805 264
9f68db38 265/* grandfather return to old style */
78cdf107
Z
266#define OLDLOP(f) \
267 do { \
268 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
269 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
270 pl_yylval.ival = (f); \
271 PL_expect = XTERM; \
272 PL_bufptr = s; \
273 return (int)LSTOP; \
274 } while(0)
79072805 275
83944c01
FC
276#define COPLINE_INC_WITH_HERELINES \
277 STMT_START { \
278 CopLINE_inc(PL_curcop); \
851b527a
FC
279 if (PL_parser->herelines) \
280 CopLINE(PL_curcop) += PL_parser->herelines, \
281 PL_parser->herelines = 0; \
83944c01 282 } STMT_END
ffdb8b16
FC
283/* Called after scan_str to update CopLINE(PL_curcop), but only when there
284 * is no sublex_push to follow. */
285#define COPLINE_SET_FROM_MULTI_END \
286 STMT_START { \
287 CopLINE_set(PL_curcop, PL_multi_end); \
288 if (PL_multi_end != PL_multi_start) \
851b527a 289 PL_parser->herelines = 0; \
ffdb8b16 290 } STMT_END
83944c01
FC
291
292
539c839f
AC
293/* A file-local structure for passing around information about subroutines and
294 * related definable words */
295struct code {
296 SV *sv;
297 CV *cv;
298 GV *gv, **gvp;
299 OP *rv2cv_op;
300 PADOFFSET off;
301 bool lex;
302};
303
304static const struct code no_code = { NULL, NULL, NULL, NULL, NULL, 0, FALSE };
305
539c839f 306
8fa7f367
JH
307#ifdef DEBUGGING
308
6154021b 309/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
310enum token_type {
311 TOKENTYPE_NONE,
312 TOKENTYPE_IVAL,
6154021b 313 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6 314 TOKENTYPE_PVAL,
aeaef349 315 TOKENTYPE_OPVAL
bbf60fe6
DM
316};
317
6d4a66ac
NC
318static struct debug_tokens {
319 const int token;
320 enum token_type type;
321 const char *name;
322} const debug_tokens[] =
9041c2e3 323{
bbf60fe6
DM
324 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
325 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
326 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
327 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
436ddf68 328 { ANON_SIGSUB, TOKENTYPE_IVAL, "ANON_SIGSUB" },
bbf60fe6
DM
329 { ARROW, TOKENTYPE_NONE, "ARROW" },
330 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
331 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
332 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
333 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
334 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
7896dde7 335 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
336 { DO, TOKENTYPE_NONE, "DO" },
337 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
338 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
339 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
340 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
341 { ELSE, TOKENTYPE_NONE, "ELSE" },
342 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
343 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
344 { FOR, TOKENTYPE_IVAL, "FOR" },
345 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
705fe0e5
FC
346 { FORMLBRACK, TOKENTYPE_NONE, "FORMLBRACK" },
347 { FORMRBRACK, TOKENTYPE_NONE, "FORMRBRACK" },
bbf60fe6
DM
348 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
349 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
7eb971ee 350 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
bbf60fe6
DM
351 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
352 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
353 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 354 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
355 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
356 { IF, TOKENTYPE_IVAL, "IF" },
01719201 357 { LABEL, TOKENTYPE_OPVAL, "LABEL" },
bbf60fe6
DM
358 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
359 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
360 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
361 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
362 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
363 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
364 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
365 { MY, TOKENTYPE_IVAL, "MY" },
bbf60fe6
DM
366 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
367 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
368 { OROP, TOKENTYPE_IVAL, "OROP" },
369 { OROR, TOKENTYPE_NONE, "OROR" },
370 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
88e1f1a2
JV
371 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
372 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6 373 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
cc624add 374 { POSTJOIN, TOKENTYPE_NONE, "POSTJOIN" },
bbf60fe6
DM
375 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
376 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
377 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
378 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
379 { PREINC, TOKENTYPE_NONE, "PREINC" },
380 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
f3f204dc 381 { QWLIST, TOKENTYPE_OPVAL, "QWLIST" },
bbf60fe6
DM
382 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
383 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
f3f204dc 384 { REQUIRE, TOKENTYPE_NONE, "REQUIRE" },
bbf60fe6 385 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
436ddf68 386 { SIGSUB, TOKENTYPE_NONE, "SIGSUB" },
bbf60fe6 387 { SUB, TOKENTYPE_NONE, "SUB" },
69afcc21
TC
388 { SUBLEXEND, TOKENTYPE_NONE, "SUBLEXEND" },
389 { SUBLEXSTART, TOKENTYPE_NONE, "SUBLEXSTART" },
bbf60fe6
DM
390 { THING, TOKENTYPE_OPVAL, "THING" },
391 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
392 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
393 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
394 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
395 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
396 { USE, TOKENTYPE_IVAL, "USE" },
7896dde7 397 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6 398 { WHILE, TOKENTYPE_IVAL, "WHILE" },
185c2e96 399 { BAREWORD, TOKENTYPE_OPVAL, "BAREWORD" },
be25f609 400 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 401 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
402};
403
6154021b 404/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 405
bbf60fe6 406STATIC int
704d4215 407S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 408{
7918f24d
NC
409 PERL_ARGS_ASSERT_TOKEREPORT;
410
bbf60fe6 411 if (DEBUG_T_TEST) {
bd61b366 412 const char *name = NULL;
bbf60fe6 413 enum token_type type = TOKENTYPE_NONE;
f54cb97a 414 const struct debug_tokens *p;
396482e1 415 SV* const report = newSVpvs("<== ");
bbf60fe6 416
f54cb97a 417 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
418 if (p->token == (int)rv) {
419 name = p->name;
420 type = p->type;
421 break;
422 }
423 }
424 if (name)
54667de8 425 Perl_sv_catpv(aTHX_ report, name);
239f83d5 426 else if (isGRAPH(rv))
4ebc7986 427 {
bbf60fe6 428 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
4ebc7986
FC
429 if ((char)rv == 'p')
430 sv_catpvs(report, " (pending identifier)");
431 }
bbf60fe6 432 else if (!rv)
396482e1 433 sv_catpvs(report, "EOF");
bbf60fe6 434 else
147e3846 435 Perl_sv_catpvf(aTHX_ report, "?? %" IVdf, (IV)rv);
bbf60fe6
DM
436 switch (type) {
437 case TOKENTYPE_NONE:
bbf60fe6
DM
438 break;
439 case TOKENTYPE_IVAL:
147e3846 440 Perl_sv_catpvf(aTHX_ report, "(ival=%" IVdf ")", (IV)lvalp->ival);
bbf60fe6
DM
441 break;
442 case TOKENTYPE_OPNUM:
443 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 444 PL_op_name[lvalp->ival]);
bbf60fe6
DM
445 break;
446 case TOKENTYPE_PVAL:
704d4215 447 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
448 break;
449 case TOKENTYPE_OPVAL:
704d4215 450 if (lvalp->opval) {
401441c0 451 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
452 PL_op_name[lvalp->opval->op_type]);
453 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 454 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 455 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
456 }
457
458 }
401441c0 459 else
396482e1 460 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
461 break;
462 }
b6007c36 463 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
464 };
465 return (int)rv;
998054bd
SC
466}
467
b6007c36
DM
468
469/* print the buffer with suitable escapes */
470
471STATIC void
15f169a1 472S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 473{
396482e1 474 SV* const tmp = newSVpvs("");
7918f24d
NC
475
476 PERL_ARGS_ASSERT_PRINTBUF;
477
7347ee54 478 GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral); /* fmt checked by caller */
b6007c36 479 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
7347ee54 480 GCC_DIAG_RESTORE_STMT;
b6007c36
DM
481 SvREFCNT_dec(tmp);
482}
483
8fa7f367
JH
484#endif
485
ffb4593c
NT
486/*
487 * S_ao
488 *
f393a21a
FC
489 * This subroutine looks for an '=' next to the operator that has just been
490 * parsed and turns it into an ASSIGNOP if it finds one.
ffb4593c
NT
491 */
492
76e3520e 493STATIC int
cea2e8a9 494S_ao(pTHX_ int toketype)
a0d0e21e 495{
3280af22
NIS
496 if (*PL_bufptr == '=') {
497 PL_bufptr++;
a0d0e21e 498 if (toketype == ANDAND)
6154021b 499 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 500 else if (toketype == OROR)
6154021b 501 pl_yylval.ival = OP_ORASSIGN;
c963b151 502 else if (toketype == DORDOR)
6154021b 503 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
504 toketype = ASSIGNOP;
505 }
b1764551 506 return REPORT(toketype);
a0d0e21e
LW
507}
508
ffb4593c
NT
509/*
510 * S_no_op
511 * When Perl expects an operator and finds something else, no_op
512 * prints the warning. It always prints "<something> found where
513 * operator expected. It prints "Missing semicolon on previous line?"
514 * if the surprise occurs at the start of the line. "do you need to
515 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
516 * where the compiler doesn't know if foo is a method call or a function.
517 * It prints "Missing operator before end of line" if there's nothing
518 * after the missing operator, or "... before <...>" if there is something
519 * after the missing operator.
488bc579
FC
520 *
521 * PL_bufptr is expected to point to the start of the thing that was found,
522 * and s after the next token or partial token.
ffb4593c
NT
523 */
524
76e3520e 525STATIC void
15f169a1 526S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 527{
9d4ba2ae
AL
528 char * const oldbp = PL_bufptr;
529 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 530
7918f24d
NC
531 PERL_ARGS_ASSERT_NO_OP;
532
1189a94a
GS
533 if (!s)
534 s = oldbp;
07c798fb 535 else
1189a94a 536 PL_bufptr = s;
734ab321 537 yywarn(Perl_form(aTHX_ "%s found where operator expected", what), UTF ? SVf_UTF8 : 0);
56da5a46
RGS
538 if (ckWARN_d(WARN_SYNTAX)) {
539 if (is_first)
540 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
541 "\t(Missing semicolon on previous line?)\n");
fac0f7a3
KW
542 else if (PL_oldoldbufptr && isIDFIRST_lazy_if_safe(PL_oldoldbufptr,
543 PL_bufend,
544 UTF))
545 {
f54cb97a 546 const char *t;
fac0f7a3
KW
547 for (t = PL_oldoldbufptr;
548 (isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF) || *t == ':');
549 t += UTF ? UTF8SKIP(t) : 1)
550 {
c35e046a 551 NOOP;
fac0f7a3 552 }
56da5a46
RGS
553 if (t < PL_bufptr && isSPACE(*t))
554 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
147e3846 555 "\t(Do you need to predeclare %" UTF8f "?)\n",
b17a0679 556 UTF8fARG(UTF, t - PL_oldoldbufptr, PL_oldoldbufptr));
56da5a46
RGS
557 }
558 else {
559 assert(s >= oldbp);
560 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
147e3846 561 "\t(Missing operator before %" UTF8f "?)\n",
b17a0679 562 UTF8fARG(UTF, s - oldbp, oldbp));
56da5a46 563 }
07c798fb 564 }
3280af22 565 PL_bufptr = oldbp;
8990e307
LW
566}
567
ffb4593c
NT
568/*
569 * S_missingterm
570 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 571 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
572 * If we're in a delimited string and the delimiter is a control
573 * character, it's reformatted into a two-char sequence like ^C.
574 * This is fatal.
575 */
576
76e3520e 577STATIC void
1b8d3e0e 578S_missingterm(pTHX_ char *s, STRLEN len)
8990e307 579{
e487ff5e 580 char tmpbuf[UTF8_MAXBYTES + 1];
8990e307 581 char q;
cb650135
FC
582 bool uni = FALSE;
583 SV *sv;
8990e307 584 if (s) {
f88b23d7 585 char * const nl = (char *) my_memrchr(s, '\n', len);
1b8d3e0e
LM
586 if (nl) {
587 *nl = '\0';
588 len = nl - s;
589 }
7f7f7d08 590 uni = UTF;
8990e307 591 }
cb650135 592 else if (PL_multi_close < 32) {
8990e307 593 *tmpbuf = '^';
585ec06d 594 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
595 tmpbuf[2] = '\0';
596 s = tmpbuf;
1b8d3e0e 597 len = 2;
8990e307
LW
598 }
599 else {
cb650135
FC
600 if (LIKELY(PL_multi_close < 256)) {
601 *tmpbuf = (char)PL_multi_close;
602 tmpbuf[1] = '\0';
1b8d3e0e 603 len = 1;
cb650135
FC
604 }
605 else {
1b8d3e0e
LM
606 char *end = (char *)uvchr_to_utf8((U8 *)tmpbuf, PL_multi_close);
607 *end = '\0';
608 len = end - tmpbuf;
cb650135 609 uni = TRUE;
cb650135 610 }
8990e307
LW
611 s = tmpbuf;
612 }
9f8d27b7 613 q = memchr(s, '"', len) ? '\'' : '"';
1b8d3e0e 614 sv = sv_2mortal(newSVpvn(s, len));
cb650135
FC
615 if (uni)
616 SvUTF8_on(sv);
1b8d3e0e
LM
617 Perl_croak(aTHX_ "Can't find string terminator %c%" SVf "%c"
618 " anywhere before EOF", q, SVfARG(sv), q);
463ee0b2 619}
79072805 620
dd0ac2b9
FC
621#include "feature.h"
622
0d863452 623/*
9cbb5ea2
GS
624 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
625 * utf16-to-utf8-reversed.
ffb4593c
NT
626 */
627
c39cd008
GS
628#ifdef PERL_CR_FILTER
629static void
630strip_return(SV *sv)
631{
eb578fdb
KW
632 const char *s = SvPVX_const(sv);
633 const char * const e = s + SvCUR(sv);
7918f24d
NC
634
635 PERL_ARGS_ASSERT_STRIP_RETURN;
636
c39cd008
GS
637 /* outer loop optimized to do nothing if there are no CR-LFs */
638 while (s < e) {
639 if (*s++ == '\r' && *s == '\n') {
640 /* hit a CR-LF, need to copy the rest */
eb578fdb 641 char *d = s - 1;
c39cd008
GS
642 *d++ = *s++;
643 while (s < e) {
644 if (*s == '\r' && s[1] == '\n')
645 s++;
646 *d++ = *s++;
647 }
648 SvCUR(sv) -= s - d;
649 return;
650 }
651 }
652}
a868473f 653
76e3520e 654STATIC I32
c39cd008 655S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 656{
f54cb97a 657 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
658 if (count > 0 && !maxlen)
659 strip_return(sv);
660 return count;
a868473f
NIS
661}
662#endif
663
ffb4593c 664/*
44170c9a 665=for apidoc lex_start
8eaa0acf
Z
666
667Creates and initialises a new lexer/parser state object, supplying
668a context in which to lex and parse from a new source of Perl code.
669A pointer to the new state object is placed in L</PL_parser>. An entry
c83d5090 670is made on the save stack so that upon unwinding, the new state object
8eaa0acf
Z
671will be destroyed and the former value of L</PL_parser> will be restored.
672Nothing else need be done to clean up the parsing context.
673
2d7f6611 674The code to be parsed comes from C<line> and C<rsfp>. C<line>, if
8eaa0acf 675non-null, provides a string (in SV form) containing code to be parsed.
2d7f6611
KW
676A copy of the string is made, so subsequent modification of C<line>
677does not affect parsing. C<rsfp>, if non-null, provides an input stream
8eaa0acf 678from which code will be read to be parsed. If both are non-null, the
2d7f6611
KW
679code in C<line> comes first and must consist of complete lines of input,
680and C<rsfp> supplies the remainder of the source.
8eaa0acf 681
2d7f6611 682The C<flags> parameter is reserved for future use. Currently it is only
e368b3bd 683used by perl internally, so extensions should always pass zero.
8eaa0acf
Z
684
685=cut
686*/
ffb4593c 687
27fcb6ee 688/* LEX_START_SAME_FILTER indicates that this is not a new file, so it
87606032
NC
689 can share filters with the current parser.
690 LEX_START_DONT_CLOSE indicates that the file handle wasn't opened by the
691 caller, hence isn't owned by the parser, so shouldn't be closed on parser
692 destruction. This is used to handle the case of defaulting to reading the
693 script from the standard input because no filename was given on the command
694 line (without getting confused by situation where STDIN has been closed, so
695 the script handle is opened on fd 0) */
27fcb6ee 696
a0d0e21e 697void
8eaa0acf 698Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
79072805 699{
6ef55633 700 const char *s = NULL;
5486870f 701 yy_parser *parser, *oparser;
90b58c70 702
60d63348 703 if (flags && flags & ~LEX_START_FLAGS)
8eaa0acf 704 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
acdf0a21
DM
705
706 /* create and initialise a parser */
707
199e78b7 708 Newxz(parser, 1, yy_parser);
5486870f 709 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
710 PL_parser = parser;
711
28ac2b49 712 parser->stack = NULL;
df13534a 713 parser->stack_max1 = NULL;
28ac2b49 714 parser->ps = NULL;
acdf0a21 715
e3abe207
DM
716 /* on scope exit, free this parser and restore any outer one */
717 SAVEPARSER(parser);
7c4baf47 718 parser->saved_curcop = PL_curcop;
e3abe207 719
acdf0a21 720 /* initialise lexer state */
8990e307 721
fb205e7a 722 parser->nexttoke = 0;
ca4cfd28 723 parser->error_count = oparser ? oparser->error_count : 0;
7f1c3e8c 724 parser->copline = parser->preambling = NOLINE;
5afb0a62 725 parser->lex_state = LEX_NORMAL;
c2598295 726 parser->expect = XSTATE;
2f9285f8 727 parser->rsfp = rsfp;
b7b52646 728 parser->recheck_utf8_validity = TRUE;
27fcb6ee
FC
729 parser->rsfp_filters =
730 !(flags & LEX_START_SAME_FILTER) || !oparser
d3cd8e11
FC
731 ? NULL
732 : MUTABLE_AV(SvREFCNT_inc(
733 oparser->rsfp_filters
734 ? oparser->rsfp_filters
735 : (oparser->rsfp_filters = newAV())
736 ));
2f9285f8 737
199e78b7
DM
738 Newx(parser->lex_brackstack, 120, char);
739 Newx(parser->lex_casestack, 12, char);
740 *parser->lex_casestack = '\0';
d794b522 741 Newxz(parser->lex_shared, 1, LEXSHARED);
02b34bbe 742
10efb74f 743 if (line) {
0528fd32 744 STRLEN len;
a6909bd2
KW
745 const U8* first_bad_char_loc;
746
10efb74f 747 s = SvPV_const(line, len);
90b58c70 748
0aab20f2
KW
749 if ( SvUTF8(line)
750 && UNLIKELY(! is_utf8_string_loc((U8 *) s,
751 SvCUR(line),
752 &first_bad_char_loc)))
90b58c70
KW
753 {
754 _force_out_malformed_utf8_message(first_bad_char_loc,
755 (U8 *) s + SvCUR(line),
756 0,
757 1 /* 1 means die */ );
758 NOT_REACHED; /* NOTREACHED */
759 }
760
0abcdfa4
FC
761 parser->linestr = flags & LEX_START_COPIED
762 ? SvREFCNT_inc_simple_NN(line)
763 : newSVpvn_flags(s, len, SvUTF8(line));
b3dd0aba
FC
764 if (!rsfp)
765 sv_catpvs(parser->linestr, "\n;");
0abcdfa4 766 } else {
bf1b738b 767 parser->linestr = newSVpvn("\n;", rsfp ? 1 : 2);
8990e307 768 }
218304f9 769
f06b5848
DM
770 parser->oldoldbufptr =
771 parser->oldbufptr =
772 parser->bufptr =
773 parser->linestart = SvPVX(parser->linestr);
774 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
775 parser->last_lop = parser->last_uni = NULL;
b54f893d 776
6d59e610 777 STATIC_ASSERT_STMT(FITS_IN_8_BITS(LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
b54f893d
KW
778 |LEX_DONT_CLOSE_RSFP));
779 parser->lex_flags = (U8) (flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
780 |LEX_DONT_CLOSE_RSFP));
737c24fc 781
60d63348 782 parser->in_pod = parser->filtered = 0;
79072805 783}
a687059c 784
e3abe207
DM
785
786/* delete a parser object */
787
788void
789Perl_parser_free(pTHX_ const yy_parser *parser)
790{
7918f24d
NC
791 PERL_ARGS_ASSERT_PARSER_FREE;
792
7c4baf47 793 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
794 SvREFCNT_dec(parser->linestr);
795
87606032 796 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
2f9285f8 797 PerlIO_clearerr(parser->rsfp);
407f8cf2
KW
798 else if (parser->rsfp && (!parser->old_parser
799 || (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 800 PerlIO_close(parser->rsfp);
5486870f 801 SvREFCNT_dec(parser->rsfp_filters);
10002bc1 802 SvREFCNT_dec(parser->lex_stuff);
7ef70b3d 803 SvREFCNT_dec(parser->lex_sub_repl);
3ac7ff8f
FC
804
805 Safefree(parser->lex_brackstack);
806 Safefree(parser->lex_casestack);
807 Safefree(parser->lex_shared);
808 PL_parser = parser->old_parser;
809 Safefree(parser);
810}
811
812void
813Perl_parser_free_nexttoke_ops(pTHX_ yy_parser *parser, OPSLAB *slab)
814{
3ac7ff8f 815 I32 nexttoke = parser->nexttoke;
3ac7ff8f 816 PERL_ARGS_ASSERT_PARSER_FREE_NEXTTOKE_OPS;
3ce3dcd9 817 while (nexttoke--) {
3ac7ff8f
FC
818 if (S_is_opval_token(parser->nexttype[nexttoke] & 0xffff)
819 && parser->nextval[nexttoke].opval
820 && parser->nextval[nexttoke].opval->op_slabbed
821 && OpSLAB(parser->nextval[nexttoke].opval) == slab) {
3ce3dcd9 822 op_free(parser->nextval[nexttoke].opval);
3ac7ff8f
FC
823 parser->nextval[nexttoke].opval = NULL;
824 }
3ce3dcd9 825 }
e3abe207
DM
826}
827
828
ffb4593c 829/*
78342678 830=for apidoc AmnxUN|SV *|PL_parser-E<gt>linestr
f0e67a1d
Z
831
832Buffer scalar containing the chunk currently under consideration of the
833text currently being lexed. This is always a plain string scalar (for
834which C<SvPOK> is true). It is not intended to be used as a scalar by
835normal scalar means; instead refer to the buffer directly by the pointer
836variables described below.
837
838The lexer maintains various C<char*> pointers to things in the
839C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
840reallocated, all of these pointers must be updated. Don't attempt to
841do this manually, but rather use L</lex_grow_linestr> if you need to
842reallocate the buffer.
843
844The content of the text chunk in the buffer is commonly exactly one
845complete line of input, up to and including a newline terminator,
846but there are situations where it is otherwise. The octets of the
847buffer may be intended to be interpreted as either UTF-8 or Latin-1.
848The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
849flag on this scalar, which may disagree with it.
850
851For direct examination of the buffer, the variable
852L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
853lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
854of these pointers is usually preferable to examination of the scalar
855through normal scalar means.
856
78342678 857=for apidoc AmnxUN|char *|PL_parser-E<gt>bufend
f0e67a1d
Z
858
859Direct pointer to the end of the chunk of text currently being lexed, the
860end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
6602b933 861+ SvCUR(PL_parser-E<gt>linestr)>. A C<NUL> character (zero octet) is
f0e67a1d
Z
862always located at the end of the buffer, and does not count as part of
863the buffer's contents.
864
78342678 865=for apidoc AmnxUN|char *|PL_parser-E<gt>bufptr
f0e67a1d
Z
866
867Points to the current position of lexing inside the lexer buffer.
868Characters around this point may be freely examined, within
869the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
870L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
871interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
872
873Lexing code (whether in the Perl core or not) moves this pointer past
874the characters that it consumes. It is also expected to perform some
875bookkeeping whenever a newline character is consumed. This movement
876can be more conveniently performed by the function L</lex_read_to>,
877which handles newlines appropriately.
878
879Interpretation of the buffer's octets can be abstracted out by
880using the slightly higher-level functions L</lex_peek_unichar> and
881L</lex_read_unichar>.
882
78342678 883=for apidoc AmnxUN|char *|PL_parser-E<gt>linestart
f0e67a1d
Z
884
885Points to the start of the current line inside the lexer buffer.
886This is useful for indicating at which column an error occurred, and
887not much else. This must be updated by any lexing code that consumes
888a newline; the function L</lex_read_to> handles this detail.
889
890=cut
891*/
892
893/*
44170c9a 894=for apidoc lex_bufutf8
f0e67a1d
Z
895
896Indicates whether the octets in the lexer buffer
897(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
898of Unicode characters. If not, they should be interpreted as Latin-1
899characters. This is analogous to the C<SvUTF8> flag for scalars.
900
901In UTF-8 mode, it is not guaranteed that the lexer buffer actually
902contains valid UTF-8. Lexing code must be robust in the face of invalid
903encoding.
904
905The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
906is significant, but not the whole story regarding the input character
907encoding. Normally, when a file is being read, the scalar contains octets
908and its C<SvUTF8> flag is off, but the octets should be interpreted as
909UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
910however, the scalar may have the C<SvUTF8> flag on, and in this case its
911octets should be interpreted as UTF-8 unless the C<use bytes> pragma
912is in effect. This logic may change in the future; use this function
913instead of implementing the logic yourself.
914
915=cut
916*/
917
918bool
919Perl_lex_bufutf8(pTHX)
920{
921 return UTF;
922}
923
924/*
44170c9a 925=for apidoc lex_grow_linestr
f0e67a1d
Z
926
927Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
2d7f6611 928at least C<len> octets (including terminating C<NUL>). Returns a
f0e67a1d
Z
929pointer to the reallocated buffer. This is necessary before making
930any direct modification of the buffer that would increase its length.
931L</lex_stuff_pvn> provides a more convenient way to insert text into
932the buffer.
933
934Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
935this function updates all of the lexer's variables that point directly
936into the buffer.
937
938=cut
939*/
940
941char *
942Perl_lex_grow_linestr(pTHX_ STRLEN len)
943{
944 SV *linestr;
945 char *buf;
946 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
c7641931 947 STRLEN linestart_pos, last_uni_pos, last_lop_pos, re_eval_start_pos;
98d5e3ef
DM
948 bool current;
949
f0e67a1d
Z
950 linestr = PL_parser->linestr;
951 buf = SvPVX(linestr);
952 if (len <= SvLEN(linestr))
953 return buf;
98d5e3ef
DM
954
955 /* Is the lex_shared linestr SV the same as the current linestr SV?
956 * Only in this case does re_eval_start need adjusting, since it
957 * points within lex_shared->ls_linestr's buffer */
b1b8fb6a
DM
958 current = ( !PL_parser->lex_shared->ls_linestr
959 || linestr == PL_parser->lex_shared->ls_linestr);
98d5e3ef 960
f0e67a1d
Z
961 bufend_pos = PL_parser->bufend - buf;
962 bufptr_pos = PL_parser->bufptr - buf;
963 oldbufptr_pos = PL_parser->oldbufptr - buf;
964 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
965 linestart_pos = PL_parser->linestart - buf;
966 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
967 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
98d5e3ef 968 re_eval_start_pos = (current && PL_parser->lex_shared->re_eval_start) ?
3328ab5a 969 PL_parser->lex_shared->re_eval_start - buf : 0;
c7641931 970
f0e67a1d 971 buf = sv_grow(linestr, len);
c7641931 972
f0e67a1d
Z
973 PL_parser->bufend = buf + bufend_pos;
974 PL_parser->bufptr = buf + bufptr_pos;
975 PL_parser->oldbufptr = buf + oldbufptr_pos;
976 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
977 PL_parser->linestart = buf + linestart_pos;
978 if (PL_parser->last_uni)
979 PL_parser->last_uni = buf + last_uni_pos;
980 if (PL_parser->last_lop)
981 PL_parser->last_lop = buf + last_lop_pos;
98d5e3ef 982 if (current && PL_parser->lex_shared->re_eval_start)
3328ab5a 983 PL_parser->lex_shared->re_eval_start = buf + re_eval_start_pos;
f0e67a1d
Z
984 return buf;
985}
986
987/*
44170c9a 988=for apidoc lex_stuff_pvn
f0e67a1d
Z
989
990Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
991immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
992reallocating the buffer if necessary. This means that lexing code that
993runs later will see the characters as if they had appeared in the input.
994It is not recommended to do this as part of normal parsing, and most
995uses of this facility run the risk of the inserted characters being
996interpreted in an unintended manner.
997
2d7f6611
KW
998The string to be inserted is represented by C<len> octets starting
999at C<pv>. These octets are interpreted as either UTF-8 or Latin-1,
1000according to whether the C<LEX_STUFF_UTF8> flag is set in C<flags>.
f0e67a1d
Z
1001The characters are recoded for the lexer buffer, according to how the
1002buffer is currently being interpreted (L</lex_bufutf8>). If a string
9dcc53ea 1003to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
f0e67a1d
Z
1004function is more convenient.
1005
5af38e47
KW
1006=for apidoc Amnh||LEX_STUFF_UTF8
1007
f0e67a1d
Z
1008=cut
1009*/
1010
1011void
83aa740e 1012Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
f0e67a1d 1013{
749123ff 1014 dVAR;
f0e67a1d
Z
1015 char *bufptr;
1016 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
1017 if (flags & ~(LEX_STUFF_UTF8))
1018 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
1019 if (UTF) {
1020 if (flags & LEX_STUFF_UTF8) {
1021 goto plain_copy;
1022 } else {
01ccd497
KW
1023 STRLEN highhalf = variant_under_utf8_count((U8 *) pv,
1024 (U8 *) pv + len);
1025 const char *p, *e = pv+len;;
f0e67a1d
Z
1026 if (!highhalf)
1027 goto plain_copy;
1028 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
1029 bufptr = PL_parser->bufptr;
1030 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1031 SvCUR_set(PL_parser->linestr,
1032 SvCUR(PL_parser->linestr) + len+highhalf);
f0e67a1d
Z
1033 PL_parser->bufend += len+highhalf;
1034 for (p = pv; p != e; p++) {
27d0a47c 1035 append_utf8_from_native_byte(*p, (U8 **) &bufptr);
f0e67a1d
Z
1036 }
1037 }
1038 } else {
1039 if (flags & LEX_STUFF_UTF8) {
1040 STRLEN highhalf = 0;
83aa740e 1041 const char *p, *e = pv+len;
f0e67a1d
Z
1042 for (p = pv; p != e; p++) {
1043 U8 c = (U8)*p;
54d004e8 1044 if (UTF8_IS_ABOVE_LATIN1(c)) {
f0e67a1d
Z
1045 Perl_croak(aTHX_ "Lexing code attempted to stuff "
1046 "non-Latin-1 character into Latin-1 input");
54d004e8 1047 } else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e)) {
f0e67a1d
Z
1048 p++;
1049 highhalf++;
f88c6466 1050 } else assert(UTF8_IS_INVARIANT(c));
f0e67a1d
Z
1051 }
1052 if (!highhalf)
1053 goto plain_copy;
1054 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
1055 bufptr = PL_parser->bufptr;
1056 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1057 SvCUR_set(PL_parser->linestr,
1058 SvCUR(PL_parser->linestr) + len-highhalf);
f0e67a1d 1059 PL_parser->bufend += len-highhalf;
54d004e8
KW
1060 p = pv;
1061 while (p < e) {
1062 if (UTF8_IS_INVARIANT(*p)) {
1063 *bufptr++ = *p;
1064 p++;
f0e67a1d 1065 }
54d004e8
KW
1066 else {
1067 assert(p < e -1 );
a62b247b 1068 *bufptr++ = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
54d004e8
KW
1069 p += 2;
1070 }
f0e67a1d
Z
1071 }
1072 } else {
54d004e8 1073 plain_copy:
f0e67a1d
Z
1074 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1075 bufptr = PL_parser->bufptr;
1076 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
255fdf19 1077 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
f0e67a1d
Z
1078 PL_parser->bufend += len;
1079 Copy(pv, bufptr, len, char);
1080 }
1081 }
1082}
1083
1084/*
44170c9a 1085=for apidoc lex_stuff_pv
9dcc53ea
Z
1086
1087Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1088immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1089reallocating the buffer if necessary. This means that lexing code that
1090runs later will see the characters as if they had appeared in the input.
1091It is not recommended to do this as part of normal parsing, and most
1092uses of this facility run the risk of the inserted characters being
1093interpreted in an unintended manner.
1094
2d7f6611 1095The string to be inserted is represented by octets starting at C<pv>
9dcc53ea
Z
1096and continuing to the first nul. These octets are interpreted as either
1097UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
2d7f6611 1098in C<flags>. The characters are recoded for the lexer buffer, according
9dcc53ea
Z
1099to how the buffer is currently being interpreted (L</lex_bufutf8>).
1100If it is not convenient to nul-terminate a string to be inserted, the
1101L</lex_stuff_pvn> function is more appropriate.
1102
1103=cut
1104*/
1105
1106void
1107Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1108{
1109 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1110 lex_stuff_pvn(pv, strlen(pv), flags);
1111}
1112
1113/*
44170c9a 1114=for apidoc lex_stuff_sv
f0e67a1d
Z
1115
1116Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1117immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1118reallocating the buffer if necessary. This means that lexing code that
1119runs later will see the characters as if they had appeared in the input.
1120It is not recommended to do this as part of normal parsing, and most
1121uses of this facility run the risk of the inserted characters being
1122interpreted in an unintended manner.
1123
2d7f6611 1124The string to be inserted is the string value of C<sv>. The characters
f0e67a1d 1125are recoded for the lexer buffer, according to how the buffer is currently
9dcc53ea 1126being interpreted (L</lex_bufutf8>). If a string to be inserted is
f0e67a1d
Z
1127not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1128need to construct a scalar.
1129
1130=cut
1131*/
1132
1133void
1134Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1135{
1136 char *pv;
1137 STRLEN len;
1138 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1139 if (flags)
1140 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1141 pv = SvPV(sv, len);
1142 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1143}
1144
1145/*
44170c9a 1146=for apidoc lex_unstuff
f0e67a1d
Z
1147
1148Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
2d7f6611 1149C<ptr>. Text following C<ptr> will be moved, and the buffer shortened.
f0e67a1d
Z
1150This hides the discarded text from any lexing code that runs later,
1151as if the text had never appeared.
1152
1153This is not the normal way to consume lexed text. For that, use
1154L</lex_read_to>.
1155
1156=cut
1157*/
1158
1159void
1160Perl_lex_unstuff(pTHX_ char *ptr)
1161{
1162 char *buf, *bufend;
1163 STRLEN unstuff_len;
1164 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1165 buf = PL_parser->bufptr;
1166 if (ptr < buf)
1167 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1168 if (ptr == buf)
1169 return;
1170 bufend = PL_parser->bufend;
1171 if (ptr > bufend)
1172 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1173 unstuff_len = ptr - buf;
1174 Move(ptr, buf, bufend+1-ptr, char);
1175 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1176 PL_parser->bufend = bufend - unstuff_len;
1177}
1178
1179/*
44170c9a 1180=for apidoc lex_read_to
f0e67a1d
Z
1181
1182Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
2d7f6611 1183to C<ptr>. This advances L</PL_parser-E<gt>bufptr> to match C<ptr>,
f0e67a1d
Z
1184performing the correct bookkeeping whenever a newline character is passed.
1185This is the normal way to consume lexed text.
1186
1187Interpretation of the buffer's octets can be abstracted out by
1188using the slightly higher-level functions L</lex_peek_unichar> and
1189L</lex_read_unichar>.
1190
1191=cut
1192*/
1193
1194void
1195Perl_lex_read_to(pTHX_ char *ptr)
1196{
1197 char *s;
1198 PERL_ARGS_ASSERT_LEX_READ_TO;
1199 s = PL_parser->bufptr;
1200 if (ptr < s || ptr > PL_parser->bufend)
1201 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1202 for (; s != ptr; s++)
1203 if (*s == '\n') {
83944c01 1204 COPLINE_INC_WITH_HERELINES;
f0e67a1d
Z
1205 PL_parser->linestart = s+1;
1206 }
1207 PL_parser->bufptr = ptr;
1208}
1209
1210/*
44170c9a 1211=for apidoc lex_discard_to
f0e67a1d
Z
1212
1213Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
2d7f6611
KW
1214up to C<ptr>. The remaining content of the buffer will be moved, and
1215all pointers into the buffer updated appropriately. C<ptr> must not
f0e67a1d
Z
1216be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1217it is not permitted to discard text that has yet to be lexed.
1218
1219Normally it is not necessarily to do this directly, because it suffices to
1220use the implicit discarding behaviour of L</lex_next_chunk> and things
1221based on it. However, if a token stretches across multiple lines,
1f317c95 1222and the lexing code has kept multiple lines of text in the buffer for
f0e67a1d
Z
1223that purpose, then after completion of the token it would be wise to
1224explicitly discard the now-unneeded earlier lines, to avoid future
1225multi-line tokens growing the buffer without bound.
1226
1227=cut
1228*/
1229
1230void
1231Perl_lex_discard_to(pTHX_ char *ptr)
1232{
1233 char *buf;
1234 STRLEN discard_len;
1235 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1236 buf = SvPVX(PL_parser->linestr);
1237 if (ptr < buf)
1238 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1239 if (ptr == buf)
1240 return;
1241 if (ptr > PL_parser->bufptr)
1242 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1243 discard_len = ptr - buf;
1244 if (PL_parser->oldbufptr < ptr)
1245 PL_parser->oldbufptr = ptr;
1246 if (PL_parser->oldoldbufptr < ptr)
1247 PL_parser->oldoldbufptr = ptr;
1248 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1249 PL_parser->last_uni = NULL;
1250 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1251 PL_parser->last_lop = NULL;
1252 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1253 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1254 PL_parser->bufend -= discard_len;
1255 PL_parser->bufptr -= discard_len;
1256 PL_parser->oldbufptr -= discard_len;
1257 PL_parser->oldoldbufptr -= discard_len;
1258 if (PL_parser->last_uni)
1259 PL_parser->last_uni -= discard_len;
1260 if (PL_parser->last_lop)
1261 PL_parser->last_lop -= discard_len;
1262}
1263
efa571ab
KW
1264void
1265Perl_notify_parser_that_changed_to_utf8(pTHX)
1266{
1267 /* Called when $^H is changed to indicate that HINT_UTF8 has changed from
1268 * off to on. At compile time, this has the effect of entering a 'use
1269 * utf8' section. This means that any input was not previously checked for
1270 * UTF-8 (because it was off), but now we do need to check it, or our
1271 * assumptions about the input being sane could be wrong, and we could
1272 * segfault. This routine just sets a flag so that the next time we look
1273 * at the input we do the well-formed UTF-8 check. If we aren't in the
1274 * proper phase, there may not be a parser object, but if there is, setting
1275 * the flag is harmless */
1276
1277 if (PL_parser) {
1278 PL_parser->recheck_utf8_validity = TRUE;
1279 }
1280}
1281
f0e67a1d 1282/*
44170c9a 1283=for apidoc lex_next_chunk
f0e67a1d
Z
1284
1285Reads in the next chunk of text to be lexed, appending it to
1286L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1287looked to the end of the current chunk and wants to know more. It is
1288usual, but not necessary, for lexing to have consumed the entirety of
1289the current chunk at this time.
1290
1291If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1292chunk (i.e., the current chunk has been entirely consumed), normally the
1293current chunk will be discarded at the same time that the new chunk is
c5608a1f 1294read in. If C<flags> has the C<LEX_KEEP_PREVIOUS> bit set, the current chunk
f0e67a1d
Z
1295will not be discarded. If the current chunk has not been entirely
1296consumed, then it will not be discarded regardless of the flag.
1297
1298Returns true if some new text was added to the buffer, or false if the
1299buffer has reached the end of the input text.
1300
5af38e47
KW
1301=for apidoc Amnh||LEX_KEEP_PREVIOUS
1302
f0e67a1d
Z
1303=cut
1304*/
1305
1306#define LEX_FAKE_EOF 0x80000000
e47d32dc 1307#define LEX_NO_TERM 0x40000000 /* here-doc */
f0e67a1d
Z
1308
1309bool
1310Perl_lex_next_chunk(pTHX_ U32 flags)
1311{
1312 SV *linestr;
1313 char *buf;
1314 STRLEN old_bufend_pos, new_bufend_pos;
1315 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1316 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1317 bool got_some_for_debugger = 0;
f0e67a1d 1318 bool got_some;
6cdc5cd8 1319
112d1284 1320 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM))
f0e67a1d 1321 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
d27f4b91 1322 if (!(flags & LEX_NO_TERM) && PL_lex_inwhat)
e47d32dc 1323 return FALSE;
f0e67a1d
Z
1324 linestr = PL_parser->linestr;
1325 buf = SvPVX(linestr);
407f8cf2
KW
1326 if (!(flags & LEX_KEEP_PREVIOUS)
1327 && PL_parser->bufptr == PL_parser->bufend)
1328 {
f0e67a1d
Z
1329 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1330 linestart_pos = 0;
1331 if (PL_parser->last_uni != PL_parser->bufend)
1332 PL_parser->last_uni = NULL;
1333 if (PL_parser->last_lop != PL_parser->bufend)
1334 PL_parser->last_lop = NULL;
1335 last_uni_pos = last_lop_pos = 0;
1336 *buf = 0;
2324bdb9 1337 SvCUR_set(linestr, 0);
f0e67a1d
Z
1338 } else {
1339 old_bufend_pos = PL_parser->bufend - buf;
1340 bufptr_pos = PL_parser->bufptr - buf;
1341 oldbufptr_pos = PL_parser->oldbufptr - buf;
1342 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1343 linestart_pos = PL_parser->linestart - buf;
1344 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1345 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1346 }
1347 if (flags & LEX_FAKE_EOF) {
1348 goto eof;
60d63348 1349 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
f0e67a1d
Z
1350 got_some = 0;
1351 } else if (filter_gets(linestr, old_bufend_pos)) {
1352 got_some = 1;
17cc9359 1353 got_some_for_debugger = 1;
112d1284
FC
1354 } else if (flags & LEX_NO_TERM) {
1355 got_some = 0;
f0e67a1d 1356 } else {
580561a3 1357 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
847cc851 1358 SvPVCLEAR(linestr);
f0e67a1d
Z
1359 eof:
1360 /* End of real input. Close filehandle (unless it was STDIN),
1361 * then add implicit termination.
1362 */
87606032 1363 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
f0e67a1d
Z
1364 PerlIO_clearerr(PL_parser->rsfp);
1365 else if (PL_parser->rsfp)
1366 (void)PerlIO_close(PL_parser->rsfp);
1367 PL_parser->rsfp = NULL;
60d63348 1368 PL_parser->in_pod = PL_parser->filtered = 0;
f0e67a1d
Z
1369 if (!PL_in_eval && PL_minus_p) {
1370 sv_catpvs(linestr,
1371 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1372 PL_minus_n = PL_minus_p = 0;
1373 } else if (!PL_in_eval && PL_minus_n) {
1374 sv_catpvs(linestr, /*{*/";}");
1375 PL_minus_n = 0;
1376 } else
1377 sv_catpvs(linestr, ";");
1378 got_some = 1;
1379 }
1380 buf = SvPVX(linestr);
1381 new_bufend_pos = SvCUR(linestr);
1382 PL_parser->bufend = buf + new_bufend_pos;
1383 PL_parser->bufptr = buf + bufptr_pos;
6cdc5cd8 1384
07337b95
KW
1385 if (UTF) {
1386 const U8* first_bad_char_loc;
1387 if (UNLIKELY(! is_utf8_string_loc(
1388 (U8 *) PL_parser->bufptr,
1389 PL_parser->bufend - PL_parser->bufptr,
1390 &first_bad_char_loc)))
1391 {
1392 _force_out_malformed_utf8_message(first_bad_char_loc,
1393 (U8 *) PL_parser->bufend,
1394 0,
1395 1 /* 1 means die */ );
1396 NOT_REACHED; /* NOTREACHED */
1397 }
6cdc5cd8
KW
1398 }
1399
f0e67a1d
Z
1400 PL_parser->oldbufptr = buf + oldbufptr_pos;
1401 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1402 PL_parser->linestart = buf + linestart_pos;
1403 if (PL_parser->last_uni)
1404 PL_parser->last_uni = buf + last_uni_pos;
1405 if (PL_parser->last_lop)
1406 PL_parser->last_lop = buf + last_lop_pos;
7f1c3e8c
FC
1407 if (PL_parser->preambling != NOLINE) {
1408 CopLINE_set(PL_curcop, PL_parser->preambling + 1);
1409 PL_parser->preambling = NOLINE;
1410 }
407f8cf2
KW
1411 if ( got_some_for_debugger
1412 && PERLDB_LINE_OR_SAVESRC
1413 && PL_curstash != PL_debstash)
1414 {
f0e67a1d
Z
1415 /* debugger active and we're not compiling the debugger code,
1416 * so store the line into the debugger's array of lines
1417 */
1418 update_debugger_info(NULL, buf+old_bufend_pos,
1419 new_bufend_pos-old_bufend_pos);
1420 }
1421 return got_some;
1422}
1423
1424/*
44170c9a 1425=for apidoc lex_peek_unichar
f0e67a1d
Z
1426
1427Looks ahead one (Unicode) character in the text currently being lexed.
1428Returns the codepoint (unsigned integer value) of the next character,
1429or -1 if lexing has reached the end of the input text. To consume the
1430peeked character, use L</lex_read_unichar>.
1431
1432If the next character is in (or extends into) the next chunk of input
1433text, the next chunk will be read in. Normally the current chunk will be
c5608a1f
KW
1434discarded at the same time, but if C<flags> has the C<LEX_KEEP_PREVIOUS>
1435bit set, then the current chunk will not be discarded.
f0e67a1d
Z
1436
1437If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1438is encountered, an exception is generated.
1439
1440=cut
1441*/
1442
1443I32
1444Perl_lex_peek_unichar(pTHX_ U32 flags)
1445{
749123ff 1446 dVAR;
f0e67a1d
Z
1447 char *s, *bufend;
1448 if (flags & ~(LEX_KEEP_PREVIOUS))
1449 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1450 s = PL_parser->bufptr;
1451 bufend = PL_parser->bufend;
1452 if (UTF) {
1453 U8 head;
1454 I32 unichar;
1455 STRLEN len, retlen;
1456 if (s == bufend) {
1457 if (!lex_next_chunk(flags))
1458 return -1;
1459 s = PL_parser->bufptr;
1460 bufend = PL_parser->bufend;
1461 }
1462 head = (U8)*s;
54d004e8 1463 if (UTF8_IS_INVARIANT(head))
f0e67a1d 1464 return head;
54d004e8
KW
1465 if (UTF8_IS_START(head)) {
1466 len = UTF8SKIP(&head);
f0e67a1d
Z
1467 while ((STRLEN)(bufend-s) < len) {
1468 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1469 break;
1470 s = PL_parser->bufptr;
1471 bufend = PL_parser->bufend;
1472 }
1473 }
c80e42f3 1474 unichar = utf8n_to_uvchr((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
f0e67a1d 1475 if (retlen == (STRLEN)-1) {
75219bac
KW
1476 _force_out_malformed_utf8_message((U8 *) s,
1477 (U8 *) bufend,
1478 0,
1479 1 /* 1 means die */ );
1480 NOT_REACHED; /* NOTREACHED */
f0e67a1d
Z
1481 }
1482 return unichar;
1483 } else {
1484 if (s == bufend) {
1485 if (!lex_next_chunk(flags))
1486 return -1;
1487 s = PL_parser->bufptr;
1488 }
1489 return (U8)*s;
1490 }
1491}
1492
1493/*
44170c9a 1494=for apidoc lex_read_unichar
f0e67a1d
Z
1495
1496Reads the next (Unicode) character in the text currently being lexed.
1497Returns the codepoint (unsigned integer value) of the character read,
1498and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1499if lexing has reached the end of the input text. To non-destructively
1500examine the next character, use L</lex_peek_unichar> instead.
1501
1502If the next character is in (or extends into) the next chunk of input
1503text, the next chunk will be read in. Normally the current chunk will be
c5608a1f
KW
1504discarded at the same time, but if C<flags> has the C<LEX_KEEP_PREVIOUS>
1505bit set, then the current chunk will not be discarded.
f0e67a1d
Z
1506
1507If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1508is encountered, an exception is generated.
1509
1510=cut
1511*/
1512
1513I32
1514Perl_lex_read_unichar(pTHX_ U32 flags)
1515{
1516 I32 c;
1517 if (flags & ~(LEX_KEEP_PREVIOUS))
1518 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1519 c = lex_peek_unichar(flags);
1520 if (c != -1) {
1521 if (c == '\n')
83944c01 1522 COPLINE_INC_WITH_HERELINES;
d9018cbe
EB
1523 if (UTF)
1524 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1525 else
1526 ++(PL_parser->bufptr);
f0e67a1d
Z
1527 }
1528 return c;
1529}
1530
1531/*
44170c9a 1532=for apidoc lex_read_space
f0e67a1d
Z
1533
1534Reads optional spaces, in Perl style, in the text currently being
1535lexed. The spaces may include ordinary whitespace characters and
1536Perl-style comments. C<#line> directives are processed if encountered.
1537L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1538at a non-space character (or the end of the input text).
1539
1540If spaces extend into the next chunk of input text, the next chunk will
1541be read in. Normally the current chunk will be discarded at the same
c5608a1f 1542time, but if C<flags> has the C<LEX_KEEP_PREVIOUS> bit set, then the current
f0e67a1d
Z
1543chunk will not be discarded.
1544
1545=cut
1546*/
1547
21791330 1548#define LEX_NO_INCLINE 0x40000000
f0998909
Z
1549#define LEX_NO_NEXT_CHUNK 0x80000000
1550
f0e67a1d
Z
1551void
1552Perl_lex_read_space(pTHX_ U32 flags)
1553{
1554 char *s, *bufend;
21791330 1555 const bool can_incline = !(flags & LEX_NO_INCLINE);
f0e67a1d 1556 bool need_incline = 0;
21791330 1557 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK|LEX_NO_INCLINE))
f0e67a1d 1558 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
f0e67a1d
Z
1559 s = PL_parser->bufptr;
1560 bufend = PL_parser->bufend;
1561 while (1) {
1562 char c = *s;
1563 if (c == '#') {
1564 do {
1565 c = *++s;
1566 } while (!(c == '\n' || (c == 0 && s == bufend)));
1567 } else if (c == '\n') {
1568 s++;
21791330
FC
1569 if (can_incline) {
1570 PL_parser->linestart = s;
1571 if (s == bufend)
1572 need_incline = 1;
1573 else
c6875f94 1574 incline(s, bufend);
21791330 1575 }
f0e67a1d
Z
1576 } else if (isSPACE(c)) {
1577 s++;
1578 } else if (c == 0 && s == bufend) {
1579 bool got_more;
65c68e17 1580 line_t l;
f0998909
Z
1581 if (flags & LEX_NO_NEXT_CHUNK)
1582 break;
f0e67a1d 1583 PL_parser->bufptr = s;
65c68e17 1584 l = CopLINE(PL_curcop);
851b527a 1585 CopLINE(PL_curcop) += PL_parser->herelines + 1;
f0e67a1d 1586 got_more = lex_next_chunk(flags);
65c68e17 1587 CopLINE_set(PL_curcop, l);
f0e67a1d
Z
1588 s = PL_parser->bufptr;
1589 bufend = PL_parser->bufend;
1590 if (!got_more)
1591 break;
21791330 1592 if (can_incline && need_incline && PL_parser->rsfp) {
c6875f94 1593 incline(s, bufend);
f0e67a1d
Z
1594 need_incline = 0;
1595 }
3c47da3c
FC
1596 } else if (!c) {
1597 s++;
f0e67a1d
Z
1598 } else {
1599 break;
1600 }
1601 }
f0e67a1d
Z
1602 PL_parser->bufptr = s;
1603}
1604
1605/*
fe788d6b 1606
44170c9a 1607=for apidoc validate_proto
fe788d6b
PM
1608
1609This function performs syntax checking on a prototype, C<proto>.
1610If C<warn> is true, any illegal characters or mismatched brackets
1611will trigger illegalproto warnings, declaring that they were
1612detected in the prototype for C<name>.
1613
1614The return value is C<true> if this is a valid prototype, and
1615C<false> if it is not, regardless of whether C<warn> was C<true> or
1616C<false>.
1617
1618Note that C<NULL> is a valid C<proto> and will always return C<true>.
1619
1620=cut
1621
1622 */
1623
1624bool
5783dc51 1625Perl_validate_proto(pTHX_ SV *name, SV *proto, bool warn, bool curstash)
fe788d6b
PM
1626{
1627 STRLEN len, origlen;
11327fa1 1628 char *p;
fe788d6b
PM
1629 bool bad_proto = FALSE;
1630 bool in_brackets = FALSE;
1631 bool after_slash = FALSE;
1632 char greedy_proto = ' ';
1633 bool proto_after_greedy_proto = FALSE;
1634 bool must_be_last = FALSE;
1635 bool underscore = FALSE;
f791a21a 1636 bool bad_proto_after_underscore = FALSE;
fe788d6b
PM
1637
1638 PERL_ARGS_ASSERT_VALIDATE_PROTO;
1639
1640 if (!proto)
1641 return TRUE;
1642
11327fa1 1643 p = SvPV(proto, len);
fe788d6b
PM
1644 origlen = len;
1645 for (; len--; p++) {
1646 if (!isSPACE(*p)) {
1647 if (must_be_last)
1648 proto_after_greedy_proto = TRUE;
f791a21a 1649 if (underscore) {
4aada8b9 1650 if (!memCHRs(";@%", *p))
f791a21a
PM
1651 bad_proto_after_underscore = TRUE;
1652 underscore = FALSE;
1653 }
4aada8b9 1654 if (!memCHRs("$@%*;[]&\\_+", *p) || *p == '\0') {
fe788d6b
PM
1655 bad_proto = TRUE;
1656 }
1657 else {
fe788d6b
PM
1658 if (*p == '[')
1659 in_brackets = TRUE;
1660 else if (*p == ']')
1661 in_brackets = FALSE;
407f8cf2
KW
1662 else if ((*p == '@' || *p == '%')
1663 && !after_slash
1664 && !in_brackets )
1665 {
fe788d6b
PM
1666 must_be_last = TRUE;
1667 greedy_proto = *p;
1668 }
1669 else if (*p == '_')
f791a21a 1670 underscore = TRUE;
fe788d6b
PM
1671 }
1672 if (*p == '\\')
1673 after_slash = TRUE;
1674 else
1675 after_slash = FALSE;
1676 }
1677 }
1678
1679 if (warn) {
b54d603d 1680 SV *tmpsv = newSVpvs_flags("", SVs_TEMP);
fe788d6b 1681 p -= origlen;
b54d603d
PM
1682 p = SvUTF8(proto)
1683 ? sv_uni_display(tmpsv, newSVpvn_flags(p, origlen, SVs_TEMP | SVf_UTF8),
1684 origlen, UNI_DISPLAY_ISPRINT)
1685 : pv_pretty(tmpsv, p, origlen, 60, NULL, NULL, PERL_PV_ESCAPE_NONASCII);
1686
cbf83791
FC
1687 if (curstash && !memchr(SvPVX(name), ':', SvCUR(name))) {
1688 SV *name2 = sv_2mortal(newSVsv(PL_curstname));
1689 sv_catpvs(name2, "::");
1690 sv_catsv(name2, (SV *)name);
1691 name = name2;
1692 }
1693
fe788d6b
PM
1694 if (proto_after_greedy_proto)
1695 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
147e3846 1696 "Prototype after '%c' for %" SVf " : %s",
fe788d6b 1697 greedy_proto, SVfARG(name), p);
50278ed0
PM
1698 if (in_brackets)
1699 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
147e3846 1700 "Missing ']' in prototype for %" SVf " : %s",
50278ed0 1701 SVfARG(name), p);
b54d603d 1702 if (bad_proto)
fe788d6b 1703 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
147e3846 1704 "Illegal character in prototype for %" SVf " : %s",
f791a21a
PM
1705 SVfARG(name), p);
1706 if (bad_proto_after_underscore)
1707 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
147e3846 1708 "Illegal character after '_' in prototype for %" SVf " : %s",
f791a21a 1709 SVfARG(name), p);
fe788d6b
PM
1710 }
1711
1712 return (! (proto_after_greedy_proto || bad_proto) );
1713}
1714
1715/*
ffb4593c
NT
1716 * S_incline
1717 * This subroutine has nothing to do with tilting, whether at windmills
1718 * or pinball tables. Its name is short for "increment line". It
57843af0 1719 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1720 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1721 * # line 500 "foo.pm"
1722 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1723 */
1724
76e3520e 1725STATIC void
c6875f94 1726S_incline(pTHX_ const char *s, const char *end)
463ee0b2 1727{
d9095cec
NC
1728 const char *t;
1729 const char *n;
1730 const char *e;
8818d409 1731 line_t line_num;
22ff3130 1732 UV uv;
463ee0b2 1733
7918f24d
NC
1734 PERL_ARGS_ASSERT_INCLINE;
1735
d77eff5d
KW
1736 assert(end >= s);
1737
83944c01 1738 COPLINE_INC_WITH_HERELINES;
451f421f
FC
1739 if (!PL_rsfp && !PL_parser->filtered && PL_lex_state == LEX_NORMAL
1740 && s+1 == PL_bufend && *s == ';') {
1741 /* fake newline in string eval */
1742 CopLINE_dec(PL_curcop);
1743 return;
1744 }
463ee0b2
LW
1745 if (*s++ != '#')
1746 return;
d4c19fe8
AL
1747 while (SPACE_OR_TAB(*s))
1748 s++;
d77eff5d
KW
1749 if (memBEGINs(s, (STRLEN) (end - s), "line"))
1750 s += sizeof("line") - 1;
73659bf1
GS
1751 else
1752 return;
084592ab 1753 if (SPACE_OR_TAB(*s))
73659bf1 1754 s++;
4e553d73 1755 else
73659bf1 1756 return;
d4c19fe8
AL
1757 while (SPACE_OR_TAB(*s))
1758 s++;
463ee0b2
LW
1759 if (!isDIGIT(*s))
1760 return;
d4c19fe8 1761
463ee0b2
LW
1762 n = s;
1763 while (isDIGIT(*s))
1764 s++;
07714eb4 1765 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1766 return;
bf4acbe4 1767 while (SPACE_OR_TAB(*s))
463ee0b2 1768 s++;
85bb8b90 1769 if (*s == '"' && (t = (char *) memchr(s+1, '"', end - s))) {
463ee0b2 1770 s++;
73659bf1
GS
1771 e = t + 1;
1772 }
463ee0b2 1773 else {
c35e046a 1774 t = s;
1bb1a3d6 1775 while (*t && !isSPACE(*t))
c35e046a 1776 t++;
73659bf1 1777 e = t;
463ee0b2 1778 }
bf4acbe4 1779 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1780 e++;
1781 if (*e != '\n' && *e != '\0')
1782 return; /* false alarm */
1783
22ff3130
HS
1784 if (!grok_atoUV(n, &uv, &e))
1785 return;
1786 line_num = ((line_t)uv) - 1;
8818d409 1787
f4dd75d9 1788 if (t - s > 0) {
d9095cec 1789 const STRLEN len = t - s;
3df32bda 1790
d36ee5be 1791 if (!PL_rsfp && !PL_parser->filtered) {
e66cf94c
RGS
1792 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1793 * to *{"::_<newfilename"} */
44867030
NC
1794 /* However, the long form of evals is only turned on by the
1795 debugger - usually they're "(eval %lu)" */
d36ee5be
FC
1796 GV * const cfgv = CopFILEGV(PL_curcop);
1797 if (cfgv) {
38bd7ad8
FC
1798 char smallbuf[128];
1799 STRLEN tmplen2 = len;
44867030 1800 char *tmpbuf2;
449dd039 1801 GV *gv2;
44867030
NC
1802
1803 if (tmplen2 + 2 <= sizeof smallbuf)
1804 tmpbuf2 = smallbuf;
1805 else
1806 Newx(tmpbuf2, tmplen2 + 2, char);
1807
38bd7ad8
FC
1808 tmpbuf2[0] = '_';
1809 tmpbuf2[1] = '<';
44867030
NC
1810
1811 memcpy(tmpbuf2 + 2, s, tmplen2);
1812 tmplen2 += 2;
1813
8a5ee598 1814 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1815 if (!isGV(gv2)) {
8a5ee598 1816 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1817 /* adjust ${"::_<newfilename"} to store the new file name */
1818 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
8818d409
FC
1819 /* The line number may differ. If that is the case,
1820 alias the saved lines that are in the array.
1821 Otherwise alias the whole array. */
1822 if (CopLINE(PL_curcop) == line_num) {
38bd7ad8
FC
1823 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(cfgv)));
1824 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(cfgv)));
8818d409 1825 }
38bd7ad8
FC
1826 else if (GvAV(cfgv)) {
1827 AV * const av = GvAV(cfgv);
515c395b
TC
1828 const line_t start = CopLINE(PL_curcop)+1;
1829 SSize_t items = AvFILLp(av) - start;
8818d409
FC
1830 if (items > 0) {
1831 AV * const av2 = GvAVn(gv2);
1832 SV **svp = AvARRAY(av) + start;
515c395b
TC
1833 Size_t l = line_num+1;
1834 while (items-- && l < SSize_t_MAX && l == (line_t)l)
1835 av_store(av2, (SSize_t)l++, SvREFCNT_inc(*svp++));
8818d409
FC
1836 }
1837 }
e5527e4b 1838 }
44867030
NC
1839
1840 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
d36ee5be 1841 }
e66cf94c 1842 }
05ec9bb3 1843 CopFILE_free(PL_curcop);
449dd039 1844 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1845 }
8818d409 1846 CopLINE_set(PL_curcop, line_num);
463ee0b2
LW
1847}
1848
80a702cd 1849STATIC void
15f169a1 1850S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1851{
1852 AV *av = CopFILEAVx(PL_curcop);
1853 if (av) {
7f1c3e8c
FC
1854 SV * sv;
1855 if (PL_parser->preambling == NOLINE) sv = newSV_type(SVt_PVMG);
1856 else {
1857 sv = *av_fetch(av, 0, 1);
1858 SvUPGRADE(sv, SVt_PVMG);
1859 }
847cc851 1860 if (!SvPOK(sv)) SvPVCLEAR(sv);
5fa550fb 1861 if (orig_sv)
7f1c3e8c 1862 sv_catsv(sv, orig_sv);
5fa550fb 1863 else
7f1c3e8c
FC
1864 sv_catpvn(sv, buf, len);
1865 if (!SvIOK(sv)) {
1866 (void)SvIOK_on(sv);
1867 SvIV_set(sv, 0);
1868 }
1869 if (PL_parser->preambling == NOLINE)
1870 av_store(av, CopLINE(PL_curcop), sv);
80a702cd
RGS
1871 }
1872}
1873
ffb4593c 1874/*
8c6b0c7d 1875 * skipspace
ffb4593c
NT
1876 * Called to gobble the appropriate amount and type of whitespace.
1877 * Skips comments as well.
71fff7cb 1878 * Returns the next character after the whitespace that is skipped.
8c6b0c7d
FC
1879 *
1880 * peekspace
1881 * Same thing, but look ahead without incrementing line numbers or
1882 * adjusting PL_linestart.
ffb4593c
NT
1883 */
1884
3218e223 1885#define skipspace(s) skipspace_flags(s, 0)
8c6b0c7d 1886#define peekspace(s) skipspace_flags(s, LEX_NO_INCLINE)
3218e223 1887
aabfeadc
KW
1888char *
1889Perl_skipspace_flags(pTHX_ char *s, U32 flags)
a687059c 1890{
21791330 1891 PERL_ARGS_ASSERT_SKIPSPACE_FLAGS;
3280af22 1892 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
3c47da3c 1893 while (s < PL_bufend && (SPACE_OR_TAB(*s) || !*s))
463ee0b2 1894 s++;
f0e67a1d
Z
1895 } else {
1896 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1897 PL_bufptr = s;
21791330 1898 lex_read_space(flags | LEX_KEEP_PREVIOUS |
d27f4b91 1899 (PL_lex_inwhat || PL_lex_state == LEX_FORMLINE ?
f0998909 1900 LEX_NO_NEXT_CHUNK : 0));
3280af22 1901 s = PL_bufptr;
f0e67a1d
Z
1902 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1903 if (PL_linestart > PL_bufptr)
1904 PL_bufptr = PL_linestart;
1905 return s;
463ee0b2 1906 }
5db06880 1907 return s;
a687059c 1908}
378cc40b 1909
ffb4593c
NT
1910/*
1911 * S_check_uni
1912 * Check the unary operators to ensure there's no ambiguity in how they're
1913 * used. An ambiguous piece of code would be:
1914 * rand + 5
1915 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1916 * the +5 is its argument.
1917 */
1918
76e3520e 1919STATIC void
cea2e8a9 1920S_check_uni(pTHX)
ba106d47 1921{
d4c19fe8 1922 const char *s;
2f3197b3 1923
3280af22 1924 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1925 return;
3280af22
NIS
1926 while (isSPACE(*PL_last_uni))
1927 PL_last_uni++;
c35e046a 1928 s = PL_last_uni;
fac0f7a3 1929 while (isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF) || *s == '-')
8ce2ba82 1930 s += UTF ? UTF8SKIP(s) : 1;
4efcdc02 1931 if (s < PL_bufptr && memchr(s, '(', PL_bufptr - s))
a0d0e21e 1932 return;
6136c704 1933
9b387841 1934 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
147e3846 1935 "Warning: Use of \"%" UTF8f "\" without parentheses is ambiguous",
b59c097b 1936 UTF8fARG(UTF, (int)(s - PL_last_uni), PL_last_uni));
2f3197b3
LW
1937}
1938
ffb4593c
NT
1939/*
1940 * LOP : macro to build a list operator. Its behaviour has been replaced
1941 * with a subroutine, S_lop() for which LOP is just another name.
1942 */
1943
a0d0e21e
LW
1944#define LOP(f,x) return lop(f,x,s)
1945
ffb4593c
NT
1946/*
1947 * S_lop
1948 * Build a list operator (or something that might be one). The rules:
41e8cbf4
FC
1949 * - if we have a next token, then it's a list operator (no parens) for
1950 * which the next token has already been parsed; e.g.,
1951 * sort foo @args
1952 * sort foo (@args)
ffb4593c
NT
1953 * - if the next thing is an opening paren, then it's a function
1954 * - else it's a list operator
1955 */
1956
76e3520e 1957STATIC I32
11288bb3 1958S_lop(pTHX_ I32 f, U8 x, char *s)
ffed7fef 1959{
7918f24d
NC
1960 PERL_ARGS_ASSERT_LOP;
1961
6154021b 1962 pl_yylval.ival = f;
35c8bce7 1963 CLINE;
3280af22
NIS
1964 PL_bufptr = s;
1965 PL_last_lop = PL_oldbufptr;
eb160463 1966 PL_last_lop_op = (OPCODE)f;
3280af22 1967 if (PL_nexttoke)
78cdf107 1968 goto lstop;
19f1898a 1969 PL_expect = x;
79072805 1970 if (*s == '(')
bbf60fe6 1971 return REPORT(FUNC);
294a536f 1972 s = skipspace(s);
79072805 1973 if (*s == '(')
bbf60fe6 1974 return REPORT(FUNC);
78cdf107
Z
1975 else {
1976 lstop:
1977 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
1978 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
bbf60fe6 1979 return REPORT(LSTOP);
78cdf107 1980 }
79072805
LW
1981}
1982
ffb4593c
NT
1983/*
1984 * S_force_next
9cbb5ea2 1985 * When the lexer realizes it knows the next token (for instance,
ffb4593c 1986 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 1987 * to know what token to return the next time the lexer is called. Caller
b5bbe64a
JH
1988 * will need to set PL_nextval[] and possibly PL_expect to ensure
1989 * the lexer handles the token correctly.
ffb4593c
NT
1990 */
1991
4e553d73 1992STATIC void
cea2e8a9 1993S_force_next(pTHX_ I32 type)
79072805 1994{
704d4215
GG
1995#ifdef DEBUGGING
1996 if (DEBUG_T_TEST) {
1997 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 1998 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
1999 }
2000#endif
1f7c3e7c 2001 assert(PL_nexttoke < C_ARRAY_LENGTH(PL_nexttype));
3280af22
NIS
2002 PL_nexttype[PL_nexttoke] = type;
2003 PL_nexttoke++;
79072805
LW
2004}
2005
89f35911
FC
2006/*
2007 * S_postderef
2008 *
2009 * This subroutine handles postfix deref syntax after the arrow has already
b3f7b7ad 2010 * been emitted. @* $* etc. are emitted as two separate tokens right here.
89f35911
FC
2011 * @[ @{ %[ %{ *{ are emitted also as two tokens, but this function emits
2012 * only the first, leaving yylex to find the next.
89f35911
FC
2013 */
2014
2015static int
ff25e5db 2016S_postderef(pTHX_ int const funny, char const next)
89f35911 2017{
4aada8b9 2018 assert(funny == DOLSHARP || memCHRs("$@%&*", funny));
89f35911
FC
2019 if (next == '*') {
2020 PL_expect = XOPERATOR;
cc624add 2021 if (PL_lex_state == LEX_INTERPNORMAL && !PL_lex_brackets) {
ff25e5db 2022 assert('@' == funny || '$' == funny || DOLSHARP == funny);
cc624add 2023 PL_lex_state = LEX_INTERPEND;
c3492809
FC
2024 if ('@' == funny)
2025 force_next(POSTJOIN);
cc624add 2026 }
89f35911
FC
2027 force_next(next);
2028 PL_bufptr+=2;
2029 }
2030 else {
760ca746
FC
2031 if ('@' == funny && PL_lex_state == LEX_INTERPNORMAL
2032 && !PL_lex_brackets)
2033 PL_lex_dojoin = 2;
89f35911
FC
2034 PL_expect = XOPERATOR;
2035 PL_bufptr++;
2036 }
2037 return funny;
2038}
2039
28ac2b49
Z
2040void
2041Perl_yyunlex(pTHX)
2042{
a7aaec61
Z
2043 int yyc = PL_parser->yychar;
2044 if (yyc != YYEMPTY) {
2045 if (yyc) {
a7aaec61
Z
2046 NEXTVAL_NEXTTOKE = PL_parser->yylval;
2047 if (yyc == '{'/*}*/ || yyc == HASHBRACK || yyc == '['/*]*/) {
78cdf107 2048 PL_lex_allbrackets--;
a7aaec61 2049 PL_lex_brackets--;
78cdf107
Z
2050 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
2051 } else if (yyc == '('/*)*/) {
2052 PL_lex_allbrackets--;
2053 yyc |= (2<<24);
a7aaec61
Z
2054 }
2055 force_next(yyc);
2056 }
28ac2b49
Z
2057 PL_parser->yychar = YYEMPTY;
2058 }
2059}
2060
d0a148a6 2061STATIC SV *
15f169a1 2062S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 2063{
740cce10 2064 SV * const sv = newSVpvn_utf8(start, len,
6d492fac
KW
2065 ! IN_BYTES
2066 && UTF
53463649 2067 && len != 0
6d492fac 2068 && is_utf8_non_invariant_string((const U8*)start, len));
d0a148a6
NC
2069 return sv;
2070}
2071
ffb4593c
NT
2072/*
2073 * S_force_word
2074 * When the lexer knows the next thing is a word (for instance, it has
2075 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
2076 * it calls S_force_word to stick the next word into the PL_nexttoke/val
2077 * lookahead.
ffb4593c
NT
2078 *
2079 * Arguments:
b1b65b59 2080 * char *start : buffer position (must be within PL_linestr)
185c2e96
DM
2081 * int token : PL_next* will be this type of bare word
2082 * (e.g., METHOD,BAREWORD)
ffb4593c
NT
2083 * int check_keyword : if true, Perl checks to make sure the word isn't
2084 * a keyword (do this if the word is a label, e.g. goto FOO)
2085 * int allow_pack : if true, : characters will also be allowed (require,
2086 * use, etc. do this)
ffb4593c
NT
2087 */
2088
76e3520e 2089STATIC char *
345b3785 2090S_force_word(pTHX_ char *start, int token, int check_keyword, int allow_pack)
79072805 2091{
eb578fdb 2092 char *s;
463ee0b2 2093 STRLEN len;
4e553d73 2094
7918f24d
NC
2095 PERL_ARGS_ASSERT_FORCE_WORD;
2096
294a536f 2097 start = skipspace(start);
463ee0b2 2098 s = start;
fac0f7a3 2099 if ( isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)
e7127e21 2100 || (allow_pack && *s == ':' && s[1] == ':') )
a0d0e21e 2101 {
3280af22 2102 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
01b5ef50
FC
2103 if (check_keyword) {
2104 char *s2 = PL_tokenbuf;
487e470d 2105 STRLEN len2 = len;
de627158
KW
2106 if (allow_pack && memBEGINPs(s2, len, "CORE::")) {
2107 s2 += sizeof("CORE::") - 1;
2108 len2 -= sizeof("CORE::") - 1;
2109 }
487e470d 2110 if (keyword(s2, len2, 0))
463ee0b2 2111 return start;
01b5ef50 2112 }
463ee0b2 2113 if (token == METHOD) {
294a536f 2114 s = skipspace(s);
463ee0b2 2115 if (*s == '(')
3280af22 2116 PL_expect = XTERM;
463ee0b2 2117 else {
3280af22 2118 PL_expect = XOPERATOR;
463ee0b2 2119 }
79072805 2120 }
9ded7720 2121 NEXTVAL_NEXTTOKE.opval
275103cd 2122 = newSVOP(OP_CONST,0,
d0a148a6 2123 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 2124 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
2125 force_next(token);
2126 }
2127 return s;
2128}
2129
ffb4593c
NT
2130/*
2131 * S_force_ident
9cbb5ea2 2132 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
2133 * text only contains the "foo" portion. The first argument is a pointer
2134 * to the "foo", and the second argument is the type symbol to prefix.
185c2e96 2135 * Forces the next token to be a "BAREWORD".
9cbb5ea2 2136 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2137 */
2138
76e3520e 2139STATIC void
5aaab254 2140S_force_ident(pTHX_ const char *s, int kind)
79072805 2141{
7918f24d
NC
2142 PERL_ARGS_ASSERT_FORCE_IDENT;
2143
c9b48522
DD
2144 if (s[0]) {
2145 const STRLEN len = s[1] ? strlen(s) : 1; /* s = "\"" see yylex */
275103cd 2146 OP* const o = newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
728847b1 2147 UTF ? SVf_UTF8 : 0));
9ded7720 2148 NEXTVAL_NEXTTOKE.opval = o;
185c2e96 2149 force_next(BAREWORD);
748a9306 2150 if (kind) {
11343788 2151 o->op_private = OPpCONST_ENTERED;
55497cff
PP
2152 /* XXX see note in pp_entereval() for why we forgo typo
2153 warnings if the symbol must be introduced in an eval.
2154 GSAR 96-10-12 */
90e5519e 2155 gv_fetchpvn_flags(s, len,
4bff32c5 2156 (PL_in_eval ? GV_ADDMULTI
728847b1 2157 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
90e5519e
NC
2158 kind == '$' ? SVt_PV :
2159 kind == '@' ? SVt_PVAV :
2160 kind == '%' ? SVt_PVHV :
a0d0e21e 2161 SVt_PVGV
90e5519e 2162 );
748a9306 2163 }
79072805
LW
2164 }
2165}
2166
3f33d153
FC
2167static void
2168S_force_ident_maybe_lex(pTHX_ char pit)
2169{
3f33d153
FC
2170 NEXTVAL_NEXTTOKE.ival = pit;
2171 force_next('p');
2172}
2173
1571675a
GS
2174NV
2175Perl_str_to_version(pTHX_ SV *sv)
2176{
2177 NV retval = 0.0;
2178 NV nshift = 1.0;
2179 STRLEN len;
cfd0369c 2180 const char *start = SvPV_const(sv,len);
9d4ba2ae 2181 const char * const end = start + len;
8298454c 2182 const bool utf = cBOOL(SvUTF8(sv));
7918f24d
NC
2183
2184 PERL_ARGS_ASSERT_STR_TO_VERSION;
2185
1571675a 2186 while (start < end) {
ba210ebe 2187 STRLEN skip;
1571675a
GS
2188 UV n;
2189 if (utf)
9041c2e3 2190 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2191 else {
2192 n = *(U8*)start;
2193 skip = 1;
2194 }
2195 retval += ((NV)n)/nshift;
2196 start += skip;
2197 nshift *= 1000;
2198 }
2199 return retval;
2200}
2201
4e553d73 2202/*
ffb4593c
NT
2203 * S_force_version
2204 * Forces the next token to be a version number.
e759cc13
RGS
2205 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2206 * and if "guessing" is TRUE, then no new token is created (and the caller
2207 * must use an alternative parsing method).
ffb4593c
NT
2208 */
2209
76e3520e 2210STATIC char *
e759cc13 2211S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2212{
5f66b61c 2213 OP *version = NULL;
44dcb63b 2214 char *d;
89bfa8cd 2215
7918f24d
NC
2216 PERL_ARGS_ASSERT_FORCE_VERSION;
2217
294a536f 2218 s = skipspace(s);
89bfa8cd 2219
44dcb63b 2220 d = s;
dd629d5b 2221 if (*d == 'v')
44dcb63b 2222 d++;
44dcb63b 2223 if (isDIGIT(*d)) {
e759cc13
RGS
2224 while (isDIGIT(*d) || *d == '_' || *d == '.')
2225 d++;
4e4da3ac 2226 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
dd629d5b 2227 SV *ver;
6154021b
RGS
2228 s = scan_num(s, &pl_yylval);
2229 version = pl_yylval.opval;
dd629d5b
GS
2230 ver = cSVOPx(version)->op_sv;
2231 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2232 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2233 SvNV_set(ver, str_to_version(ver));
1571675a 2234 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2235 }
89bfa8cd 2236 }
5db06880 2237 else if (guessing) {
e759cc13 2238 return s;
5db06880 2239 }
89bfa8cd
PP
2240 }
2241
2242 /* NOTE: The parser sees the package name and the VERSION swapped */
9ded7720 2243 NEXTVAL_NEXTTOKE.opval = version;
185c2e96 2244 force_next(BAREWORD);
89bfa8cd 2245
e759cc13 2246 return s;
89bfa8cd
PP
2247}
2248
ffb4593c 2249/*
91152fc1
DG
2250 * S_force_strict_version
2251 * Forces the next token to be a version number using strict syntax rules.
2252 */
2253
2254STATIC char *
2255S_force_strict_version(pTHX_ char *s)
2256{
91152fc1 2257 OP *version = NULL;
91152fc1
DG
2258 const char *errstr = NULL;
2259
2260 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2261
2262 while (isSPACE(*s)) /* leading whitespace */
2263 s++;
2264
2265 if (is_STRICT_VERSION(s,&errstr)) {
2266 SV *ver = newSV(0);
2267 s = (char *)scan_version(s, ver, 0);
2268 version = newSVOP(OP_CONST, 0, ver);
2269 }
407f8cf2
KW
2270 else if ((*s != ';' && *s != '{' && *s != '}' )
2271 && (s = skipspace(s), (*s != ';' && *s != '{' && *s != '}' )))
4e4da3ac 2272 {
91152fc1
DG
2273 PL_bufptr = s;
2274 if (errstr)
2275 yyerror(errstr); /* version required */
2276 return s;
2277 }
2278
91152fc1 2279 /* NOTE: The parser sees the package name and the VERSION swapped */
91152fc1 2280 NEXTVAL_NEXTTOKE.opval = version;
185c2e96 2281 force_next(BAREWORD);
91152fc1
DG
2282
2283 return s;
2284}
2285
2286/*
ffb4593c 2287 * S_tokeq
ef3ff34d
KW
2288 * Turns any \\ into \ in a quoted string passed in in 'sv', returning 'sv',
2289 * modified as necessary. However, if HINT_NEW_STRING is on, 'sv' is
2290 * unchanged, and a new SV containing the modified input is returned.
ffb4593c
NT
2291 */
2292
76e3520e 2293STATIC SV *
cea2e8a9 2294S_tokeq(pTHX_ SV *sv)
79072805 2295{
eb578fdb
KW
2296 char *s;
2297 char *send;
2298 char *d;
b3ac6de7 2299 SV *pv = sv;
79072805 2300
7918f24d
NC
2301 PERL_ARGS_ASSERT_TOKEQ;
2302
279b35ad
FC
2303 assert (SvPOK(sv));
2304 assert (SvLEN(sv));
2305 assert (!SvIsCOW(sv));
307ed071 2306 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1) /* <<'heredoc' */
b3ac6de7 2307 goto finish;
279b35ad
FC
2308 s = SvPVX(sv);
2309 send = SvEND(sv);
dcb21ed6
NC
2310 /* This is relying on the SV being "well formed" with a trailing '\0' */
2311 while (s < send && !(*s == '\\' && s[1] == '\\'))
79072805
LW
2312 s++;
2313 if (s == send)
b3ac6de7 2314 goto finish;
79072805 2315 d = s;
be4731d2 2316 if ( PL_hints & HINT_NEW_STRING ) {
279b35ad
FC
2317 pv = newSVpvn_flags(SvPVX_const(pv), SvCUR(sv),
2318 SVs_TEMP | SvUTF8(sv));
be4731d2 2319 }
79072805
LW
2320 while (s < send) {
2321 if (*s == '\\') {
a0d0e21e 2322 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2323 s++; /* all that, just for this */
2324 }
2325 *d++ = *s++;
2326 }
2327 *d = '\0';
95a20fc0 2328 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2329 finish:
3280af22 2330 if ( PL_hints & HINT_NEW_STRING )
164e423c 2331 return new_constant(NULL, 0, "q", sv, pv, "q", 1, NULL);
79072805
LW
2332 return sv;
2333}
2334
ffb4593c
NT
2335/*
2336 * Now come three functions related to double-quote context,
2337 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2338 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2339 * interact with PL_lex_state, and create fake ( ... ) argument lists
2340 * to handle functions and concatenation.
ecd24171
DM
2341 * For example,
2342 * "foo\lbar"
2343 * is tokenised as
2344 * stringify ( const[foo] concat lcfirst ( const[bar] ) )
ffb4593c
NT
2345 */
2346
2347/*
2348 * S_sublex_start
6154021b 2349 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2350 *
2351 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2352 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c 2353 *
16d1d8bd 2354 * OP_CONST is easy--just make the new op and return.
ffb4593c
NT
2355 *
2356 * Everything else becomes a FUNC.
2357 *
16d1d8bd
LM
2358 * Sets PL_lex_state to LEX_INTERPPUSH unless ival was OP_NULL or we
2359 * had an OP_CONST. This just sets us up for a
ffb4593c
NT
2360 * call to S_sublex_push().
2361 */
2362
76e3520e 2363STATIC I32
cea2e8a9 2364S_sublex_start(pTHX)
79072805 2365{
eb578fdb 2366 const I32 op_type = pl_yylval.ival;
79072805
LW
2367
2368 if (op_type == OP_NULL) {
6154021b 2369 pl_yylval.opval = PL_lex_op;
5f66b61c 2370 PL_lex_op = NULL;
79072805
LW
2371 return THING;
2372 }
466112bb 2373 if (op_type == OP_CONST) {
67c71cbb
FC
2374 SV *sv = PL_lex_stuff;
2375 PL_lex_stuff = NULL;
2376 sv = tokeq(sv);
b3ac6de7
IZ
2377
2378 if (SvTYPE(sv) == SVt_PVIV) {
2379 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2380 STRLEN len;
96a5add6 2381 const char * const p = SvPV_const(sv, len);
740cce10 2382 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2383 SvREFCNT_dec(sv);
2384 sv = nsv;
4e553d73 2385 }
275103cd 2386 pl_yylval.opval = newSVOP(op_type, 0, sv);
79072805
LW
2387 return THING;
2388 }
2389
7ef70b3d
FC
2390 PL_parser->lex_super_state = PL_lex_state;
2391 PL_parser->lex_sub_inwhat = (U16)op_type;
2392 PL_parser->lex_sub_op = PL_lex_op;
bb4e4c38
TC
2393 PL_parser->sub_no_recover = FALSE;
2394 PL_parser->sub_error_count = PL_error_count;
3280af22 2395 PL_lex_state = LEX_INTERPPUSH;
55497cff 2396
3280af22
NIS
2397 PL_expect = XTERM;
2398 if (PL_lex_op) {
6154021b 2399 pl_yylval.opval = PL_lex_op;
5f66b61c 2400 PL_lex_op = NULL;
55497cff
PP
2401 return PMFUNC;
2402 }
2403 else
2404 return FUNC;
2405}
2406
ffb4593c
NT
2407/*
2408 * S_sublex_push
2409 * Create a new scope to save the lexing state. The scope will be
2410 * ended in S_sublex_done. Returns a '(', starting the function arguments
2411 * to the uc, lc, etc. found before.
2412 * Sets PL_lex_state to LEX_INTERPCONCAT.
2413 */
2414
76e3520e 2415STATIC I32
cea2e8a9 2416S_sublex_push(pTHX)
55497cff 2417{
78a635de 2418 LEXSHARED *shared;
801d32ac 2419 const bool is_heredoc = PL_multi_close == '<';
f46d017c 2420 ENTER;
55497cff 2421
7ef70b3d 2422 PL_lex_state = PL_parser->lex_super_state;
cc624add 2423 SAVEI8(PL_lex_dojoin);
3280af22 2424 SAVEI32(PL_lex_brackets);
78cdf107 2425 SAVEI32(PL_lex_allbrackets);
b27dce25 2426 SAVEI32(PL_lex_formbrack);
78cdf107 2427 SAVEI8(PL_lex_fakeeof);
3280af22
NIS
2428 SAVEI32(PL_lex_casemods);
2429 SAVEI32(PL_lex_starts);
651b5b28 2430 SAVEI8(PL_lex_state);
7cc34111 2431 SAVESPTR(PL_lex_repl);
7766f137 2432 SAVEVPTR(PL_lex_inpat);
98246f1e 2433 SAVEI16(PL_lex_inwhat);
ffdb8b16 2434 if (is_heredoc)
b42366d4 2435 {
ffdb8b16 2436 SAVECOPLINE(PL_curcop);
b42366d4 2437 SAVEI32(PL_multi_end);
851b527a
FC
2438 SAVEI32(PL_parser->herelines);
2439 PL_parser->herelines = 0;
b42366d4 2440 }
2ca4363d 2441 SAVEIV(PL_multi_close);
3280af22 2442 SAVEPPTR(PL_bufptr);
8452ff4b 2443 SAVEPPTR(PL_bufend);
3280af22
NIS
2444 SAVEPPTR(PL_oldbufptr);
2445 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2446 SAVEPPTR(PL_last_lop);
2447 SAVEPPTR(PL_last_uni);
3280af22
NIS
2448 SAVEPPTR(PL_linestart);
2449 SAVESPTR(PL_linestr);
8edd5f42
RGS
2450 SAVEGENERICPV(PL_lex_brackstack);
2451 SAVEGENERICPV(PL_lex_casestack);
78a635de 2452 SAVEGENERICPV(PL_parser->lex_shared);
3a54fd60 2453 SAVEBOOL(PL_parser->lex_re_reparsing);
ffdb8b16 2454 SAVEI32(PL_copline);
3280af22 2455
99bd9d90 2456 /* The here-doc parser needs to be able to peek into outer lexing
60f40a38
FC
2457 scopes to find the body of the here-doc. So we put PL_linestr and
2458 PL_bufptr into lex_shared, to ‘share’ those values.
99bd9d90 2459 */
60f40a38
FC
2460 PL_parser->lex_shared->ls_linestr = PL_linestr;
2461 PL_parser->lex_shared->ls_bufptr = PL_bufptr;
99bd9d90 2462
3280af22 2463 PL_linestr = PL_lex_stuff;
7ef70b3d 2464 PL_lex_repl = PL_parser->lex_sub_repl;
a0714e2c 2465 PL_lex_stuff = NULL;
7ef70b3d 2466 PL_parser->lex_sub_repl = NULL;
3280af22 2467
eabab8bc
FC
2468 /* Arrange for PL_lex_stuff to be freed on scope exit, in case it gets
2469 set for an inner quote-like operator and then an error causes scope-
2470 popping. We must not have a PL_lex_stuff value left dangling, as
2471 that breaks assumptions elsewhere. See bug #123617. */
2472 SAVEGENERICSV(PL_lex_stuff);
7ef70b3d 2473 SAVEGENERICSV(PL_parser->lex_sub_repl);
eabab8bc 2474
9cbb5ea2
GS
2475 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2476 = SvPVX(PL_linestr);
3280af22 2477 PL_bufend += SvCUR(PL_linestr);
bd61b366 2478 PL_last_lop = PL_last_uni = NULL;
3280af22 2479 SAVEFREESV(PL_linestr);
4dc843bc 2480 if (PL_lex_repl) SAVEFREESV(PL_lex_repl);
3280af22
NIS
2481
2482 PL_lex_dojoin = FALSE;
b27dce25 2483 PL_lex_brackets = PL_lex_formbrack = 0;
78cdf107
Z
2484 PL_lex_allbrackets = 0;
2485 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
a02a5408
JC
2486 Newx(PL_lex_brackstack, 120, char);
2487 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2488 PL_lex_casemods = 0;
2489 *PL_lex_casestack = '\0';
2490 PL_lex_starts = 0;
2491 PL_lex_state = LEX_INTERPCONCAT;
ffdb8b16 2492 if (is_heredoc)
6ddcf93b 2493 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
ffdb8b16 2494 PL_copline = NOLINE;
caae0700 2495
78a635de
FC
2496 Newxz(shared, 1, LEXSHARED);
2497 shared->ls_prev = PL_parser->lex_shared;
2498 PL_parser->lex_shared = shared;
3280af22 2499
7ef70b3d 2500 PL_lex_inwhat = PL_parser->lex_sub_inwhat;
bb16bae8 2501 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
3280af22 2502 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
7ef70b3d 2503 PL_lex_inpat = PL_parser->lex_sub_op;
79072805 2504 else
5f66b61c 2505 PL_lex_inpat = NULL;
79072805 2506
3a54fd60
DM
2507 PL_parser->lex_re_reparsing = cBOOL(PL_in_eval & EVAL_RE_REPARSING);
2508 PL_in_eval &= ~EVAL_RE_REPARSING;
2509
69afcc21 2510 return SUBLEXSTART;
79072805
LW
2511}
2512
ffb4593c
NT
2513/*
2514 * S_sublex_done
2515 * Restores lexer state after a S_sublex_push.
2516 */
2517
76e3520e 2518STATIC I32
cea2e8a9 2519S_sublex_done(pTHX)
79072805 2520{
3280af22 2521 if (!PL_lex_starts++) {
396482e1 2522 SV * const sv = newSVpvs("");
9aa983d2
JH
2523 if (SvUTF8(PL_linestr))
2524 SvUTF8_on(sv);
3280af22 2525 PL_expect = XOPERATOR;
275103cd 2526 pl_yylval.opval = newSVOP(OP_CONST, 0, sv);
79072805
LW
2527 return THING;
2528 }
2529
3280af22
NIS
2530 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2531 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2532 return yylex();
79072805
LW
2533 }
2534
ffb4593c 2535 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
bb16bae8 2536 assert(PL_lex_inwhat != OP_TRANSR);
5aa91856
FC
2537 if (PL_lex_repl) {
2538 assert (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS);
3280af22
NIS
2539 PL_linestr = PL_lex_repl;
2540 PL_lex_inpat = 0;
2541 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2542 PL_bufend += SvCUR(PL_linestr);
bd61b366 2543 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2544 PL_lex_dojoin = FALSE;
2545 PL_lex_brackets = 0;
78cdf107
Z
2546 PL_lex_allbrackets = 0;
2547 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
3280af22
NIS
2548 PL_lex_casemods = 0;
2549 *PL_lex_casestack = '\0';
2550 PL_lex_starts = 0;
25da4f38 2551 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2552 PL_lex_state = LEX_INTERPNORMAL;
2553 PL_lex_starts++;
e9fa98b2
HS
2554 /* we don't clear PL_lex_repl here, so that we can check later
2555 whether this is an evalled subst; that means we rely on the
2556 logic to ensure sublex_done() is called again only via the
2557 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2558 }
e9fa98b2 2559 else {
3280af22 2560 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2561 PL_lex_repl = NULL;
e9fa98b2 2562 }
ffdb8b16
FC
2563 if (SvTYPE(PL_linestr) >= SVt_PVNV) {
2564 CopLINE(PL_curcop) +=
9420b268 2565 ((XPVNV*)SvANY(PL_linestr))->xnv_u.xnv_lines
851b527a
FC
2566 + PL_parser->herelines;
2567 PL_parser->herelines = 0;
ffdb8b16 2568 }
9b6b7be8 2569 return '/';
ffed7fef
LW
2570 }
2571 else {
b42366d4 2572 const line_t l = CopLINE(PL_curcop);
f46d017c 2573 LEAVE;
bb4e4c38 2574 if (PL_parser->sub_error_count != PL_error_count) {
bb4e4c38 2575 if (PL_parser->sub_no_recover) {
ad1ecdf7 2576 yyquit();
bb4e4c38
TC
2577 NOT_REACHED;
2578 }
2579 }
b42366d4 2580 if (PL_multi_close == '<')
851b527a 2581 PL_parser->herelines += l - PL_multi_end;
3280af22
NIS
2582 PL_bufend = SvPVX(PL_linestr);
2583 PL_bufend += SvCUR(PL_linestr);
2584 PL_expect = XOPERATOR;
69afcc21 2585 return SUBLEXEND;
ffed7fef
LW
2586 }
2587}
2588
4e8ee35f
KW
2589HV *
2590Perl_load_charnames(pTHX_ SV * char_name, const char * context,
2591 const STRLEN context_len, const char ** error_msg)
2592{
2593 /* Load the official _charnames module if not already there. The
2594 * parameters are just to give info for any error messages generated:
2595 * char_name a name to look up which is the reason for loading this
2596 * context 'char_name' in the context in the input in which it appears
2597 * context_len how many bytes 'context' occupies
2598 * error_msg *error_msg will be set to any error
2599 *
2600 * Returns the ^H table if success; otherwise NULL */
2601
2602 unsigned int i;
2603 HV * table;
2604 SV **cvp;
2605 SV * res;
2606
2607 PERL_ARGS_ASSERT_LOAD_CHARNAMES;
2608
2609 /* This loop is executed 1 1/2 times. On the first time through, if it
2610 * isn't already loaded, try loading it, and iterate just once to see if it
2611 * worked. */
2612 for (i = 0; i < 2; i++) {
2613 table = GvHV(PL_hintgv); /* ^H */
2614
2615 if ( table
2616 && (PL_hints & HINT_LOCALIZE_HH)
2617 && (cvp = hv_fetchs(table, "charnames", FALSE))
2618 && SvOK(*cvp))
2619 {
2620 return table; /* Quit if already loaded */
2621 }
2622
2623 if (i == 0) {
2624 Perl_load_module(aTHX_
2625 0,
2626 newSVpvs("_charnames"),
2627
2628 /* version parameter; no need to specify it, as if we get too early
2629 * a version, will fail anyway, not being able to find 'charnames'
2630 * */
2631 NULL,
2632 newSVpvs(":full"),
2633 newSVpvs(":short"),
2634 NULL);
2635 }
2636 }
2637
2638 /* Here, it failed; new_constant will give appropriate error messages */
2639 *error_msg = NULL;
2640 res = new_constant( NULL, 0, "charnames", char_name, NULL,
2641 context, context_len, error_msg);
2642 SvREFCNT_dec(res);
2643
2644 return NULL;
2645}
2646
7a7d14f3 2647STATIC SV*
2c43c309
KW
2648S_get_and_check_backslash_N_name_wrapper(pTHX_ const char* s, const char* const e)
2649{
2650 /* This justs wraps get_and_check_backslash_N_name() to output any error
2651 * message it returns. */
2652
2653 const char * error_msg = NULL;
25c7fb78 2654 SV * result;
2c43c309
KW
2655
2656 PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME_WRAPPER;
2657
25c7fb78
KW
2658 /* charnames doesn't work well if there have been errors found */
2659 if (PL_error_count > 0) {
2660 return NULL;
2661 }
2662
2663 result = get_and_check_backslash_N_name(s, e, cBOOL(UTF), &error_msg);
2664
2c43c309
KW
2665 if (error_msg) {
2666 yyerror_pv(error_msg, UTF ? SVf_UTF8 : 0);
2667 }
2668
2669 return result;
2670}
2671
25c7fb78
KW
2672SV*
2673Perl_get_and_check_backslash_N_name(pTHX_ const char* s,
2674 const char* const e,
2675 const bool is_utf8,
2676 const char ** error_msg)
6f613c73 2677{
140b12ad
KW
2678 /* <s> points to first character of interior of \N{}, <e> to one beyond the
2679 * interior, hence to the "}". Finds what the name resolves to, returning
25c7fb78
KW
2680 * an SV* containing it; NULL if no valid one found.
2681 *
2682 * 'is_utf8' is TRUE if we know we want the result to be UTF-8 even if it
2683 * doesn't have to be. */
6f613c73 2684
7303cc1f 2685 SV* char_name;
25c7fb78 2686 SV* res;
0c415a79
KW
2687 HV * table;
2688 SV **cvp;
2689 SV *cv;
2690 SV *rv;
2691 HV *stash;
8b0cce63
KW
2692
2693 /* Points to the beginning of the \N{... so that any messages include the
2694 * context of what's failing*/
2695 const char* context = s - 3;
2696 STRLEN context_len = e - context + 1; /* include all of \N{...} */
2697
25c7fb78 2698 dVAR;
0c415a79 2699
6f613c73
KW
2700 PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME;
2701
25c7fb78
KW
2702 assert(e >= s);
2703 assert(s > (char *) 3);
2704
7303cc1f 2705 char_name = newSVpvn_flags(s, e - s, (is_utf8) ? SVf_UTF8 : 0);
25c7fb78 2706
7303cc1f
KW
2707 if (!SvCUR(char_name)) {
2708 SvREFCNT_dec_NN(char_name);
be332ba0 2709 /* diag_listed_as: Unknown charname '%s' */
2c43c309 2710 *error_msg = Perl_form(aTHX_ "Unknown charname ''");
be332ba0 2711 return NULL;
d8d26cac 2712 }
b7e6151c 2713
7303cc1f 2714 /* Autoload the charnames module */
7303cc1f 2715
4e8ee35f
KW
2716 table = load_charnames(char_name, context, context_len, error_msg);
2717 if (table == NULL) {
2718 return NULL;
7303cc1f
KW
2719 }
2720
2721 *error_msg = NULL;
8b0cce63
KW
2722 res = new_constant( NULL, 0, "charnames", char_name, NULL,
2723 context, context_len, error_msg);
7303cc1f 2724 if (*error_msg) {
4e8ee35f 2725 *error_msg = Perl_form(aTHX_ "Unknown charname '%s'", SvPVX(char_name));
7303cc1f
KW
2726
2727 SvREFCNT_dec(res);
6f613c73
KW
2728 return NULL;
2729 }
2730
0c415a79
KW
2731 /* See if the charnames handler is the Perl core's, and if so, we can skip
2732 * the validation needed for a user-supplied one, as Perl's does its own
2733 * validation. */
0c415a79 2734 cvp = hv_fetchs(table, "charnames", FALSE);
5882ddb3
FC
2735 if (cvp && (cv = *cvp) && SvROK(cv) && (rv = SvRV(cv),
2736 SvTYPE(rv) == SVt_PVCV) && ((stash = CvSTASH(rv)) != NULL))
0c415a79
KW
2737 {
2738 const char * const name = HvNAME(stash);
b59bf0b2 2739 if (memEQs(name, HvNAMELEN(stash), "_charnames")) {
0c415a79
KW
2740 return res;
2741 }
2742 }
2743
bde9e88d
KW
2744 /* Here, it isn't Perl's charname handler. We can't rely on a
2745 * user-supplied handler to validate the input name. For non-ut8 input,
2746 * look to see that the first character is legal. Then loop through the
2747 * rest checking that each is a continuation */
6f613c73 2748
36897d64
KW
2749 /* This code makes the reasonable assumption that the only Latin1-range
2750 * characters that begin a character name alias are alphabetic, otherwise
2751 * would have to create a isCHARNAME_BEGIN macro */
b6ba1137 2752
25c7fb78 2753 if (! is_utf8) {
bde9e88d 2754 if (! isALPHAU(*s)) {
b6ba1137
KW
2755 goto bad_charname;
2756 }
bde9e88d
KW
2757 s++;
2758 while (s < e) {
2759 if (! isCHARNAME_CONT(*s)) {
b6ba1137
KW
2760 goto bad_charname;
2761 }
2d8eb851
KW
2762 if (*s == ' ' && *(s-1) == ' ') {
2763 goto multi_spaces;
bd299e29 2764 }
bde9e88d 2765 s++;
b6ba1137
KW
2766 }
2767 }
2768 else {
bde9e88d 2769 /* Similarly for utf8. For invariants can check directly; for other
58aa6738
KW
2770 * Latin1, can calculate their code point and check; otherwise use an
2771 * inversion list */
bde9e88d
KW
2772 if (UTF8_IS_INVARIANT(*s)) {
2773 if (! isALPHAU(*s)) {
140b12ad
KW
2774 goto bad_charname;
2775 }
bde9e88d
KW
2776 s++;
2777 } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
a62b247b 2778 if (! isALPHAU(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)))) {
b6ba1137 2779 goto bad_charname;
6f613c73 2780 }
bde9e88d 2781 s += 2;
6f613c73 2782 }
bde9e88d 2783 else {
f1bcae08
KW
2784 if (! _invlist_contains_cp(PL_utf8_charname_begin,
2785 utf8_to_uvchr_buf((U8 *) s,
2786 (U8 *) e,
2787 NULL)))
2788 {
bde9e88d
KW
2789 goto bad_charname;
2790 }
2791 s += UTF8SKIP(s);
2792 }
2793
2794 while (s < e) {
2795 if (UTF8_IS_INVARIANT(*s)) {
2796 if (! isCHARNAME_CONT(*s)) {
2797 goto bad_charname;
2798 }
2d8eb851
KW
2799 if (*s == ' ' && *(s-1) == ' ') {
2800 goto multi_spaces;
bd299e29 2801 }
bde9e88d
KW
2802 s++;
2803 }
2804 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
a62b247b 2805 if (! isCHARNAME_CONT(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1))))
bde9e88d
KW
2806 {
2807 goto bad_charname;
2808 }
2809 s += 2;
2810 }
2811 else {
f1bcae08
KW
2812 if (! _invlist_contains_cp(PL_utf8_charname_continue,
2813 utf8_to_uvchr_buf((U8 *) s,
2814 (U8 *) e,
2815 NULL)))
2816 {
bde9e88d
KW
2817 goto bad_charname;
2818 }
2819 s += UTF8SKIP(s);
6f613c73
KW
2820 }
2821 }
2d8eb851
KW
2822 }
2823 if (*(s-1) == ' ') {
8d9d0498
FC
2824 /* diag_listed_as: charnames alias definitions may not contain
2825 trailing white-space; marked by <-- HERE in %s
2826 */
2c43c309 2827 *error_msg = Perl_form(aTHX_
2d8eb851
KW
2828 "charnames alias definitions may not contain trailing "
2829 "white-space; marked by <-- HERE in %.*s<-- HERE %.*s",
8b0cce63 2830 (int)(s - context + 1), context,
2c43c309 2831 (int)(e - s + 1), s + 1);
2d8eb851 2832 return NULL;
6f613c73
KW
2833 }
2834
870fe793 2835 if (SvUTF8(res)) { /* Don't accept malformed charname value */
bde9e88d
KW
2836 const U8* first_bad_char_loc;
2837 STRLEN len;
2838 const char* const str = SvPV_const(res, len);
0aab20f2
KW
2839 if (UNLIKELY(! is_utf8_string_loc((U8 *) str, len,
2840 &first_bad_char_loc)))
2841 {
75219bac
KW
2842 _force_out_malformed_utf8_message(first_bad_char_loc,
2843 (U8 *) PL_parser->bufend,
2844 0,
2845 0 /* 0 means don't die */ );
8d9d0498
FC
2846 /* diag_listed_as: Malformed UTF-8 returned by \N{%s}
2847 immediately after '%s' */
2c43c309 2848 *error_msg = Perl_form(aTHX_
bde9e88d 2849 "Malformed UTF-8 returned by %.*s immediately after '%.*s'",
8b0cce63 2850 (int) context_len, context,
2c43c309 2851 (int) ((char *) first_bad_char_loc - str), str);
bde9e88d
KW
2852 return NULL;
2853 }
2854 }
140b12ad 2855
bde9e88d 2856 return res;
140b12ad 2857
bde9e88d 2858 bad_charname: {
bde9e88d
KW
2859
2860 /* The final %.*s makes sure that should the trailing NUL be missing
2861 * that this print won't run off the end of the string */
8d9d0498
FC
2862 /* diag_listed_as: Invalid character in \N{...}; marked by <-- HERE
2863 in \N{%s} */
2c43c309 2864 *error_msg = Perl_form(aTHX_
bde9e88d 2865 "Invalid character in \\N{...}; marked by <-- HERE in %.*s<-- HERE %.*s",
8b0cce63 2866 (int)(s - context + 1), context,
2c43c309 2867 (int)(e - s + 1), s + 1);
bde9e88d
KW
2868 return NULL;
2869 }
2d8eb851
KW
2870
2871 multi_spaces:
8d9d0498
FC
2872 /* diag_listed_as: charnames alias definitions may not contain a
2873 sequence of multiple spaces; marked by <-- HERE
2874 in %s */
2c43c309 2875 *error_msg = Perl_form(aTHX_
2d8eb851
KW
2876 "charnames alias definitions may not contain a sequence of "
2877 "multiple spaces; marked by <-- HERE in %.*s<-- HERE %.*s",
8b0cce63 2878 (int)(s - context + 1), context,
2c43c309 2879 (int)(e - s + 1), s + 1);
2d8eb851 2880 return NULL;
6f613c73
KW
2881}
2882
02aa26ce
NT
2883/*
2884 scan_const
2885
9da1dd8f
DM
2886 Extracts the next constant part of a pattern, double-quoted string,
2887 or transliteration. This is terrifying code.
2888
2889 For example, in parsing the double-quoted string "ab\x63$d", it would
2890 stop at the '$' and return an OP_CONST containing 'abc'.
02aa26ce 2891
94def140 2892 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 2893 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 2894 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 2895
94def140
ST
2896 Returns a pointer to the character scanned up to. If this is
2897 advanced from the start pointer supplied (i.e. if anything was
9da1dd8f 2898 successfully parsed), will leave an OP_CONST for the substring scanned
6154021b 2899 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
2900 by looking at the next characters herself.
2901
02aa26ce 2902 In patterns:
9da1dd8f 2903 expand:
537124e4
KW
2904 \N{FOO} => \N{U+hex_for_character_FOO}
2905 (if FOO expands to multiple characters, expands to \N{U+xx.XX.yy ...})
9da1dd8f
DM
2906
2907 pass through:
2908 all other \-char, including \N and \N{ apart from \N{ABC}
2909
2910 stops on:
2911 @ and $ where it appears to be a var, but not for $ as tail anchor
2912 \l \L \u \U \Q \E
2913 (?{ or (??{
2914
02aa26ce
NT
2915 In transliterations:
2916 characters are VERY literal, except for - not at the start or end
66b09263
KW
2917 of the string, which indicates a range. However some backslash sequences
2918 are recognized: \r, \n, and the like
2919 \007 \o{}, \x{}, \N{}
2920 If all elements in the transliteration are below 256,
94def140
ST
2921 scan_const expands the range to the full set of intermediate
2922 characters. If the range is in utf8, the hyphen is replaced with
2923 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
2924
2925 In double-quoted strings:
2926 backslashes:
66b09263 2927 all those recognized in transliterations
94def140 2928 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
2929 case and quoting: \U \Q \E
2930 stops on @ and $
2931
2932 scan_const does *not* construct ops to handle interpolated strings.
2933 It stops processing as soon as it finds an embedded $ or @ variable
2934 and leaves it to the caller to work out what's going on.
2935
94def140
ST
2936 embedded arrays (whether in pattern or not) could be:
2937 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2938
2939 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
2940
2941 $ in pattern could be $foo or could be tail anchor. Assumption:
2942 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 2943 followed by one of "()| \r\n\t"
02aa26ce 2944
9da1dd8f 2945 \1 (backreferences) are turned into $1 in substitutions
02aa26ce
NT
2946
2947 The structure of the code is
2948 while (there's a character to process) {
94def140
ST
2949 handle transliteration ranges
2950 skip regexp comments /(?#comment)/ and codes /(?{code})/
2951 skip #-initiated comments in //x patterns
2952 check for embedded arrays
02aa26ce
NT
2953 check for embedded scalars
2954 if (backslash) {
94def140 2955 deprecate \1 in substitution replacements
02aa26ce
NT
2956 handle string-changing backslashes \l \U \Q \E, etc.
2957 switch (what was escaped) {
94def140 2958 handle \- in a transliteration (becomes a literal -)
ff3f963a 2959 if a pattern and not \N{, go treat as regular character
94def140
ST
2960 handle \132 (octal characters)
2961 handle \x15 and \x{1234} (hex characters)
ff3f963a 2962 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
ST
2963 handle \cV (control characters)
2964 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 2965 } (end switch)
77a135fe 2966 continue
02aa26ce 2967 } (end if backslash)
77a135fe 2968 handle regular character
02aa26ce 2969 } (end while character to read)
02cd137d 2970
02aa26ce
NT
2971*/
2972
76e3520e 2973STATIC char *
cea2e8a9 2974S_scan_const(pTHX_ char *start)
79072805 2975{
eb578fdb 2976 char *send = PL_bufend; /* end of the constant */
dc023dbb
KW
2977 SV *sv = newSV(send - start); /* sv for the constant. See note below
2978 on sizing. */
eb578fdb
KW
2979 char *s = start; /* start of the constant */
2980 char *d = SvPVX(sv); /* destination for copies */
dc023dbb
KW
2981 bool dorange = FALSE; /* are we in a translit range? */
2982 bool didrange = FALSE; /* did we just finish a range? */
2983 bool in_charclass = FALSE; /* within /[...]/ */
ed2893cb 2984 bool s_is_utf8 = cBOOL(UTF); /* Is the source string assumed to be
dc023dbb
KW
2985 UTF8? But, this can show as true
2986 when the source isn't utf8, as for
2987 example when it is entirely composed
2988 of hex constants */
f34acfec 2989 bool d_is_utf8 = FALSE; /* Output constant is UTF8 */
af9be36c
KW
2990 STRLEN utf8_variant_count = 0; /* When not in UTF-8, this counts the
2991 number of characters found so far
2992 that will expand (into 2 bytes)
2993 should we have to convert to
2994 UTF-8) */
6f613c73 2995 SV *res; /* result from charnames */
c89db733
JH
2996 STRLEN offset_to_max = 0; /* The offset in the output to where the range
2997 high-end character is temporarily placed */
77a135fe 2998
fe2ba0a2
KW
2999 /* Does something require special handling in tr/// ? This avoids extra
3000 * work in a less likely case. As such, khw didn't feel it was worth
3001 * adding any branches to the more mainline code to handle this, which
3002 * means that this doesn't get set in some circumstances when things like
3003 * \x{100} get expanded out. As a result there needs to be extra testing
3004 * done in the tr code */
3005 bool has_above_latin1 = FALSE;
3006
77a135fe
KW
3007 /* Note on sizing: The scanned constant is placed into sv, which is
3008 * initialized by newSV() assuming one byte of output for every byte of
3009 * input. This routine expects newSV() to allocate an extra byte for a
3010 * trailing NUL, which this routine will append if it gets to the end of
3011 * the input. There may be more bytes of input than output (eg., \N{LATIN
3012 * CAPITAL LETTER A}), or more output than input if the constant ends up
3013 * recoded to utf8, but each time a construct is found that might increase
3014 * the needed size, SvGROW() is called. Its size parameter each time is
3015 * based on the best guess estimate at the time, namely the length used so
3016 * far, plus the length the current construct will occupy, plus room for
caae0700 3017 * the trailing NUL, plus one byte for every input byte still unscanned */
77a135fe 3018
c3320c2a
KW
3019 UV uv = UV_MAX; /* Initialize to weird value to try to catch any uses
3020 before set */
4c3a8340 3021#ifdef EBCDIC
f4240379
KW
3022 int backslash_N = 0; /* ? was the character from \N{} */
3023 int non_portable_endpoint = 0; /* ? In a range is an endpoint
3024 platform-specific like \x65 */
4c3a8340 3025#endif
012bcf8d 3026
7918f24d
NC
3027 PERL_ARGS_ASSERT_SCAN_CONST;
3028
bb16bae8 3029 assert(PL_lex_inwhat != OP_TRANSR);
2b9d42f0 3030
b899e89d
FC
3031 /* Protect sv from errors and fatal warnings. */
3032 ENTER_with_name("scan_const");
3033 SAVEFREESV(sv);
2b9d42f0 3034
1759517a
KW
3035 /* A bunch of code in the loop below assumes that if s[n] exists and is not
3036 * NUL, then s[n+1] exists. This assertion makes sure that assumption is
3037 * valid */
3038 assert(*send == '\0');
3039
f4240379
KW
3040 while (s < send
3041 || dorange /* Handle tr/// range at right edge of input */
3042 ) {
ff3f963a 3043
02aa26ce 3044 /* get transliterations out of the way (they're most literal) */
3280af22 3045 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 3046
f4240379
KW
3047 /* But there isn't any special handling necessary unless there is a
3048 * range, so for most cases we just drop down and handle the value
3049 * as any other. There are two exceptions.
3050 *
02cd137d
KW
3051 * 1. A hyphen indicates that we are actually going to have a
3052 * range. In this case, skip the '-', set a flag, then drop
f4240379
KW
3053 * down to handle what should be the end range value.
3054 * 2. After we've handled that value, the next time through, that
3055 * flag is set and we fix up the range.
3056 *
3057 * Ranges entirely within Latin1 are expanded out entirely, in
188d22cf
KW
3058 * order to make the transliteration a simple table look-up.
3059 * Ranges that extend above Latin1 have to be done differently, so
3060 * there is no advantage to expanding them here, so they are
dc8faf6b
KW
3061 * stored here as Min, RANGE_INDICATOR, Max. 'RANGE_INDICATOR' is
3062 * a byte that can't occur in legal UTF-8, and hence can signify a
3063 * hyphen without any possible ambiguity. On EBCDIC machines, if
3064 * the range is expressed as Unicode, the Latin1 portion is
3065 * expanded out even if the range extends above Latin1. This is
3066 * because each code point in it has to be processed here
3067 * individually to get its native translation */
f4240379
KW
3068
3069 if (! dorange) {
3070
02cd137d
KW
3071 /* Here, we don't think we're in a range. If the new character
3072 * is not a hyphen; or if it is a hyphen, but it's too close to
e8d55f27
TC
3073 * either edge to indicate a range, or if we haven't output any
3074 * characters yet then it's a regular character. */
04863ba1
KW
3075 if (*s != '-' || s >= send - 1 || s == start || d == SvPVX(sv))
3076 {
f4240379
KW
3077
3078 /* A regular character. Process like any other, but first
3079 * clear any flags */
3080 didrange = FALSE;
3081 dorange = FALSE;
e294cc5d 3082#ifdef EBCDIC
f4240379
KW
3083 non_portable_endpoint = 0;
3084 backslash_N = 0;
e294cc5d 3085#endif
02cd137d
KW
3086 /* The tests here for being above Latin1 and similar ones
3087 * in the following 'else' suffice to find all such
3088 * occurences in the constant, except those added by a
fe2ba0a2 3089 * backslash escape sequence, like \x{100}. Mostly, those
02cd137d 3090 * set 'has_above_latin1' as appropriate */
ed2893cb 3091 if (s_is_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) {
188d22cf
KW
3092 has_above_latin1 = TRUE;
3093 }
3094
f4240379
KW
3095 /* Drops down to generic code to process current byte */
3096 }
02cd137d 3097 else { /* Is a '-' in the context where it means a range */
f4240379 3098 if (didrange) { /* Something like y/A-C-Z// */
02cd137d
KW
3099 Perl_croak(aTHX_ "Ambiguous range in transliteration"
3100 " operator");
f4240379 3101 }
e294cc5d 3102
f4240379 3103 dorange = TRUE;
2b9d42f0 3104
02cd137d 3105 s++; /* Skip past the hyphen */
f4240379
KW
3106
3107 /* d now points to where the end-range character will be
d7f7b0e3
KW
3108 * placed. Drop down to get that character. We'll finish
3109 * processing the range the next time through the loop */
188d22cf 3110
ed2893cb 3111 if (s_is_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) {
188d22cf
KW
3112 has_above_latin1 = TRUE;
3113 }
02cd137d
KW
3114
3115 /* Drops down to generic code to process current byte */
f4240379
KW
3116 }
3117 } /* End of not a range */
3118 else {
3119 /* Here we have parsed a range. Now must handle it. At this
3120 * point:
3121 * 'sv' is a SV* that contains the output string we are
3122 * constructing. The final two characters in that string
3123 * are the range start and range end, in order.
3124 * 'd' points to just beyond the range end in the 'sv' string,
3125 * where we would next place something
f4240379 3126 */
0c311b7c 3127 char * max_ptr;
5ca37d54 3128 char * min_ptr;
f4240379
KW
3129 IV range_min;
3130 IV range_max; /* last character in range */
f4240379 3131 STRLEN grow;
5ca37d54
KW
3132 Size_t offset_to_min = 0;
3133 Size_t extras = 0;
11327fa1 3134#ifdef EBCDIC
f4240379
KW
3135 bool convert_unicode;
3136 IV real_range_max = 0;
e294cc5d 3137#endif
02cd137d 3138 /* Get the code point values of the range ends. */
0c311b7c
KW
3139 max_ptr = (d_is_utf8) ? (char *) utf8_hop( (U8*) d, -1) : d - 1;
3140 offset_to_max = max_ptr - SvPVX_const(sv);
ed2893cb 3141 if (d_is_utf8) {
f4240379
KW
3142 /* We know the utf8 is valid, because we just constructed
3143 * it ourselves in previous loop iterations */
3144 min_ptr = (char*) utf8_hop( (U8*) max_ptr, -1);
3145 range_min = valid_utf8_to_uvchr( (U8*) min_ptr, NULL);
3146 range_max = valid_utf8_to_uvchr( (U8*) max_ptr, NULL);
fe2ba0a2
KW
3147
3148 /* This compensates for not all code setting
3149 * 'has_above_latin1', so that we don't skip stuff that
3150 * should be executed */
3151 if (range_max > 255) {
3152 has_above_latin1 = TRUE;
3153 }
e294cc5d 3154 }
f4240379
KW
3155 else {
3156 min_ptr = max_ptr - 1;
3157 range_min = * (U8*) min_ptr;
3158 range_max = * (U8*) max_ptr;
3159 }
3160
8efef67c
KW
3161 /* If the range is just a single code point, like tr/a-a/.../,
3162 * that code point is already in the output, twice. We can
3163 * just back up over the second instance and avoid all the rest
3164 * of the work. But if it is a variant character, it's been
218304f9
KW
3165 * counted twice, so decrement. (This unlikely scenario is
3166 * special cased, like the one for a range of 2 code points
3167 * below, only because the main-line code below needs a range
3168 * of 3 or more to work without special casing. Might as well
3169 * get it out of the way now.) */
8efef67c
KW
3170 if (UNLIKELY(range_max == range_min)) {
3171 d = max_ptr;
ed2893cb 3172 if (! d_is_utf8 && ! UVCHR_IS_INVARIANT(range_max)) {
8efef67c
KW
3173 utf8_variant_count--;
3174 }
3175 goto range_done;
3176 }
3177
e294cc5d 3178#ifdef EBCDIC
f4240379
KW
3179 /* On EBCDIC platforms, we may have to deal with portable
3180 * ranges. These happen if at least one range endpoint is a
3181 * Unicode value (\N{...}), or if the range is a subset of
3182 * [A-Z] or [a-z], and both ends are literal characters,
3183 * like 'A', and not like \x{C1} */
87dd6ea7 3184 convert_unicode =
02cd137d
KW
3185 cBOOL(backslash_N) /* \N{} forces Unicode,
3186 hence portable range */
3187 || ( ! non_portable_endpoint
3188 && (( isLOWER_A(range_min) && isLOWER_A(range_max))
3189 || (isUPPER_A(range_min) && isUPPER_A(range_max))));
87dd6ea7 3190 if (convert_unicode) {
f4240379
KW
3191
3192 /* Special handling is needed for these portable ranges.
02cd137d
KW
3193 * They are defined to be in Unicode terms, which includes
3194 * all the Unicode code points between the end points.
f4240379
KW
3195 * Convert to Unicode to get the Unicode range. Later we
3196 * will convert each code point in the range back to
3197 * native. */
3198 range_min = NATIVE_TO_UNI(range_min);
3199 range_max = NATIVE_TO_UNI(range_max);
3200 }
e294cc5d 3201#endif
8ada0baa 3202
f4240379 3203 if (range_min > range_max) {
11327fa1 3204#ifdef EBCDIC
f4240379
KW
3205 if (convert_unicode) {
3206 /* Need to convert back to native for meaningful
3207 * messages for this platform */
3208 range_min = UNI_TO_NATIVE(range_min);
3209 range_max = UNI_TO_NATIVE(range_max);
3210 }
11327fa1 3211#endif
f4240379
KW
3212 /* Use the characters themselves for the error message if
3213 * ASCII printables; otherwise some visible representation
3214 * of them */
3215 if (isPRINT_A(range_min) && isPRINT_A(range_max)) {
3216 Perl_croak(aTHX_
3217 "Invalid range \"%c-%c\" in transliteration operator",
3218 (char)range_min, (char)range_max);
3219 }
11327fa1 3220#ifdef EBCDIC
f4240379 3221 else if (convert_unicode) {
02cd137d 3222 /* diag_listed_as: Invalid range "%s" in transliteration operator */
f4240379 3223 Perl_croak(aTHX_
02cd137d
KW
3224 "Invalid range \"\\N{U+%04" UVXf "}-\\N{U+%04"
3225 UVXf "}\" in transliteration operator",
3226 range_min, range_max);
f4240379 3227 }
11327fa1 3228#endif
f4240379 3229 else {
02cd137d 3230 /* diag_listed_as: Invalid range "%s" in transliteration operator */
f4240379 3231 Perl_croak(aTHX_
02cd137d
KW
3232 "Invalid range \"\\x{%04" UVXf "}-\\x{%04" UVXf "}\""
3233 " in transliteration operator",
3234 range_min, range_max);
f4240379 3235 }
c2e66d9e
GS
3236 }
3237
aca41667
KW
3238 /* If the range is exactly two code points long, they are
3239 * already both in the output */
3240 if (UNLIKELY(range_min + 1 == range_max)) {
3241 goto range_done;
3242 }
3243
3244 /* Here the range contains at least 3 code points */
3245
ed2893cb 3246 if (d_is_utf8) {
f4240379 3247
188d22cf
KW
3248 /* If everything in the transliteration is below 256, we
3249 * can avoid special handling later. A translation table
02cd137d
KW
3250 * for each of those bytes is created by op.c. So we
3251 * expand out all ranges to their constituent code points.
3252 * But if we've encountered something above 255, the
3253 * expanding won't help, so skip doing that. But if it's
3254 * EBCDIC, we may have to look at each character below 256
3255 * if we have to convert to/from Unicode values */
188d22cf 3256 if ( has_above_latin1
c7f1f016 3257#ifdef EBCDIC
f4240379 3258 && (range_min > 255 || ! convert_unicode)
8ada0baa 3259#endif
f4240379 3260 ) {
3fdfceb3
KW
3261 const STRLEN off = d - SvPVX(sv);
3262 const STRLEN extra = 1 + (send - s) + 1;
3263 char *e;
3264
f4240379
KW
3265 /* Move the high character one byte to the right; then
3266 * insert between it and the range begin, an illegal
3267 * byte which serves to indicate this is a range (using
02cd137d 3268 * a '-' would be ambiguous). */
3fdfceb3
KW
3269
3270 if (off + extra > SvLEN(sv)) {
3271 d = off + SvGROW(sv, off + extra);
3272 max_ptr = d - off + offset_to_max;
3273 }
3274
3275 e = d++;
f4240379
KW
3276 while (e-- > max_ptr) {
3277 *(e + 1) = *e;
e294cc5d 3278 }
dc8faf6b 3279 *(e + 1) = (char) RANGE_INDICATOR;
f4240379
KW
3280 goto range_done;
3281 }
3282
3283 /* Here, we're going to expand out the range. For EBCDIC
3284 * the range can extend above 255 (not so in ASCII), so
3285 * for EBCDIC, split it into the parts above and below
3286 * 255/256 */
e294cc5d 3287#ifdef EBCDIC
f4240379
KW
3288 if (range_max > 255) {
3289 real_range_max = range_max;
3290 range_max = 255;
3291 }
e294cc5d 3292#endif
f4240379 3293 }
02aa26ce 3294
f4240379 3295 /* Here we need to expand out the string to contain each
5ca37d54
KW
3296 * character in the range. Grow the output to handle this.
3297 * For non-UTF8, we need a byte for each code point in the
3298 * range, minus the three that we've already allocated for: the
3299 * hyphen, the min, and the max. For UTF-8, we need this
3300 * plus an extra byte for each code point that occupies two
3301 * bytes (is variant) when in UTF-8 (except we've already
3302 * allocated for the end points, including if they are
3303 * variants). For ASCII platforms and Unicode ranges on EBCDIC
3304 * platforms, it's easy to calculate a precise number. To
3305 * start, we count the variants in the range, which we need
3306 * elsewhere in this function anyway. (For the case where it
3307 * isn't easy to calculate, 'extras' has been initialized to 0,
3308 * and the calculation is done in a loop further down.) */
3309#ifdef EBCDIC
3310 if (convert_unicode)
3311#endif
3312 {
3313 /* This is executed unconditionally on ASCII, and for
3314 * Unicode ranges on EBCDIC. Under these conditions, all
3315 * code points above a certain value are variant; and none
3316 * under that value are. We just need to find out how much
3317 * of the range is above that value. We don't count the
3318 * end points here, as they will already have been counted
3319 * as they were parsed. */
3320 if (range_min >= UTF_CONTINUATION_MARK) {
3321
3322 /* The whole range is made up of variants */
3323 extras = (range_max - 1) - (range_min + 1) + 1;
3324 }
3325 else if (range_max >= UTF_CONTINUATION_MARK) {
f4240379 3326
5ca37d54
KW
3327 /* Only the higher portion of the range is variants */
3328 extras = (range_max - 1) - UTF_CONTINUATION_MARK + 1;
3329 }
f4240379 3330
5ca37d54
KW
3331 utf8_variant_count += extras;
3332 }
3333
3334 /* The base growth is the number of code points in the range,
3335 * not including the endpoints, which have already been sized
3336 * for (and output). We don't subtract for the hyphen, as it
3337 * has been parsed but not output, and the SvGROW below is
3338 * based only on what's been output plus what's left to parse.
3339 * */
3340 grow = (range_max - 1) - (range_min + 1) + 1;
f4240379 3341
ed2893cb 3342 if (d_is_utf8) {
4c3a8340 3343#ifdef EBCDIC
5ca37d54
KW
3344 /* In some cases in EBCDIC, we haven't yet calculated a
3345 * precise amount needed for the UTF-8 variants. Just
3346 * assume the worst case, that everything will expand by a
3347 * byte */
3348 if (! convert_unicode) {
f4240379
KW
3349 grow *= 2;
3350 }
5ca37d54 3351 else
4c3a8340 3352#endif
5ca37d54
KW
3353 {
3354 /* Otherwise we know exactly how many variants there
3355 * are in the range. */
3356 grow += extras;
3357 }
f4240379
KW
3358 }
3359
5ca37d54
KW
3360 /* Grow, but position the output to overwrite the range min end
3361 * point, because in some cases we overwrite that */
3362 SvCUR_set(sv, d - SvPVX_const(sv));
3363 offset_to_min = min_ptr - SvPVX_const(sv);
3364
3365 /* See Note on sizing above. */
3366 d = offset_to_min + SvGROW(sv, SvCUR(sv)
3367 + (send - s)
3368 + grow
3369 + 1 /* Trailing NUL */ );
f4240379 3370
5ca37d54 3371 /* Now, we can expand out the range. */
11327fa1 3372#ifdef EBCDIC
f4240379 3373 if (convert_unicode) {
5ca37d54 3374 SSize_t i;
02aa26ce 3375
f4240379
KW
3376 /* Recall that the min and max are now in Unicode terms, so
3377 * we have to convert each character to its native
3378 * equivalent */
ed2893cb 3379 if (d_is_utf8) {
f4240379 3380 for (i = range_min; i <= range_max; i++) {
02cd137d
KW
3381 append_utf8_from_native_byte(
3382 LATIN1_TO_NATIVE((U8) i),
3383 (U8 **) &d);
f4240379
KW
3384 }
3385 }
3386 else {
3387 for (i = range_min; i <= range_max; i++) {
81324705 3388 *d++ = (char)LATIN1_TO_NATIVE((U8) i);
f4240379
KW
3389 }
3390 }
01ec43d0 3391 }
11327fa1
AL
3392 else
3393#endif
3394 /* Always gets run for ASCII, and sometimes for EBCDIC. */
3395 {
f4240379
KW
3396 /* Here, no conversions are necessary, which means that the
3397 * first character in the range is already in 'd' and
3398 * valid, so we can skip overwriting it */
ed2893cb 3399 if (d_is_utf8) {
19742f39 3400 SSize_t i;
f4240379
KW
3401 d += UTF8SKIP(d);
3402 for (i = range_min + 1; i <= range_max; i++) {
81324705 3403 append_utf8_from_native_byte((U8) i, (U8 **) &d);
f4240379
KW
3404 }
3405 }
3406 else {
19742f39 3407 SSize_t i;
f4240379 3408 d++;
5ca37d54
KW
3409 assert(range_min + 1 <= range_max);
3410 for (i = range_min + 1; i < range_max; i++) {
3411#ifdef EBCDIC
3412 /* In this case on EBCDIC, we haven't calculated
3413 * the variants. Do it here, as we go along */
3414 if (! UVCHR_IS_INVARIANT(i)) {
3415 utf8_variant_count++;
3416 }
3417#endif
f4240379
KW
3418 *d++ = (char)i;
3419 }
5ca37d54
KW
3420
3421 /* The range_max is done outside the loop so as to
3422 * avoid having to special case not incrementing
3423 * 'utf8_variant_count' on EBCDIC (it's already been
3424 * counted when originally parsed) */
3425 *d++ = (char) range_max;
f4240379 3426 }
a0ed51b3 3427 }
02aa26ce 3428
11327fa1 3429#ifdef EBCDIC
02cd137d
KW
3430 /* If the original range extended above 255, add in that
3431 * portion. */
f4240379
KW
3432 if (real_range_max) {
3433 *d++ = (char) UTF8_TWO_BYTE_HI(0x100);
3434 *d++ = (char) UTF8_TWO_BYTE_LO(0x100);
043af6f6
KW
3435 if (real_range_max > 0x100) {
3436 if (real_range_max > 0x101) {
dc8faf6b 3437 *d++ = (char) RANGE_INDICATOR;
043af6f6 3438 }
f4240379 3439 d = (char*)uvchr_to_utf8((U8*)d, real_range_max);
043af6f6 3440 }
f4240379 3441 }
11327fa1 3442#endif
02aa26ce 3443
f4240379
KW
3444 range_done:
3445 /* mark the range as done, and continue */
3446 didrange = TRUE;
3447 dorange = FALSE;
3448#ifdef EBCDIC
3449 non_portable_endpoint = 0;
3450 backslash_N = 0;
3451#endif
3452 continue;
3453 } /* End of is a range */
3454 } /* End of transliteration. Joins main code after these else's */
e4a2df84
DM
3455 else if (*s == '[' && PL_lex_inpat && !in_charclass) {
3456 char *s1 = s-1;
3457 int esc = 0;
3458 while (s1 >= start && *s1-- == '\\')
3459 esc = !esc;
3460 if (!esc)
3461 in_charclass = TRUE;
3462 }
1e02a175 3463 else if (*s == ']' && PL_lex_inpat && in_charclass) {
e4a2df84
DM
3464 char *s1 = s-1;
3465 int esc = 0;
3466 while (s1 >= start && *s1-- == '\\')
3467 esc = !esc;
3468 if (!esc)
3469 in_charclass = FALSE;
3470 }
02cd137d
KW
3471 /* skip for regexp comments /(?#comment)/, except for the last
3472 * char, which will be done separately. Stop on (?{..}) and
3473 * friends */
c30fc27b 3474 else if (*s == '(' && PL_lex_inpat && s[1] == '?' && !in_charclass) {
cc6b7395 3475 if (s[2] == '#') {
f339d50e
KW
3476 if (s_is_utf8) {
3477 PERL_UINT_FAST8_T len = UTF8SKIP(s);
3478
3479 while (s + len < send && *s != ')') {
3480 Copy(s, d, len, U8);
3481 d += len;
3482 s += len;
3483 len = UTF8_SAFE_SKIP(s, send);
3484 }
3485 }
3486 else while (s+1 < send && *s != ')') {
3487 *d++ = *s++;
3488 }
155aba94 3489 }
407f8cf2
KW
3490 else if (!PL_lex_casemods
3491 && ( s[2] == '{' /* This should match regcomp.c */
3492 || (s[2] == '?' && s[3] == '{')))
155aba94 3493 {
9da1dd8f 3494 break;
cc6b7395 3495 }
748a9306 3496 }
02cd137d 3497 /* likewise skip #-initiated comments in //x patterns */
407f8cf2
KW
3498 else if (*s == '#'
3499 && PL_lex_inpat
3500 && !in_charclass
3501 && ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED)
3502 {
8faf4f30 3503 while (s < send && *s != '\n')
5ff03569 3504 *d++ = *s++;
748a9306 3505 }
02cd137d 3506 /* no further processing of single-quoted regex */
9da1dd8f
DM
3507 else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'')
3508 goto default_action;
3509
02cd137d
KW
3510 /* check for embedded arrays
3511 * (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
3512 */
1749ea0d 3513 else if (*s == '@' && s[1]) {
fac0f7a3
KW
3514 if (UTF
3515 ? isIDFIRST_utf8_safe(s+1, send)
3516 : isWORDCHAR_A(s[1]))
3517 {
1749ea0d 3518 break;
fac0f7a3 3519 }
4aada8b9 3520 if (memCHRs(":'{$", s[1]))
1749ea0d
ST
3521 break;
3522 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
3523 break; /* in regexp, neither @+ nor @- are interpolated */
3524 }
02cd137d
KW
3525 /* check for embedded scalars. only stop if we're sure it's a
3526 * variable. */
79072805 3527 else if (*s == '$') {
3280af22 3528 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
79072805 3529 break;
4aada8b9 3530 if (s + 1 < send && !memCHRs("()| \r\n\t", s[1])) {
a2a5de95
NC
3531 if (s[1] == '\\') {
3532 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
3533 "Possible unintended interpolation of $\\ in regex");
77772344