This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Various updates and fixes to some of the SysV IPC ops and their tests
[perl5.git] / toke.c
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
f0e67a1d
Z
26This is the lower layer of the Perl parser, managing characters and tokens.
27
78342678 28=for apidoc AmnU|yy_parser *|PL_parser
f0e67a1d
Z
29
30Pointer to a structure encapsulating the state of the parsing operation
31currently in progress. The pointer can be locally changed to perform
32a nested parse without interfering with the state of an outer parse.
33Individual members of C<PL_parser> have their own documentation.
34
35=cut
36*/
37
378cc40b 38#include "EXTERN.h"
864dbfa3 39#define PERL_IN_TOKE_C
378cc40b 40#include "perl.h"
f1bcae08 41#include "invlist_inline.h"
378cc40b 42
164e423c
KW
43#define new_constant(a,b,c,d,e,f,g, h) \
44 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g, h)
eb0d8d16 45
6154021b 46#define pl_yylval (PL_parser->yylval)
d3b6f988 47
199e78b7
DM
48/* XXX temporary backwards compatibility */
49#define PL_lex_brackets (PL_parser->lex_brackets)
78cdf107
Z
50#define PL_lex_allbrackets (PL_parser->lex_allbrackets)
51#define PL_lex_fakeeof (PL_parser->lex_fakeeof)
199e78b7
DM
52#define PL_lex_brackstack (PL_parser->lex_brackstack)
53#define PL_lex_casemods (PL_parser->lex_casemods)
54#define PL_lex_casestack (PL_parser->lex_casestack)
199e78b7 55#define PL_lex_dojoin (PL_parser->lex_dojoin)
199e78b7
DM
56#define PL_lex_formbrack (PL_parser->lex_formbrack)
57#define PL_lex_inpat (PL_parser->lex_inpat)
58#define PL_lex_inwhat (PL_parser->lex_inwhat)
59#define PL_lex_op (PL_parser->lex_op)
60#define PL_lex_repl (PL_parser->lex_repl)
61#define PL_lex_starts (PL_parser->lex_starts)
62#define PL_lex_stuff (PL_parser->lex_stuff)
63#define PL_multi_start (PL_parser->multi_start)
64#define PL_multi_open (PL_parser->multi_open)
65#define PL_multi_close (PL_parser->multi_close)
199e78b7 66#define PL_preambled (PL_parser->preambled)
bdc0bf6f 67#define PL_linestr (PL_parser->linestr)
c2598295
DM
68#define PL_expect (PL_parser->expect)
69#define PL_copline (PL_parser->copline)
f06b5848
DM
70#define PL_bufptr (PL_parser->bufptr)
71#define PL_oldbufptr (PL_parser->oldbufptr)
72#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
73#define PL_linestart (PL_parser->linestart)
74#define PL_bufend (PL_parser->bufend)
75#define PL_last_uni (PL_parser->last_uni)
76#define PL_last_lop (PL_parser->last_lop)
77#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 78#define PL_lex_state (PL_parser->lex_state)
2f9285f8 79#define PL_rsfp (PL_parser->rsfp)
5486870f 80#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
81#define PL_in_my (PL_parser->in_my)
82#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 83#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 84#define PL_multi_end (PL_parser->multi_end)
13765c85 85#define PL_error_count (PL_parser->error_count)
199e78b7 86
fb205e7a
DM
87# define PL_nexttoke (PL_parser->nexttoke)
88# define PL_nexttype (PL_parser->nexttype)
89# define PL_nextval (PL_parser->nextval)
199e78b7 90
6432a58a
DM
91
92#define SvEVALED(sv) \
93 (SvTYPE(sv) >= SVt_PVNV \
94 && ((XPVIV*)SvANY(sv))->xiv_u.xivu_eval_seen)
95
a1894d81 96static const char* const ident_too_long = "Identifier too long";
60267e1d 97static const char* const ident_var_zero_multi_digit = "Numeric variables with more than one digit may not start with '0'";
8903cb82 98
9ded7720 99# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2 100
a7aaec61
Z
101#define XENUMMASK 0x3f
102#define XFAKEEOF 0x40
103#define XFAKEBRACK 0x80
9059aa12 104
39e02b42 105#ifdef USE_UTF8_SCRIPTS
b3041197 106# define UTF cBOOL(!IN_BYTES)
2b9d42f0 107#else
b3041197 108# define UTF cBOOL((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
2b9d42f0 109#endif
a0ed51b3 110
b1fc3636
CJ
111/* The maximum number of characters preceding the unrecognized one to display */
112#define UNRECOGNIZED_PRECEDE_COUNT 10
113
61f0cdd9 114/* In variables named $^X, these are the legal values for X.
2b92dfce 115 * 1999-02-27 mjd-perl-patch@plover.com */
4aada8b9 116#define isCONTROLVAR(x) (isUPPER(x) || memCHRs("[\\]^_?", (x)))
2b92dfce 117
14bd96d0 118#define SPACE_OR_TAB(c) isBLANK_A(c)
bf4acbe4 119
9ff909cf
JH
120#define HEXFP_PEEK(s) \
121 (((s[0] == '.') && \
122 (isXDIGIT(s[1]) || isALPHA_FOLD_EQ(s[1], 'p'))) || \
123 isALPHA_FOLD_EQ(s[0], 'p'))
124
ffb4593c
NT
125/* LEX_* are values for PL_lex_state, the state of the lexer.
126 * They are arranged oddly so that the guard on the switch statement
79072805 127 * can get by with a single comparison (if the compiler is smart enough).
9da1dd8f
DM
128 *
129 * These values refer to the various states within a sublex parse,
130 * i.e. within a double quotish string
79072805
LW
131 */
132
fb73857a
PP
133/* #define LEX_NOTPARSING 11 is done in perl.h. */
134
b6007c36
DM
135#define LEX_NORMAL 10 /* normal code (ie not within "...") */
136#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
137#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
138#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
139#define LEX_INTERPSTART 6 /* expecting the start of a $var */
140
141 /* at end of code, eg "$x" followed by: */
142#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
143#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
144
145#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
146 string or after \E, $foo, etc */
147#define LEX_INTERPCONST 2 /* NOT USED */
148#define LEX_FORMLINE 1 /* expecting a format line */
b6007c36 149
56d9fe2f
TC
150/* returned to yyl_try() to request it to retry the parse loop, expected to only
151 be returned directly by yyl_fake_eof(), but functions that call yyl_fake_eof()
152 can also return it.
153
154 yylex (aka Perl_yylex) returns 0 on EOF rather than returning -1,
155 other token values are 258 or higher (see perly.h), so -1 should be
156 a safe value here.
157*/
158#define YYL_RETRY (-1)
79072805 159
bbf60fe6 160#ifdef DEBUGGING
27da23d5 161static const char* const lex_state_names[] = {
bbf60fe6
DM
162 "KNOWNEXT",
163 "FORMLINE",
164 "INTERPCONST",
165 "INTERPCONCAT",
166 "INTERPENDMAYBE",
167 "INTERPEND",
168 "INTERPSTART",
169 "INTERPPUSH",
170 "INTERPCASEMOD",
171 "INTERPNORMAL",
172 "NORMAL"
173};
174#endif
175
79072805 176#include "keywords.h"
fe14fcc3 177
ffb4593c
NT
178/* CLINE is a macro that ensures PL_copline has a sane value */
179
57843af0 180#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 181
ffb4593c
NT
182/*
183 * Convenience functions to return different tokens and prime the
9cbb5ea2 184 * lexer for the next token. They all take an argument.
ffb4593c
NT
185 *
186 * TOKEN : generic token (used for '(', DOLSHARP, etc)
187 * OPERATOR : generic operator
188 * AOPERATOR : assignment operator
189 * PREBLOCK : beginning the block after an if, while, foreach, ...
190 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
191 * PREREF : *EXPR where EXPR is not a simple identifier
192 * TERM : expression term
89f35911 193 * POSTDEREF : postfix dereference (->$* ->@[...] etc.)
ffb4593c
NT
194 * LOOPX : loop exiting command (goto, last, dump, etc)
195 * FTST : file test operator
196 * FUN0 : zero-argument function
7eb971ee 197 * FUN0OP : zero-argument function, with its op created in this file
2d2e263d 198 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
199 * BOop : bitwise or or xor
200 * BAop : bitwise and
8823cb89 201 * BCop : bitwise complement
ffb4593c
NT
202 * SHop : shift operator
203 * PWop : power operator
9cbb5ea2 204 * PMop : pattern-matching operator
ffb4593c 205 * Aop : addition-level operator
e4916dd1 206 * AopNOASSIGN : addition-level operator that is never part of .=
ffb4593c 207 * Mop : multiplication-level operator
02b85d3d
Z
208 * ChEop : chaining equality-testing operator
209 * NCEop : non-chaining comparison operator at equality precedence
210 * ChRop : chaining relational operator <= != gt
211 * NCRop : non-chaining relational operator isa
ffb4593c
NT
212 *
213 * Also see LOP and lop() below.
214 */
215
998054bd 216#ifdef DEBUGGING /* Serve -DT. */
704d4215 217# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 218#else
bbf60fe6 219# define REPORT(retval) (retval)
998054bd
SC
220#endif
221
bbf60fe6
DM
222#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
223#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
b1764551 224#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, retval))
bbf60fe6
DM
225#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
226#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
227#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
228#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
89f35911 229#define POSTDEREF(f) return (PL_bufptr = s, S_postderef(aTHX_ REPORT(f),s[1]))
185c2e96 230#define LOOPX(f) return (PL_bufptr = force_word(s,BAREWORD,TRUE,FALSE), \
7a61bf3c 231 pl_yylval.ival=f, \
a49203fd 232 PL_expect = PL_nexttoke ? XOPERATOR : XTERM, \
7a61bf3c 233 REPORT((int)LOOPEX))
6154021b
RGS
234#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
235#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
7eb971ee 236#define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
6154021b 237#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
b1764551
FC
238#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)BITOROP))
239#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)BITANDOP))
8823cb89
FC
240#define BCop(f) return pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr = s, \
241 REPORT('~')
b1764551
FC
242#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)SHIFTOP))
243#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)POWOP))
6154021b 244#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
b1764551 245#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)ADDOP))
e4916dd1 246#define AopNOASSIGN(f) return (pl_yylval.ival=f, PL_bufptr=s, REPORT((int)ADDOP))
b1764551 247#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)MULOP))
02b85d3d
Z
248#define ChEop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)CHEQOP))
249#define NCEop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)NCEQOP))
250#define ChRop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)CHRELOP))
251#define NCRop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)NCRELOP))
2f3197b3 252
a687059c
LW
253/* This bit of chicanery makes a unary function followed by
254 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
255 * The UNIDOR macro is for unary functions that can be followed by the //
256 * operator (such as C<shift // 0>).
a687059c 257 */
d68ce4ac 258#define UNI3(f,x,have_x) { \
6154021b 259 pl_yylval.ival = f; \
d68ce4ac 260 if (have_x) PL_expect = x; \
376fcdbf
AL
261 PL_bufptr = s; \
262 PL_last_uni = PL_oldbufptr; \
0af40c75 263 PL_last_lop_op = (f) < 0 ? -(f) : (f); \
376fcdbf
AL
264 if (*s == '(') \
265 return REPORT( (int)FUNC1 ); \
294a536f 266 s = skipspace(s); \
376fcdbf
AL
267 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
268 }
d68ce4ac
FC
269#define UNI(f) UNI3(f,XTERM,1)
270#define UNIDOR(f) UNI3(f,XTERMORDORDOR,1)
b5fb7ce3
FC
271#define UNIPROTO(f,optional) { \
272 if (optional) PL_last_uni = PL_oldbufptr; \
22393538
MH
273 OPERATOR(f); \
274 }
a687059c 275
d68ce4ac 276#define UNIBRACK(f) UNI3(f,0,0)
79072805 277
9f68db38 278/* grandfather return to old style */
78cdf107
Z
279#define OLDLOP(f) \
280 do { \
281 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
282 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
283 pl_yylval.ival = (f); \
284 PL_expect = XTERM; \
285 PL_bufptr = s; \
286 return (int)LSTOP; \
287 } while(0)
79072805 288
83944c01
FC
289#define COPLINE_INC_WITH_HERELINES \
290 STMT_START { \
291 CopLINE_inc(PL_curcop); \
851b527a
FC
292 if (PL_parser->herelines) \
293 CopLINE(PL_curcop) += PL_parser->herelines, \
294 PL_parser->herelines = 0; \
83944c01 295 } STMT_END
ffdb8b16
FC
296/* Called after scan_str to update CopLINE(PL_curcop), but only when there
297 * is no sublex_push to follow. */
298#define COPLINE_SET_FROM_MULTI_END \
299 STMT_START { \
300 CopLINE_set(PL_curcop, PL_multi_end); \
301 if (PL_multi_end != PL_multi_start) \
851b527a 302 PL_parser->herelines = 0; \
ffdb8b16 303 } STMT_END
83944c01
FC
304
305
539c839f
AC
306/* A file-local structure for passing around information about subroutines and
307 * related definable words */
308struct code {
309 SV *sv;
310 CV *cv;
311 GV *gv, **gvp;
312 OP *rv2cv_op;
313 PADOFFSET off;
314 bool lex;
315};
316
317static const struct code no_code = { NULL, NULL, NULL, NULL, NULL, 0, FALSE };
318
8fa7f367
JH
319#ifdef DEBUGGING
320
6154021b 321/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
322enum token_type {
323 TOKENTYPE_NONE,
324 TOKENTYPE_IVAL,
6154021b 325 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6 326 TOKENTYPE_PVAL,
aeaef349 327 TOKENTYPE_OPVAL
bbf60fe6
DM
328};
329
6d4a66ac
NC
330static struct debug_tokens {
331 const int token;
332 enum token_type type;
333 const char *name;
334} const debug_tokens[] =
9041c2e3 335{
bbf60fe6
DM
336 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
337 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
338 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
339 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
436ddf68 340 { ANON_SIGSUB, TOKENTYPE_IVAL, "ANON_SIGSUB" },
bbf60fe6
DM
341 { ARROW, TOKENTYPE_NONE, "ARROW" },
342 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
343 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
344 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
02b85d3d
Z
345 { CHEQOP, TOKENTYPE_OPNUM, "CHEQOP" },
346 { CHRELOP, TOKENTYPE_OPNUM, "CHRELOP" },
bbf60fe6
DM
347 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
348 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
7896dde7 349 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
350 { DO, TOKENTYPE_NONE, "DO" },
351 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
352 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
353 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
354 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
355 { ELSE, TOKENTYPE_NONE, "ELSE" },
356 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
bbf60fe6
DM
357 { FOR, TOKENTYPE_IVAL, "FOR" },
358 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
705fe0e5
FC
359 { FORMLBRACK, TOKENTYPE_NONE, "FORMLBRACK" },
360 { FORMRBRACK, TOKENTYPE_NONE, "FORMRBRACK" },
bbf60fe6
DM
361 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
362 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
7eb971ee 363 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
bbf60fe6
DM
364 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
365 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
366 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 367 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
368 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
369 { IF, TOKENTYPE_IVAL, "IF" },
01719201 370 { LABEL, TOKENTYPE_OPVAL, "LABEL" },
bbf60fe6
DM
371 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
372 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
373 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
374 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
375 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
376 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
377 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
378 { MY, TOKENTYPE_IVAL, "MY" },
02b85d3d
Z
379 { NCEQOP, TOKENTYPE_OPNUM, "NCEQOP" },
380 { NCRELOP, TOKENTYPE_OPNUM, "NCRELOP" },
bbf60fe6
DM
381 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
382 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
383 { OROP, TOKENTYPE_IVAL, "OROP" },
384 { OROR, TOKENTYPE_NONE, "OROR" },
385 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
88e1f1a2
JV
386 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
387 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6 388 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
cc624add 389 { POSTJOIN, TOKENTYPE_NONE, "POSTJOIN" },
bbf60fe6
DM
390 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
391 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
392 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
393 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
394 { PREINC, TOKENTYPE_NONE, "PREINC" },
395 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
f3f204dc 396 { QWLIST, TOKENTYPE_OPVAL, "QWLIST" },
bbf60fe6 397 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
f3f204dc 398 { REQUIRE, TOKENTYPE_NONE, "REQUIRE" },
bbf60fe6 399 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
436ddf68 400 { SIGSUB, TOKENTYPE_NONE, "SIGSUB" },
bbf60fe6 401 { SUB, TOKENTYPE_NONE, "SUB" },
69afcc21
TC
402 { SUBLEXEND, TOKENTYPE_NONE, "SUBLEXEND" },
403 { SUBLEXSTART, TOKENTYPE_NONE, "SUBLEXSTART" },
bbf60fe6
DM
404 { THING, TOKENTYPE_OPVAL, "THING" },
405 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
406 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
407 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
408 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
409 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
410 { USE, TOKENTYPE_IVAL, "USE" },
7896dde7 411 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6 412 { WHILE, TOKENTYPE_IVAL, "WHILE" },
185c2e96 413 { BAREWORD, TOKENTYPE_OPVAL, "BAREWORD" },
be25f609 414 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 415 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
416};
417
6154021b 418/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 419
bbf60fe6 420STATIC int
704d4215 421S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 422{
7918f24d
NC
423 PERL_ARGS_ASSERT_TOKEREPORT;
424
bbf60fe6 425 if (DEBUG_T_TEST) {
bd61b366 426 const char *name = NULL;
bbf60fe6 427 enum token_type type = TOKENTYPE_NONE;
f54cb97a 428 const struct debug_tokens *p;
396482e1 429 SV* const report = newSVpvs("<== ");
bbf60fe6 430
f54cb97a 431 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
432 if (p->token == (int)rv) {
433 name = p->name;
434 type = p->type;
435 break;
436 }
437 }
438 if (name)
54667de8 439 Perl_sv_catpv(aTHX_ report, name);
239f83d5 440 else if (isGRAPH(rv))
4ebc7986 441 {
bbf60fe6 442 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
4ebc7986
FC
443 if ((char)rv == 'p')
444 sv_catpvs(report, " (pending identifier)");
445 }
bbf60fe6 446 else if (!rv)
396482e1 447 sv_catpvs(report, "EOF");
bbf60fe6 448 else
147e3846 449 Perl_sv_catpvf(aTHX_ report, "?? %" IVdf, (IV)rv);
bbf60fe6
DM
450 switch (type) {
451 case TOKENTYPE_NONE:
bbf60fe6
DM
452 break;
453 case TOKENTYPE_IVAL:
147e3846 454 Perl_sv_catpvf(aTHX_ report, "(ival=%" IVdf ")", (IV)lvalp->ival);
bbf60fe6
DM
455 break;
456 case TOKENTYPE_OPNUM:
457 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 458 PL_op_name[lvalp->ival]);
bbf60fe6
DM
459 break;
460 case TOKENTYPE_PVAL:
704d4215 461 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
462 break;
463 case TOKENTYPE_OPVAL:
704d4215 464 if (lvalp->opval) {
401441c0 465 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
466 PL_op_name[lvalp->opval->op_type]);
467 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 468 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 469 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
470 }
471
472 }
401441c0 473 else
396482e1 474 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
475 break;
476 }
b6007c36 477 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
478 };
479 return (int)rv;
998054bd
SC
480}
481
b6007c36
DM
482
483/* print the buffer with suitable escapes */
484
485STATIC void
15f169a1 486S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 487{
396482e1 488 SV* const tmp = newSVpvs("");
7918f24d
NC
489
490 PERL_ARGS_ASSERT_PRINTBUF;
491
7347ee54 492 GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral); /* fmt checked by caller */
b6007c36 493 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
7347ee54 494 GCC_DIAG_RESTORE_STMT;
b6007c36
DM
495 SvREFCNT_dec(tmp);
496}
497
8fa7f367
JH
498#endif
499
ffb4593c
NT
500/*
501 * S_ao
502 *
f393a21a
FC
503 * This subroutine looks for an '=' next to the operator that has just been
504 * parsed and turns it into an ASSIGNOP if it finds one.
ffb4593c
NT
505 */
506
76e3520e 507STATIC int
cea2e8a9 508S_ao(pTHX_ int toketype)
a0d0e21e 509{
3280af22
NIS
510 if (*PL_bufptr == '=') {
511 PL_bufptr++;
a0d0e21e 512 if (toketype == ANDAND)
6154021b 513 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 514 else if (toketype == OROR)
6154021b 515 pl_yylval.ival = OP_ORASSIGN;
c963b151 516 else if (toketype == DORDOR)
6154021b 517 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
518 toketype = ASSIGNOP;
519 }
b1764551 520 return REPORT(toketype);
a0d0e21e
LW
521}
522
ffb4593c
NT
523/*
524 * S_no_op
525 * When Perl expects an operator and finds something else, no_op
526 * prints the warning. It always prints "<something> found where
527 * operator expected. It prints "Missing semicolon on previous line?"
528 * if the surprise occurs at the start of the line. "do you need to
529 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
530 * where the compiler doesn't know if foo is a method call or a function.
531 * It prints "Missing operator before end of line" if there's nothing
532 * after the missing operator, or "... before <...>" if there is something
533 * after the missing operator.
488bc579
FC
534 *
535 * PL_bufptr is expected to point to the start of the thing that was found,
536 * and s after the next token or partial token.
ffb4593c
NT
537 */
538
76e3520e 539STATIC void
15f169a1 540S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 541{
9d4ba2ae
AL
542 char * const oldbp = PL_bufptr;
543 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 544
7918f24d
NC
545 PERL_ARGS_ASSERT_NO_OP;
546
1189a94a
GS
547 if (!s)
548 s = oldbp;
07c798fb 549 else
1189a94a 550 PL_bufptr = s;
734ab321 551 yywarn(Perl_form(aTHX_ "%s found where operator expected", what), UTF ? SVf_UTF8 : 0);
56da5a46
RGS
552 if (ckWARN_d(WARN_SYNTAX)) {
553 if (is_first)
554 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
555 "\t(Missing semicolon on previous line?)\n");
fac0f7a3
KW
556 else if (PL_oldoldbufptr && isIDFIRST_lazy_if_safe(PL_oldoldbufptr,
557 PL_bufend,
558 UTF))
559 {
f54cb97a 560 const char *t;
fac0f7a3
KW
561 for (t = PL_oldoldbufptr;
562 (isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF) || *t == ':');
563 t += UTF ? UTF8SKIP(t) : 1)
564 {
c35e046a 565 NOOP;
fac0f7a3 566 }
56da5a46
RGS
567 if (t < PL_bufptr && isSPACE(*t))
568 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
147e3846 569 "\t(Do you need to predeclare %" UTF8f "?)\n",
b17a0679 570 UTF8fARG(UTF, t - PL_oldoldbufptr, PL_oldoldbufptr));
56da5a46
RGS
571 }
572 else {
573 assert(s >= oldbp);
574 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
147e3846 575 "\t(Missing operator before %" UTF8f "?)\n",
b17a0679 576 UTF8fARG(UTF, s - oldbp, oldbp));
56da5a46 577 }
07c798fb 578 }
3280af22 579 PL_bufptr = oldbp;
8990e307
LW
580}
581
ffb4593c
NT
582/*
583 * S_missingterm
584 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 585 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
586 * If we're in a delimited string and the delimiter is a control
587 * character, it's reformatted into a two-char sequence like ^C.
588 * This is fatal.
589 */
590
76e3520e 591STATIC void
1b8d3e0e 592S_missingterm(pTHX_ char *s, STRLEN len)
8990e307 593{
e487ff5e 594 char tmpbuf[UTF8_MAXBYTES + 1];
8990e307 595 char q;
cb650135
FC
596 bool uni = FALSE;
597 SV *sv;
8990e307 598 if (s) {
f88b23d7 599 char * const nl = (char *) my_memrchr(s, '\n', len);
1b8d3e0e
LM
600 if (nl) {
601 *nl = '\0';
602 len = nl - s;
603 }
7f7f7d08 604 uni = UTF;
8990e307 605 }
cb650135 606 else if (PL_multi_close < 32) {
8990e307 607 *tmpbuf = '^';
585ec06d 608 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
609 tmpbuf[2] = '\0';
610 s = tmpbuf;
1b8d3e0e 611 len = 2;
8990e307
LW
612 }
613 else {
cb650135
FC
614 if (LIKELY(PL_multi_close < 256)) {
615 *tmpbuf = (char)PL_multi_close;
616 tmpbuf[1] = '\0';
1b8d3e0e 617 len = 1;
cb650135
FC
618 }
619 else {
1b8d3e0e
LM
620 char *end = (char *)uvchr_to_utf8((U8 *)tmpbuf, PL_multi_close);
621 *end = '\0';
622 len = end - tmpbuf;
cb650135 623 uni = TRUE;
cb650135 624 }
8990e307
LW
625 s = tmpbuf;
626 }
9f8d27b7 627 q = memchr(s, '"', len) ? '\'' : '"';
1b8d3e0e 628 sv = sv_2mortal(newSVpvn(s, len));
cb650135
FC
629 if (uni)
630 SvUTF8_on(sv);
1b8d3e0e
LM
631 Perl_croak(aTHX_ "Can't find string terminator %c%" SVf "%c"
632 " anywhere before EOF", q, SVfARG(sv), q);
463ee0b2 633}
79072805 634
dd0ac2b9
FC
635#include "feature.h"
636
0d863452 637/*
9cbb5ea2
GS
638 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
639 * utf16-to-utf8-reversed.
ffb4593c
NT
640 */
641
c39cd008
GS
642#ifdef PERL_CR_FILTER
643static void
644strip_return(SV *sv)
645{
eb578fdb
KW
646 const char *s = SvPVX_const(sv);
647 const char * const e = s + SvCUR(sv);
7918f24d
NC
648
649 PERL_ARGS_ASSERT_STRIP_RETURN;
650
c39cd008
GS
651 /* outer loop optimized to do nothing if there are no CR-LFs */
652 while (s < e) {
653 if (*s++ == '\r' && *s == '\n') {
654 /* hit a CR-LF, need to copy the rest */
eb578fdb 655 char *d = s - 1;
c39cd008
GS
656 *d++ = *s++;
657 while (s < e) {
658 if (*s == '\r' && s[1] == '\n')
659 s++;
660 *d++ = *s++;
661 }
662 SvCUR(sv) -= s - d;
663 return;
664 }
665 }
666}
a868473f 667
76e3520e 668STATIC I32
c39cd008 669S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 670{
f54cb97a 671 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
672 if (count > 0 && !maxlen)
673 strip_return(sv);
674 return count;
a868473f
NIS
675}
676#endif
677
ffb4593c 678/*
44170c9a 679=for apidoc lex_start
8eaa0acf
Z
680
681Creates and initialises a new lexer/parser state object, supplying
682a context in which to lex and parse from a new source of Perl code.
683A pointer to the new state object is placed in L</PL_parser>. An entry
c83d5090 684is made on the save stack so that upon unwinding, the new state object
8eaa0acf
Z
685will be destroyed and the former value of L</PL_parser> will be restored.
686Nothing else need be done to clean up the parsing context.
687
2d7f6611 688The code to be parsed comes from C<line> and C<rsfp>. C<line>, if
8eaa0acf 689non-null, provides a string (in SV form) containing code to be parsed.
2d7f6611
KW
690A copy of the string is made, so subsequent modification of C<line>
691does not affect parsing. C<rsfp>, if non-null, provides an input stream
8eaa0acf 692from which code will be read to be parsed. If both are non-null, the
2d7f6611
KW
693code in C<line> comes first and must consist of complete lines of input,
694and C<rsfp> supplies the remainder of the source.
8eaa0acf 695
2d7f6611 696The C<flags> parameter is reserved for future use. Currently it is only
e368b3bd 697used by perl internally, so extensions should always pass zero.
8eaa0acf
Z
698
699=cut
700*/
ffb4593c 701
27fcb6ee 702/* LEX_START_SAME_FILTER indicates that this is not a new file, so it
87606032
NC
703 can share filters with the current parser.
704 LEX_START_DONT_CLOSE indicates that the file handle wasn't opened by the
705 caller, hence isn't owned by the parser, so shouldn't be closed on parser
706 destruction. This is used to handle the case of defaulting to reading the
707 script from the standard input because no filename was given on the command
708 line (without getting confused by situation where STDIN has been closed, so
709 the script handle is opened on fd 0) */
27fcb6ee 710
a0d0e21e 711void
8eaa0acf 712Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
79072805 713{
6ef55633 714 const char *s = NULL;
5486870f 715 yy_parser *parser, *oparser;
90b58c70 716
60d63348 717 if (flags && flags & ~LEX_START_FLAGS)
8eaa0acf 718 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
acdf0a21
DM
719
720 /* create and initialise a parser */
721
199e78b7 722 Newxz(parser, 1, yy_parser);
5486870f 723 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
724 PL_parser = parser;
725
28ac2b49 726 parser->stack = NULL;
df13534a 727 parser->stack_max1 = NULL;
28ac2b49 728 parser->ps = NULL;
acdf0a21 729
e3abe207
DM
730 /* on scope exit, free this parser and restore any outer one */
731 SAVEPARSER(parser);
7c4baf47 732 parser->saved_curcop = PL_curcop;
e3abe207 733
acdf0a21 734 /* initialise lexer state */
8990e307 735
fb205e7a 736 parser->nexttoke = 0;
ca4cfd28 737 parser->error_count = oparser ? oparser->error_count : 0;
7f1c3e8c 738 parser->copline = parser->preambling = NOLINE;
5afb0a62 739 parser->lex_state = LEX_NORMAL;
c2598295 740 parser->expect = XSTATE;
2f9285f8 741 parser->rsfp = rsfp;
b7b52646 742 parser->recheck_utf8_validity = TRUE;
27fcb6ee
FC
743 parser->rsfp_filters =
744 !(flags & LEX_START_SAME_FILTER) || !oparser
d3cd8e11
FC
745 ? NULL
746 : MUTABLE_AV(SvREFCNT_inc(
747 oparser->rsfp_filters
748 ? oparser->rsfp_filters
749 : (oparser->rsfp_filters = newAV())
750 ));
2f9285f8 751
199e78b7
DM
752 Newx(parser->lex_brackstack, 120, char);
753 Newx(parser->lex_casestack, 12, char);
754 *parser->lex_casestack = '\0';
d794b522 755 Newxz(parser->lex_shared, 1, LEXSHARED);
02b34bbe 756
10efb74f 757 if (line) {
0528fd32 758 STRLEN len;
a6909bd2
KW
759 const U8* first_bad_char_loc;
760
10efb74f 761 s = SvPV_const(line, len);
90b58c70 762
0aab20f2
KW
763 if ( SvUTF8(line)
764 && UNLIKELY(! is_utf8_string_loc((U8 *) s,
765 SvCUR(line),
766 &first_bad_char_loc)))
90b58c70
KW
767 {
768 _force_out_malformed_utf8_message(first_bad_char_loc,
769 (U8 *) s + SvCUR(line),
770 0,
771 1 /* 1 means die */ );
772 NOT_REACHED; /* NOTREACHED */
773 }
774
0abcdfa4
FC
775 parser->linestr = flags & LEX_START_COPIED
776 ? SvREFCNT_inc_simple_NN(line)
777 : newSVpvn_flags(s, len, SvUTF8(line));
b3dd0aba
FC
778 if (!rsfp)
779 sv_catpvs(parser->linestr, "\n;");
0abcdfa4 780 } else {
bf1b738b 781 parser->linestr = newSVpvn("\n;", rsfp ? 1 : 2);
8990e307 782 }
218304f9 783
f06b5848
DM
784 parser->oldoldbufptr =
785 parser->oldbufptr =
786 parser->bufptr =
787 parser->linestart = SvPVX(parser->linestr);
788 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
789 parser->last_lop = parser->last_uni = NULL;
b54f893d 790
6d59e610 791 STATIC_ASSERT_STMT(FITS_IN_8_BITS(LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
b54f893d
KW
792 |LEX_DONT_CLOSE_RSFP));
793 parser->lex_flags = (U8) (flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
794 |LEX_DONT_CLOSE_RSFP));
737c24fc 795
60d63348 796 parser->in_pod = parser->filtered = 0;
79072805 797}
a687059c 798
e3abe207
DM
799
800/* delete a parser object */
801
802void
803Perl_parser_free(pTHX_ const yy_parser *parser)
804{
7918f24d
NC
805 PERL_ARGS_ASSERT_PARSER_FREE;
806
7c4baf47 807 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
808 SvREFCNT_dec(parser->linestr);
809
87606032 810 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
2f9285f8 811 PerlIO_clearerr(parser->rsfp);
407f8cf2
KW
812 else if (parser->rsfp && (!parser->old_parser
813 || (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 814 PerlIO_close(parser->rsfp);
5486870f 815 SvREFCNT_dec(parser->rsfp_filters);
10002bc1 816 SvREFCNT_dec(parser->lex_stuff);
7ef70b3d 817 SvREFCNT_dec(parser->lex_sub_repl);
3ac7ff8f
FC
818
819 Safefree(parser->lex_brackstack);
820 Safefree(parser->lex_casestack);
821 Safefree(parser->lex_shared);
822 PL_parser = parser->old_parser;
823 Safefree(parser);
824}
825
826void
827Perl_parser_free_nexttoke_ops(pTHX_ yy_parser *parser, OPSLAB *slab)
828{
3ac7ff8f 829 I32 nexttoke = parser->nexttoke;
3ac7ff8f 830 PERL_ARGS_ASSERT_PARSER_FREE_NEXTTOKE_OPS;
3ce3dcd9 831 while (nexttoke--) {
3ac7ff8f
FC
832 if (S_is_opval_token(parser->nexttype[nexttoke] & 0xffff)
833 && parser->nextval[nexttoke].opval
834 && parser->nextval[nexttoke].opval->op_slabbed
835 && OpSLAB(parser->nextval[nexttoke].opval) == slab) {
3ce3dcd9 836 op_free(parser->nextval[nexttoke].opval);
3ac7ff8f
FC
837 parser->nextval[nexttoke].opval = NULL;
838 }
3ce3dcd9 839 }
e3abe207
DM
840}
841
842
ffb4593c 843/*
78342678 844=for apidoc AmnxUN|SV *|PL_parser-E<gt>linestr
f0e67a1d
Z
845
846Buffer scalar containing the chunk currently under consideration of the
847text currently being lexed. This is always a plain string scalar (for
848which C<SvPOK> is true). It is not intended to be used as a scalar by
849normal scalar means; instead refer to the buffer directly by the pointer
850variables described below.
851
852The lexer maintains various C<char*> pointers to things in the
853C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
854reallocated, all of these pointers must be updated. Don't attempt to
855do this manually, but rather use L</lex_grow_linestr> if you need to
856reallocate the buffer.
857
858The content of the text chunk in the buffer is commonly exactly one
859complete line of input, up to and including a newline terminator,
860but there are situations where it is otherwise. The octets of the
861buffer may be intended to be interpreted as either UTF-8 or Latin-1.
862The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
863flag on this scalar, which may disagree with it.
864
865For direct examination of the buffer, the variable
866L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
867lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
868of these pointers is usually preferable to examination of the scalar
869through normal scalar means.
870
78342678 871=for apidoc AmnxUN|char *|PL_parser-E<gt>bufend
f0e67a1d
Z
872
873Direct pointer to the end of the chunk of text currently being lexed, the
874end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
6602b933 875+ SvCUR(PL_parser-E<gt>linestr)>. A C<NUL> character (zero octet) is
f0e67a1d
Z
876always located at the end of the buffer, and does not count as part of
877the buffer's contents.
878
78342678 879=for apidoc AmnxUN|char *|PL_parser-E<gt>bufptr
f0e67a1d
Z
880
881Points to the current position of lexing inside the lexer buffer.
882Characters around this point may be freely examined, within
883the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
884L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
885interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
886
887Lexing code (whether in the Perl core or not) moves this pointer past
888the characters that it consumes. It is also expected to perform some
889bookkeeping whenever a newline character is consumed. This movement
890can be more conveniently performed by the function L</lex_read_to>,
891which handles newlines appropriately.
892
893Interpretation of the buffer's octets can be abstracted out by
894using the slightly higher-level functions L</lex_peek_unichar> and
895L</lex_read_unichar>.
896
78342678 897=for apidoc AmnxUN|char *|PL_parser-E<gt>linestart
f0e67a1d
Z
898
899Points to the start of the current line inside the lexer buffer.
900This is useful for indicating at which column an error occurred, and
901not much else. This must be updated by any lexing code that consumes
902a newline; the function L</lex_read_to> handles this detail.
903
904=cut
905*/
906
907/*
44170c9a 908=for apidoc lex_bufutf8
f0e67a1d
Z
909
910Indicates whether the octets in the lexer buffer
911(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
912of Unicode characters. If not, they should be interpreted as Latin-1
913characters. This is analogous to the C<SvUTF8> flag for scalars.
914
915In UTF-8 mode, it is not guaranteed that the lexer buffer actually
916contains valid UTF-8. Lexing code must be robust in the face of invalid
917encoding.
918
919The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
920is significant, but not the whole story regarding the input character
921encoding. Normally, when a file is being read, the scalar contains octets
922and its C<SvUTF8> flag is off, but the octets should be interpreted as
923UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
924however, the scalar may have the C<SvUTF8> flag on, and in this case its
925octets should be interpreted as UTF-8 unless the C<use bytes> pragma
926is in effect. This logic may change in the future; use this function
927instead of implementing the logic yourself.
928
929=cut
930*/
931
932bool
933Perl_lex_bufutf8(pTHX)
934{
935 return UTF;
936}
937
938/*
44170c9a 939=for apidoc lex_grow_linestr
f0e67a1d
Z
940
941Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
2d7f6611 942at least C<len> octets (including terminating C<NUL>). Returns a
f0e67a1d
Z
943pointer to the reallocated buffer. This is necessary before making
944any direct modification of the buffer that would increase its length.
945L</lex_stuff_pvn> provides a more convenient way to insert text into
946the buffer.
947
948Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
949this function updates all of the lexer's variables that point directly
950into the buffer.
951
952=cut
953*/
954
955char *
956Perl_lex_grow_linestr(pTHX_ STRLEN len)
957{
958 SV *linestr;
959 char *buf;
960 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
c7641931 961 STRLEN linestart_pos, last_uni_pos, last_lop_pos, re_eval_start_pos;
98d5e3ef
DM
962 bool current;
963
f0e67a1d
Z
964 linestr = PL_parser->linestr;
965 buf = SvPVX(linestr);
966 if (len <= SvLEN(linestr))
967 return buf;
98d5e3ef
DM
968
969 /* Is the lex_shared linestr SV the same as the current linestr SV?
970 * Only in this case does re_eval_start need adjusting, since it
971 * points within lex_shared->ls_linestr's buffer */
b1b8fb6a
DM
972 current = ( !PL_parser->lex_shared->ls_linestr
973 || linestr == PL_parser->lex_shared->ls_linestr);
98d5e3ef 974
f0e67a1d
Z
975 bufend_pos = PL_parser->bufend - buf;
976 bufptr_pos = PL_parser->bufptr - buf;
977 oldbufptr_pos = PL_parser->oldbufptr - buf;
978 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
979 linestart_pos = PL_parser->linestart - buf;
980 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
981 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
98d5e3ef 982 re_eval_start_pos = (current && PL_parser->lex_shared->re_eval_start) ?
3328ab5a 983 PL_parser->lex_shared->re_eval_start - buf : 0;
c7641931 984
f0e67a1d 985 buf = sv_grow(linestr, len);
c7641931 986
f0e67a1d
Z
987 PL_parser->bufend = buf + bufend_pos;
988 PL_parser->bufptr = buf + bufptr_pos;
989 PL_parser->oldbufptr = buf + oldbufptr_pos;
990 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
991 PL_parser->linestart = buf + linestart_pos;
992 if (PL_parser->last_uni)
993 PL_parser->last_uni = buf + last_uni_pos;
994 if (PL_parser->last_lop)
995 PL_parser->last_lop = buf + last_lop_pos;
98d5e3ef 996 if (current && PL_parser->lex_shared->re_eval_start)
3328ab5a 997 PL_parser->lex_shared->re_eval_start = buf + re_eval_start_pos;
f0e67a1d
Z
998 return buf;
999}
1000
1001/*
44170c9a 1002=for apidoc lex_stuff_pvn
f0e67a1d
Z
1003
1004Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1005immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1006reallocating the buffer if necessary. This means that lexing code that
1007runs later will see the characters as if they had appeared in the input.
1008It is not recommended to do this as part of normal parsing, and most
1009uses of this facility run the risk of the inserted characters being
1010interpreted in an unintended manner.
1011
2d7f6611
KW
1012The string to be inserted is represented by C<len> octets starting
1013at C<pv>. These octets are interpreted as either UTF-8 or Latin-1,
1014according to whether the C<LEX_STUFF_UTF8> flag is set in C<flags>.
f0e67a1d
Z
1015The characters are recoded for the lexer buffer, according to how the
1016buffer is currently being interpreted (L</lex_bufutf8>). If a string
9dcc53ea 1017to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
f0e67a1d
Z
1018function is more convenient.
1019
5af38e47
KW
1020=for apidoc Amnh||LEX_STUFF_UTF8
1021
f0e67a1d
Z
1022=cut
1023*/
1024
1025void
83aa740e 1026Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
f0e67a1d
Z
1027{
1028 char *bufptr;
1029 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
1030 if (flags & ~(LEX_STUFF_UTF8))
1031 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
1032 if (UTF) {
1033 if (flags & LEX_STUFF_UTF8) {
1034 goto plain_copy;
1035 } else {
01ccd497
KW
1036 STRLEN highhalf = variant_under_utf8_count((U8 *) pv,
1037 (U8 *) pv + len);
1038 const char *p, *e = pv+len;;
f0e67a1d
Z
1039 if (!highhalf)
1040 goto plain_copy;
1041 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
1042 bufptr = PL_parser->bufptr;
1043 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1044 SvCUR_set(PL_parser->linestr,
1045 SvCUR(PL_parser->linestr) + len+highhalf);
f0e67a1d
Z
1046 PL_parser->bufend += len+highhalf;
1047 for (p = pv; p != e; p++) {
27d0a47c 1048 append_utf8_from_native_byte(*p, (U8 **) &bufptr);
f0e67a1d
Z
1049 }
1050 }
1051 } else {
1052 if (flags & LEX_STUFF_UTF8) {
1053 STRLEN highhalf = 0;
83aa740e 1054 const char *p, *e = pv+len;
f0e67a1d
Z
1055 for (p = pv; p != e; p++) {
1056 U8 c = (U8)*p;
54d004e8 1057 if (UTF8_IS_ABOVE_LATIN1(c)) {
f0e67a1d
Z
1058 Perl_croak(aTHX_ "Lexing code attempted to stuff "
1059 "non-Latin-1 character into Latin-1 input");
54d004e8 1060 } else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e)) {
f0e67a1d
Z
1061 p++;
1062 highhalf++;
f88c6466 1063 } else assert(UTF8_IS_INVARIANT(c));
f0e67a1d
Z
1064 }
1065 if (!highhalf)
1066 goto plain_copy;
1067 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
1068 bufptr = PL_parser->bufptr;
1069 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1070 SvCUR_set(PL_parser->linestr,
1071 SvCUR(PL_parser->linestr) + len-highhalf);
f0e67a1d 1072 PL_parser->bufend += len-highhalf;
54d004e8
KW
1073 p = pv;
1074 while (p < e) {
1075 if (UTF8_IS_INVARIANT(*p)) {
1076 *bufptr++ = *p;
1077 p++;
f0e67a1d 1078 }
54d004e8
KW
1079 else {
1080 assert(p < e -1 );
a62b247b 1081 *bufptr++ = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
54d004e8
KW
1082 p += 2;
1083 }
f0e67a1d
Z
1084 }
1085 } else {
54d004e8 1086 plain_copy:
f0e67a1d
Z
1087 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1088 bufptr = PL_parser->bufptr;
1089 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
255fdf19 1090 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
f0e67a1d
Z
1091 PL_parser->bufend += len;
1092 Copy(pv, bufptr, len, char);
1093 }
1094 }
1095}
1096
1097/*
44170c9a 1098=for apidoc lex_stuff_pv
9dcc53ea
Z
1099
1100Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1101immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1102reallocating the buffer if necessary. This means that lexing code that
1103runs later will see the characters as if they had appeared in the input.
1104It is not recommended to do this as part of normal parsing, and most
1105uses of this facility run the risk of the inserted characters being
1106interpreted in an unintended manner.
1107
2d7f6611 1108The string to be inserted is represented by octets starting at C<pv>
9dcc53ea
Z
1109and continuing to the first nul. These octets are interpreted as either
1110UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
2d7f6611 1111in C<flags>. The characters are recoded for the lexer buffer, according
9dcc53ea
Z
1112to how the buffer is currently being interpreted (L</lex_bufutf8>).
1113If it is not convenient to nul-terminate a string to be inserted, the
1114L</lex_stuff_pvn> function is more appropriate.
1115
1116=cut
1117*/
1118
1119void
1120Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1121{
1122 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1123 lex_stuff_pvn(pv, strlen(pv), flags);
1124}
1125
1126/*
44170c9a 1127=for apidoc lex_stuff_sv
f0e67a1d
Z
1128
1129Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1130immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1131reallocating the buffer if necessary. This means that lexing code that
1132runs later will see the characters as if they had appeared in the input.
1133It is not recommended to do this as part of normal parsing, and most
1134uses of this facility run the risk of the inserted characters being
1135interpreted in an unintended manner.
1136
2d7f6611 1137The string to be inserted is the string value of C<sv>. The characters
f0e67a1d 1138are recoded for the lexer buffer, according to how the buffer is currently
9dcc53ea 1139being interpreted (L</lex_bufutf8>). If a string to be inserted is
f0e67a1d
Z
1140not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1141need to construct a scalar.
1142
1143=cut
1144*/
1145
1146void
1147Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1148{
1149 char *pv;
1150 STRLEN len;
1151 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1152 if (flags)
1153 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1154 pv = SvPV(sv, len);
1155 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1156}
1157
1158/*
44170c9a 1159=for apidoc lex_unstuff
f0e67a1d
Z
1160
1161Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
2d7f6611 1162C<ptr>. Text following C<ptr> will be moved, and the buffer shortened.
f0e67a1d
Z
1163This hides the discarded text from any lexing code that runs later,
1164as if the text had never appeared.
1165
1166This is not the normal way to consume lexed text. For that, use
1167L</lex_read_to>.
1168
1169=cut
1170*/
1171
1172void
1173Perl_lex_unstuff(pTHX_ char *ptr)
1174{
1175 char *buf, *bufend;
1176 STRLEN unstuff_len;
1177 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1178 buf = PL_parser->bufptr;
1179 if (ptr < buf)
1180 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1181 if (ptr == buf)
1182 return;
1183 bufend = PL_parser->bufend;
1184 if (ptr > bufend)
1185 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1186 unstuff_len = ptr - buf;
1187 Move(ptr, buf, bufend+1-ptr, char);
1188 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1189 PL_parser->bufend = bufend - unstuff_len;
1190}
1191
1192/*
44170c9a 1193=for apidoc lex_read_to
f0e67a1d
Z
1194
1195Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
2d7f6611 1196to C<ptr>. This advances L</PL_parser-E<gt>bufptr> to match C<ptr>,
f0e67a1d
Z
1197performing the correct bookkeeping whenever a newline character is passed.
1198This is the normal way to consume lexed text.
1199
1200Interpretation of the buffer's octets can be abstracted out by
1201using the slightly higher-level functions L</lex_peek_unichar> and
1202L</lex_read_unichar>.
1203
1204=cut
1205*/
1206
1207void
1208Perl_lex_read_to(pTHX_ char *ptr)
1209{
1210 char *s;
1211 PERL_ARGS_ASSERT_LEX_READ_TO;
1212 s = PL_parser->bufptr;
1213 if (ptr < s || ptr > PL_parser->bufend)
1214 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1215 for (; s != ptr; s++)
1216 if (*s == '\n') {
83944c01 1217 COPLINE_INC_WITH_HERELINES;
f0e67a1d
Z
1218 PL_parser->linestart = s+1;
1219 }
1220 PL_parser->bufptr = ptr;
1221}
1222
1223/*
44170c9a 1224=for apidoc lex_discard_to
f0e67a1d
Z
1225
1226Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
2d7f6611
KW
1227up to C<ptr>. The remaining content of the buffer will be moved, and
1228all pointers into the buffer updated appropriately. C<ptr> must not
f0e67a1d
Z
1229be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1230it is not permitted to discard text that has yet to be lexed.
1231
1232Normally it is not necessarily to do this directly, because it suffices to
1233use the implicit discarding behaviour of L</lex_next_chunk> and things
1234based on it. However, if a token stretches across multiple lines,
1f317c95 1235and the lexing code has kept multiple lines of text in the buffer for
f0e67a1d
Z
1236that purpose, then after completion of the token it would be wise to
1237explicitly discard the now-unneeded earlier lines, to avoid future
1238multi-line tokens growing the buffer without bound.
1239
1240=cut
1241*/
1242
1243void
1244Perl_lex_discard_to(pTHX_ char *ptr)
1245{
1246 char *buf;
1247 STRLEN discard_len;
1248 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1249 buf = SvPVX(PL_parser->linestr);
1250 if (ptr < buf)
1251 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1252 if (ptr == buf)
1253 return;
1254 if (ptr > PL_parser->bufptr)
1255 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1256 discard_len = ptr - buf;
1257 if (PL_parser->oldbufptr < ptr)
1258 PL_parser->oldbufptr = ptr;
1259 if (PL_parser->oldoldbufptr < ptr)
1260 PL_parser->oldoldbufptr = ptr;
1261 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1262 PL_parser->last_uni = NULL;
1263 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1264 PL_parser->last_lop = NULL;
1265 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1266 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1267 PL_parser->bufend -= discard_len;
1268 PL_parser->bufptr -= discard_len;
1269 PL_parser->oldbufptr -= discard_len;
1270 PL_parser->oldoldbufptr -= discard_len;
1271 if (PL_parser->last_uni)
1272 PL_parser->last_uni -= discard_len;
1273 if (PL_parser->last_lop)
1274 PL_parser->last_lop -= discard_len;
1275}
1276
efa571ab
KW
1277void
1278Perl_notify_parser_that_changed_to_utf8(pTHX)
1279{
1280 /* Called when $^H is changed to indicate that HINT_UTF8 has changed from
1281 * off to on. At compile time, this has the effect of entering a 'use
1282 * utf8' section. This means that any input was not previously checked for
1283 * UTF-8 (because it was off), but now we do need to check it, or our
1284 * assumptions about the input being sane could be wrong, and we could
1285 * segfault. This routine just sets a flag so that the next time we look
1286 * at the input we do the well-formed UTF-8 check. If we aren't in the
1287 * proper phase, there may not be a parser object, but if there is, setting
1288 * the flag is harmless */
1289
1290 if (PL_parser) {
1291 PL_parser->recheck_utf8_validity = TRUE;
1292 }
1293}
1294
f0e67a1d 1295/*
44170c9a 1296=for apidoc lex_next_chunk
f0e67a1d
Z
1297
1298Reads in the next chunk of text to be lexed, appending it to
1299L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1300looked to the end of the current chunk and wants to know more. It is
1301usual, but not necessary, for lexing to have consumed the entirety of
1302the current chunk at this time.
1303
1304If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1305chunk (i.e., the current chunk has been entirely consumed), normally the
1306current chunk will be discarded at the same time that the new chunk is
c5608a1f 1307read in. If C<flags> has the C<LEX_KEEP_PREVIOUS> bit set, the current chunk
f0e67a1d
Z
1308will not be discarded. If the current chunk has not been entirely
1309consumed, then it will not be discarded regardless of the flag.
1310
1311Returns true if some new text was added to the buffer, or false if the
1312buffer has reached the end of the input text.
1313
5af38e47
KW
1314=for apidoc Amnh||LEX_KEEP_PREVIOUS
1315
f0e67a1d
Z
1316=cut
1317*/
1318
1319#define LEX_FAKE_EOF 0x80000000
e47d32dc 1320#define LEX_NO_TERM 0x40000000 /* here-doc */
f0e67a1d
Z
1321
1322bool
1323Perl_lex_next_chunk(pTHX_ U32 flags)
1324{
1325 SV *linestr;
1326 char *buf;
1327 STRLEN old_bufend_pos, new_bufend_pos;
1328 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1329 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1330 bool got_some_for_debugger = 0;
f0e67a1d 1331 bool got_some;
6cdc5cd8 1332
112d1284 1333 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM))
f0e67a1d 1334 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
d27f4b91 1335 if (!(flags & LEX_NO_TERM) && PL_lex_inwhat)
e47d32dc 1336 return FALSE;
f0e67a1d
Z
1337 linestr = PL_parser->linestr;
1338 buf = SvPVX(linestr);
407f8cf2
KW
1339 if (!(flags & LEX_KEEP_PREVIOUS)
1340 && PL_parser->bufptr == PL_parser->bufend)
1341 {
f0e67a1d
Z
1342 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1343 linestart_pos = 0;
1344 if (PL_parser->last_uni != PL_parser->bufend)
1345 PL_parser->last_uni = NULL;
1346 if (PL_parser->last_lop != PL_parser->bufend)
1347 PL_parser->last_lop = NULL;
1348 last_uni_pos = last_lop_pos = 0;
1349 *buf = 0;
2324bdb9 1350 SvCUR_set(linestr, 0);
f0e67a1d
Z
1351 } else {
1352 old_bufend_pos = PL_parser->bufend - buf;
1353 bufptr_pos = PL_parser->bufptr - buf;
1354 oldbufptr_pos = PL_parser->oldbufptr - buf;
1355 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1356 linestart_pos = PL_parser->linestart - buf;
1357 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1358 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1359 }
1360 if (flags & LEX_FAKE_EOF) {
1361 goto eof;
60d63348 1362 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
f0e67a1d
Z
1363 got_some = 0;
1364 } else if (filter_gets(linestr, old_bufend_pos)) {
1365 got_some = 1;
17cc9359 1366 got_some_for_debugger = 1;
112d1284
FC
1367 } else if (flags & LEX_NO_TERM) {
1368 got_some = 0;
f0e67a1d 1369 } else {
580561a3 1370 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
847cc851 1371 SvPVCLEAR(linestr);
f0e67a1d
Z
1372 eof:
1373 /* End of real input. Close filehandle (unless it was STDIN),
1374 * then add implicit termination.
1375 */
87606032 1376 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
f0e67a1d
Z
1377 PerlIO_clearerr(PL_parser->rsfp);
1378 else if (PL_parser->rsfp)
1379 (void)PerlIO_close(PL_parser->rsfp);
1380 PL_parser->rsfp = NULL;
60d63348 1381 PL_parser->in_pod = PL_parser->filtered = 0;
f0e67a1d
Z
1382 if (!PL_in_eval && PL_minus_p) {
1383 sv_catpvs(linestr,
1384 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1385 PL_minus_n = PL_minus_p = 0;
1386 } else if (!PL_in_eval && PL_minus_n) {
1387 sv_catpvs(linestr, /*{*/";}");
1388 PL_minus_n = 0;
1389 } else
1390 sv_catpvs(linestr, ";");
1391 got_some = 1;
1392 }
1393 buf = SvPVX(linestr);
1394 new_bufend_pos = SvCUR(linestr);
1395 PL_parser->bufend = buf + new_bufend_pos;
1396 PL_parser->bufptr = buf + bufptr_pos;
6cdc5cd8 1397
07337b95
KW
1398 if (UTF) {
1399 const U8* first_bad_char_loc;
1400 if (UNLIKELY(! is_utf8_string_loc(
1401 (U8 *) PL_parser->bufptr,
1402 PL_parser->bufend - PL_parser->bufptr,
1403 &first_bad_char_loc)))
1404 {
1405 _force_out_malformed_utf8_message(first_bad_char_loc,
1406 (U8 *) PL_parser->bufend,
1407 0,
1408 1 /* 1 means die */ );
1409 NOT_REACHED; /* NOTREACHED */
1410 }
6cdc5cd8
KW
1411 }
1412
f0e67a1d
Z
1413 PL_parser->oldbufptr = buf + oldbufptr_pos;
1414 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1415 PL_parser->linestart = buf + linestart_pos;
1416 if (PL_parser->last_uni)
1417 PL_parser->last_uni = buf + last_uni_pos;
1418 if (PL_parser->last_lop)
1419 PL_parser->last_lop = buf + last_lop_pos;
7f1c3e8c
FC
1420 if (PL_parser->preambling != NOLINE) {
1421 CopLINE_set(PL_curcop, PL_parser->preambling + 1);
1422 PL_parser->preambling = NOLINE;
1423 }
407f8cf2
KW
1424 if ( got_some_for_debugger
1425 && PERLDB_LINE_OR_SAVESRC
1426 && PL_curstash != PL_debstash)
1427 {
f0e67a1d
Z
1428 /* debugger active and we're not compiling the debugger code,
1429 * so store the line into the debugger's array of lines
1430 */
1431 update_debugger_info(NULL, buf+old_bufend_pos,
1432 new_bufend_pos-old_bufend_pos);
1433 }
1434 return got_some;
1435}
1436
1437/*
44170c9a 1438=for apidoc lex_peek_unichar
f0e67a1d
Z
1439
1440Looks ahead one (Unicode) character in the text currently being lexed.
1441Returns the codepoint (unsigned integer value) of the next character,
1442or -1 if lexing has reached the end of the input text. To consume the
1443peeked character, use L</lex_read_unichar>.
1444
1445If the next character is in (or extends into) the next chunk of input
1446text, the next chunk will be read in. Normally the current chunk will be
c5608a1f
KW
1447discarded at the same time, but if C<flags> has the C<LEX_KEEP_PREVIOUS>
1448bit set, then the current chunk will not be discarded.
f0e67a1d
Z
1449
1450If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1451is encountered, an exception is generated.
1452
1453=cut
1454*/
1455
1456I32
1457Perl_lex_peek_unichar(pTHX_ U32 flags)
1458{
1459 char *s, *bufend;
1460 if (flags & ~(LEX_KEEP_PREVIOUS))
1461 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1462 s = PL_parser->bufptr;
1463 bufend = PL_parser->bufend;
1464 if (UTF) {
1465 U8 head;
1466 I32 unichar;
1467 STRLEN len, retlen;
1468 if (s == bufend) {
1469 if (!lex_next_chunk(flags))
1470 return -1;
1471 s = PL_parser->bufptr;
1472 bufend = PL_parser->bufend;
1473 }
1474 head = (U8)*s;
54d004e8 1475 if (UTF8_IS_INVARIANT(head))
f0e67a1d 1476 return head;
54d004e8
KW
1477 if (UTF8_IS_START(head)) {
1478 len = UTF8SKIP(&head);
f0e67a1d
Z
1479 while ((STRLEN)(bufend-s) < len) {
1480 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1481 break;
1482 s = PL_parser->bufptr;
1483 bufend = PL_parser->bufend;
1484 }
1485 }
c80e42f3 1486 unichar = utf8n_to_uvchr((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
f0e67a1d 1487 if (retlen == (STRLEN)-1) {
75219bac
KW
1488 _force_out_malformed_utf8_message((U8 *) s,
1489 (U8 *) bufend,
1490 0,
1491 1 /* 1 means die */ );
1492 NOT_REACHED; /* NOTREACHED */
f0e67a1d
Z
1493 }
1494 return unichar;
1495 } else {
1496 if (s == bufend) {
1497 if (!lex_next_chunk(flags))
1498 return -1;
1499 s = PL_parser->bufptr;
1500 }
1501 return (U8)*s;
1502 }
1503}
1504
1505/*
44170c9a 1506=for apidoc lex_read_unichar
f0e67a1d
Z
1507
1508Reads the next (Unicode) character in the text currently being lexed.
1509Returns the codepoint (unsigned integer value) of the character read,
1510and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1511if lexing has reached the end of the input text. To non-destructively
1512examine the next character, use L</lex_peek_unichar> instead.
1513
1514If the next character is in (or extends into) the next chunk of input
1515text, the next chunk will be read in. Normally the current chunk will be
c5608a1f
KW
1516discarded at the same time, but if C<flags> has the C<LEX_KEEP_PREVIOUS>
1517bit set, then the current chunk will not be discarded.
f0e67a1d
Z
1518
1519If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1520is encountered, an exception is generated.
1521
1522=cut
1523*/
1524
1525I32
1526Perl_lex_read_unichar(pTHX_ U32 flags)
1527{
1528 I32 c;
1529 if (flags & ~(LEX_KEEP_PREVIOUS))
1530 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1531 c = lex_peek_unichar(flags);
1532 if (c != -1) {
1533 if (c == '\n')
83944c01 1534 COPLINE_INC_WITH_HERELINES;
d9018cbe
EB
1535 if (UTF)
1536 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1537 else
1538 ++(PL_parser->bufptr);
f0e67a1d
Z
1539 }
1540 return c;
1541}
1542
1543/*
44170c9a 1544=for apidoc lex_read_space
f0e67a1d
Z
1545
1546Reads optional spaces, in Perl style, in the text currently being
1547lexed. The spaces may include ordinary whitespace characters and
1548Perl-style comments. C<#line> directives are processed if encountered.
1549L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1550at a non-space character (or the end of the input text).
1551
1552If spaces extend into the next chunk of input text, the next chunk will
1553be read in. Normally the current chunk will be discarded at the same
c5608a1f 1554time, but if C<flags> has the C<LEX_KEEP_PREVIOUS> bit set, then the current
f0e67a1d
Z
1555chunk will not be discarded.
1556
1557=cut
1558*/
1559
21791330 1560#define LEX_NO_INCLINE 0x40000000
f0998909
Z
1561#define LEX_NO_NEXT_CHUNK 0x80000000
1562
f0e67a1d
Z
1563void
1564Perl_lex_read_space(pTHX_ U32 flags)
1565{
1566 char *s, *bufend;
21791330 1567 const bool can_incline = !(flags & LEX_NO_INCLINE);
f0e67a1d 1568 bool need_incline = 0;
21791330 1569 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK|LEX_NO_INCLINE))
f0e67a1d 1570 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
f0e67a1d
Z
1571 s = PL_parser->bufptr;
1572 bufend = PL_parser->bufend;
1573 while (1) {
1574 char c = *s;
1575 if (c == '#') {
1576 do {
1577 c = *++s;
1578 } while (!(c == '\n' || (c == 0 && s == bufend)));
1579 } else if (c == '\n') {
1580 s++;
21791330
FC
1581 if (can_incline) {
1582 PL_parser->linestart = s;
1583 if (s == bufend)
1584 need_incline = 1;
1585 else
c6875f94 1586 incline(s, bufend);
21791330 1587 }
f0e67a1d
Z
1588 } else if (isSPACE(c)) {
1589 s++;
1590 } else if (c == 0 && s == bufend) {
1591 bool got_more;
65c68e17 1592 line_t l;
f0998909
Z
1593 if (flags & LEX_NO_NEXT_CHUNK)
1594 break;
f0e67a1d 1595 PL_parser->bufptr = s;
65c68e17 1596 l = CopLINE(PL_curcop);
851b527a 1597 CopLINE(PL_curcop) += PL_parser->herelines + 1;
f0e67a1d 1598 got_more = lex_next_chunk(flags);
65c68e17 1599 CopLINE_set(PL_curcop, l);
f0e67a1d
Z
1600 s = PL_parser->bufptr;
1601 bufend = PL_parser->bufend;
1602 if (!got_more)
1603 break;
21791330 1604 if (can_incline && need_incline && PL_parser->rsfp) {
c6875f94 1605 incline(s, bufend);
f0e67a1d
Z
1606 need_incline = 0;
1607 }
3c47da3c
FC
1608 } else if (!c) {
1609 s++;
f0e67a1d
Z
1610 } else {
1611 break;
1612 }
1613 }
f0e67a1d
Z
1614 PL_parser->bufptr = s;
1615}
1616
1617/*
fe788d6b 1618
44170c9a 1619=for apidoc validate_proto
fe788d6b
PM
1620
1621This function performs syntax checking on a prototype, C<proto>.
1622If C<warn> is true, any illegal characters or mismatched brackets
1623will trigger illegalproto warnings, declaring that they were
1624detected in the prototype for C<name>.
1625
1626The return value is C<true> if this is a valid prototype, and
1627C<false> if it is not, regardless of whether C<warn> was C<true> or
1628C<false>.
1629
1630Note that C<NULL> is a valid C<proto> and will always return C<true>.
1631
1632=cut
1633
1634 */
1635
1636bool
5783dc51 1637Perl_validate_proto(pTHX_ SV *name, SV *proto, bool warn, bool curstash)
fe788d6b
PM
1638{
1639 STRLEN len, origlen;
11327fa1 1640 char *p;
fe788d6b
PM
1641 bool bad_proto = FALSE;
1642 bool in_brackets = FALSE;
1643 bool after_slash = FALSE;
1644 char greedy_proto = ' ';
1645 bool proto_after_greedy_proto = FALSE;
1646 bool must_be_last = FALSE;
1647 bool underscore = FALSE;
f791a21a 1648 bool bad_proto_after_underscore = FALSE;
fe788d6b
PM
1649
1650 PERL_ARGS_ASSERT_VALIDATE_PROTO;
1651
1652 if (!proto)
1653 return TRUE;
1654
11327fa1 1655 p = SvPV(proto, len);
fe788d6b
PM
1656 origlen = len;
1657 for (; len--; p++) {
1658 if (!isSPACE(*p)) {
1659 if (must_be_last)
1660 proto_after_greedy_proto = TRUE;
f791a21a 1661 if (underscore) {
4aada8b9 1662 if (!memCHRs(";@%", *p))
f791a21a
PM
1663 bad_proto_after_underscore = TRUE;
1664 underscore = FALSE;
1665 }
4aada8b9 1666 if (!memCHRs("$@%*;[]&\\_+", *p) || *p == '\0') {
fe788d6b
PM
1667 bad_proto = TRUE;
1668 }
1669 else {
fe788d6b
PM
1670 if (*p == '[')
1671 in_brackets = TRUE;
1672 else if (*p == ']')
1673 in_brackets = FALSE;
407f8cf2
KW
1674 else if ((*p == '@' || *p == '%')
1675 && !after_slash
1676 && !in_brackets )
1677 {
fe788d6b
PM
1678 must_be_last = TRUE;
1679 greedy_proto = *p;
1680 }
1681 else if (*p == '_')
f791a21a 1682 underscore = TRUE;
fe788d6b
PM
1683 }
1684 if (*p == '\\')
1685 after_slash = TRUE;
1686 else
1687 after_slash = FALSE;
1688 }
1689 }
1690
1691 if (warn) {
b54d603d 1692 SV *tmpsv = newSVpvs_flags("", SVs_TEMP);
fe788d6b 1693 p -= origlen;
b54d603d
PM
1694 p = SvUTF8(proto)
1695 ? sv_uni_display(tmpsv, newSVpvn_flags(p, origlen, SVs_TEMP | SVf_UTF8),
1696 origlen, UNI_DISPLAY_ISPRINT)
1697 : pv_pretty(tmpsv, p, origlen, 60, NULL, NULL, PERL_PV_ESCAPE_NONASCII);
1698
cbf83791
FC
1699 if (curstash && !memchr(SvPVX(name), ':', SvCUR(name))) {
1700 SV *name2 = sv_2mortal(newSVsv(PL_curstname));
1701 sv_catpvs(name2, "::");
1702 sv_catsv(name2, (SV *)name);
1703 name = name2;
1704 }
1705
fe788d6b
PM
1706 if (proto_after_greedy_proto)
1707 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
147e3846 1708 "Prototype after '%c' for %" SVf " : %s",
fe788d6b 1709 greedy_proto, SVfARG(name), p);
50278ed0
PM
1710 if (in_brackets)
1711 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
147e3846 1712 "Missing ']' in prototype for %" SVf " : %s",
50278ed0 1713 SVfARG(name), p);
b54d603d 1714 if (bad_proto)
fe788d6b 1715 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
147e3846 1716 "Illegal character in prototype for %" SVf " : %s",
f791a21a
PM
1717 SVfARG(name), p);
1718 if (bad_proto_after_underscore)
1719 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
147e3846 1720 "Illegal character after '_' in prototype for %" SVf " : %s",
f791a21a 1721 SVfARG(name), p);
fe788d6b
PM
1722 }
1723
1724 return (! (proto_after_greedy_proto || bad_proto) );
1725}
1726
1727/*
ffb4593c
NT
1728 * S_incline
1729 * This subroutine has nothing to do with tilting, whether at windmills
1730 * or pinball tables. Its name is short for "increment line". It
57843af0 1731 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1732 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1733 * # line 500 "foo.pm"
1734 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1735 */
1736
76e3520e 1737STATIC void
c6875f94 1738S_incline(pTHX_ const char *s, const char *end)
463ee0b2 1739{
d9095cec
NC
1740 const char *t;
1741 const char *n;
1742 const char *e;
8818d409 1743 line_t line_num;
22ff3130 1744 UV uv;
463ee0b2 1745
7918f24d
NC
1746 PERL_ARGS_ASSERT_INCLINE;
1747
d77eff5d
KW
1748 assert(end >= s);
1749
83944c01 1750 COPLINE_INC_WITH_HERELINES;
451f421f
FC
1751 if (!PL_rsfp && !PL_parser->filtered && PL_lex_state == LEX_NORMAL
1752 && s+1 == PL_bufend && *s == ';') {
1753 /* fake newline in string eval */
1754 CopLINE_dec(PL_curcop);
1755 return;
1756 }
463ee0b2
LW
1757 if (*s++ != '#')
1758 return;
d4c19fe8
AL
1759 while (SPACE_OR_TAB(*s))
1760 s++;
d77eff5d
KW
1761 if (memBEGINs(s, (STRLEN) (end - s), "line"))
1762 s += sizeof("line") - 1;
73659bf1
GS
1763 else
1764 return;
084592ab 1765 if (SPACE_OR_TAB(*s))
73659bf1 1766 s++;
4e553d73 1767 else
73659bf1 1768 return;
d4c19fe8
AL
1769 while (SPACE_OR_TAB(*s))
1770 s++;
463ee0b2
LW
1771 if (!isDIGIT(*s))
1772 return;
d4c19fe8 1773
463ee0b2
LW
1774 n = s;
1775 while (isDIGIT(*s))
1776 s++;
07714eb4 1777 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1778 return;
bf4acbe4 1779 while (SPACE_OR_TAB(*s))
463ee0b2 1780 s++;
85bb8b90 1781 if (*s == '"' && (t = (char *) memchr(s+1, '"', end - s))) {
463ee0b2 1782 s++;
73659bf1
GS
1783 e = t + 1;
1784 }
463ee0b2 1785 else {
c35e046a 1786 t = s;
1bb1a3d6 1787 while (*t && !isSPACE(*t))
c35e046a 1788 t++;
73659bf1 1789 e = t;
463ee0b2 1790 }
bf4acbe4 1791 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1792 e++;
1793 if (*e != '\n' && *e != '\0')
1794 return; /* false alarm */
1795
22ff3130
HS
1796 if (!grok_atoUV(n, &uv, &e))
1797 return;
1798 line_num = ((line_t)uv) - 1;
8818d409 1799
f4dd75d9 1800 if (t - s > 0) {
d9095cec 1801 const STRLEN len = t - s;
3df32bda 1802
d36ee5be 1803 if (!PL_rsfp && !PL_parser->filtered) {
e66cf94c
RGS
1804 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1805 * to *{"::_<newfilename"} */
44867030
NC
1806 /* However, the long form of evals is only turned on by the
1807 debugger - usually they're "(eval %lu)" */
d36ee5be
FC
1808 GV * const cfgv = CopFILEGV(PL_curcop);
1809 if (cfgv) {
38bd7ad8
FC
1810 char smallbuf[128];
1811 STRLEN tmplen2 = len;
44867030 1812 char *tmpbuf2;
449dd039 1813 GV *gv2;
44867030
NC
1814
1815 if (tmplen2 + 2 <= sizeof smallbuf)
1816 tmpbuf2 = smallbuf;
1817 else
1818 Newx(tmpbuf2, tmplen2 + 2, char);
1819
38bd7ad8
FC
1820 tmpbuf2[0] = '_';
1821 tmpbuf2[1] = '<';
44867030
NC
1822
1823 memcpy(tmpbuf2 + 2, s, tmplen2);
1824 tmplen2 += 2;
1825
8a5ee598 1826 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1827 if (!isGV(gv2)) {
8a5ee598 1828 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1829 /* adjust ${"::_<newfilename"} to store the new file name */
1830 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
8818d409
FC
1831 /* The line number may differ. If that is the case,
1832 alias the saved lines that are in the array.
1833 Otherwise alias the whole array. */
1834 if (CopLINE(PL_curcop) == line_num) {
38bd7ad8
FC
1835 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(cfgv)));
1836 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(cfgv)));
8818d409 1837 }
38bd7ad8
FC
1838 else if (GvAV(cfgv)) {
1839 AV * const av = GvAV(cfgv);
515c395b
TC
1840 const line_t start = CopLINE(PL_curcop)+1;
1841 SSize_t items = AvFILLp(av) - start;
8818d409
FC
1842 if (items > 0) {
1843 AV * const av2 = GvAVn(gv2);
1844 SV **svp = AvARRAY(av) + start;
515c395b
TC
1845 Size_t l = line_num+1;
1846 while (items-- && l < SSize_t_MAX && l == (line_t)l)
1847 av_store(av2, (SSize_t)l++, SvREFCNT_inc(*svp++));
8818d409
FC
1848 }
1849 }
e5527e4b 1850 }
44867030
NC
1851
1852 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
d36ee5be 1853 }
e66cf94c 1854 }
05ec9bb3 1855 CopFILE_free(PL_curcop);
449dd039 1856 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1857 }
8818d409 1858 CopLINE_set(PL_curcop, line_num);
463ee0b2
LW
1859}
1860
80a702cd 1861STATIC void
15f169a1 1862S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1863{
1864 AV *av = CopFILEAVx(PL_curcop);
1865 if (av) {
7f1c3e8c
FC
1866 SV * sv;
1867 if (PL_parser->preambling == NOLINE) sv = newSV_type(SVt_PVMG);
1868 else {
1869 sv = *av_fetch(av, 0, 1);
1870 SvUPGRADE(sv, SVt_PVMG);
1871 }
847cc851 1872 if (!SvPOK(sv)) SvPVCLEAR(sv);
5fa550fb 1873 if (orig_sv)
7f1c3e8c 1874 sv_catsv(sv, orig_sv);
5fa550fb 1875 else
7f1c3e8c
FC
1876 sv_catpvn(sv, buf, len);
1877 if (!SvIOK(sv)) {
1878 (void)SvIOK_on(sv);
1879 SvIV_set(sv, 0);
1880 }
1881 if (PL_parser->preambling == NOLINE)
1882 av_store(av, CopLINE(PL_curcop), sv);
80a702cd
RGS
1883 }
1884}
1885
ffb4593c 1886/*
8c6b0c7d 1887 * skipspace
ffb4593c
NT
1888 * Called to gobble the appropriate amount and type of whitespace.
1889 * Skips comments as well.
71fff7cb 1890 * Returns the next character after the whitespace that is skipped.
8c6b0c7d
FC
1891 *
1892 * peekspace
1893 * Same thing, but look ahead without incrementing line numbers or
1894 * adjusting PL_linestart.
ffb4593c
NT
1895 */
1896
3218e223 1897#define skipspace(s) skipspace_flags(s, 0)
8c6b0c7d 1898#define peekspace(s) skipspace_flags(s, LEX_NO_INCLINE)
3218e223 1899
aabfeadc
KW
1900char *
1901Perl_skipspace_flags(pTHX_ char *s, U32 flags)
a687059c 1902{
21791330 1903 PERL_ARGS_ASSERT_SKIPSPACE_FLAGS;
3280af22 1904 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
3c47da3c 1905 while (s < PL_bufend && (SPACE_OR_TAB(*s) || !*s))
463ee0b2 1906 s++;
f0e67a1d
Z
1907 } else {
1908 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1909 PL_bufptr = s;
21791330 1910 lex_read_space(flags | LEX_KEEP_PREVIOUS |
d27f4b91 1911 (PL_lex_inwhat || PL_lex_state == LEX_FORMLINE ?
f0998909 1912 LEX_NO_NEXT_CHUNK : 0));
3280af22 1913 s = PL_bufptr;
f0e67a1d
Z
1914 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1915 if (PL_linestart > PL_bufptr)
1916 PL_bufptr = PL_linestart;
1917 return s;
463ee0b2 1918 }
5db06880 1919 return s;
a687059c 1920}
378cc40b 1921
ffb4593c
NT
1922/*
1923 * S_check_uni
1924 * Check the unary operators to ensure there's no ambiguity in how they're
1925 * used. An ambiguous piece of code would be:
1926 * rand + 5
1927 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1928 * the +5 is its argument.
1929 */
1930
76e3520e 1931STATIC void
cea2e8a9 1932S_check_uni(pTHX)
ba106d47 1933{
d4c19fe8 1934 const char *s;
2f3197b3 1935
3280af22 1936 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1937 return;
3280af22
NIS
1938 while (isSPACE(*PL_last_uni))
1939 PL_last_uni++;
c35e046a 1940 s = PL_last_uni;
fac0f7a3 1941 while (isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF) || *s == '-')
8ce2ba82 1942 s += UTF ? UTF8SKIP(s) : 1;
4efcdc02 1943 if (s < PL_bufptr && memchr(s, '(', PL_bufptr - s))
a0d0e21e 1944 return;
6136c704 1945
9b387841 1946 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
147e3846 1947 "Warning: Use of \"%" UTF8f "\" without parentheses is ambiguous",
b59c097b 1948 UTF8fARG(UTF, (int)(s - PL_last_uni), PL_last_uni));
2f3197b3
LW
1949}
1950
ffb4593c
NT
1951/*
1952 * LOP : macro to build a list operator. Its behaviour has been replaced
1953 * with a subroutine, S_lop() for which LOP is just another name.
1954 */
1955
a0d0e21e
LW
1956#define LOP(f,x) return lop(f,x,s)
1957
ffb4593c
NT
1958/*
1959 * S_lop
1960 * Build a list operator (or something that might be one). The rules:
41e8cbf4
FC
1961 * - if we have a next token, then it's a list operator (no parens) for
1962 * which the next token has already been parsed; e.g.,
1963 * sort foo @args
1964 * sort foo (@args)
ffb4593c
NT
1965 * - if the next thing is an opening paren, then it's a function
1966 * - else it's a list operator
1967 */
1968
76e3520e 1969STATIC I32
11288bb3 1970S_lop(pTHX_ I32 f, U8 x, char *s)
ffed7fef 1971{
7918f24d
NC
1972 PERL_ARGS_ASSERT_LOP;
1973
6154021b 1974 pl_yylval.ival = f;
35c8bce7 1975 CLINE;
3280af22
NIS
1976 PL_bufptr = s;
1977 PL_last_lop = PL_oldbufptr;
eb160463 1978 PL_last_lop_op = (OPCODE)f;
3280af22 1979 if (PL_nexttoke)
78cdf107 1980 goto lstop;
19f1898a 1981 PL_expect = x;
79072805 1982 if (*s == '(')
bbf60fe6 1983 return REPORT(FUNC);
294a536f 1984 s = skipspace(s);
79072805 1985 if (*s == '(')
bbf60fe6 1986 return REPORT(FUNC);
78cdf107
Z
1987 else {
1988 lstop:
1989 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
1990 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
bbf60fe6 1991 return REPORT(LSTOP);
78cdf107 1992 }
79072805
LW
1993}
1994
ffb4593c
NT
1995/*
1996 * S_force_next
9cbb5ea2 1997 * When the lexer realizes it knows the next token (for instance,
ffb4593c 1998 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 1999 * to know what token to return the next time the lexer is called. Caller
b5bbe64a
JH
2000 * will need to set PL_nextval[] and possibly PL_expect to ensure
2001 * the lexer handles the token correctly.
ffb4593c
NT
2002 */
2003
4e553d73 2004STATIC void
cea2e8a9 2005S_force_next(pTHX_ I32 type)
79072805 2006{
704d4215
GG
2007#ifdef DEBUGGING
2008 if (DEBUG_T_TEST) {
2009 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 2010 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
2011 }
2012#endif
1f7c3e7c 2013 assert(PL_nexttoke < C_ARRAY_LENGTH(PL_nexttype));
3280af22
NIS
2014 PL_nexttype[PL_nexttoke] = type;
2015 PL_nexttoke++;
79072805
LW
2016}
2017
89f35911
FC
2018/*
2019 * S_postderef
2020 *
2021 * This subroutine handles postfix deref syntax after the arrow has already
b3f7b7ad 2022 * been emitted. @* $* etc. are emitted as two separate tokens right here.
89f35911
FC
2023 * @[ @{ %[ %{ *{ are emitted also as two tokens, but this function emits
2024 * only the first, leaving yylex to find the next.
89f35911
FC
2025 */
2026
2027static int
ff25e5db 2028S_postderef(pTHX_ int const funny, char const next)
89f35911 2029{
4aada8b9 2030 assert(funny == DOLSHARP || memCHRs("$@%&*", funny));
89f35911
FC
2031 if (next == '*') {
2032 PL_expect = XOPERATOR;
cc624add 2033 if (PL_lex_state == LEX_INTERPNORMAL && !PL_lex_brackets) {
ff25e5db 2034 assert('@' == funny || '$' == funny || DOLSHARP == funny);
cc624add 2035 PL_lex_state = LEX_INTERPEND;
c3492809
FC
2036 if ('@' == funny)
2037 force_next(POSTJOIN);
cc624add 2038 }
89f35911
FC
2039 force_next(next);
2040 PL_bufptr+=2;
2041 }
2042 else {
760ca746
FC
2043 if ('@' == funny && PL_lex_state == LEX_INTERPNORMAL
2044 && !PL_lex_brackets)
2045 PL_lex_dojoin = 2;
89f35911
FC
2046 PL_expect = XOPERATOR;
2047 PL_bufptr++;
2048 }
2049 return funny;
2050}
2051
28ac2b49
Z
2052void
2053Perl_yyunlex(pTHX)
2054{
a7aaec61
Z
2055 int yyc = PL_parser->yychar;
2056 if (yyc != YYEMPTY) {
2057 if (yyc) {
a7aaec61
Z
2058 NEXTVAL_NEXTTOKE = PL_parser->yylval;
2059 if (yyc == '{'/*}*/ || yyc == HASHBRACK || yyc == '['/*]*/) {
78cdf107 2060 PL_lex_allbrackets--;
a7aaec61 2061 PL_lex_brackets--;
78cdf107
Z
2062 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
2063 } else if (yyc == '('/*)*/) {
2064 PL_lex_allbrackets--;
2065 yyc |= (2<<24);
a7aaec61
Z
2066 }
2067 force_next(yyc);
2068 }
28ac2b49
Z
2069 PL_parser->yychar = YYEMPTY;
2070 }
2071}
2072
d0a148a6 2073STATIC SV *
15f169a1 2074S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 2075{
740cce10 2076 SV * const sv = newSVpvn_utf8(start, len,
6d492fac
KW
2077 ! IN_BYTES
2078 && UTF
53463649 2079 && len != 0
6d492fac 2080 && is_utf8_non_invariant_string((const U8*)start, len));
d0a148a6
NC
2081 return sv;
2082}
2083
ffb4593c
NT
2084/*
2085 * S_force_word
2086 * When the lexer knows the next thing is a word (for instance, it has
2087 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
2088 * it calls S_force_word to stick the next word into the PL_nexttoke/val
2089 * lookahead.
ffb4593c
NT
2090 *
2091 * Arguments:
b1b65b59 2092 * char *start : buffer position (must be within PL_linestr)
185c2e96
DM
2093 * int token : PL_next* will be this type of bare word
2094 * (e.g., METHOD,BAREWORD)
ffb4593c
NT
2095 * int check_keyword : if true, Perl checks to make sure the word isn't
2096 * a keyword (do this if the word is a label, e.g. goto FOO)
2097 * int allow_pack : if true, : characters will also be allowed (require,
2098 * use, etc. do this)
ffb4593c
NT
2099 */
2100
76e3520e 2101STATIC char *
345b3785 2102S_force_word(pTHX_ char *start, int token, int check_keyword, int allow_pack)
79072805 2103{
eb578fdb 2104 char *s;
463ee0b2 2105 STRLEN len;
4e553d73 2106
7918f24d
NC
2107 PERL_ARGS_ASSERT_FORCE_WORD;
2108
294a536f 2109 start = skipspace(start);
463ee0b2 2110 s = start;
fac0f7a3 2111 if ( isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)
e7127e21 2112 || (allow_pack && *s == ':' && s[1] == ':') )
a0d0e21e 2113 {
3280af22 2114 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
01b5ef50
FC
2115 if (check_keyword) {
2116 char *s2 = PL_tokenbuf;
487e470d 2117 STRLEN len2 = len;
de627158
KW
2118 if (allow_pack && memBEGINPs(s2, len, "CORE::")) {
2119 s2 += sizeof("CORE::") - 1;
2120 len2 -= sizeof("CORE::") - 1;
2121 }
487e470d 2122 if (keyword(s2, len2, 0))
463ee0b2 2123 return start;
01b5ef50 2124 }
463ee0b2 2125 if (token == METHOD) {
294a536f 2126 s = skipspace(s);
463ee0b2 2127 if (*s == '(')
3280af22 2128 PL_expect = XTERM;
463ee0b2 2129 else {
3280af22 2130 PL_expect = XOPERATOR;
463ee0b2 2131 }
79072805 2132 }
9ded7720 2133 NEXTVAL_NEXTTOKE.opval
275103cd 2134 = newSVOP(OP_CONST,0,
d0a148a6 2135 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 2136 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
2137 force_next(token);
2138 }
2139 return s;
2140}
2141
ffb4593c
NT
2142/*
2143 * S_force_ident
9cbb5ea2 2144 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
2145 * text only contains the "foo" portion. The first argument is a pointer
2146 * to the "foo", and the second argument is the type symbol to prefix.
185c2e96 2147 * Forces the next token to be a "BAREWORD".
9cbb5ea2 2148 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2149 */
2150
76e3520e 2151STATIC void
5aaab254 2152S_force_ident(pTHX_ const char *s, int kind)
79072805 2153{
7918f24d
NC
2154 PERL_ARGS_ASSERT_FORCE_IDENT;
2155
c9b48522
DD
2156 if (s[0]) {
2157 const STRLEN len = s[1] ? strlen(s) : 1; /* s = "\"" see yylex */
275103cd 2158 OP* const o = newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
728847b1 2159 UTF ? SVf_UTF8 : 0));
9ded7720 2160 NEXTVAL_NEXTTOKE.opval = o;
185c2e96 2161 force_next(BAREWORD);
748a9306 2162 if (kind) {
11343788 2163 o->op_private = OPpCONST_ENTERED;
55497cff
PP
2164 /* XXX see note in pp_entereval() for why we forgo typo
2165 warnings if the symbol must be introduced in an eval.
2166 GSAR 96-10-12 */
90e5519e 2167 gv_fetchpvn_flags(s, len,
4bff32c5 2168 (PL_in_eval ? GV_ADDMULTI
728847b1 2169 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
90e5519e
NC
2170 kind == '$' ? SVt_PV :
2171 kind == '@' ? SVt_PVAV :
2172 kind == '%' ? SVt_PVHV :
a0d0e21e 2173 SVt_PVGV
90e5519e 2174 );
748a9306 2175 }
79072805
LW
2176 }
2177}
2178
3f33d153
FC
2179static void
2180S_force_ident_maybe_lex(pTHX_ char pit)
2181{
3f33d153
FC
2182 NEXTVAL_NEXTTOKE.ival = pit;
2183 force_next('p');
2184}
2185
1571675a
GS
2186NV
2187Perl_str_to_version(pTHX_ SV *sv)
2188{
2189 NV retval = 0.0;
2190 NV nshift = 1.0;
2191 STRLEN len;
cfd0369c 2192 const char *start = SvPV_const(sv,len);
9d4ba2ae 2193 const char * const end = start + len;
8298454c 2194 const bool utf = cBOOL(SvUTF8(sv));
7918f24d
NC
2195
2196 PERL_ARGS_ASSERT_STR_TO_VERSION;
2197
1571675a 2198 while (start < end) {
ba210ebe 2199 STRLEN skip;
1571675a
GS
2200 UV n;
2201 if (utf)
9041c2e3 2202 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2203 else {
2204 n = *(U8*)start;
2205 skip = 1;
2206 }
2207 retval += ((NV)n)/nshift;
2208 start += skip;
2209 nshift *= 1000;
2210 }
2211 return retval;
2212}
2213
4e553d73 2214/*
ffb4593c
NT
2215 * S_force_version
2216 * Forces the next token to be a version number.
e759cc13
RGS
2217 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2218 * and if "guessing" is TRUE, then no new token is created (and the caller
2219 * must use an alternative parsing method).
ffb4593c
NT
2220 */
2221
76e3520e 2222STATIC char *
e759cc13 2223S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2224{
5f66b61c 2225 OP *version = NULL;
44dcb63b 2226 char *d;
89bfa8cd 2227
7918f24d
NC
2228 PERL_ARGS_ASSERT_FORCE_VERSION;
2229
294a536f 2230 s = skipspace(s);
89bfa8cd 2231
44dcb63b 2232 d = s;
dd629d5b 2233 if (*d == 'v')
44dcb63b 2234 d++;
44dcb63b 2235 if (isDIGIT(*d)) {
e759cc13
RGS
2236 while (isDIGIT(*d) || *d == '_' || *d == '.')
2237 d++;
4e4da3ac 2238 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
dd629d5b 2239 SV *ver;
6154021b
RGS
2240 s = scan_num(s, &pl_yylval);
2241 version = pl_yylval.opval;
dd629d5b
GS
2242 ver = cSVOPx(version)->op_sv;
2243 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2244 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2245 SvNV_set(ver, str_to_version(ver));
1571675a 2246 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2247 }
89bfa8cd 2248 }
5db06880 2249 else if (guessing) {
e759cc13 2250 return s;
5db06880 2251 }
89bfa8cd
PP
2252 }
2253
2254 /* NOTE: The parser sees the package name and the VERSION swapped */
9ded7720 2255 NEXTVAL_NEXTTOKE.opval = version;
185c2e96 2256 force_next(BAREWORD);
89bfa8cd 2257
e759cc13 2258 return s;
89bfa8cd
PP
2259}
2260
ffb4593c 2261/*
91152fc1
DG
2262 * S_force_strict_version
2263 * Forces the next token to be a version number using strict syntax rules.
2264 */
2265
2266STATIC char *
2267S_force_strict_version(pTHX_ char *s)
2268{
91152fc1 2269 OP *version = NULL;
91152fc1
DG
2270 const char *errstr = NULL;
2271
2272 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2273
2274 while (isSPACE(*s)) /* leading whitespace */
2275 s++;
2276
2277 if (is_STRICT_VERSION(s,&errstr)) {
2278 SV *ver = newSV(0);
2279 s = (char *)scan_version(s, ver, 0);
2280 version = newSVOP(OP_CONST, 0, ver);
2281 }
407f8cf2
KW
2282 else if ((*s != ';' && *s != '{' && *s != '}' )
2283 && (s = skipspace(s), (*s != ';' && *s != '{' && *s != '}' )))
4e4da3ac 2284 {
91152fc1
DG
2285 PL_bufptr = s;
2286 if (errstr)
2287 yyerror(errstr); /* version required */
2288 return s;
2289 }
2290
91152fc1 2291 /* NOTE: The parser sees the package name and the VERSION swapped */
91152fc1 2292 NEXTVAL_NEXTTOKE.opval = version;
185c2e96 2293 force_next(BAREWORD);
91152fc1
DG
2294
2295 return s;
2296}
2297
2298/*
ffb4593c 2299 * S_tokeq
ef3ff34d
KW
2300 * Turns any \\ into \ in a quoted string passed in in 'sv', returning 'sv',
2301 * modified as necessary. However, if HINT_NEW_STRING is on, 'sv' is
2302 * unchanged, and a new SV containing the modified input is returned.
ffb4593c
NT
2303 */
2304
76e3520e 2305STATIC SV *
cea2e8a9 2306S_tokeq(pTHX_ SV *sv)
79072805 2307{
eb578fdb
KW
2308 char *s;
2309 char *send;
2310 char *d;
b3ac6de7 2311 SV *pv = sv;
79072805 2312
7918f24d
NC
2313 PERL_ARGS_ASSERT_TOKEQ;
2314
279b35ad
FC
2315 assert (SvPOK(sv));
2316 assert (SvLEN(sv));
2317 assert (!SvIsCOW(sv));
307ed071 2318 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1) /* <<'heredoc' */
b3ac6de7 2319 goto finish;
279b35ad
FC
2320 s = SvPVX(sv);
2321 send = SvEND(sv);
dcb21ed6
NC
2322 /* This is relying on the SV being "well formed" with a trailing '\0' */
2323 while (s < send && !(*s == '\\' && s[1] == '\\'))
79072805
LW
2324 s++;
2325 if (s == send)
b3ac6de7 2326 goto finish;
79072805 2327 d = s;
be4731d2 2328 if ( PL_hints & HINT_NEW_STRING ) {
279b35ad
FC
2329 pv = newSVpvn_flags(SvPVX_const(pv), SvCUR(sv),
2330 SVs_TEMP | SvUTF8(sv));
be4731d2 2331 }
79072805
LW
2332 while (s < send) {
2333 if (*s == '\\') {
a0d0e21e 2334 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2335 s++; /* all that, just for this */
2336 }
2337 *d++ = *s++;
2338 }
2339 *d = '\0';
95a20fc0 2340 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2341 finish:
3280af22 2342 if ( PL_hints & HINT_NEW_STRING )
164e423c 2343 return new_constant(NULL, 0, "q", sv, pv, "q", 1, NULL);
79072805
LW
2344 return sv;
2345}
2346
ffb4593c
NT
2347/*
2348 * Now come three functions related to double-quote context,
2349 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2350 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2351 * interact with PL_lex_state, and create fake ( ... ) argument lists
2352 * to handle functions and concatenation.
ecd24171
DM
2353 * For example,
2354 * "foo\lbar"
2355 * is tokenised as
2356 * stringify ( const[foo] concat lcfirst ( const[bar] ) )
ffb4593c
NT
2357 */
2358
2359/*
2360 * S_sublex_start
6154021b 2361 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2362 *
2363 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2364 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c 2365 *
16d1d8bd 2366 * OP_CONST is easy--just make the new op and return.
ffb4593c
NT
2367 *
2368 * Everything else becomes a FUNC.
2369 *
16d1d8bd
LM
2370 * Sets PL_lex_state to LEX_INTERPPUSH unless ival was OP_NULL or we
2371 * had an OP_CONST. This just sets us up for a
ffb4593c
NT
2372 * call to S_sublex_push().
2373 */
2374
76e3520e 2375STATIC I32
cea2e8a9 2376S_sublex_start(pTHX)
79072805 2377{
eb578fdb 2378 const I32 op_type = pl_yylval.ival;
79072805
LW
2379
2380 if (op_type == OP_NULL) {
6154021b 2381 pl_yylval.opval = PL_lex_op;
5f66b61c 2382 PL_lex_op = NULL;
79072805
LW
2383 return THING;
2384 }
466112bb 2385 if (op_type == OP_CONST) {
67c71cbb
FC
2386 SV *sv = PL_lex_stuff;
2387 PL_lex_stuff = NULL;
2388 sv = tokeq(sv);
b3ac6de7
IZ
2389
2390 if (SvTYPE(sv) == SVt_PVIV) {
2391 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2392 STRLEN len;
96a5add6 2393 const char * const p = SvPV_const(sv, len);
740cce10 2394 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2395 SvREFCNT_dec(sv);
2396 sv = nsv;
4e553d73 2397 }
275103cd 2398 pl_yylval.opval = newSVOP(op_type, 0, sv);
79072805
LW
2399 return THING;
2400 }
2401
7ef70b3d
FC
2402 PL_parser->lex_super_state = PL_lex_state;
2403 PL_parser->lex_sub_inwhat = (U16)op_type;
2404 PL_parser->lex_sub_op = PL_lex_op;
bb4e4c38
TC
2405 PL_parser->sub_no_recover = FALSE;
2406 PL_parser->sub_error_count = PL_error_count;
3280af22 2407 PL_lex_state = LEX_INTERPPUSH;
55497cff 2408
3280af22
NIS
2409 PL_expect = XTERM;
2410 if (PL_lex_op) {
6154021b 2411 pl_yylval.opval = PL_lex_op;
5f66b61c 2412 PL_lex_op = NULL;
55497cff
PP
2413 return PMFUNC;
2414 }
2415 else
2416 return FUNC;
2417}
2418
ffb4593c
NT
2419/*
2420 * S_sublex_push
2421 * Create a new scope to save the lexing state. The scope will be
2422 * ended in S_sublex_done. Returns a '(', starting the function arguments
2423 * to the uc, lc, etc. found before.
2424 * Sets PL_lex_state to LEX_INTERPCONCAT.
2425 */
2426
76e3520e 2427STATIC I32
cea2e8a9 2428S_sublex_push(pTHX)
55497cff 2429{
78a635de 2430 LEXSHARED *shared;
801d32ac 2431 const bool is_heredoc = PL_multi_close == '<';
f46d017c 2432 ENTER;
55497cff 2433
7ef70b3d 2434 PL_lex_state = PL_parser->lex_super_state;
cc624add 2435 SAVEI8(PL_lex_dojoin);
3280af22 2436 SAVEI32(PL_lex_brackets);
78cdf107 2437 SAVEI32(PL_lex_allbrackets);
b27dce25 2438 SAVEI32(PL_lex_formbrack);
78cdf107 2439 SAVEI8(PL_lex_fakeeof);
3280af22
NIS
2440 SAVEI32(PL_lex_casemods);
2441 SAVEI32(PL_lex_starts);
651b5b28 2442 SAVEI8(PL_lex_state);
7cc34111 2443 SAVESPTR(PL_lex_repl);
7766f137 2444 SAVEVPTR(PL_lex_inpat);
98246f1e 2445 SAVEI16(PL_lex_inwhat);
ffdb8b16 2446 if (is_heredoc)
b42366d4 2447 {
ffdb8b16 2448 SAVECOPLINE(PL_curcop);
b42366d4 2449 SAVEI32(PL_multi_end);
851b527a
FC
2450 SAVEI32(PL_parser->herelines);
2451 PL_parser->herelines = 0;
b42366d4 2452 }
2ca4363d 2453 SAVEIV(PL_multi_close);
3280af22 2454 SAVEPPTR(PL_bufptr);
8452ff4b 2455 SAVEPPTR(PL_bufend);
3280af22
NIS
2456 SAVEPPTR(PL_oldbufptr);
2457 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2458 SAVEPPTR(PL_last_lop);
2459 SAVEPPTR(PL_last_uni);
3280af22
NIS
2460 SAVEPPTR(PL_linestart);
2461 SAVESPTR(PL_linestr);
8edd5f42
RGS
2462 SAVEGENERICPV(PL_lex_brackstack);
2463 SAVEGENERICPV(PL_lex_casestack);
78a635de 2464 SAVEGENERICPV(PL_parser->lex_shared);
3a54fd60 2465 SAVEBOOL(PL_parser->lex_re_reparsing);
ffdb8b16 2466 SAVEI32(PL_copline);
3280af22 2467
99bd9d90 2468 /* The here-doc parser needs to be able to peek into outer lexing
60f40a38
FC
2469 scopes to find the body of the here-doc. So we put PL_linestr and
2470 PL_bufptr into lex_shared, to ‘share’ those values.
99bd9d90 2471 */
60f40a38
FC
2472 PL_parser->lex_shared->ls_linestr = PL_linestr;
2473 PL_parser->lex_shared->ls_bufptr = PL_bufptr;
99bd9d90 2474
3280af22 2475 PL_linestr = PL_lex_stuff;
7ef70b3d 2476 PL_lex_repl = PL_parser->lex_sub_repl;
a0714e2c 2477 PL_lex_stuff = NULL;
7ef70b3d 2478 PL_parser->lex_sub_repl = NULL;
3280af22 2479
eabab8bc
FC
2480 /* Arrange for PL_lex_stuff to be freed on scope exit, in case it gets
2481 set for an inner quote-like operator and then an error causes scope-
2482 popping. We must not have a PL_lex_stuff value left dangling, as
2483 that breaks assumptions elsewhere. See bug #123617. */
2484 SAVEGENERICSV(PL_lex_stuff);
7ef70b3d 2485 SAVEGENERICSV(PL_parser->lex_sub_repl);
eabab8bc 2486
9cbb5ea2
GS
2487 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2488 = SvPVX(PL_linestr);
3280af22 2489 PL_bufend += SvCUR(PL_linestr);
bd61b366 2490 PL_last_lop = PL_last_uni = NULL;
3280af22 2491 SAVEFREESV(PL_linestr);
4dc843bc 2492 if (PL_lex_repl) SAVEFREESV(PL_lex_repl);
3280af22
NIS
2493
2494 PL_lex_dojoin = FALSE;
b27dce25 2495 PL_lex_brackets = PL_lex_formbrack = 0;
78cdf107
Z
2496 PL_lex_allbrackets = 0;
2497 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
a02a5408
JC
2498 Newx(PL_lex_brackstack, 120, char);
2499 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2500 PL_lex_casemods = 0;
2501 *PL_lex_casestack = '\0';
2502 PL_lex_starts = 0;
2503 PL_lex_state = LEX_INTERPCONCAT;
ffdb8b16 2504 if (is_heredoc)
6ddcf93b 2505 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
ffdb8b16 2506 PL_copline = NOLINE;
caae0700 2507
78a635de
FC
2508 Newxz(shared, 1, LEXSHARED);
2509 shared->ls_prev = PL_parser->lex_shared;
2510 PL_parser->lex_shared = shared;
3280af22 2511
7ef70b3d 2512 PL_lex_inwhat = PL_parser->lex_sub_inwhat;
bb16bae8 2513 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
3280af22 2514 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
7ef70b3d 2515 PL_lex_inpat = PL_parser->lex_sub_op;
79072805 2516 else
5f66b61c 2517 PL_lex_inpat = NULL;
79072805 2518
3a54fd60
DM
2519 PL_parser->lex_re_reparsing = cBOOL(PL_in_eval & EVAL_RE_REPARSING);
2520 PL_in_eval &= ~EVAL_RE_REPARSING;
2521
69afcc21 2522 return SUBLEXSTART;
79072805
LW
2523}
2524
ffb4593c
NT
2525/*
2526 * S_sublex_done
2527 * Restores lexer state after a S_sublex_push.
2528 */
2529
76e3520e 2530STATIC I32
cea2e8a9 2531S_sublex_done(pTHX)
79072805 2532{
3280af22 2533 if (!PL_lex_starts++) {
396482e1 2534 SV * const sv = newSVpvs("");
9aa983d2
JH
2535 if (SvUTF8(PL_linestr))
2536 SvUTF8_on(sv);
3280af22 2537 PL_expect = XOPERATOR;
275103cd 2538 pl_yylval.opval = newSVOP(OP_CONST, 0, sv);
79072805
LW
2539 return THING;
2540 }
2541
3280af22
NIS
2542 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2543 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2544 return yylex();
79072805
LW
2545 }
2546
ffb4593c 2547 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
bb16bae8 2548 assert(PL_lex_inwhat != OP_TRANSR);
5aa91856
FC
2549 if (PL_lex_repl) {
2550 assert (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS);
3280af22
NIS
2551 PL_linestr = PL_lex_repl;
2552 PL_lex_inpat = 0;
2553 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2554 PL_bufend += SvCUR(PL_linestr);
bd61b366 2555 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2556 PL_lex_dojoin = FALSE;
2557 PL_lex_brackets = 0;
78cdf107
Z
2558 PL_lex_allbrackets = 0;
2559 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
3280af22
NIS
2560 PL_lex_casemods = 0;
2561 *PL_lex_casestack = '\0';
2562 PL_lex_starts = 0;
25da4f38 2563 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2564 PL_lex_state = LEX_INTERPNORMAL;
2565 PL_lex_starts++;
e9fa98b2
HS
2566 /* we don't clear PL_lex_repl here, so that we can check later
2567 whether this is an evalled subst; that means we rely on the
2568 logic to ensure sublex_done() is called again only via the
2569 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2570 }
e9fa98b2 2571 else {
3280af22 2572 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2573 PL_lex_repl = NULL;
e9fa98b2 2574 }
ffdb8b16
FC
2575 if (SvTYPE(PL_linestr) >= SVt_PVNV) {
2576 CopLINE(PL_curcop) +=
9420b268 2577 ((XPVNV*)SvANY(PL_linestr))->xnv_u.xnv_lines
851b527a
FC
2578 + PL_parser->herelines;
2579 PL_parser->herelines = 0;
ffdb8b16 2580 }
9b6b7be8 2581 return '/';
ffed7fef
LW
2582 }
2583 else {
b42366d4 2584 const line_t l = CopLINE(PL_curcop);
f46d017c 2585 LEAVE;
bb4e4c38 2586 if (PL_parser->sub_error_count != PL_error_count) {
bb4e4c38 2587 if (PL_parser->sub_no_recover) {
ad1ecdf7 2588 yyquit();
bb4e4c38
TC
2589 NOT_REACHED;
2590 }
2591 }
b42366d4 2592 if (PL_multi_close == '<')
851b527a 2593 PL_parser->herelines += l - PL_multi_end;
3280af22
NIS
2594 PL_bufend = SvPVX(PL_linestr);
2595 PL_bufend += SvCUR(PL_linestr);
2596 PL_expect = XOPERATOR;
69afcc21 2597 return SUBLEXEND;
ffed7fef
LW
2598 }
2599}
2600
4e8ee35f
KW
2601HV *
2602Perl_load_charnames(pTHX_ SV * char_name, const char * context,
2603 const STRLEN context_len, const char ** error_msg)
2604{
2605 /* Load the official _charnames module if not already there. The
2606 * parameters are just to give info for any error messages generated:
2607 * char_name a name to look up which is the reason for loading this
2608 * context 'char_name' in the context in the input in which it appears
2609 * context_len how many bytes 'context' occupies
2610 * error_msg *error_msg will be set to any error
2611 *
2612 * Returns the ^H table if success; otherwise NULL */
2613
2614 unsigned int i;
2615 HV * table;
2616 SV **cvp;
2617 SV * res;
2618
2619 PERL_ARGS_ASSERT_LOAD_CHARNAMES;
2620
2621 /* This loop is executed 1 1/2 times. On the first time through, if it
2622 * isn't already loaded, try loading it, and iterate just once to see if it
2623 * worked. */
2624 for (i = 0; i < 2; i++) {
2625 table = GvHV(PL_hintgv); /* ^H */
2626
2627 if ( table
2628 && (PL_hints & HINT_LOCALIZE_HH)
2629 && (cvp = hv_fetchs(table, "charnames", FALSE))
2630 && SvOK(*cvp))
2631 {
2632 return table; /* Quit if already loaded */
2633 }
2634
2635 if (i == 0) {
2636 Perl_load_module(aTHX_
2637 0,
2638 newSVpvs("_charnames"),
2639
2640 /* version parameter; no need to specify it, as if we get too early
2641 * a version, will fail anyway, not being able to find 'charnames'
2642 * */
2643 NULL,
2644 newSVpvs(":full"),
2645 newSVpvs(":short"),
2646 NULL);
2647 }
2648 }
2649
2650 /* Here, it failed; new_constant will give appropriate error messages */
2651 *error_msg = NULL;
2652 res = new_constant( NULL, 0, "charnames", char_name, NULL,
2653 context, context_len, error_msg);
2654 SvREFCNT_dec(res);
2655
2656 return NULL;
2657}
2658
7a7d14f3 2659STATIC SV*
2c43c309
KW
2660S_get_and_check_backslash_N_name_wrapper(pTHX_ const char* s, const char* const e)
2661{
2662 /* This justs wraps get_and_check_backslash_N_name() to output any error
2663 * message it returns. */
2664
2665 const char * error_msg = NULL;
25c7fb78 2666 SV * result;
2c43c309
KW
2667
2668 PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME_WRAPPER;
2669
25c7fb78
KW
2670 /* charnames doesn't work well if there have been errors found */
2671 if (PL_error_count > 0) {
2672 return NULL;
2673 }
2674
2675 result = get_and_check_backslash_N_name(s, e, cBOOL(UTF), &error_msg);
2676
2c43c309
KW
2677 if (error_msg) {
2678 yyerror_pv(error_msg, UTF ? SVf_UTF8 : 0);
2679 }
2680
2681 return result;
2682}
2683
25c7fb78
KW
2684SV*
2685Perl_get_and_check_backslash_N_name(pTHX_ const char* s,
2686 const char* const e,
2687 const bool is_utf8,
2688 const char ** error_msg)
6f613c73 2689{
140b12ad
KW
2690 /* <s> points to first character of interior of \N{}, <e> to one beyond the
2691 * interior, hence to the "}". Finds what the name resolves to, returning
25c7fb78
KW
2692 * an SV* containing it; NULL if no valid one found.
2693 *
2694 * 'is_utf8' is TRUE if we know we want the result to be UTF-8 even if it
2695 * doesn't have to be. */
6f613c73 2696
7303cc1f 2697 SV* char_name;
25c7fb78 2698 SV* res;
0c415a79
KW
2699 HV * table;
2700 SV **cvp;
2701 SV *cv;
2702 SV *rv;
2703 HV *stash;
8b0cce63
KW
2704
2705 /* Points to the beginning of the \N{... so that any messages include the
2706 * context of what's failing*/
2707 const char* context = s - 3;
2708 STRLEN context_len = e - context + 1; /* include all of \N{...} */
2709
0c415a79 2710
6f613c73
KW
2711 PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME;
2712
25c7fb78
KW
2713 assert(e >= s);
2714 assert(s > (char *) 3);
2715
7303cc1f 2716 char_name = newSVpvn_flags(s, e - s, (is_utf8) ? SVf_UTF8 : 0);
25c7fb78 2717
7303cc1f
KW
2718 if (!SvCUR(char_name)) {
2719 SvREFCNT_dec_NN(char_name);
be332ba0 2720 /* diag_listed_as: Unknown charname '%s' */
2c43c309 2721 *error_msg = Perl_form(aTHX_ "Unknown charname ''");
be332ba0 2722 return NULL;
d8d26cac 2723 }
b7e6151c 2724
7303cc1f 2725 /* Autoload the charnames module */
7303cc1f 2726
4e8ee35f
KW
2727 table = load_charnames(char_name, context, context_len, error_msg);
2728 if (table == NULL) {
2729 return NULL;
7303cc1f
KW
2730 }
2731
2732 *error_msg = NULL;
8b0cce63
KW
2733 res = new_constant( NULL, 0, "charnames", char_name, NULL,
2734 context, context_len, error_msg);
7303cc1f 2735 if (*error_msg) {
4e8ee35f 2736 *error_msg = Perl_form(aTHX_ "Unknown charname '%s'", SvPVX(char_name));
7303cc1f
KW
2737
2738 SvREFCNT_dec(res);
6f613c73
KW
2739 return NULL;
2740 }
2741
0c415a79
KW
2742 /* See if the charnames handler is the Perl core's, and if so, we can skip
2743 * the validation needed for a user-supplied one, as Perl's does its own
2744 * validation. */
0c415a79 2745 cvp = hv_fetchs(table, "charnames", FALSE);
5882ddb3
FC
2746 if (cvp && (cv = *cvp) && SvROK(cv) && (rv = SvRV(cv),
2747 SvTYPE(rv) == SVt_PVCV) && ((stash = CvSTASH(rv)) != NULL))
0c415a79
KW
2748 {
2749 const char * const name = HvNAME(stash);
b59bf0b2 2750 if (memEQs(name, HvNAMELEN(stash), "_charnames")) {
0c415a79
KW
2751 return res;
2752 }
2753 }
2754
bde9e88d
KW
2755 /* Here, it isn't Perl's charname handler. We can't rely on a
2756 * user-supplied handler to validate the input name. For non-ut8 input,
2757 * look to see that the first character is legal. Then loop through the
2758 * rest checking that each is a continuation */
6f613c73 2759
36897d64
KW
2760 /* This code makes the reasonable assumption that the only Latin1-range
2761 * characters that begin a character name alias are alphabetic, otherwise
2762 * would have to create a isCHARNAME_BEGIN macro */
b6ba1137 2763
25c7fb78 2764 if (! is_utf8) {
bde9e88d 2765 if (! isALPHAU(*s)) {
b6ba1137
KW
2766 goto bad_charname;
2767 }
bde9e88d
KW
2768 s++;
2769 while (s < e) {
2770 if (! isCHARNAME_CONT(*s)) {
b6ba1137
KW
2771 goto bad_charname;
2772 }
2d8eb851
KW
2773 if (*s == ' ' && *(s-1) == ' ') {
2774 goto multi_spaces;
bd299e29 2775 }
bde9e88d 2776 s++;
b6ba1137
KW
2777 }
2778 }
2779 else {
bde9e88d 2780 /* Similarly for utf8. For invariants can check directly; for other
58aa6738
KW
2781 * Latin1, can calculate their code point and check; otherwise use an
2782 * inversion list */
bde9e88d
KW
2783 if (UTF8_IS_INVARIANT(*s)) {
2784 if (! isALPHAU(*s)) {
140b12ad
KW
2785 goto bad_charname;
2786 }
bde9e88d
KW
2787 s++;
2788 } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
a62b247b 2789 if (! isALPHAU(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)))) {
b6ba1137 2790 goto bad_charname;
6f613c73 2791 }
bde9e88d 2792 s += 2;
6f613c73 2793 }
bde9e88d 2794 else {
f1bcae08
KW
2795 if (! _invlist_contains_cp(PL_utf8_charname_begin,
2796 utf8_to_uvchr_buf((U8 *) s,
2797 (U8 *) e,
2798 NULL)))
2799 {
bde9e88d
KW
2800 goto bad_charname;
2801 }
2802 s += UTF8SKIP(s);
2803 }
2804
2805 while (s < e) {
2806 if (UTF8_IS_INVARIANT(*s)) {
2807 if (! isCHARNAME_CONT(*s)) {
2808 goto bad_charname;
2809 }
2d8eb851
KW
2810 if (*s == ' ' && *(s-1) == ' ') {
2811 goto multi_spaces;
bd299e29 2812 }
bde9e88d
KW
2813 s++;
2814 }
2815 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
a62b247b 2816 if (! isCHARNAME_CONT(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1))))
bde9e88d
KW
2817 {
2818 goto bad_charname;
2819 }
2820 s += 2;
2821 }
2822 else {
f1bcae08
KW
2823 if (! _invlist_contains_cp(PL_utf8_charname_continue,
2824 utf8_to_uvchr_buf((U8 *) s,
2825 (U8 *) e,
2826 NULL)))
2827 {
bde9e88d
KW
2828 goto bad_charname;
2829 }
2830 s += UTF8SKIP(s);
6f613c73
KW
2831 }
2832 }
2d8eb851
KW
2833 }
2834 if (*(s-1) == ' ') {
8d9d0498
FC
2835 /* diag_listed_as: charnames alias definitions may not contain
2836 trailing white-space; marked by <-- HERE in %s
2837 */
2c43c309 2838 *error_msg = Perl_form(aTHX_
2d8eb851
KW
2839 "charnames alias definitions may not contain trailing "
2840 "white-space; marked by <-- HERE in %.*s<-- HERE %.*s",
8b0cce63 2841 (int)(s - context + 1), context,
2c43c309 2842 (int)(e - s + 1), s + 1);
2d8eb851 2843 return NULL;
6f613c73
KW
2844 }
2845
870fe793 2846 if (SvUTF8(res)) { /* Don't accept malformed charname value */
bde9e88d
KW
2847 const U8* first_bad_char_loc;
2848 STRLEN len;
2849 const char* const str = SvPV_const(res, len);
0aab20f2
KW
2850 if (UNLIKELY(! is_utf8_string_loc((U8 *) str, len,
2851 &first_bad_char_loc)))
2852 {
75219bac
KW
2853 _force_out_malformed_utf8_message(first_bad_char_loc,
2854 (U8 *) PL_parser->bufend,
2855 0,
2856 0 /* 0 means don't die */ );
8d9d0498
FC
2857 /* diag_listed_as: Malformed UTF-8 returned by \N{%s}
2858 immediately after '%s' */
2c43c309 2859 *error_msg = Perl_form(aTHX_
bde9e88d 2860 "Malformed UTF-8 returned by %.*s immediately after '%.*s'",
8b0cce63 2861 (int) context_len, context,
2c43c309 2862 (int) ((char *) first_bad_char_loc - str), str);
bde9e88d
KW
2863 return NULL;
2864 }
2865 }
140b12ad 2866
bde9e88d 2867 return res;
140b12ad 2868
bde9e88d 2869 bad_charname: {
bde9e88d
KW
2870
2871 /* The final %.*s makes sure that should the trailing NUL be missing
2872 * that this print won't run off the end of the string */
8d9d0498
FC
2873 /* diag_listed_as: Invalid character in \N{...}; marked by <-- HERE
2874 in \N{%s} */
2c43c309 2875 *error_msg = Perl_form(aTHX_
bde9e88d 2876 "Invalid character in \\N{...}; marked by <-- HERE in %.*s<-- HERE %.*s",
8b0cce63 2877 (int)(s - context + 1), context,
2c43c309 2878 (int)(e - s + 1), s + 1);
bde9e88d
KW
2879 return NULL;
2880 }
2d8eb851
KW
2881
2882 multi_spaces:
8d9d0498
FC
2883 /* diag_listed_as: charnames alias definitions may not contain a
2884 sequence of multiple spaces; marked by <-- HERE
2885 in %s */
2c43c309 2886 *error_msg = Perl_form(aTHX_
2d8eb851
KW
2887 "charnames alias definitions may not contain a sequence of "
2888 "multiple spaces; marked by <-- HERE in %.*s<-- HERE %.*s",
8b0cce63 2889 (int)(s - context + 1), context,
2c43c309 2890 (int)(e - s + 1), s + 1);
2d8eb851 2891 return NULL;
6f613c73
KW
2892}
2893
02aa26ce
NT
2894/*
2895 scan_const
2896
9da1dd8f
DM
2897 Extracts the next constant part of a pattern, double-quoted string,
2898 or transliteration. This is terrifying code.
2899
2900 For example, in parsing the double-quoted string "ab\x63$d", it would
2901 stop at the '$' and return an OP_CONST containing 'abc'.
02aa26ce 2902
94def140 2903 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 2904 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 2905 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 2906
94def140
ST
2907 Returns a pointer to the character scanned up to. If this is
2908 advanced from the start pointer supplied (i.e. if anything was
9da1dd8f 2909 successfully parsed), will leave an OP_CONST for the substring scanned
6154021b 2910 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
2911 by looking at the next characters herself.
2912
02aa26ce 2913 In patterns:
9da1dd8f 2914 expand:
537124e4
KW
2915 \N{FOO} => \N{U+hex_for_character_FOO}
2916 (if FOO expands to multiple characters, expands to \N{U+xx.XX.yy ...})
9da1dd8f
DM
2917
2918 pass through:
2919 all other \-char, including \N and \N{ apart from \N{ABC}
2920
2921 stops on:
2922 @ and $ where it appears to be a var, but not for $ as tail anchor
2923 \l \L \u \U \Q \E
2924 (?{ or (??{
2925
02aa26ce
NT
2926 In transliterations:
2927 characters are VERY literal, except for - not at the start or end
66b09263
KW
2928 of the string, which indicates a range. However some backslash sequences
2929 are recognized: \r, \n, and the like
2930 \007 \o{}, \x{}, \N{}
2931 If all elements in the transliteration are below 256,
94def140
ST
2932 scan_const expands the range to the full set of intermediate
2933 characters. If the range is in utf8, the hyphen is replaced with
2934 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
2935
2936 In double-quoted strings:
2937 backslashes:
66b09263 2938 all those recognized in transliterations
94def140 2939 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
2940 case and quoting: \U \Q \E
2941 stops on @ and $
2942
2943 scan_const does *not* construct ops to handle interpolated strings.
2944 It stops processing as soon as it finds an embedded $ or @ variable
2945 and leaves it to the caller to work out what's going on.
2946
94def140
ST
2947 embedded arrays (whether in pattern or not) could be:
2948 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2949
2950 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
2951
2952 $ in pattern could be $foo or could be tail anchor. Assumption:
2953 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 2954 followed by one of "()| \r\n\t"
02aa26ce 2955
9da1dd8f 2956 \1 (backreferences) are turned into $1 in substitutions
02aa26ce
NT
2957
2958 The structure of the code is
2959 while (there's a character to process) {
94def140
ST
2960 handle transliteration ranges
2961 skip regexp comments /(?#comment)/ and codes /(?{code})/
2962 skip #-initiated comments in //x patterns
2963 check for embedded arrays
02aa26ce
NT
2964 check for embedded scalars
2965 if (backslash) {
94def140 2966 deprecate \1 in substitution replacements
02aa26ce
NT
2967 handle string-changing backslashes \l \U \Q \E, etc.
2968 switch (what was escaped) {
94def140 2969 handle \- in a transliteration (becomes a literal -)
ff3f963a 2970 if a pattern and not \N{, go treat as regular character
94def140
ST
2971 handle \132 (octal characters)
2972 handle \x15 and \x{1234} (hex characters)
ff3f963a 2973 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
ST
2974 handle \cV (control characters)
2975 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 2976 } (end switch)
77a135fe 2977 continue
02aa26ce 2978 } (end if backslash)
77a135fe 2979 handle regular character
02aa26ce 2980 } (end while character to read)
02cd137d 2981
02aa26ce
NT
2982*/
2983
76e3520e 2984STATIC char *
cea2e8a9 2985S_scan_const(pTHX_ char *start)
79072805 2986{
eb578fdb 2987 char *send = PL_bufend; /* end of the constant */
dc023dbb
KW
2988 SV *sv = newSV(send - start); /* sv for the constant. See note below
2989 on sizing. */
eb578fdb
KW
2990 char *s = start; /* start of the constant */
2991 char *d = SvPVX(sv); /* destination for copies */
dc023dbb
KW
2992 bool dorange = FALSE; /* are we in a translit range? */
2993 bool didrange = FALSE; /* did we just finish a range? */
2994 bool in_charclass = FALSE; /* within /[...]/ */
ed2893cb 2995 bool s_is_utf8 = cBOOL(UTF); /* Is the source string assumed to be
dc023dbb
KW
2996 UTF8? But, this can show as true
2997 when the source isn't utf8, as for
2998 example when it is entirely composed
2999 of hex constants */
f34acfec 3000 bool d_is_utf8 = FALSE; /* Output constant is UTF8 */
af9be36c
KW
3001 STRLEN utf8_variant_count = 0; /* When not in UTF-8, this counts the
3002 number of characters found so far
3003 that will expand (into 2 bytes)
3004 should we have to convert to
3005 UTF-8) */
6f613c73 3006 SV *res; /* result from charnames */
c89db733
JH
3007 STRLEN offset_to_max = 0; /* The offset in the output to where the range
3008 high-end character is temporarily placed */
77a135fe 3009
fe2ba0a2
KW
3010 /* Does something require special handling in tr/// ? This avoids extra
3011 * work in a less likely case. As such, khw didn't feel it was worth
3012 * adding any branches to the more mainline code to handle this, which
3013 * means that this doesn't get set in some circumstances when things like
3014 * \x{100} get expanded out. As a result there needs to be extra testing
3015 * done in the tr code */
3016 bool has_above_latin1 = FALSE;
3017
77a135fe
KW
3018 /* Note on sizing: The scanned constant is placed into sv, which is
3019 * initialized by newSV() assuming one byte of output for every byte of
3020 * input. This routine expects newSV() to allocate an extra byte for a
3021 * trailing NUL, which this routine will append if it gets to the end of
3022 * the input. There may be more bytes of input than output (eg., \N{LATIN
3023 * CAPITAL LETTER A}), or more output than input if the constant ends up
3024 * recoded to utf8, but each time a construct is found that might increase
3025 * the needed size, SvGROW() is called. Its size parameter each time is
3026 * based on the best guess estimate at the time, namely the length used so
3027 * far, plus the length the current construct will occupy, plus room for
caae0700 3028 * the trailing NUL, plus one byte for every input byte still unscanned */
77a135fe 3029
c3320c2a
KW
3030 UV uv = UV_MAX; /* Initialize to weird value to try to catch any uses
3031 before set */
4c3a8340 3032#ifdef EBCDIC
f4240379
KW
3033 int backslash_N = 0; /* ? was the character from \N{} */
3034 int non_portable_endpoint = 0; /* ? In a range is an endpoint
3035 platform-specific like \x65 */
4c3a8340 3036#endif
012bcf8d 3037
7918f24d
NC
3038 PERL_ARGS_ASSERT_SCAN_CONST;
3039
bb16bae8 3040 assert(PL_lex_inwhat != OP_TRANSR);
2b9d42f0 3041
b899e89d
FC
3042 /* Protect sv from errors and fatal warnings. */
3043 ENTER_with_name("scan_const");
3044 SAVEFREESV(sv);
2b9d42f0 3045
1759517a
KW
3046 /* A bunch of code in the loop below assumes that if s[n] exists and is not
3047 * NUL, then s[n+1] exists. This assertion makes sure that assumption is
3048 * valid */
3049 assert(*send == '\0');
3050
f4240379
KW
3051 while (s < send
3052 || dorange /* Handle tr/// range at right edge of input */
3053 ) {
ff3f963a 3054
02aa26ce 3055 /* get transliterations out of the way (they're most literal) */
3280af22 3056 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 3057
f4240379
KW
3058 /* But there isn't any special handling necessary unless there is a
3059 * range, so for most cases we just drop down and handle the value
3060 * as any other. There are two exceptions.
3061 *
02cd137d
KW
3062 * 1. A hyphen indicates that we are actually going to have a
3063 * range. In this case, skip the '-', set a flag, then drop
f4240379
KW
3064 * down to handle what should be the end range value.
3065 * 2. After we've handled that value, the next time through, that
3066 * flag is set and we fix up the range.
3067 *
3068 * Ranges entirely within Latin1 are expanded out entirely, in
188d22cf
KW
3069 * order to make the transliteration a simple table look-up.
3070 * Ranges that extend above Latin1 have to be done differently, so
3071 * there is no advantage to expanding them here, so they are
dc8faf6b
KW
3072 * stored here as Min, RANGE_INDICATOR, Max. 'RANGE_INDICATOR' is
3073 * a byte that can't occur in legal UTF-8, and hence can signify a
3074 * hyphen without any possible ambiguity. On EBCDIC machines, if
3075 * the range is expressed as Unicode, the Latin1 portion is
3076 * expanded out even if the range extends above Latin1. This is
3077 * because each code point in it has to be processed here
3078 * individually to get its native translation */
f4240379
KW
3079
3080 if (! dorange) {
3081
02cd137d
KW
3082 /* Here, we don't think we're in a range. If the new character
3083 * is not a hyphen; or if it is a hyphen, but it's too close to
e8d55f27
TC
3084 * either edge to indicate a range, or if we haven't output any
3085 * characters yet then it's a regular character. */
04863ba1
KW
3086 if (*s != '-' || s >= send - 1 || s == start || d == SvPVX(sv))
3087 {
f4240379
KW
3088
3089 /* A regular character. Process like any other, but first
3090 * clear any flags */
3091 didrange = FALSE;
3092 dorange = FALSE;
e294cc5d 3093#ifdef EBCDIC
f4240379
KW
3094 non_portable_endpoint = 0;
3095 backslash_N = 0;
e294cc5d 3096#endif
02cd137d
KW
3097 /* The tests here for being above Latin1 and similar ones
3098 * in the following 'else' suffice to find all such
3099 * occurences in the constant, except those added by a
fe2ba0a2 3100 * backslash escape sequence, like \x{100}. Mostly, those
02cd137d 3101 * set 'has_above_latin1' as appropriate */
ed2893cb 3102 if (s_is_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) {
188d22cf
KW
3103 has_above_latin1 = TRUE;
3104 }
3105
f4240379
KW
3106 /* Drops down to generic code to process current byte */
3107 }
02cd137d 3108 else { /* Is a '-' in the context where it means a range */
f4240379 3109 if (didrange) { /* Something like y/A-C-Z// */
02cd137d
KW
3110 Perl_croak(aTHX_ "Ambiguous range in transliteration"
3111 " operator");
f4240379 3112 }
e294cc5d 3113
f4240379 3114 dorange = TRUE;
2b9d42f0 3115
02cd137d 3116 s++; /* Skip past the hyphen */
f4240379
KW
3117
3118 /* d now points to where the end-range character will be
d7f7b0e3
KW
3119 * placed. Drop down to get that character. We'll finish
3120 * processing the range the next time through the loop */
188d22cf 3121
ed2893cb 3122 if (s_is_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) {
188d22cf
KW
3123 has_above_latin1 = TRUE;
3124 }
02cd137d
KW
3125
3126 /* Drops down to generic code to process current byte */
f4240379
KW
3127 }
3128 } /* End of not a range */
3129 else {
3130 /* Here we have parsed a range. Now must handle it. At this
3131 * point:
3132 * 'sv' is a SV* that contains the output string we are
3133 * constructing. The final two characters in that string
3134 * are the range start and range end, in order.
3135 * 'd' points to just beyond the range end in the 'sv' string,
3136 * where we would next place something
f4240379 3137 */
0c311b7c 3138 char * max_ptr;
5ca37d54 3139 char * min_ptr;
f4240379
KW
3140 IV range_min;
3141 IV range_max; /* last character in range */
f4240379 3142 STRLEN grow;
5ca37d54
KW
3143 Size_t offset_to_min = 0;
3144 Size_t extras = 0;
11327fa1 3145#ifdef EBCDIC
f4240379
KW
3146 bool convert_unicode;
3147 IV real_range_max = 0;
e294cc5d 3148#endif
02cd137d 3149 /* Get the code point values of the range ends. */
0c311b7c
KW
3150 max_ptr = (d_is_utf8) ? (char *) utf8_hop( (U8*) d, -1) : d - 1;
3151 offset_to_max = max_ptr - SvPVX_const(sv);
ed2893cb 3152 if (d_is_utf8) {
f4240379
KW
3153 /* We know the utf8 is valid, because we just constructed
3154 * it ourselves in previous loop iterations */
3155 min_ptr = (char*) utf8_hop( (U8*) max_ptr, -1);
3156 range_min = valid_utf8_to_uvchr( (U8*) min_ptr, NULL);
3157 range_max = valid_utf8_to_uvchr( (U8*) max_ptr, NULL);
fe2ba0a2
KW
3158
3159 /* This compensates for not all code setting
3160 * 'has_above_latin1', so that we don't skip stuff that
3161 * should be executed */
3162 if (range_max > 255) {
3163 has_above_latin1 = TRUE;
3164 }
e294cc5d 3165 }
f4240379
KW
3166 else {
3167 min_ptr = max_ptr - 1;
3168 range_min = * (U8*) min_ptr;
3169 range_max = * (U8*) max_ptr;
3170 }
3171
8efef67c
KW
3172 /* If the range is just a single code point, like tr/a-a/.../,
3173 * that code point is already in the output, twice. We can
3174 * just back up over the second instance and avoid all the rest
3175 * of the work. But if it is a variant character, it's been
218304f9
KW
3176 * counted twice, so decrement. (This unlikely scenario is
3177 * special cased, like the one for a range of 2 code points
3178 * below, only because the main-line code below needs a range
3179 * of 3 or more to work without special casing. Might as well
3180 * get it out of the way now.) */
8efef67c
KW
3181 if (UNLIKELY(range_max == range_min)) {
3182 d = max_ptr;
ed2893cb 3183 if (! d_is_utf8 && ! UVCHR_IS_INVARIANT(range_max)) {
8efef67c
KW
3184 utf8_variant_count--;
3185 }
3186 goto range_done;
3187 }
3188
e294cc5d 3189#ifdef EBCDIC
f4240379
KW
3190 /* On EBCDIC platforms, we may have to deal with portable
3191 * ranges. These happen if at least one range endpoint is a
3192 * Unicode value (\N{...}), or if the range is a subset of
3193 * [A-Z] or [a-z], and both ends are literal characters,
3194 * like 'A', and not like \x{C1} */
87dd6ea7 3195 convert_unicode =
02cd137d
KW
3196 cBOOL(backslash_N) /* \N{} forces Unicode,
3197 hence portable range */
3198 || ( ! non_portable_endpoint
3199 && (( isLOWER_A(range_min) && isLOWER_A(range_max))
3200 || (isUPPER_A(range_min) && isUPPER_A(range_max))));
87dd6ea7 3201 if (convert_unicode) {
f4240379
KW
3202
3203 /* Special handling is needed for these portable ranges.
02cd137d
KW
3204 * They are defined to be in Unicode terms, which includes
3205 * all the Unicode code points between the end points.
f4240379
KW
3206 * Convert to Unicode to get the Unicode range. Later we
3207 * will convert each code point in the range back to
3208 * native. */
3209 range_min = NATIVE_TO_UNI(range_min);
3210 range_max = NATIVE_TO_UNI(range_max);
3211 }
e294cc5d 3212#endif
8ada0baa 3213
f4240379 3214 if (range_min > range_max) {
11327fa1 3215#ifdef EBCDIC
f4240379
KW
3216 if (convert_unicode) {
3217 /* Need to convert back to native for meaningful
3218 * messages for this platform */
3219 range_min = UNI_TO_NATIVE(range_min);
3220 range_max = UNI_TO_NATIVE(range_max);
3221 }
11327fa1 3222#endif
f4240379
KW
3223 /* Use the characters themselves for the error message if
3224 * ASCII printables; otherwise some visible representation
3225 * of them */
3226 if (isPRINT_A(range_min) && isPRINT_A(range_max)) {
3227 Perl_croak(aTHX_
3228 "Invalid range \"%c-%c\" in transliteration operator",
3229 (char)range_min, (char)range_max);
3230 }
11327fa1 3231#ifdef EBCDIC
f4240379 3232 else if (convert_unicode) {
02cd137d 3233 /* diag_listed_as: Invalid range "%s" in transliteration operator */
f4240379 3234 Perl_croak(aTHX_
02cd137d
KW
3235 "Invalid range \"\\N{U+%04" UVXf "}-\\N{U+%04"
3236 UVXf "}\" in transliteration operator",
3237 range_min, range_max);
f4240379 3238 }
11327fa1 3239#endif
f4240379 3240 else {
02cd137d 3241 /* diag_listed_as: Invalid range "%s" in transliteration operator */
f4240379 3242 Perl_croak(aTHX_
02cd137d
KW
3243 "Invalid range \"\\x{%04" UVXf "}-\\x{%04" UVXf "}\""
3244 " in transliteration operator",
3245 range_min, range_max);
f4240379 3246 }
c2e66d9e
GS
3247 }
3248
aca41667
KW
3249 /* If the range is exactly two code points long, they are
3250 * already both in the output */
3251 if (UNLIKELY(range_min + 1 == range_max)) {
3252 goto range_done;
3253 }
3254
3255 /* Here the range contains at least 3 code points */
3256
ed2893cb 3257 if (d_is_utf8) {
f4240379 3258
188d22cf
KW
3259 /* If everything in the transliteration is below 256, we
3260 * can avoid special handling later. A translation table
02cd137d
KW
3261 * for each of those bytes is created by op.c. So we
3262 * expand out all ranges to their constituent code points.
3263 * But if we've encountered something above 255, the
3264 * expanding won't help, so skip doing that. But if it's
3265 * EBCDIC, we may have to look at each character below 256
3266 * if we have to convert to/from Unicode values */
188d22cf 3267 if ( has_above_latin1
c7f1f016 3268#ifdef EBCDIC
f4240379 3269 && (range_min > 255 || ! convert_unicode)
8ada0baa 3270#endif
f4240379 3271 ) {
3fdfceb3
KW
3272 const STRLEN off = d - SvPVX(sv);
3273 const STRLEN extra = 1 + (send - s) + 1;
3274 char *e;
3275
f4240379
KW
3276 /* Move the high character one byte to the right; then
3277 * insert between it and the range begin, an illegal
3278 * byte which serves to indicate this is a range (using
02cd137d 3279 * a '-' would be ambiguous). */
3fdfceb3
KW
3280
3281 if (off + extra > SvLEN(sv)) {
3282 d = off + SvGROW(sv, off + extra);
3283 max_ptr = d - off + offset_to_max;
3284 }
3285
3286 e = d++;
f4240379
KW
3287 while (e-- > max_ptr) {
3288 *(e + 1) = *e;
e294cc5d 3289 }
dc8faf6b 3290 *(e + 1) = (char) RANGE_INDICATOR;
f4240379
KW
3291 goto range_done;
3292 }
3293
3294 /* Here, we're going to expand out the range. For EBCDIC
3295 * the range can extend above 255 (not so in ASCII), so
3296 * for EBCDIC, split it into the parts above and below
3297 * 255/256 */
e294cc5d 3298#ifdef EBCDIC
f4240379
KW
3299 if (range_max > 255) {
3300 real_range_max = range_max;
3301 range_max = 255;
3302 }
e294cc5d 3303#endif
f4240379 3304 }
02aa26ce 3305
f4240379 3306 /* Here we need to expand out the string to contain each
5ca37d54
KW
3307 * character in the range. Grow the output to handle this.
3308 * For non-UTF8, we need a byte for each code point in the
3309 * range, minus the three that we've already allocated for: the
3310 * hyphen, the min, and the max. For UTF-8, we need this
3311 * plus an extra byte for each code point that occupies two
3312 * bytes (is variant) when in UTF-8 (except we've already
3313 * allocated for the end points, including if they are
3314 * variants). For ASCII platforms and Unicode ranges on EBCDIC
3315 * platforms, it's easy to calculate a precise number. To
3316 * start, we count the variants in the range, which we need
3317 * elsewhere in this function anyway. (For the case where it
3318 * isn't easy to calculate, 'extras' has been initialized to 0,
3319 * and the calculation is done in a loop further down.) */
3320#ifdef EBCDIC
3321 if (convert_unicode)
3322#endif
3323 {
3324 /* This is executed unconditionally on ASCII, and for
3325 * Unicode ranges on EBCDIC. Under these conditions, all
3326 * code points above a certain value are variant; and none
3327 * under that value are. We just need to find out how much
3328 * of the range is above that value. We don't count the
3329 * end points here, as they will already have been counted
3330 * as they were parsed. */
3331 if (range_min >= UTF_CONTINUATION_MARK) {
3332
3333 /* The whole range is made up of variants */
3334 extras = (range_max - 1) - (range_min + 1) + 1;
3335 }
3336 else if (range_max >= UTF_CONTINUATION_MARK) {
f4240379 3337
5ca37d54
KW
3338 /* Only the higher portion of the range is variants */
3339 extras = (range_max - 1) - UTF_CONTINUATION_MARK + 1;
3340 }
f4240379 3341
5ca37d54
KW
3342 utf8_variant_count += extras;
3343 }
3344
3345 /* The base growth is the number of code points in the range,
3346 * not including the endpoints, which have already been sized
3347 * for (and output). We don't subtract for the hyphen, as it
3348 * has been parsed but not output, and the SvGROW below is
3349 * based only on what's been output plus what's left to parse.
3350 * */
3351 grow = (range_max - 1) - (range_min + 1) + 1;
f4240379 3352
ed2893cb 3353 if (d_is_utf8) {
4c3a8340 3354#ifdef EBCDIC
5ca37d54
KW
3355 /* In some cases in EBCDIC, we haven't yet calculated a
3356 * precise amount needed for the UTF-8 variants. Just
3357 * assume the worst case, that everything will expand by a
3358 * byte */
3359 if (! convert_unicode) {
f4240379
KW
3360 grow *= 2;
3361 }
5ca37d54 3362 else
4c3a8340 3363#endif
5ca37d54
KW
3364 {
3365 /* Otherwise we know exactly how many variants there
3366 * are in the range. */
3367 grow += extras;
3368 }
f4240379
KW
3369 }
3370
5ca37d54
KW
3371 /* Grow, but position the output to overwrite the range min end
3372 * point, because in some cases we overwrite that */
3373 SvCUR_set(sv, d - SvPVX_const(sv));
3374 offset_to_min = min_ptr - SvPVX_const(sv);
3375
3376 /* See Note on sizing above. */
3377 d = offset_to_min + SvGROW(sv, SvCUR(sv)
3378 + (send - s)
3379 + grow
3380 + 1 /* Trailing NUL */ );
f4240379 3381
5ca37d54 3382 /* Now, we can expand out the range. */
11327fa1 3383#ifdef EBCDIC
f4240379 3384 if (convert_unicode) {
5ca37d54 3385 SSize_t i;
02aa26ce 3386
f4240379
KW
3387 /* Recall that the min and max are now in Unicode terms, so
3388 * we have to convert each character to its native
3389 * equivalent */
ed2893cb 3390 if (d_is_utf8) {
f4240379 3391 for (i = range_min; i <= range_max; i++) {
02cd137d
KW
3392 append_utf8_from_native_byte(
3393 LATIN1_TO_NATIVE((U8) i),
3394 (U8 **) &d);
f4240379
KW
3395 }
3396 }
3397 else {
3398 for (i = range_min; i <= range_max; i++) {
81324705 3399 *d++ = (char)LATIN1_TO_NATIVE((U8) i);
f4240379
KW
3400 }
3401 }
01ec43d0 3402 }
11327fa1
AL
3403 else
3404#endif
3405 /* Always gets run for ASCII, and sometimes for EBCDIC. */
3406 {
f4240379
KW
3407 /* Here, no conversions are necessary, which means that the
3408 * first character in the range is already in 'd' and
3409 * valid, so we can skip overwriting it */
ed2893cb 3410 if (d_is_utf8) {
19742f39 3411 SSize_t i;
f4240379
KW
3412 d += UTF8SKIP(d);
3413 for (i = range_min + 1; i <= range_max; i++) {
81324705 3414 append_utf8_from_native_byte((U8) i, (U8 **) &d);
f4240379
KW
3415 }
3416 }
3417 else {
19742f39 3418 SSize_t i;
f4240379 3419 d++;
5ca37d54
KW
3420 assert(range_min + 1 <= range_max);
3421 for (i = range_min + 1; i < range_max; i++) {
3422#ifdef EBCDIC
3423 /* In this case on EBCDIC, we haven't calculated
3424 * the variants. Do it here, as we go along */
3425 if (! UVCHR_IS_INVARIANT(i)) {
3426 utf8_variant_count++;
3427 }
3428#endif
f4240379
KW
3429 *d++ = (char)i;
3430 }
5ca37d54
KW
3431
3432 /* The range_max is done outside the loop so as to
3433 * avoid having to special case not incrementing
3434 * 'utf8_variant_count' on EBCDIC (it's already been
3435 * counted when originally parsed) */
3436 *d++ = (char) range_max;
f4240379 3437 }
a0ed51b3 3438 }
02aa26ce 3439
11327fa1 3440#ifdef EBCDIC
02cd137d
KW
3441 /* If the original range extended above 255, add in that
3442 * portion. */
f4240379
KW
3443 if (real_range_max) {
3444 *d++ = (char) UTF8_TWO_BYTE_HI(0x100);
3445 *d++ = (char) UTF8_TWO_BYTE_LO(0x100);
043af6f6
KW
3446 if (real_range_max > 0x100) {
3447 if (real_range_max > 0x101) {
dc8faf6b 3448 *d++ = (char) RANGE_INDICATOR;
043af6f6 3449 }
f4240379 3450 d = (char*)uvchr_to_utf8((U8*)d, real_range_max);
043af6f6 3451 }
f4240379 3452 }
11327fa1 3453#endif
02aa26ce 3454
f4240379
KW
3455 range_done:
3456 /* mark the range as done, and continue */
3457 didrange = TRUE;
3458 dorange = FALSE;
3459#ifdef EBCDIC
3460 non_portable_endpoint = 0;
3461 backslash_N = 0;
3462#endif
3463 continue;
3464 } /* End of is a range */
3465 } /* End of transliteration. Joins main code after these else's */
e4a2df84
DM
3466 else if (*s == '[' && PL_lex_inpat && !in_charclass) {
3467 char *s1 = s-1;
3468 int esc = 0;
3469 while (s1 >= start && *s1-- == '\\')
3470 esc = !esc;
3471 if (!esc)
3472 in_charclass = TRUE;
3473 }
1e02a175 3474 else if (*s == ']' && PL_lex_inpat && in_charclass) {
e4a2df84
DM
3475 char *s1 = s-1;
3476 int esc = 0;
3477 while (s1 >= start && *s1-- == '\\')
3478 esc = !esc;
3479 if (!esc)
3480 in_charclass = FALSE;
3481 }
02cd137d
KW
3482 /* skip for regexp comments /(?#comment)/, except for the last
3483 * char, which will be done separately. Stop on (?{..}) and
3484 * friends */
c30fc27b 3485 else if (*s == '(' && PL_lex_inpat && s[1] == '?' && !in_charclass) {
cc6b7395 3486 if (s[2] == '#') {
f339d50e
KW
3487 if (s_is_utf8) {
3488 PERL_UINT_FAST8_T len = UTF8SKIP(s);
3489
3490 while (s + len < send && *s != ')') {
3491 Copy(s, d, len, U8);
3492 d += len;
3493 s += len;
3494 len = UTF8_SAFE_SKIP(s, send);
3495 }
3496 }
3497 else while (s+1 < send && *s != ')') {
3498 *d++ = *s++;
3499 }
155aba94 3500 }
407f8cf2
KW
3501 else if (!PL_lex_casemods
3502 && ( s[2] == '{' /* This should match regcomp.c */
3503 || (s[2] == '?' && s[3] == '{')))
155aba94 3504 {
9da1dd8f 3505 break;
cc6b7395 3506 }
748a9306 3507 }
02cd137d 3508 /* likewise skip #-initiated comments in //x patterns */
407f8cf2
KW
3509 else if (*s == '#'
3510 && PL_lex_inpat
3511 && !in_charclass
3512 && ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED)
3513 {
8faf4f30 3514 while (s < send && *s != '\n')
5ff03569 3515 *d++ = *s++;
748a9306 3516 }
02cd137d 3517 /* no further processing of single-quoted regex */
9da1dd8f
DM
3518 else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'')
3519 goto default_action;
3520
02cd137d
KW
3521 /* check for embedded arrays
3522 * (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
3523 */
1749ea0d 3524 else if (*s == '@' && s[1]) {
fac0f7a3
KW
3525 if (UTF
3526 ? isIDFIRST_utf8_safe(s+1, send)
3527 : isWORDCHAR_A(s[1]))
3528 {
1749ea0d 3529 break;
fac0f7a3 3530 }
4aada8b9 3531 if (memCHRs(":'{$", s[1]))
1749ea0d
ST
3532 break;
3533 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
3534 break; /* in regexp, neither @+ nor @- are interpolated */
3535 }
02cd137d
KW
3536 /* check for embedded scalars. only stop if we're sure it's a
3537 * variable. */