This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Distinguish C- and perly- literals - PERLY_BRACKET_OPEN
[perl5.git] / toke.c
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
f0e67a1d
Z
26This is the lower layer of the Perl parser, managing characters and tokens.
27
78342678 28=for apidoc AmnU|yy_parser *|PL_parser
f0e67a1d
Z
29
30Pointer to a structure encapsulating the state of the parsing operation
31currently in progress. The pointer can be locally changed to perform
32a nested parse without interfering with the state of an outer parse.
33Individual members of C<PL_parser> have their own documentation.
34
35=cut
36*/
37
378cc40b 38#include "EXTERN.h"
864dbfa3 39#define PERL_IN_TOKE_C
378cc40b 40#include "perl.h"
f1bcae08 41#include "invlist_inline.h"
378cc40b 42
164e423c
KW
43#define new_constant(a,b,c,d,e,f,g, h) \
44 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g, h)
eb0d8d16 45
6154021b 46#define pl_yylval (PL_parser->yylval)
d3b6f988 47
199e78b7
DM
48/* XXX temporary backwards compatibility */
49#define PL_lex_brackets (PL_parser->lex_brackets)
78cdf107
Z
50#define PL_lex_allbrackets (PL_parser->lex_allbrackets)
51#define PL_lex_fakeeof (PL_parser->lex_fakeeof)
199e78b7
DM
52#define PL_lex_brackstack (PL_parser->lex_brackstack)
53#define PL_lex_casemods (PL_parser->lex_casemods)
54#define PL_lex_casestack (PL_parser->lex_casestack)
199e78b7 55#define PL_lex_dojoin (PL_parser->lex_dojoin)
199e78b7
DM
56#define PL_lex_formbrack (PL_parser->lex_formbrack)
57#define PL_lex_inpat (PL_parser->lex_inpat)
58#define PL_lex_inwhat (PL_parser->lex_inwhat)
59#define PL_lex_op (PL_parser->lex_op)
60#define PL_lex_repl (PL_parser->lex_repl)
61#define PL_lex_starts (PL_parser->lex_starts)
62#define PL_lex_stuff (PL_parser->lex_stuff)
63#define PL_multi_start (PL_parser->multi_start)
64#define PL_multi_open (PL_parser->multi_open)
65#define PL_multi_close (PL_parser->multi_close)
199e78b7 66#define PL_preambled (PL_parser->preambled)
bdc0bf6f 67#define PL_linestr (PL_parser->linestr)
c2598295
DM
68#define PL_expect (PL_parser->expect)
69#define PL_copline (PL_parser->copline)
f06b5848
DM
70#define PL_bufptr (PL_parser->bufptr)
71#define PL_oldbufptr (PL_parser->oldbufptr)
72#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
73#define PL_linestart (PL_parser->linestart)
74#define PL_bufend (PL_parser->bufend)
75#define PL_last_uni (PL_parser->last_uni)
76#define PL_last_lop (PL_parser->last_lop)
77#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 78#define PL_lex_state (PL_parser->lex_state)
2f9285f8 79#define PL_rsfp (PL_parser->rsfp)
5486870f 80#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
81#define PL_in_my (PL_parser->in_my)
82#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 83#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 84#define PL_multi_end (PL_parser->multi_end)
13765c85 85#define PL_error_count (PL_parser->error_count)
199e78b7 86
fb205e7a
DM
87# define PL_nexttoke (PL_parser->nexttoke)
88# define PL_nexttype (PL_parser->nexttype)
89# define PL_nextval (PL_parser->nextval)
199e78b7 90
6432a58a
DM
91
92#define SvEVALED(sv) \
93 (SvTYPE(sv) >= SVt_PVNV \
94 && ((XPVIV*)SvANY(sv))->xiv_u.xivu_eval_seen)
95
a1894d81 96static const char* const ident_too_long = "Identifier too long";
60267e1d 97static const char* const ident_var_zero_multi_digit = "Numeric variables with more than one digit may not start with '0'";
8903cb82 98
9ded7720 99# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2 100
a7aaec61
Z
101#define XENUMMASK 0x3f
102#define XFAKEEOF 0x40
103#define XFAKEBRACK 0x80
9059aa12 104
39e02b42 105#ifdef USE_UTF8_SCRIPTS
b3041197 106# define UTF cBOOL(!IN_BYTES)
2b9d42f0 107#else
b3041197 108# define UTF cBOOL((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
2b9d42f0 109#endif
a0ed51b3 110
b1fc3636
CJ
111/* The maximum number of characters preceding the unrecognized one to display */
112#define UNRECOGNIZED_PRECEDE_COUNT 10
113
61f0cdd9 114/* In variables named $^X, these are the legal values for X.
2b92dfce 115 * 1999-02-27 mjd-perl-patch@plover.com */
4aada8b9 116#define isCONTROLVAR(x) (isUPPER(x) || memCHRs("[\\]^_?", (x)))
2b92dfce 117
14bd96d0 118#define SPACE_OR_TAB(c) isBLANK_A(c)
bf4acbe4 119
9ff909cf
JH
120#define HEXFP_PEEK(s) \
121 (((s[0] == '.') && \
122 (isXDIGIT(s[1]) || isALPHA_FOLD_EQ(s[1], 'p'))) || \
123 isALPHA_FOLD_EQ(s[0], 'p'))
124
ffb4593c
NT
125/* LEX_* are values for PL_lex_state, the state of the lexer.
126 * They are arranged oddly so that the guard on the switch statement
79072805 127 * can get by with a single comparison (if the compiler is smart enough).
9da1dd8f
DM
128 *
129 * These values refer to the various states within a sublex parse,
130 * i.e. within a double quotish string
79072805
LW
131 */
132
fb73857a
PP
133/* #define LEX_NOTPARSING 11 is done in perl.h. */
134
b6007c36
DM
135#define LEX_NORMAL 10 /* normal code (ie not within "...") */
136#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
137#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
138#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
139#define LEX_INTERPSTART 6 /* expecting the start of a $var */
140
141 /* at end of code, eg "$x" followed by: */
142#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
143#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
144
145#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
146 string or after \E, $foo, etc */
147#define LEX_INTERPCONST 2 /* NOT USED */
148#define LEX_FORMLINE 1 /* expecting a format line */
b6007c36 149
56d9fe2f
TC
150/* returned to yyl_try() to request it to retry the parse loop, expected to only
151 be returned directly by yyl_fake_eof(), but functions that call yyl_fake_eof()
152 can also return it.
153
154 yylex (aka Perl_yylex) returns 0 on EOF rather than returning -1,
155 other token values are 258 or higher (see perly.h), so -1 should be
156 a safe value here.
157*/
158#define YYL_RETRY (-1)
79072805 159
bbf60fe6 160#ifdef DEBUGGING
27da23d5 161static const char* const lex_state_names[] = {
bbf60fe6
DM
162 "KNOWNEXT",
163 "FORMLINE",
164 "INTERPCONST",
165 "INTERPCONCAT",
166 "INTERPENDMAYBE",
167 "INTERPEND",
168 "INTERPSTART",
169 "INTERPPUSH",
170 "INTERPCASEMOD",
171 "INTERPNORMAL",
172 "NORMAL"
173};
174#endif
175
79072805 176#include "keywords.h"
fe14fcc3 177
ffb4593c
NT
178/* CLINE is a macro that ensures PL_copline has a sane value */
179
57843af0 180#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 181
ffb4593c
NT
182/*
183 * Convenience functions to return different tokens and prime the
9cbb5ea2 184 * lexer for the next token. They all take an argument.
ffb4593c
NT
185 *
186 * TOKEN : generic token (used for '(', DOLSHARP, etc)
187 * OPERATOR : generic operator
188 * AOPERATOR : assignment operator
189 * PREBLOCK : beginning the block after an if, while, foreach, ...
190 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
191 * PREREF : *EXPR where EXPR is not a simple identifier
192 * TERM : expression term
89f35911 193 * POSTDEREF : postfix dereference (->$* ->@[...] etc.)
ffb4593c
NT
194 * LOOPX : loop exiting command (goto, last, dump, etc)
195 * FTST : file test operator
196 * FUN0 : zero-argument function
7eb971ee 197 * FUN0OP : zero-argument function, with its op created in this file
2d2e263d 198 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
199 * BOop : bitwise or or xor
200 * BAop : bitwise and
8823cb89 201 * BCop : bitwise complement
ffb4593c
NT
202 * SHop : shift operator
203 * PWop : power operator
9cbb5ea2 204 * PMop : pattern-matching operator
ffb4593c 205 * Aop : addition-level operator
e4916dd1 206 * AopNOASSIGN : addition-level operator that is never part of .=
ffb4593c 207 * Mop : multiplication-level operator
02b85d3d
Z
208 * ChEop : chaining equality-testing operator
209 * NCEop : non-chaining comparison operator at equality precedence
210 * ChRop : chaining relational operator <= != gt
211 * NCRop : non-chaining relational operator isa
ffb4593c
NT
212 *
213 * Also see LOP and lop() below.
214 */
215
998054bd 216#ifdef DEBUGGING /* Serve -DT. */
704d4215 217# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 218#else
bbf60fe6 219# define REPORT(retval) (retval)
998054bd
SC
220#endif
221
bbf60fe6
DM
222#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
223#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
b1764551 224#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, retval))
bbf60fe6
DM
225#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
226#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
227#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
228#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
89f35911 229#define POSTDEREF(f) return (PL_bufptr = s, S_postderef(aTHX_ REPORT(f),s[1]))
185c2e96 230#define LOOPX(f) return (PL_bufptr = force_word(s,BAREWORD,TRUE,FALSE), \
7a61bf3c 231 pl_yylval.ival=f, \
a49203fd 232 PL_expect = PL_nexttoke ? XOPERATOR : XTERM, \
7a61bf3c 233 REPORT((int)LOOPEX))
6154021b
RGS
234#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
235#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
7eb971ee 236#define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
6154021b 237#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
b1764551
FC
238#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)BITOROP))
239#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)BITANDOP))
8823cb89
FC
240#define BCop(f) return pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr = s, \
241 REPORT('~')
b1764551
FC
242#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)SHIFTOP))
243#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)POWOP))
6154021b 244#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
b1764551 245#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)ADDOP))
e4916dd1 246#define AopNOASSIGN(f) return (pl_yylval.ival=f, PL_bufptr=s, REPORT((int)ADDOP))
b1764551 247#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)MULOP))
02b85d3d
Z
248#define ChEop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)CHEQOP))
249#define NCEop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)NCEQOP))
250#define ChRop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)CHRELOP))
251#define NCRop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)NCRELOP))
2f3197b3 252
a687059c
LW
253/* This bit of chicanery makes a unary function followed by
254 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
255 * The UNIDOR macro is for unary functions that can be followed by the //
256 * operator (such as C<shift // 0>).
a687059c 257 */
d68ce4ac 258#define UNI3(f,x,have_x) { \
6154021b 259 pl_yylval.ival = f; \
d68ce4ac 260 if (have_x) PL_expect = x; \
376fcdbf
AL
261 PL_bufptr = s; \
262 PL_last_uni = PL_oldbufptr; \
0af40c75 263 PL_last_lop_op = (f) < 0 ? -(f) : (f); \
376fcdbf
AL
264 if (*s == '(') \
265 return REPORT( (int)FUNC1 ); \
294a536f 266 s = skipspace(s); \
376fcdbf
AL
267 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
268 }
d68ce4ac
FC
269#define UNI(f) UNI3(f,XTERM,1)
270#define UNIDOR(f) UNI3(f,XTERMORDORDOR,1)
b5fb7ce3
FC
271#define UNIPROTO(f,optional) { \
272 if (optional) PL_last_uni = PL_oldbufptr; \
22393538
MH
273 OPERATOR(f); \
274 }
a687059c 275
d68ce4ac 276#define UNIBRACK(f) UNI3(f,0,0)
79072805 277
9f68db38 278/* grandfather return to old style */
78cdf107
Z
279#define OLDLOP(f) \
280 do { \
281 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
282 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
283 pl_yylval.ival = (f); \
284 PL_expect = XTERM; \
285 PL_bufptr = s; \
286 return (int)LSTOP; \
287 } while(0)
79072805 288
83944c01
FC
289#define COPLINE_INC_WITH_HERELINES \
290 STMT_START { \
291 CopLINE_inc(PL_curcop); \
851b527a
FC
292 if (PL_parser->herelines) \
293 CopLINE(PL_curcop) += PL_parser->herelines, \
294 PL_parser->herelines = 0; \
83944c01 295 } STMT_END
ffdb8b16
FC
296/* Called after scan_str to update CopLINE(PL_curcop), but only when there
297 * is no sublex_push to follow. */
298#define COPLINE_SET_FROM_MULTI_END \
299 STMT_START { \
300 CopLINE_set(PL_curcop, PL_multi_end); \
301 if (PL_multi_end != PL_multi_start) \
851b527a 302 PL_parser->herelines = 0; \
ffdb8b16 303 } STMT_END
83944c01
FC
304
305
539c839f
AC
306/* A file-local structure for passing around information about subroutines and
307 * related definable words */
308struct code {
309 SV *sv;
310 CV *cv;
311 GV *gv, **gvp;
312 OP *rv2cv_op;
313 PADOFFSET off;
314 bool lex;
315};
316
317static const struct code no_code = { NULL, NULL, NULL, NULL, NULL, 0, FALSE };
318
8fa7f367
JH
319#ifdef DEBUGGING
320
6154021b 321/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
322enum token_type {
323 TOKENTYPE_NONE,
324 TOKENTYPE_IVAL,
6154021b 325 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6 326 TOKENTYPE_PVAL,
aeaef349 327 TOKENTYPE_OPVAL
bbf60fe6
DM
328};
329
c588171e
BZ
330#define DEBUG_TOKEN(Type, Name) \
331 { Name, TOKENTYPE_##Type, #Name }
332
6d4a66ac
NC
333static struct debug_tokens {
334 const int token;
335 enum token_type type;
336 const char *name;
337} const debug_tokens[] =
9041c2e3 338{
bbf60fe6
DM
339 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
340 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
341 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
342 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
436ddf68 343 { ANON_SIGSUB, TOKENTYPE_IVAL, "ANON_SIGSUB" },
bbf60fe6
DM
344 { ARROW, TOKENTYPE_NONE, "ARROW" },
345 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
346 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
347 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
02b85d3d
Z
348 { CHEQOP, TOKENTYPE_OPNUM, "CHEQOP" },
349 { CHRELOP, TOKENTYPE_OPNUM, "CHRELOP" },
bbf60fe6
DM
350 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
351 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
7896dde7 352 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
353 { DO, TOKENTYPE_NONE, "DO" },
354 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
355 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
356 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
357 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
358 { ELSE, TOKENTYPE_NONE, "ELSE" },
359 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
bbf60fe6
DM
360 { FOR, TOKENTYPE_IVAL, "FOR" },
361 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
705fe0e5
FC
362 { FORMLBRACK, TOKENTYPE_NONE, "FORMLBRACK" },
363 { FORMRBRACK, TOKENTYPE_NONE, "FORMRBRACK" },
bbf60fe6
DM
364 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
365 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
7eb971ee 366 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
bbf60fe6
DM
367 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
368 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
369 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 370 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
371 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
372 { IF, TOKENTYPE_IVAL, "IF" },
01719201 373 { LABEL, TOKENTYPE_OPVAL, "LABEL" },
bbf60fe6
DM
374 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
375 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
376 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
377 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
378 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
379 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
380 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
381 { MY, TOKENTYPE_IVAL, "MY" },
02b85d3d
Z
382 { NCEQOP, TOKENTYPE_OPNUM, "NCEQOP" },
383 { NCRELOP, TOKENTYPE_OPNUM, "NCRELOP" },
bbf60fe6
DM
384 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
385 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
386 { OROP, TOKENTYPE_IVAL, "OROP" },
387 { OROR, TOKENTYPE_NONE, "OROR" },
388 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
d0a6a9c7 389 DEBUG_TOKEN (IVAL, PERLY_BRACE_CLOSE),
c588171e 390 DEBUG_TOKEN (IVAL, PERLY_BRACE_OPEN),
669dd22c 391 DEBUG_TOKEN (IVAL, PERLY_BRACKET_OPEN),
88e1f1a2
JV
392 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
393 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6 394 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
cc624add 395 { POSTJOIN, TOKENTYPE_NONE, "POSTJOIN" },
bbf60fe6
DM
396 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
397 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
398 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
399 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
400 { PREINC, TOKENTYPE_NONE, "PREINC" },
401 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
f3f204dc 402 { QWLIST, TOKENTYPE_OPVAL, "QWLIST" },
bbf60fe6 403 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
f3f204dc 404 { REQUIRE, TOKENTYPE_NONE, "REQUIRE" },
bbf60fe6 405 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
436ddf68 406 { SIGSUB, TOKENTYPE_NONE, "SIGSUB" },
bbf60fe6 407 { SUB, TOKENTYPE_NONE, "SUB" },
69afcc21
TC
408 { SUBLEXEND, TOKENTYPE_NONE, "SUBLEXEND" },
409 { SUBLEXSTART, TOKENTYPE_NONE, "SUBLEXSTART" },
bbf60fe6
DM
410 { THING, TOKENTYPE_OPVAL, "THING" },
411 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
412 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
413 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
414 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
415 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
416 { USE, TOKENTYPE_IVAL, "USE" },
7896dde7 417 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6 418 { WHILE, TOKENTYPE_IVAL, "WHILE" },
185c2e96 419 { BAREWORD, TOKENTYPE_OPVAL, "BAREWORD" },
be25f609 420 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 421 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
422};
423
c588171e
BZ
424#undef DEBUG_TOKEN
425
6154021b 426/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 427
bbf60fe6 428STATIC int
704d4215 429S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 430{
7918f24d
NC
431 PERL_ARGS_ASSERT_TOKEREPORT;
432
bbf60fe6 433 if (DEBUG_T_TEST) {
bd61b366 434 const char *name = NULL;
bbf60fe6 435 enum token_type type = TOKENTYPE_NONE;
f54cb97a 436 const struct debug_tokens *p;
396482e1 437 SV* const report = newSVpvs("<== ");
bbf60fe6 438
f54cb97a 439 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
440 if (p->token == (int)rv) {
441 name = p->name;
442 type = p->type;
443 break;
444 }
445 }
446 if (name)
54667de8 447 Perl_sv_catpv(aTHX_ report, name);
239f83d5 448 else if (isGRAPH(rv))
4ebc7986 449 {
bbf60fe6 450 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
4ebc7986
FC
451 if ((char)rv == 'p')
452 sv_catpvs(report, " (pending identifier)");
453 }
bbf60fe6 454 else if (!rv)
396482e1 455 sv_catpvs(report, "EOF");
bbf60fe6 456 else
147e3846 457 Perl_sv_catpvf(aTHX_ report, "?? %" IVdf, (IV)rv);
bbf60fe6
DM
458 switch (type) {
459 case TOKENTYPE_NONE:
bbf60fe6
DM
460 break;
461 case TOKENTYPE_IVAL:
147e3846 462 Perl_sv_catpvf(aTHX_ report, "(ival=%" IVdf ")", (IV)lvalp->ival);
bbf60fe6
DM
463 break;
464 case TOKENTYPE_OPNUM:
465 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 466 PL_op_name[lvalp->ival]);
bbf60fe6
DM
467 break;
468 case TOKENTYPE_PVAL:
704d4215 469 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
470 break;
471 case TOKENTYPE_OPVAL:
704d4215 472 if (lvalp->opval) {
401441c0 473 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
474 PL_op_name[lvalp->opval->op_type]);
475 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 476 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 477 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
478 }
479
480 }
401441c0 481 else
396482e1 482 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
483 break;
484 }
b6007c36 485 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
486 };
487 return (int)rv;
998054bd
SC
488}
489
b6007c36
DM
490
491/* print the buffer with suitable escapes */
492
493STATIC void
15f169a1 494S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 495{
396482e1 496 SV* const tmp = newSVpvs("");
7918f24d
NC
497
498 PERL_ARGS_ASSERT_PRINTBUF;
499
7347ee54 500 GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral); /* fmt checked by caller */
b6007c36 501 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
7347ee54 502 GCC_DIAG_RESTORE_STMT;
b6007c36
DM
503 SvREFCNT_dec(tmp);
504}
505
8fa7f367
JH
506#endif
507
ffb4593c
NT
508/*
509 * S_ao
510 *
f393a21a
FC
511 * This subroutine looks for an '=' next to the operator that has just been
512 * parsed and turns it into an ASSIGNOP if it finds one.
ffb4593c
NT
513 */
514
76e3520e 515STATIC int
cea2e8a9 516S_ao(pTHX_ int toketype)
a0d0e21e 517{
3280af22
NIS
518 if (*PL_bufptr == '=') {
519 PL_bufptr++;
a0d0e21e 520 if (toketype == ANDAND)
6154021b 521 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 522 else if (toketype == OROR)
6154021b 523 pl_yylval.ival = OP_ORASSIGN;
c963b151 524 else if (toketype == DORDOR)
6154021b 525 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
526 toketype = ASSIGNOP;
527 }
b1764551 528 return REPORT(toketype);
a0d0e21e
LW
529}
530
ffb4593c
NT
531/*
532 * S_no_op
533 * When Perl expects an operator and finds something else, no_op
534 * prints the warning. It always prints "<something> found where
535 * operator expected. It prints "Missing semicolon on previous line?"
536 * if the surprise occurs at the start of the line. "do you need to
537 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
538 * where the compiler doesn't know if foo is a method call or a function.
539 * It prints "Missing operator before end of line" if there's nothing
540 * after the missing operator, or "... before <...>" if there is something
541 * after the missing operator.
488bc579
FC
542 *
543 * PL_bufptr is expected to point to the start of the thing that was found,
544 * and s after the next token or partial token.
ffb4593c
NT
545 */
546
76e3520e 547STATIC void
15f169a1 548S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 549{
9d4ba2ae
AL
550 char * const oldbp = PL_bufptr;
551 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 552
7918f24d
NC
553 PERL_ARGS_ASSERT_NO_OP;
554
1189a94a
GS
555 if (!s)
556 s = oldbp;
07c798fb 557 else
1189a94a 558 PL_bufptr = s;
734ab321 559 yywarn(Perl_form(aTHX_ "%s found where operator expected", what), UTF ? SVf_UTF8 : 0);
56da5a46
RGS
560 if (ckWARN_d(WARN_SYNTAX)) {
561 if (is_first)
562 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
563 "\t(Missing semicolon on previous line?)\n");
fac0f7a3
KW
564 else if (PL_oldoldbufptr && isIDFIRST_lazy_if_safe(PL_oldoldbufptr,
565 PL_bufend,
566 UTF))
567 {
f54cb97a 568 const char *t;
fac0f7a3
KW
569 for (t = PL_oldoldbufptr;
570 (isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF) || *t == ':');
571 t += UTF ? UTF8SKIP(t) : 1)
572 {
c35e046a 573 NOOP;
fac0f7a3 574 }
56da5a46
RGS
575 if (t < PL_bufptr && isSPACE(*t))
576 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
147e3846 577 "\t(Do you need to predeclare %" UTF8f "?)\n",
b17a0679 578 UTF8fARG(UTF, t - PL_oldoldbufptr, PL_oldoldbufptr));
56da5a46
RGS
579 }
580 else {
581 assert(s >= oldbp);
582 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
147e3846 583 "\t(Missing operator before %" UTF8f "?)\n",
b17a0679 584 UTF8fARG(UTF, s - oldbp, oldbp));
56da5a46 585 }
07c798fb 586 }
3280af22 587 PL_bufptr = oldbp;
8990e307
LW
588}
589
ffb4593c
NT
590/*
591 * S_missingterm
592 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 593 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
594 * If we're in a delimited string and the delimiter is a control
595 * character, it's reformatted into a two-char sequence like ^C.
596 * This is fatal.
597 */
598
76e3520e 599STATIC void
1b8d3e0e 600S_missingterm(pTHX_ char *s, STRLEN len)
8990e307 601{
e487ff5e 602 char tmpbuf[UTF8_MAXBYTES + 1];
8990e307 603 char q;
cb650135
FC
604 bool uni = FALSE;
605 SV *sv;
8990e307 606 if (s) {
f88b23d7 607 char * const nl = (char *) my_memrchr(s, '\n', len);
1b8d3e0e
LM
608 if (nl) {
609 *nl = '\0';
610 len = nl - s;
611 }
7f7f7d08 612 uni = UTF;
8990e307 613 }
cb650135 614 else if (PL_multi_close < 32) {
8990e307 615 *tmpbuf = '^';
585ec06d 616 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
617 tmpbuf[2] = '\0';
618 s = tmpbuf;
1b8d3e0e 619 len = 2;
8990e307
LW
620 }
621 else {
cb650135
FC
622 if (LIKELY(PL_multi_close < 256)) {
623 *tmpbuf = (char)PL_multi_close;
624 tmpbuf[1] = '\0';
1b8d3e0e 625 len = 1;
cb650135
FC
626 }
627 else {
1b8d3e0e
LM
628 char *end = (char *)uvchr_to_utf8((U8 *)tmpbuf, PL_multi_close);
629 *end = '\0';
630 len = end - tmpbuf;
cb650135 631 uni = TRUE;
cb650135 632 }
8990e307
LW
633 s = tmpbuf;
634 }
9f8d27b7 635 q = memchr(s, '"', len) ? '\'' : '"';
1b8d3e0e 636 sv = sv_2mortal(newSVpvn(s, len));
cb650135
FC
637 if (uni)
638 SvUTF8_on(sv);
1b8d3e0e
LM
639 Perl_croak(aTHX_ "Can't find string terminator %c%" SVf "%c"
640 " anywhere before EOF", q, SVfARG(sv), q);
463ee0b2 641}
79072805 642
dd0ac2b9
FC
643#include "feature.h"
644
0d863452 645/*
9cbb5ea2
GS
646 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
647 * utf16-to-utf8-reversed.
ffb4593c
NT
648 */
649
c39cd008
GS
650#ifdef PERL_CR_FILTER
651static void
652strip_return(SV *sv)
653{
eb578fdb
KW
654 const char *s = SvPVX_const(sv);
655 const char * const e = s + SvCUR(sv);
7918f24d
NC
656
657 PERL_ARGS_ASSERT_STRIP_RETURN;
658
c39cd008
GS
659 /* outer loop optimized to do nothing if there are no CR-LFs */
660 while (s < e) {
661 if (*s++ == '\r' && *s == '\n') {
662 /* hit a CR-LF, need to copy the rest */
eb578fdb 663 char *d = s - 1;
c39cd008
GS
664 *d++ = *s++;
665 while (s < e) {
666 if (*s == '\r' && s[1] == '\n')
667 s++;
668 *d++ = *s++;
669 }
670 SvCUR(sv) -= s - d;
671 return;
672 }
673 }
674}
a868473f 675
76e3520e 676STATIC I32
c39cd008 677S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 678{
f54cb97a 679 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
680 if (count > 0 && !maxlen)
681 strip_return(sv);
682 return count;
a868473f
NIS
683}
684#endif
685
ffb4593c 686/*
44170c9a 687=for apidoc lex_start
8eaa0acf
Z
688
689Creates and initialises a new lexer/parser state object, supplying
690a context in which to lex and parse from a new source of Perl code.
691A pointer to the new state object is placed in L</PL_parser>. An entry
c83d5090 692is made on the save stack so that upon unwinding, the new state object
8eaa0acf
Z
693will be destroyed and the former value of L</PL_parser> will be restored.
694Nothing else need be done to clean up the parsing context.
695
2d7f6611 696The code to be parsed comes from C<line> and C<rsfp>. C<line>, if
8eaa0acf 697non-null, provides a string (in SV form) containing code to be parsed.
2d7f6611
KW
698A copy of the string is made, so subsequent modification of C<line>
699does not affect parsing. C<rsfp>, if non-null, provides an input stream
8eaa0acf 700from which code will be read to be parsed. If both are non-null, the
2d7f6611
KW
701code in C<line> comes first and must consist of complete lines of input,
702and C<rsfp> supplies the remainder of the source.
8eaa0acf 703
2d7f6611 704The C<flags> parameter is reserved for future use. Currently it is only
e368b3bd 705used by perl internally, so extensions should always pass zero.
8eaa0acf
Z
706
707=cut
708*/
ffb4593c 709
27fcb6ee 710/* LEX_START_SAME_FILTER indicates that this is not a new file, so it
87606032
NC
711 can share filters with the current parser.
712 LEX_START_DONT_CLOSE indicates that the file handle wasn't opened by the
713 caller, hence isn't owned by the parser, so shouldn't be closed on parser
714 destruction. This is used to handle the case of defaulting to reading the
715 script from the standard input because no filename was given on the command
716 line (without getting confused by situation where STDIN has been closed, so
717 the script handle is opened on fd 0) */
27fcb6ee 718
a0d0e21e 719void
8eaa0acf 720Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
79072805 721{
6ef55633 722 const char *s = NULL;
5486870f 723 yy_parser *parser, *oparser;
90b58c70 724
60d63348 725 if (flags && flags & ~LEX_START_FLAGS)
8eaa0acf 726 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
acdf0a21
DM
727
728 /* create and initialise a parser */
729
199e78b7 730 Newxz(parser, 1, yy_parser);
5486870f 731 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
732 PL_parser = parser;
733
28ac2b49 734 parser->stack = NULL;
df13534a 735 parser->stack_max1 = NULL;
28ac2b49 736 parser->ps = NULL;
acdf0a21 737
e3abe207
DM
738 /* on scope exit, free this parser and restore any outer one */
739 SAVEPARSER(parser);
7c4baf47 740 parser->saved_curcop = PL_curcop;
e3abe207 741
acdf0a21 742 /* initialise lexer state */
8990e307 743
fb205e7a 744 parser->nexttoke = 0;
ca4cfd28 745 parser->error_count = oparser ? oparser->error_count : 0;
7f1c3e8c 746 parser->copline = parser->preambling = NOLINE;
5afb0a62 747 parser->lex_state = LEX_NORMAL;
c2598295 748 parser->expect = XSTATE;
2f9285f8 749 parser->rsfp = rsfp;
b7b52646 750 parser->recheck_utf8_validity = TRUE;
27fcb6ee
FC
751 parser->rsfp_filters =
752 !(flags & LEX_START_SAME_FILTER) || !oparser
d3cd8e11
FC
753 ? NULL
754 : MUTABLE_AV(SvREFCNT_inc(
755 oparser->rsfp_filters
756 ? oparser->rsfp_filters
757 : (oparser->rsfp_filters = newAV())
758 ));
2f9285f8 759
199e78b7
DM
760 Newx(parser->lex_brackstack, 120, char);
761 Newx(parser->lex_casestack, 12, char);
762 *parser->lex_casestack = '\0';
d794b522 763 Newxz(parser->lex_shared, 1, LEXSHARED);
02b34bbe 764
10efb74f 765 if (line) {
0528fd32 766 STRLEN len;
a6909bd2
KW
767 const U8* first_bad_char_loc;
768
10efb74f 769 s = SvPV_const(line, len);
90b58c70 770
0aab20f2
KW
771 if ( SvUTF8(line)
772 && UNLIKELY(! is_utf8_string_loc((U8 *) s,
773 SvCUR(line),
774 &first_bad_char_loc)))
90b58c70
KW
775 {
776 _force_out_malformed_utf8_message(first_bad_char_loc,
777 (U8 *) s + SvCUR(line),
778 0,
779 1 /* 1 means die */ );
780 NOT_REACHED; /* NOTREACHED */
781 }
782
0abcdfa4
FC
783 parser->linestr = flags & LEX_START_COPIED
784 ? SvREFCNT_inc_simple_NN(line)
785 : newSVpvn_flags(s, len, SvUTF8(line));
b3dd0aba
FC
786 if (!rsfp)
787 sv_catpvs(parser->linestr, "\n;");
0abcdfa4 788 } else {
bf1b738b 789 parser->linestr = newSVpvn("\n;", rsfp ? 1 : 2);
8990e307 790 }
218304f9 791
f06b5848
DM
792 parser->oldoldbufptr =
793 parser->oldbufptr =
794 parser->bufptr =
795 parser->linestart = SvPVX(parser->linestr);
796 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
797 parser->last_lop = parser->last_uni = NULL;
b54f893d 798
6d59e610 799 STATIC_ASSERT_STMT(FITS_IN_8_BITS(LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
b54f893d
KW
800 |LEX_DONT_CLOSE_RSFP));
801 parser->lex_flags = (U8) (flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
802 |LEX_DONT_CLOSE_RSFP));
737c24fc 803
60d63348 804 parser->in_pod = parser->filtered = 0;
79072805 805}
a687059c 806
e3abe207
DM
807
808/* delete a parser object */
809
810void
811Perl_parser_free(pTHX_ const yy_parser *parser)
812{
7918f24d
NC
813 PERL_ARGS_ASSERT_PARSER_FREE;
814
7c4baf47 815 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
816 SvREFCNT_dec(parser->linestr);
817
87606032 818 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
2f9285f8 819 PerlIO_clearerr(parser->rsfp);
407f8cf2
KW
820 else if (parser->rsfp && (!parser->old_parser
821 || (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 822 PerlIO_close(parser->rsfp);
5486870f 823 SvREFCNT_dec(parser->rsfp_filters);
10002bc1 824 SvREFCNT_dec(parser->lex_stuff);
7ef70b3d 825 SvREFCNT_dec(parser->lex_sub_repl);
3ac7ff8f
FC
826
827 Safefree(parser->lex_brackstack);
828 Safefree(parser->lex_casestack);
829 Safefree(parser->lex_shared);
830 PL_parser = parser->old_parser;
831 Safefree(parser);
832}
833
834void
835Perl_parser_free_nexttoke_ops(pTHX_ yy_parser *parser, OPSLAB *slab)
836{
3ac7ff8f 837 I32 nexttoke = parser->nexttoke;
3ac7ff8f 838 PERL_ARGS_ASSERT_PARSER_FREE_NEXTTOKE_OPS;
3ce3dcd9 839 while (nexttoke--) {
3ac7ff8f
FC
840 if (S_is_opval_token(parser->nexttype[nexttoke] & 0xffff)
841 && parser->nextval[nexttoke].opval
842 && parser->nextval[nexttoke].opval->op_slabbed
843 && OpSLAB(parser->nextval[nexttoke].opval) == slab) {
3ce3dcd9 844 op_free(parser->nextval[nexttoke].opval);
3ac7ff8f
FC
845 parser->nextval[nexttoke].opval = NULL;
846 }
3ce3dcd9 847 }
e3abe207
DM
848}
849
850
ffb4593c 851/*
78342678 852=for apidoc AmnxUN|SV *|PL_parser-E<gt>linestr
f0e67a1d
Z
853
854Buffer scalar containing the chunk currently under consideration of the
855text currently being lexed. This is always a plain string scalar (for
856which C<SvPOK> is true). It is not intended to be used as a scalar by
857normal scalar means; instead refer to the buffer directly by the pointer
858variables described below.
859
860The lexer maintains various C<char*> pointers to things in the
861C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
862reallocated, all of these pointers must be updated. Don't attempt to
863do this manually, but rather use L</lex_grow_linestr> if you need to
864reallocate the buffer.
865
866The content of the text chunk in the buffer is commonly exactly one
867complete line of input, up to and including a newline terminator,
868but there are situations where it is otherwise. The octets of the
869buffer may be intended to be interpreted as either UTF-8 or Latin-1.
870The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
871flag on this scalar, which may disagree with it.
872
873For direct examination of the buffer, the variable
874L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
875lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
876of these pointers is usually preferable to examination of the scalar
877through normal scalar means.
878
78342678 879=for apidoc AmnxUN|char *|PL_parser-E<gt>bufend
f0e67a1d
Z
880
881Direct pointer to the end of the chunk of text currently being lexed, the
882end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
6602b933 883+ SvCUR(PL_parser-E<gt>linestr)>. A C<NUL> character (zero octet) is
f0e67a1d
Z
884always located at the end of the buffer, and does not count as part of
885the buffer's contents.
886
78342678 887=for apidoc AmnxUN|char *|PL_parser-E<gt>bufptr
f0e67a1d
Z
888
889Points to the current position of lexing inside the lexer buffer.
890Characters around this point may be freely examined, within
891the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
892L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
893interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
894
895Lexing code (whether in the Perl core or not) moves this pointer past
896the characters that it consumes. It is also expected to perform some
897bookkeeping whenever a newline character is consumed. This movement
898can be more conveniently performed by the function L</lex_read_to>,
899which handles newlines appropriately.
900
901Interpretation of the buffer's octets can be abstracted out by
902using the slightly higher-level functions L</lex_peek_unichar> and
903L</lex_read_unichar>.
904
78342678 905=for apidoc AmnxUN|char *|PL_parser-E<gt>linestart
f0e67a1d
Z
906
907Points to the start of the current line inside the lexer buffer.
908This is useful for indicating at which column an error occurred, and
909not much else. This must be updated by any lexing code that consumes
910a newline; the function L</lex_read_to> handles this detail.
911
912=cut
913*/
914
915/*
44170c9a 916=for apidoc lex_bufutf8
f0e67a1d
Z
917
918Indicates whether the octets in the lexer buffer
919(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
920of Unicode characters. If not, they should be interpreted as Latin-1
921characters. This is analogous to the C<SvUTF8> flag for scalars.
922
923In UTF-8 mode, it is not guaranteed that the lexer buffer actually
924contains valid UTF-8. Lexing code must be robust in the face of invalid
925encoding.
926
927The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
928is significant, but not the whole story regarding the input character
929encoding. Normally, when a file is being read, the scalar contains octets
930and its C<SvUTF8> flag is off, but the octets should be interpreted as
931UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
932however, the scalar may have the C<SvUTF8> flag on, and in this case its
933octets should be interpreted as UTF-8 unless the C<use bytes> pragma
934is in effect. This logic may change in the future; use this function
935instead of implementing the logic yourself.
936
937=cut
938*/
939
940bool
941Perl_lex_bufutf8(pTHX)
942{
943 return UTF;
944}
945
946/*
44170c9a 947=for apidoc lex_grow_linestr
f0e67a1d
Z
948
949Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
2d7f6611 950at least C<len> octets (including terminating C<NUL>). Returns a
f0e67a1d
Z
951pointer to the reallocated buffer. This is necessary before making
952any direct modification of the buffer that would increase its length.
953L</lex_stuff_pvn> provides a more convenient way to insert text into
954the buffer.
955
956Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
957this function updates all of the lexer's variables that point directly
958into the buffer.
959
960=cut
961*/
962
963char *
964Perl_lex_grow_linestr(pTHX_ STRLEN len)
965{
966 SV *linestr;
967 char *buf;
968 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
c7641931 969 STRLEN linestart_pos, last_uni_pos, last_lop_pos, re_eval_start_pos;
98d5e3ef
DM
970 bool current;
971
f0e67a1d
Z
972 linestr = PL_parser->linestr;
973 buf = SvPVX(linestr);
974 if (len <= SvLEN(linestr))
975 return buf;
98d5e3ef
DM
976
977 /* Is the lex_shared linestr SV the same as the current linestr SV?
978 * Only in this case does re_eval_start need adjusting, since it
979 * points within lex_shared->ls_linestr's buffer */
b1b8fb6a
DM
980 current = ( !PL_parser->lex_shared->ls_linestr
981 || linestr == PL_parser->lex_shared->ls_linestr);
98d5e3ef 982
f0e67a1d
Z
983 bufend_pos = PL_parser->bufend - buf;
984 bufptr_pos = PL_parser->bufptr - buf;
985 oldbufptr_pos = PL_parser->oldbufptr - buf;
986 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
987 linestart_pos = PL_parser->linestart - buf;
988 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
989 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
98d5e3ef 990 re_eval_start_pos = (current && PL_parser->lex_shared->re_eval_start) ?
3328ab5a 991 PL_parser->lex_shared->re_eval_start - buf : 0;
c7641931 992
f0e67a1d 993 buf = sv_grow(linestr, len);
c7641931 994
f0e67a1d
Z
995 PL_parser->bufend = buf + bufend_pos;
996 PL_parser->bufptr = buf + bufptr_pos;
997 PL_parser->oldbufptr = buf + oldbufptr_pos;
998 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
999 PL_parser->linestart = buf + linestart_pos;
1000 if (PL_parser->last_uni)
1001 PL_parser->last_uni = buf + last_uni_pos;
1002 if (PL_parser->last_lop)
1003 PL_parser->last_lop = buf + last_lop_pos;
98d5e3ef 1004 if (current && PL_parser->lex_shared->re_eval_start)
3328ab5a 1005 PL_parser->lex_shared->re_eval_start = buf + re_eval_start_pos;
f0e67a1d
Z
1006 return buf;
1007}
1008
1009/*
44170c9a 1010=for apidoc lex_stuff_pvn
f0e67a1d
Z
1011
1012Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1013immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1014reallocating the buffer if necessary. This means that lexing code that
1015runs later will see the characters as if they had appeared in the input.
1016It is not recommended to do this as part of normal parsing, and most
1017uses of this facility run the risk of the inserted characters being
1018interpreted in an unintended manner.
1019
2d7f6611
KW
1020The string to be inserted is represented by C<len> octets starting
1021at C<pv>. These octets are interpreted as either UTF-8 or Latin-1,
1022according to whether the C<LEX_STUFF_UTF8> flag is set in C<flags>.
f0e67a1d
Z
1023The characters are recoded for the lexer buffer, according to how the
1024buffer is currently being interpreted (L</lex_bufutf8>). If a string
9dcc53ea 1025to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
f0e67a1d
Z
1026function is more convenient.
1027
5af38e47
KW
1028=for apidoc Amnh||LEX_STUFF_UTF8
1029
f0e67a1d
Z
1030=cut
1031*/
1032
1033void
83aa740e 1034Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
f0e67a1d
Z
1035{
1036 char *bufptr;
1037 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
1038 if (flags & ~(LEX_STUFF_UTF8))
1039 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
1040 if (UTF) {
1041 if (flags & LEX_STUFF_UTF8) {
1042 goto plain_copy;
1043 } else {
01ccd497
KW
1044 STRLEN highhalf = variant_under_utf8_count((U8 *) pv,
1045 (U8 *) pv + len);
1046 const char *p, *e = pv+len;;
f0e67a1d
Z
1047 if (!highhalf)
1048 goto plain_copy;
1049 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
1050 bufptr = PL_parser->bufptr;
1051 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1052 SvCUR_set(PL_parser->linestr,
1053 SvCUR(PL_parser->linestr) + len+highhalf);
f0e67a1d
Z
1054 PL_parser->bufend += len+highhalf;
1055 for (p = pv; p != e; p++) {
27d0a47c 1056 append_utf8_from_native_byte(*p, (U8 **) &bufptr);
f0e67a1d
Z
1057 }
1058 }
1059 } else {
1060 if (flags & LEX_STUFF_UTF8) {
1061 STRLEN highhalf = 0;
83aa740e 1062 const char *p, *e = pv+len;
f0e67a1d
Z
1063 for (p = pv; p != e; p++) {
1064 U8 c = (U8)*p;
54d004e8 1065 if (UTF8_IS_ABOVE_LATIN1(c)) {
f0e67a1d
Z
1066 Perl_croak(aTHX_ "Lexing code attempted to stuff "
1067 "non-Latin-1 character into Latin-1 input");
54d004e8 1068 } else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e)) {
f0e67a1d
Z
1069 p++;
1070 highhalf++;
f88c6466 1071 } else assert(UTF8_IS_INVARIANT(c));
f0e67a1d
Z
1072 }
1073 if (!highhalf)
1074 goto plain_copy;
1075 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
1076 bufptr = PL_parser->bufptr;
1077 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1078 SvCUR_set(PL_parser->linestr,
1079 SvCUR(PL_parser->linestr) + len-highhalf);
f0e67a1d 1080 PL_parser->bufend += len-highhalf;
54d004e8
KW
1081 p = pv;
1082 while (p < e) {
1083 if (UTF8_IS_INVARIANT(*p)) {
1084 *bufptr++ = *p;
1085 p++;
f0e67a1d 1086 }
54d004e8
KW
1087 else {
1088 assert(p < e -1 );
a62b247b 1089 *bufptr++ = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
54d004e8
KW
1090 p += 2;
1091 }
f0e67a1d
Z
1092 }
1093 } else {
54d004e8 1094 plain_copy:
f0e67a1d
Z
1095 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1096 bufptr = PL_parser->bufptr;
1097 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
255fdf19 1098 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
f0e67a1d
Z
1099 PL_parser->bufend += len;
1100 Copy(pv, bufptr, len, char);
1101 }
1102 }
1103}
1104
1105/*
44170c9a 1106=for apidoc lex_stuff_pv
9dcc53ea
Z
1107
1108Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1109immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1110reallocating the buffer if necessary. This means that lexing code that
1111runs later will see the characters as if they had appeared in the input.
1112It is not recommended to do this as part of normal parsing, and most
1113uses of this facility run the risk of the inserted characters being
1114interpreted in an unintended manner.
1115
2d7f6611 1116The string to be inserted is represented by octets starting at C<pv>
9dcc53ea
Z
1117and continuing to the first nul. These octets are interpreted as either
1118UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
2d7f6611 1119in C<flags>. The characters are recoded for the lexer buffer, according
9dcc53ea
Z
1120to how the buffer is currently being interpreted (L</lex_bufutf8>).
1121If it is not convenient to nul-terminate a string to be inserted, the
1122L</lex_stuff_pvn> function is more appropriate.
1123
1124=cut
1125*/
1126
1127void
1128Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1129{
1130 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1131 lex_stuff_pvn(pv, strlen(pv), flags);
1132}
1133
1134/*
44170c9a 1135=for apidoc lex_stuff_sv
f0e67a1d
Z
1136
1137Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1138immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1139reallocating the buffer if necessary. This means that lexing code that
1140runs later will see the characters as if they had appeared in the input.
1141It is not recommended to do this as part of normal parsing, and most
1142uses of this facility run the risk of the inserted characters being
1143interpreted in an unintended manner.
1144
2d7f6611 1145The string to be inserted is the string value of C<sv>. The characters
f0e67a1d 1146are recoded for the lexer buffer, according to how the buffer is currently
9dcc53ea 1147being interpreted (L</lex_bufutf8>). If a string to be inserted is
f0e67a1d
Z
1148not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1149need to construct a scalar.
1150
1151=cut
1152*/
1153
1154void
1155Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1156{
1157 char *pv;
1158 STRLEN len;
1159 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1160 if (flags)
1161 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1162 pv = SvPV(sv, len);
1163 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1164}
1165
1166/*
44170c9a 1167=for apidoc lex_unstuff
f0e67a1d
Z
1168
1169Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
2d7f6611 1170C<ptr>. Text following C<ptr> will be moved, and the buffer shortened.
f0e67a1d
Z
1171This hides the discarded text from any lexing code that runs later,
1172as if the text had never appeared.
1173
1174This is not the normal way to consume lexed text. For that, use
1175L</lex_read_to>.
1176
1177=cut
1178*/
1179
1180void
1181Perl_lex_unstuff(pTHX_ char *ptr)
1182{
1183 char *buf, *bufend;
1184 STRLEN unstuff_len;
1185 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1186 buf = PL_parser->bufptr;
1187 if (ptr < buf)
1188 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1189 if (ptr == buf)
1190 return;
1191 bufend = PL_parser->bufend;
1192 if (ptr > bufend)
1193 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1194 unstuff_len = ptr - buf;
1195 Move(ptr, buf, bufend+1-ptr, char);
1196 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1197 PL_parser->bufend = bufend - unstuff_len;
1198}
1199
1200/*
44170c9a 1201=for apidoc lex_read_to
f0e67a1d
Z
1202
1203Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
2d7f6611 1204to C<ptr>. This advances L</PL_parser-E<gt>bufptr> to match C<ptr>,
f0e67a1d
Z
1205performing the correct bookkeeping whenever a newline character is passed.
1206This is the normal way to consume lexed text.
1207
1208Interpretation of the buffer's octets can be abstracted out by
1209using the slightly higher-level functions L</lex_peek_unichar> and
1210L</lex_read_unichar>.
1211
1212=cut
1213*/
1214
1215void
1216Perl_lex_read_to(pTHX_ char *ptr)
1217{
1218 char *s;
1219 PERL_ARGS_ASSERT_LEX_READ_TO;
1220 s = PL_parser->bufptr;
1221 if (ptr < s || ptr > PL_parser->bufend)
1222 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1223 for (; s != ptr; s++)
1224 if (*s == '\n') {
83944c01 1225 COPLINE_INC_WITH_HERELINES;
f0e67a1d
Z
1226 PL_parser->linestart = s+1;
1227 }
1228 PL_parser->bufptr = ptr;
1229}
1230
1231/*
44170c9a 1232=for apidoc lex_discard_to
f0e67a1d
Z
1233
1234Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
2d7f6611
KW
1235up to C<ptr>. The remaining content of the buffer will be moved, and
1236all pointers into the buffer updated appropriately. C<ptr> must not
f0e67a1d
Z
1237be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1238it is not permitted to discard text that has yet to be lexed.
1239
1240Normally it is not necessarily to do this directly, because it suffices to
1241use the implicit discarding behaviour of L</lex_next_chunk> and things
1242based on it. However, if a token stretches across multiple lines,
1f317c95 1243and the lexing code has kept multiple lines of text in the buffer for
f0e67a1d
Z
1244that purpose, then after completion of the token it would be wise to
1245explicitly discard the now-unneeded earlier lines, to avoid future
1246multi-line tokens growing the buffer without bound.
1247
1248=cut
1249*/
1250
1251void
1252Perl_lex_discard_to(pTHX_ char *ptr)
1253{
1254 char *buf;
1255 STRLEN discard_len;
1256 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1257 buf = SvPVX(PL_parser->linestr);
1258 if (ptr < buf)
1259 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1260 if (ptr == buf)
1261 return;
1262 if (ptr > PL_parser->bufptr)
1263 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1264 discard_len = ptr - buf;
1265 if (PL_parser->oldbufptr < ptr)
1266 PL_parser->oldbufptr = ptr;
1267 if (PL_parser->oldoldbufptr < ptr)
1268 PL_parser->oldoldbufptr = ptr;
1269 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1270 PL_parser->last_uni = NULL;
1271 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1272 PL_parser->last_lop = NULL;
1273 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1274 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1275 PL_parser->bufend -= discard_len;
1276 PL_parser->bufptr -= discard_len;
1277 PL_parser->oldbufptr -= discard_len;
1278 PL_parser->oldoldbufptr -= discard_len;
1279 if (PL_parser->last_uni)
1280 PL_parser->last_uni -= discard_len;
1281 if (PL_parser->last_lop)
1282 PL_parser->last_lop -= discard_len;
1283}
1284
efa571ab
KW
1285void
1286Perl_notify_parser_that_changed_to_utf8(pTHX)
1287{
1288 /* Called when $^H is changed to indicate that HINT_UTF8 has changed from
1289 * off to on. At compile time, this has the effect of entering a 'use
1290 * utf8' section. This means that any input was not previously checked for
1291 * UTF-8 (because it was off), but now we do need to check it, or our
1292 * assumptions about the input being sane could be wrong, and we could
1293 * segfault. This routine just sets a flag so that the next time we look
1294 * at the input we do the well-formed UTF-8 check. If we aren't in the
1295 * proper phase, there may not be a parser object, but if there is, setting
1296 * the flag is harmless */
1297
1298 if (PL_parser) {
1299 PL_parser->recheck_utf8_validity = TRUE;
1300 }
1301}
1302
f0e67a1d 1303/*
44170c9a 1304=for apidoc lex_next_chunk
f0e67a1d
Z
1305
1306Reads in the next chunk of text to be lexed, appending it to
1307L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1308looked to the end of the current chunk and wants to know more. It is
1309usual, but not necessary, for lexing to have consumed the entirety of
1310the current chunk at this time.
1311
1312If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1313chunk (i.e., the current chunk has been entirely consumed), normally the
1314current chunk will be discarded at the same time that the new chunk is
c5608a1f 1315read in. If C<flags> has the C<LEX_KEEP_PREVIOUS> bit set, the current chunk
f0e67a1d
Z
1316will not be discarded. If the current chunk has not been entirely
1317consumed, then it will not be discarded regardless of the flag.
1318
1319Returns true if some new text was added to the buffer, or false if the
1320buffer has reached the end of the input text.
1321
5af38e47
KW
1322=for apidoc Amnh||LEX_KEEP_PREVIOUS
1323
f0e67a1d
Z
1324=cut
1325*/
1326
1327#define LEX_FAKE_EOF 0x80000000
e47d32dc 1328#define LEX_NO_TERM 0x40000000 /* here-doc */
f0e67a1d
Z
1329
1330bool
1331Perl_lex_next_chunk(pTHX_ U32 flags)
1332{
1333 SV *linestr;
1334 char *buf;
1335 STRLEN old_bufend_pos, new_bufend_pos;
1336 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1337 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1338 bool got_some_for_debugger = 0;
f0e67a1d 1339 bool got_some;
6cdc5cd8 1340
112d1284 1341 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM))
f0e67a1d 1342 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
d27f4b91 1343 if (!(flags & LEX_NO_TERM) && PL_lex_inwhat)
e47d32dc 1344 return FALSE;
f0e67a1d
Z
1345 linestr = PL_parser->linestr;
1346 buf = SvPVX(linestr);
407f8cf2
KW
1347 if (!(flags & LEX_KEEP_PREVIOUS)
1348 && PL_parser->bufptr == PL_parser->bufend)
1349 {
f0e67a1d
Z
1350 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1351 linestart_pos = 0;
1352 if (PL_parser->last_uni != PL_parser->bufend)
1353 PL_parser->last_uni = NULL;
1354 if (PL_parser->last_lop != PL_parser->bufend)
1355 PL_parser->last_lop = NULL;
1356 last_uni_pos = last_lop_pos = 0;
1357 *buf = 0;
2324bdb9 1358 SvCUR_set(linestr, 0);
f0e67a1d
Z
1359 } else {
1360 old_bufend_pos = PL_parser->bufend - buf;
1361 bufptr_pos = PL_parser->bufptr - buf;
1362 oldbufptr_pos = PL_parser->oldbufptr - buf;
1363 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1364 linestart_pos = PL_parser->linestart - buf;
1365 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1366 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1367 }
1368 if (flags & LEX_FAKE_EOF) {
1369 goto eof;
60d63348 1370 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
f0e67a1d
Z
1371 got_some = 0;
1372 } else if (filter_gets(linestr, old_bufend_pos)) {
1373 got_some = 1;
17cc9359 1374 got_some_for_debugger = 1;
112d1284
FC
1375 } else if (flags & LEX_NO_TERM) {
1376 got_some = 0;
f0e67a1d 1377 } else {
580561a3 1378 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
847cc851 1379 SvPVCLEAR(linestr);
f0e67a1d
Z
1380 eof:
1381 /* End of real input. Close filehandle (unless it was STDIN),
1382 * then add implicit termination.
1383 */
87606032 1384 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
f0e67a1d
Z
1385 PerlIO_clearerr(PL_parser->rsfp);
1386 else if (PL_parser->rsfp)
1387 (void)PerlIO_close(PL_parser->rsfp);
1388 PL_parser->rsfp = NULL;
60d63348 1389 PL_parser->in_pod = PL_parser->filtered = 0;
f0e67a1d
Z
1390 if (!PL_in_eval && PL_minus_p) {
1391 sv_catpvs(linestr,
1392 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1393 PL_minus_n = PL_minus_p = 0;
1394 } else if (!PL_in_eval && PL_minus_n) {
1395 sv_catpvs(linestr, /*{*/";}");
1396 PL_minus_n = 0;
1397 } else
1398 sv_catpvs(linestr, ";");
1399 got_some = 1;
1400 }
1401 buf = SvPVX(linestr);
1402 new_bufend_pos = SvCUR(linestr);
1403 PL_parser->bufend = buf + new_bufend_pos;
1404 PL_parser->bufptr = buf + bufptr_pos;
6cdc5cd8 1405
07337b95
KW
1406 if (UTF) {
1407 const U8* first_bad_char_loc;
1408 if (UNLIKELY(! is_utf8_string_loc(
1409 (U8 *) PL_parser->bufptr,
1410 PL_parser->bufend - PL_parser->bufptr,
1411 &first_bad_char_loc)))
1412 {
1413 _force_out_malformed_utf8_message(first_bad_char_loc,
1414 (U8 *) PL_parser->bufend,
1415 0,
1416 1 /* 1 means die */ );
1417 NOT_REACHED; /* NOTREACHED */
1418 }
6cdc5cd8
KW
1419 }
1420
f0e67a1d
Z
1421 PL_parser->oldbufptr = buf + oldbufptr_pos;
1422 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1423 PL_parser->linestart = buf + linestart_pos;
1424 if (PL_parser->last_uni)
1425 PL_parser->last_uni = buf + last_uni_pos;
1426 if (PL_parser->last_lop)
1427 PL_parser->last_lop = buf + last_lop_pos;
7f1c3e8c
FC
1428 if (PL_parser->preambling != NOLINE) {
1429 CopLINE_set(PL_curcop, PL_parser->preambling + 1);
1430 PL_parser->preambling = NOLINE;
1431 }
407f8cf2
KW
1432 if ( got_some_for_debugger
1433 && PERLDB_LINE_OR_SAVESRC
1434 && PL_curstash != PL_debstash)
1435 {
f0e67a1d
Z
1436 /* debugger active and we're not compiling the debugger code,
1437 * so store the line into the debugger's array of lines
1438 */
1439 update_debugger_info(NULL, buf+old_bufend_pos,
1440 new_bufend_pos-old_bufend_pos);
1441 }
1442 return got_some;
1443}
1444
1445/*
44170c9a 1446=for apidoc lex_peek_unichar
f0e67a1d
Z
1447
1448Looks ahead one (Unicode) character in the text currently being lexed.
1449Returns the codepoint (unsigned integer value) of the next character,
1450or -1 if lexing has reached the end of the input text. To consume the
1451peeked character, use L</lex_read_unichar>.
1452
1453If the next character is in (or extends into) the next chunk of input
1454text, the next chunk will be read in. Normally the current chunk will be
c5608a1f
KW
1455discarded at the same time, but if C<flags> has the C<LEX_KEEP_PREVIOUS>
1456bit set, then the current chunk will not be discarded.
f0e67a1d
Z
1457
1458If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1459is encountered, an exception is generated.
1460
1461=cut
1462*/
1463
1464I32
1465Perl_lex_peek_unichar(pTHX_ U32 flags)
1466{
1467 char *s, *bufend;
1468 if (flags & ~(LEX_KEEP_PREVIOUS))
1469 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1470 s = PL_parser->bufptr;
1471 bufend = PL_parser->bufend;
1472 if (UTF) {
1473 U8 head;
1474 I32 unichar;
1475 STRLEN len, retlen;
1476 if (s == bufend) {
1477 if (!lex_next_chunk(flags))
1478 return -1;
1479 s = PL_parser->bufptr;
1480 bufend = PL_parser->bufend;
1481 }
1482 head = (U8)*s;
54d004e8 1483 if (UTF8_IS_INVARIANT(head))
f0e67a1d 1484 return head;
54d004e8
KW
1485 if (UTF8_IS_START(head)) {
1486 len = UTF8SKIP(&head);
f0e67a1d
Z
1487 while ((STRLEN)(bufend-s) < len) {
1488 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1489 break;
1490 s = PL_parser->bufptr;
1491 bufend = PL_parser->bufend;
1492 }
1493 }
c80e42f3 1494 unichar = utf8n_to_uvchr((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
f0e67a1d 1495 if (retlen == (STRLEN)-1) {
75219bac
KW
1496 _force_out_malformed_utf8_message((U8 *) s,
1497 (U8 *) bufend,
1498 0,
1499 1 /* 1 means die */ );
1500 NOT_REACHED; /* NOTREACHED */
f0e67a1d
Z
1501 }
1502 return unichar;
1503 } else {
1504 if (s == bufend) {
1505 if (!lex_next_chunk(flags))
1506 return -1;
1507 s = PL_parser->bufptr;
1508 }
1509 return (U8)*s;
1510 }
1511}
1512
1513/*
44170c9a 1514=for apidoc lex_read_unichar
f0e67a1d
Z
1515
1516Reads the next (Unicode) character in the text currently being lexed.
1517Returns the codepoint (unsigned integer value) of the character read,
1518and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1519if lexing has reached the end of the input text. To non-destructively
1520examine the next character, use L</lex_peek_unichar> instead.
1521
1522If the next character is in (or extends into) the next chunk of input
1523text, the next chunk will be read in. Normally the current chunk will be
c5608a1f
KW
1524discarded at the same time, but if C<flags> has the C<LEX_KEEP_PREVIOUS>
1525bit set, then the current chunk will not be discarded.
f0e67a1d
Z
1526
1527If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1528is encountered, an exception is generated.
1529
1530=cut
1531*/
1532
1533I32
1534Perl_lex_read_unichar(pTHX_ U32 flags)
1535{
1536 I32 c;
1537 if (flags & ~(LEX_KEEP_PREVIOUS))
1538 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1539 c = lex_peek_unichar(flags);
1540 if (c != -1) {
1541 if (c == '\n')
83944c01 1542 COPLINE_INC_WITH_HERELINES;
d9018cbe
EB
1543 if (UTF)
1544 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1545 else
1546 ++(PL_parser->bufptr);
f0e67a1d
Z
1547 }
1548 return c;
1549}
1550
1551/*
44170c9a 1552=for apidoc lex_read_space
f0e67a1d
Z
1553
1554Reads optional spaces, in Perl style, in the text currently being
1555lexed. The spaces may include ordinary whitespace characters and
1556Perl-style comments. C<#line> directives are processed if encountered.
1557L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1558at a non-space character (or the end of the input text).
1559
1560If spaces extend into the next chunk of input text, the next chunk will
1561be read in. Normally the current chunk will be discarded at the same
c5608a1f 1562time, but if C<flags> has the C<LEX_KEEP_PREVIOUS> bit set, then the current
f0e67a1d
Z
1563chunk will not be discarded.
1564
1565=cut
1566*/
1567
21791330 1568#define LEX_NO_INCLINE 0x40000000
f0998909
Z
1569#define LEX_NO_NEXT_CHUNK 0x80000000
1570
f0e67a1d
Z
1571void
1572Perl_lex_read_space(pTHX_ U32 flags)
1573{
1574 char *s, *bufend;
21791330 1575 const bool can_incline = !(flags & LEX_NO_INCLINE);
f0e67a1d 1576 bool need_incline = 0;
21791330 1577 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK|LEX_NO_INCLINE))
f0e67a1d 1578 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
f0e67a1d
Z
1579 s = PL_parser->bufptr;
1580 bufend = PL_parser->bufend;
1581 while (1) {
1582 char c = *s;
1583 if (c == '#') {
1584 do {
1585 c = *++s;
1586 } while (!(c == '\n' || (c == 0 && s == bufend)));
1587 } else if (c == '\n') {
1588 s++;
21791330
FC
1589 if (can_incline) {
1590 PL_parser->linestart = s;
1591 if (s == bufend)
1592 need_incline = 1;
1593 else
c6875f94 1594 incline(s, bufend);
21791330 1595 }
f0e67a1d
Z
1596 } else if (isSPACE(c)) {
1597 s++;
1598 } else if (c == 0 && s == bufend) {
1599 bool got_more;
65c68e17 1600 line_t l;
f0998909
Z
1601 if (flags & LEX_NO_NEXT_CHUNK)
1602 break;
f0e67a1d 1603 PL_parser->bufptr = s;
65c68e17 1604 l = CopLINE(PL_curcop);
851b527a 1605 CopLINE(PL_curcop) += PL_parser->herelines + 1;
f0e67a1d 1606 got_more = lex_next_chunk(flags);
65c68e17 1607 CopLINE_set(PL_curcop, l);
f0e67a1d
Z
1608 s = PL_parser->bufptr;
1609 bufend = PL_parser->bufend;
1610 if (!got_more)
1611 break;
21791330 1612 if (can_incline && need_incline && PL_parser->rsfp) {
c6875f94 1613 incline(s, bufend);
f0e67a1d
Z
1614 need_incline = 0;
1615 }
3c47da3c
FC
1616 } else if (!c) {
1617 s++;
f0e67a1d
Z
1618 } else {
1619 break;
1620 }
1621 }
f0e67a1d
Z
1622 PL_parser->bufptr = s;
1623}
1624
1625/*
fe788d6b 1626
44170c9a 1627=for apidoc validate_proto
fe788d6b
PM
1628
1629This function performs syntax checking on a prototype, C<proto>.
1630If C<warn> is true, any illegal characters or mismatched brackets
1631will trigger illegalproto warnings, declaring that they were
1632detected in the prototype for C<name>.
1633
1634The return value is C<true> if this is a valid prototype, and
1635C<false> if it is not, regardless of whether C<warn> was C<true> or
1636C<false>.
1637
1638Note that C<NULL> is a valid C<proto> and will always return C<true>.
1639
1640=cut
1641
1642 */
1643
1644bool
5783dc51 1645Perl_validate_proto(pTHX_ SV *name, SV *proto, bool warn, bool curstash)
fe788d6b
PM
1646{
1647 STRLEN len, origlen;
11327fa1 1648 char *p;
fe788d6b
PM
1649 bool bad_proto = FALSE;
1650 bool in_brackets = FALSE;
1651 bool after_slash = FALSE;
1652 char greedy_proto = ' ';
1653 bool proto_after_greedy_proto = FALSE;
1654 bool must_be_last = FALSE;
1655 bool underscore = FALSE;
f791a21a 1656 bool bad_proto_after_underscore = FALSE;
fe788d6b
PM
1657
1658 PERL_ARGS_ASSERT_VALIDATE_PROTO;
1659
1660 if (!proto)
1661 return TRUE;
1662
11327fa1 1663 p = SvPV(proto, len);
fe788d6b
PM
1664 origlen = len;
1665 for (; len--; p++) {
1666 if (!isSPACE(*p)) {
1667 if (must_be_last)
1668 proto_after_greedy_proto = TRUE;
f791a21a 1669 if (underscore) {
4aada8b9 1670 if (!memCHRs(";@%", *p))
f791a21a
PM
1671 bad_proto_after_underscore = TRUE;
1672 underscore = FALSE;
1673 }
4aada8b9 1674 if (!memCHRs("$@%*;[]&\\_+", *p) || *p == '\0') {
fe788d6b
PM
1675 bad_proto = TRUE;
1676 }
1677 else {
fe788d6b
PM
1678 if (*p == '[')
1679 in_brackets = TRUE;
1680 else if (*p == ']')
1681 in_brackets = FALSE;
407f8cf2
KW
1682 else if ((*p == '@' || *p == '%')
1683 && !after_slash
1684 && !in_brackets )
1685 {
fe788d6b
PM
1686 must_be_last = TRUE;
1687 greedy_proto = *p;
1688 }
1689 else if (*p == '_')
f791a21a 1690 underscore = TRUE;
fe788d6b
PM
1691 }
1692 if (*p == '\\')
1693 after_slash = TRUE;
1694 else
1695 after_slash = FALSE;
1696 }
1697 }
1698
1699 if (warn) {
b54d603d 1700 SV *tmpsv = newSVpvs_flags("", SVs_TEMP);
fe788d6b 1701 p -= origlen;
b54d603d
PM
1702 p = SvUTF8(proto)
1703 ? sv_uni_display(tmpsv, newSVpvn_flags(p, origlen, SVs_TEMP | SVf_UTF8),
1704 origlen, UNI_DISPLAY_ISPRINT)
1705 : pv_pretty(tmpsv, p, origlen, 60, NULL, NULL, PERL_PV_ESCAPE_NONASCII);
1706
cbf83791
FC
1707 if (curstash && !memchr(SvPVX(name), ':', SvCUR(name))) {
1708 SV *name2 = sv_2mortal(newSVsv(PL_curstname));
1709 sv_catpvs(name2, "::");
1710 sv_catsv(name2, (SV *)name);
1711 name = name2;
1712 }
1713
fe788d6b
PM
1714 if (proto_after_greedy_proto)
1715 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
147e3846 1716 "Prototype after '%c' for %" SVf " : %s",
fe788d6b 1717 greedy_proto, SVfARG(name), p);
50278ed0
PM
1718 if (in_brackets)
1719 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
147e3846 1720 "Missing ']' in prototype for %" SVf " : %s",
50278ed0 1721 SVfARG(name), p);
b54d603d 1722 if (bad_proto)
fe788d6b 1723 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
147e3846 1724 "Illegal character in prototype for %" SVf " : %s",
f791a21a
PM
1725 SVfARG(name), p);
1726 if (bad_proto_after_underscore)
1727 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
147e3846 1728 "Illegal character after '_' in prototype for %" SVf " : %s",
f791a21a 1729 SVfARG(name), p);
fe788d6b
PM
1730 }
1731
1732 return (! (proto_after_greedy_proto || bad_proto) );
1733}
1734
1735/*
ffb4593c
NT
1736 * S_incline
1737 * This subroutine has nothing to do with tilting, whether at windmills
1738 * or pinball tables. Its name is short for "increment line". It
57843af0 1739 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1740 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1741 * # line 500 "foo.pm"
1742 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1743 */
1744
76e3520e 1745STATIC void
c6875f94 1746S_incline(pTHX_ const char *s, const char *end)
463ee0b2 1747{
d9095cec
NC
1748 const char *t;
1749 const char *n;
1750 const char *e;
8818d409 1751 line_t line_num;
22ff3130 1752 UV uv;
463ee0b2 1753
7918f24d
NC
1754 PERL_ARGS_ASSERT_INCLINE;
1755
d77eff5d
KW
1756 assert(end >= s);
1757
83944c01 1758 COPLINE_INC_WITH_HERELINES;
451f421f
FC
1759 if (!PL_rsfp && !PL_parser->filtered && PL_lex_state == LEX_NORMAL
1760 && s+1 == PL_bufend && *s == ';') {
1761 /* fake newline in string eval */
1762 CopLINE_dec(PL_curcop);
1763 return;
1764 }
463ee0b2
LW
1765 if (*s++ != '#')
1766 return;
d4c19fe8
AL
1767 while (SPACE_OR_TAB(*s))
1768 s++;
d77eff5d
KW
1769 if (memBEGINs(s, (STRLEN) (end - s), "line"))
1770 s += sizeof("line") - 1;
73659bf1
GS
1771 else
1772 return;
084592ab 1773 if (SPACE_OR_TAB(*s))
73659bf1 1774 s++;
4e553d73 1775 else
73659bf1 1776 return;
d4c19fe8
AL
1777 while (SPACE_OR_TAB(*s))
1778 s++;
463ee0b2
LW
1779 if (!isDIGIT(*s))
1780 return;
d4c19fe8 1781
463ee0b2
LW
1782 n = s;
1783 while (isDIGIT(*s))
1784 s++;
07714eb4 1785 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1786 return;
bf4acbe4 1787 while (SPACE_OR_TAB(*s))
463ee0b2 1788 s++;
85bb8b90 1789 if (*s == '"' && (t = (char *) memchr(s+1, '"', end - s))) {
463ee0b2 1790 s++;
73659bf1
GS
1791 e = t + 1;
1792 }
463ee0b2 1793 else {
c35e046a 1794 t = s;
1bb1a3d6 1795 while (*t && !isSPACE(*t))
c35e046a 1796 t++;
73659bf1 1797 e = t;
463ee0b2 1798 }
bf4acbe4 1799 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1800 e++;
1801 if (*e != '\n' && *e != '\0')
1802 return; /* false alarm */
1803
22ff3130
HS
1804 if (!grok_atoUV(n, &uv, &e))
1805 return;
1806 line_num = ((line_t)uv) - 1;
8818d409 1807
f4dd75d9 1808 if (t - s > 0) {
d9095cec 1809 const STRLEN len = t - s;
3df32bda 1810
d36ee5be 1811 if (!PL_rsfp && !PL_parser->filtered) {
e66cf94c
RGS
1812 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1813 * to *{"::_<newfilename"} */
44867030
NC
1814 /* However, the long form of evals is only turned on by the
1815 debugger - usually they're "(eval %lu)" */
d36ee5be
FC
1816 GV * const cfgv = CopFILEGV(PL_curcop);
1817 if (cfgv) {
38bd7ad8
FC
1818 char smallbuf[128];
1819 STRLEN tmplen2 = len;
44867030 1820 char *tmpbuf2;
449dd039 1821 GV *gv2;
44867030
NC
1822
1823 if (tmplen2 + 2 <= sizeof smallbuf)
1824 tmpbuf2 = smallbuf;
1825 else
1826 Newx(tmpbuf2, tmplen2 + 2, char);
1827
38bd7ad8
FC
1828 tmpbuf2[0] = '_';
1829 tmpbuf2[1] = '<';
44867030
NC
1830
1831 memcpy(tmpbuf2 + 2, s, tmplen2);
1832 tmplen2 += 2;
1833
8a5ee598 1834 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1835 if (!isGV(gv2)) {
8a5ee598 1836 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1837 /* adjust ${"::_<newfilename"} to store the new file name */
1838 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
8818d409
FC
1839 /* The line number may differ. If that is the case,
1840 alias the saved lines that are in the array.
1841 Otherwise alias the whole array. */
1842 if (CopLINE(PL_curcop) == line_num) {
38bd7ad8
FC
1843 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(cfgv)));
1844 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(cfgv)));
8818d409 1845 }
38bd7ad8
FC
1846 else if (GvAV(cfgv)) {
1847 AV * const av = GvAV(cfgv);
515c395b
TC
1848 const line_t start = CopLINE(PL_curcop)+1;
1849 SSize_t items = AvFILLp(av) - start;
8818d409
FC
1850 if (items > 0) {
1851 AV * const av2 = GvAVn(gv2);
1852 SV **svp = AvARRAY(av) + start;
515c395b
TC
1853 Size_t l = line_num+1;
1854 while (items-- && l < SSize_t_MAX && l == (line_t)l)
1855 av_store(av2, (SSize_t)l++, SvREFCNT_inc(*svp++));
8818d409
FC
1856 }
1857 }
e5527e4b 1858 }
44867030
NC
1859
1860 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
d36ee5be 1861 }
e66cf94c 1862 }
05ec9bb3 1863 CopFILE_free(PL_curcop);
449dd039 1864 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1865 }
8818d409 1866 CopLINE_set(PL_curcop, line_num);
463ee0b2
LW
1867}
1868
80a702cd 1869STATIC void
15f169a1 1870S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1871{
1872 AV *av = CopFILEAVx(PL_curcop);
1873 if (av) {
7f1c3e8c
FC
1874 SV * sv;
1875 if (PL_parser->preambling == NOLINE) sv = newSV_type(SVt_PVMG);
1876 else {
1877 sv = *av_fetch(av, 0, 1);
1878 SvUPGRADE(sv, SVt_PVMG);
1879 }
847cc851 1880 if (!SvPOK(sv)) SvPVCLEAR(sv);
5fa550fb 1881 if (orig_sv)
7f1c3e8c 1882 sv_catsv(sv, orig_sv);
5fa550fb 1883 else
7f1c3e8c
FC
1884 sv_catpvn(sv, buf, len);
1885 if (!SvIOK(sv)) {
1886 (void)SvIOK_on(sv);
1887 SvIV_set(sv, 0);
1888 }
1889 if (PL_parser->preambling == NOLINE)
1890 av_store(av, CopLINE(PL_curcop), sv);
80a702cd
RGS
1891 }
1892}
1893
ffb4593c 1894/*
8c6b0c7d 1895 * skipspace
ffb4593c
NT
1896 * Called to gobble the appropriate amount and type of whitespace.
1897 * Skips comments as well.
71fff7cb 1898 * Returns the next character after the whitespace that is skipped.
8c6b0c7d
FC
1899 *
1900 * peekspace
1901 * Same thing, but look ahead without incrementing line numbers or
1902 * adjusting PL_linestart.
ffb4593c
NT
1903 */
1904
3218e223 1905#define skipspace(s) skipspace_flags(s, 0)
8c6b0c7d 1906#define peekspace(s) skipspace_flags(s, LEX_NO_INCLINE)
3218e223 1907
aabfeadc
KW
1908char *
1909Perl_skipspace_flags(pTHX_ char *s, U32 flags)
a687059c 1910{
21791330 1911 PERL_ARGS_ASSERT_SKIPSPACE_FLAGS;
3280af22 1912 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
3c47da3c 1913 while (s < PL_bufend && (SPACE_OR_TAB(*s) || !*s))
463ee0b2 1914 s++;
f0e67a1d
Z
1915 } else {
1916 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1917 PL_bufptr = s;
21791330 1918 lex_read_space(flags | LEX_KEEP_PREVIOUS |
d27f4b91 1919 (PL_lex_inwhat || PL_lex_state == LEX_FORMLINE ?
f0998909 1920 LEX_NO_NEXT_CHUNK : 0));
3280af22 1921 s = PL_bufptr;
f0e67a1d
Z
1922 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1923 if (PL_linestart > PL_bufptr)
1924 PL_bufptr = PL_linestart;
1925 return s;
463ee0b2 1926 }
5db06880 1927 return s;
a687059c 1928}
378cc40b 1929
ffb4593c
NT
1930/*
1931 * S_check_uni
1932 * Check the unary operators to ensure there's no ambiguity in how they're
1933 * used. An ambiguous piece of code would be:
1934 * rand + 5
1935 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1936 * the +5 is its argument.
1937 */
1938
76e3520e 1939STATIC void
cea2e8a9 1940S_check_uni(pTHX)
ba106d47 1941{
d4c19fe8 1942 const char *s;
2f3197b3 1943
3280af22 1944 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1945 return;
3280af22
NIS
1946 while (isSPACE(*PL_last_uni))
1947 PL_last_uni++;
c35e046a 1948 s = PL_last_uni;
fac0f7a3 1949 while (isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF) || *s == '-')
8ce2ba82 1950 s += UTF ? UTF8SKIP(s) : 1;
4efcdc02 1951 if (s < PL_bufptr && memchr(s, '(', PL_bufptr - s))
a0d0e21e 1952 return;
6136c704 1953
9b387841 1954 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
147e3846 1955 "Warning: Use of \"%" UTF8f "\" without parentheses is ambiguous",
b59c097b 1956 UTF8fARG(UTF, (int)(s - PL_last_uni), PL_last_uni));
2f3197b3
LW
1957}
1958
ffb4593c
NT
1959/*
1960 * LOP : macro to build a list operator. Its behaviour has been replaced
1961 * with a subroutine, S_lop() for which LOP is just another name.
1962 */
1963
a0d0e21e
LW
1964#define LOP(f,x) return lop(f,x,s)
1965
ffb4593c
NT
1966/*
1967 * S_lop
1968 * Build a list operator (or something that might be one). The rules:
41e8cbf4
FC
1969 * - if we have a next token, then it's a list operator (no parens) for
1970 * which the next token has already been parsed; e.g.,
1971 * sort foo @args
1972 * sort foo (@args)
ffb4593c
NT
1973 * - if the next thing is an opening paren, then it's a function
1974 * - else it's a list operator
1975 */
1976
76e3520e 1977STATIC I32
11288bb3 1978S_lop(pTHX_ I32 f, U8 x, char *s)
ffed7fef 1979{
7918f24d
NC
1980 PERL_ARGS_ASSERT_LOP;
1981
6154021b 1982 pl_yylval.ival = f;
35c8bce7 1983 CLINE;
3280af22
NIS
1984 PL_bufptr = s;
1985 PL_last_lop = PL_oldbufptr;
eb160463 1986 PL_last_lop_op = (OPCODE)f;
3280af22 1987 if (PL_nexttoke)
78cdf107 1988 goto lstop;
19f1898a 1989 PL_expect = x;
79072805 1990 if (*s == '(')
bbf60fe6 1991 return REPORT(FUNC);
294a536f 1992 s = skipspace(s);
79072805 1993 if (*s == '(')
bbf60fe6 1994 return REPORT(FUNC);
78cdf107
Z
1995 else {
1996 lstop:
1997 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
1998 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
bbf60fe6 1999 return REPORT(LSTOP);
78cdf107 2000 }
79072805
LW
2001}
2002
ffb4593c
NT
2003/*
2004 * S_force_next
9cbb5ea2 2005 * When the lexer realizes it knows the next token (for instance,
ffb4593c 2006 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 2007 * to know what token to return the next time the lexer is called. Caller
b5bbe64a
JH
2008 * will need to set PL_nextval[] and possibly PL_expect to ensure
2009 * the lexer handles the token correctly.
ffb4593c
NT
2010 */
2011
4e553d73 2012STATIC void
cea2e8a9 2013S_force_next(pTHX_ I32 type)
79072805 2014{
704d4215
GG
2015#ifdef DEBUGGING
2016 if (DEBUG_T_TEST) {
2017 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 2018 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
2019 }
2020#endif
1f7c3e7c 2021 assert(PL_nexttoke < C_ARRAY_LENGTH(PL_nexttype));
3280af22
NIS
2022 PL_nexttype[PL_nexttoke] = type;
2023 PL_nexttoke++;
79072805
LW
2024}
2025
89f35911
FC
2026/*
2027 * S_postderef
2028 *
2029 * This subroutine handles postfix deref syntax after the arrow has already
b3f7b7ad 2030 * been emitted. @* $* etc. are emitted as two separate tokens right here.
89f35911
FC
2031 * @[ @{ %[ %{ *{ are emitted also as two tokens, but this function emits
2032 * only the first, leaving yylex to find the next.
89f35911
FC
2033 */
2034
2035static int
ff25e5db 2036S_postderef(pTHX_ int const funny, char const next)
89f35911 2037{
4aada8b9 2038 assert(funny == DOLSHARP || memCHRs("$@%&*", funny));
89f35911
FC
2039 if (next == '*') {
2040 PL_expect = XOPERATOR;
cc624add 2041 if (PL_lex_state == LEX_INTERPNORMAL && !PL_lex_brackets) {
ff25e5db 2042 assert('@' == funny || '$' == funny || DOLSHARP == funny);
cc624add 2043 PL_lex_state = LEX_INTERPEND;
c3492809
FC
2044 if ('@' == funny)
2045 force_next(POSTJOIN);
cc624add 2046 }
89f35911
FC
2047 force_next(next);
2048 PL_bufptr+=2;
2049 }
2050 else {
760ca746
FC
2051 if ('@' == funny && PL_lex_state == LEX_INTERPNORMAL
2052 && !PL_lex_brackets)
2053 PL_lex_dojoin = 2;
89f35911
FC
2054 PL_expect = XOPERATOR;
2055 PL_bufptr++;
2056 }
2057 return funny;
2058}
2059
28ac2b49
Z
2060void
2061Perl_yyunlex(pTHX)
2062{
a7aaec61
Z
2063 int yyc = PL_parser->yychar;
2064 if (yyc != YYEMPTY) {
2065 if (yyc) {
a7aaec61 2066 NEXTVAL_NEXTTOKE = PL_parser->yylval;
669dd22c 2067 if (yyc == PERLY_BRACE_OPEN || yyc == HASHBRACK || yyc == PERLY_BRACKET_OPEN) {
78cdf107 2068 PL_lex_allbrackets--;
a7aaec61 2069 PL_lex_brackets--;
78cdf107
Z
2070 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
2071 } else if (yyc == '('/*)*/) {
2072 PL_lex_allbrackets--;
2073 yyc |= (2<<24);
a7aaec61
Z
2074 }
2075 force_next(yyc);
2076 }
28ac2b49
Z
2077 PL_parser->yychar = YYEMPTY;
2078 }
2079}
2080
d0a148a6 2081STATIC SV *
15f169a1 2082S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 2083{
740cce10 2084 SV * const sv = newSVpvn_utf8(start, len,
6d492fac
KW
2085 ! IN_BYTES
2086 && UTF
53463649 2087 && len != 0
6d492fac 2088 && is_utf8_non_invariant_string((const U8*)start, len));
d0a148a6
NC
2089 return sv;
2090}
2091
ffb4593c
NT
2092/*
2093 * S_force_word
2094 * When the lexer knows the next thing is a word (for instance, it has
2095 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
2096 * it calls S_force_word to stick the next word into the PL_nexttoke/val
2097 * lookahead.
ffb4593c
NT
2098 *
2099 * Arguments:
b1b65b59 2100 * char *start : buffer position (must be within PL_linestr)
185c2e96
DM
2101 * int token : PL_next* will be this type of bare word
2102 * (e.g., METHOD,BAREWORD)
ffb4593c
NT
2103 * int check_keyword : if true, Perl checks to make sure the word isn't
2104 * a keyword (do this if the word is a label, e.g. goto FOO)
2105 * int allow_pack : if true, : characters will also be allowed (require,
2106 * use, etc. do this)
ffb4593c
NT
2107 */
2108
76e3520e 2109STATIC char *
345b3785 2110S_force_word(pTHX_ char *start, int token, int check_keyword, int allow_pack)
79072805 2111{
eb578fdb 2112 char *s;
463ee0b2 2113 STRLEN len;
4e553d73 2114
7918f24d
NC
2115 PERL_ARGS_ASSERT_FORCE_WORD;
2116
294a536f 2117 start = skipspace(start);
463ee0b2 2118 s = start;
fac0f7a3 2119 if ( isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)
e7127e21 2120 || (allow_pack && *s == ':' && s[1] == ':') )
a0d0e21e 2121 {
3280af22 2122 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
01b5ef50
FC
2123 if (check_keyword) {
2124 char *s2 = PL_tokenbuf;
487e470d 2125 STRLEN len2 = len;
de627158
KW
2126 if (allow_pack && memBEGINPs(s2, len, "CORE::")) {
2127 s2 += sizeof("CORE::") - 1;
2128 len2 -= sizeof("CORE::") - 1;
2129 }
487e470d 2130 if (keyword(s2, len2, 0))
463ee0b2 2131 return start;
01b5ef50 2132 }
463ee0b2 2133 if (token == METHOD) {
294a536f 2134 s = skipspace(s);
463ee0b2 2135 if (*s == '(')
3280af22 2136 PL_expect = XTERM;
463ee0b2 2137 else {
3280af22 2138 PL_expect = XOPERATOR;
463ee0b2 2139 }
79072805 2140 }
9ded7720 2141 NEXTVAL_NEXTTOKE.opval
275103cd 2142 = newSVOP(OP_CONST,0,
d0a148a6 2143 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 2144 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
2145 force_next(token);
2146 }
2147 return s;
2148}
2149
ffb4593c
NT
2150/*
2151 * S_force_ident
9cbb5ea2 2152 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
2153 * text only contains the "foo" portion. The first argument is a pointer
2154 * to the "foo", and the second argument is the type symbol to prefix.
185c2e96 2155 * Forces the next token to be a "BAREWORD".
9cbb5ea2 2156 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2157 */
2158
76e3520e 2159STATIC void
5aaab254 2160S_force_ident(pTHX_ const char *s, int kind)
79072805 2161{
7918f24d
NC
2162 PERL_ARGS_ASSERT_FORCE_IDENT;
2163
c9b48522
DD
2164 if (s[0]) {
2165 const STRLEN len = s[1] ? strlen(s) : 1; /* s = "\"" see yylex */
275103cd 2166 OP* const o = newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
728847b1 2167 UTF ? SVf_UTF8 : 0));
9ded7720 2168 NEXTVAL_NEXTTOKE.opval = o;
185c2e96 2169 force_next(BAREWORD);
748a9306 2170 if (kind) {
11343788 2171 o->op_private = OPpCONST_ENTERED;
55497cff
PP
2172 /* XXX see note in pp_entereval() for why we forgo typo
2173 warnings if the symbol must be introduced in an eval.
2174 GSAR 96-10-12 */
90e5519e 2175 gv_fetchpvn_flags(s, len,
4bff32c5 2176 (PL_in_eval ? GV_ADDMULTI
728847b1 2177 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
90e5519e
NC
2178 kind == '$' ? SVt_PV :
2179 kind == '@' ? SVt_PVAV :
2180 kind == '%' ? SVt_PVHV :
a0d0e21e 2181 SVt_PVGV
90e5519e 2182 );
748a9306 2183 }
79072805
LW
2184 }
2185}
2186
3f33d153
FC
2187static void
2188S_force_ident_maybe_lex(pTHX_ char pit)
2189{
3f33d153
FC
2190 NEXTVAL_NEXTTOKE.ival = pit;
2191 force_next('p');
2192}
2193
1571675a
GS
2194NV
2195Perl_str_to_version(pTHX_ SV *sv)
2196{
2197 NV retval = 0.0;
2198 NV nshift = 1.0;
2199 STRLEN len;
cfd0369c 2200 const char *start = SvPV_const(sv,len);
9d4ba2ae 2201 const char * const end = start + len;
8298454c 2202 const bool utf = cBOOL(SvUTF8(sv));
7918f24d
NC
2203
2204 PERL_ARGS_ASSERT_STR_TO_VERSION;
2205
1571675a 2206 while (start < end) {
ba210ebe 2207 STRLEN skip;
1571675a
GS
2208 UV n;
2209 if (utf)
9041c2e3 2210 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2211 else {
2212 n = *(U8*)start;
2213 skip = 1;
2214 }
2215 retval += ((NV)n)/nshift;
2216 start += skip;
2217 nshift *= 1000;
2218 }
2219 return retval;
2220}
2221
4e553d73 2222/*
ffb4593c
NT
2223 * S_force_version
2224 * Forces the next token to be a version number.
e759cc13
RGS
2225 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2226 * and if "guessing" is TRUE, then no new token is created (and the caller
2227 * must use an alternative parsing method).
ffb4593c
NT
2228 */
2229
76e3520e 2230STATIC char *
e759cc13 2231S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2232{
5f66b61c 2233 OP *version = NULL;
44dcb63b 2234 char *d;
89bfa8cd 2235
7918f24d
NC
2236 PERL_ARGS_ASSERT_FORCE_VERSION;
2237
294a536f 2238 s = skipspace(s);
89bfa8cd 2239
44dcb63b 2240 d = s;
dd629d5b 2241 if (*d == 'v')
44dcb63b 2242 d++;
44dcb63b 2243 if (isDIGIT(*d)) {
e759cc13
RGS
2244 while (isDIGIT(*d) || *d == '_' || *d == '.')
2245 d++;
4e4da3ac 2246 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
dd629d5b 2247 SV *ver;
6154021b
RGS
2248 s = scan_num(s, &pl_yylval);
2249 version = pl_yylval.opval;
dd629d5b
GS
2250 ver = cSVOPx(version)->op_sv;
2251 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2252 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2253 SvNV_set(ver, str_to_version(ver));
1571675a 2254 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2255 }
89bfa8cd 2256 }
5db06880 2257 else if (guessing) {
e759cc13 2258 return s;
5db06880 2259 }
89bfa8cd
PP
2260 }
2261
2262 /* NOTE: The parser sees the package name and the VERSION swapped */
9ded7720 2263 NEXTVAL_NEXTTOKE.opval = version;
185c2e96 2264 force_next(BAREWORD);
89bfa8cd 2265
e759cc13 2266 return s;
89bfa8cd
PP
2267}
2268
ffb4593c 2269/*
91152fc1
DG
2270 * S_force_strict_version
2271 * Forces the next token to be a version number using strict syntax rules.
2272 */
2273
2274STATIC char *
2275S_force_strict_version(pTHX_ char *s)
2276{
91152fc1 2277 OP *version = NULL;
91152fc1
DG
2278 const char *errstr = NULL;
2279
2280 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2281
2282 while (isSPACE(*s)) /* leading whitespace */
2283 s++;
2284
2285 if (is_STRICT_VERSION(s,&errstr)) {
2286 SV *ver = newSV(0);
2287 s = (char *)scan_version(s, ver, 0);
2288 version = newSVOP(OP_CONST, 0, ver);
2289 }
407f8cf2
KW
2290 else if ((*s != ';' && *s != '{' && *s != '}' )
2291 && (s = skipspace(s), (*s != ';' && *s != '{' && *s != '}' )))
4e4da3ac 2292 {
91152fc1
DG
2293 PL_bufptr = s;
2294 if (errstr)
2295 yyerror(errstr); /* version required */
2296 return s;
2297 }
2298
91152fc1 2299 /* NOTE: The parser sees the package name and the VERSION swapped */
91152fc1 2300 NEXTVAL_NEXTTOKE.opval = version;
185c2e96 2301 force_next(BAREWORD);
91152fc1
DG
2302
2303 return s;
2304}
2305
2306/*
ffb4593c 2307 * S_tokeq
ef3ff34d
KW
2308 * Turns any \\ into \ in a quoted string passed in in 'sv', returning 'sv',
2309 * modified as necessary. However, if HINT_NEW_STRING is on, 'sv' is
2310 * unchanged, and a new SV containing the modified input is returned.
ffb4593c
NT
2311 */
2312
76e3520e 2313STATIC SV *
cea2e8a9 2314S_tokeq(pTHX_ SV *sv)
79072805 2315{
eb578fdb
KW
2316 char *s;
2317 char *send;
2318 char *d;
b3ac6de7 2319 SV *pv = sv;
79072805 2320
7918f24d
NC
2321 PERL_ARGS_ASSERT_TOKEQ;
2322
279b35ad
FC
2323 assert (SvPOK(sv));
2324 assert (SvLEN(sv));
2325 assert (!SvIsCOW(sv));
307ed071 2326 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1) /* <<'heredoc' */
b3ac6de7 2327 goto finish;
279b35ad
FC
2328 s = SvPVX(sv);
2329 send = SvEND(sv);
dcb21ed6
NC
2330 /* This is relying on the SV being "well formed" with a trailing '\0' */
2331 while (s < send && !(*s == '\\' && s[1] == '\\'))
79072805
LW
2332 s++;
2333 if (s == send)
b3ac6de7 2334 goto finish;
79072805 2335 d = s;
be4731d2 2336 if ( PL_hints & HINT_NEW_STRING ) {
279b35ad
FC
2337 pv = newSVpvn_flags(SvPVX_const(pv), SvCUR(sv),
2338 SVs_TEMP | SvUTF8(sv));
be4731d2 2339 }
79072805
LW
2340 while (s < send) {
2341 if (*s == '\\') {
a0d0e21e 2342 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2343 s++; /* all that, just for this */
2344 }
2345 *d++ = *s++;
2346 }
2347 *d = '\0';
95a20fc0 2348 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2349 finish:
3280af22 2350 if ( PL_hints & HINT_NEW_STRING )
164e423c 2351 return new_constant(NULL, 0, "q", sv, pv, "q", 1, NULL);
79072805
LW
2352 return sv;
2353}
2354
ffb4593c
NT
2355/*
2356 * Now come three functions related to double-quote context,
2357 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2358 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2359 * interact with PL_lex_state, and create fake ( ... ) argument lists
2360 * to handle functions and concatenation.
ecd24171
DM
2361 * For example,
2362 * "foo\lbar"
2363 * is tokenised as
2364 * stringify ( const[foo] concat lcfirst ( const[bar] ) )
ffb4593c
NT
2365 */
2366
2367/*
2368 * S_sublex_start
6154021b 2369 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2370 *
2371 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2372 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c 2373 *
16d1d8bd 2374 * OP_CONST is easy--just make the new op and return.
ffb4593c
NT
2375 *
2376 * Everything else becomes a FUNC.
2377 *
16d1d8bd
LM
2378 * Sets PL_lex_state to LEX_INTERPPUSH unless ival was OP_NULL or we
2379 * had an OP_CONST. This just sets us up for a
ffb4593c
NT
2380 * call to S_sublex_push().
2381 */
2382
76e3520e 2383STATIC I32
cea2e8a9 2384S_sublex_start(pTHX)
79072805 2385{
eb578fdb 2386 const I32 op_type = pl_yylval.ival;
79072805
LW
2387
2388 if (op_type == OP_NULL) {
6154021b 2389 pl_yylval.opval = PL_lex_op;
5f66b61c 2390 PL_lex_op = NULL;
79072805
LW
2391 return THING;
2392 }
466112bb 2393 if (op_type == OP_CONST) {
67c71cbb
FC
2394 SV *sv = PL_lex_stuff;
2395 PL_lex_stuff = NULL;
2396 sv = tokeq(sv);
b3ac6de7
IZ
2397
2398 if (SvTYPE(sv) == SVt_PVIV) {
2399 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2400 STRLEN len;
96a5add6 2401 const char * const p = SvPV_const(sv, len);
740cce10 2402 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2403 SvREFCNT_dec(sv);
2404 sv = nsv;
4e553d73 2405 }
275103cd 2406 pl_yylval.opval = newSVOP(op_type, 0, sv);
79072805
LW
2407 return THING;
2408 }
2409
7ef70b3d
FC
2410 PL_parser->lex_super_state = PL_lex_state;
2411 PL_parser->lex_sub_inwhat = (U16)op_type;
2412 PL_parser->lex_sub_op = PL_lex_op;
bb4e4c38
TC
2413 PL_parser->sub_no_recover = FALSE;
2414 PL_parser->sub_error_count = PL_error_count;
3280af22 2415 PL_lex_state = LEX_INTERPPUSH;
55497cff 2416
3280af22
NIS
2417 PL_expect = XTERM;
2418 if (PL_lex_op) {
6154021b 2419 pl_yylval.opval = PL_lex_op;
5f66b61c 2420 PL_lex_op = NULL;
55497cff
PP
2421 return PMFUNC;
2422 }
2423 else
2424 return FUNC;
2425}
2426
ffb4593c
NT
2427/*
2428 * S_sublex_push
2429 * Create a new scope to save the lexing state. The scope will be
2430 * ended in S_sublex_done. Returns a '(', starting the function arguments
2431 * to the uc, lc, etc. found before.
2432 * Sets PL_lex_state to LEX_INTERPCONCAT.
2433 */
2434
76e3520e 2435STATIC I32
cea2e8a9 2436S_sublex_push(pTHX)
55497cff 2437{
78a635de 2438 LEXSHARED *shared;
801d32ac 2439 const bool is_heredoc = PL_multi_close == '<';
f46d017c 2440 ENTER;
55497cff 2441
7ef70b3d 2442 PL_lex_state = PL_parser->lex_super_state;
cc624add 2443 SAVEI8(PL_lex_dojoin);
3280af22 2444 SAVEI32(PL_lex_brackets);
78cdf107 2445 SAVEI32(PL_lex_allbrackets);
b27dce25 2446 SAVEI32(PL_lex_formbrack);
78cdf107 2447 SAVEI8(PL_lex_fakeeof);
3280af22
NIS
2448 SAVEI32(PL_lex_casemods);
2449 SAVEI32(PL_lex_starts);
651b5b28 2450 SAVEI8(PL_lex_state);
7cc34111 2451 SAVESPTR(PL_lex_repl);
7766f137 2452 SAVEVPTR(PL_lex_inpat);
98246f1e 2453 SAVEI16(PL_lex_inwhat);
ffdb8b16 2454 if (is_heredoc)
b42366d4 2455 {
ffdb8b16 2456 SAVECOPLINE(PL_curcop);
b42366d4 2457 SAVEI32(PL_multi_end);
851b527a
FC
2458 SAVEI32(PL_parser->herelines);
2459 PL_parser->herelines = 0;
b42366d4 2460 }
2ca4363d 2461 SAVEIV(PL_multi_close);
3280af22 2462 SAVEPPTR(PL_bufptr);
8452ff4b 2463 SAVEPPTR(PL_bufend);
3280af22
NIS
2464 SAVEPPTR(PL_oldbufptr);
2465 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2466 SAVEPPTR(PL_last_lop);
2467 SAVEPPTR(PL_last_uni);
3280af22
NIS
2468 SAVEPPTR(PL_linestart);
2469 SAVESPTR(PL_linestr);
8edd5f42
RGS
2470 SAVEGENERICPV(PL_lex_brackstack);
2471 SAVEGENERICPV(PL_lex_casestack);
78a635de 2472 SAVEGENERICPV(PL_parser->lex_shared);
3a54fd60 2473 SAVEBOOL(PL_parser->lex_re_reparsing);
ffdb8b16 2474 SAVEI32(PL_copline);
3280af22 2475
99bd9d90 2476 /* The here-doc parser needs to be able to peek into outer lexing
60f40a38
FC
2477 scopes to find the body of the here-doc. So we put PL_linestr and
2478 PL_bufptr into lex_shared, to ‘share’ those values.
99bd9d90 2479 */
60f40a38
FC
2480 PL_parser->lex_shared->ls_linestr = PL_linestr;
2481 PL_parser->lex_shared->ls_bufptr = PL_bufptr;
99bd9d90 2482
3280af22 2483 PL_linestr = PL_lex_stuff;
7ef70b3d 2484 PL_lex_repl = PL_parser->lex_sub_repl;
a0714e2c 2485 PL_lex_stuff = NULL;
7ef70b3d 2486 PL_parser->lex_sub_repl = NULL;
3280af22 2487
eabab8bc
FC
2488 /* Arrange for PL_lex_stuff to be freed on scope exit, in case it gets
2489 set for an inner quote-like operator and then an error causes scope-
2490 popping. We must not have a PL_lex_stuff value left dangling, as
2491 that breaks assumptions elsewhere. See bug #123617. */
2492 SAVEGENERICSV(PL_lex_stuff);
7ef70b3d 2493 SAVEGENERICSV(PL_parser->lex_sub_repl);
eabab8bc 2494
9cbb5ea2
GS
2495 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2496 = SvPVX(PL_linestr);
3280af22 2497 PL_bufend += SvCUR(PL_linestr);
bd61b366 2498 PL_last_lop = PL_last_uni = NULL;
3280af22 2499 SAVEFREESV(PL_linestr);
4dc843bc 2500 if (PL_lex_repl) SAVEFREESV(PL_lex_repl);
3280af22
NIS
2501
2502 PL_lex_dojoin = FALSE;
b27dce25 2503 PL_lex_brackets = PL_lex_formbrack = 0;
78cdf107
Z
2504 PL_lex_allbrackets = 0;
2505 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
a02a5408
JC
2506 Newx(PL_lex_brackstack, 120, char);
2507 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2508 PL_lex_casemods = 0;
2509 *PL_lex_casestack = '\0';
2510 PL_lex_starts = 0;
2511 PL_lex_state = LEX_INTERPCONCAT;
ffdb8b16 2512 if (is_heredoc)
6ddcf93b 2513 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
ffdb8b16 2514 PL_copline = NOLINE;
caae0700 2515
78a635de
FC
2516 Newxz(shared, 1, LEXSHARED);
2517 shared->ls_prev = PL_parser->lex_shared;
2518 PL_parser->lex_shared = shared;
3280af22 2519
7ef70b3d 2520 PL_lex_inwhat = PL_parser->lex_sub_inwhat;
bb16bae8 2521 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
3280af22 2522 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
7ef70b3d 2523 PL_lex_inpat = PL_parser->lex_sub_op;
79072805 2524 else
5f66b61c 2525 PL_lex_inpat = NULL;
79072805 2526
3a54fd60
DM
2527 PL_parser->lex_re_reparsing = cBOOL(PL_in_eval & EVAL_RE_REPARSING);
2528 PL_in_eval &= ~EVAL_RE_REPARSING;
2529
69afcc21 2530 return SUBLEXSTART;
79072805
LW
2531}
2532
ffb4593c
NT
2533/*
2534 * S_sublex_done
2535 * Restores lexer state after a S_sublex_push.
2536 */
2537
76e3520e 2538STATIC I32
cea2e8a9 2539S_sublex_done(pTHX)
79072805 2540{
3280af22 2541 if (!PL_lex_starts++) {
396482e1 2542 SV * const sv = newSVpvs("");
9aa983d2
JH
2543 if (SvUTF8(PL_linestr))
2544 SvUTF8_on(sv);
3280af22 2545 PL_expect = XOPERATOR;
275103cd 2546 pl_yylval.opval = newSVOP(OP_CONST, 0, sv);
79072805
LW
2547 return THING;
2548 }
2549
3280af22
NIS
2550 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2551 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2552 return yylex();
79072805
LW
2553 }
2554
ffb4593c 2555 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
bb16bae8 2556 assert(PL_lex_inwhat != OP_TRANSR);
5aa91856
FC
2557 if (PL_lex_repl) {
2558 assert (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS);
3280af22
NIS
2559 PL_linestr = PL_lex_repl;
2560 PL_lex_inpat = 0;
2561 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2562 PL_bufend += SvCUR(PL_linestr);
bd61b366 2563 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2564 PL_lex_dojoin = FALSE;
2565 PL_lex_brackets = 0;
78cdf107
Z
2566 PL_lex_allbrackets = 0;
2567 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
3280af22
NIS
2568 PL_lex_casemods = 0;
2569 *PL_lex_casestack = '\0';
2570 PL_lex_starts = 0;
25da4f38 2571 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2572 PL_lex_state = LEX_INTERPNORMAL;
2573 PL_lex_starts++;
e9fa98b2
HS
2574 /* we don't clear PL_lex_repl here, so that we can check later
2575 whether this is an evalled subst; that means we rely on the
2576 logic to ensure sublex_done() is called again only via the
2577 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2578 }
e9fa98b2 2579 else {
3280af22 2580 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2581 PL_lex_repl = NULL;
e9fa98b2 2582 }
ffdb8b16
FC
2583 if (SvTYPE(PL_linestr) >= SVt_PVNV) {
2584 CopLINE(PL_curcop) +=
9420b268 2585 ((XPVNV*)SvANY(PL_linestr))->xnv_u.xnv_lines
851b527a
FC
2586 + PL_parser->herelines;
2587 PL_parser->herelines = 0;
ffdb8b16 2588 }
9b6b7be8 2589 return '/';
ffed7fef
LW
2590 }
2591 else {
b42366d4 2592 const line_t l = CopLINE(PL_curcop);
f46d017c 2593 LEAVE;
bb4e4c38 2594 if (PL_parser->sub_error_count != PL_error_count) {
bb4e4c38 2595 if (PL_parser->sub_no_recover) {
ad1ecdf7 2596 yyquit();
bb4e4c38
TC
2597 NOT_REACHED;
2598 }
2599 }
b42366d4 2600 if (PL_multi_close == '<')
851b527a 2601 PL_parser->herelines += l - PL_multi_end;
3280af22
NIS
2602 PL_bufend = SvPVX(PL_linestr);
2603 PL_bufend += SvCUR(PL_linestr);
2604 PL_expect = XOPERATOR;
69afcc21 2605 return SUBLEXEND;
ffed7fef
LW
2606 }
2607}
2608
4e8ee35f
KW
2609HV *
2610Perl_load_charnames(pTHX_ SV * char_name, const char * context,
2611 const STRLEN context_len, const char ** error_msg)
2612{
2613 /* Load the official _charnames module if not already there. The
2614 * parameters are just to give info for any error messages generated:
2615 * char_name a name to look up which is the reason for loading this
2616 * context 'char_name' in the context in the input in which it appears
2617 * context_len how many bytes 'context' occupies
2618 * error_msg *error_msg will be set to any error
2619 *
2620 * Returns the ^H table if success; otherwise NULL */
2621
2622 unsigned int i;
2623 HV * table;
2624 SV **cvp;
2625 SV * res;
2626
2627 PERL_ARGS_ASSERT_LOAD_CHARNAMES;
2628
2629 /* This loop is executed 1 1/2 times. On the first time through, if it
2630 * isn't already loaded, try loading it, and iterate just once to see if it
2631 * worked. */
2632 for (i = 0; i < 2; i++) {
2633 table = GvHV(PL_hintgv); /* ^H */
2634
2635 if ( table
2636 && (PL_hints & HINT_LOCALIZE_HH)
2637 && (cvp = hv_fetchs(table, "charnames", FALSE))
2638 && SvOK(*cvp))
2639 {
2640 return table; /* Quit if already loaded */
2641 }
2642
2643 if (i == 0) {
2644 Perl_load_module(aTHX_
2645 0,
2646 newSVpvs("_charnames"),
2647
2648 /* version parameter; no need to specify it, as if we get too early
2649 * a version, will fail anyway, not being able to find 'charnames'
2650 * */
2651 NULL,
2652 newSVpvs(":full"),
2653 newSVpvs(":short"),
2654 NULL);
2655 }
2656 }
2657
2658 /* Here, it failed; new_constant will give appropriate error messages */
2659 *error_msg = NULL;
2660 res = new_constant( NULL, 0, "charnames", char_name, NULL,
2661 context, context_len, error_msg);
2662 SvREFCNT_dec(res);
2663
2664 return NULL;
2665}
2666
7a7d14f3 2667STATIC SV*
2c43c309
KW
2668S_get_and_check_backslash_N_name_wrapper(pTHX_ const char* s, const char* const e)
2669{
2670 /* This justs wraps get_and_check_backslash_N_name() to output any error
2671 * message it returns. */
2672
2673 const char * error_msg = NULL;
25c7fb78 2674 SV * result;
2c43c309
KW
2675
2676 PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME_WRAPPER;
2677
25c7fb78
KW
2678 /* charnames doesn't work well if there have been errors found */
2679 if (PL_error_count > 0) {
2680 return NULL;
2681 }
2682
2683 result = get_and_check_backslash_N_name(s, e, cBOOL(UTF), &error_msg);
2684
2c43c309
KW
2685 if (error_msg) {
2686 yyerror_pv(error_msg, UTF ? SVf_UTF8 : 0);
2687 }
2688
2689 return result;
2690}
2691
25c7fb78
KW
2692SV*
2693Perl_get_and_check_backslash_N_name(pTHX_ const char* s,
2694 const char* const e,
2695 const bool is_utf8,
2696 const char ** error_msg)
6f613c73 2697{
140b12ad
KW
2698 /* <s> points to first character of interior of \N{}, <e> to one beyond the
2699 * interior, hence to the "}". Finds what the name resolves to, returning
25c7fb78
KW
2700 * an SV* containing it; NULL if no valid one found.
2701 *
2702 * 'is_utf8' is TRUE if we know we want the result to be UTF-8 even if it
2703 * doesn't have to be. */
6f613c73 2704
7303cc1f 2705 SV* char_name;
25c7fb78 2706 SV* res;
0c415a79
KW
2707 HV * table;
2708 SV **cvp;
2709 SV *cv;
2710 SV *rv;
2711 HV *stash;
8b0cce63
KW
2712
2713 /* Points to the beginning of the \N{... so that any messages include the
2714 * context of what's failing*/
2715 const char* context = s - 3;
2716 STRLEN context_len = e - context + 1; /* include all of \N{...} */
2717
0c415a79 2718
6f613c73
KW
2719 PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME;
2720
25c7fb78
KW
2721 assert(e >= s);
2722 assert(s > (char *) 3);
2723
7303cc1f 2724 char_name = newSVpvn_flags(s, e - s, (is_utf8) ? SVf_UTF8 : 0);
25c7fb78 2725
7303cc1f
KW
2726 if (!SvCUR(char_name)) {
2727 SvREFCNT_dec_NN(char_name);
be332ba0 2728 /* diag_listed_as: Unknown charname '%s' */
2c43c309 2729 *error_msg = Perl_form(aTHX_ "Unknown charname ''");
be332ba0 2730 return NULL;
d8d26cac 2731 }
b7e6151c 2732
7303cc1f 2733 /* Autoload the charnames module */
7303cc1f 2734
4e8ee35f
KW
2735 table = load_charnames(char_name, context, context_len, error_msg);
2736 if (table == NULL) {
2737 return NULL;
7303cc1f
KW
2738 }
2739
2740 *error_msg = NULL;
8b0cce63
KW
2741 res = new_constant( NULL, 0, "charnames", char_name, NULL,
2742 context, context_len, error_msg);
7303cc1f 2743 if (*error_msg) {
4e8ee35f 2744 *error_msg = Perl_form(aTHX_ "Unknown charname '%s'", SvPVX(char_name));
7303cc1f
KW
2745
2746 SvREFCNT_dec(res);
6f613c73
KW
2747 return NULL;
2748 }
2749
0c415a79
KW
2750 /* See if the charnames handler is the Perl core's, and if so, we can skip
2751 * the validation needed for a user-supplied one, as Perl's does its own
2752 * validation. */
0c415a79 2753 cvp = hv_fetchs(table, "charnames", FALSE);
5882ddb3
FC
2754 if (cvp && (cv = *cvp) && SvROK(cv) && (rv = SvRV(cv),
2755 SvTYPE(rv) == SVt_PVCV) && ((stash = CvSTASH(rv)) != NULL))
0c415a79
KW
2756 {
2757 const char * const name = HvNAME(stash);
b59bf0b2 2758 if (memEQs(name, HvNAMELEN(stash), "_charnames")) {
0c415a79
KW
2759 return res;
2760 }
2761 }
2762
bde9e88d
KW
2763 /* Here, it isn't Perl's charname handler. We can't rely on a
2764 * user-supplied handler to validate the input name. For non-ut8 input,
2765 * look to see that the first character is legal. Then loop through the
2766 * rest checking that each is a continuation */
6f613c73 2767
36897d64
KW
2768 /* This code makes the reasonable assumption that the only Latin1-range
2769 * characters that begin a character name alias are alphabetic, otherwise
2770 * would have to create a isCHARNAME_BEGIN macro */
b6ba1137 2771
25c7fb78 2772 if (! is_utf8) {
bde9e88d 2773 if (! isALPHAU(*s)) {
b6ba1137
KW
2774 goto bad_charname;
2775 }
bde9e88d
KW
2776 s++;
2777 while (s < e) {
2778 if (! isCHARNAME_CONT(*s)) {
b6ba1137
KW
2779 goto bad_charname;
2780 }
2d8eb851
KW
2781 if (*s == ' ' && *(s-1) == ' ') {
2782 goto multi_spaces;
bd299e29 2783 }
bde9e88d 2784 s++;
b6ba1137
KW
2785 }
2786 }
2787 else {
bde9e88d 2788 /* Similarly for utf8. For invariants can check directly; for other
58aa6738
KW
2789 * Latin1, can calculate their code point and check; otherwise use an
2790 * inversion list */
bde9e88d
KW
2791 if (UTF8_IS_INVARIANT(*s)) {
2792 if (! isALPHAU(*s)) {
140b12ad
KW
2793 goto bad_charname;
2794 }
bde9e88d
KW
2795 s++;
2796 } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
a62b247b 2797 if (! isALPHAU(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)))) {
b6ba1137 2798 goto bad_charname;
6f613c73 2799 }
bde9e88d 2800 s += 2;
6f613c73 2801 }
bde9e88d 2802 else {
f1bcae08
KW
2803 if (! _invlist_contains_cp(PL_utf8_charname_begin,
2804 utf8_to_uvchr_buf((U8 *) s,
2805 (U8 *) e,
2806 NULL)))
2807 {
bde9e88d
KW
2808 goto bad_charname;
2809 }
2810 s += UTF8SKIP(s);
2811 }
2812
2813 while (s < e) {
2814 if (UTF8_IS_INVARIANT(*s)) {
2815 if (! isCHARNAME_CONT(*s)) {
2816 goto bad_charname;
2817 }
2d8eb851
KW
2818 if (*s == ' ' && *(s-1) == ' ') {
2819 goto multi_spaces;
bd299e29 2820 }
bde9e88d
KW
2821 s++;
2822 }
2823 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
a62b247b 2824 if (! isCHARNAME_CONT(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1))))
bde9e88d
KW
2825 {
2826 goto bad_charname;
2827 }
2828 s += 2;
2829 }
2830 else {
f1bcae08
KW
2831 if (! _invlist_contains_cp(PL_utf8_charname_continue,
2832 utf8_to_uvchr_buf((U8 *) s,
2833 (U8 *) e,
2834 NULL)))
2835 {
bde9e88d
KW
2836 goto bad_charname;
2837 }
2838 s += UTF8SKIP(s);
6f613c73
KW
2839 }
2840 }
2d8eb851
KW
2841 }
2842 if (*(s-1) == ' ') {
8d9d0498
FC
2843 /* diag_listed_as: charnames alias definitions may not contain
2844 trailing white-space; marked by <-- HERE in %s
2845 */
2c43c309 2846 *error_msg = Perl_form(aTHX_
2d8eb851
KW
2847 "charnames alias definitions may not contain trailing "
2848 "white-space; marked by <-- HERE in %.*s<-- HERE %.*s",
8b0cce63 2849 (int)(s - context + 1), context,
2c43c309 2850 (int)(e - s + 1), s + 1);
2d8eb851 2851 return NULL;
6f613c73
KW
2852 }
2853
870fe793 2854 if (SvUTF8(res)) { /* Don't accept malformed charname value */
bde9e88d
KW
2855 const U8* first_bad_char_loc;
2856 STRLEN len;
2857 const char* const str = SvPV_const(res, len);
0aab20f2
KW
2858 if (UNLIKELY(! is_utf8_string_loc((U8 *) str, len,
2859 &first_bad_char_loc)))
2860 {
75219bac
KW
2861 _force_out_malformed_utf8_message(first_bad_char_loc,
2862 (U8 *) PL_parser->bufend,
2863 0,
2864 0 /* 0 means don't die */ );
8d9d0498
FC
2865 /* diag_listed_as: Malformed UTF-8 returned by \N{%s}
2866 immediately after '%s' */
2c43c309 2867 *error_msg = Perl_form(aTHX_
bde9e88d 2868 "Malformed UTF-8 returned by %.*s immediately after '%.*s'",
8b0cce63 2869 (int) context_len, context,
2c43c309 2870 (int) ((char *) first_bad_char_loc - str), str);
bde9e88d
KW
2871 return NULL;
2872 }
2873 }
140b12ad 2874
bde9e88d 2875 return res;
140b12ad 2876
bde9e88d 2877 bad_charname: {
bde9e88d
KW
2878
2879 /* The final %.*s makes sure that should the trailing NUL be missing
2880 * that this print won't run off the end of the string */
8d9d0498
FC
2881 /* diag_listed_as: Invalid character in \N{...}; marked by <-- HERE
2882 in \N{%s} */
2c43c309 2883 *error_msg = Perl_form(aTHX_
bde9e88d 2884 "Invalid character in \\N{...}; marked by <-- HERE in %.*s<-- HERE %.*s",
8b0cce63 2885 (int)(s - context + 1), context,
2c43c309 2886 (int)(e - s + 1), s + 1);
bde9e88d
KW
2887 return NULL;
2888 }
2d8eb851
KW
2889
2890 multi_spaces:
8d9d0498
FC
2891 /* diag_listed_as: charnames alias definitions may not contain a
2892 sequence of multiple spaces; marked by <-- HERE
2893 in %s */
2c43c309 2894 *error_msg = Perl_form(aTHX_
2d8eb851
KW
2895 "charnames alias definitions may not contain a sequence of "
2896 "multiple spaces; marked by <-- HERE in %.*s<-- HERE %.*s",
8b0cce63 2897 (int)(s - context + 1), context,
2c43c309 2898 (int)(e - s + 1), s + 1);
2d8eb851 2899 return NULL;
6f613c73
KW
2900}
2901
02aa26ce
NT
2902/*
2903 scan_const
2904
9da1dd8f
DM
2905 Extracts the next constant part of a pattern, double-quoted string,
2906 or transliteration. This is terrifying code.
2907
2908 For example, in parsing the double-quoted string "ab\x63$d", it would
2909 stop at the '$' and return an OP_CONST containing 'abc'.
02aa26ce 2910
94def140 2911 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 2912 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 2913 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 2914
94def140
ST
2915 Returns a pointer to the character scanned up to. If this is
2916 advanced from the start pointer supplied (i.e. if anything was
9da1dd8f 2917 successfully parsed), will leave an OP_CONST for the substring scanned
6154021b 2918 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
2919 by looking at the next characters herself.
2920
02aa26ce 2921 In patterns:
9da1dd8f 2922 expand:
537124e4
KW
2923 \N{FOO} => \N{U+hex_for_character_FOO}
2924 (if FOO expands to multiple characters, expands to \N{U+xx.XX.yy ...})
9da1dd8f
DM
2925
2926 pass through:
2927 all other \-char, including \N and \N{ apart from \N{ABC}
2928
2929 stops on:
2930 @ and $ where it appears to be a var, but not for $ as tail anchor
2931 \l \L \u \U \Q \E
2932 (?{ or (??{
2933
02aa26ce
NT
2934 In transliterations:
2935 characters are VERY literal, except for - not at the start or end
66b09263
KW
2936 of the string, which indicates a range. However some backslash sequences
2937 are recognized: \r, \n, and the like
2938 \007 \o{}, \x{}, \N{}
2939 If all elements in the transliteration are below 256,
94def140
ST
2940 scan_const expands the range to the full set of intermediate
2941 characters. If the range is in utf8, the hyphen is replaced with
2942 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
2943
2944 In double-quoted strings:
2945 backslashes:
66b09263 2946 all those recognized in transliterations
94def140 2947 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
2948 case and quoting: \U \Q \E
2949 stops on @ and $
2950
2951 scan_const does *not* construct ops to handle interpolated strings.
2952 It stops processing as soon as it finds an embedded $ or @ variable
2953 and leaves it to the caller to work out what's going on.
2954
94def140
ST
2955 embedded arrays (whether in pattern or not) could be:
2956 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2957
2958 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
2959
2960 $ in pattern could be $foo or could be tail anchor. Assumption:
2961 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 2962 followed by one of "()| \r\n\t"
02aa26ce 2963
9da1dd8f 2964 \1 (backreferences) are turned into $1 in substitutions
02aa26ce
NT
2965
2966 The structure of the code is
2967 while (there's a character to process) {
94def140
ST
2968 handle transliteration ranges
2969 skip regexp comments /(?#comment)/ and codes /(?{code})/
2970 skip #-initiated comments in //x patterns
2971 check for embedded arrays
02aa26ce
NT
2972 check for embedded scalars
2973 if (backslash) {
94def140 2974 deprecate \1 in substitution replacements
02aa26ce
NT
2975 handle string-changing backslashes \l \U \Q \E, etc.
2976 switch (what was escaped) {
94def140 2977 handle \- in a transliteration (becomes a literal -)
ff3f963a 2978 if a pattern and not \N{, go treat as regular character
94def140
ST
2979 handle \132 (octal characters)
2980 handle \x15 and \x{1234} (hex characters)
ff3f963a 2981 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
ST
2982 handle \cV (control characters)
2983 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 2984 } (end switch)
77a135fe 2985 continue
02aa26ce 2986 } (end if backslash)
77a135fe 2987 handle regular character
02aa26ce 2988 } (end while character to read)
02cd137d 2989
02aa26ce
NT
2990*/
2991
76e3520e 2992STATIC char *
cea2e8a9 2993S_scan_const(pTHX_ char *start)
79072805 2994{
eb578fdb 2995 char *send = PL_bufend; /* end of the constant */
dc023dbb
KW
2996 SV *sv = newSV(send - start); /* sv for the constant. See note below
2997 on sizing. */
eb578fdb
KW
2998 char *s = start; /* start of the constant */
2999 char *d = SvPVX(sv); /* destination for copies */
dc023dbb
KW
3000 bool dorange = FALSE; /* are we in a translit range? */
3001 bool didrange = FALSE; /* did we just finish a range? */
3002 bool in_charclass = FALSE; /* within /[...]/ */
ed2893cb 3003 bool s_is_utf8 = cBOOL(UTF); /* Is the source string assumed to be
dc023dbb
KW
3004 UTF8? But, this can show as true
3005 when the source isn't utf8, as for
3006 example when it is entirely composed
3007 of hex constants */
f34acfec 3008 bool d_is_utf8 = FALSE; /* Output constant is UTF8 */
af9be36c
KW
3009 STRLEN utf8_variant_count = 0; /* When not in UTF-8, this counts the
3010 number of characters found so far
3011 that will expand (into 2 bytes)
3012 should we have to convert to
3013 UTF-8) */
6f613c73 3014 SV *res; /* result from charnames */
c89db733
JH
3015 STRLEN offset_to_max = 0; /* The offset in the output to where the range
3016 high-end character is temporarily placed */
77a135fe 3017
fe2ba0a2
KW
3018 /* Does something require special handling in tr/// ? This avoids extra
3019 * work in a less likely case. As such, khw didn't feel it was worth
3020 * adding any branches to the more mainline code to handle this, which
3021 * means that this doesn't get set in some circumstances when things like
3022 * \x{100} get expanded out. As a result there needs to be extra testing
3023 * done in the tr code */
3024 bool has_above_latin1 = FALSE;
3025
77a135fe
KW
3026 /* Note on sizing: The scanned constant is placed into sv, which is
3027 * initialized by newSV() assuming one byte of output for every byte of
3028 * input. This routine expects newSV() to allocate an extra byte for a
3029 * trailing NUL, which this routine will append if it gets to the end of
3030 * the input. There may be more bytes of input than output (eg., \N{LATIN
3031 * CAPITAL LETTER A}), or more output than input if the constant ends up
3032 * recoded to utf8, but each time a construct is found that might increase
3033 * the needed size, SvGROW() is called. Its size parameter each time is
3034 * based on the best guess estimate at the time, namely the length used so
3035 * far, plus the length the current construct will occupy, plus room for
caae0700 3036 * the trailing NUL, plus one byte for every input byte still unscanned */
77a135fe 3037
c3320c2a
KW
3038 UV uv = UV_MAX; /* Initialize to weird value to try to catch any uses
3039 before set */
4c3a8340 3040#ifdef EBCDIC
f4240379
KW
3041 int backslash_N = 0; /* ? was the character from \N{} */
3042 int non_portable_endpoint = 0; /* ? In a range is an endpoint
3043 platform-specific like \x65 */
4c3a8340 3044#endif
012bcf8d 3045
7918f24d
NC
3046 PERL_ARGS_ASSERT_SCAN_CONST;
3047
bb16bae8 3048 assert(PL_lex_inwhat != OP_TRANSR);
2b9d42f0 3049
b899e89d
FC
3050 /* Protect sv from errors and fatal warnings. */
3051 ENTER_with_name("scan_const");
3052 SAVEFREESV(sv);
2b9d42f0 3053
1759517a
KW
3054 /* A bunch of code in the loop below assumes that if s[n] exists and is not
3055 * NUL, then s[n+1] exists. This assertion makes sure that assumption is
3056 * valid */
3057 assert(*send == '\0');
3058
f4240379
KW
3059 while (s < send
3060 || dorange /* Handle tr/// range at right edge of input */
3061 ) {
ff3f963a 3062
02aa26ce 3063 /* get transliterations out of the way (they're most literal) */
3280af22 3064 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 3065
f4240379
KW
3066 /* But there isn't any special handling necessary unless there is a
3067 * range, so for most cases we just drop down and handle the value
3068 * as any other. There are two exceptions.
3069 *
02cd137d
KW
3070 * 1. A hyphen indicates that we are actually going to have a
3071 * range. In this case, skip the '-', set a flag, then drop
f4240379
KW
3072 * down to handle what should be the end range value.
3073 * 2. After we've handled that value, the next time through, that
3074 * flag is set and we fix up the range.
3075 *
3076 * Ranges entirely within Latin1 are expanded out entirely, in
188d22cf
KW
3077 * order to make the transliteration a simple table look-up.
3078 * Ranges that extend above Latin1 have to be done differently, so
3079 * there is no advantage to expanding them here, so they are
dc8faf6b
KW
3080 * stored here as Min, RANGE_INDICATOR, Max. 'RANGE_INDICATOR' is
3081 * a byte that can't occur in legal UTF-8, and hence can signify a
3082 * hyphen without any possible ambiguity. On EBCDIC machines, if
3083 * the range is expressed as Unicode, the Latin1 portion is
3084 * expanded out even if the range extends above Latin1. This is
3085 * because each code point in it has to be processed here
3086 * individually to get its native translation */
f4240379
KW
3087
3088 if (! dorange) {
3089
02cd137d
KW
3090 /* Here, we don't think we're in a range. If the new character
3091 * is not a hyphen; or if it is a hyphen, but it's too close to
e8d55f27
TC
3092 * either edge to indicate a range, or if we haven't output any
3093 * characters yet then it's a regular character. */
04863ba1
KW
3094 if (*s != '-' || s >= send - 1 || s == start || d == SvPVX(sv))
3095 {
f4240379
KW
3096
3097 /* A regular character. Process like any other, but first
3098 * clear any flags */
3099 didrange = FALSE;
3100 dorange = FALSE;
e294cc5d 3101#ifdef EBCDIC
f4240379
KW
3102 non_portable_endpoint = 0;
3103 backslash_N = 0;
e294cc5d 3104#endif
02cd137d
KW
3105 /* The tests here for being above Latin1 and similar ones
3106 * in the following 'else' suffice to find all such
3107 * occurences in the constant, except those added by a
fe2ba0a2 3108 * backslash escape sequence, like \x{100}. Mostly, those
02cd137d 3109 * set 'has_above_latin1' as appropriate */
ed2893cb 3110 if (s_is_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) {
188d22cf
KW
3111 has_above_latin1 = TRUE;
3112 }
3113
f4240379
KW
3114 /* Drops down to generic code to process current byte */
3115 }
02cd137d 3116 else { /* Is a '-' in the context where it means a range */
f4240379 3117 if (didrange) { /* Something like y/A-C-Z// */
02cd137d
KW
3118 Perl_croak(aTHX_ "Ambiguous range in transliteration"
3119 " operator");
f4240379 3120 }
e294cc5d 3121
f4240379 3122 dorange = TRUE;
2b9d42f0 3123
02cd137d 3124 s++; /* Skip past the hyphen */
f4240379
KW
3125
3126 /* d now points to where the end-range character will be
d7f7b0e3
KW
3127 * placed. Drop down to get that character. We'll finish
3128 * processing the range the next time through the loop */
188d22cf 3129
ed2893cb 3130 if (s_is_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) {
188d22cf
KW
3131 has_above_latin1 = TRUE;
3132 }
02cd137d
KW
3133
3134 /* Drops down to generic code to process current byte */
f4240379
KW
3135 }
3136 } /* End of not a range */
3137 else {
3138 /* Here we have parsed a range. Now must handle it. At this
3139 * point:
3140 * 'sv' is a SV* that contains the output string we are
3141 * constructing. The final two characters in that string
3142 * are the range start and range end, in order.
3143 * 'd' points to just beyond the range end in the 'sv' string,
3144 * where we would next place something
f4240379 3145 */
0c311b7c 3146 char * max_ptr;
5ca37d54 3147 char * min_ptr;
f4240379
KW
3148 IV range_min;
3149 IV range_max; /* last character in range */
f4240379 3150 STRLEN grow;
5ca37d54
KW
3151 Size_t offset_to_min = 0;
3152 Size_t extras = 0;
11327fa1 3153#ifdef EBCDIC
f4240379
KW
3154 bool convert_unicode;
3155 IV real_range_max = 0;
e294cc5d 3156#endif
02cd137d 3157 /* Get the code point values of the range ends. */
0c311b7c
KW
3158 max_ptr = (d_is_utf8) ? (char *) utf8_hop( (U8*) d, -1) : d - 1;
3159 offset_to_max = max_ptr - SvPVX_const(sv);
ed2893cb 3160 if (d_is_utf8) {
f4240379
KW
3161 /* We know the utf8 is valid, because we just constructed
3162 * it ourselves in previous loop iterations */
3163 min_ptr = (char*) utf8_hop( (U8*) max_ptr, -1);
3164 range_min = valid_utf8_to_uvchr( (U8*) min_ptr, NULL);
3165 range_max = valid_utf8_to_uvchr( (U8*) max_ptr, NULL);
fe2ba0a2
KW
3166
3167 /* This compensates for not all code setting
3168 * 'has_above_latin1', so that we don't skip stuff that
3169 * should be executed */
3170 if (range_max > 255) {
3171 has_above_latin1 = TRUE;
3172 }
e294cc5d 3173 }
f4240379
KW
3174 else {
3175 min_ptr = max_ptr - 1;
3176 range_min = * (U8*) min_ptr;
3177 range_max = * (U8*) max_ptr;
3178 }
3179
8efef67c
KW
3180 /* If the range is just a single code point, like tr/a-a/.../,
3181 * that code point is already in the output, twice. We can
3182 * just back up over the second instance and avoid all the rest
3183 * of the work. But if it is a variant character, it's been
218304f9
KW
3184 * counted twice, so decrement. (This unlikely scenario is
3185 * special cased, like the one for a range of 2 code points
3186 * below, only because the main-line code below needs a range
3187 * of 3 or more to work without special casing. Might as well
3188 * get it out of the way now.) */
8efef67c
KW
3189 if (UNLIKELY(range_max == range_min)) {
3190 d = max_ptr;
ed2893cb 3191 if (! d_is_utf8 && ! UVCHR_IS_INVARIANT(range_max)) {
8efef67c
KW
3192 utf8_variant_count--;
3193 }
3194 goto range_done;
3195 }
3196
e294cc5d 3197#ifdef EBCDIC
f4240379
KW
3198 /* On EBCDIC platforms, we may have to deal with portable
3199 * ranges. These happen if at least one range endpoint is a
3200 * Unicode value (\N{...}), or if the range is a subset of
3201 * [A-Z] or [a-z], and both ends are literal characters,
3202 * like 'A', and not like \x{C1} */
87dd6ea7 3203 convert_unicode =
02cd137d
KW
3204 cBOOL(backslash_N) /* \N{} forces Unicode,
3205 hence portable range */
3206 || ( ! non_portable_endpoint
3207 && (( isLOWER_A(range_min) && isLOWER_A(range_max))
3208 || (isUPPER_A(range_min) && isUPPER_A(range_max))));
87dd6ea7 3209 if (convert_unicode) {
f4240379
KW
3210
3211 /* Special handling is needed for these portable ranges.
02cd137d
KW
3212 * They are defined to be in Unicode terms, which includes
3213 * all the Unicode code points between the end points.
f4240379
KW
3214 * Convert to Unicode to get the Unicode range. Later we
3215 * will convert each code point in the range back to
3216 * native. */
3217 range_min = NATIVE_TO_UNI(range_min);
3218 range_max = NATIVE_TO_UNI(range_max);
3219 }
e294cc5d 3220#endif
8ada0baa 3221
f4240379 3222 if (range_min > range_max) {
11327fa1 3223#ifdef EBCDIC
f4240379
KW
3224 if (convert_unicode) {
3225 /* Need to convert back to native for meaningful
3226 * messages for this platform */
3227 range_min = UNI_TO_NATIVE(range_min);
3228 range_max = UNI_TO_NATIVE(range_max);
3229 }
11327fa1 3230#endif
f4240379
KW
3231 /* Use the characters themselves for the error message if
3232 * ASCII printables; otherwise some visible representation
3233 * of them */
3234 if (isPRINT_A(range_min) && isPRINT_A(range_max)) {
3235 Perl_croak(aTHX_
3236 "Invalid range \"%c-%c\" in transliteration operator",
3237 (char)range_min, (char)range_max);
3238 }
11327fa1 3239#ifdef EBCDIC
f4240379 3240 else if (convert_unicode) {
02cd137d 3241 /* diag_listed_as: Invalid range "%s" in transliteration operator */
f4240379 3242 Perl_croak(aTHX_
02cd137d
KW
3243 "Invalid range \"\\N{U+%04" UVXf "}-\\N{U+%04"
3244 UVXf "}\" in transliteration operator",
3245 range_min, range_max);
f4240379 3246 }
11327fa1 3247#endif
f4240379 3248 else {
02cd137d 3249 /* diag_listed_as: Invalid range "%s" in transliteration operator */
f4240379 3250 Perl_croak(aTHX_
02cd137d
KW
3251 "Invalid range \"\\x{%04" UVXf "}-\\x{%04" UVXf "}\""
3252 " in transliteration operator",
3253 range_min, range_max);
f4240379 3254 }
c2e66d9e
GS
3255 }
3256
aca41667
KW
3257 /* If the range is exactly two code points long, they are
3258 * already both in the output */
3259 if (UNLIKELY(range_min + 1 == range_max)) {
3260 goto range_done;
3261 }
3262
3263 /* Here the range contains at least 3 code points */
3264
ed2893cb 3265 if (d_is_utf8) {
f4240379 3266
188d22cf
KW
3267 /* If everything in the transliteration is below 256, we
3268 * can avoid special handling later. A translation table
02cd137d
KW
3269 * for each of those bytes is created by op.c. So we
3270 * expand out all ranges to their constituent code points.
3271 * But if we've encountered something above 255, the
3272 * expanding won't help, so skip doing that. But if it's
3273 * EBCDIC, we may have to look at each character below 256
3274 * if we have to convert to/from Unicode values */
188d22cf 3275 if ( has_above_latin1
c7f1f016 3276#ifdef EBCDIC
f4240379 3277 && (range_min > 255 || ! convert_unicode)
8ada0baa 3278#endif
f4240379 3279 ) {
3fdfceb3
KW
3280 const STRLEN off = d - SvPVX(sv);
3281 const STRLEN extra = 1 + (send - s) + 1;
3282 char *e;
3283
f4240379
KW
3284 /* Move the high character one byte to the right; then
3285 * insert between it and the range begin, an illegal
3286 * byte which serves to indicate this is a range (using
02cd137d 3287 * a '-' would be ambiguous). */
3fdfceb3
KW
3288
3289 if (off + extra > SvLEN(sv)) {
3290 d = off + SvGROW(sv, off + extra);
3291 max_ptr = d - off + offset_to_max;
3292 }
3293
3294 e = d++;
f4240379
KW
3295 while (e-- > max_ptr) {
3296 *(e + 1) = *e;
e294cc5d 3297 }
dc8faf6b 3298 *(e + 1) = (char) RANGE_INDICATOR;
f4240379
KW
3299 goto range_done;
3300 }
3301
3302 /* Here, we're going to expand out the range. For EBCDIC
3303 * the range can extend above 255 (not so in ASCII), so
3304 * for EBCDIC, split it into the parts above and below
3305 * 255/256 */
e294cc5d 3306#ifdef EBCDIC
f4240379
KW
3307 if (range_max > 255) {
3308 real_range_max = range_max;
3309 range_max = 255;
3310 }
e294cc5d 3311#endif
f4240379 3312 }
02aa26ce 3313
f4240379 3314 /* Here we need to expand out the string to contain each
5ca37d54
KW
3315 * character in the range. Grow the output to handle this.
3316 * For non-UTF8, we need a byte for each code point in the
3317 * range, minus the three that we've already allocated for: the
3318 * hyphen, the min, and the max. For UTF-8, we need this
3319 * plus an extra byte for each code point that occupies two
3320 * bytes (is variant) when in UTF-8 (except we've already
3321 * allocated for the end points, including if they are
3322 * variants). For ASCII platforms and Unicode ranges on EBCDIC
3323 * platforms, it's easy to calculate a precise number. To
3324 * start, we count the variants in the range, which we need
3325 * elsewhere in this function anyway. (For the case where it
3326 * isn't easy to calculate, 'extras' has been initialized to 0,
3327 * and the calculation is done in a loop further down.) */
3328#ifdef EBCDIC
3329 if (convert_unicode)
3330#endif
3331 {
3332 /* This is executed unconditionally on ASCII, and for
3333 * Unicode ranges on EBCDIC. Under these conditions, all
3334 * code points above a certain value are variant; and none
3335 * under that value are. We just need to find out how much
3336 * of the range is above that value. We don't count the
3337 * end points here, as they will already have been counted
3338 * as they were parsed. */
3339 if (range_min >= UTF_CONTINUATION_MARK) {
3340
3341 /* The whole range is made up of variants */
3342 extras = (range_max - 1) - (range_min + 1) + 1;
3343 }
3344 else if (range_max >= UTF_CONTINUATION_MARK) {
f4240379 3345
5ca37d54
KW
3346 /* Only the higher portion of the range is variants */
3347 extras = (range_max - 1) - UTF_CONTINUATION_MARK + 1;
3348 }
f4240379 3349
5ca37d54
KW
3350 utf8_variant_count += extras;
3351 }
3352
3353 /* The base growth is the number of code points in the range,
3354 * not including the endpoints, which have already been sized
3355 * for (and output). We don't subtract for the hyphen, as it
3356 * has been parsed but not output, and the SvGROW below is
3357 * based only on what's been output plus what's left to parse.
3358 * */
3359 grow = (range_max - 1) - (range_min + 1) + 1;
f4240379 3360
ed2893cb 3361 if (d_is_utf8) {
4c3a8340 3362#ifdef EBCDIC
5ca37d54
KW
3363 /* In some cases in EBCDIC, we haven't yet calculated a
3364 * precise amount needed for the UTF-8 variants. Just
3365 * assume the worst case, that everything will expand by a
3366 * byte */
3367 if (! convert_unicode) {
f4240379
KW
3368 grow *= 2;
3369 }
5ca37d54 3370 else
4c3a8340 3371#endif
5ca37d54
KW
3372 {
3373 /* Otherwise we know exactly how many variants there
3374 * are in the range. */
3375 grow += extras;
3376 }
f4240379
KW
3377 }
3378
5ca37d54
KW
3379 /* Grow, but position the output to overwrite the range min end
3380 * point, because in some cases we overwrite that */
3381 SvCUR_set(sv, d - SvPVX_const(sv));
3382 offset_to_min = min_ptr - SvPVX_const(sv);
3383
3384 /* See Note on sizing above. */
3385 d = offset_to_min + SvGROW(sv, SvCUR(sv)
3386 + (send - s)
3387 + grow
3388 + 1 /* Trailing NUL */ );
f4240379 3389
5ca37d54 3390 /* Now, we can expand out the range. */
11327fa1 3391#ifdef EBCDIC
f4240379 3392 if (convert_unicode) {
5ca37d54 3393 SSize_t i;
02aa26ce 3394
f4240379
KW
3395 /* Recall that the min and max are now in Unicode terms, so
3396 * we have to convert each character to its native
3397 * equivalent */
ed2893cb 3398 if (d_is_utf8) {
f4240379 3399 for (i = range_min; i <= range_max; i++) {
02cd137d
KW
3400 append_utf8_from_native_byte(
3401 LATIN1_TO_NATIVE((U8) i),
3402 (U8 **) &d);
f4240379
KW
3403 }
3404 }
3405 else {
3406 for (i = range_min; i <= range_max; i++) {
81324705 3407 *d++ = (char)LATIN1_TO_NATIVE((U8) i);
f4240379
KW
3408 }
3409 }
01ec43d0 3410 }
11327fa1
AL
3411 else
3412#endif
3413 /* Always gets run for ASCII, and sometimes for EBCDIC. */
3414 {
f4240379
KW
3415 /* Here, no conversions are necessary, which means that the
3416 * first character in the range is already in 'd' and
3417 * valid, so we can skip overwriting it */
ed2893cb 3418 if (d_is_utf8) {
19742f39 3419 SSize_t i;
f4240379
KW
3420 d += UTF8SKIP(d);
3421 for (i = range_min + 1; i <= range_max; i++) {
81324705 3422 append_utf8_from_native_byte((U8) i, (U8 **) &d);
f4240379
KW
3423 }
3424 }
3425 else {
19742f39 3426 SSize_t i;
f4240379 3427 d++;
5ca37d54
KW
3428 assert(range_min + 1 <= range_max);
3429 for (i = range_min + 1; i < range_max; i++) {
3430#ifdef EBCDIC
3431 /* In this case on EBCDIC, we haven't calculated
3432 * the variants. Do it here, as we go along */
3433 if (! UVCHR_IS_INVARIANT(i)) {
3434 utf8_variant_count++;
3435 }
3436#endif
f4240379
KW
3437 *d++ = (char)i;
3438 }
5ca37d54
KW
3439
3440 /* The range_max is done outside the loop so as to
3441 * avoid having to special case not incrementing
3442 * 'utf8_variant_count' on EBCDIC (it's already been
3443 * counted when originally parsed) */
3444 *d++ = (char) range_max;
f4240379 3445 }
a0ed51b3 3446 }
02aa26ce 3447
11327fa1 3448#ifdef EBCDIC
02cd137d
KW
3449 /* If the original range extended above 255, add in that
3450 * portion. */
f4240379
KW
3451 if (real_range_max) {
3452 *d++ = (char) UTF8_TWO_BYTE_HI(0x100);
3453 *d++ = (char) UTF8_TWO_BYTE_LO(0x100);
043af6f6
KW
3454 if (real_range_max > 0x100) {
3455 if (real_range_max > 0x101) {
dc8faf6b 3456 *d++ = (char) RANGE_INDICATOR;
043af6f6 3457 }
f4240379 3458 d = (char*)uvchr_to_utf8((U8*)d, real_range_max);
043af6f6 3459 }
f4240379 3460 }
11327fa1 3461#endif
02aa26ce 3462
f4240379
KW
3463 range_done:
3464 /* mark the range as done, and continue */
3465 didrange = TRUE;
3466 dorange = FALSE;
3467#ifdef EBCDIC
3468 non_portable_endpoint = 0;
3469 backslash_N = 0;
3470#endif
3471 continue;
3472 } /* End of is a range */
3473 } /* End of transliteration. Joins main code after these else's */
e4a2df84
DM
3474 else if (*s == '[' && PL_lex_inpat && !in_charclass) {
3475 char *s1 = s-1;
3476 int esc = 0;
3477 while (s1 >= start && *s1-- == '\\')
3478 esc = !esc;
3479 if (!esc)
3480 in_charclass = TRUE;
3481 }
1e02a175 3482 else if (*s == ']' && PL_lex_inpat && in_charclass) {
e4a2df84
DM
3483 char *s1 = s-1;
3484 int esc = 0;
3485 while (s1 >= start && *s1-- == '\\')
3486 esc = !esc;
3487 if (!esc)
3488 in_charclass = FALSE;
3489 }
02cd137d
KW
3490 /* skip for regexp comments /(?#comment)/, except for the last
3491 * char, which will be done separately. Stop on (?{..}) and
3492 * friends */
c30fc27b 3493 else if (*s == '(' && PL_lex_inpat && s[1] == '?' && !in_charclass) {
cc6b7395 3494 if (s[2] == '#') {
f339d50e
KW
3495 if (s_is_utf8) {
3496 PERL_UINT_FAST8_T len = UTF8SKIP(s);
3497
3498 while (s + len < send && *s != ')') {
3499 Copy(s, d, len, U8);
3500 d += len;
3501 s += len;
3502 len = UTF8_SAFE_SKIP(s, send);
3503 }
3504 }
3505 else while (s+1 < send && *s != ')') {
3506 *d++ = *s++;
3507 }
155aba94 3508 }
407f8cf2
KW
3509 else if (!PL_lex_casemods
3510 && ( s[2] == '{' /* This should match regcomp.c */
3511 || (s[2] == '?' && s[3] == '{')))
155aba94 3512 {
9da1dd8f 3513 break;
cc6b7395 3514 }
748a9306 3515 }
02cd137d 3516 /* likewise skip #-initiated comments in //x patterns */
407f8cf2
KW
3517 else if (*s == '#'
3518 && PL_lex_inpat
3519 && !in_charclass
3520 && ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED)
3521 {
8faf4f30 3522 while (s < send && *s != '\n')
5ff03569 3523 *d++ = *s++;
748a9306 3524 }
02cd137d 3525 /* no further processing of single-quoted regex */
9da1dd8f
DM
3526 else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'')
3527 goto default_action;
3528
02cd137d
KW
3529 /* check for embedded arrays
3530 * (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
3531 */
1749ea0d 3532 else if (*s == '@' && s[1]) {
fac0f7a3
KW
3533 if (UTF
3534 ? isIDFIRST_utf8_safe(s+1, send)
3535 : isWORDCHAR_A(s[1]))