This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Distinguish C- and perly- literals - PERLY_BRACE_CLOSE
[perl5.git] / toke.c
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
f0e67a1d
Z
26This is the lower layer of the Perl parser, managing characters and tokens.
27
78342678 28=for apidoc AmnU|yy_parser *|PL_parser
f0e67a1d
Z
29
30Pointer to a structure encapsulating the state of the parsing operation
31currently in progress. The pointer can be locally changed to perform
32a nested parse without interfering with the state of an outer parse.
33Individual members of C<PL_parser> have their own documentation.
34
35=cut
36*/
37
378cc40b 38#include "EXTERN.h"
864dbfa3 39#define PERL_IN_TOKE_C
378cc40b 40#include "perl.h"
f1bcae08 41#include "invlist_inline.h"
378cc40b 42
164e423c
KW
43#define new_constant(a,b,c,d,e,f,g, h) \
44 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g, h)
eb0d8d16 45
6154021b 46#define pl_yylval (PL_parser->yylval)
d3b6f988 47
199e78b7
DM
48/* XXX temporary backwards compatibility */
49#define PL_lex_brackets (PL_parser->lex_brackets)
78cdf107
Z
50#define PL_lex_allbrackets (PL_parser->lex_allbrackets)
51#define PL_lex_fakeeof (PL_parser->lex_fakeeof)
199e78b7
DM
52#define PL_lex_brackstack (PL_parser->lex_brackstack)
53#define PL_lex_casemods (PL_parser->lex_casemods)
54#define PL_lex_casestack (PL_parser->lex_casestack)
199e78b7 55#define PL_lex_dojoin (PL_parser->lex_dojoin)
199e78b7
DM
56#define PL_lex_formbrack (PL_parser->lex_formbrack)
57#define PL_lex_inpat (PL_parser->lex_inpat)
58#define PL_lex_inwhat (PL_parser->lex_inwhat)
59#define PL_lex_op (PL_parser->lex_op)
60#define PL_lex_repl (PL_parser->lex_repl)
61#define PL_lex_starts (PL_parser->lex_starts)
62#define PL_lex_stuff (PL_parser->lex_stuff)
63#define PL_multi_start (PL_parser->multi_start)
64#define PL_multi_open (PL_parser->multi_open)
65#define PL_multi_close (PL_parser->multi_close)
199e78b7 66#define PL_preambled (PL_parser->preambled)
bdc0bf6f 67#define PL_linestr (PL_parser->linestr)
c2598295
DM
68#define PL_expect (PL_parser->expect)
69#define PL_copline (PL_parser->copline)
f06b5848
DM
70#define PL_bufptr (PL_parser->bufptr)
71#define PL_oldbufptr (PL_parser->oldbufptr)
72#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
73#define PL_linestart (PL_parser->linestart)
74#define PL_bufend (PL_parser->bufend)
75#define PL_last_uni (PL_parser->last_uni)
76#define PL_last_lop (PL_parser->last_lop)
77#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 78#define PL_lex_state (PL_parser->lex_state)
2f9285f8 79#define PL_rsfp (PL_parser->rsfp)
5486870f 80#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
81#define PL_in_my (PL_parser->in_my)
82#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 83#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 84#define PL_multi_end (PL_parser->multi_end)
13765c85 85#define PL_error_count (PL_parser->error_count)
199e78b7 86
fb205e7a
DM
87# define PL_nexttoke (PL_parser->nexttoke)
88# define PL_nexttype (PL_parser->nexttype)
89# define PL_nextval (PL_parser->nextval)
199e78b7 90
6432a58a
DM
91
92#define SvEVALED(sv) \
93 (SvTYPE(sv) >= SVt_PVNV \
94 && ((XPVIV*)SvANY(sv))->xiv_u.xivu_eval_seen)
95
a1894d81 96static const char* const ident_too_long = "Identifier too long";
60267e1d 97static const char* const ident_var_zero_multi_digit = "Numeric variables with more than one digit may not start with '0'";
8903cb82 98
9ded7720 99# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2 100
a7aaec61
Z
101#define XENUMMASK 0x3f
102#define XFAKEEOF 0x40
103#define XFAKEBRACK 0x80
9059aa12 104
39e02b42 105#ifdef USE_UTF8_SCRIPTS
b3041197 106# define UTF cBOOL(!IN_BYTES)
2b9d42f0 107#else
b3041197 108# define UTF cBOOL((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
2b9d42f0 109#endif
a0ed51b3 110
b1fc3636
CJ
111/* The maximum number of characters preceding the unrecognized one to display */
112#define UNRECOGNIZED_PRECEDE_COUNT 10
113
61f0cdd9 114/* In variables named $^X, these are the legal values for X.
2b92dfce 115 * 1999-02-27 mjd-perl-patch@plover.com */
4aada8b9 116#define isCONTROLVAR(x) (isUPPER(x) || memCHRs("[\\]^_?", (x)))
2b92dfce 117
14bd96d0 118#define SPACE_OR_TAB(c) isBLANK_A(c)
bf4acbe4 119
9ff909cf
JH
120#define HEXFP_PEEK(s) \
121 (((s[0] == '.') && \
122 (isXDIGIT(s[1]) || isALPHA_FOLD_EQ(s[1], 'p'))) || \
123 isALPHA_FOLD_EQ(s[0], 'p'))
124
ffb4593c
NT
125/* LEX_* are values for PL_lex_state, the state of the lexer.
126 * They are arranged oddly so that the guard on the switch statement
79072805 127 * can get by with a single comparison (if the compiler is smart enough).
9da1dd8f
DM
128 *
129 * These values refer to the various states within a sublex parse,
130 * i.e. within a double quotish string
79072805
LW
131 */
132
fb73857a
PP
133/* #define LEX_NOTPARSING 11 is done in perl.h. */
134
b6007c36
DM
135#define LEX_NORMAL 10 /* normal code (ie not within "...") */
136#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
137#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
138#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
139#define LEX_INTERPSTART 6 /* expecting the start of a $var */
140
141 /* at end of code, eg "$x" followed by: */
142#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
143#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
144
145#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
146 string or after \E, $foo, etc */
147#define LEX_INTERPCONST 2 /* NOT USED */
148#define LEX_FORMLINE 1 /* expecting a format line */
b6007c36 149
56d9fe2f
TC
150/* returned to yyl_try() to request it to retry the parse loop, expected to only
151 be returned directly by yyl_fake_eof(), but functions that call yyl_fake_eof()
152 can also return it.
153
154 yylex (aka Perl_yylex) returns 0 on EOF rather than returning -1,
155 other token values are 258 or higher (see perly.h), so -1 should be
156 a safe value here.
157*/
158#define YYL_RETRY (-1)
79072805 159
bbf60fe6 160#ifdef DEBUGGING
27da23d5 161static const char* const lex_state_names[] = {
bbf60fe6
DM
162 "KNOWNEXT",
163 "FORMLINE",
164 "INTERPCONST",
165 "INTERPCONCAT",
166 "INTERPENDMAYBE",
167 "INTERPEND",
168 "INTERPSTART",
169 "INTERPPUSH",
170 "INTERPCASEMOD",
171 "INTERPNORMAL",
172 "NORMAL"
173};
174#endif
175
79072805 176#include "keywords.h"
fe14fcc3 177
ffb4593c
NT
178/* CLINE is a macro that ensures PL_copline has a sane value */
179
57843af0 180#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 181
ffb4593c
NT
182/*
183 * Convenience functions to return different tokens and prime the
9cbb5ea2 184 * lexer for the next token. They all take an argument.
ffb4593c
NT
185 *
186 * TOKEN : generic token (used for '(', DOLSHARP, etc)
187 * OPERATOR : generic operator
188 * AOPERATOR : assignment operator
189 * PREBLOCK : beginning the block after an if, while, foreach, ...
190 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
191 * PREREF : *EXPR where EXPR is not a simple identifier
192 * TERM : expression term
89f35911 193 * POSTDEREF : postfix dereference (->$* ->@[...] etc.)
ffb4593c
NT
194 * LOOPX : loop exiting command (goto, last, dump, etc)
195 * FTST : file test operator
196 * FUN0 : zero-argument function
7eb971ee 197 * FUN0OP : zero-argument function, with its op created in this file
2d2e263d 198 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
199 * BOop : bitwise or or xor
200 * BAop : bitwise and
8823cb89 201 * BCop : bitwise complement
ffb4593c
NT
202 * SHop : shift operator
203 * PWop : power operator
9cbb5ea2 204 * PMop : pattern-matching operator
ffb4593c 205 * Aop : addition-level operator
e4916dd1 206 * AopNOASSIGN : addition-level operator that is never part of .=
ffb4593c 207 * Mop : multiplication-level operator
02b85d3d
Z
208 * ChEop : chaining equality-testing operator
209 * NCEop : non-chaining comparison operator at equality precedence
210 * ChRop : chaining relational operator <= != gt
211 * NCRop : non-chaining relational operator isa
ffb4593c
NT
212 *
213 * Also see LOP and lop() below.
214 */
215
998054bd 216#ifdef DEBUGGING /* Serve -DT. */
704d4215 217# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 218#else
bbf60fe6 219# define REPORT(retval) (retval)
998054bd
SC
220#endif
221
bbf60fe6
DM
222#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
223#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
b1764551 224#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, retval))
bbf60fe6
DM
225#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
226#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
227#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
228#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
89f35911 229#define POSTDEREF(f) return (PL_bufptr = s, S_postderef(aTHX_ REPORT(f),s[1]))
185c2e96 230#define LOOPX(f) return (PL_bufptr = force_word(s,BAREWORD,TRUE,FALSE), \
7a61bf3c 231 pl_yylval.ival=f, \
a49203fd 232 PL_expect = PL_nexttoke ? XOPERATOR : XTERM, \
7a61bf3c 233 REPORT((int)LOOPEX))
6154021b
RGS
234#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
235#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
7eb971ee 236#define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
6154021b 237#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
b1764551
FC
238#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)BITOROP))
239#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)BITANDOP))
8823cb89
FC
240#define BCop(f) return pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr = s, \
241 REPORT('~')
b1764551
FC
242#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)SHIFTOP))
243#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)POWOP))
6154021b 244#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
b1764551 245#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)ADDOP))
e4916dd1 246#define AopNOASSIGN(f) return (pl_yylval.ival=f, PL_bufptr=s, REPORT((int)ADDOP))
b1764551 247#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)MULOP))
02b85d3d
Z
248#define ChEop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)CHEQOP))
249#define NCEop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)NCEQOP))
250#define ChRop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)CHRELOP))
251#define NCRop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)NCRELOP))
2f3197b3 252
a687059c
LW
253/* This bit of chicanery makes a unary function followed by
254 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
255 * The UNIDOR macro is for unary functions that can be followed by the //
256 * operator (such as C<shift // 0>).
a687059c 257 */
d68ce4ac 258#define UNI3(f,x,have_x) { \
6154021b 259 pl_yylval.ival = f; \
d68ce4ac 260 if (have_x) PL_expect = x; \
376fcdbf
AL
261 PL_bufptr = s; \
262 PL_last_uni = PL_oldbufptr; \
0af40c75 263 PL_last_lop_op = (f) < 0 ? -(f) : (f); \
376fcdbf
AL
264 if (*s == '(') \
265 return REPORT( (int)FUNC1 ); \
294a536f 266 s = skipspace(s); \
376fcdbf
AL
267 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
268 }
d68ce4ac
FC
269#define UNI(f) UNI3(f,XTERM,1)
270#define UNIDOR(f) UNI3(f,XTERMORDORDOR,1)
b5fb7ce3
FC
271#define UNIPROTO(f,optional) { \
272 if (optional) PL_last_uni = PL_oldbufptr; \
22393538
MH
273 OPERATOR(f); \
274 }
a687059c 275
d68ce4ac 276#define UNIBRACK(f) UNI3(f,0,0)
79072805 277
9f68db38 278/* grandfather return to old style */
78cdf107
Z
279#define OLDLOP(f) \
280 do { \
281 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
282 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
283 pl_yylval.ival = (f); \
284 PL_expect = XTERM; \
285 PL_bufptr = s; \
286 return (int)LSTOP; \
287 } while(0)
79072805 288
83944c01
FC
289#define COPLINE_INC_WITH_HERELINES \
290 STMT_START { \
291 CopLINE_inc(PL_curcop); \
851b527a
FC
292 if (PL_parser->herelines) \
293 CopLINE(PL_curcop) += PL_parser->herelines, \
294 PL_parser->herelines = 0; \
83944c01 295 } STMT_END
ffdb8b16
FC
296/* Called after scan_str to update CopLINE(PL_curcop), but only when there
297 * is no sublex_push to follow. */
298#define COPLINE_SET_FROM_MULTI_END \
299 STMT_START { \
300 CopLINE_set(PL_curcop, PL_multi_end); \
301 if (PL_multi_end != PL_multi_start) \
851b527a 302 PL_parser->herelines = 0; \
ffdb8b16 303 } STMT_END
83944c01
FC
304
305
539c839f
AC
306/* A file-local structure for passing around information about subroutines and
307 * related definable words */
308struct code {
309 SV *sv;
310 CV *cv;
311 GV *gv, **gvp;
312 OP *rv2cv_op;
313 PADOFFSET off;
314 bool lex;
315};
316
317static const struct code no_code = { NULL, NULL, NULL, NULL, NULL, 0, FALSE };
318
8fa7f367
JH
319#ifdef DEBUGGING
320
6154021b 321/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
322enum token_type {
323 TOKENTYPE_NONE,
324 TOKENTYPE_IVAL,
6154021b 325 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6 326 TOKENTYPE_PVAL,
aeaef349 327 TOKENTYPE_OPVAL
bbf60fe6
DM
328};
329
c588171e
BZ
330#define DEBUG_TOKEN(Type, Name) \
331 { Name, TOKENTYPE_##Type, #Name }
332
6d4a66ac
NC
333static struct debug_tokens {
334 const int token;
335 enum token_type type;
336 const char *name;
337} const debug_tokens[] =
9041c2e3 338{
bbf60fe6
DM
339 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
340 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
341 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
342 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
436ddf68 343 { ANON_SIGSUB, TOKENTYPE_IVAL, "ANON_SIGSUB" },
bbf60fe6
DM
344 { ARROW, TOKENTYPE_NONE, "ARROW" },
345 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
346 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
347 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
02b85d3d
Z
348 { CHEQOP, TOKENTYPE_OPNUM, "CHEQOP" },
349 { CHRELOP, TOKENTYPE_OPNUM, "CHRELOP" },
bbf60fe6
DM
350 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
351 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
7896dde7 352 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
353 { DO, TOKENTYPE_NONE, "DO" },
354 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
355 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
356 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
357 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
358 { ELSE, TOKENTYPE_NONE, "ELSE" },
359 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
bbf60fe6
DM
360 { FOR, TOKENTYPE_IVAL, "FOR" },
361 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
705fe0e5
FC
362 { FORMLBRACK, TOKENTYPE_NONE, "FORMLBRACK" },
363 { FORMRBRACK, TOKENTYPE_NONE, "FORMRBRACK" },
bbf60fe6
DM
364 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
365 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
7eb971ee 366 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
bbf60fe6
DM
367 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
368 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
369 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 370 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
371 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
372 { IF, TOKENTYPE_IVAL, "IF" },
01719201 373 { LABEL, TOKENTYPE_OPVAL, "LABEL" },
bbf60fe6
DM
374 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
375 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
376 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
377 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
378 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
379 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
380 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
381 { MY, TOKENTYPE_IVAL, "MY" },
02b85d3d
Z
382 { NCEQOP, TOKENTYPE_OPNUM, "NCEQOP" },
383 { NCRELOP, TOKENTYPE_OPNUM, "NCRELOP" },
bbf60fe6
DM
384 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
385 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
386 { OROP, TOKENTYPE_IVAL, "OROP" },
387 { OROR, TOKENTYPE_NONE, "OROR" },
388 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
d0a6a9c7 389 DEBUG_TOKEN (IVAL, PERLY_BRACE_CLOSE),
c588171e 390 DEBUG_TOKEN (IVAL, PERLY_BRACE_OPEN),
88e1f1a2
JV
391 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
392 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6 393 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
cc624add 394 { POSTJOIN, TOKENTYPE_NONE, "POSTJOIN" },
bbf60fe6
DM
395 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
396 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
397 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
398 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
399 { PREINC, TOKENTYPE_NONE, "PREINC" },
400 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
f3f204dc 401 { QWLIST, TOKENTYPE_OPVAL, "QWLIST" },
bbf60fe6 402 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
f3f204dc 403 { REQUIRE, TOKENTYPE_NONE, "REQUIRE" },
bbf60fe6 404 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
436ddf68 405 { SIGSUB, TOKENTYPE_NONE, "SIGSUB" },
bbf60fe6 406 { SUB, TOKENTYPE_NONE, "SUB" },
69afcc21
TC
407 { SUBLEXEND, TOKENTYPE_NONE, "SUBLEXEND" },
408 { SUBLEXSTART, TOKENTYPE_NONE, "SUBLEXSTART" },
bbf60fe6
DM
409 { THING, TOKENTYPE_OPVAL, "THING" },
410 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
411 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
412 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
413 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
414 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
415 { USE, TOKENTYPE_IVAL, "USE" },
7896dde7 416 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6 417 { WHILE, TOKENTYPE_IVAL, "WHILE" },
185c2e96 418 { BAREWORD, TOKENTYPE_OPVAL, "BAREWORD" },
be25f609 419 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 420 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
421};
422
c588171e
BZ
423#undef DEBUG_TOKEN
424
6154021b 425/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 426
bbf60fe6 427STATIC int
704d4215 428S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 429{
7918f24d
NC
430 PERL_ARGS_ASSERT_TOKEREPORT;
431
bbf60fe6 432 if (DEBUG_T_TEST) {
bd61b366 433 const char *name = NULL;
bbf60fe6 434 enum token_type type = TOKENTYPE_NONE;
f54cb97a 435 const struct debug_tokens *p;
396482e1 436 SV* const report = newSVpvs("<== ");
bbf60fe6 437
f54cb97a 438 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
439 if (p->token == (int)rv) {
440 name = p->name;
441 type = p->type;
442 break;
443 }
444 }
445 if (name)
54667de8 446 Perl_sv_catpv(aTHX_ report, name);
239f83d5 447 else if (isGRAPH(rv))
4ebc7986 448 {
bbf60fe6 449 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
4ebc7986
FC
450 if ((char)rv == 'p')
451 sv_catpvs(report, " (pending identifier)");
452 }
bbf60fe6 453 else if (!rv)
396482e1 454 sv_catpvs(report, "EOF");
bbf60fe6 455 else
147e3846 456 Perl_sv_catpvf(aTHX_ report, "?? %" IVdf, (IV)rv);
bbf60fe6
DM
457 switch (type) {
458 case TOKENTYPE_NONE:
bbf60fe6
DM
459 break;
460 case TOKENTYPE_IVAL:
147e3846 461 Perl_sv_catpvf(aTHX_ report, "(ival=%" IVdf ")", (IV)lvalp->ival);
bbf60fe6
DM
462 break;
463 case TOKENTYPE_OPNUM:
464 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 465 PL_op_name[lvalp->ival]);
bbf60fe6
DM
466 break;
467 case TOKENTYPE_PVAL:
704d4215 468 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
469 break;
470 case TOKENTYPE_OPVAL:
704d4215 471 if (lvalp->opval) {
401441c0 472 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
473 PL_op_name[lvalp->opval->op_type]);
474 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 475 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 476 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
477 }
478
479 }
401441c0 480 else
396482e1 481 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
482 break;
483 }
b6007c36 484 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
485 };
486 return (int)rv;
998054bd
SC
487}
488
b6007c36
DM
489
490/* print the buffer with suitable escapes */
491
492STATIC void
15f169a1 493S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 494{
396482e1 495 SV* const tmp = newSVpvs("");
7918f24d
NC
496
497 PERL_ARGS_ASSERT_PRINTBUF;
498
7347ee54 499 GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral); /* fmt checked by caller */
b6007c36 500 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
7347ee54 501 GCC_DIAG_RESTORE_STMT;
b6007c36
DM
502 SvREFCNT_dec(tmp);
503}
504
8fa7f367
JH
505#endif
506
ffb4593c
NT
507/*
508 * S_ao
509 *
f393a21a
FC
510 * This subroutine looks for an '=' next to the operator that has just been
511 * parsed and turns it into an ASSIGNOP if it finds one.
ffb4593c
NT
512 */
513
76e3520e 514STATIC int
cea2e8a9 515S_ao(pTHX_ int toketype)
a0d0e21e 516{
3280af22
NIS
517 if (*PL_bufptr == '=') {
518 PL_bufptr++;
a0d0e21e 519 if (toketype == ANDAND)
6154021b 520 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 521 else if (toketype == OROR)
6154021b 522 pl_yylval.ival = OP_ORASSIGN;
c963b151 523 else if (toketype == DORDOR)
6154021b 524 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
525 toketype = ASSIGNOP;
526 }
b1764551 527 return REPORT(toketype);
a0d0e21e
LW
528}
529
ffb4593c
NT
530/*
531 * S_no_op
532 * When Perl expects an operator and finds something else, no_op
533 * prints the warning. It always prints "<something> found where
534 * operator expected. It prints "Missing semicolon on previous line?"
535 * if the surprise occurs at the start of the line. "do you need to
536 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
537 * where the compiler doesn't know if foo is a method call or a function.
538 * It prints "Missing operator before end of line" if there's nothing
539 * after the missing operator, or "... before <...>" if there is something
540 * after the missing operator.
488bc579
FC
541 *
542 * PL_bufptr is expected to point to the start of the thing that was found,
543 * and s after the next token or partial token.
ffb4593c
NT
544 */
545
76e3520e 546STATIC void
15f169a1 547S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 548{
9d4ba2ae
AL
549 char * const oldbp = PL_bufptr;
550 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 551
7918f24d
NC
552 PERL_ARGS_ASSERT_NO_OP;
553
1189a94a
GS
554 if (!s)
555 s = oldbp;
07c798fb 556 else
1189a94a 557 PL_bufptr = s;
734ab321 558 yywarn(Perl_form(aTHX_ "%s found where operator expected", what), UTF ? SVf_UTF8 : 0);
56da5a46
RGS
559 if (ckWARN_d(WARN_SYNTAX)) {
560 if (is_first)
561 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
562 "\t(Missing semicolon on previous line?)\n");
fac0f7a3
KW
563 else if (PL_oldoldbufptr && isIDFIRST_lazy_if_safe(PL_oldoldbufptr,
564 PL_bufend,
565 UTF))
566 {
f54cb97a 567 const char *t;
fac0f7a3
KW
568 for (t = PL_oldoldbufptr;
569 (isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF) || *t == ':');
570 t += UTF ? UTF8SKIP(t) : 1)
571 {
c35e046a 572 NOOP;
fac0f7a3 573 }
56da5a46
RGS
574 if (t < PL_bufptr && isSPACE(*t))
575 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
147e3846 576 "\t(Do you need to predeclare %" UTF8f "?)\n",
b17a0679 577 UTF8fARG(UTF, t - PL_oldoldbufptr, PL_oldoldbufptr));
56da5a46
RGS
578 }
579 else {
580 assert(s >= oldbp);
581 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
147e3846 582 "\t(Missing operator before %" UTF8f "?)\n",
b17a0679 583 UTF8fARG(UTF, s - oldbp, oldbp));
56da5a46 584 }
07c798fb 585 }
3280af22 586 PL_bufptr = oldbp;
8990e307
LW
587}
588
ffb4593c
NT
589/*
590 * S_missingterm
591 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 592 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
593 * If we're in a delimited string and the delimiter is a control
594 * character, it's reformatted into a two-char sequence like ^C.
595 * This is fatal.
596 */
597
76e3520e 598STATIC void
1b8d3e0e 599S_missingterm(pTHX_ char *s, STRLEN len)
8990e307 600{
e487ff5e 601 char tmpbuf[UTF8_MAXBYTES + 1];
8990e307 602 char q;
cb650135
FC
603 bool uni = FALSE;
604 SV *sv;
8990e307 605 if (s) {
f88b23d7 606 char * const nl = (char *) my_memrchr(s, '\n', len);
1b8d3e0e
LM
607 if (nl) {
608 *nl = '\0';
609 len = nl - s;
610 }
7f7f7d08 611 uni = UTF;
8990e307 612 }
cb650135 613 else if (PL_multi_close < 32) {
8990e307 614 *tmpbuf = '^';
585ec06d 615 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
616 tmpbuf[2] = '\0';
617 s = tmpbuf;
1b8d3e0e 618 len = 2;
8990e307
LW
619 }
620 else {
cb650135
FC
621 if (LIKELY(PL_multi_close < 256)) {
622 *tmpbuf = (char)PL_multi_close;
623 tmpbuf[1] = '\0';
1b8d3e0e 624 len = 1;
cb650135
FC
625 }
626 else {
1b8d3e0e
LM
627 char *end = (char *)uvchr_to_utf8((U8 *)tmpbuf, PL_multi_close);
628 *end = '\0';
629 len = end - tmpbuf;
cb650135 630 uni = TRUE;
cb650135 631 }
8990e307
LW
632 s = tmpbuf;
633 }
9f8d27b7 634 q = memchr(s, '"', len) ? '\'' : '"';
1b8d3e0e 635 sv = sv_2mortal(newSVpvn(s, len));
cb650135
FC
636 if (uni)
637 SvUTF8_on(sv);
1b8d3e0e
LM
638 Perl_croak(aTHX_ "Can't find string terminator %c%" SVf "%c"
639 " anywhere before EOF", q, SVfARG(sv), q);
463ee0b2 640}
79072805 641
dd0ac2b9
FC
642#include "feature.h"
643
0d863452 644/*
9cbb5ea2
GS
645 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
646 * utf16-to-utf8-reversed.
ffb4593c
NT
647 */
648
c39cd008
GS
649#ifdef PERL_CR_FILTER
650static void
651strip_return(SV *sv)
652{
eb578fdb
KW
653 const char *s = SvPVX_const(sv);
654 const char * const e = s + SvCUR(sv);
7918f24d
NC
655
656 PERL_ARGS_ASSERT_STRIP_RETURN;
657
c39cd008
GS
658 /* outer loop optimized to do nothing if there are no CR-LFs */
659 while (s < e) {
660 if (*s++ == '\r' && *s == '\n') {
661 /* hit a CR-LF, need to copy the rest */
eb578fdb 662 char *d = s - 1;
c39cd008
GS
663 *d++ = *s++;
664 while (s < e) {
665 if (*s == '\r' && s[1] == '\n')
666 s++;
667 *d++ = *s++;
668 }
669 SvCUR(sv) -= s - d;
670 return;
671 }
672 }
673}
a868473f 674
76e3520e 675STATIC I32
c39cd008 676S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 677{
f54cb97a 678 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
679 if (count > 0 && !maxlen)
680 strip_return(sv);
681 return count;
a868473f
NIS
682}
683#endif
684
ffb4593c 685/*
44170c9a 686=for apidoc lex_start
8eaa0acf
Z
687
688Creates and initialises a new lexer/parser state object, supplying
689a context in which to lex and parse from a new source of Perl code.
690A pointer to the new state object is placed in L</PL_parser>. An entry
c83d5090 691is made on the save stack so that upon unwinding, the new state object
8eaa0acf
Z
692will be destroyed and the former value of L</PL_parser> will be restored.
693Nothing else need be done to clean up the parsing context.
694
2d7f6611 695The code to be parsed comes from C<line> and C<rsfp>. C<line>, if
8eaa0acf 696non-null, provides a string (in SV form) containing code to be parsed.
2d7f6611
KW
697A copy of the string is made, so subsequent modification of C<line>
698does not affect parsing. C<rsfp>, if non-null, provides an input stream
8eaa0acf 699from which code will be read to be parsed. If both are non-null, the
2d7f6611
KW
700code in C<line> comes first and must consist of complete lines of input,
701and C<rsfp> supplies the remainder of the source.
8eaa0acf 702
2d7f6611 703The C<flags> parameter is reserved for future use. Currently it is only
e368b3bd 704used by perl internally, so extensions should always pass zero.
8eaa0acf
Z
705
706=cut
707*/
ffb4593c 708
27fcb6ee 709/* LEX_START_SAME_FILTER indicates that this is not a new file, so it
87606032
NC
710 can share filters with the current parser.
711 LEX_START_DONT_CLOSE indicates that the file handle wasn't opened by the
712 caller, hence isn't owned by the parser, so shouldn't be closed on parser
713 destruction. This is used to handle the case of defaulting to reading the
714 script from the standard input because no filename was given on the command
715 line (without getting confused by situation where STDIN has been closed, so
716 the script handle is opened on fd 0) */
27fcb6ee 717
a0d0e21e 718void
8eaa0acf 719Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
79072805 720{
6ef55633 721 const char *s = NULL;
5486870f 722 yy_parser *parser, *oparser;
90b58c70 723
60d63348 724 if (flags && flags & ~LEX_START_FLAGS)
8eaa0acf 725 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
acdf0a21
DM
726
727 /* create and initialise a parser */
728
199e78b7 729 Newxz(parser, 1, yy_parser);
5486870f 730 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
731 PL_parser = parser;
732
28ac2b49 733 parser->stack = NULL;
df13534a 734 parser->stack_max1 = NULL;
28ac2b49 735 parser->ps = NULL;
acdf0a21 736
e3abe207
DM
737 /* on scope exit, free this parser and restore any outer one */
738 SAVEPARSER(parser);
7c4baf47 739 parser->saved_curcop = PL_curcop;
e3abe207 740
acdf0a21 741 /* initialise lexer state */
8990e307 742
fb205e7a 743 parser->nexttoke = 0;
ca4cfd28 744 parser->error_count = oparser ? oparser->error_count : 0;
7f1c3e8c 745 parser->copline = parser->preambling = NOLINE;
5afb0a62 746 parser->lex_state = LEX_NORMAL;
c2598295 747 parser->expect = XSTATE;
2f9285f8 748 parser->rsfp = rsfp;
b7b52646 749 parser->recheck_utf8_validity = TRUE;
27fcb6ee
FC
750 parser->rsfp_filters =
751 !(flags & LEX_START_SAME_FILTER) || !oparser
d3cd8e11
FC
752 ? NULL
753 : MUTABLE_AV(SvREFCNT_inc(
754 oparser->rsfp_filters
755 ? oparser->rsfp_filters
756 : (oparser->rsfp_filters = newAV())
757 ));
2f9285f8 758
199e78b7
DM
759 Newx(parser->lex_brackstack, 120, char);
760 Newx(parser->lex_casestack, 12, char);
761 *parser->lex_casestack = '\0';
d794b522 762 Newxz(parser->lex_shared, 1, LEXSHARED);
02b34bbe 763
10efb74f 764 if (line) {
0528fd32 765 STRLEN len;
a6909bd2
KW
766 const U8* first_bad_char_loc;
767
10efb74f 768 s = SvPV_const(line, len);
90b58c70 769
0aab20f2
KW
770 if ( SvUTF8(line)
771 && UNLIKELY(! is_utf8_string_loc((U8 *) s,
772 SvCUR(line),
773 &first_bad_char_loc)))
90b58c70
KW
774 {
775 _force_out_malformed_utf8_message(first_bad_char_loc,
776 (U8 *) s + SvCUR(line),
777 0,
778 1 /* 1 means die */ );
779 NOT_REACHED; /* NOTREACHED */
780 }
781
0abcdfa4
FC
782 parser->linestr = flags & LEX_START_COPIED
783 ? SvREFCNT_inc_simple_NN(line)
784 : newSVpvn_flags(s, len, SvUTF8(line));
b3dd0aba
FC
785 if (!rsfp)
786 sv_catpvs(parser->linestr, "\n;");
0abcdfa4 787 } else {
bf1b738b 788 parser->linestr = newSVpvn("\n;", rsfp ? 1 : 2);
8990e307 789 }
218304f9 790
f06b5848
DM
791 parser->oldoldbufptr =
792 parser->oldbufptr =
793 parser->bufptr =
794 parser->linestart = SvPVX(parser->linestr);
795 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
796 parser->last_lop = parser->last_uni = NULL;
b54f893d 797
6d59e610 798 STATIC_ASSERT_STMT(FITS_IN_8_BITS(LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
b54f893d
KW
799 |LEX_DONT_CLOSE_RSFP));
800 parser->lex_flags = (U8) (flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
801 |LEX_DONT_CLOSE_RSFP));
737c24fc 802
60d63348 803 parser->in_pod = parser->filtered = 0;
79072805 804}
a687059c 805
e3abe207
DM
806
807/* delete a parser object */
808
809void
810Perl_parser_free(pTHX_ const yy_parser *parser)
811{
7918f24d
NC
812 PERL_ARGS_ASSERT_PARSER_FREE;
813
7c4baf47 814 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
815 SvREFCNT_dec(parser->linestr);
816
87606032 817 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
2f9285f8 818 PerlIO_clearerr(parser->rsfp);
407f8cf2
KW
819 else if (parser->rsfp && (!parser->old_parser
820 || (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 821 PerlIO_close(parser->rsfp);
5486870f 822 SvREFCNT_dec(parser->rsfp_filters);
10002bc1 823 SvREFCNT_dec(parser->lex_stuff);
7ef70b3d 824 SvREFCNT_dec(parser->lex_sub_repl);
3ac7ff8f
FC
825
826 Safefree(parser->lex_brackstack);
827 Safefree(parser->lex_casestack);
828 Safefree(parser->lex_shared);
829 PL_parser = parser->old_parser;
830 Safefree(parser);
831}
832
833void
834Perl_parser_free_nexttoke_ops(pTHX_ yy_parser *parser, OPSLAB *slab)
835{
3ac7ff8f 836 I32 nexttoke = parser->nexttoke;
3ac7ff8f 837 PERL_ARGS_ASSERT_PARSER_FREE_NEXTTOKE_OPS;
3ce3dcd9 838 while (nexttoke--) {
3ac7ff8f
FC
839 if (S_is_opval_token(parser->nexttype[nexttoke] & 0xffff)
840 && parser->nextval[nexttoke].opval
841 && parser->nextval[nexttoke].opval->op_slabbed
842 && OpSLAB(parser->nextval[nexttoke].opval) == slab) {
3ce3dcd9 843 op_free(parser->nextval[nexttoke].opval);
3ac7ff8f
FC
844 parser->nextval[nexttoke].opval = NULL;
845 }
3ce3dcd9 846 }
e3abe207
DM
847}
848
849
ffb4593c 850/*
78342678 851=for apidoc AmnxUN|SV *|PL_parser-E<gt>linestr
f0e67a1d
Z
852
853Buffer scalar containing the chunk currently under consideration of the
854text currently being lexed. This is always a plain string scalar (for
855which C<SvPOK> is true). It is not intended to be used as a scalar by
856normal scalar means; instead refer to the buffer directly by the pointer
857variables described below.
858
859The lexer maintains various C<char*> pointers to things in the
860C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
861reallocated, all of these pointers must be updated. Don't attempt to
862do this manually, but rather use L</lex_grow_linestr> if you need to
863reallocate the buffer.
864
865The content of the text chunk in the buffer is commonly exactly one
866complete line of input, up to and including a newline terminator,
867but there are situations where it is otherwise. The octets of the
868buffer may be intended to be interpreted as either UTF-8 or Latin-1.
869The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
870flag on this scalar, which may disagree with it.
871
872For direct examination of the buffer, the variable
873L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
874lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
875of these pointers is usually preferable to examination of the scalar
876through normal scalar means.
877
78342678 878=for apidoc AmnxUN|char *|PL_parser-E<gt>bufend
f0e67a1d
Z
879
880Direct pointer to the end of the chunk of text currently being lexed, the
881end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
6602b933 882+ SvCUR(PL_parser-E<gt>linestr)>. A C<NUL> character (zero octet) is
f0e67a1d
Z
883always located at the end of the buffer, and does not count as part of
884the buffer's contents.
885
78342678 886=for apidoc AmnxUN|char *|PL_parser-E<gt>bufptr
f0e67a1d
Z
887
888Points to the current position of lexing inside the lexer buffer.
889Characters around this point may be freely examined, within
890the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
891L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
892interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
893
894Lexing code (whether in the Perl core or not) moves this pointer past
895the characters that it consumes. It is also expected to perform some
896bookkeeping whenever a newline character is consumed. This movement
897can be more conveniently performed by the function L</lex_read_to>,
898which handles newlines appropriately.
899
900Interpretation of the buffer's octets can be abstracted out by
901using the slightly higher-level functions L</lex_peek_unichar> and
902L</lex_read_unichar>.
903
78342678 904=for apidoc AmnxUN|char *|PL_parser-E<gt>linestart
f0e67a1d
Z
905
906Points to the start of the current line inside the lexer buffer.
907This is useful for indicating at which column an error occurred, and
908not much else. This must be updated by any lexing code that consumes
909a newline; the function L</lex_read_to> handles this detail.
910
911=cut
912*/
913
914/*
44170c9a 915=for apidoc lex_bufutf8
f0e67a1d
Z
916
917Indicates whether the octets in the lexer buffer
918(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
919of Unicode characters. If not, they should be interpreted as Latin-1
920characters. This is analogous to the C<SvUTF8> flag for scalars.
921
922In UTF-8 mode, it is not guaranteed that the lexer buffer actually
923contains valid UTF-8. Lexing code must be robust in the face of invalid
924encoding.
925
926The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
927is significant, but not the whole story regarding the input character
928encoding. Normally, when a file is being read, the scalar contains octets
929and its C<SvUTF8> flag is off, but the octets should be interpreted as
930UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
931however, the scalar may have the C<SvUTF8> flag on, and in this case its
932octets should be interpreted as UTF-8 unless the C<use bytes> pragma
933is in effect. This logic may change in the future; use this function
934instead of implementing the logic yourself.
935
936=cut
937*/
938
939bool
940Perl_lex_bufutf8(pTHX)
941{
942 return UTF;
943}
944
945/*
44170c9a 946=for apidoc lex_grow_linestr
f0e67a1d
Z
947
948Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
2d7f6611 949at least C<len> octets (including terminating C<NUL>). Returns a
f0e67a1d
Z
950pointer to the reallocated buffer. This is necessary before making
951any direct modification of the buffer that would increase its length.
952L</lex_stuff_pvn> provides a more convenient way to insert text into
953the buffer.
954
955Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
956this function updates all of the lexer's variables that point directly
957into the buffer.
958
959=cut
960*/
961
962char *
963Perl_lex_grow_linestr(pTHX_ STRLEN len)
964{
965 SV *linestr;
966 char *buf;
967 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
c7641931 968 STRLEN linestart_pos, last_uni_pos, last_lop_pos, re_eval_start_pos;
98d5e3ef
DM
969 bool current;
970
f0e67a1d
Z
971 linestr = PL_parser->linestr;
972 buf = SvPVX(linestr);
973 if (len <= SvLEN(linestr))
974 return buf;
98d5e3ef
DM
975
976 /* Is the lex_shared linestr SV the same as the current linestr SV?
977 * Only in this case does re_eval_start need adjusting, since it
978 * points within lex_shared->ls_linestr's buffer */
b1b8fb6a
DM
979 current = ( !PL_parser->lex_shared->ls_linestr
980 || linestr == PL_parser->lex_shared->ls_linestr);
98d5e3ef 981
f0e67a1d
Z
982 bufend_pos = PL_parser->bufend - buf;
983 bufptr_pos = PL_parser->bufptr - buf;
984 oldbufptr_pos = PL_parser->oldbufptr - buf;
985 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
986 linestart_pos = PL_parser->linestart - buf;
987 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
988 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
98d5e3ef 989 re_eval_start_pos = (current && PL_parser->lex_shared->re_eval_start) ?
3328ab5a 990 PL_parser->lex_shared->re_eval_start - buf : 0;
c7641931 991
f0e67a1d 992 buf = sv_grow(linestr, len);
c7641931 993
f0e67a1d
Z
994 PL_parser->bufend = buf + bufend_pos;
995 PL_parser->bufptr = buf + bufptr_pos;
996 PL_parser->oldbufptr = buf + oldbufptr_pos;
997 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
998 PL_parser->linestart = buf + linestart_pos;
999 if (PL_parser->last_uni)
1000 PL_parser->last_uni = buf + last_uni_pos;
1001 if (PL_parser->last_lop)
1002 PL_parser->last_lop = buf + last_lop_pos;
98d5e3ef 1003 if (current && PL_parser->lex_shared->re_eval_start)
3328ab5a 1004 PL_parser->lex_shared->re_eval_start = buf + re_eval_start_pos;
f0e67a1d
Z
1005 return buf;
1006}
1007
1008/*
44170c9a 1009=for apidoc lex_stuff_pvn
f0e67a1d
Z
1010
1011Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1012immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1013reallocating the buffer if necessary. This means that lexing code that
1014runs later will see the characters as if they had appeared in the input.
1015It is not recommended to do this as part of normal parsing, and most
1016uses of this facility run the risk of the inserted characters being
1017interpreted in an unintended manner.
1018
2d7f6611
KW
1019The string to be inserted is represented by C<len> octets starting
1020at C<pv>. These octets are interpreted as either UTF-8 or Latin-1,
1021according to whether the C<LEX_STUFF_UTF8> flag is set in C<flags>.
f0e67a1d
Z
1022The characters are recoded for the lexer buffer, according to how the
1023buffer is currently being interpreted (L</lex_bufutf8>). If a string
9dcc53ea 1024to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
f0e67a1d
Z
1025function is more convenient.
1026
5af38e47
KW
1027=for apidoc Amnh||LEX_STUFF_UTF8
1028
f0e67a1d
Z
1029=cut
1030*/
1031
1032void
83aa740e 1033Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
f0e67a1d
Z
1034{
1035 char *bufptr;
1036 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
1037 if (flags & ~(LEX_STUFF_UTF8))
1038 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
1039 if (UTF) {
1040 if (flags & LEX_STUFF_UTF8) {
1041 goto plain_copy;
1042 } else {
01ccd497
KW
1043 STRLEN highhalf = variant_under_utf8_count((U8 *) pv,
1044 (U8 *) pv + len);
1045 const char *p, *e = pv+len;;
f0e67a1d
Z
1046 if (!highhalf)
1047 goto plain_copy;
1048 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
1049 bufptr = PL_parser->bufptr;
1050 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1051 SvCUR_set(PL_parser->linestr,
1052 SvCUR(PL_parser->linestr) + len+highhalf);
f0e67a1d
Z
1053 PL_parser->bufend += len+highhalf;
1054 for (p = pv; p != e; p++) {
27d0a47c 1055 append_utf8_from_native_byte(*p, (U8 **) &bufptr);
f0e67a1d
Z
1056 }
1057 }
1058 } else {
1059 if (flags & LEX_STUFF_UTF8) {
1060 STRLEN highhalf = 0;
83aa740e 1061 const char *p, *e = pv+len;
f0e67a1d
Z
1062 for (p = pv; p != e; p++) {
1063 U8 c = (U8)*p;
54d004e8 1064 if (UTF8_IS_ABOVE_LATIN1(c)) {
f0e67a1d
Z
1065 Perl_croak(aTHX_ "Lexing code attempted to stuff "
1066 "non-Latin-1 character into Latin-1 input");
54d004e8 1067 } else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e)) {
f0e67a1d
Z
1068 p++;
1069 highhalf++;
f88c6466 1070 } else assert(UTF8_IS_INVARIANT(c));
f0e67a1d
Z
1071 }
1072 if (!highhalf)
1073 goto plain_copy;
1074 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
1075 bufptr = PL_parser->bufptr;
1076 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1077 SvCUR_set(PL_parser->linestr,
1078 SvCUR(PL_parser->linestr) + len-highhalf);
f0e67a1d 1079 PL_parser->bufend += len-highhalf;
54d004e8
KW
1080 p = pv;
1081 while (p < e) {
1082 if (UTF8_IS_INVARIANT(*p)) {
1083 *bufptr++ = *p;
1084 p++;
f0e67a1d 1085 }
54d004e8
KW
1086 else {
1087 assert(p < e -1 );
a62b247b 1088 *bufptr++ = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
54d004e8
KW
1089 p += 2;
1090 }
f0e67a1d
Z
1091 }
1092 } else {
54d004e8 1093 plain_copy:
f0e67a1d
Z
1094 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1095 bufptr = PL_parser->bufptr;
1096 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
255fdf19 1097 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
f0e67a1d
Z
1098 PL_parser->bufend += len;
1099 Copy(pv, bufptr, len, char);
1100 }
1101 }
1102}
1103
1104/*
44170c9a 1105=for apidoc lex_stuff_pv
9dcc53ea
Z
1106
1107Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1108immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1109reallocating the buffer if necessary. This means that lexing code that
1110runs later will see the characters as if they had appeared in the input.
1111It is not recommended to do this as part of normal parsing, and most
1112uses of this facility run the risk of the inserted characters being
1113interpreted in an unintended manner.
1114
2d7f6611 1115The string to be inserted is represented by octets starting at C<pv>
9dcc53ea
Z
1116and continuing to the first nul. These octets are interpreted as either
1117UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
2d7f6611 1118in C<flags>. The characters are recoded for the lexer buffer, according
9dcc53ea
Z
1119to how the buffer is currently being interpreted (L</lex_bufutf8>).
1120If it is not convenient to nul-terminate a string to be inserted, the
1121L</lex_stuff_pvn> function is more appropriate.
1122
1123=cut
1124*/
1125
1126void
1127Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1128{
1129 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1130 lex_stuff_pvn(pv, strlen(pv), flags);
1131}
1132
1133/*
44170c9a 1134=for apidoc lex_stuff_sv
f0e67a1d
Z
1135
1136Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1137immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1138reallocating the buffer if necessary. This means that lexing code that
1139runs later will see the characters as if they had appeared in the input.
1140It is not recommended to do this as part of normal parsing, and most
1141uses of this facility run the risk of the inserted characters being
1142interpreted in an unintended manner.
1143
2d7f6611 1144The string to be inserted is the string value of C<sv>. The characters
f0e67a1d 1145are recoded for the lexer buffer, according to how the buffer is currently
9dcc53ea 1146being interpreted (L</lex_bufutf8>). If a string to be inserted is
f0e67a1d
Z
1147not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1148need to construct a scalar.
1149
1150=cut
1151*/
1152
1153void
1154Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1155{
1156 char *pv;
1157 STRLEN len;
1158 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1159 if (flags)
1160 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1161 pv = SvPV(sv, len);
1162 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1163}
1164
1165/*
44170c9a 1166=for apidoc lex_unstuff
f0e67a1d
Z
1167
1168Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
2d7f6611 1169C<ptr>. Text following C<ptr> will be moved, and the buffer shortened.
f0e67a1d
Z
1170This hides the discarded text from any lexing code that runs later,
1171as if the text had never appeared.
1172
1173This is not the normal way to consume lexed text. For that, use
1174L</lex_read_to>.
1175
1176=cut
1177*/
1178
1179void
1180Perl_lex_unstuff(pTHX_ char *ptr)
1181{
1182 char *buf, *bufend;
1183 STRLEN unstuff_len;
1184 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1185 buf = PL_parser->bufptr;
1186 if (ptr < buf)
1187 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1188 if (ptr == buf)
1189 return;
1190 bufend = PL_parser->bufend;
1191 if (ptr > bufend)
1192 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1193 unstuff_len = ptr - buf;
1194 Move(ptr, buf, bufend+1-ptr, char);
1195 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1196 PL_parser->bufend = bufend - unstuff_len;
1197}
1198
1199/*
44170c9a 1200=for apidoc lex_read_to
f0e67a1d
Z
1201
1202Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
2d7f6611 1203to C<ptr>. This advances L</PL_parser-E<gt>bufptr> to match C<ptr>,
f0e67a1d
Z
1204performing the correct bookkeeping whenever a newline character is passed.
1205This is the normal way to consume lexed text.
1206
1207Interpretation of the buffer's octets can be abstracted out by
1208using the slightly higher-level functions L</lex_peek_unichar> and
1209L</lex_read_unichar>.
1210
1211=cut
1212*/
1213
1214void
1215Perl_lex_read_to(pTHX_ char *ptr)
1216{
1217 char *s;
1218 PERL_ARGS_ASSERT_LEX_READ_TO;
1219 s = PL_parser->bufptr;
1220 if (ptr < s || ptr > PL_parser->bufend)
1221 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1222 for (; s != ptr; s++)
1223 if (*s == '\n') {
83944c01 1224 COPLINE_INC_WITH_HERELINES;
f0e67a1d
Z
1225 PL_parser->linestart = s+1;
1226 }
1227 PL_parser->bufptr = ptr;
1228}
1229
1230/*
44170c9a 1231=for apidoc lex_discard_to
f0e67a1d
Z
1232
1233Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
2d7f6611
KW
1234up to C<ptr>. The remaining content of the buffer will be moved, and
1235all pointers into the buffer updated appropriately. C<ptr> must not
f0e67a1d
Z
1236be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1237it is not permitted to discard text that has yet to be lexed.
1238
1239Normally it is not necessarily to do this directly, because it suffices to
1240use the implicit discarding behaviour of L</lex_next_chunk> and things
1241based on it. However, if a token stretches across multiple lines,
1f317c95 1242and the lexing code has kept multiple lines of text in the buffer for
f0e67a1d
Z
1243that purpose, then after completion of the token it would be wise to
1244explicitly discard the now-unneeded earlier lines, to avoid future
1245multi-line tokens growing the buffer without bound.
1246
1247=cut
1248*/
1249
1250void
1251Perl_lex_discard_to(pTHX_ char *ptr)
1252{
1253 char *buf;
1254 STRLEN discard_len;
1255 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1256 buf = SvPVX(PL_parser->linestr);
1257 if (ptr < buf)
1258 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1259 if (ptr == buf)
1260 return;
1261 if (ptr > PL_parser->bufptr)
1262 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1263 discard_len = ptr - buf;
1264 if (PL_parser->oldbufptr < ptr)
1265 PL_parser->oldbufptr = ptr;
1266 if (PL_parser->oldoldbufptr < ptr)
1267 PL_parser->oldoldbufptr = ptr;
1268 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1269 PL_parser->last_uni = NULL;
1270 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1271 PL_parser->last_lop = NULL;
1272 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1273 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1274 PL_parser->bufend -= discard_len;
1275 PL_parser->bufptr -= discard_len;
1276 PL_parser->oldbufptr -= discard_len;
1277 PL_parser->oldoldbufptr -= discard_len;
1278 if (PL_parser->last_uni)
1279 PL_parser->last_uni -= discard_len;
1280 if (PL_parser->last_lop)
1281 PL_parser->last_lop -= discard_len;
1282}
1283
efa571ab
KW
1284void
1285Perl_notify_parser_that_changed_to_utf8(pTHX)
1286{
1287 /* Called when $^H is changed to indicate that HINT_UTF8 has changed from
1288 * off to on. At compile time, this has the effect of entering a 'use
1289 * utf8' section. This means that any input was not previously checked for
1290 * UTF-8 (because it was off), but now we do need to check it, or our
1291 * assumptions about the input being sane could be wrong, and we could
1292 * segfault. This routine just sets a flag so that the next time we look
1293 * at the input we do the well-formed UTF-8 check. If we aren't in the
1294 * proper phase, there may not be a parser object, but if there is, setting
1295 * the flag is harmless */
1296
1297 if (PL_parser) {
1298 PL_parser->recheck_utf8_validity = TRUE;
1299 }
1300}
1301
f0e67a1d 1302/*
44170c9a 1303=for apidoc lex_next_chunk
f0e67a1d
Z
1304
1305Reads in the next chunk of text to be lexed, appending it to
1306L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1307looked to the end of the current chunk and wants to know more. It is
1308usual, but not necessary, for lexing to have consumed the entirety of
1309the current chunk at this time.
1310
1311If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1312chunk (i.e., the current chunk has been entirely consumed), normally the
1313current chunk will be discarded at the same time that the new chunk is
c5608a1f 1314read in. If C<flags> has the C<LEX_KEEP_PREVIOUS> bit set, the current chunk
f0e67a1d
Z
1315will not be discarded. If the current chunk has not been entirely
1316consumed, then it will not be discarded regardless of the flag.
1317
1318Returns true if some new text was added to the buffer, or false if the
1319buffer has reached the end of the input text.
1320
5af38e47
KW
1321=for apidoc Amnh||LEX_KEEP_PREVIOUS
1322
f0e67a1d
Z
1323=cut
1324*/
1325
1326#define LEX_FAKE_EOF 0x80000000
e47d32dc 1327#define LEX_NO_TERM 0x40000000 /* here-doc */
f0e67a1d
Z
1328
1329bool
1330Perl_lex_next_chunk(pTHX_ U32 flags)
1331{
1332 SV *linestr;
1333 char *buf;
1334 STRLEN old_bufend_pos, new_bufend_pos;
1335 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1336 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1337 bool got_some_for_debugger = 0;
f0e67a1d 1338 bool got_some;
6cdc5cd8 1339
112d1284 1340 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM))
f0e67a1d 1341 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
d27f4b91 1342 if (!(flags & LEX_NO_TERM) && PL_lex_inwhat)
e47d32dc 1343 return FALSE;
f0e67a1d
Z
1344 linestr = PL_parser->linestr;
1345 buf = SvPVX(linestr);
407f8cf2
KW
1346 if (!(flags & LEX_KEEP_PREVIOUS)
1347 && PL_parser->bufptr == PL_parser->bufend)
1348 {
f0e67a1d
Z
1349 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1350 linestart_pos = 0;
1351 if (PL_parser->last_uni != PL_parser->bufend)
1352 PL_parser->last_uni = NULL;
1353 if (PL_parser->last_lop != PL_parser->bufend)
1354 PL_parser->last_lop = NULL;
1355 last_uni_pos = last_lop_pos = 0;
1356 *buf = 0;
2324bdb9 1357 SvCUR_set(linestr, 0);
f0e67a1d
Z
1358 } else {
1359 old_bufend_pos = PL_parser->bufend - buf;
1360 bufptr_pos = PL_parser->bufptr - buf;
1361 oldbufptr_pos = PL_parser->oldbufptr - buf;
1362 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1363 linestart_pos = PL_parser->linestart - buf;
1364 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1365 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1366 }
1367 if (flags & LEX_FAKE_EOF) {
1368 goto eof;
60d63348 1369 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
f0e67a1d
Z
1370 got_some = 0;
1371 } else if (filter_gets(linestr, old_bufend_pos)) {
1372 got_some = 1;
17cc9359 1373 got_some_for_debugger = 1;
112d1284
FC
1374 } else if (flags & LEX_NO_TERM) {
1375 got_some = 0;
f0e67a1d 1376 } else {
580561a3 1377 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
847cc851 1378 SvPVCLEAR(linestr);
f0e67a1d
Z
1379 eof:
1380 /* End of real input. Close filehandle (unless it was STDIN),
1381 * then add implicit termination.
1382 */
87606032 1383 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
f0e67a1d
Z
1384 PerlIO_clearerr(PL_parser->rsfp);
1385 else if (PL_parser->rsfp)
1386 (void)PerlIO_close(PL_parser->rsfp);
1387 PL_parser->rsfp = NULL;
60d63348 1388 PL_parser->in_pod = PL_parser->filtered = 0;
f0e67a1d
Z
1389 if (!PL_in_eval && PL_minus_p) {
1390 sv_catpvs(linestr,
1391 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1392 PL_minus_n = PL_minus_p = 0;
1393 } else if (!PL_in_eval && PL_minus_n) {
1394 sv_catpvs(linestr, /*{*/";}");
1395 PL_minus_n = 0;
1396 } else
1397 sv_catpvs(linestr, ";");
1398 got_some = 1;
1399 }
1400 buf = SvPVX(linestr);
1401 new_bufend_pos = SvCUR(linestr);
1402 PL_parser->bufend = buf + new_bufend_pos;
1403 PL_parser->bufptr = buf + bufptr_pos;
6cdc5cd8 1404
07337b95
KW
1405 if (UTF) {
1406 const U8* first_bad_char_loc;
1407 if (UNLIKELY(! is_utf8_string_loc(
1408 (U8 *) PL_parser->bufptr,
1409 PL_parser->bufend - PL_parser->bufptr,
1410 &first_bad_char_loc)))
1411 {
1412 _force_out_malformed_utf8_message(first_bad_char_loc,
1413 (U8 *) PL_parser->bufend,
1414 0,
1415 1 /* 1 means die */ );
1416 NOT_REACHED; /* NOTREACHED */
1417 }
6cdc5cd8
KW
1418 }
1419
f0e67a1d
Z
1420 PL_parser->oldbufptr = buf + oldbufptr_pos;
1421 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1422 PL_parser->linestart = buf + linestart_pos;
1423 if (PL_parser->last_uni)
1424 PL_parser->last_uni = buf + last_uni_pos;
1425 if (PL_parser->last_lop)
1426 PL_parser->last_lop = buf + last_lop_pos;
7f1c3e8c
FC
1427 if (PL_parser->preambling != NOLINE) {
1428 CopLINE_set(PL_curcop, PL_parser->preambling + 1);
1429 PL_parser->preambling = NOLINE;
1430 }
407f8cf2
KW
1431 if ( got_some_for_debugger
1432 && PERLDB_LINE_OR_SAVESRC
1433 && PL_curstash != PL_debstash)
1434 {
f0e67a1d
Z
1435 /* debugger active and we're not compiling the debugger code,
1436 * so store the line into the debugger's array of lines
1437 */
1438 update_debugger_info(NULL, buf+old_bufend_pos,
1439 new_bufend_pos-old_bufend_pos);
1440 }
1441 return got_some;
1442}
1443
1444/*
44170c9a 1445=for apidoc lex_peek_unichar
f0e67a1d
Z
1446
1447Looks ahead one (Unicode) character in the text currently being lexed.
1448Returns the codepoint (unsigned integer value) of the next character,
1449or -1 if lexing has reached the end of the input text. To consume the
1450peeked character, use L</lex_read_unichar>.
1451
1452If the next character is in (or extends into) the next chunk of input
1453text, the next chunk will be read in. Normally the current chunk will be
c5608a1f
KW
1454discarded at the same time, but if C<flags> has the C<LEX_KEEP_PREVIOUS>
1455bit set, then the current chunk will not be discarded.
f0e67a1d
Z
1456
1457If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1458is encountered, an exception is generated.
1459
1460=cut
1461*/
1462
1463I32
1464Perl_lex_peek_unichar(pTHX_ U32 flags)
1465{
1466 char *s, *bufend;
1467 if (flags & ~(LEX_KEEP_PREVIOUS))
1468 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1469 s = PL_parser->bufptr;
1470 bufend = PL_parser->bufend;
1471 if (UTF) {
1472 U8 head;
1473 I32 unichar;
1474 STRLEN len, retlen;
1475 if (s == bufend) {
1476 if (!lex_next_chunk(flags))
1477 return -1;
1478 s = PL_parser->bufptr;
1479 bufend = PL_parser->bufend;
1480 }
1481 head = (U8)*s;
54d004e8 1482 if (UTF8_IS_INVARIANT(head))
f0e67a1d 1483 return head;
54d004e8
KW
1484 if (UTF8_IS_START(head)) {
1485 len = UTF8SKIP(&head);
f0e67a1d
Z
1486 while ((STRLEN)(bufend-s) < len) {
1487 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1488 break;
1489 s = PL_parser->bufptr;
1490 bufend = PL_parser->bufend;
1491 }
1492 }
c80e42f3 1493 unichar = utf8n_to_uvchr((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
f0e67a1d 1494 if (retlen == (STRLEN)-1) {
75219bac
KW
1495 _force_out_malformed_utf8_message((U8 *) s,
1496 (U8 *) bufend,
1497 0,
1498 1 /* 1 means die */ );
1499 NOT_REACHED; /* NOTREACHED */
f0e67a1d
Z
1500 }
1501 return unichar;
1502 } else {
1503 if (s == bufend) {
1504 if (!lex_next_chunk(flags))
1505 return -1;
1506 s = PL_parser->bufptr;
1507 }
1508 return (U8)*s;
1509 }
1510}
1511
1512/*
44170c9a 1513=for apidoc lex_read_unichar
f0e67a1d
Z
1514
1515Reads the next (Unicode) character in the text currently being lexed.
1516Returns the codepoint (unsigned integer value) of the character read,
1517and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1518if lexing has reached the end of the input text. To non-destructively
1519examine the next character, use L</lex_peek_unichar> instead.
1520
1521If the next character is in (or extends into) the next chunk of input
1522text, the next chunk will be read in. Normally the current chunk will be
c5608a1f
KW
1523discarded at the same time, but if C<flags> has the C<LEX_KEEP_PREVIOUS>
1524bit set, then the current chunk will not be discarded.
f0e67a1d
Z
1525
1526If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1527is encountered, an exception is generated.
1528
1529=cut
1530*/
1531
1532I32
1533Perl_lex_read_unichar(pTHX_ U32 flags)
1534{
1535 I32 c;
1536 if (flags & ~(LEX_KEEP_PREVIOUS))
1537 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1538 c = lex_peek_unichar(flags);
1539 if (c != -1) {
1540 if (c == '\n')
83944c01 1541 COPLINE_INC_WITH_HERELINES;
d9018cbe
EB
1542 if (UTF)
1543 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1544 else
1545 ++(PL_parser->bufptr);
f0e67a1d
Z
1546 }
1547 return c;
1548}
1549
1550/*
44170c9a 1551=for apidoc lex_read_space
f0e67a1d
Z
1552
1553Reads optional spaces, in Perl style, in the text currently being
1554lexed. The spaces may include ordinary whitespace characters and
1555Perl-style comments. C<#line> directives are processed if encountered.
1556L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1557at a non-space character (or the end of the input text).
1558
1559If spaces extend into the next chunk of input text, the next chunk will
1560be read in. Normally the current chunk will be discarded at the same
c5608a1f 1561time, but if C<flags> has the C<LEX_KEEP_PREVIOUS> bit set, then the current
f0e67a1d
Z
1562chunk will not be discarded.
1563
1564=cut
1565*/
1566
21791330 1567#define LEX_NO_INCLINE 0x40000000
f0998909
Z
1568#define LEX_NO_NEXT_CHUNK 0x80000000
1569
f0e67a1d
Z
1570void
1571Perl_lex_read_space(pTHX_ U32 flags)
1572{
1573 char *s, *bufend;
21791330 1574 const bool can_incline = !(flags & LEX_NO_INCLINE);
f0e67a1d 1575 bool need_incline = 0;
21791330 1576 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK|LEX_NO_INCLINE))
f0e67a1d 1577 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
f0e67a1d
Z
1578 s = PL_parser->bufptr;
1579 bufend = PL_parser->bufend;
1580 while (1) {
1581 char c = *s;
1582 if (c == '#') {
1583 do {
1584 c = *++s;
1585 } while (!(c == '\n' || (c == 0 && s == bufend)));
1586 } else if (c == '\n') {
1587 s++;
21791330
FC
1588 if (can_incline) {
1589 PL_parser->linestart = s;
1590 if (s == bufend)
1591 need_incline = 1;
1592 else
c6875f94 1593 incline(s, bufend);
21791330 1594 }
f0e67a1d
Z
1595 } else if (isSPACE(c)) {
1596 s++;
1597 } else if (c == 0 && s == bufend) {
1598 bool got_more;
65c68e17 1599 line_t l;
f0998909
Z
1600 if (flags & LEX_NO_NEXT_CHUNK)
1601 break;
f0e67a1d 1602 PL_parser->bufptr = s;
65c68e17 1603 l = CopLINE(PL_curcop);
851b527a 1604 CopLINE(PL_curcop) += PL_parser->herelines + 1;
f0e67a1d 1605 got_more = lex_next_chunk(flags);
65c68e17 1606 CopLINE_set(PL_curcop, l);
f0e67a1d
Z
1607 s = PL_parser->bufptr;
1608 bufend = PL_parser->bufend;
1609 if (!got_more)
1610 break;
21791330 1611 if (can_incline && need_incline && PL_parser->rsfp) {
c6875f94 1612 incline(s, bufend);
f0e67a1d
Z
1613 need_incline = 0;
1614 }
3c47da3c
FC
1615 } else if (!c) {
1616 s++;
f0e67a1d
Z
1617 } else {
1618 break;
1619 }
1620 }
f0e67a1d
Z
1621 PL_parser->bufptr = s;
1622}
1623
1624/*
fe788d6b 1625
44170c9a 1626=for apidoc validate_proto
fe788d6b
PM
1627
1628This function performs syntax checking on a prototype, C<proto>.
1629If C<warn> is true, any illegal characters or mismatched brackets
1630will trigger illegalproto warnings, declaring that they were
1631detected in the prototype for C<name>.
1632
1633The return value is C<true> if this is a valid prototype, and
1634C<false> if it is not, regardless of whether C<warn> was C<true> or
1635C<false>.
1636
1637Note that C<NULL> is a valid C<proto> and will always return C<true>.
1638
1639=cut
1640
1641 */
1642
1643bool
5783dc51 1644Perl_validate_proto(pTHX_ SV *name, SV *proto, bool warn, bool curstash)
fe788d6b
PM
1645{
1646 STRLEN len, origlen;
11327fa1 1647 char *p;
fe788d6b
PM
1648 bool bad_proto = FALSE;
1649 bool in_brackets = FALSE;
1650 bool after_slash = FALSE;
1651 char greedy_proto = ' ';
1652 bool proto_after_greedy_proto = FALSE;
1653 bool must_be_last = FALSE;
1654 bool underscore = FALSE;
f791a21a 1655 bool bad_proto_after_underscore = FALSE;
fe788d6b
PM
1656
1657 PERL_ARGS_ASSERT_VALIDATE_PROTO;
1658
1659 if (!proto)
1660 return TRUE;
1661
11327fa1 1662 p = SvPV(proto, len);
fe788d6b
PM
1663 origlen = len;
1664 for (; len--; p++) {
1665 if (!isSPACE(*p)) {
1666 if (must_be_last)
1667 proto_after_greedy_proto = TRUE;
f791a21a 1668 if (underscore) {
4aada8b9 1669 if (!memCHRs(";@%", *p))
f791a21a
PM
1670 bad_proto_after_underscore = TRUE;
1671 underscore = FALSE;
1672 }
4aada8b9 1673 if (!memCHRs("$@%*;[]&\\_+", *p) || *p == '\0') {
fe788d6b
PM
1674 bad_proto = TRUE;
1675 }
1676 else {
fe788d6b
PM
1677 if (*p == '[')
1678 in_brackets = TRUE;
1679 else if (*p == ']')
1680 in_brackets = FALSE;
407f8cf2
KW
1681 else if ((*p == '@' || *p == '%')
1682 && !after_slash
1683 && !in_brackets )
1684 {
fe788d6b
PM
1685 must_be_last = TRUE;
1686 greedy_proto = *p;
1687 }
1688 else if (*p == '_')
f791a21a 1689 underscore = TRUE;
fe788d6b
PM
1690 }
1691 if (*p == '\\')
1692 after_slash = TRUE;
1693 else
1694 after_slash = FALSE;
1695 }
1696 }
1697
1698 if (warn) {
b54d603d 1699 SV *tmpsv = newSVpvs_flags("", SVs_TEMP);
fe788d6b 1700 p -= origlen;
b54d603d
PM
1701 p = SvUTF8(proto)
1702 ? sv_uni_display(tmpsv, newSVpvn_flags(p, origlen, SVs_TEMP | SVf_UTF8),
1703 origlen, UNI_DISPLAY_ISPRINT)
1704 : pv_pretty(tmpsv, p, origlen, 60, NULL, NULL, PERL_PV_ESCAPE_NONASCII);
1705
cbf83791
FC
1706 if (curstash && !memchr(SvPVX(name), ':', SvCUR(name))) {
1707 SV *name2 = sv_2mortal(newSVsv(PL_curstname));
1708 sv_catpvs(name2, "::");
1709 sv_catsv(name2, (SV *)name);
1710 name = name2;
1711 }
1712
fe788d6b
PM
1713 if (proto_after_greedy_proto)
1714 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
147e3846 1715 "Prototype after '%c' for %" SVf " : %s",
fe788d6b 1716 greedy_proto, SVfARG(name), p);
50278ed0
PM
1717 if (in_brackets)
1718 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
147e3846 1719 "Missing ']' in prototype for %" SVf " : %s",
50278ed0 1720 SVfARG(name), p);
b54d603d 1721 if (bad_proto)
fe788d6b 1722 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
147e3846 1723 "Illegal character in prototype for %" SVf " : %s",
f791a21a
PM
1724 SVfARG(name), p);
1725 if (bad_proto_after_underscore)
1726 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
147e3846 1727 "Illegal character after '_' in prototype for %" SVf " : %s",
f791a21a 1728 SVfARG(name), p);
fe788d6b
PM
1729 }
1730
1731 return (! (proto_after_greedy_proto || bad_proto) );
1732}
1733
1734/*
ffb4593c
NT
1735 * S_incline
1736 * This subroutine has nothing to do with tilting, whether at windmills
1737 * or pinball tables. Its name is short for "increment line". It
57843af0 1738 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1739 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1740 * # line 500 "foo.pm"
1741 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1742 */
1743
76e3520e 1744STATIC void
c6875f94 1745S_incline(pTHX_ const char *s, const char *end)
463ee0b2 1746{
d9095cec
NC
1747 const char *t;
1748 const char *n;
1749 const char *e;
8818d409 1750 line_t line_num;
22ff3130 1751 UV uv;
463ee0b2 1752
7918f24d
NC
1753 PERL_ARGS_ASSERT_INCLINE;
1754
d77eff5d
KW
1755 assert(end >= s);
1756
83944c01 1757 COPLINE_INC_WITH_HERELINES;
451f421f
FC
1758 if (!PL_rsfp && !PL_parser->filtered && PL_lex_state == LEX_NORMAL
1759 && s+1 == PL_bufend && *s == ';') {
1760 /* fake newline in string eval */
1761 CopLINE_dec(PL_curcop);
1762 return;
1763 }
463ee0b2
LW
1764 if (*s++ != '#')
1765 return;
d4c19fe8
AL
1766 while (SPACE_OR_TAB(*s))
1767 s++;
d77eff5d
KW
1768 if (memBEGINs(s, (STRLEN) (end - s), "line"))
1769 s += sizeof("line") - 1;
73659bf1
GS
1770 else
1771 return;
084592ab 1772 if (SPACE_OR_TAB(*s))
73659bf1 1773 s++;
4e553d73 1774 else
73659bf1 1775 return;
d4c19fe8
AL
1776 while (SPACE_OR_TAB(*s))
1777 s++;
463ee0b2
LW
1778 if (!isDIGIT(*s))
1779 return;
d4c19fe8 1780
463ee0b2
LW
1781 n = s;
1782 while (isDIGIT(*s))
1783 s++;
07714eb4 1784 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1785 return;
bf4acbe4 1786 while (SPACE_OR_TAB(*s))
463ee0b2 1787 s++;
85bb8b90 1788 if (*s == '"' && (t = (char *) memchr(s+1, '"', end - s))) {
463ee0b2 1789 s++;
73659bf1
GS
1790 e = t + 1;
1791 }
463ee0b2 1792 else {
c35e046a 1793 t = s;
1bb1a3d6 1794 while (*t && !isSPACE(*t))
c35e046a 1795 t++;
73659bf1 1796 e = t;
463ee0b2 1797 }
bf4acbe4 1798 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1799 e++;
1800 if (*e != '\n' && *e != '\0')
1801 return; /* false alarm */
1802
22ff3130
HS
1803 if (!grok_atoUV(n, &uv, &e))
1804 return;
1805 line_num = ((line_t)uv) - 1;
8818d409 1806
f4dd75d9 1807 if (t - s > 0) {
d9095cec 1808 const STRLEN len = t - s;
3df32bda 1809
d36ee5be 1810 if (!PL_rsfp && !PL_parser->filtered) {
e66cf94c
RGS
1811 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1812 * to *{"::_<newfilename"} */
44867030
NC
1813 /* However, the long form of evals is only turned on by the
1814 debugger - usually they're "(eval %lu)" */
d36ee5be
FC
1815 GV * const cfgv = CopFILEGV(PL_curcop);
1816 if (cfgv) {
38bd7ad8
FC
1817 char smallbuf[128];
1818 STRLEN tmplen2 = len;
44867030 1819 char *tmpbuf2;
449dd039 1820 GV *gv2;
44867030
NC
1821
1822 if (tmplen2 + 2 <= sizeof smallbuf)
1823 tmpbuf2 = smallbuf;
1824 else
1825 Newx(tmpbuf2, tmplen2 + 2, char);
1826
38bd7ad8
FC
1827 tmpbuf2[0] = '_';
1828 tmpbuf2[1] = '<';
44867030
NC
1829
1830 memcpy(tmpbuf2 + 2, s, tmplen2);
1831 tmplen2 += 2;
1832
8a5ee598 1833 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1834 if (!isGV(gv2)) {
8a5ee598 1835 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1836 /* adjust ${"::_<newfilename"} to store the new file name */
1837 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
8818d409
FC
1838 /* The line number may differ. If that is the case,
1839 alias the saved lines that are in the array.
1840 Otherwise alias the whole array. */
1841 if (CopLINE(PL_curcop) == line_num) {
38bd7ad8
FC
1842 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(cfgv)));
1843 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(cfgv)));
8818d409 1844 }
38bd7ad8
FC
1845 else if (GvAV(cfgv)) {
1846 AV * const av = GvAV(cfgv);
515c395b
TC
1847 const line_t start = CopLINE(PL_curcop)+1;
1848 SSize_t items = AvFILLp(av) - start;
8818d409
FC
1849 if (items > 0) {
1850 AV * const av2 = GvAVn(gv2);
1851 SV **svp = AvARRAY(av) + start;
515c395b
TC
1852 Size_t l = line_num+1;
1853 while (items-- && l < SSize_t_MAX && l == (line_t)l)
1854 av_store(av2, (SSize_t)l++, SvREFCNT_inc(*svp++));
8818d409
FC
1855 }
1856 }
e5527e4b 1857 }
44867030
NC
1858
1859 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
d36ee5be 1860 }
e66cf94c 1861 }
05ec9bb3 1862 CopFILE_free(PL_curcop);
449dd039 1863 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1864 }
8818d409 1865 CopLINE_set(PL_curcop, line_num);
463ee0b2
LW
1866}
1867
80a702cd 1868STATIC void
15f169a1 1869S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1870{
1871 AV *av = CopFILEAVx(PL_curcop);
1872 if (av) {
7f1c3e8c
FC
1873 SV * sv;
1874 if (PL_parser->preambling == NOLINE) sv = newSV_type(SVt_PVMG);
1875 else {
1876 sv = *av_fetch(av, 0, 1);
1877 SvUPGRADE(sv, SVt_PVMG);
1878 }
847cc851 1879 if (!SvPOK(sv)) SvPVCLEAR(sv);
5fa550fb 1880 if (orig_sv)
7f1c3e8c 1881 sv_catsv(sv, orig_sv);
5fa550fb 1882 else
7f1c3e8c
FC
1883 sv_catpvn(sv, buf, len);
1884 if (!SvIOK(sv)) {
1885 (void)SvIOK_on(sv);
1886 SvIV_set(sv, 0);
1887 }
1888 if (PL_parser->preambling == NOLINE)
1889 av_store(av, CopLINE(PL_curcop), sv);
80a702cd
RGS
1890 }
1891}
1892
ffb4593c 1893/*
8c6b0c7d 1894 * skipspace
ffb4593c
NT
1895 * Called to gobble the appropriate amount and type of whitespace.
1896 * Skips comments as well.
71fff7cb 1897 * Returns the next character after the whitespace that is skipped.
8c6b0c7d
FC
1898 *
1899 * peekspace
1900 * Same thing, but look ahead without incrementing line numbers or
1901 * adjusting PL_linestart.
ffb4593c
NT
1902 */
1903
3218e223 1904#define skipspace(s) skipspace_flags(s, 0)
8c6b0c7d 1905#define peekspace(s) skipspace_flags(s, LEX_NO_INCLINE)
3218e223 1906
aabfeadc
KW
1907char *
1908Perl_skipspace_flags(pTHX_ char *s, U32 flags)
a687059c 1909{
21791330 1910 PERL_ARGS_ASSERT_SKIPSPACE_FLAGS;
3280af22 1911 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
3c47da3c 1912 while (s < PL_bufend && (SPACE_OR_TAB(*s) || !*s))
463ee0b2 1913 s++;
f0e67a1d
Z
1914 } else {
1915 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1916 PL_bufptr = s;
21791330 1917 lex_read_space(flags | LEX_KEEP_PREVIOUS |
d27f4b91 1918 (PL_lex_inwhat || PL_lex_state == LEX_FORMLINE ?
f0998909 1919 LEX_NO_NEXT_CHUNK : 0));
3280af22 1920 s = PL_bufptr;
f0e67a1d
Z
1921 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1922 if (PL_linestart > PL_bufptr)
1923 PL_bufptr = PL_linestart;
1924 return s;
463ee0b2 1925 }
5db06880 1926 return s;
a687059c 1927}
378cc40b 1928
ffb4593c
NT
1929/*
1930 * S_check_uni
1931 * Check the unary operators to ensure there's no ambiguity in how they're
1932 * used. An ambiguous piece of code would be:
1933 * rand + 5
1934 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1935 * the +5 is its argument.
1936 */
1937
76e3520e 1938STATIC void
cea2e8a9 1939S_check_uni(pTHX)
ba106d47 1940{
d4c19fe8 1941 const char *s;
2f3197b3 1942
3280af22 1943 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1944 return;
3280af22
NIS
1945 while (isSPACE(*PL_last_uni))
1946 PL_last_uni++;
c35e046a 1947 s = PL_last_uni;
fac0f7a3 1948 while (isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF) || *s == '-')
8ce2ba82 1949 s += UTF ? UTF8SKIP(s) : 1;
4efcdc02 1950 if (s < PL_bufptr && memchr(s, '(', PL_bufptr - s))
a0d0e21e 1951 return;
6136c704 1952
9b387841 1953 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
147e3846 1954 "Warning: Use of \"%" UTF8f "\" without parentheses is ambiguous",
b59c097b 1955 UTF8fARG(UTF, (int)(s - PL_last_uni), PL_last_uni));
2f3197b3
LW
1956}
1957
ffb4593c
NT
1958/*
1959 * LOP : macro to build a list operator. Its behaviour has been replaced
1960 * with a subroutine, S_lop() for which LOP is just another name.
1961 */
1962
a0d0e21e
LW
1963#define LOP(f,x) return lop(f,x,s)
1964
ffb4593c
NT
1965/*
1966 * S_lop
1967 * Build a list operator (or something that might be one). The rules:
41e8cbf4
FC
1968 * - if we have a next token, then it's a list operator (no parens) for
1969 * which the next token has already been parsed; e.g.,
1970 * sort foo @args
1971 * sort foo (@args)
ffb4593c
NT
1972 * - if the next thing is an opening paren, then it's a function
1973 * - else it's a list operator
1974 */
1975
76e3520e 1976STATIC I32
11288bb3 1977S_lop(pTHX_ I32 f, U8 x, char *s)
ffed7fef 1978{
7918f24d
NC
1979 PERL_ARGS_ASSERT_LOP;
1980
6154021b 1981 pl_yylval.ival = f;
35c8bce7 1982 CLINE;
3280af22
NIS
1983 PL_bufptr = s;
1984 PL_last_lop = PL_oldbufptr;
eb160463 1985 PL_last_lop_op = (OPCODE)f;
3280af22 1986 if (PL_nexttoke)
78cdf107 1987 goto lstop;
19f1898a 1988 PL_expect = x;
79072805 1989 if (*s == '(')
bbf60fe6 1990 return REPORT(FUNC);
294a536f 1991 s = skipspace(s);
79072805 1992 if (*s == '(')
bbf60fe6 1993 return REPORT(FUNC);
78cdf107
Z
1994 else {
1995 lstop:
1996 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
1997 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
bbf60fe6 1998 return REPORT(LSTOP);
78cdf107 1999 }
79072805
LW
2000}
2001
ffb4593c
NT
2002/*
2003 * S_force_next
9cbb5ea2 2004 * When the lexer realizes it knows the next token (for instance,
ffb4593c 2005 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 2006 * to know what token to return the next time the lexer is called. Caller
b5bbe64a
JH
2007 * will need to set PL_nextval[] and possibly PL_expect to ensure
2008 * the lexer handles the token correctly.
ffb4593c
NT
2009 */
2010
4e553d73 2011STATIC void
cea2e8a9 2012S_force_next(pTHX_ I32 type)
79072805 2013{
704d4215
GG
2014#ifdef DEBUGGING
2015 if (DEBUG_T_TEST) {
2016 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 2017 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
2018 }
2019#endif
1f7c3e7c 2020 assert(PL_nexttoke < C_ARRAY_LENGTH(PL_nexttype));
3280af22
NIS
2021 PL_nexttype[PL_nexttoke] = type;
2022 PL_nexttoke++;
79072805
LW
2023}
2024
89f35911
FC
2025/*
2026 * S_postderef
2027 *
2028 * This subroutine handles postfix deref syntax after the arrow has already
b3f7b7ad 2029 * been emitted. @* $* etc. are emitted as two separate tokens right here.
89f35911
FC
2030 * @[ @{ %[ %{ *{ are emitted also as two tokens, but this function emits
2031 * only the first, leaving yylex to find the next.
89f35911
FC
2032 */
2033
2034static int
ff25e5db 2035S_postderef(pTHX_ int const funny, char const next)
89f35911 2036{
4aada8b9 2037 assert(funny == DOLSHARP || memCHRs("$@%&*", funny));
89f35911
FC
2038 if (next == '*') {
2039 PL_expect = XOPERATOR;
cc624add 2040 if (PL_lex_state == LEX_INTERPNORMAL && !PL_lex_brackets) {
ff25e5db 2041 assert('@' == funny || '$' == funny || DOLSHARP == funny);
cc624add 2042 PL_lex_state = LEX_INTERPEND;
c3492809
FC
2043 if ('@' == funny)
2044 force_next(POSTJOIN);
cc624add 2045 }
89f35911
FC
2046 force_next(next);
2047 PL_bufptr+=2;
2048 }
2049 else {
760ca746
FC
2050 if ('@' == funny && PL_lex_state == LEX_INTERPNORMAL
2051 && !PL_lex_brackets)
2052 PL_lex_dojoin = 2;
89f35911
FC
2053 PL_expect = XOPERATOR;
2054 PL_bufptr++;
2055 }
2056 return funny;
2057}
2058
28ac2b49
Z
2059void
2060Perl_yyunlex(pTHX)
2061{
a7aaec61
Z
2062 int yyc = PL_parser->yychar;
2063 if (yyc != YYEMPTY) {
2064 if (yyc) {
a7aaec61 2065 NEXTVAL_NEXTTOKE = PL_parser->yylval;
c588171e 2066 if (yyc == PERLY_BRACE_OPEN || yyc == HASHBRACK || yyc == '['/*]*/) {
78cdf107 2067 PL_lex_allbrackets--;
a7aaec61 2068 PL_lex_brackets--;
78cdf107
Z
2069 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
2070 } else if (yyc == '('/*)*/) {
2071 PL_lex_allbrackets--;
2072 yyc |= (2<<24);
a7aaec61
Z
2073 }
2074 force_next(yyc);
2075 }
28ac2b49
Z
2076 PL_parser->yychar = YYEMPTY;
2077 }
2078}
2079
d0a148a6 2080STATIC SV *
15f169a1 2081S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 2082{
740cce10 2083 SV * const sv = newSVpvn_utf8(start, len,
6d492fac
KW
2084 ! IN_BYTES
2085 && UTF
53463649 2086 && len != 0
6d492fac 2087 && is_utf8_non_invariant_string((const U8*)start, len));
d0a148a6
NC
2088 return sv;
2089}
2090
ffb4593c
NT
2091/*
2092 * S_force_word
2093 * When the lexer knows the next thing is a word (for instance, it has
2094 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
2095 * it calls S_force_word to stick the next word into the PL_nexttoke/val
2096 * lookahead.
ffb4593c
NT
2097 *
2098 * Arguments:
b1b65b59 2099 * char *start : buffer position (must be within PL_linestr)
185c2e96
DM
2100 * int token : PL_next* will be this type of bare word
2101 * (e.g., METHOD,BAREWORD)
ffb4593c
NT
2102 * int check_keyword : if true, Perl checks to make sure the word isn't
2103 * a keyword (do this if the word is a label, e.g. goto FOO)
2104 * int allow_pack : if true, : characters will also be allowed (require,
2105 * use, etc. do this)
ffb4593c
NT
2106 */
2107
76e3520e 2108STATIC char *
345b3785 2109S_force_word(pTHX_ char *start, int token, int check_keyword, int allow_pack)
79072805 2110{
eb578fdb 2111 char *s;
463ee0b2 2112 STRLEN len;
4e553d73 2113
7918f24d
NC
2114 PERL_ARGS_ASSERT_FORCE_WORD;
2115
294a536f 2116 start = skipspace(start);
463ee0b2 2117 s = start;
fac0f7a3 2118 if ( isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)
e7127e21 2119 || (allow_pack && *s == ':' && s[1] == ':') )
a0d0e21e 2120 {
3280af22 2121 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
01b5ef50
FC
2122 if (check_keyword) {
2123 char *s2 = PL_tokenbuf;
487e470d 2124 STRLEN len2 = len;
de627158
KW
2125 if (allow_pack && memBEGINPs(s2, len, "CORE::")) {
2126 s2 += sizeof("CORE::") - 1;
2127 len2 -= sizeof("CORE::") - 1;
2128 }
487e470d 2129 if (keyword(s2, len2, 0))
463ee0b2 2130 return start;
01b5ef50 2131 }
463ee0b2 2132 if (token == METHOD) {
294a536f 2133 s = skipspace(s);
463ee0b2 2134 if (*s == '(')
3280af22 2135 PL_expect = XTERM;
463ee0b2 2136 else {
3280af22 2137 PL_expect = XOPERATOR;
463ee0b2 2138 }
79072805 2139 }
9ded7720 2140 NEXTVAL_NEXTTOKE.opval
275103cd 2141 = newSVOP(OP_CONST,0,
d0a148a6 2142 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 2143 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
2144 force_next(token);
2145 }
2146 return s;
2147}
2148
ffb4593c
NT
2149/*
2150 * S_force_ident
9cbb5ea2 2151 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
2152 * text only contains the "foo" portion. The first argument is a pointer
2153 * to the "foo", and the second argument is the type symbol to prefix.
185c2e96 2154 * Forces the next token to be a "BAREWORD".
9cbb5ea2 2155 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2156 */
2157
76e3520e 2158STATIC void
5aaab254 2159S_force_ident(pTHX_ const char *s, int kind)
79072805 2160{
7918f24d
NC
2161 PERL_ARGS_ASSERT_FORCE_IDENT;
2162
c9b48522
DD
2163 if (s[0]) {
2164 const STRLEN len = s[1] ? strlen(s) : 1; /* s = "\"" see yylex */
275103cd 2165 OP* const o = newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
728847b1 2166 UTF ? SVf_UTF8 : 0));
9ded7720 2167 NEXTVAL_NEXTTOKE.opval = o;
185c2e96 2168 force_next(BAREWORD);
748a9306 2169 if (kind) {
11343788 2170 o->op_private = OPpCONST_ENTERED;
55497cff
PP
2171 /* XXX see note in pp_entereval() for why we forgo typo
2172 warnings if the symbol must be introduced in an eval.
2173 GSAR 96-10-12 */
90e5519e 2174 gv_fetchpvn_flags(s, len,
4bff32c5 2175 (PL_in_eval ? GV_ADDMULTI
728847b1 2176 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
90e5519e
NC
2177 kind == '$' ? SVt_PV :
2178 kind == '@' ? SVt_PVAV :
2179 kind == '%' ? SVt_PVHV :
a0d0e21e 2180 SVt_PVGV
90e5519e 2181 );
748a9306 2182 }
79072805
LW
2183 }
2184}
2185
3f33d153
FC
2186static void
2187S_force_ident_maybe_lex(pTHX_ char pit)
2188{
3f33d153
FC
2189 NEXTVAL_NEXTTOKE.ival = pit;
2190 force_next('p');
2191}
2192
1571675a
GS
2193NV
2194Perl_str_to_version(pTHX_ SV *sv)
2195{
2196 NV retval = 0.0;
2197 NV nshift = 1.0;
2198 STRLEN len;
cfd0369c 2199 const char *start = SvPV_const(sv,len);
9d4ba2ae 2200 const char * const end = start + len;
8298454c 2201 const bool utf = cBOOL(SvUTF8(sv));
7918f24d
NC
2202
2203 PERL_ARGS_ASSERT_STR_TO_VERSION;
2204
1571675a 2205 while (start < end) {
ba210ebe 2206 STRLEN skip;
1571675a
GS
2207 UV n;
2208 if (utf)
9041c2e3 2209 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2210 else {
2211 n = *(U8*)start;
2212 skip = 1;
2213 }
2214 retval += ((NV)n)/nshift;
2215 start += skip;
2216 nshift *= 1000;
2217 }
2218 return retval;
2219}
2220
4e553d73 2221/*
ffb4593c
NT
2222 * S_force_version
2223 * Forces the next token to be a version number.
e759cc13
RGS
2224 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2225 * and if "guessing" is TRUE, then no new token is created (and the caller
2226 * must use an alternative parsing method).
ffb4593c
NT
2227 */
2228
76e3520e 2229STATIC char *
e759cc13 2230S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2231{
5f66b61c 2232 OP *version = NULL;
44dcb63b 2233 char *d;
89bfa8cd 2234
7918f24d
NC
2235 PERL_ARGS_ASSERT_FORCE_VERSION;
2236
294a536f 2237 s = skipspace(s);
89bfa8cd 2238
44dcb63b 2239 d = s;
dd629d5b 2240 if (*d == 'v')
44dcb63b 2241 d++;
44dcb63b 2242 if (isDIGIT(*d)) {
e759cc13
RGS
2243 while (isDIGIT(*d) || *d == '_' || *d == '.')
2244 d++;
4e4da3ac 2245 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
dd629d5b 2246 SV *ver;
6154021b
RGS
2247 s = scan_num(s, &pl_yylval);
2248 version = pl_yylval.opval;
dd629d5b
GS
2249 ver = cSVOPx(version)->op_sv;
2250 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2251 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2252 SvNV_set(ver, str_to_version(ver));
1571675a 2253 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2254 }
89bfa8cd 2255 }
5db06880 2256 else if (guessing) {
e759cc13 2257 return s;
5db06880 2258 }
89bfa8cd
PP
2259 }
2260
2261 /* NOTE: The parser sees the package name and the VERSION swapped */
9ded7720 2262 NEXTVAL_NEXTTOKE.opval = version;
185c2e96 2263 force_next(BAREWORD);
89bfa8cd 2264
e759cc13 2265 return s;
89bfa8cd
PP
2266}
2267
ffb4593c 2268/*
91152fc1
DG
2269 * S_force_strict_version
2270 * Forces the next token to be a version number using strict syntax rules.
2271 */
2272
2273STATIC char *
2274S_force_strict_version(pTHX_ char *s)
2275{
91152fc1 2276 OP *version = NULL;
91152fc1
DG
2277 const char *errstr = NULL;
2278
2279 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2280
2281 while (isSPACE(*s)) /* leading whitespace */
2282 s++;
2283
2284 if (is_STRICT_VERSION(s,&errstr)) {
2285 SV *ver = newSV(0);
2286 s = (char *)scan_version(s, ver, 0);
2287 version = newSVOP(OP_CONST, 0, ver);
2288 }
407f8cf2
KW
2289 else if ((*s != ';' && *s != '{' && *s != '}' )
2290 && (s = skipspace(s), (*s != ';' && *s != '{' && *s != '}' )))
4e4da3ac 2291 {
91152fc1
DG
2292 PL_bufptr = s;
2293 if (errstr)
2294 yyerror(errstr); /* version required */
2295 return s;
2296 }
2297
91152fc1 2298 /* NOTE: The parser sees the package name and the VERSION swapped */
91152fc1 2299 NEXTVAL_NEXTTOKE.opval = version;
185c2e96 2300 force_next(BAREWORD);
91152fc1
DG
2301
2302 return s;
2303}
2304
2305/*
ffb4593c 2306 * S_tokeq
ef3ff34d
KW
2307 * Turns any \\ into \ in a quoted string passed in in 'sv', returning 'sv',
2308 * modified as necessary. However, if HINT_NEW_STRING is on, 'sv' is
2309 * unchanged, and a new SV containing the modified input is returned.
ffb4593c
NT
2310 */
2311
76e3520e 2312STATIC SV *
cea2e8a9 2313S_tokeq(pTHX_ SV *sv)
79072805 2314{
eb578fdb
KW
2315 char *s;
2316 char *send;
2317 char *d;
b3ac6de7 2318 SV *pv = sv;
79072805 2319
7918f24d
NC
2320 PERL_ARGS_ASSERT_TOKEQ;
2321
279b35ad
FC
2322 assert (SvPOK(sv));
2323 assert (SvLEN(sv));
2324 assert (!SvIsCOW(sv));
307ed071 2325 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1) /* <<'heredoc' */
b3ac6de7 2326 goto finish;
279b35ad
FC
2327 s = SvPVX(sv);
2328 send = SvEND(sv);
dcb21ed6
NC
2329 /* This is relying on the SV being "well formed" with a trailing '\0' */
2330 while (s < send && !(*s == '\\' && s[1] == '\\'))
79072805
LW
2331 s++;
2332 if (s == send)
b3ac6de7 2333 goto finish;
79072805 2334 d = s;
be4731d2 2335 if ( PL_hints & HINT_NEW_STRING ) {
279b35ad
FC
2336 pv = newSVpvn_flags(SvPVX_const(pv), SvCUR(sv),
2337 SVs_TEMP | SvUTF8(sv));
be4731d2 2338 }
79072805
LW
2339 while (s < send) {
2340 if (*s == '\\') {
a0d0e21e 2341 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2342 s++; /* all that, just for this */
2343 }
2344 *d++ = *s++;
2345 }
2346 *d = '\0';
95a20fc0 2347 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2348 finish:
3280af22 2349 if ( PL_hints & HINT_NEW_STRING )
164e423c 2350 return new_constant(NULL, 0, "q", sv, pv, "q", 1, NULL);
79072805
LW
2351 return sv;
2352}
2353
ffb4593c
NT
2354/*
2355 * Now come three functions related to double-quote context,
2356 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2357 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2358 * interact with PL_lex_state, and create fake ( ... ) argument lists
2359 * to handle functions and concatenation.
ecd24171
DM
2360 * For example,
2361 * "foo\lbar"
2362 * is tokenised as
2363 * stringify ( const[foo] concat lcfirst ( const[bar] ) )
ffb4593c
NT
2364 */
2365
2366/*
2367 * S_sublex_start
6154021b 2368 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2369 *
2370 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2371 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c 2372 *
16d1d8bd 2373 * OP_CONST is easy--just make the new op and return.
ffb4593c
NT
2374 *
2375 * Everything else becomes a FUNC.
2376 *
16d1d8bd
LM
2377 * Sets PL_lex_state to LEX_INTERPPUSH unless ival was OP_NULL or we
2378 * had an OP_CONST. This just sets us up for a
ffb4593c
NT
2379 * call to S_sublex_push().
2380 */
2381
76e3520e 2382STATIC I32
cea2e8a9 2383S_sublex_start(pTHX)
79072805 2384{
eb578fdb 2385 const I32 op_type = pl_yylval.ival;
79072805
LW
2386
2387 if (op_type == OP_NULL) {
6154021b 2388 pl_yylval.opval = PL_lex_op;
5f66b61c 2389 PL_lex_op = NULL;
79072805
LW
2390 return THING;
2391 }
466112bb 2392 if (op_type == OP_CONST) {
67c71cbb
FC
2393 SV *sv = PL_lex_stuff;
2394 PL_lex_stuff = NULL;
2395 sv = tokeq(sv);
b3ac6de7
IZ
2396
2397 if (SvTYPE(sv) == SVt_PVIV) {
2398 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2399 STRLEN len;
96a5add6 2400 const char * const p = SvPV_const(sv, len);
740cce10 2401 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2402 SvREFCNT_dec(sv);
2403 sv = nsv;
4e553d73 2404 }
275103cd 2405 pl_yylval.opval = newSVOP(op_type, 0, sv);
79072805
LW
2406 return THING;
2407 }
2408
7ef70b3d
FC
2409 PL_parser->lex_super_state = PL_lex_state;
2410 PL_parser->lex_sub_inwhat = (U16)op_type;
2411 PL_parser->lex_sub_op = PL_lex_op;
bb4e4c38
TC
2412 PL_parser->sub_no_recover = FALSE;
2413 PL_parser->sub_error_count = PL_error_count;
3280af22 2414 PL_lex_state = LEX_INTERPPUSH;
55497cff 2415
3280af22
NIS
2416 PL_expect = XTERM;
2417 if (PL_lex_op) {
6154021b 2418 pl_yylval.opval = PL_lex_op;
5f66b61c 2419 PL_lex_op = NULL;
55497cff
PP
2420 return PMFUNC;
2421 }
2422 else
2423 return FUNC;
2424}
2425
ffb4593c
NT
2426/*
2427 * S_sublex_push
2428 * Create a new scope to save the lexing state. The scope will be
2429 * ended in S_sublex_done. Returns a '(', starting the function arguments
2430 * to the uc, lc, etc. found before.
2431 * Sets PL_lex_state to LEX_INTERPCONCAT.
2432 */
2433
76e3520e 2434STATIC I32
cea2e8a9 2435S_sublex_push(pTHX)
55497cff 2436{
78a635de 2437 LEXSHARED *shared;
801d32ac 2438 const bool is_heredoc = PL_multi_close == '<';
f46d017c 2439 ENTER;
55497cff 2440
7ef70b3d 2441 PL_lex_state = PL_parser->lex_super_state;
cc624add 2442 SAVEI8(PL_lex_dojoin);
3280af22 2443 SAVEI32(PL_lex_brackets);
78cdf107 2444 SAVEI32(PL_lex_allbrackets);
b27dce25 2445 SAVEI32(PL_lex_formbrack);
78cdf107 2446 SAVEI8(PL_lex_fakeeof);
3280af22
NIS
2447 SAVEI32(PL_lex_casemods);
2448 SAVEI32(PL_lex_starts);
651b5b28 2449 SAVEI8(PL_lex_state);
7cc34111 2450 SAVESPTR(PL_lex_repl);
7766f137 2451 SAVEVPTR(PL_lex_inpat);
98246f1e 2452 SAVEI16(PL_lex_inwhat);
ffdb8b16 2453 if (is_heredoc)
b42366d4 2454 {
ffdb8b16 2455 SAVECOPLINE(PL_curcop);
b42366d4 2456 SAVEI32(PL_multi_end);
851b527a
FC
2457 SAVEI32(PL_parser->herelines);
2458 PL_parser->herelines = 0;
b42366d4 2459 }
2ca4363d 2460 SAVEIV(PL_multi_close);
3280af22 2461 SAVEPPTR(PL_bufptr);
8452ff4b 2462 SAVEPPTR(PL_bufend);
3280af22
NIS
2463 SAVEPPTR(PL_oldbufptr);
2464 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2465 SAVEPPTR(PL_last_lop);
2466 SAVEPPTR(PL_last_uni);
3280af22
NIS
2467 SAVEPPTR(PL_linestart);
2468 SAVESPTR(PL_linestr);
8edd5f42
RGS
2469 SAVEGENERICPV(PL_lex_brackstack);
2470 SAVEGENERICPV(PL_lex_casestack);
78a635de 2471 SAVEGENERICPV(PL_parser->lex_shared);
3a54fd60 2472 SAVEBOOL(PL_parser->lex_re_reparsing);
ffdb8b16 2473 SAVEI32(PL_copline);
3280af22 2474
99bd9d90 2475 /* The here-doc parser needs to be able to peek into outer lexing
60f40a38
FC
2476 scopes to find the body of the here-doc. So we put PL_linestr and
2477 PL_bufptr into lex_shared, to ‘share’ those values.
99bd9d90 2478 */
60f40a38
FC
2479 PL_parser->lex_shared->ls_linestr = PL_linestr;
2480 PL_parser->lex_shared->ls_bufptr = PL_bufptr;
99bd9d90 2481
3280af22 2482 PL_linestr = PL_lex_stuff;
7ef70b3d 2483 PL_lex_repl = PL_parser->lex_sub_repl;
a0714e2c 2484 PL_lex_stuff = NULL;
7ef70b3d 2485 PL_parser->lex_sub_repl = NULL;
3280af22 2486
eabab8bc
FC
2487 /* Arrange for PL_lex_stuff to be freed on scope exit, in case it gets
2488 set for an inner quote-like operator and then an error causes scope-
2489 popping. We must not have a PL_lex_stuff value left dangling, as
2490 that breaks assumptions elsewhere. See bug #123617. */
2491 SAVEGENERICSV(PL_lex_stuff);
7ef70b3d 2492 SAVEGENERICSV(PL_parser->lex_sub_repl);
eabab8bc 2493
9cbb5ea2
GS
2494 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2495 = SvPVX(PL_linestr);
3280af22 2496 PL_bufend += SvCUR(PL_linestr);
bd61b366 2497 PL_last_lop = PL_last_uni = NULL;
3280af22 2498 SAVEFREESV(PL_linestr);
4dc843bc 2499 if (PL_lex_repl) SAVEFREESV(PL_lex_repl);
3280af22
NIS
2500
2501 PL_lex_dojoin = FALSE;
b27dce25 2502 PL_lex_brackets = PL_lex_formbrack = 0;
78cdf107
Z
2503 PL_lex_allbrackets = 0;
2504 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
a02a5408
JC
2505 Newx(PL_lex_brackstack, 120, char);
2506 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2507 PL_lex_casemods = 0;
2508 *PL_lex_casestack = '\0';
2509 PL_lex_starts = 0;
2510 PL_lex_state = LEX_INTERPCONCAT;
ffdb8b16 2511 if (is_heredoc)
6ddcf93b 2512 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
ffdb8b16 2513 PL_copline = NOLINE;
caae0700 2514
78a635de
FC
2515 Newxz(shared, 1, LEXSHARED);
2516 shared->ls_prev = PL_parser->lex_shared;
2517 PL_parser->lex_shared = shared;
3280af22 2518
7ef70b3d 2519 PL_lex_inwhat = PL_parser->lex_sub_inwhat;
bb16bae8 2520 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
3280af22 2521 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
7ef70b3d 2522 PL_lex_inpat = PL_parser->lex_sub_op;
79072805 2523 else
5f66b61c 2524 PL_lex_inpat = NULL;
79072805 2525
3a54fd60
DM
2526 PL_parser->lex_re_reparsing = cBOOL(PL_in_eval & EVAL_RE_REPARSING);
2527 PL_in_eval &= ~EVAL_RE_REPARSING;
2528
69afcc21 2529 return SUBLEXSTART;
79072805
LW
2530}
2531
ffb4593c
NT
2532/*
2533 * S_sublex_done
2534 * Restores lexer state after a S_sublex_push.
2535 */
2536
76e3520e 2537STATIC I32
cea2e8a9 2538S_sublex_done(pTHX)
79072805 2539{
3280af22 2540 if (!PL_lex_starts++) {
396482e1 2541 SV * const sv = newSVpvs("");
9aa983d2
JH
2542 if (SvUTF8(PL_linestr))
2543 SvUTF8_on(sv);
3280af22 2544 PL_expect = XOPERATOR;
275103cd 2545 pl_yylval.opval = newSVOP(OP_CONST, 0, sv);
79072805
LW
2546 return THING;
2547 }
2548
3280af22
NIS
2549 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2550 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2551 return yylex();
79072805
LW
2552 }
2553
ffb4593c 2554 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
bb16bae8 2555 assert(PL_lex_inwhat != OP_TRANSR);
5aa91856
FC
2556 if (PL_lex_repl) {
2557 assert (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS);
3280af22
NIS
2558 PL_linestr = PL_lex_repl;
2559 PL_lex_inpat = 0;
2560 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2561 PL_bufend += SvCUR(PL_linestr);
bd61b366 2562 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2563 PL_lex_dojoin = FALSE;
2564 PL_lex_brackets = 0;
78cdf107
Z
2565 PL_lex_allbrackets = 0;
2566 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
3280af22
NIS
2567 PL_lex_casemods = 0;
2568 *PL_lex_casestack = '\0';
2569 PL_lex_starts = 0;
25da4f38 2570 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2571 PL_lex_state = LEX_INTERPNORMAL;
2572 PL_lex_starts++;
e9fa98b2
HS
2573 /* we don't clear PL_lex_repl here, so that we can check later
2574 whether this is an evalled subst; that means we rely on the
2575 logic to ensure sublex_done() is called again only via the
2576 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2577 }
e9fa98b2 2578 else {
3280af22 2579 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2580 PL_lex_repl = NULL;
e9fa98b2 2581 }
ffdb8b16
FC
2582 if (SvTYPE(PL_linestr) >= SVt_PVNV) {
2583 CopLINE(PL_curcop) +=
9420b268 2584 ((XPVNV*)SvANY(PL_linestr))->xnv_u.xnv_lines
851b527a
FC
2585 + PL_parser->herelines;
2586 PL_parser->herelines = 0;
ffdb8b16 2587 }
9b6b7be8 2588 return '/';
ffed7fef
LW
2589 }
2590 else {
b42366d4 2591 const line_t l = CopLINE(PL_curcop);
f46d017c 2592 LEAVE;
bb4e4c38 2593 if (PL_parser->sub_error_count != PL_error_count) {
bb4e4c38 2594 if (PL_parser->sub_no_recover) {
ad1ecdf7 2595 yyquit();
bb4e4c38
TC
2596 NOT_REACHED;
2597 }
2598 }
b42366d4 2599 if (PL_multi_close == '<')
851b527a 2600 PL_parser->herelines += l - PL_multi_end;
3280af22
NIS
2601 PL_bufend = SvPVX(PL_linestr);
2602 PL_bufend += SvCUR(PL_linestr);
2603 PL_expect = XOPERATOR;
69afcc21 2604 return SUBLEXEND;
ffed7fef
LW
2605 }
2606}
2607
4e8ee35f
KW
2608HV *
2609Perl_load_charnames(pTHX_ SV * char_name, const char * context,
2610 const STRLEN context_len, const char ** error_msg)
2611{
2612 /* Load the official _charnames module if not already there. The
2613 * parameters are just to give info for any error messages generated:
2614 * char_name a name to look up which is the reason for loading this
2615 * context 'char_name' in the context in the input in which it appears
2616 * context_len how many bytes 'context' occupies
2617 * error_msg *error_msg will be set to any error
2618 *
2619 * Returns the ^H table if success; otherwise NULL */
2620
2621 unsigned int i;
2622 HV * table;
2623 SV **cvp;
2624 SV * res;
2625
2626 PERL_ARGS_ASSERT_LOAD_CHARNAMES;
2627
2628 /* This loop is executed 1 1/2 times. On the first time through, if it
2629 * isn't already loaded, try loading it, and iterate just once to see if it
2630 * worked. */
2631 for (i = 0; i < 2; i++) {
2632 table = GvHV(PL_hintgv); /* ^H */
2633
2634 if ( table
2635 && (PL_hints & HINT_LOCALIZE_HH)
2636 && (cvp = hv_fetchs(table, "charnames", FALSE))
2637 && SvOK(*cvp))
2638 {
2639 return table; /* Quit if already loaded */
2640 }
2641
2642 if (i == 0) {
2643 Perl_load_module(aTHX_
2644 0,
2645 newSVpvs("_charnames"),
2646
2647 /* version parameter; no need to specify it, as if we get too early
2648 * a version, will fail anyway, not being able to find 'charnames'
2649 * */
2650 NULL,
2651 newSVpvs(":full"),
2652 newSVpvs(":short"),
2653 NULL);
2654 }
2655 }
2656
2657 /* Here, it failed; new_constant will give appropriate error messages */
2658 *error_msg = NULL;
2659 res = new_constant( NULL, 0, "charnames", char_name, NULL,
2660 context, context_len, error_msg);
2661 SvREFCNT_dec(res);
2662
2663 return NULL;
2664}
2665
7a7d14f3 2666STATIC SV*
2c43c309
KW
2667S_get_and_check_backslash_N_name_wrapper(pTHX_ const char* s, const char* const e)
2668{
2669 /* This justs wraps get_and_check_backslash_N_name() to output any error
2670 * message it returns. */
2671
2672 const char * error_msg = NULL;
25c7fb78 2673 SV * result;
2c43c309
KW
2674
2675 PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME_WRAPPER;
2676
25c7fb78
KW
2677 /* charnames doesn't work well if there have been errors found */
2678 if (PL_error_count > 0) {
2679 return NULL;
2680 }
2681
2682 result = get_and_check_backslash_N_name(s, e, cBOOL(UTF), &error_msg);
2683
2c43c309
KW
2684 if (error_msg) {
2685 yyerror_pv(error_msg, UTF ? SVf_UTF8 : 0);
2686 }
2687
2688 return result;
2689}
2690
25c7fb78
KW
2691SV*
2692Perl_get_and_check_backslash_N_name(pTHX_ const char* s,
2693 const char* const e,
2694 const bool is_utf8,
2695 const char ** error_msg)
6f613c73 2696{
140b12ad
KW
2697 /* <s> points to first character of interior of \N{}, <e> to one beyond the
2698 * interior, hence to the "}". Finds what the name resolves to, returning
25c7fb78
KW
2699 * an SV* containing it; NULL if no valid one found.
2700 *
2701 * 'is_utf8' is TRUE if we know we want the result to be UTF-8 even if it
2702 * doesn't have to be. */
6f613c73 2703
7303cc1f 2704 SV* char_name;
25c7fb78 2705 SV* res;
0c415a79
KW
2706 HV * table;
2707 SV **cvp;
2708 SV *cv;
2709 SV *rv;
2710 HV *stash;
8b0cce63
KW
2711
2712 /* Points to the beginning of the \N{... so that any messages include the
2713 * context of what's failing*/
2714 const char* context = s - 3;
2715 STRLEN context_len = e - context + 1; /* include all of \N{...} */
2716
0c415a79 2717
6f613c73
KW
2718 PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME;
2719
25c7fb78
KW
2720 assert(e >= s);
2721 assert(s > (char *) 3);
2722
7303cc1f 2723 char_name = newSVpvn_flags(s, e - s, (is_utf8) ? SVf_UTF8 : 0);
25c7fb78 2724
7303cc1f
KW
2725 if (!SvCUR(char_name)) {
2726 SvREFCNT_dec_NN(char_name);
be332ba0 2727 /* diag_listed_as: Unknown charname '%s' */
2c43c309 2728 *error_msg = Perl_form(aTHX_ "Unknown charname ''");
be332ba0 2729 return NULL;
d8d26cac 2730 }
b7e6151c 2731
7303cc1f 2732 /* Autoload the charnames module */
7303cc1f 2733
4e8ee35f
KW
2734 table = load_charnames(char_name, context, context_len, error_msg);
2735 if (table == NULL) {
2736 return NULL;
7303cc1f
KW
2737 }
2738
2739 *error_msg = NULL;
8b0cce63
KW
2740 res = new_constant( NULL, 0, "charnames", char_name, NULL,
2741 context, context_len, error_msg);
7303cc1f 2742 if (*error_msg) {
4e8ee35f 2743 *error_msg = Perl_form(aTHX_ "Unknown charname '%s'", SvPVX(char_name));
7303cc1f
KW
2744
2745 SvREFCNT_dec(res);
6f613c73
KW
2746 return NULL;
2747 }
2748
0c415a79
KW
2749 /* See if the charnames handler is the Perl core's, and if so, we can skip
2750 * the validation needed for a user-supplied one, as Perl's does its own
2751 * validation. */
0c415a79 2752 cvp = hv_fetchs(table, "charnames", FALSE);
5882ddb3
FC
2753 if (cvp && (cv = *cvp) && SvROK(cv) && (rv = SvRV(cv),
2754 SvTYPE(rv) == SVt_PVCV) && ((stash = CvSTASH(rv)) != NULL))
0c415a79
KW
2755 {
2756 const char * const name = HvNAME(stash);
b59bf0b2 2757 if (memEQs(name, HvNAMELEN(stash), "_charnames")) {
0c415a79
KW
2758 return res;
2759 }
2760 }
2761
bde9e88d
KW
2762 /* Here, it isn't Perl's charname handler. We can't rely on a
2763 * user-supplied handler to validate the input name. For non-ut8 input,
2764 * look to see that the first character is legal. Then loop through the
2765 * rest checking that each is a continuation */
6f613c73 2766
36897d64
KW
2767 /* This code makes the reasonable assumption that the only Latin1-range
2768 * characters that begin a character name alias are alphabetic, otherwise
2769 * would have to create a isCHARNAME_BEGIN macro */
b6ba1137 2770
25c7fb78 2771 if (! is_utf8) {
bde9e88d 2772 if (! isALPHAU(*s)) {
b6ba1137
KW
2773 goto bad_charname;
2774 }
bde9e88d
KW
2775 s++;
2776 while (s < e) {
2777 if (! isCHARNAME_CONT(*s)) {
b6ba1137
KW
2778 goto bad_charname;
2779 }
2d8eb851
KW
2780 if (*s == ' ' && *(s-1) == ' ') {
2781 goto multi_spaces;
bd299e29 2782 }
bde9e88d 2783 s++;
b6ba1137
KW
2784 }
2785 }
2786 else {
bde9e88d 2787 /* Similarly for utf8. For invariants can check directly; for other
58aa6738
KW
2788 * Latin1, can calculate their code point and check; otherwise use an
2789 * inversion list */
bde9e88d
KW
2790 if (UTF8_IS_INVARIANT(*s)) {
2791 if (! isALPHAU(*s)) {
140b12ad
KW
2792 goto bad_charname;
2793 }
bde9e88d
KW
2794 s++;
2795 } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
a62b247b 2796 if (! isALPHAU(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)))) {
b6ba1137 2797 goto bad_charname;
6f613c73 2798 }
bde9e88d 2799 s += 2;
6f613c73 2800 }
bde9e88d 2801 else {
f1bcae08
KW
2802 if (! _invlist_contains_cp(PL_utf8_charname_begin,
2803 utf8_to_uvchr_buf((U8 *) s,
2804 (U8 *) e,
2805 NULL)))
2806 {
bde9e88d
KW
2807 goto bad_charname;
2808 }
2809 s += UTF8SKIP(s);
2810 }
2811
2812 while (s < e) {
2813 if (UTF8_IS_INVARIANT(*s)) {
2814 if (! isCHARNAME_CONT(*s)) {
2815 goto bad_charname;
2816 }
2d8eb851
KW
2817 if (*s == ' ' && *(s-1) == ' ') {
2818 goto multi_spaces;
bd299e29 2819 }
bde9e88d
KW
2820 s++;
2821 }
2822 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
a62b247b 2823 if (! isCHARNAME_CONT(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1))))
bde9e88d
KW
2824 {
2825 goto bad_charname;
2826 }
2827 s += 2;
2828 }
2829 else {
f1bcae08
KW
2830 if (! _invlist_contains_cp(PL_utf8_charname_continue,
2831 utf8_to_uvchr_buf((U8 *) s,
2832 (U8 *) e,
2833 NULL)))
2834 {
bde9e88d
KW
2835 goto bad_charname;
2836 }
2837 s += UTF8SKIP(s);
6f613c73
KW
2838 }
2839 }
2d8eb851
KW
2840 }
2841 if (*(s-1) == ' ') {
8d9d0498
FC
2842 /* diag_listed_as: charnames alias definitions may not contain
2843 trailing white-space; marked by <-- HERE in %s
2844 */
2c43c309 2845 *error_msg = Perl_form(aTHX_
2d8eb851
KW
2846 "charnames alias definitions may not contain trailing "
2847 "white-space; marked by <-- HERE in %.*s<-- HERE %.*s",
8b0cce63 2848 (int)(s - context + 1), context,
2c43c309 2849 (int)(e - s + 1), s + 1);
2d8eb851 2850 return NULL;
6f613c73
KW
2851 }
2852
870fe793 2853 if (SvUTF8(res)) { /* Don't accept malformed charname value */
bde9e88d
KW
2854 const U8* first_bad_char_loc;
2855 STRLEN len;
2856 const char* const str = SvPV_const(res, len);
0aab20f2
KW
2857 if (UNLIKELY(! is_utf8_string_loc((U8 *) str, len,
2858 &first_bad_char_loc)))
2859 {
75219bac
KW
2860 _force_out_malformed_utf8_message(first_bad_char_loc,
2861 (U8 *) PL_parser->bufend,
2862 0,
2863 0 /* 0 means don't die */ );
8d9d0498
FC
2864 /* diag_listed_as: Malformed UTF-8 returned by \N{%s}
2865 immediately after '%s' */
2c43c309 2866 *error_msg = Perl_form(aTHX_
bde9e88d 2867 "Malformed UTF-8 returned by %.*s immediately after '%.*s'",
8b0cce63 2868 (int) context_len, context,
2c43c309 2869 (int) ((char *) first_bad_char_loc - str), str);
bde9e88d
KW
2870 return NULL;
2871 }
2872 }
140b12ad 2873
bde9e88d 2874 return res;
140b12ad 2875
bde9e88d 2876 bad_charname: {
bde9e88d
KW
2877
2878 /* The final %.*s makes sure that should the trailing NUL be missing
2879 * that this print won't run off the end of the string */
8d9d0498
FC
2880 /* diag_listed_as: Invalid character in \N{...}; marked by <-- HERE
2881 in \N{%s} */
2c43c309 2882 *error_msg = Perl_form(aTHX_
bde9e88d 2883 "Invalid character in \\N{...}; marked by <-- HERE in %.*s<-- HERE %.*s",
8b0cce63 2884 (int)(s - context + 1), context,
2c43c309 2885 (int)(e - s + 1), s + 1);
bde9e88d
KW
2886 return NULL;
2887 }
2d8eb851
KW
2888
2889 multi_spaces:
8d9d0498
FC
2890 /* diag_listed_as: charnames alias definitions may not contain a
2891 sequence of multiple spaces; marked by <-- HERE
2892 in %s */
2c43c309 2893 *error_msg = Perl_form(aTHX_
2d8eb851
KW
2894 "charnames alias definitions may not contain a sequence of "
2895 "multiple spaces; marked by <-- HERE in %.*s<-- HERE %.*s",
8b0cce63 2896 (int)(s - context + 1), context,
2c43c309 2897 (int)(e - s + 1), s + 1);
2d8eb851 2898 return NULL;
6f613c73
KW
2899}
2900
02aa26ce
NT
2901/*
2902 scan_const
2903
9da1dd8f
DM
2904 Extracts the next constant part of a pattern, double-quoted string,
2905 or transliteration. This is terrifying code.
2906
2907 For example, in parsing the double-quoted string "ab\x63$d", it would
2908 stop at the '$' and return an OP_CONST containing 'abc'.
02aa26ce 2909
94def140 2910 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 2911 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 2912 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 2913
94def140
ST
2914 Returns a pointer to the character scanned up to. If this is
2915 advanced from the start pointer supplied (i.e. if anything was
9da1dd8f 2916 successfully parsed), will leave an OP_CONST for the substring scanned
6154021b 2917 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
2918 by looking at the next characters herself.
2919
02aa26ce 2920 In patterns:
9da1dd8f 2921 expand:
537124e4
KW
2922 \N{FOO} => \N{U+hex_for_character_FOO}
2923 (if FOO expands to multiple characters, expands to \N{U+xx.XX.yy ...})
9da1dd8f
DM
2924
2925 pass through:
2926 all other \-char, including \N and \N{ apart from \N{ABC}
2927
2928 stops on:
2929 @ and $ where it appears to be a var, but not for $ as tail anchor
2930 \l \L \u \U \Q \E
2931 (?{ or (??{
2932
02aa26ce
NT
2933 In transliterations:
2934 characters are VERY literal, except for - not at the start or end
66b09263
KW
2935 of the string, which indicates a range. However some backslash sequences
2936 are recognized: \r, \n, and the like
2937 \007 \o{}, \x{}, \N{}
2938 If all elements in the transliteration are below 256,
94def140
ST
2939 scan_const expands the range to the full set of intermediate
2940 characters. If the range is in utf8, the hyphen is replaced with
2941 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
2942
2943 In double-quoted strings:
2944 backslashes:
66b09263 2945 all those recognized in transliterations
94def140 2946 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
2947 case and quoting: \U \Q \E
2948 stops on @ and $
2949
2950 scan_const does *not* construct ops to handle interpolated strings.
2951 It stops processing as soon as it finds an embedded $ or @ variable
2952 and leaves it to the caller to work out what's going on.
2953
94def140
ST
2954 embedded arrays (whether in pattern or not) could be:
2955 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2956
2957 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
2958
2959 $ in pattern could be $foo or could be tail anchor. Assumption:
2960 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 2961 followed by one of "()| \r\n\t"
02aa26ce 2962
9da1dd8f 2963 \1 (backreferences) are turned into $1 in substitutions
02aa26ce
NT
2964
2965 The structure of the code is
2966 while (there's a character to process) {
94def140
ST
2967 handle transliteration ranges
2968 skip regexp comments /(?#comment)/ and codes /(?{code})/
2969 skip #-initiated comments in //x patterns
2970 check for embedded arrays
02aa26ce
NT
2971 check for embedded scalars
2972 if (backslash) {
94def140 2973 deprecate \1 in substitution replacements
02aa26ce
NT
2974 handle string-changing backslashes \l \U \Q \E, etc.
2975 switch (what was escaped) {
94def140 2976 handle \- in a transliteration (becomes a literal -)
ff3f963a 2977 if a pattern and not \N{, go treat as regular character
94def140
ST
2978 handle \132 (octal characters)
2979 handle \x15 and \x{1234} (hex characters)
ff3f963a 2980 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
ST
2981 handle \cV (control characters)
2982 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 2983 } (end switch)
77a135fe 2984 continue
02aa26ce 2985 } (end if backslash)
77a135fe 2986 handle regular character
02aa26ce 2987 } (end while character to read)
02cd137d 2988
02aa26ce
NT
2989*/
2990
76e3520e 2991STATIC char *
cea2e8a9 2992S_scan_const(pTHX_ char *start)
79072805 2993{
eb578fdb 2994 char *send = PL_bufend; /* end of the constant */
dc023dbb
KW
2995 SV *sv = newSV(send - start); /* sv for the constant. See note below
2996 on sizing. */
eb578fdb
KW
2997 char *s = start; /* start of the constant */
2998 char *d = SvPVX(sv); /* destination for copies */
dc023dbb
KW
2999 bool dorange = FALSE; /* are we in a translit range? */
3000 bool didrange = FALSE; /* did we just finish a range? */
3001 bool in_charclass = FALSE; /* within /[...]/ */
ed2893cb 3002 bool s_is_utf8 = cBOOL(UTF); /* Is the source string assumed to be
dc023dbb
KW
3003 UTF8? But, this can show as true
3004 when the source isn't utf8, as for
3005 example when it is entirely composed
3006 of hex constants */
f34acfec 3007 bool d_is_utf8 = FALSE; /* Output constant is UTF8 */
af9be36c
KW
3008 STRLEN utf8_variant_count = 0; /* When not in UTF-8, this counts the
3009 number of characters found so far
3010 that will expand (into 2 bytes)
3011 should we have to convert to
3012 UTF-8) */
6f613c73 3013 SV *res; /* result from charnames */
c89db733
JH
3014 STRLEN offset_to_max = 0; /* The offset in the output to where the range
3015 high-end character is temporarily placed */
77a135fe 3016
fe2ba0a2
KW
3017 /* Does something require special handling in tr/// ? This avoids extra
3018 * work in a less likely case. As such, khw didn't feel it was worth
3019 * adding any branches to the more mainline code to handle this, which
3020 * means that this doesn't get set in some circumstances when things like
3021 * \x{100} get expanded out. As a result there needs to be extra testing
3022 * done in the tr code */
3023 bool has_above_latin1 = FALSE;
3024
77a135fe
KW
3025 /* Note on sizing: The scanned constant is placed into sv, which is
3026 * initialized by newSV() assuming one byte of output for every byte of
3027 * input. This routine expects newSV() to allocate an extra byte for a
3028 * trailing NUL, which this routine will append if it gets to the end of
3029 * the input. There may be more bytes of input than output (eg., \N{LATIN
3030 * CAPITAL LETTER A}), or more output than input if the constant ends up
3031 * recoded to utf8, but each time a construct is found that might increase
3032 * the needed size, SvGROW() is called. Its size parameter each time is
3033 * based on the best guess estimate at the time, namely the length used so
3034 * far, plus the length the current construct will occupy, plus room for
caae0700 3035 * the trailing NUL, plus one byte for every input byte still unscanned */
77a135fe 3036
c3320c2a
KW
3037 UV uv = UV_MAX; /* Initialize to weird value to try to catch any uses
3038 before set */
4c3a8340 3039#ifdef EBCDIC
f4240379
KW
3040 int backslash_N = 0; /* ? was the character from \N{} */
3041 int non_portable_endpoint = 0; /* ? In a range is an endpoint
3042 platform-specific like \x65 */
4c3a8340 3043#endif
012bcf8d 3044
7918f24d
NC
3045 PERL_ARGS_ASSERT_SCAN_CONST;
3046
bb16bae8 3047 assert(PL_lex_inwhat != OP_TRANSR);
2b9d42f0 3048
b899e89d
FC
3049 /* Protect sv from errors and fatal warnings. */
3050 ENTER_with_name("scan_const");
3051 SAVEFREESV(sv);
2b9d42f0 3052
1759517a
KW
3053 /* A bunch of code in the loop below assumes that if s[n] exists and is not
3054 * NUL, then s[n+1] exists. This assertion makes sure that assumption is
3055 * valid */
3056 assert(*send == '\0');
3057
f4240379
KW
3058 while (s < send
3059 || dorange /* Handle tr/// range at right edge of input */
3060 ) {
ff3f963a 3061
02aa26ce 3062 /* get transliterations out of the way (they're most literal) */
3280af22 3063 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 3064
f4240379
KW
3065 /* But there isn't any special handling necessary unless there is a
3066 * range, so for most cases we just drop down and handle the value
3067 * as any other. There are two exceptions.
3068 *
02cd137d
KW
3069 * 1. A hyphen indicates that we are actually going to have a
3070 * range. In this case, skip the '-', set a flag, then drop
f4240379
KW
3071 * down to handle what should be the end range value.
3072 * 2. After we've handled that value, the next time through, that
3073 * flag is set and we fix up the range.
3074 *
3075 * Ranges entirely within Latin1 are expanded out entirely, in
188d22cf
KW
3076 * order to make the transliteration a simple table look-up.
3077 * Ranges that extend above Latin1 have to be done differently, so
3078 * there is no advantage to expanding them here, so they are
dc8faf6b
KW
3079 * stored here as Min, RANGE_INDICATOR, Max. 'RANGE_INDICATOR' is
3080 * a byte that can't occur in legal UTF-8, and hence can signify a
3081 * hyphen without any possible ambiguity. On EBCDIC machines, if
3082 * the range is expressed as Unicode, the Latin1 portion is
3083 * expanded out even if the range extends above Latin1. This is
3084 * because each code point in it has to be processed here
3085 * individually to get its native translation */
f4240379
KW
3086
3087 if (! dorange) {
3088
02cd137d
KW
3089 /* Here, we don't think we're in a range. If the new character
3090 * is not a hyphen; or if it is a hyphen, but it's too close to
e8d55f27
TC
3091 * either edge to indicate a range, or if we haven't output any
3092 * characters yet then it's a regular character. */
04863ba1
KW
3093 if (*s != '-' || s >= send - 1 || s == start || d == SvPVX(sv))
3094 {
f4240379
KW
3095
3096 /* A regular character. Process like any other, but first
3097 * clear any flags */
3098 didrange = FALSE;
3099 dorange = FALSE;
e294cc5d 3100#ifdef EBCDIC
f4240379
KW
3101 non_portable_endpoint = 0;
3102 backslash_N = 0;
e294cc5d 3103#endif
02cd137d
KW
3104 /* The tests here for being above Latin1 and similar ones
3105 * in the following 'else' suffice to find all such
3106 * occurences in the constant, except those added by a
fe2ba0a2 3107 * backslash escape sequence, like \x{100}. Mostly, those
02cd137d 3108 * set 'has_above_latin1' as appropriate */
ed2893cb 3109 if (s_is_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) {
188d22cf
KW
3110 has_above_latin1 = TRUE;
3111 }
3112
f4240379
KW
3113 /* Drops down to generic code to process current byte */
3114 }
02cd137d 3115 else { /* Is a '-' in the context where it means a range */
f4240379 3116 if (didrange) { /* Something like y/A-C-Z// */
02cd137d
KW
3117 Perl_croak(aTHX_ "Ambiguous range in transliteration"
3118 " operator");
f4240379 3119 }
e294cc5d 3120
f4240379 3121 dorange = TRUE;
2b9d42f0 3122
02cd137d 3123 s++; /* Skip past the hyphen */
f4240379
KW
3124
3125 /* d now points to where the end-range character will be
d7f7b0e3
KW
3126 * placed. Drop down to get that character. We'll finish
3127 * processing the range the next time through the loop */
188d22cf 3128
ed2893cb 3129 if (s_is_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) {
188d22cf
KW
3130 has_above_latin1 = TRUE;
3131 }
02cd137d
KW
3132
3133 /* Drops down to generic code to process current byte */
f4240379
KW
3134 }
3135 } /* End of not a range */
3136 else {
3137 /* Here we have parsed a range. Now must handle it. At this
3138 * point:
3139 * 'sv' is a SV* that contains the output string we are
3140 * constructing. The final two characters in that string
3141 * are the range start and range end, in order.
3142 * 'd' points to just beyond the range end in the 'sv' string,
3143 * where we would next place something
f4240379 3144 */
0c311b7c 3145 char * max_ptr;
5ca37d54 3146 char * min_ptr;
f4240379
KW
3147 IV range_min;
3148 IV range_max; /* last character in range */
f4240379 3149 STRLEN grow;
5ca37d54
KW
3150 Size_t offset_to_min = 0;
3151 Size_t extras = 0;
11327fa1 3152#ifdef EBCDIC
f4240379
KW
3153 bool convert_unicode;
3154 IV real_range_max = 0;
e294cc5d 3155#endif
02cd137d 3156 /* Get the code point values of the range ends. */
0c311b7c
KW
3157 max_ptr = (d_is_utf8) ? (char *) utf8_hop( (U8*) d, -1) : d - 1;
3158 offset_to_max = max_ptr - SvPVX_const(sv);
ed2893cb 3159 if (d_is_utf8) {
f4240379
KW
3160 /* We know the utf8 is valid, because we just constructed
3161 * it ourselves in previous loop iterations */
3162 min_ptr = (char*) utf8_hop( (U8*) max_ptr, -1);
3163 range_min = valid_utf8_to_uvchr( (U8*) min_ptr, NULL);
3164 range_max = valid_utf8_to_uvchr( (U8*) max_ptr, NULL);
fe2ba0a2
KW
3165
3166 /* This compensates for not all code setting
3167 * 'has_above_latin1', so that we don't skip stuff that
3168 * should be executed */
3169 if (range_max > 255) {
3170 has_above_latin1 = TRUE;
3171 }
e294cc5d 3172 }
f4240379
KW
3173 else {
3174 min_ptr = max_ptr - 1;
3175 range_min = * (U8*) min_ptr;
3176 range_max = * (U8*) max_ptr;
3177 }
3178
8efef67c
KW
3179 /* If the range is just a single code point, like tr/a-a/.../,
3180 * that code point is already in the output, twice. We can
3181 * just back up over the second instance and avoid all the rest
3182 * of the work. But if it is a variant character, it's been
218304f9
KW
3183 * counted twice, so decrement. (This unlikely scenario is
3184 * special cased, like the one for a range of 2 code points
3185 * below, only because the main-line code below needs a range
3186 * of 3 or more to work without special casing. Might as well
3187 * get it out of the way now.) */
8efef67c
KW
3188 if (UNLIKELY(range_max == range_min)) {
3189 d = max_ptr;
ed2893cb 3190 if (! d_is_utf8 && ! UVCHR_IS_INVARIANT(range_max)) {
8efef67c
KW
3191 utf8_variant_count--;
3192 }
3193 goto range_done;
3194 }
3195
e294cc5d 3196#ifdef EBCDIC
f4240379
KW
3197 /* On EBCDIC platforms, we may have to deal with portable
3198 * ranges. These happen if at least one range endpoint is a
3199 * Unicode value (\N{...}), or if the range is a subset of
3200 * [A-Z] or [a-z], and both ends are literal characters,
3201 * like 'A', and not like \x{C1} */
87dd6ea7 3202 convert_unicode =
02cd137d
KW
3203 cBOOL(backslash_N) /* \N{} forces Unicode,
3204 hence portable range */
3205 || ( ! non_portable_endpoint
3206 && (( isLOWER_A(range_min) && isLOWER_A(range_max))
3207 || (isUPPER_A(range_min) && isUPPER_A(range_max))));
87dd6ea7 3208 if (convert_unicode) {
f4240379
KW
3209
3210 /* Special handling is needed for these portable ranges.
02cd137d
KW
3211 * They are defined to be in Unicode terms, which includes
3212 * all the Unicode code points between the end points.
f4240379
KW
3213 * Convert to Unicode to get the Unicode range. Later we
3214 * will convert each code point in the range back to
3215 * native. */
3216 range_min = NATIVE_TO_UNI(range_min);
3217 range_max = NATIVE_TO_UNI(range_max);
3218 }
e294cc5d 3219#endif
8ada0baa 3220
f4240379 3221 if (range_min > range_max) {
11327fa1 3222#ifdef EBCDIC
f4240379
KW
3223 if (convert_unicode) {
3224 /* Need to convert back to native for meaningful
3225 * messages for this platform */
3226 range_min = UNI_TO_NATIVE(range_min);
3227 range_max = UNI_TO_NATIVE(range_max);
3228 }
11327fa1 3229#endif
f4240379
KW
3230 /* Use the characters themselves for the error message if
3231 * ASCII printables; otherwise some visible representation
3232 * of them */
3233 if (isPRINT_A(range_min) && isPRINT_A(range_max)) {
3234 Perl_croak(aTHX_
3235 "Invalid range \"%c-%c\" in transliteration operator",
3236 (char)range_min, (char)range_max);
3237 }
11327fa1 3238#ifdef EBCDIC
f4240379 3239 else if (convert_unicode) {
02cd137d 3240 /* diag_listed_as: Invalid range "%s" in transliteration operator */
f4240379 3241 Perl_croak(aTHX_
02cd137d
KW
3242 "Invalid range \"\\N{U+%04" UVXf "}-\\N{U+%04"
3243 UVXf "}\" in transliteration operator",
3244 range_min, range_max);
f4240379 3245 }
11327fa1 3246#endif
f4240379 3247 else {
02cd137d 3248 /* diag_listed_as: Invalid range "%s" in transliteration operator */
f4240379 3249 Perl_croak(aTHX_
02cd137d
KW
3250 "Invalid range \"\\x{%04" UVXf "}-\\x{%04" UVXf "}\""
3251 " in transliteration operator",
3252 range_min, range_max);
f4240379 3253 }
c2e66d9e
GS
3254 }
3255
aca41667
KW
3256 /* If the range is exactly two code points long, they are
3257 * already both in the output */
3258 if (UNLIKELY(range_min + 1 == range_max)) {
3259 goto range_done;
3260 }
3261
3262 /* Here the range contains at least 3 code points */
3263
ed2893cb 3264 if (d_is_utf8) {
f4240379 3265
188d22cf
KW
3266 /* If everything in the transliteration is below 256, we
3267 * can avoid special handling later. A translation table
02cd137d
KW
3268 * for each of those bytes is created by op.c. So we
3269 * expand out all ranges to their constituent code points.
3270 * But if we've encountered something above 255, the
3271 * expanding won't help, so skip doing that. But if it's
3272 * EBCDIC, we may have to look at each character below 256
3273 * if we have to convert to/from Unicode values */
188d22cf 3274 if ( has_above_latin1
c7f1f016 3275#ifdef EBCDIC
f4240379 3276 && (range_min > 255 || ! convert_unicode)
8ada0baa 3277#endif
f4240379 3278 ) {
3fdfceb3
KW
3279 const STRLEN off = d - SvPVX(sv);
3280 const STRLEN extra = 1 + (send - s) + 1;
3281 char *e;
3282
f4240379
KW
3283 /* Move the high character one byte to the right; then
3284 * insert between it and the range begin, an illegal
3285 * byte which serves to indicate this is a range (using
02cd137d 3286 * a '-' would be ambiguous). */
3fdfceb3
KW
3287
3288 if (off + extra > SvLEN(sv)) {
3289 d = off + SvGROW(sv, off + extra);
3290 max_ptr = d - off + offset_to_max;
3291 }
3292
3293 e = d++;
f4240379
KW
3294 while (e-- > max_ptr) {
3295 *(e + 1) = *e;
e294cc5d 3296 }
dc8faf6b 3297 *(e + 1) = (char) RANGE_INDICATOR;
f4240379
KW
3298 goto range_done;
3299 }
3300
3301 /* Here, we're going to expand out the range. For EBCDIC
3302 * the range can extend above 255 (not so in ASCII), so
3303 * for EBCDIC, split it into the parts above and below
3304 * 255/256 */
e294cc5d 3305#ifdef EBCDIC
f4240379
KW
3306 if (range_max > 255) {
3307 real_range_max = range_max;
3308 range_max = 255;
3309 }
e294cc5d 3310#endif
f4240379 3311 }
02aa26ce 3312
f4240379 3313 /* Here we need to expand out the string to contain each
5ca37d54
KW
3314 * character in the range. Grow the output to handle this.
3315 * For non-UTF8, we need a byte for each code point in the
3316 * range, minus the three that we've already allocated for: the
3317 * hyphen, the min, and the max. For UTF-8, we need this
3318 * plus an extra byte for each code point that occupies two
3319 * bytes (is variant) when in UTF-8 (except we've already
3320 * allocated for the end points, including if they are
3321 * variants). For ASCII platforms and Unicode ranges on EBCDIC
3322 * platforms, it's easy to calculate a precise number. To
3323 * start, we count the variants in the range, which we need
3324 * elsewhere in this function anyway. (For the case where it
3325 * isn't easy to calculate, 'extras' has been initialized to 0,
3326 * and the calculation is done in a loop further down.) */
3327#ifdef EBCDIC
3328 if (convert_unicode)
3329#endif
3330 {
3331 /* This is executed unconditionally on ASCII, and for
3332 * Unicode ranges on EBCDIC. Under these conditions, all
3333 * code points above a certain value are variant; and none
3334 * under that value are. We just need to find out how much
3335 * of the range is above that value. We don't count the
3336 * end points here, as they will already have been counted
3337 * as they were parsed. */
3338 if (range_min >= UTF_CONTINUATION_MARK) {
3339
3340 /* The whole range is made up of variants */
3341 extras = (range_max - 1) - (range_min + 1) + 1;
3342 }
3343 else if (range_max >= UTF_CONTINUATION_MARK) {
f4240379 3344
5ca37d54
KW
3345 /* Only the higher portion of the range is variants */
3346 extras = (range_max - 1) - UTF_CONTINUATION_MARK + 1;
3347 }
f4240379 3348
5ca37d54
KW
3349 utf8_variant_count += extras;
3350 }
3351
3352 /* The base growth is the number of code points in the range,
3353 * not including the endpoints, which have already been sized
3354 * for (and output). We don't subtract for the hyphen, as it
3355 * has been parsed but not output, and the SvGROW below is
3356 * based only on what's been output plus what's left to parse.
3357 * */
3358 grow = (range_max - 1) - (range_min + 1) + 1;
f4240379 3359
ed2893cb 3360 if (d_is_utf8) {
4c3a8340 3361#ifdef EBCDIC
5ca37d54
KW
3362 /* In some cases in EBCDIC, we haven't yet calculated a
3363 * precise amount needed for the UTF-8 variants. Just
3364 * assume the worst case, that everything will expand by a
3365 * byte */
3366 if (! convert_unicode) {
f4240379
KW
3367 grow *= 2;
3368 }
5ca37d54 3369 else
4c3a8340 3370#endif
5ca37d54
KW
3371 {
3372 /* Otherwise we know exactly how many variants there
3373 * are in the range. */
3374 grow += extras;
3375 }
f4240379
KW
3376 }
3377
5ca37d54
KW
3378 /* Grow, but position the output to overwrite the range min end
3379 * point, because in some cases we overwrite that */
3380 SvCUR_set(sv, d - SvPVX_const(sv));
3381 offset_to_min = min_ptr - SvPVX_const(sv);
3382
3383 /* See Note on sizing above. */
3384 d = offset_to_min + SvGROW(sv, SvCUR(sv)
3385 + (send - s)
3386 + grow
3387 + 1 /* Trailing NUL */ );
f4240379 3388
5ca37d54 3389 /* Now, we can expand out the range. */
11327fa1 3390#ifdef EBCDIC
f4240379 3391 if (convert_unicode) {
5ca37d54 3392 SSize_t i;
02aa26ce 3393
f4240379
KW
3394 /* Recall that the min and max are now in Unicode terms, so
3395 * we have to convert each character to its native
3396 * equivalent */
ed2893cb 3397 if (d_is_utf8) {
f4240379 3398 for (i = range_min; i <= range_max; i++) {
02cd137d
KW
3399 append_utf8_from_native_byte(
3400 LATIN1_TO_NATIVE((U8) i),
3401 (U8 **) &d);
f4240379
KW
3402 }
3403 }
3404 else {
3405 for (i = range_min; i <= range_max; i++) {
81324705 3406 *d++ = (char)LATIN1_TO_NATIVE((U8) i);
f4240379
KW
3407 }
3408 }
01ec43d0 3409 }
11327fa1
AL
3410 else
3411#endif
3412 /* Always gets run for ASCII, and sometimes for EBCDIC. */
3413 {
f4240379
KW
3414 /* Here, no conversions are necessary, which means that the
3415 * first character in the range is already in 'd' and
3416 * valid, so we can skip overwriting it */
ed2893cb 3417 if (d_is_utf8) {
19742f39 3418 SSize_t i;
f4240379
KW
3419 d += UTF8SKIP(d);
3420 for (i = range_min + 1; i <= range_max; i++) {
81324705 3421 append_utf8_from_native_byte((U8) i, (U8 **) &d);
f4240379
KW
3422 }
3423 }
3424 else {
19742f39 3425 SSize_t i;
f4240379 3426 d++;
5ca37d54
KW
3427 assert(range_min + 1 <= range_max);
3428 for (i = range_min + 1; i < range_max; i++) {
3429#ifdef EBCDIC
3430 /* In this case on EBCDIC, we haven't calculated
3431 * the variants. Do it here, as we go along */
3432 if (! UVCHR_IS_INVARIANT(i)) {
3433 utf8_variant_count++;
3434 }
3435#endif
f4240379
KW
3436 *d++ = (char)i;
3437 }
5ca37d54
KW
3438
3439 /* The range_max is done outside the loop so as to
3440 * avoid having to special case not incrementing
3441 * 'utf8_variant_count' on EBCDIC (it's already been
3442 * counted when originally parsed) */
3443 *d++ = (char) range_max;
f4240379 3444 }
a0ed51b3 3445 }
02aa26ce 3446
11327fa1 3447#ifdef EBCDIC
02cd137d
KW
3448 /* If the original range extended above 255, add in that
3449 * portion. */
f4240379
KW
3450 if (real_range_max) {
3451 *d++ = (char) UTF8_TWO_BYTE_HI(0x100);
3452 *d++ = (char) UTF8_TWO_BYTE_LO(0x100);
043af6f6
KW
3453 if (real_range_max > 0x100) {
3454 if (real_range_max > 0x101) {
dc8faf6b 3455 *d++ = (char) RANGE_INDICATOR;
043af6f6 3456 }
f4240379 3457 d = (char*)uvchr_to_utf8((U8*)d, real_range_max);
043af6f6 3458 }
f4240379 3459 }
11327fa1 3460#endif
02aa26ce 3461
f4240379
KW
3462 range_done:
3463 /* mark the range as done, and continue */
3464 didrange = TRUE;
3465 dorange = FALSE;
3466#ifdef EBCDIC
3467 non_portable_endpoint = 0;
3468 backslash_N = 0;
3469#endif
3470 continue;
3471 } /* End of is a range */
3472 } /* End of transliteration. Joins main code after these else's */
e4a2df84
DM
3473 else if (*s == '[' && PL_lex_inpat && !in_charclass) {
3474 char *s1 = s-1;
3475 int esc = 0;
3476 while (s1 >= start && *s1-- == '\\')
3477 esc = !esc;
3478 if (!esc)
3479 in_charclass = TRUE;
3480 }
1e02a175 3481 else if (*s == ']' && PL_lex_inpat && in_charclass) {
e4a2df84
DM
3482 char *s1 = s-1;
3483 int esc = 0;
3484 while (s1 >= start && *s1-- == '\\')
3485 esc = !esc;
3486 if (!esc)
3487 in_charclass = FALSE;
3488 }
02cd137d
KW
3489 /* skip for regexp comments /(?#comment)/, except for the last
3490 * char, which will be done separately. Stop on (?{..}) and
3491 * friends */
c30fc27b 3492 else if (*s == '(' && PL_lex_inpat && s[1] == '?' && !in_charclass) {
cc6b7395 3493 if (s[2] == '#') {
f339d50e
KW
3494 if (s_is_utf8) {
3495 PERL_UINT_FAST8_T len = UTF8SKIP(s);
3496
3497 while (s + len < send && *s != ')') {
3498 Copy(s, d, len, U8);
3499 d += len;
3500 s += len;
3501 len = UTF8_SAFE_SKIP(s, send);
3502 }
3503 }
3504 else while (s+1 < send && *s != ')') {
3505 *d++ = *s++;
3506 }
155aba94 3507 }
407f8cf2
KW
3508 else if (!PL_lex_casemods
3509 && ( s[2] == '{' /* This should match regcomp.c */
3510 || (s[2] == '?' && s[3] == '{')))
155aba94 3511 {
9da1dd8f 3512 break;
cc6b7395 3513 }
748a9306 3514 }
02cd137d 3515 /* likewise skip #-initiated comments in //x patterns */
407f8cf2
KW
3516 else if (*s == '#'
3517 && PL_lex_inpat
3518 && !in_charclass
3519 && ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED)
3520 {
8faf4f30 3521 while (s < send && *s != '\n')
5ff03569 3522 *d++ = *s++;
748a9306 3523 }
02cd137d 3524 /* no further processing of single-quoted regex */
9da1dd8f
DM
3525 else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'')
3526 goto default_action;
3527
02cd137d
KW
3528 /* check for embedded arrays
3529 * (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
3530 */
1749ea0d 3531 else if (*s == '@' && s[1]) {
fac0f7a3
KW
3532 if (UTF
3533 ? isIDFIRST_utf8_safe(s+1, send)
3534 : isWORDCHAR_A(s[1]))
3535 {
1749ea0d 3536 break;