This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
toke.c: Allow \N{} handling fcn to be used elsewhere in core
[perl5.git] / toke.c
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
f0e67a1d
Z
26This is the lower layer of the Perl parser, managing characters and tokens.
27
28=for apidoc AmU|yy_parser *|PL_parser
29
30Pointer to a structure encapsulating the state of the parsing operation
31currently in progress. The pointer can be locally changed to perform
32a nested parse without interfering with the state of an outer parse.
33Individual members of C<PL_parser> have their own documentation.
34
35=cut
36*/
37
378cc40b 38#include "EXTERN.h"
864dbfa3 39#define PERL_IN_TOKE_C
378cc40b 40#include "perl.h"
f7e03a10 41#include "dquote_inline.h"
f1bcae08 42#include "invlist_inline.h"
378cc40b 43
164e423c
KW
44#define new_constant(a,b,c,d,e,f,g, h) \
45 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g, h)
eb0d8d16 46
6154021b 47#define pl_yylval (PL_parser->yylval)
d3b6f988 48
199e78b7
DM
49/* XXX temporary backwards compatibility */
50#define PL_lex_brackets (PL_parser->lex_brackets)
78cdf107
Z
51#define PL_lex_allbrackets (PL_parser->lex_allbrackets)
52#define PL_lex_fakeeof (PL_parser->lex_fakeeof)
199e78b7
DM
53#define PL_lex_brackstack (PL_parser->lex_brackstack)
54#define PL_lex_casemods (PL_parser->lex_casemods)
55#define PL_lex_casestack (PL_parser->lex_casestack)
199e78b7 56#define PL_lex_dojoin (PL_parser->lex_dojoin)
199e78b7
DM
57#define PL_lex_formbrack (PL_parser->lex_formbrack)
58#define PL_lex_inpat (PL_parser->lex_inpat)
59#define PL_lex_inwhat (PL_parser->lex_inwhat)
60#define PL_lex_op (PL_parser->lex_op)
61#define PL_lex_repl (PL_parser->lex_repl)
62#define PL_lex_starts (PL_parser->lex_starts)
63#define PL_lex_stuff (PL_parser->lex_stuff)
64#define PL_multi_start (PL_parser->multi_start)
65#define PL_multi_open (PL_parser->multi_open)
66#define PL_multi_close (PL_parser->multi_close)
199e78b7 67#define PL_preambled (PL_parser->preambled)
bdc0bf6f 68#define PL_linestr (PL_parser->linestr)
c2598295
DM
69#define PL_expect (PL_parser->expect)
70#define PL_copline (PL_parser->copline)
f06b5848
DM
71#define PL_bufptr (PL_parser->bufptr)
72#define PL_oldbufptr (PL_parser->oldbufptr)
73#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
74#define PL_linestart (PL_parser->linestart)
75#define PL_bufend (PL_parser->bufend)
76#define PL_last_uni (PL_parser->last_uni)
77#define PL_last_lop (PL_parser->last_lop)
78#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 79#define PL_lex_state (PL_parser->lex_state)
2f9285f8 80#define PL_rsfp (PL_parser->rsfp)
5486870f 81#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
82#define PL_in_my (PL_parser->in_my)
83#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 84#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 85#define PL_multi_end (PL_parser->multi_end)
13765c85 86#define PL_error_count (PL_parser->error_count)
199e78b7 87
fb205e7a
DM
88# define PL_nexttoke (PL_parser->nexttoke)
89# define PL_nexttype (PL_parser->nexttype)
90# define PL_nextval (PL_parser->nextval)
199e78b7 91
6432a58a
DM
92
93#define SvEVALED(sv) \
94 (SvTYPE(sv) >= SVt_PVNV \
95 && ((XPVIV*)SvANY(sv))->xiv_u.xivu_eval_seen)
96
a1894d81 97static const char* const ident_too_long = "Identifier too long";
8903cb82 98
9ded7720 99# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2 100
a7aaec61
Z
101#define XENUMMASK 0x3f
102#define XFAKEEOF 0x40
103#define XFAKEBRACK 0x80
9059aa12 104
39e02b42 105#ifdef USE_UTF8_SCRIPTS
b3041197 106# define UTF cBOOL(!IN_BYTES)
2b9d42f0 107#else
b3041197 108# define UTF cBOOL((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
2b9d42f0 109#endif
a0ed51b3 110
b1fc3636
CJ
111/* The maximum number of characters preceding the unrecognized one to display */
112#define UNRECOGNIZED_PRECEDE_COUNT 10
113
61f0cdd9 114/* In variables named $^X, these are the legal values for X.
2b92dfce
GS
115 * 1999-02-27 mjd-perl-patch@plover.com */
116#define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
117
14bd96d0 118#define SPACE_OR_TAB(c) isBLANK_A(c)
bf4acbe4 119
9ff909cf
JH
120#define HEXFP_PEEK(s) \
121 (((s[0] == '.') && \
122 (isXDIGIT(s[1]) || isALPHA_FOLD_EQ(s[1], 'p'))) || \
123 isALPHA_FOLD_EQ(s[0], 'p'))
124
ffb4593c
NT
125/* LEX_* are values for PL_lex_state, the state of the lexer.
126 * They are arranged oddly so that the guard on the switch statement
79072805 127 * can get by with a single comparison (if the compiler is smart enough).
9da1dd8f
DM
128 *
129 * These values refer to the various states within a sublex parse,
130 * i.e. within a double quotish string
79072805
LW
131 */
132
fb73857a
PP
133/* #define LEX_NOTPARSING 11 is done in perl.h. */
134
b6007c36
DM
135#define LEX_NORMAL 10 /* normal code (ie not within "...") */
136#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
137#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
138#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
139#define LEX_INTERPSTART 6 /* expecting the start of a $var */
140
141 /* at end of code, eg "$x" followed by: */
142#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
143#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
144
145#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
146 string or after \E, $foo, etc */
147#define LEX_INTERPCONST 2 /* NOT USED */
148#define LEX_FORMLINE 1 /* expecting a format line */
b6007c36 149
79072805 150
bbf60fe6 151#ifdef DEBUGGING
27da23d5 152static const char* const lex_state_names[] = {
bbf60fe6
DM
153 "KNOWNEXT",
154 "FORMLINE",
155 "INTERPCONST",
156 "INTERPCONCAT",
157 "INTERPENDMAYBE",
158 "INTERPEND",
159 "INTERPSTART",
160 "INTERPPUSH",
161 "INTERPCASEMOD",
162 "INTERPNORMAL",
163 "NORMAL"
164};
165#endif
166
79072805 167#include "keywords.h"
fe14fcc3 168
ffb4593c
NT
169/* CLINE is a macro that ensures PL_copline has a sane value */
170
57843af0 171#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 172
ffb4593c
NT
173/*
174 * Convenience functions to return different tokens and prime the
9cbb5ea2 175 * lexer for the next token. They all take an argument.
ffb4593c
NT
176 *
177 * TOKEN : generic token (used for '(', DOLSHARP, etc)
178 * OPERATOR : generic operator
179 * AOPERATOR : assignment operator
180 * PREBLOCK : beginning the block after an if, while, foreach, ...
181 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
182 * PREREF : *EXPR where EXPR is not a simple identifier
183 * TERM : expression term
89f35911 184 * POSTDEREF : postfix dereference (->$* ->@[...] etc.)
ffb4593c
NT
185 * LOOPX : loop exiting command (goto, last, dump, etc)
186 * FTST : file test operator
187 * FUN0 : zero-argument function
7eb971ee 188 * FUN0OP : zero-argument function, with its op created in this file
2d2e263d 189 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
190 * BOop : bitwise or or xor
191 * BAop : bitwise and
8823cb89 192 * BCop : bitwise complement
ffb4593c
NT
193 * SHop : shift operator
194 * PWop : power operator
9cbb5ea2 195 * PMop : pattern-matching operator
ffb4593c 196 * Aop : addition-level operator
e4916dd1 197 * AopNOASSIGN : addition-level operator that is never part of .=
ffb4593c
NT
198 * Mop : multiplication-level operator
199 * Eop : equality-testing operator
e5edeb50 200 * Rop : relational operator <= != gt
ffb4593c
NT
201 *
202 * Also see LOP and lop() below.
203 */
204
998054bd 205#ifdef DEBUGGING /* Serve -DT. */
704d4215 206# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 207#else
bbf60fe6 208# define REPORT(retval) (retval)
998054bd
SC
209#endif
210
bbf60fe6
DM
211#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
212#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
b1764551 213#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, retval))
bbf60fe6
DM
214#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
215#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
216#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
217#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
89f35911 218#define POSTDEREF(f) return (PL_bufptr = s, S_postderef(aTHX_ REPORT(f),s[1]))
185c2e96 219#define LOOPX(f) return (PL_bufptr = force_word(s,BAREWORD,TRUE,FALSE), \
7a61bf3c 220 pl_yylval.ival=f, \
a49203fd 221 PL_expect = PL_nexttoke ? XOPERATOR : XTERM, \
7a61bf3c 222 REPORT((int)LOOPEX))
6154021b
RGS
223#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
224#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
7eb971ee 225#define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
6154021b 226#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
b1764551
FC
227#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)BITOROP))
228#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)BITANDOP))
8823cb89
FC
229#define BCop(f) return pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr = s, \
230 REPORT('~')
b1764551
FC
231#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)SHIFTOP))
232#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)POWOP))
6154021b 233#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
b1764551 234#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)ADDOP))
e4916dd1 235#define AopNOASSIGN(f) return (pl_yylval.ival=f, PL_bufptr=s, REPORT((int)ADDOP))
b1764551 236#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)MULOP))
6154021b
RGS
237#define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
238#define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
2f3197b3 239
a687059c
LW
240/* This bit of chicanery makes a unary function followed by
241 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
242 * The UNIDOR macro is for unary functions that can be followed by the //
243 * operator (such as C<shift // 0>).
a687059c 244 */
d68ce4ac 245#define UNI3(f,x,have_x) { \
6154021b 246 pl_yylval.ival = f; \
d68ce4ac 247 if (have_x) PL_expect = x; \
376fcdbf
AL
248 PL_bufptr = s; \
249 PL_last_uni = PL_oldbufptr; \
0af40c75 250 PL_last_lop_op = (f) < 0 ? -(f) : (f); \
376fcdbf
AL
251 if (*s == '(') \
252 return REPORT( (int)FUNC1 ); \
294a536f 253 s = skipspace(s); \
376fcdbf
AL
254 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
255 }
d68ce4ac
FC
256#define UNI(f) UNI3(f,XTERM,1)
257#define UNIDOR(f) UNI3(f,XTERMORDORDOR,1)
b5fb7ce3
FC
258#define UNIPROTO(f,optional) { \
259 if (optional) PL_last_uni = PL_oldbufptr; \
22393538
MH
260 OPERATOR(f); \
261 }
a687059c 262
d68ce4ac 263#define UNIBRACK(f) UNI3(f,0,0)
79072805 264
9f68db38 265/* grandfather return to old style */
78cdf107
Z
266#define OLDLOP(f) \
267 do { \
268 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
269 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
270 pl_yylval.ival = (f); \
271 PL_expect = XTERM; \
272 PL_bufptr = s; \
273 return (int)LSTOP; \
274 } while(0)
79072805 275
83944c01
FC
276#define COPLINE_INC_WITH_HERELINES \
277 STMT_START { \
278 CopLINE_inc(PL_curcop); \
851b527a
FC
279 if (PL_parser->herelines) \
280 CopLINE(PL_curcop) += PL_parser->herelines, \
281 PL_parser->herelines = 0; \
83944c01 282 } STMT_END
ffdb8b16
FC
283/* Called after scan_str to update CopLINE(PL_curcop), but only when there
284 * is no sublex_push to follow. */
285#define COPLINE_SET_FROM_MULTI_END \
286 STMT_START { \
287 CopLINE_set(PL_curcop, PL_multi_end); \
288 if (PL_multi_end != PL_multi_start) \
851b527a 289 PL_parser->herelines = 0; \
ffdb8b16 290 } STMT_END
83944c01
FC
291
292
8fa7f367
JH
293#ifdef DEBUGGING
294
6154021b 295/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
296enum token_type {
297 TOKENTYPE_NONE,
298 TOKENTYPE_IVAL,
6154021b 299 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6 300 TOKENTYPE_PVAL,
aeaef349 301 TOKENTYPE_OPVAL
bbf60fe6
DM
302};
303
6d4a66ac
NC
304static struct debug_tokens {
305 const int token;
306 enum token_type type;
307 const char *name;
308} const debug_tokens[] =
9041c2e3 309{
bbf60fe6
DM
310 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
311 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
312 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
313 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
436ddf68 314 { ANON_SIGSUB, TOKENTYPE_IVAL, "ANON_SIGSUB" },
bbf60fe6
DM
315 { ARROW, TOKENTYPE_NONE, "ARROW" },
316 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
317 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
318 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
319 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
320 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
7896dde7 321 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
322 { DO, TOKENTYPE_NONE, "DO" },
323 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
324 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
325 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
326 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
327 { ELSE, TOKENTYPE_NONE, "ELSE" },
328 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
329 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
330 { FOR, TOKENTYPE_IVAL, "FOR" },
331 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
705fe0e5
FC
332 { FORMLBRACK, TOKENTYPE_NONE, "FORMLBRACK" },
333 { FORMRBRACK, TOKENTYPE_NONE, "FORMRBRACK" },
bbf60fe6
DM
334 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
335 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
7eb971ee 336 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
bbf60fe6
DM
337 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
338 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
339 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 340 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
341 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
342 { IF, TOKENTYPE_IVAL, "IF" },
5504e6cf 343 { LABEL, TOKENTYPE_PVAL, "LABEL" },
bbf60fe6
DM
344 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
345 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
346 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
347 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
348 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
349 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
350 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
351 { MY, TOKENTYPE_IVAL, "MY" },
bbf60fe6
DM
352 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
353 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
354 { OROP, TOKENTYPE_IVAL, "OROP" },
355 { OROR, TOKENTYPE_NONE, "OROR" },
356 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
88e1f1a2
JV
357 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
358 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6 359 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
cc624add 360 { POSTJOIN, TOKENTYPE_NONE, "POSTJOIN" },
bbf60fe6
DM
361 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
362 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
363 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
364 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
365 { PREINC, TOKENTYPE_NONE, "PREINC" },
366 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
f3f204dc 367 { QWLIST, TOKENTYPE_OPVAL, "QWLIST" },
bbf60fe6
DM
368 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
369 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
f3f204dc 370 { REQUIRE, TOKENTYPE_NONE, "REQUIRE" },
bbf60fe6 371 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
436ddf68 372 { SIGSUB, TOKENTYPE_NONE, "SIGSUB" },
bbf60fe6
DM
373 { SUB, TOKENTYPE_NONE, "SUB" },
374 { THING, TOKENTYPE_OPVAL, "THING" },
375 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
376 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
377 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
378 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
379 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
380 { USE, TOKENTYPE_IVAL, "USE" },
7896dde7 381 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6 382 { WHILE, TOKENTYPE_IVAL, "WHILE" },
185c2e96 383 { BAREWORD, TOKENTYPE_OPVAL, "BAREWORD" },
be25f609 384 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 385 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
386};
387
6154021b 388/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 389
bbf60fe6 390STATIC int
704d4215 391S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 392{
7918f24d
NC
393 PERL_ARGS_ASSERT_TOKEREPORT;
394
bbf60fe6 395 if (DEBUG_T_TEST) {
bd61b366 396 const char *name = NULL;
bbf60fe6 397 enum token_type type = TOKENTYPE_NONE;
f54cb97a 398 const struct debug_tokens *p;
396482e1 399 SV* const report = newSVpvs("<== ");
bbf60fe6 400
f54cb97a 401 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
402 if (p->token == (int)rv) {
403 name = p->name;
404 type = p->type;
405 break;
406 }
407 }
408 if (name)
54667de8 409 Perl_sv_catpv(aTHX_ report, name);
239f83d5 410 else if (isGRAPH(rv))
4ebc7986 411 {
bbf60fe6 412 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
4ebc7986
FC
413 if ((char)rv == 'p')
414 sv_catpvs(report, " (pending identifier)");
415 }
bbf60fe6 416 else if (!rv)
396482e1 417 sv_catpvs(report, "EOF");
bbf60fe6 418 else
147e3846 419 Perl_sv_catpvf(aTHX_ report, "?? %" IVdf, (IV)rv);
bbf60fe6
DM
420 switch (type) {
421 case TOKENTYPE_NONE:
bbf60fe6
DM
422 break;
423 case TOKENTYPE_IVAL:
147e3846 424 Perl_sv_catpvf(aTHX_ report, "(ival=%" IVdf ")", (IV)lvalp->ival);
bbf60fe6
DM
425 break;
426 case TOKENTYPE_OPNUM:
427 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 428 PL_op_name[lvalp->ival]);
bbf60fe6
DM
429 break;
430 case TOKENTYPE_PVAL:
704d4215 431 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
432 break;
433 case TOKENTYPE_OPVAL:
704d4215 434 if (lvalp->opval) {
401441c0 435 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
436 PL_op_name[lvalp->opval->op_type]);
437 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 438 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 439 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
440 }
441
442 }
401441c0 443 else
396482e1 444 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
445 break;
446 }
b6007c36 447 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
448 };
449 return (int)rv;
998054bd
SC
450}
451
b6007c36
DM
452
453/* print the buffer with suitable escapes */
454
455STATIC void
15f169a1 456S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 457{
396482e1 458 SV* const tmp = newSVpvs("");
7918f24d
NC
459
460 PERL_ARGS_ASSERT_PRINTBUF;
461
7347ee54 462 GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral); /* fmt checked by caller */
b6007c36 463 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
7347ee54 464 GCC_DIAG_RESTORE_STMT;
b6007c36
DM
465 SvREFCNT_dec(tmp);
466}
467
8fa7f367
JH
468#endif
469
ffb4593c
NT
470/*
471 * S_ao
472 *
f393a21a
FC
473 * This subroutine looks for an '=' next to the operator that has just been
474 * parsed and turns it into an ASSIGNOP if it finds one.
ffb4593c
NT
475 */
476
76e3520e 477STATIC int
cea2e8a9 478S_ao(pTHX_ int toketype)
a0d0e21e 479{
3280af22
NIS
480 if (*PL_bufptr == '=') {
481 PL_bufptr++;
a0d0e21e 482 if (toketype == ANDAND)
6154021b 483 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 484 else if (toketype == OROR)
6154021b 485 pl_yylval.ival = OP_ORASSIGN;
c963b151 486 else if (toketype == DORDOR)
6154021b 487 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
488 toketype = ASSIGNOP;
489 }
b1764551 490 return REPORT(toketype);
a0d0e21e
LW
491}
492
ffb4593c
NT
493/*
494 * S_no_op
495 * When Perl expects an operator and finds something else, no_op
496 * prints the warning. It always prints "<something> found where
497 * operator expected. It prints "Missing semicolon on previous line?"
498 * if the surprise occurs at the start of the line. "do you need to
499 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
500 * where the compiler doesn't know if foo is a method call or a function.
501 * It prints "Missing operator before end of line" if there's nothing
502 * after the missing operator, or "... before <...>" if there is something
503 * after the missing operator.
488bc579
FC
504 *
505 * PL_bufptr is expected to point to the start of the thing that was found,
506 * and s after the next token or partial token.
ffb4593c
NT
507 */
508
76e3520e 509STATIC void
15f169a1 510S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 511{
9d4ba2ae
AL
512 char * const oldbp = PL_bufptr;
513 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 514
7918f24d
NC
515 PERL_ARGS_ASSERT_NO_OP;
516
1189a94a
GS
517 if (!s)
518 s = oldbp;
07c798fb 519 else
1189a94a 520 PL_bufptr = s;
734ab321 521 yywarn(Perl_form(aTHX_ "%s found where operator expected", what), UTF ? SVf_UTF8 : 0);
56da5a46
RGS
522 if (ckWARN_d(WARN_SYNTAX)) {
523 if (is_first)
524 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
525 "\t(Missing semicolon on previous line?)\n");
fac0f7a3
KW
526 else if (PL_oldoldbufptr && isIDFIRST_lazy_if_safe(PL_oldoldbufptr,
527 PL_bufend,
528 UTF))
529 {
f54cb97a 530 const char *t;
fac0f7a3
KW
531 for (t = PL_oldoldbufptr;
532 (isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF) || *t == ':');
533 t += UTF ? UTF8SKIP(t) : 1)
534 {
c35e046a 535 NOOP;
fac0f7a3 536 }
56da5a46
RGS
537 if (t < PL_bufptr && isSPACE(*t))
538 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
147e3846 539 "\t(Do you need to predeclare %" UTF8f "?)\n",
b17a0679 540 UTF8fARG(UTF, t - PL_oldoldbufptr, PL_oldoldbufptr));
56da5a46
RGS
541 }
542 else {
543 assert(s >= oldbp);
544 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
147e3846 545 "\t(Missing operator before %" UTF8f "?)\n",
b17a0679 546 UTF8fARG(UTF, s - oldbp, oldbp));
56da5a46 547 }
07c798fb 548 }
3280af22 549 PL_bufptr = oldbp;
8990e307
LW
550}
551
ffb4593c
NT
552/*
553 * S_missingterm
554 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 555 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
556 * If we're in a delimited string and the delimiter is a control
557 * character, it's reformatted into a two-char sequence like ^C.
558 * This is fatal.
559 */
560
76e3520e 561STATIC void
1b8d3e0e 562S_missingterm(pTHX_ char *s, STRLEN len)
8990e307 563{
e487ff5e 564 char tmpbuf[UTF8_MAXBYTES + 1];
8990e307 565 char q;
cb650135
FC
566 bool uni = FALSE;
567 SV *sv;
8990e307 568 if (s) {
f88b23d7 569 char * const nl = (char *) my_memrchr(s, '\n', len);
1b8d3e0e
LM
570 if (nl) {
571 *nl = '\0';
572 len = nl - s;
573 }
7f7f7d08 574 uni = UTF;
8990e307 575 }
cb650135 576 else if (PL_multi_close < 32) {
8990e307 577 *tmpbuf = '^';
585ec06d 578 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
579 tmpbuf[2] = '\0';
580 s = tmpbuf;
1b8d3e0e 581 len = 2;
8990e307
LW
582 }
583 else {
cb650135
FC
584 if (LIKELY(PL_multi_close < 256)) {
585 *tmpbuf = (char)PL_multi_close;
586 tmpbuf[1] = '\0';
1b8d3e0e 587 len = 1;
cb650135
FC
588 }
589 else {
1b8d3e0e
LM
590 char *end = (char *)uvchr_to_utf8((U8 *)tmpbuf, PL_multi_close);
591 *end = '\0';
592 len = end - tmpbuf;
cb650135 593 uni = TRUE;
cb650135 594 }
8990e307
LW
595 s = tmpbuf;
596 }
9f8d27b7 597 q = memchr(s, '"', len) ? '\'' : '"';
1b8d3e0e 598 sv = sv_2mortal(newSVpvn(s, len));
cb650135
FC
599 if (uni)
600 SvUTF8_on(sv);
1b8d3e0e
LM
601 Perl_croak(aTHX_ "Can't find string terminator %c%" SVf "%c"
602 " anywhere before EOF", q, SVfARG(sv), q);
463ee0b2 603}
79072805 604
dd0ac2b9
FC
605#include "feature.h"
606
0d863452 607/*
0d863452
RH
608 * Check whether the named feature is enabled.
609 */
26ea9e12 610bool
3fff3427 611Perl_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
0d863452 612{
4a731d7b 613 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
7918f24d
NC
614
615 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
ca4d40c4
FC
616
617 assert(CURRENT_FEATURE_BUNDLE == FEATURE_BUNDLE_CUSTOM);
7918f24d 618
26ea9e12
NC
619 if (namelen > MAX_FEATURE_LEN)
620 return FALSE;
3fff3427 621 memcpy(&he_name[8], name, namelen);
7d69d4a6 622
c8ca97b0
NC
623 return cBOOL(cop_hints_fetch_pvn(PL_curcop, he_name, 8 + namelen, 0,
624 REFCOUNTED_HE_EXISTS));
0d863452
RH
625}
626
ffb4593c 627/*
9cbb5ea2
GS
628 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
629 * utf16-to-utf8-reversed.
ffb4593c
NT
630 */
631
c39cd008
GS
632#ifdef PERL_CR_FILTER
633static void
634strip_return(SV *sv)
635{
eb578fdb
KW
636 const char *s = SvPVX_const(sv);
637 const char * const e = s + SvCUR(sv);
7918f24d
NC
638
639 PERL_ARGS_ASSERT_STRIP_RETURN;
640
c39cd008
GS
641 /* outer loop optimized to do nothing if there are no CR-LFs */
642 while (s < e) {
643 if (*s++ == '\r' && *s == '\n') {
644 /* hit a CR-LF, need to copy the rest */
eb578fdb 645 char *d = s - 1;
c39cd008
GS
646 *d++ = *s++;
647 while (s < e) {
648 if (*s == '\r' && s[1] == '\n')
649 s++;
650 *d++ = *s++;
651 }
652 SvCUR(sv) -= s - d;
653 return;
654 }
655 }
656}
a868473f 657
76e3520e 658STATIC I32
c39cd008 659S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 660{
f54cb97a 661 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
662 if (count > 0 && !maxlen)
663 strip_return(sv);
664 return count;
a868473f
NIS
665}
666#endif
667
ffb4593c 668/*
8eaa0acf
Z
669=for apidoc Amx|void|lex_start|SV *line|PerlIO *rsfp|U32 flags
670
671Creates and initialises a new lexer/parser state object, supplying
672a context in which to lex and parse from a new source of Perl code.
673A pointer to the new state object is placed in L</PL_parser>. An entry
c83d5090 674is made on the save stack so that upon unwinding, the new state object
8eaa0acf
Z
675will be destroyed and the former value of L</PL_parser> will be restored.
676Nothing else need be done to clean up the parsing context.
677
2d7f6611 678The code to be parsed comes from C<line> and C<rsfp>. C<line>, if
8eaa0acf 679non-null, provides a string (in SV form) containing code to be parsed.
2d7f6611
KW
680A copy of the string is made, so subsequent modification of C<line>
681does not affect parsing. C<rsfp>, if non-null, provides an input stream
8eaa0acf 682from which code will be read to be parsed. If both are non-null, the
2d7f6611
KW
683code in C<line> comes first and must consist of complete lines of input,
684and C<rsfp> supplies the remainder of the source.
8eaa0acf 685
2d7f6611 686The C<flags> parameter is reserved for future use. Currently it is only
e368b3bd 687used by perl internally, so extensions should always pass zero.
8eaa0acf
Z
688
689=cut
690*/
ffb4593c 691
27fcb6ee 692/* LEX_START_SAME_FILTER indicates that this is not a new file, so it
87606032
NC
693 can share filters with the current parser.
694 LEX_START_DONT_CLOSE indicates that the file handle wasn't opened by the
695 caller, hence isn't owned by the parser, so shouldn't be closed on parser
696 destruction. This is used to handle the case of defaulting to reading the
697 script from the standard input because no filename was given on the command
698 line (without getting confused by situation where STDIN has been closed, so
699 the script handle is opened on fd 0) */
27fcb6ee 700
a0d0e21e 701void
8eaa0acf 702Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
79072805 703{
6ef55633 704 const char *s = NULL;
5486870f 705 yy_parser *parser, *oparser;
90b58c70 706
60d63348 707 if (flags && flags & ~LEX_START_FLAGS)
8eaa0acf 708 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
acdf0a21
DM
709
710 /* create and initialise a parser */
711
199e78b7 712 Newxz(parser, 1, yy_parser);
5486870f 713 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
714 PL_parser = parser;
715
28ac2b49 716 parser->stack = NULL;
df13534a 717 parser->stack_max1 = NULL;
28ac2b49 718 parser->ps = NULL;
acdf0a21 719
e3abe207
DM
720 /* on scope exit, free this parser and restore any outer one */
721 SAVEPARSER(parser);
7c4baf47 722 parser->saved_curcop = PL_curcop;
e3abe207 723
acdf0a21 724 /* initialise lexer state */
8990e307 725
fb205e7a 726 parser->nexttoke = 0;
ca4cfd28 727 parser->error_count = oparser ? oparser->error_count : 0;
7f1c3e8c 728 parser->copline = parser->preambling = NOLINE;
5afb0a62 729 parser->lex_state = LEX_NORMAL;
c2598295 730 parser->expect = XSTATE;
2f9285f8 731 parser->rsfp = rsfp;
efa571ab 732 parser->recheck_utf8_validity = FALSE;
27fcb6ee
FC
733 parser->rsfp_filters =
734 !(flags & LEX_START_SAME_FILTER) || !oparser
d3cd8e11
FC
735 ? NULL
736 : MUTABLE_AV(SvREFCNT_inc(
737 oparser->rsfp_filters
738 ? oparser->rsfp_filters
739 : (oparser->rsfp_filters = newAV())
740 ));
2f9285f8 741
199e78b7
DM
742 Newx(parser->lex_brackstack, 120, char);
743 Newx(parser->lex_casestack, 12, char);
744 *parser->lex_casestack = '\0';
d794b522 745 Newxz(parser->lex_shared, 1, LEXSHARED);
02b34bbe 746
10efb74f 747 if (line) {
0528fd32 748 STRLEN len;
a6909bd2
KW
749 const U8* first_bad_char_loc;
750
10efb74f 751 s = SvPV_const(line, len);
90b58c70 752
0aab20f2
KW
753 if ( SvUTF8(line)
754 && UNLIKELY(! is_utf8_string_loc((U8 *) s,
755 SvCUR(line),
756 &first_bad_char_loc)))
90b58c70
KW
757 {
758 _force_out_malformed_utf8_message(first_bad_char_loc,
759 (U8 *) s + SvCUR(line),
760 0,
761 1 /* 1 means die */ );
762 NOT_REACHED; /* NOTREACHED */
763 }
764
0abcdfa4
FC
765 parser->linestr = flags & LEX_START_COPIED
766 ? SvREFCNT_inc_simple_NN(line)
767 : newSVpvn_flags(s, len, SvUTF8(line));
b3dd0aba
FC
768 if (!rsfp)
769 sv_catpvs(parser->linestr, "\n;");
0abcdfa4 770 } else {
bf1b738b 771 parser->linestr = newSVpvn("\n;", rsfp ? 1 : 2);
8990e307 772 }
218304f9 773
f06b5848
DM
774 parser->oldoldbufptr =
775 parser->oldbufptr =
776 parser->bufptr =
777 parser->linestart = SvPVX(parser->linestr);
778 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
779 parser->last_lop = parser->last_uni = NULL;
b54f893d 780
6d59e610 781 STATIC_ASSERT_STMT(FITS_IN_8_BITS(LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
b54f893d
KW
782 |LEX_DONT_CLOSE_RSFP));
783 parser->lex_flags = (U8) (flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
784 |LEX_DONT_CLOSE_RSFP));
737c24fc 785
60d63348 786 parser->in_pod = parser->filtered = 0;
79072805 787}
a687059c 788
e3abe207
DM
789
790/* delete a parser object */
791
792void
793Perl_parser_free(pTHX_ const yy_parser *parser)
794{
7918f24d
NC
795 PERL_ARGS_ASSERT_PARSER_FREE;
796
7c4baf47 797 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
798 SvREFCNT_dec(parser->linestr);
799
87606032 800 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
2f9285f8 801 PerlIO_clearerr(parser->rsfp);
407f8cf2
KW
802 else if (parser->rsfp && (!parser->old_parser
803 || (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 804 PerlIO_close(parser->rsfp);
5486870f 805 SvREFCNT_dec(parser->rsfp_filters);
10002bc1 806 SvREFCNT_dec(parser->lex_stuff);
7ef70b3d 807 SvREFCNT_dec(parser->lex_sub_repl);
3ac7ff8f
FC
808
809 Safefree(parser->lex_brackstack);
810 Safefree(parser->lex_casestack);
811 Safefree(parser->lex_shared);
812 PL_parser = parser->old_parser;
813 Safefree(parser);
814}
815
816void
817Perl_parser_free_nexttoke_ops(pTHX_ yy_parser *parser, OPSLAB *slab)
818{
3ac7ff8f 819 I32 nexttoke = parser->nexttoke;
3ac7ff8f 820 PERL_ARGS_ASSERT_PARSER_FREE_NEXTTOKE_OPS;
3ce3dcd9 821 while (nexttoke--) {
3ac7ff8f
FC
822 if (S_is_opval_token(parser->nexttype[nexttoke] & 0xffff)
823 && parser->nextval[nexttoke].opval
824 && parser->nextval[nexttoke].opval->op_slabbed
825 && OpSLAB(parser->nextval[nexttoke].opval) == slab) {
3ce3dcd9 826 op_free(parser->nextval[nexttoke].opval);
3ac7ff8f
FC
827 parser->nextval[nexttoke].opval = NULL;
828 }
3ce3dcd9 829 }
e3abe207
DM
830}
831
832
ffb4593c 833/*
f0e67a1d
Z
834=for apidoc AmxU|SV *|PL_parser-E<gt>linestr
835
836Buffer scalar containing the chunk currently under consideration of the
837text currently being lexed. This is always a plain string scalar (for
838which C<SvPOK> is true). It is not intended to be used as a scalar by
839normal scalar means; instead refer to the buffer directly by the pointer
840variables described below.
841
842The lexer maintains various C<char*> pointers to things in the
843C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
844reallocated, all of these pointers must be updated. Don't attempt to
845do this manually, but rather use L</lex_grow_linestr> if you need to
846reallocate the buffer.
847
848The content of the text chunk in the buffer is commonly exactly one
849complete line of input, up to and including a newline terminator,
850but there are situations where it is otherwise. The octets of the
851buffer may be intended to be interpreted as either UTF-8 or Latin-1.
852The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
853flag on this scalar, which may disagree with it.
854
855For direct examination of the buffer, the variable
856L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
857lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
858of these pointers is usually preferable to examination of the scalar
859through normal scalar means.
860
861=for apidoc AmxU|char *|PL_parser-E<gt>bufend
862
863Direct pointer to the end of the chunk of text currently being lexed, the
864end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
6602b933 865+ SvCUR(PL_parser-E<gt>linestr)>. A C<NUL> character (zero octet) is
f0e67a1d
Z
866always located at the end of the buffer, and does not count as part of
867the buffer's contents.
868
869=for apidoc AmxU|char *|PL_parser-E<gt>bufptr
870
871Points to the current position of lexing inside the lexer buffer.
872Characters around this point may be freely examined, within
873the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
874L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
875interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
876
877Lexing code (whether in the Perl core or not) moves this pointer past
878the characters that it consumes. It is also expected to perform some
879bookkeeping whenever a newline character is consumed. This movement
880can be more conveniently performed by the function L</lex_read_to>,
881which handles newlines appropriately.
882
883Interpretation of the buffer's octets can be abstracted out by
884using the slightly higher-level functions L</lex_peek_unichar> and
885L</lex_read_unichar>.
886
887=for apidoc AmxU|char *|PL_parser-E<gt>linestart
888
889Points to the start of the current line inside the lexer buffer.
890This is useful for indicating at which column an error occurred, and
891not much else. This must be updated by any lexing code that consumes
892a newline; the function L</lex_read_to> handles this detail.
893
894=cut
895*/
896
897/*
898=for apidoc Amx|bool|lex_bufutf8
899
900Indicates whether the octets in the lexer buffer
901(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
902of Unicode characters. If not, they should be interpreted as Latin-1
903characters. This is analogous to the C<SvUTF8> flag for scalars.
904
905In UTF-8 mode, it is not guaranteed that the lexer buffer actually
906contains valid UTF-8. Lexing code must be robust in the face of invalid
907encoding.
908
909The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
910is significant, but not the whole story regarding the input character
911encoding. Normally, when a file is being read, the scalar contains octets
912and its C<SvUTF8> flag is off, but the octets should be interpreted as
913UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
914however, the scalar may have the C<SvUTF8> flag on, and in this case its
915octets should be interpreted as UTF-8 unless the C<use bytes> pragma
916is in effect. This logic may change in the future; use this function
917instead of implementing the logic yourself.
918
919=cut
920*/
921
922bool
923Perl_lex_bufutf8(pTHX)
924{
925 return UTF;
926}
927
928/*
929=for apidoc Amx|char *|lex_grow_linestr|STRLEN len
930
931Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
2d7f6611 932at least C<len> octets (including terminating C<NUL>). Returns a
f0e67a1d
Z
933pointer to the reallocated buffer. This is necessary before making
934any direct modification of the buffer that would increase its length.
935L</lex_stuff_pvn> provides a more convenient way to insert text into
936the buffer.
937
938Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
939this function updates all of the lexer's variables that point directly
940into the buffer.
941
942=cut
943*/
944
945char *
946Perl_lex_grow_linestr(pTHX_ STRLEN len)
947{
948 SV *linestr;
949 char *buf;
950 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
c7641931 951 STRLEN linestart_pos, last_uni_pos, last_lop_pos, re_eval_start_pos;
98d5e3ef
DM
952 bool current;
953
f0e67a1d
Z
954 linestr = PL_parser->linestr;
955 buf = SvPVX(linestr);
956 if (len <= SvLEN(linestr))
957 return buf;
98d5e3ef
DM
958
959 /* Is the lex_shared linestr SV the same as the current linestr SV?
960 * Only in this case does re_eval_start need adjusting, since it
961 * points within lex_shared->ls_linestr's buffer */
b1b8fb6a
DM
962 current = ( !PL_parser->lex_shared->ls_linestr
963 || linestr == PL_parser->lex_shared->ls_linestr);
98d5e3ef 964
f0e67a1d
Z
965 bufend_pos = PL_parser->bufend - buf;
966 bufptr_pos = PL_parser->bufptr - buf;
967 oldbufptr_pos = PL_parser->oldbufptr - buf;
968 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
969 linestart_pos = PL_parser->linestart - buf;
970 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
971 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
98d5e3ef 972 re_eval_start_pos = (current && PL_parser->lex_shared->re_eval_start) ?
3328ab5a 973 PL_parser->lex_shared->re_eval_start - buf : 0;
c7641931 974
f0e67a1d 975 buf = sv_grow(linestr, len);
c7641931 976
f0e67a1d
Z
977 PL_parser->bufend = buf + bufend_pos;
978 PL_parser->bufptr = buf + bufptr_pos;
979 PL_parser->oldbufptr = buf + oldbufptr_pos;
980 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
981 PL_parser->linestart = buf + linestart_pos;
982 if (PL_parser->last_uni)
983 PL_parser->last_uni = buf + last_uni_pos;
984 if (PL_parser->last_lop)
985 PL_parser->last_lop = buf + last_lop_pos;
98d5e3ef 986 if (current && PL_parser->lex_shared->re_eval_start)
3328ab5a 987 PL_parser->lex_shared->re_eval_start = buf + re_eval_start_pos;
f0e67a1d
Z
988 return buf;
989}
990
991/*
83aa740e 992=for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
f0e67a1d
Z
993
994Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
995immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
996reallocating the buffer if necessary. This means that lexing code that
997runs later will see the characters as if they had appeared in the input.
998It is not recommended to do this as part of normal parsing, and most
999uses of this facility run the risk of the inserted characters being
1000interpreted in an unintended manner.
1001
2d7f6611
KW
1002The string to be inserted is represented by C<len> octets starting
1003at C<pv>. These octets are interpreted as either UTF-8 or Latin-1,
1004according to whether the C<LEX_STUFF_UTF8> flag is set in C<flags>.
f0e67a1d
Z
1005The characters are recoded for the lexer buffer, according to how the
1006buffer is currently being interpreted (L</lex_bufutf8>). If a string
9dcc53ea 1007to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
f0e67a1d
Z
1008function is more convenient.
1009
1010=cut
1011*/
1012
1013void
83aa740e 1014Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
f0e67a1d 1015{
749123ff 1016 dVAR;
f0e67a1d
Z
1017 char *bufptr;
1018 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
1019 if (flags & ~(LEX_STUFF_UTF8))
1020 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
1021 if (UTF) {
1022 if (flags & LEX_STUFF_UTF8) {
1023 goto plain_copy;
1024 } else {
54d004e8 1025 STRLEN highhalf = 0; /* Count of variants */
83aa740e 1026 const char *p, *e = pv+len;
54d004e8
KW
1027 for (p = pv; p != e; p++) {
1028 if (! UTF8_IS_INVARIANT(*p)) {
1029 highhalf++;
1030 }
1031 }
f0e67a1d
Z
1032 if (!highhalf)
1033 goto plain_copy;
1034 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
1035 bufptr = PL_parser->bufptr;
1036 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1037 SvCUR_set(PL_parser->linestr,
1038 SvCUR(PL_parser->linestr) + len+highhalf);
f0e67a1d
Z
1039 PL_parser->bufend += len+highhalf;
1040 for (p = pv; p != e; p++) {
27d0a47c 1041 append_utf8_from_native_byte(*p, (U8 **) &bufptr);
f0e67a1d
Z
1042 }
1043 }
1044 } else {
1045 if (flags & LEX_STUFF_UTF8) {
1046 STRLEN highhalf = 0;
83aa740e 1047 const char *p, *e = pv+len;
f0e67a1d
Z
1048 for (p = pv; p != e; p++) {
1049 U8 c = (U8)*p;
54d004e8 1050 if (UTF8_IS_ABOVE_LATIN1(c)) {
f0e67a1d
Z
1051 Perl_croak(aTHX_ "Lexing code attempted to stuff "
1052 "non-Latin-1 character into Latin-1 input");
54d004e8 1053 } else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e)) {
f0e67a1d
Z
1054 p++;
1055 highhalf++;
f88c6466 1056 } else assert(UTF8_IS_INVARIANT(c));
f0e67a1d
Z
1057 }
1058 if (!highhalf)
1059 goto plain_copy;
1060 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
1061 bufptr = PL_parser->bufptr;
1062 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1063 SvCUR_set(PL_parser->linestr,
1064 SvCUR(PL_parser->linestr) + len-highhalf);
f0e67a1d 1065 PL_parser->bufend += len-highhalf;
54d004e8
KW
1066 p = pv;
1067 while (p < e) {
1068 if (UTF8_IS_INVARIANT(*p)) {
1069 *bufptr++ = *p;
1070 p++;
f0e67a1d 1071 }
54d004e8
KW
1072 else {
1073 assert(p < e -1 );
a62b247b 1074 *bufptr++ = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
54d004e8
KW
1075 p += 2;
1076 }
f0e67a1d
Z
1077 }
1078 } else {
54d004e8 1079 plain_copy:
f0e67a1d
Z
1080 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1081 bufptr = PL_parser->bufptr;
1082 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
255fdf19 1083 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
f0e67a1d
Z
1084 PL_parser->bufend += len;
1085 Copy(pv, bufptr, len, char);
1086 }
1087 }
1088}
1089
1090/*
9dcc53ea
Z
1091=for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
1092
1093Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1094immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1095reallocating the buffer if necessary. This means that lexing code that
1096runs later will see the characters as if they had appeared in the input.
1097It is not recommended to do this as part of normal parsing, and most
1098uses of this facility run the risk of the inserted characters being
1099interpreted in an unintended manner.
1100
2d7f6611 1101The string to be inserted is represented by octets starting at C<pv>
9dcc53ea
Z
1102and continuing to the first nul. These octets are interpreted as either
1103UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
2d7f6611 1104in C<flags>. The characters are recoded for the lexer buffer, according
9dcc53ea
Z
1105to how the buffer is currently being interpreted (L</lex_bufutf8>).
1106If it is not convenient to nul-terminate a string to be inserted, the
1107L</lex_stuff_pvn> function is more appropriate.
1108
1109=cut
1110*/
1111
1112void
1113Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1114{
1115 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1116 lex_stuff_pvn(pv, strlen(pv), flags);
1117}
1118
1119/*
f0e67a1d
Z
1120=for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1121
1122Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1123immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1124reallocating the buffer if necessary. This means that lexing code that
1125runs later will see the characters as if they had appeared in the input.
1126It is not recommended to do this as part of normal parsing, and most
1127uses of this facility run the risk of the inserted characters being
1128interpreted in an unintended manner.
1129
2d7f6611 1130The string to be inserted is the string value of C<sv>. The characters
f0e67a1d 1131are recoded for the lexer buffer, according to how the buffer is currently
9dcc53ea 1132being interpreted (L</lex_bufutf8>). If a string to be inserted is
f0e67a1d
Z
1133not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1134need to construct a scalar.
1135
1136=cut
1137*/
1138
1139void
1140Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1141{
1142 char *pv;
1143 STRLEN len;
1144 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1145 if (flags)
1146 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1147 pv = SvPV(sv, len);
1148 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1149}
1150
1151/*
1152=for apidoc Amx|void|lex_unstuff|char *ptr
1153
1154Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
2d7f6611 1155C<ptr>. Text following C<ptr> will be moved, and the buffer shortened.
f0e67a1d
Z
1156This hides the discarded text from any lexing code that runs later,
1157as if the text had never appeared.
1158
1159This is not the normal way to consume lexed text. For that, use
1160L</lex_read_to>.
1161
1162=cut
1163*/
1164
1165void
1166Perl_lex_unstuff(pTHX_ char *ptr)
1167{
1168 char *buf, *bufend;
1169 STRLEN unstuff_len;
1170 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1171 buf = PL_parser->bufptr;
1172 if (ptr < buf)
1173 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1174 if (ptr == buf)
1175 return;
1176 bufend = PL_parser->bufend;
1177 if (ptr > bufend)
1178 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1179 unstuff_len = ptr - buf;
1180 Move(ptr, buf, bufend+1-ptr, char);
1181 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1182 PL_parser->bufend = bufend - unstuff_len;
1183}
1184
1185/*
1186=for apidoc Amx|void|lex_read_to|char *ptr
1187
1188Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
2d7f6611 1189to C<ptr>. This advances L</PL_parser-E<gt>bufptr> to match C<ptr>,
f0e67a1d
Z
1190performing the correct bookkeeping whenever a newline character is passed.
1191This is the normal way to consume lexed text.
1192
1193Interpretation of the buffer's octets can be abstracted out by
1194using the slightly higher-level functions L</lex_peek_unichar> and
1195L</lex_read_unichar>.
1196
1197=cut
1198*/
1199
1200void
1201Perl_lex_read_to(pTHX_ char *ptr)
1202{
1203 char *s;
1204 PERL_ARGS_ASSERT_LEX_READ_TO;
1205 s = PL_parser->bufptr;
1206 if (ptr < s || ptr > PL_parser->bufend)
1207 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1208 for (; s != ptr; s++)
1209 if (*s == '\n') {
83944c01 1210 COPLINE_INC_WITH_HERELINES;
f0e67a1d
Z
1211 PL_parser->linestart = s+1;
1212 }
1213 PL_parser->bufptr = ptr;
1214}
1215
1216/*
1217=for apidoc Amx|void|lex_discard_to|char *ptr
1218
1219Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
2d7f6611
KW
1220up to C<ptr>. The remaining content of the buffer will be moved, and
1221all pointers into the buffer updated appropriately. C<ptr> must not
f0e67a1d
Z
1222be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1223it is not permitted to discard text that has yet to be lexed.
1224
1225Normally it is not necessarily to do this directly, because it suffices to
1226use the implicit discarding behaviour of L</lex_next_chunk> and things
1227based on it. However, if a token stretches across multiple lines,
1f317c95 1228and the lexing code has kept multiple lines of text in the buffer for
f0e67a1d
Z
1229that purpose, then after completion of the token it would be wise to
1230explicitly discard the now-unneeded earlier lines, to avoid future
1231multi-line tokens growing the buffer without bound.
1232
1233=cut
1234*/
1235
1236void
1237Perl_lex_discard_to(pTHX_ char *ptr)
1238{
1239 char *buf;
1240 STRLEN discard_len;
1241 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1242 buf = SvPVX(PL_parser->linestr);
1243 if (ptr < buf)
1244 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1245 if (ptr == buf)
1246 return;
1247 if (ptr > PL_parser->bufptr)
1248 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1249 discard_len = ptr - buf;
1250 if (PL_parser->oldbufptr < ptr)
1251 PL_parser->oldbufptr = ptr;
1252 if (PL_parser->oldoldbufptr < ptr)
1253 PL_parser->oldoldbufptr = ptr;
1254 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1255 PL_parser->last_uni = NULL;
1256 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1257 PL_parser->last_lop = NULL;
1258 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1259 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1260 PL_parser->bufend -= discard_len;
1261 PL_parser->bufptr -= discard_len;
1262 PL_parser->oldbufptr -= discard_len;
1263 PL_parser->oldoldbufptr -= discard_len;
1264 if (PL_parser->last_uni)
1265 PL_parser->last_uni -= discard_len;
1266 if (PL_parser->last_lop)
1267 PL_parser->last_lop -= discard_len;
1268}
1269
efa571ab
KW
1270void
1271Perl_notify_parser_that_changed_to_utf8(pTHX)
1272{
1273 /* Called when $^H is changed to indicate that HINT_UTF8 has changed from
1274 * off to on. At compile time, this has the effect of entering a 'use
1275 * utf8' section. This means that any input was not previously checked for
1276 * UTF-8 (because it was off), but now we do need to check it, or our
1277 * assumptions about the input being sane could be wrong, and we could
1278 * segfault. This routine just sets a flag so that the next time we look
1279 * at the input we do the well-formed UTF-8 check. If we aren't in the
1280 * proper phase, there may not be a parser object, but if there is, setting
1281 * the flag is harmless */
1282
1283 if (PL_parser) {
1284 PL_parser->recheck_utf8_validity = TRUE;
1285 }
1286}
1287
f0e67a1d
Z
1288/*
1289=for apidoc Amx|bool|lex_next_chunk|U32 flags
1290
1291Reads in the next chunk of text to be lexed, appending it to
1292L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1293looked to the end of the current chunk and wants to know more. It is
1294usual, but not necessary, for lexing to have consumed the entirety of
1295the current chunk at this time.
1296
1297If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1298chunk (i.e., the current chunk has been entirely consumed), normally the
1299current chunk will be discarded at the same time that the new chunk is
c5608a1f 1300read in. If C<flags> has the C<LEX_KEEP_PREVIOUS> bit set, the current chunk
f0e67a1d
Z
1301will not be discarded. If the current chunk has not been entirely
1302consumed, then it will not be discarded regardless of the flag.
1303
1304Returns true if some new text was added to the buffer, or false if the
1305buffer has reached the end of the input text.
1306
1307=cut
1308*/
1309
1310#define LEX_FAKE_EOF 0x80000000
e47d32dc 1311#define LEX_NO_TERM 0x40000000 /* here-doc */
f0e67a1d
Z
1312
1313bool
1314Perl_lex_next_chunk(pTHX_ U32 flags)
1315{
1316 SV *linestr;
1317 char *buf;
1318 STRLEN old_bufend_pos, new_bufend_pos;
1319 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1320 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1321 bool got_some_for_debugger = 0;
f0e67a1d 1322 bool got_some;
6cdc5cd8 1323
112d1284 1324 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM))
f0e67a1d 1325 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
d27f4b91 1326 if (!(flags & LEX_NO_TERM) && PL_lex_inwhat)
e47d32dc 1327 return FALSE;
f0e67a1d
Z
1328 linestr = PL_parser->linestr;
1329 buf = SvPVX(linestr);
407f8cf2
KW
1330 if (!(flags & LEX_KEEP_PREVIOUS)
1331 && PL_parser->bufptr == PL_parser->bufend)
1332 {
f0e67a1d
Z
1333 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1334 linestart_pos = 0;
1335 if (PL_parser->last_uni != PL_parser->bufend)
1336 PL_parser->last_uni = NULL;
1337 if (PL_parser->last_lop != PL_parser->bufend)
1338 PL_parser->last_lop = NULL;
1339 last_uni_pos = last_lop_pos = 0;
1340 *buf = 0;
1341 SvCUR(linestr) = 0;
1342 } else {
1343 old_bufend_pos = PL_parser->bufend - buf;
1344 bufptr_pos = PL_parser->bufptr - buf;
1345 oldbufptr_pos = PL_parser->oldbufptr - buf;
1346 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1347 linestart_pos = PL_parser->linestart - buf;
1348 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1349 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1350 }
1351 if (flags & LEX_FAKE_EOF) {
1352 goto eof;
60d63348 1353 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
f0e67a1d
Z
1354 got_some = 0;
1355 } else if (filter_gets(linestr, old_bufend_pos)) {
1356 got_some = 1;
17cc9359 1357 got_some_for_debugger = 1;
112d1284
FC
1358 } else if (flags & LEX_NO_TERM) {
1359 got_some = 0;
f0e67a1d 1360 } else {
580561a3 1361 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
847cc851 1362 SvPVCLEAR(linestr);
f0e67a1d
Z
1363 eof:
1364 /* End of real input. Close filehandle (unless it was STDIN),
1365 * then add implicit termination.
1366 */
87606032 1367 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
f0e67a1d
Z
1368 PerlIO_clearerr(PL_parser->rsfp);
1369 else if (PL_parser->rsfp)
1370 (void)PerlIO_close(PL_parser->rsfp);
1371 PL_parser->rsfp = NULL;
60d63348 1372 PL_parser->in_pod = PL_parser->filtered = 0;
f0e67a1d
Z
1373 if (!PL_in_eval && PL_minus_p) {
1374 sv_catpvs(linestr,
1375 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1376 PL_minus_n = PL_minus_p = 0;
1377 } else if (!PL_in_eval && PL_minus_n) {
1378 sv_catpvs(linestr, /*{*/";}");
1379 PL_minus_n = 0;
1380 } else
1381 sv_catpvs(linestr, ";");
1382 got_some = 1;
1383 }
1384 buf = SvPVX(linestr);
1385 new_bufend_pos = SvCUR(linestr);
1386 PL_parser->bufend = buf + new_bufend_pos;
1387 PL_parser->bufptr = buf + bufptr_pos;
6cdc5cd8 1388
07337b95
KW
1389 if (UTF) {
1390 const U8* first_bad_char_loc;
1391 if (UNLIKELY(! is_utf8_string_loc(
1392 (U8 *) PL_parser->bufptr,
1393 PL_parser->bufend - PL_parser->bufptr,
1394 &first_bad_char_loc)))
1395 {
1396 _force_out_malformed_utf8_message(first_bad_char_loc,
1397 (U8 *) PL_parser->bufend,
1398 0,
1399 1 /* 1 means die */ );
1400 NOT_REACHED; /* NOTREACHED */
1401 }
6cdc5cd8
KW
1402 }
1403
f0e67a1d
Z
1404 PL_parser->oldbufptr = buf + oldbufptr_pos;
1405 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1406 PL_parser->linestart = buf + linestart_pos;
1407 if (PL_parser->last_uni)
1408 PL_parser->last_uni = buf + last_uni_pos;
1409 if (PL_parser->last_lop)
1410 PL_parser->last_lop = buf + last_lop_pos;
7f1c3e8c
FC
1411 if (PL_parser->preambling != NOLINE) {
1412 CopLINE_set(PL_curcop, PL_parser->preambling + 1);
1413 PL_parser->preambling = NOLINE;
1414 }
407f8cf2
KW
1415 if ( got_some_for_debugger
1416 && PERLDB_LINE_OR_SAVESRC
1417 && PL_curstash != PL_debstash)
1418 {
f0e67a1d
Z
1419 /* debugger active and we're not compiling the debugger code,
1420 * so store the line into the debugger's array of lines
1421 */
1422 update_debugger_info(NULL, buf+old_bufend_pos,
1423 new_bufend_pos-old_bufend_pos);
1424 }
1425 return got_some;
1426}
1427
1428/*
1429=for apidoc Amx|I32|lex_peek_unichar|U32 flags
1430
1431Looks ahead one (Unicode) character in the text currently being lexed.
1432Returns the codepoint (unsigned integer value) of the next character,
1433or -1 if lexing has reached the end of the input text. To consume the
1434peeked character, use L</lex_read_unichar>.
1435
1436If the next character is in (or extends into) the next chunk of input
1437text, the next chunk will be read in. Normally the current chunk will be
c5608a1f
KW
1438discarded at the same time, but if C<flags> has the C<LEX_KEEP_PREVIOUS>
1439bit set, then the current chunk will not be discarded.
f0e67a1d
Z
1440
1441If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1442is encountered, an exception is generated.
1443
1444=cut
1445*/
1446
1447I32
1448Perl_lex_peek_unichar(pTHX_ U32 flags)
1449{
749123ff 1450 dVAR;
f0e67a1d
Z
1451 char *s, *bufend;
1452 if (flags & ~(LEX_KEEP_PREVIOUS))
1453 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1454 s = PL_parser->bufptr;
1455 bufend = PL_parser->bufend;
1456 if (UTF) {
1457 U8 head;
1458 I32 unichar;
1459 STRLEN len, retlen;
1460 if (s == bufend) {
1461 if (!lex_next_chunk(flags))
1462 return -1;
1463 s = PL_parser->bufptr;
1464 bufend = PL_parser->bufend;
1465 }
1466 head = (U8)*s;
54d004e8 1467 if (UTF8_IS_INVARIANT(head))
f0e67a1d 1468 return head;
54d004e8
KW
1469 if (UTF8_IS_START(head)) {
1470 len = UTF8SKIP(&head);
f0e67a1d
Z
1471 while ((STRLEN)(bufend-s) < len) {
1472 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1473 break;
1474 s = PL_parser->bufptr;
1475 bufend = PL_parser->bufend;
1476 }
1477 }
c80e42f3 1478 unichar = utf8n_to_uvchr((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
f0e67a1d 1479 if (retlen == (STRLEN)-1) {
75219bac
KW
1480 _force_out_malformed_utf8_message((U8 *) s,
1481 (U8 *) bufend,
1482 0,
1483 1 /* 1 means die */ );
1484 NOT_REACHED; /* NOTREACHED */
f0e67a1d
Z
1485 }
1486 return unichar;
1487 } else {
1488 if (s == bufend) {
1489 if (!lex_next_chunk(flags))
1490 return -1;
1491 s = PL_parser->bufptr;
1492 }
1493 return (U8)*s;
1494 }
1495}
1496
1497/*
1498=for apidoc Amx|I32|lex_read_unichar|U32 flags
1499
1500Reads the next (Unicode) character in the text currently being lexed.
1501Returns the codepoint (unsigned integer value) of the character read,
1502and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1503if lexing has reached the end of the input text. To non-destructively
1504examine the next character, use L</lex_peek_unichar> instead.
1505
1506If the next character is in (or extends into) the next chunk of input
1507text, the next chunk will be read in. Normally the current chunk will be
c5608a1f
KW
1508discarded at the same time, but if C<flags> has the C<LEX_KEEP_PREVIOUS>
1509bit set, then the current chunk will not be discarded.
f0e67a1d
Z
1510
1511If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1512is encountered, an exception is generated.
1513
1514=cut
1515*/
1516
1517I32
1518Perl_lex_read_unichar(pTHX_ U32 flags)
1519{
1520 I32 c;
1521 if (flags & ~(LEX_KEEP_PREVIOUS))
1522 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1523 c = lex_peek_unichar(flags);
1524 if (c != -1) {
1525 if (c == '\n')
83944c01 1526 COPLINE_INC_WITH_HERELINES;
d9018cbe
EB
1527 if (UTF)
1528 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1529 else
1530 ++(PL_parser->bufptr);
f0e67a1d
Z
1531 }
1532 return c;
1533}
1534
1535/*
1536=for apidoc Amx|void|lex_read_space|U32 flags
1537
1538Reads optional spaces, in Perl style, in the text currently being
1539lexed. The spaces may include ordinary whitespace characters and
1540Perl-style comments. C<#line> directives are processed if encountered.
1541L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1542at a non-space character (or the end of the input text).
1543
1544If spaces extend into the next chunk of input text, the next chunk will
1545be read in. Normally the current chunk will be discarded at the same
c5608a1f 1546time, but if C<flags> has the C<LEX_KEEP_PREVIOUS> bit set, then the current
f0e67a1d
Z
1547chunk will not be discarded.
1548
1549=cut
1550*/
1551
21791330 1552#define LEX_NO_INCLINE 0x40000000
f0998909
Z
1553#define LEX_NO_NEXT_CHUNK 0x80000000
1554
f0e67a1d
Z
1555void
1556Perl_lex_read_space(pTHX_ U32 flags)
1557{
1558 char *s, *bufend;
21791330 1559 const bool can_incline = !(flags & LEX_NO_INCLINE);
f0e67a1d 1560 bool need_incline = 0;
21791330 1561 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK|LEX_NO_INCLINE))
f0e67a1d 1562 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
f0e67a1d
Z
1563 s = PL_parser->bufptr;
1564 bufend = PL_parser->bufend;
1565 while (1) {
1566 char c = *s;
1567 if (c == '#') {
1568 do {
1569 c = *++s;
1570 } while (!(c == '\n' || (c == 0 && s == bufend)));
1571 } else if (c == '\n') {
1572 s++;
21791330
FC
1573 if (can_incline) {
1574 PL_parser->linestart = s;
1575 if (s == bufend)
1576 need_incline = 1;
1577 else
c6875f94 1578 incline(s, bufend);
21791330 1579 }
f0e67a1d
Z
1580 } else if (isSPACE(c)) {
1581 s++;
1582 } else if (c == 0 && s == bufend) {
1583 bool got_more;
65c68e17 1584 line_t l;
f0998909
Z
1585 if (flags & LEX_NO_NEXT_CHUNK)
1586 break;
f0e67a1d 1587 PL_parser->bufptr = s;
65c68e17 1588 l = CopLINE(PL_curcop);
851b527a 1589 CopLINE(PL_curcop) += PL_parser->herelines + 1;
f0e67a1d 1590 got_more = lex_next_chunk(flags);
65c68e17 1591 CopLINE_set(PL_curcop, l);
f0e67a1d
Z
1592 s = PL_parser->bufptr;
1593 bufend = PL_parser->bufend;
1594 if (!got_more)
1595 break;
21791330 1596 if (can_incline && need_incline && PL_parser->rsfp) {
c6875f94 1597 incline(s, bufend);
f0e67a1d
Z
1598 need_incline = 0;
1599 }
3c47da3c
FC
1600 } else if (!c) {
1601 s++;
f0e67a1d
Z
1602 } else {
1603 break;
1604 }
1605 }
f0e67a1d
Z
1606 PL_parser->bufptr = s;
1607}
1608
1609/*
fe788d6b
PM
1610
1611=for apidoc EXMp|bool|validate_proto|SV *name|SV *proto|bool warn
1612
1613This function performs syntax checking on a prototype, C<proto>.
1614If C<warn> is true, any illegal characters or mismatched brackets
1615will trigger illegalproto warnings, declaring that they were
1616detected in the prototype for C<name>.
1617
1618The return value is C<true> if this is a valid prototype, and
1619C<false> if it is not, regardless of whether C<warn> was C<true> or
1620C<false>.
1621
1622Note that C<NULL> is a valid C<proto> and will always return C<true>.
1623
1624=cut
1625
1626 */
1627
1628bool
5783dc51 1629Perl_validate_proto(pTHX_ SV *name, SV *proto, bool warn, bool curstash)
fe788d6b
PM
1630{
1631 STRLEN len, origlen;
11327fa1 1632 char *p;
fe788d6b
PM
1633 bool bad_proto = FALSE;
1634 bool in_brackets = FALSE;
1635 bool after_slash = FALSE;
1636 char greedy_proto = ' ';
1637 bool proto_after_greedy_proto = FALSE;
1638 bool must_be_last = FALSE;
1639 bool underscore = FALSE;
f791a21a 1640 bool bad_proto_after_underscore = FALSE;
fe788d6b
PM
1641
1642 PERL_ARGS_ASSERT_VALIDATE_PROTO;
1643
1644 if (!proto)
1645 return TRUE;
1646
11327fa1 1647 p = SvPV(proto, len);
fe788d6b
PM
1648 origlen = len;
1649 for (; len--; p++) {
1650 if (!isSPACE(*p)) {
1651 if (must_be_last)
1652 proto_after_greedy_proto = TRUE;
f791a21a
PM
1653 if (underscore) {
1654 if (!strchr(";@%", *p))
1655 bad_proto_after_underscore = TRUE;
1656 underscore = FALSE;
1657 }
fe788d6b
PM
1658 if (!strchr("$@%*;[]&\\_+", *p) || *p == '\0') {
1659 bad_proto = TRUE;
1660 }
1661 else {
fe788d6b
PM
1662 if (*p == '[')
1663 in_brackets = TRUE;
1664 else if (*p == ']')
1665 in_brackets = FALSE;
407f8cf2
KW
1666 else if ((*p == '@' || *p == '%')
1667 && !after_slash
1668 && !in_brackets )
1669 {
fe788d6b
PM
1670 must_be_last = TRUE;
1671 greedy_proto = *p;
1672 }
1673 else if (*p == '_')
f791a21a 1674 underscore = TRUE;
fe788d6b
PM
1675 }
1676 if (*p == '\\')
1677 after_slash = TRUE;
1678 else
1679 after_slash = FALSE;
1680 }
1681 }
1682
1683 if (warn) {
b54d603d 1684 SV *tmpsv = newSVpvs_flags("", SVs_TEMP);
fe788d6b 1685 p -= origlen;
b54d603d
PM
1686 p = SvUTF8(proto)
1687 ? sv_uni_display(tmpsv, newSVpvn_flags(p, origlen, SVs_TEMP | SVf_UTF8),
1688 origlen, UNI_DISPLAY_ISPRINT)
1689 : pv_pretty(tmpsv, p, origlen, 60, NULL, NULL, PERL_PV_ESCAPE_NONASCII);
1690
cbf83791
FC
1691 if (curstash && !memchr(SvPVX(name), ':', SvCUR(name))) {
1692 SV *name2 = sv_2mortal(newSVsv(PL_curstname));
1693 sv_catpvs(name2, "::");
1694 sv_catsv(name2, (SV *)name);
1695 name = name2;
1696 }
1697
fe788d6b
PM
1698 if (proto_after_greedy_proto)
1699 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
147e3846 1700 "Prototype after '%c' for %" SVf " : %s",
fe788d6b 1701 greedy_proto, SVfARG(name), p);
50278ed0
PM
1702 if (in_brackets)
1703 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
147e3846 1704 "Missing ']' in prototype for %" SVf " : %s",
50278ed0 1705 SVfARG(name), p);
b54d603d 1706 if (bad_proto)
fe788d6b 1707 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
147e3846 1708 "Illegal character in prototype for %" SVf " : %s",
f791a21a
PM
1709 SVfARG(name), p);
1710 if (bad_proto_after_underscore)
1711 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
147e3846 1712 "Illegal character after '_' in prototype for %" SVf " : %s",
f791a21a 1713 SVfARG(name), p);
fe788d6b
PM
1714 }
1715
1716 return (! (proto_after_greedy_proto || bad_proto) );
1717}
1718
1719/*
ffb4593c
NT
1720 * S_incline
1721 * This subroutine has nothing to do with tilting, whether at windmills
1722 * or pinball tables. Its name is short for "increment line". It
57843af0 1723 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1724 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1725 * # line 500 "foo.pm"
1726 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1727 */
1728
76e3520e 1729STATIC void
c6875f94 1730S_incline(pTHX_ const char *s, const char *end)
463ee0b2 1731{
d9095cec
NC
1732 const char *t;
1733 const char *n;
1734 const char *e;
8818d409 1735 line_t line_num;
22ff3130 1736 UV uv;
463ee0b2 1737
7918f24d
NC
1738 PERL_ARGS_ASSERT_INCLINE;
1739
d77eff5d
KW
1740 assert(end >= s);
1741
83944c01 1742 COPLINE_INC_WITH_HERELINES;
451f421f
FC
1743 if (!PL_rsfp && !PL_parser->filtered && PL_lex_state == LEX_NORMAL
1744 && s+1 == PL_bufend && *s == ';') {
1745 /* fake newline in string eval */
1746 CopLINE_dec(PL_curcop);
1747 return;
1748 }
463ee0b2
LW
1749 if (*s++ != '#')
1750 return;
d4c19fe8
AL
1751 while (SPACE_OR_TAB(*s))
1752 s++;
d77eff5d
KW
1753 if (memBEGINs(s, (STRLEN) (end - s), "line"))
1754 s += sizeof("line") - 1;
73659bf1
GS
1755 else
1756 return;
084592ab 1757 if (SPACE_OR_TAB(*s))
73659bf1 1758 s++;
4e553d73 1759 else
73659bf1 1760 return;
d4c19fe8
AL
1761 while (SPACE_OR_TAB(*s))
1762 s++;
463ee0b2
LW
1763 if (!isDIGIT(*s))
1764 return;
d4c19fe8 1765
463ee0b2
LW
1766 n = s;
1767 while (isDIGIT(*s))
1768 s++;
07714eb4 1769 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1770 return;
bf4acbe4 1771 while (SPACE_OR_TAB(*s))
463ee0b2 1772 s++;
85bb8b90 1773 if (*s == '"' && (t = (char *) memchr(s+1, '"', end - s))) {
463ee0b2 1774 s++;
73659bf1
GS
1775 e = t + 1;
1776 }
463ee0b2 1777 else {
c35e046a 1778 t = s;
1bb1a3d6 1779 while (*t && !isSPACE(*t))
c35e046a 1780 t++;
73659bf1 1781 e = t;
463ee0b2 1782 }
bf4acbe4 1783 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1784 e++;
1785 if (*e != '\n' && *e != '\0')
1786 return; /* false alarm */
1787
22ff3130
HS
1788 if (!grok_atoUV(n, &uv, &e))
1789 return;
1790 line_num = ((line_t)uv) - 1;
8818d409 1791
f4dd75d9 1792 if (t - s > 0) {
d9095cec 1793 const STRLEN len = t - s;
3df32bda 1794
d36ee5be 1795 if (!PL_rsfp && !PL_parser->filtered) {
e66cf94c
RGS
1796 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1797 * to *{"::_<newfilename"} */
44867030
NC
1798 /* However, the long form of evals is only turned on by the
1799 debugger - usually they're "(eval %lu)" */
d36ee5be
FC
1800 GV * const cfgv = CopFILEGV(PL_curcop);
1801 if (cfgv) {
38bd7ad8
FC
1802 char smallbuf[128];
1803 STRLEN tmplen2 = len;
44867030 1804 char *tmpbuf2;
449dd039 1805 GV *gv2;
44867030
NC
1806
1807 if (tmplen2 + 2 <= sizeof smallbuf)
1808 tmpbuf2 = smallbuf;
1809 else
1810 Newx(tmpbuf2, tmplen2 + 2, char);
1811
38bd7ad8
FC
1812 tmpbuf2[0] = '_';
1813 tmpbuf2[1] = '<';
44867030
NC
1814
1815 memcpy(tmpbuf2 + 2, s, tmplen2);
1816 tmplen2 += 2;
1817
8a5ee598 1818 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1819 if (!isGV(gv2)) {
8a5ee598 1820 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1821 /* adjust ${"::_<newfilename"} to store the new file name */
1822 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
8818d409
FC
1823 /* The line number may differ. If that is the case,
1824 alias the saved lines that are in the array.
1825 Otherwise alias the whole array. */
1826 if (CopLINE(PL_curcop) == line_num) {
38bd7ad8
FC
1827 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(cfgv)));
1828 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(cfgv)));
8818d409 1829 }
38bd7ad8
FC
1830 else if (GvAV(cfgv)) {
1831 AV * const av = GvAV(cfgv);
515c395b
TC
1832 const line_t start = CopLINE(PL_curcop)+1;
1833 SSize_t items = AvFILLp(av) - start;
8818d409
FC
1834 if (items > 0) {
1835 AV * const av2 = GvAVn(gv2);
1836 SV **svp = AvARRAY(av) + start;
515c395b
TC
1837 Size_t l = line_num+1;
1838 while (items-- && l < SSize_t_MAX && l == (line_t)l)
1839 av_store(av2, (SSize_t)l++, SvREFCNT_inc(*svp++));
8818d409
FC
1840 }
1841 }
e5527e4b 1842 }
44867030
NC
1843
1844 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
d36ee5be 1845 }
e66cf94c 1846 }
05ec9bb3 1847 CopFILE_free(PL_curcop);
449dd039 1848 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1849 }
8818d409 1850 CopLINE_set(PL_curcop, line_num);
463ee0b2
LW
1851}
1852
80a702cd 1853STATIC void
15f169a1 1854S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1855{
1856 AV *av = CopFILEAVx(PL_curcop);
1857 if (av) {
7f1c3e8c
FC
1858 SV * sv;
1859 if (PL_parser->preambling == NOLINE) sv = newSV_type(SVt_PVMG);
1860 else {
1861 sv = *av_fetch(av, 0, 1);
1862 SvUPGRADE(sv, SVt_PVMG);
1863 }
847cc851 1864 if (!SvPOK(sv)) SvPVCLEAR(sv);
5fa550fb 1865 if (orig_sv)
7f1c3e8c 1866 sv_catsv(sv, orig_sv);
5fa550fb 1867 else
7f1c3e8c
FC
1868 sv_catpvn(sv, buf, len);
1869 if (!SvIOK(sv)) {
1870 (void)SvIOK_on(sv);
1871 SvIV_set(sv, 0);
1872 }
1873 if (PL_parser->preambling == NOLINE)
1874 av_store(av, CopLINE(PL_curcop), sv);
80a702cd
RGS
1875 }
1876}
1877
ffb4593c 1878/*
8c6b0c7d 1879 * skipspace
ffb4593c
NT
1880 * Called to gobble the appropriate amount and type of whitespace.
1881 * Skips comments as well.
71fff7cb 1882 * Returns the next character after the whitespace that is skipped.
8c6b0c7d
FC
1883 *
1884 * peekspace
1885 * Same thing, but look ahead without incrementing line numbers or
1886 * adjusting PL_linestart.
ffb4593c
NT
1887 */
1888
3218e223 1889#define skipspace(s) skipspace_flags(s, 0)
8c6b0c7d 1890#define peekspace(s) skipspace_flags(s, LEX_NO_INCLINE)
3218e223 1891
76e3520e 1892STATIC char *
21791330 1893S_skipspace_flags(pTHX_ char *s, U32 flags)
a687059c 1894{
21791330 1895 PERL_ARGS_ASSERT_SKIPSPACE_FLAGS;
3280af22 1896 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
3c47da3c 1897 while (s < PL_bufend && (SPACE_OR_TAB(*s) || !*s))
463ee0b2 1898 s++;
f0e67a1d
Z
1899 } else {
1900 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1901 PL_bufptr = s;
21791330 1902 lex_read_space(flags | LEX_KEEP_PREVIOUS |
d27f4b91 1903 (PL_lex_inwhat || PL_lex_state == LEX_FORMLINE ?
f0998909 1904 LEX_NO_NEXT_CHUNK : 0));
3280af22 1905 s = PL_bufptr;
f0e67a1d
Z
1906 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1907 if (PL_linestart > PL_bufptr)
1908 PL_bufptr = PL_linestart;
1909 return s;
463ee0b2 1910 }
5db06880 1911 return s;
a687059c 1912}
378cc40b 1913
ffb4593c
NT
1914/*
1915 * S_check_uni
1916 * Check the unary operators to ensure there's no ambiguity in how they're
1917 * used. An ambiguous piece of code would be:
1918 * rand + 5
1919 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1920 * the +5 is its argument.
1921 */
1922
76e3520e 1923STATIC void
cea2e8a9 1924S_check_uni(pTHX)
ba106d47 1925{
d4c19fe8 1926 const char *s;
2f3197b3 1927
3280af22 1928 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1929 return;
3280af22
NIS
1930 while (isSPACE(*PL_last_uni))
1931 PL_last_uni++;
c35e046a 1932 s = PL_last_uni;
fac0f7a3 1933 while (isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF) || *s == '-')
8ce2ba82 1934 s += UTF ? UTF8SKIP(s) : 1;
4efcdc02 1935 if (s < PL_bufptr && memchr(s, '(', PL_bufptr - s))
a0d0e21e 1936 return;
6136c704 1937
9b387841 1938 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
147e3846 1939 "Warning: Use of \"%" UTF8f "\" without parentheses is ambiguous",
b59c097b 1940 UTF8fARG(UTF, (int)(s - PL_last_uni), PL_last_uni));
2f3197b3
LW
1941}
1942
ffb4593c
NT
1943/*
1944 * LOP : macro to build a list operator. Its behaviour has been replaced
1945 * with a subroutine, S_lop() for which LOP is just another name.
1946 */
1947
a0d0e21e
LW
1948#define LOP(f,x) return lop(f,x,s)
1949
ffb4593c
NT
1950/*
1951 * S_lop
1952 * Build a list operator (or something that might be one). The rules:
41e8cbf4
FC
1953 * - if we have a next token, then it's a list operator (no parens) for
1954 * which the next token has already been parsed; e.g.,
1955 * sort foo @args
1956 * sort foo (@args)
ffb4593c
NT
1957 * - if the next thing is an opening paren, then it's a function
1958 * - else it's a list operator
1959 */
1960
76e3520e 1961STATIC I32
11288bb3 1962S_lop(pTHX_ I32 f, U8 x, char *s)
ffed7fef 1963{
7918f24d
NC
1964 PERL_ARGS_ASSERT_LOP;
1965
6154021b 1966 pl_yylval.ival = f;
35c8bce7 1967 CLINE;
3280af22
NIS
1968 PL_bufptr = s;
1969 PL_last_lop = PL_oldbufptr;
eb160463 1970 PL_last_lop_op = (OPCODE)f;
3280af22 1971 if (PL_nexttoke)
78cdf107 1972 goto lstop;
19f1898a 1973 PL_expect = x;
79072805 1974 if (*s == '(')
bbf60fe6 1975 return REPORT(FUNC);
294a536f 1976 s = skipspace(s);
79072805 1977 if (*s == '(')
bbf60fe6 1978 return REPORT(FUNC);
78cdf107
Z
1979 else {
1980 lstop:
1981 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
1982 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
bbf60fe6 1983 return REPORT(LSTOP);
78cdf107 1984 }
79072805
LW
1985}
1986
ffb4593c
NT
1987/*
1988 * S_force_next
9cbb5ea2 1989 * When the lexer realizes it knows the next token (for instance,
ffb4593c 1990 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 1991 * to know what token to return the next time the lexer is called. Caller
b5bbe64a
JH
1992 * will need to set PL_nextval[] and possibly PL_expect to ensure
1993 * the lexer handles the token correctly.
ffb4593c
NT
1994 */
1995
4e553d73 1996STATIC void
cea2e8a9 1997S_force_next(pTHX_ I32 type)
79072805 1998{
704d4215
GG
1999#ifdef DEBUGGING
2000 if (DEBUG_T_TEST) {
2001 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 2002 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
2003 }
2004#endif
1f7c3e7c 2005 assert(PL_nexttoke < C_ARRAY_LENGTH(PL_nexttype));
3280af22
NIS
2006 PL_nexttype[PL_nexttoke] = type;
2007 PL_nexttoke++;
79072805
LW
2008}
2009
89f35911
FC
2010/*
2011 * S_postderef
2012 *
2013 * This subroutine handles postfix deref syntax after the arrow has already
b3f7b7ad 2014 * been emitted. @* $* etc. are emitted as two separate tokens right here.
89f35911
FC
2015 * @[ @{ %[ %{ *{ are emitted also as two tokens, but this function emits
2016 * only the first, leaving yylex to find the next.
89f35911
FC
2017 */
2018
2019static int
ff25e5db 2020S_postderef(pTHX_ int const funny, char const next)
89f35911 2021{
ff25e5db 2022 assert(funny == DOLSHARP || strchr("$@%&*", funny));
89f35911
FC
2023 if (next == '*') {
2024 PL_expect = XOPERATOR;
cc624add 2025 if (PL_lex_state == LEX_INTERPNORMAL && !PL_lex_brackets) {
ff25e5db 2026 assert('@' == funny || '$' == funny || DOLSHARP == funny);
cc624add 2027 PL_lex_state = LEX_INTERPEND;
c3492809
FC
2028 if ('@' == funny)
2029 force_next(POSTJOIN);
cc624add 2030 }
89f35911
FC
2031 force_next(next);
2032 PL_bufptr+=2;
2033 }
2034 else {
760ca746
FC
2035 if ('@' == funny && PL_lex_state == LEX_INTERPNORMAL
2036 && !PL_lex_brackets)
2037 PL_lex_dojoin = 2;
89f35911
FC
2038 PL_expect = XOPERATOR;
2039 PL_bufptr++;
2040 }
2041 return funny;
2042}
2043
28ac2b49
Z
2044void
2045Perl_yyunlex(pTHX)
2046{
a7aaec61
Z
2047 int yyc = PL_parser->yychar;
2048 if (yyc != YYEMPTY) {
2049 if (yyc) {
a7aaec61
Z
2050 NEXTVAL_NEXTTOKE = PL_parser->yylval;
2051 if (yyc == '{'/*}*/ || yyc == HASHBRACK || yyc == '['/*]*/) {
78cdf107 2052 PL_lex_allbrackets--;
a7aaec61 2053 PL_lex_brackets--;
78cdf107
Z
2054 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
2055 } else if (yyc == '('/*)*/) {
2056 PL_lex_allbrackets--;
2057 yyc |= (2<<24);
a7aaec61
Z
2058 }
2059 force_next(yyc);
2060 }
28ac2b49
Z
2061 PL_parser->yychar = YYEMPTY;
2062 }
2063}
2064
d0a148a6 2065STATIC SV *
15f169a1 2066S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 2067{
740cce10 2068 SV * const sv = newSVpvn_utf8(start, len,
6d492fac
KW
2069 ! IN_BYTES
2070 && UTF
53463649 2071 && len != 0
6d492fac 2072 && is_utf8_non_invariant_string((const U8*)start, len));
d0a148a6
NC
2073 return sv;
2074}
2075
ffb4593c
NT
2076/*
2077 * S_force_word
2078 * When the lexer knows the next thing is a word (for instance, it has
2079 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
2080 * it calls S_force_word to stick the next word into the PL_nexttoke/val
2081 * lookahead.
ffb4593c
NT
2082 *
2083 * Arguments:
b1b65b59 2084 * char *start : buffer position (must be within PL_linestr)
185c2e96
DM
2085 * int token : PL_next* will be this type of bare word
2086 * (e.g., METHOD,BAREWORD)
ffb4593c
NT
2087 * int check_keyword : if true, Perl checks to make sure the word isn't
2088 * a keyword (do this if the word is a label, e.g. goto FOO)
2089 * int allow_pack : if true, : characters will also be allowed (require,
2090 * use, etc. do this)
ffb4593c
NT
2091 */
2092
76e3520e 2093STATIC char *
345b3785 2094S_force_word(pTHX_ char *start, int token, int check_keyword, int allow_pack)
79072805 2095{
eb578fdb 2096 char *s;
463ee0b2 2097 STRLEN len;
4e553d73 2098
7918f24d
NC
2099 PERL_ARGS_ASSERT_FORCE_WORD;
2100
294a536f 2101 start = skipspace(start);
463ee0b2 2102 s = start;
fac0f7a3 2103 if ( isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)
e7127e21 2104 || (allow_pack && *s == ':' && s[1] == ':') )
a0d0e21e 2105 {
3280af22 2106 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
01b5ef50
FC
2107 if (check_keyword) {
2108 char *s2 = PL_tokenbuf;
487e470d 2109 STRLEN len2 = len;
de627158
KW
2110 if (allow_pack && memBEGINPs(s2, len, "CORE::")) {
2111 s2 += sizeof("CORE::") - 1;
2112 len2 -= sizeof("CORE::") - 1;
2113 }
487e470d 2114 if (keyword(s2, len2, 0))
463ee0b2 2115 return start;
01b5ef50 2116 }
463ee0b2 2117 if (token == METHOD) {
294a536f 2118 s = skipspace(s);
463ee0b2 2119 if (*s == '(')
3280af22 2120 PL_expect = XTERM;
463ee0b2 2121 else {
3280af22 2122 PL_expect = XOPERATOR;
463ee0b2 2123 }
79072805 2124 }
9ded7720 2125 NEXTVAL_NEXTTOKE.opval
275103cd 2126 = newSVOP(OP_CONST,0,
d0a148a6 2127 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 2128 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
2129 force_next(token);
2130 }
2131 return s;
2132}
2133
ffb4593c
NT
2134/*
2135 * S_force_ident
9cbb5ea2 2136 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
2137 * text only contains the "foo" portion. The first argument is a pointer
2138 * to the "foo", and the second argument is the type symbol to prefix.
185c2e96 2139 * Forces the next token to be a "BAREWORD".
9cbb5ea2 2140 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2141 */
2142
76e3520e 2143STATIC void
5aaab254 2144S_force_ident(pTHX_ const char *s, int kind)
79072805 2145{
7918f24d
NC
2146 PERL_ARGS_ASSERT_FORCE_IDENT;
2147
c9b48522
DD
2148 if (s[0]) {
2149 const STRLEN len = s[1] ? strlen(s) : 1; /* s = "\"" see yylex */
275103cd 2150 OP* const o = newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
728847b1 2151 UTF ? SVf_UTF8 : 0));
9ded7720 2152 NEXTVAL_NEXTTOKE.opval = o;
185c2e96 2153 force_next(BAREWORD);
748a9306 2154 if (kind) {
11343788 2155 o->op_private = OPpCONST_ENTERED;
55497cff
PP
2156 /* XXX see note in pp_entereval() for why we forgo typo
2157 warnings if the symbol must be introduced in an eval.
2158 GSAR 96-10-12 */
90e5519e 2159 gv_fetchpvn_flags(s, len,
4bff32c5 2160 (PL_in_eval ? GV_ADDMULTI
728847b1 2161 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
90e5519e
NC
2162 kind == '$' ? SVt_PV :
2163 kind == '@' ? SVt_PVAV :
2164 kind == '%' ? SVt_PVHV :
a0d0e21e 2165 SVt_PVGV
90e5519e 2166 );
748a9306 2167 }
79072805
LW
2168 }
2169}
2170
3f33d153
FC
2171static void
2172S_force_ident_maybe_lex(pTHX_ char pit)
2173{
3f33d153
FC
2174 NEXTVAL_NEXTTOKE.ival = pit;
2175 force_next('p');
2176}
2177
1571675a
GS
2178NV
2179Perl_str_to_version(pTHX_ SV *sv)
2180{
2181 NV retval = 0.0;
2182 NV nshift = 1.0;
2183 STRLEN len;
cfd0369c 2184 const char *start = SvPV_const(sv,len);
9d4ba2ae 2185 const char * const end = start + len;
8298454c 2186 const bool utf = cBOOL(SvUTF8(sv));
7918f24d
NC
2187
2188 PERL_ARGS_ASSERT_STR_TO_VERSION;
2189
1571675a 2190 while (start < end) {
ba210ebe 2191 STRLEN skip;
1571675a
GS
2192 UV n;
2193 if (utf)
9041c2e3 2194 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2195 else {
2196 n = *(U8*)start;
2197 skip = 1;
2198 }
2199 retval += ((NV)n)/nshift;
2200 start += skip;
2201 nshift *= 1000;
2202 }
2203 return retval;
2204}
2205
4e553d73 2206/*
ffb4593c
NT
2207 * S_force_version
2208 * Forces the next token to be a version number.
e759cc13
RGS
2209 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2210 * and if "guessing" is TRUE, then no new token is created (and the caller
2211 * must use an alternative parsing method).
ffb4593c
NT
2212 */
2213
76e3520e 2214STATIC char *
e759cc13 2215S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2216{
5f66b61c 2217 OP *version = NULL;
44dcb63b 2218 char *d;
89bfa8cd 2219
7918f24d
NC
2220 PERL_ARGS_ASSERT_FORCE_VERSION;
2221
294a536f 2222 s = skipspace(s);
89bfa8cd 2223
44dcb63b 2224 d = s;
dd629d5b 2225 if (*d == 'v')
44dcb63b 2226 d++;
44dcb63b 2227 if (isDIGIT(*d)) {
e759cc13
RGS
2228 while (isDIGIT(*d) || *d == '_' || *d == '.')
2229 d++;
4e4da3ac 2230 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
dd629d5b 2231 SV *ver;
6154021b
RGS
2232 s = scan_num(s, &pl_yylval);
2233 version = pl_yylval.opval;
dd629d5b
GS
2234 ver = cSVOPx(version)->op_sv;
2235 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2236 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2237 SvNV_set(ver, str_to_version(ver));
1571675a 2238 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2239 }
89bfa8cd 2240 }
5db06880 2241 else if (guessing) {
e759cc13 2242 return s;
5db06880 2243 }
89bfa8cd
PP
2244 }
2245
2246 /* NOTE: The parser sees the package name and the VERSION swapped */
9ded7720 2247 NEXTVAL_NEXTTOKE.opval = version;
185c2e96 2248 force_next(BAREWORD);
89bfa8cd 2249
e759cc13 2250 return s;
89bfa8cd
PP
2251}
2252
ffb4593c 2253/*
91152fc1
DG
2254 * S_force_strict_version
2255 * Forces the next token to be a version number using strict syntax rules.
2256 */
2257
2258STATIC char *
2259S_force_strict_version(pTHX_ char *s)
2260{
91152fc1 2261 OP *version = NULL;
91152fc1
DG
2262 const char *errstr = NULL;
2263
2264 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2265
2266 while (isSPACE(*s)) /* leading whitespace */
2267 s++;
2268
2269 if (is_STRICT_VERSION(s,&errstr)) {
2270 SV *ver = newSV(0);
2271 s = (char *)scan_version(s, ver, 0);
2272 version = newSVOP(OP_CONST, 0, ver);
2273 }
407f8cf2
KW
2274 else if ((*s != ';' && *s != '{' && *s != '}' )
2275 && (s = skipspace(s), (*s != ';' && *s != '{' && *s != '}' )))
4e4da3ac 2276 {
91152fc1
DG
2277 PL_bufptr = s;
2278 if (errstr)
2279 yyerror(errstr); /* version required */
2280 return s;
2281 }
2282
91152fc1 2283 /* NOTE: The parser sees the package name and the VERSION swapped */
91152fc1 2284 NEXTVAL_NEXTTOKE.opval = version;
185c2e96 2285 force_next(BAREWORD);
91152fc1
DG
2286
2287 return s;
2288}
2289
2290/*
ffb4593c 2291 * S_tokeq
ef3ff34d
KW
2292 * Turns any \\ into \ in a quoted string passed in in 'sv', returning 'sv',
2293 * modified as necessary. However, if HINT_NEW_STRING is on, 'sv' is
2294 * unchanged, and a new SV containing the modified input is returned.
ffb4593c
NT
2295 */
2296
76e3520e 2297STATIC SV *
cea2e8a9 2298S_tokeq(pTHX_ SV *sv)
79072805 2299{
eb578fdb
KW
2300 char *s;
2301 char *send;
2302 char *d;
b3ac6de7 2303 SV *pv = sv;
79072805 2304
7918f24d
NC
2305 PERL_ARGS_ASSERT_TOKEQ;
2306
279b35ad
FC
2307 assert (SvPOK(sv));
2308 assert (SvLEN(sv));
2309 assert (!SvIsCOW(sv));
307ed071 2310 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1) /* <<'heredoc' */
b3ac6de7 2311 goto finish;
279b35ad
FC
2312 s = SvPVX(sv);
2313 send = SvEND(sv);
dcb21ed6
NC
2314 /* This is relying on the SV being "well formed" with a trailing '\0' */
2315 while (s < send && !(*s == '\\' && s[1] == '\\'))
79072805
LW
2316 s++;
2317 if (s == send)
b3ac6de7 2318 goto finish;
79072805 2319 d = s;
be4731d2 2320 if ( PL_hints & HINT_NEW_STRING ) {
279b35ad
FC
2321 pv = newSVpvn_flags(SvPVX_const(pv), SvCUR(sv),
2322 SVs_TEMP | SvUTF8(sv));
be4731d2 2323 }
79072805
LW
2324 while (s < send) {
2325 if (*s == '\\') {
a0d0e21e 2326 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2327 s++; /* all that, just for this */
2328 }
2329 *d++ = *s++;
2330 }
2331 *d = '\0';
95a20fc0 2332 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2333 finish:
3280af22 2334 if ( PL_hints & HINT_NEW_STRING )
164e423c 2335 return new_constant(NULL, 0, "q", sv, pv, "q", 1, NULL);
79072805
LW
2336 return sv;
2337}
2338
ffb4593c
NT
2339/*
2340 * Now come three functions related to double-quote context,
2341 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2342 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2343 * interact with PL_lex_state, and create fake ( ... ) argument lists
2344 * to handle functions and concatenation.
ecd24171
DM
2345 * For example,
2346 * "foo\lbar"
2347 * is tokenised as
2348 * stringify ( const[foo] concat lcfirst ( const[bar] ) )
ffb4593c
NT
2349 */
2350
2351/*
2352 * S_sublex_start
6154021b 2353 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2354 *
2355 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2356 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c 2357 *
16d1d8bd 2358 * OP_CONST is easy--just make the new op and return.
ffb4593c
NT
2359 *
2360 * Everything else becomes a FUNC.
2361 *
16d1d8bd
LM
2362 * Sets PL_lex_state to LEX_INTERPPUSH unless ival was OP_NULL or we
2363 * had an OP_CONST. This just sets us up for a
ffb4593c
NT
2364 * call to S_sublex_push().
2365 */
2366
76e3520e 2367STATIC I32
cea2e8a9 2368S_sublex_start(pTHX)
79072805 2369{
eb578fdb 2370 const I32 op_type = pl_yylval.ival;
79072805
LW
2371
2372 if (op_type == OP_NULL) {
6154021b 2373 pl_yylval.opval = PL_lex_op;
5f66b61c 2374 PL_lex_op = NULL;
79072805
LW
2375 return THING;
2376 }
466112bb 2377 if (op_type == OP_CONST) {
67c71cbb
FC
2378 SV *sv = PL_lex_stuff;
2379 PL_lex_stuff = NULL;
2380 sv = tokeq(sv);
b3ac6de7
IZ
2381
2382 if (SvTYPE(sv) == SVt_PVIV) {
2383 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2384 STRLEN len;
96a5add6 2385 const char * const p = SvPV_const(sv, len);
740cce10 2386 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2387 SvREFCNT_dec(sv);
2388 sv = nsv;
4e553d73 2389 }
275103cd 2390 pl_yylval.opval = newSVOP(op_type, 0, sv);
79072805
LW
2391 return THING;
2392 }
2393
7ef70b3d
FC
2394 PL_parser->lex_super_state = PL_lex_state;
2395 PL_parser->lex_sub_inwhat = (U16)op_type;
2396 PL_parser->lex_sub_op = PL_lex_op;
bb4e4c38
TC
2397 PL_parser->sub_no_recover = FALSE;
2398 PL_parser->sub_error_count = PL_error_count;
3280af22 2399 PL_lex_state = LEX_INTERPPUSH;
55497cff 2400
3280af22
NIS
2401 PL_expect = XTERM;
2402 if (PL_lex_op) {
6154021b 2403 pl_yylval.opval = PL_lex_op;
5f66b61c 2404 PL_lex_op = NULL;
55497cff
PP
2405 return PMFUNC;
2406 }
2407 else
2408 return FUNC;
2409}
2410
ffb4593c
NT
2411/*
2412 * S_sublex_push
2413 * Create a new scope to save the lexing state. The scope will be
2414 * ended in S_sublex_done. Returns a '(', starting the function arguments
2415 * to the uc, lc, etc. found before.
2416 * Sets PL_lex_state to LEX_INTERPCONCAT.
2417 */
2418
76e3520e 2419STATIC I32
cea2e8a9 2420S_sublex_push(pTHX)
55497cff 2421{
78a635de 2422 LEXSHARED *shared;
801d32ac 2423 const bool is_heredoc = PL_multi_close == '<';
f46d017c 2424 ENTER;
55497cff 2425
7ef70b3d 2426 PL_lex_state = PL_parser->lex_super_state;
cc624add 2427 SAVEI8(PL_lex_dojoin);
3280af22 2428 SAVEI32(PL_lex_brackets);
78cdf107 2429 SAVEI32(PL_lex_allbrackets);
b27dce25 2430 SAVEI32(PL_lex_formbrack);
78cdf107 2431 SAVEI8(PL_lex_fakeeof);
3280af22
NIS
2432 SAVEI32(PL_lex_casemods);
2433 SAVEI32(PL_lex_starts);
651b5b28 2434 SAVEI8(PL_lex_state);
7cc34111 2435 SAVESPTR(PL_lex_repl);
7766f137 2436 SAVEVPTR(PL_lex_inpat);
98246f1e 2437 SAVEI16(PL_lex_inwhat);
ffdb8b16 2438 if (is_heredoc)
b42366d4 2439 {
ffdb8b16 2440 SAVECOPLINE(PL_curcop);
b42366d4 2441 SAVEI32(PL_multi_end);
851b527a
FC
2442 SAVEI32(PL_parser->herelines);
2443 PL_parser->herelines = 0;
b42366d4 2444 }
2ca4363d 2445 SAVEIV(PL_multi_close);
3280af22 2446 SAVEPPTR(PL_bufptr);
8452ff4b 2447 SAVEPPTR(PL_bufend);
3280af22
NIS
2448 SAVEPPTR(PL_oldbufptr);
2449 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2450 SAVEPPTR(PL_last_lop);
2451 SAVEPPTR(PL_last_uni);
3280af22
NIS
2452 SAVEPPTR(PL_linestart);
2453 SAVESPTR(PL_linestr);
8edd5f42
RGS
2454 SAVEGENERICPV(PL_lex_brackstack);
2455 SAVEGENERICPV(PL_lex_casestack);
78a635de 2456 SAVEGENERICPV(PL_parser->lex_shared);
3a54fd60 2457 SAVEBOOL(PL_parser->lex_re_reparsing);
ffdb8b16 2458 SAVEI32(PL_copline);
3280af22 2459
99bd9d90 2460 /* The here-doc parser needs to be able to peek into outer lexing
60f40a38
FC
2461 scopes to find the body of the here-doc. So we put PL_linestr and
2462 PL_bufptr into lex_shared, to ‘share’ those values.
99bd9d90 2463 */
60f40a38
FC
2464 PL_parser->lex_shared->ls_linestr = PL_linestr;
2465 PL_parser->lex_shared->ls_bufptr = PL_bufptr;
99bd9d90 2466
3280af22 2467 PL_linestr = PL_lex_stuff;
7ef70b3d 2468 PL_lex_repl = PL_parser->lex_sub_repl;
a0714e2c 2469 PL_lex_stuff = NULL;
7ef70b3d 2470 PL_parser->lex_sub_repl = NULL;
3280af22 2471
eabab8bc
FC
2472 /* Arrange for PL_lex_stuff to be freed on scope exit, in case it gets
2473 set for an inner quote-like operator and then an error causes scope-
2474 popping. We must not have a PL_lex_stuff value left dangling, as
2475 that breaks assumptions elsewhere. See bug #123617. */
2476 SAVEGENERICSV(PL_lex_stuff);
7ef70b3d 2477 SAVEGENERICSV(PL_parser->lex_sub_repl);
eabab8bc 2478
9cbb5ea2
GS
2479 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2480 = SvPVX(PL_linestr);
3280af22 2481 PL_bufend += SvCUR(PL_linestr);
bd61b366 2482 PL_last_lop = PL_last_uni = NULL;
3280af22 2483 SAVEFREESV(PL_linestr);
4dc843bc 2484 if (PL_lex_repl) SAVEFREESV(PL_lex_repl);
3280af22
NIS
2485
2486 PL_lex_dojoin = FALSE;
b27dce25 2487 PL_lex_brackets = PL_lex_formbrack = 0;
78cdf107
Z
2488 PL_lex_allbrackets = 0;
2489 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
a02a5408
JC
2490 Newx(PL_lex_brackstack, 120, char);
2491 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2492 PL_lex_casemods = 0;
2493 *PL_lex_casestack = '\0';
2494 PL_lex_starts = 0;
2495 PL_lex_state = LEX_INTERPCONCAT;
ffdb8b16 2496 if (is_heredoc)
6ddcf93b 2497 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
ffdb8b16 2498 PL_copline = NOLINE;
caae0700 2499
78a635de
FC
2500 Newxz(shared, 1, LEXSHARED);
2501 shared->ls_prev = PL_parser->lex_shared;
2502 PL_parser->lex_shared = shared;
3280af22 2503
7ef70b3d 2504 PL_lex_inwhat = PL_parser->lex_sub_inwhat;
bb16bae8 2505 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
3280af22 2506 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
7ef70b3d 2507 PL_lex_inpat = PL_parser->lex_sub_op;
79072805 2508 else
5f66b61c 2509 PL_lex_inpat = NULL;
79072805 2510
3a54fd60
DM
2511 PL_parser->lex_re_reparsing = cBOOL(PL_in_eval & EVAL_RE_REPARSING);
2512 PL_in_eval &= ~EVAL_RE_REPARSING;
2513
55497cff 2514 return '(';
79072805
LW
2515}
2516
ffb4593c
NT
2517/*
2518 * S_sublex_done
2519 * Restores lexer state after a S_sublex_push.
2520 */
2521
76e3520e 2522STATIC I32
cea2e8a9 2523S_sublex_done(pTHX)
79072805 2524{
3280af22 2525 if (!PL_lex_starts++) {
396482e1 2526 SV * const sv = newSVpvs("");
9aa983d2
JH
2527 if (SvUTF8(PL_linestr))
2528 SvUTF8_on(sv);
3280af22 2529 PL_expect = XOPERATOR;
275103cd 2530 pl_yylval.opval = newSVOP(OP_CONST, 0, sv);
79072805
LW
2531 return THING;
2532 }
2533
3280af22
NIS
2534 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2535 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2536 return yylex();
79072805
LW
2537 }
2538
ffb4593c 2539 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
bb16bae8 2540 assert(PL_lex_inwhat != OP_TRANSR);
5aa91856
FC
2541 if (PL_lex_repl) {
2542 assert (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS);
3280af22
NIS
2543 PL_linestr = PL_lex_repl;
2544 PL_lex_inpat = 0;
2545 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2546 PL_bufend += SvCUR(PL_linestr);
bd61b366 2547 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2548 PL_lex_dojoin = FALSE;
2549 PL_lex_brackets = 0;
78cdf107
Z
2550 PL_lex_allbrackets = 0;
2551 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
3280af22
NIS
2552 PL_lex_casemods = 0;
2553 *PL_lex_casestack = '\0';
2554 PL_lex_starts = 0;
25da4f38 2555 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2556 PL_lex_state = LEX_INTERPNORMAL;
2557 PL_lex_starts++;
e9fa98b2
HS
2558 /* we don't clear PL_lex_repl here, so that we can check later
2559 whether this is an evalled subst; that means we rely on the
2560 logic to ensure sublex_done() is called again only via the
2561 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2562 }
e9fa98b2 2563 else {
3280af22 2564 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2565 PL_lex_repl = NULL;
e9fa98b2 2566 }
ffdb8b16
FC
2567 if (SvTYPE(PL_linestr) >= SVt_PVNV) {
2568 CopLINE(PL_curcop) +=
9420b268 2569 ((XPVNV*)SvANY(PL_linestr))->xnv_u.xnv_lines
851b527a
FC
2570 + PL_parser->herelines;
2571 PL_parser->herelines = 0;
ffdb8b16 2572 }
9b6b7be8 2573 return '/';
ffed7fef
LW
2574 }
2575 else {
b42366d4 2576 const line_t l = CopLINE(PL_curcop);
f46d017c 2577 LEAVE;
bb4e4c38 2578 if (PL_parser->sub_error_count != PL_error_count) {
bb4e4c38 2579 if (PL_parser->sub_no_recover) {
ad1ecdf7 2580 yyquit();
bb4e4c38
TC
2581 NOT_REACHED;
2582 }
2583 }
b42366d4 2584 if (PL_multi_close == '<')
851b527a 2585 PL_parser->herelines += l - PL_multi_end;
3280af22
NIS
2586 PL_bufend = SvPVX(PL_linestr);
2587 PL_bufend += SvCUR(PL_linestr);
2588 PL_expect = XOPERATOR;
79072805 2589 return ')';
ffed7fef
LW
2590 }
2591}
2592
7a7d14f3 2593STATIC SV*
2c43c309
KW
2594S_get_and_check_backslash_N_name_wrapper(pTHX_ const char* s, const char* const e)
2595{
2596 /* This justs wraps get_and_check_backslash_N_name() to output any error
2597 * message it returns. */
2598
2599 const char * error_msg = NULL;
25c7fb78 2600 SV * result;
2c43c309
KW
2601
2602 PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME_WRAPPER;
2603
25c7fb78
KW
2604 /* charnames doesn't work well if there have been errors found */
2605 if (PL_error_count > 0) {
2606 return NULL;
2607 }
2608
2609 result = get_and_check_backslash_N_name(s, e, cBOOL(UTF), &error_msg);
2610
2c43c309
KW
2611 if (error_msg) {
2612 yyerror_pv(error_msg, UTF ? SVf_UTF8 : 0);
2613 }
2614
2615 return result;
2616}
2617
25c7fb78
KW
2618SV*
2619Perl_get_and_check_backslash_N_name(pTHX_ const char* s,
2620 const char* const e,
2621 const bool is_utf8,
2622 const char ** error_msg)
6f613c73 2623{
140b12ad
KW
2624 /* <s> points to first character of interior of \N{}, <e> to one beyond the
2625 * interior, hence to the "}". Finds what the name resolves to, returning
25c7fb78
KW
2626 * an SV* containing it; NULL if no valid one found.
2627 *
2628 * 'is_utf8' is TRUE if we know we want the result to be UTF-8 even if it
2629 * doesn't have to be. */
6f613c73 2630
25c7fb78 2631 SV* res;
0c415a79
KW
2632 HV * table;
2633 SV **cvp;
2634 SV *cv;
2635 SV *rv;
2636 HV *stash;
0c415a79 2637 const char* backslash_ptr = s - 3; /* Points to the <\> of \N{... */
25c7fb78 2638 dVAR;
0c415a79 2639
6f613c73
KW
2640 PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME;
2641
25c7fb78
KW
2642 assert(e >= s);
2643 assert(s > (char *) 3);
2644
2645 res = newSVpvn_flags(s, e - s, (is_utf8) ? SVf_UTF8 : 0);
2646
d8d26cac 2647 if (!SvCUR(res)) {
ebcc725e 2648 SvREFCNT_dec_NN(res);
be332ba0 2649 /* diag_listed_as: Unknown charname '%s' */
2c43c309 2650 *error_msg = Perl_form(aTHX_ "Unknown charname ''");
be332ba0 2651 return NULL;
d8d26cac 2652 }
b7e6151c 2653
107160e2
KW
2654 res = new_constant( NULL, 0, "charnames", res, NULL, backslash_ptr,
2655 /* include the <}> */
2c43c309 2656 e - backslash_ptr + 1, error_msg);
6f613c73 2657 if (! SvPOK(res)) {
b6407c49 2658 SvREFCNT_dec_NN(res);
6f613c73
KW
2659 return NULL;
2660 }
2661
0c415a79
KW
2662 /* See if the charnames handler is the Perl core's, and if so, we can skip
2663 * the validation needed for a user-supplied one, as Perl's does its own
2664 * validation. */
2665 table = GvHV(PL_hintgv); /* ^H */
2666 cvp = hv_fetchs(table, "charnames", FALSE);
5882ddb3
FC
2667 if (cvp && (cv = *cvp) && SvROK(cv) && (rv = SvRV(cv),
2668 SvTYPE(rv) == SVt_PVCV) && ((stash = CvSTASH(rv)) != NULL))
0c415a79
KW
2669 {
2670 const char * const name = HvNAME(stash);
b59bf0b2 2671 if (memEQs(name, HvNAMELEN(stash), "_charnames")) {
0c415a79
KW
2672 return res;
2673 }
2674 }
2675
bde9e88d
KW
2676 /* Here, it isn't Perl's charname handler. We can't rely on a
2677 * user-supplied handler to validate the input name. For non-ut8 input,
2678 * look to see that the first character is legal. Then loop through the
2679 * rest checking that each is a continuation */
6f613c73 2680
36897d64
KW
2681 /* This code makes the reasonable assumption that the only Latin1-range
2682 * characters that begin a character name alias are alphabetic, otherwise
2683 * would have to create a isCHARNAME_BEGIN macro */
b6ba1137 2684
25c7fb78 2685 if (! is_utf8) {
bde9e88d 2686 if (! isALPHAU(*s)) {
b6ba1137
KW
2687 goto bad_charname;
2688 }
bde9e88d
KW
2689 s++;
2690 while (s < e) {
2691 if (! isCHARNAME_CONT(*s)) {
b6ba1137
KW
2692 goto bad_charname;
2693 }
2d8eb851
KW
2694 if (*s == ' ' && *(s-1) == ' ') {
2695 goto multi_spaces;
bd299e29 2696 }
bde9e88d 2697 s++;
b6ba1137
KW
2698 }
2699 }
2700 else {
bde9e88d
KW
2701 /* Similarly for utf8. For invariants can check directly; for other
2702 * Latin1, can calculate their code point and check; otherwise use a
2703 * swash */
2704 if (UTF8_IS_INVARIANT(*s)) {
2705 if (! isALPHAU(*s)) {
140b12ad
KW
2706 goto bad_charname;
2707 }
bde9e88d
KW
2708 s++;
2709 } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
a62b247b 2710 if (! isALPHAU(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)))) {
b6ba1137 2711 goto bad_charname;
6f613c73 2712 }
bde9e88d 2713 s += 2;
6f613c73 2714 }
bde9e88d 2715 else {
f1bcae08
KW
2716 if (! _invlist_contains_cp(PL_utf8_charname_begin,
2717 utf8_to_uvchr_buf((U8 *) s,
2718 (U8 *) e,
2719 NULL)))
2720 {
bde9e88d
KW
2721 goto bad_charname;
2722 }
2723 s += UTF8SKIP(s);
2724 }
2725
2726 while (s < e) {
2727 if (UTF8_IS_INVARIANT(*s)) {
2728 if (! isCHARNAME_CONT(*s)) {
2729 goto bad_charname;
2730 }
2d8eb851
KW
2731 if (*s == ' ' && *(s-1) == ' ') {
2732 goto multi_spaces;
bd299e29 2733 }
bde9e88d
KW
2734 s++;
2735 }
2736 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
a62b247b 2737 if (! isCHARNAME_CONT(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1))))
bde9e88d
KW
2738 {
2739 goto bad_charname;
2740 }
2741 s += 2;
2742 }
2743 else {
f1bcae08
KW
2744 if (! _invlist_contains_cp(PL_utf8_charname_continue,
2745 utf8_to_uvchr_buf((U8 *) s,
2746 (U8 *) e,
2747 NULL)))
2748 {
bde9e88d
KW
2749 goto bad_charname;
2750 }
2751 s += UTF8SKIP(s);
6f613c73
KW
2752 }
2753 }
2d8eb851
KW
2754 }
2755 if (*(s-1) == ' ') {
8d9d0498
FC
2756 /* diag_listed_as: charnames alias definitions may not contain
2757 trailing white-space; marked by <-- HERE in %s
2758 */
2c43c309 2759 *error_msg = Perl_form(aTHX_
2d8eb851
KW
2760 "charnames alias definitions may not contain trailing "
2761 "white-space; marked by <-- HERE in %.*s<-- HERE %.*s",
2762 (int)(s - backslash_ptr + 1), backslash_ptr,
2c43c309 2763 (int)(e - s + 1), s + 1);
2d8eb851 2764 return NULL;
6f613c73
KW
2765 }
2766
870fe793 2767 if (SvUTF8(res)) { /* Don't accept malformed charname value */
bde9e88d
KW
2768 const U8* first_bad_char_loc;
2769 STRLEN len;
2770 const char* const str = SvPV_const(res, len);
0aab20f2
KW
2771 if (UNLIKELY(! is_utf8_string_loc((U8 *) str, len,
2772 &first_bad_char_loc)))
2773 {
75219bac
KW
2774 _force_out_malformed_utf8_message(first_bad_char_loc,
2775 (U8 *) PL_parser->bufend,
2776 0,
2777 0 /* 0 means don't die */ );
8d9d0498
FC
2778 /* diag_listed_as: Malformed UTF-8 returned by \N{%s}
2779 immediately after '%s' */
2c43c309 2780 *error_msg = Perl_form(aTHX_
bde9e88d
KW
2781 "Malformed UTF-8 returned by %.*s immediately after '%.*s'",
2782 (int) (e - backslash_ptr + 1), backslash_ptr,
2c43c309 2783 (int) ((char *) first_bad_char_loc - str), str);
bde9e88d
KW
2784 return NULL;
2785 }
2786 }
140b12ad 2787
bde9e88d 2788 return res;
140b12ad 2789
bde9e88d 2790 bad_charname: {
bde9e88d
KW
2791
2792 /* The final %.*s makes sure that should the trailing NUL be missing
2793 * that this print won't run off the end of the string */
8d9d0498
FC
2794 /* diag_listed_as: Invalid character in \N{...}; marked by <-- HERE
2795 in \N{%s} */
2c43c309 2796 *error_msg = Perl_form(aTHX_
bde9e88d 2797 "Invalid character in \\N{...}; marked by <-- HERE in %.*s<-- HERE %.*s",
2d8eb851 2798 (int)(s - backslash_ptr + 1), backslash_ptr,
2c43c309 2799 (int)(e - s + 1), s + 1);
bde9e88d
KW
2800 return NULL;
2801 }
2d8eb851
KW
2802
2803 multi_spaces:
8d9d0498
FC
2804 /* diag_listed_as: charnames alias definitions may not contain a
2805 sequence of multiple spaces; marked by <-- HERE
2806 in %s */
2c43c309 2807 *error_msg = Perl_form(aTHX_
2d8eb851
KW
2808 "charnames alias definitions may not contain a sequence of "
2809 "multiple spaces; marked by <-- HERE in %.*s<-- HERE %.*s",
2810 (int)(s - backslash_ptr + 1), backslash_ptr,
2c43c309 2811 (int)(e - s + 1), s + 1);
2d8eb851 2812 return NULL;
6f613c73
KW
2813}
2814
02aa26ce
NT
2815/*
2816 scan_const
2817
9da1dd8f
DM
2818 Extracts the next constant part of a pattern, double-quoted string,
2819 or transliteration. This is terrifying code.
2820
2821 For example, in parsing the double-quoted string "ab\x63$d", it would
2822 stop at the '$' and return an OP_CONST containing 'abc'.
02aa26ce 2823
94def140 2824 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 2825 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 2826 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 2827
94def140
ST
2828 Returns a pointer to the character scanned up to. If this is
2829 advanced from the start pointer supplied (i.e. if anything was
9da1dd8f 2830 successfully parsed), will leave an OP_CONST for the substring scanned
6154021b 2831 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
2832 by looking at the next characters herself.
2833
02aa26ce 2834 In patterns:
9da1dd8f 2835 expand:
537124e4
KW
2836 \N{FOO} => \N{U+hex_for_character_FOO}
2837 (if FOO expands to multiple characters, expands to \N{U+xx.XX.yy ...})
9da1dd8f
DM
2838
2839 pass through:
2840 all other \-char, including \N and \N{ apart from \N{ABC}
2841
2842 stops on:
2843 @ and $ where it appears to be a var, but not for $ as tail anchor
2844 \l \L \u \U \Q \E
2845 (?{ or (??{
2846
02aa26ce
NT
2847 In transliterations:
2848 characters are VERY literal, except for - not at the start or end
66b09263
KW
2849 of the string, which indicates a range. However some backslash sequences
2850 are recognized: \r, \n, and the like
2851 \007 \o{}, \x{}, \N{}
2852 If all elements in the transliteration are below 256,
94def140
ST
2853 scan_const expands the range to the full set of intermediate
2854 characters. If the range is in utf8, the hyphen is replaced with
2855 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
2856
2857 In double-quoted strings:
2858 backslashes:
66b09263 2859 all those recognized in transliterations
94def140 2860 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
2861 case and quoting: \U \Q \E
2862 stops on @ and $
2863
2864 scan_const does *not* construct ops to handle interpolated strings.
2865 It stops processing as soon as it finds an embedded $ or @ variable
2866 and leaves it to the caller to work out what's going on.
2867
94def140
ST
2868 embedded arrays (whether in pattern or not) could be:
2869 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2870
2871 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
2872
2873 $ in pattern could be $foo or could be tail anchor. Assumption:
2874 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 2875 followed by one of "()| \r\n\t"
02aa26ce 2876
9da1dd8f 2877 \1 (backreferences) are turned into $1 in substitutions
02aa26ce
NT
2878
2879 The structure of the code is
2880 while (there's a character to process) {
94def140
ST
2881 handle transliteration ranges
2882 skip regexp comments /(?#comment)/ and codes /(?{code})/
2883 skip #-initiated comments in //x patterns
2884 check for embedded arrays
02aa26ce
NT
2885 check for embedded scalars
2886 if (backslash) {
94def140 2887 deprecate \1 in substitution replacements
02aa26ce
NT
2888 handle string-changing backslashes \l \U \Q \E, etc.
2889 switch (what was escaped) {
94def140 2890 handle \- in a transliteration (becomes a literal -)
ff3f963a 2891 if a pattern and not \N{, go treat as regular character
94def140
ST
2892 handle \132 (octal characters)
2893 handle \x15 and \x{1234} (hex characters)
ff3f963a 2894 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
ST
2895 handle \cV (control characters)
2896 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 2897 } (end switch)
77a135fe 2898 continue
02aa26ce 2899 } (end if backslash)
77a135fe 2900 handle regular character
02aa26ce 2901 } (end while character to read)
02cd137d 2902
02aa26ce
NT
2903*/
2904
76e3520e 2905STATIC char *
cea2e8a9 2906S_scan_const(pTHX_ char *start)
79072805 2907{
eb578fdb 2908 char *send = PL_bufend; /* end of the constant */
dc023dbb
KW
2909 SV *sv = newSV(send - start); /* sv for the constant. See note below
2910 on sizing. */
eb578fdb
KW
2911 char *s = start; /* start of the constant */
2912 char *d = SvPVX(sv); /* destination for copies */
dc023dbb
KW
2913 bool dorange = FALSE; /* are we in a translit range? */
2914 bool didrange = FALSE; /* did we just finish a range? */
2915 bool in_charclass = FALSE; /* within /[...]/ */
2916 bool has_utf8 = FALSE; /* Output constant is UTF8 */
2917 bool this_utf8 = cBOOL(UTF); /* Is the source string assumed to be
2918 UTF8? But, this can show as true
2919 when the source isn't utf8, as for
2920 example when it is entirely composed
2921 of hex constants */
af9be36c
KW
2922 STRLEN utf8_variant_count = 0; /* When not in UTF-8, this counts the
2923 number of characters found so far
2924 that will expand (into 2 bytes)
2925 should we have to convert to
2926 UTF-8) */
6f613c73 2927 SV *res; /* result from charnames */
c89db733
JH
2928 STRLEN offset_to_max = 0; /* The offset in the output to where the range
2929 high-end character is temporarily placed */
77a135fe 2930
fe2ba0a2
KW
2931 /* Does something require special handling in tr/// ? This avoids extra
2932 * work in a less likely case. As such, khw didn't feel it was worth
2933 * adding any branches to the more mainline code to handle this, which
2934 * means that this doesn't get set in some circumstances when things like
2935 * \x{100} get expanded out. As a result there needs to be extra testing
2936 * done in the tr code */
2937 bool has_above_latin1 = FALSE;
2938
77a135fe
KW
2939 /* Note on sizing: The scanned constant is placed into sv, which is
2940 * initialized by newSV() assuming one byte of output for every byte of
2941 * input. This routine expects newSV() to allocate an extra byte for a
2942 * trailing NUL, which this routine will append if it gets to the end of
2943 * the input. There may be more bytes of input than output (eg., \N{LATIN
2944 * CAPITAL LETTER A}), or more output than input if the constant ends up
2945 * recoded to utf8, but each time a construct is found that might increase
2946 * the needed size, SvGROW() is called. Its size parameter each time is
2947 * based on the best guess estimate at the time, namely the length used so
2948 * far, plus the length the current construct will occupy, plus room for
caae0700 2949 * the trailing NUL, plus one byte for every input byte still unscanned */
77a135fe 2950
c3320c2a
KW
2951 UV uv = UV_MAX; /* Initialize to weird value to try to catch any uses
2952 before set */
4c3a8340 2953#ifdef EBCDIC
f4240379
KW
2954 int backslash_N = 0; /* ? was the character from \N{} */
2955 int non_portable_endpoint = 0; /* ? In a range is an endpoint
2956 platform-specific like \x65 */
4c3a8340 2957#endif
012bcf8d 2958
7918f24d
NC
2959 PERL_ARGS_ASSERT_SCAN_CONST;
2960
bb16bae8 2961 assert(PL_lex_inwhat != OP_TRANSR);
7ef70b3d 2962 if (PL_lex_inwhat == OP_TRANS && PL_parser->lex_sub_op) {
2b9d42f0 2963 /* If we are doing a trans and we know we want UTF8 set expectation */
7ef70b3d
FC
2964 has_utf8 = PL_parser->lex_sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
2965 this_utf8 = PL_parser->lex_sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
2b9d42f0
NIS
2966 }
2967
b899e89d
FC
2968 /* Protect sv from errors and fatal warnings. */
2969 ENTER_with_name("scan_const");
2970 SAVEFREESV(sv);
2b9d42f0 2971
f4240379
KW
2972 while (s < send
2973 || dorange /* Handle tr/// range at right edge of input */
2974 ) {
ff3f963a 2975
02aa26ce 2976 /* get transliterations out of the way (they're most literal) */
3280af22 2977 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 2978
f4240379
KW
2979 /* But there isn't any special handling necessary unless there is a
2980 * range, so for most cases we just drop down and handle the value
2981 * as any other. There are two exceptions.
2982 *
02cd137d
KW
2983 * 1. A hyphen indicates that we are actually going to have a
2984 * range. In this case, skip the '-', set a flag, then drop
f4240379
KW
2985 * down to handle what should be the end range value.
2986 * 2. After we've handled that value, the next time through, that
2987 * flag is set and we fix up the range.
2988 *
2989 * Ranges entirely within Latin1 are expanded out entirely, in
188d22cf
KW
2990 * order to make the transliteration a simple table look-up.
2991 * Ranges that extend above Latin1 have to be done differently, so
2992 * there is no advantage to expanding them here, so they are
2993 * stored here as Min, ILLEGAL_UTF8_BYTE, Max. The illegal byte
2994 * signifies a hyphen without any possible ambiguity. On EBCDIC
2995 * machines, if the range is expressed as Unicode, the Latin1
2996 * portion is expanded out even if the range extends above
2997 * Latin1. This is because each code point in it has to be
2998 * processed here individually to get its native translation */
f4240379
KW
2999
3000 if (! dorange) {
3001
02cd137d
KW
3002 /* Here, we don't think we're in a range. If the new character
3003 * is not a hyphen; or if it is a hyphen, but it's too close to
e8d55f27
TC
3004 * either edge to indicate a range, or if we haven't output any
3005 * characters yet then it's a regular character. */
3006 if (*s != '-' || s >= send - 1 || s == start || d == SvPVX(sv)) {
f4240379
KW
3007
3008 /* A regular character. Process like any other, but first
3009 * clear any flags */
3010 didrange = FALSE;
3011 dorange = FALSE;
e294cc5d 3012#ifdef EBCDIC
f4240379
KW
3013 non_portable_endpoint = 0;
3014 backslash_N = 0;
e294cc5d 3015#endif
02cd137d
KW
3016 /* The tests here for being above Latin1 and similar ones
3017 * in the following 'else' suffice to find all such
3018 * occurences in the constant, except those added by a
fe2ba0a2 3019 * backslash escape sequence, like \x{100}. Mostly, those
02cd137d 3020 * set 'has_above_latin1' as appropriate */
188d22cf
KW
3021 if (this_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) {
3022 has_above_latin1 = TRUE;
3023 }
3024
f4240379
KW
3025 /* Drops down to generic code to process current byte */
3026 }
02cd137d 3027 else { /* Is a '-' in the context where it means a range */
f4240379 3028 if (didrange) { /* Something like y/A-C-Z// */
02cd137d
KW
3029 Perl_croak(aTHX_ "Ambiguous range in transliteration"
3030 " operator");
f4240379 3031 }
e294cc5d 3032
f4240379 3033 dorange = TRUE;
2b9d42f0 3034
02cd137d 3035 s++; /* Skip past the hyphen */
f4240379
KW
3036
3037 /* d now points to where the end-range character will be
3038 * placed. Save it so won't have to go finding it later,
3039 * and drop down to get that character. (Actually we
3040 * instead save the offset, to handle the case where a
3041 * realloc in the meantime could change the actual
3042 * pointer). We'll finish processing the range the next
3043 * time through the loop */
3044 offset_to_max = d - SvPVX_const(sv);
188d22cf
KW
3045
3046 if (this_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) {
3047 has_above_latin1 = TRUE;
3048 }
02cd137d
KW
3049
3050 /* Drops down to generic code to process current byte */
f4240379
KW
3051 }
3052 } /* End of not a range */
3053 else {
3054 /* Here we have parsed a range. Now must handle it. At this
3055 * point:
3056 * 'sv' is a SV* that contains the output string we are
3057 * constructing. The final two characters in that string
3058 * are the range start and range end, in order.
3059 * 'd' points to just beyond the range end in the 'sv' string,
3060 * where we would next place something
3061 * 'offset_to_max' is the offset in 'sv' at which the character
02cd137d 3062 * (the range's maximum end point) before 'd' begins.
f4240379 3063 */
8efef67c 3064 char * max_ptr = SvPVX(sv) + offset_to_max;
5ca37d54 3065 char * min_ptr;
f4240379
KW
3066 IV range_min;
3067 IV range_max; /* last character in range */
f4240379 3068 STRLEN grow;
5ca37d54
KW
3069 Size_t offset_to_min = 0;
3070 Size_t extras = 0;
11327fa1 3071#ifdef EBCDIC
f4240379
KW
3072 bool convert_unicode;
3073 IV real_range_max = 0;
e294cc5d 3074#endif
02cd137d 3075 /* Get the code point values of the range ends. */
e294cc5d 3076 if (has_utf8) {
f4240379
KW
3077 /* We know the utf8 is valid, because we just constructed
3078 * it ourselves in previous loop iterations */
3079 min_ptr = (char*) utf8_hop( (U8*) max_ptr, -1);
3080 range_min = valid_utf8_to_uvchr( (U8*) min_ptr, NULL);
3081 range_max = valid_utf8_to_uvchr( (U8*) max_ptr, NULL);
fe2ba0a2
KW
3082
3083 /* This compensates for not all code setting
3084 * 'has_above_latin1', so that we don't skip stuff that
3085 * should be executed */
3086 if (range_max > 255) {
3087 has_above_latin1 = TRUE;
3088 }
e294cc5d 3089 }
f4240379
KW
3090 else {
3091 min_ptr = max_ptr - 1;
3092 range_min = * (U8*) min_ptr;
3093 range_max = * (U8*) max_ptr;
3094 }
3095
8efef67c
KW
3096 /* If the range is just a single code point, like tr/a-a/.../,
3097 * that code point is already in the output, twice. We can
3098 * just back up over the second instance and avoid all the rest
3099 * of the work. But if it is a variant character, it's been
218304f9
KW
3100 * counted twice, so decrement. (This unlikely scenario is
3101 * special cased, like the one for a range of 2 code points
3102 * below, only because the main-line code below needs a range
3103 * of 3 or more to work without special casing. Might as well
3104 * get it out of the way now.) */
8efef67c
KW
3105 if (UNLIKELY(range_max == range_min)) {
3106 d = max_ptr;
3107 if (! has_utf8 && ! UVCHR_IS_INVARIANT(range_max)) {
3108 utf8_variant_count--;
3109 }
3110 goto range_done;
3111 }
3112
e294cc5d 3113#ifdef EBCDIC
f4240379
KW
3114 /* On EBCDIC platforms, we may have to deal with portable
3115 * ranges. These happen if at least one range endpoint is a
3116 * Unicode value (\N{...}), or if the range is a subset of
3117 * [A-Z] or [a-z], and both ends are literal characters,
3118 * like 'A', and not like \x{C1} */
87dd6ea7 3119 convert_unicode =
02cd137d
KW
3120 cBOOL(backslash_N) /* \N{} forces Unicode,
3121 hence portable range */
3122 || ( ! non_portable_endpoint
3123 && (( isLOWER_A(range_min) && isLOWER_A(range_max))
3124 || (isUPPER_A(range_min) && isUPPER_A(range_max))));
87dd6ea7 3125 if (convert_unicode) {
f4240379
KW
3126
3127 /* Special handling is needed for these portable ranges.
02cd137d
KW
3128 * They are defined to be in Unicode terms, which includes
3129 * all the Unicode code points between the end points.
f4240379
KW
3130 * Convert to Unicode to get the Unicode range. Later we
3131 * will convert each code point in the range back to
3132 * native. */
3133 range_min = NATIVE_TO_UNI(range_min);
3134 range_max = NATIVE_TO_UNI(range_max);
3135 }
e294cc5d 3136#endif
8ada0baa 3137
f4240379 3138 if (range_min > range_max) {
11327fa1 3139#ifdef EBCDIC
f4240379
KW
3140 if (convert_unicode) {
3141 /* Need to convert back to native for meaningful
3142 * messages for this platform */
3143 range_min = UNI_TO_NATIVE(range_min);
3144 range_max = UNI_TO_NATIVE(range_max);
3145 }
11327fa1 3146#endif
f4240379
KW
3147 /* Use the characters themselves for the error message if
3148 * ASCII printables; otherwise some visible representation
3149 * of them */
3150 if (isPRINT_A(range_min) && isPRINT_A(range_max)) {
3151 Perl_croak(aTHX_
3152 "Invalid range \"%c-%c\" in transliteration operator",
3153 (char)range_min, (char)range_max);
3154 }
11327fa1 3155#ifdef EBCDIC
f4240379 3156 else if (convert_unicode) {
02cd137d 3157 /* diag_listed_as: Invalid range "%s" in transliteration operator */
f4240379 3158 Perl_croak(aTHX_
02cd137d
KW
3159 "Invalid range \"\\N{U+%04" UVXf "}-\\N{U+%04"
3160 UVXf "}\" in transliteration operator",
3161 range_min, range_max);
f4240379 3162 }
11327fa1 3163#endif
f4240379 3164 else {
02cd137d 3165 /* diag_listed_as: Invalid range "%s" in transliteration operator */
f4240379 3166 Perl_croak(aTHX_
02cd137d
KW
3167 "Invalid range \"\\x{%04" UVXf "}-\\x{%04" UVXf "}\""
3168 " in transliteration operator",
3169 range_min, range_max);
f4240379 3170 }
c2e66d9e
GS
3171 }
3172
aca41667
KW
3173 /* If the range is exactly two code points long, they are
3174 * already both in the output */
3175 if (UNLIKELY(range_min + 1 == range_max)) {
3176 goto range_done;
3177 }
3178
3179 /* Here the range contains at least 3 code points */
3180
f4240379
KW
3181 if (has_utf8) {
3182
188d22cf
KW
3183 /* If everything in the transliteration is below 256, we
3184 * can avoid special handling later. A translation table
02cd137d
KW
3185 * for each of those bytes is created by op.c. So we
3186 * expand out all ranges to their constituent code points.
3187 * But if we've encountered something above 255, the
3188 * expanding won't help, so skip doing that. But if it's
3189 * EBCDIC, we may have to look at each character below 256
3190 * if we have to convert to/from Unicode values */
188d22cf 3191 if ( has_above_latin1
c7f1f016 3192#ifdef EBCDIC
f4240379 3193 && (range_min > 255 || ! convert_unicode)
8ada0baa 3194#endif
f4240379
KW
3195 ) {
3196 /* Move the high character one byte to the right; then
3197 * insert between it and the range begin, an illegal
3198 * byte which serves to indicate this is a range (using
02cd137d 3199 * a '-' would be ambiguous). */
f4240379
KW
3200 char *e = d++;
3201 while (e-- > max_ptr) {
3202 *(e + 1) = *e;
e294cc5d 3203 }
f4240379
KW
3204 *(e + 1) = (char) ILLEGAL_UTF8_BYTE;
3205 goto range_done;
3206 }
3207
3208 /* Here, we're going to expand out the range. For EBCDIC
3209 * the range can extend above 255 (not so in ASCII), so
3210 * for EBCDIC, split it into the parts above and below
3211 * 255/256 */
e294cc5d 3212#ifdef EBCDIC
f4240379
KW
3213 if (range_max > 255) {
3214 real_range_max = range_max;
3215 range_max = 255;
3216 }
e294cc5d 3217#endif
f4240379 3218 }
02aa26ce 3219
f4240379 3220 /* Here we need to expand out the string to contain each
5ca37d54
KW
3221 * character in the range. Grow the output to handle this.
3222 * For non-UTF8, we need a byte for each code point in the
3223 * range, minus the three that we've already allocated for: the
3224 * hyphen, the min, and the max. For UTF-8, we need this
3225 * plus an extra byte for each code point that occupies two
3226 * bytes (is variant) when in UTF-8 (except we've already
3227 * allocated for the end points, including if they are
3228 * variants). For ASCII platforms and Unicode ranges on EBCDIC
3229 * platforms, it's easy to calculate a precise number. To
3230 * start, we count the variants in the range, which we need
3231 * elsewhere in this function anyway. (For the case where it
3232 * isn't easy to calculate, 'extras' has been initialized to 0,
3233 * and the calculation is done in a loop further down.) */
3234#ifdef EBCDIC
3235 if (convert_unicode)
3236#endif
3237 {
3238 /* This is executed unconditionally on ASCII, and for
3239 * Unicode ranges on EBCDIC. Under these conditions, all
3240 * code points above a certain value are variant; and none
3241 * under that value are. We just need to find out how much
3242 * of the range is above that value. We don't count the
3243 * end points here, as they will already have been counted
3244 * as they were parsed. */
3245 if (range_min >= UTF_CONTINUATION_MARK) {
3246
3247 /* The whole range is made up of variants */
3248 extras = (range_max - 1) - (range_min + 1) + 1;
3249 }
3250 else if (range_max >= UTF_CONTINUATION_MARK) {
f4240379 3251
5ca37d54
KW
3252 /* Only the higher portion of the range is variants */
3253 extras = (range_max - 1) - UTF_CONTINUATION_MARK + 1;
3254 }
f4240379 3255
5ca37d54
KW
3256 utf8_variant_count += extras;
3257 }
3258
3259 /* The base growth is the number of code points in the range,
3260 * not including the endpoints, which have already been sized
3261 * for (and output). We don't subtract for the hyphen, as it
3262 * has been parsed but not output, and the SvGROW below is
3263 * based only on what's been output plus what's left to parse.
3264 * */
3265 grow = (range_max - 1) - (range_min + 1) + 1;
f4240379 3266
5ca37d54 3267 if (has_utf8) {
4c3a8340 3268#ifdef EBCDIC
5ca37d54
KW
3269 /* In some cases in EBCDIC, we haven't yet calculated a
3270 * precise amount needed for the UTF-8 variants. Just
3271 * assume the worst case, that everything will expand by a
3272 * byte */
3273 if (! convert_unicode) {
f4240379
KW
3274 grow *= 2;
3275 }
5ca37d54 3276 else
4c3a8340 3277#endif
5ca37d54
KW
3278 {
3279 /* Otherwise we know exactly how many variants there
3280 * are in the range. */
3281 grow += extras;
3282 }
f4240379
KW
3283 }
3284
5ca37d54
KW
3285 /* Grow, but position the output to overwrite the range min end
3286 * point, because in some cases we overwrite that */
3287 SvCUR_set(sv, d - SvPVX_const(sv));
3288 offset_to_min = min_ptr - SvPVX_const(sv);
3289
3290 /* See Note on sizing above. */
3291 d = offset_to_min + SvGROW(sv, SvCUR(sv)
3292 + (send - s)
3293 + grow
3294 + 1 /* Trailing NUL */ );
f4240379 3295
5ca37d54 3296 /* Now, we can expand out the range. */
11327fa1 3297#ifdef EBCDIC
f4240379 3298 if (convert_unicode) {
5ca37d54 3299 SSize_t i;
02aa26ce 3300
f4240379
KW
3301 /* Recall that the min and max are now in Unicode terms, so
3302 * we have to convert each character to its native
3303 * equivalent */
3304 if (has_utf8) {
3305 for (i = range_min; i <= range_max; i++) {
02cd137d
KW
3306 append_utf8_from_native_byte(
3307 LATIN1_TO_NATIVE((U8) i),
3308 (U8 **) &d);
f4240379
KW
3309 }
3310 }
3311 else {
3312 for (i = range_min; i <= range_max; i++) {
81324705 3313 *d++ = (char)LATIN1_TO_NATIVE((U8) i);
f4240379
KW
3314 }
3315 }
01ec43d0 3316 }
11327fa1
AL
3317 else
3318#endif
3319 /* Always gets run for ASCII, and sometimes for EBCDIC. */
3320 {
f4240379
KW
3321 /* Here, no conversions are necessary, which means that the
3322 * first character in the range is already in 'd' and
3323 * valid, so we can skip overwriting it */
3324 if (has_utf8) {
19742f39 3325 SSize_t i;
f4240379
KW
3326 d += UTF8SKIP(d);
3327 for (i = range_min + 1; i <= range_max; i++) {
81324705 3328 append_utf8_from_native_byte((U8) i, (U8 **) &d);
f4240379
KW
3329 }
3330 }
3331 else {
19742f39 3332 SSize_t i;
f4240379 3333 d++;
5ca37d54
KW
3334 assert(range_min + 1 <= range_max);
3335 for (i = range_min + 1; i < range_max; i++) {
3336#ifdef EBCDIC
3337 /* In this case on EBCDIC, we haven't calculated
3338 * the variants. Do it here, as we go along */
3339 if (! UVCHR_IS_INVARIANT(i)) {
3340 utf8_variant_count++;
3341 }
3342#endif
f4240379
KW
3343 *d++ = (char)i;
3344 }
5ca37d54
KW
3345
3346 /* The range_max is done outside the loop so as to
3347 * avoid having to special case not incrementing
3348 * 'utf8_variant_count' on EBCDIC (it's already been
3349 * counted when originally parsed) */
3350 *d++ = (char) range_max;
f4240379 3351 }
a0ed51b3 3352 }
02aa26ce 3353
11327fa1 3354#ifdef EBCDIC
02cd137d
KW
3355 /* If the original range extended above 255, add in that
3356 * portion. */
f4240379
KW
3357 if (real_range_max) {
3358 *d++ = (char) UTF8_TWO_BYTE_HI(0x100);
3359 *d++ = (char) UTF8_TWO_BYTE_LO(0x100);
043af6f6
KW
3360 if (real_range_max > 0x100) {
3361 if (real_range_max > 0x101) {
3362 *d++ = (char) ILLEGAL_UTF8_BYTE;
3363 }
f4240379 3364 d = (char*)uvchr_to_utf8((U8*)d, real_range_max);
043af6f6 3365 }
f4240379 3366 }
11327fa1 3367#endif
02aa26ce 3368
f4240379
KW
3369 range_done:
3370 /* mark the range as done, and continue */
3371 didrange = TRUE;
3372 dorange = FALSE;
3373#ifdef EBCDIC
3374 non_portable_endpoint = 0;
3375 backslash_N = 0;
3376#endif
3377 continue;
3378 } /* End of is a range */
3379 } /* End of transliteration. Joins main code after these else's */
e4a2df84
DM
3380 else if (*s == '[' && PL_lex_inpat && !in_charclass) {
3381 char *s1 = s-1;
3382 int esc = 0;
3383 while (s1 >= start && *s1-- == '\\')
3384 esc = !esc;
3385 if (!esc)
3386 in_charclass = TRUE;
3387 }
1e02a175 3388 else if (*s == ']' && PL_lex_inpat && in_charclass) {
e4a2df84
DM
3389 char *s1 = s-1;
3390 int esc = 0;
3391 while (s1 >= start && *s1-- == '\\')
3392 esc = !esc;
3393 if (!esc)
3394 in_charclass = FALSE;
3395 }
02cd137d
KW
3396 /* skip for regexp comments /(?#comment)/, except for the last
3397 * char, which will be done separately. Stop on (?{..}) and
3398 * friends */
c30fc27b 3399 else if (*s == '(' && PL_lex_inpat && s[1] == '?' && !in_charclass) {
cc6b7395 3400 if (s[2] == '#') {
e994fd66 3401 while (s+1 < send && *s != ')')
5ff03569 3402 *d++ = *s++;
155aba94 3403 }
407f8cf2
KW
3404 else if (!PL_lex_casemods
3405 && ( s[2] == '{' /* This should match regcomp.c */
3406 || (s[2] == '?' && s[3] == '{')))
155aba94 3407 {
9da1dd8f 3408 break;
cc6b7395 3409 }
748a9306 3410 }
02cd137d 3411 /* likewise skip #-initiated comments in //x patterns */
407f8cf2
KW
3412 else if (*s == '#'
3413 && PL_lex_inpat
3414 && !in_charclass
3415 && ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED)
3416 {
8faf4f30 3417 while (s < send && *s != '\n')
5ff03569 3418 *d++ = *s++;
748a9306 3419 }
02cd137d 3420 /* no further processing of single-quoted regex */
9da1dd8f
DM
3421 else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'')
3422 goto default_action;
3423
02cd137d
KW
3424 /* check for embedded arrays
3425 * (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
3426 */
1749ea0d 3427 else if (*s == '@' && s[1]) {
fac0f7a3
KW
3428 if (UTF
3429 ? isIDFIRST_utf8_safe(s+1, send)
3430 : isWORDCHAR_A(s[1]))
3431 {
1749ea0d 3432 break;
fac0f7a3 3433 }
1749ea0d
ST
3434 if (strchr(":'{$", s[1]))
3435 break;
3436 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
3437 break; /* in regexp, neither @+ nor @- are interpolated */
3438 }
02cd137d
KW
3439 /* check for embedded scalars. only stop if we're sure it's a
3440 * variable. */
79072805 3441 else if (*s == '$') {
3280af22 3442 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
79072805 3443 break;
77772344 3444 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
a2a5de95
NC
3445 if (s[1] == '\\') {
3446 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
3447 "Possible unintended interpolation of $\\ in regex");
77772344 3448 }
79072805 3449 break; /* in regexp, $ might be tail anchor */
77772344 3450 }
79072805 3451 }
02aa26ce 3452
2b9d42f0
NIS
3453 /* End of else if chain - OP_TRANS rejoin rest */
3454
8faf4f30
HS
3455 if (UNLIKELY(s >= send)) {
3456 assert(s == send);
3457 break;
3458 }
3459
02aa26ce 3460 /* backslashes */
79072805 3461 if (*s == '\\' && s+1 < send) {
ff3f963a
KW
3462 char* e; /* Can be used for ending '}', etc. */
3463
79072805 3464 s++;
02aa26ce 3465
7d0fc23c
KW
3466 /* warn on \1 - \9 in substitution replacements, but note that \11
3467 * is an octal; and \19 is \1 followed by '9' */
407f8cf2
KW
3468 if (PL_lex_inwhat == OP_SUBST
3469 && !PL_lex_inpat
3470 && isDIGIT(*s)
3471 && *s != '0'
3472 && !isDIGIT(s[1]))
79072805 3473 {
c782d7ee 3474 /* diag_listed_as: \%d better written as $%d */
a2a5de95 3475 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
79072805
LW
3476 *--s = '$';
3477 break;
3478 }
02aa26ce
NT
3479
3480 /* string-change backslash escapes */
838f2281 3481 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQF", *s)) {
79072805
LW
3482 --s;
3483 break;
3484 }
ff3f963a
KW
3485 /* In a pattern, process \N, but skip any other backslash escapes.
3486 * This is because we don't want to translate an escape sequence
3487 * into a meta symbol and have the regex compiler use the meta
3488 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
3489 * in spite of this, we do have to process \N here while the proper
3490 * charnames handler is in scope. See bugs #56444 and #62056.
85fba779 3491 *
ff3f963a
KW
3492 * There is a complication because \N in a pattern may also stand
3493 * for 'match a non-nl', and not mean a charname, in which case its
3494 * processing should be deferred to the regex compiler. To be a
3495 * charname it must be followed immediately by a '{', and not look
3496 * like \N followed by a curly quantifier, i.e., not something like
3497 * \N{3,}. regcurly returns a boolean indicating if it is a legal
3498 * quantifier */
3499 else if (PL_lex_inpat
3500 && (*s != 'N'
3501 || s[1] != '{'
412f55bb 3502 || regcurly(s + 1)))
ff3f963a 3503 {
4d73d076 3504 *d++ = '\\';
cc74c5bd
ST
3505 goto default_action;
3506 }
02aa26ce 3507
79072805 3508 switch (*s) {
79072805 3509 default:
11b8faa4 3510 {
15861f94 3511 if ((isALPHANUMERIC(*s)))
a2a5de95
NC
3512 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
3513 "Unrecognized escape \\%c passed through",
3514 *s);
11b8faa4 3515 /* default action is to copy the quoted character */
f9a63242 3516 goto default_action;
11b8faa4 3517 }
02aa26ce 3518
632403cc 3519 /* eg. \132 indicates the octal constant 0132 */
79072805
LW
3520 case '0': case '1': case '2': case '3':
3521 case '4': case '5': case '6': case '7':
ba210ebe 3522 {
5e0a247b 3523 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
53305cf1 3524 STRLEN len = 3;
06972766 3525 uv = grok_oct(s, &len, &flags, NULL);
ba210ebe 3526 s += len;
5e0a247b
KW
3527 if (len < 3 && s < send && isDIGIT(*s)
3528 && ckWARN(WARN_MISC))
3529 {
3530 Perl_warner(aTHX_ packWARN(WARN_MISC),
3531 "%s", form_short_octal_warning(s, len));
3532 }
ba210ebe 3533 }
012bcf8d 3534 goto NUM_ESCAPE_INSERT;
02aa26ce 3535
f0a2b745
KW
3536 /* eg. \o{24} indicates the octal constant \024 */
3537 case 'o':
3538 {
454155d9 3539 const char* error;
f0a2b745 3540</