This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
detect sub attributes following a signature
[perl5.git] / toke.c
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
f0e67a1d
Z
26This is the lower layer of the Perl parser, managing characters and tokens.
27
28=for apidoc AmU|yy_parser *|PL_parser
29
30Pointer to a structure encapsulating the state of the parsing operation
31currently in progress. The pointer can be locally changed to perform
32a nested parse without interfering with the state of an outer parse.
33Individual members of C<PL_parser> have their own documentation.
34
35=cut
36*/
37
378cc40b 38#include "EXTERN.h"
864dbfa3 39#define PERL_IN_TOKE_C
378cc40b 40#include "perl.h"
f7e03a10 41#include "dquote_inline.h"
378cc40b 42
eb0d8d16
NC
43#define new_constant(a,b,c,d,e,f,g) \
44 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
45
6154021b 46#define pl_yylval (PL_parser->yylval)
d3b6f988 47
199e78b7
DM
48/* XXX temporary backwards compatibility */
49#define PL_lex_brackets (PL_parser->lex_brackets)
78cdf107
Z
50#define PL_lex_allbrackets (PL_parser->lex_allbrackets)
51#define PL_lex_fakeeof (PL_parser->lex_fakeeof)
199e78b7
DM
52#define PL_lex_brackstack (PL_parser->lex_brackstack)
53#define PL_lex_casemods (PL_parser->lex_casemods)
54#define PL_lex_casestack (PL_parser->lex_casestack)
199e78b7 55#define PL_lex_dojoin (PL_parser->lex_dojoin)
199e78b7
DM
56#define PL_lex_formbrack (PL_parser->lex_formbrack)
57#define PL_lex_inpat (PL_parser->lex_inpat)
58#define PL_lex_inwhat (PL_parser->lex_inwhat)
59#define PL_lex_op (PL_parser->lex_op)
60#define PL_lex_repl (PL_parser->lex_repl)
61#define PL_lex_starts (PL_parser->lex_starts)
62#define PL_lex_stuff (PL_parser->lex_stuff)
63#define PL_multi_start (PL_parser->multi_start)
64#define PL_multi_open (PL_parser->multi_open)
65#define PL_multi_close (PL_parser->multi_close)
199e78b7 66#define PL_preambled (PL_parser->preambled)
bdc0bf6f 67#define PL_linestr (PL_parser->linestr)
c2598295
DM
68#define PL_expect (PL_parser->expect)
69#define PL_copline (PL_parser->copline)
f06b5848
DM
70#define PL_bufptr (PL_parser->bufptr)
71#define PL_oldbufptr (PL_parser->oldbufptr)
72#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
73#define PL_linestart (PL_parser->linestart)
74#define PL_bufend (PL_parser->bufend)
75#define PL_last_uni (PL_parser->last_uni)
76#define PL_last_lop (PL_parser->last_lop)
77#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 78#define PL_lex_state (PL_parser->lex_state)
2f9285f8 79#define PL_rsfp (PL_parser->rsfp)
5486870f 80#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
81#define PL_in_my (PL_parser->in_my)
82#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 83#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 84#define PL_multi_end (PL_parser->multi_end)
13765c85 85#define PL_error_count (PL_parser->error_count)
199e78b7 86
fb205e7a
DM
87# define PL_nexttoke (PL_parser->nexttoke)
88# define PL_nexttype (PL_parser->nexttype)
89# define PL_nextval (PL_parser->nextval)
199e78b7 90
6432a58a
DM
91
92#define SvEVALED(sv) \
93 (SvTYPE(sv) >= SVt_PVNV \
94 && ((XPVIV*)SvANY(sv))->xiv_u.xivu_eval_seen)
95
a1894d81 96static const char* const ident_too_long = "Identifier too long";
8903cb82 97
9ded7720 98# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2 99
a7aaec61
Z
100#define XENUMMASK 0x3f
101#define XFAKEEOF 0x40
102#define XFAKEBRACK 0x80
9059aa12 103
39e02b42 104#ifdef USE_UTF8_SCRIPTS
b3041197 105# define UTF cBOOL(!IN_BYTES)
2b9d42f0 106#else
b3041197 107# define UTF cBOOL((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
2b9d42f0 108#endif
a0ed51b3 109
b1fc3636
CJ
110/* The maximum number of characters preceding the unrecognized one to display */
111#define UNRECOGNIZED_PRECEDE_COUNT 10
112
61f0cdd9 113/* In variables named $^X, these are the legal values for X.
2b92dfce
GS
114 * 1999-02-27 mjd-perl-patch@plover.com */
115#define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
116
14bd96d0 117#define SPACE_OR_TAB(c) isBLANK_A(c)
bf4acbe4 118
9ff909cf
JH
119#define HEXFP_PEEK(s) \
120 (((s[0] == '.') && \
121 (isXDIGIT(s[1]) || isALPHA_FOLD_EQ(s[1], 'p'))) || \
122 isALPHA_FOLD_EQ(s[0], 'p'))
123
ffb4593c
NT
124/* LEX_* are values for PL_lex_state, the state of the lexer.
125 * They are arranged oddly so that the guard on the switch statement
79072805 126 * can get by with a single comparison (if the compiler is smart enough).
9da1dd8f
DM
127 *
128 * These values refer to the various states within a sublex parse,
129 * i.e. within a double quotish string
79072805
LW
130 */
131
fb73857a
PP
132/* #define LEX_NOTPARSING 11 is done in perl.h. */
133
b6007c36
DM
134#define LEX_NORMAL 10 /* normal code (ie not within "...") */
135#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
136#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
137#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
138#define LEX_INTERPSTART 6 /* expecting the start of a $var */
139
140 /* at end of code, eg "$x" followed by: */
141#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
142#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
143
144#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
145 string or after \E, $foo, etc */
146#define LEX_INTERPCONST 2 /* NOT USED */
147#define LEX_FORMLINE 1 /* expecting a format line */
b6007c36 148
79072805 149
bbf60fe6 150#ifdef DEBUGGING
27da23d5 151static const char* const lex_state_names[] = {
bbf60fe6
DM
152 "KNOWNEXT",
153 "FORMLINE",
154 "INTERPCONST",
155 "INTERPCONCAT",
156 "INTERPENDMAYBE",
157 "INTERPEND",
158 "INTERPSTART",
159 "INTERPPUSH",
160 "INTERPCASEMOD",
161 "INTERPNORMAL",
162 "NORMAL"
163};
164#endif
165
79072805 166#include "keywords.h"
fe14fcc3 167
ffb4593c
NT
168/* CLINE is a macro that ensures PL_copline has a sane value */
169
57843af0 170#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 171
ffb4593c
NT
172/*
173 * Convenience functions to return different tokens and prime the
9cbb5ea2 174 * lexer for the next token. They all take an argument.
ffb4593c
NT
175 *
176 * TOKEN : generic token (used for '(', DOLSHARP, etc)
177 * OPERATOR : generic operator
178 * AOPERATOR : assignment operator
179 * PREBLOCK : beginning the block after an if, while, foreach, ...
180 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
181 * PREREF : *EXPR where EXPR is not a simple identifier
182 * TERM : expression term
89f35911 183 * POSTDEREF : postfix dereference (->$* ->@[...] etc.)
ffb4593c
NT
184 * LOOPX : loop exiting command (goto, last, dump, etc)
185 * FTST : file test operator
186 * FUN0 : zero-argument function
7eb971ee 187 * FUN0OP : zero-argument function, with its op created in this file
2d2e263d 188 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
189 * BOop : bitwise or or xor
190 * BAop : bitwise and
8823cb89 191 * BCop : bitwise complement
ffb4593c
NT
192 * SHop : shift operator
193 * PWop : power operator
9cbb5ea2 194 * PMop : pattern-matching operator
ffb4593c 195 * Aop : addition-level operator
e4916dd1 196 * AopNOASSIGN : addition-level operator that is never part of .=
ffb4593c
NT
197 * Mop : multiplication-level operator
198 * Eop : equality-testing operator
e5edeb50 199 * Rop : relational operator <= != gt
ffb4593c
NT
200 *
201 * Also see LOP and lop() below.
202 */
203
998054bd 204#ifdef DEBUGGING /* Serve -DT. */
704d4215 205# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 206#else
bbf60fe6 207# define REPORT(retval) (retval)
998054bd
SC
208#endif
209
bbf60fe6
DM
210#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
211#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
b1764551 212#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, retval))
bbf60fe6
DM
213#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
214#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
215#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
216#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
89f35911 217#define POSTDEREF(f) return (PL_bufptr = s, S_postderef(aTHX_ REPORT(f),s[1]))
185c2e96 218#define LOOPX(f) return (PL_bufptr = force_word(s,BAREWORD,TRUE,FALSE), \
7a61bf3c 219 pl_yylval.ival=f, \
a49203fd 220 PL_expect = PL_nexttoke ? XOPERATOR : XTERM, \
7a61bf3c 221 REPORT((int)LOOPEX))
6154021b
RGS
222#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
223#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
7eb971ee 224#define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
6154021b 225#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
b1764551
FC
226#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)BITOROP))
227#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)BITANDOP))
8823cb89
FC
228#define BCop(f) return pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr = s, \
229 REPORT('~')
b1764551
FC
230#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)SHIFTOP))
231#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)POWOP))
6154021b 232#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
b1764551 233#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)ADDOP))
e4916dd1 234#define AopNOASSIGN(f) return (pl_yylval.ival=f, PL_bufptr=s, REPORT((int)ADDOP))
b1764551 235#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)MULOP))
6154021b
RGS
236#define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
237#define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
2f3197b3 238
a687059c
LW
239/* This bit of chicanery makes a unary function followed by
240 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
241 * The UNIDOR macro is for unary functions that can be followed by the //
242 * operator (such as C<shift // 0>).
a687059c 243 */
d68ce4ac 244#define UNI3(f,x,have_x) { \
6154021b 245 pl_yylval.ival = f; \
d68ce4ac 246 if (have_x) PL_expect = x; \
376fcdbf
AL
247 PL_bufptr = s; \
248 PL_last_uni = PL_oldbufptr; \
0af40c75 249 PL_last_lop_op = (f) < 0 ? -(f) : (f); \
376fcdbf
AL
250 if (*s == '(') \
251 return REPORT( (int)FUNC1 ); \
294a536f 252 s = skipspace(s); \
376fcdbf
AL
253 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
254 }
d68ce4ac
FC
255#define UNI(f) UNI3(f,XTERM,1)
256#define UNIDOR(f) UNI3(f,XTERMORDORDOR,1)
b5fb7ce3
FC
257#define UNIPROTO(f,optional) { \
258 if (optional) PL_last_uni = PL_oldbufptr; \
22393538
MH
259 OPERATOR(f); \
260 }
a687059c 261
d68ce4ac 262#define UNIBRACK(f) UNI3(f,0,0)
79072805 263
9f68db38 264/* grandfather return to old style */
78cdf107
Z
265#define OLDLOP(f) \
266 do { \
267 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
268 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
269 pl_yylval.ival = (f); \
270 PL_expect = XTERM; \
271 PL_bufptr = s; \
272 return (int)LSTOP; \
273 } while(0)
79072805 274
83944c01
FC
275#define COPLINE_INC_WITH_HERELINES \
276 STMT_START { \
277 CopLINE_inc(PL_curcop); \
851b527a
FC
278 if (PL_parser->herelines) \
279 CopLINE(PL_curcop) += PL_parser->herelines, \
280 PL_parser->herelines = 0; \
83944c01 281 } STMT_END
ffdb8b16
FC
282/* Called after scan_str to update CopLINE(PL_curcop), but only when there
283 * is no sublex_push to follow. */
284#define COPLINE_SET_FROM_MULTI_END \
285 STMT_START { \
286 CopLINE_set(PL_curcop, PL_multi_end); \
287 if (PL_multi_end != PL_multi_start) \
851b527a 288 PL_parser->herelines = 0; \
ffdb8b16 289 } STMT_END
83944c01
FC
290
291
8fa7f367
JH
292#ifdef DEBUGGING
293
6154021b 294/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
295enum token_type {
296 TOKENTYPE_NONE,
297 TOKENTYPE_IVAL,
6154021b 298 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6 299 TOKENTYPE_PVAL,
aeaef349 300 TOKENTYPE_OPVAL
bbf60fe6
DM
301};
302
6d4a66ac
NC
303static struct debug_tokens {
304 const int token;
305 enum token_type type;
306 const char *name;
307} const debug_tokens[] =
9041c2e3 308{
bbf60fe6
DM
309 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
310 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
311 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
312 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
436ddf68 313 { ANON_SIGSUB, TOKENTYPE_IVAL, "ANON_SIGSUB" },
bbf60fe6
DM
314 { ARROW, TOKENTYPE_NONE, "ARROW" },
315 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
316 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
317 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
318 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
319 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
7896dde7 320 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
321 { DO, TOKENTYPE_NONE, "DO" },
322 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
323 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
324 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
325 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
326 { ELSE, TOKENTYPE_NONE, "ELSE" },
327 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
328 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
329 { FOR, TOKENTYPE_IVAL, "FOR" },
330 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
705fe0e5
FC
331 { FORMLBRACK, TOKENTYPE_NONE, "FORMLBRACK" },
332 { FORMRBRACK, TOKENTYPE_NONE, "FORMRBRACK" },
bbf60fe6
DM
333 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
334 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
7eb971ee 335 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
bbf60fe6
DM
336 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
337 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
338 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 339 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
340 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
341 { IF, TOKENTYPE_IVAL, "IF" },
5504e6cf 342 { LABEL, TOKENTYPE_PVAL, "LABEL" },
bbf60fe6
DM
343 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
344 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
345 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
346 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
347 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
348 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
349 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
350 { MY, TOKENTYPE_IVAL, "MY" },
bbf60fe6
DM
351 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
352 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
353 { OROP, TOKENTYPE_IVAL, "OROP" },
354 { OROR, TOKENTYPE_NONE, "OROR" },
355 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
88e1f1a2
JV
356 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
357 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6 358 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
cc624add 359 { POSTJOIN, TOKENTYPE_NONE, "POSTJOIN" },
bbf60fe6
DM
360 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
361 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
362 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
363 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
364 { PREINC, TOKENTYPE_NONE, "PREINC" },
365 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
f3f204dc 366 { QWLIST, TOKENTYPE_OPVAL, "QWLIST" },
bbf60fe6
DM
367 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
368 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
f3f204dc 369 { REQUIRE, TOKENTYPE_NONE, "REQUIRE" },
bbf60fe6 370 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
436ddf68 371 { SIGSUB, TOKENTYPE_NONE, "SIGSUB" },
bbf60fe6
DM
372 { SUB, TOKENTYPE_NONE, "SUB" },
373 { THING, TOKENTYPE_OPVAL, "THING" },
374 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
375 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
376 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
377 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
378 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
379 { USE, TOKENTYPE_IVAL, "USE" },
7896dde7 380 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6 381 { WHILE, TOKENTYPE_IVAL, "WHILE" },
185c2e96 382 { BAREWORD, TOKENTYPE_OPVAL, "BAREWORD" },
be25f609 383 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 384 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
385};
386
6154021b 387/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 388
bbf60fe6 389STATIC int
704d4215 390S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 391{
7918f24d
NC
392 PERL_ARGS_ASSERT_TOKEREPORT;
393
bbf60fe6 394 if (DEBUG_T_TEST) {
bd61b366 395 const char *name = NULL;
bbf60fe6 396 enum token_type type = TOKENTYPE_NONE;
f54cb97a 397 const struct debug_tokens *p;
396482e1 398 SV* const report = newSVpvs("<== ");
bbf60fe6 399
f54cb97a 400 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
401 if (p->token == (int)rv) {
402 name = p->name;
403 type = p->type;
404 break;
405 }
406 }
407 if (name)
54667de8 408 Perl_sv_catpv(aTHX_ report, name);
239f83d5 409 else if (isGRAPH(rv))
4ebc7986 410 {
bbf60fe6 411 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
4ebc7986
FC
412 if ((char)rv == 'p')
413 sv_catpvs(report, " (pending identifier)");
414 }
bbf60fe6 415 else if (!rv)
396482e1 416 sv_catpvs(report, "EOF");
bbf60fe6 417 else
147e3846 418 Perl_sv_catpvf(aTHX_ report, "?? %" IVdf, (IV)rv);
bbf60fe6
DM
419 switch (type) {
420 case TOKENTYPE_NONE:
bbf60fe6
DM
421 break;
422 case TOKENTYPE_IVAL:
147e3846 423 Perl_sv_catpvf(aTHX_ report, "(ival=%" IVdf ")", (IV)lvalp->ival);
bbf60fe6
DM
424 break;
425 case TOKENTYPE_OPNUM:
426 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 427 PL_op_name[lvalp->ival]);
bbf60fe6
DM
428 break;
429 case TOKENTYPE_PVAL:
704d4215 430 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
431 break;
432 case TOKENTYPE_OPVAL:
704d4215 433 if (lvalp->opval) {
401441c0 434 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
435 PL_op_name[lvalp->opval->op_type]);
436 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 437 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 438 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
439 }
440
441 }
401441c0 442 else
396482e1 443 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
444 break;
445 }
b6007c36 446 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
447 };
448 return (int)rv;
998054bd
SC
449}
450
b6007c36
DM
451
452/* print the buffer with suitable escapes */
453
454STATIC void
15f169a1 455S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 456{
396482e1 457 SV* const tmp = newSVpvs("");
7918f24d
NC
458
459 PERL_ARGS_ASSERT_PRINTBUF;
460
7347ee54 461 GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral); /* fmt checked by caller */
b6007c36 462 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
7347ee54 463 GCC_DIAG_RESTORE_STMT;
b6007c36
DM
464 SvREFCNT_dec(tmp);
465}
466
8fa7f367
JH
467#endif
468
ffb4593c
NT
469/*
470 * S_ao
471 *
f393a21a
FC
472 * This subroutine looks for an '=' next to the operator that has just been
473 * parsed and turns it into an ASSIGNOP if it finds one.
ffb4593c
NT
474 */
475
76e3520e 476STATIC int
cea2e8a9 477S_ao(pTHX_ int toketype)
a0d0e21e 478{
3280af22
NIS
479 if (*PL_bufptr == '=') {
480 PL_bufptr++;
a0d0e21e 481 if (toketype == ANDAND)
6154021b 482 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 483 else if (toketype == OROR)
6154021b 484 pl_yylval.ival = OP_ORASSIGN;
c963b151 485 else if (toketype == DORDOR)
6154021b 486 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
487 toketype = ASSIGNOP;
488 }
b1764551 489 return REPORT(toketype);
a0d0e21e
LW
490}
491
ffb4593c
NT
492/*
493 * S_no_op
494 * When Perl expects an operator and finds something else, no_op
495 * prints the warning. It always prints "<something> found where
496 * operator expected. It prints "Missing semicolon on previous line?"
497 * if the surprise occurs at the start of the line. "do you need to
498 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
499 * where the compiler doesn't know if foo is a method call or a function.
500 * It prints "Missing operator before end of line" if there's nothing
501 * after the missing operator, or "... before <...>" if there is something
502 * after the missing operator.
488bc579
FC
503 *
504 * PL_bufptr is expected to point to the start of the thing that was found,
505 * and s after the next token or partial token.
ffb4593c
NT
506 */
507
76e3520e 508STATIC void
15f169a1 509S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 510{
9d4ba2ae
AL
511 char * const oldbp = PL_bufptr;
512 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 513
7918f24d
NC
514 PERL_ARGS_ASSERT_NO_OP;
515
1189a94a
GS
516 if (!s)
517 s = oldbp;
07c798fb 518 else
1189a94a 519 PL_bufptr = s;
734ab321 520 yywarn(Perl_form(aTHX_ "%s found where operator expected", what), UTF ? SVf_UTF8 : 0);
56da5a46
RGS
521 if (ckWARN_d(WARN_SYNTAX)) {
522 if (is_first)
523 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
524 "\t(Missing semicolon on previous line?)\n");
fac0f7a3
KW
525 else if (PL_oldoldbufptr && isIDFIRST_lazy_if_safe(PL_oldoldbufptr,
526 PL_bufend,
527 UTF))
528 {
f54cb97a 529 const char *t;
fac0f7a3
KW
530 for (t = PL_oldoldbufptr;
531 (isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF) || *t == ':');
532 t += UTF ? UTF8SKIP(t) : 1)
533 {
c35e046a 534 NOOP;
fac0f7a3 535 }
56da5a46
RGS
536 if (t < PL_bufptr && isSPACE(*t))
537 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
147e3846 538 "\t(Do you need to predeclare %" UTF8f "?)\n",
b17a0679 539 UTF8fARG(UTF, t - PL_oldoldbufptr, PL_oldoldbufptr));
56da5a46
RGS
540 }
541 else {
542 assert(s >= oldbp);
543 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
147e3846 544 "\t(Missing operator before %" UTF8f "?)\n",
b17a0679 545 UTF8fARG(UTF, s - oldbp, oldbp));
56da5a46 546 }
07c798fb 547 }
3280af22 548 PL_bufptr = oldbp;
8990e307
LW
549}
550
ffb4593c
NT
551/*
552 * S_missingterm
553 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 554 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
555 * If we're in a delimited string and the delimiter is a control
556 * character, it's reformatted into a two-char sequence like ^C.
557 * This is fatal.
558 */
559
76e3520e 560STATIC void
1b8d3e0e 561S_missingterm(pTHX_ char *s, STRLEN len)
8990e307 562{
e487ff5e 563 char tmpbuf[UTF8_MAXBYTES + 1];
8990e307 564 char q;
cb650135
FC
565 bool uni = FALSE;
566 SV *sv;
8990e307 567 if (s) {
f88b23d7 568 char * const nl = (char *) my_memrchr(s, '\n', len);
1b8d3e0e
LM
569 if (nl) {
570 *nl = '\0';
571 len = nl - s;
572 }
7f7f7d08 573 uni = UTF;
8990e307 574 }
cb650135 575 else if (PL_multi_close < 32) {
8990e307 576 *tmpbuf = '^';
585ec06d 577 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
578 tmpbuf[2] = '\0';
579 s = tmpbuf;
1b8d3e0e 580 len = 2;
8990e307
LW
581 }
582 else {
cb650135
FC
583 if (LIKELY(PL_multi_close < 256)) {
584 *tmpbuf = (char)PL_multi_close;
585 tmpbuf[1] = '\0';
1b8d3e0e 586 len = 1;
cb650135
FC
587 }
588 else {
1b8d3e0e
LM
589 char *end = (char *)uvchr_to_utf8((U8 *)tmpbuf, PL_multi_close);
590 *end = '\0';
591 len = end - tmpbuf;
cb650135 592 uni = TRUE;
cb650135 593 }
8990e307
LW
594 s = tmpbuf;
595 }
9f8d27b7 596 q = memchr(s, '"', len) ? '\'' : '"';
1b8d3e0e 597 sv = sv_2mortal(newSVpvn(s, len));
cb650135
FC
598 if (uni)
599 SvUTF8_on(sv);
1b8d3e0e
LM
600 Perl_croak(aTHX_ "Can't find string terminator %c%" SVf "%c"
601 " anywhere before EOF", q, SVfARG(sv), q);
463ee0b2 602}
79072805 603
dd0ac2b9
FC
604#include "feature.h"
605
0d863452 606/*
0d863452
RH
607 * Check whether the named feature is enabled.
608 */
26ea9e12 609bool
3fff3427 610Perl_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
0d863452 611{
4a731d7b 612 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
7918f24d
NC
613
614 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
ca4d40c4
FC
615
616 assert(CURRENT_FEATURE_BUNDLE == FEATURE_BUNDLE_CUSTOM);
7918f24d 617
26ea9e12
NC
618 if (namelen > MAX_FEATURE_LEN)
619 return FALSE;
3fff3427 620 memcpy(&he_name[8], name, namelen);
7d69d4a6 621
c8ca97b0
NC
622 return cBOOL(cop_hints_fetch_pvn(PL_curcop, he_name, 8 + namelen, 0,
623 REFCOUNTED_HE_EXISTS));
0d863452
RH
624}
625
ffb4593c 626/*
9cbb5ea2
GS
627 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
628 * utf16-to-utf8-reversed.
ffb4593c
NT
629 */
630
c39cd008
GS
631#ifdef PERL_CR_FILTER
632static void
633strip_return(SV *sv)
634{
eb578fdb
KW
635 const char *s = SvPVX_const(sv);
636 const char * const e = s + SvCUR(sv);
7918f24d
NC
637
638 PERL_ARGS_ASSERT_STRIP_RETURN;
639
c39cd008
GS
640 /* outer loop optimized to do nothing if there are no CR-LFs */
641 while (s < e) {
642 if (*s++ == '\r' && *s == '\n') {
643 /* hit a CR-LF, need to copy the rest */
eb578fdb 644 char *d = s - 1;
c39cd008
GS
645 *d++ = *s++;
646 while (s < e) {
647 if (*s == '\r' && s[1] == '\n')
648 s++;
649 *d++ = *s++;
650 }
651 SvCUR(sv) -= s - d;
652 return;
653 }
654 }
655}
a868473f 656
76e3520e 657STATIC I32
c39cd008 658S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 659{
f54cb97a 660 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
661 if (count > 0 && !maxlen)
662 strip_return(sv);
663 return count;
a868473f
NIS
664}
665#endif
666
ffb4593c 667/*
8eaa0acf
Z
668=for apidoc Amx|void|lex_start|SV *line|PerlIO *rsfp|U32 flags
669
670Creates and initialises a new lexer/parser state object, supplying
671a context in which to lex and parse from a new source of Perl code.
672A pointer to the new state object is placed in L</PL_parser>. An entry
c83d5090 673is made on the save stack so that upon unwinding, the new state object
8eaa0acf
Z
674will be destroyed and the former value of L</PL_parser> will be restored.
675Nothing else need be done to clean up the parsing context.
676
2d7f6611 677The code to be parsed comes from C<line> and C<rsfp>. C<line>, if
8eaa0acf 678non-null, provides a string (in SV form) containing code to be parsed.
2d7f6611
KW
679A copy of the string is made, so subsequent modification of C<line>
680does not affect parsing. C<rsfp>, if non-null, provides an input stream
8eaa0acf 681from which code will be read to be parsed. If both are non-null, the
2d7f6611
KW
682code in C<line> comes first and must consist of complete lines of input,
683and C<rsfp> supplies the remainder of the source.
8eaa0acf 684
2d7f6611 685The C<flags> parameter is reserved for future use. Currently it is only
e368b3bd 686used by perl internally, so extensions should always pass zero.
8eaa0acf
Z
687
688=cut
689*/
ffb4593c 690
27fcb6ee 691/* LEX_START_SAME_FILTER indicates that this is not a new file, so it
87606032
NC
692 can share filters with the current parser.
693 LEX_START_DONT_CLOSE indicates that the file handle wasn't opened by the
694 caller, hence isn't owned by the parser, so shouldn't be closed on parser
695 destruction. This is used to handle the case of defaulting to reading the
696 script from the standard input because no filename was given on the command
697 line (without getting confused by situation where STDIN has been closed, so
698 the script handle is opened on fd 0) */
27fcb6ee 699
a0d0e21e 700void
8eaa0acf 701Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
79072805 702{
6ef55633 703 const char *s = NULL;
5486870f 704 yy_parser *parser, *oparser;
90b58c70 705
60d63348 706 if (flags && flags & ~LEX_START_FLAGS)
8eaa0acf 707 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
acdf0a21
DM
708
709 /* create and initialise a parser */
710
199e78b7 711 Newxz(parser, 1, yy_parser);
5486870f 712 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
713 PL_parser = parser;
714
28ac2b49 715 parser->stack = NULL;
df13534a 716 parser->stack_max1 = NULL;
28ac2b49 717 parser->ps = NULL;
acdf0a21 718
e3abe207
DM
719 /* on scope exit, free this parser and restore any outer one */
720 SAVEPARSER(parser);
7c4baf47 721 parser->saved_curcop = PL_curcop;
e3abe207 722
acdf0a21 723 /* initialise lexer state */
8990e307 724
fb205e7a 725 parser->nexttoke = 0;
ca4cfd28 726 parser->error_count = oparser ? oparser->error_count : 0;
7f1c3e8c 727 parser->copline = parser->preambling = NOLINE;
5afb0a62 728 parser->lex_state = LEX_NORMAL;
c2598295 729 parser->expect = XSTATE;
2f9285f8 730 parser->rsfp = rsfp;
efa571ab 731 parser->recheck_utf8_validity = FALSE;
27fcb6ee
FC
732 parser->rsfp_filters =
733 !(flags & LEX_START_SAME_FILTER) || !oparser
d3cd8e11
FC
734 ? NULL
735 : MUTABLE_AV(SvREFCNT_inc(
736 oparser->rsfp_filters
737 ? oparser->rsfp_filters
738 : (oparser->rsfp_filters = newAV())
739 ));
2f9285f8 740
199e78b7
DM
741 Newx(parser->lex_brackstack, 120, char);
742 Newx(parser->lex_casestack, 12, char);
743 *parser->lex_casestack = '\0';
d794b522 744 Newxz(parser->lex_shared, 1, LEXSHARED);
02b34bbe 745
10efb74f 746 if (line) {
0528fd32 747 STRLEN len;
a6909bd2
KW
748 const U8* first_bad_char_loc;
749
10efb74f 750 s = SvPV_const(line, len);
90b58c70 751
0aab20f2
KW
752 if ( SvUTF8(line)
753 && UNLIKELY(! is_utf8_string_loc((U8 *) s,
754 SvCUR(line),
755 &first_bad_char_loc)))
90b58c70
KW
756 {
757 _force_out_malformed_utf8_message(first_bad_char_loc,
758 (U8 *) s + SvCUR(line),
759 0,
760 1 /* 1 means die */ );
761 NOT_REACHED; /* NOTREACHED */
762 }
763
0abcdfa4
FC
764 parser->linestr = flags & LEX_START_COPIED
765 ? SvREFCNT_inc_simple_NN(line)
766 : newSVpvn_flags(s, len, SvUTF8(line));
b3dd0aba
FC
767 if (!rsfp)
768 sv_catpvs(parser->linestr, "\n;");
0abcdfa4 769 } else {
bf1b738b 770 parser->linestr = newSVpvn("\n;", rsfp ? 1 : 2);
8990e307 771 }
218304f9 772
f06b5848
DM
773 parser->oldoldbufptr =
774 parser->oldbufptr =
775 parser->bufptr =
776 parser->linestart = SvPVX(parser->linestr);
777 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
778 parser->last_lop = parser->last_uni = NULL;
b54f893d 779
6d59e610 780 STATIC_ASSERT_STMT(FITS_IN_8_BITS(LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
b54f893d
KW
781 |LEX_DONT_CLOSE_RSFP));
782 parser->lex_flags = (U8) (flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
783 |LEX_DONT_CLOSE_RSFP));
737c24fc 784
60d63348 785 parser->in_pod = parser->filtered = 0;
79072805 786}
a687059c 787
e3abe207
DM
788
789/* delete a parser object */
790
791void
792Perl_parser_free(pTHX_ const yy_parser *parser)
793{
7918f24d
NC
794 PERL_ARGS_ASSERT_PARSER_FREE;
795
7c4baf47 796 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
797 SvREFCNT_dec(parser->linestr);
798
87606032 799 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
2f9285f8 800 PerlIO_clearerr(parser->rsfp);
407f8cf2
KW
801 else if (parser->rsfp && (!parser->old_parser
802 || (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 803 PerlIO_close(parser->rsfp);
5486870f 804 SvREFCNT_dec(parser->rsfp_filters);
10002bc1 805 SvREFCNT_dec(parser->lex_stuff);
7ef70b3d 806 SvREFCNT_dec(parser->lex_sub_repl);
3ac7ff8f
FC
807
808 Safefree(parser->lex_brackstack);
809 Safefree(parser->lex_casestack);
810 Safefree(parser->lex_shared);
811 PL_parser = parser->old_parser;
812 Safefree(parser);
813}
814
815void
816Perl_parser_free_nexttoke_ops(pTHX_ yy_parser *parser, OPSLAB *slab)
817{
3ac7ff8f 818 I32 nexttoke = parser->nexttoke;
3ac7ff8f 819 PERL_ARGS_ASSERT_PARSER_FREE_NEXTTOKE_OPS;
3ce3dcd9 820 while (nexttoke--) {
3ac7ff8f
FC
821 if (S_is_opval_token(parser->nexttype[nexttoke] & 0xffff)
822 && parser->nextval[nexttoke].opval
823 && parser->nextval[nexttoke].opval->op_slabbed
824 && OpSLAB(parser->nextval[nexttoke].opval) == slab) {
3ce3dcd9 825 op_free(parser->nextval[nexttoke].opval);
3ac7ff8f
FC
826 parser->nextval[nexttoke].opval = NULL;
827 }
3ce3dcd9 828 }
e3abe207
DM
829}
830
831
ffb4593c 832/*
f0e67a1d
Z
833=for apidoc AmxU|SV *|PL_parser-E<gt>linestr
834
835Buffer scalar containing the chunk currently under consideration of the
836text currently being lexed. This is always a plain string scalar (for
837which C<SvPOK> is true). It is not intended to be used as a scalar by
838normal scalar means; instead refer to the buffer directly by the pointer
839variables described below.
840
841The lexer maintains various C<char*> pointers to things in the
842C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
843reallocated, all of these pointers must be updated. Don't attempt to
844do this manually, but rather use L</lex_grow_linestr> if you need to
845reallocate the buffer.
846
847The content of the text chunk in the buffer is commonly exactly one
848complete line of input, up to and including a newline terminator,
849but there are situations where it is otherwise. The octets of the
850buffer may be intended to be interpreted as either UTF-8 or Latin-1.
851The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
852flag on this scalar, which may disagree with it.
853
854For direct examination of the buffer, the variable
855L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
856lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
857of these pointers is usually preferable to examination of the scalar
858through normal scalar means.
859
860=for apidoc AmxU|char *|PL_parser-E<gt>bufend
861
862Direct pointer to the end of the chunk of text currently being lexed, the
863end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
6602b933 864+ SvCUR(PL_parser-E<gt>linestr)>. A C<NUL> character (zero octet) is
f0e67a1d
Z
865always located at the end of the buffer, and does not count as part of
866the buffer's contents.
867
868=for apidoc AmxU|char *|PL_parser-E<gt>bufptr
869
870Points to the current position of lexing inside the lexer buffer.
871Characters around this point may be freely examined, within
872the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
873L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
874interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
875
876Lexing code (whether in the Perl core or not) moves this pointer past
877the characters that it consumes. It is also expected to perform some
878bookkeeping whenever a newline character is consumed. This movement
879can be more conveniently performed by the function L</lex_read_to>,
880which handles newlines appropriately.
881
882Interpretation of the buffer's octets can be abstracted out by
883using the slightly higher-level functions L</lex_peek_unichar> and
884L</lex_read_unichar>.
885
886=for apidoc AmxU|char *|PL_parser-E<gt>linestart
887
888Points to the start of the current line inside the lexer buffer.
889This is useful for indicating at which column an error occurred, and
890not much else. This must be updated by any lexing code that consumes
891a newline; the function L</lex_read_to> handles this detail.
892
893=cut
894*/
895
896/*
897=for apidoc Amx|bool|lex_bufutf8
898
899Indicates whether the octets in the lexer buffer
900(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
901of Unicode characters. If not, they should be interpreted as Latin-1
902characters. This is analogous to the C<SvUTF8> flag for scalars.
903
904In UTF-8 mode, it is not guaranteed that the lexer buffer actually
905contains valid UTF-8. Lexing code must be robust in the face of invalid
906encoding.
907
908The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
909is significant, but not the whole story regarding the input character
910encoding. Normally, when a file is being read, the scalar contains octets
911and its C<SvUTF8> flag is off, but the octets should be interpreted as
912UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
913however, the scalar may have the C<SvUTF8> flag on, and in this case its
914octets should be interpreted as UTF-8 unless the C<use bytes> pragma
915is in effect. This logic may change in the future; use this function
916instead of implementing the logic yourself.
917
918=cut
919*/
920
921bool
922Perl_lex_bufutf8(pTHX)
923{
924 return UTF;
925}
926
927/*
928=for apidoc Amx|char *|lex_grow_linestr|STRLEN len
929
930Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
2d7f6611 931at least C<len> octets (including terminating C<NUL>). Returns a
f0e67a1d
Z
932pointer to the reallocated buffer. This is necessary before making
933any direct modification of the buffer that would increase its length.
934L</lex_stuff_pvn> provides a more convenient way to insert text into
935the buffer.
936
937Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
938this function updates all of the lexer's variables that point directly
939into the buffer.
940
941=cut
942*/
943
944char *
945Perl_lex_grow_linestr(pTHX_ STRLEN len)
946{
947 SV *linestr;
948 char *buf;
949 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
c7641931 950 STRLEN linestart_pos, last_uni_pos, last_lop_pos, re_eval_start_pos;
98d5e3ef
DM
951 bool current;
952
f0e67a1d
Z
953 linestr = PL_parser->linestr;
954 buf = SvPVX(linestr);
955 if (len <= SvLEN(linestr))
956 return buf;
98d5e3ef
DM
957
958 /* Is the lex_shared linestr SV the same as the current linestr SV?
959 * Only in this case does re_eval_start need adjusting, since it
960 * points within lex_shared->ls_linestr's buffer */
b1b8fb6a
DM
961 current = ( !PL_parser->lex_shared->ls_linestr
962 || linestr == PL_parser->lex_shared->ls_linestr);
98d5e3ef 963
f0e67a1d
Z
964 bufend_pos = PL_parser->bufend - buf;
965 bufptr_pos = PL_parser->bufptr - buf;
966 oldbufptr_pos = PL_parser->oldbufptr - buf;
967 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
968 linestart_pos = PL_parser->linestart - buf;
969 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
970 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
98d5e3ef 971 re_eval_start_pos = (current && PL_parser->lex_shared->re_eval_start) ?
3328ab5a 972 PL_parser->lex_shared->re_eval_start - buf : 0;
c7641931 973
f0e67a1d 974 buf = sv_grow(linestr, len);
c7641931 975
f0e67a1d
Z
976 PL_parser->bufend = buf + bufend_pos;
977 PL_parser->bufptr = buf + bufptr_pos;
978 PL_parser->oldbufptr = buf + oldbufptr_pos;
979 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
980 PL_parser->linestart = buf + linestart_pos;
981 if (PL_parser->last_uni)
982 PL_parser->last_uni = buf + last_uni_pos;
983 if (PL_parser->last_lop)
984 PL_parser->last_lop = buf + last_lop_pos;
98d5e3ef 985 if (current && PL_parser->lex_shared->re_eval_start)
3328ab5a 986 PL_parser->lex_shared->re_eval_start = buf + re_eval_start_pos;
f0e67a1d
Z
987 return buf;
988}
989
990/*
83aa740e 991=for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
f0e67a1d
Z
992
993Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
994immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
995reallocating the buffer if necessary. This means that lexing code that
996runs later will see the characters as if they had appeared in the input.
997It is not recommended to do this as part of normal parsing, and most
998uses of this facility run the risk of the inserted characters being
999interpreted in an unintended manner.
1000
2d7f6611
KW
1001The string to be inserted is represented by C<len> octets starting
1002at C<pv>. These octets are interpreted as either UTF-8 or Latin-1,
1003according to whether the C<LEX_STUFF_UTF8> flag is set in C<flags>.
f0e67a1d
Z
1004The characters are recoded for the lexer buffer, according to how the
1005buffer is currently being interpreted (L</lex_bufutf8>). If a string
9dcc53ea 1006to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
f0e67a1d
Z
1007function is more convenient.
1008
1009=cut
1010*/
1011
1012void
83aa740e 1013Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
f0e67a1d 1014{
749123ff 1015 dVAR;
f0e67a1d
Z
1016 char *bufptr;
1017 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
1018 if (flags & ~(LEX_STUFF_UTF8))
1019 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
1020 if (UTF) {
1021 if (flags & LEX_STUFF_UTF8) {
1022 goto plain_copy;
1023 } else {
54d004e8 1024 STRLEN highhalf = 0; /* Count of variants */
83aa740e 1025 const char *p, *e = pv+len;
54d004e8
KW
1026 for (p = pv; p != e; p++) {
1027 if (! UTF8_IS_INVARIANT(*p)) {
1028 highhalf++;
1029 }
1030 }
f0e67a1d
Z
1031 if (!highhalf)
1032 goto plain_copy;
1033 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
1034 bufptr = PL_parser->bufptr;
1035 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1036 SvCUR_set(PL_parser->linestr,
1037 SvCUR(PL_parser->linestr) + len+highhalf);
f0e67a1d
Z
1038 PL_parser->bufend += len+highhalf;
1039 for (p = pv; p != e; p++) {
27d0a47c 1040 append_utf8_from_native_byte(*p, (U8 **) &bufptr);
f0e67a1d
Z
1041 }
1042 }
1043 } else {
1044 if (flags & LEX_STUFF_UTF8) {
1045 STRLEN highhalf = 0;
83aa740e 1046 const char *p, *e = pv+len;
f0e67a1d
Z
1047 for (p = pv; p != e; p++) {
1048 U8 c = (U8)*p;
54d004e8 1049 if (UTF8_IS_ABOVE_LATIN1(c)) {
f0e67a1d
Z
1050 Perl_croak(aTHX_ "Lexing code attempted to stuff "
1051 "non-Latin-1 character into Latin-1 input");
54d004e8 1052 } else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e)) {
f0e67a1d
Z
1053 p++;
1054 highhalf++;
f88c6466 1055 } else assert(UTF8_IS_INVARIANT(c));
f0e67a1d
Z
1056 }
1057 if (!highhalf)
1058 goto plain_copy;
1059 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
1060 bufptr = PL_parser->bufptr;
1061 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1062 SvCUR_set(PL_parser->linestr,
1063 SvCUR(PL_parser->linestr) + len-highhalf);
f0e67a1d 1064 PL_parser->bufend += len-highhalf;
54d004e8
KW
1065 p = pv;
1066 while (p < e) {
1067 if (UTF8_IS_INVARIANT(*p)) {
1068 *bufptr++ = *p;
1069 p++;
f0e67a1d 1070 }
54d004e8
KW
1071 else {
1072 assert(p < e -1 );
a62b247b 1073 *bufptr++ = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
54d004e8
KW
1074 p += 2;
1075 }
f0e67a1d
Z
1076 }
1077 } else {
54d004e8 1078 plain_copy:
f0e67a1d
Z
1079 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1080 bufptr = PL_parser->bufptr;
1081 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
255fdf19 1082 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
f0e67a1d
Z
1083 PL_parser->bufend += len;
1084 Copy(pv, bufptr, len, char);
1085 }
1086 }
1087}
1088
1089/*
9dcc53ea
Z
1090=for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
1091
1092Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1093immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1094reallocating the buffer if necessary. This means that lexing code that
1095runs later will see the characters as if they had appeared in the input.
1096It is not recommended to do this as part of normal parsing, and most
1097uses of this facility run the risk of the inserted characters being
1098interpreted in an unintended manner.
1099
2d7f6611 1100The string to be inserted is represented by octets starting at C<pv>
9dcc53ea
Z
1101and continuing to the first nul. These octets are interpreted as either
1102UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
2d7f6611 1103in C<flags>. The characters are recoded for the lexer buffer, according
9dcc53ea
Z
1104to how the buffer is currently being interpreted (L</lex_bufutf8>).
1105If it is not convenient to nul-terminate a string to be inserted, the
1106L</lex_stuff_pvn> function is more appropriate.
1107
1108=cut
1109*/
1110
1111void
1112Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1113{
1114 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1115 lex_stuff_pvn(pv, strlen(pv), flags);
1116}
1117
1118/*
f0e67a1d
Z
1119=for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1120
1121Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1122immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1123reallocating the buffer if necessary. This means that lexing code that
1124runs later will see the characters as if they had appeared in the input.
1125It is not recommended to do this as part of normal parsing, and most
1126uses of this facility run the risk of the inserted characters being
1127interpreted in an unintended manner.
1128
2d7f6611 1129The string to be inserted is the string value of C<sv>. The characters
f0e67a1d 1130are recoded for the lexer buffer, according to how the buffer is currently
9dcc53ea 1131being interpreted (L</lex_bufutf8>). If a string to be inserted is
f0e67a1d
Z
1132not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1133need to construct a scalar.
1134
1135=cut
1136*/
1137
1138void
1139Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1140{
1141 char *pv;
1142 STRLEN len;
1143 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1144 if (flags)
1145 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1146 pv = SvPV(sv, len);
1147 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1148}
1149
1150/*
1151=for apidoc Amx|void|lex_unstuff|char *ptr
1152
1153Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
2d7f6611 1154C<ptr>. Text following C<ptr> will be moved, and the buffer shortened.
f0e67a1d
Z
1155This hides the discarded text from any lexing code that runs later,
1156as if the text had never appeared.
1157
1158This is not the normal way to consume lexed text. For that, use
1159L</lex_read_to>.
1160
1161=cut
1162*/
1163
1164void
1165Perl_lex_unstuff(pTHX_ char *ptr)
1166{
1167 char *buf, *bufend;
1168 STRLEN unstuff_len;
1169 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1170 buf = PL_parser->bufptr;
1171 if (ptr < buf)
1172 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1173 if (ptr == buf)
1174 return;
1175 bufend = PL_parser->bufend;
1176 if (ptr > bufend)
1177 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1178 unstuff_len = ptr - buf;
1179 Move(ptr, buf, bufend+1-ptr, char);
1180 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1181 PL_parser->bufend = bufend - unstuff_len;
1182}
1183
1184/*
1185=for apidoc Amx|void|lex_read_to|char *ptr
1186
1187Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
2d7f6611 1188to C<ptr>. This advances L</PL_parser-E<gt>bufptr> to match C<ptr>,
f0e67a1d
Z
1189performing the correct bookkeeping whenever a newline character is passed.
1190This is the normal way to consume lexed text.
1191
1192Interpretation of the buffer's octets can be abstracted out by
1193using the slightly higher-level functions L</lex_peek_unichar> and
1194L</lex_read_unichar>.
1195
1196=cut
1197*/
1198
1199void
1200Perl_lex_read_to(pTHX_ char *ptr)
1201{
1202 char *s;
1203 PERL_ARGS_ASSERT_LEX_READ_TO;
1204 s = PL_parser->bufptr;
1205 if (ptr < s || ptr > PL_parser->bufend)
1206 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1207 for (; s != ptr; s++)
1208 if (*s == '\n') {
83944c01 1209 COPLINE_INC_WITH_HERELINES;
f0e67a1d
Z
1210 PL_parser->linestart = s+1;
1211 }
1212 PL_parser->bufptr = ptr;
1213}
1214
1215/*
1216=for apidoc Amx|void|lex_discard_to|char *ptr
1217
1218Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
2d7f6611
KW
1219up to C<ptr>. The remaining content of the buffer will be moved, and
1220all pointers into the buffer updated appropriately. C<ptr> must not
f0e67a1d
Z
1221be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1222it is not permitted to discard text that has yet to be lexed.
1223
1224Normally it is not necessarily to do this directly, because it suffices to
1225use the implicit discarding behaviour of L</lex_next_chunk> and things
1226based on it. However, if a token stretches across multiple lines,
1f317c95 1227and the lexing code has kept multiple lines of text in the buffer for
f0e67a1d
Z
1228that purpose, then after completion of the token it would be wise to
1229explicitly discard the now-unneeded earlier lines, to avoid future
1230multi-line tokens growing the buffer without bound.
1231
1232=cut
1233*/
1234
1235void
1236Perl_lex_discard_to(pTHX_ char *ptr)
1237{
1238 char *buf;
1239 STRLEN discard_len;
1240 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1241 buf = SvPVX(PL_parser->linestr);
1242 if (ptr < buf)
1243 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1244 if (ptr == buf)
1245 return;
1246 if (ptr > PL_parser->bufptr)
1247 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1248 discard_len = ptr - buf;
1249 if (PL_parser->oldbufptr < ptr)
1250 PL_parser->oldbufptr = ptr;
1251 if (PL_parser->oldoldbufptr < ptr)
1252 PL_parser->oldoldbufptr = ptr;
1253 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1254 PL_parser->last_uni = NULL;
1255 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1256 PL_parser->last_lop = NULL;
1257 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1258 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1259 PL_parser->bufend -= discard_len;
1260 PL_parser->bufptr -= discard_len;
1261 PL_parser->oldbufptr -= discard_len;
1262 PL_parser->oldoldbufptr -= discard_len;
1263 if (PL_parser->last_uni)
1264 PL_parser->last_uni -= discard_len;
1265 if (PL_parser->last_lop)
1266 PL_parser->last_lop -= discard_len;
1267}
1268
efa571ab
KW
1269void
1270Perl_notify_parser_that_changed_to_utf8(pTHX)
1271{
1272 /* Called when $^H is changed to indicate that HINT_UTF8 has changed from
1273 * off to on. At compile time, this has the effect of entering a 'use
1274 * utf8' section. This means that any input was not previously checked for
1275 * UTF-8 (because it was off), but now we do need to check it, or our
1276 * assumptions about the input being sane could be wrong, and we could
1277 * segfault. This routine just sets a flag so that the next time we look
1278 * at the input we do the well-formed UTF-8 check. If we aren't in the
1279 * proper phase, there may not be a parser object, but if there is, setting
1280 * the flag is harmless */
1281
1282 if (PL_parser) {
1283 PL_parser->recheck_utf8_validity = TRUE;
1284 }
1285}
1286
f0e67a1d
Z
1287/*
1288=for apidoc Amx|bool|lex_next_chunk|U32 flags
1289
1290Reads in the next chunk of text to be lexed, appending it to
1291L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1292looked to the end of the current chunk and wants to know more. It is
1293usual, but not necessary, for lexing to have consumed the entirety of
1294the current chunk at this time.
1295
1296If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1297chunk (i.e., the current chunk has been entirely consumed), normally the
1298current chunk will be discarded at the same time that the new chunk is
c5608a1f 1299read in. If C<flags> has the C<LEX_KEEP_PREVIOUS> bit set, the current chunk
f0e67a1d
Z
1300will not be discarded. If the current chunk has not been entirely
1301consumed, then it will not be discarded regardless of the flag.
1302
1303Returns true if some new text was added to the buffer, or false if the
1304buffer has reached the end of the input text.
1305
1306=cut
1307*/
1308
1309#define LEX_FAKE_EOF 0x80000000
e47d32dc 1310#define LEX_NO_TERM 0x40000000 /* here-doc */
f0e67a1d
Z
1311
1312bool
1313Perl_lex_next_chunk(pTHX_ U32 flags)
1314{
1315 SV *linestr;
1316 char *buf;
1317 STRLEN old_bufend_pos, new_bufend_pos;
1318 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1319 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1320 bool got_some_for_debugger = 0;
f0e67a1d 1321 bool got_some;
6cdc5cd8 1322
112d1284 1323 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM))
f0e67a1d 1324 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
d27f4b91 1325 if (!(flags & LEX_NO_TERM) && PL_lex_inwhat)
e47d32dc 1326 return FALSE;
f0e67a1d
Z
1327 linestr = PL_parser->linestr;
1328 buf = SvPVX(linestr);
407f8cf2
KW
1329 if (!(flags & LEX_KEEP_PREVIOUS)
1330 && PL_parser->bufptr == PL_parser->bufend)
1331 {
f0e67a1d
Z
1332 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1333 linestart_pos = 0;
1334 if (PL_parser->last_uni != PL_parser->bufend)
1335 PL_parser->last_uni = NULL;
1336 if (PL_parser->last_lop != PL_parser->bufend)
1337 PL_parser->last_lop = NULL;
1338 last_uni_pos = last_lop_pos = 0;
1339 *buf = 0;
1340 SvCUR(linestr) = 0;
1341 } else {
1342 old_bufend_pos = PL_parser->bufend - buf;
1343 bufptr_pos = PL_parser->bufptr - buf;
1344 oldbufptr_pos = PL_parser->oldbufptr - buf;
1345 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1346 linestart_pos = PL_parser->linestart - buf;
1347 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1348 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1349 }
1350 if (flags & LEX_FAKE_EOF) {
1351 goto eof;
60d63348 1352 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
f0e67a1d
Z
1353 got_some = 0;
1354 } else if (filter_gets(linestr, old_bufend_pos)) {
1355 got_some = 1;
17cc9359 1356 got_some_for_debugger = 1;
112d1284
FC
1357 } else if (flags & LEX_NO_TERM) {
1358 got_some = 0;
f0e67a1d 1359 } else {
580561a3 1360 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
847cc851 1361 SvPVCLEAR(linestr);
f0e67a1d
Z
1362 eof:
1363 /* End of real input. Close filehandle (unless it was STDIN),
1364 * then add implicit termination.
1365 */
87606032 1366 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
f0e67a1d
Z
1367 PerlIO_clearerr(PL_parser->rsfp);
1368 else if (PL_parser->rsfp)
1369 (void)PerlIO_close(PL_parser->rsfp);
1370 PL_parser->rsfp = NULL;
60d63348 1371 PL_parser->in_pod = PL_parser->filtered = 0;
f0e67a1d
Z
1372 if (!PL_in_eval && PL_minus_p) {
1373 sv_catpvs(linestr,
1374 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1375 PL_minus_n = PL_minus_p = 0;
1376 } else if (!PL_in_eval && PL_minus_n) {
1377 sv_catpvs(linestr, /*{*/";}");
1378 PL_minus_n = 0;
1379 } else
1380 sv_catpvs(linestr, ";");
1381 got_some = 1;
1382 }
1383 buf = SvPVX(linestr);
1384 new_bufend_pos = SvCUR(linestr);
1385 PL_parser->bufend = buf + new_bufend_pos;
1386 PL_parser->bufptr = buf + bufptr_pos;
6cdc5cd8 1387
07337b95
KW
1388 if (UTF) {
1389 const U8* first_bad_char_loc;
1390 if (UNLIKELY(! is_utf8_string_loc(
1391 (U8 *) PL_parser->bufptr,
1392 PL_parser->bufend - PL_parser->bufptr,
1393 &first_bad_char_loc)))
1394 {
1395 _force_out_malformed_utf8_message(first_bad_char_loc,
1396 (U8 *) PL_parser->bufend,
1397 0,
1398 1 /* 1 means die */ );
1399 NOT_REACHED; /* NOTREACHED */
1400 }
6cdc5cd8
KW
1401 }
1402
f0e67a1d
Z
1403 PL_parser->oldbufptr = buf + oldbufptr_pos;
1404 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1405 PL_parser->linestart = buf + linestart_pos;
1406 if (PL_parser->last_uni)
1407 PL_parser->last_uni = buf + last_uni_pos;
1408 if (PL_parser->last_lop)
1409 PL_parser->last_lop = buf + last_lop_pos;
7f1c3e8c
FC
1410 if (PL_parser->preambling != NOLINE) {
1411 CopLINE_set(PL_curcop, PL_parser->preambling + 1);
1412 PL_parser->preambling = NOLINE;
1413 }
407f8cf2
KW
1414 if ( got_some_for_debugger
1415 && PERLDB_LINE_OR_SAVESRC
1416 && PL_curstash != PL_debstash)
1417 {
f0e67a1d
Z
1418 /* debugger active and we're not compiling the debugger code,
1419 * so store the line into the debugger's array of lines
1420 */
1421 update_debugger_info(NULL, buf+old_bufend_pos,
1422 new_bufend_pos-old_bufend_pos);
1423 }
1424 return got_some;
1425}
1426
1427/*
1428=for apidoc Amx|I32|lex_peek_unichar|U32 flags
1429
1430Looks ahead one (Unicode) character in the text currently being lexed.
1431Returns the codepoint (unsigned integer value) of the next character,
1432or -1 if lexing has reached the end of the input text. To consume the
1433peeked character, use L</lex_read_unichar>.
1434
1435If the next character is in (or extends into) the next chunk of input
1436text, the next chunk will be read in. Normally the current chunk will be
c5608a1f
KW
1437discarded at the same time, but if C<flags> has the C<LEX_KEEP_PREVIOUS>
1438bit set, then the current chunk will not be discarded.
f0e67a1d
Z
1439
1440If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1441is encountered, an exception is generated.
1442
1443=cut
1444*/
1445
1446I32
1447Perl_lex_peek_unichar(pTHX_ U32 flags)
1448{
749123ff 1449 dVAR;
f0e67a1d
Z
1450 char *s, *bufend;
1451 if (flags & ~(LEX_KEEP_PREVIOUS))
1452 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1453 s = PL_parser->bufptr;
1454 bufend = PL_parser->bufend;
1455 if (UTF) {
1456 U8 head;
1457 I32 unichar;
1458 STRLEN len, retlen;
1459 if (s == bufend) {
1460 if (!lex_next_chunk(flags))
1461 return -1;
1462 s = PL_parser->bufptr;
1463 bufend = PL_parser->bufend;
1464 }
1465 head = (U8)*s;
54d004e8 1466 if (UTF8_IS_INVARIANT(head))
f0e67a1d 1467 return head;
54d004e8
KW
1468 if (UTF8_IS_START(head)) {
1469 len = UTF8SKIP(&head);
f0e67a1d
Z
1470 while ((STRLEN)(bufend-s) < len) {
1471 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1472 break;
1473 s = PL_parser->bufptr;
1474 bufend = PL_parser->bufend;
1475 }
1476 }
c80e42f3 1477 unichar = utf8n_to_uvchr((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
f0e67a1d 1478 if (retlen == (STRLEN)-1) {
75219bac
KW
1479 _force_out_malformed_utf8_message((U8 *) s,
1480 (U8 *) bufend,
1481 0,
1482 1 /* 1 means die */ );
1483 NOT_REACHED; /* NOTREACHED */
f0e67a1d
Z
1484 }
1485 return unichar;
1486 } else {
1487 if (s == bufend) {
1488 if (!lex_next_chunk(flags))
1489 return -1;
1490 s = PL_parser->bufptr;
1491 }
1492 return (U8)*s;
1493 }
1494}
1495
1496/*
1497=for apidoc Amx|I32|lex_read_unichar|U32 flags
1498
1499Reads the next (Unicode) character in the text currently being lexed.
1500Returns the codepoint (unsigned integer value) of the character read,
1501and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1502if lexing has reached the end of the input text. To non-destructively
1503examine the next character, use L</lex_peek_unichar> instead.
1504
1505If the next character is in (or extends into) the next chunk of input
1506text, the next chunk will be read in. Normally the current chunk will be
c5608a1f
KW
1507discarded at the same time, but if C<flags> has the C<LEX_KEEP_PREVIOUS>
1508bit set, then the current chunk will not be discarded.
f0e67a1d
Z
1509
1510If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1511is encountered, an exception is generated.
1512
1513=cut
1514*/
1515
1516I32
1517Perl_lex_read_unichar(pTHX_ U32 flags)
1518{
1519 I32 c;
1520 if (flags & ~(LEX_KEEP_PREVIOUS))
1521 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1522 c = lex_peek_unichar(flags);
1523 if (c != -1) {
1524 if (c == '\n')
83944c01 1525 COPLINE_INC_WITH_HERELINES;
d9018cbe
EB
1526 if (UTF)
1527 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1528 else
1529 ++(PL_parser->bufptr);
f0e67a1d
Z
1530 }
1531 return c;
1532}
1533
1534/*
1535=for apidoc Amx|void|lex_read_space|U32 flags
1536
1537Reads optional spaces, in Perl style, in the text currently being
1538lexed. The spaces may include ordinary whitespace characters and
1539Perl-style comments. C<#line> directives are processed if encountered.
1540L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1541at a non-space character (or the end of the input text).
1542
1543If spaces extend into the next chunk of input text, the next chunk will
1544be read in. Normally the current chunk will be discarded at the same
c5608a1f 1545time, but if C<flags> has the C<LEX_KEEP_PREVIOUS> bit set, then the current
f0e67a1d
Z
1546chunk will not be discarded.
1547
1548=cut
1549*/
1550
21791330 1551#define LEX_NO_INCLINE 0x40000000
f0998909
Z
1552#define LEX_NO_NEXT_CHUNK 0x80000000
1553
f0e67a1d
Z
1554void
1555Perl_lex_read_space(pTHX_ U32 flags)
1556{
1557 char *s, *bufend;
21791330 1558 const bool can_incline = !(flags & LEX_NO_INCLINE);
f0e67a1d 1559 bool need_incline = 0;
21791330 1560 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK|LEX_NO_INCLINE))
f0e67a1d 1561 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
f0e67a1d
Z
1562 s = PL_parser->bufptr;
1563 bufend = PL_parser->bufend;
1564 while (1) {
1565 char c = *s;
1566 if (c == '#') {
1567 do {
1568 c = *++s;
1569 } while (!(c == '\n' || (c == 0 && s == bufend)));
1570 } else if (c == '\n') {
1571 s++;
21791330
FC
1572 if (can_incline) {
1573 PL_parser->linestart = s;
1574 if (s == bufend)
1575 need_incline = 1;
1576 else
c6875f94 1577 incline(s, bufend);
21791330 1578 }
f0e67a1d
Z
1579 } else if (isSPACE(c)) {
1580 s++;
1581 } else if (c == 0 && s == bufend) {
1582 bool got_more;
65c68e17 1583 line_t l;
f0998909
Z
1584 if (flags & LEX_NO_NEXT_CHUNK)
1585 break;
f0e67a1d 1586 PL_parser->bufptr = s;
65c68e17 1587 l = CopLINE(PL_curcop);
851b527a 1588 CopLINE(PL_curcop) += PL_parser->herelines + 1;
f0e67a1d 1589 got_more = lex_next_chunk(flags);
65c68e17 1590 CopLINE_set(PL_curcop, l);
f0e67a1d
Z
1591 s = PL_parser->bufptr;
1592 bufend = PL_parser->bufend;
1593 if (!got_more)
1594 break;
21791330 1595 if (can_incline && need_incline && PL_parser->rsfp) {
c6875f94 1596 incline(s, bufend);
f0e67a1d
Z
1597 need_incline = 0;
1598 }
3c47da3c
FC
1599 } else if (!c) {
1600 s++;
f0e67a1d
Z
1601 } else {
1602 break;
1603 }
1604 }
f0e67a1d
Z
1605 PL_parser->bufptr = s;
1606}
1607
1608/*
fe788d6b
PM
1609
1610=for apidoc EXMp|bool|validate_proto|SV *name|SV *proto|bool warn
1611
1612This function performs syntax checking on a prototype, C<proto>.
1613If C<warn> is true, any illegal characters or mismatched brackets
1614will trigger illegalproto warnings, declaring that they were
1615detected in the prototype for C<name>.
1616
1617The return value is C<true> if this is a valid prototype, and
1618C<false> if it is not, regardless of whether C<warn> was C<true> or
1619C<false>.
1620
1621Note that C<NULL> is a valid C<proto> and will always return C<true>.
1622
1623=cut
1624
1625 */
1626
1627bool
5783dc51 1628Perl_validate_proto(pTHX_ SV *name, SV *proto, bool warn, bool curstash)
fe788d6b
PM
1629{
1630 STRLEN len, origlen;
11327fa1 1631 char *p;
fe788d6b
PM
1632 bool bad_proto = FALSE;
1633 bool in_brackets = FALSE;
1634 bool after_slash = FALSE;
1635 char greedy_proto = ' ';
1636 bool proto_after_greedy_proto = FALSE;
1637 bool must_be_last = FALSE;
1638 bool underscore = FALSE;
f791a21a 1639 bool bad_proto_after_underscore = FALSE;
fe788d6b
PM
1640
1641 PERL_ARGS_ASSERT_VALIDATE_PROTO;
1642
1643 if (!proto)
1644 return TRUE;
1645
11327fa1 1646 p = SvPV(proto, len);
fe788d6b
PM
1647 origlen = len;
1648 for (; len--; p++) {
1649 if (!isSPACE(*p)) {
1650 if (must_be_last)
1651 proto_after_greedy_proto = TRUE;
f791a21a
PM
1652 if (underscore) {
1653 if (!strchr(";@%", *p))
1654 bad_proto_after_underscore = TRUE;
1655 underscore = FALSE;
1656 }
fe788d6b
PM
1657 if (!strchr("$@%*;[]&\\_+", *p) || *p == '\0') {
1658 bad_proto = TRUE;
1659 }
1660 else {
fe788d6b
PM
1661 if (*p == '[')
1662 in_brackets = TRUE;
1663 else if (*p == ']')
1664 in_brackets = FALSE;
407f8cf2
KW
1665 else if ((*p == '@' || *p == '%')
1666 && !after_slash
1667 && !in_brackets )
1668 {
fe788d6b
PM
1669 must_be_last = TRUE;
1670 greedy_proto = *p;
1671 }
1672 else if (*p == '_')
f791a21a 1673 underscore = TRUE;
fe788d6b
PM
1674 }
1675 if (*p == '\\')
1676 after_slash = TRUE;
1677 else
1678 after_slash = FALSE;
1679 }
1680 }
1681
1682 if (warn) {
b54d603d 1683 SV *tmpsv = newSVpvs_flags("", SVs_TEMP);
fe788d6b 1684 p -= origlen;
b54d603d
PM
1685 p = SvUTF8(proto)
1686 ? sv_uni_display(tmpsv, newSVpvn_flags(p, origlen, SVs_TEMP | SVf_UTF8),
1687 origlen, UNI_DISPLAY_ISPRINT)
1688 : pv_pretty(tmpsv, p, origlen, 60, NULL, NULL, PERL_PV_ESCAPE_NONASCII);
1689
cbf83791
FC
1690 if (curstash && !memchr(SvPVX(name), ':', SvCUR(name))) {
1691 SV *name2 = sv_2mortal(newSVsv(PL_curstname));
1692 sv_catpvs(name2, "::");
1693 sv_catsv(name2, (SV *)name);
1694 name = name2;
1695 }
1696
fe788d6b
PM
1697 if (proto_after_greedy_proto)
1698 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
147e3846 1699 "Prototype after '%c' for %" SVf " : %s",
fe788d6b 1700 greedy_proto, SVfARG(name), p);
50278ed0
PM
1701 if (in_brackets)
1702 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
147e3846 1703 "Missing ']' in prototype for %" SVf " : %s",
50278ed0 1704 SVfARG(name), p);
b54d603d 1705 if (bad_proto)
fe788d6b 1706 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
147e3846 1707 "Illegal character in prototype for %" SVf " : %s",
f791a21a
PM
1708 SVfARG(name), p);
1709 if (bad_proto_after_underscore)
1710 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
147e3846 1711 "Illegal character after '_' in prototype for %" SVf " : %s",
f791a21a 1712 SVfARG(name), p);
fe788d6b
PM
1713 }
1714
1715 return (! (proto_after_greedy_proto || bad_proto) );
1716}
1717
1718/*
ffb4593c
NT
1719 * S_incline
1720 * This subroutine has nothing to do with tilting, whether at windmills
1721 * or pinball tables. Its name is short for "increment line". It
57843af0 1722 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1723 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1724 * # line 500 "foo.pm"
1725 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1726 */
1727
76e3520e 1728STATIC void
c6875f94 1729S_incline(pTHX_ const char *s, const char *end)
463ee0b2 1730{
d9095cec
NC
1731 const char *t;
1732 const char *n;
1733 const char *e;
8818d409 1734 line_t line_num;
22ff3130 1735 UV uv;
463ee0b2 1736
7918f24d
NC
1737 PERL_ARGS_ASSERT_INCLINE;
1738
d77eff5d
KW
1739 assert(end >= s);
1740
83944c01 1741 COPLINE_INC_WITH_HERELINES;
451f421f
FC
1742 if (!PL_rsfp && !PL_parser->filtered && PL_lex_state == LEX_NORMAL
1743 && s+1 == PL_bufend && *s == ';') {
1744 /* fake newline in string eval */
1745 CopLINE_dec(PL_curcop);
1746 return;
1747 }
463ee0b2
LW
1748 if (*s++ != '#')
1749 return;
d4c19fe8
AL
1750 while (SPACE_OR_TAB(*s))
1751 s++;
d77eff5d
KW
1752 if (memBEGINs(s, (STRLEN) (end - s), "line"))
1753 s += sizeof("line") - 1;
73659bf1
GS
1754 else
1755 return;
084592ab 1756 if (SPACE_OR_TAB(*s))
73659bf1 1757 s++;
4e553d73 1758 else
73659bf1 1759 return;
d4c19fe8
AL
1760 while (SPACE_OR_TAB(*s))
1761 s++;
463ee0b2
LW
1762 if (!isDIGIT(*s))
1763 return;
d4c19fe8 1764
463ee0b2
LW
1765 n = s;
1766 while (isDIGIT(*s))
1767 s++;
07714eb4 1768 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1769 return;
bf4acbe4 1770 while (SPACE_OR_TAB(*s))
463ee0b2 1771 s++;
85bb8b90 1772 if (*s == '"' && (t = (char *) memchr(s+1, '"', end - s))) {
463ee0b2 1773 s++;
73659bf1
GS
1774 e = t + 1;
1775 }
463ee0b2 1776 else {
c35e046a 1777 t = s;
1bb1a3d6 1778 while (*t && !isSPACE(*t))
c35e046a 1779 t++;
73659bf1 1780 e = t;
463ee0b2 1781 }
bf4acbe4 1782 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1783 e++;
1784 if (*e != '\n' && *e != '\0')
1785 return; /* false alarm */
1786
22ff3130
HS
1787 if (!grok_atoUV(n, &uv, &e))
1788 return;
1789 line_num = ((line_t)uv) - 1;
8818d409 1790
f4dd75d9 1791 if (t - s > 0) {
d9095cec 1792 const STRLEN len = t - s;
3df32bda 1793
d36ee5be 1794 if (!PL_rsfp && !PL_parser->filtered) {
e66cf94c
RGS
1795 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1796 * to *{"::_<newfilename"} */
44867030
NC
1797 /* However, the long form of evals is only turned on by the
1798 debugger - usually they're "(eval %lu)" */
d36ee5be
FC
1799 GV * const cfgv = CopFILEGV(PL_curcop);
1800 if (cfgv) {
38bd7ad8
FC
1801 char smallbuf[128];
1802 STRLEN tmplen2 = len;
44867030 1803 char *tmpbuf2;
449dd039 1804 GV *gv2;
44867030
NC
1805
1806 if (tmplen2 + 2 <= sizeof smallbuf)
1807 tmpbuf2 = smallbuf;
1808 else
1809 Newx(tmpbuf2, tmplen2 + 2, char);
1810
38bd7ad8
FC
1811 tmpbuf2[0] = '_';
1812 tmpbuf2[1] = '<';
44867030
NC
1813
1814 memcpy(tmpbuf2 + 2, s, tmplen2);
1815 tmplen2 += 2;
1816
8a5ee598 1817 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1818 if (!isGV(gv2)) {
8a5ee598 1819 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1820 /* adjust ${"::_<newfilename"} to store the new file name */
1821 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
8818d409
FC
1822 /* The line number may differ. If that is the case,
1823 alias the saved lines that are in the array.
1824 Otherwise alias the whole array. */
1825 if (CopLINE(PL_curcop) == line_num) {
38bd7ad8
FC
1826 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(cfgv)));
1827 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(cfgv)));
8818d409 1828 }
38bd7ad8
FC
1829 else if (GvAV(cfgv)) {
1830 AV * const av = GvAV(cfgv);
8818d409
FC
1831 const I32 start = CopLINE(PL_curcop)+1;
1832 I32 items = AvFILLp(av) - start;
1833 if (items > 0) {
1834 AV * const av2 = GvAVn(gv2);
1835 SV **svp = AvARRAY(av) + start;
1836 I32 l = (I32)line_num+1;
1837 while (items--)
1838 av_store(av2, l++, SvREFCNT_inc(*svp++));
1839 }
1840 }
e5527e4b 1841 }
44867030
NC
1842
1843 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
d36ee5be 1844 }
e66cf94c 1845 }
05ec9bb3 1846 CopFILE_free(PL_curcop);
449dd039 1847 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1848 }
8818d409 1849 CopLINE_set(PL_curcop, line_num);
463ee0b2
LW
1850}
1851
80a702cd 1852STATIC void
15f169a1 1853S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1854{
1855 AV *av = CopFILEAVx(PL_curcop);
1856 if (av) {
7f1c3e8c
FC
1857 SV * sv;
1858 if (PL_parser->preambling == NOLINE) sv = newSV_type(SVt_PVMG);
1859 else {
1860 sv = *av_fetch(av, 0, 1);
1861 SvUPGRADE(sv, SVt_PVMG);
1862 }
847cc851 1863 if (!SvPOK(sv)) SvPVCLEAR(sv);
5fa550fb 1864 if (orig_sv)
7f1c3e8c 1865 sv_catsv(sv, orig_sv);
5fa550fb 1866 else
7f1c3e8c
FC
1867 sv_catpvn(sv, buf, len);
1868 if (!SvIOK(sv)) {
1869 (void)SvIOK_on(sv);
1870 SvIV_set(sv, 0);
1871 }
1872 if (PL_parser->preambling == NOLINE)
1873 av_store(av, CopLINE(PL_curcop), sv);
80a702cd
RGS
1874 }
1875}
1876
ffb4593c 1877/*
8c6b0c7d 1878 * skipspace
ffb4593c
NT
1879 * Called to gobble the appropriate amount and type of whitespace.
1880 * Skips comments as well.
71fff7cb 1881 * Returns the next character after the whitespace that is skipped.
8c6b0c7d
FC
1882 *
1883 * peekspace
1884 * Same thing, but look ahead without incrementing line numbers or
1885 * adjusting PL_linestart.
ffb4593c
NT
1886 */
1887
3218e223 1888#define skipspace(s) skipspace_flags(s, 0)
8c6b0c7d 1889#define peekspace(s) skipspace_flags(s, LEX_NO_INCLINE)
3218e223 1890
76e3520e 1891STATIC char *
21791330 1892S_skipspace_flags(pTHX_ char *s, U32 flags)
a687059c 1893{
21791330 1894 PERL_ARGS_ASSERT_SKIPSPACE_FLAGS;
3280af22 1895 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
3c47da3c 1896 while (s < PL_bufend && (SPACE_OR_TAB(*s) || !*s))
463ee0b2 1897 s++;
f0e67a1d
Z
1898 } else {
1899 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1900 PL_bufptr = s;
21791330 1901 lex_read_space(flags | LEX_KEEP_PREVIOUS |
d27f4b91 1902 (PL_lex_inwhat || PL_lex_state == LEX_FORMLINE ?
f0998909 1903 LEX_NO_NEXT_CHUNK : 0));
3280af22 1904 s = PL_bufptr;
f0e67a1d
Z
1905 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1906 if (PL_linestart > PL_bufptr)
1907 PL_bufptr = PL_linestart;
1908 return s;
463ee0b2 1909 }
5db06880 1910 return s;
a687059c 1911}
378cc40b 1912
ffb4593c
NT
1913/*
1914 * S_check_uni
1915 * Check the unary operators to ensure there's no ambiguity in how they're
1916 * used. An ambiguous piece of code would be:
1917 * rand + 5
1918 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1919 * the +5 is its argument.
1920 */
1921
76e3520e 1922STATIC void
cea2e8a9 1923S_check_uni(pTHX)
ba106d47 1924{
d4c19fe8 1925 const char *s;
2f3197b3 1926
3280af22 1927 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1928 return;
3280af22
NIS
1929 while (isSPACE(*PL_last_uni))
1930 PL_last_uni++;
c35e046a 1931 s = PL_last_uni;
fac0f7a3 1932 while (isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF) || *s == '-')
8ce2ba82 1933 s += UTF ? UTF8SKIP(s) : 1;
4efcdc02 1934 if (s < PL_bufptr && memchr(s, '(', PL_bufptr - s))
a0d0e21e 1935 return;
6136c704 1936
9b387841 1937 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
147e3846 1938 "Warning: Use of \"%" UTF8f "\" without parentheses is ambiguous",
b59c097b 1939 UTF8fARG(UTF, (int)(s - PL_last_uni), PL_last_uni));
2f3197b3
LW
1940}
1941
ffb4593c
NT
1942/*
1943 * LOP : macro to build a list operator. Its behaviour has been replaced
1944 * with a subroutine, S_lop() for which LOP is just another name.
1945 */
1946
a0d0e21e
LW
1947#define LOP(f,x) return lop(f,x,s)
1948
ffb4593c
NT
1949/*
1950 * S_lop
1951 * Build a list operator (or something that might be one). The rules:
41e8cbf4
FC
1952 * - if we have a next token, then it's a list operator (no parens) for
1953 * which the next token has already been parsed; e.g.,
1954 * sort foo @args
1955 * sort foo (@args)
ffb4593c
NT
1956 * - if the next thing is an opening paren, then it's a function
1957 * - else it's a list operator
1958 */
1959
76e3520e 1960STATIC I32
11288bb3 1961S_lop(pTHX_ I32 f, U8 x, char *s)
ffed7fef 1962{
7918f24d
NC
1963 PERL_ARGS_ASSERT_LOP;
1964
6154021b 1965 pl_yylval.ival = f;
35c8bce7 1966 CLINE;
3280af22
NIS
1967 PL_bufptr = s;
1968 PL_last_lop = PL_oldbufptr;
eb160463 1969 PL_last_lop_op = (OPCODE)f;
3280af22 1970 if (PL_nexttoke)
78cdf107 1971 goto lstop;
19f1898a 1972 PL_expect = x;
79072805 1973 if (*s == '(')
bbf60fe6 1974 return REPORT(FUNC);
294a536f 1975 s = skipspace(s);
79072805 1976 if (*s == '(')
bbf60fe6 1977 return REPORT(FUNC);
78cdf107
Z
1978 else {
1979 lstop:
1980 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
1981 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
bbf60fe6 1982 return REPORT(LSTOP);
78cdf107 1983 }
79072805
LW
1984}
1985
ffb4593c
NT
1986/*
1987 * S_force_next
9cbb5ea2 1988 * When the lexer realizes it knows the next token (for instance,
ffb4593c 1989 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 1990 * to know what token to return the next time the lexer is called. Caller
b5bbe64a
JH
1991 * will need to set PL_nextval[] and possibly PL_expect to ensure
1992 * the lexer handles the token correctly.
ffb4593c
NT
1993 */
1994
4e553d73 1995STATIC void
cea2e8a9 1996S_force_next(pTHX_ I32 type)
79072805 1997{
704d4215
GG
1998#ifdef DEBUGGING
1999 if (DEBUG_T_TEST) {
2000 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 2001 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
2002 }
2003#endif
1f7c3e7c 2004 assert(PL_nexttoke < C_ARRAY_LENGTH(PL_nexttype));
3280af22
NIS
2005 PL_nexttype[PL_nexttoke] = type;
2006 PL_nexttoke++;
79072805
LW
2007}
2008
89f35911
FC
2009/*
2010 * S_postderef
2011 *
2012 * This subroutine handles postfix deref syntax after the arrow has already
b3f7b7ad 2013 * been emitted. @* $* etc. are emitted as two separate tokens right here.
89f35911
FC
2014 * @[ @{ %[ %{ *{ are emitted also as two tokens, but this function emits
2015 * only the first, leaving yylex to find the next.
89f35911
FC
2016 */
2017
2018static int
ff25e5db 2019S_postderef(pTHX_ int const funny, char const next)
89f35911 2020{
ff25e5db 2021 assert(funny == DOLSHARP || strchr("$@%&*", funny));
89f35911
FC
2022 if (next == '*') {
2023 PL_expect = XOPERATOR;
cc624add 2024 if (PL_lex_state == LEX_INTERPNORMAL && !PL_lex_brackets) {
ff25e5db 2025 assert('@' == funny || '$' == funny || DOLSHARP == funny);
cc624add 2026 PL_lex_state = LEX_INTERPEND;
c3492809
FC
2027 if ('@' == funny)
2028 force_next(POSTJOIN);
cc624add 2029 }
89f35911
FC
2030 force_next(next);
2031 PL_bufptr+=2;
2032 }
2033 else {
760ca746
FC
2034 if ('@' == funny && PL_lex_state == LEX_INTERPNORMAL
2035 && !PL_lex_brackets)
2036 PL_lex_dojoin = 2;
89f35911
FC
2037 PL_expect = XOPERATOR;
2038 PL_bufptr++;
2039 }
2040 return funny;
2041}
2042
28ac2b49
Z
2043void
2044Perl_yyunlex(pTHX)
2045{
a7aaec61
Z
2046 int yyc = PL_parser->yychar;
2047 if (yyc != YYEMPTY) {
2048 if (yyc) {
a7aaec61
Z
2049 NEXTVAL_NEXTTOKE = PL_parser->yylval;
2050 if (yyc == '{'/*}*/ || yyc == HASHBRACK || yyc == '['/*]*/) {
78cdf107 2051 PL_lex_allbrackets--;
a7aaec61 2052 PL_lex_brackets--;
78cdf107
Z
2053 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
2054 } else if (yyc == '('/*)*/) {
2055 PL_lex_allbrackets--;
2056 yyc |= (2<<24);
a7aaec61
Z
2057 }
2058 force_next(yyc);
2059 }
28ac2b49
Z
2060 PL_parser->yychar = YYEMPTY;
2061 }
2062}
2063
d0a148a6 2064STATIC SV *
15f169a1 2065S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 2066{
740cce10 2067 SV * const sv = newSVpvn_utf8(start, len,
6d492fac
KW
2068 ! IN_BYTES
2069 && UTF
2070 && is_utf8_non_invariant_string((const U8*)start, len));
d0a148a6
NC
2071 return sv;
2072}
2073
ffb4593c
NT
2074/*
2075 * S_force_word
2076 * When the lexer knows the next thing is a word (for instance, it has
2077 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
2078 * it calls S_force_word to stick the next word into the PL_nexttoke/val
2079 * lookahead.
ffb4593c
NT
2080 *
2081 * Arguments:
b1b65b59 2082 * char *start : buffer position (must be within PL_linestr)
185c2e96
DM
2083 * int token : PL_next* will be this type of bare word
2084 * (e.g., METHOD,BAREWORD)
ffb4593c
NT
2085 * int check_keyword : if true, Perl checks to make sure the word isn't
2086 * a keyword (do this if the word is a label, e.g. goto FOO)
2087 * int allow_pack : if true, : characters will also be allowed (require,
2088 * use, etc. do this)
ffb4593c
NT
2089 */
2090
76e3520e 2091STATIC char *
345b3785 2092S_force_word(pTHX_ char *start, int token, int check_keyword, int allow_pack)
79072805 2093{
eb578fdb 2094 char *s;
463ee0b2 2095 STRLEN len;
4e553d73 2096
7918f24d
NC
2097 PERL_ARGS_ASSERT_FORCE_WORD;
2098
294a536f 2099 start = skipspace(start);
463ee0b2 2100 s = start;
fac0f7a3 2101 if ( isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)
e7127e21 2102 || (allow_pack && *s == ':' && s[1] == ':') )
a0d0e21e 2103 {
3280af22 2104 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
01b5ef50
FC
2105 if (check_keyword) {
2106 char *s2 = PL_tokenbuf;
487e470d 2107 STRLEN len2 = len;
de627158
KW
2108 if (allow_pack && memBEGINPs(s2, len, "CORE::")) {
2109 s2 += sizeof("CORE::") - 1;
2110 len2 -= sizeof("CORE::") - 1;
2111 }
487e470d 2112 if (keyword(s2, len2, 0))
463ee0b2 2113 return start;
01b5ef50 2114 }
463ee0b2 2115 if (token == METHOD) {
294a536f 2116 s = skipspace(s);
463ee0b2 2117 if (*s == '(')
3280af22 2118 PL_expect = XTERM;
463ee0b2 2119 else {
3280af22 2120 PL_expect = XOPERATOR;
463ee0b2 2121 }
79072805 2122 }
9ded7720 2123 NEXTVAL_NEXTTOKE.opval
275103cd 2124 = newSVOP(OP_CONST,0,
d0a148a6 2125 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 2126 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
2127 force_next(token);
2128 }
2129 return s;
2130}
2131
ffb4593c
NT
2132/*
2133 * S_force_ident
9cbb5ea2 2134 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
2135 * text only contains the "foo" portion. The first argument is a pointer
2136 * to the "foo", and the second argument is the type symbol to prefix.
185c2e96 2137 * Forces the next token to be a "BAREWORD".
9cbb5ea2 2138 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2139 */
2140
76e3520e 2141STATIC void
5aaab254 2142S_force_ident(pTHX_ const char *s, int kind)
79072805 2143{
7918f24d
NC
2144 PERL_ARGS_ASSERT_FORCE_IDENT;
2145
c9b48522
DD
2146 if (s[0]) {
2147 const STRLEN len = s[1] ? strlen(s) : 1; /* s = "\"" see yylex */
275103cd 2148 OP* const o = newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
728847b1 2149 UTF ? SVf_UTF8 : 0));
9ded7720 2150 NEXTVAL_NEXTTOKE.opval = o;
185c2e96 2151 force_next(BAREWORD);
748a9306 2152 if (kind) {
11343788 2153 o->op_private = OPpCONST_ENTERED;
55497cff
PP
2154 /* XXX see note in pp_entereval() for why we forgo typo
2155 warnings if the symbol must be introduced in an eval.
2156 GSAR 96-10-12 */
90e5519e 2157 gv_fetchpvn_flags(s, len,
4bff32c5 2158 (PL_in_eval ? GV_ADDMULTI
728847b1 2159 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
90e5519e
NC
2160 kind == '$' ? SVt_PV :
2161 kind == '@' ? SVt_PVAV :
2162 kind == '%' ? SVt_PVHV :
a0d0e21e 2163 SVt_PVGV
90e5519e 2164 );
748a9306 2165 }
79072805
LW
2166 }
2167}
2168
3f33d153
FC
2169static void
2170S_force_ident_maybe_lex(pTHX_ char pit)
2171{
3f33d153
FC
2172 NEXTVAL_NEXTTOKE.ival = pit;
2173 force_next('p');
2174}
2175
1571675a
GS
2176NV
2177Perl_str_to_version(pTHX_ SV *sv)
2178{
2179 NV retval = 0.0;
2180 NV nshift = 1.0;
2181 STRLEN len;
cfd0369c 2182 const char *start = SvPV_const(sv,len);
9d4ba2ae 2183 const char * const end = start + len;
8298454c 2184 const bool utf = cBOOL(SvUTF8(sv));
7918f24d
NC
2185
2186 PERL_ARGS_ASSERT_STR_TO_VERSION;
2187
1571675a 2188 while (start < end) {
ba210ebe 2189 STRLEN skip;
1571675a
GS
2190 UV n;
2191 if (utf)
9041c2e3 2192 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2193 else {
2194 n = *(U8*)start;
2195 skip = 1;
2196 }
2197 retval += ((NV)n)/nshift;
2198 start += skip;
2199 nshift *= 1000;
2200 }
2201 return retval;
2202}
2203
4e553d73 2204/*
ffb4593c
NT
2205 * S_force_version
2206 * Forces the next token to be a version number.
e759cc13
RGS
2207 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2208 * and if "guessing" is TRUE, then no new token is created (and the caller
2209 * must use an alternative parsing method).
ffb4593c
NT
2210 */
2211
76e3520e 2212STATIC char *
e759cc13 2213S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2214{
5f66b61c 2215 OP *version = NULL;
44dcb63b 2216 char *d;
89bfa8cd 2217
7918f24d
NC
2218 PERL_ARGS_ASSERT_FORCE_VERSION;
2219
294a536f 2220 s = skipspace(s);
89bfa8cd 2221
44dcb63b 2222 d = s;
dd629d5b 2223 if (*d == 'v')
44dcb63b 2224 d++;
44dcb63b 2225 if (isDIGIT(*d)) {
e759cc13
RGS
2226 while (isDIGIT(*d) || *d == '_' || *d == '.')
2227 d++;
4e4da3ac 2228 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
dd629d5b 2229 SV *ver;
6154021b
RGS
2230 s = scan_num(s, &pl_yylval);
2231 version = pl_yylval.opval;
dd629d5b
GS
2232 ver = cSVOPx(version)->op_sv;
2233 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2234 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2235 SvNV_set(ver, str_to_version(ver));
1571675a 2236 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2237 }
89bfa8cd 2238 }
5db06880 2239 else if (guessing) {
e759cc13 2240 return s;
5db06880 2241 }
89bfa8cd
PP
2242 }
2243
2244 /* NOTE: The parser sees the package name and the VERSION swapped */
9ded7720 2245 NEXTVAL_NEXTTOKE.opval = version;
185c2e96 2246 force_next(BAREWORD);
89bfa8cd 2247
e759cc13 2248 return s;
89bfa8cd
PP
2249}
2250
ffb4593c 2251/*
91152fc1
DG
2252 * S_force_strict_version
2253 * Forces the next token to be a version number using strict syntax rules.
2254 */
2255
2256STATIC char *
2257S_force_strict_version(pTHX_ char *s)
2258{
91152fc1 2259 OP *version = NULL;
91152fc1
DG
2260 const char *errstr = NULL;
2261
2262 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2263
2264 while (isSPACE(*s)) /* leading whitespace */
2265 s++;
2266
2267 if (is_STRICT_VERSION(s,&errstr)) {
2268 SV *ver = newSV(0);
2269 s = (char *)scan_version(s, ver, 0);
2270 version = newSVOP(OP_CONST, 0, ver);
2271 }
407f8cf2
KW
2272 else if ((*s != ';' && *s != '{' && *s != '}' )
2273 && (s = skipspace(s), (*s != ';' && *s != '{' && *s != '}' )))
4e4da3ac 2274 {
91152fc1
DG
2275 PL_bufptr = s;
2276 if (errstr)
2277 yyerror(errstr); /* version required */
2278 return s;
2279 }
2280
91152fc1 2281 /* NOTE: The parser sees the package name and the VERSION swapped */
91152fc1 2282 NEXTVAL_NEXTTOKE.opval = version;
185c2e96 2283 force_next(BAREWORD);
91152fc1
DG
2284
2285 return s;
2286}
2287
2288/*
ffb4593c 2289 * S_tokeq
ef3ff34d
KW
2290 * Turns any \\ into \ in a quoted string passed in in 'sv', returning 'sv',
2291 * modified as necessary. However, if HINT_NEW_STRING is on, 'sv' is
2292 * unchanged, and a new SV containing the modified input is returned.
ffb4593c
NT
2293 */
2294
76e3520e 2295STATIC SV *
cea2e8a9 2296S_tokeq(pTHX_ SV *sv)
79072805 2297{
eb578fdb
KW
2298 char *s;
2299 char *send;
2300 char *d;
b3ac6de7 2301 SV *pv = sv;
79072805 2302
7918f24d
NC
2303 PERL_ARGS_ASSERT_TOKEQ;
2304
279b35ad
FC
2305 assert (SvPOK(sv));
2306 assert (SvLEN(sv));
2307 assert (!SvIsCOW(sv));
307ed071 2308 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1) /* <<'heredoc' */
b3ac6de7 2309 goto finish;
279b35ad
FC
2310 s = SvPVX(sv);
2311 send = SvEND(sv);
dcb21ed6
NC
2312 /* This is relying on the SV being "well formed" with a trailing '\0' */
2313 while (s < send && !(*s == '\\' && s[1] == '\\'))
79072805
LW
2314 s++;
2315 if (s == send)
b3ac6de7 2316 goto finish;
79072805 2317 d = s;
be4731d2 2318 if ( PL_hints & HINT_NEW_STRING ) {
279b35ad
FC
2319 pv = newSVpvn_flags(SvPVX_const(pv), SvCUR(sv),
2320 SVs_TEMP | SvUTF8(sv));
be4731d2 2321 }
79072805
LW
2322 while (s < send) {
2323 if (*s == '\\') {
a0d0e21e 2324 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2325 s++; /* all that, just for this */
2326 }
2327 *d++ = *s++;
2328 }
2329 *d = '\0';
95a20fc0 2330 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2331 finish:
3280af22 2332 if ( PL_hints & HINT_NEW_STRING )
eb0d8d16 2333 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
79072805
LW
2334 return sv;
2335}
2336
ffb4593c
NT
2337/*
2338 * Now come three functions related to double-quote context,
2339 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2340 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2341 * interact with PL_lex_state, and create fake ( ... ) argument lists
2342 * to handle functions and concatenation.
ecd24171
DM
2343 * For example,
2344 * "foo\lbar"
2345 * is tokenised as
2346 * stringify ( const[foo] concat lcfirst ( const[bar] ) )
ffb4593c
NT
2347 */
2348
2349/*
2350 * S_sublex_start
6154021b 2351 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2352 *
2353 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2354 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c 2355 *
16d1d8bd 2356 * OP_CONST is easy--just make the new op and return.
ffb4593c
NT
2357 *
2358 * Everything else becomes a FUNC.
2359 *
16d1d8bd
LM
2360 * Sets PL_lex_state to LEX_INTERPPUSH unless ival was OP_NULL or we
2361 * had an OP_CONST. This just sets us up for a
ffb4593c
NT
2362 * call to S_sublex_push().
2363 */
2364
76e3520e 2365STATIC I32
cea2e8a9 2366S_sublex_start(pTHX)
79072805 2367{
eb578fdb 2368 const I32 op_type = pl_yylval.ival;
79072805
LW
2369
2370 if (op_type == OP_NULL) {
6154021b 2371 pl_yylval.opval = PL_lex_op;
5f66b61c 2372 PL_lex_op = NULL;
79072805
LW
2373 return THING;
2374 }
466112bb 2375 if (op_type == OP_CONST) {
67c71cbb
FC
2376 SV *sv = PL_lex_stuff;
2377 PL_lex_stuff = NULL;
2378 sv = tokeq(sv);
b3ac6de7
IZ
2379
2380 if (SvTYPE(sv) == SVt_PVIV) {
2381 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2382 STRLEN len;
96a5add6 2383 const char * const p = SvPV_const(sv, len);
740cce10 2384 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2385 SvREFCNT_dec(sv);
2386 sv = nsv;
4e553d73 2387 }
275103cd 2388 pl_yylval.opval = newSVOP(op_type, 0, sv);
79072805
LW
2389 return THING;
2390 }
2391
7ef70b3d
FC
2392 PL_parser->lex_super_state = PL_lex_state;
2393 PL_parser->lex_sub_inwhat = (U16)op_type;
2394 PL_parser->lex_sub_op = PL_lex_op;
bb4e4c38
TC
2395 PL_parser->sub_no_recover = FALSE;
2396 PL_parser->sub_error_count = PL_error_count;
3280af22 2397 PL_lex_state = LEX_INTERPPUSH;
55497cff 2398
3280af22
NIS
2399 PL_expect = XTERM;
2400 if (PL_lex_op) {
6154021b 2401 pl_yylval.opval = PL_lex_op;
5f66b61c 2402 PL_lex_op = NULL;
55497cff
PP
2403 return PMFUNC;
2404 }
2405 else
2406 return FUNC;
2407}
2408
ffb4593c
NT
2409/*
2410 * S_sublex_push
2411 * Create a new scope to save the lexing state. The scope will be
2412 * ended in S_sublex_done. Returns a '(', starting the function arguments
2413 * to the uc, lc, etc. found before.
2414 * Sets PL_lex_state to LEX_INTERPCONCAT.
2415 */
2416
76e3520e 2417STATIC I32
cea2e8a9 2418S_sublex_push(pTHX)
55497cff 2419{
78a635de 2420 LEXSHARED *shared;
801d32ac 2421 const bool is_heredoc = PL_multi_close == '<';
f46d017c 2422 ENTER;
55497cff 2423
7ef70b3d 2424 PL_lex_state = PL_parser->lex_super_state;
cc624add 2425 SAVEI8(PL_lex_dojoin);
3280af22 2426 SAVEI32(PL_lex_brackets);
78cdf107 2427 SAVEI32(PL_lex_allbrackets);
b27dce25 2428 SAVEI32(PL_lex_formbrack);
78cdf107 2429 SAVEI8(PL_lex_fakeeof);
3280af22
NIS
2430 SAVEI32(PL_lex_casemods);
2431 SAVEI32(PL_lex_starts);
651b5b28 2432 SAVEI8(PL_lex_state);
7cc34111 2433 SAVESPTR(PL_lex_repl);
7766f137 2434 SAVEVPTR(PL_lex_inpat);
98246f1e 2435 SAVEI16(PL_lex_inwhat);
ffdb8b16 2436 if (is_heredoc)
b42366d4 2437 {
ffdb8b16 2438 SAVECOPLINE(PL_curcop);
b42366d4 2439 SAVEI32(PL_multi_end);
851b527a
FC
2440 SAVEI32(PL_parser->herelines);
2441 PL_parser->herelines = 0;
b42366d4 2442 }
2ca4363d 2443 SAVEIV(PL_multi_close);
3280af22 2444 SAVEPPTR(PL_bufptr);
8452ff4b 2445 SAVEPPTR(PL_bufend);
3280af22
NIS
2446 SAVEPPTR(PL_oldbufptr);
2447 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2448 SAVEPPTR(PL_last_lop);
2449 SAVEPPTR(PL_last_uni);
3280af22
NIS
2450 SAVEPPTR(PL_linestart);
2451 SAVESPTR(PL_linestr);
8edd5f42
RGS
2452 SAVEGENERICPV(PL_lex_brackstack);
2453 SAVEGENERICPV(PL_lex_casestack);
78a635de 2454 SAVEGENERICPV(PL_parser->lex_shared);
3a54fd60 2455 SAVEBOOL(PL_parser->lex_re_reparsing);
ffdb8b16 2456 SAVEI32(PL_copline);
3280af22 2457
99bd9d90 2458 /* The here-doc parser needs to be able to peek into outer lexing
60f40a38
FC
2459 scopes to find the body of the here-doc. So we put PL_linestr and
2460 PL_bufptr into lex_shared, to ‘share’ those values.
99bd9d90 2461 */
60f40a38
FC
2462 PL_parser->lex_shared->ls_linestr = PL_linestr;
2463 PL_parser->lex_shared->ls_bufptr = PL_bufptr;
99bd9d90 2464
3280af22 2465 PL_linestr = PL_lex_stuff;
7ef70b3d 2466 PL_lex_repl = PL_parser->lex_sub_repl;
a0714e2c 2467 PL_lex_stuff = NULL;
7ef70b3d 2468 PL_parser->lex_sub_repl = NULL;
3280af22 2469
eabab8bc
FC
2470 /* Arrange for PL_lex_stuff to be freed on scope exit, in case it gets
2471 set for an inner quote-like operator and then an error causes scope-
2472 popping. We must not have a PL_lex_stuff value left dangling, as
2473 that breaks assumptions elsewhere. See bug #123617. */
2474 SAVEGENERICSV(PL_lex_stuff);
7ef70b3d 2475 SAVEGENERICSV(PL_parser->lex_sub_repl);
eabab8bc 2476
9cbb5ea2
GS
2477 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2478 = SvPVX(PL_linestr);
3280af22 2479 PL_bufend += SvCUR(PL_linestr);
bd61b366 2480 PL_last_lop = PL_last_uni = NULL;
3280af22 2481 SAVEFREESV(PL_linestr);
4dc843bc 2482 if (PL_lex_repl) SAVEFREESV(PL_lex_repl);
3280af22
NIS
2483
2484 PL_lex_dojoin = FALSE;
b27dce25 2485 PL_lex_brackets = PL_lex_formbrack = 0;
78cdf107
Z
2486 PL_lex_allbrackets = 0;
2487 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
a02a5408
JC
2488 Newx(PL_lex_brackstack, 120, char);
2489 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2490 PL_lex_casemods = 0;
2491 *PL_lex_casestack = '\0';
2492 PL_lex_starts = 0;
2493 PL_lex_state = LEX_INTERPCONCAT;
ffdb8b16 2494 if (is_heredoc)
6ddcf93b 2495 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
ffdb8b16 2496 PL_copline = NOLINE;
caae0700 2497
78a635de
FC
2498 Newxz(shared, 1, LEXSHARED);
2499 shared->ls_prev = PL_parser->lex_shared;
2500 PL_parser->lex_shared = shared;
3280af22 2501
7ef70b3d 2502 PL_lex_inwhat = PL_parser->lex_sub_inwhat;
bb16bae8 2503 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
3280af22 2504 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
7ef70b3d 2505 PL_lex_inpat = PL_parser->lex_sub_op;
79072805 2506 else
5f66b61c 2507 PL_lex_inpat = NULL;
79072805 2508
3a54fd60
DM
2509 PL_parser->lex_re_reparsing = cBOOL(PL_in_eval & EVAL_RE_REPARSING);
2510 PL_in_eval &= ~EVAL_RE_REPARSING;
2511
55497cff 2512 return '(';
79072805
LW
2513}
2514
ffb4593c
NT
2515/*
2516 * S_sublex_done
2517 * Restores lexer state after a S_sublex_push.
2518 */
2519
76e3520e 2520STATIC I32
cea2e8a9 2521S_sublex_done(pTHX)
79072805 2522{
3280af22 2523 if (!PL_lex_starts++) {
396482e1 2524 SV * const sv = newSVpvs("");
9aa983d2
JH
2525 if (SvUTF8(PL_linestr))
2526 SvUTF8_on(sv);
3280af22 2527 PL_expect = XOPERATOR;
275103cd 2528 pl_yylval.opval = newSVOP(OP_CONST, 0, sv);
79072805
LW
2529 return THING;
2530 }
2531
3280af22
NIS
2532 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2533 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2534 return yylex();
79072805
LW
2535 }
2536
ffb4593c 2537 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
bb16bae8 2538 assert(PL_lex_inwhat != OP_TRANSR);
5aa91856
FC
2539 if (PL_lex_repl) {
2540 assert (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS);
3280af22
NIS
2541 PL_linestr = PL_lex_repl;
2542 PL_lex_inpat = 0;
2543 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2544 PL_bufend += SvCUR(PL_linestr);
bd61b366 2545 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2546 PL_lex_dojoin = FALSE;
2547 PL_lex_brackets = 0;
78cdf107
Z
2548 PL_lex_allbrackets = 0;
2549 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
3280af22
NIS
2550 PL_lex_casemods = 0;
2551 *PL_lex_casestack = '\0';
2552 PL_lex_starts = 0;
25da4f38 2553 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2554 PL_lex_state = LEX_INTERPNORMAL;
2555 PL_lex_starts++;
e9fa98b2
HS
2556 /* we don't clear PL_lex_repl here, so that we can check later
2557 whether this is an evalled subst; that means we rely on the
2558 logic to ensure sublex_done() is called again only via the
2559 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2560 }
e9fa98b2 2561 else {
3280af22 2562 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2563 PL_lex_repl = NULL;
e9fa98b2 2564 }
ffdb8b16
FC
2565 if (SvTYPE(PL_linestr) >= SVt_PVNV) {
2566 CopLINE(PL_curcop) +=
9420b268 2567 ((XPVNV*)SvANY(PL_linestr))->xnv_u.xnv_lines
851b527a
FC
2568 + PL_parser->herelines;
2569 PL_parser->herelines = 0;
ffdb8b16 2570 }
9b6b7be8 2571 return '/';
ffed7fef
LW
2572 }
2573 else {
b42366d4 2574 const line_t l = CopLINE(PL_curcop);
f46d017c 2575 LEAVE;
bb4e4c38
TC
2576 if (PL_parser->sub_error_count != PL_error_count) {
2577 const char * const name = OutCopFILE(PL_curcop);
2578 if (PL_parser->sub_no_recover) {
2579 const char * msg = "";
2580 if (PL_in_eval) {
2581 SV *errsv = ERRSV;
2582 if (SvCUR(ERRSV)) {
2583 msg = Perl_form(aTHX_ "%" SVf, SVfARG(errsv));
2584 }
2585 }
2586 abort_execution(msg, name);
2587 NOT_REACHED;
2588 }
2589 }
b42366d4 2590 if (PL_multi_close == '<')
851b527a 2591 PL_parser->herelines += l - PL_multi_end;
3280af22
NIS
2592 PL_bufend = SvPVX(PL_linestr);
2593 PL_bufend += SvCUR(PL_linestr);
2594 PL_expect = XOPERATOR;
79072805 2595 return ')';
ffed7fef
LW
2596 }
2597}
2598
7a7d14f3 2599STATIC SV*
6f613c73
KW
2600S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
2601{
140b12ad
KW
2602 /* <s> points to first character of interior of \N{}, <e> to one beyond the
2603 * interior, hence to the "}". Finds what the name resolves to, returning
2604 * an SV* containing it; NULL if no valid one found */
2605
dd2b1b72 2606 SV* res = newSVpvn_flags(s, e - s, UTF ? SVf_UTF8 : 0);
6f613c73 2607
0c415a79
KW
2608 HV * table;
2609 SV **cvp;
2610 SV *cv;
2611 SV *rv;
2612 HV *stash;
0c415a79
KW
2613 const char* backslash_ptr = s - 3; /* Points to the <\> of \N{... */
2614
6f613c73
KW
2615 PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME;
2616
d8d26cac 2617 if (!SvCUR(res)) {
ebcc725e 2618 SvREFCNT_dec_NN(res);
be332ba0
KW
2619 /* diag_listed_as: Unknown charname '%s' */
2620 yyerror("Unknown charname ''");
2621 return NULL;
d8d26cac 2622 }
b7e6151c 2623
107160e2
KW
2624 res = new_constant( NULL, 0, "charnames", res, NULL, backslash_ptr,
2625 /* include the <}> */
2626 e - backslash_ptr + 1);
6f613c73 2627 if (! SvPOK(res)) {
b6407c49 2628 SvREFCNT_dec_NN(res);
6f613c73
KW
2629 return NULL;
2630 }
2631
0c415a79
KW
2632 /* See if the charnames handler is the Perl core's, and if so, we can skip
2633 * the validation needed for a user-supplied one, as Perl's does its own
2634 * validation. */
2635 table = GvHV(PL_hintgv); /* ^H */
2636 cvp = hv_fetchs(table, "charnames", FALSE);
5882ddb3
FC
2637 if (cvp && (cv = *cvp) && SvROK(cv) && (rv = SvRV(cv),
2638 SvTYPE(rv) == SVt_PVCV) && ((stash = CvSTASH(rv)) != NULL))
0c415a79
KW
2639 {
2640 const char * const name = HvNAME(stash);
b59bf0b2 2641 if (memEQs(name, HvNAMELEN(stash), "_charnames")) {
0c415a79
KW
2642 return res;
2643 }
2644 }
2645
bde9e88d
KW
2646 /* Here, it isn't Perl's charname handler. We can't rely on a
2647 * user-supplied handler to validate the input name. For non-ut8 input,
2648 * look to see that the first character is legal. Then loop through the
2649 * rest checking that each is a continuation */
6f613c73 2650
36897d64
KW
2651 /* This code makes the reasonable assumption that the only Latin1-range
2652 * characters that begin a character name alias are alphabetic, otherwise
2653 * would have to create a isCHARNAME_BEGIN macro */
b6ba1137 2654
b6ba1137 2655 if (! UTF) {
bde9e88d 2656 if (! isALPHAU(*s)) {
b6ba1137
KW
2657 goto bad_charname;
2658 }
bde9e88d
KW
2659 s++;
2660 while (s < e) {
2661 if (! isCHARNAME_CONT(*s)) {
b6ba1137
KW
2662 goto bad_charname;
2663 }
2d8eb851
KW
2664 if (*s == ' ' && *(s-1) == ' ') {
2665 goto multi_spaces;
bd299e29 2666 }
bde9e88d 2667 s++;
b6ba1137
KW
2668 }
2669 }
2670 else {
bde9e88d
KW
2671 /* Similarly for utf8. For invariants can check directly; for other
2672 * Latin1, can calculate their code point and check; otherwise use a
2673 * swash */
2674 if (UTF8_IS_INVARIANT(*s)) {
2675 if (! isALPHAU(*s)) {
140b12ad
KW
2676 goto bad_charname;
2677 }
bde9e88d
KW
2678 s++;
2679 } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
a62b247b 2680 if (! isALPHAU(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)))) {
b6ba1137 2681 goto bad_charname;
6f613c73 2682 }
bde9e88d 2683 s += 2;
6f613c73 2684 }
bde9e88d
KW
2685 else {
2686 if (! PL_utf8_charname_begin) {
2687 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2688 PL_utf8_charname_begin = _core_swash_init("utf8",
2689 "_Perl_Charname_Begin",
2690 &PL_sv_undef,
2691 1, 0, NULL, &flags);
2692 }
2693 if (! swash_fetch(PL_utf8_charname_begin, (U8 *) s, TRUE)) {
2694 goto bad_charname;
2695 }
2696 s += UTF8SKIP(s);
2697 }
2698
2699 while (s < e) {
2700 if (UTF8_IS_INVARIANT(*s)) {
2701 if (! isCHARNAME_CONT(*s)) {
2702 goto bad_charname;
2703 }
2d8eb851
KW
2704 if (*s == ' ' && *(s-1) == ' ') {
2705 goto multi_spaces;
bd299e29 2706 }
bde9e88d
KW
2707 s++;
2708 }
2709 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
a62b247b 2710 if (! isCHARNAME_CONT(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1))))
bde9e88d
KW
2711 {
2712 goto bad_charname;
2713 }
2714 s += 2;
2715 }
2716 else {
2717 if (! PL_utf8_charname_continue) {
2718 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2719 PL_utf8_charname_continue = _core_swash_init("utf8",
2720 "_Perl_Charname_Continue",
2721 &PL_sv_undef,
2722 1, 0, NULL, &flags);
2723 }
2724 if (! swash_fetch(PL_utf8_charname_continue, (U8 *) s, TRUE)) {
2725 goto bad_charname;
2726 }
2727 s += UTF8SKIP(s);
6f613c73
KW
2728 }
2729 }
2d8eb851
KW
2730 }
2731 if (*(s-1) == ' ') {
8d9d0498
FC
2732 /* diag_listed_as: charnames alias definitions may not contain
2733 trailing white-space; marked by <-- HERE in %s
2734 */
2d8eb851
KW
2735 yyerror_pv(
2736 Perl_form(aTHX_
2737 "charnames alias definitions may not contain trailing "
2738 "white-space; marked by <-- HERE in %.*s<-- HERE %.*s",
2739 (int)(s - backslash_ptr + 1), backslash_ptr,
2740 (int)(e - s + 1), s + 1
2741 ),
2742 UTF ? SVf_UTF8 : 0);
2743 return NULL;
6f613c73
KW
2744 }
2745
94ca1619 2746 if (SvUTF8(res)) { /* Don't accept malformed input */
bde9e88d
KW
2747 const U8* first_bad_char_loc;
2748 STRLEN len;
2749 const char* const str = SvPV_const(res, len);
0aab20f2
KW
2750 if (UNLIKELY(! is_utf8_string_loc((U8 *) str, len,
2751 &first_bad_char_loc)))
2752 {
75219bac
KW
2753 _force_out_malformed_utf8_message(first_bad_char_loc,
2754 (U8 *) PL_parser->bufend,
2755 0,
2756 0 /* 0 means don't die */ );
8d9d0498
FC
2757 /* diag_listed_as: Malformed UTF-8 returned by \N{%s}
2758 immediately after '%s' */
bde9e88d
KW
2759 yyerror_pv(
2760 Perl_form(aTHX_
2761 "Malformed UTF-8 returned by %.*s immediately after '%.*s'",
2762 (int) (e - backslash_ptr + 1), backslash_ptr,
2763 (int) ((char *) first_bad_char_loc - str), str
2764 ),
2765 SVf_UTF8);
2766 return NULL;
2767 }
2768 }
140b12ad 2769
bde9e88d 2770 return res;
140b12ad 2771
bde9e88d 2772 bad_charname: {
bde9e88d
KW
2773
2774 /* The final %.*s makes sure that should the trailing NUL be missing
2775 * that this print won't run off the end of the string */
8d9d0498
FC
2776 /* diag_listed_as: Invalid character in \N{...}; marked by <-- HERE
2777 in \N{%s} */
bde9e88d
KW
2778 yyerror_pv(
2779 Perl_form(aTHX_
2780 "Invalid character in \\N{...}; marked by <-- HERE in %.*s<-- HERE %.*s",
2d8eb851
KW
2781 (int)(s - backslash_ptr + 1), backslash_ptr,
2782 (int)(e - s + 1), s + 1
bde9e88d
KW
2783 ),
2784 UTF ? SVf_UTF8 : 0);
2785 return NULL;
2786 }
2d8eb851
KW
2787
2788 multi_spaces:
8d9d0498
FC
2789 /* diag_listed_as: charnames alias definitions may not contain a
2790 sequence of multiple spaces; marked by <-- HERE
2791 in %s */
2d8eb851
KW
2792 yyerror_pv(
2793 Perl_form(aTHX_
2794 "charnames alias definitions may not contain a sequence of "
2795 "multiple spaces; marked by <-- HERE in %.*s<-- HERE %.*s",
2796 (int)(s - backslash_ptr + 1), backslash_ptr,
2797 (int)(e - s + 1), s + 1
2798 ),
2799 UTF ? SVf_UTF8 : 0);
2800 return NULL;
6f613c73
KW
2801}
2802
02aa26ce
NT
2803/*
2804 scan_const
2805
9da1dd8f
DM
2806 Extracts the next constant part of a pattern, double-quoted string,
2807 or transliteration. This is terrifying code.
2808
2809 For example, in parsing the double-quoted string "ab\x63$d", it would
2810 stop at the '$' and return an OP_CONST containing 'abc'.
02aa26ce 2811
94def140 2812 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 2813 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 2814 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 2815
94def140
ST
2816 Returns a pointer to the character scanned up to. If this is
2817 advanced from the start pointer supplied (i.e. if anything was
9da1dd8f 2818 successfully parsed), will leave an OP_CONST for the substring scanned
6154021b 2819 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
2820 by looking at the next characters herself.
2821
02aa26ce 2822 In patterns:
9da1dd8f 2823 expand:
537124e4
KW
2824 \N{FOO} => \N{U+hex_for_character_FOO}
2825 (if FOO expands to multiple characters, expands to \N{U+xx.XX.yy ...})
9da1dd8f
DM
2826
2827 pass through:
2828 all other \-char, including \N and \N{ apart from \N{ABC}
2829
2830 stops on:
2831 @ and $ where it appears to be a var, but not for $ as tail anchor
2832 \l \L \u \U \Q \E
2833 (?{ or (??{
2834
02aa26ce
NT
2835 In transliterations:
2836 characters are VERY literal, except for - not at the start or end
66b09263
KW
2837 of the string, which indicates a range. However some backslash sequences
2838 are recognized: \r, \n, and the like
2839 \007 \o{}, \x{}, \N{}
2840 If all elements in the transliteration are below 256,
94def140
ST
2841 scan_const expands the range to the full set of intermediate
2842 characters. If the range is in utf8, the hyphen is replaced with
2843 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
2844
2845 In double-quoted strings:
2846 backslashes:
66b09263 2847 all those recognized in transliterations
94def140 2848 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
2849 case and quoting: \U \Q \E
2850 stops on @ and $
2851
2852 scan_const does *not* construct ops to handle interpolated strings.
2853 It stops processing as soon as it finds an embedded $ or @ variable
2854 and leaves it to the caller to work out what's going on.
2855
94def140
ST
2856 embedded arrays (whether in pattern or not) could be:
2857 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2858
2859 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
2860
2861 $ in pattern could be $foo or could be tail anchor. Assumption:
2862 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 2863 followed by one of "()| \r\n\t"
02aa26ce 2864
9da1dd8f 2865 \1 (backreferences) are turned into $1 in substitutions
02aa26ce
NT
2866
2867 The structure of the code is
2868 while (there's a character to process) {
94def140
ST
2869 handle transliteration ranges
2870 skip regexp comments /(?#comment)/ and codes /(?{code})/
2871 skip #-initiated comments in //x patterns
2872 check for embedded arrays
02aa26ce
NT
2873 check for embedded scalars
2874 if (backslash) {
94def140 2875 deprecate \1 in substitution replacements
02aa26ce
NT
2876 handle string-changing backslashes \l \U \Q \E, etc.
2877 switch (what was escaped) {
94def140 2878 handle \- in a transliteration (becomes a literal -)
ff3f963a 2879 if a pattern and not \N{, go treat as regular character
94def140
ST
2880 handle \132 (octal characters)
2881 handle \x15 and \x{1234} (hex characters)
ff3f963a 2882 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
ST
2883 handle \cV (control characters)
2884 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 2885 } (end switch)
77a135fe 2886 continue
02aa26ce 2887 } (end if backslash)
77a135fe 2888 handle regular character
02aa26ce 2889 } (end while character to read)
02cd137d 2890
02aa26ce
NT
2891*/
2892
76e3520e 2893STATIC char *
cea2e8a9 2894S_scan_const(pTHX_ char *start)
79072805 2895{
eb578fdb 2896 char *send = PL_bufend; /* end of the constant */
dc023dbb
KW
2897 SV *sv = newSV(send - start); /* sv for the constant. See note below
2898 on sizing. */
eb578fdb
KW
2899 char *s = start; /* start of the constant */
2900 char *d = SvPVX(sv); /* destination for copies */
dc023dbb
KW
2901 bool dorange = FALSE; /* are we in a translit range? */
2902 bool didrange = FALSE; /* did we just finish a range? */
2903 bool in_charclass = FALSE; /* within /[...]/ */
2904 bool has_utf8 = FALSE; /* Output constant is UTF8 */
2905 bool this_utf8 = cBOOL(UTF); /* Is the source string assumed to be
2906 UTF8? But, this can show as true
2907 when the source isn't utf8, as for
2908 example when it is entirely composed
2909 of hex constants */
af9be36c
KW
2910 STRLEN utf8_variant_count = 0; /* When not in UTF-8, this counts the
2911 number of characters found so far
2912 that will expand (into 2 bytes)
2913 should we have to convert to
2914 UTF-8) */
6f613c73 2915 SV *res; /* result from charnames */
c89db733
JH
2916 STRLEN offset_to_max = 0; /* The offset in the output to where the range
2917 high-end character is temporarily placed */
77a135fe 2918
fe2ba0a2
KW
2919 /* Does something require special handling in tr/// ? This avoids extra
2920 * work in a less likely case. As such, khw didn't feel it was worth
2921 * adding any branches to the more mainline code to handle this, which
2922 * means that this doesn't get set in some circumstances when things like
2923 * \x{100} get expanded out. As a result there needs to be extra testing
2924 * done in the tr code */
2925 bool has_above_latin1 = FALSE;
2926
77a135fe
KW
2927 /* Note on sizing: The scanned constant is placed into sv, which is
2928 * initialized by newSV() assuming one byte of output for every byte of
2929 * input. This routine expects newSV() to allocate an extra byte for a
2930 * trailing NUL, which this routine will append if it gets to the end of
2931 * the input. There may be more bytes of input than output (eg., \N{LATIN
2932 * CAPITAL LETTER A}), or more output than input if the constant ends up
2933 * recoded to utf8, but each time a construct is found that might increase
2934 * the needed size, SvGROW() is called. Its size parameter each time is
2935 * based on the best guess estimate at the time, namely the length used so
2936 * far, plus the length the current construct will occupy, plus room for
caae0700 2937 * the trailing NUL, plus one byte for every input byte still unscanned */
77a135fe 2938
c3320c2a
KW
2939 UV uv = UV_MAX; /* Initialize to weird value to try to catch any uses
2940 before set */
4c3a8340 2941#ifdef EBCDIC
f4240379
KW
2942 int backslash_N = 0; /* ? was the character from \N{} */
2943 int non_portable_endpoint = 0; /* ? In a range is an endpoint
2944 platform-specific like \x65 */
4c3a8340 2945#endif
012bcf8d 2946
7918f24d
NC
2947 PERL_ARGS_ASSERT_SCAN_CONST;
2948
bb16bae8 2949 assert(PL_lex_inwhat != OP_TRANSR);
7ef70b3d 2950 if (PL_lex_inwhat == OP_TRANS && PL_parser->lex_sub_op) {
2b9d42f0 2951 /* If we are doing a trans and we know we want UTF8 set expectation */
7ef70b3d
FC
2952 has_utf8 = PL_parser->lex_sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
2953 this_utf8 = PL_parser->lex_sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
2b9d42f0
NIS
2954 }
2955
b899e89d
FC
2956 /* Protect sv from errors and fatal warnings. */
2957 ENTER_with_name("scan_const");
2958 SAVEFREESV(sv);
2b9d42f0 2959
f4240379
KW
2960 while (s < send
2961 || dorange /* Handle tr/// range at right edge of input */
2962 ) {
ff3f963a 2963
02aa26ce 2964 /* get transliterations out of the way (they're most literal) */
3280af22 2965 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 2966
f4240379
KW
2967 /* But there isn't any special handling necessary unless there is a
2968 * range, so for most cases we just drop down and handle the value
2969 * as any other. There are two exceptions.
2970 *
02cd137d
KW
2971 * 1. A hyphen indicates that we are actually going to have a
2972 * range. In this case, skip the '-', set a flag, then drop
f4240379
KW
2973 * down to handle what should be the end range value.
2974 * 2. After we've handled that value, the next time through, that
2975 * flag is set and we fix up the range.
2976 *
2977 * Ranges entirely within Latin1 are expanded out entirely, in
188d22cf
KW
2978 * order to make the transliteration a simple table look-up.
2979 * Ranges that extend above Latin1 have to be done differently, so
2980 * there is no advantage to expanding them here, so they are
2981 * stored here as Min, ILLEGAL_UTF8_BYTE, Max. The illegal byte
2982 * signifies a hyphen without any possible ambiguity. On EBCDIC
2983 * machines, if the range is expressed as Unicode, the Latin1
2984 * portion is expanded out even if the range extends above
2985 * Latin1. This is because each code point in it has to be
2986 * processed here individually to get its native translation */
f4240379
KW
2987
2988 if (! dorange) {
2989
02cd137d
KW
2990 /* Here, we don't think we're in a range. If the new character
2991 * is not a hyphen; or if it is a hyphen, but it's too close to
e8d55f27
TC
2992 * either edge to indicate a range, or if we haven't output any
2993 * characters yet then it's a regular character. */
2994 if (*s != '-' || s >= send - 1 || s == start || d == SvPVX(sv)) {
f4240379
KW
2995
2996 /* A regular character. Process like any other, but first
2997 * clear any flags */
2998 didrange = FALSE;
2999 dorange = FALSE;
e294cc5d 3000#ifdef EBCDIC
f4240379
KW
3001 non_portable_endpoint = 0;
3002 backslash_N = 0;
e294cc5d 3003#endif
02cd137d
KW
3004 /* The tests here for being above Latin1 and similar ones
3005 * in the following 'else' suffice to find all such
3006 * occurences in the constant, except those added by a
fe2ba0a2 3007 * backslash escape sequence, like \x{100}. Mostly, those
02cd137d 3008 * set 'has_above_latin1' as appropriate */
188d22cf
KW
3009 if (this_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) {
3010 has_above_latin1 = TRUE;
3011 }
3012
f4240379
KW
3013 /* Drops down to generic code to process current byte */
3014 }
02cd137d 3015 else { /* Is a '-' in the context where it means a range */
f4240379 3016 if (didrange) { /* Something like y/A-C-Z// */
02cd137d
KW
3017 Perl_croak(aTHX_ "Ambiguous range in transliteration"
3018 " operator");
f4240379 3019 }
e294cc5d 3020
f4240379 3021 dorange = TRUE;
2b9d42f0 3022
02cd137d 3023 s++; /* Skip past the hyphen */
f4240379
KW
3024
3025 /* d now points to where the end-range character will be
3026 * placed. Save it so won't have to go finding it later,
3027 * and drop down to get that character. (Actually we
3028 * instead save the offset, to handle the case where a
3029 * realloc in the meantime could change the actual
3030 * pointer). We'll finish processing the range the next
3031 * time through the loop */
3032 offset_to_max = d - SvPVX_const(sv);
188d22cf
KW
3033
3034 if (this_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) {
3035 has_above_latin1 = TRUE;
3036 }
02cd137d
KW
3037
3038 /* Drops down to generic code to process current byte */
f4240379
KW
3039 }
3040 } /* End of not a range */
3041 else {
3042 /* Here we have parsed a range. Now must handle it. At this
3043 * point:
3044 * 'sv' is a SV* that contains the output string we are
3045 * constructing. The final two characters in that string
3046 * are the range start and range end, in order.
3047 * 'd' points to just beyond the range end in the 'sv' string,
3048 * where we would next place something
3049 * 'offset_to_max' is the offset in 'sv' at which the character
02cd137d 3050 * (the range's maximum end point) before 'd' begins.
f4240379 3051 */
8efef67c 3052 char * max_ptr = SvPVX(sv) + offset_to_max;
5ca37d54 3053 char * min_ptr;
f4240379
KW
3054 IV range_min;
3055 IV range_max; /* last character in range */
f4240379 3056 STRLEN grow;
5ca37d54
KW
3057 Size_t offset_to_min = 0;
3058 Size_t extras = 0;
11327fa1 3059#ifdef EBCDIC
f4240379
KW
3060 bool convert_unicode;
3061 IV real_range_max = 0;
e294cc5d 3062#endif
02cd137d 3063 /* Get the code point values of the range ends. */
e294cc5d 3064 if (has_utf8) {
f4240379
KW
3065 /* We know the utf8 is valid, because we just constructed
3066 * it ourselves in previous loop iterations */
3067 min_ptr = (char*) utf8_hop( (U8*) max_ptr, -1);
3068 range_min = valid_utf8_to_uvchr( (U8*) min_ptr, NULL);
3069 range_max = valid_utf8_to_uvchr( (U8*) max_ptr, NULL);
fe2ba0a2
KW
3070
3071 /* This compensates for not all code setting
3072 * 'has_above_latin1', so that we don't skip stuff that
3073 * should be executed */
3074 if (range_max > 255) {
3075 has_above_latin1 = TRUE;
3076 }
e294cc5d 3077 }
f4240379
KW
3078 else {
3079 min_ptr = max_ptr - 1;
3080 range_min = * (U8*) min_ptr;
3081 range_max = * (U8*) max_ptr;
3082 }
3083
8efef67c
KW
3084 /* If the range is just a single code point, like tr/a-a/.../,
3085 * that code point is already in the output, twice. We can
3086 * just back up over the second instance and avoid all the rest
3087 * of the work. But if it is a variant character, it's been
218304f9
KW
3088 * counted twice, so decrement. (This unlikely scenario is
3089 * special cased, like the one for a range of 2 code points
3090 * below, only because the main-line code below needs a range
3091 * of 3 or more to work without special casing. Might as well
3092 * get it out of the way now.) */
8efef67c
KW
3093 if (UNLIKELY(range_max == range_min)) {
3094 d = max_ptr;
3095 if (! has_utf8 && ! UVCHR_IS_INVARIANT(range_max)) {
3096 utf8_variant_count--;
3097 }
3098 goto range_done;
3099 }
3100
e294cc5d 3101#ifdef EBCDIC
f4240379
KW
3102 /* On EBCDIC platforms, we may have to deal with portable
3103 * ranges. These happen if at least one range endpoint is a
3104 * Unicode value (\N{...}), or if the range is a subset of
3105 * [A-Z] or [a-z], and both ends are literal characters,
3106 * like 'A', and not like \x{C1} */
87dd6ea7 3107 convert_unicode =
02cd137d
KW
3108 cBOOL(backslash_N) /* \N{} forces Unicode,
3109 hence portable range */
3110 || ( ! non_portable_endpoint
3111 && (( isLOWER_A(range_min) && isLOWER_A(range_max))
3112 || (isUPPER_A(range_min) && isUPPER_A(range_max))));
87dd6ea7 3113 if (convert_unicode) {
f4240379
KW
3114
3115 /* Special handling is needed for these portable ranges.
02cd137d
KW
3116 * They are defined to be in Unicode terms, which includes
3117 * all the Unicode code points between the end points.
f4240379
KW
3118 * Convert to Unicode to get the Unicode range. Later we
3119 * will convert each code point in the range back to
3120 * native. */
3121 range_min = NATIVE_TO_UNI(range_min);
3122 range_max = NATIVE_TO_UNI(range_max);
3123 }
e294cc5d 3124#endif
8ada0baa 3125
f4240379 3126 if (range_min > range_max) {
11327fa1 3127#ifdef EBCDIC
f4240379
KW
3128 if (convert_unicode) {
3129 /* Need to convert back to native for meaningful
3130 * messages for this platform */
3131 range_min = UNI_TO_NATIVE(range_min);
3132 range_max = UNI_TO_NATIVE(range_max);
3133 }
11327fa1 3134#endif
f4240379
KW
3135 /* Use the characters themselves for the error message if
3136 * ASCII printables; otherwise some visible representation
3137 * of them */
3138 if (isPRINT_A(range_min) && isPRINT_A(range_max)) {
3139 Perl_croak(aTHX_
3140 "Invalid range \"%c-%c\" in transliteration operator",
3141 (char)range_min, (char)range_max);
3142 }
11327fa1 3143#ifdef EBCDIC
f4240379 3144 else if (convert_unicode) {
02cd137d 3145 /* diag_listed_as: Invalid range "%s" in transliteration operator */
f4240379 3146 Perl_croak(aTHX_
02cd137d
KW
3147 "Invalid range \"\\N{U+%04" UVXf "}-\\N{U+%04"
3148 UVXf "}\" in transliteration operator",
3149 range_min, range_max);
f4240379 3150 }
11327fa1 3151#endif
f4240379 3152 else {
02cd137d 3153 /* diag_listed_as: Invalid range "%s" in transliteration operator */
f4240379 3154 Perl_croak(aTHX_
02cd137d
KW
3155 "Invalid range \"\\x{%04" UVXf "}-\\x{%04" UVXf "}\""
3156 " in transliteration operator",
3157 range_min, range_max);
f4240379 3158 }
c2e66d9e
GS
3159 }
3160
aca41667
KW
3161 /* If the range is exactly two code points long, they are
3162 * already both in the output */
3163 if (UNLIKELY(range_min + 1 == range_max)) {
3164 goto range_done;
3165 }
3166
3167 /* Here the range contains at least 3 code points */
3168
f4240379
KW
3169 if (has_utf8) {
3170
188d22cf
KW
3171 /* If everything in the transliteration is below 256, we
3172 * can avoid special handling later. A translation table
02cd137d
KW
3173 * for each of those bytes is created by op.c. So we
3174 * expand out all ranges to their constituent code points.
3175 * But if we've encountered something above 255, the
3176 * expanding won't help, so skip doing that. But if it's
3177 * EBCDIC, we may have to look at each character below 256
3178 * if we have to convert to/from Unicode values */
188d22cf 3179 if ( has_above_latin1
c7f1f016 3180#ifdef EBCDIC
f4240379 3181 && (range_min > 255 || ! convert_unicode)
8ada0baa 3182#endif
f4240379
KW
3183 ) {
3184 /* Move the high character one byte to the right; then
3185 * insert between it and the range begin, an illegal
3186 * byte which serves to indicate this is a range (using
02cd137d 3187 * a '-' would be ambiguous). */
f4240379
KW
3188 char *e = d++;
3189 while (e-- > max_ptr) {
3190 *(e + 1) = *e;
e294cc5d 3191 }
f4240379
KW
3192 *(e + 1) = (char) ILLEGAL_UTF8_BYTE;
3193 goto range_done;
3194 }
3195
3196 /* Here, we're going to expand out the range. For EBCDIC
3197 * the range can extend above 255 (not so in ASCII), so
3198 * for EBCDIC, split it into the parts above and below
3199 * 255/256 */
e294cc5d 3200#ifdef EBCDIC
f4240379
KW
3201 if (range_max > 255) {
3202 real_range_max = range_max;
3203 range_max = 255;
3204 }
e294cc5d 3205#endif
f4240379 3206 }
02aa26ce 3207
f4240379 3208 /* Here we need to expand out the string to contain each
5ca37d54
KW
3209 * character in the range. Grow the output to handle this.
3210 * For non-UTF8, we need a byte for each code point in the
3211 * range, minus the three that we've already allocated for: the
3212 * hyphen, the min, and the max. For UTF-8, we need this
3213 * plus an extra byte for each code point that occupies two
3214 * bytes (is variant) when in UTF-8 (except we've already
3215 * allocated for the end points, including if they are
3216 * variants). For ASCII platforms and Unicode ranges on EBCDIC
3217 * platforms, it's easy to calculate a precise number. To
3218 * start, we count the variants in the range, which we need
3219 * elsewhere in this function anyway. (For the case where it
3220 * isn't easy to calculate, 'extras' has been initialized to 0,
3221 * and the calculation is done in a loop further down.) */
3222#ifdef EBCDIC
3223 if (convert_unicode)
3224#endif
3225 {
3226 /* This is executed unconditionally on ASCII, and for
3227 * Unicode ranges on EBCDIC. Under these conditions, all
3228 * code points above a certain value are variant; and none
3229 * under that value are. We just need to find out how much
3230 * of the range is above that value. We don't count the
3231 * end points here, as they will already have been counted
3232 * as they were parsed. */
3233 if (range_min >= UTF_CONTINUATION_MARK) {
3234
3235 /* The whole range is made up of variants */
3236 extras = (range_max - 1) - (range_min + 1) + 1;
3237 }
3238 else if (range_max >= UTF_CONTINUATION_MARK) {
f4240379 3239
5ca37d54
KW
3240 /* Only the higher portion of the range is variants */
3241 extras = (range_max - 1) - UTF_CONTINUATION_MARK + 1;
3242 }
f4240379 3243
5ca37d54
KW
3244 utf8_variant_count += extras;
3245 }
3246
3247 /* The base growth is the number of code points in the range,
3248 * not including the endpoints, which have already been sized
3249 * for (and output). We don't subtract for the hyphen, as it
3250 * has been parsed but not output, and the SvGROW below is
3251 * based only on what's been output plus what's left to parse.
3252 * */
3253 grow = (range_max - 1) - (range_min + 1) + 1;
f4240379 3254
5ca37d54 3255 if (has_utf8) {
4c3a8340 3256#ifdef EBCDIC
5ca37d54
KW
3257 /* In some cases in EBCDIC, we haven't yet calculated a
3258 * precise amount needed for the UTF-8 variants. Just
3259 * assume the worst case, that everything will expand by a
3260 * byte */
3261 if (! convert_unicode) {
f4240379
KW
3262 grow *= 2;
3263 }
5ca37d54 3264 else
4c3a8340 3265#endif
5ca37d54
KW
3266 {
3267 /* Otherwise we know exactly how many variants there
3268 * are in the range. */
3269 grow += extras;
3270 }
f4240379
KW
3271 }
3272
5ca37d54
KW
3273 /* Grow, but position the output to overwrite the range min end
3274 * point, because in some cases we overwrite that */
3275 SvCUR_set(sv, d - SvPVX_const(sv));
3276 offset_to_min = min_ptr - SvPVX_const(sv);
3277
3278 /* See Note on sizing above. */
3279 d = offset_to_min + SvGROW(sv, SvCUR(sv)
3280 + (send - s)
3281 + grow
3282 + 1 /* Trailing NUL */ );
f4240379 3283
5ca37d54 3284 /* Now, we can expand out the range. */
11327fa1 3285#ifdef EBCDIC
f4240379 3286 if (convert_unicode) {
5ca37d54 3287 SSize_t i;
02aa26ce 3288
f4240379
KW
3289 /* Recall that the min and max are now in Unicode terms, so
3290 * we have to convert each character to its native
3291 * equivalent */
3292 if (has_utf8) {
3293 for (i = range_min; i <= range_max; i++) {
02cd137d
KW
3294 append_utf8_from_native_byte(
3295 LATIN1_TO_NATIVE((U8) i),
3296 (U8 **) &d);
f4240379
KW
3297 }
3298 }
3299 else {
3300 for (i = range_min; i <= range_max; i++) {
81324705 3301 *d++ = (char)LATIN1_TO_NATIVE((U8) i);
f4240379
KW
3302 }
3303 }
01ec43d0 3304 }
11327fa1
AL
3305 else
3306#endif
3307 /* Always gets run for ASCII, and sometimes for EBCDIC. */
3308 {
f4240379
KW
3309 /* Here, no conversions are necessary, which means that the
3310 * first character in the range is already in 'd' and
3311 * valid, so we can skip overwriting it */
3312 if (has_utf8) {
19742f39 3313 SSize_t i;
f4240379
KW
3314 d += UTF8SKIP(d);
3315 for (i = range_min + 1; i <= range_max; i++) {
81324705 3316 append_utf8_from_native_byte((U8) i, (U8 **) &d);
f4240379
KW
3317 }
3318 }
3319 else {
19742f39 3320 SSize_t i;
f4240379 3321 d++;
5ca37d54
KW
3322 assert(range_min + 1 <= range_max);
3323 for (i = range_min + 1; i < range_max; i++) {
3324#ifdef EBCDIC
3325 /* In this case on EBCDIC, we haven't calculated
3326 * the variants. Do it here, as we go along */
3327 if (! UVCHR_IS_INVARIANT(i)) {
3328 utf8_variant_count++;
3329 }
3330#endif
f4240379
KW
3331 *d++ = (char)i;
3332 }
5ca37d54
KW
3333
3334 /* The range_max is done outside the loop so as to
3335 * avoid having to special case not incrementing
3336 * 'utf8_variant_count' on EBCDIC (it's already been
3337 * counted when originally parsed) */
3338 *d++ = (char) range_max;
f4240379 3339 }
a0ed51b3 3340 }
02aa26ce 3341
11327fa1 3342#ifdef EBCDIC
02cd137d
KW
3343 /* If the original range extended above 255, add in that
3344 * portion. */
f4240379
KW
3345 if (real_range_max) {
3346 *d++ = (char) UTF8_TWO_BYTE_HI(0x100);
3347 *d++ = (char) UTF8_TWO_BYTE_LO(0x100);
043af6f6
KW
3348 if (real_range_max > 0x100) {
3349 if (real_range_max > 0x101) {
3350 *d++ = (char) ILLEGAL_UTF8_BYTE;
3351 }
f4240379 3352 d = (char*)uvchr_to_utf8((U8*)d, real_range_max);
043af6f6 3353 }
f4240379 3354 }
11327fa1 3355#endif
02aa26ce 3356
f4240379
KW
3357 range_done:
3358 /* mark the range as done, and continue */
3359 didrange = TRUE;
3360 dorange = FALSE;
3361#ifdef EBCDIC
3362 non_portable_endpoint = 0;
3363 backslash_N = 0;
3364#endif
3365 continue;
3366 } /* End of is a range */
3367 } /* End of transliteration. Joins main code after these else's */
e4a2df84
DM
3368 else if (*s == '[' && PL_lex_inpat && !in_charclass) {
3369 char *s1 = s-1;
3370 int esc = 0;
3371 while (s1 >= start && *s1-- == '\\')
3372 esc = !esc;
3373 if (!esc)
3374 in_charclass = TRUE;
3375 }
1e02a175 3376 else if (*s == ']' && PL_lex_inpat && in_charclass) {
e4a2df84
DM
3377 char *s1 = s-1;
3378 int esc = 0;
3379 while (s1 >= start && *s1-- == '\\')
3380 esc = !esc;
3381 if (!esc)
3382 in_charclass = FALSE;
3383 }
02cd137d
KW
3384 /* skip for regexp comments /(?#comment)/, except for the last
3385 * char, which will be done separately. Stop on (?{..}) and
3386 * friends */
c30fc27b 3387 else if (*s == '(' && PL_lex_inpat && s[1] == '?' && !in_charclass) {
cc6b7395 3388 if (s[2] == '#') {
e994fd66 3389 while (s+1 < send && *s != ')')
5ff03569 3390 *d++ = *s++;
155aba94 3391 }
407f8cf2
KW
3392 else if (!PL_lex_casemods
3393 && ( s[2] == '{' /* This should match regcomp.c */
3394 || (s[2] == '?' && s[3] == '{')))
155aba94 3395 {
9da1dd8f 3396 break;
cc6b7395 3397 }
748a9306 3398 }
02cd137d 3399 /* likewise skip #-initiated comments in //x patterns */
407f8cf2
KW
3400 else if (*s == '#'
3401 && PL_lex_inpat
3402 && !in_charclass
3403 && ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED)
3404 {
8faf4f30 3405 while (s < send && *s != '\n')
5ff03569 3406 *d++ = *s++;
748a9306 3407 }
02cd137d 3408 /* no further processing of single-quoted regex */
9da1dd8f
DM
3409 else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'')
3410 goto default_action;
3411
02cd137d
KW
3412 /* check for embedded arrays
3413 * (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
3414 */
1749ea0d 3415 else if (*s == '@' && s[1]) {
fac0f7a3
KW
3416 if (UTF
3417 ? isIDFIRST_utf8_safe(s+1, send)
3418 : isWORDCHAR_A(s[1]))
3419 {
1749ea0d 3420 break;
fac0f7a3 3421 }
1749ea0d
ST
3422 if (strchr(":'{$", s[1]))
3423 break;
3424 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
3425 break; /* in regexp, neither @+ nor @- are interpolated */
3426 }
02cd137d
KW
3427 /* check for embedded scalars. only stop if we're sure it's a
3428 * variable. */
79072805 3429 else if (*s == '$') {
3280af22 3430 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
79072805 3431 break;
77772344 3432 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
a2a5de95
NC
3433 if (s[1] == '\\') {
3434 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
3435 "Possible unintended interpolation of $\\ in regex");
77772344 3436 }
79072805 3437 break; /* in regexp, $ might be tail anchor */
77772344 3438 }
79072805 3439 }
02aa26ce 3440
2b9d42f0
NIS
3441 /* End of else if chain - OP_TRANS rejoin rest */
3442
8faf4f30
HS
3443 if (UNLIKELY(s >= send)) {
3444 assert(s == send);
3445 break;
3446 }
3447
02aa26ce 3448 /* backslashes */
79072805 3449 if (*s == '\\' && s+1 < send) {
ff3f963a
KW
3450 char* e; /* Can be used for ending '}', etc. */
3451
79072805 3452 s++;
02aa26ce 3453
7d0fc23c
KW
3454 /* warn on \1 - \9 in substitution replacements, but note that \11
3455 * is an octal; and \19 is \1 followed by '9' */
407f8cf2
KW
3456 if (PL_lex_inwhat == OP_SUBST
3457 && !PL_lex_inpat
3458 && isDIGIT(*s)
3459 && *s != '0'
3460 && !isDIGIT(s[1]))
79072805 3461 {
c782d7ee 3462 /* diag_listed_as: \%d better written as $%d */
a2a5de95 3463 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
79072805
LW
3464 *--s = '$';
3465 break;
3466 }
02aa26ce
NT
3467
3468 /* string-change backslash escapes */
838f2281 3469 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQF", *s)) {
79072805
LW
3470 --s;
3471 break;
3472 }
ff3f963a
KW
3473 /* In a pattern, process \N, but skip any other backslash escapes.
3474 * This is because we don't want to translate an escape sequence
3475 * into a meta symbol and have the regex compiler use the meta
3476 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
3477 * in spite of this, we do have to process \N here while the proper
3478 * charnames handler is in scope. See bugs #56444 and #62056.
85fba779 3479 *
ff3f963a
KW
3480 * There is a complication because \N in a pattern may also stand
3481 * for 'match a non-nl', and not mean a charname, in which case its
3482 * processing should be deferred to the regex compiler. To be a
3483 * charname it must be followed immediately by a '{', and not look
3484 * like \N followed by a curly quantifier, i.e., not something like
3485 * \N{3,}. regcurly returns a boolean indicating if it is a legal
3486 * quantifier */
3487 else if (PL_lex_inpat
3488 && (*s != 'N'
3489 || s[1] != '{'
412f55bb 3490 || regcurly(s + 1)))
ff3f963a 3491 {
4d73d076 3492 *d++ = '\\';
cc74c5bd
ST
3493 goto default_action;
3494 }
02aa26ce 3495
79072805 3496 switch (*s) {
79072805 3497 default:
11b8faa4 3498 {
15861f94 3499 if ((isALPHANUMERIC(*s)))
a2a5de95
NC
3500 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
3501 "Unrecognized escape \\%c passed through",
3502 *s);
11b8faa4 3503 /* default action is to copy the quoted character */
f9a63242 3504 goto default_action;
11b8faa4 3505 }
02aa26ce 3506
632403cc 3507 /* eg. \132 indicates the octal constant 0132 */
79072805
LW
3508 case '0': case '1': case '2': case '3':
3509 case '4': case '5': case '6': case '7':
ba210ebe 3510 {
5e0a247b 3511 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
53305cf1 3512 STRLEN len = 3;
06972766 3513 uv = grok_oct(s, &len, &flags, NULL);
ba210ebe 3514 s += len;
5e0a247b
KW
3515 if (len < 3 && s < send && isDIGIT(*s)
3516 && ckWARN(WARN_MISC))
3517 {
3518 Perl_warner(aTHX_ packWARN(WARN_MISC),
3519 "%s", form_short_octal_warning(s, len));
3520 }
ba210ebe 3521 }
012bcf8d 3522 goto NUM_ESCAPE_INSERT;
02aa26ce 3523
f0a2b745
KW
3524 /* eg. \o{24} indicates the octal constant \024 */
3525 case 'o':
3526 {
454155d9 3527 const char* error;
f0a2b745 3528
e8278639
KW
3529 bool valid = grok_bslash_o(&s, PL_bufend,
3530 &uv, &error,
80f4111b
KW
3531 TRUE, /* Output warning */
3532 FALSE, /* Not strict */
17896a85
KW
3533 TRUE, /* Output warnings for
3534 non-portables */
80f4111b 3535 UTF);
454155d9 3536 if (! valid) {
f0a2b745 3537 yyerror(error);
3dd4eaeb 3538 uv = 0; /* drop through to ensure range ends are set */
f0a2b745
KW
3539 }
3540 goto NUM_ESCAPE_INSERT;
3541 }
3542
77a135fe 3543 /* eg. \x24 indicates the hex constant 0x24 */
79072805 3544 case 'x':
a0481293 3545 {
a0481293 3546 const char* error;
355860ce 3547
e8278639
KW
3548 bool valid = grok_bslash_x(&s, PL_bufend,
3549 &uv, &error,
80f4111b
KW
3550 TRUE, /* Output warning */
3551 FALSE, /* Not strict */
17896a85
KW
3552 TRUE, /* Output warnings for
3553 non-portables */
80f4111b