This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
toke.c: Remove redundant PL_expect check
[perl5.git] / toke.c
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
f0e67a1d
Z
26This is the lower layer of the Perl parser, managing characters and tokens.
27
28=for apidoc AmU|yy_parser *|PL_parser
29
30Pointer to a structure encapsulating the state of the parsing operation
31currently in progress. The pointer can be locally changed to perform
32a nested parse without interfering with the state of an outer parse.
33Individual members of C<PL_parser> have their own documentation.
34
35=cut
36*/
37
378cc40b 38#include "EXTERN.h"
864dbfa3 39#define PERL_IN_TOKE_C
378cc40b 40#include "perl.h"
04e98a4d 41#include "dquote_static.c"
378cc40b 42
eb0d8d16
NC
43#define new_constant(a,b,c,d,e,f,g) \
44 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
45
6154021b 46#define pl_yylval (PL_parser->yylval)
d3b6f988 47
199e78b7
DM
48/* XXX temporary backwards compatibility */
49#define PL_lex_brackets (PL_parser->lex_brackets)
78cdf107
Z
50#define PL_lex_allbrackets (PL_parser->lex_allbrackets)
51#define PL_lex_fakeeof (PL_parser->lex_fakeeof)
199e78b7
DM
52#define PL_lex_brackstack (PL_parser->lex_brackstack)
53#define PL_lex_casemods (PL_parser->lex_casemods)
54#define PL_lex_casestack (PL_parser->lex_casestack)
55#define PL_lex_defer (PL_parser->lex_defer)
56#define PL_lex_dojoin (PL_parser->lex_dojoin)
199e78b7
DM
57#define PL_lex_formbrack (PL_parser->lex_formbrack)
58#define PL_lex_inpat (PL_parser->lex_inpat)
59#define PL_lex_inwhat (PL_parser->lex_inwhat)
60#define PL_lex_op (PL_parser->lex_op)
61#define PL_lex_repl (PL_parser->lex_repl)
62#define PL_lex_starts (PL_parser->lex_starts)
63#define PL_lex_stuff (PL_parser->lex_stuff)
64#define PL_multi_start (PL_parser->multi_start)
65#define PL_multi_open (PL_parser->multi_open)
66#define PL_multi_close (PL_parser->multi_close)
199e78b7
DM
67#define PL_preambled (PL_parser->preambled)
68#define PL_sublex_info (PL_parser->sublex_info)
bdc0bf6f 69#define PL_linestr (PL_parser->linestr)
c2598295
DM
70#define PL_expect (PL_parser->expect)
71#define PL_copline (PL_parser->copline)
f06b5848
DM
72#define PL_bufptr (PL_parser->bufptr)
73#define PL_oldbufptr (PL_parser->oldbufptr)
74#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
75#define PL_linestart (PL_parser->linestart)
76#define PL_bufend (PL_parser->bufend)
77#define PL_last_uni (PL_parser->last_uni)
78#define PL_last_lop (PL_parser->last_lop)
79#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 80#define PL_lex_state (PL_parser->lex_state)
2f9285f8 81#define PL_rsfp (PL_parser->rsfp)
5486870f 82#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
83#define PL_in_my (PL_parser->in_my)
84#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 85#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 86#define PL_multi_end (PL_parser->multi_end)
13765c85 87#define PL_error_count (PL_parser->error_count)
199e78b7 88
fb205e7a
DM
89# define PL_nexttoke (PL_parser->nexttoke)
90# define PL_nexttype (PL_parser->nexttype)
91# define PL_nextval (PL_parser->nextval)
199e78b7 92
a1894d81 93static const char* const ident_too_long = "Identifier too long";
8903cb82 94
9ded7720 95# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2 96
a7aaec61
Z
97#define XENUMMASK 0x3f
98#define XFAKEEOF 0x40
99#define XFAKEBRACK 0x80
9059aa12 100
39e02b42
JH
101#ifdef USE_UTF8_SCRIPTS
102# define UTF (!IN_BYTES)
2b9d42f0 103#else
802a15e9 104# define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
2b9d42f0 105#endif
a0ed51b3 106
b1fc3636
CJ
107/* The maximum number of characters preceding the unrecognized one to display */
108#define UNRECOGNIZED_PRECEDE_COUNT 10
109
61f0cdd9 110/* In variables named $^X, these are the legal values for X.
2b92dfce
GS
111 * 1999-02-27 mjd-perl-patch@plover.com */
112#define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
113
14bd96d0 114#define SPACE_OR_TAB(c) isBLANK_A(c)
bf4acbe4 115
9ff909cf
JH
116#define HEXFP_PEEK(s) \
117 (((s[0] == '.') && \
118 (isXDIGIT(s[1]) || isALPHA_FOLD_EQ(s[1], 'p'))) || \
119 isALPHA_FOLD_EQ(s[0], 'p'))
120
ffb4593c
NT
121/* LEX_* are values for PL_lex_state, the state of the lexer.
122 * They are arranged oddly so that the guard on the switch statement
79072805 123 * can get by with a single comparison (if the compiler is smart enough).
9da1dd8f
DM
124 *
125 * These values refer to the various states within a sublex parse,
126 * i.e. within a double quotish string
79072805
LW
127 */
128
fb73857a
PP
129/* #define LEX_NOTPARSING 11 is done in perl.h. */
130
b6007c36
DM
131#define LEX_NORMAL 10 /* normal code (ie not within "...") */
132#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
133#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
134#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
135#define LEX_INTERPSTART 6 /* expecting the start of a $var */
136
137 /* at end of code, eg "$x" followed by: */
138#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
139#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
140
141#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
142 string or after \E, $foo, etc */
143#define LEX_INTERPCONST 2 /* NOT USED */
144#define LEX_FORMLINE 1 /* expecting a format line */
145#define LEX_KNOWNEXT 0 /* next token known; just return it */
146
79072805 147
bbf60fe6 148#ifdef DEBUGGING
27da23d5 149static const char* const lex_state_names[] = {
bbf60fe6
DM
150 "KNOWNEXT",
151 "FORMLINE",
152 "INTERPCONST",
153 "INTERPCONCAT",
154 "INTERPENDMAYBE",
155 "INTERPEND",
156 "INTERPSTART",
157 "INTERPPUSH",
158 "INTERPCASEMOD",
159 "INTERPNORMAL",
160 "NORMAL"
161};
162#endif
163
79072805 164#include "keywords.h"
fe14fcc3 165
ffb4593c
NT
166/* CLINE is a macro that ensures PL_copline has a sane value */
167
57843af0 168#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 169
ffb4593c
NT
170/*
171 * Convenience functions to return different tokens and prime the
9cbb5ea2 172 * lexer for the next token. They all take an argument.
ffb4593c
NT
173 *
174 * TOKEN : generic token (used for '(', DOLSHARP, etc)
175 * OPERATOR : generic operator
176 * AOPERATOR : assignment operator
177 * PREBLOCK : beginning the block after an if, while, foreach, ...
178 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
179 * PREREF : *EXPR where EXPR is not a simple identifier
180 * TERM : expression term
89f35911 181 * POSTDEREF : postfix dereference (->$* ->@[...] etc.)
ffb4593c
NT
182 * LOOPX : loop exiting command (goto, last, dump, etc)
183 * FTST : file test operator
184 * FUN0 : zero-argument function
7eb971ee 185 * FUN0OP : zero-argument function, with its op created in this file
2d2e263d 186 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
187 * BOop : bitwise or or xor
188 * BAop : bitwise and
189 * SHop : shift operator
190 * PWop : power operator
9cbb5ea2 191 * PMop : pattern-matching operator
ffb4593c 192 * Aop : addition-level operator
e4916dd1 193 * AopNOASSIGN : addition-level operator that is never part of .=
ffb4593c
NT
194 * Mop : multiplication-level operator
195 * Eop : equality-testing operator
e5edeb50 196 * Rop : relational operator <= != gt
ffb4593c
NT
197 *
198 * Also see LOP and lop() below.
199 */
200
998054bd 201#ifdef DEBUGGING /* Serve -DT. */
704d4215 202# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 203#else
bbf60fe6 204# define REPORT(retval) (retval)
998054bd
SC
205#endif
206
bbf60fe6
DM
207#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
208#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
209#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, REPORT(retval)))
210#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
211#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
212#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
213#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
89f35911 214#define POSTDEREF(f) return (PL_bufptr = s, S_postderef(aTHX_ REPORT(f),s[1]))
a49203fd 215#define LOOPX(f) return (PL_bufptr = force_word(s,WORD,TRUE,FALSE), \
7a61bf3c 216 pl_yylval.ival=f, \
a49203fd 217 PL_expect = PL_nexttoke ? XOPERATOR : XTERM, \
7a61bf3c 218 REPORT((int)LOOPEX))
6154021b
RGS
219#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
220#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
7eb971ee 221#define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
6154021b
RGS
222#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
223#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITOROP)))
224#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITANDOP)))
225#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)SHIFTOP)))
226#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)POWOP)))
227#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
228#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)ADDOP)))
e4916dd1 229#define AopNOASSIGN(f) return (pl_yylval.ival=f, PL_bufptr=s, REPORT((int)ADDOP))
6154021b
RGS
230#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MULOP)))
231#define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
232#define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
2f3197b3 233
a687059c
LW
234/* This bit of chicanery makes a unary function followed by
235 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
236 * The UNIDOR macro is for unary functions that can be followed by the //
237 * operator (such as C<shift // 0>).
a687059c 238 */
d68ce4ac 239#define UNI3(f,x,have_x) { \
6154021b 240 pl_yylval.ival = f; \
d68ce4ac 241 if (have_x) PL_expect = x; \
376fcdbf
AL
242 PL_bufptr = s; \
243 PL_last_uni = PL_oldbufptr; \
244 PL_last_lop_op = f; \
245 if (*s == '(') \
246 return REPORT( (int)FUNC1 ); \
294a536f 247 s = skipspace(s); \
376fcdbf
AL
248 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
249 }
d68ce4ac
FC
250#define UNI(f) UNI3(f,XTERM,1)
251#define UNIDOR(f) UNI3(f,XTERMORDORDOR,1)
b5fb7ce3
FC
252#define UNIPROTO(f,optional) { \
253 if (optional) PL_last_uni = PL_oldbufptr; \
22393538
MH
254 OPERATOR(f); \
255 }
a687059c 256
d68ce4ac 257#define UNIBRACK(f) UNI3(f,0,0)
79072805 258
9f68db38 259/* grandfather return to old style */
78cdf107
Z
260#define OLDLOP(f) \
261 do { \
262 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
263 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
264 pl_yylval.ival = (f); \
265 PL_expect = XTERM; \
266 PL_bufptr = s; \
267 return (int)LSTOP; \
268 } while(0)
79072805 269
83944c01
FC
270#define COPLINE_INC_WITH_HERELINES \
271 STMT_START { \
272 CopLINE_inc(PL_curcop); \
851b527a
FC
273 if (PL_parser->herelines) \
274 CopLINE(PL_curcop) += PL_parser->herelines, \
275 PL_parser->herelines = 0; \
83944c01 276 } STMT_END
ffdb8b16
FC
277/* Called after scan_str to update CopLINE(PL_curcop), but only when there
278 * is no sublex_push to follow. */
279#define COPLINE_SET_FROM_MULTI_END \
280 STMT_START { \
281 CopLINE_set(PL_curcop, PL_multi_end); \
282 if (PL_multi_end != PL_multi_start) \
851b527a 283 PL_parser->herelines = 0; \
ffdb8b16 284 } STMT_END
83944c01
FC
285
286
8fa7f367
JH
287#ifdef DEBUGGING
288
6154021b 289/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
290enum token_type {
291 TOKENTYPE_NONE,
292 TOKENTYPE_IVAL,
6154021b 293 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6 294 TOKENTYPE_PVAL,
aeaef349 295 TOKENTYPE_OPVAL
bbf60fe6
DM
296};
297
6d4a66ac
NC
298static struct debug_tokens {
299 const int token;
300 enum token_type type;
301 const char *name;
302} const debug_tokens[] =
9041c2e3 303{
bbf60fe6
DM
304 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
305 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
306 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
307 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
308 { ARROW, TOKENTYPE_NONE, "ARROW" },
309 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
310 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
311 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
312 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
313 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
0d863452 314 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
315 { DO, TOKENTYPE_NONE, "DO" },
316 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
317 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
318 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
319 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
320 { ELSE, TOKENTYPE_NONE, "ELSE" },
321 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
322 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
323 { FOR, TOKENTYPE_IVAL, "FOR" },
324 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
705fe0e5
FC
325 { FORMLBRACK, TOKENTYPE_NONE, "FORMLBRACK" },
326 { FORMRBRACK, TOKENTYPE_NONE, "FORMRBRACK" },
bbf60fe6
DM
327 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
328 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
7eb971ee 329 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
bbf60fe6
DM
330 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
331 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
332 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 333 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
334 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
335 { IF, TOKENTYPE_IVAL, "IF" },
5504e6cf 336 { LABEL, TOKENTYPE_PVAL, "LABEL" },
bbf60fe6
DM
337 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
338 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
339 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
340 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
341 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
342 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
343 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
344 { MY, TOKENTYPE_IVAL, "MY" },
bbf60fe6
DM
345 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
346 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
347 { OROP, TOKENTYPE_IVAL, "OROP" },
348 { OROR, TOKENTYPE_NONE, "OROR" },
349 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
88e1f1a2
JV
350 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
351 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6 352 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
cc624add 353 { POSTJOIN, TOKENTYPE_NONE, "POSTJOIN" },
bbf60fe6
DM
354 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
355 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
356 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
357 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
358 { PREINC, TOKENTYPE_NONE, "PREINC" },
359 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
f3f204dc 360 { QWLIST, TOKENTYPE_OPVAL, "QWLIST" },
bbf60fe6
DM
361 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
362 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
f3f204dc 363 { REQUIRE, TOKENTYPE_NONE, "REQUIRE" },
bbf60fe6
DM
364 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
365 { SUB, TOKENTYPE_NONE, "SUB" },
366 { THING, TOKENTYPE_OPVAL, "THING" },
367 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
368 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
369 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
370 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
371 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
372 { USE, TOKENTYPE_IVAL, "USE" },
0d863452 373 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6
DM
374 { WHILE, TOKENTYPE_IVAL, "WHILE" },
375 { WORD, TOKENTYPE_OPVAL, "WORD" },
be25f609 376 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 377 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
378};
379
6154021b 380/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 381
bbf60fe6 382STATIC int
704d4215 383S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 384{
7918f24d
NC
385 PERL_ARGS_ASSERT_TOKEREPORT;
386
bbf60fe6 387 if (DEBUG_T_TEST) {
bd61b366 388 const char *name = NULL;
bbf60fe6 389 enum token_type type = TOKENTYPE_NONE;
f54cb97a 390 const struct debug_tokens *p;
396482e1 391 SV* const report = newSVpvs("<== ");
bbf60fe6 392
f54cb97a 393 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
394 if (p->token == (int)rv) {
395 name = p->name;
396 type = p->type;
397 break;
398 }
399 }
400 if (name)
54667de8 401 Perl_sv_catpv(aTHX_ report, name);
74736ae6 402 else if ((char)rv > ' ' && (char)rv <= '~')
4ebc7986 403 {
bbf60fe6 404 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
4ebc7986
FC
405 if ((char)rv == 'p')
406 sv_catpvs(report, " (pending identifier)");
407 }
bbf60fe6 408 else if (!rv)
396482e1 409 sv_catpvs(report, "EOF");
bbf60fe6
DM
410 else
411 Perl_sv_catpvf(aTHX_ report, "?? %"IVdf, (IV)rv);
412 switch (type) {
413 case TOKENTYPE_NONE:
bbf60fe6
DM
414 break;
415 case TOKENTYPE_IVAL:
704d4215 416 Perl_sv_catpvf(aTHX_ report, "(ival=%"IVdf")", (IV)lvalp->ival);
bbf60fe6
DM
417 break;
418 case TOKENTYPE_OPNUM:
419 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 420 PL_op_name[lvalp->ival]);
bbf60fe6
DM
421 break;
422 case TOKENTYPE_PVAL:
704d4215 423 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
424 break;
425 case TOKENTYPE_OPVAL:
704d4215 426 if (lvalp->opval) {
401441c0 427 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
428 PL_op_name[lvalp->opval->op_type]);
429 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 430 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 431 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
432 }
433
434 }
401441c0 435 else
396482e1 436 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
437 break;
438 }
b6007c36 439 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
440 };
441 return (int)rv;
998054bd
SC
442}
443
b6007c36
DM
444
445/* print the buffer with suitable escapes */
446
447STATIC void
15f169a1 448S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 449{
396482e1 450 SV* const tmp = newSVpvs("");
7918f24d
NC
451
452 PERL_ARGS_ASSERT_PRINTBUF;
453
5d37acd6 454 GCC_DIAG_IGNORE(-Wformat-nonliteral); /* fmt checked by caller */
b6007c36 455 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
5d37acd6 456 GCC_DIAG_RESTORE;
b6007c36
DM
457 SvREFCNT_dec(tmp);
458}
459
8fa7f367
JH
460#endif
461
8290c323
NC
462static int
463S_deprecate_commaless_var_list(pTHX) {
464 PL_expect = XTERM;
465 deprecate("comma-less variable list");
466 return REPORT(','); /* grandfather non-comma-format format */
467}
468
ffb4593c
NT
469/*
470 * S_ao
471 *
f393a21a
FC
472 * This subroutine looks for an '=' next to the operator that has just been
473 * parsed and turns it into an ASSIGNOP if it finds one.
ffb4593c
NT
474 */
475
76e3520e 476STATIC int
cea2e8a9 477S_ao(pTHX_ int toketype)
a0d0e21e 478{
3280af22
NIS
479 if (*PL_bufptr == '=') {
480 PL_bufptr++;
a0d0e21e 481 if (toketype == ANDAND)
6154021b 482 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 483 else if (toketype == OROR)
6154021b 484 pl_yylval.ival = OP_ORASSIGN;
c963b151 485 else if (toketype == DORDOR)
6154021b 486 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
487 toketype = ASSIGNOP;
488 }
489 return toketype;
490}
491
ffb4593c
NT
492/*
493 * S_no_op
494 * When Perl expects an operator and finds something else, no_op
495 * prints the warning. It always prints "<something> found where
496 * operator expected. It prints "Missing semicolon on previous line?"
497 * if the surprise occurs at the start of the line. "do you need to
498 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
499 * where the compiler doesn't know if foo is a method call or a function.
500 * It prints "Missing operator before end of line" if there's nothing
501 * after the missing operator, or "... before <...>" if there is something
502 * after the missing operator.
503 */
504
76e3520e 505STATIC void
15f169a1 506S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 507{
9d4ba2ae
AL
508 char * const oldbp = PL_bufptr;
509 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 510
7918f24d
NC
511 PERL_ARGS_ASSERT_NO_OP;
512
1189a94a
GS
513 if (!s)
514 s = oldbp;
07c798fb 515 else
1189a94a 516 PL_bufptr = s;
734ab321 517 yywarn(Perl_form(aTHX_ "%s found where operator expected", what), UTF ? SVf_UTF8 : 0);
56da5a46
RGS
518 if (ckWARN_d(WARN_SYNTAX)) {
519 if (is_first)
520 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
521 "\t(Missing semicolon on previous line?)\n");
522 else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) {
f54cb97a 523 const char *t;
8a2bca12 524 for (t = PL_oldoldbufptr; (isWORDCHAR_lazy_if(t,UTF) || *t == ':');
734ab321 525 t += UTF ? UTF8SKIP(t) : 1)
c35e046a 526 NOOP;
56da5a46
RGS
527 if (t < PL_bufptr && isSPACE(*t))
528 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
b17a0679
FC
529 "\t(Do you need to predeclare %"UTF8f"?)\n",
530 UTF8fARG(UTF, t - PL_oldoldbufptr, PL_oldoldbufptr));
56da5a46
RGS
531 }
532 else {
533 assert(s >= oldbp);
534 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
b17a0679
FC
535 "\t(Missing operator before %"UTF8f"?)\n",
536 UTF8fARG(UTF, s - oldbp, oldbp));
56da5a46 537 }
07c798fb 538 }
3280af22 539 PL_bufptr = oldbp;
8990e307
LW
540}
541
ffb4593c
NT
542/*
543 * S_missingterm
544 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 545 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
546 * If we're in a delimited string and the delimiter is a control
547 * character, it's reformatted into a two-char sequence like ^C.
548 * This is fatal.
549 */
550
76e3520e 551STATIC void
cea2e8a9 552S_missingterm(pTHX_ char *s)
8990e307
LW
553{
554 char tmpbuf[3];
555 char q;
556 if (s) {
9d4ba2ae 557 char * const nl = strrchr(s,'\n');
d2719217 558 if (nl)
8990e307
LW
559 *nl = '\0';
560 }
ca8b19a7 561 else if ((U8) PL_multi_close < 32) {
8990e307 562 *tmpbuf = '^';
585ec06d 563 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
564 tmpbuf[2] = '\0';
565 s = tmpbuf;
566 }
567 else {
eb160463 568 *tmpbuf = (char)PL_multi_close;
8990e307
LW
569 tmpbuf[1] = '\0';
570 s = tmpbuf;
571 }
572 q = strchr(s,'"') ? '\'' : '"';
cea2e8a9 573 Perl_croak(aTHX_ "Can't find string terminator %c%s%c anywhere before EOF",q,s,q);
463ee0b2 574}
79072805 575
dd0ac2b9
FC
576#include "feature.h"
577
0d863452 578/*
0d863452
RH
579 * Check whether the named feature is enabled.
580 */
26ea9e12 581bool
3fff3427 582Perl_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
0d863452 583{
4a731d7b 584 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
7918f24d
NC
585
586 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
ca4d40c4
FC
587
588 assert(CURRENT_FEATURE_BUNDLE == FEATURE_BUNDLE_CUSTOM);
7918f24d 589
26ea9e12
NC
590 if (namelen > MAX_FEATURE_LEN)
591 return FALSE;
3fff3427 592 memcpy(&he_name[8], name, namelen);
7d69d4a6 593
c8ca97b0
NC
594 return cBOOL(cop_hints_fetch_pvn(PL_curcop, he_name, 8 + namelen, 0,
595 REFCOUNTED_HE_EXISTS));
0d863452
RH
596}
597
ffb4593c 598/*
9cbb5ea2
GS
599 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
600 * utf16-to-utf8-reversed.
ffb4593c
NT
601 */
602
c39cd008
GS
603#ifdef PERL_CR_FILTER
604static void
605strip_return(SV *sv)
606{
eb578fdb
KW
607 const char *s = SvPVX_const(sv);
608 const char * const e = s + SvCUR(sv);
7918f24d
NC
609
610 PERL_ARGS_ASSERT_STRIP_RETURN;
611
c39cd008
GS
612 /* outer loop optimized to do nothing if there are no CR-LFs */
613 while (s < e) {
614 if (*s++ == '\r' && *s == '\n') {
615 /* hit a CR-LF, need to copy the rest */
eb578fdb 616 char *d = s - 1;
c39cd008
GS
617 *d++ = *s++;
618 while (s < e) {
619 if (*s == '\r' && s[1] == '\n')
620 s++;
621 *d++ = *s++;
622 }
623 SvCUR(sv) -= s - d;
624 return;
625 }
626 }
627}
a868473f 628
76e3520e 629STATIC I32
c39cd008 630S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 631{
f54cb97a 632 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
633 if (count > 0 && !maxlen)
634 strip_return(sv);
635 return count;
a868473f
NIS
636}
637#endif
638
ffb4593c 639/*
8eaa0acf
Z
640=for apidoc Amx|void|lex_start|SV *line|PerlIO *rsfp|U32 flags
641
642Creates and initialises a new lexer/parser state object, supplying
643a context in which to lex and parse from a new source of Perl code.
644A pointer to the new state object is placed in L</PL_parser>. An entry
645is made on the save stack so that upon unwinding the new state object
646will be destroyed and the former value of L</PL_parser> will be restored.
647Nothing else need be done to clean up the parsing context.
648
649The code to be parsed comes from I<line> and I<rsfp>. I<line>, if
650non-null, provides a string (in SV form) containing code to be parsed.
651A copy of the string is made, so subsequent modification of I<line>
652does not affect parsing. I<rsfp>, if non-null, provides an input stream
653from which code will be read to be parsed. If both are non-null, the
654code in I<line> comes first and must consist of complete lines of input,
655and I<rsfp> supplies the remainder of the source.
656
e368b3bd
FC
657The I<flags> parameter is reserved for future use. Currently it is only
658used by perl internally, so extensions should always pass zero.
8eaa0acf
Z
659
660=cut
661*/
ffb4593c 662
27fcb6ee 663/* LEX_START_SAME_FILTER indicates that this is not a new file, so it
87606032
NC
664 can share filters with the current parser.
665 LEX_START_DONT_CLOSE indicates that the file handle wasn't opened by the
666 caller, hence isn't owned by the parser, so shouldn't be closed on parser
667 destruction. This is used to handle the case of defaulting to reading the
668 script from the standard input because no filename was given on the command
669 line (without getting confused by situation where STDIN has been closed, so
670 the script handle is opened on fd 0) */
27fcb6ee 671
a0d0e21e 672void
8eaa0acf 673Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
79072805 674{
6ef55633 675 const char *s = NULL;
5486870f 676 yy_parser *parser, *oparser;
60d63348 677 if (flags && flags & ~LEX_START_FLAGS)
8eaa0acf 678 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
acdf0a21
DM
679
680 /* create and initialise a parser */
681
199e78b7 682 Newxz(parser, 1, yy_parser);
5486870f 683 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
684 PL_parser = parser;
685
28ac2b49
Z
686 parser->stack = NULL;
687 parser->ps = NULL;
688 parser->stack_size = 0;
acdf0a21 689
e3abe207
DM
690 /* on scope exit, free this parser and restore any outer one */
691 SAVEPARSER(parser);
7c4baf47 692 parser->saved_curcop = PL_curcop;
e3abe207 693
acdf0a21 694 /* initialise lexer state */
8990e307 695
fb205e7a 696 parser->nexttoke = 0;
ca4cfd28 697 parser->error_count = oparser ? oparser->error_count : 0;
7f1c3e8c 698 parser->copline = parser->preambling = NOLINE;
5afb0a62 699 parser->lex_state = LEX_NORMAL;
c2598295 700 parser->expect = XSTATE;
2f9285f8 701 parser->rsfp = rsfp;
27fcb6ee
FC
702 parser->rsfp_filters =
703 !(flags & LEX_START_SAME_FILTER) || !oparser
d3cd8e11
FC
704 ? NULL
705 : MUTABLE_AV(SvREFCNT_inc(
706 oparser->rsfp_filters
707 ? oparser->rsfp_filters
708 : (oparser->rsfp_filters = newAV())
709 ));
2f9285f8 710
199e78b7
DM
711 Newx(parser->lex_brackstack, 120, char);
712 Newx(parser->lex_casestack, 12, char);
713 *parser->lex_casestack = '\0';
d794b522 714 Newxz(parser->lex_shared, 1, LEXSHARED);
02b34bbe 715
10efb74f 716 if (line) {
0528fd32 717 STRLEN len;
10efb74f 718 s = SvPV_const(line, len);
0abcdfa4
FC
719 parser->linestr = flags & LEX_START_COPIED
720 ? SvREFCNT_inc_simple_NN(line)
721 : newSVpvn_flags(s, len, SvUTF8(line));
bf1b738b 722 sv_catpvn(parser->linestr, "\n;", rsfp ? 1 : 2);
0abcdfa4 723 } else {
bf1b738b 724 parser->linestr = newSVpvn("\n;", rsfp ? 1 : 2);
8990e307 725 }
f06b5848
DM
726 parser->oldoldbufptr =
727 parser->oldbufptr =
728 parser->bufptr =
729 parser->linestart = SvPVX(parser->linestr);
730 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
731 parser->last_lop = parser->last_uni = NULL;
b54f893d
KW
732
733 assert(FITS_IN_8_BITS(LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
734 |LEX_DONT_CLOSE_RSFP));
735 parser->lex_flags = (U8) (flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
736 |LEX_DONT_CLOSE_RSFP));
737c24fc 737
60d63348 738 parser->in_pod = parser->filtered = 0;
79072805 739}
a687059c 740
e3abe207
DM
741
742/* delete a parser object */
743
744void
745Perl_parser_free(pTHX_ const yy_parser *parser)
746{
7918f24d
NC
747 PERL_ARGS_ASSERT_PARSER_FREE;
748
7c4baf47 749 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
750 SvREFCNT_dec(parser->linestr);
751
87606032 752 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
2f9285f8 753 PerlIO_clearerr(parser->rsfp);
799361c3
SH
754 else if (parser->rsfp && (!parser->old_parser ||
755 (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 756 PerlIO_close(parser->rsfp);
5486870f 757 SvREFCNT_dec(parser->rsfp_filters);
10002bc1
FC
758 SvREFCNT_dec(parser->lex_stuff);
759 SvREFCNT_dec(parser->sublex_info.repl);
3ac7ff8f
FC
760
761 Safefree(parser->lex_brackstack);
762 Safefree(parser->lex_casestack);
763 Safefree(parser->lex_shared);
764 PL_parser = parser->old_parser;
765 Safefree(parser);
766}
767
768void
769Perl_parser_free_nexttoke_ops(pTHX_ yy_parser *parser, OPSLAB *slab)
770{
3ac7ff8f 771 I32 nexttoke = parser->nexttoke;
3ac7ff8f 772 PERL_ARGS_ASSERT_PARSER_FREE_NEXTTOKE_OPS;
3ce3dcd9 773 while (nexttoke--) {
3ac7ff8f
FC
774 if (S_is_opval_token(parser->nexttype[nexttoke] & 0xffff)
775 && parser->nextval[nexttoke].opval
776 && parser->nextval[nexttoke].opval->op_slabbed
777 && OpSLAB(parser->nextval[nexttoke].opval) == slab) {
3ce3dcd9 778 op_free(parser->nextval[nexttoke].opval);
3ac7ff8f
FC
779 parser->nextval[nexttoke].opval = NULL;
780 }
3ce3dcd9 781 }
e3abe207
DM
782}
783
784
ffb4593c 785/*
f0e67a1d
Z
786=for apidoc AmxU|SV *|PL_parser-E<gt>linestr
787
788Buffer scalar containing the chunk currently under consideration of the
789text currently being lexed. This is always a plain string scalar (for
790which C<SvPOK> is true). It is not intended to be used as a scalar by
791normal scalar means; instead refer to the buffer directly by the pointer
792variables described below.
793
794The lexer maintains various C<char*> pointers to things in the
795C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
796reallocated, all of these pointers must be updated. Don't attempt to
797do this manually, but rather use L</lex_grow_linestr> if you need to
798reallocate the buffer.
799
800The content of the text chunk in the buffer is commonly exactly one
801complete line of input, up to and including a newline terminator,
802but there are situations where it is otherwise. The octets of the
803buffer may be intended to be interpreted as either UTF-8 or Latin-1.
804The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
805flag on this scalar, which may disagree with it.
806
807For direct examination of the buffer, the variable
808L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
809lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
810of these pointers is usually preferable to examination of the scalar
811through normal scalar means.
812
813=for apidoc AmxU|char *|PL_parser-E<gt>bufend
814
815Direct pointer to the end of the chunk of text currently being lexed, the
816end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
6602b933 817+ SvCUR(PL_parser-E<gt>linestr)>. A C<NUL> character (zero octet) is
f0e67a1d
Z
818always located at the end of the buffer, and does not count as part of
819the buffer's contents.
820
821=for apidoc AmxU|char *|PL_parser-E<gt>bufptr
822
823Points to the current position of lexing inside the lexer buffer.
824Characters around this point may be freely examined, within
825the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
826L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
827interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
828
829Lexing code (whether in the Perl core or not) moves this pointer past
830the characters that it consumes. It is also expected to perform some
831bookkeeping whenever a newline character is consumed. This movement
832can be more conveniently performed by the function L</lex_read_to>,
833which handles newlines appropriately.
834
835Interpretation of the buffer's octets can be abstracted out by
836using the slightly higher-level functions L</lex_peek_unichar> and
837L</lex_read_unichar>.
838
839=for apidoc AmxU|char *|PL_parser-E<gt>linestart
840
841Points to the start of the current line inside the lexer buffer.
842This is useful for indicating at which column an error occurred, and
843not much else. This must be updated by any lexing code that consumes
844a newline; the function L</lex_read_to> handles this detail.
845
846=cut
847*/
848
849/*
850=for apidoc Amx|bool|lex_bufutf8
851
852Indicates whether the octets in the lexer buffer
853(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
854of Unicode characters. If not, they should be interpreted as Latin-1
855characters. This is analogous to the C<SvUTF8> flag for scalars.
856
857In UTF-8 mode, it is not guaranteed that the lexer buffer actually
858contains valid UTF-8. Lexing code must be robust in the face of invalid
859encoding.
860
861The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
862is significant, but not the whole story regarding the input character
863encoding. Normally, when a file is being read, the scalar contains octets
864and its C<SvUTF8> flag is off, but the octets should be interpreted as
865UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
866however, the scalar may have the C<SvUTF8> flag on, and in this case its
867octets should be interpreted as UTF-8 unless the C<use bytes> pragma
868is in effect. This logic may change in the future; use this function
869instead of implementing the logic yourself.
870
871=cut
872*/
873
874bool
875Perl_lex_bufutf8(pTHX)
876{
877 return UTF;
878}
879
880/*
881=for apidoc Amx|char *|lex_grow_linestr|STRLEN len
882
883Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
6602b933 884at least I<len> octets (including terminating C<NUL>). Returns a
f0e67a1d
Z
885pointer to the reallocated buffer. This is necessary before making
886any direct modification of the buffer that would increase its length.
887L</lex_stuff_pvn> provides a more convenient way to insert text into
888the buffer.
889
890Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
891this function updates all of the lexer's variables that point directly
892into the buffer.
893
894=cut
895*/
896
897char *
898Perl_lex_grow_linestr(pTHX_ STRLEN len)
899{
900 SV *linestr;
901 char *buf;
902 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
c7641931 903 STRLEN linestart_pos, last_uni_pos, last_lop_pos, re_eval_start_pos;
f0e67a1d
Z
904 linestr = PL_parser->linestr;
905 buf = SvPVX(linestr);
906 if (len <= SvLEN(linestr))
907 return buf;
908 bufend_pos = PL_parser->bufend - buf;
909 bufptr_pos = PL_parser->bufptr - buf;
910 oldbufptr_pos = PL_parser->oldbufptr - buf;
911 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
912 linestart_pos = PL_parser->linestart - buf;
913 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
914 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
3328ab5a
FC
915 re_eval_start_pos = PL_parser->lex_shared->re_eval_start ?
916 PL_parser->lex_shared->re_eval_start - buf : 0;
c7641931 917
f0e67a1d 918 buf = sv_grow(linestr, len);
c7641931 919
f0e67a1d
Z
920 PL_parser->bufend = buf + bufend_pos;
921 PL_parser->bufptr = buf + bufptr_pos;
922 PL_parser->oldbufptr = buf + oldbufptr_pos;
923 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
924 PL_parser->linestart = buf + linestart_pos;
925 if (PL_parser->last_uni)
926 PL_parser->last_uni = buf + last_uni_pos;
927 if (PL_parser->last_lop)
928 PL_parser->last_lop = buf + last_lop_pos;
3328ab5a
FC
929 if (PL_parser->lex_shared->re_eval_start)
930 PL_parser->lex_shared->re_eval_start = buf + re_eval_start_pos;
f0e67a1d
Z
931 return buf;
932}
933
934/*
83aa740e 935=for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
f0e67a1d
Z
936
937Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
938immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
939reallocating the buffer if necessary. This means that lexing code that
940runs later will see the characters as if they had appeared in the input.
941It is not recommended to do this as part of normal parsing, and most
942uses of this facility run the risk of the inserted characters being
943interpreted in an unintended manner.
944
945The string to be inserted is represented by I<len> octets starting
946at I<pv>. These octets are interpreted as either UTF-8 or Latin-1,
947according to whether the C<LEX_STUFF_UTF8> flag is set in I<flags>.
948The characters are recoded for the lexer buffer, according to how the
949buffer is currently being interpreted (L</lex_bufutf8>). If a string
9dcc53ea 950to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
f0e67a1d
Z
951function is more convenient.
952
953=cut
954*/
955
956void
83aa740e 957Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
f0e67a1d 958{
749123ff 959 dVAR;
f0e67a1d
Z
960 char *bufptr;
961 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
962 if (flags & ~(LEX_STUFF_UTF8))
963 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
964 if (UTF) {
965 if (flags & LEX_STUFF_UTF8) {
966 goto plain_copy;
967 } else {
54d004e8 968 STRLEN highhalf = 0; /* Count of variants */
83aa740e 969 const char *p, *e = pv+len;
54d004e8
KW
970 for (p = pv; p != e; p++) {
971 if (! UTF8_IS_INVARIANT(*p)) {
972 highhalf++;
973 }
974 }
f0e67a1d
Z
975 if (!highhalf)
976 goto plain_copy;
977 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
978 bufptr = PL_parser->bufptr;
979 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
980 SvCUR_set(PL_parser->linestr,
981 SvCUR(PL_parser->linestr) + len+highhalf);
f0e67a1d
Z
982 PL_parser->bufend += len+highhalf;
983 for (p = pv; p != e; p++) {
984 U8 c = (U8)*p;
54d004e8
KW
985 if (! UTF8_IS_INVARIANT(c)) {
986 *bufptr++ = UTF8_TWO_BYTE_HI(c);
987 *bufptr++ = UTF8_TWO_BYTE_LO(c);
f0e67a1d
Z
988 } else {
989 *bufptr++ = (char)c;
990 }
991 }
992 }
993 } else {
994 if (flags & LEX_STUFF_UTF8) {
995 STRLEN highhalf = 0;
83aa740e 996 const char *p, *e = pv+len;
f0e67a1d
Z
997 for (p = pv; p != e; p++) {
998 U8 c = (U8)*p;
54d004e8 999 if (UTF8_IS_ABOVE_LATIN1(c)) {
f0e67a1d
Z
1000 Perl_croak(aTHX_ "Lexing code attempted to stuff "
1001 "non-Latin-1 character into Latin-1 input");
54d004e8 1002 } else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e)) {
f0e67a1d
Z
1003 p++;
1004 highhalf++;
54d004e8 1005 } else if (! UTF8_IS_INVARIANT(c)) {
f0e67a1d
Z
1006 /* malformed UTF-8 */
1007 ENTER;
1008 SAVESPTR(PL_warnhook);
1009 PL_warnhook = PERL_WARNHOOK_FATAL;
c80e42f3 1010 utf8n_to_uvchr((U8*)p, e-p, NULL, 0);
f0e67a1d
Z
1011 LEAVE;
1012 }
1013 }
1014 if (!highhalf)
1015 goto plain_copy;
1016 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
1017 bufptr = PL_parser->bufptr;
1018 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1019 SvCUR_set(PL_parser->linestr,
1020 SvCUR(PL_parser->linestr) + len-highhalf);
f0e67a1d 1021 PL_parser->bufend += len-highhalf;
54d004e8
KW
1022 p = pv;
1023 while (p < e) {
1024 if (UTF8_IS_INVARIANT(*p)) {
1025 *bufptr++ = *p;
1026 p++;
f0e67a1d 1027 }
54d004e8
KW
1028 else {
1029 assert(p < e -1 );
94bb8c36 1030 *bufptr++ = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
54d004e8
KW
1031 p += 2;
1032 }
f0e67a1d
Z
1033 }
1034 } else {
54d004e8 1035 plain_copy:
f0e67a1d
Z
1036 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1037 bufptr = PL_parser->bufptr;
1038 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
255fdf19 1039 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
f0e67a1d
Z
1040 PL_parser->bufend += len;
1041 Copy(pv, bufptr, len, char);
1042 }
1043 }
1044}
1045
1046/*
9dcc53ea
Z
1047=for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
1048
1049Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1050immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1051reallocating the buffer if necessary. This means that lexing code that
1052runs later will see the characters as if they had appeared in the input.
1053It is not recommended to do this as part of normal parsing, and most
1054uses of this facility run the risk of the inserted characters being
1055interpreted in an unintended manner.
1056
1057The string to be inserted is represented by octets starting at I<pv>
1058and continuing to the first nul. These octets are interpreted as either
1059UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
1060in I<flags>. The characters are recoded for the lexer buffer, according
1061to how the buffer is currently being interpreted (L</lex_bufutf8>).
1062If it is not convenient to nul-terminate a string to be inserted, the
1063L</lex_stuff_pvn> function is more appropriate.
1064
1065=cut
1066*/
1067
1068void
1069Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1070{
1071 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1072 lex_stuff_pvn(pv, strlen(pv), flags);
1073}
1074
1075/*
f0e67a1d
Z
1076=for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1077
1078Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1079immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1080reallocating the buffer if necessary. This means that lexing code that
1081runs later will see the characters as if they had appeared in the input.
1082It is not recommended to do this as part of normal parsing, and most
1083uses of this facility run the risk of the inserted characters being
1084interpreted in an unintended manner.
1085
1086The string to be inserted is the string value of I<sv>. The characters
1087are recoded for the lexer buffer, according to how the buffer is currently
9dcc53ea 1088being interpreted (L</lex_bufutf8>). If a string to be inserted is
f0e67a1d
Z
1089not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1090need to construct a scalar.
1091
1092=cut
1093*/
1094
1095void
1096Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1097{
1098 char *pv;
1099 STRLEN len;
1100 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1101 if (flags)
1102 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1103 pv = SvPV(sv, len);
1104 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1105}
1106
1107/*
1108=for apidoc Amx|void|lex_unstuff|char *ptr
1109
1110Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1111I<ptr>. Text following I<ptr> will be moved, and the buffer shortened.
1112This hides the discarded text from any lexing code that runs later,
1113as if the text had never appeared.
1114
1115This is not the normal way to consume lexed text. For that, use
1116L</lex_read_to>.
1117
1118=cut
1119*/
1120
1121void
1122Perl_lex_unstuff(pTHX_ char *ptr)
1123{
1124 char *buf, *bufend;
1125 STRLEN unstuff_len;
1126 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1127 buf = PL_parser->bufptr;
1128 if (ptr < buf)
1129 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1130 if (ptr == buf)
1131 return;
1132 bufend = PL_parser->bufend;
1133 if (ptr > bufend)
1134 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1135 unstuff_len = ptr - buf;
1136 Move(ptr, buf, bufend+1-ptr, char);
1137 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1138 PL_parser->bufend = bufend - unstuff_len;
1139}
1140
1141/*
1142=for apidoc Amx|void|lex_read_to|char *ptr
1143
1144Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1145to I<ptr>. This advances L</PL_parser-E<gt>bufptr> to match I<ptr>,
1146performing the correct bookkeeping whenever a newline character is passed.
1147This is the normal way to consume lexed text.
1148
1149Interpretation of the buffer's octets can be abstracted out by
1150using the slightly higher-level functions L</lex_peek_unichar> and
1151L</lex_read_unichar>.
1152
1153=cut
1154*/
1155
1156void
1157Perl_lex_read_to(pTHX_ char *ptr)
1158{
1159 char *s;
1160 PERL_ARGS_ASSERT_LEX_READ_TO;
1161 s = PL_parser->bufptr;
1162 if (ptr < s || ptr > PL_parser->bufend)
1163 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1164 for (; s != ptr; s++)
1165 if (*s == '\n') {
83944c01 1166 COPLINE_INC_WITH_HERELINES;
f0e67a1d
Z
1167 PL_parser->linestart = s+1;
1168 }
1169 PL_parser->bufptr = ptr;
1170}
1171
1172/*
1173=for apidoc Amx|void|lex_discard_to|char *ptr
1174
1175Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1176up to I<ptr>. The remaining content of the buffer will be moved, and
1177all pointers into the buffer updated appropriately. I<ptr> must not
1178be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1179it is not permitted to discard text that has yet to be lexed.
1180
1181Normally it is not necessarily to do this directly, because it suffices to
1182use the implicit discarding behaviour of L</lex_next_chunk> and things
1183based on it. However, if a token stretches across multiple lines,
1f317c95 1184and the lexing code has kept multiple lines of text in the buffer for
f0e67a1d
Z
1185that purpose, then after completion of the token it would be wise to
1186explicitly discard the now-unneeded earlier lines, to avoid future
1187multi-line tokens growing the buffer without bound.
1188
1189=cut
1190*/
1191
1192void
1193Perl_lex_discard_to(pTHX_ char *ptr)
1194{
1195 char *buf;
1196 STRLEN discard_len;
1197 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1198 buf = SvPVX(PL_parser->linestr);
1199 if (ptr < buf)
1200 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1201 if (ptr == buf)
1202 return;
1203 if (ptr > PL_parser->bufptr)
1204 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1205 discard_len = ptr - buf;
1206 if (PL_parser->oldbufptr < ptr)
1207 PL_parser->oldbufptr = ptr;
1208 if (PL_parser->oldoldbufptr < ptr)
1209 PL_parser->oldoldbufptr = ptr;
1210 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1211 PL_parser->last_uni = NULL;
1212 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1213 PL_parser->last_lop = NULL;
1214 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1215 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1216 PL_parser->bufend -= discard_len;
1217 PL_parser->bufptr -= discard_len;
1218 PL_parser->oldbufptr -= discard_len;
1219 PL_parser->oldoldbufptr -= discard_len;
1220 if (PL_parser->last_uni)
1221 PL_parser->last_uni -= discard_len;
1222 if (PL_parser->last_lop)
1223 PL_parser->last_lop -= discard_len;
1224}
1225
1226/*
1227=for apidoc Amx|bool|lex_next_chunk|U32 flags
1228
1229Reads in the next chunk of text to be lexed, appending it to
1230L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1231looked to the end of the current chunk and wants to know more. It is
1232usual, but not necessary, for lexing to have consumed the entirety of
1233the current chunk at this time.
1234
1235If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1236chunk (i.e., the current chunk has been entirely consumed), normally the
1237current chunk will be discarded at the same time that the new chunk is
1238read in. If I<flags> includes C<LEX_KEEP_PREVIOUS>, the current chunk
1239will not be discarded. If the current chunk has not been entirely
1240consumed, then it will not be discarded regardless of the flag.
1241
1242Returns true if some new text was added to the buffer, or false if the
1243buffer has reached the end of the input text.
1244
1245=cut
1246*/
1247
1248#define LEX_FAKE_EOF 0x80000000
112d1284 1249#define LEX_NO_TERM 0x40000000
f0e67a1d
Z
1250
1251bool
1252Perl_lex_next_chunk(pTHX_ U32 flags)
1253{
1254 SV *linestr;
1255 char *buf;
1256 STRLEN old_bufend_pos, new_bufend_pos;
1257 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1258 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1259 bool got_some_for_debugger = 0;
f0e67a1d 1260 bool got_some;
112d1284 1261 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM))
f0e67a1d 1262 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
f0e67a1d
Z
1263 linestr = PL_parser->linestr;
1264 buf = SvPVX(linestr);
1265 if (!(flags & LEX_KEEP_PREVIOUS) &&
1266 PL_parser->bufptr == PL_parser->bufend) {
1267 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1268 linestart_pos = 0;
1269 if (PL_parser->last_uni != PL_parser->bufend)
1270 PL_parser->last_uni = NULL;
1271 if (PL_parser->last_lop != PL_parser->bufend)
1272 PL_parser->last_lop = NULL;
1273 last_uni_pos = last_lop_pos = 0;
1274 *buf = 0;
1275 SvCUR(linestr) = 0;
1276 } else {
1277 old_bufend_pos = PL_parser->bufend - buf;
1278 bufptr_pos = PL_parser->bufptr - buf;
1279 oldbufptr_pos = PL_parser->oldbufptr - buf;
1280 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1281 linestart_pos = PL_parser->linestart - buf;
1282 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1283 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1284 }
1285 if (flags & LEX_FAKE_EOF) {
1286 goto eof;
60d63348 1287 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
f0e67a1d
Z
1288 got_some = 0;
1289 } else if (filter_gets(linestr, old_bufend_pos)) {
1290 got_some = 1;
17cc9359 1291 got_some_for_debugger = 1;
112d1284
FC
1292 } else if (flags & LEX_NO_TERM) {
1293 got_some = 0;
f0e67a1d 1294 } else {
580561a3
Z
1295 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1296 sv_setpvs(linestr, "");
f0e67a1d
Z
1297 eof:
1298 /* End of real input. Close filehandle (unless it was STDIN),
1299 * then add implicit termination.
1300 */
87606032 1301 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
f0e67a1d
Z
1302 PerlIO_clearerr(PL_parser->rsfp);
1303 else if (PL_parser->rsfp)
1304 (void)PerlIO_close(PL_parser->rsfp);
1305 PL_parser->rsfp = NULL;
60d63348 1306 PL_parser->in_pod = PL_parser->filtered = 0;
f0e67a1d
Z
1307 if (!PL_in_eval && PL_minus_p) {
1308 sv_catpvs(linestr,
1309 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1310 PL_minus_n = PL_minus_p = 0;
1311 } else if (!PL_in_eval && PL_minus_n) {
1312 sv_catpvs(linestr, /*{*/";}");
1313 PL_minus_n = 0;
1314 } else
1315 sv_catpvs(linestr, ";");
1316 got_some = 1;
1317 }
1318 buf = SvPVX(linestr);
1319 new_bufend_pos = SvCUR(linestr);
1320 PL_parser->bufend = buf + new_bufend_pos;
1321 PL_parser->bufptr = buf + bufptr_pos;
1322 PL_parser->oldbufptr = buf + oldbufptr_pos;
1323 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1324 PL_parser->linestart = buf + linestart_pos;
1325 if (PL_parser->last_uni)
1326 PL_parser->last_uni = buf + last_uni_pos;
1327 if (PL_parser->last_lop)
1328 PL_parser->last_lop = buf + last_lop_pos;
7f1c3e8c
FC
1329 if (PL_parser->preambling != NOLINE) {
1330 CopLINE_set(PL_curcop, PL_parser->preambling + 1);
1331 PL_parser->preambling = NOLINE;
1332 }
17cc9359 1333 if (got_some_for_debugger && (PERLDB_LINE || PERLDB_SAVESRC) &&
f0e67a1d
Z
1334 PL_curstash != PL_debstash) {
1335 /* debugger active and we're not compiling the debugger code,
1336 * so store the line into the debugger's array of lines
1337 */
1338 update_debugger_info(NULL, buf+old_bufend_pos,
1339 new_bufend_pos-old_bufend_pos);
1340 }
1341 return got_some;
1342}
1343
1344/*
1345=for apidoc Amx|I32|lex_peek_unichar|U32 flags
1346
1347Looks ahead one (Unicode) character in the text currently being lexed.
1348Returns the codepoint (unsigned integer value) of the next character,
1349or -1 if lexing has reached the end of the input text. To consume the
1350peeked character, use L</lex_read_unichar>.
1351
1352If the next character is in (or extends into) the next chunk of input
1353text, the next chunk will be read in. Normally the current chunk will be
1354discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1355then the current chunk will not be discarded.
1356
1357If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1358is encountered, an exception is generated.
1359
1360=cut
1361*/
1362
1363I32
1364Perl_lex_peek_unichar(pTHX_ U32 flags)
1365{
749123ff 1366 dVAR;
f0e67a1d
Z
1367 char *s, *bufend;
1368 if (flags & ~(LEX_KEEP_PREVIOUS))
1369 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1370 s = PL_parser->bufptr;
1371 bufend = PL_parser->bufend;
1372 if (UTF) {
1373 U8 head;
1374 I32 unichar;
1375 STRLEN len, retlen;
1376 if (s == bufend) {
1377 if (!lex_next_chunk(flags))
1378 return -1;
1379 s = PL_parser->bufptr;
1380 bufend = PL_parser->bufend;
1381 }
1382 head = (U8)*s;
54d004e8 1383 if (UTF8_IS_INVARIANT(head))
f0e67a1d 1384 return head;
54d004e8
KW
1385 if (UTF8_IS_START(head)) {
1386 len = UTF8SKIP(&head);
f0e67a1d
Z
1387 while ((STRLEN)(bufend-s) < len) {
1388 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1389 break;
1390 s = PL_parser->bufptr;
1391 bufend = PL_parser->bufend;
1392 }
1393 }
c80e42f3 1394 unichar = utf8n_to_uvchr((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
f0e67a1d
Z
1395 if (retlen == (STRLEN)-1) {
1396 /* malformed UTF-8 */
1397 ENTER;
1398 SAVESPTR(PL_warnhook);
1399 PL_warnhook = PERL_WARNHOOK_FATAL;
c80e42f3 1400 utf8n_to_uvchr((U8*)s, bufend-s, NULL, 0);
f0e67a1d
Z
1401 LEAVE;
1402 }
1403 return unichar;
1404 } else {
1405 if (s == bufend) {
1406 if (!lex_next_chunk(flags))
1407 return -1;
1408 s = PL_parser->bufptr;
1409 }
1410 return (U8)*s;
1411 }
1412}
1413
1414/*
1415=for apidoc Amx|I32|lex_read_unichar|U32 flags
1416
1417Reads the next (Unicode) character in the text currently being lexed.
1418Returns the codepoint (unsigned integer value) of the character read,
1419and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1420if lexing has reached the end of the input text. To non-destructively
1421examine the next character, use L</lex_peek_unichar> instead.
1422
1423If the next character is in (or extends into) the next chunk of input
1424text, the next chunk will be read in. Normally the current chunk will be
1425discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1426then the current chunk will not be discarded.
1427
1428If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1429is encountered, an exception is generated.
1430
1431=cut
1432*/
1433
1434I32
1435Perl_lex_read_unichar(pTHX_ U32 flags)
1436{
1437 I32 c;
1438 if (flags & ~(LEX_KEEP_PREVIOUS))
1439 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1440 c = lex_peek_unichar(flags);
1441 if (c != -1) {
1442 if (c == '\n')
83944c01 1443 COPLINE_INC_WITH_HERELINES;
d9018cbe
EB
1444 if (UTF)
1445 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1446 else
1447 ++(PL_parser->bufptr);
f0e67a1d
Z
1448 }
1449 return c;
1450}
1451
1452/*
1453=for apidoc Amx|void|lex_read_space|U32 flags
1454
1455Reads optional spaces, in Perl style, in the text currently being
1456lexed. The spaces may include ordinary whitespace characters and
1457Perl-style comments. C<#line> directives are processed if encountered.
1458L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1459at a non-space character (or the end of the input text).
1460
1461If spaces extend into the next chunk of input text, the next chunk will
1462be read in. Normally the current chunk will be discarded at the same
1463time, but if I<flags> includes C<LEX_KEEP_PREVIOUS> then the current
1464chunk will not be discarded.
1465
1466=cut
1467*/
1468
21791330 1469#define LEX_NO_INCLINE 0x40000000
f0998909
Z
1470#define LEX_NO_NEXT_CHUNK 0x80000000
1471
f0e67a1d
Z
1472void
1473Perl_lex_read_space(pTHX_ U32 flags)
1474{
1475 char *s, *bufend;
21791330 1476 const bool can_incline = !(flags & LEX_NO_INCLINE);
f0e67a1d 1477 bool need_incline = 0;
21791330 1478 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK|LEX_NO_INCLINE))
f0e67a1d 1479 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
f0e67a1d
Z
1480 s = PL_parser->bufptr;
1481 bufend = PL_parser->bufend;
1482 while (1) {
1483 char c = *s;
1484 if (c == '#') {
1485 do {
1486 c = *++s;
1487 } while (!(c == '\n' || (c == 0 && s == bufend)));
1488 } else if (c == '\n') {
1489 s++;
21791330
FC
1490 if (can_incline) {
1491 PL_parser->linestart = s;
1492 if (s == bufend)
1493 need_incline = 1;
1494 else
1495 incline(s);
1496 }
f0e67a1d
Z
1497 } else if (isSPACE(c)) {
1498 s++;
1499 } else if (c == 0 && s == bufend) {
1500 bool got_more;
65c68e17 1501 line_t l;
f0998909
Z
1502 if (flags & LEX_NO_NEXT_CHUNK)
1503 break;
f0e67a1d 1504 PL_parser->bufptr = s;
65c68e17 1505 l = CopLINE(PL_curcop);
851b527a 1506 CopLINE(PL_curcop) += PL_parser->herelines + 1;
f0e67a1d 1507 got_more = lex_next_chunk(flags);
65c68e17 1508 CopLINE_set(PL_curcop, l);
f0e67a1d
Z
1509 s = PL_parser->bufptr;
1510 bufend = PL_parser->bufend;
1511 if (!got_more)
1512 break;
21791330 1513 if (can_incline && need_incline && PL_parser->rsfp) {
f0e67a1d
Z
1514 incline(s);
1515 need_incline = 0;
1516 }
1517 } else {
1518 break;
1519 }
1520 }
f0e67a1d
Z
1521 PL_parser->bufptr = s;
1522}
1523
1524/*
fe788d6b
PM
1525
1526=for apidoc EXMp|bool|validate_proto|SV *name|SV *proto|bool warn
1527
1528This function performs syntax checking on a prototype, C<proto>.
1529If C<warn> is true, any illegal characters or mismatched brackets
1530will trigger illegalproto warnings, declaring that they were
1531detected in the prototype for C<name>.
1532
1533The return value is C<true> if this is a valid prototype, and
1534C<false> if it is not, regardless of whether C<warn> was C<true> or
1535C<false>.
1536
1537Note that C<NULL> is a valid C<proto> and will always return C<true>.
1538
1539=cut
1540
1541 */
1542
1543bool
1544Perl_validate_proto(pTHX_ SV *name, SV *proto, bool warn)
1545{
1546 STRLEN len, origlen;
1547 char *p = proto ? SvPV(proto, len) : NULL;
1548 bool bad_proto = FALSE;
1549 bool in_brackets = FALSE;
1550 bool after_slash = FALSE;
1551 char greedy_proto = ' ';
1552 bool proto_after_greedy_proto = FALSE;
1553 bool must_be_last = FALSE;
1554 bool underscore = FALSE;
f791a21a 1555 bool bad_proto_after_underscore = FALSE;
fe788d6b
PM
1556
1557 PERL_ARGS_ASSERT_VALIDATE_PROTO;
1558
1559 if (!proto)
1560 return TRUE;
1561
1562 origlen = len;
1563 for (; len--; p++) {
1564 if (!isSPACE(*p)) {
1565 if (must_be_last)
1566 proto_after_greedy_proto = TRUE;
f791a21a
PM
1567 if (underscore) {
1568 if (!strchr(";@%", *p))
1569 bad_proto_after_underscore = TRUE;
1570 underscore = FALSE;
1571 }
fe788d6b
PM
1572 if (!strchr("$@%*;[]&\\_+", *p) || *p == '\0') {
1573 bad_proto = TRUE;
1574 }
1575 else {
fe788d6b
PM
1576 if (*p == '[')
1577 in_brackets = TRUE;
1578 else if (*p == ']')
1579 in_brackets = FALSE;
1580 else if ((*p == '@' || *p == '%') &&
1581 !after_slash &&
1582 !in_brackets ) {
1583 must_be_last = TRUE;
1584 greedy_proto = *p;
1585 }
1586 else if (*p == '_')
f791a21a 1587 underscore = TRUE;
fe788d6b
PM
1588 }
1589 if (*p == '\\')
1590 after_slash = TRUE;
1591 else
1592 after_slash = FALSE;
1593 }
1594 }
1595
1596 if (warn) {
b54d603d 1597 SV *tmpsv = newSVpvs_flags("", SVs_TEMP);
fe788d6b 1598 p -= origlen;
b54d603d
PM
1599 p = SvUTF8(proto)
1600 ? sv_uni_display(tmpsv, newSVpvn_flags(p, origlen, SVs_TEMP | SVf_UTF8),
1601 origlen, UNI_DISPLAY_ISPRINT)
1602 : pv_pretty(tmpsv, p, origlen, 60, NULL, NULL, PERL_PV_ESCAPE_NONASCII);
1603
fe788d6b
PM
1604 if (proto_after_greedy_proto)
1605 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1606 "Prototype after '%c' for %"SVf" : %s",
1607 greedy_proto, SVfARG(name), p);
50278ed0
PM
1608 if (in_brackets)
1609 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1610 "Missing ']' in prototype for %"SVf" : %s",
1611 SVfARG(name), p);
b54d603d 1612 if (bad_proto)
fe788d6b 1613 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
f791a21a
PM
1614 "Illegal character in prototype for %"SVf" : %s",
1615 SVfARG(name), p);
1616 if (bad_proto_after_underscore)
1617 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1618 "Illegal character after '_' in prototype for %"SVf" : %s",
1619 SVfARG(name), p);
fe788d6b
PM
1620 }
1621
1622 return (! (proto_after_greedy_proto || bad_proto) );
1623}
1624
1625/*
ffb4593c
NT
1626 * S_incline
1627 * This subroutine has nothing to do with tilting, whether at windmills
1628 * or pinball tables. Its name is short for "increment line". It
57843af0 1629 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1630 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1631 * # line 500 "foo.pm"
1632 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1633 */
1634
76e3520e 1635STATIC void
d9095cec 1636S_incline(pTHX_ const char *s)
463ee0b2 1637{
d9095cec
NC
1638 const char *t;
1639 const char *n;
1640 const char *e;
8818d409 1641 line_t line_num;
463ee0b2 1642
7918f24d
NC
1643 PERL_ARGS_ASSERT_INCLINE;
1644
83944c01 1645 COPLINE_INC_WITH_HERELINES;
451f421f
FC
1646 if (!PL_rsfp && !PL_parser->filtered && PL_lex_state == LEX_NORMAL
1647 && s+1 == PL_bufend && *s == ';') {
1648 /* fake newline in string eval */
1649 CopLINE_dec(PL_curcop);
1650 return;
1651 }
463ee0b2
LW
1652 if (*s++ != '#')
1653 return;
d4c19fe8
AL
1654 while (SPACE_OR_TAB(*s))
1655 s++;
73659bf1
GS
1656 if (strnEQ(s, "line", 4))
1657 s += 4;
1658 else
1659 return;
084592ab 1660 if (SPACE_OR_TAB(*s))
73659bf1 1661 s++;
4e553d73 1662 else
73659bf1 1663 return;
d4c19fe8
AL
1664 while (SPACE_OR_TAB(*s))
1665 s++;
463ee0b2
LW
1666 if (!isDIGIT(*s))
1667 return;
d4c19fe8 1668
463ee0b2
LW
1669 n = s;
1670 while (isDIGIT(*s))
1671 s++;
07714eb4 1672 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1673 return;
bf4acbe4 1674 while (SPACE_OR_TAB(*s))
463ee0b2 1675 s++;
73659bf1 1676 if (*s == '"' && (t = strchr(s+1, '"'))) {
463ee0b2 1677 s++;
73659bf1
GS
1678 e = t + 1;
1679 }
463ee0b2 1680 else {
c35e046a
AL
1681 t = s;
1682 while (!isSPACE(*t))
1683 t++;
73659bf1 1684 e = t;
463ee0b2 1685 }
bf4acbe4 1686 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1687 e++;
1688 if (*e != '\n' && *e != '\0')
1689 return; /* false alarm */
1690
96e440d2 1691 line_num = grok_atou(n, &e) - 1;
8818d409 1692
f4dd75d9 1693 if (t - s > 0) {
d9095cec 1694 const STRLEN len = t - s;
3df32bda 1695
d36ee5be 1696 if (!PL_rsfp && !PL_parser->filtered) {
e66cf94c
RGS
1697 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1698 * to *{"::_<newfilename"} */
44867030
NC
1699 /* However, the long form of evals is only turned on by the
1700 debugger - usually they're "(eval %lu)" */
d36ee5be
FC
1701 GV * const cfgv = CopFILEGV(PL_curcop);
1702 if (cfgv) {
38bd7ad8
FC
1703 char smallbuf[128];
1704 STRLEN tmplen2 = len;
44867030 1705 char *tmpbuf2;
449dd039 1706 GV *gv2;
44867030
NC
1707
1708 if (tmplen2 + 2 <= sizeof smallbuf)
1709 tmpbuf2 = smallbuf;
1710 else
1711 Newx(tmpbuf2, tmplen2 + 2, char);
1712
38bd7ad8
FC
1713 tmpbuf2[0] = '_';
1714 tmpbuf2[1] = '<';
44867030
NC
1715
1716 memcpy(tmpbuf2 + 2, s, tmplen2);
1717 tmplen2 += 2;
1718
8a5ee598 1719 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1720 if (!isGV(gv2)) {
8a5ee598 1721 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1722 /* adjust ${"::_<newfilename"} to store the new file name */
1723 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
8818d409
FC
1724 /* The line number may differ. If that is the case,
1725 alias the saved lines that are in the array.
1726 Otherwise alias the whole array. */
1727 if (CopLINE(PL_curcop) == line_num) {
38bd7ad8
FC
1728 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(cfgv)));
1729 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(cfgv)));
8818d409 1730 }
38bd7ad8
FC
1731 else if (GvAV(cfgv)) {
1732 AV * const av = GvAV(cfgv);
8818d409
FC
1733 const I32 start = CopLINE(PL_curcop)+1;
1734 I32 items = AvFILLp(av) - start;
1735 if (items > 0) {
1736 AV * const av2 = GvAVn(gv2);
1737 SV **svp = AvARRAY(av) + start;
1738 I32 l = (I32)line_num+1;
1739 while (items--)
1740 av_store(av2, l++, SvREFCNT_inc(*svp++));
1741 }
1742 }
e5527e4b 1743 }
44867030
NC
1744
1745 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
d36ee5be 1746 }
e66cf94c 1747 }
05ec9bb3 1748 CopFILE_free(PL_curcop);
449dd039 1749 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1750 }
8818d409 1751 CopLINE_set(PL_curcop, line_num);
463ee0b2
LW
1752}
1753
21791330
FC
1754#define skipspace(s) skipspace_flags(s, 0)
1755
29595ff2 1756
80a702cd 1757STATIC void
15f169a1 1758S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1759{
1760 AV *av = CopFILEAVx(PL_curcop);
1761 if (av) {
7f1c3e8c
FC
1762 SV * sv;
1763 if (PL_parser->preambling == NOLINE) sv = newSV_type(SVt_PVMG);
1764 else {
1765 sv = *av_fetch(av, 0, 1);
1766 SvUPGRADE(sv, SVt_PVMG);
1767 }
1768 if (!SvPOK(sv)) sv_setpvs(sv,"");
5fa550fb 1769 if (orig_sv)
7f1c3e8c 1770 sv_catsv(sv, orig_sv);
5fa550fb 1771 else
7f1c3e8c
FC
1772 sv_catpvn(sv, buf, len);
1773 if (!SvIOK(sv)) {
1774 (void)SvIOK_on(sv);
1775 SvIV_set(sv, 0);
1776 }
1777 if (PL_parser->preambling == NOLINE)
1778 av_store(av, CopLINE(PL_curcop), sv);
80a702cd
RGS
1779 }
1780}
1781
ffb4593c
NT
1782/*
1783 * S_skipspace
1784 * Called to gobble the appropriate amount and type of whitespace.
1785 * Skips comments as well.
1786 */
1787
76e3520e 1788STATIC char *
21791330 1789S_skipspace_flags(pTHX_ char *s, U32 flags)
a687059c 1790{
21791330 1791 PERL_ARGS_ASSERT_SKIPSPACE_FLAGS;
3280af22 1792 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
bf4acbe4 1793 while (s < PL_bufend && SPACE_OR_TAB(*s))
463ee0b2 1794 s++;
f0e67a1d
Z
1795 } else {
1796 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1797 PL_bufptr = s;
21791330 1798 lex_read_space(flags | LEX_KEEP_PREVIOUS |
f0998909
Z
1799 (PL_sublex_info.sub_inwhat || PL_lex_state == LEX_FORMLINE ?
1800 LEX_NO_NEXT_CHUNK : 0));
3280af22 1801 s = PL_bufptr;
f0e67a1d
Z
1802 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1803 if (PL_linestart > PL_bufptr)
1804 PL_bufptr = PL_linestart;
1805 return s;
463ee0b2 1806 }
5db06880 1807 return s;
a687059c 1808}
378cc40b 1809
ffb4593c
NT
1810/*
1811 * S_check_uni
1812 * Check the unary operators to ensure there's no ambiguity in how they're
1813 * used. An ambiguous piece of code would be:
1814 * rand + 5
1815 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1816 * the +5 is its argument.
1817 */
1818
76e3520e 1819STATIC void
cea2e8a9 1820S_check_uni(pTHX)
ba106d47 1821{
d4c19fe8
AL
1822 const char *s;
1823 const char *t;
2f3197b3 1824
3280af22 1825 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1826 return;
3280af22
NIS
1827 while (isSPACE(*PL_last_uni))
1828 PL_last_uni++;
c35e046a 1829 s = PL_last_uni;
8a2bca12 1830 while (isWORDCHAR_lazy_if(s,UTF) || *s == '-')
c35e046a 1831 s++;
3280af22 1832 if ((t = strchr(s, '(')) && t < PL_bufptr)
a0d0e21e 1833 return;
6136c704 1834
9b387841
NC
1835 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
1836 "Warning: Use of \"%.*s\" without parentheses is ambiguous",
1837 (int)(s - PL_last_uni), PL_last_uni);
2f3197b3
LW
1838}
1839
ffb4593c
NT
1840/*
1841 * LOP : macro to build a list operator. Its behaviour has been replaced
1842 * with a subroutine, S_lop() for which LOP is just another name.
1843 */
1844
a0d0e21e
LW
1845#define LOP(f,x) return lop(f,x,s)
1846
ffb4593c
NT
1847/*
1848 * S_lop
1849 * Build a list operator (or something that might be one). The rules:
41e8cbf4
FC
1850 * - if we have a next token, then it's a list operator (no parens) for
1851 * which the next token has already been parsed; e.g.,
1852 * sort foo @args
1853 * sort foo (@args)
ffb4593c
NT
1854 * - if the next thing is an opening paren, then it's a function
1855 * - else it's a list operator
1856 */
1857
76e3520e 1858STATIC I32
a0be28da 1859S_lop(pTHX_ I32 f, int x, char *s)
ffed7fef 1860{
7918f24d
NC
1861 PERL_ARGS_ASSERT_LOP;
1862
6154021b 1863 pl_yylval.ival = f;
35c8bce7 1864 CLINE;
3280af22
NIS
1865 PL_bufptr = s;
1866 PL_last_lop = PL_oldbufptr;
eb160463 1867 PL_last_lop_op = (OPCODE)f;
3280af22 1868 if (PL_nexttoke)
78cdf107 1869 goto lstop;
19f1898a 1870 PL_expect = x;
79072805 1871 if (*s == '(')
bbf60fe6 1872 return REPORT(FUNC);
294a536f 1873 s = skipspace(s);
79072805 1874 if (*s == '(')
bbf60fe6 1875 return REPORT(FUNC);
78cdf107
Z
1876 else {
1877 lstop:
1878 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
1879 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
bbf60fe6 1880 return REPORT(LSTOP);
78cdf107 1881 }
79072805
LW
1882}
1883
ffb4593c
NT
1884/*
1885 * S_force_next
9cbb5ea2 1886 * When the lexer realizes it knows the next token (for instance,
ffb4593c 1887 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 1888 * to know what token to return the next time the lexer is called. Caller
b5bbe64a
JH
1889 * will need to set PL_nextval[] and possibly PL_expect to ensure
1890 * the lexer handles the token correctly.
ffb4593c
NT
1891 */
1892
4e553d73 1893STATIC void
cea2e8a9 1894S_force_next(pTHX_ I32 type)
79072805 1895{
704d4215
GG
1896#ifdef DEBUGGING
1897 if (DEBUG_T_TEST) {
1898 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 1899 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
1900 }
1901#endif
3280af22
NIS
1902 PL_nexttype[PL_nexttoke] = type;
1903 PL_nexttoke++;
1904 if (PL_lex_state != LEX_KNOWNEXT) {
1905 PL_lex_defer = PL_lex_state;
3280af22 1906 PL_lex_state = LEX_KNOWNEXT;
79072805
LW
1907 }
1908}
1909
89f35911
FC
1910/*
1911 * S_postderef
1912 *
1913 * This subroutine handles postfix deref syntax after the arrow has already
1914 * been emitted. @* $* etc. are emitted as two separate token right here.
1915 * @[ @{ %[ %{ *{ are emitted also as two tokens, but this function emits
1916 * only the first, leaving yylex to find the next.
89f35911
FC
1917 */
1918
1919static int
ff25e5db 1920S_postderef(pTHX_ int const funny, char const next)
89f35911 1921{
ff25e5db 1922 assert(funny == DOLSHARP || strchr("$@%&*", funny));
89f35911
FC
1923 assert(strchr("*[{", next));
1924 if (next == '*') {
1925 PL_expect = XOPERATOR;
cc624add 1926 if (PL_lex_state == LEX_INTERPNORMAL && !PL_lex_brackets) {
ff25e5db 1927 assert('@' == funny || '$' == funny || DOLSHARP == funny);
cc624add 1928 PL_lex_state = LEX_INTERPEND;
cc624add
FC
1929 force_next(POSTJOIN);
1930 }
89f35911
FC
1931 force_next(next);
1932 PL_bufptr+=2;
1933 }
1934 else {
760ca746
FC
1935 if ('@' == funny && PL_lex_state == LEX_INTERPNORMAL
1936 && !PL_lex_brackets)
1937 PL_lex_dojoin = 2;
89f35911
FC
1938 PL_expect = XOPERATOR;
1939 PL_bufptr++;
1940 }
1941 return funny;
1942}
1943
28ac2b49
Z
1944void
1945Perl_yyunlex(pTHX)
1946{
a7aaec61
Z
1947 int yyc = PL_parser->yychar;
1948 if (yyc != YYEMPTY) {
1949 if (yyc) {
a7aaec61
Z
1950 NEXTVAL_NEXTTOKE = PL_parser->yylval;
1951 if (yyc == '{'/*}*/ || yyc == HASHBRACK || yyc == '['/*]*/) {
78cdf107 1952 PL_lex_allbrackets--;
a7aaec61 1953 PL_lex_brackets--;
78cdf107
Z
1954 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
1955 } else if (yyc == '('/*)*/) {
1956 PL_lex_allbrackets--;
1957 yyc |= (2<<24);
a7aaec61
Z
1958 }
1959 force_next(yyc);
1960 }
28ac2b49
Z
1961 PL_parser->yychar = YYEMPTY;
1962 }
1963}
1964
d0a148a6 1965STATIC SV *
15f169a1 1966S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 1967{
740cce10 1968 SV * const sv = newSVpvn_utf8(start, len,
eaf7a4d2
CS
1969 !IN_BYTES
1970 && UTF
1971 && !is_ascii_string((const U8*)start, len)
740cce10 1972 && is_utf8_string((const U8*)start, len));
d0a148a6
NC
1973 return sv;
1974}
1975
ffb4593c
NT
1976/*
1977 * S_force_word
1978 * When the lexer knows the next thing is a word (for instance, it has
1979 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
1980 * it calls S_force_word to stick the next word into the PL_nexttoke/val
1981 * lookahead.
ffb4593c
NT
1982 *
1983 * Arguments:
b1b65b59 1984 * char *start : buffer position (must be within PL_linestr)
02b34bbe 1985 * int token : PL_next* will be this type of bare word (e.g., METHOD,WORD)
ffb4593c
NT
1986 * int check_keyword : if true, Perl checks to make sure the word isn't
1987 * a keyword (do this if the word is a label, e.g. goto FOO)
1988 * int allow_pack : if true, : characters will also be allowed (require,
1989 * use, etc. do this)
9cbb5ea2 1990 * int allow_initial_tick : used by the "sub" lexer only.
ffb4593c
NT
1991 */
1992
76e3520e 1993STATIC char *
345b3785 1994S_force_word(pTHX_ char *start, int token, int check_keyword, int allow_pack)
79072805 1995{
eb578fdb 1996 char *s;
463ee0b2 1997 STRLEN len;
4e553d73 1998
7918f24d
NC
1999 PERL_ARGS_ASSERT_FORCE_WORD;
2000
294a536f 2001 start = skipspace(start);
463ee0b2 2002 s = start;
7e2040f0 2003 if (isIDFIRST_lazy_if(s,UTF) ||
345b3785 2004 (allow_pack && *s == ':') )
a0d0e21e 2005 {
3280af22 2006 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
01b5ef50
FC
2007 if (check_keyword) {
2008 char *s2 = PL_tokenbuf;
2009 if (allow_pack && len > 6 && strnEQ(s2, "CORE::", 6))
2010 s2 += 6, len -= 6;
2011 if (keyword(s2, len, 0))
463ee0b2 2012 return start;
01b5ef50 2013 }
463ee0b2 2014 if (token == METHOD) {
294a536f 2015 s = skipspace(s);
463ee0b2 2016 if (*s == '(')
3280af22 2017 PL_expect = XTERM;
463ee0b2 2018 else {
3280af22 2019 PL_expect = XOPERATOR;
463ee0b2 2020 }
79072805 2021 }
9ded7720 2022 NEXTVAL_NEXTTOKE.opval
d0a148a6
NC
2023 = (OP*)newSVOP(OP_CONST,0,
2024 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 2025 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
2026 force_next(token);
2027 }
2028 return s;
2029}
2030
ffb4593c
NT
2031/*
2032 * S_force_ident
9cbb5ea2 2033 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
2034 * text only contains the "foo" portion. The first argument is a pointer
2035 * to the "foo", and the second argument is the type symbol to prefix.
2036 * Forces the next token to be a "WORD".
9cbb5ea2 2037 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2038 */
2039
76e3520e 2040STATIC void
5aaab254 2041S_force_ident(pTHX_ const char *s, int kind)
79072805 2042{
7918f24d
NC
2043 PERL_ARGS_ASSERT_FORCE_IDENT;
2044
c9b48522
DD
2045 if (s[0]) {
2046 const STRLEN len = s[1] ? strlen(s) : 1; /* s = "\"" see yylex */
728847b1
BF
2047 OP* const o = (OP*)newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
2048 UTF ? SVf_UTF8 : 0));
9ded7720 2049 NEXTVAL_NEXTTOKE.opval = o;
79072805 2050 force_next(WORD);
748a9306 2051 if (kind) {
11343788 2052 o->op_private = OPpCONST_ENTERED;
55497cff
PP
2053 /* XXX see note in pp_entereval() for why we forgo typo
2054 warnings if the symbol must be introduced in an eval.
2055 GSAR 96-10-12 */
90e5519e 2056 gv_fetchpvn_flags(s, len,
4bff32c5 2057 (PL_in_eval ? GV_ADDMULTI
728847b1 2058 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
90e5519e
NC
2059 kind == '$' ? SVt_PV :
2060 kind == '@' ? SVt_PVAV :
2061 kind == '%' ? SVt_PVHV :
a0d0e21e 2062 SVt_PVGV
90e5519e 2063 );
748a9306 2064 }
79072805
LW
2065 }
2066}
2067
3f33d153
FC
2068static void
2069S_force_ident_maybe_lex(pTHX_ char pit)
2070{
3f33d153
FC
2071 NEXTVAL_NEXTTOKE.ival = pit;
2072 force_next('p');
2073}
2074
1571675a
GS
2075NV
2076Perl_str_to_version(pTHX_ SV *sv)
2077{
2078 NV retval = 0.0;
2079 NV nshift = 1.0;
2080 STRLEN len;
cfd0369c 2081 const char *start = SvPV_const(sv,len);
9d4ba2ae 2082 const char * const end = start + len;
504618e9 2083 const bool utf = SvUTF8(sv) ? TRUE : FALSE;
7918f24d
NC
2084
2085 PERL_ARGS_ASSERT_STR_TO_VERSION;
2086
1571675a 2087 while (start < end) {
ba210ebe 2088 STRLEN skip;
1571675a
GS
2089 UV n;
2090 if (utf)
9041c2e3 2091 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2092 else {
2093 n = *(U8*)start;
2094 skip = 1;
2095 }
2096 retval += ((NV)n)/nshift;
2097 start += skip;
2098 nshift *= 1000;
2099 }
2100 return retval;
2101}
2102
4e553d73 2103/*
ffb4593c
NT
2104 * S_force_version
2105 * Forces the next token to be a version number.
e759cc13
RGS
2106 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2107 * and if "guessing" is TRUE, then no new token is created (and the caller
2108 * must use an alternative parsing method).
ffb4593c
NT
2109 */
2110
76e3520e 2111STATIC char *
e759cc13 2112S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2113{
5f66b61c 2114 OP *version = NULL;
44dcb63b 2115 char *d;
89bfa8cd 2116
7918f24d
NC
2117 PERL_ARGS_ASSERT_FORCE_VERSION;
2118
294a536f 2119 s = skipspace(s);
89bfa8cd 2120
44dcb63b 2121 d = s;
dd629d5b 2122 if (*d == 'v')
44dcb63b 2123 d++;
44dcb63b 2124 if (isDIGIT(*d)) {
e759cc13
RGS
2125 while (isDIGIT(*d) || *d == '_' || *d == '.')
2126 d++;
4e4da3ac 2127 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
dd629d5b 2128 SV *ver;
6154021b
RGS
2129 s = scan_num(s, &pl_yylval);
2130 version = pl_yylval.opval;
dd629d5b
GS
2131 ver = cSVOPx(version)->op_sv;
2132 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2133 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2134 SvNV_set(ver, str_to_version(ver));
1571675a 2135 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2136 }
89bfa8cd 2137 }
5db06880 2138 else if (guessing) {
e759cc13 2139 return s;
5db06880 2140 }
89bfa8cd
PP
2141 }
2142
2143 /* NOTE: The parser sees the package name and the VERSION swapped */
9ded7720 2144 NEXTVAL_NEXTTOKE.opval = version;
4e553d73 2145 force_next(WORD);
89bfa8cd 2146
e759cc13 2147 return s;
89bfa8cd
PP
2148}
2149
ffb4593c 2150/*
91152fc1
DG
2151 * S_force_strict_version
2152 * Forces the next token to be a version number using strict syntax rules.
2153 */
2154
2155STATIC char *
2156S_force_strict_version(pTHX_ char *s)
2157{
91152fc1 2158 OP *version = NULL;
91152fc1
DG
2159 const char *errstr = NULL;
2160
2161 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2162
2163 while (isSPACE(*s)) /* leading whitespace */
2164 s++;
2165
2166 if (is_STRICT_VERSION(s,&errstr)) {
2167 SV *ver = newSV(0);
2168 s = (char *)scan_version(s, ver, 0);
2169 version = newSVOP(OP_CONST, 0, ver);
2170 }
4e4da3ac 2171 else if ( (*s != ';' && *s != '{' && *s != '}' ) &&
294a536f 2172 (s = skipspace(s), (*s != ';' && *s != '{' && *s != '}' )))
4e4da3ac 2173 {
91152fc1
DG
2174 PL_bufptr = s;
2175 if (errstr)
2176 yyerror(errstr); /* version required */
2177 return s;
2178 }
2179
91152fc1 2180 /* NOTE: The parser sees the package name and the VERSION swapped */
91152fc1
DG
2181 NEXTVAL_NEXTTOKE.opval = version;
2182 force_next(WORD);
2183
2184 return s;
2185}
2186
2187/*
ffb4593c
NT
2188 * S_tokeq
2189 * Tokenize a quoted string passed in as an SV. It finds the next
2190 * chunk, up to end of string or a backslash. It may make a new
2191 * SV containing that chunk (if HINT_NEW_STRING is on). It also
2192 * turns \\ into \.
2193 */
2194
76e3520e 2195STATIC SV *
cea2e8a9 2196S_tokeq(pTHX_ SV *sv)
79072805 2197{
eb578fdb
KW
2198 char *s;
2199 char *send;
2200 char *d;
b3ac6de7 2201 SV *pv = sv;
79072805 2202
7918f24d
NC
2203 PERL_ARGS_ASSERT_TOKEQ;
2204
279b35ad
FC
2205 assert (SvPOK(sv));
2206 assert (SvLEN(sv));
2207 assert (!SvIsCOW(sv));
307ed071 2208 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1) /* <<'heredoc' */
b3ac6de7 2209 goto finish;
279b35ad
FC
2210 s = SvPVX(sv);
2211 send = SvEND(sv);
dcb21ed6
NC
2212 /* This is relying on the SV being "well formed" with a trailing '\0' */
2213 while (s < send && !(*s == '\\' && s[1] == '\\'))
79072805
LW
2214 s++;
2215 if (s == send)
b3ac6de7 2216 goto finish;
79072805 2217 d = s;
be4731d2 2218 if ( PL_hints & HINT_NEW_STRING ) {
279b35ad
FC
2219 pv = newSVpvn_flags(SvPVX_const(pv), SvCUR(sv),
2220 SVs_TEMP | SvUTF8(sv));
be4731d2 2221 }
79072805
LW
2222 while (s < send) {
2223 if (*s == '\\') {
a0d0e21e 2224 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2225 s++; /* all that, just for this */
2226 }
2227 *d++ = *s++;
2228 }
2229 *d = '\0';
95a20fc0 2230 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2231 finish:
3280af22 2232 if ( PL_hints & HINT_NEW_STRING )
eb0d8d16 2233 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
79072805
LW
2234 return sv;
2235}
2236
ffb4593c
NT
2237/*
2238 * Now come three functions related to double-quote context,
2239 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2240 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2241 * interact with PL_lex_state, and create fake ( ... ) argument lists
2242 * to handle functions and concatenation.
ecd24171
DM
2243 * For example,
2244 * "foo\lbar"
2245 * is tokenised as
2246 * stringify ( const[foo] concat lcfirst ( const[bar] ) )
ffb4593c
NT
2247 */
2248
2249/*
2250 * S_sublex_start
6154021b 2251 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2252 *
2253 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2254 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c
NT
2255 *
2256 * OP_CONST and OP_READLINE are easy--just make the new op and return.
2257 *
2258 * Everything else becomes a FUNC.
2259 *
2260 * Sets PL_lex_state to LEX_INTERPPUSH unless (ival was OP_NULL or we
2261 * had an OP_CONST or OP_READLINE). This just sets us up for a
2262 * call to S_sublex_push().
2263 */
2264
76e3520e 2265STATIC I32
cea2e8a9 2266S_sublex_start(pTHX)
79072805 2267{
eb578fdb 2268 const I32 op_type = pl_yylval.ival;
79072805
LW
2269
2270 if (op_type == OP_NULL) {
6154021b 2271 pl_yylval.opval = PL_lex_op;
5f66b61c 2272 PL_lex_op = NULL;
79072805
LW
2273 return THING;
2274 }
466112bb 2275 if (op_type == OP_CONST) {
3280af22 2276 SV *sv = tokeq(PL_lex_stuff);
b3ac6de7
IZ
2277
2278 if (SvTYPE(sv) == SVt_PVIV) {
2279 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2280 STRLEN len;
96a5add6 2281 const char * const p = SvPV_const(sv, len);
740cce10 2282 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2283 SvREFCNT_dec(sv);
2284 sv = nsv;
4e553d73 2285 }
6154021b 2286 pl_yylval.opval = (OP*)newSVOP(op_type, 0, sv);
a0714e2c 2287 PL_lex_stuff = NULL;
79072805
LW
2288 return THING;
2289 }
2290
3280af22 2291 PL_sublex_info.super_state = PL_lex_state;
eac04b2e 2292 PL_sublex_info.sub_inwhat = (U16)op_type;
3280af22
NIS
2293 PL_sublex_info.sub_op = PL_lex_op;
2294 PL_lex_state = LEX_INTERPPUSH;
55497cff 2295
3280af22
NIS
2296 PL_expect = XTERM;
2297 if (PL_lex_op) {
6154021b 2298 pl_yylval.opval = PL_lex_op;
5f66b61c 2299 PL_lex_op = NULL;
55497cff
PP
2300 return PMFUNC;
2301 }
2302 else
2303 return FUNC;
2304}
2305
ffb4593c
NT
2306/*
2307 * S_sublex_push
2308 * Create a new scope to save the lexing state. The scope will be
2309 * ended in S_sublex_done. Returns a '(', starting the function arguments
2310 * to the uc, lc, etc. found before.
2311 * Sets PL_lex_state to LEX_INTERPCONCAT.
2312 */
2313
76e3520e 2314STATIC I32
cea2e8a9 2315S_sublex_push(pTHX)
55497cff 2316{
78a635de 2317 LEXSHARED *shared;
801d32ac 2318 const bool is_heredoc = PL_multi_close == '<';
f46d017c 2319 ENTER;
55497cff 2320
3280af22 2321 PL_lex_state = PL_sublex_info.super_state;
cc624add 2322 SAVEI8(PL_lex_dojoin);
3280af22 2323 SAVEI32(PL_lex_brackets);
78cdf107 2324 SAVEI32(PL_lex_allbrackets);
b27dce25 2325 SAVEI32(PL_lex_formbrack);
78cdf107 2326 SAVEI8(PL_lex_fakeeof);
3280af22
NIS
2327 SAVEI32(PL_lex_casemods);
2328 SAVEI32(PL_lex_starts);
651b5b28 2329 SAVEI8(PL_lex_state);
7cc34111 2330 SAVESPTR(PL_lex_repl);
7766f137 2331 SAVEVPTR(PL_lex_inpat);
98246f1e 2332 SAVEI16(PL_lex_inwhat);
ffdb8b16 2333 if (is_heredoc)
b42366d4 2334 {
ffdb8b16 2335 SAVECOPLINE(PL_curcop);
b42366d4 2336 SAVEI32(PL_multi_end);
851b527a
FC
2337 SAVEI32(PL_parser->herelines);
2338 PL_parser->herelines = 0;
b42366d4
FC
2339 }
2340 SAVEI8(PL_multi_close);
3280af22 2341 SAVEPPTR(PL_bufptr);
8452ff4b 2342 SAVEPPTR(PL_bufend);
3280af22
NIS
2343 SAVEPPTR(PL_oldbufptr);
2344 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2345 SAVEPPTR(PL_last_lop);
2346 SAVEPPTR(PL_last_uni);
3280af22
NIS
2347 SAVEPPTR(PL_linestart);
2348 SAVESPTR(PL_linestr);
8edd5f42
RGS
2349 SAVEGENERICPV(PL_lex_brackstack);
2350 SAVEGENERICPV(PL_lex_casestack);
78a635de 2351 SAVEGENERICPV(PL_parser->lex_shared);
3a54fd60 2352 SAVEBOOL(PL_parser->lex_re_reparsing);
ffdb8b16 2353 SAVEI32(PL_copline);
3280af22 2354
99bd9d90 2355 /* The here-doc parser needs to be able to peek into outer lexing
60f40a38
FC
2356 scopes to find the body of the here-doc. So we put PL_linestr and
2357 PL_bufptr into lex_shared, to ‘share’ those values.
99bd9d90 2358 */
60f40a38
FC
2359 PL_parser->lex_shared->ls_linestr = PL_linestr;
2360 PL_parser->lex_shared->ls_bufptr = PL_bufptr;
99bd9d90 2361
3280af22 2362 PL_linestr = PL_lex_stuff;
7cc34111 2363 PL_lex_repl = PL_sublex_info.repl;
a0714e2c 2364 PL_lex_stuff = NULL;
7cc34111 2365 PL_sublex_info.repl = NULL;
3280af22 2366
9cbb5ea2
GS
2367 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2368 = SvPVX(PL_linestr);
3280af22 2369 PL_bufend += SvCUR(PL_linestr);
bd61b366 2370 PL_last_lop = PL_last_uni = NULL;
3280af22 2371 SAVEFREESV(PL_linestr);
4dc843bc 2372 if (PL_lex_repl) SAVEFREESV(PL_lex_repl);
3280af22
NIS
2373
2374 PL_lex_dojoin = FALSE;
b27dce25 2375 PL_lex_brackets = PL_lex_formbrack = 0;
78cdf107
Z
2376 PL_lex_allbrackets = 0;
2377 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
a02a5408
JC
2378 Newx(PL_lex_brackstack, 120, char);
2379 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2380 PL_lex_casemods = 0;
2381 *PL_lex_casestack = '\0';
2382 PL_lex_starts = 0;
2383 PL_lex_state = LEX_INTERPCONCAT;
ffdb8b16 2384 if (is_heredoc)
6ddcf93b 2385 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
ffdb8b16 2386 PL_copline = NOLINE;
78a635de
FC
2387
2388 Newxz(shared, 1, LEXSHARED);
2389 shared->ls_prev = PL_parser->lex_shared;
2390 PL_parser->lex_shared = shared;
3280af22
NIS
2391
2392 PL_lex_inwhat = PL_sublex_info.sub_inwhat;
bb16bae8 2393 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
3280af22
NIS
2394 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2395 PL_lex_inpat = PL_sublex_info.sub_op;
79072805 2396 else
5f66b61c 2397 PL_lex_inpat = NULL;
79072805 2398
3a54fd60
DM
2399 PL_parser->lex_re_reparsing = cBOOL(PL_in_eval & EVAL_RE_REPARSING);
2400 PL_in_eval &= ~EVAL_RE_REPARSING;
2401
55497cff 2402 return '(';
79072805
LW
2403}
2404
ffb4593c
NT
2405/*
2406 * S_sublex_done
2407 * Restores lexer state after a S_sublex_push.
2408 */
2409
76e3520e 2410STATIC I32
cea2e8a9 2411S_sublex_done(pTHX)
79072805 2412{
3280af22 2413 if (!PL_lex_starts++) {
396482e1 2414 SV * const sv = newSVpvs("");
9aa983d2
JH
2415 if (SvUTF8(PL_linestr))
2416 SvUTF8_on(sv);
3280af22 2417 PL_expect = XOPERATOR;
6154021b 2418 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
79072805
LW
2419 return THING;
2420 }
2421
3280af22
NIS
2422 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2423 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2424 return yylex();
79072805
LW
2425 }
2426
ffb4593c 2427 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
bb16bae8 2428 assert(PL_lex_inwhat != OP_TRANSR);
5aa91856
FC
2429 if (PL_lex_repl) {
2430 assert (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS);
3280af22
NIS
2431 PL_linestr = PL_lex_repl;
2432 PL_lex_inpat = 0;
2433 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2434 PL_bufend += SvCUR(PL_linestr);
bd61b366 2435 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2436 PL_lex_dojoin = FALSE;
2437 PL_lex_brackets = 0;
78cdf107
Z
2438 PL_lex_allbrackets = 0;
2439 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
3280af22
NIS
2440 PL_lex_casemods = 0;
2441 *PL_lex_casestack = '\0';
2442 PL_lex_starts = 0;
25da4f38 2443 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2444 PL_lex_state = LEX_INTERPNORMAL;
2445 PL_lex_starts++;
e9fa98b2
HS
2446 /* we don't clear PL_lex_repl here, so that we can check later
2447 whether this is an evalled subst; that means we rely on the
2448 logic to ensure sublex_done() is called again only via the
2449 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2450 }
e9fa98b2 2451 else {
3280af22 2452 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2453 PL_lex_repl = NULL;
e9fa98b2 2454 }
ffdb8b16
FC
2455 if (SvTYPE(PL_linestr) >= SVt_PVNV) {
2456 CopLINE(PL_curcop) +=
2457 ((XPVNV*)SvANY(PL_linestr))->xnv_u.xpad_cop_seq.xlow
851b527a
FC
2458 + PL_parser->herelines;
2459 PL_parser->herelines = 0;
ffdb8b16 2460 }
79072805 2461 return ',';
ffed7fef
LW
2462 }
2463 else {
b42366d4 2464 const line_t l = CopLINE(PL_curcop);
f46d017c 2465 LEAVE;
b42366d4 2466 if (PL_multi_close == '<')
851b527a 2467 PL_parser->herelines += l - PL_multi_end;
3280af22
NIS
2468 PL_bufend = SvPVX(PL_linestr);
2469 PL_bufend += SvCUR(PL_linestr);
2470 PL_expect = XOPERATOR;
09bef843 2471 PL_sublex_info.sub_inwhat = 0;
79072805 2472 return ')';
ffed7fef
LW
2473 }
2474}
2475
6f613c73
KW
2476PERL_STATIC_INLINE SV*
2477S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
2478{
140b12ad
KW
2479 /* <s> points to first character of interior of \N{}, <e> to one beyond the
2480 * interior, hence to the "}". Finds what the name resolves to, returning
2481 * an SV* containing it; NULL if no valid one found */
2482
dd2b1b72 2483 SV* res = newSVpvn_flags(s, e - s, UTF ? SVf_UTF8 : 0);
6f613c73 2484
0c415a79
KW
2485 HV * table;
2486 SV **cvp;
2487 SV *cv;
2488 SV *rv;
2489 HV *stash;
2490 const U8* first_bad_char_loc;
2491 const char* backslash_ptr = s - 3; /* Points to the <\> of \N{... */
2492
6f613c73
KW
2493 PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME;
2494
107160e2
KW
2495 if (UTF && ! is_utf8_string_loc((U8 *) backslash_ptr,
2496 e - backslash_ptr,
2497 &first_bad_char_loc))
2498 {
2499 /* If warnings are on, this will print a more detailed analysis of what
2500 * is wrong than the error message below */
c80e42f3 2501 utf8n_to_uvchr(first_bad_char_loc,
107160e2
KW
2502 e - ((char *) first_bad_char_loc),
2503 NULL, 0);
2504
2505 /* We deliberately don't try to print the malformed character, which
2506 * might not print very well; it also may be just the first of many
2507 * malformations, so don't print what comes after it */
2508 yyerror(Perl_form(aTHX_
2509 "Malformed UTF-8 character immediately after '%.*s'",
2510 (int) (first_bad_char_loc - (U8 *) backslash_ptr), backslash_ptr));
2511 return NULL;
2512 }
2513
2514 res = new_constant( NULL, 0, "charnames", res, NULL, backslash_ptr,
2515 /* include the <}> */
2516 e - backslash_ptr + 1);
6f613c73 2517 if (! SvPOK(res)) {
b6407c49 2518 SvREFCNT_dec_NN(res);
6f613c73
KW
2519 return NULL;
2520 }
2521
0c415a79
KW
2522 /* See if the charnames handler is the Perl core's, and if so, we can skip
2523 * the validation needed for a user-supplied one, as Perl's does its own
2524 * validation. */
2525 table = GvHV(PL_hintgv); /* ^H */
2526 cvp = hv_fetchs(table, "charnames", FALSE);
5882ddb3
FC
2527 if (cvp && (cv = *cvp) && SvROK(cv) && (rv = SvRV(cv),
2528 SvTYPE(rv) == SVt_PVCV) && ((stash = CvSTASH(rv)) != NULL))
0c415a79
KW
2529 {
2530 const char * const name = HvNAME(stash);
6a642c21
FC
2531 if (HvNAMELEN(stash) == sizeof("_charnames")-1
2532 && strEQ(name, "_charnames")) {
0c415a79
KW
2533 return res;
2534 }
2535 }
2536
bde9e88d
KW
2537 /* Here, it isn't Perl's charname handler. We can't rely on a
2538 * user-supplied handler to validate the input name. For non-ut8 input,
2539 * look to see that the first character is legal. Then loop through the
2540 * rest checking that each is a continuation */
6f613c73 2541
36897d64
KW
2542 /* This code makes the reasonable assumption that the only Latin1-range
2543 * characters that begin a character name alias are alphabetic, otherwise
2544 * would have to create a isCHARNAME_BEGIN macro */
b6ba1137 2545
b6ba1137 2546 if (! UTF) {
bde9e88d 2547 if (! isALPHAU(*s)) {
b6ba1137
KW
2548 goto bad_charname;
2549 }
bde9e88d
KW
2550 s++;
2551 while (s < e) {
2552 if (! isCHARNAME_CONT(*s)) {
b6ba1137
KW
2553 goto bad_charname;
2554 }
2d8eb851
KW
2555 if (*s == ' ' && *(s-1) == ' ') {
2556 goto multi_spaces;
bd299e29 2557 }
df758df2
KW
2558 if ((U8) *s == NBSP_NATIVE && ckWARN_d(WARN_DEPRECATED)) {
2559 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2560 "NO-BREAK SPACE in a charnames "
2561 "alias definition is deprecated");
2562 }
bde9e88d 2563 s++;
b6ba1137
KW
2564 }
2565 }
2566 else {
bde9e88d
KW
2567 /* Similarly for utf8. For invariants can check directly; for other
2568 * Latin1, can calculate their code point and check; otherwise use a
2569 * swash */
2570 if (UTF8_IS_INVARIANT(*s)) {
2571 if (! isALPHAU(*s)) {
140b12ad
KW
2572 goto bad_charname;
2573 }
bde9e88d
KW
2574 s++;
2575 } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
94bb8c36 2576 if (! isALPHAU(TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)))) {
b6ba1137 2577 goto bad_charname;
6f613c73 2578 }
bde9e88d 2579 s += 2;
6f613c73 2580 }
bde9e88d
KW
2581 else {
2582 if (! PL_utf8_charname_begin) {
2583 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2584 PL_utf8_charname_begin = _core_swash_init("utf8",
2585 "_Perl_Charname_Begin",
2586 &PL_sv_undef,
2587 1, 0, NULL, &flags);
2588 }
2589 if (! swash_fetch(PL_utf8_charname_begin, (U8 *) s, TRUE)) {
2590 goto bad_charname;
2591 }
2592 s += UTF8SKIP(s);
2593 }
2594
2595 while (s < e) {
2596 if (UTF8_IS_INVARIANT(*s)) {
2597 if (! isCHARNAME_CONT(*s)) {
2598 goto bad_charname;
2599 }
2d8eb851
KW
2600 if (*s == ' ' && *(s-1) == ' ') {
2601 goto multi_spaces;
bd299e29 2602 }
bde9e88d
KW
2603 s++;
2604 }
2605 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
94bb8c36 2606 if (! isCHARNAME_CONT(TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1))))
bde9e88d
KW
2607 {
2608 goto bad_charname;
2609 }
df758df2
KW
2610 if (*s == *NBSP_UTF8
2611 && *(s+1) == *(NBSP_UTF8+1)
2612 && ckWARN_d(WARN_DEPRECATED))
2613 {
2614 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2615 "NO-BREAK SPACE in a charnames "
2616 "alias definition is deprecated");
2617 }
bde9e88d
KW
2618 s += 2;
2619 }
2620 else {
2621 if (! PL_utf8_charname_continue) {
2622 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2623 PL_utf8_charname_continue = _core_swash_init("utf8",
2624 "_Perl_Charname_Continue",
2625 &PL_sv_undef,
2626 1, 0, NULL, &flags);
2627 }
2628 if (! swash_fetch(PL_utf8_charname_continue, (U8 *) s, TRUE)) {
2629 goto bad_charname;
2630 }
2631 s += UTF8SKIP(s);
6f613c73
KW
2632 }
2633 }
2d8eb851
KW
2634 }
2635 if (*(s-1) == ' ') {
2636 yyerror_pv(
2637 Perl_form(aTHX_
2638 "charnames alias definitions may not contain trailing "
2639 "white-space; marked by <-- HERE in %.*s<-- HERE %.*s",
2640 (int)(s - backslash_ptr + 1), backslash_ptr,
2641 (int)(e - s + 1), s + 1
2642 ),
2643 UTF ? SVf_UTF8 : 0);
2644 return NULL;
6f613c73
KW
2645 }
2646
94ca1619 2647 if (SvUTF8(res)) { /* Don't accept malformed input */
bde9e88d
KW
2648 const U8* first_bad_char_loc;
2649 STRLEN len;
2650 const char* const str = SvPV_const(res, len);
2651 if (! is_utf8_string_loc((U8 *) str, len, &first_bad_char_loc)) {
2652 /* If warnings are on, this will print a more detailed analysis of
2653 * what is wrong than the error message below */
c80e42f3 2654 utf8n_to_uvchr(first_bad_char_loc,
bde9e88d
KW
2655 (char *) first_bad_char_loc - str,
2656 NULL, 0);
2657
2658 /* We deliberately don't try to print the malformed character,
2659 * which might not print very well; it also may be just the first
2660 * of many malformations, so don't print what comes after it */
2661 yyerror_pv(
2662 Perl_form(aTHX_
2663 "Malformed UTF-8 returned by %.*s immediately after '%.*s'",
2664 (int) (e - backslash_ptr + 1), backslash_ptr,
2665 (int) ((char *) first_bad_char_loc - str), str
2666 ),
2667 SVf_UTF8);
2668 return NULL;
2669 }
2670 }
140b12ad 2671
bde9e88d 2672 return res;
140b12ad 2673
bde9e88d 2674 bad_charname: {
bde9e88d
KW
2675
2676 /* The final %.*s makes sure that should the trailing NUL be missing
2677 * that this print won't run off the end of the string */
2678 yyerror_pv(
2679 Perl_form(aTHX_
2680 "Invalid character in \\N{...}; marked by <-- HERE in %.*s<-- HERE %.*s",
2d8eb851
KW
2681 (int)(s - backslash_ptr + 1), backslash_ptr,
2682 (int)(e - s + 1), s + 1
bde9e88d
KW
2683 ),
2684 UTF ? SVf_UTF8 : 0);
2685 return NULL;
2686 }
2d8eb851
KW
2687
2688 multi_spaces:
2689 yyerror_pv(
2690 Perl_form(aTHX_
2691 "charnames alias definitions may not contain a sequence of "
2692 "multiple spaces; marked by <-- HERE in %.*s<-- HERE %.*s",
2693 (int)(s - backslash_ptr + 1), backslash_ptr,
2694 (int)(e - s + 1), s + 1
2695 ),
2696 UTF ? SVf_UTF8 : 0);
2697 return NULL;
6f613c73
KW
2698}
2699
02aa26ce
NT
2700/*
2701 scan_const
2702
9da1dd8f
DM
2703 Extracts the next constant part of a pattern, double-quoted string,
2704 or transliteration. This is terrifying code.
2705
2706 For example, in parsing the double-quoted string "ab\x63$d", it would
2707 stop at the '$' and return an OP_CONST containing 'abc'.
02aa26ce 2708
94def140 2709 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 2710 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 2711 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 2712
94def140
ST
2713 Returns a pointer to the character scanned up to. If this is
2714 advanced from the start pointer supplied (i.e. if anything was
9da1dd8f 2715 successfully parsed), will leave an OP_CONST for the substring scanned
6154021b 2716 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
2717 by looking at the next characters herself.
2718
02aa26ce 2719 In patterns:
9da1dd8f 2720 expand:
537124e4
KW
2721 \N{FOO} => \N{U+hex_for_character_FOO}
2722 (if FOO expands to multiple characters, expands to \N{U+xx.XX.yy ...})
9da1dd8f
DM
2723
2724 pass through:
2725 all other \-char, including \N and \N{ apart from \N{ABC}
2726
2727 stops on:
2728 @ and $ where it appears to be a var, but not for $ as tail anchor
2729 \l \L \u \U \Q \E
2730 (?{ or (??{
2731
02aa26ce
NT
2732
2733 In transliterations:
2734 characters are VERY literal, except for - not at the start or end
94def140
ST
2735 of the string, which indicates a range. If the range is in bytes,
2736 scan_const expands the range to the full set of intermediate
2737 characters. If the range is in utf8, the hyphen is replaced with
2738 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
2739
2740 In double-quoted strings:
2741 backslashes:
2742 double-quoted style: \r and \n
ff3f963a 2743 constants: \x31, etc.
94def140 2744 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
2745 case and quoting: \U \Q \E
2746 stops on @ and $
2747
2748 scan_const does *not* construct ops to handle interpolated strings.
2749 It stops processing as soon as it finds an embedded $ or @ variable
2750 and leaves it to the caller to work out what's going on.
2751
94def140
ST
2752 embedded arrays (whether in pattern or not) could be:
2753 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2754
2755 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
2756
2757 $ in pattern could be $foo or could be tail anchor. Assumption:
2758 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 2759 followed by one of "()| \r\n\t"
02aa26ce 2760
9da1dd8f 2761 \1 (backreferences) are turned into $1 in substitutions
02aa26ce
NT
2762
2763 The structure of the code is
2764 while (there's a character to process) {
94def140
ST
2765 handle transliteration ranges
2766 skip regexp comments /(?#comment)/ and codes /(?{code})/
2767 skip #-initiated comments in //x patterns
2768 check for embedded arrays
02aa26ce
NT
2769 check for embedded scalars
2770 if (backslash) {
94def140 2771 deprecate \1 in substitution replacements
02aa26ce
NT
2772 handle string-changing backslashes \l \U \Q \E, etc.
2773 switch (what was escaped) {
94def140 2774 handle \- in a transliteration (becomes a literal -)
ff3f963a 2775 if a pattern and not \N{, go treat as regular character
94def140
ST
2776 handle \132 (octal characters)
2777 handle \x15 and \x{1234} (hex characters)
ff3f963a 2778 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
ST
2779 handle \cV (control characters)
2780 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 2781 } (end switch)
77a135fe 2782 continue
02aa26ce 2783 } (end if backslash)
77a135fe 2784 handle regular character
02aa26ce 2785 } (end while character to read)
4e553d73 2786
02aa26ce
NT
2787*/
2788
76e3520e 2789STATIC char *
cea2e8a9 2790S_scan_const(pTHX_ char *start)
79072805 2791{
eb578fdb 2792 char *send = PL_bufend; /* end of the constant */
dc023dbb
KW
2793 SV *sv = newSV(send - start); /* sv for the constant. See note below
2794 on sizing. */
eb578fdb
KW
2795 char *s = start; /* start of the constant */
2796 char *d = SvPVX(sv); /* destination for copies */
dc023dbb
KW
2797 bool dorange = FALSE; /* are we in a translit range? */
2798 bool didrange = FALSE; /* did we just finish a range? */
2799 bool in_charclass = FALSE; /* within /[...]/ */
2800 bool has_utf8 = FALSE; /* Output constant is UTF8 */
2801 bool this_utf8 = cBOOL(UTF); /* Is the source string assumed to be
2802 UTF8? But, this can show as true
2803 when the source isn't utf8, as for
2804 example when it is entirely composed
2805 of hex constants */
6f613c73 2806 SV *res; /* result from charnames */
77a135fe
KW
2807
2808 /* Note on sizing: The scanned constant is placed into sv, which is
2809 * initialized by newSV() assuming one byte of output for every byte of
2810 * input. This routine expects newSV() to allocate an extra byte for a
2811 * trailing NUL, which this routine will append if it gets to the end of
2812 * the input. There may be more bytes of input than output (eg., \N{LATIN
2813 * CAPITAL LETTER A}), or more output than input if the constant ends up
2814 * recoded to utf8, but each time a construct is found that might increase
2815 * the needed size, SvGROW() is called. Its size parameter each time is
2816 * based on the best guess estimate at the time, namely the length used so
2817 * far, plus the length the current construct will occupy, plus room for
2818 * the trailing NUL, plus one byte for every input byte still unscanned */
2819
c3320c2a
KW
2820 UV uv = UV_MAX; /* Initialize to weird value to try to catch any uses
2821 before set */
4c3a8340
ST
2822#ifdef EBCDIC
2823 UV literal_endpoint = 0;
e294cc5d 2824 bool native_range = TRUE; /* turned to FALSE if the first endpoint is Unicode. */
4c3a8340 2825#endif
012bcf8d 2826
7918f24d
NC
2827 PERL_ARGS_ASSERT_SCAN_CONST;
2828
bb16bae8 2829 assert(PL_lex_inwhat != OP_TRANSR);
2b9d42f0
NIS
2830 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
2831 /* If we are doing a trans and we know we want UTF8 set expectation */
2832 has_utf8 = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
2833 this_utf8 = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
2834 }
2835
b899e89d
FC
2836 /* Protect sv from errors and fatal warnings. */
2837 ENTER_with_name("scan_const");
2838 SAVEFREESV(sv);
2b9d42f0 2839
79072805 2840 while (s < send || dorange) {
ff3f963a 2841
02aa26ce 2842 /* get transliterations out of the way (they're most literal) */
3280af22 2843 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 2844 /* expand a range A-Z to the full set of characters. AIE! */
79072805 2845 if (dorange) {
1ba5c669
JH
2846 I32 i; /* current expanded character */
2847 I32 min; /* first character in range */
2848 I32 max; /* last character in range */
02aa26ce 2849
e294cc5d
JH
2850#ifdef EBCDIC
2851 UV uvmax = 0;
2852#endif
2853
2854 if (has_utf8
2855#ifdef EBCDIC
2856 && !native_range
2857#endif
1953db30 2858 ) {
9d4ba2ae 2859 char * const c = (char*)utf8_hop((U8*)d, -1);
8973db79
JH
2860 char *e = d++;
2861 while (e-- > c)
2862 *(e + 1) = *e;
e7214ce8 2863 *c = (char) ILLEGAL_UTF8_BYTE;
8973db79
JH
2864 /* mark the range as done, and continue */
2865 dorange = FALSE;
2866 didrange = TRUE;
2867 continue;
2868 }
2b9d42f0 2869
95a20fc0 2870 i = d - SvPVX_const(sv); /* remember current offset */
e294cc5d
JH
2871#ifdef EBCDIC
2872 SvGROW(sv,
dc023dbb
KW
2873 SvLEN(sv) + ((has_utf8)
2874 ? (512 - UTF_CONTINUATION_MARK
2875 + UNISKIP(0x100))
e294cc5d
JH
2876 : 256));
2877 /* How many two-byte within 0..255: 128 in UTF-8,
2878 * 96 in UTF-8-mod. */
2879#else
9cbb5ea2 2880 SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */
e294cc5d 2881#endif
9cbb5ea2 2882 d = SvPVX(sv) + i; /* refresh d after realloc */
e294cc5d
JH
2883#ifdef EBCDIC
2884 if (has_utf8) {
2885 int j;
2886 for (j = 0; j <= 1; j++) {
2887 char * const c = (char*)utf8_hop((U8*)d, -1);
2888 const UV uv = utf8n_to_uvchr((U8*)c, d - c, NULL, 0);
2889 if (j)
2890 min = (U8)uv;
2891 else if (uv < 256)
2892 max = (U8)uv;
2893 else {
2894 max = (U8)0xff; /* only to \xff */
2895 uvmax = uv; /* \x{100} to uvmax */
2896 }
2897 d = c; /* eat endpoint chars */
2898 }
2899 }
2900 else {
2901#endif
2902 d -= 2; /* eat the first char and the - */
2903 min = (U8)*d; /* first char in range */
2904 max = (U8)d[1]; /* last char in range */
2905#ifdef EBCDIC
2906 }
2907#endif
8ada0baa 2908
c2e66d9e 2909 if (min > max) {
01ec43d0 2910 Perl_croak(aTHX_
d1573ac7 2911 "Invalid range \"%c-%c\" in transliteration operator",
1ba5c669 2912 (char)min, (char)max);
c2e66d9e
GS
2913 }
2914
c7f1f016 2915#ifdef EBCDIC
dc023dbb
KW
2916 /* Because of the discontinuities in EBCDIC A-Z and a-z, expand
2917 * any subsets of these ranges into individual characters */
4c3a8340 2918 if (literal_endpoint == 2 &&
c286a9a6 2919 ((isLOWER_A(min) && isLOWER_A(max)) ||
75e9e7bf
KW
2920 (isUPPER_A(min) && isUPPER_A(max))))
2921 {
2922 for (i = min; i <= max; i++) {
2923 if (isALPHA_A(i))
2924 *d++ = i;
8ada0baa
JH
2925 }
2926 }
2927 else
2928#endif
2929 for (i = min; i <= max; i++)
e294cc5d
JH
2930#ifdef EBCDIC
2931 if (has_utf8) {
55d09dc8 2932 append_utf8_from_native_byte(i, &d);
e294cc5d
JH
2933 }
2934 else
2935#endif
2936 *d++ = (char)i;
2937
2938#ifdef EBCDIC
2939 if (uvmax) {
2940 d = (char*)uvchr_to_utf8((U8*)d, 0x100);
2941 if (uvmax > 0x101)
e7214ce8 2942 *d++ = (char) ILLEGAL_UTF8_BYTE;
e294cc5d
JH
2943 if (uvmax > 0x100)
2944 d = (char*)uvchr_to_utf8((U8*)d, uvmax);
2945 }
2946#endif
02aa26ce
NT
2947
2948 /* mark the range as done, and continue */
79072805 2949 dorange = FALSE;
01ec43d0 2950 didrange = TRUE;
4c3a8340
ST
2951#ifdef EBCDIC
2952 literal_endpoint = 0;
2953#endif
79072805 2954 continue;
4e553d73 2955 }
02aa26ce
NT
2956
2957 /* range begins (ignore - as first or last char) */
79072805 2958 else if (*s == '-' && s+1 < send && s != start) {
4e553d73 2959 if (didrange) {
1fafa243 2960 Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
01ec43d0 2961 }
e294cc5d
JH
2962 if (has_utf8
2963#ifdef EBCDIC
2964 && !native_range
2965#endif
2966 ) {
e7214ce8 2967 *d++ = (char) ILLEGAL_UTF8_BYTE; /* use illegal utf8 byte--see pmtrans */
a0ed51b3
LW
2968 s++;
2969 continue;
2970 }
79072805
LW
2971 dorange = TRUE;
2972 s++;
01ec43d0
GS
2973 }
2974 else {
2975 didrange = FALSE;
4c3a8340
ST
2976#ifdef EBCDIC
2977 literal_endpoint = 0;
e294cc5d 2978 native_range = TRUE;
4c3a8340 2979#endif
01ec43d0 2980 }
79072805 2981 }
02aa26ce
NT
2982
2983 /* if we get here, we're not doing a transliteration */
2984
e4a2df84
DM
2985 else if (*s == '[' && PL_lex_inpat && !in_charclass) {
2986 char *s1 = s-1;
2987 int esc = 0;
2988 while (s1 >= start && *s1-- == '\\')
2989 esc = !esc;
2990 if (!esc)
2991 in_charclass = TRUE;
2992 }
2866decb 2993
e4a2df84
DM
2994 else if (*s == ']' && PL_lex_inpat && in_charclass) {
2995 char *s1 = s-1;
2996 int esc = 0;
2997 while (s1 >= start && *s1-- == '\\')
2998 esc = !esc;
2999 if (!esc)
3000 in_charclass = FALSE;
3001 }
2866decb 3002
9da1dd8f
DM
3003 /* skip for regexp comments /(?#comment)/, except for the last
3004 * char, which will be done separately.
3005 * Stop on (?{..}) and friends */
3006
c30fc27b 3007 else if (*s == '(' && PL_lex_inpat && s[1] == '?' && !in_charclass) {
cc6b7395 3008 if (s[2] == '#') {
e994fd66 3009 while (s+1 < send && *s != ')')
5ff03569 3010 *d++ = *s++;
155aba94 3011 }
c30fc27b 3012 else if (!PL_lex_casemods &&
d3cec5e5
DM
3013 ( s[2] == '{' /* This should match regcomp.c */
3014 || (s[2] == '?' && s[3] == '{')))
155aba94 3015 {
9da1dd8f 3016 break;
cc6b7395 3017 }
748a9306 3018 }
02aa26ce
NT
3019
3020 /* likewise skip #-initiated comments in //x patterns */
c30fc27b 3021 else if (*s == '#' && PL_lex_inpat && !in_charclass &&
73134a2e 3022 ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED) {
748a9306 3023 while (s+1 < send && *s != '\n')
5ff03569 3024 *d++ = *s++;
748a9306 3025 }
02aa26ce 3026
9da1dd8f
DM
3027 /* no further processing of single-quoted regex */
3028 else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'')
3029 goto default_action;
3030
5d1d4326 3031 /* check for embedded arrays
da6eedaa 3032 (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
5d1d4326 3033 */
1749ea0d 3034 else if (*s == '@' && s[1]) {
8a2bca12 3035 if (isWORDCHAR_lazy_if(s+1,UTF))
1749ea0d
ST
3036 break;
3037 if (strchr(":'{$", s[1]))
3038 break;
3039 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
3040 break; /* in regexp, neither @+ nor @- are interpolated */
3041 }
02aa26ce
NT
3042
3043 /* check for embedded scalars. only stop if we're sure it's a
3044 variable.
3045 */
79072805 3046 else if (*s == '$') {
3280af22 3047 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
79072805 3048 break;
77772344 3049 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
a2a5de95
NC
3050 if (s[1] == '\\') {
3051 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
3052 "Possible unintended interpolation of $\\ in regex");
77772344 3053 }
79072805 3054 break; /* in regexp, $ might be tail anchor */
77772344 3055 }
79072805 3056 }
02aa26ce 3057
2b9d42f0
NIS
3058 /* End of else if chain - OP_TRANS rejoin rest */
3059
02aa26ce 3060 /* backslashes */
79072805 3061 if (*s == '\\' && s+1 < send) {
ff3f963a
KW
3062 char* e; /* Can be used for ending '}', etc. */
3063
79072805 3064 s++;
02aa26ce 3065
7d0fc23c
KW
3066 /* warn on \1 - \9 in substitution replacements, but note that \11
3067 * is an octal; and \19 is \1 followed by '9' */
3280af22 3068 if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat &&
a0d0e21e 3069 isDIGIT(*s) && *s != '0' && !isDIGIT(s[1]))
79072805 3070 {
c782d7ee 3071 /* diag_listed_as: \%d better written as $%d */
a2a5de95 3072 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
79072805
LW
3073 *--s = '$';
3074 break;
3075 }
02aa26ce
NT
3076
3077 /* string-change backslash escapes */
838f2281 3078 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQF", *s)) {
79072805
LW
3079 --s;
3080 break;
3081 }
ff3f963a
KW
3082 /* In a pattern, process \N, but skip any other backslash escapes.
3083 * This is because we don't want to translate an escape sequence
3084 * into a meta symbol and have the regex compiler use the meta
3085 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
3086 * in spite of this, we do have to process \N here while the proper
3087 * charnames handler is in scope. See bugs #56444 and #62056.
3088 * There is a complication because \N in a pattern may also stand
3089 * for 'match a non-nl', and not mean a charname, in which case its
3090 * processing should be deferred to the regex compiler. To be a
3091 * charname it must be followed immediately by a '{', and not look
3092 * like \N followed by a curly quantifier, i.e., not something like
3093 * \N{3,}. regcurly returns a boolean indicating if it is a legal
3094 * quantifier */
3095 else if (PL_lex_inpat
3096 && (*s != 'N'
3097 || s[1] != '{'
412f55bb 3098 || regcurly(s + 1)))
ff3f963a 3099 {
4d73d076 3100 *d++ = '\\';
cc74c5bd
ST
3101 goto default_action;
3102 }
02aa26ce 3103
79072805 3104 switch (*s) {
02aa26ce
NT
3105
3106 /* quoted - in transliterations */
79072805 3107 case '-':
3280af22 3108 if (PL_lex_inwhat == OP_TRANS) {
79072805
LW
3109 *d++ = *s++;
3110 continue;
3111 }
924ba076 3112 /* FALLTHROUGH */
79072805 3113 default:
11b8faa4 3114 {
15861f94 3115 if ((isALPHANUMERIC(*s)))
a2a5de95
NC
3116 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
3117 "Unrecognized escape \\%c passed through",
3118 *s);
11b8faa4 3119 /* default action is to copy the quoted character */
f9a63242 3120 goto default_action;
11b8faa4 3121 }
02aa26ce 3122
632403cc 3123 /* eg. \132 indicates the octal constant 0132 */
79072805
LW
3124 case '0': case '1': case '2': case '3':
3125 case '4': case '5': case '6': case '7':
ba210ebe 3126 {
5e0a247b 3127 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
53305cf1 3128 STRLEN len = 3;
06972766 3129 uv = grok_oct(s, &len, &flags, NULL);
ba210ebe 3130 s += len;
5e0a247b
KW
3131 if (len < 3 && s < send && isDIGIT(*s)
3132 && ckWARN(WARN_MISC))
3133 {
3134 Perl_warner(aTHX_ packWARN(WARN_MISC),
3135 "%s", form_short_octal_warning(s, len));
3136 }
ba210ebe 3137 }
012bcf8d 3138 goto NUM_ESCAPE_INSERT;
02aa26ce 3139
f0a2b745
KW
3140 /* eg. \o{24} indicates the octal constant \024 */
3141 case 'o':
3142 {
454155d9 3143 const char* error;
f0a2b745 3144
00ce5563 3145 bool valid = grok_bslash_o(&s, &uv, &error,
80f4111b
KW
3146 TRUE, /* Output warning */
3147 FALSE, /* Not strict */
17896a85
KW
3148 TRUE, /* Output warnings for
3149 non-portables */
80f4111b 3150 UTF);
454155d9 3151 if (! valid) {
f0a2b745
KW
3152 yyerror(error);
3153 continue;
3154 }
3155 goto NUM_ESCAPE_INSERT;
3156 }
3157
77a135fe 3158 /* eg. \x24 indicates the hex constant 0x24 */
79072805 3159 case 'x':
a0481293 3160 {
a0481293 3161 const char* error;
355860ce 3162
00ce5563 3163 bool valid = grok_bslash_x(&s, &uv, &error,
80f4111b
KW
3164 TRUE, /* Output warning */
3165 FALSE, /* Not strict */
17896a85
KW
3166 TRUE, /* Output warnings for
3167 non-portables */
80f4111b 3168 UTF);
a0481293
KW
3169 if (! valid) {
3170 yyerror(error);
355860ce 3171 continue;
ba210ebe 3172 }
012bcf8d
GS
3173 }
3174
3175 NUM_ESCAPE_INSERT:
ff3f963a
KW
3176 /* Insert oct or hex escaped character. There will always be
3177 * enough room in sv since such escapes will be longer than any
3178 * UTF-8 sequence they can end up as, except if they force us
3179 * to recode the rest of the string into utf8 */
ba7cea30 3180
06972766 3181 /* Here uv is the ordinal of the next character being added */
6f2d5cbc 3182 if (!UVCHR_IS_INVARIANT(uv)) {
9aa983d2 3183 if (!has_utf8 && uv > 255) {
77a135fe
KW
3184 /* Might need to recode whatever we have accumulated so
3185 * far if it contains any chars variant in utf8 or
3186 * utf-ebcdic. */
3187
3188 SvCUR_set(sv, d - SvPVX_const(sv));
3189 SvPOK_on(sv);
3190 *d = '\0';
77a135fe 3191 /* See Note on sizing above. */
7bf79863
KW
3192 sv_utf8_upgrade_flags_grow(sv,
3193 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3194 UNISKIP(uv) + (STRLEN)(send - s) + 1);
77a135fe
KW
3195 d = SvPVX(sv) + SvCUR(sv);
3196 has_utf8 = TRUE;
012bcf8d
GS
3197 }
3198
77a135fe 3199 if (has_utf8) {
c80e42f3 3200 d = (char*)uvchr_to_utf8((U8*)d, uv);
f9a63242
JH
3201 if (PL_lex_inwhat == OP_TRANS &&
3202 PL_sublex_info.sub_op) {
3203 PL_sublex_info.sub_op->op_private |=
3204 (PL_lex_repl ? OPpTRANS_FROM_UTF
3205 : OPpTRANS_TO_UTF);
f9a63242 3206 }
e294cc5d
JH
3207#ifdef EBCDIC
3208 if (uv > 255 && !dorange)
3209 native_range = FALSE;
3210#endif
012bcf8d 3211 }
a0ed51b3 3212 else {
012bcf8d 3213 *d++ = (char)uv;
a0ed51b3 3214 }
012bcf8d
GS
3215 }
3216 else {
c4d5f83a 3217 *d++ = (char) uv;
a0ed51b3 3218 }
79072805 3219 continue;
02aa26ce 3220
4a2d328f 3221 case 'N':
ff3f963a
KW
3222 /* In a non-pattern \N must be a named character, like \N{LATIN
3223 * SMALL LETTER A} or \N{U+0041}. For patterns, it also can
3224 * mean to match a non-newline. For non-patterns, named
3225 * characters are converted to their string equivalents. In
3226 * patterns, named characters are not converted to their
3227 * ultimate forms for the same reasons that other escapes
3228 * aren't. Instead, they are converted to the \N{U+...} form
3229 * to get the value from the charnames that is in effect right
3230 * now, while preserving the fact that it was a named character
3231 * so that the regex compiler knows this */
3232
ff3f963a
KW
3233 /* The structure of this section of code (besides checking for
3234 * errors and upgrading to utf8) is:
3235 * Further disambiguate between the two meanings of \N, and if
3236 * not a charname, go process it elsewhere
0a96133f
KW
3237 * If of form \N{U+...}, pass it through if a pattern;
3238 * otherwise convert to utf8
3239 * Otherwise must be \N{NAME}: convert to \N{U+c1.c2...} if a
3240 * pattern; otherwise convert to utf8 */
ff3f963a
KW
3241
3242 /* Here, s points to the 'N'; the test below is guaranteed to
3243 * succeed if we are being called on a pattern as we already
3244 * know from a test above that the next character is a '{'.
3245 * On a non-pattern \N must mean 'named sequence, which
3246 * requires braces */
3247 s++;
3248 if (*s != '{') {
3249 yyerror("Missing braces on \\N{}");
3250 continue;
3251 }
3252 s++;
3253
0a96133f 3254 /* If there is no matching '}', it is an error. */
ff3f963a
KW
3255 if (! (e = strchr(s, '}'))) {
3256 if (! PL_lex_inpat) {
5777a3f7 3257 yyerror("Missing right brace on \\N{}");
0a96133f 3258 } else {
4407f1b8 3259 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N");
dbc0d4f2 3260 }
0a96133f 3261 continue;
ff3f963a 3262 }
cddc7ef4 3263
ff3f963a 3264 /* Here it looks like a named character */
cddc7ef4 3265
ff3f963a
KW
3266 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3267 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3268 | PERL_SCAN_DISALLOW_PREFIX;
3269 STRLEN len;
3270
3271 /* For \N{U+...}, the '...' is a unicode value even on
3272 * EBCDIC machines */
3273 s += 2; /* Skip to next char after the 'U+' */
3274 len = e - s;
3275 uv = grok_hex(s, &len, &flags, NULL);
3276 if (len == 0 || len != (STRLEN)(e - s)) {
3277 yyerror("Invalid hexadecimal number in \\N{U+...}");
3278 s = e + 1;
3279 continue;
3280 }
3281
3282 if (PL_lex_inpat) {
3283
e2a7e165
KW
3284 /* On non-EBCDIC platforms, pass through to the regex
3285 * compiler unchanged. The reason we evaluated the
3286 * number above is to make sure there wasn't a syntax
3287 * error. But on EBCDIC we convert to native so
3288 * downstream code can continue to assume it's native
3289 */
ff3f963a 3290 s -= 5; /* Include the '\N{U+' */
e2a7e165
KW
3291#ifdef EBCDIC
3292 d += my_snprintf(d, e - s + 1 + 1, /* includes the }
3293 and the \0 */
3294 "\\N{U+%X}",
3295 (unsigned int) UNI_TO_NATIVE(uv));
3296#else
ff3f963a
KW
3297 Copy(s, d, e - s + 1, char); /* 1 = include the } */
3298 d += e - s + 1;
e2a7e165 3299#endif
ff3f963a
KW
3300 }
3301 else { /* Not a pattern: convert the hex to string */
3302
3303 /* If destination is not in utf8, unconditionally
3304 * recode it to be so. This is because \N{} implies
3305 * Unicode semantics, and scalars have to be in utf8
3306 * to guarantee those semantics */
3307 if (! has_utf8) {
3308 SvCUR_set(sv, d - SvPVX_const(sv));
3309 SvPOK_on(sv);
3310 *d = '\0';
3311 /* See Note on sizing above. */
3312 sv_utf8_upgrade_flags_grow(
3313 sv,
3314 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3315 UNISKIP(uv) + (STRLEN)(send - e) + 1);
3316 d = SvPVX(sv) + SvCUR(sv);
3317 has_utf8 = TRUE;
3318 }
3319
a46469e6 3320 /* Add the (Unicode) code point to the output. */
ff3f963a 3321 if (UNI_IS_INVARIANT(uv)) {
a46469e6 3322 *d++ = (char) LATIN1_TO_NATIVE(uv);
ff3f963a 3323 }
a46469e6
KW
3324 else {
3325 d = (char*) uvoffuni_to_utf8_flags((U8*)d, uv, 0);
3326 }
ff3f963a
KW
3327 }
3328 }
6f613c73
KW
3329 else /* Here is \N{NAME} but not \N{U+...}. */
3330 if ((res = get_and_check_backslash_N_name(s, e)))
3331 {
3332 STRLEN len;
3333 const char *str = SvPV_const(res, len);
3334 if (PL_lex_inpat) {
ff3f963a
KW
3335
3336 if (! len) { /* The name resolved to an empty string */
3337 Copy("\\N{}", d, 4, char);
3338 d += 4;
3339 }
3340 else {
3341 /* In order to not lose information for the regex
3342 * compiler, pass the result in the specially made
3343 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3344 * the code points in hex of each character
3345 * returned by charnames */
3346
3347 const char *str_end = str + len;
3b721c4f 3348 const STRLEN off = d - SvPVX_const(sv);
94ca1619
KW
3349
3350 if (! SvUTF8(res)) {
3351 /* For the non-UTF-8 case, we can determine the
3352 * exact length needed without having to parse
3353 * through the string. Each character takes up
3354 * 2 hex digits plus either a trailing dot or
3355 * the "}" */
3356 d = off + SvGROW(sv, off
3357 + 3 * len
3358 + 6 /* For the "\N{U+", and
3359 trailing NUL */
3360 + (STRLEN)(send - e));
3361 Copy("\\N{U+", d, 5, char);
3362 d += 5;
3363 while (str < str_end) {
3364 char hex_string[4];
e8549682 3365 int len =
51f14a05
JH
3366 my_snprintf(hex_string,
3367 sizeof(hex_string),
e8549682
JH
3368 "%02X.", (U8) *str);
3369 PERL_MY_SNPRINTF_POST_GUARD(len, sizeof(hex_string));
94ca1619
KW
3370 Copy(hex_string, d, 3, char);
3371 d += 3;
3372 str++;
3373 }
3374 d--; /* We will overwrite below the final
3375 dot with a right brace */
3376 }
3377 else {
1953db30
KW
3378 STRLEN char_length; /* cur char's byte length */
3379
3380 /* and the number of bytes after this is
3381 * translated into hex digits */
3382 STRLEN output_length;
3383
3384 /* 2 hex per byte; 2 chars for '\N'; 2 chars
3385 * for max('U+', '.'); and 1 for NUL */
3386 char hex_string[2 * UTF8_MAXBYTES + 5];
3387
3388 /* Get the first character of the result. */
a46469e6 3389 U32 uv = utf8n_to_uvchr((U8 *) str,
1953db30
KW
3390 len,
3391 &char_length,
3392 UTF8_ALLOW_ANYUV);
3393 /* Convert first code point to hex, including
a46469e6 3394 * the boiler plate before it. */
1953db30
KW
3395 output_length =
3396 my_snprintf(hex_string, sizeof(hex_string),
a46469e6
KW
3397 "\\N{U+%X",
3398 (unsigned int) uv);
1953db30
KW
3399
3400 /* Make sure there is enough space to hold it */
3401 d = off + SvGROW(sv, off
3402 + output_length
3403 + (STRLEN)(send - e)
3404 + 2); /* '}' + NUL */
3405 /* And output it */
3406 Copy(hex_string, d, output_length, char);
3407 d += output_length;
3408
3409 /* For each subsequent character, append dot and
3410 * its ordinal in hex */
3411 while ((str += char_length) < str_end) {
3412 const STRLEN off = d - SvPVX_const(sv);
a46469e6 3413 U32 uv = utf8n_to_uvchr((U8 *) str,
1953db30
KW
3414 str_end - str,
3415 &char_length,
3416 UTF8_ALLOW_ANYUV);
3417 output_length =
3418 my_snprintf(hex_string,
a46469e6
KW
3419 sizeof(hex_string),
3420 ".%X",
3421 (unsigned int) uv);
1953db30
KW
3422
3423 d = off + SvGROW(sv, off
3424 + output_length
3425 + (STRLEN)(send - e)
3426 + 2); /* '}' + NUL */
3427 Copy(hex_string, d, output_length, char);
3428 d += output_length;
3429 }
94ca1619 3430 }
ff3f963a
KW
3431
3432 *d++ = '}'; /* Done. Add the trailing brace */
3433 }
3434 }
3435 else { /* Here, not in a pattern. Convert the name to a
3436 * string. */
3437
3438 /* If destination is not in utf8, unconditionally
3439 * recode it to be so. This is because \N{} implies
3440 * Unicode semantics, and scalars have to be in utf8
3441 * to guarantee those semantics */
3442 if (! has_utf8) {
3443 SvCUR_set(sv, d - SvPVX_const(sv));
3444 SvPOK_on(sv);
3445 *d = '\0';
3446 /* See Note on sizing above. */
3447 sv_utf8_upgrade_flags_grow(sv,
3448 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3449 len + (STRLEN)(send - s) + 1);
3450 d = SvPVX(sv) + SvCUR(sv);
3451 has_utf8 = TRUE;
3452 } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
3453
3454 /* See Note on sizing above. (NOTE: SvCUR() is not
3455 * set correctly here). */
3456 const STRLEN off = d - SvPVX_const(sv);
3457 d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1);
3458 }
7fc82458
KW
3459 if (! SvUTF8(res)) { /* Make sure is \N{} return is UTF-8 */
3460 sv_utf8_upgrade(res);
3461 str = SvPV_const(res, len);
3462 }
ff3f963a
KW
3463 Copy(str, d, len, char);
3464 d += len;
423cee85 3465 }
6f613c73 3466
423cee85 3467 SvREFCNT_dec(res);
cb233ae3 3468
cb233ae3 3469 } /* End \N{NAME} */
ff3f963a
KW
3470#ifdef EBCDIC
3471 if (!dorange)
3472 native_range = FALSE; /* \N{} is defined to be Unicode */
3473#endif
3474 s = e + 1; /* Point to just after the '}' */
423cee85
JH
3475 continue;
3476
02aa26ce 3477 /* \c is a control character */
79072805
LW
3478 case 'c':
3479 s++;
961ce445 3480 if (s < send) {
421e43ba 3481 *d++ = grok_bslash_c(*s++, 1);
ba210ebe 3482 }
961ce445
RGS
3483 else {
3484 yyerror("Missing control char name in \\c");
3485 }
79072805 3486 continue;
02aa26ce
NT
3487
3488 /* printf-style backslashes, formfeeds, newlines, etc */
79072805 3489 case 'b':
4d73d076 3490 *d++ = '\b';
79072805
LW
3491 break;
3492 case 'n':
4d73d076 3493 *d++ = '\n';
79072805
LW
3494 break;
3495 case 'r':
4d73d076 3496 *d++ = '\r';
79072805
LW
3497 break;
3498 case 'f':
4d73d076 3499 *d++ = '\f';
79072805
LW
3500 break;
3501 case 't':
4d73d076 3502 *d++ = '\t';
79072805 3503 break;
34a3fe2a 3504 case 'e':
da0334e8 3505 *d++ = ESC_NATIVE;
34a3fe2a
PP
3506 break;
3507 case 'a':
4d73d076 3508 *d++ = '\a';
79072805 3509 break;
02aa26ce
NT
3510 } /* end switch */
3511
79072805
LW
3512 s++;
3513 continue;
02aa26ce 3514 } /* end if (backslash) */
4c3a8340
ST
3515#ifdef EBCDIC
3516 else
3517 literal_endpoint++;
3518#endif
02aa26ce 3519
f9a63242 3520 default_action:
77a135fe
KW
3521 /* If we started with encoded form, or already know we want it,
3522 then encode the next character */
6f2d5cbc 3523 if (! NATIVE_BYTE_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
2b9d42f0 3524 STRLEN len = 1;
77a135fe
KW
3525
3526
3527 /* One might think that it is wasted effort in the case of the
3528 * source being utf8 (this_utf8 == TRUE) to take the next character
3529 * in the source, convert it to an unsigned value, and then convert
3530 * it back again. But the source has not been validated here. The
3531 * routine that does the conversion checks for errors like
3532 * malformed utf8 */
3533
233ca360
KW
3534 const UV nextuv = (this_utf8)
3535 ? utf8n_to_uvchr((U8*)s, send - s, &len, 0)
3536 : (UV) ((U8) *s);
5aaebcb3 3537 const STRLEN need = UNISKIP(nextuv);
77a135fe
KW
3538 if (!has_utf8) {
3539 SvCUR_set(sv, d - SvPVX_const(sv));
3540 SvPOK_on(sv);
3541 *d = '\0';
77a135fe 3542 /* See Note on sizing above. */
7bf79863
KW
3543 sv_utf8_upgrade_flags_grow(sv,
3544 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3545 need + (STRLEN)(send - s) + 1);
77a135fe
KW
3546 d = SvPVX(sv) + SvCUR(sv);
3547 has_utf8 = TRUE;
3548 } else if (need > len) {
3549 /* encoded value larger than old, may need extra space (NOTE:
3550 * SvCUR() is not set correctly here). See Note on sizing
3551 * above. */
9d4ba2ae 3552 const STRLEN off = d - SvPVX_const(sv);
77a135fe 3553 d = SvGROW(sv, off + need + (STRLEN)(send - s) + 1) + off;
2b9d42f0 3554 }
77a135fe
KW
3555 s += len;
3556
5f66b61c 3557 d = (char*)uvchr_to_utf8((U8*)d, nextuv);
e294cc5d
JH
3558#ifdef EBCDIC
3559 if (uv > 255 && !dorange)
3560 native_range = FALSE;
3561#endif
2b9d42f0
NIS
3562 }
3563 else {
5ff03569 3564 *d++ = *s++;
2b9d42f0 3565 }
02aa26ce
NT
3566 } /* while loop to process each character */
3567
3568 /* terminate the string and set up the sv */
79072805 3569 *d = '\0';
95a20fc0 3570 SvCUR_set(sv, d - SvPVX_const(sv));
2b9d42f0 3571 if (SvCUR(sv) >= SvLEN(sv))
5637ef5b
NC
3572 Perl_croak(aTHX_ "panic: constant overflowed allocated space, %"UVuf
3573 " >= %"UVuf, (UV)SvCUR(sv), (UV)SvLEN(sv));
2b9d42f0 3574
79072805 3575 SvPOK_on(sv);
9f4817db 3576 if (PL_encoding && !has_utf8) {
d0063567
DK
3577 sv_recode_to_utf8(sv, PL_encoding);
3578 if (SvUTF8(sv))
3579 has_utf8 = TRUE;
9f4817db 3580 }
2b9d42f0 3581 if (has_utf8) {
7e2040f0 3582 SvUTF8_on(sv);
2b9d42f0 3583 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
d0063567 3584 PL_sublex_info.sub_op->op_private |=
2b9d42f0
NIS
3585 (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
3586 }
3587 }
79072805 3588
02aa26ce 3589 /* shrink the sv if we allocated more than we used */
79072805 3590 if (SvCUR(sv) + 5 < SvLEN(sv)) {
1da4ca5f 3591 SvPV_shrink_to_cur(sv);
79072805 3592 }
02aa26ce 3593
6154021b 3594 /* return the substring (via pl_yylval) only if we parsed anything */
f0feb466
FC
3595 if (s > start) {
3596 char *s2 = start;
ffdb8b16
FC
3597 for (; s2 < s; s2++) {
3598 if (*s2 == '\n')
3599 COPLINE_INC_WITH_HERELINES;
3600 }
b899e89d 3601 SvREFCNT_inc_simple_void_NN(sv);
4f3e2518
DM
3602 if ( (PL_hints & ( PL_lex_inpat ? HINT_NEW_RE : HINT_NEW_STRING ))
3603 && ! PL_parser->lex_re_reparsing)
3604 {
eb0d8d16
NC
3605 const char *const key = PL_lex_inpat ? "qr" : "q";
3606 const STRLEN keylen = PL_lex_inpat ? 2 : 1;
3607 const char *type;
3608 STRLEN typelen;
3609
3610 if (PL_lex_inwhat == OP_TRANS) {
3611 type = "tr";
3612 typelen = 2;
3613 } else if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat) {
3614 type = "s";
3615 typelen = 1;
9da1dd8f
DM
3616 } else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'') {
3617 type = "q";
3618 typelen = 1;
eb0d8d16
NC
3619 } else {
3620 type = "qq";
3621 typelen = 2;
3622 }
3623
3624 sv = S_new_constant(aTHX_ start, s - start, key, keylen, sv, NULL,
3625 type, typelen);
3626 }
6154021b 3627 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
b899e89d
FC
3628 }
3629 LEAVE_with_name("scan_const");
79072805
LW
3630 return s;
3631}
3632
ffb4593c
NT
3633/* S_intuit_more
3634 * Returns TRUE if there's more to the expression (e.g., a subscript),
3635 * FALSE otherwise.
ffb4593c
NT
3636 *
3637 * It deals with "$foo[3]" and /$foo[3]/ and /$foo[0123456789$]+/
3638 *
3639 * ->[ and ->{ return TRUE
ff25e5db 3640 * ->$* ->$#* ->@* ->@[ ->@{ return TRUE if postderef_qq is enabled
ffb4593c
NT
3641 * { and [ outside a pattern are always subscripts, so return TRUE
3642 * if we're outside a pattern and it's not { or [, then return FALSE
3643 * if we're in a pattern and the first char is a {
3644 * {4,5} (any digits around the comma) returns FALSE
3645 * if we're in a pattern and the first char is a [
3646 * [] returns FALSE
3647 * [SOMETHING] has a funky algorithm to decide whether it's a
3648 * character class or not. It has to deal with things like
3649 * /$foo[-3]/ and /$foo[$bar]/ as well as /$foo[$\d]+/
3650 * anything else returns TRUE
3651 */
3652
9cbb5ea2
GS
3653/* This is the one truly awful dwimmer necessary to conflate C and sed. */
3654
76e3520e 3655STATIC int