This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
toke.c: White-space alignment, add comment
[perl5.git] / toke.c
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
f0e67a1d
Z
26This is the lower layer of the Perl parser, managing characters and tokens.
27
28=for apidoc AmU|yy_parser *|PL_parser
29
30Pointer to a structure encapsulating the state of the parsing operation
31currently in progress. The pointer can be locally changed to perform
32a nested parse without interfering with the state of an outer parse.
33Individual members of C<PL_parser> have their own documentation.
34
35=cut
36*/
37
378cc40b 38#include "EXTERN.h"
864dbfa3 39#define PERL_IN_TOKE_C
378cc40b 40#include "perl.h"
04e98a4d 41#include "dquote_static.c"
378cc40b 42
eb0d8d16
NC
43#define new_constant(a,b,c,d,e,f,g) \
44 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
45
6154021b 46#define pl_yylval (PL_parser->yylval)
d3b6f988 47
199e78b7
DM
48/* XXX temporary backwards compatibility */
49#define PL_lex_brackets (PL_parser->lex_brackets)
78cdf107
Z
50#define PL_lex_allbrackets (PL_parser->lex_allbrackets)
51#define PL_lex_fakeeof (PL_parser->lex_fakeeof)
199e78b7
DM
52#define PL_lex_brackstack (PL_parser->lex_brackstack)
53#define PL_lex_casemods (PL_parser->lex_casemods)
54#define PL_lex_casestack (PL_parser->lex_casestack)
55#define PL_lex_defer (PL_parser->lex_defer)
56#define PL_lex_dojoin (PL_parser->lex_dojoin)
57#define PL_lex_expect (PL_parser->lex_expect)
58#define PL_lex_formbrack (PL_parser->lex_formbrack)
59#define PL_lex_inpat (PL_parser->lex_inpat)
60#define PL_lex_inwhat (PL_parser->lex_inwhat)
61#define PL_lex_op (PL_parser->lex_op)
62#define PL_lex_repl (PL_parser->lex_repl)
63#define PL_lex_starts (PL_parser->lex_starts)
64#define PL_lex_stuff (PL_parser->lex_stuff)
65#define PL_multi_start (PL_parser->multi_start)
66#define PL_multi_open (PL_parser->multi_open)
67#define PL_multi_close (PL_parser->multi_close)
199e78b7
DM
68#define PL_preambled (PL_parser->preambled)
69#define PL_sublex_info (PL_parser->sublex_info)
bdc0bf6f 70#define PL_linestr (PL_parser->linestr)
c2598295
DM
71#define PL_expect (PL_parser->expect)
72#define PL_copline (PL_parser->copline)
f06b5848
DM
73#define PL_bufptr (PL_parser->bufptr)
74#define PL_oldbufptr (PL_parser->oldbufptr)
75#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
76#define PL_linestart (PL_parser->linestart)
77#define PL_bufend (PL_parser->bufend)
78#define PL_last_uni (PL_parser->last_uni)
79#define PL_last_lop (PL_parser->last_lop)
80#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 81#define PL_lex_state (PL_parser->lex_state)
2f9285f8 82#define PL_rsfp (PL_parser->rsfp)
5486870f 83#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
84#define PL_in_my (PL_parser->in_my)
85#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 86#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 87#define PL_multi_end (PL_parser->multi_end)
13765c85 88#define PL_error_count (PL_parser->error_count)
199e78b7 89
fb205e7a
DM
90# define PL_nexttoke (PL_parser->nexttoke)
91# define PL_nexttype (PL_parser->nexttype)
92# define PL_nextval (PL_parser->nextval)
199e78b7 93
a1894d81 94static const char* const ident_too_long = "Identifier too long";
8903cb82 95
9ded7720 96# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2 97
a7aaec61
Z
98#define XENUMMASK 0x3f
99#define XFAKEEOF 0x40
100#define XFAKEBRACK 0x80
9059aa12 101
39e02b42
JH
102#ifdef USE_UTF8_SCRIPTS
103# define UTF (!IN_BYTES)
2b9d42f0 104#else
802a15e9 105# define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
2b9d42f0 106#endif
a0ed51b3 107
b1fc3636
CJ
108/* The maximum number of characters preceding the unrecognized one to display */
109#define UNRECOGNIZED_PRECEDE_COUNT 10
110
61f0cdd9 111/* In variables named $^X, these are the legal values for X.
2b92dfce
GS
112 * 1999-02-27 mjd-perl-patch@plover.com */
113#define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
114
14bd96d0 115#define SPACE_OR_TAB(c) isBLANK_A(c)
bf4acbe4 116
ffb4593c
NT
117/* LEX_* are values for PL_lex_state, the state of the lexer.
118 * They are arranged oddly so that the guard on the switch statement
79072805 119 * can get by with a single comparison (if the compiler is smart enough).
9da1dd8f
DM
120 *
121 * These values refer to the various states within a sublex parse,
122 * i.e. within a double quotish string
79072805
LW
123 */
124
fb73857a
PP
125/* #define LEX_NOTPARSING 11 is done in perl.h. */
126
b6007c36
DM
127#define LEX_NORMAL 10 /* normal code (ie not within "...") */
128#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
129#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
130#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
131#define LEX_INTERPSTART 6 /* expecting the start of a $var */
132
133 /* at end of code, eg "$x" followed by: */
134#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
135#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
136
137#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
138 string or after \E, $foo, etc */
139#define LEX_INTERPCONST 2 /* NOT USED */
140#define LEX_FORMLINE 1 /* expecting a format line */
141#define LEX_KNOWNEXT 0 /* next token known; just return it */
142
79072805 143
bbf60fe6 144#ifdef DEBUGGING
27da23d5 145static const char* const lex_state_names[] = {
bbf60fe6
DM
146 "KNOWNEXT",
147 "FORMLINE",
148 "INTERPCONST",
149 "INTERPCONCAT",
150 "INTERPENDMAYBE",
151 "INTERPEND",
152 "INTERPSTART",
153 "INTERPPUSH",
154 "INTERPCASEMOD",
155 "INTERPNORMAL",
156 "NORMAL"
157};
158#endif
159
79072805 160#include "keywords.h"
fe14fcc3 161
ffb4593c
NT
162/* CLINE is a macro that ensures PL_copline has a sane value */
163
57843af0 164#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 165
29595ff2
NC
166# define SKIPSPACE0(s) skipspace(s)
167# define SKIPSPACE1(s) skipspace(s)
168# define SKIPSPACE2(s,tsv) skipspace(s)
169# define PEEKSPACE(s) skipspace(s)
29595ff2 170
ffb4593c
NT
171/*
172 * Convenience functions to return different tokens and prime the
9cbb5ea2 173 * lexer for the next token. They all take an argument.
ffb4593c
NT
174 *
175 * TOKEN : generic token (used for '(', DOLSHARP, etc)
176 * OPERATOR : generic operator
177 * AOPERATOR : assignment operator
178 * PREBLOCK : beginning the block after an if, while, foreach, ...
179 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
180 * PREREF : *EXPR where EXPR is not a simple identifier
181 * TERM : expression term
89f35911 182 * POSTDEREF : postfix dereference (->$* ->@[...] etc.)
ffb4593c
NT
183 * LOOPX : loop exiting command (goto, last, dump, etc)
184 * FTST : file test operator
185 * FUN0 : zero-argument function
7eb971ee 186 * FUN0OP : zero-argument function, with its op created in this file
2d2e263d 187 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
188 * BOop : bitwise or or xor
189 * BAop : bitwise and
190 * SHop : shift operator
191 * PWop : power operator
9cbb5ea2 192 * PMop : pattern-matching operator
ffb4593c
NT
193 * Aop : addition-level operator
194 * Mop : multiplication-level operator
195 * Eop : equality-testing operator
e5edeb50 196 * Rop : relational operator <= != gt
ffb4593c
NT
197 *
198 * Also see LOP and lop() below.
199 */
200
998054bd 201#ifdef DEBUGGING /* Serve -DT. */
704d4215 202# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 203#else
bbf60fe6 204# define REPORT(retval) (retval)
998054bd
SC
205#endif
206
bbf60fe6
DM
207#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
208#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
209#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, REPORT(retval)))
210#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
211#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
212#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
213#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
89f35911 214#define POSTDEREF(f) return (PL_bufptr = s, S_postderef(aTHX_ REPORT(f),s[1]))
6154021b
RGS
215#define LOOPX(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)LOOPEX))
216#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
217#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
7eb971ee 218#define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
6154021b
RGS
219#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
220#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITOROP)))
221#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITANDOP)))
222#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)SHIFTOP)))
223#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)POWOP)))
224#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
225#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)ADDOP)))
226#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MULOP)))
227#define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
228#define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
2f3197b3 229
a687059c
LW
230/* This bit of chicanery makes a unary function followed by
231 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
232 * The UNIDOR macro is for unary functions that can be followed by the //
233 * operator (such as C<shift // 0>).
a687059c 234 */
d68ce4ac 235#define UNI3(f,x,have_x) { \
6154021b 236 pl_yylval.ival = f; \
d68ce4ac 237 if (have_x) PL_expect = x; \
376fcdbf
AL
238 PL_bufptr = s; \
239 PL_last_uni = PL_oldbufptr; \
240 PL_last_lop_op = f; \
241 if (*s == '(') \
242 return REPORT( (int)FUNC1 ); \
29595ff2 243 s = PEEKSPACE(s); \
376fcdbf
AL
244 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
245 }
d68ce4ac
FC
246#define UNI(f) UNI3(f,XTERM,1)
247#define UNIDOR(f) UNI3(f,XTERMORDORDOR,1)
b5fb7ce3
FC
248#define UNIPROTO(f,optional) { \
249 if (optional) PL_last_uni = PL_oldbufptr; \
22393538
MH
250 OPERATOR(f); \
251 }
a687059c 252
d68ce4ac 253#define UNIBRACK(f) UNI3(f,0,0)
79072805 254
9f68db38 255/* grandfather return to old style */
78cdf107
Z
256#define OLDLOP(f) \
257 do { \
258 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
259 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
260 pl_yylval.ival = (f); \
261 PL_expect = XTERM; \
262 PL_bufptr = s; \
263 return (int)LSTOP; \
264 } while(0)
79072805 265
83944c01
FC
266#define COPLINE_INC_WITH_HERELINES \
267 STMT_START { \
268 CopLINE_inc(PL_curcop); \
851b527a
FC
269 if (PL_parser->herelines) \
270 CopLINE(PL_curcop) += PL_parser->herelines, \
271 PL_parser->herelines = 0; \
83944c01 272 } STMT_END
ffdb8b16
FC
273/* Called after scan_str to update CopLINE(PL_curcop), but only when there
274 * is no sublex_push to follow. */
275#define COPLINE_SET_FROM_MULTI_END \
276 STMT_START { \
277 CopLINE_set(PL_curcop, PL_multi_end); \
278 if (PL_multi_end != PL_multi_start) \
851b527a 279 PL_parser->herelines = 0; \
ffdb8b16 280 } STMT_END
83944c01
FC
281
282
8fa7f367
JH
283#ifdef DEBUGGING
284
6154021b 285/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
286enum token_type {
287 TOKENTYPE_NONE,
288 TOKENTYPE_IVAL,
6154021b 289 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6 290 TOKENTYPE_PVAL,
aeaef349 291 TOKENTYPE_OPVAL
bbf60fe6
DM
292};
293
6d4a66ac
NC
294static struct debug_tokens {
295 const int token;
296 enum token_type type;
297 const char *name;
298} const debug_tokens[] =
9041c2e3 299{
bbf60fe6
DM
300 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
301 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
302 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
303 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
304 { ARROW, TOKENTYPE_NONE, "ARROW" },
305 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
306 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
307 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
308 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
309 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
0d863452 310 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
311 { DO, TOKENTYPE_NONE, "DO" },
312 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
313 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
314 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
315 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
316 { ELSE, TOKENTYPE_NONE, "ELSE" },
317 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
318 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
319 { FOR, TOKENTYPE_IVAL, "FOR" },
320 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
705fe0e5
FC
321 { FORMLBRACK, TOKENTYPE_NONE, "FORMLBRACK" },
322 { FORMRBRACK, TOKENTYPE_NONE, "FORMRBRACK" },
bbf60fe6
DM
323 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
324 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
7eb971ee 325 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
bbf60fe6
DM
326 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
327 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
328 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 329 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
330 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
331 { IF, TOKENTYPE_IVAL, "IF" },
5504e6cf 332 { LABEL, TOKENTYPE_PVAL, "LABEL" },
bbf60fe6
DM
333 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
334 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
335 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
336 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
337 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
338 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
339 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
340 { MY, TOKENTYPE_IVAL, "MY" },
bbf60fe6
DM
341 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
342 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
343 { OROP, TOKENTYPE_IVAL, "OROP" },
344 { OROR, TOKENTYPE_NONE, "OROR" },
345 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
88e1f1a2
JV
346 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
347 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6 348 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
cc624add 349 { POSTJOIN, TOKENTYPE_NONE, "POSTJOIN" },
bbf60fe6
DM
350 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
351 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
352 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
353 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
354 { PREINC, TOKENTYPE_NONE, "PREINC" },
355 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
f3f204dc 356 { QWLIST, TOKENTYPE_OPVAL, "QWLIST" },
bbf60fe6
DM
357 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
358 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
f3f204dc 359 { REQUIRE, TOKENTYPE_NONE, "REQUIRE" },
bbf60fe6
DM
360 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
361 { SUB, TOKENTYPE_NONE, "SUB" },
362 { THING, TOKENTYPE_OPVAL, "THING" },
363 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
364 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
365 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
366 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
367 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
368 { USE, TOKENTYPE_IVAL, "USE" },
0d863452 369 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6
DM
370 { WHILE, TOKENTYPE_IVAL, "WHILE" },
371 { WORD, TOKENTYPE_OPVAL, "WORD" },
be25f609 372 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 373 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
374};
375
6154021b 376/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 377
bbf60fe6 378STATIC int
704d4215 379S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 380{
97aff369 381 dVAR;
7918f24d
NC
382
383 PERL_ARGS_ASSERT_TOKEREPORT;
384
bbf60fe6 385 if (DEBUG_T_TEST) {
bd61b366 386 const char *name = NULL;
bbf60fe6 387 enum token_type type = TOKENTYPE_NONE;
f54cb97a 388 const struct debug_tokens *p;
396482e1 389 SV* const report = newSVpvs("<== ");
bbf60fe6 390
f54cb97a 391 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
392 if (p->token == (int)rv) {
393 name = p->name;
394 type = p->type;
395 break;
396 }
397 }
398 if (name)
54667de8 399 Perl_sv_catpv(aTHX_ report, name);
74736ae6 400 else if ((char)rv > ' ' && (char)rv <= '~')
4ebc7986 401 {
bbf60fe6 402 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
4ebc7986
FC
403 if ((char)rv == 'p')
404 sv_catpvs(report, " (pending identifier)");
405 }
bbf60fe6 406 else if (!rv)
396482e1 407 sv_catpvs(report, "EOF");
bbf60fe6
DM
408 else
409 Perl_sv_catpvf(aTHX_ report, "?? %"IVdf, (IV)rv);
410 switch (type) {
411 case TOKENTYPE_NONE:
bbf60fe6
DM
412 break;
413 case TOKENTYPE_IVAL:
704d4215 414 Perl_sv_catpvf(aTHX_ report, "(ival=%"IVdf")", (IV)lvalp->ival);
bbf60fe6
DM
415 break;
416 case TOKENTYPE_OPNUM:
417 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 418 PL_op_name[lvalp->ival]);
bbf60fe6
DM
419 break;
420 case TOKENTYPE_PVAL:
704d4215 421 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
422 break;
423 case TOKENTYPE_OPVAL:
704d4215 424 if (lvalp->opval) {
401441c0 425 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
426 PL_op_name[lvalp->opval->op_type]);
427 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 428 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 429 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
430 }
431
432 }
401441c0 433 else
396482e1 434 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
435 break;
436 }
b6007c36 437 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
438 };
439 return (int)rv;
998054bd
SC
440}
441
b6007c36
DM
442
443/* print the buffer with suitable escapes */
444
445STATIC void
15f169a1 446S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 447{
396482e1 448 SV* const tmp = newSVpvs("");
7918f24d
NC
449
450 PERL_ARGS_ASSERT_PRINTBUF;
451
5d37acd6 452 GCC_DIAG_IGNORE(-Wformat-nonliteral); /* fmt checked by caller */
b6007c36 453 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
5d37acd6 454 GCC_DIAG_RESTORE;
b6007c36
DM
455 SvREFCNT_dec(tmp);
456}
457
8fa7f367
JH
458#endif
459
8290c323
NC
460static int
461S_deprecate_commaless_var_list(pTHX) {
462 PL_expect = XTERM;
463 deprecate("comma-less variable list");
464 return REPORT(','); /* grandfather non-comma-format format */
465}
466
ffb4593c
NT
467/*
468 * S_ao
469 *
c963b151
BD
470 * This subroutine detects &&=, ||=, and //= and turns an ANDAND, OROR or DORDOR
471 * into an OP_ANDASSIGN, OP_ORASSIGN, or OP_DORASSIGN
ffb4593c
NT
472 */
473
76e3520e 474STATIC int
cea2e8a9 475S_ao(pTHX_ int toketype)
a0d0e21e 476{
3280af22
NIS
477 if (*PL_bufptr == '=') {
478 PL_bufptr++;
a0d0e21e 479 if (toketype == ANDAND)
6154021b 480 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 481 else if (toketype == OROR)
6154021b 482 pl_yylval.ival = OP_ORASSIGN;
c963b151 483 else if (toketype == DORDOR)
6154021b 484 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
485 toketype = ASSIGNOP;
486 }
487 return toketype;
488}
489
ffb4593c
NT
490/*
491 * S_no_op
492 * When Perl expects an operator and finds something else, no_op
493 * prints the warning. It always prints "<something> found where
494 * operator expected. It prints "Missing semicolon on previous line?"
495 * if the surprise occurs at the start of the line. "do you need to
496 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
497 * where the compiler doesn't know if foo is a method call or a function.
498 * It prints "Missing operator before end of line" if there's nothing
499 * after the missing operator, or "... before <...>" if there is something
500 * after the missing operator.
501 */
502
76e3520e 503STATIC void
15f169a1 504S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 505{
9d4ba2ae
AL
506 char * const oldbp = PL_bufptr;
507 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 508
7918f24d
NC
509 PERL_ARGS_ASSERT_NO_OP;
510
1189a94a
GS
511 if (!s)
512 s = oldbp;
07c798fb 513 else
1189a94a 514 PL_bufptr = s;
734ab321 515 yywarn(Perl_form(aTHX_ "%s found where operator expected", what), UTF ? SVf_UTF8 : 0);
56da5a46
RGS
516 if (ckWARN_d(WARN_SYNTAX)) {
517 if (is_first)
518 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
519 "\t(Missing semicolon on previous line?)\n");
520 else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) {
f54cb97a 521 const char *t;
8a2bca12 522 for (t = PL_oldoldbufptr; (isWORDCHAR_lazy_if(t,UTF) || *t == ':');
734ab321 523 t += UTF ? UTF8SKIP(t) : 1)
c35e046a 524 NOOP;
56da5a46
RGS
525 if (t < PL_bufptr && isSPACE(*t))
526 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
b17a0679
FC
527 "\t(Do you need to predeclare %"UTF8f"?)\n",
528 UTF8fARG(UTF, t - PL_oldoldbufptr, PL_oldoldbufptr));
56da5a46
RGS
529 }
530 else {
531 assert(s >= oldbp);
532 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
b17a0679
FC
533 "\t(Missing operator before %"UTF8f"?)\n",
534 UTF8fARG(UTF, s - oldbp, oldbp));
56da5a46 535 }
07c798fb 536 }
3280af22 537 PL_bufptr = oldbp;
8990e307
LW
538}
539
ffb4593c
NT
540/*
541 * S_missingterm
542 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 543 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
544 * If we're in a delimited string and the delimiter is a control
545 * character, it's reformatted into a two-char sequence like ^C.
546 * This is fatal.
547 */
548
76e3520e 549STATIC void
cea2e8a9 550S_missingterm(pTHX_ char *s)
8990e307
LW
551{
552 char tmpbuf[3];
553 char q;
554 if (s) {
9d4ba2ae 555 char * const nl = strrchr(s,'\n');
d2719217 556 if (nl)
8990e307
LW
557 *nl = '\0';
558 }
ca8b19a7 559 else if ((U8) PL_multi_close < 32) {
8990e307 560 *tmpbuf = '^';
585ec06d 561 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
562 tmpbuf[2] = '\0';
563 s = tmpbuf;
564 }
565 else {
eb160463 566 *tmpbuf = (char)PL_multi_close;
8990e307
LW
567 tmpbuf[1] = '\0';
568 s = tmpbuf;
569 }
570 q = strchr(s,'"') ? '\'' : '"';
cea2e8a9 571 Perl_croak(aTHX_ "Can't find string terminator %c%s%c anywhere before EOF",q,s,q);
463ee0b2 572}
79072805 573
dd0ac2b9
FC
574#include "feature.h"
575
0d863452 576/*
0d863452
RH
577 * Check whether the named feature is enabled.
578 */
26ea9e12 579bool
3fff3427 580Perl_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
0d863452 581{
4a731d7b 582 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
7918f24d
NC
583
584 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
ca4d40c4
FC
585
586 assert(CURRENT_FEATURE_BUNDLE == FEATURE_BUNDLE_CUSTOM);
7918f24d 587
26ea9e12
NC
588 if (namelen > MAX_FEATURE_LEN)
589 return FALSE;
3fff3427 590 memcpy(&he_name[8], name, namelen);
7d69d4a6 591
c8ca97b0
NC
592 return cBOOL(cop_hints_fetch_pvn(PL_curcop, he_name, 8 + namelen, 0,
593 REFCOUNTED_HE_EXISTS));
0d863452
RH
594}
595
ffb4593c 596/*
9cbb5ea2
GS
597 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
598 * utf16-to-utf8-reversed.
ffb4593c
NT
599 */
600
c39cd008
GS
601#ifdef PERL_CR_FILTER
602static void
603strip_return(SV *sv)
604{
eb578fdb
KW
605 const char *s = SvPVX_const(sv);
606 const char * const e = s + SvCUR(sv);
7918f24d
NC
607
608 PERL_ARGS_ASSERT_STRIP_RETURN;
609
c39cd008
GS
610 /* outer loop optimized to do nothing if there are no CR-LFs */
611 while (s < e) {
612 if (*s++ == '\r' && *s == '\n') {
613 /* hit a CR-LF, need to copy the rest */
eb578fdb 614 char *d = s - 1;
c39cd008
GS
615 *d++ = *s++;
616 while (s < e) {
617 if (*s == '\r' && s[1] == '\n')
618 s++;
619 *d++ = *s++;
620 }
621 SvCUR(sv) -= s - d;
622 return;
623 }
624 }
625}
a868473f 626
76e3520e 627STATIC I32
c39cd008 628S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 629{
f54cb97a 630 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
631 if (count > 0 && !maxlen)
632 strip_return(sv);
633 return count;
a868473f
NIS
634}
635#endif
636
ffb4593c 637/*
8eaa0acf
Z
638=for apidoc Amx|void|lex_start|SV *line|PerlIO *rsfp|U32 flags
639
640Creates and initialises a new lexer/parser state object, supplying
641a context in which to lex and parse from a new source of Perl code.
642A pointer to the new state object is placed in L</PL_parser>. An entry
643is made on the save stack so that upon unwinding the new state object
644will be destroyed and the former value of L</PL_parser> will be restored.
645Nothing else need be done to clean up the parsing context.
646
647The code to be parsed comes from I<line> and I<rsfp>. I<line>, if
648non-null, provides a string (in SV form) containing code to be parsed.
649A copy of the string is made, so subsequent modification of I<line>
650does not affect parsing. I<rsfp>, if non-null, provides an input stream
651from which code will be read to be parsed. If both are non-null, the
652code in I<line> comes first and must consist of complete lines of input,
653and I<rsfp> supplies the remainder of the source.
654
e368b3bd
FC
655The I<flags> parameter is reserved for future use. Currently it is only
656used by perl internally, so extensions should always pass zero.
8eaa0acf
Z
657
658=cut
659*/
ffb4593c 660
27fcb6ee 661/* LEX_START_SAME_FILTER indicates that this is not a new file, so it
87606032
NC
662 can share filters with the current parser.
663 LEX_START_DONT_CLOSE indicates that the file handle wasn't opened by the
664 caller, hence isn't owned by the parser, so shouldn't be closed on parser
665 destruction. This is used to handle the case of defaulting to reading the
666 script from the standard input because no filename was given on the command
667 line (without getting confused by situation where STDIN has been closed, so
668 the script handle is opened on fd 0) */
27fcb6ee 669
a0d0e21e 670void
8eaa0acf 671Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
79072805 672{
6ef55633 673 const char *s = NULL;
5486870f 674 yy_parser *parser, *oparser;
60d63348 675 if (flags && flags & ~LEX_START_FLAGS)
8eaa0acf 676 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
acdf0a21
DM
677
678 /* create and initialise a parser */
679
199e78b7 680 Newxz(parser, 1, yy_parser);
5486870f 681 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
682 PL_parser = parser;
683
28ac2b49
Z
684 parser->stack = NULL;
685 parser->ps = NULL;
686 parser->stack_size = 0;
acdf0a21 687
e3abe207
DM
688 /* on scope exit, free this parser and restore any outer one */
689 SAVEPARSER(parser);
7c4baf47 690 parser->saved_curcop = PL_curcop;
e3abe207 691
acdf0a21 692 /* initialise lexer state */
8990e307 693
fb205e7a 694 parser->nexttoke = 0;
ca4cfd28 695 parser->error_count = oparser ? oparser->error_count : 0;
7f1c3e8c 696 parser->copline = parser->preambling = NOLINE;
5afb0a62 697 parser->lex_state = LEX_NORMAL;
c2598295 698 parser->expect = XSTATE;
2f9285f8 699 parser->rsfp = rsfp;
27fcb6ee
FC
700 parser->rsfp_filters =
701 !(flags & LEX_START_SAME_FILTER) || !oparser
d3cd8e11
FC
702 ? NULL
703 : MUTABLE_AV(SvREFCNT_inc(
704 oparser->rsfp_filters
705 ? oparser->rsfp_filters
706 : (oparser->rsfp_filters = newAV())
707 ));
2f9285f8 708
199e78b7
DM
709 Newx(parser->lex_brackstack, 120, char);
710 Newx(parser->lex_casestack, 12, char);
711 *parser->lex_casestack = '\0';
d794b522 712 Newxz(parser->lex_shared, 1, LEXSHARED);
02b34bbe 713
10efb74f 714 if (line) {
0528fd32 715 STRLEN len;
10efb74f 716 s = SvPV_const(line, len);
0abcdfa4
FC
717 parser->linestr = flags & LEX_START_COPIED
718 ? SvREFCNT_inc_simple_NN(line)
719 : newSVpvn_flags(s, len, SvUTF8(line));
bf1b738b 720 sv_catpvn(parser->linestr, "\n;", rsfp ? 1 : 2);
0abcdfa4 721 } else {
bf1b738b 722 parser->linestr = newSVpvn("\n;", rsfp ? 1 : 2);
8990e307 723 }
f06b5848
DM
724 parser->oldoldbufptr =
725 parser->oldbufptr =
726 parser->bufptr =
727 parser->linestart = SvPVX(parser->linestr);
728 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
729 parser->last_lop = parser->last_uni = NULL;
b54f893d
KW
730
731 assert(FITS_IN_8_BITS(LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
732 |LEX_DONT_CLOSE_RSFP));
733 parser->lex_flags = (U8) (flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
734 |LEX_DONT_CLOSE_RSFP));
737c24fc 735
60d63348 736 parser->in_pod = parser->filtered = 0;
79072805 737}
a687059c 738
e3abe207
DM
739
740/* delete a parser object */
741
742void
743Perl_parser_free(pTHX_ const yy_parser *parser)
744{
7918f24d
NC
745 PERL_ARGS_ASSERT_PARSER_FREE;
746
7c4baf47 747 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
748 SvREFCNT_dec(parser->linestr);
749
87606032 750 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
2f9285f8 751 PerlIO_clearerr(parser->rsfp);
799361c3
SH
752 else if (parser->rsfp && (!parser->old_parser ||
753 (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 754 PerlIO_close(parser->rsfp);
5486870f 755 SvREFCNT_dec(parser->rsfp_filters);
10002bc1
FC
756 SvREFCNT_dec(parser->lex_stuff);
757 SvREFCNT_dec(parser->sublex_info.repl);
3ac7ff8f
FC
758
759 Safefree(parser->lex_brackstack);
760 Safefree(parser->lex_casestack);
761 Safefree(parser->lex_shared);
762 PL_parser = parser->old_parser;
763 Safefree(parser);
764}
765
766void
767Perl_parser_free_nexttoke_ops(pTHX_ yy_parser *parser, OPSLAB *slab)
768{
3ac7ff8f 769 I32 nexttoke = parser->nexttoke;
3ac7ff8f 770 PERL_ARGS_ASSERT_PARSER_FREE_NEXTTOKE_OPS;
3ce3dcd9 771 while (nexttoke--) {
3ac7ff8f
FC
772 if (S_is_opval_token(parser->nexttype[nexttoke] & 0xffff)
773 && parser->nextval[nexttoke].opval
774 && parser->nextval[nexttoke].opval->op_slabbed
775 && OpSLAB(parser->nextval[nexttoke].opval) == slab) {
3ce3dcd9 776 op_free(parser->nextval[nexttoke].opval);
3ac7ff8f
FC
777 parser->nextval[nexttoke].opval = NULL;
778 }
3ce3dcd9 779 }
e3abe207
DM
780}
781
782
ffb4593c 783/*
f0e67a1d
Z
784=for apidoc AmxU|SV *|PL_parser-E<gt>linestr
785
786Buffer scalar containing the chunk currently under consideration of the
787text currently being lexed. This is always a plain string scalar (for
788which C<SvPOK> is true). It is not intended to be used as a scalar by
789normal scalar means; instead refer to the buffer directly by the pointer
790variables described below.
791
792The lexer maintains various C<char*> pointers to things in the
793C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
794reallocated, all of these pointers must be updated. Don't attempt to
795do this manually, but rather use L</lex_grow_linestr> if you need to
796reallocate the buffer.
797
798The content of the text chunk in the buffer is commonly exactly one
799complete line of input, up to and including a newline terminator,
800but there are situations where it is otherwise. The octets of the
801buffer may be intended to be interpreted as either UTF-8 or Latin-1.
802The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
803flag on this scalar, which may disagree with it.
804
805For direct examination of the buffer, the variable
806L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
807lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
808of these pointers is usually preferable to examination of the scalar
809through normal scalar means.
810
811=for apidoc AmxU|char *|PL_parser-E<gt>bufend
812
813Direct pointer to the end of the chunk of text currently being lexed, the
814end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
6602b933 815+ SvCUR(PL_parser-E<gt>linestr)>. A C<NUL> character (zero octet) is
f0e67a1d
Z
816always located at the end of the buffer, and does not count as part of
817the buffer's contents.
818
819=for apidoc AmxU|char *|PL_parser-E<gt>bufptr
820
821Points to the current position of lexing inside the lexer buffer.
822Characters around this point may be freely examined, within
823the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
824L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
825interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
826
827Lexing code (whether in the Perl core or not) moves this pointer past
828the characters that it consumes. It is also expected to perform some
829bookkeeping whenever a newline character is consumed. This movement
830can be more conveniently performed by the function L</lex_read_to>,
831which handles newlines appropriately.
832
833Interpretation of the buffer's octets can be abstracted out by
834using the slightly higher-level functions L</lex_peek_unichar> and
835L</lex_read_unichar>.
836
837=for apidoc AmxU|char *|PL_parser-E<gt>linestart
838
839Points to the start of the current line inside the lexer buffer.
840This is useful for indicating at which column an error occurred, and
841not much else. This must be updated by any lexing code that consumes
842a newline; the function L</lex_read_to> handles this detail.
843
844=cut
845*/
846
847/*
848=for apidoc Amx|bool|lex_bufutf8
849
850Indicates whether the octets in the lexer buffer
851(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
852of Unicode characters. If not, they should be interpreted as Latin-1
853characters. This is analogous to the C<SvUTF8> flag for scalars.
854
855In UTF-8 mode, it is not guaranteed that the lexer buffer actually
856contains valid UTF-8. Lexing code must be robust in the face of invalid
857encoding.
858
859The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
860is significant, but not the whole story regarding the input character
861encoding. Normally, when a file is being read, the scalar contains octets
862and its C<SvUTF8> flag is off, but the octets should be interpreted as
863UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
864however, the scalar may have the C<SvUTF8> flag on, and in this case its
865octets should be interpreted as UTF-8 unless the C<use bytes> pragma
866is in effect. This logic may change in the future; use this function
867instead of implementing the logic yourself.
868
869=cut
870*/
871
872bool
873Perl_lex_bufutf8(pTHX)
874{
875 return UTF;
876}
877
878/*
879=for apidoc Amx|char *|lex_grow_linestr|STRLEN len
880
881Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
6602b933 882at least I<len> octets (including terminating C<NUL>). Returns a
f0e67a1d
Z
883pointer to the reallocated buffer. This is necessary before making
884any direct modification of the buffer that would increase its length.
885L</lex_stuff_pvn> provides a more convenient way to insert text into
886the buffer.
887
888Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
889this function updates all of the lexer's variables that point directly
890into the buffer.
891
892=cut
893*/
894
895char *
896Perl_lex_grow_linestr(pTHX_ STRLEN len)
897{
898 SV *linestr;
899 char *buf;
900 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
c7641931 901 STRLEN linestart_pos, last_uni_pos, last_lop_pos, re_eval_start_pos;
f0e67a1d
Z
902 linestr = PL_parser->linestr;
903 buf = SvPVX(linestr);
904 if (len <= SvLEN(linestr))
905 return buf;
906 bufend_pos = PL_parser->bufend - buf;
907 bufptr_pos = PL_parser->bufptr - buf;
908 oldbufptr_pos = PL_parser->oldbufptr - buf;
909 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
910 linestart_pos = PL_parser->linestart - buf;
911 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
912 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
3328ab5a
FC
913 re_eval_start_pos = PL_parser->lex_shared->re_eval_start ?
914 PL_parser->lex_shared->re_eval_start - buf : 0;
c7641931 915
f0e67a1d 916 buf = sv_grow(linestr, len);
c7641931 917
f0e67a1d
Z
918 PL_parser->bufend = buf + bufend_pos;
919 PL_parser->bufptr = buf + bufptr_pos;
920 PL_parser->oldbufptr = buf + oldbufptr_pos;
921 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
922 PL_parser->linestart = buf + linestart_pos;
923 if (PL_parser->last_uni)
924 PL_parser->last_uni = buf + last_uni_pos;
925 if (PL_parser->last_lop)
926 PL_parser->last_lop = buf + last_lop_pos;
3328ab5a
FC
927 if (PL_parser->lex_shared->re_eval_start)
928 PL_parser->lex_shared->re_eval_start = buf + re_eval_start_pos;
f0e67a1d
Z
929 return buf;
930}
931
932/*
83aa740e 933=for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
f0e67a1d
Z
934
935Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
936immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
937reallocating the buffer if necessary. This means that lexing code that
938runs later will see the characters as if they had appeared in the input.
939It is not recommended to do this as part of normal parsing, and most
940uses of this facility run the risk of the inserted characters being
941interpreted in an unintended manner.
942
943The string to be inserted is represented by I<len> octets starting
944at I<pv>. These octets are interpreted as either UTF-8 or Latin-1,
945according to whether the C<LEX_STUFF_UTF8> flag is set in I<flags>.
946The characters are recoded for the lexer buffer, according to how the
947buffer is currently being interpreted (L</lex_bufutf8>). If a string
9dcc53ea 948to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
f0e67a1d
Z
949function is more convenient.
950
951=cut
952*/
953
954void
83aa740e 955Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
f0e67a1d 956{
749123ff 957 dVAR;
f0e67a1d
Z
958 char *bufptr;
959 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
960 if (flags & ~(LEX_STUFF_UTF8))
961 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
962 if (UTF) {
963 if (flags & LEX_STUFF_UTF8) {
964 goto plain_copy;
965 } else {
54d004e8 966 STRLEN highhalf = 0; /* Count of variants */
83aa740e 967 const char *p, *e = pv+len;
54d004e8
KW
968 for (p = pv; p != e; p++) {
969 if (! UTF8_IS_INVARIANT(*p)) {
970 highhalf++;
971 }
972 }
f0e67a1d
Z
973 if (!highhalf)
974 goto plain_copy;
975 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
976 bufptr = PL_parser->bufptr;
977 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
978 SvCUR_set(PL_parser->linestr,
979 SvCUR(PL_parser->linestr) + len+highhalf);
f0e67a1d
Z
980 PL_parser->bufend += len+highhalf;
981 for (p = pv; p != e; p++) {
982 U8 c = (U8)*p;
54d004e8
KW
983 if (! UTF8_IS_INVARIANT(c)) {
984 *bufptr++ = UTF8_TWO_BYTE_HI(c);
985 *bufptr++ = UTF8_TWO_BYTE_LO(c);
f0e67a1d
Z
986 } else {
987 *bufptr++ = (char)c;
988 }
989 }
990 }
991 } else {
992 if (flags & LEX_STUFF_UTF8) {
993 STRLEN highhalf = 0;
83aa740e 994 const char *p, *e = pv+len;
f0e67a1d
Z
995 for (p = pv; p != e; p++) {
996 U8 c = (U8)*p;
54d004e8 997 if (UTF8_IS_ABOVE_LATIN1(c)) {
f0e67a1d
Z
998 Perl_croak(aTHX_ "Lexing code attempted to stuff "
999 "non-Latin-1 character into Latin-1 input");
54d004e8 1000 } else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e)) {
f0e67a1d
Z
1001 p++;
1002 highhalf++;
54d004e8 1003 } else if (! UTF8_IS_INVARIANT(c)) {
f0e67a1d
Z
1004 /* malformed UTF-8 */
1005 ENTER;
1006 SAVESPTR(PL_warnhook);
1007 PL_warnhook = PERL_WARNHOOK_FATAL;
c80e42f3 1008 utf8n_to_uvchr((U8*)p, e-p, NULL, 0);
f0e67a1d
Z
1009 LEAVE;
1010 }
1011 }
1012 if (!highhalf)
1013 goto plain_copy;
1014 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
1015 bufptr = PL_parser->bufptr;
1016 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1017 SvCUR_set(PL_parser->linestr,
1018 SvCUR(PL_parser->linestr) + len-highhalf);
f0e67a1d 1019 PL_parser->bufend += len-highhalf;
54d004e8
KW
1020 p = pv;
1021 while (p < e) {
1022 if (UTF8_IS_INVARIANT(*p)) {
1023 *bufptr++ = *p;
1024 p++;
f0e67a1d 1025 }
54d004e8
KW
1026 else {
1027 assert(p < e -1 );
94bb8c36 1028 *bufptr++ = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
54d004e8
KW
1029 p += 2;
1030 }
f0e67a1d
Z
1031 }
1032 } else {
54d004e8 1033 plain_copy:
f0e67a1d
Z
1034 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1035 bufptr = PL_parser->bufptr;
1036 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
255fdf19 1037 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
f0e67a1d
Z
1038 PL_parser->bufend += len;
1039 Copy(pv, bufptr, len, char);
1040 }
1041 }
1042}
1043
1044/*
9dcc53ea
Z
1045=for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
1046
1047Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1048immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1049reallocating the buffer if necessary. This means that lexing code that
1050runs later will see the characters as if they had appeared in the input.
1051It is not recommended to do this as part of normal parsing, and most
1052uses of this facility run the risk of the inserted characters being
1053interpreted in an unintended manner.
1054
1055The string to be inserted is represented by octets starting at I<pv>
1056and continuing to the first nul. These octets are interpreted as either
1057UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
1058in I<flags>. The characters are recoded for the lexer buffer, according
1059to how the buffer is currently being interpreted (L</lex_bufutf8>).
1060If it is not convenient to nul-terminate a string to be inserted, the
1061L</lex_stuff_pvn> function is more appropriate.
1062
1063=cut
1064*/
1065
1066void
1067Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1068{
1069 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1070 lex_stuff_pvn(pv, strlen(pv), flags);
1071}
1072
1073/*
f0e67a1d
Z
1074=for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1075
1076Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1077immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1078reallocating the buffer if necessary. This means that lexing code that
1079runs later will see the characters as if they had appeared in the input.
1080It is not recommended to do this as part of normal parsing, and most
1081uses of this facility run the risk of the inserted characters being
1082interpreted in an unintended manner.
1083
1084The string to be inserted is the string value of I<sv>. The characters
1085are recoded for the lexer buffer, according to how the buffer is currently
9dcc53ea 1086being interpreted (L</lex_bufutf8>). If a string to be inserted is
f0e67a1d
Z
1087not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1088need to construct a scalar.
1089
1090=cut
1091*/
1092
1093void
1094Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1095{
1096 char *pv;
1097 STRLEN len;
1098 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1099 if (flags)
1100 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1101 pv = SvPV(sv, len);
1102 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1103}
1104
1105/*
1106=for apidoc Amx|void|lex_unstuff|char *ptr
1107
1108Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1109I<ptr>. Text following I<ptr> will be moved, and the buffer shortened.
1110This hides the discarded text from any lexing code that runs later,
1111as if the text had never appeared.
1112
1113This is not the normal way to consume lexed text. For that, use
1114L</lex_read_to>.
1115
1116=cut
1117*/
1118
1119void
1120Perl_lex_unstuff(pTHX_ char *ptr)
1121{
1122 char *buf, *bufend;
1123 STRLEN unstuff_len;
1124 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1125 buf = PL_parser->bufptr;
1126 if (ptr < buf)
1127 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1128 if (ptr == buf)
1129 return;
1130 bufend = PL_parser->bufend;
1131 if (ptr > bufend)
1132 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1133 unstuff_len = ptr - buf;
1134 Move(ptr, buf, bufend+1-ptr, char);
1135 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1136 PL_parser->bufend = bufend - unstuff_len;
1137}
1138
1139/*
1140=for apidoc Amx|void|lex_read_to|char *ptr
1141
1142Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1143to I<ptr>. This advances L</PL_parser-E<gt>bufptr> to match I<ptr>,
1144performing the correct bookkeeping whenever a newline character is passed.
1145This is the normal way to consume lexed text.
1146
1147Interpretation of the buffer's octets can be abstracted out by
1148using the slightly higher-level functions L</lex_peek_unichar> and
1149L</lex_read_unichar>.
1150
1151=cut
1152*/
1153
1154void
1155Perl_lex_read_to(pTHX_ char *ptr)
1156{
1157 char *s;
1158 PERL_ARGS_ASSERT_LEX_READ_TO;
1159 s = PL_parser->bufptr;
1160 if (ptr < s || ptr > PL_parser->bufend)
1161 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1162 for (; s != ptr; s++)
1163 if (*s == '\n') {
83944c01 1164 COPLINE_INC_WITH_HERELINES;
f0e67a1d
Z
1165 PL_parser->linestart = s+1;
1166 }
1167 PL_parser->bufptr = ptr;
1168}
1169
1170/*
1171=for apidoc Amx|void|lex_discard_to|char *ptr
1172
1173Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1174up to I<ptr>. The remaining content of the buffer will be moved, and
1175all pointers into the buffer updated appropriately. I<ptr> must not
1176be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1177it is not permitted to discard text that has yet to be lexed.
1178
1179Normally it is not necessarily to do this directly, because it suffices to
1180use the implicit discarding behaviour of L</lex_next_chunk> and things
1181based on it. However, if a token stretches across multiple lines,
1f317c95 1182and the lexing code has kept multiple lines of text in the buffer for
f0e67a1d
Z
1183that purpose, then after completion of the token it would be wise to
1184explicitly discard the now-unneeded earlier lines, to avoid future
1185multi-line tokens growing the buffer without bound.
1186
1187=cut
1188*/
1189
1190void
1191Perl_lex_discard_to(pTHX_ char *ptr)
1192{
1193 char *buf;
1194 STRLEN discard_len;
1195 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1196 buf = SvPVX(PL_parser->linestr);
1197 if (ptr < buf)
1198 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1199 if (ptr == buf)
1200 return;
1201 if (ptr > PL_parser->bufptr)
1202 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1203 discard_len = ptr - buf;
1204 if (PL_parser->oldbufptr < ptr)
1205 PL_parser->oldbufptr = ptr;
1206 if (PL_parser->oldoldbufptr < ptr)
1207 PL_parser->oldoldbufptr = ptr;
1208 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1209 PL_parser->last_uni = NULL;
1210 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1211 PL_parser->last_lop = NULL;
1212 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1213 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1214 PL_parser->bufend -= discard_len;
1215 PL_parser->bufptr -= discard_len;
1216 PL_parser->oldbufptr -= discard_len;
1217 PL_parser->oldoldbufptr -= discard_len;
1218 if (PL_parser->last_uni)
1219 PL_parser->last_uni -= discard_len;
1220 if (PL_parser->last_lop)
1221 PL_parser->last_lop -= discard_len;
1222}
1223
1224/*
1225=for apidoc Amx|bool|lex_next_chunk|U32 flags
1226
1227Reads in the next chunk of text to be lexed, appending it to
1228L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1229looked to the end of the current chunk and wants to know more. It is
1230usual, but not necessary, for lexing to have consumed the entirety of
1231the current chunk at this time.
1232
1233If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1234chunk (i.e., the current chunk has been entirely consumed), normally the
1235current chunk will be discarded at the same time that the new chunk is
1236read in. If I<flags> includes C<LEX_KEEP_PREVIOUS>, the current chunk
1237will not be discarded. If the current chunk has not been entirely
1238consumed, then it will not be discarded regardless of the flag.
1239
1240Returns true if some new text was added to the buffer, or false if the
1241buffer has reached the end of the input text.
1242
1243=cut
1244*/
1245
1246#define LEX_FAKE_EOF 0x80000000
112d1284 1247#define LEX_NO_TERM 0x40000000
f0e67a1d
Z
1248
1249bool
1250Perl_lex_next_chunk(pTHX_ U32 flags)
1251{
1252 SV *linestr;
1253 char *buf;
1254 STRLEN old_bufend_pos, new_bufend_pos;
1255 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1256 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1257 bool got_some_for_debugger = 0;
f0e67a1d 1258 bool got_some;
112d1284 1259 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM))
f0e67a1d 1260 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
f0e67a1d
Z
1261 linestr = PL_parser->linestr;
1262 buf = SvPVX(linestr);
1263 if (!(flags & LEX_KEEP_PREVIOUS) &&
1264 PL_parser->bufptr == PL_parser->bufend) {
1265 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1266 linestart_pos = 0;
1267 if (PL_parser->last_uni != PL_parser->bufend)
1268 PL_parser->last_uni = NULL;
1269 if (PL_parser->last_lop != PL_parser->bufend)
1270 PL_parser->last_lop = NULL;
1271 last_uni_pos = last_lop_pos = 0;
1272 *buf = 0;
1273 SvCUR(linestr) = 0;
1274 } else {
1275 old_bufend_pos = PL_parser->bufend - buf;
1276 bufptr_pos = PL_parser->bufptr - buf;
1277 oldbufptr_pos = PL_parser->oldbufptr - buf;
1278 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1279 linestart_pos = PL_parser->linestart - buf;
1280 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1281 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1282 }
1283 if (flags & LEX_FAKE_EOF) {
1284 goto eof;
60d63348 1285 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
f0e67a1d
Z
1286 got_some = 0;
1287 } else if (filter_gets(linestr, old_bufend_pos)) {
1288 got_some = 1;
17cc9359 1289 got_some_for_debugger = 1;
112d1284
FC
1290 } else if (flags & LEX_NO_TERM) {
1291 got_some = 0;
f0e67a1d 1292 } else {
580561a3
Z
1293 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1294 sv_setpvs(linestr, "");
f0e67a1d
Z
1295 eof:
1296 /* End of real input. Close filehandle (unless it was STDIN),
1297 * then add implicit termination.
1298 */
87606032 1299 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
f0e67a1d
Z
1300 PerlIO_clearerr(PL_parser->rsfp);
1301 else if (PL_parser->rsfp)
1302 (void)PerlIO_close(PL_parser->rsfp);
1303 PL_parser->rsfp = NULL;
60d63348 1304 PL_parser->in_pod = PL_parser->filtered = 0;
f0e67a1d
Z
1305 if (!PL_in_eval && PL_minus_p) {
1306 sv_catpvs(linestr,
1307 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1308 PL_minus_n = PL_minus_p = 0;
1309 } else if (!PL_in_eval && PL_minus_n) {
1310 sv_catpvs(linestr, /*{*/";}");
1311 PL_minus_n = 0;
1312 } else
1313 sv_catpvs(linestr, ";");
1314 got_some = 1;
1315 }
1316 buf = SvPVX(linestr);
1317 new_bufend_pos = SvCUR(linestr);
1318 PL_parser->bufend = buf + new_bufend_pos;
1319 PL_parser->bufptr = buf + bufptr_pos;
1320 PL_parser->oldbufptr = buf + oldbufptr_pos;
1321 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1322 PL_parser->linestart = buf + linestart_pos;
1323 if (PL_parser->last_uni)
1324 PL_parser->last_uni = buf + last_uni_pos;
1325 if (PL_parser->last_lop)
1326 PL_parser->last_lop = buf + last_lop_pos;
7f1c3e8c
FC
1327 if (PL_parser->preambling != NOLINE) {
1328 CopLINE_set(PL_curcop, PL_parser->preambling + 1);
1329 PL_parser->preambling = NOLINE;
1330 }
17cc9359 1331 if (got_some_for_debugger && (PERLDB_LINE || PERLDB_SAVESRC) &&
f0e67a1d
Z
1332 PL_curstash != PL_debstash) {
1333 /* debugger active and we're not compiling the debugger code,
1334 * so store the line into the debugger's array of lines
1335 */
1336 update_debugger_info(NULL, buf+old_bufend_pos,
1337 new_bufend_pos-old_bufend_pos);
1338 }
1339 return got_some;
1340}
1341
1342/*
1343=for apidoc Amx|I32|lex_peek_unichar|U32 flags
1344
1345Looks ahead one (Unicode) character in the text currently being lexed.
1346Returns the codepoint (unsigned integer value) of the next character,
1347or -1 if lexing has reached the end of the input text. To consume the
1348peeked character, use L</lex_read_unichar>.
1349
1350If the next character is in (or extends into) the next chunk of input
1351text, the next chunk will be read in. Normally the current chunk will be
1352discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1353then the current chunk will not be discarded.
1354
1355If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1356is encountered, an exception is generated.
1357
1358=cut
1359*/
1360
1361I32
1362Perl_lex_peek_unichar(pTHX_ U32 flags)
1363{
749123ff 1364 dVAR;
f0e67a1d
Z
1365 char *s, *bufend;
1366 if (flags & ~(LEX_KEEP_PREVIOUS))
1367 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1368 s = PL_parser->bufptr;
1369 bufend = PL_parser->bufend;
1370 if (UTF) {
1371 U8 head;
1372 I32 unichar;
1373 STRLEN len, retlen;
1374 if (s == bufend) {
1375 if (!lex_next_chunk(flags))
1376 return -1;
1377 s = PL_parser->bufptr;
1378 bufend = PL_parser->bufend;
1379 }
1380 head = (U8)*s;
54d004e8 1381 if (UTF8_IS_INVARIANT(head))
f0e67a1d 1382 return head;
54d004e8
KW
1383 if (UTF8_IS_START(head)) {
1384 len = UTF8SKIP(&head);
f0e67a1d
Z
1385 while ((STRLEN)(bufend-s) < len) {
1386 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1387 break;
1388 s = PL_parser->bufptr;
1389 bufend = PL_parser->bufend;
1390 }
1391 }
c80e42f3 1392 unichar = utf8n_to_uvchr((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
f0e67a1d
Z
1393 if (retlen == (STRLEN)-1) {
1394 /* malformed UTF-8 */
1395 ENTER;
1396 SAVESPTR(PL_warnhook);
1397 PL_warnhook = PERL_WARNHOOK_FATAL;
c80e42f3 1398 utf8n_to_uvchr((U8*)s, bufend-s, NULL, 0);
f0e67a1d
Z
1399 LEAVE;
1400 }
1401 return unichar;
1402 } else {
1403 if (s == bufend) {
1404 if (!lex_next_chunk(flags))
1405 return -1;
1406 s = PL_parser->bufptr;
1407 }
1408 return (U8)*s;
1409 }
1410}
1411
1412/*
1413=for apidoc Amx|I32|lex_read_unichar|U32 flags
1414
1415Reads the next (Unicode) character in the text currently being lexed.
1416Returns the codepoint (unsigned integer value) of the character read,
1417and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1418if lexing has reached the end of the input text. To non-destructively
1419examine the next character, use L</lex_peek_unichar> instead.
1420
1421If the next character is in (or extends into) the next chunk of input
1422text, the next chunk will be read in. Normally the current chunk will be
1423discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1424then the current chunk will not be discarded.
1425
1426If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1427is encountered, an exception is generated.
1428
1429=cut
1430*/
1431
1432I32
1433Perl_lex_read_unichar(pTHX_ U32 flags)
1434{
1435 I32 c;
1436 if (flags & ~(LEX_KEEP_PREVIOUS))
1437 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1438 c = lex_peek_unichar(flags);
1439 if (c != -1) {
1440 if (c == '\n')
83944c01 1441 COPLINE_INC_WITH_HERELINES;
d9018cbe
EB
1442 if (UTF)
1443 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1444 else
1445 ++(PL_parser->bufptr);
f0e67a1d
Z
1446 }
1447 return c;
1448}
1449
1450/*
1451=for apidoc Amx|void|lex_read_space|U32 flags
1452
1453Reads optional spaces, in Perl style, in the text currently being
1454lexed. The spaces may include ordinary whitespace characters and
1455Perl-style comments. C<#line> directives are processed if encountered.
1456L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1457at a non-space character (or the end of the input text).
1458
1459If spaces extend into the next chunk of input text, the next chunk will
1460be read in. Normally the current chunk will be discarded at the same
1461time, but if I<flags> includes C<LEX_KEEP_PREVIOUS> then the current
1462chunk will not be discarded.
1463
1464=cut
1465*/
1466
21791330 1467#define LEX_NO_INCLINE 0x40000000
f0998909
Z
1468#define LEX_NO_NEXT_CHUNK 0x80000000
1469
f0e67a1d
Z
1470void
1471Perl_lex_read_space(pTHX_ U32 flags)
1472{
1473 char *s, *bufend;
21791330 1474 const bool can_incline = !(flags & LEX_NO_INCLINE);
f0e67a1d 1475 bool need_incline = 0;
21791330 1476 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK|LEX_NO_INCLINE))
f0e67a1d 1477 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
f0e67a1d
Z
1478 s = PL_parser->bufptr;
1479 bufend = PL_parser->bufend;
1480 while (1) {
1481 char c = *s;
1482 if (c == '#') {
1483 do {
1484 c = *++s;
1485 } while (!(c == '\n' || (c == 0 && s == bufend)));
1486 } else if (c == '\n') {
1487 s++;
21791330
FC
1488 if (can_incline) {
1489 PL_parser->linestart = s;
1490 if (s == bufend)
1491 need_incline = 1;
1492 else
1493 incline(s);
1494 }
f0e67a1d
Z
1495 } else if (isSPACE(c)) {
1496 s++;
1497 } else if (c == 0 && s == bufend) {
1498 bool got_more;
65c68e17 1499 line_t l;
f0998909
Z
1500 if (flags & LEX_NO_NEXT_CHUNK)
1501 break;
f0e67a1d 1502 PL_parser->bufptr = s;
65c68e17 1503 l = CopLINE(PL_curcop);
851b527a 1504 CopLINE(PL_curcop) += PL_parser->herelines + 1;
f0e67a1d 1505 got_more = lex_next_chunk(flags);
65c68e17 1506 CopLINE_set(PL_curcop, l);
f0e67a1d
Z
1507 s = PL_parser->bufptr;
1508 bufend = PL_parser->bufend;
1509 if (!got_more)
1510 break;
21791330 1511 if (can_incline && need_incline && PL_parser->rsfp) {
f0e67a1d
Z
1512 incline(s);
1513 need_incline = 0;
1514 }
1515 } else {
1516 break;
1517 }
1518 }
f0e67a1d
Z
1519 PL_parser->bufptr = s;
1520}
1521
1522/*
fe788d6b
PM
1523
1524=for apidoc EXMp|bool|validate_proto|SV *name|SV *proto|bool warn
1525
1526This function performs syntax checking on a prototype, C<proto>.
1527If C<warn> is true, any illegal characters or mismatched brackets
1528will trigger illegalproto warnings, declaring that they were
1529detected in the prototype for C<name>.
1530
1531The return value is C<true> if this is a valid prototype, and
1532C<false> if it is not, regardless of whether C<warn> was C<true> or
1533C<false>.
1534
1535Note that C<NULL> is a valid C<proto> and will always return C<true>.
1536
1537=cut
1538
1539 */
1540
1541bool
1542Perl_validate_proto(pTHX_ SV *name, SV *proto, bool warn)
1543{
1544 STRLEN len, origlen;
1545 char *p = proto ? SvPV(proto, len) : NULL;
1546 bool bad_proto = FALSE;
1547 bool in_brackets = FALSE;
1548 bool after_slash = FALSE;
1549 char greedy_proto = ' ';
1550 bool proto_after_greedy_proto = FALSE;
1551 bool must_be_last = FALSE;
1552 bool underscore = FALSE;
f791a21a 1553 bool bad_proto_after_underscore = FALSE;
fe788d6b
PM
1554
1555 PERL_ARGS_ASSERT_VALIDATE_PROTO;
1556
1557 if (!proto)
1558 return TRUE;
1559
1560 origlen = len;
1561 for (; len--; p++) {
1562 if (!isSPACE(*p)) {
1563 if (must_be_last)
1564 proto_after_greedy_proto = TRUE;
f791a21a
PM
1565 if (underscore) {
1566 if (!strchr(";@%", *p))
1567 bad_proto_after_underscore = TRUE;
1568 underscore = FALSE;
1569 }
fe788d6b
PM
1570 if (!strchr("$@%*;[]&\\_+", *p) || *p == '\0') {
1571 bad_proto = TRUE;
1572 }
1573 else {
fe788d6b
PM
1574 if (*p == '[')
1575 in_brackets = TRUE;
1576 else if (*p == ']')
1577 in_brackets = FALSE;
1578 else if ((*p == '@' || *p == '%') &&
1579 !after_slash &&
1580 !in_brackets ) {
1581 must_be_last = TRUE;
1582 greedy_proto = *p;
1583 }
1584 else if (*p == '_')
f791a21a 1585 underscore = TRUE;
fe788d6b
PM
1586 }
1587 if (*p == '\\')
1588 after_slash = TRUE;
1589 else
1590 after_slash = FALSE;
1591 }
1592 }
1593
1594 if (warn) {
b54d603d 1595 SV *tmpsv = newSVpvs_flags("", SVs_TEMP);
fe788d6b 1596 p -= origlen;
b54d603d
PM
1597 p = SvUTF8(proto)
1598 ? sv_uni_display(tmpsv, newSVpvn_flags(p, origlen, SVs_TEMP | SVf_UTF8),
1599 origlen, UNI_DISPLAY_ISPRINT)
1600 : pv_pretty(tmpsv, p, origlen, 60, NULL, NULL, PERL_PV_ESCAPE_NONASCII);
1601
fe788d6b
PM
1602 if (proto_after_greedy_proto)
1603 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1604 "Prototype after '%c' for %"SVf" : %s",
1605 greedy_proto, SVfARG(name), p);
50278ed0
PM
1606 if (in_brackets)
1607 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1608 "Missing ']' in prototype for %"SVf" : %s",
1609 SVfARG(name), p);
b54d603d 1610 if (bad_proto)
fe788d6b 1611 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
f791a21a
PM
1612 "Illegal character in prototype for %"SVf" : %s",
1613 SVfARG(name), p);
1614 if (bad_proto_after_underscore)
1615 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1616 "Illegal character after '_' in prototype for %"SVf" : %s",
1617 SVfARG(name), p);
fe788d6b
PM
1618 }
1619
1620 return (! (proto_after_greedy_proto || bad_proto) );
1621}
1622
1623/*
ffb4593c
NT
1624 * S_incline
1625 * This subroutine has nothing to do with tilting, whether at windmills
1626 * or pinball tables. Its name is short for "increment line". It
57843af0 1627 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1628 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1629 * # line 500 "foo.pm"
1630 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1631 */
1632
76e3520e 1633STATIC void
d9095cec 1634S_incline(pTHX_ const char *s)
463ee0b2 1635{
d9095cec
NC
1636 const char *t;
1637 const char *n;
1638 const char *e;
8818d409 1639 line_t line_num;
463ee0b2 1640
7918f24d
NC
1641 PERL_ARGS_ASSERT_INCLINE;
1642
83944c01 1643 COPLINE_INC_WITH_HERELINES;
451f421f
FC
1644 if (!PL_rsfp && !PL_parser->filtered && PL_lex_state == LEX_NORMAL
1645 && s+1 == PL_bufend && *s == ';') {
1646 /* fake newline in string eval */
1647 CopLINE_dec(PL_curcop);
1648 return;
1649 }
463ee0b2
LW
1650 if (*s++ != '#')
1651 return;
d4c19fe8
AL
1652 while (SPACE_OR_TAB(*s))
1653 s++;
73659bf1
GS
1654 if (strnEQ(s, "line", 4))
1655 s += 4;
1656 else
1657 return;
084592ab 1658 if (SPACE_OR_TAB(*s))
73659bf1 1659 s++;
4e553d73 1660 else
73659bf1 1661 return;
d4c19fe8
AL
1662 while (SPACE_OR_TAB(*s))
1663 s++;
463ee0b2
LW
1664 if (!isDIGIT(*s))
1665 return;
d4c19fe8 1666
463ee0b2
LW
1667 n = s;
1668 while (isDIGIT(*s))
1669 s++;
07714eb4 1670 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1671 return;
bf4acbe4 1672 while (SPACE_OR_TAB(*s))
463ee0b2 1673 s++;
73659bf1 1674 if (*s == '"' && (t = strchr(s+1, '"'))) {
463ee0b2 1675 s++;
73659bf1
GS
1676 e = t + 1;
1677 }
463ee0b2 1678 else {
c35e046a
AL
1679 t = s;
1680 while (!isSPACE(*t))
1681 t++;
73659bf1 1682 e = t;
463ee0b2 1683 }
bf4acbe4 1684 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1685 e++;
1686 if (*e != '\n' && *e != '\0')
1687 return; /* false alarm */
1688
8818d409
FC
1689 line_num = atoi(n)-1;
1690
f4dd75d9 1691 if (t - s > 0) {
d9095cec 1692 const STRLEN len = t - s;
3df32bda 1693
d36ee5be 1694 if (!PL_rsfp && !PL_parser->filtered) {
e66cf94c
RGS
1695 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1696 * to *{"::_<newfilename"} */
44867030
NC
1697 /* However, the long form of evals is only turned on by the
1698 debugger - usually they're "(eval %lu)" */
d36ee5be
FC
1699 GV * const cfgv = CopFILEGV(PL_curcop);
1700 if (cfgv) {
38bd7ad8
FC
1701 char smallbuf[128];
1702 STRLEN tmplen2 = len;
44867030 1703 char *tmpbuf2;
449dd039 1704 GV *gv2;
44867030
NC
1705
1706 if (tmplen2 + 2 <= sizeof smallbuf)
1707 tmpbuf2 = smallbuf;
1708 else
1709 Newx(tmpbuf2, tmplen2 + 2, char);
1710
38bd7ad8
FC
1711 tmpbuf2[0] = '_';
1712 tmpbuf2[1] = '<';
44867030
NC
1713
1714 memcpy(tmpbuf2 + 2, s, tmplen2);
1715 tmplen2 += 2;
1716
8a5ee598 1717 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1718 if (!isGV(gv2)) {
8a5ee598 1719 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1720 /* adjust ${"::_<newfilename"} to store the new file name */
1721 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
8818d409
FC
1722 /* The line number may differ. If that is the case,
1723 alias the saved lines that are in the array.
1724 Otherwise alias the whole array. */
1725 if (CopLINE(PL_curcop) == line_num) {
38bd7ad8
FC
1726 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(cfgv)));
1727 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(cfgv)));
8818d409 1728 }
38bd7ad8
FC
1729 else if (GvAV(cfgv)) {
1730 AV * const av = GvAV(cfgv);
8818d409
FC
1731 const I32 start = CopLINE(PL_curcop)+1;
1732 I32 items = AvFILLp(av) - start;
1733 if (items > 0) {
1734 AV * const av2 = GvAVn(gv2);
1735 SV **svp = AvARRAY(av) + start;
1736 I32 l = (I32)line_num+1;
1737 while (items--)
1738 av_store(av2, l++, SvREFCNT_inc(*svp++));
1739 }
1740 }
e5527e4b 1741 }
44867030
NC
1742
1743 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
d36ee5be 1744 }
e66cf94c 1745 }
05ec9bb3 1746 CopFILE_free(PL_curcop);
449dd039 1747 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1748 }
8818d409 1749 CopLINE_set(PL_curcop, line_num);
463ee0b2
LW
1750}
1751
21791330
FC
1752#define skipspace(s) skipspace_flags(s, 0)
1753
29595ff2 1754
80a702cd 1755STATIC void
15f169a1 1756S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1757{
1758 AV *av = CopFILEAVx(PL_curcop);
1759 if (av) {
7f1c3e8c
FC
1760 SV * sv;
1761 if (PL_parser->preambling == NOLINE) sv = newSV_type(SVt_PVMG);
1762 else {
1763 sv = *av_fetch(av, 0, 1);
1764 SvUPGRADE(sv, SVt_PVMG);
1765 }
1766 if (!SvPOK(sv)) sv_setpvs(sv,"");
5fa550fb 1767 if (orig_sv)
7f1c3e8c 1768 sv_catsv(sv, orig_sv);
5fa550fb 1769 else
7f1c3e8c
FC
1770 sv_catpvn(sv, buf, len);
1771 if (!SvIOK(sv)) {
1772 (void)SvIOK_on(sv);
1773 SvIV_set(sv, 0);
1774 }
1775 if (PL_parser->preambling == NOLINE)
1776 av_store(av, CopLINE(PL_curcop), sv);
80a702cd
RGS
1777 }
1778}
1779
ffb4593c
NT
1780/*
1781 * S_skipspace
1782 * Called to gobble the appropriate amount and type of whitespace.
1783 * Skips comments as well.
1784 */
1785
76e3520e 1786STATIC char *
21791330 1787S_skipspace_flags(pTHX_ char *s, U32 flags)
a687059c 1788{
21791330 1789 PERL_ARGS_ASSERT_SKIPSPACE_FLAGS;
3280af22 1790 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
bf4acbe4 1791 while (s < PL_bufend && SPACE_OR_TAB(*s))
463ee0b2 1792 s++;
f0e67a1d
Z
1793 } else {
1794 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1795 PL_bufptr = s;
21791330 1796 lex_read_space(flags | LEX_KEEP_PREVIOUS |
f0998909
Z
1797 (PL_sublex_info.sub_inwhat || PL_lex_state == LEX_FORMLINE ?
1798 LEX_NO_NEXT_CHUNK : 0));
3280af22 1799 s = PL_bufptr;
f0e67a1d
Z
1800 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1801 if (PL_linestart > PL_bufptr)
1802 PL_bufptr = PL_linestart;
1803 return s;
463ee0b2 1804 }
5db06880 1805 return s;
a687059c 1806}
378cc40b 1807
ffb4593c
NT
1808/*
1809 * S_check_uni
1810 * Check the unary operators to ensure there's no ambiguity in how they're
1811 * used. An ambiguous piece of code would be:
1812 * rand + 5
1813 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1814 * the +5 is its argument.
1815 */
1816
76e3520e 1817STATIC void
cea2e8a9 1818S_check_uni(pTHX)
ba106d47 1819{
d4c19fe8
AL
1820 const char *s;
1821 const char *t;
2f3197b3 1822
3280af22 1823 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1824 return;
3280af22
NIS
1825 while (isSPACE(*PL_last_uni))
1826 PL_last_uni++;
c35e046a 1827 s = PL_last_uni;
8a2bca12 1828 while (isWORDCHAR_lazy_if(s,UTF) || *s == '-')
c35e046a 1829 s++;
3280af22 1830 if ((t = strchr(s, '(')) && t < PL_bufptr)
a0d0e21e 1831 return;
6136c704 1832
9b387841
NC
1833 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
1834 "Warning: Use of \"%.*s\" without parentheses is ambiguous",
1835 (int)(s - PL_last_uni), PL_last_uni);
2f3197b3
LW
1836}
1837
ffb4593c
NT
1838/*
1839 * LOP : macro to build a list operator. Its behaviour has been replaced
1840 * with a subroutine, S_lop() for which LOP is just another name.
1841 */
1842
a0d0e21e
LW
1843#define LOP(f,x) return lop(f,x,s)
1844
ffb4593c
NT
1845/*
1846 * S_lop
1847 * Build a list operator (or something that might be one). The rules:
1848 * - if we have a next token, then it's a list operator [why?]
1849 * - if the next thing is an opening paren, then it's a function
1850 * - else it's a list operator
1851 */
1852
76e3520e 1853STATIC I32
a0be28da 1854S_lop(pTHX_ I32 f, int x, char *s)
ffed7fef 1855{
7918f24d
NC
1856 PERL_ARGS_ASSERT_LOP;
1857
6154021b 1858 pl_yylval.ival = f;
35c8bce7 1859 CLINE;
3280af22
NIS
1860 PL_expect = x;
1861 PL_bufptr = s;
1862 PL_last_lop = PL_oldbufptr;
eb160463 1863 PL_last_lop_op = (OPCODE)f;
3280af22 1864 if (PL_nexttoke)
78cdf107 1865 goto lstop;
79072805 1866 if (*s == '(')
bbf60fe6 1867 return REPORT(FUNC);
29595ff2 1868 s = PEEKSPACE(s);
79072805 1869 if (*s == '(')
bbf60fe6 1870 return REPORT(FUNC);
78cdf107
Z
1871 else {
1872 lstop:
1873 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
1874 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
bbf60fe6 1875 return REPORT(LSTOP);
78cdf107 1876 }
79072805
LW
1877}
1878
ffb4593c
NT
1879/*
1880 * S_force_next
9cbb5ea2 1881 * When the lexer realizes it knows the next token (for instance,
ffb4593c 1882 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 1883 * to know what token to return the next time the lexer is called. Caller
b5bbe64a
JH
1884 * will need to set PL_nextval[] and possibly PL_expect to ensure
1885 * the lexer handles the token correctly.
ffb4593c
NT
1886 */
1887
4e553d73 1888STATIC void
cea2e8a9 1889S_force_next(pTHX_ I32 type)
79072805 1890{
704d4215
GG
1891#ifdef DEBUGGING
1892 if (DEBUG_T_TEST) {
1893 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 1894 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
1895 }
1896#endif
3280af22
NIS
1897 PL_nexttype[PL_nexttoke] = type;
1898 PL_nexttoke++;
1899 if (PL_lex_state != LEX_KNOWNEXT) {
1900 PL_lex_defer = PL_lex_state;
1901 PL_lex_expect = PL_expect;
1902 PL_lex_state = LEX_KNOWNEXT;
79072805
LW
1903 }
1904}
1905
89f35911
FC
1906/*
1907 * S_postderef
1908 *
1909 * This subroutine handles postfix deref syntax after the arrow has already
1910 * been emitted. @* $* etc. are emitted as two separate token right here.
1911 * @[ @{ %[ %{ *{ are emitted also as two tokens, but this function emits
1912 * only the first, leaving yylex to find the next.
89f35911
FC
1913 */
1914
1915static int
ff25e5db 1916S_postderef(pTHX_ int const funny, char const next)
89f35911 1917{
ff25e5db 1918 assert(funny == DOLSHARP || strchr("$@%&*", funny));
89f35911
FC
1919 assert(strchr("*[{", next));
1920 if (next == '*') {
1921 PL_expect = XOPERATOR;
cc624add 1922 if (PL_lex_state == LEX_INTERPNORMAL && !PL_lex_brackets) {
ff25e5db 1923 assert('@' == funny || '$' == funny || DOLSHARP == funny);
cc624add 1924 PL_lex_state = LEX_INTERPEND;
cc624add
FC
1925 force_next(POSTJOIN);
1926 }
89f35911
FC
1927 force_next(next);
1928 PL_bufptr+=2;
1929 }
1930 else {
760ca746
FC
1931 if ('@' == funny && PL_lex_state == LEX_INTERPNORMAL
1932 && !PL_lex_brackets)
1933 PL_lex_dojoin = 2;
89f35911
FC
1934 PL_expect = XOPERATOR;
1935 PL_bufptr++;
1936 }
1937 return funny;
1938}
1939
28ac2b49
Z
1940void
1941Perl_yyunlex(pTHX)
1942{
a7aaec61
Z
1943 int yyc = PL_parser->yychar;
1944 if (yyc != YYEMPTY) {
1945 if (yyc) {
a7aaec61
Z
1946 NEXTVAL_NEXTTOKE = PL_parser->yylval;
1947 if (yyc == '{'/*}*/ || yyc == HASHBRACK || yyc == '['/*]*/) {
78cdf107 1948 PL_lex_allbrackets--;
a7aaec61 1949 PL_lex_brackets--;
78cdf107
Z
1950 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
1951 } else if (yyc == '('/*)*/) {
1952 PL_lex_allbrackets--;
1953 yyc |= (2<<24);
a7aaec61
Z
1954 }
1955 force_next(yyc);
1956 }
28ac2b49
Z
1957 PL_parser->yychar = YYEMPTY;
1958 }
1959}
1960
d0a148a6 1961STATIC SV *
15f169a1 1962S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 1963{
740cce10 1964 SV * const sv = newSVpvn_utf8(start, len,
eaf7a4d2
CS
1965 !IN_BYTES
1966 && UTF
1967 && !is_ascii_string((const U8*)start, len)
740cce10 1968 && is_utf8_string((const U8*)start, len));
d0a148a6
NC
1969 return sv;
1970}
1971
ffb4593c
NT
1972/*
1973 * S_force_word
1974 * When the lexer knows the next thing is a word (for instance, it has
1975 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
1976 * it calls S_force_word to stick the next word into the PL_nexttoke/val
1977 * lookahead.
ffb4593c
NT
1978 *
1979 * Arguments:
b1b65b59 1980 * char *start : buffer position (must be within PL_linestr)
02b34bbe 1981 * int token : PL_next* will be this type of bare word (e.g., METHOD,WORD)
ffb4593c
NT
1982 * int check_keyword : if true, Perl checks to make sure the word isn't
1983 * a keyword (do this if the word is a label, e.g. goto FOO)
1984 * int allow_pack : if true, : characters will also be allowed (require,
1985 * use, etc. do this)
9cbb5ea2 1986 * int allow_initial_tick : used by the "sub" lexer only.
ffb4593c
NT
1987 */
1988
76e3520e 1989STATIC char *
345b3785 1990S_force_word(pTHX_ char *start, int token, int check_keyword, int allow_pack)
79072805 1991{
eb578fdb 1992 char *s;
463ee0b2 1993 STRLEN len;
4e553d73 1994
7918f24d
NC
1995 PERL_ARGS_ASSERT_FORCE_WORD;
1996
29595ff2 1997 start = SKIPSPACE1(start);
463ee0b2 1998 s = start;
7e2040f0 1999 if (isIDFIRST_lazy_if(s,UTF) ||
345b3785 2000 (allow_pack && *s == ':') )
a0d0e21e 2001 {
3280af22 2002 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
01b5ef50
FC
2003 if (check_keyword) {
2004 char *s2 = PL_tokenbuf;
2005 if (allow_pack && len > 6 && strnEQ(s2, "CORE::", 6))
2006 s2 += 6, len -= 6;
2007 if (keyword(s2, len, 0))
463ee0b2 2008 return start;
01b5ef50 2009 }
463ee0b2 2010 if (token == METHOD) {
29595ff2 2011 s = SKIPSPACE1(s);
463ee0b2 2012 if (*s == '(')
3280af22 2013 PL_expect = XTERM;
463ee0b2 2014 else {
3280af22 2015 PL_expect = XOPERATOR;
463ee0b2 2016 }
79072805 2017 }
9ded7720 2018 NEXTVAL_NEXTTOKE.opval
d0a148a6
NC
2019 = (OP*)newSVOP(OP_CONST,0,
2020 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 2021 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
2022 force_next(token);
2023 }
2024 return s;
2025}
2026
ffb4593c
NT
2027/*
2028 * S_force_ident
9cbb5ea2 2029 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
2030 * text only contains the "foo" portion. The first argument is a pointer
2031 * to the "foo", and the second argument is the type symbol to prefix.
2032 * Forces the next token to be a "WORD".
9cbb5ea2 2033 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2034 */
2035
76e3520e 2036STATIC void
5aaab254 2037S_force_ident(pTHX_ const char *s, int kind)
79072805 2038{
7918f24d
NC
2039 PERL_ARGS_ASSERT_FORCE_IDENT;
2040
c9b48522
DD
2041 if (s[0]) {
2042 const STRLEN len = s[1] ? strlen(s) : 1; /* s = "\"" see yylex */
728847b1
BF
2043 OP* const o = (OP*)newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
2044 UTF ? SVf_UTF8 : 0));
9ded7720 2045 NEXTVAL_NEXTTOKE.opval = o;
79072805 2046 force_next(WORD);
748a9306 2047 if (kind) {
11343788 2048 o->op_private = OPpCONST_ENTERED;
55497cff
PP
2049 /* XXX see note in pp_entereval() for why we forgo typo
2050 warnings if the symbol must be introduced in an eval.
2051 GSAR 96-10-12 */
90e5519e 2052 gv_fetchpvn_flags(s, len,
728847b1
BF
2053 (PL_in_eval ? (GV_ADDMULTI | GV_ADDINEVAL)
2054 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
90e5519e
NC
2055 kind == '$' ? SVt_PV :
2056 kind == '@' ? SVt_PVAV :
2057 kind == '%' ? SVt_PVHV :
a0d0e21e 2058 SVt_PVGV
90e5519e 2059 );
748a9306 2060 }
79072805
LW
2061 }
2062}
2063
3f33d153
FC
2064static void
2065S_force_ident_maybe_lex(pTHX_ char pit)
2066{
3f33d153
FC
2067 NEXTVAL_NEXTTOKE.ival = pit;
2068 force_next('p');
2069}
2070
1571675a
GS
2071NV
2072Perl_str_to_version(pTHX_ SV *sv)
2073{
2074 NV retval = 0.0;
2075 NV nshift = 1.0;
2076 STRLEN len;
cfd0369c 2077 const char *start = SvPV_const(sv,len);
9d4ba2ae 2078 const char * const end = start + len;
504618e9 2079 const bool utf = SvUTF8(sv) ? TRUE : FALSE;
7918f24d
NC
2080
2081 PERL_ARGS_ASSERT_STR_TO_VERSION;
2082
1571675a 2083 while (start < end) {
ba210ebe 2084 STRLEN skip;
1571675a
GS
2085 UV n;
2086 if (utf)
9041c2e3 2087 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2088 else {
2089 n = *(U8*)start;
2090 skip = 1;
2091 }
2092 retval += ((NV)n)/nshift;
2093 start += skip;
2094 nshift *= 1000;
2095 }
2096 return retval;
2097}
2098
4e553d73 2099/*
ffb4593c
NT
2100 * S_force_version
2101 * Forces the next token to be a version number.
e759cc13
RGS
2102 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2103 * and if "guessing" is TRUE, then no new token is created (and the caller
2104 * must use an alternative parsing method).
ffb4593c
NT
2105 */
2106
76e3520e 2107STATIC char *
e759cc13 2108S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2109{
5f66b61c 2110 OP *version = NULL;
44dcb63b 2111 char *d;
89bfa8cd 2112
7918f24d
NC
2113 PERL_ARGS_ASSERT_FORCE_VERSION;
2114
29595ff2 2115 s = SKIPSPACE1(s);
89bfa8cd 2116
44dcb63b 2117 d = s;
dd629d5b 2118 if (*d == 'v')
44dcb63b 2119 d++;
44dcb63b 2120 if (isDIGIT(*d)) {
e759cc13
RGS
2121 while (isDIGIT(*d) || *d == '_' || *d == '.')
2122 d++;
4e4da3ac 2123 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
dd629d5b 2124 SV *ver;
6154021b
RGS
2125 s = scan_num(s, &pl_yylval);
2126 version = pl_yylval.opval;
dd629d5b
GS
2127 ver = cSVOPx(version)->op_sv;
2128 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2129 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2130 SvNV_set(ver, str_to_version(ver));
1571675a 2131 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2132 }
89bfa8cd 2133 }
5db06880 2134 else if (guessing) {
e759cc13 2135 return s;
5db06880 2136 }
89bfa8cd
PP
2137 }
2138
2139 /* NOTE: The parser sees the package name and the VERSION swapped */
9ded7720 2140 NEXTVAL_NEXTTOKE.opval = version;
4e553d73 2141 force_next(WORD);
89bfa8cd 2142
e759cc13 2143 return s;
89bfa8cd
PP
2144}
2145
ffb4593c 2146/*
91152fc1
DG
2147 * S_force_strict_version
2148 * Forces the next token to be a version number using strict syntax rules.
2149 */
2150
2151STATIC char *
2152S_force_strict_version(pTHX_ char *s)
2153{
91152fc1 2154 OP *version = NULL;
91152fc1
DG
2155 const char *errstr = NULL;
2156
2157 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2158
2159 while (isSPACE(*s)) /* leading whitespace */
2160 s++;
2161
2162 if (is_STRICT_VERSION(s,&errstr)) {
2163 SV *ver = newSV(0);
2164 s = (char *)scan_version(s, ver, 0);
2165 version = newSVOP(OP_CONST, 0, ver);
2166 }
4e4da3ac
Z
2167 else if ( (*s != ';' && *s != '{' && *s != '}' ) &&
2168 (s = SKIPSPACE1(s), (*s != ';' && *s != '{' && *s != '}' )))
2169 {
91152fc1
DG
2170 PL_bufptr = s;
2171 if (errstr)
2172 yyerror(errstr); /* version required */
2173 return s;
2174 }
2175
91152fc1 2176 /* NOTE: The parser sees the package name and the VERSION swapped */
91152fc1
DG
2177 NEXTVAL_NEXTTOKE.opval = version;
2178 force_next(WORD);
2179
2180 return s;
2181}
2182
2183/*
ffb4593c
NT
2184 * S_tokeq
2185 * Tokenize a quoted string passed in as an SV. It finds the next
2186 * chunk, up to end of string or a backslash. It may make a new
2187 * SV containing that chunk (if HINT_NEW_STRING is on). It also
2188 * turns \\ into \.
2189 */
2190
76e3520e 2191STATIC SV *
cea2e8a9 2192S_tokeq(pTHX_ SV *sv)
79072805 2193{
eb578fdb
KW
2194 char *s;
2195 char *send;
2196 char *d;
b3ac6de7 2197 SV *pv = sv;
79072805 2198
7918f24d
NC
2199 PERL_ARGS_ASSERT_TOKEQ;
2200
279b35ad
FC
2201 assert (SvPOK(sv));
2202 assert (SvLEN(sv));
2203 assert (!SvIsCOW(sv));
307ed071 2204 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1) /* <<'heredoc' */
b3ac6de7 2205 goto finish;
279b35ad
FC
2206 s = SvPVX(sv);
2207 send = SvEND(sv);
dcb21ed6
NC
2208 /* This is relying on the SV being "well formed" with a trailing '\0' */
2209 while (s < send && !(*s == '\\' && s[1] == '\\'))
79072805
LW
2210 s++;
2211 if (s == send)
b3ac6de7 2212 goto finish;
79072805 2213 d = s;
be4731d2 2214 if ( PL_hints & HINT_NEW_STRING ) {
279b35ad
FC
2215 pv = newSVpvn_flags(SvPVX_const(pv), SvCUR(sv),
2216 SVs_TEMP | SvUTF8(sv));
be4731d2 2217 }
79072805
LW
2218 while (s < send) {
2219 if (*s == '\\') {
a0d0e21e 2220 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2221 s++; /* all that, just for this */
2222 }
2223 *d++ = *s++;
2224 }
2225 *d = '\0';
95a20fc0 2226 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2227 finish:
3280af22 2228 if ( PL_hints & HINT_NEW_STRING )
eb0d8d16 2229 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
79072805
LW
2230 return sv;
2231}
2232
ffb4593c
NT
2233/*
2234 * Now come three functions related to double-quote context,
2235 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2236 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2237 * interact with PL_lex_state, and create fake ( ... ) argument lists
2238 * to handle functions and concatenation.
ecd24171
DM
2239 * For example,
2240 * "foo\lbar"
2241 * is tokenised as
2242 * stringify ( const[foo] concat lcfirst ( const[bar] ) )
ffb4593c
NT
2243 */
2244
2245/*
2246 * S_sublex_start
6154021b 2247 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2248 *
2249 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2250 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c
NT
2251 *
2252 * OP_CONST and OP_READLINE are easy--just make the new op and return.
2253 *
2254 * Everything else becomes a FUNC.
2255 *
2256 * Sets PL_lex_state to LEX_INTERPPUSH unless (ival was OP_NULL or we
2257 * had an OP_CONST or OP_READLINE). This just sets us up for a
2258 * call to S_sublex_push().
2259 */
2260
76e3520e 2261STATIC I32
cea2e8a9 2262S_sublex_start(pTHX)
79072805 2263{
eb578fdb 2264 const I32 op_type = pl_yylval.ival;
79072805
LW
2265
2266 if (op_type == OP_NULL) {
6154021b 2267 pl_yylval.opval = PL_lex_op;
5f66b61c 2268 PL_lex_op = NULL;
79072805
LW
2269 return THING;
2270 }
466112bb 2271 if (op_type == OP_CONST) {
3280af22 2272 SV *sv = tokeq(PL_lex_stuff);
b3ac6de7
IZ
2273
2274 if (SvTYPE(sv) == SVt_PVIV) {
2275 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2276 STRLEN len;
96a5add6 2277 const char * const p = SvPV_const(sv, len);
740cce10 2278 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2279 SvREFCNT_dec(sv);
2280 sv = nsv;
4e553d73 2281 }
6154021b 2282 pl_yylval.opval = (OP*)newSVOP(op_type, 0, sv);
a0714e2c 2283 PL_lex_stuff = NULL;
79072805
LW
2284 return THING;
2285 }
2286
3280af22 2287 PL_sublex_info.super_state = PL_lex_state;
eac04b2e 2288 PL_sublex_info.sub_inwhat = (U16)op_type;
3280af22
NIS
2289 PL_sublex_info.sub_op = PL_lex_op;
2290 PL_lex_state = LEX_INTERPPUSH;
55497cff 2291
3280af22
NIS
2292 PL_expect = XTERM;
2293 if (PL_lex_op) {
6154021b 2294 pl_yylval.opval = PL_lex_op;
5f66b61c 2295 PL_lex_op = NULL;
55497cff
PP
2296 return PMFUNC;
2297 }
2298 else
2299 return FUNC;
2300}
2301
ffb4593c
NT
2302/*
2303 * S_sublex_push
2304 * Create a new scope to save the lexing state. The scope will be
2305 * ended in S_sublex_done. Returns a '(', starting the function arguments
2306 * to the uc, lc, etc. found before.
2307 * Sets PL_lex_state to LEX_INTERPCONCAT.
2308 */
2309
76e3520e 2310STATIC I32
cea2e8a9 2311S_sublex_push(pTHX)
55497cff 2312{
78a635de 2313 LEXSHARED *shared;
801d32ac 2314 const bool is_heredoc = PL_multi_close == '<';
f46d017c 2315 ENTER;
55497cff 2316
3280af22 2317 PL_lex_state = PL_sublex_info.super_state;
cc624add 2318 SAVEI8(PL_lex_dojoin);
3280af22 2319 SAVEI32(PL_lex_brackets);
78cdf107 2320 SAVEI32(PL_lex_allbrackets);
b27dce25 2321 SAVEI32(PL_lex_formbrack);
78cdf107 2322 SAVEI8(PL_lex_fakeeof);
3280af22
NIS
2323 SAVEI32(PL_lex_casemods);
2324 SAVEI32(PL_lex_starts);
651b5b28 2325 SAVEI8(PL_lex_state);
7cc34111 2326 SAVESPTR(PL_lex_repl);
7766f137 2327 SAVEVPTR(PL_lex_inpat);
98246f1e 2328 SAVEI16(PL_lex_inwhat);
ffdb8b16 2329 if (is_heredoc)
b42366d4 2330 {
ffdb8b16 2331 SAVECOPLINE(PL_curcop);
b42366d4 2332 SAVEI32(PL_multi_end);
851b527a
FC
2333 SAVEI32(PL_parser->herelines);
2334 PL_parser->herelines = 0;
b42366d4
FC
2335 }
2336 SAVEI8(PL_multi_close);
3280af22 2337 SAVEPPTR(PL_bufptr);
8452ff4b 2338 SAVEPPTR(PL_bufend);
3280af22
NIS
2339 SAVEPPTR(PL_oldbufptr);
2340 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2341 SAVEPPTR(PL_last_lop);
2342 SAVEPPTR(PL_last_uni);
3280af22
NIS
2343 SAVEPPTR(PL_linestart);
2344 SAVESPTR(PL_linestr);
8edd5f42
RGS
2345 SAVEGENERICPV(PL_lex_brackstack);
2346 SAVEGENERICPV(PL_lex_casestack);
78a635de 2347 SAVEGENERICPV(PL_parser->lex_shared);
3a54fd60 2348 SAVEBOOL(PL_parser->lex_re_reparsing);
ffdb8b16 2349 SAVEI32(PL_copline);
3280af22 2350
99bd9d90 2351 /* The here-doc parser needs to be able to peek into outer lexing
60f40a38
FC
2352 scopes to find the body of the here-doc. So we put PL_linestr and
2353 PL_bufptr into lex_shared, to ‘share’ those values.
99bd9d90 2354 */
60f40a38
FC
2355 PL_parser->lex_shared->ls_linestr = PL_linestr;
2356 PL_parser->lex_shared->ls_bufptr = PL_bufptr;
99bd9d90 2357
3280af22 2358 PL_linestr = PL_lex_stuff;
7cc34111 2359 PL_lex_repl = PL_sublex_info.repl;
a0714e2c 2360 PL_lex_stuff = NULL;
7cc34111 2361 PL_sublex_info.repl = NULL;
3280af22 2362
9cbb5ea2
GS
2363 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2364 = SvPVX(PL_linestr);
3280af22 2365 PL_bufend += SvCUR(PL_linestr);
bd61b366 2366 PL_last_lop = PL_last_uni = NULL;
3280af22 2367 SAVEFREESV(PL_linestr);
4dc843bc 2368 if (PL_lex_repl) SAVEFREESV(PL_lex_repl);
3280af22
NIS
2369
2370 PL_lex_dojoin = FALSE;
b27dce25 2371 PL_lex_brackets = PL_lex_formbrack = 0;
78cdf107
Z
2372 PL_lex_allbrackets = 0;
2373 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
a02a5408
JC
2374 Newx(PL_lex_brackstack, 120, char);
2375 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2376 PL_lex_casemods = 0;
2377 *PL_lex_casestack = '\0';
2378 PL_lex_starts = 0;
2379 PL_lex_state = LEX_INTERPCONCAT;
ffdb8b16 2380 if (is_heredoc)
6ddcf93b 2381 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
ffdb8b16 2382 PL_copline = NOLINE;
78a635de
FC
2383
2384 Newxz(shared, 1, LEXSHARED);
2385 shared->ls_prev = PL_parser->lex_shared;
2386 PL_parser->lex_shared = shared;
3280af22
NIS
2387
2388 PL_lex_inwhat = PL_sublex_info.sub_inwhat;
bb16bae8 2389 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
3280af22
NIS
2390 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2391 PL_lex_inpat = PL_sublex_info.sub_op;
79072805 2392 else
5f66b61c 2393 PL_lex_inpat = NULL;
79072805 2394
3a54fd60
DM
2395 PL_parser->lex_re_reparsing = cBOOL(PL_in_eval & EVAL_RE_REPARSING);
2396 PL_in_eval &= ~EVAL_RE_REPARSING;
2397
55497cff 2398 return '(';
79072805
LW
2399}
2400
ffb4593c
NT
2401/*
2402 * S_sublex_done
2403 * Restores lexer state after a S_sublex_push.
2404 */
2405
76e3520e 2406STATIC I32
cea2e8a9 2407S_sublex_done(pTHX)
79072805 2408{
3280af22 2409 if (!PL_lex_starts++) {
396482e1 2410 SV * const sv = newSVpvs("");
9aa983d2
JH
2411 if (SvUTF8(PL_linestr))
2412 SvUTF8_on(sv);
3280af22 2413 PL_expect = XOPERATOR;
6154021b 2414 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
79072805
LW
2415 return THING;
2416 }
2417
3280af22
NIS
2418 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2419 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2420 return yylex();
79072805
LW
2421 }
2422
ffb4593c 2423 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
bb16bae8 2424 assert(PL_lex_inwhat != OP_TRANSR);
5aa91856
FC
2425 if (PL_lex_repl) {
2426 assert (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS);
3280af22
NIS
2427 PL_linestr = PL_lex_repl;
2428 PL_lex_inpat = 0;
2429 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2430 PL_bufend += SvCUR(PL_linestr);
bd61b366 2431 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2432 PL_lex_dojoin = FALSE;
2433 PL_lex_brackets = 0;
78cdf107
Z
2434 PL_lex_allbrackets = 0;
2435 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
3280af22
NIS
2436 PL_lex_casemods = 0;
2437 *PL_lex_casestack = '\0';
2438 PL_lex_starts = 0;
25da4f38 2439 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2440 PL_lex_state = LEX_INTERPNORMAL;
2441 PL_lex_starts++;
e9fa98b2
HS
2442 /* we don't clear PL_lex_repl here, so that we can check later
2443 whether this is an evalled subst; that means we rely on the
2444 logic to ensure sublex_done() is called again only via the
2445 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2446 }
e9fa98b2 2447 else {
3280af22 2448 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2449 PL_lex_repl = NULL;
e9fa98b2 2450 }
ffdb8b16
FC
2451 if (SvTYPE(PL_linestr) >= SVt_PVNV) {
2452 CopLINE(PL_curcop) +=
2453 ((XPVNV*)SvANY(PL_linestr))->xnv_u.xpad_cop_seq.xlow
851b527a
FC
2454 + PL_parser->herelines;
2455 PL_parser->herelines = 0;
ffdb8b16 2456 }
79072805 2457 return ',';
ffed7fef
LW
2458 }
2459 else {
b42366d4 2460 const line_t l = CopLINE(PL_curcop);
f46d017c 2461 LEAVE;
b42366d4 2462 if (PL_multi_close == '<')
851b527a 2463 PL_parser->herelines += l - PL_multi_end;
3280af22
NIS
2464 PL_bufend = SvPVX(PL_linestr);
2465 PL_bufend += SvCUR(PL_linestr);
2466 PL_expect = XOPERATOR;
09bef843 2467 PL_sublex_info.sub_inwhat = 0;
79072805 2468 return ')';
ffed7fef
LW
2469 }
2470}
2471
6f613c73
KW
2472PERL_STATIC_INLINE SV*
2473S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
2474{
140b12ad
KW
2475 /* <s> points to first character of interior of \N{}, <e> to one beyond the
2476 * interior, hence to the "}". Finds what the name resolves to, returning
2477 * an SV* containing it; NULL if no valid one found */
2478
dd2b1b72 2479 SV* res = newSVpvn_flags(s, e - s, UTF ? SVf_UTF8 : 0);
6f613c73 2480
0c415a79
KW
2481 HV * table;
2482 SV **cvp;
2483 SV *cv;
2484 SV *rv;
2485 HV *stash;
2486 const U8* first_bad_char_loc;
2487 const char* backslash_ptr = s - 3; /* Points to the <\> of \N{... */
2488
6f613c73
KW
2489 PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME;
2490
107160e2
KW
2491 if (UTF && ! is_utf8_string_loc((U8 *) backslash_ptr,
2492 e - backslash_ptr,
2493 &first_bad_char_loc))
2494 {
2495 /* If warnings are on, this will print a more detailed analysis of what
2496 * is wrong than the error message below */
c80e42f3 2497 utf8n_to_uvchr(first_bad_char_loc,
107160e2
KW
2498 e - ((char *) first_bad_char_loc),
2499 NULL, 0);
2500
2501 /* We deliberately don't try to print the malformed character, which
2502 * might not print very well; it also may be just the first of many
2503 * malformations, so don't print what comes after it */
2504 yyerror(Perl_form(aTHX_
2505 "Malformed UTF-8 character immediately after '%.*s'",
2506 (int) (first_bad_char_loc - (U8 *) backslash_ptr), backslash_ptr));
2507 return NULL;
2508 }
2509
2510 res = new_constant( NULL, 0, "charnames", res, NULL, backslash_ptr,
2511 /* include the <}> */
2512 e - backslash_ptr + 1);
6f613c73 2513 if (! SvPOK(res)) {
b6407c49 2514 SvREFCNT_dec_NN(res);
6f613c73
KW
2515 return NULL;
2516 }
2517
0c415a79
KW
2518 /* See if the charnames handler is the Perl core's, and if so, we can skip
2519 * the validation needed for a user-supplied one, as Perl's does its own
2520 * validation. */
2521 table = GvHV(PL_hintgv); /* ^H */
2522 cvp = hv_fetchs(table, "charnames", FALSE);
5882ddb3
FC
2523 if (cvp && (cv = *cvp) && SvROK(cv) && (rv = SvRV(cv),
2524 SvTYPE(rv) == SVt_PVCV) && ((stash = CvSTASH(rv)) != NULL))
0c415a79
KW
2525 {
2526 const char * const name = HvNAME(stash);
6a642c21
FC
2527 if (HvNAMELEN(stash) == sizeof("_charnames")-1
2528 && strEQ(name, "_charnames")) {
0c415a79
KW
2529 return res;
2530 }
2531 }
2532
bde9e88d
KW
2533 /* Here, it isn't Perl's charname handler. We can't rely on a
2534 * user-supplied handler to validate the input name. For non-ut8 input,
2535 * look to see that the first character is legal. Then loop through the
2536 * rest checking that each is a continuation */
6f613c73 2537
36897d64
KW
2538 /* This code makes the reasonable assumption that the only Latin1-range
2539 * characters that begin a character name alias are alphabetic, otherwise
2540 * would have to create a isCHARNAME_BEGIN macro */
b6ba1137 2541
b6ba1137 2542 if (! UTF) {
bde9e88d 2543 if (! isALPHAU(*s)) {
b6ba1137
KW
2544 goto bad_charname;
2545 }
bde9e88d
KW
2546 s++;
2547 while (s < e) {
2548 if (! isCHARNAME_CONT(*s)) {
b6ba1137
KW
2549 goto bad_charname;
2550 }
2d8eb851
KW
2551 if (*s == ' ' && *(s-1) == ' ') {
2552 goto multi_spaces;
bd299e29 2553 }
df758df2
KW
2554 if ((U8) *s == NBSP_NATIVE && ckWARN_d(WARN_DEPRECATED)) {
2555 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2556 "NO-BREAK SPACE in a charnames "
2557 "alias definition is deprecated");
2558 }
bde9e88d 2559 s++;
b6ba1137
KW
2560 }
2561 }
2562 else {
bde9e88d
KW
2563 /* Similarly for utf8. For invariants can check directly; for other
2564 * Latin1, can calculate their code point and check; otherwise use a
2565 * swash */
2566 if (UTF8_IS_INVARIANT(*s)) {
2567 if (! isALPHAU(*s)) {
140b12ad
KW
2568 goto bad_charname;
2569 }
bde9e88d
KW
2570 s++;
2571 } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
94bb8c36 2572 if (! isALPHAU(TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)))) {
b6ba1137 2573 goto bad_charname;
6f613c73 2574 }
bde9e88d 2575 s += 2;
6f613c73 2576 }
bde9e88d
KW
2577 else {
2578 if (! PL_utf8_charname_begin) {
2579 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2580 PL_utf8_charname_begin = _core_swash_init("utf8",
2581 "_Perl_Charname_Begin",
2582 &PL_sv_undef,
2583 1, 0, NULL, &flags);
2584 }
2585 if (! swash_fetch(PL_utf8_charname_begin, (U8 *) s, TRUE)) {
2586 goto bad_charname;
2587 }
2588 s += UTF8SKIP(s);
2589 }
2590
2591 while (s < e) {
2592 if (UTF8_IS_INVARIANT(*s)) {
2593 if (! isCHARNAME_CONT(*s)) {
2594 goto bad_charname;
2595 }
2d8eb851
KW
2596 if (*s == ' ' && *(s-1) == ' ') {
2597 goto multi_spaces;
bd299e29 2598 }
bde9e88d
KW
2599 s++;
2600 }
2601 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
94bb8c36 2602 if (! isCHARNAME_CONT(TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1))))
bde9e88d
KW
2603 {
2604 goto bad_charname;
2605 }
df758df2
KW
2606 if (*s == *NBSP_UTF8
2607 && *(s+1) == *(NBSP_UTF8+1)
2608 && ckWARN_d(WARN_DEPRECATED))
2609 {
2610 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2611 "NO-BREAK SPACE in a charnames "
2612 "alias definition is deprecated");
2613 }
bde9e88d
KW
2614 s += 2;
2615 }
2616 else {
2617 if (! PL_utf8_charname_continue) {
2618 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2619 PL_utf8_charname_continue = _core_swash_init("utf8",
2620 "_Perl_Charname_Continue",
2621 &PL_sv_undef,
2622 1, 0, NULL, &flags);
2623 }
2624 if (! swash_fetch(PL_utf8_charname_continue, (U8 *) s, TRUE)) {
2625 goto bad_charname;
2626 }
2627 s += UTF8SKIP(s);
6f613c73
KW
2628 }
2629 }
2d8eb851
KW
2630 }
2631 if (*(s-1) == ' ') {
2632 yyerror_pv(
2633 Perl_form(aTHX_
2634 "charnames alias definitions may not contain trailing "
2635 "white-space; marked by <-- HERE in %.*s<-- HERE %.*s",
2636 (int)(s - backslash_ptr + 1), backslash_ptr,
2637 (int)(e - s + 1), s + 1
2638 ),
2639 UTF ? SVf_UTF8 : 0);
2640 return NULL;
6f613c73
KW
2641 }
2642
94ca1619 2643 if (SvUTF8(res)) { /* Don't accept malformed input */
bde9e88d
KW
2644 const U8* first_bad_char_loc;
2645 STRLEN len;
2646 const char* const str = SvPV_const(res, len);
2647 if (! is_utf8_string_loc((U8 *) str, len, &first_bad_char_loc)) {
2648 /* If warnings are on, this will print a more detailed analysis of
2649 * what is wrong than the error message below */
c80e42f3 2650 utf8n_to_uvchr(first_bad_char_loc,
bde9e88d
KW
2651 (char *) first_bad_char_loc - str,
2652 NULL, 0);
2653
2654 /* We deliberately don't try to print the malformed character,
2655 * which might not print very well; it also may be just the first
2656 * of many malformations, so don't print what comes after it */
2657 yyerror_pv(
2658 Perl_form(aTHX_
2659 "Malformed UTF-8 returned by %.*s immediately after '%.*s'",
2660 (int) (e - backslash_ptr + 1), backslash_ptr,
2661 (int) ((char *) first_bad_char_loc - str), str
2662 ),
2663 SVf_UTF8);
2664 return NULL;
2665 }
2666 }
140b12ad 2667
bde9e88d 2668 return res;
140b12ad 2669
bde9e88d 2670 bad_charname: {
bde9e88d
KW
2671
2672 /* The final %.*s makes sure that should the trailing NUL be missing
2673 * that this print won't run off the end of the string */
2674 yyerror_pv(
2675 Perl_form(aTHX_
2676 "Invalid character in \\N{...}; marked by <-- HERE in %.*s<-- HERE %.*s",
2d8eb851
KW
2677 (int)(s - backslash_ptr + 1), backslash_ptr,
2678 (int)(e - s + 1), s + 1
bde9e88d
KW
2679 ),
2680 UTF ? SVf_UTF8 : 0);
2681 return NULL;
2682 }
2d8eb851
KW
2683
2684 multi_spaces:
2685 yyerror_pv(
2686 Perl_form(aTHX_
2687 "charnames alias definitions may not contain a sequence of "
2688 "multiple spaces; marked by <-- HERE in %.*s<-- HERE %.*s",
2689 (int)(s - backslash_ptr + 1), backslash_ptr,
2690 (int)(e - s + 1), s + 1
2691 ),
2692 UTF ? SVf_UTF8 : 0);
2693 return NULL;
6f613c73
KW
2694}
2695
02aa26ce
NT
2696/*
2697 scan_const
2698
9da1dd8f
DM
2699 Extracts the next constant part of a pattern, double-quoted string,
2700 or transliteration. This is terrifying code.
2701
2702 For example, in parsing the double-quoted string "ab\x63$d", it would
2703 stop at the '$' and return an OP_CONST containing 'abc'.
02aa26ce 2704
94def140 2705 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 2706 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 2707 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 2708
94def140
ST
2709 Returns a pointer to the character scanned up to. If this is
2710 advanced from the start pointer supplied (i.e. if anything was
9da1dd8f 2711 successfully parsed), will leave an OP_CONST for the substring scanned
6154021b 2712 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
2713 by looking at the next characters herself.
2714
02aa26ce 2715 In patterns:
9da1dd8f 2716 expand:
537124e4
KW
2717 \N{FOO} => \N{U+hex_for_character_FOO}
2718 (if FOO expands to multiple characters, expands to \N{U+xx.XX.yy ...})
9da1dd8f
DM
2719
2720 pass through:
2721 all other \-char, including \N and \N{ apart from \N{ABC}
2722
2723 stops on:
2724 @ and $ where it appears to be a var, but not for $ as tail anchor
2725 \l \L \u \U \Q \E
2726 (?{ or (??{
2727
02aa26ce
NT
2728
2729 In transliterations:
2730 characters are VERY literal, except for - not at the start or end
94def140
ST
2731 of the string, which indicates a range. If the range is in bytes,
2732 scan_const expands the range to the full set of intermediate
2733 characters. If the range is in utf8, the hyphen is replaced with
2734 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
2735
2736 In double-quoted strings:
2737 backslashes:
2738 double-quoted style: \r and \n
ff3f963a 2739 constants: \x31, etc.
94def140 2740 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
2741 case and quoting: \U \Q \E
2742 stops on @ and $
2743
2744 scan_const does *not* construct ops to handle interpolated strings.
2745 It stops processing as soon as it finds an embedded $ or @ variable
2746 and leaves it to the caller to work out what's going on.
2747
94def140
ST
2748 embedded arrays (whether in pattern or not) could be:
2749 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2750
2751 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
2752
2753 $ in pattern could be $foo or could be tail anchor. Assumption:
2754 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 2755 followed by one of "()| \r\n\t"
02aa26ce 2756
9da1dd8f 2757 \1 (backreferences) are turned into $1 in substitutions
02aa26ce
NT
2758
2759 The structure of the code is
2760 while (there's a character to process) {
94def140
ST
2761 handle transliteration ranges
2762 skip regexp comments /(?#comment)/ and codes /(?{code})/
2763 skip #-initiated comments in //x patterns
2764 check for embedded arrays
02aa26ce
NT
2765 check for embedded scalars
2766 if (backslash) {
94def140 2767 deprecate \1 in substitution replacements
02aa26ce
NT
2768 handle string-changing backslashes \l \U \Q \E, etc.
2769 switch (what was escaped) {
94def140 2770 handle \- in a transliteration (becomes a literal -)
ff3f963a 2771 if a pattern and not \N{, go treat as regular character
94def140
ST
2772 handle \132 (octal characters)
2773 handle \x15 and \x{1234} (hex characters)
ff3f963a 2774 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
ST
2775 handle \cV (control characters)
2776 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 2777 } (end switch)
77a135fe 2778 continue
02aa26ce 2779 } (end if backslash)
77a135fe 2780 handle regular character
02aa26ce 2781 } (end while character to read)
4e553d73 2782
02aa26ce
NT
2783*/
2784
76e3520e 2785STATIC char *
cea2e8a9 2786S_scan_const(pTHX_ char *start)
79072805 2787{
eb578fdb 2788 char *send = PL_bufend; /* end of the constant */
dc023dbb
KW
2789 SV *sv = newSV(send - start); /* sv for the constant. See note below
2790 on sizing. */
eb578fdb
KW
2791 char *s = start; /* start of the constant */
2792 char *d = SvPVX(sv); /* destination for copies */
dc023dbb
KW
2793 bool dorange = FALSE; /* are we in a translit range? */
2794 bool didrange = FALSE; /* did we just finish a range? */
2795 bool in_charclass = FALSE; /* within /[...]/ */
2796 bool has_utf8 = FALSE; /* Output constant is UTF8 */
2797 bool this_utf8 = cBOOL(UTF); /* Is the source string assumed to be
2798 UTF8? But, this can show as true
2799 when the source isn't utf8, as for
2800 example when it is entirely composed
2801 of hex constants */
6f613c73 2802 SV *res; /* result from charnames */
77a135fe
KW
2803
2804 /* Note on sizing: The scanned constant is placed into sv, which is
2805 * initialized by newSV() assuming one byte of output for every byte of
2806 * input. This routine expects newSV() to allocate an extra byte for a
2807 * trailing NUL, which this routine will append if it gets to the end of
2808 * the input. There may be more bytes of input than output (eg., \N{LATIN
2809 * CAPITAL LETTER A}), or more output than input if the constant ends up
2810 * recoded to utf8, but each time a construct is found that might increase
2811 * the needed size, SvGROW() is called. Its size parameter each time is
2812 * based on the best guess estimate at the time, namely the length used so
2813 * far, plus the length the current construct will occupy, plus room for
2814 * the trailing NUL, plus one byte for every input byte still unscanned */
2815
c3320c2a
KW
2816 UV uv = UV_MAX; /* Initialize to weird value to try to catch any uses
2817 before set */
4c3a8340
ST
2818#ifdef EBCDIC
2819 UV literal_endpoint = 0;
e294cc5d 2820 bool native_range = TRUE; /* turned to FALSE if the first endpoint is Unicode. */
4c3a8340 2821#endif
012bcf8d 2822
7918f24d
NC
2823 PERL_ARGS_ASSERT_SCAN_CONST;
2824
bb16bae8 2825 assert(PL_lex_inwhat != OP_TRANSR);
2b9d42f0
NIS
2826 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
2827 /* If we are doing a trans and we know we want UTF8 set expectation */
2828 has_utf8 = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
2829 this_utf8 = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
2830 }
2831
b899e89d
FC
2832 /* Protect sv from errors and fatal warnings. */
2833 ENTER_with_name("scan_const");
2834 SAVEFREESV(sv);
2b9d42f0 2835
79072805 2836 while (s < send || dorange) {
ff3f963a 2837
02aa26ce 2838 /* get transliterations out of the way (they're most literal) */
3280af22 2839 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 2840 /* expand a range A-Z to the full set of characters. AIE! */
79072805 2841 if (dorange) {
1ba5c669
JH
2842 I32 i; /* current expanded character */
2843 I32 min; /* first character in range */
2844 I32 max; /* last character in range */
02aa26ce 2845
e294cc5d
JH
2846#ifdef EBCDIC
2847 UV uvmax = 0;
2848#endif
2849
2850 if (has_utf8
2851#ifdef EBCDIC
2852 && !native_range
2853#endif
1953db30 2854 ) {
9d4ba2ae 2855 char * const c = (char*)utf8_hop((U8*)d, -1);
8973db79
JH
2856 char *e = d++;
2857 while (e-- > c)
2858 *(e + 1) = *e;
e7214ce8 2859 *c = (char) ILLEGAL_UTF8_BYTE;
8973db79
JH
2860 /* mark the range as done, and continue */
2861 dorange = FALSE;
2862 didrange = TRUE;
2863 continue;
2864 }
2b9d42f0 2865
95a20fc0 2866 i = d - SvPVX_const(sv); /* remember current offset */
e294cc5d
JH
2867#ifdef EBCDIC
2868 SvGROW(sv,
dc023dbb
KW
2869 SvLEN(sv) + ((has_utf8)
2870 ? (512 - UTF_CONTINUATION_MARK
2871 + UNISKIP(0x100))
e294cc5d
JH
2872 : 256));
2873 /* How many two-byte within 0..255: 128 in UTF-8,
2874 * 96 in UTF-8-mod. */
2875#else
9cbb5ea2 2876 SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */
e294cc5d 2877#endif
9cbb5ea2 2878 d = SvPVX(sv) + i; /* refresh d after realloc */
e294cc5d
JH
2879#ifdef EBCDIC
2880 if (has_utf8) {
2881 int j;
2882 for (j = 0; j <= 1; j++) {
2883 char * const c = (char*)utf8_hop((U8*)d, -1);
2884 const UV uv = utf8n_to_uvchr((U8*)c, d - c, NULL, 0);
2885 if (j)
2886 min = (U8)uv;
2887 else if (uv < 256)
2888 max = (U8)uv;
2889 else {
2890 max = (U8)0xff; /* only to \xff */
2891 uvmax = uv; /* \x{100} to uvmax */
2892 }
2893 d = c; /* eat endpoint chars */
2894 }
2895 }
2896 else {
2897#endif
2898 d -= 2; /* eat the first char and the - */
2899 min = (U8)*d; /* first char in range */
2900 max = (U8)d[1]; /* last char in range */
2901#ifdef EBCDIC
2902 }
2903#endif
8ada0baa 2904
c2e66d9e 2905 if (min > max) {
01ec43d0 2906 Perl_croak(aTHX_
d1573ac7 2907 "Invalid range \"%c-%c\" in transliteration operator",
1ba5c669 2908 (char)min, (char)max);
c2e66d9e
GS
2909 }
2910
c7f1f016 2911#ifdef EBCDIC
dc023dbb
KW
2912 /* Because of the discontinuities in EBCDIC A-Z and a-z, expand
2913 * any subsets of these ranges into individual characters */
4c3a8340 2914 if (literal_endpoint == 2 &&
c286a9a6 2915 ((isLOWER_A(min) && isLOWER_A(max)) ||
75e9e7bf
KW
2916 (isUPPER_A(min) && isUPPER_A(max))))
2917 {
2918 for (i = min; i <= max; i++) {
2919 if (isALPHA_A(i))
2920 *d++ = i;
8ada0baa
JH
2921 }
2922 }
2923 else
2924#endif
2925 for (i = min; i <= max; i++)
e294cc5d
JH
2926#ifdef EBCDIC
2927 if (has_utf8) {
55d09dc8 2928 append_utf8_from_native_byte(i, &d);
e294cc5d
JH
2929 }
2930 else
2931#endif
2932 *d++ = (char)i;
2933
2934#ifdef EBCDIC
2935 if (uvmax) {
2936 d = (char*)uvchr_to_utf8((U8*)d, 0x100);
2937 if (uvmax > 0x101)
e7214ce8 2938 *d++ = (char) ILLEGAL_UTF8_BYTE;
e294cc5d
JH
2939 if (uvmax > 0x100)
2940 d = (char*)uvchr_to_utf8((U8*)d, uvmax);
2941 }
2942#endif
02aa26ce
NT
2943
2944 /* mark the range as done, and continue */
79072805 2945 dorange = FALSE;
01ec43d0 2946 didrange = TRUE;
4c3a8340
ST
2947#ifdef EBCDIC
2948 literal_endpoint = 0;
2949#endif
79072805 2950 continue;
4e553d73 2951 }
02aa26ce
NT
2952
2953 /* range begins (ignore - as first or last char) */
79072805 2954 else if (*s == '-' && s+1 < send && s != start) {
4e553d73 2955 if (didrange) {
1fafa243 2956 Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
01ec43d0 2957 }
e294cc5d
JH
2958 if (has_utf8
2959#ifdef EBCDIC
2960 && !native_range
2961#endif
2962 ) {
e7214ce8 2963 *d++ = (char) ILLEGAL_UTF8_BYTE; /* use illegal utf8 byte--see pmtrans */
a0ed51b3
LW
2964 s++;
2965 continue;
2966 }
79072805
LW
2967 dorange = TRUE;
2968 s++;
01ec43d0
GS
2969 }
2970 else {
2971 didrange = FALSE;
4c3a8340
ST
2972#ifdef EBCDIC
2973 literal_endpoint = 0;
e294cc5d 2974 native_range = TRUE;
4c3a8340 2975#endif
01ec43d0 2976 }
79072805 2977 }
02aa26ce
NT
2978
2979 /* if we get here, we're not doing a transliteration */
2980
e4a2df84
DM
2981 else if (*s == '[' && PL_lex_inpat && !in_charclass) {
2982 char *s1 = s-1;
2983 int esc = 0;
2984 while (s1 >= start && *s1-- == '\\')
2985 esc = !esc;
2986 if (!esc)
2987 in_charclass = TRUE;
2988 }
2866decb 2989
e4a2df84
DM
2990 else if (*s == ']' && PL_lex_inpat && in_charclass) {
2991 char *s1 = s-1;
2992 int esc = 0;
2993 while (s1 >= start && *s1-- == '\\')
2994 esc = !esc;
2995 if (!esc)
2996 in_charclass = FALSE;
2997 }
2866decb 2998
9da1dd8f
DM
2999 /* skip for regexp comments /(?#comment)/, except for the last
3000 * char, which will be done separately.
3001 * Stop on (?{..}) and friends */
3002
c30fc27b 3003 else if (*s == '(' && PL_lex_inpat && s[1] == '?' && !in_charclass) {
cc6b7395 3004 if (s[2] == '#') {
e994fd66 3005 while (s+1 < send && *s != ')')
5ff03569 3006 *d++ = *s++;
155aba94 3007 }
c30fc27b 3008 else if (!PL_lex_casemods &&
d3cec5e5
DM
3009 ( s[2] == '{' /* This should match regcomp.c */
3010 || (s[2] == '?' && s[3] == '{')))
155aba94 3011 {
9da1dd8f 3012 break;
cc6b7395 3013 }
748a9306 3014 }
02aa26ce
NT
3015
3016 /* likewise skip #-initiated comments in //x patterns */
c30fc27b 3017 else if (*s == '#' && PL_lex_inpat && !in_charclass &&
73134a2e 3018 ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED) {
748a9306 3019 while (s+1 < send && *s != '\n')
5ff03569 3020 *d++ = *s++;
748a9306 3021 }
02aa26ce 3022
9da1dd8f
DM
3023 /* no further processing of single-quoted regex */
3024 else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'')
3025 goto default_action;
3026
5d1d4326 3027 /* check for embedded arrays
da6eedaa 3028 (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
5d1d4326 3029 */
1749ea0d 3030 else if (*s == '@' && s[1]) {
8a2bca12 3031 if (isWORDCHAR_lazy_if(s+1,UTF))
1749ea0d
ST
3032 break;
3033 if (strchr(":'{$", s[1]))
3034 break;
3035 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
3036 break; /* in regexp, neither @+ nor @- are interpolated */
3037 }
02aa26ce
NT
3038
3039 /* check for embedded scalars. only stop if we're sure it's a
3040 variable.
3041 */
79072805 3042 else if (*s == '$') {
3280af22 3043 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
79072805 3044 break;
77772344 3045 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
a2a5de95
NC
3046 if (s[1] == '\\') {
3047 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
3048 "Possible unintended interpolation of $\\ in regex");
77772344 3049 }
79072805 3050 break; /* in regexp, $ might be tail anchor */
77772344 3051 }
79072805 3052 }
02aa26ce 3053
2b9d42f0
NIS
3054 /* End of else if chain - OP_TRANS rejoin rest */
3055
02aa26ce 3056 /* backslashes */
79072805 3057 if (*s == '\\' && s+1 < send) {
ff3f963a
KW
3058 char* e; /* Can be used for ending '}', etc. */
3059
79072805 3060 s++;
02aa26ce 3061
7d0fc23c
KW
3062 /* warn on \1 - \9 in substitution replacements, but note that \11
3063 * is an octal; and \19 is \1 followed by '9' */
3280af22 3064 if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat &&
a0d0e21e 3065 isDIGIT(*s) && *s != '0' && !isDIGIT(s[1]))
79072805 3066 {
c782d7ee 3067 /* diag_listed_as: \%d better written as $%d */
a2a5de95 3068 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
79072805
LW
3069 *--s = '$';
3070 break;
3071 }
02aa26ce
NT
3072
3073 /* string-change backslash escapes */
838f2281 3074 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQF", *s)) {
79072805
LW
3075 --s;
3076 break;
3077 }
ff3f963a
KW
3078 /* In a pattern, process \N, but skip any other backslash escapes.
3079 * This is because we don't want to translate an escape sequence
3080 * into a meta symbol and have the regex compiler use the meta
3081 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
3082 * in spite of this, we do have to process \N here while the proper
3083 * charnames handler is in scope. See bugs #56444 and #62056.
3084 * There is a complication because \N in a pattern may also stand
3085 * for 'match a non-nl', and not mean a charname, in which case its
3086 * processing should be deferred to the regex compiler. To be a
3087 * charname it must be followed immediately by a '{', and not look
3088 * like \N followed by a curly quantifier, i.e., not something like
3089 * \N{3,}. regcurly returns a boolean indicating if it is a legal
3090 * quantifier */
3091 else if (PL_lex_inpat
3092 && (*s != 'N'
3093 || s[1] != '{'
412f55bb 3094 || regcurly(s + 1)))
ff3f963a 3095 {
4d73d076 3096 *d++ = '\\';
cc74c5bd
ST
3097 goto default_action;
3098 }
02aa26ce 3099
79072805 3100 switch (*s) {
02aa26ce
NT
3101
3102 /* quoted - in transliterations */
79072805 3103 case '-':
3280af22 3104 if (PL_lex_inwhat == OP_TRANS) {
79072805
LW
3105 *d++ = *s++;
3106 continue;
3107 }
924ba076 3108 /* FALLTHROUGH */
79072805 3109 default:
11b8faa4 3110 {
15861f94 3111 if ((isALPHANUMERIC(*s)))
a2a5de95
NC
3112 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
3113 "Unrecognized escape \\%c passed through",
3114 *s);
11b8faa4 3115 /* default action is to copy the quoted character */
f9a63242 3116 goto default_action;
11b8faa4 3117 }
02aa26ce 3118
632403cc 3119 /* eg. \132 indicates the octal constant 0132 */
79072805
LW
3120 case '0': case '1': case '2': case '3':
3121 case '4': case '5': case '6': case '7':
ba210ebe 3122 {
5e0a247b 3123 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
53305cf1 3124 STRLEN len = 3;
06972766 3125 uv = grok_oct(s, &len, &flags, NULL);
ba210ebe 3126 s += len;
5e0a247b
KW
3127 if (len < 3 && s < send && isDIGIT(*s)
3128 && ckWARN(WARN_MISC))
3129 {
3130 Perl_warner(aTHX_ packWARN(WARN_MISC),
3131 "%s", form_short_octal_warning(s, len));
3132 }
ba210ebe 3133 }
012bcf8d 3134 goto NUM_ESCAPE_INSERT;
02aa26ce 3135
f0a2b745
KW
3136 /* eg. \o{24} indicates the octal constant \024 */
3137 case 'o':
3138 {
454155d9 3139 const char* error;
f0a2b745 3140
00ce5563 3141 bool valid = grok_bslash_o(&s, &uv, &error,
80f4111b
KW
3142 TRUE, /* Output warning */
3143 FALSE, /* Not strict */
17896a85
KW
3144 TRUE, /* Output warnings for
3145 non-portables */
80f4111b 3146 UTF);
454155d9 3147 if (! valid) {
f0a2b745
KW
3148 yyerror(error);
3149 continue;
3150 }
3151 goto NUM_ESCAPE_INSERT;
3152 }
3153
77a135fe 3154 /* eg. \x24 indicates the hex constant 0x24 */
79072805 3155 case 'x':
a0481293 3156 {
a0481293 3157 const char* error;
355860ce 3158
00ce5563 3159 bool valid = grok_bslash_x(&s, &uv, &error,
80f4111b
KW
3160 TRUE, /* Output warning */
3161 FALSE, /* Not strict */
17896a85
KW
3162 TRUE, /* Output warnings for
3163 non-portables */
80f4111b 3164 UTF);
a0481293
KW
3165 if (! valid) {
3166 yyerror(error);
355860ce 3167 continue;
ba210ebe 3168 }
012bcf8d
GS
3169 }
3170
3171 NUM_ESCAPE_INSERT:
ff3f963a
KW
3172 /* Insert oct or hex escaped character. There will always be
3173 * enough room in sv since such escapes will be longer than any
3174 * UTF-8 sequence they can end up as, except if they force us
3175 * to recode the rest of the string into utf8 */
ba7cea30 3176
06972766 3177 /* Here uv is the ordinal of the next character being added */
6f2d5cbc 3178 if (!UVCHR_IS_INVARIANT(uv)) {
9aa983d2 3179 if (!has_utf8 && uv > 255) {
77a135fe
KW
3180 /* Might need to recode whatever we have accumulated so
3181 * far if it contains any chars variant in utf8 or
3182 * utf-ebcdic. */
3183
3184 SvCUR_set(sv, d - SvPVX_const(sv));
3185 SvPOK_on(sv);
3186 *d = '\0';
77a135fe 3187 /* See Note on sizing above. */
7bf79863
KW
3188 sv_utf8_upgrade_flags_grow(sv,
3189 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3190 UNISKIP(uv) + (STRLEN)(send - s) + 1);
77a135fe
KW
3191 d = SvPVX(sv) + SvCUR(sv);
3192 has_utf8 = TRUE;
012bcf8d
GS
3193 }
3194
77a135fe 3195 if (has_utf8) {
c80e42f3 3196 d = (char*)uvchr_to_utf8((U8*)d, uv);
f9a63242
JH
3197 if (PL_lex_inwhat == OP_TRANS &&
3198 PL_sublex_info.sub_op) {
3199 PL_sublex_info.sub_op->op_private |=
3200 (PL_lex_repl ? OPpTRANS_FROM_UTF
3201 : OPpTRANS_TO_UTF);
f9a63242 3202 }
e294cc5d
JH
3203#ifdef EBCDIC
3204 if (uv > 255 && !dorange)
3205 native_range = FALSE;
3206#endif
012bcf8d 3207 }
a0ed51b3 3208 else {
012bcf8d 3209 *d++ = (char)uv;
a0ed51b3 3210 }
012bcf8d
GS
3211 }
3212 else {
c4d5f83a 3213 *d++ = (char) uv;
a0ed51b3 3214 }
79072805 3215 continue;
02aa26ce 3216
4a2d328f 3217 case 'N':
ff3f963a
KW
3218 /* In a non-pattern \N must be a named character, like \N{LATIN
3219 * SMALL LETTER A} or \N{U+0041}. For patterns, it also can
3220 * mean to match a non-newline. For non-patterns, named
3221 * characters are converted to their string equivalents. In
3222 * patterns, named characters are not converted to their
3223 * ultimate forms for the same reasons that other escapes
3224 * aren't. Instead, they are converted to the \N{U+...} form
3225 * to get the value from the charnames that is in effect right
3226 * now, while preserving the fact that it was a named character
3227 * so that the regex compiler knows this */
3228
ff3f963a
KW
3229 /* The structure of this section of code (besides checking for
3230 * errors and upgrading to utf8) is:
3231 * Further disambiguate between the two meanings of \N, and if
3232 * not a charname, go process it elsewhere
0a96133f
KW
3233 * If of form \N{U+...}, pass it through if a pattern;
3234 * otherwise convert to utf8
3235 * Otherwise must be \N{NAME}: convert to \N{U+c1.c2...} if a
3236 * pattern; otherwise convert to utf8 */
ff3f963a
KW
3237
3238 /* Here, s points to the 'N'; the test below is guaranteed to
3239 * succeed if we are being called on a pattern as we already
3240 * know from a test above that the next character is a '{'.
3241 * On a non-pattern \N must mean 'named sequence, which
3242 * requires braces */
3243 s++;
3244 if (*s != '{') {
3245 yyerror("Missing braces on \\N{}");
3246 continue;
3247 }
3248 s++;
3249
0a96133f 3250 /* If there is no matching '}', it is an error. */
ff3f963a
KW
3251 if (! (e = strchr(s, '}'))) {
3252 if (! PL_lex_inpat) {
5777a3f7 3253 yyerror("Missing right brace on \\N{}");
0a96133f 3254 } else {
4407f1b8 3255 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N");
dbc0d4f2 3256 }
0a96133f 3257 continue;
ff3f963a 3258 }
cddc7ef4 3259
ff3f963a 3260 /* Here it looks like a named character */
cddc7ef4 3261
ff3f963a
KW
3262 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3263 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3264 | PERL_SCAN_DISALLOW_PREFIX;
3265 STRLEN len;
3266
3267 /* For \N{U+...}, the '...' is a unicode value even on
3268 * EBCDIC machines */
3269 s += 2; /* Skip to next char after the 'U+' */
3270 len = e - s;
3271 uv = grok_hex(s, &len, &flags, NULL);
3272 if (len == 0 || len != (STRLEN)(e - s)) {
3273 yyerror("Invalid hexadecimal number in \\N{U+...}");
3274 s = e + 1;
3275 continue;
3276 }
3277
3278 if (PL_lex_inpat) {
3279
e2a7e165
KW
3280 /* On non-EBCDIC platforms, pass through to the regex
3281 * compiler unchanged. The reason we evaluated the
3282 * number above is to make sure there wasn't a syntax
3283 * error. But on EBCDIC we convert to native so
3284 * downstream code can continue to assume it's native
3285 */
ff3f963a 3286 s -= 5; /* Include the '\N{U+' */
e2a7e165
KW
3287#ifdef EBCDIC
3288 d += my_snprintf(d, e - s + 1 + 1, /* includes the }
3289 and the \0 */
3290 "\\N{U+%X}",
3291 (unsigned int) UNI_TO_NATIVE(uv));
3292#else
ff3f963a
KW
3293 Copy(s, d, e - s + 1, char); /* 1 = include the } */
3294 d += e - s + 1;
e2a7e165 3295#endif
ff3f963a
KW
3296 }
3297 else { /* Not a pattern: convert the hex to string */
3298
3299 /* If destination is not in utf8, unconditionally
3300 * recode it to be so. This is because \N{} implies
3301 * Unicode semantics, and scalars have to be in utf8
3302 * to guarantee those semantics */
3303 if (! has_utf8) {
3304 SvCUR_set(sv, d - SvPVX_const(sv));
3305 SvPOK_on(sv);
3306 *d = '\0';
3307 /* See Note on sizing above. */
3308 sv_utf8_upgrade_flags_grow(
3309 sv,
3310 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3311 UNISKIP(uv) + (STRLEN)(send - e) + 1);
3312 d = SvPVX(sv) + SvCUR(sv);
3313 has_utf8 = TRUE;
3314 }
3315
a46469e6 3316 /* Add the (Unicode) code point to the output. */
ff3f963a 3317 if (UNI_IS_INVARIANT(uv)) {
a46469e6 3318 *d++ = (char) LATIN1_TO_NATIVE(uv);
ff3f963a 3319 }
a46469e6
KW
3320 else {
3321 d = (char*) uvoffuni_to_utf8_flags((U8*)d, uv, 0);
3322 }
ff3f963a
KW
3323 }
3324 }
6f613c73
KW
3325 else /* Here is \N{NAME} but not \N{U+...}. */
3326 if ((res = get_and_check_backslash_N_name(s, e)))
3327 {
3328 STRLEN len;
3329 const char *str = SvPV_const(res, len);
3330 if (PL_lex_inpat) {
ff3f963a
KW
3331
3332 if (! len) { /* The name resolved to an empty string */
3333 Copy("\\N{}", d, 4, char);
3334 d += 4;
3335 }
3336 else {
3337 /* In order to not lose information for the regex
3338 * compiler, pass the result in the specially made
3339 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3340 * the code points in hex of each character
3341 * returned by charnames */
3342
3343 const char *str_end = str + len;
3b721c4f 3344 const STRLEN off = d - SvPVX_const(sv);
94ca1619
KW
3345
3346 if (! SvUTF8(res)) {
3347 /* For the non-UTF-8 case, we can determine the
3348 * exact length needed without having to parse
3349 * through the string. Each character takes up
3350 * 2 hex digits plus either a trailing dot or
3351 * the "}" */
3352 d = off + SvGROW(sv, off
3353 + 3 * len
3354 + 6 /* For the "\N{U+", and
3355 trailing NUL */
3356 + (STRLEN)(send - e));
3357 Copy("\\N{U+", d, 5, char);
3358 d += 5;
3359 while (str < str_end) {
3360 char hex_string[4];
e8549682 3361 int len =
51f14a05
JH
3362 my_snprintf(hex_string,
3363 sizeof(hex_string),
e8549682
JH
3364 "%02X.", (U8) *str);
3365 PERL_MY_SNPRINTF_POST_GUARD(len, sizeof(hex_string));
94ca1619
KW
3366 Copy(hex_string, d, 3, char);
3367 d += 3;
3368 str++;
3369 }
3370 d--; /* We will overwrite below the final
3371 dot with a right brace */
3372 }
3373 else {
1953db30
KW
3374 STRLEN char_length; /* cur char's byte length */
3375
3376 /* and the number of bytes after this is
3377 * translated into hex digits */
3378 STRLEN output_length;
3379
3380 /* 2 hex per byte; 2 chars for '\N'; 2 chars
3381 * for max('U+', '.'); and 1 for NUL */
3382 char hex_string[2 * UTF8_MAXBYTES + 5];
3383
3384 /* Get the first character of the result. */
a46469e6 3385 U32 uv = utf8n_to_uvchr((U8 *) str,
1953db30
KW
3386 len,
3387 &char_length,
3388 UTF8_ALLOW_ANYUV);
3389 /* Convert first code point to hex, including
a46469e6 3390 * the boiler plate before it. */
1953db30
KW
3391 output_length =
3392 my_snprintf(hex_string, sizeof(hex_string),
a46469e6
KW
3393 "\\N{U+%X",
3394 (unsigned int) uv);
1953db30
KW
3395
3396 /* Make sure there is enough space to hold it */
3397 d = off + SvGROW(sv, off
3398 + output_length
3399 + (STRLEN)(send - e)
3400 + 2); /* '}' + NUL */
3401 /* And output it */
3402 Copy(hex_string, d, output_length, char);
3403 d += output_length;
3404
3405 /* For each subsequent character, append dot and
3406 * its ordinal in hex */
3407 while ((str += char_length) < str_end) {
3408 const STRLEN off = d - SvPVX_const(sv);
a46469e6 3409 U32 uv = utf8n_to_uvchr((U8 *) str,
1953db30
KW
3410 str_end - str,
3411 &char_length,
3412 UTF8_ALLOW_ANYUV);
3413 output_length =
3414 my_snprintf(hex_string,
a46469e6
KW
3415 sizeof(hex_string),
3416 ".%X",
3417 (unsigned int) uv);
1953db30
KW
3418
3419 d = off + SvGROW(sv, off
3420 + output_length
3421 + (STRLEN)(send - e)
3422 + 2); /* '}' + NUL */
3423 Copy(hex_string, d, output_length, char);
3424 d += output_length;
3425 }
94ca1619 3426 }
ff3f963a
KW
3427
3428 *d++ = '}'; /* Done. Add the trailing brace */
3429 }
3430 }
3431 else { /* Here, not in a pattern. Convert the name to a
3432 * string. */
3433
3434 /* If destination is not in utf8, unconditionally
3435 * recode it to be so. This is because \N{} implies
3436 * Unicode semantics, and scalars have to be in utf8
3437 * to guarantee those semantics */
3438 if (! has_utf8) {
3439 SvCUR_set(sv, d - SvPVX_const(sv));
3440 SvPOK_on(sv);
3441 *d = '\0';
3442 /* See Note on sizing above. */
3443 sv_utf8_upgrade_flags_grow(sv,
3444 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3445 len + (STRLEN)(send - s) + 1);
3446 d = SvPVX(sv) + SvCUR(sv);
3447 has_utf8 = TRUE;
3448 } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
3449
3450 /* See Note on sizing above. (NOTE: SvCUR() is not
3451 * set correctly here). */
3452 const STRLEN off = d - SvPVX_const(sv);
3453 d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1);
3454 }
7fc82458
KW
3455 if (! SvUTF8(res)) { /* Make sure is \N{} return is UTF-8 */
3456 sv_utf8_upgrade(res);
3457 str = SvPV_const(res, len);
3458 }
ff3f963a
KW
3459 Copy(str, d, len, char);
3460 d += len;
423cee85 3461 }
6f613c73 3462
423cee85 3463 SvREFCNT_dec(res);
cb233ae3 3464
cb233ae3 3465 } /* End \N{NAME} */
ff3f963a
KW
3466#ifdef EBCDIC
3467 if (!dorange)
3468 native_range = FALSE; /* \N{} is defined to be Unicode */
3469#endif
3470 s = e + 1; /* Point to just after the '}' */
423cee85
JH
3471 continue;
3472
02aa26ce 3473 /* \c is a control character */
79072805
LW
3474 case 'c':
3475 s++;
961ce445 3476 if (s < send) {
421e43ba 3477 *d++ = grok_bslash_c(*s++, 1);
ba210ebe 3478 }
961ce445
RGS
3479 else {
3480 yyerror("Missing control char name in \\c");
3481 }
79072805 3482 continue;
02aa26ce
NT
3483
3484 /* printf-style backslashes, formfeeds, newlines, etc */
79072805 3485 case 'b':
4d73d076 3486 *d++ = '\b';
79072805
LW
3487 break;
3488 case 'n':
4d73d076 3489 *d++ = '\n';
79072805
LW
3490 break;
3491 case 'r':
4d73d076 3492 *d++ = '\r';
79072805
LW
3493 break;
3494 case 'f':
4d73d076 3495 *d++ = '\f';
79072805
LW
3496 break;
3497 case 't':
4d73d076 3498 *d++ = '\t';
79072805 3499 break;
34a3fe2a 3500 case 'e':
4d73d076 3501 *d++ = ASCII_TO_NATIVE('\033');
34a3fe2a
PP
3502 break;
3503 case 'a':
4d73d076 3504 *d++ = '\a';
79072805 3505 break;
02aa26ce
NT
3506 } /* end switch */
3507
79072805
LW
3508 s++;
3509 continue;
02aa26ce 3510 } /* end if (backslash) */
4c3a8340
ST
3511#ifdef EBCDIC
3512 else
3513 literal_endpoint++;
3514#endif
02aa26ce 3515
f9a63242 3516 default_action:
77a135fe
KW
3517 /* If we started with encoded form, or already know we want it,
3518 then encode the next character */
6f2d5cbc 3519 if (! NATIVE_BYTE_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
2b9d42f0 3520 STRLEN len = 1;
77a135fe
KW
3521
3522
3523 /* One might think that it is wasted effort in the case of the
3524 * source being utf8 (this_utf8 == TRUE) to take the next character
3525 * in the source, convert it to an unsigned value, and then convert
3526 * it back again. But the source has not been validated here. The
3527 * routine that does the conversion checks for errors like
3528 * malformed utf8 */
3529
233ca360
KW
3530 const UV nextuv = (this_utf8)
3531 ? utf8n_to_uvchr((U8*)s, send - s, &len, 0)
3532 : (UV) ((U8) *s);
5aaebcb3 3533 const STRLEN need = UNISKIP(nextuv);
77a135fe
KW
3534 if (!has_utf8) {
3535 SvCUR_set(sv, d - SvPVX_const(sv));
3536 SvPOK_on(sv);
3537 *d = '\0';
77a135fe 3538 /* See Note on sizing above. */
7bf79863
KW
3539 sv_utf8_upgrade_flags_grow(sv,
3540 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3541 need + (STRLEN)(send - s) + 1);
77a135fe
KW
3542 d = SvPVX(sv) + SvCUR(sv);
3543 has_utf8 = TRUE;
3544 } else if (need > len) {
3545 /* encoded value larger than old, may need extra space (NOTE:
3546 * SvCUR() is not set correctly here). See Note on sizing
3547 * above. */
9d4ba2ae 3548 const STRLEN off = d - SvPVX_const(sv);
77a135fe 3549 d = SvGROW(sv, off + need + (STRLEN)(send - s) + 1) + off;
2b9d42f0 3550 }
77a135fe
KW
3551 s += len;
3552
5f66b61c 3553 d = (char*)uvchr_to_utf8((U8*)d, nextuv);
e294cc5d
JH
3554#ifdef EBCDIC
3555 if (uv > 255 && !dorange)
3556 native_range = FALSE;
3557#endif
2b9d42f0
NIS
3558 }
3559 else {
5ff03569 3560 *d++ = *s++;
2b9d42f0 3561 }
02aa26ce
NT
3562 } /* while loop to process each character */
3563
3564 /* terminate the string and set up the sv */
79072805 3565 *d = '\0';
95a20fc0 3566 SvCUR_set(sv, d - SvPVX_const(sv));
2b9d42f0 3567 if (SvCUR(sv) >= SvLEN(sv))
5637ef5b
NC
3568 Perl_croak(aTHX_ "panic: constant overflowed allocated space, %"UVuf
3569 " >= %"UVuf, (UV)SvCUR(sv), (UV)SvLEN(sv));
2b9d42f0 3570
79072805 3571 SvPOK_on(sv);
9f4817db 3572 if (PL_encoding && !has_utf8) {
d0063567
DK
3573 sv_recode_to_utf8(sv, PL_encoding);
3574 if (SvUTF8(sv))
3575 has_utf8 = TRUE;
9f4817db 3576 }
2b9d42f0 3577 if (has_utf8) {
7e2040f0 3578 SvUTF8_on(sv);
2b9d42f0 3579 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
d0063567 3580 PL_sublex_info.sub_op->op_private |=
2b9d42f0
NIS
3581 (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
3582 }
3583 }
79072805 3584
02aa26ce 3585 /* shrink the sv if we allocated more than we used */
79072805 3586 if (SvCUR(sv) + 5 < SvLEN(sv)) {
1da4ca5f 3587 SvPV_shrink_to_cur(sv);
79072805 3588 }
02aa26ce 3589
6154021b 3590 /* return the substring (via pl_yylval) only if we parsed anything */
f0feb466
FC
3591 if (s > start) {
3592 char *s2 = start;
ffdb8b16
FC
3593 for (; s2 < s; s2++) {
3594 if (*s2 == '\n')
3595 COPLINE_INC_WITH_HERELINES;
3596 }
b899e89d 3597 SvREFCNT_inc_simple_void_NN(sv);
4f3e2518
DM
3598 if ( (PL_hints & ( PL_lex_inpat ? HINT_NEW_RE : HINT_NEW_STRING ))
3599 && ! PL_parser->lex_re_reparsing)
3600 {
eb0d8d16
NC
3601 const char *const key = PL_lex_inpat ? "qr" : "q";
3602 const STRLEN keylen = PL_lex_inpat ? 2 : 1;
3603 const char *type;
3604 STRLEN typelen;
3605
3606 if (PL_lex_inwhat == OP_TRANS) {
3607 type = "tr";
3608 typelen = 2;
3609 } else if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat) {
3610 type = "s";
3611 typelen = 1;
9da1dd8f
DM
3612 } else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'') {
3613 type = "q";
3614 typelen = 1;
eb0d8d16
NC
3615 } else {
3616 type = "qq";
3617 typelen = 2;
3618 }
3619
3620 sv = S_new_constant(aTHX_ start, s - start, key, keylen, sv, NULL,
3621 type, typelen);
3622 }
6154021b 3623 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
b899e89d
FC
3624 }
3625 LEAVE_with_name("scan_const");
79072805
LW
3626 return s;
3627}
3628
ffb4593c
NT
3629/* S_intuit_more
3630 * Returns TRUE if there's more to the expression (e.g., a subscript),
3631 * FALSE otherwise.
ffb4593c
NT
3632 *
3633 * It deals with "$foo[3]" and /$foo[3]/ and /$foo[0123456789$]+/
3634 *
3635 * ->[ and ->{ return TRUE
ff25e5db 3636 * ->$* ->$#* ->@* ->@[ ->@{ return TRUE if postderef_qq is enabled
ffb4593c
NT
3637 * { and [ outside a pattern are always subscripts, so return TRUE
3638 * if we're outside a pattern and it's not { or [, then return FALSE
3639 * if we're in a pattern and the first char is a {
3640 * {4,5} (any digits around the comma) returns FALSE
3641 * if we're in a pattern and the first char is a [
3642 * [] returns FALSE
3643 * [SOMETHING] has a funky algorithm to decide whether it's a
3644 * character class or not. It has to deal with things like
3645 * /$foo[-3]/ and /$foo[$bar]/ as well as /$foo[$\d]+/
3646 * anything else returns TRUE
3647 */
3648
9cbb5ea2
GS
3649/* This is the one truly awful dwimmer necessary to conflate C and sed. */
3650
76e3520e 3651STATIC int
5aaab254 3652S_intuit_more(pTHX_ char *s)
79072805 3653{
7918f24d
NC
3654 PERL_ARGS_ASSERT_INTUIT_MORE;
3655
3280af22 3656 if (PL_lex_brackets)
79072805
LW
3657 return TRUE;
3658 if (*s == '-' && s[1] == '>' && (s[2] == '[' || s[2] == '{'))
3659 return TRUE;
cc624add
FC
3660 if (*s == '-' && s[1] == '>'
3661 && FEATURE_POSTDEREF_QQ_IS_ENABLED
ff25e5db