This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
[perl #46947] Parse method-BLOCK arguments as a term
[perl5.git] / toke.c
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
f0e67a1d
Z
26This is the lower layer of the Perl parser, managing characters and tokens.
27
28=for apidoc AmU|yy_parser *|PL_parser
29
30Pointer to a structure encapsulating the state of the parsing operation
31currently in progress. The pointer can be locally changed to perform
32a nested parse without interfering with the state of an outer parse.
33Individual members of C<PL_parser> have their own documentation.
34
35=cut
36*/
37
378cc40b 38#include "EXTERN.h"
864dbfa3 39#define PERL_IN_TOKE_C
378cc40b 40#include "perl.h"
04e98a4d 41#include "dquote_static.c"
378cc40b 42
eb0d8d16
NC
43#define new_constant(a,b,c,d,e,f,g) \
44 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
45
6154021b 46#define pl_yylval (PL_parser->yylval)
d3b6f988 47
199e78b7
DM
48/* XXX temporary backwards compatibility */
49#define PL_lex_brackets (PL_parser->lex_brackets)
78cdf107
Z
50#define PL_lex_allbrackets (PL_parser->lex_allbrackets)
51#define PL_lex_fakeeof (PL_parser->lex_fakeeof)
199e78b7
DM
52#define PL_lex_brackstack (PL_parser->lex_brackstack)
53#define PL_lex_casemods (PL_parser->lex_casemods)
54#define PL_lex_casestack (PL_parser->lex_casestack)
55#define PL_lex_defer (PL_parser->lex_defer)
56#define PL_lex_dojoin (PL_parser->lex_dojoin)
57#define PL_lex_expect (PL_parser->lex_expect)
58#define PL_lex_formbrack (PL_parser->lex_formbrack)
59#define PL_lex_inpat (PL_parser->lex_inpat)
60#define PL_lex_inwhat (PL_parser->lex_inwhat)
61#define PL_lex_op (PL_parser->lex_op)
62#define PL_lex_repl (PL_parser->lex_repl)
63#define PL_lex_starts (PL_parser->lex_starts)
64#define PL_lex_stuff (PL_parser->lex_stuff)
65#define PL_multi_start (PL_parser->multi_start)
66#define PL_multi_open (PL_parser->multi_open)
67#define PL_multi_close (PL_parser->multi_close)
199e78b7
DM
68#define PL_preambled (PL_parser->preambled)
69#define PL_sublex_info (PL_parser->sublex_info)
bdc0bf6f 70#define PL_linestr (PL_parser->linestr)
c2598295
DM
71#define PL_expect (PL_parser->expect)
72#define PL_copline (PL_parser->copline)
f06b5848
DM
73#define PL_bufptr (PL_parser->bufptr)
74#define PL_oldbufptr (PL_parser->oldbufptr)
75#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
76#define PL_linestart (PL_parser->linestart)
77#define PL_bufend (PL_parser->bufend)
78#define PL_last_uni (PL_parser->last_uni)
79#define PL_last_lop (PL_parser->last_lop)
80#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 81#define PL_lex_state (PL_parser->lex_state)
2f9285f8 82#define PL_rsfp (PL_parser->rsfp)
5486870f 83#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
84#define PL_in_my (PL_parser->in_my)
85#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 86#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 87#define PL_multi_end (PL_parser->multi_end)
13765c85 88#define PL_error_count (PL_parser->error_count)
199e78b7 89
fb205e7a
DM
90# define PL_nexttoke (PL_parser->nexttoke)
91# define PL_nexttype (PL_parser->nexttype)
92# define PL_nextval (PL_parser->nextval)
199e78b7 93
a1894d81 94static const char* const ident_too_long = "Identifier too long";
8903cb82 95
9ded7720 96# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2 97
a7aaec61
Z
98#define XENUMMASK 0x3f
99#define XFAKEEOF 0x40
100#define XFAKEBRACK 0x80
9059aa12 101
39e02b42
JH
102#ifdef USE_UTF8_SCRIPTS
103# define UTF (!IN_BYTES)
2b9d42f0 104#else
802a15e9 105# define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
2b9d42f0 106#endif
a0ed51b3 107
b1fc3636
CJ
108/* The maximum number of characters preceding the unrecognized one to display */
109#define UNRECOGNIZED_PRECEDE_COUNT 10
110
61f0cdd9 111/* In variables named $^X, these are the legal values for X.
2b92dfce
GS
112 * 1999-02-27 mjd-perl-patch@plover.com */
113#define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
114
14bd96d0 115#define SPACE_OR_TAB(c) isBLANK_A(c)
bf4acbe4 116
ffb4593c
NT
117/* LEX_* are values for PL_lex_state, the state of the lexer.
118 * They are arranged oddly so that the guard on the switch statement
79072805 119 * can get by with a single comparison (if the compiler is smart enough).
9da1dd8f
DM
120 *
121 * These values refer to the various states within a sublex parse,
122 * i.e. within a double quotish string
79072805
LW
123 */
124
fb73857a
PP
125/* #define LEX_NOTPARSING 11 is done in perl.h. */
126
b6007c36
DM
127#define LEX_NORMAL 10 /* normal code (ie not within "...") */
128#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
129#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
130#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
131#define LEX_INTERPSTART 6 /* expecting the start of a $var */
132
133 /* at end of code, eg "$x" followed by: */
134#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
135#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
136
137#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
138 string or after \E, $foo, etc */
139#define LEX_INTERPCONST 2 /* NOT USED */
140#define LEX_FORMLINE 1 /* expecting a format line */
141#define LEX_KNOWNEXT 0 /* next token known; just return it */
142
79072805 143
bbf60fe6 144#ifdef DEBUGGING
27da23d5 145static const char* const lex_state_names[] = {
bbf60fe6
DM
146 "KNOWNEXT",
147 "FORMLINE",
148 "INTERPCONST",
149 "INTERPCONCAT",
150 "INTERPENDMAYBE",
151 "INTERPEND",
152 "INTERPSTART",
153 "INTERPPUSH",
154 "INTERPCASEMOD",
155 "INTERPNORMAL",
156 "NORMAL"
157};
158#endif
159
79072805 160#include "keywords.h"
fe14fcc3 161
ffb4593c
NT
162/* CLINE is a macro that ensures PL_copline has a sane value */
163
57843af0 164#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 165
29595ff2
NC
166# define SKIPSPACE0(s) skipspace(s)
167# define SKIPSPACE1(s) skipspace(s)
168# define SKIPSPACE2(s,tsv) skipspace(s)
169# define PEEKSPACE(s) skipspace(s)
29595ff2 170
ffb4593c
NT
171/*
172 * Convenience functions to return different tokens and prime the
9cbb5ea2 173 * lexer for the next token. They all take an argument.
ffb4593c
NT
174 *
175 * TOKEN : generic token (used for '(', DOLSHARP, etc)
176 * OPERATOR : generic operator
177 * AOPERATOR : assignment operator
178 * PREBLOCK : beginning the block after an if, while, foreach, ...
179 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
180 * PREREF : *EXPR where EXPR is not a simple identifier
181 * TERM : expression term
89f35911 182 * POSTDEREF : postfix dereference (->$* ->@[...] etc.)
ffb4593c
NT
183 * LOOPX : loop exiting command (goto, last, dump, etc)
184 * FTST : file test operator
185 * FUN0 : zero-argument function
7eb971ee 186 * FUN0OP : zero-argument function, with its op created in this file
2d2e263d 187 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
188 * BOop : bitwise or or xor
189 * BAop : bitwise and
190 * SHop : shift operator
191 * PWop : power operator
9cbb5ea2 192 * PMop : pattern-matching operator
ffb4593c
NT
193 * Aop : addition-level operator
194 * Mop : multiplication-level operator
195 * Eop : equality-testing operator
e5edeb50 196 * Rop : relational operator <= != gt
ffb4593c
NT
197 *
198 * Also see LOP and lop() below.
199 */
200
998054bd 201#ifdef DEBUGGING /* Serve -DT. */
704d4215 202# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 203#else
bbf60fe6 204# define REPORT(retval) (retval)
998054bd
SC
205#endif
206
bbf60fe6
DM
207#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
208#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
209#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, REPORT(retval)))
210#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
211#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
212#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
213#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
89f35911 214#define POSTDEREF(f) return (PL_bufptr = s, S_postderef(aTHX_ REPORT(f),s[1]))
6154021b
RGS
215#define LOOPX(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)LOOPEX))
216#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
217#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
7eb971ee 218#define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
6154021b
RGS
219#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
220#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITOROP)))
221#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITANDOP)))
222#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)SHIFTOP)))
223#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)POWOP)))
224#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
225#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)ADDOP)))
226#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MULOP)))
227#define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
228#define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
2f3197b3 229
a687059c
LW
230/* This bit of chicanery makes a unary function followed by
231 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
232 * The UNIDOR macro is for unary functions that can be followed by the //
233 * operator (such as C<shift // 0>).
a687059c 234 */
d68ce4ac 235#define UNI3(f,x,have_x) { \
6154021b 236 pl_yylval.ival = f; \
d68ce4ac 237 if (have_x) PL_expect = x; \
376fcdbf
AL
238 PL_bufptr = s; \
239 PL_last_uni = PL_oldbufptr; \
240 PL_last_lop_op = f; \
241 if (*s == '(') \
242 return REPORT( (int)FUNC1 ); \
29595ff2 243 s = PEEKSPACE(s); \
376fcdbf
AL
244 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
245 }
d68ce4ac
FC
246#define UNI(f) UNI3(f,XTERM,1)
247#define UNIDOR(f) UNI3(f,XTERMORDORDOR,1)
b5fb7ce3
FC
248#define UNIPROTO(f,optional) { \
249 if (optional) PL_last_uni = PL_oldbufptr; \
22393538
MH
250 OPERATOR(f); \
251 }
a687059c 252
d68ce4ac 253#define UNIBRACK(f) UNI3(f,0,0)
79072805 254
9f68db38 255/* grandfather return to old style */
78cdf107
Z
256#define OLDLOP(f) \
257 do { \
258 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
259 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
260 pl_yylval.ival = (f); \
261 PL_expect = XTERM; \
262 PL_bufptr = s; \
263 return (int)LSTOP; \
264 } while(0)
79072805 265
83944c01
FC
266#define COPLINE_INC_WITH_HERELINES \
267 STMT_START { \
268 CopLINE_inc(PL_curcop); \
851b527a
FC
269 if (PL_parser->herelines) \
270 CopLINE(PL_curcop) += PL_parser->herelines, \
271 PL_parser->herelines = 0; \
83944c01 272 } STMT_END
ffdb8b16
FC
273/* Called after scan_str to update CopLINE(PL_curcop), but only when there
274 * is no sublex_push to follow. */
275#define COPLINE_SET_FROM_MULTI_END \
276 STMT_START { \
277 CopLINE_set(PL_curcop, PL_multi_end); \
278 if (PL_multi_end != PL_multi_start) \
851b527a 279 PL_parser->herelines = 0; \
ffdb8b16 280 } STMT_END
83944c01
FC
281
282
8fa7f367
JH
283#ifdef DEBUGGING
284
6154021b 285/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
286enum token_type {
287 TOKENTYPE_NONE,
288 TOKENTYPE_IVAL,
6154021b 289 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6 290 TOKENTYPE_PVAL,
aeaef349 291 TOKENTYPE_OPVAL
bbf60fe6
DM
292};
293
6d4a66ac
NC
294static struct debug_tokens {
295 const int token;
296 enum token_type type;
297 const char *name;
298} const debug_tokens[] =
9041c2e3 299{
bbf60fe6
DM
300 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
301 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
302 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
303 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
304 { ARROW, TOKENTYPE_NONE, "ARROW" },
305 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
306 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
307 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
308 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
309 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
0d863452 310 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
311 { DO, TOKENTYPE_NONE, "DO" },
312 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
313 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
314 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
315 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
316 { ELSE, TOKENTYPE_NONE, "ELSE" },
317 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
318 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
319 { FOR, TOKENTYPE_IVAL, "FOR" },
320 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
705fe0e5
FC
321 { FORMLBRACK, TOKENTYPE_NONE, "FORMLBRACK" },
322 { FORMRBRACK, TOKENTYPE_NONE, "FORMRBRACK" },
bbf60fe6
DM
323 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
324 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
7eb971ee 325 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
bbf60fe6
DM
326 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
327 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
328 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 329 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
330 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
331 { IF, TOKENTYPE_IVAL, "IF" },
5504e6cf 332 { LABEL, TOKENTYPE_PVAL, "LABEL" },
bbf60fe6
DM
333 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
334 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
335 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
336 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
337 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
338 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
339 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
340 { MY, TOKENTYPE_IVAL, "MY" },
bbf60fe6
DM
341 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
342 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
343 { OROP, TOKENTYPE_IVAL, "OROP" },
344 { OROR, TOKENTYPE_NONE, "OROR" },
345 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
88e1f1a2
JV
346 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
347 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6 348 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
cc624add 349 { POSTJOIN, TOKENTYPE_NONE, "POSTJOIN" },
bbf60fe6
DM
350 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
351 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
352 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
353 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
354 { PREINC, TOKENTYPE_NONE, "PREINC" },
355 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
f3f204dc 356 { QWLIST, TOKENTYPE_OPVAL, "QWLIST" },
bbf60fe6
DM
357 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
358 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
f3f204dc 359 { REQUIRE, TOKENTYPE_NONE, "REQUIRE" },
bbf60fe6
DM
360 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
361 { SUB, TOKENTYPE_NONE, "SUB" },
362 { THING, TOKENTYPE_OPVAL, "THING" },
363 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
364 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
365 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
366 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
367 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
368 { USE, TOKENTYPE_IVAL, "USE" },
0d863452 369 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6
DM
370 { WHILE, TOKENTYPE_IVAL, "WHILE" },
371 { WORD, TOKENTYPE_OPVAL, "WORD" },
be25f609 372 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 373 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
374};
375
6154021b 376/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 377
bbf60fe6 378STATIC int
704d4215 379S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 380{
7918f24d
NC
381 PERL_ARGS_ASSERT_TOKEREPORT;
382
bbf60fe6 383 if (DEBUG_T_TEST) {
bd61b366 384 const char *name = NULL;
bbf60fe6 385 enum token_type type = TOKENTYPE_NONE;
f54cb97a 386 const struct debug_tokens *p;
396482e1 387 SV* const report = newSVpvs("<== ");
bbf60fe6 388
f54cb97a 389 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
390 if (p->token == (int)rv) {
391 name = p->name;
392 type = p->type;
393 break;
394 }
395 }
396 if (name)
54667de8 397 Perl_sv_catpv(aTHX_ report, name);
74736ae6 398 else if ((char)rv > ' ' && (char)rv <= '~')
4ebc7986 399 {
bbf60fe6 400 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
4ebc7986
FC
401 if ((char)rv == 'p')
402 sv_catpvs(report, " (pending identifier)");
403 }
bbf60fe6 404 else if (!rv)
396482e1 405 sv_catpvs(report, "EOF");
bbf60fe6
DM
406 else
407 Perl_sv_catpvf(aTHX_ report, "?? %"IVdf, (IV)rv);
408 switch (type) {
409 case TOKENTYPE_NONE:
bbf60fe6
DM
410 break;
411 case TOKENTYPE_IVAL:
704d4215 412 Perl_sv_catpvf(aTHX_ report, "(ival=%"IVdf")", (IV)lvalp->ival);
bbf60fe6
DM
413 break;
414 case TOKENTYPE_OPNUM:
415 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 416 PL_op_name[lvalp->ival]);
bbf60fe6
DM
417 break;
418 case TOKENTYPE_PVAL:
704d4215 419 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
420 break;
421 case TOKENTYPE_OPVAL:
704d4215 422 if (lvalp->opval) {
401441c0 423 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
424 PL_op_name[lvalp->opval->op_type]);
425 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 426 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 427 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
428 }
429
430 }
401441c0 431 else
396482e1 432 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
433 break;
434 }
b6007c36 435 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
436 };
437 return (int)rv;
998054bd
SC
438}
439
b6007c36
DM
440
441/* print the buffer with suitable escapes */
442
443STATIC void
15f169a1 444S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 445{
396482e1 446 SV* const tmp = newSVpvs("");
7918f24d
NC
447
448 PERL_ARGS_ASSERT_PRINTBUF;
449
5d37acd6 450 GCC_DIAG_IGNORE(-Wformat-nonliteral); /* fmt checked by caller */
b6007c36 451 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
5d37acd6 452 GCC_DIAG_RESTORE;
b6007c36
DM
453 SvREFCNT_dec(tmp);
454}
455
8fa7f367
JH
456#endif
457
8290c323
NC
458static int
459S_deprecate_commaless_var_list(pTHX) {
460 PL_expect = XTERM;
461 deprecate("comma-less variable list");
462 return REPORT(','); /* grandfather non-comma-format format */
463}
464
ffb4593c
NT
465/*
466 * S_ao
467 *
c963b151
BD
468 * This subroutine detects &&=, ||=, and //= and turns an ANDAND, OROR or DORDOR
469 * into an OP_ANDASSIGN, OP_ORASSIGN, or OP_DORASSIGN
ffb4593c
NT
470 */
471
76e3520e 472STATIC int
cea2e8a9 473S_ao(pTHX_ int toketype)
a0d0e21e 474{
3280af22
NIS
475 if (*PL_bufptr == '=') {
476 PL_bufptr++;
a0d0e21e 477 if (toketype == ANDAND)
6154021b 478 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 479 else if (toketype == OROR)
6154021b 480 pl_yylval.ival = OP_ORASSIGN;
c963b151 481 else if (toketype == DORDOR)
6154021b 482 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
483 toketype = ASSIGNOP;
484 }
485 return toketype;
486}
487
ffb4593c
NT
488/*
489 * S_no_op
490 * When Perl expects an operator and finds something else, no_op
491 * prints the warning. It always prints "<something> found where
492 * operator expected. It prints "Missing semicolon on previous line?"
493 * if the surprise occurs at the start of the line. "do you need to
494 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
495 * where the compiler doesn't know if foo is a method call or a function.
496 * It prints "Missing operator before end of line" if there's nothing
497 * after the missing operator, or "... before <...>" if there is something
498 * after the missing operator.
499 */
500
76e3520e 501STATIC void
15f169a1 502S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 503{
9d4ba2ae
AL
504 char * const oldbp = PL_bufptr;
505 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 506
7918f24d
NC
507 PERL_ARGS_ASSERT_NO_OP;
508
1189a94a
GS
509 if (!s)
510 s = oldbp;
07c798fb 511 else
1189a94a 512 PL_bufptr = s;
734ab321 513 yywarn(Perl_form(aTHX_ "%s found where operator expected", what), UTF ? SVf_UTF8 : 0);
56da5a46
RGS
514 if (ckWARN_d(WARN_SYNTAX)) {
515 if (is_first)
516 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
517 "\t(Missing semicolon on previous line?)\n");
518 else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) {
f54cb97a 519 const char *t;
8a2bca12 520 for (t = PL_oldoldbufptr; (isWORDCHAR_lazy_if(t,UTF) || *t == ':');
734ab321 521 t += UTF ? UTF8SKIP(t) : 1)
c35e046a 522 NOOP;
56da5a46
RGS
523 if (t < PL_bufptr && isSPACE(*t))
524 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
b17a0679
FC
525 "\t(Do you need to predeclare %"UTF8f"?)\n",
526 UTF8fARG(UTF, t - PL_oldoldbufptr, PL_oldoldbufptr));
56da5a46
RGS
527 }
528 else {
529 assert(s >= oldbp);
530 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
b17a0679
FC
531 "\t(Missing operator before %"UTF8f"?)\n",
532 UTF8fARG(UTF, s - oldbp, oldbp));
56da5a46 533 }
07c798fb 534 }
3280af22 535 PL_bufptr = oldbp;
8990e307
LW
536}
537
ffb4593c
NT
538/*
539 * S_missingterm
540 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 541 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
542 * If we're in a delimited string and the delimiter is a control
543 * character, it's reformatted into a two-char sequence like ^C.
544 * This is fatal.
545 */
546
76e3520e 547STATIC void
cea2e8a9 548S_missingterm(pTHX_ char *s)
8990e307
LW
549{
550 char tmpbuf[3];
551 char q;
552 if (s) {
9d4ba2ae 553 char * const nl = strrchr(s,'\n');
d2719217 554 if (nl)
8990e307
LW
555 *nl = '\0';
556 }
ca8b19a7 557 else if ((U8) PL_multi_close < 32) {
8990e307 558 *tmpbuf = '^';
585ec06d 559 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
560 tmpbuf[2] = '\0';
561 s = tmpbuf;
562 }
563 else {
eb160463 564 *tmpbuf = (char)PL_multi_close;
8990e307
LW
565 tmpbuf[1] = '\0';
566 s = tmpbuf;
567 }
568 q = strchr(s,'"') ? '\'' : '"';
cea2e8a9 569 Perl_croak(aTHX_ "Can't find string terminator %c%s%c anywhere before EOF",q,s,q);
463ee0b2 570}
79072805 571
dd0ac2b9
FC
572#include "feature.h"
573
0d863452 574/*
0d863452
RH
575 * Check whether the named feature is enabled.
576 */
26ea9e12 577bool
3fff3427 578Perl_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
0d863452 579{
4a731d7b 580 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
7918f24d
NC
581
582 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
ca4d40c4
FC
583
584 assert(CURRENT_FEATURE_BUNDLE == FEATURE_BUNDLE_CUSTOM);
7918f24d 585
26ea9e12
NC
586 if (namelen > MAX_FEATURE_LEN)
587 return FALSE;
3fff3427 588 memcpy(&he_name[8], name, namelen);
7d69d4a6 589
c8ca97b0
NC
590 return cBOOL(cop_hints_fetch_pvn(PL_curcop, he_name, 8 + namelen, 0,
591 REFCOUNTED_HE_EXISTS));
0d863452
RH
592}
593
ffb4593c 594/*
9cbb5ea2
GS
595 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
596 * utf16-to-utf8-reversed.
ffb4593c
NT
597 */
598
c39cd008
GS
599#ifdef PERL_CR_FILTER
600static void
601strip_return(SV *sv)
602{
eb578fdb
KW
603 const char *s = SvPVX_const(sv);
604 const char * const e = s + SvCUR(sv);
7918f24d
NC
605
606 PERL_ARGS_ASSERT_STRIP_RETURN;
607
c39cd008
GS
608 /* outer loop optimized to do nothing if there are no CR-LFs */
609 while (s < e) {
610 if (*s++ == '\r' && *s == '\n') {
611 /* hit a CR-LF, need to copy the rest */
eb578fdb 612 char *d = s - 1;
c39cd008
GS
613 *d++ = *s++;
614 while (s < e) {
615 if (*s == '\r' && s[1] == '\n')
616 s++;
617 *d++ = *s++;
618 }
619 SvCUR(sv) -= s - d;
620 return;
621 }
622 }
623}
a868473f 624
76e3520e 625STATIC I32
c39cd008 626S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 627{
f54cb97a 628 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
629 if (count > 0 && !maxlen)
630 strip_return(sv);
631 return count;
a868473f
NIS
632}
633#endif
634
ffb4593c 635/*
8eaa0acf
Z
636=for apidoc Amx|void|lex_start|SV *line|PerlIO *rsfp|U32 flags
637
638Creates and initialises a new lexer/parser state object, supplying
639a context in which to lex and parse from a new source of Perl code.
640A pointer to the new state object is placed in L</PL_parser>. An entry
641is made on the save stack so that upon unwinding the new state object
642will be destroyed and the former value of L</PL_parser> will be restored.
643Nothing else need be done to clean up the parsing context.
644
645The code to be parsed comes from I<line> and I<rsfp>. I<line>, if
646non-null, provides a string (in SV form) containing code to be parsed.
647A copy of the string is made, so subsequent modification of I<line>
648does not affect parsing. I<rsfp>, if non-null, provides an input stream
649from which code will be read to be parsed. If both are non-null, the
650code in I<line> comes first and must consist of complete lines of input,
651and I<rsfp> supplies the remainder of the source.
652
e368b3bd
FC
653The I<flags> parameter is reserved for future use. Currently it is only
654used by perl internally, so extensions should always pass zero.
8eaa0acf
Z
655
656=cut
657*/
ffb4593c 658
27fcb6ee 659/* LEX_START_SAME_FILTER indicates that this is not a new file, so it
87606032
NC
660 can share filters with the current parser.
661 LEX_START_DONT_CLOSE indicates that the file handle wasn't opened by the
662 caller, hence isn't owned by the parser, so shouldn't be closed on parser
663 destruction. This is used to handle the case of defaulting to reading the
664 script from the standard input because no filename was given on the command
665 line (without getting confused by situation where STDIN has been closed, so
666 the script handle is opened on fd 0) */
27fcb6ee 667
a0d0e21e 668void
8eaa0acf 669Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
79072805 670{
6ef55633 671 const char *s = NULL;
5486870f 672 yy_parser *parser, *oparser;
60d63348 673 if (flags && flags & ~LEX_START_FLAGS)
8eaa0acf 674 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
acdf0a21
DM
675
676 /* create and initialise a parser */
677
199e78b7 678 Newxz(parser, 1, yy_parser);
5486870f 679 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
680 PL_parser = parser;
681
28ac2b49
Z
682 parser->stack = NULL;
683 parser->ps = NULL;
684 parser->stack_size = 0;
acdf0a21 685
e3abe207
DM
686 /* on scope exit, free this parser and restore any outer one */
687 SAVEPARSER(parser);
7c4baf47 688 parser->saved_curcop = PL_curcop;
e3abe207 689
acdf0a21 690 /* initialise lexer state */
8990e307 691
fb205e7a 692 parser->nexttoke = 0;
ca4cfd28 693 parser->error_count = oparser ? oparser->error_count : 0;
7f1c3e8c 694 parser->copline = parser->preambling = NOLINE;
5afb0a62 695 parser->lex_state = LEX_NORMAL;
c2598295 696 parser->expect = XSTATE;
2f9285f8 697 parser->rsfp = rsfp;
27fcb6ee
FC
698 parser->rsfp_filters =
699 !(flags & LEX_START_SAME_FILTER) || !oparser
d3cd8e11
FC
700 ? NULL
701 : MUTABLE_AV(SvREFCNT_inc(
702 oparser->rsfp_filters
703 ? oparser->rsfp_filters
704 : (oparser->rsfp_filters = newAV())
705 ));
2f9285f8 706
199e78b7
DM
707 Newx(parser->lex_brackstack, 120, char);
708 Newx(parser->lex_casestack, 12, char);
709 *parser->lex_casestack = '\0';
d794b522 710 Newxz(parser->lex_shared, 1, LEXSHARED);
02b34bbe 711
10efb74f 712 if (line) {
0528fd32 713 STRLEN len;
10efb74f 714 s = SvPV_const(line, len);
0abcdfa4
FC
715 parser->linestr = flags & LEX_START_COPIED
716 ? SvREFCNT_inc_simple_NN(line)
717 : newSVpvn_flags(s, len, SvUTF8(line));
bf1b738b 718 sv_catpvn(parser->linestr, "\n;", rsfp ? 1 : 2);
0abcdfa4 719 } else {
bf1b738b 720 parser->linestr = newSVpvn("\n;", rsfp ? 1 : 2);
8990e307 721 }
f06b5848
DM
722 parser->oldoldbufptr =
723 parser->oldbufptr =
724 parser->bufptr =
725 parser->linestart = SvPVX(parser->linestr);
726 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
727 parser->last_lop = parser->last_uni = NULL;
b54f893d
KW
728
729 assert(FITS_IN_8_BITS(LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
730 |LEX_DONT_CLOSE_RSFP));
731 parser->lex_flags = (U8) (flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
732 |LEX_DONT_CLOSE_RSFP));
737c24fc 733
60d63348 734 parser->in_pod = parser->filtered = 0;
79072805 735}
a687059c 736
e3abe207
DM
737
738/* delete a parser object */
739
740void
741Perl_parser_free(pTHX_ const yy_parser *parser)
742{
7918f24d
NC
743 PERL_ARGS_ASSERT_PARSER_FREE;
744
7c4baf47 745 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
746 SvREFCNT_dec(parser->linestr);
747
87606032 748 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
2f9285f8 749 PerlIO_clearerr(parser->rsfp);
799361c3
SH
750 else if (parser->rsfp && (!parser->old_parser ||
751 (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 752 PerlIO_close(parser->rsfp);
5486870f 753 SvREFCNT_dec(parser->rsfp_filters);
10002bc1
FC
754 SvREFCNT_dec(parser->lex_stuff);
755 SvREFCNT_dec(parser->sublex_info.repl);
3ac7ff8f
FC
756
757 Safefree(parser->lex_brackstack);
758 Safefree(parser->lex_casestack);
759 Safefree(parser->lex_shared);
760 PL_parser = parser->old_parser;
761 Safefree(parser);
762}
763
764void
765Perl_parser_free_nexttoke_ops(pTHX_ yy_parser *parser, OPSLAB *slab)
766{
3ac7ff8f 767 I32 nexttoke = parser->nexttoke;
3ac7ff8f 768 PERL_ARGS_ASSERT_PARSER_FREE_NEXTTOKE_OPS;
3ce3dcd9 769 while (nexttoke--) {
3ac7ff8f
FC
770 if (S_is_opval_token(parser->nexttype[nexttoke] & 0xffff)
771 && parser->nextval[nexttoke].opval
772 && parser->nextval[nexttoke].opval->op_slabbed
773 && OpSLAB(parser->nextval[nexttoke].opval) == slab) {
3ce3dcd9 774 op_free(parser->nextval[nexttoke].opval);
3ac7ff8f
FC
775 parser->nextval[nexttoke].opval = NULL;
776 }
3ce3dcd9 777 }
e3abe207
DM
778}
779
780
ffb4593c 781/*
f0e67a1d
Z
782=for apidoc AmxU|SV *|PL_parser-E<gt>linestr
783
784Buffer scalar containing the chunk currently under consideration of the
785text currently being lexed. This is always a plain string scalar (for
786which C<SvPOK> is true). It is not intended to be used as a scalar by
787normal scalar means; instead refer to the buffer directly by the pointer
788variables described below.
789
790The lexer maintains various C<char*> pointers to things in the
791C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
792reallocated, all of these pointers must be updated. Don't attempt to
793do this manually, but rather use L</lex_grow_linestr> if you need to
794reallocate the buffer.
795
796The content of the text chunk in the buffer is commonly exactly one
797complete line of input, up to and including a newline terminator,
798but there are situations where it is otherwise. The octets of the
799buffer may be intended to be interpreted as either UTF-8 or Latin-1.
800The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
801flag on this scalar, which may disagree with it.
802
803For direct examination of the buffer, the variable
804L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
805lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
806of these pointers is usually preferable to examination of the scalar
807through normal scalar means.
808
809=for apidoc AmxU|char *|PL_parser-E<gt>bufend
810
811Direct pointer to the end of the chunk of text currently being lexed, the
812end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
6602b933 813+ SvCUR(PL_parser-E<gt>linestr)>. A C<NUL> character (zero octet) is
f0e67a1d
Z
814always located at the end of the buffer, and does not count as part of
815the buffer's contents.
816
817=for apidoc AmxU|char *|PL_parser-E<gt>bufptr
818
819Points to the current position of lexing inside the lexer buffer.
820Characters around this point may be freely examined, within
821the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
822L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
823interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
824
825Lexing code (whether in the Perl core or not) moves this pointer past
826the characters that it consumes. It is also expected to perform some
827bookkeeping whenever a newline character is consumed. This movement
828can be more conveniently performed by the function L</lex_read_to>,
829which handles newlines appropriately.
830
831Interpretation of the buffer's octets can be abstracted out by
832using the slightly higher-level functions L</lex_peek_unichar> and
833L</lex_read_unichar>.
834
835=for apidoc AmxU|char *|PL_parser-E<gt>linestart
836
837Points to the start of the current line inside the lexer buffer.
838This is useful for indicating at which column an error occurred, and
839not much else. This must be updated by any lexing code that consumes
840a newline; the function L</lex_read_to> handles this detail.
841
842=cut
843*/
844
845/*
846=for apidoc Amx|bool|lex_bufutf8
847
848Indicates whether the octets in the lexer buffer
849(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
850of Unicode characters. If not, they should be interpreted as Latin-1
851characters. This is analogous to the C<SvUTF8> flag for scalars.
852
853In UTF-8 mode, it is not guaranteed that the lexer buffer actually
854contains valid UTF-8. Lexing code must be robust in the face of invalid
855encoding.
856
857The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
858is significant, but not the whole story regarding the input character
859encoding. Normally, when a file is being read, the scalar contains octets
860and its C<SvUTF8> flag is off, but the octets should be interpreted as
861UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
862however, the scalar may have the C<SvUTF8> flag on, and in this case its
863octets should be interpreted as UTF-8 unless the C<use bytes> pragma
864is in effect. This logic may change in the future; use this function
865instead of implementing the logic yourself.
866
867=cut
868*/
869
870bool
871Perl_lex_bufutf8(pTHX)
872{
873 return UTF;
874}
875
876/*
877=for apidoc Amx|char *|lex_grow_linestr|STRLEN len
878
879Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
6602b933 880at least I<len> octets (including terminating C<NUL>). Returns a
f0e67a1d
Z
881pointer to the reallocated buffer. This is necessary before making
882any direct modification of the buffer that would increase its length.
883L</lex_stuff_pvn> provides a more convenient way to insert text into
884the buffer.
885
886Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
887this function updates all of the lexer's variables that point directly
888into the buffer.
889
890=cut
891*/
892
893char *
894Perl_lex_grow_linestr(pTHX_ STRLEN len)
895{
896 SV *linestr;
897 char *buf;
898 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
c7641931 899 STRLEN linestart_pos, last_uni_pos, last_lop_pos, re_eval_start_pos;
f0e67a1d
Z
900 linestr = PL_parser->linestr;
901 buf = SvPVX(linestr);
902 if (len <= SvLEN(linestr))
903 return buf;
904 bufend_pos = PL_parser->bufend - buf;
905 bufptr_pos = PL_parser->bufptr - buf;
906 oldbufptr_pos = PL_parser->oldbufptr - buf;
907 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
908 linestart_pos = PL_parser->linestart - buf;
909 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
910 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
3328ab5a
FC
911 re_eval_start_pos = PL_parser->lex_shared->re_eval_start ?
912 PL_parser->lex_shared->re_eval_start - buf : 0;
c7641931 913
f0e67a1d 914 buf = sv_grow(linestr, len);
c7641931 915
f0e67a1d
Z
916 PL_parser->bufend = buf + bufend_pos;
917 PL_parser->bufptr = buf + bufptr_pos;
918 PL_parser->oldbufptr = buf + oldbufptr_pos;
919 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
920 PL_parser->linestart = buf + linestart_pos;
921 if (PL_parser->last_uni)
922 PL_parser->last_uni = buf + last_uni_pos;
923 if (PL_parser->last_lop)
924 PL_parser->last_lop = buf + last_lop_pos;
3328ab5a
FC
925 if (PL_parser->lex_shared->re_eval_start)
926 PL_parser->lex_shared->re_eval_start = buf + re_eval_start_pos;
f0e67a1d
Z
927 return buf;
928}
929
930/*
83aa740e 931=for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
f0e67a1d
Z
932
933Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
934immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
935reallocating the buffer if necessary. This means that lexing code that
936runs later will see the characters as if they had appeared in the input.
937It is not recommended to do this as part of normal parsing, and most
938uses of this facility run the risk of the inserted characters being
939interpreted in an unintended manner.
940
941The string to be inserted is represented by I<len> octets starting
942at I<pv>. These octets are interpreted as either UTF-8 or Latin-1,
943according to whether the C<LEX_STUFF_UTF8> flag is set in I<flags>.
944The characters are recoded for the lexer buffer, according to how the
945buffer is currently being interpreted (L</lex_bufutf8>). If a string
9dcc53ea 946to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
f0e67a1d
Z
947function is more convenient.
948
949=cut
950*/
951
952void
83aa740e 953Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
f0e67a1d 954{
749123ff 955 dVAR;
f0e67a1d
Z
956 char *bufptr;
957 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
958 if (flags & ~(LEX_STUFF_UTF8))
959 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
960 if (UTF) {
961 if (flags & LEX_STUFF_UTF8) {
962 goto plain_copy;
963 } else {
54d004e8 964 STRLEN highhalf = 0; /* Count of variants */
83aa740e 965 const char *p, *e = pv+len;
54d004e8
KW
966 for (p = pv; p != e; p++) {
967 if (! UTF8_IS_INVARIANT(*p)) {
968 highhalf++;
969 }
970 }
f0e67a1d
Z
971 if (!highhalf)
972 goto plain_copy;
973 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
974 bufptr = PL_parser->bufptr;
975 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
976 SvCUR_set(PL_parser->linestr,
977 SvCUR(PL_parser->linestr) + len+highhalf);
f0e67a1d
Z
978 PL_parser->bufend += len+highhalf;
979 for (p = pv; p != e; p++) {
980 U8 c = (U8)*p;
54d004e8
KW
981 if (! UTF8_IS_INVARIANT(c)) {
982 *bufptr++ = UTF8_TWO_BYTE_HI(c);
983 *bufptr++ = UTF8_TWO_BYTE_LO(c);
f0e67a1d
Z
984 } else {
985 *bufptr++ = (char)c;
986 }
987 }
988 }
989 } else {
990 if (flags & LEX_STUFF_UTF8) {
991 STRLEN highhalf = 0;
83aa740e 992 const char *p, *e = pv+len;
f0e67a1d
Z
993 for (p = pv; p != e; p++) {
994 U8 c = (U8)*p;
54d004e8 995 if (UTF8_IS_ABOVE_LATIN1(c)) {
f0e67a1d
Z
996 Perl_croak(aTHX_ "Lexing code attempted to stuff "
997 "non-Latin-1 character into Latin-1 input");
54d004e8 998 } else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e)) {
f0e67a1d
Z
999 p++;
1000 highhalf++;
54d004e8 1001 } else if (! UTF8_IS_INVARIANT(c)) {
f0e67a1d
Z
1002 /* malformed UTF-8 */
1003 ENTER;
1004 SAVESPTR(PL_warnhook);
1005 PL_warnhook = PERL_WARNHOOK_FATAL;
c80e42f3 1006 utf8n_to_uvchr((U8*)p, e-p, NULL, 0);
f0e67a1d
Z
1007 LEAVE;
1008 }
1009 }
1010 if (!highhalf)
1011 goto plain_copy;
1012 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
1013 bufptr = PL_parser->bufptr;
1014 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1015 SvCUR_set(PL_parser->linestr,
1016 SvCUR(PL_parser->linestr) + len-highhalf);
f0e67a1d 1017 PL_parser->bufend += len-highhalf;
54d004e8
KW
1018 p = pv;
1019 while (p < e) {
1020 if (UTF8_IS_INVARIANT(*p)) {
1021 *bufptr++ = *p;
1022 p++;
f0e67a1d 1023 }
54d004e8
KW
1024 else {
1025 assert(p < e -1 );
94bb8c36 1026 *bufptr++ = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
54d004e8
KW
1027 p += 2;
1028 }
f0e67a1d
Z
1029 }
1030 } else {
54d004e8 1031 plain_copy:
f0e67a1d
Z
1032 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1033 bufptr = PL_parser->bufptr;
1034 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
255fdf19 1035 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
f0e67a1d
Z
1036 PL_parser->bufend += len;
1037 Copy(pv, bufptr, len, char);
1038 }
1039 }
1040}
1041
1042/*
9dcc53ea
Z
1043=for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
1044
1045Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1046immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1047reallocating the buffer if necessary. This means that lexing code that
1048runs later will see the characters as if they had appeared in the input.
1049It is not recommended to do this as part of normal parsing, and most
1050uses of this facility run the risk of the inserted characters being
1051interpreted in an unintended manner.
1052
1053The string to be inserted is represented by octets starting at I<pv>
1054and continuing to the first nul. These octets are interpreted as either
1055UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
1056in I<flags>. The characters are recoded for the lexer buffer, according
1057to how the buffer is currently being interpreted (L</lex_bufutf8>).
1058If it is not convenient to nul-terminate a string to be inserted, the
1059L</lex_stuff_pvn> function is more appropriate.
1060
1061=cut
1062*/
1063
1064void
1065Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1066{
1067 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1068 lex_stuff_pvn(pv, strlen(pv), flags);
1069}
1070
1071/*
f0e67a1d
Z
1072=for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1073
1074Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1075immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1076reallocating the buffer if necessary. This means that lexing code that
1077runs later will see the characters as if they had appeared in the input.
1078It is not recommended to do this as part of normal parsing, and most
1079uses of this facility run the risk of the inserted characters being
1080interpreted in an unintended manner.
1081
1082The string to be inserted is the string value of I<sv>. The characters
1083are recoded for the lexer buffer, according to how the buffer is currently
9dcc53ea 1084being interpreted (L</lex_bufutf8>). If a string to be inserted is
f0e67a1d
Z
1085not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1086need to construct a scalar.
1087
1088=cut
1089*/
1090
1091void
1092Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1093{
1094 char *pv;
1095 STRLEN len;
1096 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1097 if (flags)
1098 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1099 pv = SvPV(sv, len);
1100 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1101}
1102
1103/*
1104=for apidoc Amx|void|lex_unstuff|char *ptr
1105
1106Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1107I<ptr>. Text following I<ptr> will be moved, and the buffer shortened.
1108This hides the discarded text from any lexing code that runs later,
1109as if the text had never appeared.
1110
1111This is not the normal way to consume lexed text. For that, use
1112L</lex_read_to>.
1113
1114=cut
1115*/
1116
1117void
1118Perl_lex_unstuff(pTHX_ char *ptr)
1119{
1120 char *buf, *bufend;
1121 STRLEN unstuff_len;
1122 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1123 buf = PL_parser->bufptr;
1124 if (ptr < buf)
1125 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1126 if (ptr == buf)
1127 return;
1128 bufend = PL_parser->bufend;
1129 if (ptr > bufend)
1130 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1131 unstuff_len = ptr - buf;
1132 Move(ptr, buf, bufend+1-ptr, char);
1133 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1134 PL_parser->bufend = bufend - unstuff_len;
1135}
1136
1137/*
1138=for apidoc Amx|void|lex_read_to|char *ptr
1139
1140Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1141to I<ptr>. This advances L</PL_parser-E<gt>bufptr> to match I<ptr>,
1142performing the correct bookkeeping whenever a newline character is passed.
1143This is the normal way to consume lexed text.
1144
1145Interpretation of the buffer's octets can be abstracted out by
1146using the slightly higher-level functions L</lex_peek_unichar> and
1147L</lex_read_unichar>.
1148
1149=cut
1150*/
1151
1152void
1153Perl_lex_read_to(pTHX_ char *ptr)
1154{
1155 char *s;
1156 PERL_ARGS_ASSERT_LEX_READ_TO;
1157 s = PL_parser->bufptr;
1158 if (ptr < s || ptr > PL_parser->bufend)
1159 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1160 for (; s != ptr; s++)
1161 if (*s == '\n') {
83944c01 1162 COPLINE_INC_WITH_HERELINES;
f0e67a1d
Z
1163 PL_parser->linestart = s+1;
1164 }
1165 PL_parser->bufptr = ptr;
1166}
1167
1168/*
1169=for apidoc Amx|void|lex_discard_to|char *ptr
1170
1171Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1172up to I<ptr>. The remaining content of the buffer will be moved, and
1173all pointers into the buffer updated appropriately. I<ptr> must not
1174be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1175it is not permitted to discard text that has yet to be lexed.
1176
1177Normally it is not necessarily to do this directly, because it suffices to
1178use the implicit discarding behaviour of L</lex_next_chunk> and things
1179based on it. However, if a token stretches across multiple lines,
1f317c95 1180and the lexing code has kept multiple lines of text in the buffer for
f0e67a1d
Z
1181that purpose, then after completion of the token it would be wise to
1182explicitly discard the now-unneeded earlier lines, to avoid future
1183multi-line tokens growing the buffer without bound.
1184
1185=cut
1186*/
1187
1188void
1189Perl_lex_discard_to(pTHX_ char *ptr)
1190{
1191 char *buf;
1192 STRLEN discard_len;
1193 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1194 buf = SvPVX(PL_parser->linestr);
1195 if (ptr < buf)
1196 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1197 if (ptr == buf)
1198 return;
1199 if (ptr > PL_parser->bufptr)
1200 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1201 discard_len = ptr - buf;
1202 if (PL_parser->oldbufptr < ptr)
1203 PL_parser->oldbufptr = ptr;
1204 if (PL_parser->oldoldbufptr < ptr)
1205 PL_parser->oldoldbufptr = ptr;
1206 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1207 PL_parser->last_uni = NULL;
1208 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1209 PL_parser->last_lop = NULL;
1210 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1211 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1212 PL_parser->bufend -= discard_len;
1213 PL_parser->bufptr -= discard_len;
1214 PL_parser->oldbufptr -= discard_len;
1215 PL_parser->oldoldbufptr -= discard_len;
1216 if (PL_parser->last_uni)
1217 PL_parser->last_uni -= discard_len;
1218 if (PL_parser->last_lop)
1219 PL_parser->last_lop -= discard_len;
1220}
1221
1222/*
1223=for apidoc Amx|bool|lex_next_chunk|U32 flags
1224
1225Reads in the next chunk of text to be lexed, appending it to
1226L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1227looked to the end of the current chunk and wants to know more. It is
1228usual, but not necessary, for lexing to have consumed the entirety of
1229the current chunk at this time.
1230
1231If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1232chunk (i.e., the current chunk has been entirely consumed), normally the
1233current chunk will be discarded at the same time that the new chunk is
1234read in. If I<flags> includes C<LEX_KEEP_PREVIOUS>, the current chunk
1235will not be discarded. If the current chunk has not been entirely
1236consumed, then it will not be discarded regardless of the flag.
1237
1238Returns true if some new text was added to the buffer, or false if the
1239buffer has reached the end of the input text.
1240
1241=cut
1242*/
1243
1244#define LEX_FAKE_EOF 0x80000000
112d1284 1245#define LEX_NO_TERM 0x40000000
f0e67a1d
Z
1246
1247bool
1248Perl_lex_next_chunk(pTHX_ U32 flags)
1249{
1250 SV *linestr;
1251 char *buf;
1252 STRLEN old_bufend_pos, new_bufend_pos;
1253 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1254 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1255 bool got_some_for_debugger = 0;
f0e67a1d 1256 bool got_some;
112d1284 1257 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM))
f0e67a1d 1258 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
f0e67a1d
Z
1259 linestr = PL_parser->linestr;
1260 buf = SvPVX(linestr);
1261 if (!(flags & LEX_KEEP_PREVIOUS) &&
1262 PL_parser->bufptr == PL_parser->bufend) {
1263 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1264 linestart_pos = 0;
1265 if (PL_parser->last_uni != PL_parser->bufend)
1266 PL_parser->last_uni = NULL;
1267 if (PL_parser->last_lop != PL_parser->bufend)
1268 PL_parser->last_lop = NULL;
1269 last_uni_pos = last_lop_pos = 0;
1270 *buf = 0;
1271 SvCUR(linestr) = 0;
1272 } else {
1273 old_bufend_pos = PL_parser->bufend - buf;
1274 bufptr_pos = PL_parser->bufptr - buf;
1275 oldbufptr_pos = PL_parser->oldbufptr - buf;
1276 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1277 linestart_pos = PL_parser->linestart - buf;
1278 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1279 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1280 }
1281 if (flags & LEX_FAKE_EOF) {
1282 goto eof;
60d63348 1283 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
f0e67a1d
Z
1284 got_some = 0;
1285 } else if (filter_gets(linestr, old_bufend_pos)) {
1286 got_some = 1;
17cc9359 1287 got_some_for_debugger = 1;
112d1284
FC
1288 } else if (flags & LEX_NO_TERM) {
1289 got_some = 0;
f0e67a1d 1290 } else {
580561a3
Z
1291 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1292 sv_setpvs(linestr, "");
f0e67a1d
Z
1293 eof:
1294 /* End of real input. Close filehandle (unless it was STDIN),
1295 * then add implicit termination.
1296 */
87606032 1297 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
f0e67a1d
Z
1298 PerlIO_clearerr(PL_parser->rsfp);
1299 else if (PL_parser->rsfp)
1300 (void)PerlIO_close(PL_parser->rsfp);
1301 PL_parser->rsfp = NULL;
60d63348 1302 PL_parser->in_pod = PL_parser->filtered = 0;
f0e67a1d
Z
1303 if (!PL_in_eval && PL_minus_p) {
1304 sv_catpvs(linestr,
1305 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1306 PL_minus_n = PL_minus_p = 0;
1307 } else if (!PL_in_eval && PL_minus_n) {
1308 sv_catpvs(linestr, /*{*/";}");
1309 PL_minus_n = 0;
1310 } else
1311 sv_catpvs(linestr, ";");
1312 got_some = 1;
1313 }
1314 buf = SvPVX(linestr);
1315 new_bufend_pos = SvCUR(linestr);
1316 PL_parser->bufend = buf + new_bufend_pos;
1317 PL_parser->bufptr = buf + bufptr_pos;
1318 PL_parser->oldbufptr = buf + oldbufptr_pos;
1319 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1320 PL_parser->linestart = buf + linestart_pos;
1321 if (PL_parser->last_uni)
1322 PL_parser->last_uni = buf + last_uni_pos;
1323 if (PL_parser->last_lop)
1324 PL_parser->last_lop = buf + last_lop_pos;
7f1c3e8c
FC
1325 if (PL_parser->preambling != NOLINE) {
1326 CopLINE_set(PL_curcop, PL_parser->preambling + 1);
1327 PL_parser->preambling = NOLINE;
1328 }
17cc9359 1329 if (got_some_for_debugger && (PERLDB_LINE || PERLDB_SAVESRC) &&
f0e67a1d
Z
1330 PL_curstash != PL_debstash) {
1331 /* debugger active and we're not compiling the debugger code,
1332 * so store the line into the debugger's array of lines
1333 */
1334 update_debugger_info(NULL, buf+old_bufend_pos,
1335 new_bufend_pos-old_bufend_pos);
1336 }
1337 return got_some;
1338}
1339
1340/*
1341=for apidoc Amx|I32|lex_peek_unichar|U32 flags
1342
1343Looks ahead one (Unicode) character in the text currently being lexed.
1344Returns the codepoint (unsigned integer value) of the next character,
1345or -1 if lexing has reached the end of the input text. To consume the
1346peeked character, use L</lex_read_unichar>.
1347
1348If the next character is in (or extends into) the next chunk of input
1349text, the next chunk will be read in. Normally the current chunk will be
1350discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1351then the current chunk will not be discarded.
1352
1353If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1354is encountered, an exception is generated.
1355
1356=cut
1357*/
1358
1359I32
1360Perl_lex_peek_unichar(pTHX_ U32 flags)
1361{
749123ff 1362 dVAR;
f0e67a1d
Z
1363 char *s, *bufend;
1364 if (flags & ~(LEX_KEEP_PREVIOUS))
1365 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1366 s = PL_parser->bufptr;
1367 bufend = PL_parser->bufend;
1368 if (UTF) {
1369 U8 head;
1370 I32 unichar;
1371 STRLEN len, retlen;
1372 if (s == bufend) {
1373 if (!lex_next_chunk(flags))
1374 return -1;
1375 s = PL_parser->bufptr;
1376 bufend = PL_parser->bufend;
1377 }
1378 head = (U8)*s;
54d004e8 1379 if (UTF8_IS_INVARIANT(head))
f0e67a1d 1380 return head;
54d004e8
KW
1381 if (UTF8_IS_START(head)) {
1382 len = UTF8SKIP(&head);
f0e67a1d
Z
1383 while ((STRLEN)(bufend-s) < len) {
1384 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1385 break;
1386 s = PL_parser->bufptr;
1387 bufend = PL_parser->bufend;
1388 }
1389 }
c80e42f3 1390 unichar = utf8n_to_uvchr((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
f0e67a1d
Z
1391 if (retlen == (STRLEN)-1) {
1392 /* malformed UTF-8 */
1393 ENTER;
1394 SAVESPTR(PL_warnhook);
1395 PL_warnhook = PERL_WARNHOOK_FATAL;
c80e42f3 1396 utf8n_to_uvchr((U8*)s, bufend-s, NULL, 0);
f0e67a1d
Z
1397 LEAVE;
1398 }
1399 return unichar;
1400 } else {
1401 if (s == bufend) {
1402 if (!lex_next_chunk(flags))
1403 return -1;
1404 s = PL_parser->bufptr;
1405 }
1406 return (U8)*s;
1407 }
1408}
1409
1410/*
1411=for apidoc Amx|I32|lex_read_unichar|U32 flags
1412
1413Reads the next (Unicode) character in the text currently being lexed.
1414Returns the codepoint (unsigned integer value) of the character read,
1415and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1416if lexing has reached the end of the input text. To non-destructively
1417examine the next character, use L</lex_peek_unichar> instead.
1418
1419If the next character is in (or extends into) the next chunk of input
1420text, the next chunk will be read in. Normally the current chunk will be
1421discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1422then the current chunk will not be discarded.
1423
1424If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1425is encountered, an exception is generated.
1426
1427=cut
1428*/
1429
1430I32
1431Perl_lex_read_unichar(pTHX_ U32 flags)
1432{
1433 I32 c;
1434 if (flags & ~(LEX_KEEP_PREVIOUS))
1435 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1436 c = lex_peek_unichar(flags);
1437 if (c != -1) {
1438 if (c == '\n')
83944c01 1439 COPLINE_INC_WITH_HERELINES;
d9018cbe
EB
1440 if (UTF)
1441 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1442 else
1443 ++(PL_parser->bufptr);
f0e67a1d
Z
1444 }
1445 return c;
1446}
1447
1448/*
1449=for apidoc Amx|void|lex_read_space|U32 flags
1450
1451Reads optional spaces, in Perl style, in the text currently being
1452lexed. The spaces may include ordinary whitespace characters and
1453Perl-style comments. C<#line> directives are processed if encountered.
1454L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1455at a non-space character (or the end of the input text).
1456
1457If spaces extend into the next chunk of input text, the next chunk will
1458be read in. Normally the current chunk will be discarded at the same
1459time, but if I<flags> includes C<LEX_KEEP_PREVIOUS> then the current
1460chunk will not be discarded.
1461
1462=cut
1463*/
1464
21791330 1465#define LEX_NO_INCLINE 0x40000000
f0998909
Z
1466#define LEX_NO_NEXT_CHUNK 0x80000000
1467
f0e67a1d
Z
1468void
1469Perl_lex_read_space(pTHX_ U32 flags)
1470{
1471 char *s, *bufend;
21791330 1472 const bool can_incline = !(flags & LEX_NO_INCLINE);
f0e67a1d 1473 bool need_incline = 0;
21791330 1474 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK|LEX_NO_INCLINE))
f0e67a1d 1475 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
f0e67a1d
Z
1476 s = PL_parser->bufptr;
1477 bufend = PL_parser->bufend;
1478 while (1) {
1479 char c = *s;
1480 if (c == '#') {
1481 do {
1482 c = *++s;
1483 } while (!(c == '\n' || (c == 0 && s == bufend)));
1484 } else if (c == '\n') {
1485 s++;
21791330
FC
1486 if (can_incline) {
1487 PL_parser->linestart = s;
1488 if (s == bufend)
1489 need_incline = 1;
1490 else
1491 incline(s);
1492 }
f0e67a1d
Z
1493 } else if (isSPACE(c)) {
1494 s++;
1495 } else if (c == 0 && s == bufend) {
1496 bool got_more;
65c68e17 1497 line_t l;
f0998909
Z
1498 if (flags & LEX_NO_NEXT_CHUNK)
1499 break;
f0e67a1d 1500 PL_parser->bufptr = s;
65c68e17 1501 l = CopLINE(PL_curcop);
851b527a 1502 CopLINE(PL_curcop) += PL_parser->herelines + 1;
f0e67a1d 1503 got_more = lex_next_chunk(flags);
65c68e17 1504 CopLINE_set(PL_curcop, l);
f0e67a1d
Z
1505 s = PL_parser->bufptr;
1506 bufend = PL_parser->bufend;
1507 if (!got_more)
1508 break;
21791330 1509 if (can_incline && need_incline && PL_parser->rsfp) {
f0e67a1d
Z
1510 incline(s);
1511 need_incline = 0;
1512 }
1513 } else {
1514 break;
1515 }
1516 }
f0e67a1d
Z
1517 PL_parser->bufptr = s;
1518}
1519
1520/*
fe788d6b
PM
1521
1522=for apidoc EXMp|bool|validate_proto|SV *name|SV *proto|bool warn
1523
1524This function performs syntax checking on a prototype, C<proto>.
1525If C<warn> is true, any illegal characters or mismatched brackets
1526will trigger illegalproto warnings, declaring that they were
1527detected in the prototype for C<name>.
1528
1529The return value is C<true> if this is a valid prototype, and
1530C<false> if it is not, regardless of whether C<warn> was C<true> or
1531C<false>.
1532
1533Note that C<NULL> is a valid C<proto> and will always return C<true>.
1534
1535=cut
1536
1537 */
1538
1539bool
1540Perl_validate_proto(pTHX_ SV *name, SV *proto, bool warn)
1541{
1542 STRLEN len, origlen;
1543 char *p = proto ? SvPV(proto, len) : NULL;
1544 bool bad_proto = FALSE;
1545 bool in_brackets = FALSE;
1546 bool after_slash = FALSE;
1547 char greedy_proto = ' ';
1548 bool proto_after_greedy_proto = FALSE;
1549 bool must_be_last = FALSE;
1550 bool underscore = FALSE;
f791a21a 1551 bool bad_proto_after_underscore = FALSE;
fe788d6b
PM
1552
1553 PERL_ARGS_ASSERT_VALIDATE_PROTO;
1554
1555 if (!proto)
1556 return TRUE;
1557
1558 origlen = len;
1559 for (; len--; p++) {
1560 if (!isSPACE(*p)) {
1561 if (must_be_last)
1562 proto_after_greedy_proto = TRUE;
f791a21a
PM
1563 if (underscore) {
1564 if (!strchr(";@%", *p))
1565 bad_proto_after_underscore = TRUE;
1566 underscore = FALSE;
1567 }
fe788d6b
PM
1568 if (!strchr("$@%*;[]&\\_+", *p) || *p == '\0') {
1569 bad_proto = TRUE;
1570 }
1571 else {
fe788d6b
PM
1572 if (*p == '[')
1573 in_brackets = TRUE;
1574 else if (*p == ']')
1575 in_brackets = FALSE;
1576 else if ((*p == '@' || *p == '%') &&
1577 !after_slash &&
1578 !in_brackets ) {
1579 must_be_last = TRUE;
1580 greedy_proto = *p;
1581 }
1582 else if (*p == '_')
f791a21a 1583 underscore = TRUE;
fe788d6b
PM
1584 }
1585 if (*p == '\\')
1586 after_slash = TRUE;
1587 else
1588 after_slash = FALSE;
1589 }
1590 }
1591
1592 if (warn) {
b54d603d 1593 SV *tmpsv = newSVpvs_flags("", SVs_TEMP);
fe788d6b 1594 p -= origlen;
b54d603d
PM
1595 p = SvUTF8(proto)
1596 ? sv_uni_display(tmpsv, newSVpvn_flags(p, origlen, SVs_TEMP | SVf_UTF8),
1597 origlen, UNI_DISPLAY_ISPRINT)
1598 : pv_pretty(tmpsv, p, origlen, 60, NULL, NULL, PERL_PV_ESCAPE_NONASCII);
1599
fe788d6b
PM
1600 if (proto_after_greedy_proto)
1601 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1602 "Prototype after '%c' for %"SVf" : %s",
1603 greedy_proto, SVfARG(name), p);
50278ed0
PM
1604 if (in_brackets)
1605 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1606 "Missing ']' in prototype for %"SVf" : %s",
1607 SVfARG(name), p);
b54d603d 1608 if (bad_proto)
fe788d6b 1609 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
f791a21a
PM
1610 "Illegal character in prototype for %"SVf" : %s",
1611 SVfARG(name), p);
1612 if (bad_proto_after_underscore)
1613 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1614 "Illegal character after '_' in prototype for %"SVf" : %s",
1615 SVfARG(name), p);
fe788d6b
PM
1616 }
1617
1618 return (! (proto_after_greedy_proto || bad_proto) );
1619}
1620
1621/*
ffb4593c
NT
1622 * S_incline
1623 * This subroutine has nothing to do with tilting, whether at windmills
1624 * or pinball tables. Its name is short for "increment line". It
57843af0 1625 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1626 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1627 * # line 500 "foo.pm"
1628 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1629 */
1630
76e3520e 1631STATIC void
d9095cec 1632S_incline(pTHX_ const char *s)
463ee0b2 1633{
d9095cec
NC
1634 const char *t;
1635 const char *n;
1636 const char *e;
8818d409 1637 line_t line_num;
463ee0b2 1638
7918f24d
NC
1639 PERL_ARGS_ASSERT_INCLINE;
1640
83944c01 1641 COPLINE_INC_WITH_HERELINES;
451f421f
FC
1642 if (!PL_rsfp && !PL_parser->filtered && PL_lex_state == LEX_NORMAL
1643 && s+1 == PL_bufend && *s == ';') {
1644 /* fake newline in string eval */
1645 CopLINE_dec(PL_curcop);
1646 return;
1647 }
463ee0b2
LW
1648 if (*s++ != '#')
1649 return;
d4c19fe8
AL
1650 while (SPACE_OR_TAB(*s))
1651 s++;
73659bf1
GS
1652 if (strnEQ(s, "line", 4))
1653 s += 4;
1654 else
1655 return;
084592ab 1656 if (SPACE_OR_TAB(*s))
73659bf1 1657 s++;
4e553d73 1658 else
73659bf1 1659 return;
d4c19fe8
AL
1660 while (SPACE_OR_TAB(*s))
1661 s++;
463ee0b2
LW
1662 if (!isDIGIT(*s))
1663 return;
d4c19fe8 1664
463ee0b2
LW
1665 n = s;
1666 while (isDIGIT(*s))
1667 s++;
07714eb4 1668 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1669 return;
bf4acbe4 1670 while (SPACE_OR_TAB(*s))
463ee0b2 1671 s++;
73659bf1 1672 if (*s == '"' && (t = strchr(s+1, '"'))) {
463ee0b2 1673 s++;
73659bf1
GS
1674 e = t + 1;
1675 }
463ee0b2 1676 else {
c35e046a
AL
1677 t = s;
1678 while (!isSPACE(*t))
1679 t++;
73659bf1 1680 e = t;
463ee0b2 1681 }
bf4acbe4 1682 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1683 e++;
1684 if (*e != '\n' && *e != '\0')
1685 return; /* false alarm */
1686
96e440d2 1687 line_num = grok_atou(n, &e) - 1;
8818d409 1688
f4dd75d9 1689 if (t - s > 0) {
d9095cec 1690 const STRLEN len = t - s;
3df32bda 1691
d36ee5be 1692 if (!PL_rsfp && !PL_parser->filtered) {
e66cf94c
RGS
1693 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1694 * to *{"::_<newfilename"} */
44867030
NC
1695 /* However, the long form of evals is only turned on by the
1696 debugger - usually they're "(eval %lu)" */
d36ee5be
FC
1697 GV * const cfgv = CopFILEGV(PL_curcop);
1698 if (cfgv) {
38bd7ad8
FC
1699 char smallbuf[128];
1700 STRLEN tmplen2 = len;
44867030 1701 char *tmpbuf2;
449dd039 1702 GV *gv2;
44867030
NC
1703
1704 if (tmplen2 + 2 <= sizeof smallbuf)
1705 tmpbuf2 = smallbuf;
1706 else
1707 Newx(tmpbuf2, tmplen2 + 2, char);
1708
38bd7ad8
FC
1709 tmpbuf2[0] = '_';
1710 tmpbuf2[1] = '<';
44867030
NC
1711
1712 memcpy(tmpbuf2 + 2, s, tmplen2);
1713 tmplen2 += 2;
1714
8a5ee598 1715 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1716 if (!isGV(gv2)) {
8a5ee598 1717 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1718 /* adjust ${"::_<newfilename"} to store the new file name */
1719 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
8818d409
FC
1720 /* The line number may differ. If that is the case,
1721 alias the saved lines that are in the array.
1722 Otherwise alias the whole array. */
1723 if (CopLINE(PL_curcop) == line_num) {
38bd7ad8
FC
1724 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(cfgv)));
1725 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(cfgv)));
8818d409 1726 }
38bd7ad8
FC
1727 else if (GvAV(cfgv)) {
1728 AV * const av = GvAV(cfgv);
8818d409
FC
1729 const I32 start = CopLINE(PL_curcop)+1;
1730 I32 items = AvFILLp(av) - start;
1731 if (items > 0) {
1732 AV * const av2 = GvAVn(gv2);
1733 SV **svp = AvARRAY(av) + start;
1734 I32 l = (I32)line_num+1;
1735 while (items--)
1736 av_store(av2, l++, SvREFCNT_inc(*svp++));
1737 }
1738 }
e5527e4b 1739 }
44867030
NC
1740
1741 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
d36ee5be 1742 }
e66cf94c 1743 }
05ec9bb3 1744 CopFILE_free(PL_curcop);
449dd039 1745 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1746 }
8818d409 1747 CopLINE_set(PL_curcop, line_num);
463ee0b2
LW
1748}
1749
21791330
FC
1750#define skipspace(s) skipspace_flags(s, 0)
1751
29595ff2 1752
80a702cd 1753STATIC void
15f169a1 1754S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1755{
1756 AV *av = CopFILEAVx(PL_curcop);
1757 if (av) {
7f1c3e8c
FC
1758 SV * sv;
1759 if (PL_parser->preambling == NOLINE) sv = newSV_type(SVt_PVMG);
1760 else {
1761 sv = *av_fetch(av, 0, 1);
1762 SvUPGRADE(sv, SVt_PVMG);
1763 }
1764 if (!SvPOK(sv)) sv_setpvs(sv,"");
5fa550fb 1765 if (orig_sv)
7f1c3e8c 1766 sv_catsv(sv, orig_sv);
5fa550fb 1767 else
7f1c3e8c
FC
1768 sv_catpvn(sv, buf, len);
1769 if (!SvIOK(sv)) {
1770 (void)SvIOK_on(sv);
1771 SvIV_set(sv, 0);
1772 }
1773 if (PL_parser->preambling == NOLINE)
1774 av_store(av, CopLINE(PL_curcop), sv);
80a702cd
RGS
1775 }
1776}
1777
ffb4593c
NT
1778/*
1779 * S_skipspace
1780 * Called to gobble the appropriate amount and type of whitespace.
1781 * Skips comments as well.
1782 */
1783
76e3520e 1784STATIC char *
21791330 1785S_skipspace_flags(pTHX_ char *s, U32 flags)
a687059c 1786{
21791330 1787 PERL_ARGS_ASSERT_SKIPSPACE_FLAGS;
3280af22 1788 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
bf4acbe4 1789 while (s < PL_bufend && SPACE_OR_TAB(*s))
463ee0b2 1790 s++;
f0e67a1d
Z
1791 } else {
1792 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1793 PL_bufptr = s;
21791330 1794 lex_read_space(flags | LEX_KEEP_PREVIOUS |
f0998909
Z
1795 (PL_sublex_info.sub_inwhat || PL_lex_state == LEX_FORMLINE ?
1796 LEX_NO_NEXT_CHUNK : 0));
3280af22 1797 s = PL_bufptr;
f0e67a1d
Z
1798 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1799 if (PL_linestart > PL_bufptr)
1800 PL_bufptr = PL_linestart;
1801 return s;
463ee0b2 1802 }
5db06880 1803 return s;
a687059c 1804}
378cc40b 1805
ffb4593c
NT
1806/*
1807 * S_check_uni
1808 * Check the unary operators to ensure there's no ambiguity in how they're
1809 * used. An ambiguous piece of code would be:
1810 * rand + 5
1811 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1812 * the +5 is its argument.
1813 */
1814
76e3520e 1815STATIC void
cea2e8a9 1816S_check_uni(pTHX)
ba106d47 1817{
d4c19fe8
AL
1818 const char *s;
1819 const char *t;
2f3197b3 1820
3280af22 1821 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1822 return;
3280af22
NIS
1823 while (isSPACE(*PL_last_uni))
1824 PL_last_uni++;
c35e046a 1825 s = PL_last_uni;
8a2bca12 1826 while (isWORDCHAR_lazy_if(s,UTF) || *s == '-')
c35e046a 1827 s++;
3280af22 1828 if ((t = strchr(s, '(')) && t < PL_bufptr)
a0d0e21e 1829 return;
6136c704 1830
9b387841
NC
1831 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
1832 "Warning: Use of \"%.*s\" without parentheses is ambiguous",
1833 (int)(s - PL_last_uni), PL_last_uni);
2f3197b3
LW
1834}
1835
ffb4593c
NT
1836/*
1837 * LOP : macro to build a list operator. Its behaviour has been replaced
1838 * with a subroutine, S_lop() for which LOP is just another name.
1839 */
1840
a0d0e21e
LW
1841#define LOP(f,x) return lop(f,x,s)
1842
ffb4593c
NT
1843/*
1844 * S_lop
1845 * Build a list operator (or something that might be one). The rules:
1846 * - if we have a next token, then it's a list operator [why?]
1847 * - if the next thing is an opening paren, then it's a function
1848 * - else it's a list operator
1849 */
1850
76e3520e 1851STATIC I32
a0be28da 1852S_lop(pTHX_ I32 f, int x, char *s)
ffed7fef 1853{
7918f24d
NC
1854 PERL_ARGS_ASSERT_LOP;
1855
6154021b 1856 pl_yylval.ival = f;
35c8bce7 1857 CLINE;
3280af22
NIS
1858 PL_expect = x;
1859 PL_bufptr = s;
1860 PL_last_lop = PL_oldbufptr;
eb160463 1861 PL_last_lop_op = (OPCODE)f;
3280af22 1862 if (PL_nexttoke)
78cdf107 1863 goto lstop;
79072805 1864 if (*s == '(')
bbf60fe6 1865 return REPORT(FUNC);
29595ff2 1866 s = PEEKSPACE(s);
79072805 1867 if (*s == '(')
bbf60fe6 1868 return REPORT(FUNC);
78cdf107
Z
1869 else {
1870 lstop:
1871 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
1872 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
bbf60fe6 1873 return REPORT(LSTOP);
78cdf107 1874 }
79072805
LW
1875}
1876
ffb4593c
NT
1877/*
1878 * S_force_next
9cbb5ea2 1879 * When the lexer realizes it knows the next token (for instance,
ffb4593c 1880 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 1881 * to know what token to return the next time the lexer is called. Caller
b5bbe64a
JH
1882 * will need to set PL_nextval[] and possibly PL_expect to ensure
1883 * the lexer handles the token correctly.
ffb4593c
NT
1884 */
1885
4e553d73 1886STATIC void
cea2e8a9 1887S_force_next(pTHX_ I32 type)
79072805 1888{
704d4215
GG
1889#ifdef DEBUGGING
1890 if (DEBUG_T_TEST) {
1891 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 1892 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
1893 }
1894#endif
3280af22
NIS
1895 PL_nexttype[PL_nexttoke] = type;
1896 PL_nexttoke++;
1897 if (PL_lex_state != LEX_KNOWNEXT) {
1898 PL_lex_defer = PL_lex_state;
1899 PL_lex_expect = PL_expect;
1900 PL_lex_state = LEX_KNOWNEXT;
79072805
LW
1901 }
1902}
1903
89f35911
FC
1904/*
1905 * S_postderef
1906 *
1907 * This subroutine handles postfix deref syntax after the arrow has already
1908 * been emitted. @* $* etc. are emitted as two separate token right here.
1909 * @[ @{ %[ %{ *{ are emitted also as two tokens, but this function emits
1910 * only the first, leaving yylex to find the next.
89f35911
FC
1911 */
1912
1913static int
ff25e5db 1914S_postderef(pTHX_ int const funny, char const next)
89f35911 1915{
ff25e5db 1916 assert(funny == DOLSHARP || strchr("$@%&*", funny));
89f35911
FC
1917 assert(strchr("*[{", next));
1918 if (next == '*') {
1919 PL_expect = XOPERATOR;
cc624add 1920 if (PL_lex_state == LEX_INTERPNORMAL && !PL_lex_brackets) {
ff25e5db 1921 assert('@' == funny || '$' == funny || DOLSHARP == funny);
cc624add 1922 PL_lex_state = LEX_INTERPEND;
cc624add
FC
1923 force_next(POSTJOIN);
1924 }
89f35911
FC
1925 force_next(next);
1926 PL_bufptr+=2;
1927 }
1928 else {
760ca746
FC
1929 if ('@' == funny && PL_lex_state == LEX_INTERPNORMAL
1930 && !PL_lex_brackets)
1931 PL_lex_dojoin = 2;
89f35911
FC
1932 PL_expect = XOPERATOR;
1933 PL_bufptr++;
1934 }
1935 return funny;
1936}
1937
28ac2b49
Z
1938void
1939Perl_yyunlex(pTHX)
1940{
a7aaec61
Z
1941 int yyc = PL_parser->yychar;
1942 if (yyc != YYEMPTY) {
1943 if (yyc) {
a7aaec61
Z
1944 NEXTVAL_NEXTTOKE = PL_parser->yylval;
1945 if (yyc == '{'/*}*/ || yyc == HASHBRACK || yyc == '['/*]*/) {
78cdf107 1946 PL_lex_allbrackets--;
a7aaec61 1947 PL_lex_brackets--;
78cdf107
Z
1948 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
1949 } else if (yyc == '('/*)*/) {
1950 PL_lex_allbrackets--;
1951 yyc |= (2<<24);
a7aaec61
Z
1952 }
1953 force_next(yyc);
1954 }
28ac2b49
Z
1955 PL_parser->yychar = YYEMPTY;
1956 }
1957}
1958
d0a148a6 1959STATIC SV *
15f169a1 1960S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 1961{
740cce10 1962 SV * const sv = newSVpvn_utf8(start, len,
eaf7a4d2
CS
1963 !IN_BYTES
1964 && UTF
1965 && !is_ascii_string((const U8*)start, len)
740cce10 1966 && is_utf8_string((const U8*)start, len));
d0a148a6
NC
1967 return sv;
1968}
1969
ffb4593c
NT
1970/*
1971 * S_force_word
1972 * When the lexer knows the next thing is a word (for instance, it has
1973 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
1974 * it calls S_force_word to stick the next word into the PL_nexttoke/val
1975 * lookahead.
ffb4593c
NT
1976 *
1977 * Arguments:
b1b65b59 1978 * char *start : buffer position (must be within PL_linestr)
02b34bbe 1979 * int token : PL_next* will be this type of bare word (e.g., METHOD,WORD)
ffb4593c
NT
1980 * int check_keyword : if true, Perl checks to make sure the word isn't
1981 * a keyword (do this if the word is a label, e.g. goto FOO)
1982 * int allow_pack : if true, : characters will also be allowed (require,
1983 * use, etc. do this)
9cbb5ea2 1984 * int allow_initial_tick : used by the "sub" lexer only.
ffb4593c
NT
1985 */
1986
76e3520e 1987STATIC char *
345b3785 1988S_force_word(pTHX_ char *start, int token, int check_keyword, int allow_pack)
79072805 1989{
eb578fdb 1990 char *s;
463ee0b2 1991 STRLEN len;
4e553d73 1992
7918f24d
NC
1993 PERL_ARGS_ASSERT_FORCE_WORD;
1994
29595ff2 1995 start = SKIPSPACE1(start);
463ee0b2 1996 s = start;
7e2040f0 1997 if (isIDFIRST_lazy_if(s,UTF) ||
345b3785 1998 (allow_pack && *s == ':') )
a0d0e21e 1999 {
3280af22 2000 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
01b5ef50
FC
2001 if (check_keyword) {
2002 char *s2 = PL_tokenbuf;
2003 if (allow_pack && len > 6 && strnEQ(s2, "CORE::", 6))
2004 s2 += 6, len -= 6;
2005 if (keyword(s2, len, 0))
463ee0b2 2006 return start;
01b5ef50 2007 }
463ee0b2 2008 if (token == METHOD) {
29595ff2 2009 s = SKIPSPACE1(s);
463ee0b2 2010 if (*s == '(')
3280af22 2011 PL_expect = XTERM;
463ee0b2 2012 else {
3280af22 2013 PL_expect = XOPERATOR;
463ee0b2 2014 }
79072805 2015 }
9ded7720 2016 NEXTVAL_NEXTTOKE.opval
d0a148a6
NC
2017 = (OP*)newSVOP(OP_CONST,0,
2018 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 2019 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
2020 force_next(token);
2021 }
2022 return s;
2023}
2024
ffb4593c
NT
2025/*
2026 * S_force_ident
9cbb5ea2 2027 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
2028 * text only contains the "foo" portion. The first argument is a pointer
2029 * to the "foo", and the second argument is the type symbol to prefix.
2030 * Forces the next token to be a "WORD".
9cbb5ea2 2031 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2032 */
2033
76e3520e 2034STATIC void
5aaab254 2035S_force_ident(pTHX_ const char *s, int kind)
79072805 2036{
7918f24d
NC
2037 PERL_ARGS_ASSERT_FORCE_IDENT;
2038
c9b48522
DD
2039 if (s[0]) {
2040 const STRLEN len = s[1] ? strlen(s) : 1; /* s = "\"" see yylex */
728847b1
BF
2041 OP* const o = (OP*)newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
2042 UTF ? SVf_UTF8 : 0));
9ded7720 2043 NEXTVAL_NEXTTOKE.opval = o;
79072805 2044 force_next(WORD);
748a9306 2045 if (kind) {
11343788 2046 o->op_private = OPpCONST_ENTERED;
55497cff
PP
2047 /* XXX see note in pp_entereval() for why we forgo typo
2048 warnings if the symbol must be introduced in an eval.
2049 GSAR 96-10-12 */
90e5519e 2050 gv_fetchpvn_flags(s, len,
728847b1
BF
2051 (PL_in_eval ? (GV_ADDMULTI | GV_ADDINEVAL)
2052 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
90e5519e
NC
2053 kind == '$' ? SVt_PV :
2054 kind == '@' ? SVt_PVAV :
2055 kind == '%' ? SVt_PVHV :
a0d0e21e 2056 SVt_PVGV
90e5519e 2057 );
748a9306 2058 }
79072805
LW
2059 }
2060}
2061
3f33d153
FC
2062static void
2063S_force_ident_maybe_lex(pTHX_ char pit)
2064{
3f33d153
FC
2065 NEXTVAL_NEXTTOKE.ival = pit;
2066 force_next('p');
2067}
2068
1571675a
GS
2069NV
2070Perl_str_to_version(pTHX_ SV *sv)
2071{
2072 NV retval = 0.0;
2073 NV nshift = 1.0;
2074 STRLEN len;
cfd0369c 2075 const char *start = SvPV_const(sv,len);
9d4ba2ae 2076 const char * const end = start + len;
504618e9 2077 const bool utf = SvUTF8(sv) ? TRUE : FALSE;
7918f24d
NC
2078
2079 PERL_ARGS_ASSERT_STR_TO_VERSION;
2080
1571675a 2081 while (start < end) {
ba210ebe 2082 STRLEN skip;
1571675a
GS
2083 UV n;
2084 if (utf)
9041c2e3 2085 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2086 else {
2087 n = *(U8*)start;
2088 skip = 1;
2089 }
2090 retval += ((NV)n)/nshift;
2091 start += skip;
2092 nshift *= 1000;
2093 }
2094 return retval;
2095}
2096
4e553d73 2097/*
ffb4593c
NT
2098 * S_force_version
2099 * Forces the next token to be a version number.
e759cc13
RGS
2100 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2101 * and if "guessing" is TRUE, then no new token is created (and the caller
2102 * must use an alternative parsing method).
ffb4593c
NT
2103 */
2104
76e3520e 2105STATIC char *
e759cc13 2106S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2107{
5f66b61c 2108 OP *version = NULL;
44dcb63b 2109 char *d;
89bfa8cd 2110
7918f24d
NC
2111 PERL_ARGS_ASSERT_FORCE_VERSION;
2112
29595ff2 2113 s = SKIPSPACE1(s);
89bfa8cd 2114
44dcb63b 2115 d = s;
dd629d5b 2116 if (*d == 'v')
44dcb63b 2117 d++;
44dcb63b 2118 if (isDIGIT(*d)) {
e759cc13
RGS
2119 while (isDIGIT(*d) || *d == '_' || *d == '.')
2120 d++;
4e4da3ac 2121 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
dd629d5b 2122 SV *ver;
6154021b
RGS
2123 s = scan_num(s, &pl_yylval);
2124 version = pl_yylval.opval;
dd629d5b
GS
2125 ver = cSVOPx(version)->op_sv;
2126 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2127 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2128 SvNV_set(ver, str_to_version(ver));
1571675a 2129 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2130 }
89bfa8cd 2131 }
5db06880 2132 else if (guessing) {
e759cc13 2133 return s;
5db06880 2134 }
89bfa8cd
PP
2135 }
2136
2137 /* NOTE: The parser sees the package name and the VERSION swapped */
9ded7720 2138 NEXTVAL_NEXTTOKE.opval = version;
4e553d73 2139 force_next(WORD);
89bfa8cd 2140
e759cc13 2141 return s;
89bfa8cd
PP
2142}
2143
ffb4593c 2144/*
91152fc1
DG
2145 * S_force_strict_version
2146 * Forces the next token to be a version number using strict syntax rules.
2147 */
2148
2149STATIC char *
2150S_force_strict_version(pTHX_ char *s)
2151{
91152fc1 2152 OP *version = NULL;
91152fc1
DG
2153 const char *errstr = NULL;
2154
2155 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2156
2157 while (isSPACE(*s)) /* leading whitespace */
2158 s++;
2159
2160 if (is_STRICT_VERSION(s,&errstr)) {
2161 SV *ver = newSV(0);
2162 s = (char *)scan_version(s, ver, 0);
2163 version = newSVOP(OP_CONST, 0, ver);
2164 }
4e4da3ac
Z
2165 else if ( (*s != ';' && *s != '{' && *s != '}' ) &&
2166 (s = SKIPSPACE1(s), (*s != ';' && *s != '{' && *s != '}' )))
2167 {
91152fc1
DG
2168 PL_bufptr = s;
2169 if (errstr)
2170 yyerror(errstr); /* version required */
2171 return s;
2172 }
2173
91152fc1 2174 /* NOTE: The parser sees the package name and the VERSION swapped */
91152fc1
DG
2175 NEXTVAL_NEXTTOKE.opval = version;
2176 force_next(WORD);
2177
2178 return s;
2179}
2180
2181/*
ffb4593c
NT
2182 * S_tokeq
2183 * Tokenize a quoted string passed in as an SV. It finds the next
2184 * chunk, up to end of string or a backslash. It may make a new
2185 * SV containing that chunk (if HINT_NEW_STRING is on). It also
2186 * turns \\ into \.
2187 */
2188
76e3520e 2189STATIC SV *
cea2e8a9 2190S_tokeq(pTHX_ SV *sv)
79072805 2191{
eb578fdb
KW
2192 char *s;
2193 char *send;
2194 char *d;
b3ac6de7 2195 SV *pv = sv;
79072805 2196
7918f24d
NC
2197 PERL_ARGS_ASSERT_TOKEQ;
2198
279b35ad
FC
2199 assert (SvPOK(sv));
2200 assert (SvLEN(sv));
2201 assert (!SvIsCOW(sv));
307ed071 2202 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1) /* <<'heredoc' */
b3ac6de7 2203 goto finish;
279b35ad
FC
2204 s = SvPVX(sv);
2205 send = SvEND(sv);
dcb21ed6
NC
2206 /* This is relying on the SV being "well formed" with a trailing '\0' */
2207 while (s < send && !(*s == '\\' && s[1] == '\\'))
79072805
LW
2208 s++;
2209 if (s == send)
b3ac6de7 2210 goto finish;
79072805 2211 d = s;
be4731d2 2212 if ( PL_hints & HINT_NEW_STRING ) {
279b35ad
FC
2213 pv = newSVpvn_flags(SvPVX_const(pv), SvCUR(sv),
2214 SVs_TEMP | SvUTF8(sv));
be4731d2 2215 }
79072805
LW
2216 while (s < send) {
2217 if (*s == '\\') {
a0d0e21e 2218 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2219 s++; /* all that, just for this */
2220 }
2221 *d++ = *s++;
2222 }
2223 *d = '\0';
95a20fc0 2224 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2225 finish:
3280af22 2226 if ( PL_hints & HINT_NEW_STRING )
eb0d8d16 2227 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
79072805
LW
2228 return sv;
2229}
2230
ffb4593c
NT
2231/*
2232 * Now come three functions related to double-quote context,
2233 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2234 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2235 * interact with PL_lex_state, and create fake ( ... ) argument lists
2236 * to handle functions and concatenation.
ecd24171
DM
2237 * For example,
2238 * "foo\lbar"
2239 * is tokenised as
2240 * stringify ( const[foo] concat lcfirst ( const[bar] ) )
ffb4593c
NT
2241 */
2242
2243/*
2244 * S_sublex_start
6154021b 2245 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2246 *
2247 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2248 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c
NT
2249 *
2250 * OP_CONST and OP_READLINE are easy--just make the new op and return.
2251 *
2252 * Everything else becomes a FUNC.
2253 *
2254 * Sets PL_lex_state to LEX_INTERPPUSH unless (ival was OP_NULL or we
2255 * had an OP_CONST or OP_READLINE). This just sets us up for a
2256 * call to S_sublex_push().
2257 */
2258
76e3520e 2259STATIC I32
cea2e8a9 2260S_sublex_start(pTHX)
79072805 2261{
eb578fdb 2262 const I32 op_type = pl_yylval.ival;
79072805
LW
2263
2264 if (op_type == OP_NULL) {
6154021b 2265 pl_yylval.opval = PL_lex_op;
5f66b61c 2266 PL_lex_op = NULL;
79072805
LW
2267 return THING;
2268 }
466112bb 2269 if (op_type == OP_CONST) {
3280af22 2270 SV *sv = tokeq(PL_lex_stuff);
b3ac6de7
IZ
2271
2272 if (SvTYPE(sv) == SVt_PVIV) {
2273 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2274 STRLEN len;
96a5add6 2275 const char * const p = SvPV_const(sv, len);
740cce10 2276 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2277 SvREFCNT_dec(sv);
2278 sv = nsv;
4e553d73 2279 }
6154021b 2280 pl_yylval.opval = (OP*)newSVOP(op_type, 0, sv);
a0714e2c 2281 PL_lex_stuff = NULL;
79072805
LW
2282 return THING;
2283 }
2284
3280af22 2285 PL_sublex_info.super_state = PL_lex_state;
eac04b2e 2286 PL_sublex_info.sub_inwhat = (U16)op_type;
3280af22
NIS
2287 PL_sublex_info.sub_op = PL_lex_op;
2288 PL_lex_state = LEX_INTERPPUSH;
55497cff 2289
3280af22
NIS
2290 PL_expect = XTERM;
2291 if (PL_lex_op) {
6154021b 2292 pl_yylval.opval = PL_lex_op;
5f66b61c 2293 PL_lex_op = NULL;
55497cff
PP
2294 return PMFUNC;
2295 }
2296 else
2297 return FUNC;
2298}
2299
ffb4593c
NT
2300/*
2301 * S_sublex_push
2302 * Create a new scope to save the lexing state. The scope will be
2303 * ended in S_sublex_done. Returns a '(', starting the function arguments
2304 * to the uc, lc, etc. found before.
2305 * Sets PL_lex_state to LEX_INTERPCONCAT.
2306 */
2307
76e3520e 2308STATIC I32
cea2e8a9 2309S_sublex_push(pTHX)
55497cff 2310{
78a635de 2311 LEXSHARED *shared;
801d32ac 2312 const bool is_heredoc = PL_multi_close == '<';
f46d017c 2313 ENTER;
55497cff 2314
3280af22 2315 PL_lex_state = PL_sublex_info.super_state;
cc624add 2316 SAVEI8(PL_lex_dojoin);
3280af22 2317 SAVEI32(PL_lex_brackets);
78cdf107 2318 SAVEI32(PL_lex_allbrackets);
b27dce25 2319 SAVEI32(PL_lex_formbrack);
78cdf107 2320 SAVEI8(PL_lex_fakeeof);
3280af22
NIS
2321 SAVEI32(PL_lex_casemods);
2322 SAVEI32(PL_lex_starts);
651b5b28 2323 SAVEI8(PL_lex_state);
7cc34111 2324 SAVESPTR(PL_lex_repl);
7766f137 2325 SAVEVPTR(PL_lex_inpat);
98246f1e 2326 SAVEI16(PL_lex_inwhat);
ffdb8b16 2327 if (is_heredoc)
b42366d4 2328 {
ffdb8b16 2329 SAVECOPLINE(PL_curcop);
b42366d4 2330 SAVEI32(PL_multi_end);
851b527a
FC
2331 SAVEI32(PL_parser->herelines);
2332 PL_parser->herelines = 0;
b42366d4
FC
2333 }
2334 SAVEI8(PL_multi_close);
3280af22 2335 SAVEPPTR(PL_bufptr);
8452ff4b 2336 SAVEPPTR(PL_bufend);
3280af22
NIS
2337 SAVEPPTR(PL_oldbufptr);
2338 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2339 SAVEPPTR(PL_last_lop);
2340 SAVEPPTR(PL_last_uni);
3280af22
NIS
2341 SAVEPPTR(PL_linestart);
2342 SAVESPTR(PL_linestr);
8edd5f42
RGS
2343 SAVEGENERICPV(PL_lex_brackstack);
2344 SAVEGENERICPV(PL_lex_casestack);
78a635de 2345 SAVEGENERICPV(PL_parser->lex_shared);
3a54fd60 2346 SAVEBOOL(PL_parser->lex_re_reparsing);
ffdb8b16 2347 SAVEI32(PL_copline);
3280af22 2348
99bd9d90 2349 /* The here-doc parser needs to be able to peek into outer lexing
60f40a38
FC
2350 scopes to find the body of the here-doc. So we put PL_linestr and
2351 PL_bufptr into lex_shared, to ‘share’ those values.
99bd9d90 2352 */
60f40a38
FC
2353 PL_parser->lex_shared->ls_linestr = PL_linestr;
2354 PL_parser->lex_shared->ls_bufptr = PL_bufptr;
99bd9d90 2355
3280af22 2356 PL_linestr = PL_lex_stuff;
7cc34111 2357 PL_lex_repl = PL_sublex_info.repl;
a0714e2c 2358 PL_lex_stuff = NULL;
7cc34111 2359 PL_sublex_info.repl = NULL;
3280af22 2360
9cbb5ea2
GS
2361 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2362 = SvPVX(PL_linestr);
3280af22 2363 PL_bufend += SvCUR(PL_linestr);
bd61b366 2364 PL_last_lop = PL_last_uni = NULL;
3280af22 2365 SAVEFREESV(PL_linestr);
4dc843bc 2366 if (PL_lex_repl) SAVEFREESV(PL_lex_repl);
3280af22
NIS
2367
2368 PL_lex_dojoin = FALSE;
b27dce25 2369 PL_lex_brackets = PL_lex_formbrack = 0;
78cdf107
Z
2370 PL_lex_allbrackets = 0;
2371 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
a02a5408
JC
2372 Newx(PL_lex_brackstack, 120, char);
2373 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2374 PL_lex_casemods = 0;
2375 *PL_lex_casestack = '\0';
2376 PL_lex_starts = 0;
2377 PL_lex_state = LEX_INTERPCONCAT;
ffdb8b16 2378 if (is_heredoc)
6ddcf93b 2379 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
ffdb8b16 2380 PL_copline = NOLINE;
78a635de
FC
2381
2382 Newxz(shared, 1, LEXSHARED);
2383 shared->ls_prev = PL_parser->lex_shared;
2384 PL_parser->lex_shared = shared;
3280af22
NIS
2385
2386 PL_lex_inwhat = PL_sublex_info.sub_inwhat;
bb16bae8 2387 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
3280af22
NIS
2388 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2389 PL_lex_inpat = PL_sublex_info.sub_op;
79072805 2390 else
5f66b61c 2391 PL_lex_inpat = NULL;
79072805 2392
3a54fd60
DM
2393 PL_parser->lex_re_reparsing = cBOOL(PL_in_eval & EVAL_RE_REPARSING);
2394 PL_in_eval &= ~EVAL_RE_REPARSING;
2395
55497cff 2396 return '(';
79072805
LW
2397}
2398
ffb4593c
NT
2399/*
2400 * S_sublex_done
2401 * Restores lexer state after a S_sublex_push.
2402 */
2403
76e3520e 2404STATIC I32
cea2e8a9 2405S_sublex_done(pTHX)
79072805 2406{
3280af22 2407 if (!PL_lex_starts++) {
396482e1 2408 SV * const sv = newSVpvs("");
9aa983d2
JH
2409 if (SvUTF8(PL_linestr))
2410 SvUTF8_on(sv);
3280af22 2411 PL_expect = XOPERATOR;
6154021b 2412 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
79072805
LW
2413 return THING;
2414 }
2415
3280af22
NIS
2416 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2417 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2418 return yylex();
79072805
LW
2419 }
2420
ffb4593c 2421 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
bb16bae8 2422 assert(PL_lex_inwhat != OP_TRANSR);
5aa91856
FC
2423 if (PL_lex_repl) {
2424 assert (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS);
3280af22
NIS
2425 PL_linestr = PL_lex_repl;
2426 PL_lex_inpat = 0;
2427 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2428 PL_bufend += SvCUR(PL_linestr);
bd61b366 2429 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2430 PL_lex_dojoin = FALSE;
2431 PL_lex_brackets = 0;
78cdf107
Z
2432 PL_lex_allbrackets = 0;
2433 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
3280af22
NIS
2434 PL_lex_casemods = 0;
2435 *PL_lex_casestack = '\0';
2436 PL_lex_starts = 0;
25da4f38 2437 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2438 PL_lex_state = LEX_INTERPNORMAL;
2439 PL_lex_starts++;
e9fa98b2
HS
2440 /* we don't clear PL_lex_repl here, so that we can check later
2441 whether this is an evalled subst; that means we rely on the
2442 logic to ensure sublex_done() is called again only via the
2443 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2444 }
e9fa98b2 2445 else {
3280af22 2446 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2447 PL_lex_repl = NULL;
e9fa98b2 2448 }
ffdb8b16
FC
2449 if (SvTYPE(PL_linestr) >= SVt_PVNV) {
2450 CopLINE(PL_curcop) +=
2451 ((XPVNV*)SvANY(PL_linestr))->xnv_u.xpad_cop_seq.xlow
851b527a
FC
2452 + PL_parser->herelines;
2453 PL_parser->herelines = 0;
ffdb8b16 2454 }
79072805 2455 return ',';
ffed7fef
LW
2456 }
2457 else {
b42366d4 2458 const line_t l = CopLINE(PL_curcop);
f46d017c 2459 LEAVE;
b42366d4 2460 if (PL_multi_close == '<')
851b527a 2461 PL_parser->herelines += l - PL_multi_end;
3280af22
NIS
2462 PL_bufend = SvPVX(PL_linestr);
2463 PL_bufend += SvCUR(PL_linestr);
2464 PL_expect = XOPERATOR;
09bef843 2465 PL_sublex_info.sub_inwhat = 0;
79072805 2466 return ')';
ffed7fef
LW
2467 }
2468}
2469
6f613c73
KW
2470PERL_STATIC_INLINE SV*
2471S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
2472{
140b12ad
KW
2473 /* <s> points to first character of interior of \N{}, <e> to one beyond the
2474 * interior, hence to the "}". Finds what the name resolves to, returning
2475 * an SV* containing it; NULL if no valid one found */
2476
dd2b1b72 2477 SV* res = newSVpvn_flags(s, e - s, UTF ? SVf_UTF8 : 0);
6f613c73 2478
0c415a79
KW
2479 HV * table;
2480 SV **cvp;
2481 SV *cv;
2482 SV *rv;
2483 HV *stash;
2484 const U8* first_bad_char_loc;
2485 const char* backslash_ptr = s - 3; /* Points to the <\> of \N{... */
2486
6f613c73
KW
2487 PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME;
2488
107160e2
KW
2489 if (UTF && ! is_utf8_string_loc((U8 *) backslash_ptr,
2490 e - backslash_ptr,
2491 &first_bad_char_loc))
2492 {
2493 /* If warnings are on, this will print a more detailed analysis of what
2494 * is wrong than the error message below */
c80e42f3 2495 utf8n_to_uvchr(first_bad_char_loc,
107160e2
KW
2496 e - ((char *) first_bad_char_loc),
2497 NULL, 0);
2498
2499 /* We deliberately don't try to print the malformed character, which
2500 * might not print very well; it also may be just the first of many
2501 * malformations, so don't print what comes after it */
2502 yyerror(Perl_form(aTHX_
2503 "Malformed UTF-8 character immediately after '%.*s'",
2504 (int) (first_bad_char_loc - (U8 *) backslash_ptr), backslash_ptr));
2505 return NULL;
2506 }
2507
2508 res = new_constant( NULL, 0, "charnames", res, NULL, backslash_ptr,
2509 /* include the <}> */
2510 e - backslash_ptr + 1);
6f613c73 2511 if (! SvPOK(res)) {
b6407c49 2512 SvREFCNT_dec_NN(res);
6f613c73
KW
2513 return NULL;
2514 }
2515
0c415a79
KW
2516 /* See if the charnames handler is the Perl core's, and if so, we can skip
2517 * the validation needed for a user-supplied one, as Perl's does its own
2518 * validation. */
2519 table = GvHV(PL_hintgv); /* ^H */
2520 cvp = hv_fetchs(table, "charnames", FALSE);
5882ddb3
FC
2521 if (cvp && (cv = *cvp) && SvROK(cv) && (rv = SvRV(cv),
2522 SvTYPE(rv) == SVt_PVCV) && ((stash = CvSTASH(rv)) != NULL))
0c415a79
KW
2523 {
2524 const char * const name = HvNAME(stash);
6a642c21
FC
2525 if (HvNAMELEN(stash) == sizeof("_charnames")-1
2526 && strEQ(name, "_charnames")) {
0c415a79
KW
2527 return res;
2528 }
2529 }
2530
bde9e88d
KW
2531 /* Here, it isn't Perl's charname handler. We can't rely on a
2532 * user-supplied handler to validate the input name. For non-ut8 input,
2533 * look to see that the first character is legal. Then loop through the
2534 * rest checking that each is a continuation */
6f613c73 2535
36897d64
KW
2536 /* This code makes the reasonable assumption that the only Latin1-range
2537 * characters that begin a character name alias are alphabetic, otherwise
2538 * would have to create a isCHARNAME_BEGIN macro */
b6ba1137 2539
b6ba1137 2540 if (! UTF) {
bde9e88d 2541 if (! isALPHAU(*s)) {
b6ba1137
KW
2542 goto bad_charname;
2543 }
bde9e88d
KW
2544 s++;
2545 while (s < e) {
2546 if (! isCHARNAME_CONT(*s)) {
b6ba1137
KW
2547 goto bad_charname;
2548 }
2d8eb851
KW
2549 if (*s == ' ' && *(s-1) == ' ') {
2550 goto multi_spaces;
bd299e29 2551 }
df758df2
KW
2552 if ((U8) *s == NBSP_NATIVE && ckWARN_d(WARN_DEPRECATED)) {
2553 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2554 "NO-BREAK SPACE in a charnames "
2555 "alias definition is deprecated");
2556 }
bde9e88d 2557 s++;
b6ba1137
KW
2558 }
2559 }
2560 else {
bde9e88d
KW
2561 /* Similarly for utf8. For invariants can check directly; for other
2562 * Latin1, can calculate their code point and check; otherwise use a
2563 * swash */
2564 if (UTF8_IS_INVARIANT(*s)) {
2565 if (! isALPHAU(*s)) {
140b12ad
KW
2566 goto bad_charname;
2567 }
bde9e88d
KW
2568 s++;
2569 } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
94bb8c36 2570 if (! isALPHAU(TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)))) {
b6ba1137 2571 goto bad_charname;
6f613c73 2572 }
bde9e88d 2573 s += 2;
6f613c73 2574 }
bde9e88d
KW
2575 else {
2576 if (! PL_utf8_charname_begin) {
2577 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2578 PL_utf8_charname_begin = _core_swash_init("utf8",
2579 "_Perl_Charname_Begin",
2580 &PL_sv_undef,
2581 1, 0, NULL, &flags);
2582 }
2583 if (! swash_fetch(PL_utf8_charname_begin, (U8 *) s, TRUE)) {
2584 goto bad_charname;
2585 }
2586 s += UTF8SKIP(s);
2587 }
2588
2589 while (s < e) {
2590 if (UTF8_IS_INVARIANT(*s)) {
2591 if (! isCHARNAME_CONT(*s)) {
2592 goto bad_charname;
2593 }
2d8eb851
KW
2594 if (*s == ' ' && *(s-1) == ' ') {
2595 goto multi_spaces;
bd299e29 2596 }
bde9e88d
KW
2597 s++;
2598 }
2599 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
94bb8c36 2600 if (! isCHARNAME_CONT(TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1))))
bde9e88d
KW
2601 {
2602 goto bad_charname;
2603 }
df758df2
KW
2604 if (*s == *NBSP_UTF8
2605 && *(s+1) == *(NBSP_UTF8+1)
2606 && ckWARN_d(WARN_DEPRECATED))
2607 {
2608 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2609 "NO-BREAK SPACE in a charnames "
2610 "alias definition is deprecated");
2611 }
bde9e88d
KW
2612 s += 2;
2613 }
2614 else {
2615 if (! PL_utf8_charname_continue) {
2616 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2617 PL_utf8_charname_continue = _core_swash_init("utf8",
2618 "_Perl_Charname_Continue",
2619 &PL_sv_undef,
2620 1, 0, NULL, &flags);
2621 }
2622 if (! swash_fetch(PL_utf8_charname_continue, (U8 *) s, TRUE)) {
2623 goto bad_charname;
2624 }
2625 s += UTF8SKIP(s);
6f613c73
KW
2626 }
2627 }
2d8eb851
KW
2628 }
2629 if (*(s-1) == ' ') {
2630 yyerror_pv(
2631 Perl_form(aTHX_
2632 "charnames alias definitions may not contain trailing "
2633 "white-space; marked by <-- HERE in %.*s<-- HERE %.*s",
2634 (int)(s - backslash_ptr + 1), backslash_ptr,
2635 (int)(e - s + 1), s + 1
2636 ),
2637 UTF ? SVf_UTF8 : 0);
2638 return NULL;
6f613c73
KW
2639 }
2640
94ca1619 2641 if (SvUTF8(res)) { /* Don't accept malformed input */
bde9e88d
KW
2642 const U8* first_bad_char_loc;
2643 STRLEN len;
2644 const char* const str = SvPV_const(res, len);
2645 if (! is_utf8_string_loc((U8 *) str, len, &first_bad_char_loc)) {
2646 /* If warnings are on, this will print a more detailed analysis of
2647 * what is wrong than the error message below */
c80e42f3 2648 utf8n_to_uvchr(first_bad_char_loc,
bde9e88d
KW
2649 (char *) first_bad_char_loc - str,
2650 NULL, 0);
2651
2652 /* We deliberately don't try to print the malformed character,
2653 * which might not print very well; it also may be just the first
2654 * of many malformations, so don't print what comes after it */
2655 yyerror_pv(
2656 Perl_form(aTHX_
2657 "Malformed UTF-8 returned by %.*s immediately after '%.*s'",
2658 (int) (e - backslash_ptr + 1), backslash_ptr,
2659 (int) ((char *) first_bad_char_loc - str), str
2660 ),
2661 SVf_UTF8);
2662 return NULL;
2663 }
2664 }
140b12ad 2665
bde9e88d 2666 return res;
140b12ad 2667
bde9e88d 2668 bad_charname: {
bde9e88d
KW
2669
2670 /* The final %.*s makes sure that should the trailing NUL be missing
2671 * that this print won't run off the end of the string */
2672 yyerror_pv(
2673 Perl_form(aTHX_
2674 "Invalid character in \\N{...}; marked by <-- HERE in %.*s<-- HERE %.*s",
2d8eb851
KW
2675 (int)(s - backslash_ptr + 1), backslash_ptr,
2676 (int)(e - s + 1), s + 1
bde9e88d
KW
2677 ),
2678 UTF ? SVf_UTF8 : 0);
2679 return NULL;
2680 }
2d8eb851
KW
2681
2682 multi_spaces:
2683 yyerror_pv(
2684 Perl_form(aTHX_
2685 "charnames alias definitions may not contain a sequence of "
2686 "multiple spaces; marked by <-- HERE in %.*s<-- HERE %.*s",
2687 (int)(s - backslash_ptr + 1), backslash_ptr,
2688 (int)(e - s + 1), s + 1
2689 ),
2690 UTF ? SVf_UTF8 : 0);
2691 return NULL;
6f613c73
KW
2692}
2693
02aa26ce
NT
2694/*
2695 scan_const
2696
9da1dd8f
DM
2697 Extracts the next constant part of a pattern, double-quoted string,
2698 or transliteration. This is terrifying code.
2699
2700 For example, in parsing the double-quoted string "ab\x63$d", it would
2701 stop at the '$' and return an OP_CONST containing 'abc'.
02aa26ce 2702
94def140 2703 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 2704 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 2705 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 2706
94def140
ST
2707 Returns a pointer to the character scanned up to. If this is
2708 advanced from the start pointer supplied (i.e. if anything was
9da1dd8f 2709 successfully parsed), will leave an OP_CONST for the substring scanned
6154021b 2710 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
2711 by looking at the next characters herself.
2712
02aa26ce 2713 In patterns:
9da1dd8f 2714 expand:
537124e4
KW
2715 \N{FOO} => \N{U+hex_for_character_FOO}
2716 (if FOO expands to multiple characters, expands to \N{U+xx.XX.yy ...})
9da1dd8f
DM
2717
2718 pass through:
2719 all other \-char, including \N and \N{ apart from \N{ABC}
2720
2721 stops on:
2722 @ and $ where it appears to be a var, but not for $ as tail anchor
2723 \l \L \u \U \Q \E
2724 (?{ or (??{
2725
02aa26ce
NT
2726
2727 In transliterations:
2728 characters are VERY literal, except for - not at the start or end
94def140
ST
2729 of the string, which indicates a range. If the range is in bytes,
2730 scan_const expands the range to the full set of intermediate
2731 characters. If the range is in utf8, the hyphen is replaced with
2732 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
2733
2734 In double-quoted strings:
2735 backslashes:
2736 double-quoted style: \r and \n
ff3f963a 2737 constants: \x31, etc.
94def140 2738 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
2739 case and quoting: \U \Q \E
2740 stops on @ and $
2741
2742 scan_const does *not* construct ops to handle interpolated strings.
2743 It stops processing as soon as it finds an embedded $ or @ variable
2744 and leaves it to the caller to work out what's going on.
2745
94def140
ST
2746 embedded arrays (whether in pattern or not) could be:
2747 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2748
2749 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
2750
2751 $ in pattern could be $foo or could be tail anchor. Assumption:
2752 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 2753 followed by one of "()| \r\n\t"
02aa26ce 2754
9da1dd8f 2755 \1 (backreferences) are turned into $1 in substitutions
02aa26ce
NT
2756
2757 The structure of the code is
2758 while (there's a character to process) {
94def140
ST
2759 handle transliteration ranges
2760 skip regexp comments /(?#comment)/ and codes /(?{code})/
2761 skip #-initiated comments in //x patterns
2762 check for embedded arrays
02aa26ce
NT
2763 check for embedded scalars
2764 if (backslash) {
94def140 2765 deprecate \1 in substitution replacements
02aa26ce
NT
2766 handle string-changing backslashes \l \U \Q \E, etc.
2767 switch (what was escaped) {
94def140 2768 handle \- in a transliteration (becomes a literal -)
ff3f963a 2769 if a pattern and not \N{, go treat as regular character
94def140
ST
2770 handle \132 (octal characters)
2771 handle \x15 and \x{1234} (hex characters)
ff3f963a 2772 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
ST
2773 handle \cV (control characters)
2774 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 2775 } (end switch)
77a135fe 2776 continue
02aa26ce 2777 } (end if backslash)
77a135fe 2778 handle regular character
02aa26ce 2779 } (end while character to read)
4e553d73 2780
02aa26ce
NT
2781*/
2782
76e3520e 2783STATIC char *
cea2e8a9 2784S_scan_const(pTHX_ char *start)
79072805 2785{
eb578fdb 2786 char *send = PL_bufend; /* end of the constant */
dc023dbb
KW
2787 SV *sv = newSV(send - start); /* sv for the constant. See note below
2788 on sizing. */
eb578fdb
KW
2789 char *s = start; /* start of the constant */
2790 char *d = SvPVX(sv); /* destination for copies */
dc023dbb
KW
2791 bool dorange = FALSE; /* are we in a translit range? */
2792 bool didrange = FALSE; /* did we just finish a range? */
2793 bool in_charclass = FALSE; /* within /[...]/ */
2794 bool has_utf8 = FALSE; /* Output constant is UTF8 */
2795 bool this_utf8 = cBOOL(UTF); /* Is the source string assumed to be
2796 UTF8? But, this can show as true
2797 when the source isn't utf8, as for
2798 example when it is entirely composed
2799 of hex constants */
6f613c73 2800 SV *res; /* result from charnames */
77a135fe
KW
2801
2802 /* Note on sizing: The scanned constant is placed into sv, which is
2803 * initialized by newSV() assuming one byte of output for every byte of
2804 * input. This routine expects newSV() to allocate an extra byte for a
2805 * trailing NUL, which this routine will append if it gets to the end of
2806 * the input. There may be more bytes of input than output (eg., \N{LATIN
2807 * CAPITAL LETTER A}), or more output than input if the constant ends up
2808 * recoded to utf8, but each time a construct is found that might increase
2809 * the needed size, SvGROW() is called. Its size parameter each time is
2810 * based on the best guess estimate at the time, namely the length used so
2811 * far, plus the length the current construct will occupy, plus room for
2812 * the trailing NUL, plus one byte for every input byte still unscanned */
2813
c3320c2a
KW
2814 UV uv = UV_MAX; /* Initialize to weird value to try to catch any uses
2815 before set */
4c3a8340
ST
2816#ifdef EBCDIC
2817 UV literal_endpoint = 0;
e294cc5d 2818 bool native_range = TRUE; /* turned to FALSE if the first endpoint is Unicode. */
4c3a8340 2819#endif
012bcf8d 2820
7918f24d
NC
2821 PERL_ARGS_ASSERT_SCAN_CONST;
2822
bb16bae8 2823 assert(PL_lex_inwhat != OP_TRANSR);
2b9d42f0
NIS
2824 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
2825 /* If we are doing a trans and we know we want UTF8 set expectation */
2826 has_utf8 = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
2827 this_utf8 = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
2828 }
2829
b899e89d
FC
2830 /* Protect sv from errors and fatal warnings. */
2831 ENTER_with_name("scan_const");
2832 SAVEFREESV(sv);
2b9d42f0 2833
79072805 2834 while (s < send || dorange) {
ff3f963a 2835
02aa26ce 2836 /* get transliterations out of the way (they're most literal) */
3280af22 2837 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 2838 /* expand a range A-Z to the full set of characters. AIE! */
79072805 2839 if (dorange) {
1ba5c669
JH
2840 I32 i; /* current expanded character */
2841 I32 min; /* first character in range */
2842 I32 max; /* last character in range */
02aa26ce 2843
e294cc5d
JH
2844#ifdef EBCDIC
2845 UV uvmax = 0;
2846#endif
2847
2848 if (has_utf8
2849#ifdef EBCDIC
2850 && !native_range
2851#endif
1953db30 2852 ) {
9d4ba2ae 2853 char * const c = (char*)utf8_hop((U8*)d, -1);
8973db79
JH
2854 char *e = d++;
2855 while (e-- > c)
2856 *(e + 1) = *e;
e7214ce8 2857 *c = (char) ILLEGAL_UTF8_BYTE;
8973db79
JH
2858 /* mark the range as done, and continue */
2859 dorange = FALSE;
2860 didrange = TRUE;
2861 continue;
2862 }
2b9d42f0 2863
95a20fc0 2864 i = d - SvPVX_const(sv); /* remember current offset */
e294cc5d
JH
2865#ifdef EBCDIC
2866 SvGROW(sv,
dc023dbb
KW
2867 SvLEN(sv) + ((has_utf8)
2868 ? (512 - UTF_CONTINUATION_MARK
2869 + UNISKIP(0x100))
e294cc5d
JH
2870 : 256));
2871 /* How many two-byte within 0..255: 128 in UTF-8,
2872 * 96 in UTF-8-mod. */
2873#else
9cbb5ea2 2874 SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */
e294cc5d 2875#endif
9cbb5ea2 2876 d = SvPVX(sv) + i; /* refresh d after realloc */
e294cc5d
JH
2877#ifdef EBCDIC
2878 if (has_utf8) {
2879 int j;
2880 for (j = 0; j <= 1; j++) {
2881 char * const c = (char*)utf8_hop((U8*)d, -1);
2882 const UV uv = utf8n_to_uvchr((U8*)c, d - c, NULL, 0);
2883 if (j)
2884 min = (U8)uv;
2885 else if (uv < 256)
2886 max = (U8)uv;
2887 else {
2888 max = (U8)0xff; /* only to \xff */
2889 uvmax = uv; /* \x{100} to uvmax */
2890 }
2891 d = c; /* eat endpoint chars */
2892 }
2893 }
2894 else {
2895#endif
2896 d -= 2; /* eat the first char and the - */
2897 min = (U8)*d; /* first char in range */
2898 max = (U8)d[1]; /* last char in range */
2899#ifdef EBCDIC
2900 }
2901#endif
8ada0baa 2902
c2e66d9e 2903 if (min > max) {
01ec43d0 2904 Perl_croak(aTHX_
d1573ac7 2905 "Invalid range \"%c-%c\" in transliteration operator",
1ba5c669 2906 (char)min, (char)max);
c2e66d9e
GS
2907 }
2908
c7f1f016 2909#ifdef EBCDIC
dc023dbb
KW
2910 /* Because of the discontinuities in EBCDIC A-Z and a-z, expand
2911 * any subsets of these ranges into individual characters */
4c3a8340 2912 if (literal_endpoint == 2 &&
c286a9a6 2913 ((isLOWER_A(min) && isLOWER_A(max)) ||
75e9e7bf
KW
2914 (isUPPER_A(min) && isUPPER_A(max))))
2915 {
2916 for (i = min; i <= max; i++) {
2917 if (isALPHA_A(i))
2918 *d++ = i;
8ada0baa
JH
2919 }
2920 }
2921 else
2922#endif
2923 for (i = min; i <= max; i++)
e294cc5d
JH
2924#ifdef EBCDIC
2925 if (has_utf8) {
55d09dc8 2926 append_utf8_from_native_byte(i, &d);
e294cc5d
JH
2927 }
2928 else
2929#endif
2930 *d++ = (char)i;
2931
2932#ifdef EBCDIC
2933 if (uvmax) {
2934 d = (char*)uvchr_to_utf8((U8*)d, 0x100);
2935 if (uvmax > 0x101)
e7214ce8 2936 *d++ = (char) ILLEGAL_UTF8_BYTE;
e294cc5d
JH
2937 if (uvmax > 0x100)
2938 d = (char*)uvchr_to_utf8((U8*)d, uvmax);
2939 }
2940#endif
02aa26ce
NT
2941
2942 /* mark the range as done, and continue */
79072805 2943 dorange = FALSE;
01ec43d0 2944 didrange = TRUE;
4c3a8340
ST
2945#ifdef EBCDIC
2946 literal_endpoint = 0;
2947#endif
79072805 2948 continue;
4e553d73 2949 }
02aa26ce
NT
2950
2951 /* range begins (ignore - as first or last char) */
79072805 2952 else if (*s == '-' && s+1 < send && s != start) {
4e553d73 2953 if (didrange) {
1fafa243 2954 Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
01ec43d0 2955 }
e294cc5d
JH
2956 if (has_utf8
2957#ifdef EBCDIC
2958 && !native_range
2959#endif
2960 ) {
e7214ce8 2961 *d++ = (char) ILLEGAL_UTF8_BYTE; /* use illegal utf8 byte--see pmtrans */
a0ed51b3
LW
2962 s++;
2963 continue;
2964 }
79072805
LW
2965 dorange = TRUE;
2966 s++;
01ec43d0
GS
2967 }
2968 else {
2969 didrange = FALSE;
4c3a8340
ST
2970#ifdef EBCDIC
2971 literal_endpoint = 0;
e294cc5d 2972 native_range = TRUE;
4c3a8340 2973#endif
01ec43d0 2974 }
79072805 2975 }
02aa26ce
NT
2976
2977 /* if we get here, we're not doing a transliteration */
2978
e4a2df84
DM
2979 else if (*s == '[' && PL_lex_inpat && !in_charclass) {
2980 char *s1 = s-1;
2981 int esc = 0;
2982 while (s1 >= start && *s1-- == '\\')
2983 esc = !esc;
2984 if (!esc)
2985 in_charclass = TRUE;
2986 }
2866decb 2987
e4a2df84
DM
2988 else if (*s == ']' && PL_lex_inpat && in_charclass) {
2989 char *s1 = s-1;
2990 int esc = 0;
2991 while (s1 >= start && *s1-- == '\\')
2992 esc = !esc;
2993 if (!esc)
2994 in_charclass = FALSE;
2995 }
2866decb 2996
9da1dd8f
DM
2997 /* skip for regexp comments /(?#comment)/, except for the last
2998 * char, which will be done separately.
2999 * Stop on (?{..}) and friends */
3000
c30fc27b 3001 else if (*s == '(' && PL_lex_inpat && s[1] == '?' && !in_charclass) {
cc6b7395 3002 if (s[2] == '#') {
e994fd66 3003 while (s+1 < send && *s != ')')
5ff03569 3004 *d++ = *s++;
155aba94 3005 }
c30fc27b 3006 else if (!PL_lex_casemods &&
d3cec5e5
DM
3007 ( s[2] == '{' /* This should match regcomp.c */
3008 || (s[2] == '?' && s[3] == '{')))
155aba94 3009 {
9da1dd8f 3010 break;
cc6b7395 3011 }
748a9306 3012 }
02aa26ce
NT
3013
3014 /* likewise skip #-initiated comments in //x patterns */
c30fc27b 3015 else if (*s == '#' && PL_lex_inpat && !in_charclass &&
73134a2e 3016 ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED) {
748a9306 3017 while (s+1 < send && *s != '\n')
5ff03569 3018 *d++ = *s++;
748a9306 3019 }
02aa26ce 3020
9da1dd8f
DM
3021 /* no further processing of single-quoted regex */
3022 else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'')
3023 goto default_action;
3024
5d1d4326 3025 /* check for embedded arrays
da6eedaa 3026 (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
5d1d4326 3027 */
1749ea0d 3028 else if (*s == '@' && s[1]) {
8a2bca12 3029 if (isWORDCHAR_lazy_if(s+1,UTF))
1749ea0d
ST
3030 break;
3031 if (strchr(":'{$", s[1]))
3032 break;
3033 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
3034 break; /* in regexp, neither @+ nor @- are interpolated */
3035 }
02aa26ce
NT
3036
3037 /* check for embedded scalars. only stop if we're sure it's a
3038 variable.
3039 */
79072805 3040 else if (*s == '$') {
3280af22 3041 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
79072805 3042 break;
77772344 3043 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
a2a5de95
NC
3044 if (s[1] == '\\') {
3045 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
3046 "Possible unintended interpolation of $\\ in regex");
77772344 3047 }
79072805 3048 break; /* in regexp, $ might be tail anchor */
77772344 3049 }
79072805 3050 }
02aa26ce 3051
2b9d42f0
NIS
3052 /* End of else if chain - OP_TRANS rejoin rest */
3053
02aa26ce 3054 /* backslashes */
79072805 3055 if (*s == '\\' && s+1 < send) {
ff3f963a
KW
3056 char* e; /* Can be used for ending '}', etc. */
3057
79072805 3058 s++;
02aa26ce 3059
7d0fc23c
KW
3060 /* warn on \1 - \9 in substitution replacements, but note that \11
3061 * is an octal; and \19 is \1 followed by '9' */
3280af22 3062 if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat &&
a0d0e21e 3063 isDIGIT(*s) && *s != '0' && !isDIGIT(s[1]))
79072805 3064 {
c782d7ee 3065 /* diag_listed_as: \%d better written as $%d */
a2a5de95 3066 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
79072805
LW
3067 *--s = '$';
3068 break;
3069 }
02aa26ce
NT
3070
3071 /* string-change backslash escapes */
838f2281 3072 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQF", *s)) {
79072805
LW
3073 --s;
3074 break;
3075 }
ff3f963a
KW
3076 /* In a pattern, process \N, but skip any other backslash escapes.
3077 * This is because we don't want to translate an escape sequence
3078 * into a meta symbol and have the regex compiler use the meta
3079 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
3080 * in spite of this, we do have to process \N here while the proper
3081 * charnames handler is in scope. See bugs #56444 and #62056.
3082 * There is a complication because \N in a pattern may also stand
3083 * for 'match a non-nl', and not mean a charname, in which case its
3084 * processing should be deferred to the regex compiler. To be a
3085 * charname it must be followed immediately by a '{', and not look
3086 * like \N followed by a curly quantifier, i.e., not something like
3087 * \N{3,}. regcurly returns a boolean indicating if it is a legal
3088 * quantifier */
3089 else if (PL_lex_inpat
3090 && (*s != 'N'
3091 || s[1] != '{'
412f55bb 3092 || regcurly(s + 1)))
ff3f963a 3093 {
4d73d076 3094 *d++ = '\\';
cc74c5bd
ST
3095 goto default_action;
3096 }
02aa26ce 3097
79072805 3098 switch (*s) {
02aa26ce
NT
3099
3100 /* quoted - in transliterations */
79072805 3101 case '-':
3280af22 3102 if (PL_lex_inwhat == OP_TRANS) {
79072805
LW
3103 *d++ = *s++;
3104 continue;
3105 }
924ba076 3106 /* FALLTHROUGH */
79072805 3107 default:
11b8faa4 3108 {
15861f94 3109 if ((isALPHANUMERIC(*s)))
a2a5de95
NC
3110 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
3111 "Unrecognized escape \\%c passed through",
3112 *s);
11b8faa4 3113 /* default action is to copy the quoted character */
f9a63242 3114 goto default_action;
11b8faa4 3115 }
02aa26ce 3116
632403cc 3117 /* eg. \132 indicates the octal constant 0132 */
79072805
LW
3118 case '0': case '1': case '2': case '3':
3119 case '4': case '5': case '6': case '7':
ba210ebe 3120 {
5e0a247b 3121 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
53305cf1 3122 STRLEN len = 3;
06972766 3123 uv = grok_oct(s, &len, &flags, NULL);
ba210ebe 3124 s += len;
5e0a247b
KW
3125 if (len < 3 && s < send && isDIGIT(*s)
3126 && ckWARN(WARN_MISC))
3127 {
3128 Perl_warner(aTHX_ packWARN(WARN_MISC),
3129 "%s", form_short_octal_warning(s, len));
3130 }
ba210ebe 3131 }
012bcf8d 3132 goto NUM_ESCAPE_INSERT;
02aa26ce 3133
f0a2b745
KW
3134 /* eg. \o{24} indicates the octal constant \024 */
3135 case 'o':
3136 {
454155d9 3137 const char* error;
f0a2b745 3138
00ce5563 3139 bool valid = grok_bslash_o(&s, &uv, &error,
80f4111b
KW
3140 TRUE, /* Output warning */
3141 FALSE, /* Not strict */
17896a85
KW
3142 TRUE, /* Output warnings for
3143 non-portables */
80f4111b 3144 UTF);
454155d9 3145 if (! valid) {
f0a2b745
KW
3146 yyerror(error);
3147 continue;
3148 }
3149 goto NUM_ESCAPE_INSERT;
3150 }
3151
77a135fe 3152 /* eg. \x24 indicates the hex constant 0x24 */
79072805 3153 case 'x':
a0481293 3154 {
a0481293 3155 const char* error;
355860ce 3156
00ce5563 3157 bool valid = grok_bslash_x(&s, &uv, &error,
80f4111b
KW
3158 TRUE, /* Output warning */
3159 FALSE, /* Not strict */
17896a85
KW
3160 TRUE, /* Output warnings for
3161 non-portables */
80f4111b 3162 UTF);
a0481293
KW
3163 if (! valid) {
3164 yyerror(error);
355860ce 3165 continue;
ba210ebe 3166 }
012bcf8d
GS
3167 }
3168
3169 NUM_ESCAPE_INSERT:
ff3f963a
KW
3170 /* Insert oct or hex escaped character. There will always be
3171 * enough room in sv since such escapes will be longer than any
3172 * UTF-8 sequence they can end up as, except if they force us
3173 * to recode the rest of the string into utf8 */
ba7cea30 3174
06972766 3175 /* Here uv is the ordinal of the next character being added */
6f2d5cbc 3176 if (!UVCHR_IS_INVARIANT(uv)) {
9aa983d2 3177 if (!has_utf8 && uv > 255) {
77a135fe
KW
3178 /* Might need to recode whatever we have accumulated so
3179 * far if it contains any chars variant in utf8 or
3180 * utf-ebcdic. */
3181
3182 SvCUR_set(sv, d - SvPVX_const(sv));
3183 SvPOK_on(sv);
3184 *d = '\0';
77a135fe 3185 /* See Note on sizing above. */
7bf79863
KW
3186 sv_utf8_upgrade_flags_grow(sv,
3187 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3188 UNISKIP(uv) + (STRLEN)(send - s) + 1);
77a135fe
KW
3189 d = SvPVX(sv) + SvCUR(sv);
3190 has_utf8 = TRUE;
012bcf8d
GS
3191 }
3192
77a135fe 3193 if (has_utf8) {
c80e42f3 3194 d = (char*)uvchr_to_utf8((U8*)d, uv);
f9a63242
JH
3195 if (PL_lex_inwhat == OP_TRANS &&
3196 PL_sublex_info.sub_op) {
3197 PL_sublex_info.sub_op->op_private |=
3198 (PL_lex_repl ? OPpTRANS_FROM_UTF
3199 : OPpTRANS_TO_UTF);
f9a63242 3200 }
e294cc5d
JH
3201#ifdef EBCDIC
3202 if (uv > 255 && !dorange)
3203 native_range = FALSE;
3204#endif
012bcf8d 3205 }
a0ed51b3 3206 else {
012bcf8d 3207 *d++ = (char)uv;
a0ed51b3 3208 }
012bcf8d
GS
3209 }
3210 else {
c4d5f83a 3211 *d++ = (char) uv;
a0ed51b3 3212 }
79072805 3213 continue;
02aa26ce 3214
4a2d328f 3215 case 'N':
ff3f963a
KW
3216 /* In a non-pattern \N must be a named character, like \N{LATIN
3217 * SMALL LETTER A} or \N{U+0041}. For patterns, it also can
3218 * mean to match a non-newline. For non-patterns, named
3219 * characters are converted to their string equivalents. In
3220 * patterns, named characters are not converted to their
3221 * ultimate forms for the same reasons that other escapes
3222 * aren't. Instead, they are converted to the \N{U+...} form
3223 * to get the value from the charnames that is in effect right
3224 * now, while preserving the fact that it was a named character
3225 * so that the regex compiler knows this */
3226
ff3f963a
KW
3227 /* The structure of this section of code (besides checking for
3228 * errors and upgrading to utf8) is:
3229 * Further disambiguate between the two meanings of \N, and if
3230 * not a charname, go process it elsewhere
0a96133f
KW
3231 * If of form \N{U+...}, pass it through if a pattern;
3232 * otherwise convert to utf8
3233 * Otherwise must be \N{NAME}: convert to \N{U+c1.c2...} if a
3234 * pattern; otherwise convert to utf8 */
ff3f963a
KW
3235
3236 /* Here, s points to the 'N'; the test below is guaranteed to
3237 * succeed if we are being called on a pattern as we already
3238 * know from a test above that the next character is a '{'.
3239 * On a non-pattern \N must mean 'named sequence, which
3240 * requires braces */
3241 s++;
3242 if (*s != '{') {
3243 yyerror("Missing braces on \\N{}");
3244 continue;
3245 }
3246 s++;
3247
0a96133f 3248 /* If there is no matching '}', it is an error. */
ff3f963a
KW
3249 if (! (e = strchr(s, '}'))) {
3250 if (! PL_lex_inpat) {
5777a3f7 3251 yyerror("Missing right brace on \\N{}");
0a96133f 3252 } else {
4407f1b8 3253 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N");
dbc0d4f2 3254 }
0a96133f 3255 continue;
ff3f963a 3256 }
cddc7ef4 3257
ff3f963a 3258 /* Here it looks like a named character */
cddc7ef4 3259
ff3f963a
KW
3260 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3261 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3262 | PERL_SCAN_DISALLOW_PREFIX;
3263 STRLEN len;
3264
3265 /* For \N{U+...}, the '...' is a unicode value even on
3266 * EBCDIC machines */
3267 s += 2; /* Skip to next char after the 'U+' */
3268 len = e - s;
3269 uv = grok_hex(s, &len, &flags, NULL);
3270 if (len == 0 || len != (STRLEN)(e - s)) {
3271 yyerror("Invalid hexadecimal number in \\N{U+...}");
3272 s = e + 1;
3273 continue;
3274 }
3275
3276 if (PL_lex_inpat) {
3277
e2a7e165
KW
3278 /* On non-EBCDIC platforms, pass through to the regex
3279 * compiler unchanged. The reason we evaluated the
3280 * number above is to make sure there wasn't a syntax
3281 * error. But on EBCDIC we convert to native so
3282 * downstream code can continue to assume it's native
3283 */
ff3f963a 3284 s -= 5; /* Include the '\N{U+' */
e2a7e165
KW
3285#ifdef EBCDIC
3286 d += my_snprintf(d, e - s + 1 + 1, /* includes the }
3287 and the \0 */
3288 "\\N{U+%X}",
3289 (unsigned int) UNI_TO_NATIVE(uv));
3290#else
ff3f963a
KW
3291 Copy(s, d, e - s + 1, char); /* 1 = include the } */
3292 d += e - s + 1;
e2a7e165 3293#endif
ff3f963a
KW
3294 }
3295 else { /* Not a pattern: convert the hex to string */
3296
3297 /* If destination is not in utf8, unconditionally
3298 * recode it to be so. This is because \N{} implies
3299 * Unicode semantics, and scalars have to be in utf8
3300 * to guarantee those semantics */
3301 if (! has_utf8) {
3302 SvCUR_set(sv, d - SvPVX_const(sv));
3303 SvPOK_on(sv);
3304 *d = '\0';
3305 /* See Note on sizing above. */
3306 sv_utf8_upgrade_flags_grow(
3307 sv,
3308 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3309 UNISKIP(uv) + (STRLEN)(send - e) + 1);
3310 d = SvPVX(sv) + SvCUR(sv);
3311 has_utf8 = TRUE;
3312 }
3313
a46469e6 3314 /* Add the (Unicode) code point to the output. */
ff3f963a 3315 if (UNI_IS_INVARIANT(uv)) {
a46469e6 3316 *d++ = (char) LATIN1_TO_NATIVE(uv);
ff3f963a 3317 }
a46469e6
KW
3318 else {
3319 d = (char*) uvoffuni_to_utf8_flags((U8*)d, uv, 0);
3320 }
ff3f963a
KW
3321 }
3322 }
6f613c73
KW
3323 else /* Here is \N{NAME} but not \N{U+...}. */
3324 if ((res = get_and_check_backslash_N_name(s, e)))
3325 {
3326 STRLEN len;
3327 const char *str = SvPV_const(res, len);
3328 if (PL_lex_inpat) {
ff3f963a
KW
3329
3330 if (! len) { /* The name resolved to an empty string */
3331 Copy("\\N{}", d, 4, char);
3332 d += 4;
3333 }
3334 else {
3335 /* In order to not lose information for the regex
3336 * compiler, pass the result in the specially made
3337 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3338 * the code points in hex of each character
3339 * returned by charnames */
3340
3341 const char *str_end = str + len;
3b721c4f 3342 const STRLEN off = d - SvPVX_const(sv);
94ca1619
KW
3343
3344 if (! SvUTF8(res)) {
3345 /* For the non-UTF-8 case, we can determine the
3346 * exact length needed without having to parse
3347 * through the string. Each character takes up
3348 * 2 hex digits plus either a trailing dot or
3349 * the "}" */
3350 d = off + SvGROW(sv, off
3351 + 3 * len
3352 + 6 /* For the "\N{U+", and
3353 trailing NUL */
3354 + (STRLEN)(send - e));
3355 Copy("\\N{U+", d, 5, char);
3356 d += 5;
3357 while (str < str_end) {
3358 char hex_string[4];
e8549682 3359 int len =
51f14a05
JH
3360 my_snprintf(hex_string,
3361 sizeof(hex_string),
e8549682
JH
3362 "%02X.", (U8) *str);
3363 PERL_MY_SNPRINTF_POST_GUARD(len, sizeof(hex_string));
94ca1619
KW
3364 Copy(hex_string, d, 3, char);
3365 d += 3;
3366 str++;
3367 }
3368 d--; /* We will overwrite below the final
3369 dot with a right brace */
3370 }
3371 else {
1953db30
KW
3372 STRLEN char_length; /* cur char's byte length */
3373
3374 /* and the number of bytes after this is
3375 * translated into hex digits */
3376 STRLEN output_length;
3377
3378 /* 2 hex per byte; 2 chars for '\N'; 2 chars
3379 * for max('U+', '.'); and 1 for NUL */
3380 char hex_string[2 * UTF8_MAXBYTES + 5];
3381
3382 /* Get the first character of the result. */
a46469e6 3383 U32 uv = utf8n_to_uvchr((U8 *) str,
1953db30
KW
3384 len,
3385 &char_length,
3386 UTF8_ALLOW_ANYUV);
3387 /* Convert first code point to hex, including
a46469e6 3388 * the boiler plate before it. */
1953db30
KW
3389 output_length =
3390 my_snprintf(hex_string, sizeof(hex_string),
a46469e6
KW
3391 "\\N{U+%X",
3392 (unsigned int) uv);
1953db30
KW
3393
3394 /* Make sure there is enough space to hold it */
3395 d = off + SvGROW(sv, off
3396 + output_length
3397 + (STRLEN)(send - e)
3398 + 2); /* '}' + NUL */
3399 /* And output it */
3400 Copy(hex_string, d, output_length, char);
3401 d += output_length;
3402
3403 /* For each subsequent character, append dot and
3404 * its ordinal in hex */
3405 while ((str += char_length) < str_end) {
3406 const STRLEN off = d - SvPVX_const(sv);
a46469e6 3407 U32 uv = utf8n_to_uvchr((U8 *) str,
1953db30
KW
3408 str_end - str,
3409 &char_length,
3410 UTF8_ALLOW_ANYUV);
3411 output_length =
3412 my_snprintf(hex_string,
a46469e6
KW
3413 sizeof(hex_string),
3414 ".%X",
3415 (unsigned int) uv);
1953db30
KW
3416
3417 d = off + SvGROW(sv, off
3418 + output_length
3419 + (STRLEN)(send - e)
3420 + 2); /* '}' + NUL */
3421 Copy(hex_string, d, output_length, char);
3422 d += output_length;
3423 }
94ca1619 3424 }
ff3f963a
KW
3425
3426 *d++ = '}'; /* Done. Add the trailing brace */
3427 }
3428 }
3429 else { /* Here, not in a pattern. Convert the name to a
3430 * string. */
3431
3432 /* If destination is not in utf8, unconditionally
3433 * recode it to be so. This is because \N{} implies
3434 * Unicode semantics, and scalars have to be in utf8
3435 * to guarantee those semantics */
3436 if (! has_utf8) {
3437 SvCUR_set(sv, d - SvPVX_const(sv));
3438 SvPOK_on(sv);
3439 *d = '\0';
3440 /* See Note on sizing above. */
3441 sv_utf8_upgrade_flags_grow(sv,
3442 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3443 len + (STRLEN)(send - s) + 1);
3444 d = SvPVX(sv) + SvCUR(sv);
3445 has_utf8 = TRUE;
3446 } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
3447
3448 /* See Note on sizing above. (NOTE: SvCUR() is not
3449 * set correctly here). */
3450 const STRLEN off = d - SvPVX_const(sv);
3451 d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1);
3452 }
7fc82458
KW
3453 if (! SvUTF8(res)) { /* Make sure is \N{} return is UTF-8 */
3454 sv_utf8_upgrade(res);
3455 str = SvPV_const(res, len);
3456 }
ff3f963a
KW
3457 Copy(str, d, len, char);
3458 d += len;
423cee85 3459 }
6f613c73 3460
423cee85 3461 SvREFCNT_dec(res);
cb233ae3 3462
cb233ae3 3463 } /* End \N{NAME} */
ff3f963a
KW
3464#ifdef EBCDIC
3465 if (!dorange)
3466 native_range = FALSE; /* \N{} is defined to be Unicode */
3467#endif
3468 s = e + 1; /* Point to just after the '}' */
423cee85
JH
3469 continue;
3470
02aa26ce 3471 /* \c is a control character */
79072805
LW
3472 case 'c':
3473 s++;
961ce445 3474 if (s < send) {
421e43ba 3475 *d++ = grok_bslash_c(*s++, 1);
ba210ebe 3476 }
961ce445
RGS
3477 else {
3478 yyerror("Missing control char name in \\c");
3479 }
79072805 3480 continue;
02aa26ce
NT
3481
3482 /* printf-style backslashes, formfeeds, newlines, etc */
79072805 3483 case 'b':
4d73d076 3484 *d++ = '\b';
79072805
LW
3485 break;
3486 case 'n':
4d73d076 3487 *d++ = '\n';
79072805
LW
3488 break;
3489 case 'r':
4d73d076 3490 *d++ = '\r';
79072805
LW
3491 break;
3492 case 'f':
4d73d076 3493 *d++ = '\f';
79072805
LW
3494 break;
3495 case 't':
4d73d076 3496 *d++ = '\t';
79072805 3497 break;
34a3fe2a 3498 case 'e':
4d73d076 3499 *d++ = ASCII_TO_NATIVE('\033');
34a3fe2a
PP
3500 break;
3501 case 'a':
4d73d076 3502 *d++ = '\a';
79072805 3503 break;
02aa26ce
NT
3504 } /* end switch */
3505
79072805
LW
3506 s++;
3507 continue;
02aa26ce 3508 } /* end if (backslash) */
4c3a8340
ST
3509#ifdef EBCDIC
3510 else
3511 literal_endpoint++;
3512#endif
02aa26ce 3513
f9a63242 3514 default_action:
77a135fe
KW
3515 /* If we started with encoded form, or already know we want it,
3516 then encode the next character */
6f2d5cbc 3517 if (! NATIVE_BYTE_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
2b9d42f0 3518 STRLEN len = 1;
77a135fe
KW
3519
3520
3521 /* One might think that it is wasted effort in the case of the
3522 * source being utf8 (this_utf8 == TRUE) to take the next character
3523 * in the source, convert it to an unsigned value, and then convert
3524 * it back again. But the source has not been validated here. The
3525 * routine that does the conversion checks for errors like
3526 * malformed utf8 */
3527
233ca360
KW
3528 const UV nextuv = (this_utf8)
3529 ? utf8n_to_uvchr((U8*)s, send - s, &len, 0)
3530 : (UV) ((U8) *s);
5aaebcb3 3531 const STRLEN need = UNISKIP(nextuv);
77a135fe
KW
3532 if (!has_utf8) {
3533 SvCUR_set(sv, d - SvPVX_const(sv));
3534 SvPOK_on(sv);
3535 *d = '\0';
77a135fe 3536 /* See Note on sizing above. */
7bf79863
KW
3537 sv_utf8_upgrade_flags_grow(sv,
3538 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3539 need + (STRLEN)(send - s) + 1);
77a135fe
KW
3540 d = SvPVX(sv) + SvCUR(sv);
3541 has_utf8 = TRUE;
3542 } else if (need > len) {
3543 /* encoded value larger than old, may need extra space (NOTE:
3544 * SvCUR() is not set correctly here). See Note on sizing
3545 * above. */
9d4ba2ae 3546 const STRLEN off = d - SvPVX_const(sv);
77a135fe 3547 d = SvGROW(sv, off + need + (STRLEN)(send - s) + 1) + off;
2b9d42f0 3548 }
77a135fe
KW
3549 s += len;
3550
5f66b61c 3551 d = (char*)uvchr_to_utf8((U8*)d, nextuv);
e294cc5d
JH
3552#ifdef EBCDIC
3553 if (uv > 255 && !dorange)
3554 native_range = FALSE;
3555#endif
2b9d42f0
NIS
3556 }
3557 else {
5ff03569 3558 *d++ = *s++;
2b9d42f0 3559 }
02aa26ce
NT
3560 } /* while loop to process each character */
3561
3562 /* terminate the string and set up the sv */
79072805 3563 *d = '\0';
95a20fc0 3564 SvCUR_set(sv, d - SvPVX_const(sv));
2b9d42f0 3565 if (SvCUR(sv) >= SvLEN(sv))
5637ef5b
NC
3566 Perl_croak(aTHX_ "panic: constant overflowed allocated space, %"UVuf
3567 " >= %"UVuf, (UV)SvCUR(sv), (UV)SvLEN(sv));
2b9d42f0 3568
79072805 3569 SvPOK_on(sv);
9f4817db 3570 if (PL_encoding && !has_utf8) {
d0063567
DK
3571 sv_recode_to_utf8(sv, PL_encoding);
3572 if (SvUTF8(sv))
3573 has_utf8 = TRUE;
9f4817db 3574 }
2b9d42f0 3575 if (has_utf8) {
7e2040f0 3576 SvUTF8_on(sv);
2b9d42f0 3577 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
d0063567 3578 PL_sublex_info.sub_op->op_private |=
2b9d42f0
NIS
3579 (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
3580 }
3581 }
79072805 3582
02aa26ce 3583 /* shrink the sv if we allocated more than we used */
79072805 3584 if (SvCUR(sv) + 5 < SvLEN(sv)) {
1da4ca5f 3585 SvPV_shrink_to_cur(sv);
79072805 3586 }
02aa26ce 3587
6154021b 3588 /* return the substring (via pl_yylval) only if we parsed anything */
f0feb466
FC
3589 if (s > start) {
3590 char *s2 = start;
ffdb8b16
FC
3591 for (; s2 < s; s2++) {
3592 if (*s2 == '\n')
3593 COPLINE_INC_WITH_HERELINES;
3594 }
b899e89d 3595 SvREFCNT_inc_simple_void_NN(sv);
4f3e2518
DM
3596 if ( (PL_hints & ( PL_lex_inpat ? HINT_NEW_RE : HINT_NEW_STRING ))
3597 && ! PL_parser->lex_re_reparsing)
3598 {
eb0d8d16
NC
3599 const char *const key = PL_lex_inpat ? "qr" : "q";
3600 const STRLEN keylen = PL_lex_inpat ? 2 : 1;
3601 const char *type;
3602 STRLEN typelen;
3603
3604 if (PL_lex_inwhat == OP_TRANS) {
3605 type = "tr";
3606 typelen = 2;
3607 } else if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat) {
3608 type = "s";
3609 typelen = 1;
9da1dd8f
DM
3610 } else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'') {
3611 type = "q";
3612 typelen = 1;
eb0d8d16
NC
3613 } else {
3614 type = "qq";
3615 typelen = 2;
3616 }
3617
3618 sv = S_new_constant(aTHX_ start, s - start, key, keylen, sv, NULL,
3619 type, typelen);
3620 }
6154021b 3621 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
b899e89d
FC
3622 }
3623 LEAVE_with_name("scan_const");
79072805
LW
3624 return s;
3625}
3626
ffb4593c
NT
3627/* S_intuit_more
3628 * Returns TRUE if there's more to the expression (e.g., a subscript),
3629 * FALSE otherwise.
ffb4593c
NT
3630 *
3631 * It deals with "$foo[3]" and /$foo[3]/ and /$foo[0123456789$]+/
3632 *
3633 * ->[ and ->{ return TRUE
ff25e5db 3634 * ->$* ->$#* ->@* ->@[ ->@{ return TRUE if postderef_qq is enabled
ffb4593c
NT
3635 * { and [ outside a pattern are always subscripts, so return TRUE
3636 * if we're outside a pattern and it's not { or [, then return FALSE
3637 * if we're in a pattern and the first char is a {
3638 * {4,5} (any digits around the comma) returns FALSE
3639 * if we're in a pattern and the first char is a [
3640 * [] returns FALSE
3641 * [SOMETHING] has a funky algorithm to decide whether it's a
3642 * character class or not. It has to deal with things like
3643 * /$foo[-3]/ and /$foo[$bar]/ as well as /$foo[$\d]+/
3644 * anything else returns TRUE
3645 */
3646
9cbb5ea2
GS
3647/* This is the one truly awful dwimmer necessary to conflate C and sed. */
3648
76e3520e 3649STATIC int
5aaab254 3650S_intuit_more(pTHX_ char *s)
79072805 3651{
7918f24d
NC
3652 PERL_ARGS_ASSERT_INTUIT_MORE;
3653
3280af22 3654 if (PL_lex_brackets)
79072805
LW
3655 return TRUE;
3656 if (*s == '-' && s[1] == '>' && (s[2] == '[' || s[2] == '{'))
3657 return TRUE;
cc624add
FC
3658 if (*s == '-' && s[1] == '>'
3659 && FEATURE_POSTDEREF_QQ_IS_ENABLED
ff25e5db 3660 && ( (s[2] == '$' && (s[3] == '*' || (s[3] == '#' && s[4] == '*')))
cc624add
FC
3661 ||(s[2] == '@' && strchr("*[{",s[3])) ))