This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Expand a comment in toke.c (eliminate ‘why?’)
[perl5.git] / toke.c
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
f0e67a1d
Z
26This is the lower layer of the Perl parser, managing characters and tokens.
27
28=for apidoc AmU|yy_parser *|PL_parser
29
30Pointer to a structure encapsulating the state of the parsing operation
31currently in progress. The pointer can be locally changed to perform
32a nested parse without interfering with the state of an outer parse.
33Individual members of C<PL_parser> have their own documentation.
34
35=cut
36*/
37
378cc40b 38#include "EXTERN.h"
864dbfa3 39#define PERL_IN_TOKE_C
378cc40b 40#include "perl.h"
04e98a4d 41#include "dquote_static.c"
378cc40b 42
eb0d8d16
NC
43#define new_constant(a,b,c,d,e,f,g) \
44 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
45
6154021b 46#define pl_yylval (PL_parser->yylval)
d3b6f988 47
199e78b7
DM
48/* XXX temporary backwards compatibility */
49#define PL_lex_brackets (PL_parser->lex_brackets)
78cdf107
Z
50#define PL_lex_allbrackets (PL_parser->lex_allbrackets)
51#define PL_lex_fakeeof (PL_parser->lex_fakeeof)
199e78b7
DM
52#define PL_lex_brackstack (PL_parser->lex_brackstack)
53#define PL_lex_casemods (PL_parser->lex_casemods)
54#define PL_lex_casestack (PL_parser->lex_casestack)
55#define PL_lex_defer (PL_parser->lex_defer)
56#define PL_lex_dojoin (PL_parser->lex_dojoin)
57#define PL_lex_expect (PL_parser->lex_expect)
58#define PL_lex_formbrack (PL_parser->lex_formbrack)
59#define PL_lex_inpat (PL_parser->lex_inpat)
60#define PL_lex_inwhat (PL_parser->lex_inwhat)
61#define PL_lex_op (PL_parser->lex_op)
62#define PL_lex_repl (PL_parser->lex_repl)
63#define PL_lex_starts (PL_parser->lex_starts)
64#define PL_lex_stuff (PL_parser->lex_stuff)
65#define PL_multi_start (PL_parser->multi_start)
66#define PL_multi_open (PL_parser->multi_open)
67#define PL_multi_close (PL_parser->multi_close)
199e78b7
DM
68#define PL_preambled (PL_parser->preambled)
69#define PL_sublex_info (PL_parser->sublex_info)
bdc0bf6f 70#define PL_linestr (PL_parser->linestr)
c2598295
DM
71#define PL_expect (PL_parser->expect)
72#define PL_copline (PL_parser->copline)
f06b5848
DM
73#define PL_bufptr (PL_parser->bufptr)
74#define PL_oldbufptr (PL_parser->oldbufptr)
75#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
76#define PL_linestart (PL_parser->linestart)
77#define PL_bufend (PL_parser->bufend)
78#define PL_last_uni (PL_parser->last_uni)
79#define PL_last_lop (PL_parser->last_lop)
80#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 81#define PL_lex_state (PL_parser->lex_state)
2f9285f8 82#define PL_rsfp (PL_parser->rsfp)
5486870f 83#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
84#define PL_in_my (PL_parser->in_my)
85#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 86#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 87#define PL_multi_end (PL_parser->multi_end)
13765c85 88#define PL_error_count (PL_parser->error_count)
199e78b7 89
fb205e7a
DM
90# define PL_nexttoke (PL_parser->nexttoke)
91# define PL_nexttype (PL_parser->nexttype)
92# define PL_nextval (PL_parser->nextval)
199e78b7 93
a1894d81 94static const char* const ident_too_long = "Identifier too long";
8903cb82 95
9ded7720 96# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2 97
a7aaec61
Z
98#define XENUMMASK 0x3f
99#define XFAKEEOF 0x40
100#define XFAKEBRACK 0x80
9059aa12 101
39e02b42
JH
102#ifdef USE_UTF8_SCRIPTS
103# define UTF (!IN_BYTES)
2b9d42f0 104#else
802a15e9 105# define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
2b9d42f0 106#endif
a0ed51b3 107
b1fc3636
CJ
108/* The maximum number of characters preceding the unrecognized one to display */
109#define UNRECOGNIZED_PRECEDE_COUNT 10
110
61f0cdd9 111/* In variables named $^X, these are the legal values for X.
2b92dfce
GS
112 * 1999-02-27 mjd-perl-patch@plover.com */
113#define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
114
14bd96d0 115#define SPACE_OR_TAB(c) isBLANK_A(c)
bf4acbe4 116
9ff909cf
JH
117#define HEXFP_PEEK(s) \
118 (((s[0] == '.') && \
119 (isXDIGIT(s[1]) || isALPHA_FOLD_EQ(s[1], 'p'))) || \
120 isALPHA_FOLD_EQ(s[0], 'p'))
121
ffb4593c
NT
122/* LEX_* are values for PL_lex_state, the state of the lexer.
123 * They are arranged oddly so that the guard on the switch statement
79072805 124 * can get by with a single comparison (if the compiler is smart enough).
9da1dd8f
DM
125 *
126 * These values refer to the various states within a sublex parse,
127 * i.e. within a double quotish string
79072805
LW
128 */
129
fb73857a
PP
130/* #define LEX_NOTPARSING 11 is done in perl.h. */
131
b6007c36
DM
132#define LEX_NORMAL 10 /* normal code (ie not within "...") */
133#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
134#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
135#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
136#define LEX_INTERPSTART 6 /* expecting the start of a $var */
137
138 /* at end of code, eg "$x" followed by: */
139#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
140#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
141
142#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
143 string or after \E, $foo, etc */
144#define LEX_INTERPCONST 2 /* NOT USED */
145#define LEX_FORMLINE 1 /* expecting a format line */
146#define LEX_KNOWNEXT 0 /* next token known; just return it */
147
79072805 148
bbf60fe6 149#ifdef DEBUGGING
27da23d5 150static const char* const lex_state_names[] = {
bbf60fe6
DM
151 "KNOWNEXT",
152 "FORMLINE",
153 "INTERPCONST",
154 "INTERPCONCAT",
155 "INTERPENDMAYBE",
156 "INTERPEND",
157 "INTERPSTART",
158 "INTERPPUSH",
159 "INTERPCASEMOD",
160 "INTERPNORMAL",
161 "NORMAL"
162};
163#endif
164
79072805 165#include "keywords.h"
fe14fcc3 166
ffb4593c
NT
167/* CLINE is a macro that ensures PL_copline has a sane value */
168
57843af0 169#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 170
29595ff2
NC
171# define SKIPSPACE0(s) skipspace(s)
172# define SKIPSPACE1(s) skipspace(s)
173# define SKIPSPACE2(s,tsv) skipspace(s)
174# define PEEKSPACE(s) skipspace(s)
29595ff2 175
ffb4593c
NT
176/*
177 * Convenience functions to return different tokens and prime the
9cbb5ea2 178 * lexer for the next token. They all take an argument.
ffb4593c
NT
179 *
180 * TOKEN : generic token (used for '(', DOLSHARP, etc)
181 * OPERATOR : generic operator
182 * AOPERATOR : assignment operator
183 * PREBLOCK : beginning the block after an if, while, foreach, ...
184 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
185 * PREREF : *EXPR where EXPR is not a simple identifier
186 * TERM : expression term
89f35911 187 * POSTDEREF : postfix dereference (->$* ->@[...] etc.)
ffb4593c
NT
188 * LOOPX : loop exiting command (goto, last, dump, etc)
189 * FTST : file test operator
190 * FUN0 : zero-argument function
7eb971ee 191 * FUN0OP : zero-argument function, with its op created in this file
2d2e263d 192 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
193 * BOop : bitwise or or xor
194 * BAop : bitwise and
195 * SHop : shift operator
196 * PWop : power operator
9cbb5ea2 197 * PMop : pattern-matching operator
ffb4593c
NT
198 * Aop : addition-level operator
199 * Mop : multiplication-level operator
200 * Eop : equality-testing operator
e5edeb50 201 * Rop : relational operator <= != gt
ffb4593c
NT
202 *
203 * Also see LOP and lop() below.
204 */
205
998054bd 206#ifdef DEBUGGING /* Serve -DT. */
704d4215 207# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 208#else
bbf60fe6 209# define REPORT(retval) (retval)
998054bd
SC
210#endif
211
bbf60fe6
DM
212#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
213#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
214#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, REPORT(retval)))
215#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
216#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
217#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
218#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
89f35911 219#define POSTDEREF(f) return (PL_bufptr = s, S_postderef(aTHX_ REPORT(f),s[1]))
6154021b
RGS
220#define LOOPX(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)LOOPEX))
221#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
222#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
7eb971ee 223#define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
6154021b
RGS
224#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
225#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITOROP)))
226#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITANDOP)))
227#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)SHIFTOP)))
228#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)POWOP)))
229#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
230#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)ADDOP)))
231#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MULOP)))
232#define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
233#define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
2f3197b3 234
a687059c
LW
235/* This bit of chicanery makes a unary function followed by
236 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
237 * The UNIDOR macro is for unary functions that can be followed by the //
238 * operator (such as C<shift // 0>).
a687059c 239 */
d68ce4ac 240#define UNI3(f,x,have_x) { \
6154021b 241 pl_yylval.ival = f; \
d68ce4ac 242 if (have_x) PL_expect = x; \
376fcdbf
AL
243 PL_bufptr = s; \
244 PL_last_uni = PL_oldbufptr; \
245 PL_last_lop_op = f; \
246 if (*s == '(') \
247 return REPORT( (int)FUNC1 ); \
29595ff2 248 s = PEEKSPACE(s); \
376fcdbf
AL
249 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
250 }
d68ce4ac
FC
251#define UNI(f) UNI3(f,XTERM,1)
252#define UNIDOR(f) UNI3(f,XTERMORDORDOR,1)
b5fb7ce3
FC
253#define UNIPROTO(f,optional) { \
254 if (optional) PL_last_uni = PL_oldbufptr; \
22393538
MH
255 OPERATOR(f); \
256 }
a687059c 257
d68ce4ac 258#define UNIBRACK(f) UNI3(f,0,0)
79072805 259
9f68db38 260/* grandfather return to old style */
78cdf107
Z
261#define OLDLOP(f) \
262 do { \
263 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
264 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
265 pl_yylval.ival = (f); \
266 PL_expect = XTERM; \
267 PL_bufptr = s; \
268 return (int)LSTOP; \
269 } while(0)
79072805 270
83944c01
FC
271#define COPLINE_INC_WITH_HERELINES \
272 STMT_START { \
273 CopLINE_inc(PL_curcop); \
851b527a
FC
274 if (PL_parser->herelines) \
275 CopLINE(PL_curcop) += PL_parser->herelines, \
276 PL_parser->herelines = 0; \
83944c01 277 } STMT_END
ffdb8b16
FC
278/* Called after scan_str to update CopLINE(PL_curcop), but only when there
279 * is no sublex_push to follow. */
280#define COPLINE_SET_FROM_MULTI_END \
281 STMT_START { \
282 CopLINE_set(PL_curcop, PL_multi_end); \
283 if (PL_multi_end != PL_multi_start) \
851b527a 284 PL_parser->herelines = 0; \
ffdb8b16 285 } STMT_END
83944c01
FC
286
287
8fa7f367
JH
288#ifdef DEBUGGING
289
6154021b 290/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
291enum token_type {
292 TOKENTYPE_NONE,
293 TOKENTYPE_IVAL,
6154021b 294 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6 295 TOKENTYPE_PVAL,
aeaef349 296 TOKENTYPE_OPVAL
bbf60fe6
DM
297};
298
6d4a66ac
NC
299static struct debug_tokens {
300 const int token;
301 enum token_type type;
302 const char *name;
303} const debug_tokens[] =
9041c2e3 304{
bbf60fe6
DM
305 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
306 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
307 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
308 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
309 { ARROW, TOKENTYPE_NONE, "ARROW" },
310 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
311 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
312 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
313 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
314 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
0d863452 315 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
316 { DO, TOKENTYPE_NONE, "DO" },
317 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
318 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
319 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
320 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
321 { ELSE, TOKENTYPE_NONE, "ELSE" },
322 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
323 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
324 { FOR, TOKENTYPE_IVAL, "FOR" },
325 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
705fe0e5
FC
326 { FORMLBRACK, TOKENTYPE_NONE, "FORMLBRACK" },
327 { FORMRBRACK, TOKENTYPE_NONE, "FORMRBRACK" },
bbf60fe6
DM
328 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
329 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
7eb971ee 330 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
bbf60fe6
DM
331 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
332 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
333 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 334 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
335 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
336 { IF, TOKENTYPE_IVAL, "IF" },
5504e6cf 337 { LABEL, TOKENTYPE_PVAL, "LABEL" },
bbf60fe6
DM
338 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
339 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
340 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
341 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
342 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
343 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
344 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
345 { MY, TOKENTYPE_IVAL, "MY" },
bbf60fe6
DM
346 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
347 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
348 { OROP, TOKENTYPE_IVAL, "OROP" },
349 { OROR, TOKENTYPE_NONE, "OROR" },
350 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
88e1f1a2
JV
351 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
352 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6 353 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
cc624add 354 { POSTJOIN, TOKENTYPE_NONE, "POSTJOIN" },
bbf60fe6
DM
355 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
356 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
357 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
358 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
359 { PREINC, TOKENTYPE_NONE, "PREINC" },
360 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
f3f204dc 361 { QWLIST, TOKENTYPE_OPVAL, "QWLIST" },
bbf60fe6
DM
362 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
363 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
f3f204dc 364 { REQUIRE, TOKENTYPE_NONE, "REQUIRE" },
bbf60fe6
DM
365 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
366 { SUB, TOKENTYPE_NONE, "SUB" },
367 { THING, TOKENTYPE_OPVAL, "THING" },
368 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
369 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
370 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
371 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
372 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
373 { USE, TOKENTYPE_IVAL, "USE" },
0d863452 374 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6
DM
375 { WHILE, TOKENTYPE_IVAL, "WHILE" },
376 { WORD, TOKENTYPE_OPVAL, "WORD" },
be25f609 377 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 378 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
379};
380
6154021b 381/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 382
bbf60fe6 383STATIC int
704d4215 384S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 385{
7918f24d
NC
386 PERL_ARGS_ASSERT_TOKEREPORT;
387
bbf60fe6 388 if (DEBUG_T_TEST) {
bd61b366 389 const char *name = NULL;
bbf60fe6 390 enum token_type type = TOKENTYPE_NONE;
f54cb97a 391 const struct debug_tokens *p;
396482e1 392 SV* const report = newSVpvs("<== ");
bbf60fe6 393
f54cb97a 394 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
395 if (p->token == (int)rv) {
396 name = p->name;
397 type = p->type;
398 break;
399 }
400 }
401 if (name)
54667de8 402 Perl_sv_catpv(aTHX_ report, name);
74736ae6 403 else if ((char)rv > ' ' && (char)rv <= '~')
4ebc7986 404 {
bbf60fe6 405 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
4ebc7986
FC
406 if ((char)rv == 'p')
407 sv_catpvs(report, " (pending identifier)");
408 }
bbf60fe6 409 else if (!rv)
396482e1 410 sv_catpvs(report, "EOF");
bbf60fe6
DM
411 else
412 Perl_sv_catpvf(aTHX_ report, "?? %"IVdf, (IV)rv);
413 switch (type) {
414 case TOKENTYPE_NONE:
bbf60fe6
DM
415 break;
416 case TOKENTYPE_IVAL:
704d4215 417 Perl_sv_catpvf(aTHX_ report, "(ival=%"IVdf")", (IV)lvalp->ival);
bbf60fe6
DM
418 break;
419 case TOKENTYPE_OPNUM:
420 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 421 PL_op_name[lvalp->ival]);
bbf60fe6
DM
422 break;
423 case TOKENTYPE_PVAL:
704d4215 424 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
425 break;
426 case TOKENTYPE_OPVAL:
704d4215 427 if (lvalp->opval) {
401441c0 428 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
429 PL_op_name[lvalp->opval->op_type]);
430 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 431 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 432 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
433 }
434
435 }
401441c0 436 else
396482e1 437 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
438 break;
439 }
b6007c36 440 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
441 };
442 return (int)rv;
998054bd
SC
443}
444
b6007c36
DM
445
446/* print the buffer with suitable escapes */
447
448STATIC void
15f169a1 449S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 450{
396482e1 451 SV* const tmp = newSVpvs("");
7918f24d
NC
452
453 PERL_ARGS_ASSERT_PRINTBUF;
454
5d37acd6 455 GCC_DIAG_IGNORE(-Wformat-nonliteral); /* fmt checked by caller */
b6007c36 456 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
5d37acd6 457 GCC_DIAG_RESTORE;
b6007c36
DM
458 SvREFCNT_dec(tmp);
459}
460
8fa7f367
JH
461#endif
462
8290c323
NC
463static int
464S_deprecate_commaless_var_list(pTHX) {
465 PL_expect = XTERM;
466 deprecate("comma-less variable list");
467 return REPORT(','); /* grandfather non-comma-format format */
468}
469
ffb4593c
NT
470/*
471 * S_ao
472 *
c963b151
BD
473 * This subroutine detects &&=, ||=, and //= and turns an ANDAND, OROR or DORDOR
474 * into an OP_ANDASSIGN, OP_ORASSIGN, or OP_DORASSIGN
ffb4593c
NT
475 */
476
76e3520e 477STATIC int
cea2e8a9 478S_ao(pTHX_ int toketype)
a0d0e21e 479{
3280af22
NIS
480 if (*PL_bufptr == '=') {
481 PL_bufptr++;
a0d0e21e 482 if (toketype == ANDAND)
6154021b 483 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 484 else if (toketype == OROR)
6154021b 485 pl_yylval.ival = OP_ORASSIGN;
c963b151 486 else if (toketype == DORDOR)
6154021b 487 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
488 toketype = ASSIGNOP;
489 }
490 return toketype;
491}
492
ffb4593c
NT
493/*
494 * S_no_op
495 * When Perl expects an operator and finds something else, no_op
496 * prints the warning. It always prints "<something> found where
497 * operator expected. It prints "Missing semicolon on previous line?"
498 * if the surprise occurs at the start of the line. "do you need to
499 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
500 * where the compiler doesn't know if foo is a method call or a function.
501 * It prints "Missing operator before end of line" if there's nothing
502 * after the missing operator, or "... before <...>" if there is something
503 * after the missing operator.
504 */
505
76e3520e 506STATIC void
15f169a1 507S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 508{
9d4ba2ae
AL
509 char * const oldbp = PL_bufptr;
510 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 511
7918f24d
NC
512 PERL_ARGS_ASSERT_NO_OP;
513
1189a94a
GS
514 if (!s)
515 s = oldbp;
07c798fb 516 else
1189a94a 517 PL_bufptr = s;
734ab321 518 yywarn(Perl_form(aTHX_ "%s found where operator expected", what), UTF ? SVf_UTF8 : 0);
56da5a46
RGS
519 if (ckWARN_d(WARN_SYNTAX)) {
520 if (is_first)
521 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
522 "\t(Missing semicolon on previous line?)\n");
523 else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) {
f54cb97a 524 const char *t;
8a2bca12 525 for (t = PL_oldoldbufptr; (isWORDCHAR_lazy_if(t,UTF) || *t == ':');
734ab321 526 t += UTF ? UTF8SKIP(t) : 1)
c35e046a 527 NOOP;
56da5a46
RGS
528 if (t < PL_bufptr && isSPACE(*t))
529 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
b17a0679
FC
530 "\t(Do you need to predeclare %"UTF8f"?)\n",
531 UTF8fARG(UTF, t - PL_oldoldbufptr, PL_oldoldbufptr));
56da5a46
RGS
532 }
533 else {
534 assert(s >= oldbp);
535 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
b17a0679
FC
536 "\t(Missing operator before %"UTF8f"?)\n",
537 UTF8fARG(UTF, s - oldbp, oldbp));
56da5a46 538 }
07c798fb 539 }
3280af22 540 PL_bufptr = oldbp;
8990e307
LW
541}
542
ffb4593c
NT
543/*
544 * S_missingterm
545 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 546 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
547 * If we're in a delimited string and the delimiter is a control
548 * character, it's reformatted into a two-char sequence like ^C.
549 * This is fatal.
550 */
551
76e3520e 552STATIC void
cea2e8a9 553S_missingterm(pTHX_ char *s)
8990e307
LW
554{
555 char tmpbuf[3];
556 char q;
557 if (s) {
9d4ba2ae 558 char * const nl = strrchr(s,'\n');
d2719217 559 if (nl)
8990e307
LW
560 *nl = '\0';
561 }
ca8b19a7 562 else if ((U8) PL_multi_close < 32) {
8990e307 563 *tmpbuf = '^';
585ec06d 564 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
565 tmpbuf[2] = '\0';
566 s = tmpbuf;
567 }
568 else {
eb160463 569 *tmpbuf = (char)PL_multi_close;
8990e307
LW
570 tmpbuf[1] = '\0';
571 s = tmpbuf;
572 }
573 q = strchr(s,'"') ? '\'' : '"';
cea2e8a9 574 Perl_croak(aTHX_ "Can't find string terminator %c%s%c anywhere before EOF",q,s,q);
463ee0b2 575}
79072805 576
dd0ac2b9
FC
577#include "feature.h"
578
0d863452 579/*
0d863452
RH
580 * Check whether the named feature is enabled.
581 */
26ea9e12 582bool
3fff3427 583Perl_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
0d863452 584{
4a731d7b 585 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
7918f24d
NC
586
587 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
ca4d40c4
FC
588
589 assert(CURRENT_FEATURE_BUNDLE == FEATURE_BUNDLE_CUSTOM);
7918f24d 590
26ea9e12
NC
591 if (namelen > MAX_FEATURE_LEN)
592 return FALSE;
3fff3427 593 memcpy(&he_name[8], name, namelen);
7d69d4a6 594
c8ca97b0
NC
595 return cBOOL(cop_hints_fetch_pvn(PL_curcop, he_name, 8 + namelen, 0,
596 REFCOUNTED_HE_EXISTS));
0d863452
RH
597}
598
ffb4593c 599/*
9cbb5ea2
GS
600 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
601 * utf16-to-utf8-reversed.
ffb4593c
NT
602 */
603
c39cd008
GS
604#ifdef PERL_CR_FILTER
605static void
606strip_return(SV *sv)
607{
eb578fdb
KW
608 const char *s = SvPVX_const(sv);
609 const char * const e = s + SvCUR(sv);
7918f24d
NC
610
611 PERL_ARGS_ASSERT_STRIP_RETURN;
612
c39cd008
GS
613 /* outer loop optimized to do nothing if there are no CR-LFs */
614 while (s < e) {
615 if (*s++ == '\r' && *s == '\n') {
616 /* hit a CR-LF, need to copy the rest */
eb578fdb 617 char *d = s - 1;
c39cd008
GS
618 *d++ = *s++;
619 while (s < e) {
620 if (*s == '\r' && s[1] == '\n')
621 s++;
622 *d++ = *s++;
623 }
624 SvCUR(sv) -= s - d;
625 return;
626 }
627 }
628}
a868473f 629
76e3520e 630STATIC I32
c39cd008 631S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 632{
f54cb97a 633 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
634 if (count > 0 && !maxlen)
635 strip_return(sv);
636 return count;
a868473f
NIS
637}
638#endif
639
ffb4593c 640/*
8eaa0acf
Z
641=for apidoc Amx|void|lex_start|SV *line|PerlIO *rsfp|U32 flags
642
643Creates and initialises a new lexer/parser state object, supplying
644a context in which to lex and parse from a new source of Perl code.
645A pointer to the new state object is placed in L</PL_parser>. An entry
646is made on the save stack so that upon unwinding the new state object
647will be destroyed and the former value of L</PL_parser> will be restored.
648Nothing else need be done to clean up the parsing context.
649
650The code to be parsed comes from I<line> and I<rsfp>. I<line>, if
651non-null, provides a string (in SV form) containing code to be parsed.
652A copy of the string is made, so subsequent modification of I<line>
653does not affect parsing. I<rsfp>, if non-null, provides an input stream
654from which code will be read to be parsed. If both are non-null, the
655code in I<line> comes first and must consist of complete lines of input,
656and I<rsfp> supplies the remainder of the source.
657
e368b3bd
FC
658The I<flags> parameter is reserved for future use. Currently it is only
659used by perl internally, so extensions should always pass zero.
8eaa0acf
Z
660
661=cut
662*/
ffb4593c 663
27fcb6ee 664/* LEX_START_SAME_FILTER indicates that this is not a new file, so it
87606032
NC
665 can share filters with the current parser.
666 LEX_START_DONT_CLOSE indicates that the file handle wasn't opened by the
667 caller, hence isn't owned by the parser, so shouldn't be closed on parser
668 destruction. This is used to handle the case of defaulting to reading the
669 script from the standard input because no filename was given on the command
670 line (without getting confused by situation where STDIN has been closed, so
671 the script handle is opened on fd 0) */
27fcb6ee 672
a0d0e21e 673void
8eaa0acf 674Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
79072805 675{
6ef55633 676 const char *s = NULL;
5486870f 677 yy_parser *parser, *oparser;
60d63348 678 if (flags && flags & ~LEX_START_FLAGS)
8eaa0acf 679 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
acdf0a21
DM
680
681 /* create and initialise a parser */
682
199e78b7 683 Newxz(parser, 1, yy_parser);
5486870f 684 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
685 PL_parser = parser;
686
28ac2b49
Z
687 parser->stack = NULL;
688 parser->ps = NULL;
689 parser->stack_size = 0;
acdf0a21 690
e3abe207
DM
691 /* on scope exit, free this parser and restore any outer one */
692 SAVEPARSER(parser);
7c4baf47 693 parser->saved_curcop = PL_curcop;
e3abe207 694
acdf0a21 695 /* initialise lexer state */
8990e307 696
fb205e7a 697 parser->nexttoke = 0;
ca4cfd28 698 parser->error_count = oparser ? oparser->error_count : 0;
7f1c3e8c 699 parser->copline = parser->preambling = NOLINE;
5afb0a62 700 parser->lex_state = LEX_NORMAL;
c2598295 701 parser->expect = XSTATE;
2f9285f8 702 parser->rsfp = rsfp;
27fcb6ee
FC
703 parser->rsfp_filters =
704 !(flags & LEX_START_SAME_FILTER) || !oparser
d3cd8e11
FC
705 ? NULL
706 : MUTABLE_AV(SvREFCNT_inc(
707 oparser->rsfp_filters
708 ? oparser->rsfp_filters
709 : (oparser->rsfp_filters = newAV())
710 ));
2f9285f8 711
199e78b7
DM
712 Newx(parser->lex_brackstack, 120, char);
713 Newx(parser->lex_casestack, 12, char);
714 *parser->lex_casestack = '\0';
d794b522 715 Newxz(parser->lex_shared, 1, LEXSHARED);
02b34bbe 716
10efb74f 717 if (line) {
0528fd32 718 STRLEN len;
10efb74f 719 s = SvPV_const(line, len);
0abcdfa4
FC
720 parser->linestr = flags & LEX_START_COPIED
721 ? SvREFCNT_inc_simple_NN(line)
722 : newSVpvn_flags(s, len, SvUTF8(line));
bf1b738b 723 sv_catpvn(parser->linestr, "\n;", rsfp ? 1 : 2);
0abcdfa4 724 } else {
bf1b738b 725 parser->linestr = newSVpvn("\n;", rsfp ? 1 : 2);
8990e307 726 }
f06b5848
DM
727 parser->oldoldbufptr =
728 parser->oldbufptr =
729 parser->bufptr =
730 parser->linestart = SvPVX(parser->linestr);
731 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
732 parser->last_lop = parser->last_uni = NULL;
b54f893d
KW
733
734 assert(FITS_IN_8_BITS(LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
735 |LEX_DONT_CLOSE_RSFP));
736 parser->lex_flags = (U8) (flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
737 |LEX_DONT_CLOSE_RSFP));
737c24fc 738
60d63348 739 parser->in_pod = parser->filtered = 0;
79072805 740}
a687059c 741
e3abe207
DM
742
743/* delete a parser object */
744
745void
746Perl_parser_free(pTHX_ const yy_parser *parser)
747{
7918f24d
NC
748 PERL_ARGS_ASSERT_PARSER_FREE;
749
7c4baf47 750 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
751 SvREFCNT_dec(parser->linestr);
752
87606032 753 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
2f9285f8 754 PerlIO_clearerr(parser->rsfp);
799361c3
SH
755 else if (parser->rsfp && (!parser->old_parser ||
756 (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 757 PerlIO_close(parser->rsfp);
5486870f 758 SvREFCNT_dec(parser->rsfp_filters);
10002bc1
FC
759 SvREFCNT_dec(parser->lex_stuff);
760 SvREFCNT_dec(parser->sublex_info.repl);
3ac7ff8f
FC
761
762 Safefree(parser->lex_brackstack);
763 Safefree(parser->lex_casestack);
764 Safefree(parser->lex_shared);
765 PL_parser = parser->old_parser;
766 Safefree(parser);
767}
768
769void
770Perl_parser_free_nexttoke_ops(pTHX_ yy_parser *parser, OPSLAB *slab)
771{
3ac7ff8f 772 I32 nexttoke = parser->nexttoke;
3ac7ff8f 773 PERL_ARGS_ASSERT_PARSER_FREE_NEXTTOKE_OPS;
3ce3dcd9 774 while (nexttoke--) {
3ac7ff8f
FC
775 if (S_is_opval_token(parser->nexttype[nexttoke] & 0xffff)
776 && parser->nextval[nexttoke].opval
777 && parser->nextval[nexttoke].opval->op_slabbed
778 && OpSLAB(parser->nextval[nexttoke].opval) == slab) {
3ce3dcd9 779 op_free(parser->nextval[nexttoke].opval);
3ac7ff8f
FC
780 parser->nextval[nexttoke].opval = NULL;
781 }
3ce3dcd9 782 }
e3abe207
DM
783}
784
785
ffb4593c 786/*
f0e67a1d
Z
787=for apidoc AmxU|SV *|PL_parser-E<gt>linestr
788
789Buffer scalar containing the chunk currently under consideration of the
790text currently being lexed. This is always a plain string scalar (for
791which C<SvPOK> is true). It is not intended to be used as a scalar by
792normal scalar means; instead refer to the buffer directly by the pointer
793variables described below.
794
795The lexer maintains various C<char*> pointers to things in the
796C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
797reallocated, all of these pointers must be updated. Don't attempt to
798do this manually, but rather use L</lex_grow_linestr> if you need to
799reallocate the buffer.
800
801The content of the text chunk in the buffer is commonly exactly one
802complete line of input, up to and including a newline terminator,
803but there are situations where it is otherwise. The octets of the
804buffer may be intended to be interpreted as either UTF-8 or Latin-1.
805The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
806flag on this scalar, which may disagree with it.
807
808For direct examination of the buffer, the variable
809L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
810lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
811of these pointers is usually preferable to examination of the scalar
812through normal scalar means.
813
814=for apidoc AmxU|char *|PL_parser-E<gt>bufend
815
816Direct pointer to the end of the chunk of text currently being lexed, the
817end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
6602b933 818+ SvCUR(PL_parser-E<gt>linestr)>. A C<NUL> character (zero octet) is
f0e67a1d
Z
819always located at the end of the buffer, and does not count as part of
820the buffer's contents.
821
822=for apidoc AmxU|char *|PL_parser-E<gt>bufptr
823
824Points to the current position of lexing inside the lexer buffer.
825Characters around this point may be freely examined, within
826the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
827L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
828interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
829
830Lexing code (whether in the Perl core or not) moves this pointer past
831the characters that it consumes. It is also expected to perform some
832bookkeeping whenever a newline character is consumed. This movement
833can be more conveniently performed by the function L</lex_read_to>,
834which handles newlines appropriately.
835
836Interpretation of the buffer's octets can be abstracted out by
837using the slightly higher-level functions L</lex_peek_unichar> and
838L</lex_read_unichar>.
839
840=for apidoc AmxU|char *|PL_parser-E<gt>linestart
841
842Points to the start of the current line inside the lexer buffer.
843This is useful for indicating at which column an error occurred, and
844not much else. This must be updated by any lexing code that consumes
845a newline; the function L</lex_read_to> handles this detail.
846
847=cut
848*/
849
850/*
851=for apidoc Amx|bool|lex_bufutf8
852
853Indicates whether the octets in the lexer buffer
854(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
855of Unicode characters. If not, they should be interpreted as Latin-1
856characters. This is analogous to the C<SvUTF8> flag for scalars.
857
858In UTF-8 mode, it is not guaranteed that the lexer buffer actually
859contains valid UTF-8. Lexing code must be robust in the face of invalid
860encoding.
861
862The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
863is significant, but not the whole story regarding the input character
864encoding. Normally, when a file is being read, the scalar contains octets
865and its C<SvUTF8> flag is off, but the octets should be interpreted as
866UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
867however, the scalar may have the C<SvUTF8> flag on, and in this case its
868octets should be interpreted as UTF-8 unless the C<use bytes> pragma
869is in effect. This logic may change in the future; use this function
870instead of implementing the logic yourself.
871
872=cut
873*/
874
875bool
876Perl_lex_bufutf8(pTHX)
877{
878 return UTF;
879}
880
881/*
882=for apidoc Amx|char *|lex_grow_linestr|STRLEN len
883
884Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
6602b933 885at least I<len> octets (including terminating C<NUL>). Returns a
f0e67a1d
Z
886pointer to the reallocated buffer. This is necessary before making
887any direct modification of the buffer that would increase its length.
888L</lex_stuff_pvn> provides a more convenient way to insert text into
889the buffer.
890
891Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
892this function updates all of the lexer's variables that point directly
893into the buffer.
894
895=cut
896*/
897
898char *
899Perl_lex_grow_linestr(pTHX_ STRLEN len)
900{
901 SV *linestr;
902 char *buf;
903 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
c7641931 904 STRLEN linestart_pos, last_uni_pos, last_lop_pos, re_eval_start_pos;
f0e67a1d
Z
905 linestr = PL_parser->linestr;
906 buf = SvPVX(linestr);
907 if (len <= SvLEN(linestr))
908 return buf;
909 bufend_pos = PL_parser->bufend - buf;
910 bufptr_pos = PL_parser->bufptr - buf;
911 oldbufptr_pos = PL_parser->oldbufptr - buf;
912 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
913 linestart_pos = PL_parser->linestart - buf;
914 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
915 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
3328ab5a
FC
916 re_eval_start_pos = PL_parser->lex_shared->re_eval_start ?
917 PL_parser->lex_shared->re_eval_start - buf : 0;
c7641931 918
f0e67a1d 919 buf = sv_grow(linestr, len);
c7641931 920
f0e67a1d
Z
921 PL_parser->bufend = buf + bufend_pos;
922 PL_parser->bufptr = buf + bufptr_pos;
923 PL_parser->oldbufptr = buf + oldbufptr_pos;
924 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
925 PL_parser->linestart = buf + linestart_pos;
926 if (PL_parser->last_uni)
927 PL_parser->last_uni = buf + last_uni_pos;
928 if (PL_parser->last_lop)
929 PL_parser->last_lop = buf + last_lop_pos;
3328ab5a
FC
930 if (PL_parser->lex_shared->re_eval_start)
931 PL_parser->lex_shared->re_eval_start = buf + re_eval_start_pos;
f0e67a1d
Z
932 return buf;
933}
934
935/*
83aa740e 936=for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
f0e67a1d
Z
937
938Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
939immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
940reallocating the buffer if necessary. This means that lexing code that
941runs later will see the characters as if they had appeared in the input.
942It is not recommended to do this as part of normal parsing, and most
943uses of this facility run the risk of the inserted characters being
944interpreted in an unintended manner.
945
946The string to be inserted is represented by I<len> octets starting
947at I<pv>. These octets are interpreted as either UTF-8 or Latin-1,
948according to whether the C<LEX_STUFF_UTF8> flag is set in I<flags>.
949The characters are recoded for the lexer buffer, according to how the
950buffer is currently being interpreted (L</lex_bufutf8>). If a string
9dcc53ea 951to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
f0e67a1d
Z
952function is more convenient.
953
954=cut
955*/
956
957void
83aa740e 958Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
f0e67a1d 959{
749123ff 960 dVAR;
f0e67a1d
Z
961 char *bufptr;
962 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
963 if (flags & ~(LEX_STUFF_UTF8))
964 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
965 if (UTF) {
966 if (flags & LEX_STUFF_UTF8) {
967 goto plain_copy;
968 } else {
54d004e8 969 STRLEN highhalf = 0; /* Count of variants */
83aa740e 970 const char *p, *e = pv+len;
54d004e8
KW
971 for (p = pv; p != e; p++) {
972 if (! UTF8_IS_INVARIANT(*p)) {
973 highhalf++;
974 }
975 }
f0e67a1d
Z
976 if (!highhalf)
977 goto plain_copy;
978 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
979 bufptr = PL_parser->bufptr;
980 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
981 SvCUR_set(PL_parser->linestr,
982 SvCUR(PL_parser->linestr) + len+highhalf);
f0e67a1d
Z
983 PL_parser->bufend += len+highhalf;
984 for (p = pv; p != e; p++) {
985 U8 c = (U8)*p;
54d004e8
KW
986 if (! UTF8_IS_INVARIANT(c)) {
987 *bufptr++ = UTF8_TWO_BYTE_HI(c);
988 *bufptr++ = UTF8_TWO_BYTE_LO(c);
f0e67a1d
Z
989 } else {
990 *bufptr++ = (char)c;
991 }
992 }
993 }
994 } else {
995 if (flags & LEX_STUFF_UTF8) {
996 STRLEN highhalf = 0;
83aa740e 997 const char *p, *e = pv+len;
f0e67a1d
Z
998 for (p = pv; p != e; p++) {
999 U8 c = (U8)*p;
54d004e8 1000 if (UTF8_IS_ABOVE_LATIN1(c)) {
f0e67a1d
Z
1001 Perl_croak(aTHX_ "Lexing code attempted to stuff "
1002 "non-Latin-1 character into Latin-1 input");
54d004e8 1003 } else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e)) {
f0e67a1d
Z
1004 p++;
1005 highhalf++;
54d004e8 1006 } else if (! UTF8_IS_INVARIANT(c)) {
f0e67a1d
Z
1007 /* malformed UTF-8 */
1008 ENTER;
1009 SAVESPTR(PL_warnhook);
1010 PL_warnhook = PERL_WARNHOOK_FATAL;
c80e42f3 1011 utf8n_to_uvchr((U8*)p, e-p, NULL, 0);
f0e67a1d
Z
1012 LEAVE;
1013 }
1014 }
1015 if (!highhalf)
1016 goto plain_copy;
1017 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
1018 bufptr = PL_parser->bufptr;
1019 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1020 SvCUR_set(PL_parser->linestr,
1021 SvCUR(PL_parser->linestr) + len-highhalf);
f0e67a1d 1022 PL_parser->bufend += len-highhalf;
54d004e8
KW
1023 p = pv;
1024 while (p < e) {
1025 if (UTF8_IS_INVARIANT(*p)) {
1026 *bufptr++ = *p;
1027 p++;
f0e67a1d 1028 }
54d004e8
KW
1029 else {
1030 assert(p < e -1 );
94bb8c36 1031 *bufptr++ = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
54d004e8
KW
1032 p += 2;
1033 }
f0e67a1d
Z
1034 }
1035 } else {
54d004e8 1036 plain_copy:
f0e67a1d
Z
1037 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1038 bufptr = PL_parser->bufptr;
1039 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
255fdf19 1040 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
f0e67a1d
Z
1041 PL_parser->bufend += len;
1042 Copy(pv, bufptr, len, char);
1043 }
1044 }
1045}
1046
1047/*
9dcc53ea
Z
1048=for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
1049
1050Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1051immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1052reallocating the buffer if necessary. This means that lexing code that
1053runs later will see the characters as if they had appeared in the input.
1054It is not recommended to do this as part of normal parsing, and most
1055uses of this facility run the risk of the inserted characters being
1056interpreted in an unintended manner.
1057
1058The string to be inserted is represented by octets starting at I<pv>
1059and continuing to the first nul. These octets are interpreted as either
1060UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
1061in I<flags>. The characters are recoded for the lexer buffer, according
1062to how the buffer is currently being interpreted (L</lex_bufutf8>).
1063If it is not convenient to nul-terminate a string to be inserted, the
1064L</lex_stuff_pvn> function is more appropriate.
1065
1066=cut
1067*/
1068
1069void
1070Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1071{
1072 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1073 lex_stuff_pvn(pv, strlen(pv), flags);
1074}
1075
1076/*
f0e67a1d
Z
1077=for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1078
1079Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1080immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1081reallocating the buffer if necessary. This means that lexing code that
1082runs later will see the characters as if they had appeared in the input.
1083It is not recommended to do this as part of normal parsing, and most
1084uses of this facility run the risk of the inserted characters being
1085interpreted in an unintended manner.
1086
1087The string to be inserted is the string value of I<sv>. The characters
1088are recoded for the lexer buffer, according to how the buffer is currently
9dcc53ea 1089being interpreted (L</lex_bufutf8>). If a string to be inserted is
f0e67a1d
Z
1090not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1091need to construct a scalar.
1092
1093=cut
1094*/
1095
1096void
1097Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1098{
1099 char *pv;
1100 STRLEN len;
1101 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1102 if (flags)
1103 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1104 pv = SvPV(sv, len);
1105 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1106}
1107
1108/*
1109=for apidoc Amx|void|lex_unstuff|char *ptr
1110
1111Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1112I<ptr>. Text following I<ptr> will be moved, and the buffer shortened.
1113This hides the discarded text from any lexing code that runs later,
1114as if the text had never appeared.
1115
1116This is not the normal way to consume lexed text. For that, use
1117L</lex_read_to>.
1118
1119=cut
1120*/
1121
1122void
1123Perl_lex_unstuff(pTHX_ char *ptr)
1124{
1125 char *buf, *bufend;
1126 STRLEN unstuff_len;
1127 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1128 buf = PL_parser->bufptr;
1129 if (ptr < buf)
1130 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1131 if (ptr == buf)
1132 return;
1133 bufend = PL_parser->bufend;
1134 if (ptr > bufend)
1135 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1136 unstuff_len = ptr - buf;
1137 Move(ptr, buf, bufend+1-ptr, char);
1138 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1139 PL_parser->bufend = bufend - unstuff_len;
1140}
1141
1142/*
1143=for apidoc Amx|void|lex_read_to|char *ptr
1144
1145Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1146to I<ptr>. This advances L</PL_parser-E<gt>bufptr> to match I<ptr>,
1147performing the correct bookkeeping whenever a newline character is passed.
1148This is the normal way to consume lexed text.
1149
1150Interpretation of the buffer's octets can be abstracted out by
1151using the slightly higher-level functions L</lex_peek_unichar> and
1152L</lex_read_unichar>.
1153
1154=cut
1155*/
1156
1157void
1158Perl_lex_read_to(pTHX_ char *ptr)
1159{
1160 char *s;
1161 PERL_ARGS_ASSERT_LEX_READ_TO;
1162 s = PL_parser->bufptr;
1163 if (ptr < s || ptr > PL_parser->bufend)
1164 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1165 for (; s != ptr; s++)
1166 if (*s == '\n') {
83944c01 1167 COPLINE_INC_WITH_HERELINES;
f0e67a1d
Z
1168 PL_parser->linestart = s+1;
1169 }
1170 PL_parser->bufptr = ptr;
1171}
1172
1173/*
1174=for apidoc Amx|void|lex_discard_to|char *ptr
1175
1176Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1177up to I<ptr>. The remaining content of the buffer will be moved, and
1178all pointers into the buffer updated appropriately. I<ptr> must not
1179be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1180it is not permitted to discard text that has yet to be lexed.
1181
1182Normally it is not necessarily to do this directly, because it suffices to
1183use the implicit discarding behaviour of L</lex_next_chunk> and things
1184based on it. However, if a token stretches across multiple lines,
1f317c95 1185and the lexing code has kept multiple lines of text in the buffer for
f0e67a1d
Z
1186that purpose, then after completion of the token it would be wise to
1187explicitly discard the now-unneeded earlier lines, to avoid future
1188multi-line tokens growing the buffer without bound.
1189
1190=cut
1191*/
1192
1193void
1194Perl_lex_discard_to(pTHX_ char *ptr)
1195{
1196 char *buf;
1197 STRLEN discard_len;
1198 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1199 buf = SvPVX(PL_parser->linestr);
1200 if (ptr < buf)
1201 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1202 if (ptr == buf)
1203 return;
1204 if (ptr > PL_parser->bufptr)
1205 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1206 discard_len = ptr - buf;
1207 if (PL_parser->oldbufptr < ptr)
1208 PL_parser->oldbufptr = ptr;
1209 if (PL_parser->oldoldbufptr < ptr)
1210 PL_parser->oldoldbufptr = ptr;
1211 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1212 PL_parser->last_uni = NULL;
1213 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1214 PL_parser->last_lop = NULL;
1215 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1216 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1217 PL_parser->bufend -= discard_len;
1218 PL_parser->bufptr -= discard_len;
1219 PL_parser->oldbufptr -= discard_len;
1220 PL_parser->oldoldbufptr -= discard_len;
1221 if (PL_parser->last_uni)
1222 PL_parser->last_uni -= discard_len;
1223 if (PL_parser->last_lop)
1224 PL_parser->last_lop -= discard_len;
1225}
1226
1227/*
1228=for apidoc Amx|bool|lex_next_chunk|U32 flags
1229
1230Reads in the next chunk of text to be lexed, appending it to
1231L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1232looked to the end of the current chunk and wants to know more. It is
1233usual, but not necessary, for lexing to have consumed the entirety of
1234the current chunk at this time.
1235
1236If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1237chunk (i.e., the current chunk has been entirely consumed), normally the
1238current chunk will be discarded at the same time that the new chunk is
1239read in. If I<flags> includes C<LEX_KEEP_PREVIOUS>, the current chunk
1240will not be discarded. If the current chunk has not been entirely
1241consumed, then it will not be discarded regardless of the flag.
1242
1243Returns true if some new text was added to the buffer, or false if the
1244buffer has reached the end of the input text.
1245
1246=cut
1247*/
1248
1249#define LEX_FAKE_EOF 0x80000000
112d1284 1250#define LEX_NO_TERM 0x40000000
f0e67a1d
Z
1251
1252bool
1253Perl_lex_next_chunk(pTHX_ U32 flags)
1254{
1255 SV *linestr;
1256 char *buf;
1257 STRLEN old_bufend_pos, new_bufend_pos;
1258 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1259 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1260 bool got_some_for_debugger = 0;
f0e67a1d 1261 bool got_some;
112d1284 1262 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM))
f0e67a1d 1263 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
f0e67a1d
Z
1264 linestr = PL_parser->linestr;
1265 buf = SvPVX(linestr);
1266 if (!(flags & LEX_KEEP_PREVIOUS) &&
1267 PL_parser->bufptr == PL_parser->bufend) {
1268 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1269 linestart_pos = 0;
1270 if (PL_parser->last_uni != PL_parser->bufend)
1271 PL_parser->last_uni = NULL;
1272 if (PL_parser->last_lop != PL_parser->bufend)
1273 PL_parser->last_lop = NULL;
1274 last_uni_pos = last_lop_pos = 0;
1275 *buf = 0;
1276 SvCUR(linestr) = 0;
1277 } else {
1278 old_bufend_pos = PL_parser->bufend - buf;
1279 bufptr_pos = PL_parser->bufptr - buf;
1280 oldbufptr_pos = PL_parser->oldbufptr - buf;
1281 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1282 linestart_pos = PL_parser->linestart - buf;
1283 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1284 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1285 }
1286 if (flags & LEX_FAKE_EOF) {
1287 goto eof;
60d63348 1288 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
f0e67a1d
Z
1289 got_some = 0;
1290 } else if (filter_gets(linestr, old_bufend_pos)) {
1291 got_some = 1;
17cc9359 1292 got_some_for_debugger = 1;
112d1284
FC
1293 } else if (flags & LEX_NO_TERM) {
1294 got_some = 0;
f0e67a1d 1295 } else {
580561a3
Z
1296 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1297 sv_setpvs(linestr, "");
f0e67a1d
Z
1298 eof:
1299 /* End of real input. Close filehandle (unless it was STDIN),
1300 * then add implicit termination.
1301 */
87606032 1302 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
f0e67a1d
Z
1303 PerlIO_clearerr(PL_parser->rsfp);
1304 else if (PL_parser->rsfp)
1305 (void)PerlIO_close(PL_parser->rsfp);
1306 PL_parser->rsfp = NULL;
60d63348 1307 PL_parser->in_pod = PL_parser->filtered = 0;
f0e67a1d
Z
1308 if (!PL_in_eval && PL_minus_p) {
1309 sv_catpvs(linestr,
1310 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1311 PL_minus_n = PL_minus_p = 0;
1312 } else if (!PL_in_eval && PL_minus_n) {
1313 sv_catpvs(linestr, /*{*/";}");
1314 PL_minus_n = 0;
1315 } else
1316 sv_catpvs(linestr, ";");
1317 got_some = 1;
1318 }
1319 buf = SvPVX(linestr);
1320 new_bufend_pos = SvCUR(linestr);
1321 PL_parser->bufend = buf + new_bufend_pos;
1322 PL_parser->bufptr = buf + bufptr_pos;
1323 PL_parser->oldbufptr = buf + oldbufptr_pos;
1324 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1325 PL_parser->linestart = buf + linestart_pos;
1326 if (PL_parser->last_uni)
1327 PL_parser->last_uni = buf + last_uni_pos;
1328 if (PL_parser->last_lop)
1329 PL_parser->last_lop = buf + last_lop_pos;
7f1c3e8c
FC
1330 if (PL_parser->preambling != NOLINE) {
1331 CopLINE_set(PL_curcop, PL_parser->preambling + 1);
1332 PL_parser->preambling = NOLINE;
1333 }
17cc9359 1334 if (got_some_for_debugger && (PERLDB_LINE || PERLDB_SAVESRC) &&
f0e67a1d
Z
1335 PL_curstash != PL_debstash) {
1336 /* debugger active and we're not compiling the debugger code,
1337 * so store the line into the debugger's array of lines
1338 */
1339 update_debugger_info(NULL, buf+old_bufend_pos,
1340 new_bufend_pos-old_bufend_pos);
1341 }
1342 return got_some;
1343}
1344
1345/*
1346=for apidoc Amx|I32|lex_peek_unichar|U32 flags
1347
1348Looks ahead one (Unicode) character in the text currently being lexed.
1349Returns the codepoint (unsigned integer value) of the next character,
1350or -1 if lexing has reached the end of the input text. To consume the
1351peeked character, use L</lex_read_unichar>.
1352
1353If the next character is in (or extends into) the next chunk of input
1354text, the next chunk will be read in. Normally the current chunk will be
1355discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1356then the current chunk will not be discarded.
1357
1358If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1359is encountered, an exception is generated.
1360
1361=cut
1362*/
1363
1364I32
1365Perl_lex_peek_unichar(pTHX_ U32 flags)
1366{
749123ff 1367 dVAR;
f0e67a1d
Z
1368 char *s, *bufend;
1369 if (flags & ~(LEX_KEEP_PREVIOUS))
1370 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1371 s = PL_parser->bufptr;
1372 bufend = PL_parser->bufend;
1373 if (UTF) {
1374 U8 head;
1375 I32 unichar;
1376 STRLEN len, retlen;
1377 if (s == bufend) {
1378 if (!lex_next_chunk(flags))
1379 return -1;
1380 s = PL_parser->bufptr;
1381 bufend = PL_parser->bufend;
1382 }
1383 head = (U8)*s;
54d004e8 1384 if (UTF8_IS_INVARIANT(head))
f0e67a1d 1385 return head;
54d004e8
KW
1386 if (UTF8_IS_START(head)) {
1387 len = UTF8SKIP(&head);
f0e67a1d
Z
1388 while ((STRLEN)(bufend-s) < len) {
1389 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1390 break;
1391 s = PL_parser->bufptr;
1392 bufend = PL_parser->bufend;
1393 }
1394 }
c80e42f3 1395 unichar = utf8n_to_uvchr((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
f0e67a1d
Z
1396 if (retlen == (STRLEN)-1) {
1397 /* malformed UTF-8 */
1398 ENTER;
1399 SAVESPTR(PL_warnhook);
1400 PL_warnhook = PERL_WARNHOOK_FATAL;
c80e42f3 1401 utf8n_to_uvchr((U8*)s, bufend-s, NULL, 0);
f0e67a1d
Z
1402 LEAVE;
1403 }
1404 return unichar;
1405 } else {
1406 if (s == bufend) {
1407 if (!lex_next_chunk(flags))
1408 return -1;
1409 s = PL_parser->bufptr;
1410 }
1411 return (U8)*s;
1412 }
1413}
1414
1415/*
1416=for apidoc Amx|I32|lex_read_unichar|U32 flags
1417
1418Reads the next (Unicode) character in the text currently being lexed.
1419Returns the codepoint (unsigned integer value) of the character read,
1420and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1421if lexing has reached the end of the input text. To non-destructively
1422examine the next character, use L</lex_peek_unichar> instead.
1423
1424If the next character is in (or extends into) the next chunk of input
1425text, the next chunk will be read in. Normally the current chunk will be
1426discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1427then the current chunk will not be discarded.
1428
1429If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1430is encountered, an exception is generated.
1431
1432=cut
1433*/
1434
1435I32
1436Perl_lex_read_unichar(pTHX_ U32 flags)
1437{
1438 I32 c;
1439 if (flags & ~(LEX_KEEP_PREVIOUS))
1440 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1441 c = lex_peek_unichar(flags);
1442 if (c != -1) {
1443 if (c == '\n')
83944c01 1444 COPLINE_INC_WITH_HERELINES;
d9018cbe
EB
1445 if (UTF)
1446 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1447 else
1448 ++(PL_parser->bufptr);
f0e67a1d
Z
1449 }
1450 return c;
1451}
1452
1453/*
1454=for apidoc Amx|void|lex_read_space|U32 flags
1455
1456Reads optional spaces, in Perl style, in the text currently being
1457lexed. The spaces may include ordinary whitespace characters and
1458Perl-style comments. C<#line> directives are processed if encountered.
1459L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1460at a non-space character (or the end of the input text).
1461
1462If spaces extend into the next chunk of input text, the next chunk will
1463be read in. Normally the current chunk will be discarded at the same
1464time, but if I<flags> includes C<LEX_KEEP_PREVIOUS> then the current
1465chunk will not be discarded.
1466
1467=cut
1468*/
1469
21791330 1470#define LEX_NO_INCLINE 0x40000000
f0998909
Z
1471#define LEX_NO_NEXT_CHUNK 0x80000000
1472
f0e67a1d
Z
1473void
1474Perl_lex_read_space(pTHX_ U32 flags)
1475{
1476 char *s, *bufend;
21791330 1477 const bool can_incline = !(flags & LEX_NO_INCLINE);
f0e67a1d 1478 bool need_incline = 0;
21791330 1479 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK|LEX_NO_INCLINE))
f0e67a1d 1480 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
f0e67a1d
Z
1481 s = PL_parser->bufptr;
1482 bufend = PL_parser->bufend;
1483 while (1) {
1484 char c = *s;
1485 if (c == '#') {
1486 do {
1487 c = *++s;
1488 } while (!(c == '\n' || (c == 0 && s == bufend)));
1489 } else if (c == '\n') {
1490 s++;
21791330
FC
1491 if (can_incline) {
1492 PL_parser->linestart = s;
1493 if (s == bufend)
1494 need_incline = 1;
1495 else
1496 incline(s);
1497 }
f0e67a1d
Z
1498 } else if (isSPACE(c)) {
1499 s++;
1500 } else if (c == 0 && s == bufend) {
1501 bool got_more;
65c68e17 1502 line_t l;
f0998909
Z
1503 if (flags & LEX_NO_NEXT_CHUNK)
1504 break;
f0e67a1d 1505 PL_parser->bufptr = s;
65c68e17 1506 l = CopLINE(PL_curcop);
851b527a 1507 CopLINE(PL_curcop) += PL_parser->herelines + 1;
f0e67a1d 1508 got_more = lex_next_chunk(flags);
65c68e17 1509 CopLINE_set(PL_curcop, l);
f0e67a1d
Z
1510 s = PL_parser->bufptr;
1511 bufend = PL_parser->bufend;
1512 if (!got_more)
1513 break;
21791330 1514 if (can_incline && need_incline && PL_parser->rsfp) {
f0e67a1d
Z
1515 incline(s);
1516 need_incline = 0;
1517 }
1518 } else {
1519 break;
1520 }
1521 }
f0e67a1d
Z
1522 PL_parser->bufptr = s;
1523}
1524
1525/*
fe788d6b
PM
1526
1527=for apidoc EXMp|bool|validate_proto|SV *name|SV *proto|bool warn
1528
1529This function performs syntax checking on a prototype, C<proto>.
1530If C<warn> is true, any illegal characters or mismatched brackets
1531will trigger illegalproto warnings, declaring that they were
1532detected in the prototype for C<name>.
1533
1534The return value is C<true> if this is a valid prototype, and
1535C<false> if it is not, regardless of whether C<warn> was C<true> or
1536C<false>.
1537
1538Note that C<NULL> is a valid C<proto> and will always return C<true>.
1539
1540=cut
1541
1542 */
1543
1544bool
1545Perl_validate_proto(pTHX_ SV *name, SV *proto, bool warn)
1546{
1547 STRLEN len, origlen;
1548 char *p = proto ? SvPV(proto, len) : NULL;
1549 bool bad_proto = FALSE;
1550 bool in_brackets = FALSE;
1551 bool after_slash = FALSE;
1552 char greedy_proto = ' ';
1553 bool proto_after_greedy_proto = FALSE;
1554 bool must_be_last = FALSE;
1555 bool underscore = FALSE;
f791a21a 1556 bool bad_proto_after_underscore = FALSE;
fe788d6b
PM
1557
1558 PERL_ARGS_ASSERT_VALIDATE_PROTO;
1559
1560 if (!proto)
1561 return TRUE;
1562
1563 origlen = len;
1564 for (; len--; p++) {
1565 if (!isSPACE(*p)) {
1566 if (must_be_last)
1567 proto_after_greedy_proto = TRUE;
f791a21a
PM
1568 if (underscore) {
1569 if (!strchr(";@%", *p))
1570 bad_proto_after_underscore = TRUE;
1571 underscore = FALSE;
1572 }
fe788d6b
PM
1573 if (!strchr("$@%*;[]&\\_+", *p) || *p == '\0') {
1574 bad_proto = TRUE;
1575 }
1576 else {
fe788d6b
PM
1577 if (*p == '[')
1578 in_brackets = TRUE;
1579 else if (*p == ']')
1580 in_brackets = FALSE;
1581 else if ((*p == '@' || *p == '%') &&
1582 !after_slash &&
1583 !in_brackets ) {
1584 must_be_last = TRUE;
1585 greedy_proto = *p;
1586 }
1587 else if (*p == '_')
f791a21a 1588 underscore = TRUE;
fe788d6b
PM
1589 }
1590 if (*p == '\\')
1591 after_slash = TRUE;
1592 else
1593 after_slash = FALSE;
1594 }
1595 }
1596
1597 if (warn) {
b54d603d 1598 SV *tmpsv = newSVpvs_flags("", SVs_TEMP);
fe788d6b 1599 p -= origlen;
b54d603d
PM
1600 p = SvUTF8(proto)
1601 ? sv_uni_display(tmpsv, newSVpvn_flags(p, origlen, SVs_TEMP | SVf_UTF8),
1602 origlen, UNI_DISPLAY_ISPRINT)
1603 : pv_pretty(tmpsv, p, origlen, 60, NULL, NULL, PERL_PV_ESCAPE_NONASCII);
1604
fe788d6b
PM
1605 if (proto_after_greedy_proto)
1606 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1607 "Prototype after '%c' for %"SVf" : %s",
1608 greedy_proto, SVfARG(name), p);
50278ed0
PM
1609 if (in_brackets)
1610 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1611 "Missing ']' in prototype for %"SVf" : %s",
1612 SVfARG(name), p);
b54d603d 1613 if (bad_proto)
fe788d6b 1614 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
f791a21a
PM
1615 "Illegal character in prototype for %"SVf" : %s",
1616 SVfARG(name), p);
1617 if (bad_proto_after_underscore)
1618 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1619 "Illegal character after '_' in prototype for %"SVf" : %s",
1620 SVfARG(name), p);
fe788d6b
PM
1621 }
1622
1623 return (! (proto_after_greedy_proto || bad_proto) );
1624}
1625
1626/*
ffb4593c
NT
1627 * S_incline
1628 * This subroutine has nothing to do with tilting, whether at windmills
1629 * or pinball tables. Its name is short for "increment line". It
57843af0 1630 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1631 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1632 * # line 500 "foo.pm"
1633 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1634 */
1635
76e3520e 1636STATIC void
d9095cec 1637S_incline(pTHX_ const char *s)
463ee0b2 1638{
d9095cec
NC
1639 const char *t;
1640 const char *n;
1641 const char *e;
8818d409 1642 line_t line_num;
463ee0b2 1643
7918f24d
NC
1644 PERL_ARGS_ASSERT_INCLINE;
1645
83944c01 1646 COPLINE_INC_WITH_HERELINES;
451f421f
FC
1647 if (!PL_rsfp && !PL_parser->filtered && PL_lex_state == LEX_NORMAL
1648 && s+1 == PL_bufend && *s == ';') {
1649 /* fake newline in string eval */
1650 CopLINE_dec(PL_curcop);
1651 return;
1652 }
463ee0b2
LW
1653 if (*s++ != '#')
1654 return;
d4c19fe8
AL
1655 while (SPACE_OR_TAB(*s))
1656 s++;
73659bf1
GS
1657 if (strnEQ(s, "line", 4))
1658 s += 4;
1659 else
1660 return;
084592ab 1661 if (SPACE_OR_TAB(*s))
73659bf1 1662 s++;
4e553d73 1663 else
73659bf1 1664 return;
d4c19fe8
AL
1665 while (SPACE_OR_TAB(*s))
1666 s++;
463ee0b2
LW
1667 if (!isDIGIT(*s))
1668 return;
d4c19fe8 1669
463ee0b2
LW
1670 n = s;
1671 while (isDIGIT(*s))
1672 s++;
07714eb4 1673 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1674 return;
bf4acbe4 1675 while (SPACE_OR_TAB(*s))
463ee0b2 1676 s++;
73659bf1 1677 if (*s == '"' && (t = strchr(s+1, '"'))) {
463ee0b2 1678 s++;
73659bf1
GS
1679 e = t + 1;
1680 }
463ee0b2 1681 else {
c35e046a
AL
1682 t = s;
1683 while (!isSPACE(*t))
1684 t++;
73659bf1 1685 e = t;
463ee0b2 1686 }
bf4acbe4 1687 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1688 e++;
1689 if (*e != '\n' && *e != '\0')
1690 return; /* false alarm */
1691
96e440d2 1692 line_num = grok_atou(n, &e) - 1;
8818d409 1693
f4dd75d9 1694 if (t - s > 0) {
d9095cec 1695 const STRLEN len = t - s;
3df32bda 1696
d36ee5be 1697 if (!PL_rsfp && !PL_parser->filtered) {
e66cf94c
RGS
1698 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1699 * to *{"::_<newfilename"} */
44867030
NC
1700 /* However, the long form of evals is only turned on by the
1701 debugger - usually they're "(eval %lu)" */
d36ee5be
FC
1702 GV * const cfgv = CopFILEGV(PL_curcop);
1703 if (cfgv) {
38bd7ad8
FC
1704 char smallbuf[128];
1705 STRLEN tmplen2 = len;
44867030 1706 char *tmpbuf2;
449dd039 1707 GV *gv2;
44867030
NC
1708
1709 if (tmplen2 + 2 <= sizeof smallbuf)
1710 tmpbuf2 = smallbuf;
1711 else
1712 Newx(tmpbuf2, tmplen2 + 2, char);
1713
38bd7ad8
FC
1714 tmpbuf2[0] = '_';
1715 tmpbuf2[1] = '<';
44867030
NC
1716
1717 memcpy(tmpbuf2 + 2, s, tmplen2);
1718 tmplen2 += 2;
1719
8a5ee598 1720 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1721 if (!isGV(gv2)) {
8a5ee598 1722 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1723 /* adjust ${"::_<newfilename"} to store the new file name */
1724 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
8818d409
FC
1725 /* The line number may differ. If that is the case,
1726 alias the saved lines that are in the array.
1727 Otherwise alias the whole array. */
1728 if (CopLINE(PL_curcop) == line_num) {
38bd7ad8
FC
1729 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(cfgv)));
1730 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(cfgv)));
8818d409 1731 }
38bd7ad8
FC
1732 else if (GvAV(cfgv)) {
1733 AV * const av = GvAV(cfgv);
8818d409
FC
1734 const I32 start = CopLINE(PL_curcop)+1;
1735 I32 items = AvFILLp(av) - start;
1736 if (items > 0) {
1737 AV * const av2 = GvAVn(gv2);
1738 SV **svp = AvARRAY(av) + start;
1739 I32 l = (I32)line_num+1;
1740 while (items--)
1741 av_store(av2, l++, SvREFCNT_inc(*svp++));
1742 }
1743 }
e5527e4b 1744 }
44867030
NC
1745
1746 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
d36ee5be 1747 }
e66cf94c 1748 }
05ec9bb3 1749 CopFILE_free(PL_curcop);
449dd039 1750 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1751 }
8818d409 1752 CopLINE_set(PL_curcop, line_num);
463ee0b2
LW
1753}
1754
21791330
FC
1755#define skipspace(s) skipspace_flags(s, 0)
1756
29595ff2 1757
80a702cd 1758STATIC void
15f169a1 1759S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1760{
1761 AV *av = CopFILEAVx(PL_curcop);
1762 if (av) {
7f1c3e8c
FC
1763 SV * sv;
1764 if (PL_parser->preambling == NOLINE) sv = newSV_type(SVt_PVMG);
1765 else {
1766 sv = *av_fetch(av, 0, 1);
1767 SvUPGRADE(sv, SVt_PVMG);
1768 }
1769 if (!SvPOK(sv)) sv_setpvs(sv,"");
5fa550fb 1770 if (orig_sv)
7f1c3e8c 1771 sv_catsv(sv, orig_sv);
5fa550fb 1772 else
7f1c3e8c
FC
1773 sv_catpvn(sv, buf, len);
1774 if (!SvIOK(sv)) {
1775 (void)SvIOK_on(sv);
1776 SvIV_set(sv, 0);
1777 }
1778 if (PL_parser->preambling == NOLINE)
1779 av_store(av, CopLINE(PL_curcop), sv);
80a702cd
RGS
1780 }
1781}
1782
ffb4593c
NT
1783/*
1784 * S_skipspace
1785 * Called to gobble the appropriate amount and type of whitespace.
1786 * Skips comments as well.
1787 */
1788
76e3520e 1789STATIC char *
21791330 1790S_skipspace_flags(pTHX_ char *s, U32 flags)
a687059c 1791{
21791330 1792 PERL_ARGS_ASSERT_SKIPSPACE_FLAGS;
3280af22 1793 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
bf4acbe4 1794 while (s < PL_bufend && SPACE_OR_TAB(*s))
463ee0b2 1795 s++;
f0e67a1d
Z
1796 } else {
1797 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1798 PL_bufptr = s;
21791330 1799 lex_read_space(flags | LEX_KEEP_PREVIOUS |
f0998909
Z
1800 (PL_sublex_info.sub_inwhat || PL_lex_state == LEX_FORMLINE ?
1801 LEX_NO_NEXT_CHUNK : 0));
3280af22 1802 s = PL_bufptr;
f0e67a1d
Z
1803 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1804 if (PL_linestart > PL_bufptr)
1805 PL_bufptr = PL_linestart;
1806 return s;
463ee0b2 1807 }
5db06880 1808 return s;
a687059c 1809}
378cc40b 1810
ffb4593c
NT
1811/*
1812 * S_check_uni
1813 * Check the unary operators to ensure there's no ambiguity in how they're
1814 * used. An ambiguous piece of code would be:
1815 * rand + 5
1816 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1817 * the +5 is its argument.
1818 */
1819
76e3520e 1820STATIC void
cea2e8a9 1821S_check_uni(pTHX)
ba106d47 1822{
d4c19fe8
AL
1823 const char *s;
1824 const char *t;
2f3197b3 1825
3280af22 1826 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1827 return;
3280af22
NIS
1828 while (isSPACE(*PL_last_uni))
1829 PL_last_uni++;
c35e046a 1830 s = PL_last_uni;
8a2bca12 1831 while (isWORDCHAR_lazy_if(s,UTF) || *s == '-')
c35e046a 1832 s++;
3280af22 1833 if ((t = strchr(s, '(')) && t < PL_bufptr)
a0d0e21e 1834 return;
6136c704 1835
9b387841
NC
1836 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
1837 "Warning: Use of \"%.*s\" without parentheses is ambiguous",
1838 (int)(s - PL_last_uni), PL_last_uni);
2f3197b3
LW
1839}
1840
ffb4593c
NT
1841/*
1842 * LOP : macro to build a list operator. Its behaviour has been replaced
1843 * with a subroutine, S_lop() for which LOP is just another name.
1844 */
1845
a0d0e21e
LW
1846#define LOP(f,x) return lop(f,x,s)
1847
ffb4593c
NT
1848/*
1849 * S_lop
1850 * Build a list operator (or something that might be one). The rules:
41e8cbf4
FC
1851 * - if we have a next token, then it's a list operator (no parens) for
1852 * which the next token has already been parsed; e.g.,
1853 * sort foo @args
1854 * sort foo (@args)
ffb4593c
NT
1855 * - if the next thing is an opening paren, then it's a function
1856 * - else it's a list operator
1857 */
1858
76e3520e 1859STATIC I32
a0be28da 1860S_lop(pTHX_ I32 f, int x, char *s)
ffed7fef 1861{
7918f24d
NC
1862 PERL_ARGS_ASSERT_LOP;
1863
6154021b 1864 pl_yylval.ival = f;
35c8bce7 1865 CLINE;
3280af22
NIS
1866 PL_expect = x;
1867 PL_bufptr = s;
1868 PL_last_lop = PL_oldbufptr;
eb160463 1869 PL_last_lop_op = (OPCODE)f;
3280af22 1870 if (PL_nexttoke)
78cdf107 1871 goto lstop;
79072805 1872 if (*s == '(')
bbf60fe6 1873 return REPORT(FUNC);
29595ff2 1874 s = PEEKSPACE(s);
79072805 1875 if (*s == '(')
bbf60fe6 1876 return REPORT(FUNC);
78cdf107
Z
1877 else {
1878 lstop:
1879 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
1880 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
bbf60fe6 1881 return REPORT(LSTOP);
78cdf107 1882 }
79072805
LW
1883}
1884
ffb4593c
NT
1885/*
1886 * S_force_next
9cbb5ea2 1887 * When the lexer realizes it knows the next token (for instance,
ffb4593c 1888 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 1889 * to know what token to return the next time the lexer is called. Caller
b5bbe64a
JH
1890 * will need to set PL_nextval[] and possibly PL_expect to ensure
1891 * the lexer handles the token correctly.
ffb4593c
NT
1892 */
1893
4e553d73 1894STATIC void
cea2e8a9 1895S_force_next(pTHX_ I32 type)
79072805 1896{
704d4215
GG
1897#ifdef DEBUGGING
1898 if (DEBUG_T_TEST) {
1899 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 1900 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
1901 }
1902#endif
3280af22
NIS
1903 PL_nexttype[PL_nexttoke] = type;
1904 PL_nexttoke++;
1905 if (PL_lex_state != LEX_KNOWNEXT) {
1906 PL_lex_defer = PL_lex_state;
1907 PL_lex_expect = PL_expect;
1908 PL_lex_state = LEX_KNOWNEXT;
79072805
LW
1909 }
1910}
1911
89f35911
FC
1912/*
1913 * S_postderef
1914 *
1915 * This subroutine handles postfix deref syntax after the arrow has already
1916 * been emitted. @* $* etc. are emitted as two separate token right here.
1917 * @[ @{ %[ %{ *{ are emitted also as two tokens, but this function emits
1918 * only the first, leaving yylex to find the next.
89f35911
FC
1919 */
1920
1921static int
ff25e5db 1922S_postderef(pTHX_ int const funny, char const next)
89f35911 1923{
ff25e5db 1924 assert(funny == DOLSHARP || strchr("$@%&*", funny));
89f35911
FC
1925 assert(strchr("*[{", next));
1926 if (next == '*') {
1927 PL_expect = XOPERATOR;
cc624add 1928 if (PL_lex_state == LEX_INTERPNORMAL && !PL_lex_brackets) {
ff25e5db 1929 assert('@' == funny || '$' == funny || DOLSHARP == funny);
cc624add 1930 PL_lex_state = LEX_INTERPEND;
cc624add
FC
1931 force_next(POSTJOIN);
1932 }
89f35911
FC
1933 force_next(next);
1934 PL_bufptr+=2;
1935 }
1936 else {
760ca746
FC
1937 if ('@' == funny && PL_lex_state == LEX_INTERPNORMAL
1938 && !PL_lex_brackets)
1939 PL_lex_dojoin = 2;
89f35911
FC
1940 PL_expect = XOPERATOR;
1941 PL_bufptr++;
1942 }
1943 return funny;
1944}
1945
28ac2b49
Z
1946void
1947Perl_yyunlex(pTHX)
1948{
a7aaec61
Z
1949 int yyc = PL_parser->yychar;
1950 if (yyc != YYEMPTY) {
1951 if (yyc) {
a7aaec61
Z
1952 NEXTVAL_NEXTTOKE = PL_parser->yylval;
1953 if (yyc == '{'/*}*/ || yyc == HASHBRACK || yyc == '['/*]*/) {
78cdf107 1954 PL_lex_allbrackets--;
a7aaec61 1955 PL_lex_brackets--;
78cdf107
Z
1956 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
1957 } else if (yyc == '('/*)*/) {
1958 PL_lex_allbrackets--;
1959 yyc |= (2<<24);
a7aaec61
Z
1960 }
1961 force_next(yyc);
1962 }
28ac2b49
Z
1963 PL_parser->yychar = YYEMPTY;
1964 }
1965}
1966
d0a148a6 1967STATIC SV *
15f169a1 1968S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 1969{
740cce10 1970 SV * const sv = newSVpvn_utf8(start, len,
eaf7a4d2
CS
1971 !IN_BYTES
1972 && UTF
1973 && !is_ascii_string((const U8*)start, len)
740cce10 1974 && is_utf8_string((const U8*)start, len));
d0a148a6
NC
1975 return sv;
1976}
1977
ffb4593c
NT
1978/*
1979 * S_force_word
1980 * When the lexer knows the next thing is a word (for instance, it has
1981 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
1982 * it calls S_force_word to stick the next word into the PL_nexttoke/val
1983 * lookahead.
ffb4593c
NT
1984 *
1985 * Arguments:
b1b65b59 1986 * char *start : buffer position (must be within PL_linestr)
02b34bbe 1987 * int token : PL_next* will be this type of bare word (e.g., METHOD,WORD)
ffb4593c
NT
1988 * int check_keyword : if true, Perl checks to make sure the word isn't
1989 * a keyword (do this if the word is a label, e.g. goto FOO)
1990 * int allow_pack : if true, : characters will also be allowed (require,
1991 * use, etc. do this)
9cbb5ea2 1992 * int allow_initial_tick : used by the "sub" lexer only.
ffb4593c
NT
1993 */
1994
76e3520e 1995STATIC char *
345b3785 1996S_force_word(pTHX_ char *start, int token, int check_keyword, int allow_pack)
79072805 1997{
eb578fdb 1998 char *s;
463ee0b2 1999 STRLEN len;
4e553d73 2000
7918f24d
NC
2001 PERL_ARGS_ASSERT_FORCE_WORD;
2002
29595ff2 2003 start = SKIPSPACE1(start);
463ee0b2 2004 s = start;
7e2040f0 2005 if (isIDFIRST_lazy_if(s,UTF) ||
345b3785 2006 (allow_pack && *s == ':') )
a0d0e21e 2007 {
3280af22 2008 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
01b5ef50
FC
2009 if (check_keyword) {
2010 char *s2 = PL_tokenbuf;
2011 if (allow_pack && len > 6 && strnEQ(s2, "CORE::", 6))
2012 s2 += 6, len -= 6;
2013 if (keyword(s2, len, 0))
463ee0b2 2014 return start;
01b5ef50 2015 }
463ee0b2 2016 if (token == METHOD) {
29595ff2 2017 s = SKIPSPACE1(s);
463ee0b2 2018 if (*s == '(')
3280af22 2019 PL_expect = XTERM;
463ee0b2 2020 else {
3280af22 2021 PL_expect = XOPERATOR;
463ee0b2 2022 }
79072805 2023 }
9ded7720 2024 NEXTVAL_NEXTTOKE.opval
d0a148a6
NC
2025 = (OP*)newSVOP(OP_CONST,0,
2026 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 2027 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
2028 force_next(token);
2029 }
2030 return s;
2031}
2032
ffb4593c
NT
2033/*
2034 * S_force_ident
9cbb5ea2 2035 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
2036 * text only contains the "foo" portion. The first argument is a pointer
2037 * to the "foo", and the second argument is the type symbol to prefix.
2038 * Forces the next token to be a "WORD".
9cbb5ea2 2039 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2040 */
2041
76e3520e 2042STATIC void
5aaab254 2043S_force_ident(pTHX_ const char *s, int kind)
79072805 2044{
7918f24d
NC
2045 PERL_ARGS_ASSERT_FORCE_IDENT;
2046
c9b48522
DD
2047 if (s[0]) {
2048 const STRLEN len = s[1] ? strlen(s) : 1; /* s = "\"" see yylex */
728847b1
BF
2049 OP* const o = (OP*)newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
2050 UTF ? SVf_UTF8 : 0));
9ded7720 2051 NEXTVAL_NEXTTOKE.opval = o;
79072805 2052 force_next(WORD);
748a9306 2053 if (kind) {
11343788 2054 o->op_private = OPpCONST_ENTERED;
55497cff
PP
2055 /* XXX see note in pp_entereval() for why we forgo typo
2056 warnings if the symbol must be introduced in an eval.
2057 GSAR 96-10-12 */
90e5519e 2058 gv_fetchpvn_flags(s, len,
728847b1
BF
2059 (PL_in_eval ? (GV_ADDMULTI | GV_ADDINEVAL)
2060 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
90e5519e
NC
2061 kind == '$' ? SVt_PV :
2062 kind == '@' ? SVt_PVAV :
2063 kind == '%' ? SVt_PVHV :
a0d0e21e 2064 SVt_PVGV
90e5519e 2065 );
748a9306 2066 }
79072805
LW
2067 }
2068}
2069
3f33d153
FC
2070static void
2071S_force_ident_maybe_lex(pTHX_ char pit)
2072{
3f33d153
FC
2073 NEXTVAL_NEXTTOKE.ival = pit;
2074 force_next('p');
2075}
2076
1571675a
GS
2077NV
2078Perl_str_to_version(pTHX_ SV *sv)
2079{
2080 NV retval = 0.0;
2081 NV nshift = 1.0;
2082 STRLEN len;
cfd0369c 2083 const char *start = SvPV_const(sv,len);
9d4ba2ae 2084 const char * const end = start + len;
504618e9 2085 const bool utf = SvUTF8(sv) ? TRUE : FALSE;
7918f24d
NC
2086
2087 PERL_ARGS_ASSERT_STR_TO_VERSION;
2088
1571675a 2089 while (start < end) {
ba210ebe 2090 STRLEN skip;
1571675a
GS
2091 UV n;
2092 if (utf)
9041c2e3 2093 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2094 else {
2095 n = *(U8*)start;
2096 skip = 1;
2097 }
2098 retval += ((NV)n)/nshift;
2099 start += skip;
2100 nshift *= 1000;
2101 }
2102 return retval;
2103}
2104
4e553d73 2105/*
ffb4593c
NT
2106 * S_force_version
2107 * Forces the next token to be a version number.
e759cc13
RGS
2108 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2109 * and if "guessing" is TRUE, then no new token is created (and the caller
2110 * must use an alternative parsing method).
ffb4593c
NT
2111 */
2112
76e3520e 2113STATIC char *
e759cc13 2114S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2115{
5f66b61c 2116 OP *version = NULL;
44dcb63b 2117 char *d;
89bfa8cd 2118
7918f24d
NC
2119 PERL_ARGS_ASSERT_FORCE_VERSION;
2120
29595ff2 2121 s = SKIPSPACE1(s);
89bfa8cd 2122
44dcb63b 2123 d = s;
dd629d5b 2124 if (*d == 'v')
44dcb63b 2125 d++;
44dcb63b 2126 if (isDIGIT(*d)) {
e759cc13
RGS
2127 while (isDIGIT(*d) || *d == '_' || *d == '.')
2128 d++;
4e4da3ac 2129 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
dd629d5b 2130 SV *ver;
6154021b
RGS
2131 s = scan_num(s, &pl_yylval);
2132 version = pl_yylval.opval;
dd629d5b
GS
2133 ver = cSVOPx(version)->op_sv;
2134 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2135 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2136 SvNV_set(ver, str_to_version(ver));
1571675a 2137 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2138 }
89bfa8cd 2139 }
5db06880 2140 else if (guessing) {
e759cc13 2141 return s;
5db06880 2142 }
89bfa8cd
PP
2143 }
2144
2145 /* NOTE: The parser sees the package name and the VERSION swapped */
9ded7720 2146 NEXTVAL_NEXTTOKE.opval = version;
4e553d73 2147 force_next(WORD);
89bfa8cd 2148
e759cc13 2149 return s;
89bfa8cd
PP
2150}
2151
ffb4593c 2152/*
91152fc1
DG
2153 * S_force_strict_version
2154 * Forces the next token to be a version number using strict syntax rules.
2155 */
2156
2157STATIC char *
2158S_force_strict_version(pTHX_ char *s)
2159{
91152fc1 2160 OP *version = NULL;
91152fc1
DG
2161 const char *errstr = NULL;
2162
2163 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2164
2165 while (isSPACE(*s)) /* leading whitespace */
2166 s++;
2167
2168 if (is_STRICT_VERSION(s,&errstr)) {
2169 SV *ver = newSV(0);
2170 s = (char *)scan_version(s, ver, 0);
2171 version = newSVOP(OP_CONST, 0, ver);
2172 }
4e4da3ac
Z
2173 else if ( (*s != ';' && *s != '{' && *s != '}' ) &&
2174 (s = SKIPSPACE1(s), (*s != ';' && *s != '{' && *s != '}' )))
2175 {
91152fc1
DG
2176 PL_bufptr = s;
2177 if (errstr)
2178 yyerror(errstr); /* version required */
2179 return s;
2180 }
2181
91152fc1 2182 /* NOTE: The parser sees the package name and the VERSION swapped */
91152fc1
DG
2183 NEXTVAL_NEXTTOKE.opval = version;
2184 force_next(WORD);
2185
2186 return s;
2187}
2188
2189/*
ffb4593c
NT
2190 * S_tokeq
2191 * Tokenize a quoted string passed in as an SV. It finds the next
2192 * chunk, up to end of string or a backslash. It may make a new
2193 * SV containing that chunk (if HINT_NEW_STRING is on). It also
2194 * turns \\ into \.
2195 */
2196
76e3520e 2197STATIC SV *
cea2e8a9 2198S_tokeq(pTHX_ SV *sv)
79072805 2199{
eb578fdb
KW
2200 char *s;
2201 char *send;
2202 char *d;
b3ac6de7 2203 SV *pv = sv;
79072805 2204
7918f24d
NC
2205 PERL_ARGS_ASSERT_TOKEQ;
2206
279b35ad
FC
2207 assert (SvPOK(sv));
2208 assert (SvLEN(sv));
2209 assert (!SvIsCOW(sv));
307ed071 2210 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1) /* <<'heredoc' */
b3ac6de7 2211 goto finish;
279b35ad
FC
2212 s = SvPVX(sv);
2213 send = SvEND(sv);
dcb21ed6
NC
2214 /* This is relying on the SV being "well formed" with a trailing '\0' */
2215 while (s < send && !(*s == '\\' && s[1] == '\\'))
79072805
LW
2216 s++;
2217 if (s == send)
b3ac6de7 2218 goto finish;
79072805 2219 d = s;
be4731d2 2220 if ( PL_hints & HINT_NEW_STRING ) {
279b35ad
FC
2221 pv = newSVpvn_flags(SvPVX_const(pv), SvCUR(sv),
2222 SVs_TEMP | SvUTF8(sv));
be4731d2 2223 }
79072805
LW
2224 while (s < send) {
2225 if (*s == '\\') {
a0d0e21e 2226 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2227 s++; /* all that, just for this */
2228 }
2229 *d++ = *s++;
2230 }
2231 *d = '\0';
95a20fc0 2232 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2233 finish:
3280af22 2234 if ( PL_hints & HINT_NEW_STRING )
eb0d8d16 2235 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
79072805
LW
2236 return sv;
2237}
2238
ffb4593c
NT
2239/*
2240 * Now come three functions related to double-quote context,
2241 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2242 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2243 * interact with PL_lex_state, and create fake ( ... ) argument lists
2244 * to handle functions and concatenation.
ecd24171
DM
2245 * For example,
2246 * "foo\lbar"
2247 * is tokenised as
2248 * stringify ( const[foo] concat lcfirst ( const[bar] ) )
ffb4593c
NT
2249 */
2250
2251/*
2252 * S_sublex_start
6154021b 2253 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2254 *
2255 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2256 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c
NT
2257 *
2258 * OP_CONST and OP_READLINE are easy--just make the new op and return.
2259 *
2260 * Everything else becomes a FUNC.
2261 *
2262 * Sets PL_lex_state to LEX_INTERPPUSH unless (ival was OP_NULL or we
2263 * had an OP_CONST or OP_READLINE). This just sets us up for a
2264 * call to S_sublex_push().
2265 */
2266
76e3520e 2267STATIC I32
cea2e8a9 2268S_sublex_start(pTHX)
79072805 2269{
eb578fdb 2270 const I32 op_type = pl_yylval.ival;
79072805
LW
2271
2272 if (op_type == OP_NULL) {
6154021b 2273 pl_yylval.opval = PL_lex_op;
5f66b61c 2274 PL_lex_op = NULL;
79072805
LW
2275 return THING;
2276 }
466112bb 2277 if (op_type == OP_CONST) {
3280af22 2278 SV *sv = tokeq(PL_lex_stuff);
b3ac6de7
IZ
2279
2280 if (SvTYPE(sv) == SVt_PVIV) {
2281 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2282 STRLEN len;
96a5add6 2283 const char * const p = SvPV_const(sv, len);
740cce10 2284 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2285 SvREFCNT_dec(sv);
2286 sv = nsv;
4e553d73 2287 }
6154021b 2288 pl_yylval.opval = (OP*)newSVOP(op_type, 0, sv);
a0714e2c 2289 PL_lex_stuff = NULL;
79072805
LW
2290 return THING;
2291 }
2292
3280af22 2293 PL_sublex_info.super_state = PL_lex_state;
eac04b2e 2294 PL_sublex_info.sub_inwhat = (U16)op_type;
3280af22
NIS
2295 PL_sublex_info.sub_op = PL_lex_op;
2296 PL_lex_state = LEX_INTERPPUSH;
55497cff 2297
3280af22
NIS
2298 PL_expect = XTERM;
2299 if (PL_lex_op) {
6154021b 2300 pl_yylval.opval = PL_lex_op;
5f66b61c 2301 PL_lex_op = NULL;
55497cff
PP
2302 return PMFUNC;
2303 }
2304 else
2305 return FUNC;
2306}
2307
ffb4593c
NT
2308/*
2309 * S_sublex_push
2310 * Create a new scope to save the lexing state. The scope will be
2311 * ended in S_sublex_done. Returns a '(', starting the function arguments
2312 * to the uc, lc, etc. found before.
2313 * Sets PL_lex_state to LEX_INTERPCONCAT.
2314 */
2315
76e3520e 2316STATIC I32
cea2e8a9 2317S_sublex_push(pTHX)
55497cff 2318{
78a635de 2319 LEXSHARED *shared;
801d32ac 2320 const bool is_heredoc = PL_multi_close == '<';
f46d017c 2321 ENTER;
55497cff 2322
3280af22 2323 PL_lex_state = PL_sublex_info.super_state;
cc624add 2324 SAVEI8(PL_lex_dojoin);
3280af22 2325 SAVEI32(PL_lex_brackets);
78cdf107 2326 SAVEI32(PL_lex_allbrackets);
b27dce25 2327 SAVEI32(PL_lex_formbrack);
78cdf107 2328 SAVEI8(PL_lex_fakeeof);
3280af22
NIS
2329 SAVEI32(PL_lex_casemods);
2330 SAVEI32(PL_lex_starts);
651b5b28 2331 SAVEI8(PL_lex_state);
7cc34111 2332 SAVESPTR(PL_lex_repl);
7766f137 2333 SAVEVPTR(PL_lex_inpat);
98246f1e 2334 SAVEI16(PL_lex_inwhat);
ffdb8b16 2335 if (is_heredoc)
b42366d4 2336 {
ffdb8b16 2337 SAVECOPLINE(PL_curcop);
b42366d4 2338 SAVEI32(PL_multi_end);
851b527a
FC
2339 SAVEI32(PL_parser->herelines);
2340 PL_parser->herelines = 0;
b42366d4
FC
2341 }
2342 SAVEI8(PL_multi_close);
3280af22 2343 SAVEPPTR(PL_bufptr);
8452ff4b 2344 SAVEPPTR(PL_bufend);
3280af22
NIS
2345 SAVEPPTR(PL_oldbufptr);
2346 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2347 SAVEPPTR(PL_last_lop);
2348 SAVEPPTR(PL_last_uni);
3280af22
NIS
2349 SAVEPPTR(PL_linestart);
2350 SAVESPTR(PL_linestr);
8edd5f42
RGS
2351 SAVEGENERICPV(PL_lex_brackstack);
2352 SAVEGENERICPV(PL_lex_casestack);
78a635de 2353 SAVEGENERICPV(PL_parser->lex_shared);
3a54fd60 2354 SAVEBOOL(PL_parser->lex_re_reparsing);
ffdb8b16 2355 SAVEI32(PL_copline);
3280af22 2356
99bd9d90 2357 /* The here-doc parser needs to be able to peek into outer lexing
60f40a38
FC
2358 scopes to find the body of the here-doc. So we put PL_linestr and
2359 PL_bufptr into lex_shared, to ‘share’ those values.
99bd9d90 2360 */
60f40a38
FC
2361 PL_parser->lex_shared->ls_linestr = PL_linestr;
2362 PL_parser->lex_shared->ls_bufptr = PL_bufptr;
99bd9d90 2363
3280af22 2364 PL_linestr = PL_lex_stuff;
7cc34111 2365 PL_lex_repl = PL_sublex_info.repl;
a0714e2c 2366 PL_lex_stuff = NULL;
7cc34111 2367 PL_sublex_info.repl = NULL;
3280af22 2368
9cbb5ea2
GS
2369 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2370 = SvPVX(PL_linestr);
3280af22 2371 PL_bufend += SvCUR(PL_linestr);
bd61b366 2372 PL_last_lop = PL_last_uni = NULL;
3280af22 2373 SAVEFREESV(PL_linestr);
4dc843bc 2374 if (PL_lex_repl) SAVEFREESV(PL_lex_repl);
3280af22
NIS
2375
2376 PL_lex_dojoin = FALSE;
b27dce25 2377 PL_lex_brackets = PL_lex_formbrack = 0;
78cdf107
Z
2378 PL_lex_allbrackets = 0;
2379 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
a02a5408
JC
2380 Newx(PL_lex_brackstack, 120, char);
2381 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2382 PL_lex_casemods = 0;
2383 *PL_lex_casestack = '\0';
2384 PL_lex_starts = 0;
2385 PL_lex_state = LEX_INTERPCONCAT;
ffdb8b16 2386 if (is_heredoc)
6ddcf93b 2387 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
ffdb8b16 2388 PL_copline = NOLINE;
78a635de
FC
2389
2390 Newxz(shared, 1, LEXSHARED);
2391 shared->ls_prev = PL_parser->lex_shared;
2392 PL_parser->lex_shared = shared;
3280af22
NIS
2393
2394 PL_lex_inwhat = PL_sublex_info.sub_inwhat;
bb16bae8 2395 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
3280af22
NIS
2396 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2397 PL_lex_inpat = PL_sublex_info.sub_op;
79072805 2398 else
5f66b61c 2399 PL_lex_inpat = NULL;
79072805 2400
3a54fd60
DM
2401 PL_parser->lex_re_reparsing = cBOOL(PL_in_eval & EVAL_RE_REPARSING);
2402 PL_in_eval &= ~EVAL_RE_REPARSING;
2403
55497cff 2404 return '(';
79072805
LW
2405}
2406
ffb4593c
NT
2407/*
2408 * S_sublex_done
2409 * Restores lexer state after a S_sublex_push.
2410 */
2411
76e3520e 2412STATIC I32
cea2e8a9 2413S_sublex_done(pTHX)
79072805 2414{
3280af22 2415 if (!PL_lex_starts++) {
396482e1 2416 SV * const sv = newSVpvs("");
9aa983d2
JH
2417 if (SvUTF8(PL_linestr))
2418 SvUTF8_on(sv);
3280af22 2419 PL_expect = XOPERATOR;
6154021b 2420 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
79072805
LW
2421 return THING;
2422 }
2423
3280af22
NIS
2424 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2425 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2426 return yylex();
79072805
LW
2427 }
2428
ffb4593c 2429 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
bb16bae8 2430 assert(PL_lex_inwhat != OP_TRANSR);
5aa91856
FC
2431 if (PL_lex_repl) {
2432 assert (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS);
3280af22
NIS
2433 PL_linestr = PL_lex_repl;
2434 PL_lex_inpat = 0;
2435 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2436 PL_bufend += SvCUR(PL_linestr);
bd61b366 2437 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2438 PL_lex_dojoin = FALSE;
2439 PL_lex_brackets = 0;
78cdf107
Z
2440 PL_lex_allbrackets = 0;
2441 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
3280af22
NIS
2442 PL_lex_casemods = 0;
2443 *PL_lex_casestack = '\0';
2444 PL_lex_starts = 0;
25da4f38 2445 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2446 PL_lex_state = LEX_INTERPNORMAL;
2447 PL_lex_starts++;
e9fa98b2
HS
2448 /* we don't clear PL_lex_repl here, so that we can check later
2449 whether this is an evalled subst; that means we rely on the
2450 logic to ensure sublex_done() is called again only via the
2451 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2452 }
e9fa98b2 2453 else {
3280af22 2454 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2455 PL_lex_repl = NULL;
e9fa98b2 2456 }
ffdb8b16
FC
2457 if (SvTYPE(PL_linestr) >= SVt_PVNV) {
2458 CopLINE(PL_curcop) +=
2459 ((XPVNV*)SvANY(PL_linestr))->xnv_u.xpad_cop_seq.xlow
851b527a
FC
2460 + PL_parser->herelines;
2461 PL_parser->herelines = 0;
ffdb8b16 2462 }
79072805 2463 return ',';
ffed7fef
LW
2464 }
2465 else {
b42366d4 2466 const line_t l = CopLINE(PL_curcop);
f46d017c 2467 LEAVE;
b42366d4 2468 if (PL_multi_close == '<')
851b527a 2469 PL_parser->herelines += l - PL_multi_end;
3280af22
NIS
2470 PL_bufend = SvPVX(PL_linestr);
2471 PL_bufend += SvCUR(PL_linestr);
2472 PL_expect = XOPERATOR;
09bef843 2473 PL_sublex_info.sub_inwhat = 0;
79072805 2474 return ')';
ffed7fef
LW
2475 }
2476}
2477
6f613c73
KW
2478PERL_STATIC_INLINE SV*
2479S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
2480{
140b12ad
KW
2481 /* <s> points to first character of interior of \N{}, <e> to one beyond the
2482 * interior, hence to the "}". Finds what the name resolves to, returning
2483 * an SV* containing it; NULL if no valid one found */
2484
dd2b1b72 2485 SV* res = newSVpvn_flags(s, e - s, UTF ? SVf_UTF8 : 0);
6f613c73 2486
0c415a79
KW
2487 HV * table;
2488 SV **cvp;
2489 SV *cv;
2490 SV *rv;
2491 HV *stash;
2492 const U8* first_bad_char_loc;
2493 const char* backslash_ptr = s - 3; /* Points to the <\> of \N{... */
2494
6f613c73
KW
2495 PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME;
2496
107160e2
KW
2497 if (UTF && ! is_utf8_string_loc((U8 *) backslash_ptr,
2498 e - backslash_ptr,
2499 &first_bad_char_loc))
2500 {
2501 /* If warnings are on, this will print a more detailed analysis of what
2502 * is wrong than the error message below */
c80e42f3 2503 utf8n_to_uvchr(first_bad_char_loc,
107160e2
KW
2504 e - ((char *) first_bad_char_loc),
2505 NULL, 0);
2506
2507 /* We deliberately don't try to print the malformed character, which
2508 * might not print very well; it also may be just the first of many
2509 * malformations, so don't print what comes after it */
2510 yyerror(Perl_form(aTHX_
2511 "Malformed UTF-8 character immediately after '%.*s'",
2512 (int) (first_bad_char_loc - (U8 *) backslash_ptr), backslash_ptr));
2513 return NULL;
2514 }
2515
2516 res = new_constant( NULL, 0, "charnames", res, NULL, backslash_ptr,
2517 /* include the <}> */
2518 e - backslash_ptr + 1);
6f613c73 2519 if (! SvPOK(res)) {
b6407c49 2520 SvREFCNT_dec_NN(res);
6f613c73
KW
2521 return NULL;
2522 }
2523
0c415a79
KW
2524 /* See if the charnames handler is the Perl core's, and if so, we can skip
2525 * the validation needed for a user-supplied one, as Perl's does its own
2526 * validation. */
2527 table = GvHV(PL_hintgv); /* ^H */
2528 cvp = hv_fetchs(table, "charnames", FALSE);
5882ddb3
FC
2529 if (cvp && (cv = *cvp) && SvROK(cv) && (rv = SvRV(cv),
2530 SvTYPE(rv) == SVt_PVCV) && ((stash = CvSTASH(rv)) != NULL))
0c415a79
KW
2531 {
2532 const char * const name = HvNAME(stash);
6a642c21
FC
2533 if (HvNAMELEN(stash) == sizeof("_charnames")-1
2534 && strEQ(name, "_charnames")) {
0c415a79
KW
2535 return res;
2536 }
2537 }
2538
bde9e88d
KW
2539 /* Here, it isn't Perl's charname handler. We can't rely on a
2540 * user-supplied handler to validate the input name. For non-ut8 input,
2541 * look to see that the first character is legal. Then loop through the
2542 * rest checking that each is a continuation */
6f613c73 2543
36897d64
KW
2544 /* This code makes the reasonable assumption that the only Latin1-range
2545 * characters that begin a character name alias are alphabetic, otherwise
2546 * would have to create a isCHARNAME_BEGIN macro */
b6ba1137 2547
b6ba1137 2548 if (! UTF) {
bde9e88d 2549 if (! isALPHAU(*s)) {
b6ba1137
KW
2550 goto bad_charname;
2551 }
bde9e88d
KW
2552 s++;
2553 while (s < e) {
2554 if (! isCHARNAME_CONT(*s)) {
b6ba1137
KW
2555 goto bad_charname;
2556 }
2d8eb851
KW
2557 if (*s == ' ' && *(s-1) == ' ') {
2558 goto multi_spaces;
bd299e29 2559 }
df758df2
KW
2560 if ((U8) *s == NBSP_NATIVE && ckWARN_d(WARN_DEPRECATED)) {
2561 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2562 "NO-BREAK SPACE in a charnames "
2563 "alias definition is deprecated");
2564 }
bde9e88d 2565 s++;
b6ba1137
KW
2566 }
2567 }
2568 else {
bde9e88d
KW
2569 /* Similarly for utf8. For invariants can check directly; for other
2570 * Latin1, can calculate their code point and check; otherwise use a
2571 * swash */
2572 if (UTF8_IS_INVARIANT(*s)) {
2573 if (! isALPHAU(*s)) {
140b12ad
KW
2574 goto bad_charname;
2575 }
bde9e88d
KW
2576 s++;
2577 } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
94bb8c36 2578 if (! isALPHAU(TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)))) {
b6ba1137 2579 goto bad_charname;
6f613c73 2580 }
bde9e88d 2581 s += 2;
6f613c73 2582 }
bde9e88d
KW
2583 else {
2584 if (! PL_utf8_charname_begin) {
2585 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2586 PL_utf8_charname_begin = _core_swash_init("utf8",
2587 "_Perl_Charname_Begin",
2588 &PL_sv_undef,
2589 1, 0, NULL, &flags);
2590 }
2591 if (! swash_fetch(PL_utf8_charname_begin, (U8 *) s, TRUE)) {
2592 goto bad_charname;
2593 }
2594 s += UTF8SKIP(s);
2595 }
2596
2597 while (s < e) {
2598 if (UTF8_IS_INVARIANT(*s)) {
2599 if (! isCHARNAME_CONT(*s)) {
2600 goto bad_charname;
2601 }
2d8eb851
KW
2602 if (*s == ' ' && *(s-1) == ' ') {
2603 goto multi_spaces;
bd299e29 2604 }
bde9e88d
KW
2605 s++;
2606 }
2607 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
94bb8c36 2608 if (! isCHARNAME_CONT(TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1))))
bde9e88d
KW
2609 {
2610 goto bad_charname;
2611 }
df758df2
KW
2612 if (*s == *NBSP_UTF8
2613 && *(s+1) == *(NBSP_UTF8+1)
2614 && ckWARN_d(WARN_DEPRECATED))
2615 {
2616 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2617 "NO-BREAK SPACE in a charnames "
2618 "alias definition is deprecated");
2619 }
bde9e88d
KW
2620 s += 2;
2621 }
2622 else {
2623 if (! PL_utf8_charname_continue) {
2624 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2625 PL_utf8_charname_continue = _core_swash_init("utf8",
2626 "_Perl_Charname_Continue",
2627 &PL_sv_undef,
2628 1, 0, NULL, &flags);
2629 }
2630 if (! swash_fetch(PL_utf8_charname_continue, (U8 *) s, TRUE)) {
2631 goto bad_charname;
2632 }
2633 s += UTF8SKIP(s);
6f613c73
KW
2634 }
2635 }
2d8eb851
KW
2636 }
2637 if (*(s-1) == ' ') {
2638 yyerror_pv(
2639 Perl_form(aTHX_
2640 "charnames alias definitions may not contain trailing "
2641 "white-space; marked by <-- HERE in %.*s<-- HERE %.*s",
2642 (int)(s - backslash_ptr + 1), backslash_ptr,
2643 (int)(e - s + 1), s + 1
2644 ),
2645 UTF ? SVf_UTF8 : 0);
2646 return NULL;
6f613c73
KW
2647 }
2648
94ca1619 2649 if (SvUTF8(res)) { /* Don't accept malformed input */
bde9e88d
KW
2650 const U8* first_bad_char_loc;
2651 STRLEN len;
2652 const char* const str = SvPV_const(res, len);
2653 if (! is_utf8_string_loc((U8 *) str, len, &first_bad_char_loc)) {
2654 /* If warnings are on, this will print a more detailed analysis of
2655 * what is wrong than the error message below */
c80e42f3 2656 utf8n_to_uvchr(first_bad_char_loc,
bde9e88d
KW
2657 (char *) first_bad_char_loc - str,
2658 NULL, 0);
2659
2660 /* We deliberately don't try to print the malformed character,
2661 * which might not print very well; it also may be just the first
2662 * of many malformations, so don't print what comes after it */
2663 yyerror_pv(
2664 Perl_form(aTHX_
2665 "Malformed UTF-8 returned by %.*s immediately after '%.*s'",
2666 (int) (e - backslash_ptr + 1), backslash_ptr,
2667 (int) ((char *) first_bad_char_loc - str), str
2668 ),
2669 SVf_UTF8);
2670 return NULL;
2671 }
2672 }
140b12ad 2673
bde9e88d 2674 return res;
140b12ad 2675
bde9e88d 2676 bad_charname: {
bde9e88d
KW
2677
2678 /* The final %.*s makes sure that should the trailing NUL be missing
2679 * that this print won't run off the end of the string */
2680 yyerror_pv(
2681 Perl_form(aTHX_
2682 "Invalid character in \\N{...}; marked by <-- HERE in %.*s<-- HERE %.*s",
2d8eb851
KW
2683 (int)(s - backslash_ptr + 1), backslash_ptr,
2684 (int)(e - s + 1), s + 1
bde9e88d
KW
2685 ),
2686 UTF ? SVf_UTF8 : 0);
2687 return NULL;
2688 }
2d8eb851
KW
2689
2690 multi_spaces:
2691 yyerror_pv(
2692 Perl_form(aTHX_
2693 "charnames alias definitions may not contain a sequence of "
2694 "multiple spaces; marked by <-- HERE in %.*s<-- HERE %.*s",
2695 (int)(s - backslash_ptr + 1), backslash_ptr,
2696 (int)(e - s + 1), s + 1
2697 ),
2698 UTF ? SVf_UTF8 : 0);
2699 return NULL;
6f613c73
KW
2700}
2701
02aa26ce
NT
2702/*
2703 scan_const
2704
9da1dd8f
DM
2705 Extracts the next constant part of a pattern, double-quoted string,
2706 or transliteration. This is terrifying code.
2707
2708 For example, in parsing the double-quoted string "ab\x63$d", it would
2709 stop at the '$' and return an OP_CONST containing 'abc'.
02aa26ce 2710
94def140 2711 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 2712 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 2713 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 2714
94def140
ST
2715 Returns a pointer to the character scanned up to. If this is
2716 advanced from the start pointer supplied (i.e. if anything was
9da1dd8f 2717 successfully parsed), will leave an OP_CONST for the substring scanned
6154021b 2718 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
2719 by looking at the next characters herself.
2720
02aa26ce 2721 In patterns:
9da1dd8f 2722 expand:
537124e4
KW
2723 \N{FOO} => \N{U+hex_for_character_FOO}
2724 (if FOO expands to multiple characters, expands to \N{U+xx.XX.yy ...})
9da1dd8f
DM
2725
2726 pass through:
2727 all other \-char, including \N and \N{ apart from \N{ABC}
2728
2729 stops on:
2730 @ and $ where it appears to be a var, but not for $ as tail anchor
2731 \l \L \u \U \Q \E
2732 (?{ or (??{
2733
02aa26ce
NT
2734
2735 In transliterations:
2736 characters are VERY literal, except for - not at the start or end
94def140
ST
2737 of the string, which indicates a range. If the range is in bytes,
2738 scan_const expands the range to the full set of intermediate
2739 characters. If the range is in utf8, the hyphen is replaced with
2740 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
2741
2742 In double-quoted strings:
2743 backslashes:
2744 double-quoted style: \r and \n
ff3f963a 2745 constants: \x31, etc.
94def140 2746 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
2747 case and quoting: \U \Q \E
2748 stops on @ and $
2749
2750 scan_const does *not* construct ops to handle interpolated strings.
2751 It stops processing as soon as it finds an embedded $ or @ variable
2752 and leaves it to the caller to work out what's going on.
2753
94def140
ST
2754 embedded arrays (whether in pattern or not) could be:
2755 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2756
2757 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
2758
2759 $ in pattern could be $foo or could be tail anchor. Assumption:
2760 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 2761 followed by one of "()| \r\n\t"
02aa26ce 2762
9da1dd8f 2763 \1 (backreferences) are turned into $1 in substitutions
02aa26ce
NT
2764
2765 The structure of the code is
2766 while (there's a character to process) {
94def140
ST
2767 handle transliteration ranges
2768 skip regexp comments /(?#comment)/ and codes /(?{code})/
2769 skip #-initiated comments in //x patterns
2770 check for embedded arrays
02aa26ce
NT
2771 check for embedded scalars
2772 if (backslash) {
94def140 2773 deprecate \1 in substitution replacements
02aa26ce
NT
2774 handle string-changing backslashes \l \U \Q \E, etc.
2775 switch (what was escaped) {
94def140 2776 handle \- in a transliteration (becomes a literal -)
ff3f963a 2777 if a pattern and not \N{, go treat as regular character
94def140
ST
2778 handle \132 (octal characters)
2779 handle \x15 and \x{1234} (hex characters)
ff3f963a 2780 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
ST
2781 handle \cV (control characters)
2782 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 2783 } (end switch)
77a135fe 2784 continue
02aa26ce 2785 } (end if backslash)
77a135fe 2786 handle regular character
02aa26ce 2787 } (end while character to read)
4e553d73 2788
02aa26ce
NT
2789*/
2790
76e3520e 2791STATIC char *
cea2e8a9 2792S_scan_const(pTHX_ char *start)
79072805 2793{
eb578fdb 2794 char *send = PL_bufend; /* end of the constant */
dc023dbb
KW
2795 SV *sv = newSV(send - start); /* sv for the constant. See note below
2796 on sizing. */
eb578fdb
KW
2797 char *s = start; /* start of the constant */
2798 char *d = SvPVX(sv); /* destination for copies */
dc023dbb
KW
2799 bool dorange = FALSE; /* are we in a translit range? */
2800 bool didrange = FALSE; /* did we just finish a range? */
2801 bool in_charclass = FALSE; /* within /[...]/ */
2802 bool has_utf8 = FALSE; /* Output constant is UTF8 */
2803 bool this_utf8 = cBOOL(UTF); /* Is the source string assumed to be
2804 UTF8? But, this can show as true
2805 when the source isn't utf8, as for
2806 example when it is entirely composed
2807 of hex constants */
6f613c73 2808 SV *res; /* result from charnames */
77a135fe
KW
2809
2810 /* Note on sizing: The scanned constant is placed into sv, which is
2811 * initialized by newSV() assuming one byte of output for every byte of
2812 * input. This routine expects newSV() to allocate an extra byte for a
2813 * trailing NUL, which this routine will append if it gets to the end of
2814 * the input. There may be more bytes of input than output (eg., \N{LATIN
2815 * CAPITAL LETTER A}), or more output than input if the constant ends up
2816 * recoded to utf8, but each time a construct is found that might increase
2817 * the needed size, SvGROW() is called. Its size parameter each time is
2818 * based on the best guess estimate at the time, namely the length used so
2819 * far, plus the length the current construct will occupy, plus room for
2820 * the trailing NUL, plus one byte for every input byte still unscanned */
2821
c3320c2a
KW
2822 UV uv = UV_MAX; /* Initialize to weird value to try to catch any uses
2823 before set */
4c3a8340
ST
2824#ifdef EBCDIC
2825 UV literal_endpoint = 0;
e294cc5d 2826 bool native_range = TRUE; /* turned to FALSE if the first endpoint is Unicode. */
4c3a8340 2827#endif
012bcf8d 2828
7918f24d
NC
2829 PERL_ARGS_ASSERT_SCAN_CONST;
2830
bb16bae8 2831 assert(PL_lex_inwhat != OP_TRANSR);
2b9d42f0
NIS
2832 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
2833 /* If we are doing a trans and we know we want UTF8 set expectation */
2834 has_utf8 = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
2835 this_utf8 = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
2836 }
2837
b899e89d
FC
2838 /* Protect sv from errors and fatal warnings. */
2839 ENTER_with_name("scan_const");
2840 SAVEFREESV(sv);
2b9d42f0 2841
79072805 2842 while (s < send || dorange) {
ff3f963a 2843
02aa26ce 2844 /* get transliterations out of the way (they're most literal) */
3280af22 2845 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 2846 /* expand a range A-Z to the full set of characters. AIE! */
79072805 2847 if (dorange) {
1ba5c669
JH
2848 I32 i; /* current expanded character */
2849 I32 min; /* first character in range */
2850 I32 max; /* last character in range */
02aa26ce 2851
e294cc5d
JH
2852#ifdef EBCDIC
2853 UV uvmax = 0;
2854#endif
2855
2856 if (has_utf8
2857#ifdef EBCDIC
2858 && !native_range
2859#endif
1953db30 2860 ) {
9d4ba2ae 2861 char * const c = (char*)utf8_hop((U8*)d, -1);
8973db79
JH
2862 char *e = d++;
2863 while (e-- > c)
2864 *(e + 1) = *e;
e7214ce8 2865 *c = (char) ILLEGAL_UTF8_BYTE;
8973db79
JH
2866 /* mark the range as done, and continue */
2867 dorange = FALSE;
2868 didrange = TRUE;
2869 continue;
2870 }
2b9d42f0 2871
95a20fc0 2872 i = d - SvPVX_const(sv); /* remember current offset */
e294cc5d
JH
2873#ifdef EBCDIC
2874 SvGROW(sv,
dc023dbb
KW
2875 SvLEN(sv) + ((has_utf8)
2876 ? (512 - UTF_CONTINUATION_MARK
2877 + UNISKIP(0x100))
e294cc5d
JH
2878 : 256));
2879 /* How many two-byte within 0..255: 128 in UTF-8,
2880 * 96 in UTF-8-mod. */
2881#else
9cbb5ea2 2882 SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */
e294cc5d 2883#endif
9cbb5ea2 2884 d = SvPVX(sv) + i; /* refresh d after realloc */
e294cc5d
JH
2885#ifdef EBCDIC
2886 if (has_utf8) {
2887 int j;
2888 for (j = 0; j <= 1; j++) {
2889 char * const c = (char*)utf8_hop((U8*)d, -1);
2890 const UV uv = utf8n_to_uvchr((U8*)c, d - c, NULL, 0);
2891 if (j)
2892 min = (U8)uv;
2893 else if (uv < 256)
2894 max = (U8)uv;
2895 else {
2896 max = (U8)0xff; /* only to \xff */
2897 uvmax = uv; /* \x{100} to uvmax */
2898 }
2899 d = c; /* eat endpoint chars */
2900 }
2901 }
2902 else {
2903#endif
2904 d -= 2; /* eat the first char and the - */
2905 min = (U8)*d; /* first char in range */
2906 max = (U8)d[1]; /* last char in range */
2907#ifdef EBCDIC
2908 }
2909#endif
8ada0baa 2910
c2e66d9e 2911 if (min > max) {
01ec43d0 2912 Perl_croak(aTHX_
d1573ac7 2913 "Invalid range \"%c-%c\" in transliteration operator",
1ba5c669 2914 (char)min, (char)max);
c2e66d9e
GS
2915 }
2916
c7f1f016 2917#ifdef EBCDIC
dc023dbb
KW
2918 /* Because of the discontinuities in EBCDIC A-Z and a-z, expand
2919 * any subsets of these ranges into individual characters */
4c3a8340 2920 if (literal_endpoint == 2 &&
c286a9a6 2921 ((isLOWER_A(min) && isLOWER_A(max)) ||
75e9e7bf
KW
2922 (isUPPER_A(min) && isUPPER_A(max))))
2923 {
2924 for (i = min; i <= max; i++) {
2925 if (isALPHA_A(i))
2926 *d++ = i;
8ada0baa
JH
2927 }
2928 }
2929 else
2930#endif
2931 for (i = min; i <= max; i++)
e294cc5d
JH
2932#ifdef EBCDIC
2933 if (has_utf8) {
55d09dc8 2934 append_utf8_from_native_byte(i, &d);
e294cc5d
JH
2935 }
2936 else
2937#endif
2938 *d++ = (char)i;
2939
2940#ifdef EBCDIC
2941 if (uvmax) {
2942 d = (char*)uvchr_to_utf8((U8*)d, 0x100);
2943 if (uvmax > 0x101)
e7214ce8 2944 *d++ = (char) ILLEGAL_UTF8_BYTE;
e294cc5d
JH
2945 if (uvmax > 0x100)
2946 d = (char*)uvchr_to_utf8((U8*)d, uvmax);
2947 }
2948#endif
02aa26ce
NT
2949
2950 /* mark the range as done, and continue */
79072805 2951 dorange = FALSE;
01ec43d0 2952 didrange = TRUE;
4c3a8340
ST
2953#ifdef EBCDIC
2954 literal_endpoint = 0;
2955#endif
79072805 2956 continue;
4e553d73 2957 }
02aa26ce
NT
2958
2959 /* range begins (ignore - as first or last char) */
79072805 2960 else if (*s == '-' && s+1 < send && s != start) {
4e553d73 2961 if (didrange) {
1fafa243 2962 Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
01ec43d0 2963 }
e294cc5d
JH
2964 if (has_utf8
2965#ifdef EBCDIC
2966 && !native_range
2967#endif
2968 ) {
e7214ce8 2969 *d++ = (char) ILLEGAL_UTF8_BYTE; /* use illegal utf8 byte--see pmtrans */
a0ed51b3
LW
2970 s++;
2971 continue;
2972 }
79072805
LW
2973 dorange = TRUE;
2974 s++;
01ec43d0
GS
2975 }
2976 else {
2977 didrange = FALSE;
4c3a8340
ST
2978#ifdef EBCDIC
2979 literal_endpoint = 0;
e294cc5d 2980 native_range = TRUE;
4c3a8340 2981#endif
01ec43d0 2982 }
79072805 2983 }
02aa26ce
NT
2984
2985 /* if we get here, we're not doing a transliteration */
2986
e4a2df84
DM
2987 else if (*s == '[' && PL_lex_inpat && !in_charclass) {
2988 char *s1 = s-1;
2989 int esc = 0;
2990 while (s1 >= start && *s1-- == '\\')
2991 esc = !esc;
2992 if (!esc)
2993 in_charclass = TRUE;
2994 }
2866decb 2995
e4a2df84
DM
2996 else if (*s == ']' && PL_lex_inpat && in_charclass) {
2997 char *s1 = s-1;
2998 int esc = 0;
2999 while (s1 >= start && *s1-- == '\\')
3000 esc = !esc;
3001 if (!esc)
3002 in_charclass = FALSE;
3003 }
2866decb 3004
9da1dd8f
DM
3005 /* skip for regexp comments /(?#comment)/, except for the last
3006 * char, which will be done separately.
3007 * Stop on (?{..}) and friends */
3008
c30fc27b 3009 else if (*s == '(' && PL_lex_inpat && s[1] == '?' && !in_charclass) {
cc6b7395 3010 if (s[2] == '#') {
e994fd66 3011 while (s+1 < send && *s != ')')
5ff03569 3012 *d++ = *s++;
155aba94 3013 }
c30fc27b 3014 else if (!PL_lex_casemods &&
d3cec5e5
DM
3015 ( s[2] == '{' /* This should match regcomp.c */
3016 || (s[2] == '?' && s[3] == '{')))
155aba94 3017 {
9da1dd8f 3018 break;
cc6b7395 3019 }
748a9306 3020 }
02aa26ce
NT
3021
3022 /* likewise skip #-initiated comments in //x patterns */
c30fc27b 3023 else if (*s == '#' && PL_lex_inpat && !in_charclass &&
73134a2e 3024 ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED) {
748a9306 3025 while (s+1 < send && *s != '\n')
5ff03569 3026 *d++ = *s++;
748a9306 3027 }
02aa26ce 3028
9da1dd8f
DM
3029 /* no further processing of single-quoted regex */
3030 else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'')
3031 goto default_action;
3032
5d1d4326 3033 /* check for embedded arrays
da6eedaa 3034 (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
5d1d4326 3035 */
1749ea0d 3036 else if (*s == '@' && s[1]) {
8a2bca12 3037 if (isWORDCHAR_lazy_if(s+1,UTF))
1749ea0d
ST
3038 break;
3039 if (strchr(":'{$", s[1]))
3040 break;
3041 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
3042 break; /* in regexp, neither @+ nor @- are interpolated */
3043 }
02aa26ce
NT
3044
3045 /* check for embedded scalars. only stop if we're sure it's a
3046 variable.
3047 */
79072805 3048 else if (*s == '$') {
3280af22 3049 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
79072805 3050 break;
77772344 3051 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
a2a5de95
NC
3052 if (s[1] == '\\') {
3053 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
3054 "Possible unintended interpolation of $\\ in regex");
77772344 3055 }
79072805 3056 break; /* in regexp, $ might be tail anchor */
77772344 3057 }
79072805 3058 }
02aa26ce 3059
2b9d42f0
NIS
3060 /* End of else if chain - OP_TRANS rejoin rest */
3061
02aa26ce 3062 /* backslashes */
79072805 3063 if (*s == '\\' && s+1 < send) {
ff3f963a
KW
3064 char* e; /* Can be used for ending '}', etc. */
3065
79072805 3066 s++;
02aa26ce 3067
7d0fc23c
KW
3068 /* warn on \1 - \9 in substitution replacements, but note that \11
3069 * is an octal; and \19 is \1 followed by '9' */
3280af22 3070 if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat &&
a0d0e21e 3071 isDIGIT(*s) && *s != '0' && !isDIGIT(s[1]))
79072805 3072 {
c782d7ee 3073 /* diag_listed_as: \%d better written as $%d */
a2a5de95 3074 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
79072805
LW
3075 *--s = '$';
3076 break;
3077 }
02aa26ce
NT
3078
3079 /* string-change backslash escapes */
838f2281 3080 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQF", *s)) {
79072805
LW
3081 --s;
3082 break;
3083 }
ff3f963a
KW
3084 /* In a pattern, process \N, but skip any other backslash escapes.
3085 * This is because we don't want to translate an escape sequence
3086 * into a meta symbol and have the regex compiler use the meta
3087 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
3088 * in spite of this, we do have to process \N here while the proper
3089 * charnames handler is in scope. See bugs #56444 and #62056.
3090 * There is a complication because \N in a pattern may also stand
3091 * for 'match a non-nl', and not mean a charname, in which case its
3092 * processing should be deferred to the regex compiler. To be a
3093 * charname it must be followed immediately by a '{', and not look
3094 * like \N followed by a curly quantifier, i.e., not something like
3095 * \N{3,}. regcurly returns a boolean indicating if it is a legal
3096 * quantifier */
3097 else if (PL_lex_inpat
3098 && (*s != 'N'
3099 || s[1] != '{'
412f55bb 3100 || regcurly(s + 1)))
ff3f963a 3101 {
4d73d076 3102 *d++ = '\\';
cc74c5bd
ST
3103 goto default_action;
3104 }
02aa26ce 3105
79072805 3106 switch (*s) {
02aa26ce
NT
3107
3108 /* quoted - in transliterations */
79072805 3109 case '-':
3280af22 3110 if (PL_lex_inwhat == OP_TRANS) {
79072805
LW
3111 *d++ = *s++;
3112 continue;
3113 }
924ba076 3114 /* FALLTHROUGH */
79072805 3115 default:
11b8faa4 3116 {
15861f94 3117 if ((isALPHANUMERIC(*s)))
a2a5de95
NC
3118 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
3119 "Unrecognized escape \\%c passed through",
3120 *s);
11b8faa4 3121 /* default action is to copy the quoted character */
f9a63242 3122 goto default_action;
11b8faa4 3123 }
02aa26ce 3124
632403cc 3125 /* eg. \132 indicates the octal constant 0132 */
79072805
LW
3126 case '0': case '1': case '2': case '3':
3127 case '4': case '5': case '6': case '7':
ba210ebe 3128 {
5e0a247b 3129 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
53305cf1 3130 STRLEN len = 3;
06972766 3131 uv = grok_oct(s, &len, &flags, NULL);
ba210ebe 3132 s += len;
5e0a247b
KW
3133 if (len < 3 && s < send && isDIGIT(*s)
3134 && ckWARN(WARN_MISC))
3135 {
3136 Perl_warner(aTHX_ packWARN(WARN_MISC),
3137 "%s", form_short_octal_warning(s, len));
3138 }
ba210ebe 3139 }
012bcf8d 3140 goto NUM_ESCAPE_INSERT;
02aa26ce 3141
f0a2b745
KW
3142 /* eg. \o{24} indicates the octal constant \024 */
3143 case 'o':
3144 {
454155d9 3145 const char* error;
f0a2b745 3146
00ce5563 3147 bool valid = grok_bslash_o(&s, &uv, &error,
80f4111b
KW
3148 TRUE, /* Output warning */
3149 FALSE, /* Not strict */
17896a85
KW
3150 TRUE, /* Output warnings for
3151 non-portables */
80f4111b 3152 UTF);
454155d9 3153 if (! valid) {
f0a2b745
KW
3154 yyerror(error);
3155 continue;
3156 }
3157 goto NUM_ESCAPE_INSERT;
3158 }
3159
77a135fe 3160 /* eg. \x24 indicates the hex constant 0x24 */
79072805 3161 case 'x':
a0481293 3162 {
a0481293 3163 const char* error;
355860ce 3164
00ce5563 3165 bool valid = grok_bslash_x(&s, &uv, &error,
80f4111b
KW
3166 TRUE, /* Output warning */
3167 FALSE, /* Not strict */
17896a85
KW
3168 TRUE, /* Output warnings for
3169 non-portables */
80f4111b 3170 UTF);
a0481293
KW
3171 if (! valid) {
3172 yyerror(error);
355860ce 3173 continue;
ba210ebe 3174 }
012bcf8d
GS
3175 }
3176
3177 NUM_ESCAPE_INSERT:
ff3f963a
KW
3178 /* Insert oct or hex escaped character. There will always be
3179 * enough room in sv since such escapes will be longer than any
3180 * UTF-8 sequence they can end up as, except if they force us
3181 * to recode the rest of the string into utf8 */
ba7cea30 3182
06972766 3183 /* Here uv is the ordinal of the next character being added */
6f2d5cbc 3184 if (!UVCHR_IS_INVARIANT(uv)) {
9aa983d2 3185 if (!has_utf8 && uv > 255) {
77a135fe
KW
3186 /* Might need to recode whatever we have accumulated so
3187 * far if it contains any chars variant in utf8 or
3188 * utf-ebcdic. */
3189
3190 SvCUR_set(sv, d - SvPVX_const(sv));
3191 SvPOK_on(sv);
3192 *d = '\0';
77a135fe 3193 /* See Note on sizing above. */
7bf79863
KW
3194 sv_utf8_upgrade_flags_grow(sv,
3195 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3196 UNISKIP(uv) + (STRLEN)(send - s) + 1);
77a135fe
KW
3197 d = SvPVX(sv) + SvCUR(sv);
3198 has_utf8 = TRUE;
012bcf8d
GS
3199 }
3200
77a135fe 3201 if (has_utf8) {
c80e42f3 3202 d = (char*)uvchr_to_utf8((U8*)d, uv);
f9a63242
JH
3203 if (PL_lex_inwhat == OP_TRANS &&
3204 PL_sublex_info.sub_op) {
3205 PL_sublex_info.sub_op->op_private |=
3206 (PL_lex_repl ? OPpTRANS_FROM_UTF
3207 : OPpTRANS_TO_UTF);
f9a63242 3208 }
e294cc5d
JH
3209#ifdef EBCDIC
3210 if (uv > 255 && !dorange)
3211 native_range = FALSE;
3212#endif
012bcf8d 3213 }
a0ed51b3 3214 else {
012bcf8d 3215 *d++ = (char)uv;
a0ed51b3 3216 }
012bcf8d
GS
3217 }
3218 else {
c4d5f83a 3219 *d++ = (char) uv;
a0ed51b3 3220 }
79072805 3221 continue;
02aa26ce 3222
4a2d328f 3223 case 'N':
ff3f963a
KW
3224 /* In a non-pattern \N must be a named character, like \N{LATIN
3225 * SMALL LETTER A} or \N{U+0041}. For patterns, it also can
3226 * mean to match a non-newline. For non-patterns, named
3227 * characters are converted to their string equivalents. In
3228 * patterns, named characters are not converted to their
3229 * ultimate forms for the same reasons that other escapes
3230 * aren't. Instead, they are converted to the \N{U+...} form
3231 * to get the value from the charnames that is in effect right
3232 * now, while preserving the fact that it was a named character
3233 * so that the regex compiler knows this */
3234
ff3f963a
KW
3235 /* The structure of this section of code (besides checking for
3236 * errors and upgrading to utf8) is:
3237 * Further disambiguate between the two meanings of \N, and if
3238 * not a charname, go process it elsewhere
0a96133f
KW
3239 * If of form \N{U+...}, pass it through if a pattern;
3240 * otherwise convert to utf8
3241 * Otherwise must be \N{NAME}: convert to \N{U+c1.c2...} if a
3242 * pattern; otherwise convert to utf8 */
ff3f963a
KW
3243
3244 /* Here, s points to the 'N'; the test below is guaranteed to
3245 * succeed if we are being called on a pattern as we already
3246 * know from a test above that the next character is a '{'.
3247 * On a non-pattern \N must mean 'named sequence, which
3248 * requires braces */
3249 s++;
3250 if (*s != '{') {
3251 yyerror("Missing braces on \\N{}");
3252 continue;
3253 }
3254 s++;
3255
0a96133f 3256 /* If there is no matching '}', it is an error. */
ff3f963a
KW
3257 if (! (e = strchr(s, '}'))) {
3258 if (! PL_lex_inpat) {
5777a3f7 3259 yyerror("Missing right brace on \\N{}");
0a96133f 3260 } else {
4407f1b8 3261 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N");
dbc0d4f2 3262 }
0a96133f 3263 continue;
ff3f963a 3264 }
cddc7ef4 3265
ff3f963a 3266 /* Here it looks like a named character */
cddc7ef4 3267
ff3f963a
KW
3268 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3269 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3270 | PERL_SCAN_DISALLOW_PREFIX;
3271 STRLEN len;
3272
3273 /* For \N{U+...}, the '...' is a unicode value even on
3274 * EBCDIC machines */
3275 s += 2; /* Skip to next char after the 'U+' */
3276 len = e - s;
3277 uv = grok_hex(s, &len, &flags, NULL);
3278 if (len == 0 || len != (STRLEN)(e - s)) {
3279 yyerror("Invalid hexadecimal number in \\N{U+...}");
3280 s = e + 1;
3281 continue;
3282 }
3283
3284 if (PL_lex_inpat) {
3285
e2a7e165
KW
3286 /* On non-EBCDIC platforms, pass through to the regex
3287 * compiler unchanged. The reason we evaluated the
3288 * number above is to make sure there wasn't a syntax
3289 * error. But on EBCDIC we convert to native so
3290 * downstream code can continue to assume it's native
3291 */
ff3f963a 3292 s -= 5; /* Include the '\N{U+' */
e2a7e165
KW
3293#ifdef EBCDIC
3294 d += my_snprintf(d, e - s + 1 + 1, /* includes the }
3295 and the \0 */
3296 "\\N{U+%X}",
3297 (unsigned int) UNI_TO_NATIVE(uv));
3298#else
ff3f963a
KW
3299 Copy(s, d, e - s + 1, char); /* 1 = include the } */
3300 d += e - s + 1;
e2a7e165 3301#endif
ff3f963a
KW
3302 }
3303 else { /* Not a pattern: convert the hex to string */
3304
3305 /* If destination is not in utf8, unconditionally
3306 * recode it to be so. This is because \N{} implies
3307 * Unicode semantics, and scalars have to be in utf8
3308 * to guarantee those semantics */
3309 if (! has_utf8) {
3310 SvCUR_set(sv, d - SvPVX_const(sv));
3311 SvPOK_on(sv);
3312 *d = '\0';
3313 /* See Note on sizing above. */
3314 sv_utf8_upgrade_flags_grow(
3315 sv,
3316 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3317 UNISKIP(uv) + (STRLEN)(send - e) + 1);
3318 d = SvPVX(sv) + SvCUR(sv);
3319 has_utf8 = TRUE;
3320 }
3321
a46469e6 3322 /* Add the (Unicode) code point to the output. */
ff3f963a 3323 if (UNI_IS_INVARIANT(uv)) {
a46469e6 3324 *d++ = (char) LATIN1_TO_NATIVE(uv);
ff3f963a 3325 }
a46469e6
KW
3326 else {
3327 d = (char*) uvoffuni_to_utf8_flags((U8*)d, uv, 0);
3328 }
ff3f963a
KW
3329 }
3330 }
6f613c73
KW
3331 else /* Here is \N{NAME} but not \N{U+...}. */
3332 if ((res = get_and_check_backslash_N_name(s, e)))
3333 {
3334 STRLEN len;
3335 const char *str = SvPV_const(res, len);
3336 if (PL_lex_inpat) {
ff3f963a
KW
3337
3338 if (! len) { /* The name resolved to an empty string */
3339 Copy("\\N{}", d, 4, char);
3340 d += 4;
3341 }
3342 else {
3343 /* In order to not lose information for the regex
3344 * compiler, pass the result in the specially made
3345 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3346 * the code points in hex of each character
3347 * returned by charnames */
3348
3349 const char *str_end = str + len;
3b721c4f 3350 const STRLEN off = d - SvPVX_const(sv);
94ca1619
KW
3351
3352 if (! SvUTF8(res)) {
3353 /* For the non-UTF-8 case, we can determine the
3354 * exact length needed without having to parse
3355 * through the string. Each character takes up
3356 * 2 hex digits plus either a trailing dot or
3357 * the "}" */
3358 d = off + SvGROW(sv, off
3359 + 3 * len
3360 + 6 /* For the "\N{U+", and
3361 trailing NUL */
3362 + (STRLEN)(send - e));
3363 Copy("\\N{U+", d, 5, char);
3364 d += 5;
3365 while (str < str_end) {
3366 char hex_string[4];
e8549682 3367 int len =
51f14a05
JH
3368 my_snprintf(hex_string,
3369 sizeof(hex_string),
e8549682
JH
3370 "%02X.", (U8) *str);
3371 PERL_MY_SNPRINTF_POST_GUARD(len, sizeof(hex_string));
94ca1619
KW
3372 Copy(hex_string, d, 3, char);
3373 d += 3;
3374 str++;
3375 }
3376 d--; /* We will overwrite below the final
3377 dot with a right brace */
3378 }
3379 else {
1953db30
KW
3380 STRLEN char_length; /* cur char's byte length */
3381
3382 /* and the number of bytes after this is
3383 * translated into hex digits */
3384 STRLEN output_length;
3385
3386 /* 2 hex per byte; 2 chars for '\N'; 2 chars
3387 * for max('U+', '.'); and 1 for NUL */
3388 char hex_string[2 * UTF8_MAXBYTES + 5];
3389
3390 /* Get the first character of the result. */
a46469e6 3391 U32 uv = utf8n_to_uvchr((U8 *) str,
1953db30
KW
3392 len,
3393 &char_length,
3394 UTF8_ALLOW_ANYUV);
3395 /* Convert first code point to hex, including
a46469e6 3396 * the boiler plate before it. */
1953db30
KW
3397 output_length =
3398 my_snprintf(hex_string, sizeof(hex_string),
a46469e6
KW
3399 "\\N{U+%X",
3400 (unsigned int) uv);
1953db30
KW
3401
3402 /* Make sure there is enough space to hold it */
3403 d = off + SvGROW(sv, off
3404 + output_length
3405 + (STRLEN)(send - e)
3406 + 2); /* '}' + NUL */
3407 /* And output it */
3408 Copy(hex_string, d, output_length, char);
3409 d += output_length;
3410
3411 /* For each subsequent character, append dot and
3412 * its ordinal in hex */
3413 while ((str += char_length) < str_end) {
3414 const STRLEN off = d - SvPVX_const(sv);
a46469e6 3415 U32 uv = utf8n_to_uvchr((U8 *) str,
1953db30
KW
3416 str_end - str,
3417 &char_length,
3418 UTF8_ALLOW_ANYUV);
3419 output_length =
3420 my_snprintf(hex_string,
a46469e6
KW
3421 sizeof(hex_string),
3422 ".%X",
3423 (unsigned int) uv);
1953db30
KW
3424
3425 d = off + SvGROW(sv, off
3426 + output_length
3427 + (STRLEN)(send - e)
3428 + 2); /* '}' + NUL */
3429 Copy(hex_string, d, output_length, char);
3430 d += output_length;
3431 }
94ca1619 3432 }
ff3f963a
KW
3433
3434 *d++ = '}'; /* Done. Add the trailing brace */
3435 }
3436 }
3437 else { /* Here, not in a pattern. Convert the name to a
3438 * string. */
3439
3440 /* If destination is not in utf8, unconditionally
3441 * recode it to be so. This is because \N{} implies
3442 * Unicode semantics, and scalars have to be in utf8
3443 * to guarantee those semantics */
3444 if (! has_utf8) {
3445 SvCUR_set(sv, d - SvPVX_const(sv));
3446 SvPOK_on(sv);
3447 *d = '\0';
3448 /* See Note on sizing above. */
3449 sv_utf8_upgrade_flags_grow(sv,
3450 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3451 len + (STRLEN)(send - s) + 1);
3452 d = SvPVX(sv) + SvCUR(sv);
3453 has_utf8 = TRUE;
3454 } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
3455
3456 /* See Note on sizing above. (NOTE: SvCUR() is not
3457 * set correctly here). */
3458 const STRLEN off = d - SvPVX_const(sv);
3459 d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1);
3460 }
7fc82458
KW
3461 if (! SvUTF8(res)) { /* Make sure is \N{} return is UTF-8 */
3462 sv_utf8_upgrade(res);
3463 str = SvPV_const(res, len);
3464 }
ff3f963a
KW
3465 Copy(str, d, len, char);
3466 d += len;
423cee85 3467 }
6f613c73 3468
423cee85 3469 SvREFCNT_dec(res);
cb233ae3 3470
cb233ae3 3471 } /* End \N{NAME} */
ff3f963a
KW
3472#ifdef EBCDIC
3473 if (!dorange)
3474 native_range = FALSE; /* \N{} is defined to be Unicode */
3475#endif
3476 s = e + 1; /* Point to just after the '}' */
423cee85
JH
3477 continue;
3478
02aa26ce 3479 /* \c is a control character */
79072805
LW
3480 case 'c':
3481 s++;
961ce445 3482 if (s < send) {
421e43ba 3483 *d++ = grok_bslash_c(*s++, 1);
ba210ebe 3484 }
961ce445
RGS
3485 else {
3486 yyerror("Missing control char name in \\c");
3487 }
79072805 3488 continue;
02aa26ce
NT
3489
3490 /* printf-style backslashes, formfeeds, newlines, etc */
79072805 3491 case 'b':
4d73d076 3492 *d++ = '\b';
79072805
LW
3493 break;
3494 case 'n':
4d73d076 3495 *d++ = '\n';
79072805
LW
3496 break;
3497 case 'r':
4d73d076 3498 *d++ = '\r';
79072805
LW
3499 break;
3500 case 'f':
4d73d076 3501 *d++ = '\f';
79072805
LW
3502 break;
3503 case 't':
4d73d076 3504 *d++ = '\t';
79072805 3505 break;
34a3fe2a 3506 case 'e':
da0334e8 3507 *d++ = ESC_NATIVE;
34a3fe2a
PP
3508 break;
3509 case 'a':
4d73d076 3510 *d++ = '\a';
79072805 3511 break;
02aa26ce
NT
3512 } /* end switch */
3513
79072805
LW
3514 s++;
3515 continue;
02aa26ce 3516 } /* end if (backslash) */
4c3a8340
ST
3517#ifdef EBCDIC
3518 else
3519 literal_endpoint++;
3520#endif
02aa26ce 3521
f9a63242 3522 default_action:
77a135fe
KW
3523 /* If we started with encoded form, or already know we want it,
3524 then encode the next character */
6f2d5cbc 3525 if (! NATIVE_BYTE_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
2b9d42f0 3526 STRLEN len = 1;
77a135fe
KW
3527
3528
3529 /* One might think that it is wasted effort in the case of the
3530 * source being utf8 (this_utf8 == TRUE) to take the next character
3531 * in the source, convert it to an unsigned value, and then convert
3532 * it back again. But the source has not been validated here. The
3533 * routine that does the conversion checks for errors like
3534 * malformed utf8 */
3535
233ca360
KW
3536 const UV nextuv = (this_utf8)
3537 ? utf8n_to_uvchr((U8*)s, send - s, &len, 0)
3538 : (UV) ((U8) *s);
5aaebcb3 3539 const STRLEN need = UNISKIP(nextuv);
77a135fe
KW
3540 if (!has_utf8) {
3541 SvCUR_set(sv, d - SvPVX_const(sv));
3542 SvPOK_on(sv);
3543 *d = '\0';
77a135fe 3544 /* See Note on sizing above. */
7bf79863
KW
3545 sv_utf8_upgrade_flags_grow(sv,
3546 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3547 need + (STRLEN)(send - s) + 1);
77a135fe
KW
3548 d = SvPVX(sv) + SvCUR(sv);
3549 has_utf8 = TRUE;
3550 } else if (need > len) {
3551 /* encoded value larger than old, may need extra space (NOTE:
3552 * SvCUR() is not set correctly here). See Note on sizing
3553 * above. */
9d4ba2ae 3554 const STRLEN off = d - SvPVX_const(sv);
77a135fe 3555 d = SvGROW(sv, off + need + (STRLEN)(send - s) + 1) + off;
2b9d42f0 3556 }
77a135fe
KW
3557 s += len;
3558
5f66b61c 3559 d = (char*)uvchr_to_utf8((U8*)d, nextuv);
e294cc5d
JH
3560#ifdef EBCDIC
3561 if (uv > 255 && !dorange)
3562 native_range = FALSE;
3563#endif
2b9d42f0
NIS
3564 }
3565 else {
5ff03569 3566 *d++ = *s++;
2b9d42f0 3567 }
02aa26ce
NT
3568 } /* while loop to process each character */
3569
3570 /* terminate the string and set up the sv */
79072805 3571 *d = '\0';
95a20fc0 3572 SvCUR_set(sv, d - SvPVX_const(sv));
2b9d42f0 3573 if (SvCUR(sv) >= SvLEN(sv))
5637ef5b
NC
3574 Perl_croak(aTHX_ "panic: constant overflowed allocated space, %"UVuf
3575 " >= %"UVuf, (UV)SvCUR(sv), (UV)SvLEN(sv));
2b9d42f0 3576
79072805 3577 SvPOK_on(sv);
9f4817db 3578 if (PL_encoding && !has_utf8) {
d0063567
DK
3579 sv_recode_to_utf8(sv, PL_encoding);
3580 if (SvUTF8(sv))
3581 has_utf8 = TRUE;
9f4817db 3582 }
2b9d42f0 3583 if (has_utf8) {
7e2040f0 3584 SvUTF8_on(sv);
2b9d42f0 3585 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
d0063567 3586 PL_sublex_info.sub_op->op_private |=
2b9d42f0
NIS
3587 (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
3588 }
3589 }
79072805 3590
02aa26ce 3591 /* shrink the sv if we allocated more than we used */
79072805 3592 if (SvCUR(sv) + 5 < SvLEN(sv)) {
1da4ca5f 3593 SvPV_shrink_to_cur(sv);
79072805 3594 }
02aa26ce 3595
6154021b 3596 /* return the substring (via pl_yylval) only if we parsed anything */
f0feb466
FC
3597 if (s > start) {
3598 char *s2 = start;
ffdb8b16
FC
3599 for (; s2 < s; s2++) {
3600 if (*s2 == '\n')
3601 COPLINE_INC_WITH_HERELINES;
3602 }
b899e89d 3603 SvREFCNT_inc_simple_void_NN(sv);
4f3e2518
DM
3604 if ( (PL_hints & ( PL_lex_inpat ? HINT_NEW_RE : HINT_NEW_STRING ))
3605 && ! PL_parser->lex_re_reparsing)
3606 {
eb0d8d16
NC
3607 const char *const key = PL_lex_inpat ? "qr" : "q";
3608 const STRLEN keylen = PL_lex_inpat ? 2 : 1;
3609 const char *type;
3610 STRLEN typelen;
3611
3612 if (PL_lex_inwhat == OP_TRANS) {
3613 type = "tr";
3614 typelen = 2;
3615 } else if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat) {
3616 type = "s";
3617 typelen = 1;
9da1dd8f
DM
3618 } else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'') {
3619 type = "q";
3620 typelen = 1;
eb0d8d16
NC
3621 } else {
3622 type = "qq";
3623 typelen = 2;
3624 }
3625
3626 sv = S_new_constant(aTHX_ start, s - start, key, keylen, sv, NULL,
3627 type, typelen);
3628 }
6154021b 3629 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
b899e89d
FC
3630 }
3631 LEAVE_with_name("scan_const");
79072805
LW
3632 return s;
3633}
3634
ffb4593c
NT
3635/* S_intuit_more
3636 * Returns TRUE if there's more to the expression (e.g., a subscript),
3637 * FALSE otherwise.
ffb4593c
NT
3638 *
3639 * It deals with "$foo[3]" and /$foo[3]/ and /$foo[0123456789$]+/
3640 *
3641 * ->[ and ->{ return TRUE
ff25e5db 3642 * ->$* ->$#* ->@* ->@[ ->@{ return TRUE if postderef_qq is enabled
ffb4593c
NT
3643 * { and [ outside a pattern are always subscripts, so return TRUE
3644 * if we're outside a pattern and it's not { or [, then return FALSE
3645 * if we're in a pattern and the first char is a {
3646 * {4,5} (any digits around the comma) returns FALSE
3647 * if we're in a pattern and the first char is a [
3648 * [] returns FALSE
3649 * [SOMETHING] has a funky algorithm to decide whether it's a
3650 * character class or not. It has to deal with things like
3651 * /$foo[-3]/ and /$foo[$bar]/ as well as /$foo[$\d]+/
3652 * anything else returns TRUE
3653 */
3654
9cbb5ea2
GS
3655/* This is the one truly awful dwimmer necessary to conflate C and sed. */
3656
76e3520e 3657STATIC int
5aaab254 3658S_intuit_more(pTHX_ char *s)
79072805 3659{
7918f24d
NC
3660 PERL_ARGS_ASSERT_INTUIT_MORE;
3661
3280af22