This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
toke.c: Touch PL_expect less for implicit [.,] in quotes
[perl5.git] / toke.c
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
f0e67a1d
Z
26This is the lower layer of the Perl parser, managing characters and tokens.
27
28=for apidoc AmU|yy_parser *|PL_parser
29
30Pointer to a structure encapsulating the state of the parsing operation
31currently in progress. The pointer can be locally changed to perform
32a nested parse without interfering with the state of an outer parse.
33Individual members of C<PL_parser> have their own documentation.
34
35=cut
36*/
37
378cc40b 38#include "EXTERN.h"
864dbfa3 39#define PERL_IN_TOKE_C
378cc40b 40#include "perl.h"
04e98a4d 41#include "dquote_static.c"
378cc40b 42
eb0d8d16
NC
43#define new_constant(a,b,c,d,e,f,g) \
44 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
45
6154021b 46#define pl_yylval (PL_parser->yylval)
d3b6f988 47
199e78b7
DM
48/* XXX temporary backwards compatibility */
49#define PL_lex_brackets (PL_parser->lex_brackets)
78cdf107
Z
50#define PL_lex_allbrackets (PL_parser->lex_allbrackets)
51#define PL_lex_fakeeof (PL_parser->lex_fakeeof)
199e78b7
DM
52#define PL_lex_brackstack (PL_parser->lex_brackstack)
53#define PL_lex_casemods (PL_parser->lex_casemods)
54#define PL_lex_casestack (PL_parser->lex_casestack)
55#define PL_lex_defer (PL_parser->lex_defer)
56#define PL_lex_dojoin (PL_parser->lex_dojoin)
57#define PL_lex_expect (PL_parser->lex_expect)
58#define PL_lex_formbrack (PL_parser->lex_formbrack)
59#define PL_lex_inpat (PL_parser->lex_inpat)
60#define PL_lex_inwhat (PL_parser->lex_inwhat)
61#define PL_lex_op (PL_parser->lex_op)
62#define PL_lex_repl (PL_parser->lex_repl)
63#define PL_lex_starts (PL_parser->lex_starts)
64#define PL_lex_stuff (PL_parser->lex_stuff)
65#define PL_multi_start (PL_parser->multi_start)
66#define PL_multi_open (PL_parser->multi_open)
67#define PL_multi_close (PL_parser->multi_close)
199e78b7
DM
68#define PL_preambled (PL_parser->preambled)
69#define PL_sublex_info (PL_parser->sublex_info)
bdc0bf6f 70#define PL_linestr (PL_parser->linestr)
c2598295
DM
71#define PL_expect (PL_parser->expect)
72#define PL_copline (PL_parser->copline)
f06b5848
DM
73#define PL_bufptr (PL_parser->bufptr)
74#define PL_oldbufptr (PL_parser->oldbufptr)
75#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
76#define PL_linestart (PL_parser->linestart)
77#define PL_bufend (PL_parser->bufend)
78#define PL_last_uni (PL_parser->last_uni)
79#define PL_last_lop (PL_parser->last_lop)
80#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 81#define PL_lex_state (PL_parser->lex_state)
2f9285f8 82#define PL_rsfp (PL_parser->rsfp)
5486870f 83#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
84#define PL_in_my (PL_parser->in_my)
85#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 86#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 87#define PL_multi_end (PL_parser->multi_end)
13765c85 88#define PL_error_count (PL_parser->error_count)
199e78b7 89
fb205e7a
DM
90# define PL_nexttoke (PL_parser->nexttoke)
91# define PL_nexttype (PL_parser->nexttype)
92# define PL_nextval (PL_parser->nextval)
199e78b7 93
a1894d81 94static const char* const ident_too_long = "Identifier too long";
8903cb82 95
9ded7720 96# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2 97
a7aaec61
Z
98#define XENUMMASK 0x3f
99#define XFAKEEOF 0x40
100#define XFAKEBRACK 0x80
9059aa12 101
39e02b42
JH
102#ifdef USE_UTF8_SCRIPTS
103# define UTF (!IN_BYTES)
2b9d42f0 104#else
802a15e9 105# define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
2b9d42f0 106#endif
a0ed51b3 107
b1fc3636
CJ
108/* The maximum number of characters preceding the unrecognized one to display */
109#define UNRECOGNIZED_PRECEDE_COUNT 10
110
61f0cdd9 111/* In variables named $^X, these are the legal values for X.
2b92dfce
GS
112 * 1999-02-27 mjd-perl-patch@plover.com */
113#define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
114
14bd96d0 115#define SPACE_OR_TAB(c) isBLANK_A(c)
bf4acbe4 116
9ff909cf
JH
117#define HEXFP_PEEK(s) \
118 (((s[0] == '.') && \
119 (isXDIGIT(s[1]) || isALPHA_FOLD_EQ(s[1], 'p'))) || \
120 isALPHA_FOLD_EQ(s[0], 'p'))
121
ffb4593c
NT
122/* LEX_* are values for PL_lex_state, the state of the lexer.
123 * They are arranged oddly so that the guard on the switch statement
79072805 124 * can get by with a single comparison (if the compiler is smart enough).
9da1dd8f
DM
125 *
126 * These values refer to the various states within a sublex parse,
127 * i.e. within a double quotish string
79072805
LW
128 */
129
fb73857a 130/* #define LEX_NOTPARSING 11 is done in perl.h. */
131
b6007c36
DM
132#define LEX_NORMAL 10 /* normal code (ie not within "...") */
133#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
134#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
135#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
136#define LEX_INTERPSTART 6 /* expecting the start of a $var */
137
138 /* at end of code, eg "$x" followed by: */
139#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
140#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
141
142#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
143 string or after \E, $foo, etc */
144#define LEX_INTERPCONST 2 /* NOT USED */
145#define LEX_FORMLINE 1 /* expecting a format line */
146#define LEX_KNOWNEXT 0 /* next token known; just return it */
147
79072805 148
bbf60fe6 149#ifdef DEBUGGING
27da23d5 150static const char* const lex_state_names[] = {
bbf60fe6
DM
151 "KNOWNEXT",
152 "FORMLINE",
153 "INTERPCONST",
154 "INTERPCONCAT",
155 "INTERPENDMAYBE",
156 "INTERPEND",
157 "INTERPSTART",
158 "INTERPPUSH",
159 "INTERPCASEMOD",
160 "INTERPNORMAL",
161 "NORMAL"
162};
163#endif
164
79072805 165#include "keywords.h"
fe14fcc3 166
ffb4593c
NT
167/* CLINE is a macro that ensures PL_copline has a sane value */
168
57843af0 169#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 170
29595ff2
NC
171# define SKIPSPACE0(s) skipspace(s)
172# define SKIPSPACE1(s) skipspace(s)
173# define SKIPSPACE2(s,tsv) skipspace(s)
174# define PEEKSPACE(s) skipspace(s)
29595ff2 175
ffb4593c
NT
176/*
177 * Convenience functions to return different tokens and prime the
9cbb5ea2 178 * lexer for the next token. They all take an argument.
ffb4593c
NT
179 *
180 * TOKEN : generic token (used for '(', DOLSHARP, etc)
181 * OPERATOR : generic operator
182 * AOPERATOR : assignment operator
183 * PREBLOCK : beginning the block after an if, while, foreach, ...
184 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
185 * PREREF : *EXPR where EXPR is not a simple identifier
186 * TERM : expression term
89f35911 187 * POSTDEREF : postfix dereference (->$* ->@[...] etc.)
ffb4593c
NT
188 * LOOPX : loop exiting command (goto, last, dump, etc)
189 * FTST : file test operator
190 * FUN0 : zero-argument function
7eb971ee 191 * FUN0OP : zero-argument function, with its op created in this file
2d2e263d 192 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
193 * BOop : bitwise or or xor
194 * BAop : bitwise and
195 * SHop : shift operator
196 * PWop : power operator
9cbb5ea2 197 * PMop : pattern-matching operator
ffb4593c 198 * Aop : addition-level operator
e4916dd1 199 * AopNOASSIGN : addition-level operator that is never part of .=
ffb4593c
NT
200 * Mop : multiplication-level operator
201 * Eop : equality-testing operator
e5edeb50 202 * Rop : relational operator <= != gt
ffb4593c
NT
203 *
204 * Also see LOP and lop() below.
205 */
206
998054bd 207#ifdef DEBUGGING /* Serve -DT. */
704d4215 208# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 209#else
bbf60fe6 210# define REPORT(retval) (retval)
998054bd
SC
211#endif
212
bbf60fe6
DM
213#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
214#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
215#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, REPORT(retval)))
216#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
217#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
218#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
219#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
89f35911 220#define POSTDEREF(f) return (PL_bufptr = s, S_postderef(aTHX_ REPORT(f),s[1]))
7a61bf3c
FC
221#define LOOPX(f) return (PL_expect = XOPERATOR, \
222 PL_bufptr = force_word(s,WORD,TRUE,FALSE), \
223 pl_yylval.ival=f, \
224 (void)(PL_nexttoke || (PL_expect = XTERM)), \
225 REPORT((int)LOOPEX))
6154021b
RGS
226#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
227#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
7eb971ee 228#define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
6154021b
RGS
229#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
230#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITOROP)))
231#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITANDOP)))
232#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)SHIFTOP)))
233#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)POWOP)))
234#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
235#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)ADDOP)))
e4916dd1 236#define AopNOASSIGN(f) return (pl_yylval.ival=f, PL_bufptr=s, REPORT((int)ADDOP))
6154021b
RGS
237#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MULOP)))
238#define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
239#define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
2f3197b3 240
a687059c
LW
241/* This bit of chicanery makes a unary function followed by
242 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
243 * The UNIDOR macro is for unary functions that can be followed by the //
244 * operator (such as C<shift // 0>).
a687059c 245 */
d68ce4ac 246#define UNI3(f,x,have_x) { \
6154021b 247 pl_yylval.ival = f; \
d68ce4ac 248 if (have_x) PL_expect = x; \
376fcdbf
AL
249 PL_bufptr = s; \
250 PL_last_uni = PL_oldbufptr; \
251 PL_last_lop_op = f; \
252 if (*s == '(') \
253 return REPORT( (int)FUNC1 ); \
29595ff2 254 s = PEEKSPACE(s); \
376fcdbf
AL
255 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
256 }
d68ce4ac
FC
257#define UNI(f) UNI3(f,XTERM,1)
258#define UNIDOR(f) UNI3(f,XTERMORDORDOR,1)
b5fb7ce3
FC
259#define UNIPROTO(f,optional) { \
260 if (optional) PL_last_uni = PL_oldbufptr; \
22393538
MH
261 OPERATOR(f); \
262 }
a687059c 263
d68ce4ac 264#define UNIBRACK(f) UNI3(f,0,0)
79072805 265
9f68db38 266/* grandfather return to old style */
78cdf107
Z
267#define OLDLOP(f) \
268 do { \
269 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
270 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
271 pl_yylval.ival = (f); \
272 PL_expect = XTERM; \
273 PL_bufptr = s; \
274 return (int)LSTOP; \
275 } while(0)
79072805 276
83944c01
FC
277#define COPLINE_INC_WITH_HERELINES \
278 STMT_START { \
279 CopLINE_inc(PL_curcop); \
851b527a
FC
280 if (PL_parser->herelines) \
281 CopLINE(PL_curcop) += PL_parser->herelines, \
282 PL_parser->herelines = 0; \
83944c01 283 } STMT_END
ffdb8b16
FC
284/* Called after scan_str to update CopLINE(PL_curcop), but only when there
285 * is no sublex_push to follow. */
286#define COPLINE_SET_FROM_MULTI_END \
287 STMT_START { \
288 CopLINE_set(PL_curcop, PL_multi_end); \
289 if (PL_multi_end != PL_multi_start) \
851b527a 290 PL_parser->herelines = 0; \
ffdb8b16 291 } STMT_END
83944c01
FC
292
293
8fa7f367
JH
294#ifdef DEBUGGING
295
6154021b 296/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
297enum token_type {
298 TOKENTYPE_NONE,
299 TOKENTYPE_IVAL,
6154021b 300 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6 301 TOKENTYPE_PVAL,
aeaef349 302 TOKENTYPE_OPVAL
bbf60fe6
DM
303};
304
6d4a66ac
NC
305static struct debug_tokens {
306 const int token;
307 enum token_type type;
308 const char *name;
309} const debug_tokens[] =
9041c2e3 310{
bbf60fe6
DM
311 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
312 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
313 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
314 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
315 { ARROW, TOKENTYPE_NONE, "ARROW" },
316 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
317 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
318 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
319 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
320 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
0d863452 321 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
322 { DO, TOKENTYPE_NONE, "DO" },
323 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
324 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
325 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
326 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
327 { ELSE, TOKENTYPE_NONE, "ELSE" },
328 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
329 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
330 { FOR, TOKENTYPE_IVAL, "FOR" },
331 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
705fe0e5
FC
332 { FORMLBRACK, TOKENTYPE_NONE, "FORMLBRACK" },
333 { FORMRBRACK, TOKENTYPE_NONE, "FORMRBRACK" },
bbf60fe6
DM
334 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
335 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
7eb971ee 336 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
bbf60fe6
DM
337 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
338 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
339 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 340 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
341 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
342 { IF, TOKENTYPE_IVAL, "IF" },
5504e6cf 343 { LABEL, TOKENTYPE_PVAL, "LABEL" },
bbf60fe6
DM
344 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
345 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
346 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
347 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
348 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
349 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
350 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
351 { MY, TOKENTYPE_IVAL, "MY" },
bbf60fe6
DM
352 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
353 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
354 { OROP, TOKENTYPE_IVAL, "OROP" },
355 { OROR, TOKENTYPE_NONE, "OROR" },
356 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
88e1f1a2
JV
357 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
358 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6 359 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
cc624add 360 { POSTJOIN, TOKENTYPE_NONE, "POSTJOIN" },
bbf60fe6
DM
361 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
362 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
363 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
364 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
365 { PREINC, TOKENTYPE_NONE, "PREINC" },
366 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
f3f204dc 367 { QWLIST, TOKENTYPE_OPVAL, "QWLIST" },
bbf60fe6
DM
368 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
369 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
f3f204dc 370 { REQUIRE, TOKENTYPE_NONE, "REQUIRE" },
bbf60fe6
DM
371 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
372 { SUB, TOKENTYPE_NONE, "SUB" },
373 { THING, TOKENTYPE_OPVAL, "THING" },
374 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
375 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
376 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
377 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
378 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
379 { USE, TOKENTYPE_IVAL, "USE" },
0d863452 380 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6
DM
381 { WHILE, TOKENTYPE_IVAL, "WHILE" },
382 { WORD, TOKENTYPE_OPVAL, "WORD" },
be25f609 383 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 384 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
385};
386
6154021b 387/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 388
bbf60fe6 389STATIC int
704d4215 390S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 391{
7918f24d
NC
392 PERL_ARGS_ASSERT_TOKEREPORT;
393
bbf60fe6 394 if (DEBUG_T_TEST) {
bd61b366 395 const char *name = NULL;
bbf60fe6 396 enum token_type type = TOKENTYPE_NONE;
f54cb97a 397 const struct debug_tokens *p;
396482e1 398 SV* const report = newSVpvs("<== ");
bbf60fe6 399
f54cb97a 400 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
401 if (p->token == (int)rv) {
402 name = p->name;
403 type = p->type;
404 break;
405 }
406 }
407 if (name)
54667de8 408 Perl_sv_catpv(aTHX_ report, name);
74736ae6 409 else if ((char)rv > ' ' && (char)rv <= '~')
4ebc7986 410 {
bbf60fe6 411 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
4ebc7986
FC
412 if ((char)rv == 'p')
413 sv_catpvs(report, " (pending identifier)");
414 }
bbf60fe6 415 else if (!rv)
396482e1 416 sv_catpvs(report, "EOF");
bbf60fe6
DM
417 else
418 Perl_sv_catpvf(aTHX_ report, "?? %"IVdf, (IV)rv);
419 switch (type) {
420 case TOKENTYPE_NONE:
bbf60fe6
DM
421 break;
422 case TOKENTYPE_IVAL:
704d4215 423 Perl_sv_catpvf(aTHX_ report, "(ival=%"IVdf")", (IV)lvalp->ival);
bbf60fe6
DM
424 break;
425 case TOKENTYPE_OPNUM:
426 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 427 PL_op_name[lvalp->ival]);
bbf60fe6
DM
428 break;
429 case TOKENTYPE_PVAL:
704d4215 430 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
431 break;
432 case TOKENTYPE_OPVAL:
704d4215 433 if (lvalp->opval) {
401441c0 434 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
435 PL_op_name[lvalp->opval->op_type]);
436 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 437 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 438 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
439 }
440
441 }
401441c0 442 else
396482e1 443 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
444 break;
445 }
b6007c36 446 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
447 };
448 return (int)rv;
998054bd
SC
449}
450
b6007c36
DM
451
452/* print the buffer with suitable escapes */
453
454STATIC void
15f169a1 455S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 456{
396482e1 457 SV* const tmp = newSVpvs("");
7918f24d
NC
458
459 PERL_ARGS_ASSERT_PRINTBUF;
460
5d37acd6 461 GCC_DIAG_IGNORE(-Wformat-nonliteral); /* fmt checked by caller */
b6007c36 462 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
5d37acd6 463 GCC_DIAG_RESTORE;
b6007c36
DM
464 SvREFCNT_dec(tmp);
465}
466
8fa7f367
JH
467#endif
468
8290c323
NC
469static int
470S_deprecate_commaless_var_list(pTHX) {
471 PL_expect = XTERM;
472 deprecate("comma-less variable list");
473 return REPORT(','); /* grandfather non-comma-format format */
474}
475
ffb4593c
NT
476/*
477 * S_ao
478 *
f393a21a
FC
479 * This subroutine looks for an '=' next to the operator that has just been
480 * parsed and turns it into an ASSIGNOP if it finds one.
ffb4593c
NT
481 */
482
76e3520e 483STATIC int
cea2e8a9 484S_ao(pTHX_ int toketype)
a0d0e21e 485{
3280af22
NIS
486 if (*PL_bufptr == '=') {
487 PL_bufptr++;
a0d0e21e 488 if (toketype == ANDAND)
6154021b 489 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 490 else if (toketype == OROR)
6154021b 491 pl_yylval.ival = OP_ORASSIGN;
c963b151 492 else if (toketype == DORDOR)
6154021b 493 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
494 toketype = ASSIGNOP;
495 }
496 return toketype;
497}
498
ffb4593c
NT
499/*
500 * S_no_op
501 * When Perl expects an operator and finds something else, no_op
502 * prints the warning. It always prints "<something> found where
503 * operator expected. It prints "Missing semicolon on previous line?"
504 * if the surprise occurs at the start of the line. "do you need to
505 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
506 * where the compiler doesn't know if foo is a method call or a function.
507 * It prints "Missing operator before end of line" if there's nothing
508 * after the missing operator, or "... before <...>" if there is something
509 * after the missing operator.
510 */
511
76e3520e 512STATIC void
15f169a1 513S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 514{
9d4ba2ae
AL
515 char * const oldbp = PL_bufptr;
516 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 517
7918f24d
NC
518 PERL_ARGS_ASSERT_NO_OP;
519
1189a94a
GS
520 if (!s)
521 s = oldbp;
07c798fb 522 else
1189a94a 523 PL_bufptr = s;
734ab321 524 yywarn(Perl_form(aTHX_ "%s found where operator expected", what), UTF ? SVf_UTF8 : 0);
56da5a46
RGS
525 if (ckWARN_d(WARN_SYNTAX)) {
526 if (is_first)
527 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
528 "\t(Missing semicolon on previous line?)\n");
529 else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) {
f54cb97a 530 const char *t;
8a2bca12 531 for (t = PL_oldoldbufptr; (isWORDCHAR_lazy_if(t,UTF) || *t == ':');
734ab321 532 t += UTF ? UTF8SKIP(t) : 1)
c35e046a 533 NOOP;
56da5a46
RGS
534 if (t < PL_bufptr && isSPACE(*t))
535 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
b17a0679
FC
536 "\t(Do you need to predeclare %"UTF8f"?)\n",
537 UTF8fARG(UTF, t - PL_oldoldbufptr, PL_oldoldbufptr));
56da5a46
RGS
538 }
539 else {
540 assert(s >= oldbp);
541 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
b17a0679
FC
542 "\t(Missing operator before %"UTF8f"?)\n",
543 UTF8fARG(UTF, s - oldbp, oldbp));
56da5a46 544 }
07c798fb 545 }
3280af22 546 PL_bufptr = oldbp;
8990e307
LW
547}
548
ffb4593c
NT
549/*
550 * S_missingterm
551 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 552 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
553 * If we're in a delimited string and the delimiter is a control
554 * character, it's reformatted into a two-char sequence like ^C.
555 * This is fatal.
556 */
557
76e3520e 558STATIC void
cea2e8a9 559S_missingterm(pTHX_ char *s)
8990e307
LW
560{
561 char tmpbuf[3];
562 char q;
563 if (s) {
9d4ba2ae 564 char * const nl = strrchr(s,'\n');
d2719217 565 if (nl)
8990e307
LW
566 *nl = '\0';
567 }
ca8b19a7 568 else if ((U8) PL_multi_close < 32) {
8990e307 569 *tmpbuf = '^';
585ec06d 570 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
571 tmpbuf[2] = '\0';
572 s = tmpbuf;
573 }
574 else {
eb160463 575 *tmpbuf = (char)PL_multi_close;
8990e307
LW
576 tmpbuf[1] = '\0';
577 s = tmpbuf;
578 }
579 q = strchr(s,'"') ? '\'' : '"';
cea2e8a9 580 Perl_croak(aTHX_ "Can't find string terminator %c%s%c anywhere before EOF",q,s,q);
463ee0b2 581}
79072805 582
dd0ac2b9
FC
583#include "feature.h"
584
0d863452 585/*
0d863452
RH
586 * Check whether the named feature is enabled.
587 */
26ea9e12 588bool
3fff3427 589Perl_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
0d863452 590{
4a731d7b 591 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
7918f24d
NC
592
593 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
ca4d40c4
FC
594
595 assert(CURRENT_FEATURE_BUNDLE == FEATURE_BUNDLE_CUSTOM);
7918f24d 596
26ea9e12
NC
597 if (namelen > MAX_FEATURE_LEN)
598 return FALSE;
3fff3427 599 memcpy(&he_name[8], name, namelen);
7d69d4a6 600
c8ca97b0
NC
601 return cBOOL(cop_hints_fetch_pvn(PL_curcop, he_name, 8 + namelen, 0,
602 REFCOUNTED_HE_EXISTS));
0d863452
RH
603}
604
ffb4593c 605/*
9cbb5ea2
GS
606 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
607 * utf16-to-utf8-reversed.
ffb4593c
NT
608 */
609
c39cd008
GS
610#ifdef PERL_CR_FILTER
611static void
612strip_return(SV *sv)
613{
eb578fdb
KW
614 const char *s = SvPVX_const(sv);
615 const char * const e = s + SvCUR(sv);
7918f24d
NC
616
617 PERL_ARGS_ASSERT_STRIP_RETURN;
618
c39cd008
GS
619 /* outer loop optimized to do nothing if there are no CR-LFs */
620 while (s < e) {
621 if (*s++ == '\r' && *s == '\n') {
622 /* hit a CR-LF, need to copy the rest */
eb578fdb 623 char *d = s - 1;
c39cd008
GS
624 *d++ = *s++;
625 while (s < e) {
626 if (*s == '\r' && s[1] == '\n')
627 s++;
628 *d++ = *s++;
629 }
630 SvCUR(sv) -= s - d;
631 return;
632 }
633 }
634}
a868473f 635
76e3520e 636STATIC I32
c39cd008 637S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 638{
f54cb97a 639 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
640 if (count > 0 && !maxlen)
641 strip_return(sv);
642 return count;
a868473f
NIS
643}
644#endif
645
ffb4593c 646/*
8eaa0acf
Z
647=for apidoc Amx|void|lex_start|SV *line|PerlIO *rsfp|U32 flags
648
649Creates and initialises a new lexer/parser state object, supplying
650a context in which to lex and parse from a new source of Perl code.
651A pointer to the new state object is placed in L</PL_parser>. An entry
652is made on the save stack so that upon unwinding the new state object
653will be destroyed and the former value of L</PL_parser> will be restored.
654Nothing else need be done to clean up the parsing context.
655
656The code to be parsed comes from I<line> and I<rsfp>. I<line>, if
657non-null, provides a string (in SV form) containing code to be parsed.
658A copy of the string is made, so subsequent modification of I<line>
659does not affect parsing. I<rsfp>, if non-null, provides an input stream
660from which code will be read to be parsed. If both are non-null, the
661code in I<line> comes first and must consist of complete lines of input,
662and I<rsfp> supplies the remainder of the source.
663
e368b3bd
FC
664The I<flags> parameter is reserved for future use. Currently it is only
665used by perl internally, so extensions should always pass zero.
8eaa0acf
Z
666
667=cut
668*/
ffb4593c 669
27fcb6ee 670/* LEX_START_SAME_FILTER indicates that this is not a new file, so it
87606032
NC
671 can share filters with the current parser.
672 LEX_START_DONT_CLOSE indicates that the file handle wasn't opened by the
673 caller, hence isn't owned by the parser, so shouldn't be closed on parser
674 destruction. This is used to handle the case of defaulting to reading the
675 script from the standard input because no filename was given on the command
676 line (without getting confused by situation where STDIN has been closed, so
677 the script handle is opened on fd 0) */
27fcb6ee 678
a0d0e21e 679void
8eaa0acf 680Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
79072805 681{
6ef55633 682 const char *s = NULL;
5486870f 683 yy_parser *parser, *oparser;
60d63348 684 if (flags && flags & ~LEX_START_FLAGS)
8eaa0acf 685 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
acdf0a21
DM
686
687 /* create and initialise a parser */
688
199e78b7 689 Newxz(parser, 1, yy_parser);
5486870f 690 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
691 PL_parser = parser;
692
28ac2b49
Z
693 parser->stack = NULL;
694 parser->ps = NULL;
695 parser->stack_size = 0;
acdf0a21 696
e3abe207
DM
697 /* on scope exit, free this parser and restore any outer one */
698 SAVEPARSER(parser);
7c4baf47 699 parser->saved_curcop = PL_curcop;
e3abe207 700
acdf0a21 701 /* initialise lexer state */
8990e307 702
fb205e7a 703 parser->nexttoke = 0;
ca4cfd28 704 parser->error_count = oparser ? oparser->error_count : 0;
7f1c3e8c 705 parser->copline = parser->preambling = NOLINE;
5afb0a62 706 parser->lex_state = LEX_NORMAL;
c2598295 707 parser->expect = XSTATE;
2f9285f8 708 parser->rsfp = rsfp;
27fcb6ee
FC
709 parser->rsfp_filters =
710 !(flags & LEX_START_SAME_FILTER) || !oparser
d3cd8e11
FC
711 ? NULL
712 : MUTABLE_AV(SvREFCNT_inc(
713 oparser->rsfp_filters
714 ? oparser->rsfp_filters
715 : (oparser->rsfp_filters = newAV())
716 ));
2f9285f8 717
199e78b7
DM
718 Newx(parser->lex_brackstack, 120, char);
719 Newx(parser->lex_casestack, 12, char);
720 *parser->lex_casestack = '\0';
d794b522 721 Newxz(parser->lex_shared, 1, LEXSHARED);
02b34bbe 722
10efb74f 723 if (line) {
0528fd32 724 STRLEN len;
10efb74f 725 s = SvPV_const(line, len);
0abcdfa4
FC
726 parser->linestr = flags & LEX_START_COPIED
727 ? SvREFCNT_inc_simple_NN(line)
728 : newSVpvn_flags(s, len, SvUTF8(line));
bf1b738b 729 sv_catpvn(parser->linestr, "\n;", rsfp ? 1 : 2);
0abcdfa4 730 } else {
bf1b738b 731 parser->linestr = newSVpvn("\n;", rsfp ? 1 : 2);
8990e307 732 }
f06b5848
DM
733 parser->oldoldbufptr =
734 parser->oldbufptr =
735 parser->bufptr =
736 parser->linestart = SvPVX(parser->linestr);
737 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
738 parser->last_lop = parser->last_uni = NULL;
b54f893d
KW
739
740 assert(FITS_IN_8_BITS(LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
741 |LEX_DONT_CLOSE_RSFP));
742 parser->lex_flags = (U8) (flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
743 |LEX_DONT_CLOSE_RSFP));
737c24fc 744
60d63348 745 parser->in_pod = parser->filtered = 0;
79072805 746}
a687059c 747
e3abe207
DM
748
749/* delete a parser object */
750
751void
752Perl_parser_free(pTHX_ const yy_parser *parser)
753{
7918f24d
NC
754 PERL_ARGS_ASSERT_PARSER_FREE;
755
7c4baf47 756 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
757 SvREFCNT_dec(parser->linestr);
758
87606032 759 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
2f9285f8 760 PerlIO_clearerr(parser->rsfp);
799361c3
SH
761 else if (parser->rsfp && (!parser->old_parser ||
762 (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 763 PerlIO_close(parser->rsfp);
5486870f 764 SvREFCNT_dec(parser->rsfp_filters);
10002bc1
FC
765 SvREFCNT_dec(parser->lex_stuff);
766 SvREFCNT_dec(parser->sublex_info.repl);
3ac7ff8f
FC
767
768 Safefree(parser->lex_brackstack);
769 Safefree(parser->lex_casestack);
770 Safefree(parser->lex_shared);
771 PL_parser = parser->old_parser;
772 Safefree(parser);
773}
774
775void
776Perl_parser_free_nexttoke_ops(pTHX_ yy_parser *parser, OPSLAB *slab)
777{
3ac7ff8f 778 I32 nexttoke = parser->nexttoke;
3ac7ff8f 779 PERL_ARGS_ASSERT_PARSER_FREE_NEXTTOKE_OPS;
3ce3dcd9 780 while (nexttoke--) {
3ac7ff8f
FC
781 if (S_is_opval_token(parser->nexttype[nexttoke] & 0xffff)
782 && parser->nextval[nexttoke].opval
783 && parser->nextval[nexttoke].opval->op_slabbed
784 && OpSLAB(parser->nextval[nexttoke].opval) == slab) {
3ce3dcd9 785 op_free(parser->nextval[nexttoke].opval);
3ac7ff8f
FC
786 parser->nextval[nexttoke].opval = NULL;
787 }
3ce3dcd9 788 }
e3abe207
DM
789}
790
791
ffb4593c 792/*
f0e67a1d
Z
793=for apidoc AmxU|SV *|PL_parser-E<gt>linestr
794
795Buffer scalar containing the chunk currently under consideration of the
796text currently being lexed. This is always a plain string scalar (for
797which C<SvPOK> is true). It is not intended to be used as a scalar by
798normal scalar means; instead refer to the buffer directly by the pointer
799variables described below.
800
801The lexer maintains various C<char*> pointers to things in the
802C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
803reallocated, all of these pointers must be updated. Don't attempt to
804do this manually, but rather use L</lex_grow_linestr> if you need to
805reallocate the buffer.
806
807The content of the text chunk in the buffer is commonly exactly one
808complete line of input, up to and including a newline terminator,
809but there are situations where it is otherwise. The octets of the
810buffer may be intended to be interpreted as either UTF-8 or Latin-1.
811The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
812flag on this scalar, which may disagree with it.
813
814For direct examination of the buffer, the variable
815L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
816lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
817of these pointers is usually preferable to examination of the scalar
818through normal scalar means.
819
820=for apidoc AmxU|char *|PL_parser-E<gt>bufend
821
822Direct pointer to the end of the chunk of text currently being lexed, the
823end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
6602b933 824+ SvCUR(PL_parser-E<gt>linestr)>. A C<NUL> character (zero octet) is
f0e67a1d
Z
825always located at the end of the buffer, and does not count as part of
826the buffer's contents.
827
828=for apidoc AmxU|char *|PL_parser-E<gt>bufptr
829
830Points to the current position of lexing inside the lexer buffer.
831Characters around this point may be freely examined, within
832the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
833L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
834interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
835
836Lexing code (whether in the Perl core or not) moves this pointer past
837the characters that it consumes. It is also expected to perform some
838bookkeeping whenever a newline character is consumed. This movement
839can be more conveniently performed by the function L</lex_read_to>,
840which handles newlines appropriately.
841
842Interpretation of the buffer's octets can be abstracted out by
843using the slightly higher-level functions L</lex_peek_unichar> and
844L</lex_read_unichar>.
845
846=for apidoc AmxU|char *|PL_parser-E<gt>linestart
847
848Points to the start of the current line inside the lexer buffer.
849This is useful for indicating at which column an error occurred, and
850not much else. This must be updated by any lexing code that consumes
851a newline; the function L</lex_read_to> handles this detail.
852
853=cut
854*/
855
856/*
857=for apidoc Amx|bool|lex_bufutf8
858
859Indicates whether the octets in the lexer buffer
860(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
861of Unicode characters. If not, they should be interpreted as Latin-1
862characters. This is analogous to the C<SvUTF8> flag for scalars.
863
864In UTF-8 mode, it is not guaranteed that the lexer buffer actually
865contains valid UTF-8. Lexing code must be robust in the face of invalid
866encoding.
867
868The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
869is significant, but not the whole story regarding the input character
870encoding. Normally, when a file is being read, the scalar contains octets
871and its C<SvUTF8> flag is off, but the octets should be interpreted as
872UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
873however, the scalar may have the C<SvUTF8> flag on, and in this case its
874octets should be interpreted as UTF-8 unless the C<use bytes> pragma
875is in effect. This logic may change in the future; use this function
876instead of implementing the logic yourself.
877
878=cut
879*/
880
881bool
882Perl_lex_bufutf8(pTHX)
883{
884 return UTF;
885}
886
887/*
888=for apidoc Amx|char *|lex_grow_linestr|STRLEN len
889
890Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
6602b933 891at least I<len> octets (including terminating C<NUL>). Returns a
f0e67a1d
Z
892pointer to the reallocated buffer. This is necessary before making
893any direct modification of the buffer that would increase its length.
894L</lex_stuff_pvn> provides a more convenient way to insert text into
895the buffer.
896
897Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
898this function updates all of the lexer's variables that point directly
899into the buffer.
900
901=cut
902*/
903
904char *
905Perl_lex_grow_linestr(pTHX_ STRLEN len)
906{
907 SV *linestr;
908 char *buf;
909 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
c7641931 910 STRLEN linestart_pos, last_uni_pos, last_lop_pos, re_eval_start_pos;
f0e67a1d
Z
911 linestr = PL_parser->linestr;
912 buf = SvPVX(linestr);
913 if (len <= SvLEN(linestr))
914 return buf;
915 bufend_pos = PL_parser->bufend - buf;
916 bufptr_pos = PL_parser->bufptr - buf;
917 oldbufptr_pos = PL_parser->oldbufptr - buf;
918 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
919 linestart_pos = PL_parser->linestart - buf;
920 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
921 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
3328ab5a
FC
922 re_eval_start_pos = PL_parser->lex_shared->re_eval_start ?
923 PL_parser->lex_shared->re_eval_start - buf : 0;
c7641931 924
f0e67a1d 925 buf = sv_grow(linestr, len);
c7641931 926
f0e67a1d
Z
927 PL_parser->bufend = buf + bufend_pos;
928 PL_parser->bufptr = buf + bufptr_pos;
929 PL_parser->oldbufptr = buf + oldbufptr_pos;
930 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
931 PL_parser->linestart = buf + linestart_pos;
932 if (PL_parser->last_uni)
933 PL_parser->last_uni = buf + last_uni_pos;
934 if (PL_parser->last_lop)
935 PL_parser->last_lop = buf + last_lop_pos;
3328ab5a
FC
936 if (PL_parser->lex_shared->re_eval_start)
937 PL_parser->lex_shared->re_eval_start = buf + re_eval_start_pos;
f0e67a1d
Z
938 return buf;
939}
940
941/*
83aa740e 942=for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
f0e67a1d
Z
943
944Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
945immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
946reallocating the buffer if necessary. This means that lexing code that
947runs later will see the characters as if they had appeared in the input.
948It is not recommended to do this as part of normal parsing, and most
949uses of this facility run the risk of the inserted characters being
950interpreted in an unintended manner.
951
952The string to be inserted is represented by I<len> octets starting
953at I<pv>. These octets are interpreted as either UTF-8 or Latin-1,
954according to whether the C<LEX_STUFF_UTF8> flag is set in I<flags>.
955The characters are recoded for the lexer buffer, according to how the
956buffer is currently being interpreted (L</lex_bufutf8>). If a string
9dcc53ea 957to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
f0e67a1d
Z
958function is more convenient.
959
960=cut
961*/
962
963void
83aa740e 964Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
f0e67a1d 965{
749123ff 966 dVAR;
f0e67a1d
Z
967 char *bufptr;
968 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
969 if (flags & ~(LEX_STUFF_UTF8))
970 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
971 if (UTF) {
972 if (flags & LEX_STUFF_UTF8) {
973 goto plain_copy;
974 } else {
54d004e8 975 STRLEN highhalf = 0; /* Count of variants */
83aa740e 976 const char *p, *e = pv+len;
54d004e8
KW
977 for (p = pv; p != e; p++) {
978 if (! UTF8_IS_INVARIANT(*p)) {
979 highhalf++;
980 }
981 }
f0e67a1d
Z
982 if (!highhalf)
983 goto plain_copy;
984 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
985 bufptr = PL_parser->bufptr;
986 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
987 SvCUR_set(PL_parser->linestr,
988 SvCUR(PL_parser->linestr) + len+highhalf);
f0e67a1d
Z
989 PL_parser->bufend += len+highhalf;
990 for (p = pv; p != e; p++) {
991 U8 c = (U8)*p;
54d004e8
KW
992 if (! UTF8_IS_INVARIANT(c)) {
993 *bufptr++ = UTF8_TWO_BYTE_HI(c);
994 *bufptr++ = UTF8_TWO_BYTE_LO(c);
f0e67a1d
Z
995 } else {
996 *bufptr++ = (char)c;
997 }
998 }
999 }
1000 } else {
1001 if (flags & LEX_STUFF_UTF8) {
1002 STRLEN highhalf = 0;
83aa740e 1003 const char *p, *e = pv+len;
f0e67a1d
Z
1004 for (p = pv; p != e; p++) {
1005 U8 c = (U8)*p;
54d004e8 1006 if (UTF8_IS_ABOVE_LATIN1(c)) {
f0e67a1d
Z
1007 Perl_croak(aTHX_ "Lexing code attempted to stuff "
1008 "non-Latin-1 character into Latin-1 input");
54d004e8 1009 } else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e)) {
f0e67a1d
Z
1010 p++;
1011 highhalf++;
54d004e8 1012 } else if (! UTF8_IS_INVARIANT(c)) {
f0e67a1d
Z
1013 /* malformed UTF-8 */
1014 ENTER;
1015 SAVESPTR(PL_warnhook);
1016 PL_warnhook = PERL_WARNHOOK_FATAL;
c80e42f3 1017 utf8n_to_uvchr((U8*)p, e-p, NULL, 0);
f0e67a1d
Z
1018 LEAVE;
1019 }
1020 }
1021 if (!highhalf)
1022 goto plain_copy;
1023 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
1024 bufptr = PL_parser->bufptr;
1025 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1026 SvCUR_set(PL_parser->linestr,
1027 SvCUR(PL_parser->linestr) + len-highhalf);
f0e67a1d 1028 PL_parser->bufend += len-highhalf;
54d004e8
KW
1029 p = pv;
1030 while (p < e) {
1031 if (UTF8_IS_INVARIANT(*p)) {
1032 *bufptr++ = *p;
1033 p++;
f0e67a1d 1034 }
54d004e8
KW
1035 else {
1036 assert(p < e -1 );
94bb8c36 1037 *bufptr++ = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
54d004e8
KW
1038 p += 2;
1039 }
f0e67a1d
Z
1040 }
1041 } else {
54d004e8 1042 plain_copy:
f0e67a1d
Z
1043 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1044 bufptr = PL_parser->bufptr;
1045 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
255fdf19 1046 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
f0e67a1d
Z
1047 PL_parser->bufend += len;
1048 Copy(pv, bufptr, len, char);
1049 }
1050 }
1051}
1052
1053/*
9dcc53ea
Z
1054=for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
1055
1056Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1057immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1058reallocating the buffer if necessary. This means that lexing code that
1059runs later will see the characters as if they had appeared in the input.
1060It is not recommended to do this as part of normal parsing, and most
1061uses of this facility run the risk of the inserted characters being
1062interpreted in an unintended manner.
1063
1064The string to be inserted is represented by octets starting at I<pv>
1065and continuing to the first nul. These octets are interpreted as either
1066UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
1067in I<flags>. The characters are recoded for the lexer buffer, according
1068to how the buffer is currently being interpreted (L</lex_bufutf8>).
1069If it is not convenient to nul-terminate a string to be inserted, the
1070L</lex_stuff_pvn> function is more appropriate.
1071
1072=cut
1073*/
1074
1075void
1076Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1077{
1078 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1079 lex_stuff_pvn(pv, strlen(pv), flags);
1080}
1081
1082/*
f0e67a1d
Z
1083=for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1084
1085Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1086immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1087reallocating the buffer if necessary. This means that lexing code that
1088runs later will see the characters as if they had appeared in the input.
1089It is not recommended to do this as part of normal parsing, and most
1090uses of this facility run the risk of the inserted characters being
1091interpreted in an unintended manner.
1092
1093The string to be inserted is the string value of I<sv>. The characters
1094are recoded for the lexer buffer, according to how the buffer is currently
9dcc53ea 1095being interpreted (L</lex_bufutf8>). If a string to be inserted is
f0e67a1d
Z
1096not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1097need to construct a scalar.
1098
1099=cut
1100*/
1101
1102void
1103Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1104{
1105 char *pv;
1106 STRLEN len;
1107 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1108 if (flags)
1109 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1110 pv = SvPV(sv, len);
1111 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1112}
1113
1114/*
1115=for apidoc Amx|void|lex_unstuff|char *ptr
1116
1117Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1118I<ptr>. Text following I<ptr> will be moved, and the buffer shortened.
1119This hides the discarded text from any lexing code that runs later,
1120as if the text had never appeared.
1121
1122This is not the normal way to consume lexed text. For that, use
1123L</lex_read_to>.
1124
1125=cut
1126*/
1127
1128void
1129Perl_lex_unstuff(pTHX_ char *ptr)
1130{
1131 char *buf, *bufend;
1132 STRLEN unstuff_len;
1133 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1134 buf = PL_parser->bufptr;
1135 if (ptr < buf)
1136 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1137 if (ptr == buf)
1138 return;
1139 bufend = PL_parser->bufend;
1140 if (ptr > bufend)
1141 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1142 unstuff_len = ptr - buf;
1143 Move(ptr, buf, bufend+1-ptr, char);
1144 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1145 PL_parser->bufend = bufend - unstuff_len;
1146}
1147
1148/*
1149=for apidoc Amx|void|lex_read_to|char *ptr
1150
1151Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1152to I<ptr>. This advances L</PL_parser-E<gt>bufptr> to match I<ptr>,
1153performing the correct bookkeeping whenever a newline character is passed.
1154This is the normal way to consume lexed text.
1155
1156Interpretation of the buffer's octets can be abstracted out by
1157using the slightly higher-level functions L</lex_peek_unichar> and
1158L</lex_read_unichar>.
1159
1160=cut
1161*/
1162
1163void
1164Perl_lex_read_to(pTHX_ char *ptr)
1165{
1166 char *s;
1167 PERL_ARGS_ASSERT_LEX_READ_TO;
1168 s = PL_parser->bufptr;
1169 if (ptr < s || ptr > PL_parser->bufend)
1170 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1171 for (; s != ptr; s++)
1172 if (*s == '\n') {
83944c01 1173 COPLINE_INC_WITH_HERELINES;
f0e67a1d
Z
1174 PL_parser->linestart = s+1;
1175 }
1176 PL_parser->bufptr = ptr;
1177}
1178
1179/*
1180=for apidoc Amx|void|lex_discard_to|char *ptr
1181
1182Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1183up to I<ptr>. The remaining content of the buffer will be moved, and
1184all pointers into the buffer updated appropriately. I<ptr> must not
1185be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1186it is not permitted to discard text that has yet to be lexed.
1187
1188Normally it is not necessarily to do this directly, because it suffices to
1189use the implicit discarding behaviour of L</lex_next_chunk> and things
1190based on it. However, if a token stretches across multiple lines,
1f317c95 1191and the lexing code has kept multiple lines of text in the buffer for
f0e67a1d
Z
1192that purpose, then after completion of the token it would be wise to
1193explicitly discard the now-unneeded earlier lines, to avoid future
1194multi-line tokens growing the buffer without bound.
1195
1196=cut
1197*/
1198
1199void
1200Perl_lex_discard_to(pTHX_ char *ptr)
1201{
1202 char *buf;
1203 STRLEN discard_len;
1204 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1205 buf = SvPVX(PL_parser->linestr);
1206 if (ptr < buf)
1207 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1208 if (ptr == buf)
1209 return;
1210 if (ptr > PL_parser->bufptr)
1211 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1212 discard_len = ptr - buf;
1213 if (PL_parser->oldbufptr < ptr)
1214 PL_parser->oldbufptr = ptr;
1215 if (PL_parser->oldoldbufptr < ptr)
1216 PL_parser->oldoldbufptr = ptr;
1217 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1218 PL_parser->last_uni = NULL;
1219 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1220 PL_parser->last_lop = NULL;
1221 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1222 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1223 PL_parser->bufend -= discard_len;
1224 PL_parser->bufptr -= discard_len;
1225 PL_parser->oldbufptr -= discard_len;
1226 PL_parser->oldoldbufptr -= discard_len;
1227 if (PL_parser->last_uni)
1228 PL_parser->last_uni -= discard_len;
1229 if (PL_parser->last_lop)
1230 PL_parser->last_lop -= discard_len;
1231}
1232
1233/*
1234=for apidoc Amx|bool|lex_next_chunk|U32 flags
1235
1236Reads in the next chunk of text to be lexed, appending it to
1237L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1238looked to the end of the current chunk and wants to know more. It is
1239usual, but not necessary, for lexing to have consumed the entirety of
1240the current chunk at this time.
1241
1242If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1243chunk (i.e., the current chunk has been entirely consumed), normally the
1244current chunk will be discarded at the same time that the new chunk is
1245read in. If I<flags> includes C<LEX_KEEP_PREVIOUS>, the current chunk
1246will not be discarded. If the current chunk has not been entirely
1247consumed, then it will not be discarded regardless of the flag.
1248
1249Returns true if some new text was added to the buffer, or false if the
1250buffer has reached the end of the input text.
1251
1252=cut
1253*/
1254
1255#define LEX_FAKE_EOF 0x80000000
112d1284 1256#define LEX_NO_TERM 0x40000000
f0e67a1d
Z
1257
1258bool
1259Perl_lex_next_chunk(pTHX_ U32 flags)
1260{
1261 SV *linestr;
1262 char *buf;
1263 STRLEN old_bufend_pos, new_bufend_pos;
1264 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1265 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1266 bool got_some_for_debugger = 0;
f0e67a1d 1267 bool got_some;
112d1284 1268 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM))
f0e67a1d 1269 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
f0e67a1d
Z
1270 linestr = PL_parser->linestr;
1271 buf = SvPVX(linestr);
1272 if (!(flags & LEX_KEEP_PREVIOUS) &&
1273 PL_parser->bufptr == PL_parser->bufend) {
1274 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1275 linestart_pos = 0;
1276 if (PL_parser->last_uni != PL_parser->bufend)
1277 PL_parser->last_uni = NULL;
1278 if (PL_parser->last_lop != PL_parser->bufend)
1279 PL_parser->last_lop = NULL;
1280 last_uni_pos = last_lop_pos = 0;
1281 *buf = 0;
1282 SvCUR(linestr) = 0;
1283 } else {
1284 old_bufend_pos = PL_parser->bufend - buf;
1285 bufptr_pos = PL_parser->bufptr - buf;
1286 oldbufptr_pos = PL_parser->oldbufptr - buf;
1287 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1288 linestart_pos = PL_parser->linestart - buf;
1289 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1290 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1291 }
1292 if (flags & LEX_FAKE_EOF) {
1293 goto eof;
60d63348 1294 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
f0e67a1d
Z
1295 got_some = 0;
1296 } else if (filter_gets(linestr, old_bufend_pos)) {
1297 got_some = 1;
17cc9359 1298 got_some_for_debugger = 1;
112d1284
FC
1299 } else if (flags & LEX_NO_TERM) {
1300 got_some = 0;
f0e67a1d 1301 } else {
580561a3
Z
1302 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1303 sv_setpvs(linestr, "");
f0e67a1d
Z
1304 eof:
1305 /* End of real input. Close filehandle (unless it was STDIN),
1306 * then add implicit termination.
1307 */
87606032 1308 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
f0e67a1d
Z
1309 PerlIO_clearerr(PL_parser->rsfp);
1310 else if (PL_parser->rsfp)
1311 (void)PerlIO_close(PL_parser->rsfp);
1312 PL_parser->rsfp = NULL;
60d63348 1313 PL_parser->in_pod = PL_parser->filtered = 0;
f0e67a1d
Z
1314 if (!PL_in_eval && PL_minus_p) {
1315 sv_catpvs(linestr,
1316 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1317 PL_minus_n = PL_minus_p = 0;
1318 } else if (!PL_in_eval && PL_minus_n) {
1319 sv_catpvs(linestr, /*{*/";}");
1320 PL_minus_n = 0;
1321 } else
1322 sv_catpvs(linestr, ";");
1323 got_some = 1;
1324 }
1325 buf = SvPVX(linestr);
1326 new_bufend_pos = SvCUR(linestr);
1327 PL_parser->bufend = buf + new_bufend_pos;
1328 PL_parser->bufptr = buf + bufptr_pos;
1329 PL_parser->oldbufptr = buf + oldbufptr_pos;
1330 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1331 PL_parser->linestart = buf + linestart_pos;
1332 if (PL_parser->last_uni)
1333 PL_parser->last_uni = buf + last_uni_pos;
1334 if (PL_parser->last_lop)
1335 PL_parser->last_lop = buf + last_lop_pos;
7f1c3e8c
FC
1336 if (PL_parser->preambling != NOLINE) {
1337 CopLINE_set(PL_curcop, PL_parser->preambling + 1);
1338 PL_parser->preambling = NOLINE;
1339 }
17cc9359 1340 if (got_some_for_debugger && (PERLDB_LINE || PERLDB_SAVESRC) &&
f0e67a1d
Z
1341 PL_curstash != PL_debstash) {
1342 /* debugger active and we're not compiling the debugger code,
1343 * so store the line into the debugger's array of lines
1344 */
1345 update_debugger_info(NULL, buf+old_bufend_pos,
1346 new_bufend_pos-old_bufend_pos);
1347 }
1348 return got_some;
1349}
1350
1351/*
1352=for apidoc Amx|I32|lex_peek_unichar|U32 flags
1353
1354Looks ahead one (Unicode) character in the text currently being lexed.
1355Returns the codepoint (unsigned integer value) of the next character,
1356or -1 if lexing has reached the end of the input text. To consume the
1357peeked character, use L</lex_read_unichar>.
1358
1359If the next character is in (or extends into) the next chunk of input
1360text, the next chunk will be read in. Normally the current chunk will be
1361discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1362then the current chunk will not be discarded.
1363
1364If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1365is encountered, an exception is generated.
1366
1367=cut
1368*/
1369
1370I32
1371Perl_lex_peek_unichar(pTHX_ U32 flags)
1372{
749123ff 1373 dVAR;
f0e67a1d
Z
1374 char *s, *bufend;
1375 if (flags & ~(LEX_KEEP_PREVIOUS))
1376 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1377 s = PL_parser->bufptr;
1378 bufend = PL_parser->bufend;
1379 if (UTF) {
1380 U8 head;
1381 I32 unichar;
1382 STRLEN len, retlen;
1383 if (s == bufend) {
1384 if (!lex_next_chunk(flags))
1385 return -1;
1386 s = PL_parser->bufptr;
1387 bufend = PL_parser->bufend;
1388 }
1389 head = (U8)*s;
54d004e8 1390 if (UTF8_IS_INVARIANT(head))
f0e67a1d 1391 return head;
54d004e8
KW
1392 if (UTF8_IS_START(head)) {
1393 len = UTF8SKIP(&head);
f0e67a1d
Z
1394 while ((STRLEN)(bufend-s) < len) {
1395 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1396 break;
1397 s = PL_parser->bufptr;
1398 bufend = PL_parser->bufend;
1399 }
1400 }
c80e42f3 1401 unichar = utf8n_to_uvchr((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
f0e67a1d
Z
1402 if (retlen == (STRLEN)-1) {
1403 /* malformed UTF-8 */
1404 ENTER;
1405 SAVESPTR(PL_warnhook);
1406 PL_warnhook = PERL_WARNHOOK_FATAL;
c80e42f3 1407 utf8n_to_uvchr((U8*)s, bufend-s, NULL, 0);
f0e67a1d
Z
1408 LEAVE;
1409 }
1410 return unichar;
1411 } else {
1412 if (s == bufend) {
1413 if (!lex_next_chunk(flags))
1414 return -1;
1415 s = PL_parser->bufptr;
1416 }
1417 return (U8)*s;
1418 }
1419}
1420
1421/*
1422=for apidoc Amx|I32|lex_read_unichar|U32 flags
1423
1424Reads the next (Unicode) character in the text currently being lexed.
1425Returns the codepoint (unsigned integer value) of the character read,
1426and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1427if lexing has reached the end of the input text. To non-destructively
1428examine the next character, use L</lex_peek_unichar> instead.
1429
1430If the next character is in (or extends into) the next chunk of input
1431text, the next chunk will be read in. Normally the current chunk will be
1432discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1433then the current chunk will not be discarded.
1434
1435If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1436is encountered, an exception is generated.
1437
1438=cut
1439*/
1440
1441I32
1442Perl_lex_read_unichar(pTHX_ U32 flags)
1443{
1444 I32 c;
1445 if (flags & ~(LEX_KEEP_PREVIOUS))
1446 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1447 c = lex_peek_unichar(flags);
1448 if (c != -1) {
1449 if (c == '\n')
83944c01 1450 COPLINE_INC_WITH_HERELINES;
d9018cbe
EB
1451 if (UTF)
1452 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1453 else
1454 ++(PL_parser->bufptr);
f0e67a1d
Z
1455 }
1456 return c;
1457}
1458
1459/*
1460=for apidoc Amx|void|lex_read_space|U32 flags
1461
1462Reads optional spaces, in Perl style, in the text currently being
1463lexed. The spaces may include ordinary whitespace characters and
1464Perl-style comments. C<#line> directives are processed if encountered.
1465L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1466at a non-space character (or the end of the input text).
1467
1468If spaces extend into the next chunk of input text, the next chunk will
1469be read in. Normally the current chunk will be discarded at the same
1470time, but if I<flags> includes C<LEX_KEEP_PREVIOUS> then the current
1471chunk will not be discarded.
1472
1473=cut
1474*/
1475
21791330 1476#define LEX_NO_INCLINE 0x40000000
f0998909
Z
1477#define LEX_NO_NEXT_CHUNK 0x80000000
1478
f0e67a1d
Z
1479void
1480Perl_lex_read_space(pTHX_ U32 flags)
1481{
1482 char *s, *bufend;
21791330 1483 const bool can_incline = !(flags & LEX_NO_INCLINE);
f0e67a1d 1484 bool need_incline = 0;
21791330 1485 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK|LEX_NO_INCLINE))
f0e67a1d 1486 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
f0e67a1d
Z
1487 s = PL_parser->bufptr;
1488 bufend = PL_parser->bufend;
1489 while (1) {
1490 char c = *s;
1491 if (c == '#') {
1492 do {
1493 c = *++s;
1494 } while (!(c == '\n' || (c == 0 && s == bufend)));
1495 } else if (c == '\n') {
1496 s++;
21791330
FC
1497 if (can_incline) {
1498 PL_parser->linestart = s;
1499 if (s == bufend)
1500 need_incline = 1;
1501 else
1502 incline(s);
1503 }
f0e67a1d
Z
1504 } else if (isSPACE(c)) {
1505 s++;
1506 } else if (c == 0 && s == bufend) {
1507 bool got_more;
65c68e17 1508 line_t l;
f0998909
Z
1509 if (flags & LEX_NO_NEXT_CHUNK)
1510 break;
f0e67a1d 1511 PL_parser->bufptr = s;
65c68e17 1512 l = CopLINE(PL_curcop);
851b527a 1513 CopLINE(PL_curcop) += PL_parser->herelines + 1;
f0e67a1d 1514 got_more = lex_next_chunk(flags);
65c68e17 1515 CopLINE_set(PL_curcop, l);
f0e67a1d
Z
1516 s = PL_parser->bufptr;
1517 bufend = PL_parser->bufend;
1518 if (!got_more)
1519 break;
21791330 1520 if (can_incline && need_incline && PL_parser->rsfp) {
f0e67a1d
Z
1521 incline(s);
1522 need_incline = 0;
1523 }
1524 } else {
1525 break;
1526 }
1527 }
f0e67a1d
Z
1528 PL_parser->bufptr = s;
1529}
1530
1531/*
fe788d6b
PM
1532
1533=for apidoc EXMp|bool|validate_proto|SV *name|SV *proto|bool warn
1534
1535This function performs syntax checking on a prototype, C<proto>.
1536If C<warn> is true, any illegal characters or mismatched brackets
1537will trigger illegalproto warnings, declaring that they were
1538detected in the prototype for C<name>.
1539
1540The return value is C<true> if this is a valid prototype, and
1541C<false> if it is not, regardless of whether C<warn> was C<true> or
1542C<false>.
1543
1544Note that C<NULL> is a valid C<proto> and will always return C<true>.
1545
1546=cut
1547
1548 */
1549
1550bool
1551Perl_validate_proto(pTHX_ SV *name, SV *proto, bool warn)
1552{
1553 STRLEN len, origlen;
1554 char *p = proto ? SvPV(proto, len) : NULL;
1555 bool bad_proto = FALSE;
1556 bool in_brackets = FALSE;
1557 bool after_slash = FALSE;
1558 char greedy_proto = ' ';
1559 bool proto_after_greedy_proto = FALSE;
1560 bool must_be_last = FALSE;
1561 bool underscore = FALSE;
f791a21a 1562 bool bad_proto_after_underscore = FALSE;
fe788d6b
PM
1563
1564 PERL_ARGS_ASSERT_VALIDATE_PROTO;
1565
1566 if (!proto)
1567 return TRUE;
1568
1569 origlen = len;
1570 for (; len--; p++) {
1571 if (!isSPACE(*p)) {
1572 if (must_be_last)
1573 proto_after_greedy_proto = TRUE;
f791a21a
PM
1574 if (underscore) {
1575 if (!strchr(";@%", *p))
1576 bad_proto_after_underscore = TRUE;
1577 underscore = FALSE;
1578 }
fe788d6b
PM
1579 if (!strchr("$@%*;[]&\\_+", *p) || *p == '\0') {
1580 bad_proto = TRUE;
1581 }
1582 else {
fe788d6b
PM
1583 if (*p == '[')
1584 in_brackets = TRUE;
1585 else if (*p == ']')
1586 in_brackets = FALSE;
1587 else if ((*p == '@' || *p == '%') &&
1588 !after_slash &&
1589 !in_brackets ) {
1590 must_be_last = TRUE;
1591 greedy_proto = *p;
1592 }
1593 else if (*p == '_')
f791a21a 1594 underscore = TRUE;
fe788d6b
PM
1595 }
1596 if (*p == '\\')
1597 after_slash = TRUE;
1598 else
1599 after_slash = FALSE;
1600 }
1601 }
1602
1603 if (warn) {
b54d603d 1604 SV *tmpsv = newSVpvs_flags("", SVs_TEMP);
fe788d6b 1605 p -= origlen;
b54d603d
PM
1606 p = SvUTF8(proto)
1607 ? sv_uni_display(tmpsv, newSVpvn_flags(p, origlen, SVs_TEMP | SVf_UTF8),
1608 origlen, UNI_DISPLAY_ISPRINT)
1609 : pv_pretty(tmpsv, p, origlen, 60, NULL, NULL, PERL_PV_ESCAPE_NONASCII);
1610
fe788d6b
PM
1611 if (proto_after_greedy_proto)
1612 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1613 "Prototype after '%c' for %"SVf" : %s",
1614 greedy_proto, SVfARG(name), p);
50278ed0
PM
1615 if (in_brackets)
1616 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1617 "Missing ']' in prototype for %"SVf" : %s",
1618 SVfARG(name), p);
b54d603d 1619 if (bad_proto)
fe788d6b 1620 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
f791a21a
PM
1621 "Illegal character in prototype for %"SVf" : %s",
1622 SVfARG(name), p);
1623 if (bad_proto_after_underscore)
1624 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1625 "Illegal character after '_' in prototype for %"SVf" : %s",
1626 SVfARG(name), p);
fe788d6b
PM
1627 }
1628
1629 return (! (proto_after_greedy_proto || bad_proto) );
1630}
1631
1632/*
ffb4593c
NT
1633 * S_incline
1634 * This subroutine has nothing to do with tilting, whether at windmills
1635 * or pinball tables. Its name is short for "increment line". It
57843af0 1636 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1637 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1638 * # line 500 "foo.pm"
1639 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1640 */
1641
76e3520e 1642STATIC void
d9095cec 1643S_incline(pTHX_ const char *s)
463ee0b2 1644{
d9095cec
NC
1645 const char *t;
1646 const char *n;
1647 const char *e;
8818d409 1648 line_t line_num;
463ee0b2 1649
7918f24d
NC
1650 PERL_ARGS_ASSERT_INCLINE;
1651
83944c01 1652 COPLINE_INC_WITH_HERELINES;
451f421f
FC
1653 if (!PL_rsfp && !PL_parser->filtered && PL_lex_state == LEX_NORMAL
1654 && s+1 == PL_bufend && *s == ';') {
1655 /* fake newline in string eval */
1656 CopLINE_dec(PL_curcop);
1657 return;
1658 }
463ee0b2
LW
1659 if (*s++ != '#')
1660 return;
d4c19fe8
AL
1661 while (SPACE_OR_TAB(*s))
1662 s++;
73659bf1
GS
1663 if (strnEQ(s, "line", 4))
1664 s += 4;
1665 else
1666 return;
084592ab 1667 if (SPACE_OR_TAB(*s))
73659bf1 1668 s++;
4e553d73 1669 else
73659bf1 1670 return;
d4c19fe8
AL
1671 while (SPACE_OR_TAB(*s))
1672 s++;
463ee0b2
LW
1673 if (!isDIGIT(*s))
1674 return;
d4c19fe8 1675
463ee0b2
LW
1676 n = s;
1677 while (isDIGIT(*s))
1678 s++;
07714eb4 1679 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1680 return;
bf4acbe4 1681 while (SPACE_OR_TAB(*s))
463ee0b2 1682 s++;
73659bf1 1683 if (*s == '"' && (t = strchr(s+1, '"'))) {
463ee0b2 1684 s++;
73659bf1
GS
1685 e = t + 1;
1686 }
463ee0b2 1687 else {
c35e046a
AL
1688 t = s;
1689 while (!isSPACE(*t))
1690 t++;
73659bf1 1691 e = t;
463ee0b2 1692 }
bf4acbe4 1693 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1694 e++;
1695 if (*e != '\n' && *e != '\0')
1696 return; /* false alarm */
1697
96e440d2 1698 line_num = grok_atou(n, &e) - 1;
8818d409 1699
f4dd75d9 1700 if (t - s > 0) {
d9095cec 1701 const STRLEN len = t - s;
3df32bda 1702
d36ee5be 1703 if (!PL_rsfp && !PL_parser->filtered) {
e66cf94c
RGS
1704 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1705 * to *{"::_<newfilename"} */
44867030
NC
1706 /* However, the long form of evals is only turned on by the
1707 debugger - usually they're "(eval %lu)" */
d36ee5be
FC
1708 GV * const cfgv = CopFILEGV(PL_curcop);
1709 if (cfgv) {
38bd7ad8
FC
1710 char smallbuf[128];
1711 STRLEN tmplen2 = len;
44867030 1712 char *tmpbuf2;
449dd039 1713 GV *gv2;
44867030
NC
1714
1715 if (tmplen2 + 2 <= sizeof smallbuf)
1716 tmpbuf2 = smallbuf;
1717 else
1718 Newx(tmpbuf2, tmplen2 + 2, char);
1719
38bd7ad8
FC
1720 tmpbuf2[0] = '_';
1721 tmpbuf2[1] = '<';
44867030
NC
1722
1723 memcpy(tmpbuf2 + 2, s, tmplen2);
1724 tmplen2 += 2;
1725
8a5ee598 1726 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1727 if (!isGV(gv2)) {
8a5ee598 1728 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1729 /* adjust ${"::_<newfilename"} to store the new file name */
1730 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
8818d409
FC
1731 /* The line number may differ. If that is the case,
1732 alias the saved lines that are in the array.
1733 Otherwise alias the whole array. */
1734 if (CopLINE(PL_curcop) == line_num) {
38bd7ad8
FC
1735 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(cfgv)));
1736 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(cfgv)));
8818d409 1737 }
38bd7ad8
FC
1738 else if (GvAV(cfgv)) {
1739 AV * const av = GvAV(cfgv);
8818d409
FC
1740 const I32 start = CopLINE(PL_curcop)+1;
1741 I32 items = AvFILLp(av) - start;
1742 if (items > 0) {
1743 AV * const av2 = GvAVn(gv2);
1744 SV **svp = AvARRAY(av) + start;
1745 I32 l = (I32)line_num+1;
1746 while (items--)
1747 av_store(av2, l++, SvREFCNT_inc(*svp++));
1748 }
1749 }
e5527e4b 1750 }
44867030
NC
1751
1752 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
d36ee5be 1753 }
e66cf94c 1754 }
05ec9bb3 1755 CopFILE_free(PL_curcop);
449dd039 1756 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1757 }
8818d409 1758 CopLINE_set(PL_curcop, line_num);
463ee0b2
LW
1759}
1760
21791330
FC
1761#define skipspace(s) skipspace_flags(s, 0)
1762
29595ff2 1763
80a702cd 1764STATIC void
15f169a1 1765S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1766{
1767 AV *av = CopFILEAVx(PL_curcop);
1768 if (av) {
7f1c3e8c
FC
1769 SV * sv;
1770 if (PL_parser->preambling == NOLINE) sv = newSV_type(SVt_PVMG);
1771 else {
1772 sv = *av_fetch(av, 0, 1);
1773 SvUPGRADE(sv, SVt_PVMG);
1774 }
1775 if (!SvPOK(sv)) sv_setpvs(sv,"");
5fa550fb 1776 if (orig_sv)
7f1c3e8c 1777 sv_catsv(sv, orig_sv);
5fa550fb 1778 else
7f1c3e8c
FC
1779 sv_catpvn(sv, buf, len);
1780 if (!SvIOK(sv)) {
1781 (void)SvIOK_on(sv);
1782 SvIV_set(sv, 0);
1783 }
1784 if (PL_parser->preambling == NOLINE)
1785 av_store(av, CopLINE(PL_curcop), sv);
80a702cd
RGS
1786 }
1787}
1788
ffb4593c
NT
1789/*
1790 * S_skipspace
1791 * Called to gobble the appropriate amount and type of whitespace.
1792 * Skips comments as well.
1793 */
1794
76e3520e 1795STATIC char *
21791330 1796S_skipspace_flags(pTHX_ char *s, U32 flags)
a687059c 1797{
21791330 1798 PERL_ARGS_ASSERT_SKIPSPACE_FLAGS;
3280af22 1799 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
bf4acbe4 1800 while (s < PL_bufend && SPACE_OR_TAB(*s))
463ee0b2 1801 s++;
f0e67a1d
Z
1802 } else {
1803 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1804 PL_bufptr = s;
21791330 1805 lex_read_space(flags | LEX_KEEP_PREVIOUS |
f0998909
Z
1806 (PL_sublex_info.sub_inwhat || PL_lex_state == LEX_FORMLINE ?
1807 LEX_NO_NEXT_CHUNK : 0));
3280af22 1808 s = PL_bufptr;
f0e67a1d
Z
1809 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1810 if (PL_linestart > PL_bufptr)
1811 PL_bufptr = PL_linestart;
1812 return s;
463ee0b2 1813 }
5db06880 1814 return s;
a687059c 1815}
378cc40b 1816
ffb4593c
NT
1817/*
1818 * S_check_uni
1819 * Check the unary operators to ensure there's no ambiguity in how they're
1820 * used. An ambiguous piece of code would be:
1821 * rand + 5
1822 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1823 * the +5 is its argument.
1824 */
1825
76e3520e 1826STATIC void
cea2e8a9 1827S_check_uni(pTHX)
ba106d47 1828{
d4c19fe8
AL
1829 const char *s;
1830 const char *t;
2f3197b3 1831
3280af22 1832 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1833 return;
3280af22
NIS
1834 while (isSPACE(*PL_last_uni))
1835 PL_last_uni++;
c35e046a 1836 s = PL_last_uni;
8a2bca12 1837 while (isWORDCHAR_lazy_if(s,UTF) || *s == '-')
c35e046a 1838 s++;
3280af22 1839 if ((t = strchr(s, '(')) && t < PL_bufptr)
a0d0e21e 1840 return;
6136c704 1841
9b387841
NC
1842 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
1843 "Warning: Use of \"%.*s\" without parentheses is ambiguous",
1844 (int)(s - PL_last_uni), PL_last_uni);
2f3197b3
LW
1845}
1846
ffb4593c
NT
1847/*
1848 * LOP : macro to build a list operator. Its behaviour has been replaced
1849 * with a subroutine, S_lop() for which LOP is just another name.
1850 */
1851
a0d0e21e
LW
1852#define LOP(f,x) return lop(f,x,s)
1853
ffb4593c
NT
1854/*
1855 * S_lop
1856 * Build a list operator (or something that might be one). The rules:
41e8cbf4
FC
1857 * - if we have a next token, then it's a list operator (no parens) for
1858 * which the next token has already been parsed; e.g.,
1859 * sort foo @args
1860 * sort foo (@args)
ffb4593c
NT
1861 * - if the next thing is an opening paren, then it's a function
1862 * - else it's a list operator
1863 */
1864
76e3520e 1865STATIC I32
a0be28da 1866S_lop(pTHX_ I32 f, int x, char *s)
ffed7fef 1867{
7918f24d
NC
1868 PERL_ARGS_ASSERT_LOP;
1869
6154021b 1870 pl_yylval.ival = f;
35c8bce7 1871 CLINE;
3280af22
NIS
1872 PL_bufptr = s;
1873 PL_last_lop = PL_oldbufptr;
eb160463 1874 PL_last_lop_op = (OPCODE)f;
3280af22 1875 if (PL_nexttoke)
78cdf107 1876 goto lstop;
19f1898a 1877 PL_expect = x;
79072805 1878 if (*s == '(')
bbf60fe6 1879 return REPORT(FUNC);
29595ff2 1880 s = PEEKSPACE(s);
79072805 1881 if (*s == '(')
bbf60fe6 1882 return REPORT(FUNC);
78cdf107
Z
1883 else {
1884 lstop:
1885 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
1886 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
bbf60fe6 1887 return REPORT(LSTOP);
78cdf107 1888 }
79072805
LW
1889}
1890
ffb4593c
NT
1891/*
1892 * S_force_next
9cbb5ea2 1893 * When the lexer realizes it knows the next token (for instance,
ffb4593c 1894 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 1895 * to know what token to return the next time the lexer is called. Caller
b5bbe64a
JH
1896 * will need to set PL_nextval[] and possibly PL_expect to ensure
1897 * the lexer handles the token correctly.
ffb4593c
NT
1898 */
1899
4e553d73 1900STATIC void
cea2e8a9 1901S_force_next(pTHX_ I32 type)
79072805 1902{
704d4215
GG
1903#ifdef DEBUGGING
1904 if (DEBUG_T_TEST) {
1905 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 1906 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
1907 }
1908#endif
3280af22
NIS
1909 PL_nexttype[PL_nexttoke] = type;
1910 PL_nexttoke++;
1911 if (PL_lex_state != LEX_KNOWNEXT) {
1912 PL_lex_defer = PL_lex_state;
1913 PL_lex_expect = PL_expect;
1914 PL_lex_state = LEX_KNOWNEXT;
79072805
LW
1915 }
1916}
1917
89f35911
FC
1918/*
1919 * S_postderef
1920 *
1921 * This subroutine handles postfix deref syntax after the arrow has already
1922 * been emitted. @* $* etc. are emitted as two separate token right here.
1923 * @[ @{ %[ %{ *{ are emitted also as two tokens, but this function emits
1924 * only the first, leaving yylex to find the next.
89f35911
FC
1925 */
1926
1927static int
ff25e5db 1928S_postderef(pTHX_ int const funny, char const next)
89f35911 1929{
ff25e5db 1930 assert(funny == DOLSHARP || strchr("$@%&*", funny));
89f35911
FC
1931 assert(strchr("*[{", next));
1932 if (next == '*') {
1933 PL_expect = XOPERATOR;
cc624add 1934 if (PL_lex_state == LEX_INTERPNORMAL && !PL_lex_brackets) {
ff25e5db 1935 assert('@' == funny || '$' == funny || DOLSHARP == funny);
cc624add 1936 PL_lex_state = LEX_INTERPEND;
cc624add
FC
1937 force_next(POSTJOIN);
1938 }
89f35911
FC
1939 force_next(next);
1940 PL_bufptr+=2;
1941 }
1942 else {
760ca746
FC
1943 if ('@' == funny && PL_lex_state == LEX_INTERPNORMAL
1944 && !PL_lex_brackets)
1945 PL_lex_dojoin = 2;
89f35911
FC
1946 PL_expect = XOPERATOR;
1947 PL_bufptr++;
1948 }
1949 return funny;
1950}
1951
28ac2b49
Z
1952void
1953Perl_yyunlex(pTHX)
1954{
a7aaec61
Z
1955 int yyc = PL_parser->yychar;
1956 if (yyc != YYEMPTY) {
1957 if (yyc) {
a7aaec61
Z
1958 NEXTVAL_NEXTTOKE = PL_parser->yylval;
1959 if (yyc == '{'/*}*/ || yyc == HASHBRACK || yyc == '['/*]*/) {
78cdf107 1960 PL_lex_allbrackets--;
a7aaec61 1961 PL_lex_brackets--;
78cdf107
Z
1962 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
1963 } else if (yyc == '('/*)*/) {
1964 PL_lex_allbrackets--;
1965 yyc |= (2<<24);
a7aaec61
Z
1966 }
1967 force_next(yyc);
1968 }
28ac2b49
Z
1969 PL_parser->yychar = YYEMPTY;
1970 }
1971}
1972
d0a148a6 1973STATIC SV *
15f169a1 1974S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 1975{
740cce10 1976 SV * const sv = newSVpvn_utf8(start, len,
eaf7a4d2
CS
1977 !IN_BYTES
1978 && UTF
1979 && !is_ascii_string((const U8*)start, len)
740cce10 1980 && is_utf8_string((const U8*)start, len));
d0a148a6
NC
1981 return sv;
1982}
1983
ffb4593c
NT
1984/*
1985 * S_force_word
1986 * When the lexer knows the next thing is a word (for instance, it has
1987 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
1988 * it calls S_force_word to stick the next word into the PL_nexttoke/val
1989 * lookahead.
ffb4593c
NT
1990 *
1991 * Arguments:
b1b65b59 1992 * char *start : buffer position (must be within PL_linestr)
02b34bbe 1993 * int token : PL_next* will be this type of bare word (e.g., METHOD,WORD)
ffb4593c
NT
1994 * int check_keyword : if true, Perl checks to make sure the word isn't
1995 * a keyword (do this if the word is a label, e.g. goto FOO)
1996 * int allow_pack : if true, : characters will also be allowed (require,
1997 * use, etc. do this)
9cbb5ea2 1998 * int allow_initial_tick : used by the "sub" lexer only.
ffb4593c
NT
1999 */
2000
76e3520e 2001STATIC char *
345b3785 2002S_force_word(pTHX_ char *start, int token, int check_keyword, int allow_pack)
79072805 2003{
eb578fdb 2004 char *s;
463ee0b2 2005 STRLEN len;
4e553d73 2006
7918f24d
NC
2007 PERL_ARGS_ASSERT_FORCE_WORD;
2008
29595ff2 2009 start = SKIPSPACE1(start);
463ee0b2 2010 s = start;
7e2040f0 2011 if (isIDFIRST_lazy_if(s,UTF) ||
345b3785 2012 (allow_pack && *s == ':') )
a0d0e21e 2013 {
3280af22 2014 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
01b5ef50
FC
2015 if (check_keyword) {
2016 char *s2 = PL_tokenbuf;
2017 if (allow_pack && len > 6 && strnEQ(s2, "CORE::", 6))
2018 s2 += 6, len -= 6;
2019 if (keyword(s2, len, 0))
463ee0b2 2020 return start;
01b5ef50 2021 }
463ee0b2 2022 if (token == METHOD) {
29595ff2 2023 s = SKIPSPACE1(s);
463ee0b2 2024 if (*s == '(')
3280af22 2025 PL_expect = XTERM;
463ee0b2 2026 else {
3280af22 2027 PL_expect = XOPERATOR;
463ee0b2 2028 }
79072805 2029 }
9ded7720 2030 NEXTVAL_NEXTTOKE.opval
d0a148a6
NC
2031 = (OP*)newSVOP(OP_CONST,0,
2032 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 2033 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
2034 force_next(token);
2035 }
2036 return s;
2037}
2038
ffb4593c
NT
2039/*
2040 * S_force_ident
9cbb5ea2 2041 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
2042 * text only contains the "foo" portion. The first argument is a pointer
2043 * to the "foo", and the second argument is the type symbol to prefix.
2044 * Forces the next token to be a "WORD".
9cbb5ea2 2045 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2046 */
2047
76e3520e 2048STATIC void
5aaab254 2049S_force_ident(pTHX_ const char *s, int kind)
79072805 2050{
7918f24d
NC
2051 PERL_ARGS_ASSERT_FORCE_IDENT;
2052
c9b48522
DD
2053 if (s[0]) {
2054 const STRLEN len = s[1] ? strlen(s) : 1; /* s = "\"" see yylex */
728847b1
BF
2055 OP* const o = (OP*)newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
2056 UTF ? SVf_UTF8 : 0));
9ded7720 2057 NEXTVAL_NEXTTOKE.opval = o;
79072805 2058 force_next(WORD);
748a9306 2059 if (kind) {
11343788 2060 o->op_private = OPpCONST_ENTERED;
55497cff 2061 /* XXX see note in pp_entereval() for why we forgo typo
2062 warnings if the symbol must be introduced in an eval.
2063 GSAR 96-10-12 */
90e5519e 2064 gv_fetchpvn_flags(s, len,
728847b1
BF
2065 (PL_in_eval ? (GV_ADDMULTI | GV_ADDINEVAL)
2066 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
90e5519e
NC
2067 kind == '$' ? SVt_PV :
2068 kind == '@' ? SVt_PVAV :
2069 kind == '%' ? SVt_PVHV :
a0d0e21e 2070 SVt_PVGV
90e5519e 2071 );
748a9306 2072 }
79072805
LW
2073 }
2074}
2075
3f33d153
FC
2076static void
2077S_force_ident_maybe_lex(pTHX_ char pit)
2078{
3f33d153
FC
2079 NEXTVAL_NEXTTOKE.ival = pit;
2080 force_next('p');
2081}
2082
1571675a
GS
2083NV
2084Perl_str_to_version(pTHX_ SV *sv)
2085{
2086 NV retval = 0.0;
2087 NV nshift = 1.0;
2088 STRLEN len;
cfd0369c 2089 const char *start = SvPV_const(sv,len);
9d4ba2ae 2090 const char * const end = start + len;
504618e9 2091 const bool utf = SvUTF8(sv) ? TRUE : FALSE;
7918f24d
NC
2092
2093 PERL_ARGS_ASSERT_STR_TO_VERSION;
2094
1571675a 2095 while (start < end) {
ba210ebe 2096 STRLEN skip;
1571675a
GS
2097 UV n;
2098 if (utf)
9041c2e3 2099 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2100 else {
2101 n = *(U8*)start;
2102 skip = 1;
2103 }
2104 retval += ((NV)n)/nshift;
2105 start += skip;
2106 nshift *= 1000;
2107 }
2108 return retval;
2109}
2110
4e553d73 2111/*
ffb4593c
NT
2112 * S_force_version
2113 * Forces the next token to be a version number.
e759cc13
RGS
2114 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2115 * and if "guessing" is TRUE, then no new token is created (and the caller
2116 * must use an alternative parsing method).
ffb4593c
NT
2117 */
2118
76e3520e 2119STATIC char *
e759cc13 2120S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2121{
5f66b61c 2122 OP *version = NULL;
44dcb63b 2123 char *d;
89bfa8cd 2124
7918f24d
NC
2125 PERL_ARGS_ASSERT_FORCE_VERSION;
2126
29595ff2 2127 s = SKIPSPACE1(s);
89bfa8cd 2128
44dcb63b 2129 d = s;
dd629d5b 2130 if (*d == 'v')
44dcb63b 2131 d++;
44dcb63b 2132 if (isDIGIT(*d)) {
e759cc13
RGS
2133 while (isDIGIT(*d) || *d == '_' || *d == '.')
2134 d++;
4e4da3ac 2135 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
dd629d5b 2136 SV *ver;
6154021b
RGS
2137 s = scan_num(s, &pl_yylval);
2138 version = pl_yylval.opval;
dd629d5b
GS
2139 ver = cSVOPx(version)->op_sv;
2140 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2141 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2142 SvNV_set(ver, str_to_version(ver));
1571675a 2143 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2144 }
89bfa8cd 2145 }
5db06880 2146 else if (guessing) {
e759cc13 2147 return s;
5db06880 2148 }
89bfa8cd 2149 }
2150
2151 /* NOTE: The parser sees the package name and the VERSION swapped */
9ded7720 2152 NEXTVAL_NEXTTOKE.opval = version;
4e553d73 2153 force_next(WORD);
89bfa8cd 2154
e759cc13 2155 return s;
89bfa8cd 2156}
2157
ffb4593c 2158/*
91152fc1
DG
2159 * S_force_strict_version
2160 * Forces the next token to be a version number using strict syntax rules.
2161 */
2162
2163STATIC char *
2164S_force_strict_version(pTHX_ char *s)
2165{
91152fc1 2166 OP *version = NULL;
91152fc1
DG
2167 const char *errstr = NULL;
2168
2169 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2170
2171 while (isSPACE(*s)) /* leading whitespace */
2172 s++;
2173
2174 if (is_STRICT_VERSION(s,&errstr)) {
2175 SV *ver = newSV(0);
2176 s = (char *)scan_version(s, ver, 0);
2177 version = newSVOP(OP_CONST, 0, ver);
2178 }
4e4da3ac
Z
2179 else if ( (*s != ';' && *s != '{' && *s != '}' ) &&
2180 (s = SKIPSPACE1(s), (*s != ';' && *s != '{' && *s != '}' )))
2181 {
91152fc1
DG
2182 PL_bufptr = s;
2183 if (errstr)
2184 yyerror(errstr); /* version required */
2185 return s;
2186 }
2187
91152fc1 2188 /* NOTE: The parser sees the package name and the VERSION swapped */
91152fc1
DG
2189 NEXTVAL_NEXTTOKE.opval = version;
2190 force_next(WORD);
2191
2192 return s;
2193}
2194
2195/*
ffb4593c
NT
2196 * S_tokeq
2197 * Tokenize a quoted string passed in as an SV. It finds the next
2198 * chunk, up to end of string or a backslash. It may make a new
2199 * SV containing that chunk (if HINT_NEW_STRING is on). It also
2200 * turns \\ into \.
2201 */
2202
76e3520e 2203STATIC SV *
cea2e8a9 2204S_tokeq(pTHX_ SV *sv)
79072805 2205{
eb578fdb
KW
2206 char *s;
2207 char *send;
2208 char *d;
b3ac6de7 2209 SV *pv = sv;
79072805 2210
7918f24d
NC
2211 PERL_ARGS_ASSERT_TOKEQ;
2212
279b35ad
FC
2213 assert (SvPOK(sv));
2214 assert (SvLEN(sv));
2215 assert (!SvIsCOW(sv));
307ed071 2216 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1) /* <<'heredoc' */
b3ac6de7 2217 goto finish;
279b35ad
FC
2218 s = SvPVX(sv);
2219 send = SvEND(sv);
dcb21ed6
NC
2220 /* This is relying on the SV being "well formed" with a trailing '\0' */
2221 while (s < send && !(*s == '\\' && s[1] == '\\'))
79072805
LW
2222 s++;
2223 if (s == send)
b3ac6de7 2224 goto finish;
79072805 2225 d = s;
be4731d2 2226 if ( PL_hints & HINT_NEW_STRING ) {
279b35ad
FC
2227 pv = newSVpvn_flags(SvPVX_const(pv), SvCUR(sv),
2228 SVs_TEMP | SvUTF8(sv));
be4731d2 2229 }
79072805
LW
2230 while (s < send) {
2231 if (*s == '\\') {
a0d0e21e 2232 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2233 s++; /* all that, just for this */
2234 }
2235 *d++ = *s++;
2236 }
2237 *d = '\0';
95a20fc0 2238 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2239 finish:
3280af22 2240 if ( PL_hints & HINT_NEW_STRING )
eb0d8d16 2241 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
79072805
LW
2242 return sv;
2243}
2244
ffb4593c
NT
2245/*
2246 * Now come three functions related to double-quote context,
2247 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2248 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2249 * interact with PL_lex_state, and create fake ( ... ) argument lists
2250 * to handle functions and concatenation.
ecd24171
DM
2251 * For example,
2252 * "foo\lbar"
2253 * is tokenised as
2254 * stringify ( const[foo] concat lcfirst ( const[bar] ) )
ffb4593c
NT
2255 */
2256
2257/*
2258 * S_sublex_start
6154021b 2259 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2260 *
2261 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2262 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c
NT
2263 *
2264 * OP_CONST and OP_READLINE are easy--just make the new op and return.
2265 *
2266 * Everything else becomes a FUNC.
2267 *
2268 * Sets PL_lex_state to LEX_INTERPPUSH unless (ival was OP_NULL or we
2269 * had an OP_CONST or OP_READLINE). This just sets us up for a
2270 * call to S_sublex_push().
2271 */
2272
76e3520e 2273STATIC I32
cea2e8a9 2274S_sublex_start(pTHX)
79072805 2275{
eb578fdb 2276 const I32 op_type = pl_yylval.ival;
79072805
LW
2277
2278 if (op_type == OP_NULL) {
6154021b 2279 pl_yylval.opval = PL_lex_op;
5f66b61c 2280 PL_lex_op = NULL;
79072805
LW
2281 return THING;
2282 }
466112bb 2283 if (op_type == OP_CONST) {
3280af22 2284 SV *sv = tokeq(PL_lex_stuff);
b3ac6de7
IZ
2285
2286 if (SvTYPE(sv) == SVt_PVIV) {
2287 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2288 STRLEN len;
96a5add6 2289 const char * const p = SvPV_const(sv, len);
740cce10 2290 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2291 SvREFCNT_dec(sv);
2292 sv = nsv;
4e553d73 2293 }
6154021b 2294 pl_yylval.opval = (OP*)newSVOP(op_type, 0, sv);
a0714e2c 2295 PL_lex_stuff = NULL;
79072805
LW
2296 return THING;
2297 }
2298
3280af22 2299 PL_sublex_info.super_state = PL_lex_state;
eac04b2e 2300 PL_sublex_info.sub_inwhat = (U16)op_type;
3280af22
NIS
2301 PL_sublex_info.sub_op = PL_lex_op;
2302 PL_lex_state = LEX_INTERPPUSH;
55497cff 2303
3280af22
NIS
2304 PL_expect = XTERM;
2305 if (PL_lex_op) {
6154021b 2306 pl_yylval.opval = PL_lex_op;
5f66b61c 2307 PL_lex_op = NULL;
55497cff 2308 return PMFUNC;
2309 }
2310 else
2311 return FUNC;
2312}
2313
ffb4593c
NT
2314/*
2315 * S_sublex_push
2316 * Create a new scope to save the lexing state. The scope will be
2317 * ended in S_sublex_done. Returns a '(', starting the function arguments
2318 * to the uc, lc, etc. found before.
2319 * Sets PL_lex_state to LEX_INTERPCONCAT.
2320 */
2321
76e3520e 2322STATIC I32
cea2e8a9 2323S_sublex_push(pTHX)
55497cff 2324{
78a635de 2325 LEXSHARED *shared;
801d32ac 2326 const bool is_heredoc = PL_multi_close == '<';
f46d017c 2327 ENTER;
55497cff 2328
3280af22 2329 PL_lex_state = PL_sublex_info.super_state;
cc624add 2330 SAVEI8(PL_lex_dojoin);
3280af22 2331 SAVEI32(PL_lex_brackets);
78cdf107 2332 SAVEI32(PL_lex_allbrackets);
b27dce25 2333 SAVEI32(PL_lex_formbrack);
78cdf107 2334 SAVEI8(PL_lex_fakeeof);
3280af22
NIS
2335 SAVEI32(PL_lex_casemods);
2336 SAVEI32(PL_lex_starts);
651b5b28 2337 SAVEI8(PL_lex_state);
7cc34111 2338 SAVESPTR(PL_lex_repl);
7766f137 2339 SAVEVPTR(PL_lex_inpat);
98246f1e 2340 SAVEI16(PL_lex_inwhat);
ffdb8b16 2341 if (is_heredoc)
b42366d4 2342 {
ffdb8b16 2343 SAVECOPLINE(PL_curcop);
b42366d4 2344 SAVEI32(PL_multi_end);
851b527a
FC
2345 SAVEI32(PL_parser->herelines);
2346 PL_parser->herelines = 0;
b42366d4
FC
2347 }
2348 SAVEI8(PL_multi_close);
3280af22 2349 SAVEPPTR(PL_bufptr);
8452ff4b 2350 SAVEPPTR(PL_bufend);
3280af22
NIS
2351 SAVEPPTR(PL_oldbufptr);
2352 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2353 SAVEPPTR(PL_last_lop);
2354 SAVEPPTR(PL_last_uni);
3280af22
NIS
2355 SAVEPPTR(PL_linestart);
2356 SAVESPTR(PL_linestr);
8edd5f42
RGS
2357 SAVEGENERICPV(PL_lex_brackstack);
2358 SAVEGENERICPV(PL_lex_casestack);
78a635de 2359 SAVEGENERICPV(PL_parser->lex_shared);
3a54fd60 2360 SAVEBOOL(PL_parser->lex_re_reparsing);
ffdb8b16 2361 SAVEI32(PL_copline);
3280af22 2362
99bd9d90 2363 /* The here-doc parser needs to be able to peek into outer lexing
60f40a38
FC
2364 scopes to find the body of the here-doc. So we put PL_linestr and
2365 PL_bufptr into lex_shared, to ‘share’ those values.
99bd9d90 2366 */
60f40a38
FC
2367 PL_parser->lex_shared->ls_linestr = PL_linestr;
2368 PL_parser->lex_shared->ls_bufptr = PL_bufptr;
99bd9d90 2369
3280af22 2370 PL_linestr = PL_lex_stuff;
7cc34111 2371 PL_lex_repl = PL_sublex_info.repl;
a0714e2c 2372 PL_lex_stuff = NULL;
7cc34111 2373 PL_sublex_info.repl = NULL;
3280af22 2374
9cbb5ea2
GS
2375 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2376 = SvPVX(PL_linestr);
3280af22 2377 PL_bufend += SvCUR(PL_linestr);
bd61b366 2378 PL_last_lop = PL_last_uni = NULL;
3280af22 2379 SAVEFREESV(PL_linestr);
4dc843bc 2380 if (PL_lex_repl) SAVEFREESV(PL_lex_repl);
3280af22
NIS
2381
2382 PL_lex_dojoin = FALSE;
b27dce25 2383 PL_lex_brackets = PL_lex_formbrack = 0;
78cdf107
Z
2384 PL_lex_allbrackets = 0;
2385 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
a02a5408
JC
2386 Newx(PL_lex_brackstack, 120, char);
2387 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2388 PL_lex_casemods = 0;
2389 *PL_lex_casestack = '\0';
2390 PL_lex_starts = 0;
2391 PL_lex_state = LEX_INTERPCONCAT;
ffdb8b16 2392 if (is_heredoc)
6ddcf93b 2393 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
ffdb8b16 2394 PL_copline = NOLINE;
78a635de
FC
2395
2396 Newxz(shared, 1, LEXSHARED);
2397 shared->ls_prev = PL_parser->lex_shared;
2398 PL_parser->lex_shared = shared;
3280af22
NIS
2399
2400 PL_lex_inwhat = PL_sublex_info.sub_inwhat;
bb16bae8 2401 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
3280af22
NIS
2402 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2403 PL_lex_inpat = PL_sublex_info.sub_op;
79072805 2404 else
5f66b61c 2405 PL_lex_inpat = NULL;
79072805 2406
3a54fd60
DM
2407 PL_parser->lex_re_reparsing = cBOOL(PL_in_eval & EVAL_RE_REPARSING);
2408 PL_in_eval &= ~EVAL_RE_REPARSING;
2409
55497cff 2410 return '(';
79072805
LW
2411}
2412
ffb4593c
NT
2413/*
2414 * S_sublex_done
2415 * Restores lexer state after a S_sublex_push.
2416 */
2417
76e3520e 2418STATIC I32
cea2e8a9 2419S_sublex_done(pTHX)
79072805 2420{
3280af22 2421 if (!PL_lex_starts++) {
396482e1 2422 SV * const sv = newSVpvs("");
9aa983d2
JH
2423 if (SvUTF8(PL_linestr))
2424 SvUTF8_on(sv);
3280af22 2425 PL_expect = XOPERATOR;
6154021b 2426 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
79072805
LW
2427 return THING;
2428 }
2429
3280af22
NIS
2430 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2431 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2432 return yylex();
79072805
LW
2433 }
2434
ffb4593c 2435 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
bb16bae8 2436 assert(PL_lex_inwhat != OP_TRANSR);
5aa91856
FC
2437 if (PL_lex_repl) {
2438 assert (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS);
3280af22
NIS
2439 PL_linestr = PL_lex_repl;
2440 PL_lex_inpat = 0;
2441 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2442 PL_bufend += SvCUR(PL_linestr);
bd61b366 2443 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2444 PL_lex_dojoin = FALSE;
2445 PL_lex_brackets = 0;
78cdf107
Z
2446 PL_lex_allbrackets = 0;
2447 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
3280af22
NIS
2448 PL_lex_casemods = 0;
2449 *PL_lex_casestack = '\0';
2450 PL_lex_starts = 0;
25da4f38 2451 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2452 PL_lex_state = LEX_INTERPNORMAL;
2453 PL_lex_starts++;
e9fa98b2
HS
2454 /* we don't clear PL_lex_repl here, so that we can check later
2455 whether this is an evalled subst; that means we rely on the
2456 logic to ensure sublex_done() is called again only via the
2457 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2458 }
e9fa98b2 2459 else {
3280af22 2460 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2461 PL_lex_repl = NULL;
e9fa98b2 2462 }
ffdb8b16
FC
2463 if (SvTYPE(PL_linestr) >= SVt_PVNV) {
2464 CopLINE(PL_curcop) +=
2465 ((XPVNV*)SvANY(PL_linestr))->xnv_u.xpad_cop_seq.xlow
851b527a
FC
2466 + PL_parser->herelines;
2467 PL_parser->herelines = 0;
ffdb8b16 2468 }
79072805 2469 return ',';
ffed7fef
LW
2470 }
2471 else {
b42366d4 2472 const line_t l = CopLINE(PL_curcop);
f46d017c 2473 LEAVE;
b42366d4 2474 if (PL_multi_close == '<')
851b527a 2475 PL_parser->herelines += l - PL_multi_end;
3280af22
NIS
2476 PL_bufend = SvPVX(PL_linestr);
2477 PL_bufend += SvCUR(PL_linestr);
2478 PL_expect = XOPERATOR;
09bef843 2479 PL_sublex_info.sub_inwhat = 0;
79072805 2480 return ')';
ffed7fef
LW
2481 }
2482}
2483
6f613c73
KW
2484PERL_STATIC_INLINE SV*
2485S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
2486{
140b12ad
KW
2487 /* <s> points to first character of interior of \N{}, <e> to one beyond the
2488 * interior, hence to the "}". Finds what the name resolves to, returning
2489 * an SV* containing it; NULL if no valid one found */
2490
dd2b1b72 2491 SV* res = newSVpvn_flags(s, e - s, UTF ? SVf_UTF8 : 0);
6f613c73 2492
0c415a79
KW
2493 HV * table;
2494 SV **cvp;
2495 SV *cv;
2496 SV *rv;
2497 HV *stash;
2498 const U8* first_bad_char_loc;
2499 const char* backslash_ptr = s - 3; /* Points to the <\> of \N{... */
2500
6f613c73
KW
2501 PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME;
2502
107160e2
KW
2503 if (UTF && ! is_utf8_string_loc((U8 *) backslash_ptr,
2504 e - backslash_ptr,
2505 &first_bad_char_loc))
2506 {
2507 /* If warnings are on, this will print a more detailed analysis of what
2508 * is wrong than the error message below */
c80e42f3 2509 utf8n_to_uvchr(first_bad_char_loc,
107160e2
KW
2510 e - ((char *) first_bad_char_loc),
2511 NULL, 0);
2512
2513 /* We deliberately don't try to print the malformed character, which
2514 * might not print very well; it also may be just the first of many
2515 * malformations, so don't print what comes after it */
2516 yyerror(Perl_form(aTHX_
2517 "Malformed UTF-8 character immediately after '%.*s'",
2518 (int) (first_bad_char_loc - (U8 *) backslash_ptr), backslash_ptr));
2519 return NULL;
2520 }
2521
2522 res = new_constant( NULL, 0, "charnames", res, NULL, backslash_ptr,
2523 /* include the <}> */
2524 e - backslash_ptr + 1);
6f613c73 2525 if (! SvPOK(res)) {
b6407c49 2526 SvREFCNT_dec_NN(res);
6f613c73
KW
2527 return NULL;
2528 }
2529
0c415a79
KW
2530 /* See if the charnames handler is the Perl core's, and if so, we can skip
2531 * the validation needed for a user-supplied one, as Perl's does its own
2532 * validation. */
2533 table = GvHV(PL_hintgv); /* ^H */
2534 cvp = hv_fetchs(table, "charnames", FALSE);
5882ddb3
FC
2535 if (cvp && (cv = *cvp) && SvROK(cv) && (rv = SvRV(cv),
2536 SvTYPE(rv) == SVt_PVCV) && ((stash = CvSTASH(rv)) != NULL))
0c415a79
KW
2537 {
2538 const char * const name = HvNAME(stash);
6a642c21
FC
2539 if (HvNAMELEN(stash) == sizeof("_charnames")-1
2540 && strEQ(name, "_charnames")) {
0c415a79
KW
2541 return res;
2542 }
2543 }
2544
bde9e88d
KW
2545 /* Here, it isn't Perl's charname handler. We can't rely on a
2546 * user-supplied handler to validate the input name. For non-ut8 input,
2547 * look to see that the first character is legal. Then loop through the
2548 * rest checking that each is a continuation */
6f613c73 2549
36897d64
KW
2550 /* This code makes the reasonable assumption that the only Latin1-range
2551 * characters that begin a character name alias are alphabetic, otherwise
2552 * would have to create a isCHARNAME_BEGIN macro */
b6ba1137 2553
b6ba1137 2554 if (! UTF) {
bde9e88d 2555 if (! isALPHAU(*s)) {
b6ba1137
KW
2556 goto bad_charname;
2557 }
bde9e88d
KW
2558 s++;
2559 while (s < e) {
2560 if (! isCHARNAME_CONT(*s)) {
b6ba1137
KW
2561 goto bad_charname;
2562 }
2d8eb851
KW
2563 if (*s == ' ' && *(s-1) == ' ') {
2564 goto multi_spaces;
bd299e29 2565 }
df758df2
KW
2566 if ((U8) *s == NBSP_NATIVE && ckWARN_d(WARN_DEPRECATED)) {
2567 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2568 "NO-BREAK SPACE in a charnames "
2569 "alias definition is deprecated");
2570 }
bde9e88d 2571 s++;
b6ba1137
KW
2572 }
2573 }
2574 else {
bde9e88d
KW
2575 /* Similarly for utf8. For invariants can check directly; for other
2576 * Latin1, can calculate their code point and check; otherwise use a
2577 * swash */
2578 if (UTF8_IS_INVARIANT(*s)) {
2579 if (! isALPHAU(*s)) {
140b12ad
KW
2580 goto bad_charname;
2581 }
bde9e88d
KW
2582 s++;
2583 } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
94bb8c36 2584 if (! isALPHAU(TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)))) {
b6ba1137 2585 goto bad_charname;
6f613c73 2586 }
bde9e88d 2587 s += 2;
6f613c73 2588 }
bde9e88d
KW
2589 else {
2590 if (! PL_utf8_charname_begin) {
2591 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2592 PL_utf8_charname_begin = _core_swash_init("utf8",
2593 "_Perl_Charname_Begin",
2594 &PL_sv_undef,
2595 1, 0, NULL, &flags);
2596 }
2597 if (! swash_fetch(PL_utf8_charname_begin, (U8 *) s, TRUE)) {
2598 goto bad_charname;
2599 }
2600 s += UTF8SKIP(s);
2601 }
2602
2603 while (s < e) {
2604 if (UTF8_IS_INVARIANT(*s)) {
2605 if (! isCHARNAME_CONT(*s)) {
2606 goto bad_charname;
2607 }
2d8eb851
KW
2608 if (*s == ' ' && *(s-1) == ' ') {
2609 goto multi_spaces;
bd299e29 2610 }
bde9e88d
KW
2611 s++;
2612 }
2613 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
94bb8c36 2614 if (! isCHARNAME_CONT(TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1))))
bde9e88d
KW
2615 {
2616 goto bad_charname;
2617 }
df758df2
KW
2618 if (*s == *NBSP_UTF8
2619 && *(s+1) == *(NBSP_UTF8+1)
2620 && ckWARN_d(WARN_DEPRECATED))
2621 {
2622 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2623 "NO-BREAK SPACE in a charnames "
2624 "alias definition is deprecated");
2625 }
bde9e88d
KW
2626 s += 2;
2627 }
2628 else {
2629 if (! PL_utf8_charname_continue) {
2630 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2631 PL_utf8_charname_continue = _core_swash_init("utf8",
2632 "_Perl_Charname_Continue",
2633 &PL_sv_undef,
2634 1, 0, NULL, &flags);
2635 }
2636 if (! swash_fetch(PL_utf8_charname_continue, (U8 *) s, TRUE)) {
2637 goto bad_charname;
2638 }
2639 s += UTF8SKIP(s);
6f613c73
KW
2640 }
2641 }
2d8eb851
KW
2642 }
2643 if (*(s-1) == ' ') {
2644 yyerror_pv(
2645 Perl_form(aTHX_
2646 "charnames alias definitions may not contain trailing "
2647 "white-space; marked by <-- HERE in %.*s<-- HERE %.*s",
2648 (int)(s - backslash_ptr + 1), backslash_ptr,
2649 (int)(e - s + 1), s + 1
2650 ),
2651 UTF ? SVf_UTF8 : 0);
2652 return NULL;
6f613c73
KW
2653 }
2654
94ca1619 2655 if (SvUTF8(res)) { /* Don't accept malformed input */
bde9e88d
KW
2656 const U8* first_bad_char_loc;
2657 STRLEN len;
2658 const char* const str = SvPV_const(res, len);
2659 if (! is_utf8_string_loc((U8 *) str, len, &first_bad_char_loc)) {
2660 /* If warnings are on, this will print a more detailed analysis of
2661 * what is wrong than the error message below */
c80e42f3 2662 utf8n_to_uvchr(first_bad_char_loc,
bde9e88d
KW
2663 (char *) first_bad_char_loc - str,
2664 NULL, 0);
2665
2666 /* We deliberately don't try to print the malformed character,
2667 * which might not print very well; it also may be just the first
2668 * of many malformations, so don't print what comes after it */
2669 yyerror_pv(
2670 Perl_form(aTHX_
2671 "Malformed UTF-8 returned by %.*s immediately after '%.*s'",
2672 (int) (e - backslash_ptr + 1), backslash_ptr,
2673 (int) ((char *) first_bad_char_loc - str), str
2674 ),
2675 SVf_UTF8);
2676 return NULL;
2677 }
2678 }
140b12ad 2679
bde9e88d 2680 return res;
140b12ad 2681
bde9e88d 2682 bad_charname: {
bde9e88d
KW
2683
2684 /* The final %.*s makes sure that should the trailing NUL be missing
2685 * that this print won't run off the end of the string */
2686 yyerror_pv(
2687 Perl_form(aTHX_
2688 "Invalid character in \\N{...}; marked by <-- HERE in %.*s<-- HERE %.*s",
2d8eb851
KW
2689 (int)(s - backslash_ptr + 1), backslash_ptr,
2690 (int)(e - s + 1), s + 1
bde9e88d
KW
2691 ),
2692 UTF ? SVf_UTF8 : 0);
2693 return NULL;
2694 }
2d8eb851
KW
2695
2696 multi_spaces:
2697 yyerror_pv(
2698 Perl_form(aTHX_
2699 "charnames alias definitions may not contain a sequence of "
2700 "multiple spaces; marked by <-- HERE in %.*s<-- HERE %.*s",
2701 (int)(s - backslash_ptr + 1), backslash_ptr,
2702 (int)(e - s + 1), s + 1
2703 ),
2704 UTF ? SVf_UTF8 : 0);
2705 return NULL;
6f613c73
KW
2706}
2707
02aa26ce
NT
2708/*
2709 scan_const
2710
9da1dd8f
DM
2711 Extracts the next constant part of a pattern, double-quoted string,
2712 or transliteration. This is terrifying code.
2713
2714 For example, in parsing the double-quoted string "ab\x63$d", it would
2715 stop at the '$' and return an OP_CONST containing 'abc'.
02aa26ce 2716
94def140 2717 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 2718 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 2719 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 2720
94def140
TS
2721 Returns a pointer to the character scanned up to. If this is
2722 advanced from the start pointer supplied (i.e. if anything was
9da1dd8f 2723 successfully parsed), will leave an OP_CONST for the substring scanned
6154021b 2724 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
2725 by looking at the next characters herself.
2726
02aa26ce 2727 In patterns:
9da1dd8f 2728 expand:
537124e4
KW
2729 \N{FOO} => \N{U+hex_for_character_FOO}
2730 (if FOO expands to multiple characters, expands to \N{U+xx.XX.yy ...})
9da1dd8f
DM
2731
2732 pass through:
2733 all other \-char, including \N and \N{ apart from \N{ABC}
2734
2735 stops on:
2736 @ and $ where it appears to be a var, but not for $ as tail anchor
2737 \l \L \u \U \Q \E
2738 (?{ or (??{
2739
02aa26ce
NT
2740
2741 In transliterations:
2742 characters are VERY literal, except for - not at the start or end
94def140
TS
2743 of the string, which indicates a range. If the range is in bytes,
2744 scan_const expands the range to the full set of intermediate
2745 characters. If the range is in utf8, the hyphen is replaced with
2746 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
2747
2748 In double-quoted strings:
2749 backslashes:
2750 double-quoted style: \r and \n
ff3f963a 2751 constants: \x31, etc.
94def140 2752 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
2753 case and quoting: \U \Q \E
2754 stops on @ and $
2755
2756 scan_const does *not* construct ops to handle interpolated strings.
2757 It stops processing as soon as it finds an embedded $ or @ variable
2758 and leaves it to the caller to work out what's going on.
2759
94def140
TS
2760 embedded arrays (whether in pattern or not) could be:
2761 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2762
2763 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
2764
2765 $ in pattern could be $foo or could be tail anchor. Assumption:
2766 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 2767 followed by one of "()| \r\n\t"
02aa26ce 2768
9da1dd8f 2769 \1 (backreferences) are turned into $1 in substitutions
02aa26ce
NT
2770
2771 The structure of the code is
2772 while (there's a character to process) {
94def140
TS
2773 handle transliteration ranges
2774 skip regexp comments /(?#comment)/ and codes /(?{code})/
2775 skip #-initiated comments in //x patterns
2776 check for embedded arrays
02aa26ce
NT
2777 check for embedded scalars
2778 if (backslash) {
94def140 2779 deprecate \1 in substitution replacements
02aa26ce
NT
2780 handle string-changing backslashes \l \U \Q \E, etc.
2781 switch (what was escaped) {
94def140 2782 handle \- in a transliteration (becomes a literal -)
ff3f963a 2783 if a pattern and not \N{, go treat as regular character
94def140
TS
2784 handle \132 (octal characters)
2785 handle \x15 and \x{1234} (hex characters)
ff3f963a 2786 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
TS
2787 handle \cV (control characters)
2788 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 2789 } (end switch)
77a135fe 2790 continue
02aa26ce 2791 } (end if backslash)
77a135fe 2792 handle regular character
02aa26ce 2793 } (end while character to read)
4e553d73 2794
02aa26ce
NT
2795*/
2796
76e3520e 2797STATIC char *
cea2e8a9 2798S_scan_const(pTHX_ char *start)
79072805 2799{
eb578fdb 2800 char *send = PL_bufend; /* end of the constant */
dc023dbb
KW
2801 SV *sv = newSV(send - start); /* sv for the constant. See note below
2802 on sizing. */
eb578fdb
KW
2803 char *s = start; /* start of the constant */
2804 char *d = SvPVX(sv); /* destination for copies */
dc023dbb
KW
2805 bool dorange = FALSE; /* are we in a translit range? */
2806 bool didrange = FALSE; /* did we just finish a range? */
2807 bool in_charclass = FALSE; /* within /[...]/ */
2808 bool has_utf8 = FALSE; /* Output constant is UTF8 */
2809 bool this_utf8 = cBOOL(UTF); /* Is the source string assumed to be
2810 UTF8? But, this can show as true
2811 when the source isn't utf8, as for
2812 example when it is entirely composed
2813 of hex constants */
6f613c73 2814 SV *res; /* result from charnames */
77a135fe
KW
2815
2816 /* Note on sizing: The scanned constant is placed into sv, which is
2817 * initialized by newSV() assuming one byte of output for every byte of
2818 * input. This routine expects newSV() to allocate an extra byte for a
2819 * trailing NUL, which this routine will append if it gets to the end of
2820 * the input. There may be more bytes of input than output (eg., \N{LATIN
2821 * CAPITAL LETTER A}), or more output than input if the constant ends up
2822 * recoded to utf8, but each time a construct is found that might increase
2823 * the needed size, SvGROW() is called. Its size parameter each time is
2824 * based on the best guess estimate at the time, namely the length used so
2825 * far, plus the length the current construct will occupy, plus room for
2826 * the trailing NUL, plus one byte for every input byte still unscanned */
2827
c3320c2a
KW
2828 UV uv = UV_MAX; /* Initialize to weird value to try to catch any uses
2829 before set */
4c3a8340
TS
2830#ifdef EBCDIC
2831 UV literal_endpoint = 0;
e294cc5d 2832 bool native_range = TRUE; /* turned to FALSE if the first endpoint is Unicode. */
4c3a8340 2833#endif
012bcf8d 2834
7918f24d
NC
2835 PERL_ARGS_ASSERT_SCAN_CONST;
2836
bb16bae8 2837 assert(PL_lex_inwhat != OP_TRANSR);
2b9d42f0
NIS
2838 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
2839 /* If we are doing a trans and we know we want UTF8 set expectation */
2840 has_utf8 = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
2841 this_utf8 = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
2842 }
2843
b899e89d
FC
2844 /* Protect sv from errors and fatal warnings. */
2845 ENTER_with_name("scan_const");
2846 SAVEFREESV(sv);
2b9d42f0 2847
79072805 2848 while (s < send || dorange) {
ff3f963a 2849
02aa26ce 2850 /* get transliterations out of the way (they're most literal) */
3280af22 2851 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 2852 /* expand a range A-Z to the full set of characters. AIE! */
79072805 2853 if (dorange) {
1ba5c669
JH
2854 I32 i; /* current expanded character */
2855 I32 min; /* first character in range */
2856 I32 max; /* last character in range */
02aa26ce 2857
e294cc5d
JH
2858#ifdef EBCDIC
2859 UV uvmax = 0;
2860#endif
2861
2862 if (has_utf8
2863#ifdef EBCDIC
2864 && !native_range
2865#endif
1953db30 2866 ) {
9d4ba2ae 2867 char * const c = (char*)utf8_hop((U8*)d, -1);
8973db79
JH
2868 char *e = d++;
2869 while (e-- > c)
2870 *(e + 1) = *e;
e7214ce8 2871 *c = (char) ILLEGAL_UTF8_BYTE;
8973db79
JH
2872 /* mark the range as done, and continue */
2873 dorange = FALSE;
2874 didrange = TRUE;
2875 continue;
2876 }
2b9d42f0 2877
95a20fc0 2878 i = d - SvPVX_const(sv); /* remember current offset */
e294cc5d
JH
2879#ifdef EBCDIC
2880 SvGROW(sv,
dc023dbb
KW
2881 SvLEN(sv) + ((has_utf8)
2882 ? (512 - UTF_CONTINUATION_MARK
2883 + UNISKIP(0x100))
e294cc5d
JH
2884 : 256));
2885 /* How many two-byte within 0..255: 128 in UTF-8,
2886 * 96 in UTF-8-mod. */
2887#else
9cbb5ea2 2888 SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */
e294cc5d 2889#endif
9cbb5ea2 2890 d = SvPVX(sv) + i; /* refresh d after realloc */
e294cc5d
JH
2891#ifdef EBCDIC
2892 if (has_utf8) {
2893 int j;
2894 for (j = 0; j <= 1; j++) {
2895 char * const c = (char*)utf8_hop((U8*)d, -1);
2896 const UV uv = utf8n_to_uvchr((U8*)c, d - c, NULL, 0);
2897 if (j)
2898 min = (U8)uv;
2899 else if (uv < 256)
2900 max = (U8)uv;
2901 else {
2902 max = (U8)0xff; /* only to \xff */
2903 uvmax = uv; /* \x{100} to uvmax */
2904 }
2905 d = c; /* eat endpoint chars */
2906 }
2907 }
2908 else {
2909#endif
2910 d -= 2; /* eat the first char and the - */
2911 min = (U8)*d; /* first char in range */
2912 max = (U8)d[1]; /* last char in range */
2913#ifdef EBCDIC
2914 }
2915#endif
8ada0baa 2916
c2e66d9e 2917 if (min > max) {
01ec43d0 2918 Perl_croak(aTHX_
d1573ac7 2919 "Invalid range \"%c-%c\" in transliteration operator",
1ba5c669 2920 (char)min, (char)max);
c2e66d9e
GS
2921 }
2922
c7f1f016 2923#ifdef EBCDIC
dc023dbb
KW
2924 /* Because of the discontinuities in EBCDIC A-Z and a-z, expand
2925 * any subsets of these ranges into individual characters */
4c3a8340 2926 if (literal_endpoint == 2 &&
c286a9a6 2927 ((isLOWER_A(min) && isLOWER_A(max)) ||
75e9e7bf
KW
2928 (isUPPER_A(min) && isUPPER_A(max))))
2929 {
2930 for (i = min; i <= max; i++) {
2931 if (isALPHA_A(i))
2932 *d++ = i;
8ada0baa
JH
2933 }
2934 }
2935 else
2936#endif
2937 for (i = min; i <= max; i++)
e294cc5d
JH
2938#ifdef EBCDIC
2939 if (has_utf8) {
55d09dc8 2940 append_utf8_from_native_byte(i, &d);
e294cc5d
JH
2941 }
2942 else
2943#endif
2944 *d++ = (char)i;
2945
2946#ifdef EBCDIC
2947 if (uvmax) {
2948 d = (char*)uvchr_to_utf8((U8*)d, 0x100);
2949 if (uvmax > 0x101)
e7214ce8 2950 *d++ = (char) ILLEGAL_UTF8_BYTE;
e294cc5d
JH
2951 if (uvmax > 0x100)
2952 d = (char*)uvchr_to_utf8((U8*)d, uvmax);
2953 }
2954#endif
02aa26ce
NT
2955
2956 /* mark the range as done, and continue */
79072805 2957 dorange = FALSE;
01ec43d0 2958 didrange = TRUE;
4c3a8340
TS
2959#ifdef EBCDIC
2960 literal_endpoint = 0;
2961#endif
79072805 2962 continue;
4e553d73 2963 }
02aa26ce
NT
2964
2965 /* range begins (ignore - as first or last char) */
79072805 2966 else if (*s == '-' && s+1 < send && s != start) {
4e553d73 2967 if (didrange) {
1fafa243 2968 Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
01ec43d0 2969 }
e294cc5d
JH
2970 if (has_utf8
2971#ifdef EBCDIC
2972 && !native_range
2973#endif
2974 ) {
e7214ce8 2975 *d++ = (char) ILLEGAL_UTF8_BYTE; /* use illegal utf8 byte--see pmtrans */
a0ed51b3
LW
2976 s++;
2977 continue;
2978 }
79072805
LW
2979 dorange = TRUE;
2980 s++;
01ec43d0
GS
2981 }
2982 else {
2983 didrange = FALSE;
4c3a8340
TS
2984#ifdef EBCDIC
2985 literal_endpoint = 0;
e294cc5d 2986 native_range = TRUE;
4c3a8340 2987#endif
01ec43d0 2988 }
79072805 2989 }
02aa26ce
NT
2990
2991 /* if we get here, we're not doing a transliteration */
2992
e4a2df84
DM
2993 else if (*s == '[' && PL_lex_inpat && !in_charclass) {
2994 char *s1 = s-1;
2995 int esc = 0;
2996 while (s1 >= start && *s1-- == '\\')
2997 esc = !esc;
2998 if (!esc)
2999 in_charclass = TRUE;
3000 }
2866decb 3001
e4a2df84
DM
3002 else if (*s == ']' && PL_lex_inpat && in_charclass) {
3003 char *s1 = s-1;
3004 int esc = 0;
3005 while (s1 >= start && *s1-- == '\\')
3006 esc = !esc;
3007 if (!esc)
3008 in_charclass = FALSE;
3009 }
2866decb 3010
9da1dd8f
DM
3011 /* skip for regexp comments /(?#comment)/, except for the last
3012 * char, which will be done separately.
3013 * Stop on (?{..}) and friends */
3014
c30fc27b 3015 else if (*s == '(' && PL_lex_inpat && s[1] == '?' && !in_charclass) {
cc6b7395 3016 if (s[2] == '#') {
e994fd66 3017 while (s+1 < send && *s != ')')
5ff03569 3018 *d++ = *s++;
155aba94 3019 }
c30fc27b 3020 else if (!PL_lex_casemods &&
d3cec5e5
DM
3021 ( s[2] == '{' /* This should match regcomp.c */
3022 || (s[2] == '?' && s[3] == '{')))
155aba94 3023 {
9da1dd8f 3024 break;
cc6b7395 3025 }
748a9306 3026 }
02aa26ce
NT
3027
3028 /* likewise skip #-initiated comments in //x patterns */
c30fc27b 3029 else if (*s == '#' && PL_lex_inpat && !in_charclass &&
73134a2e 3030 ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED) {
748a9306 3031 while (s+1 < send && *s != '\n')
5ff03569 3032 *d++ = *s++;
748a9306 3033 }
02aa26ce 3034
9da1dd8f
DM
3035 /* no further processing of single-quoted regex */
3036 else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'')
3037 goto default_action;
3038
5d1d4326 3039 /* check for embedded arrays
da6eedaa 3040 (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
5d1d4326 3041 */
1749ea0d 3042 else if (*s == '@' && s[1]) {
8a2bca12 3043 if (isWORDCHAR_lazy_if(s+1,UTF))
1749ea0d
TS
3044 break;
3045 if (strchr(":'{$", s[1]))
3046 break;
3047 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
3048 break; /* in regexp, neither @+ nor @- are interpolated */
3049 }
02aa26ce
NT
3050
3051 /* check for embedded scalars. only stop if we're sure it's a
3052 variable.
3053 */
79072805 3054 else if (*s == '$') {
3280af22 3055 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
79072805 3056 break;
77772344 3057 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
a2a5de95
NC
3058 if (s[1] == '\\') {
3059 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
3060 "Possible unintended interpolation of $\\ in regex");
77772344 3061 }
79072805 3062 break; /* in regexp, $ might be tail anchor */
77772344 3063 }
79072805 3064 }
02aa26ce 3065
2b9d42f0
NIS
3066 /* End of else if chain - OP_TRANS rejoin rest */
3067
02aa26ce 3068 /* backslashes */
79072805 3069 if (*s == '\\' && s+1 < send) {
ff3f963a
KW
3070 char* e; /* Can be used for ending '}', etc. */
3071
79072805 3072 s++;
02aa26ce 3073
7d0fc23c
KW
3074 /* warn on \1 - \9 in substitution replacements, but note that \11
3075 * is an octal; and \19 is \1 followed by '9' */
3280af22 3076 if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat &&
a0d0e21e 3077 isDIGIT(*s) && *s != '0' && !isDIGIT(s[1]))
79072805 3078 {
c782d7ee 3079 /* diag_listed_as: \%d better written as $%d */
a2a5de95 3080 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
79072805
LW
3081 *--s = '$';
3082 break;
3083 }
02aa26ce
NT
3084
3085 /* string-change backslash escapes */
838f2281 3086 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQF", *s)) {
79072805
LW
3087 --s;
3088 break;
3089 }
ff3f963a
KW
3090 /* In a pattern, process \N, but skip any other backslash escapes.
3091 * This is because we don't want to translate an escape sequence
3092 * into a meta symbol and have the regex compiler use the meta
3093 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
3094 * in spite of this, we do have to process \N here while the proper
3095 * charnames handler is in scope. See bugs #56444 and #62056.
3096 * There is a complication because \N in a pattern may also stand
3097 * for 'match a non-nl', and not mean a charname, in which case its
3098 * processing should be deferred to the regex compiler. To be a
3099 * charname it must be followed immediately by a '{', and not look
3100 * like \N followed by a curly quantifier, i.e., not something like
3101 * \N{3,}. regcurly returns a boolean indicating if it is a legal
3102 * quantifier */
3103 else if (PL_lex_inpat
3104 && (*s != 'N'
3105 || s[1] != '{'
412f55bb 3106 || regcurly(s + 1)))
ff3f963a 3107 {
4d73d076 3108 *d++ = '\\';
cc74c5bd
TS
3109 goto default_action;
3110 }
02aa26ce 3111
79072805 3112 switch (*s) {
02aa26ce
NT
3113
3114 /* quoted - in transliterations */
79072805 3115 case '-':
3280af22 3116 if (PL_lex_inwhat == OP_TRANS) {
79072805
LW
3117 *d++ = *s++;
3118 continue;
3119 }
924ba076 3120 /* FALLTHROUGH */
79072805 3121 default:
11b8faa4 3122 {
15861f94 3123 if ((isALPHANUMERIC(*s)))
a2a5de95
NC
3124 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
3125 "Unrecognized escape \\%c passed through",
3126 *s);
11b8faa4 3127 /* default action is to copy the quoted character */
f9a63242 3128 goto default_action;
11b8faa4 3129 }
02aa26ce 3130
632403cc 3131 /* eg. \132 indicates the octal constant 0132 */
79072805
LW
3132 case '0': case '1': case '2': case '3':
3133 case '4': case '5': case '6': case '7':
ba210ebe 3134 {
5e0a247b 3135 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
53305cf1 3136 STRLEN len = 3;
06972766 3137 uv = grok_oct(s, &len, &flags, NULL);
ba210ebe 3138 s += len;
5e0a247b
KW
3139 if (len < 3 && s < send && isDIGIT(*s)
3140 && ckWARN(WARN_MISC))
3141 {
3142 Perl_warner(aTHX_ packWARN(WARN_MISC),
3143 "%s", form_short_octal_warning(s, len));
3144 }
ba210ebe 3145 }
012bcf8d 3146 goto NUM_ESCAPE_INSERT;
02aa26ce 3147
f0a2b745
KW
3148 /* eg. \o{24} indicates the octal constant \024 */
3149 case 'o':
3150 {
454155d9 3151 const char* error;
f0a2b745 3152
00ce5563 3153 bool valid = grok_bslash_o(&s, &uv, &error,
80f4111b
KW
3154 TRUE, /* Output warning */
3155 FALSE, /* Not strict */
17896a85
KW
3156 TRUE, /* Output warnings for
3157 non-portables */
80f4111b 3158 UTF);
454155d9 3159 if (! valid) {
f0a2b745
KW
3160 yyerror(error);
3161 continue;
3162 }
3163 goto NUM_ESCAPE_INSERT;
3164 }
3165
77a135fe 3166 /* eg. \x24 indicates the hex constant 0x24 */
79072805 3167 case 'x':
a0481293 3168 {
a0481293 3169 const char* error;
355860ce 3170
00ce5563 3171 bool valid = grok_bslash_x(&s, &uv, &error,
80f4111b
KW
3172 TRUE, /* Output warning */
3173 FALSE, /* Not strict */
17896a85
KW
3174 TRUE, /* Output warnings for
3175 non-portables */
80f4111b 3176 UTF);
a0481293
KW
3177 if (! valid) {
3178 yyerror(error);
355860ce 3179 continue;
ba210ebe 3180 }
012bcf8d
GS
3181 }
3182
3183 NUM_ESCAPE_INSERT:
ff3f963a
KW
3184 /* Insert oct or hex escaped character. There will always be
3185 * enough room in sv since such escapes will be longer than any
3186 * UTF-8 sequence they can end up as, except if they force us
3187 * to recode the rest of the string into utf8 */
ba7cea30 3188
06972766 3189 /* Here uv is the ordinal of the next character being added */
6f2d5cbc 3190 if (!UVCHR_IS_INVARIANT(uv)) {
9aa983d2 3191 if (!has_utf8 && uv > 255) {
77a135fe
KW
3192 /* Might need to recode whatever we have accumulated so
3193 * far if it contains any chars variant in utf8 or
3194 * utf-ebcdic. */
3195
3196 SvCUR_set(sv, d - SvPVX_const(sv));
3197 SvPOK_on(sv);
3198 *d = '\0';
77a135fe 3199 /* See Note on sizing above. */
7bf79863
KW
3200 sv_utf8_upgrade_flags_grow(sv,
3201 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3202 UNISKIP(uv) + (STRLEN)(send - s) + 1);
77a135fe
KW
3203 d = SvPVX(sv) + SvCUR(sv);
3204 has_utf8 = TRUE;
012bcf8d
GS
3205 }
3206
77a135fe 3207 if (has_utf8) {
c80e42f3 3208 d = (char*)uvchr_to_utf8((U8*)d, uv);
f9a63242
JH
3209 if (PL_lex_inwhat == OP_TRANS &&
3210 PL_sublex_info.sub_op) {
3211 PL_sublex_info.sub_op->op_private |=
3212 (PL_lex_repl ? OPpTRANS_FROM_UTF
3213 : OPpTRANS_TO_UTF);
f9a63242 3214 }
e294cc5d
JH
3215#ifdef EBCDIC
3216 if (uv > 255 && !dorange)
3217 native_range = FALSE;
3218#endif
012bcf8d 3219 }
a0ed51b3 3220 else {
012bcf8d 3221 *d++ = (char)uv;
a0ed51b3 3222 }
012bcf8d
GS
3223 }
3224 else {
c4d5f83a 3225 *d++ = (char) uv;
a0ed51b3 3226 }
79072805 3227 continue;
02aa26ce 3228
4a2d328f 3229 case 'N':
ff3f963a
KW
3230 /* In a non-pattern \N must be a named character, like \N{LATIN
3231 * SMALL LETTER A} or \N{U+0041}. For patterns, it also can
3232 * mean to match a non-newline. For non-patterns, named
3233 * characters are converted to their string equivalents. In
3234 * patterns, named characters are not converted to their
3235 * ultimate forms for the same reasons that other escapes
3236 * aren't. Instead, they are converted to the \N{U+...} form
3237 * to get the value from the charnames that is in effect right
3238 * now, while preserving the fact that it was a named character
3239 * so that the regex compiler knows this */
3240
ff3f963a
KW
3241 /* The structure of this section of code (besides checking for
3242 * errors and upgrading to utf8) is:
3243 * Further disambiguate between the two meanings of \N, and if
3244 * not a charname, go process it elsewhere
0a96133f
KW
3245 * If of form \N{U+...}, pass it through if a pattern;
3246 * otherwise convert to utf8
3247 * Otherwise must be \N{NAME}: convert to \N{U+c1.c2...} if a
3248 * pattern; otherwise convert to utf8 */
ff3f963a
KW
3249
3250 /* Here, s points to the 'N'; the test below is guaranteed to
3251 * succeed if we are being called on a pattern as we already
3252 * know from a test above that the next character is a '{'.
3253 * On a non-pattern \N must mean 'named sequence, which
3254 * requires braces */
3255 s++;
3256 if (*s != '{') {
3257 yyerror("Missing braces on \\N{}");
3258 continue;
3259 }
3260 s++;
3261
0a96133f 3262 /* If there is no matching '}', it is an error. */
ff3f963a
KW
3263 if (! (e = strchr(s, '}'))) {
3264 if (! PL_lex_inpat) {
5777a3f7 3265 yyerror("Missing right brace on \\N{}");
0a96133f 3266 } else {
4407f1b8 3267 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N");
dbc0d4f2 3268 }
0a96133f 3269 continue;
ff3f963a 3270 }
cddc7ef4 3271
ff3f963a 3272 /* Here it looks like a named character */
cddc7ef4 3273
ff3f963a
KW
3274 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3275 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3276 | PERL_SCAN_DISALLOW_PREFIX;
3277 STRLEN len;
3278
3279 /* For \N{U+...}, the '...' is a unicode value even on
3280 * EBCDIC machines */
3281 s += 2; /* Skip to next char after the 'U+' */
3282 len = e - s;
3283 uv = grok_hex(s, &len, &flags, NULL);
3284 if (len == 0 || len != (STRLEN)(e - s)) {
3285 yyerror("Invalid hexadecimal number in \\N{U+...}");
3286 s = e + 1;
3287 continue;
3288 }
3289
3290 if (PL_lex_inpat) {
3291
e2a7e165
KW
3292 /* On non-EBCDIC platforms, pass through to the regex
3293 * compiler unchanged. The reason we evaluated the
3294 * number above is to make sure there wasn't a syntax
3295 * error. But on EBCDIC we convert to native so
3296 * downstream code can continue to assume it's native
3297 */
ff3f963a 3298 s -= 5; /* Include the '\N{U+' */
e2a7e165
KW
3299#ifdef EBCDIC
3300 d += my_snprintf(d, e - s + 1 + 1, /* includes the }
3301 and the \0 */
3302 "\\N{U+%X}",
3303 (unsigned int) UNI_TO_NATIVE(uv));
3304#else
ff3f963a
KW
3305 Copy(s, d, e - s + 1, char); /* 1 = include the } */
3306 d += e - s + 1;
e2a7e165 3307#endif
ff3f963a
KW
3308 }
3309 else { /* Not a pattern: convert the hex to string */
3310
3311 /* If destination is not in utf8, unconditionally
3312 * recode it to be so. This is because \N{} implies
3313 * Unicode semantics, and scalars have to be in utf8
3314 * to guarantee those semantics */
3315 if (! has_utf8) {
3316 SvCUR_set(sv, d - SvPVX_const(sv));
3317 SvPOK_on(sv);
3318 *d = '\0';
3319 /* See Note on sizing above. */
3320 sv_utf8_upgrade_flags_grow(
3321 sv,
3322 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3323 UNISKIP(uv) + (STRLEN)(send - e) + 1);
3324 d = SvPVX(sv) + SvCUR(sv);
3325 has_utf8 = TRUE;
3326 }
3327
a46469e6 3328 /* Add the (Unicode) code point to the output. */
ff3f963a 3329 if (UNI_IS_INVARIANT(uv)) {
a46469e6 3330 *d++ = (char) LATIN1_TO_NATIVE(uv);
ff3f963a 3331 }
a46469e6
KW
3332 else {
3333 d = (char*) uvoffuni_to_utf8_flags((U8*)d, uv, 0);
3334 }
ff3f963a
KW
3335 }
3336 }
6f613c73
KW
3337 else /* Here is \N{NAME} but not \N{U+...}. */
3338 if ((res = get_and_check_backslash_N_name(s, e)))
3339 {
3340 STRLEN len;
3341 const char *str = SvPV_const(res, len);
3342 if (PL_lex_inpat) {
ff3f963a
KW
3343
3344 if (! len) { /* The name resolved to an empty string */
3345 Copy("\\N{}", d, 4, char);
3346 d += 4;
3347 }
3348 else {
3349 /* In order to not lose information for the regex
3350 * compiler, pass the result in the specially made
3351 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3352 * the code points in hex of each character
3353 * returned by charnames */
3354
3355 const char *str_end = str + len;
3b721c4f 3356 const STRLEN off = d - SvPVX_const(sv);
94ca1619
KW
3357
3358 if (! SvUTF8(res)) {
3359 /* For the non-UTF-8 case, we can determine the
3360 * exact length needed without having to parse
3361 * through the string. Each character takes up
3362 * 2 hex digits plus either a trailing dot or
3363 * the "}" */
3364 d = off + SvGROW(sv, off
3365 + 3 * len
3366 + 6 /* For the "\N{U+", and
3367 trailing NUL */
3368 + (STRLEN)(send - e));
3369 Copy("\\N{U+", d, 5, char);
3370 d += 5;
3371 while (str < str_end) {
3372 char hex_string[4];
e8549682 3373 int len =
51f14a05
JH
3374 my_snprintf(hex_string,
3375 sizeof(hex_string),
e8549682
JH
3376 "%02X.", (U8) *str);
3377 PERL_MY_SNPRINTF_POST_GUARD(len, sizeof(hex_string));
94ca1619
KW
3378 Copy(hex_string, d, 3, char);
3379 d += 3;
3380 str++;
3381 }
3382 d--; /* We will overwrite below the final
3383 dot with a right brace */
3384 }
3385 else {
1953db30
KW
3386 STRLEN char_length; /* cur char's byte length */
3387
3388 /* and the number of bytes after this is
3389 * translated into hex digits */
3390 STRLEN output_length;
3391
3392 /* 2 hex per byte; 2 chars for '\N'; 2 chars
3393 * for max('U+', '.'); and 1 for NUL */
3394 char hex_string[2 * UTF8_MAXBYTES + 5];
3395
3396 /* Get the first character of the result. */
a46469e6 3397 U32 uv = utf8n_to_uvchr((U8 *) str,
1953db30
KW
3398 len,
3399 &char_length,
3400 UTF8_ALLOW_ANYUV);
3401 /* Convert first code point to hex, including
a46469e6 3402 * the boiler plate before it. */
1953db30
KW
3403 output_length =
3404 my_snprintf(hex_string, sizeof(hex_string),
a46469e6
KW
3405 "\\N{U+%X",
3406 (unsigned int) uv);
1953db30
KW
3407
3408 /* Make sure there is enough space to hold it */
3409 d = off + SvGROW(sv, off
3410 + output_length
3411 + (STRLEN)(send - e)
3412 + 2); /* '}' + NUL */
3413 /* And output it */
3414 Copy(hex_string, d, output_length, char);
3415 d += output_length;
3416
3417 /* For each subsequent character, append dot and
3418 * its ordinal in hex */
3419 while ((str += char_length) < str_end) {
3420 const STRLEN off = d - SvPVX_const(sv);
a46469e6 3421 U32 uv = utf8n_to_uvchr((U8 *) str,
1953db30
KW
3422 str_end - str,
3423 &char_length,
3424 UTF8_ALLOW_ANYUV);
3425 output_length =
3426 my_snprintf(hex_string,
a46469e6
KW
3427 sizeof(hex_string),
3428 ".%X",
3429 (unsigned int) uv);
1953db30
KW
3430
3431 d = off + SvGROW(sv, off
3432 + output_length
3433 + (STRLEN)(send - e)
3434 + 2); /* '}' + NUL */
3435 Copy(hex_string, d, output_length, char);
3436 d += output_length;
3437 }
94ca1619 3438 }
ff3f963a
KW
3439
3440 *d++ = '}'; /* Done. Add the trailing brace */
3441 }
3442 }
3443 else { /* Here, not in a pattern. Convert the name to a
3444 * string. */
3445
3446 /* If destination is not in utf8, unconditionally
3447 * recode it to be so. This is because \N{} implies
3448 * Unicode semantics, and scalars have to be in utf8
3449 * to guarantee those semantics */
3450 if (! has_utf8) {
3451 SvCUR_set(sv, d - SvPVX_const(sv));
3452 SvPOK_on(sv);
3453 *d = '\0';
3454 /* See Note on sizing above. */
3455 sv_utf8_upgrade_flags_grow(sv,
3456 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3457 len + (STRLEN)(send - s) + 1);
3458 d = SvPVX(sv) + SvCUR(sv);
3459 has_utf8 = TRUE;
3460 } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
3461
3462 /* See Note on sizing above. (NOTE: SvCUR() is not
3463 * set correctly here). */
3464 const STRLEN off = d - SvPVX_const(sv);
3465 d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1);
3466 }
7fc82458
KW
3467 if (! SvUTF8(res)) { /* Make sure is \N{} return is UTF-8 */
3468 sv_utf8_upgrade(res);
3469 str = SvPV_const(res, len);
3470 }
ff3f963a
KW
3471 Copy(str, d, len, char);
3472 d += len;
423cee85 3473 }
6f613c73 3474
423cee85 3475 SvREFCNT_dec(res);
cb233ae3 3476
cb233ae3 3477 } /* End \N{NAME} */
ff3f963a
KW
3478#ifdef EBCDIC
3479 if (!dorange)
3480 native_range = FALSE; /* \N{} is defined to be Unicode */
3481#endif
3482 s = e + 1; /* Point to just after the '}' */
423cee85
JH
3483 continue;
3484
02aa26ce 3485 /* \c is a control character */
79072805
LW
3486 case 'c':
3487 s++;
961ce445 3488 if (s < send) {
421e43ba 3489 *d++ = grok_bslash_c(*s++, 1);
ba210ebe 3490 }
961ce445
RGS
3491 else {
3492 yyerror("Missing control char name in \\c");
3493 }
79072805 3494 continue;
02aa26ce
NT
3495
3496 /* printf-style backslashes, formfeeds, newlines, etc */
79072805 3497 case 'b':
4d73d076 3498 *d++ = '\b';
79072805
LW
3499 break;
3500 case 'n':
4d73d076 3501 *d++ = '\n';
79072805
LW
3502 break;
3503 case 'r':
4d73d076 3504 *d++ = '\r';
79072805
LW
3505 break;
3506 case 'f':
4d73d076 3507 *d++ = '\f';
79072805
LW
3508 break;
3509 case 't':
4d73d076 3510 *d++ = '\t';
79072805 3511 break;
34a3fe2a 3512 case 'e':
da0334e8 3513 *d++ = ESC_NATIVE;
34a3fe2a
PP
3514 break;
3515 case 'a':
4d73d076 3516 *d++ = '\a';
79072805 3517 break;
02aa26ce
NT
3518 } /* end switch */
3519
79072805
LW
3520 s++;
3521 continue;
02aa26ce 3522 } /* end if (backslash) */
4c3a8340
TS
3523#ifdef EBCDIC
3524 else
3525 literal_endpoint++;
3526#endif
02aa26ce 3527
f9a63242 3528 default_action:
77a135fe
KW
3529 /* If we started with encoded form, or already know we want it,
3530 then encode the next character */
6f2d5cbc 3531 if (! NATIVE_BYTE_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
2b9d42f0 3532 STRLEN len = 1;
77a135fe
KW
3533
3534
3535 /* One might think that it is wasted effort in the case of the
3536 * source being utf8 (this_utf8 == TRUE) to take the next character
3537 * in the source, convert it to an unsigned value, and then convert
3538 * it back again. But the source has not been validated here. The
3539 * routine that does the conversion checks for errors like
3540 * malformed utf8 */
3541
233ca360
KW
3542 const UV nextuv = (this_utf8)
3543 ? utf8n_to_uvchr((U8*)s, send - s, &len, 0)
3544 : (UV) ((U8) *s);
5aaebcb3 3545 const STRLEN need = UNISKIP(nextuv);
77a135fe
KW
3546 if (!has_utf8) {
3547 SvCUR_set(sv, d - SvPVX_const(sv));
3548 SvPOK_on(sv);
3549 *d = '\0';
77a135fe 3550 /* See Note on sizing above. */
7bf79863
KW
3551 sv_utf8_upgrade_flags_grow(sv,
3552 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3553 need + (STRLEN)(send - s) + 1);
77a135fe
KW
3554 d = SvPVX(sv) + SvCUR(sv);
3555 has_utf8 = TRUE;
3556 } else if (need > len) {
3557 /* encoded value larger than old, may need extra space (NOTE:
3558 * SvCUR() is not set correctly here). See Note on sizing
3559 * above. */
9d4ba2ae 3560 const STRLEN off = d - SvPVX_const(sv);
77a135fe 3561 d = SvGROW(sv, off + need + (STRLEN)(send - s) + 1) + off;
2b9d42f0 3562 }
77a135fe
KW
3563 s += len;
3564
5f66b61c 3565 d = (char*)uvchr_to_utf8((U8*)d, nextuv);
e294cc5d
JH
3566#ifdef EBCDIC
3567 if (uv > 255 && !dorange)
3568 native_range = FALSE;
3569#endif
2b9d42f0
NIS
3570 }
3571 else {
5ff03569 3572 *d++ = *s++;
2b9d42f0 3573 }
02aa26ce
NT
3574 } /* while loop to process each character */
3575
3576 /* terminate the string and set up the sv */
79072805 3577 *d = '\0';
95a20fc0 3578 SvCUR_set(sv, d - SvPVX_const(sv));
2b9d42f0 3579 if (SvCUR(sv) >= SvLEN(sv))
5637ef5b
NC
3580 Perl_croak(aTHX_ "panic: constant overflowed allocated space, %"UVuf
3581 " >= %"UVuf, (UV)SvCUR(sv), (UV)SvLEN(sv));
2b9d42f0 3582
79072805 3583 SvPOK_on(sv);
9f4817db 3584 if (PL_encoding && !has_utf8) {
d0063567
DK
3585 sv_recode_to_utf8(sv, PL_encoding);
3586 if (SvUTF8(sv))
3587 has_utf8 = TRUE;
9f4817db 3588 }
2b9d42f0 3589 if (has_utf8) {
7e2040f0 3590 SvUTF8_on(sv);
2b9d42f0 3591 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
d0063567 3592 PL_sublex_info.sub_op->op_private |=
2b9d42f0
NIS
3593 (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
3594 }
3595 }
79072805 3596
02aa26ce 3597 /* shrink the sv if we allocated more than we used */
79072805 3598 if (SvCUR(sv) + 5 < SvLEN(sv)) {
1da4ca5f 3599 SvPV_shrink_to_cur(sv);
79072805 3600 }
02aa26ce 3601
6154021b 3602 /* return the substring (via pl_yylval) only if we parsed anything */
f0feb466
FC
3603 if (s > start) {
3604 char *s2 = start;
ffdb8b16
FC
3605 for (; s2 < s; s2++) {
3606 if (*s2 == '\n')
3607 COPLINE_INC_WITH_HERELINES;
3608 }
b899e89d 3609 SvREFCNT_inc_simple_void_NN(sv);
4f3e2518
DM
3610 if ( (PL_hints & ( PL_lex_inpat ? HINT_NEW_RE : HINT_NEW_STRING ))
3611 && ! PL_parser->lex_re_reparsing)
3612 {
eb0d8d16
NC
3613 const char *const key = PL_lex_inpat ? "qr" : "q";
3614 const STRLEN keylen = PL_lex_inpat ? 2 : 1;
3615 const char *type;
3616 STRLEN typelen;
3617
3618 if (PL_lex_inwhat == OP_TRANS) {
3619 type = "tr";
3620 typelen = 2;
3621 } else if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat) {
3622 type = "s";
3623 typelen = 1;
9da1dd8f
DM
3624 } else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'') {
3625 type = "q";
3626 typelen = 1;
eb0d8d16
NC
3627 } else {
3628 type = "qq";
3629 typelen = 2;
3630 }
3631
3632 sv = S_new_constant(aTHX_ start, s - start, key, keylen, sv, NULL,
3633 type, typelen);
3634 }
6154021b 3635 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
b899e89d
FC
3636 }
3637 LEAVE_with_name("scan_const");
79072805
LW
3638 return s;
3639}
3640
ffb4593c
NT
3641/* S_intuit_more
3642 * Returns TRUE if there's more to the expression (e.g., a subscript),
3643 * FALSE otherwise.
ffb4593c
NT
3644 *
3645 * It deals with "$foo[3]" and /$foo[3]/ and /$foo[0123456789$]+/
3646 *
3647 * ->[ and ->{ return TRUE
ff25e5db 3648 * ->$* ->$#* ->@* ->@[ ->@{ return TRUE if postderef_qq is enabled
ffb4593c
NT
3649 * { and [ outside a pattern are always subscripts, so return TRUE
3650 * if we're outside a pattern and it's not { or [, then return FALSE
3651 * if we're in a pattern and the first char is a {
3652 * {4,5} (any digits around the comma) returns FALSE
3653 * if we're in a pattern and the first char is a [
3654 * [] returns FALSE
3655 * [SOMETHING] has a funky algorithm to decide whether it's a
3656 * character class or not. It has to deal with things like
3657 * /$foo[-3]/ and /$foo[$bar]/ as well as /$foo[$\d]+/
3658 * anything else returns TRUE
3659 */
3660
9cbb5ea2
GS
3661