This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
toke.c: Skip PL_expect assignment under KEY_require
[perl5.git] / toke.c
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
f0e67a1d
Z
26This is the lower layer of the Perl parser, managing characters and tokens.
27
28=for apidoc AmU|yy_parser *|PL_parser
29
30Pointer to a structure encapsulating the state of the parsing operation
31currently in progress. The pointer can be locally changed to perform
32a nested parse without interfering with the state of an outer parse.
33Individual members of C<PL_parser> have their own documentation.
34
35=cut
36*/
37
378cc40b 38#include "EXTERN.h"
864dbfa3 39#define PERL_IN_TOKE_C
378cc40b 40#include "perl.h"
04e98a4d 41#include "dquote_static.c"
378cc40b 42
eb0d8d16
NC
43#define new_constant(a,b,c,d,e,f,g) \
44 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
45
6154021b 46#define pl_yylval (PL_parser->yylval)
d3b6f988 47
199e78b7
DM
48/* XXX temporary backwards compatibility */
49#define PL_lex_brackets (PL_parser->lex_brackets)
78cdf107
Z
50#define PL_lex_allbrackets (PL_parser->lex_allbrackets)
51#define PL_lex_fakeeof (PL_parser->lex_fakeeof)
199e78b7
DM
52#define PL_lex_brackstack (PL_parser->lex_brackstack)
53#define PL_lex_casemods (PL_parser->lex_casemods)
54#define PL_lex_casestack (PL_parser->lex_casestack)
55#define PL_lex_defer (PL_parser->lex_defer)
56#define PL_lex_dojoin (PL_parser->lex_dojoin)
57#define PL_lex_expect (PL_parser->lex_expect)
58#define PL_lex_formbrack (PL_parser->lex_formbrack)
59#define PL_lex_inpat (PL_parser->lex_inpat)
60#define PL_lex_inwhat (PL_parser->lex_inwhat)
61#define PL_lex_op (PL_parser->lex_op)
62#define PL_lex_repl (PL_parser->lex_repl)
63#define PL_lex_starts (PL_parser->lex_starts)
64#define PL_lex_stuff (PL_parser->lex_stuff)
65#define PL_multi_start (PL_parser->multi_start)
66#define PL_multi_open (PL_parser->multi_open)
67#define PL_multi_close (PL_parser->multi_close)
199e78b7
DM
68#define PL_preambled (PL_parser->preambled)
69#define PL_sublex_info (PL_parser->sublex_info)
bdc0bf6f 70#define PL_linestr (PL_parser->linestr)
c2598295
DM
71#define PL_expect (PL_parser->expect)
72#define PL_copline (PL_parser->copline)
f06b5848
DM
73#define PL_bufptr (PL_parser->bufptr)
74#define PL_oldbufptr (PL_parser->oldbufptr)
75#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
76#define PL_linestart (PL_parser->linestart)
77#define PL_bufend (PL_parser->bufend)
78#define PL_last_uni (PL_parser->last_uni)
79#define PL_last_lop (PL_parser->last_lop)
80#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 81#define PL_lex_state (PL_parser->lex_state)
2f9285f8 82#define PL_rsfp (PL_parser->rsfp)
5486870f 83#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
84#define PL_in_my (PL_parser->in_my)
85#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 86#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 87#define PL_multi_end (PL_parser->multi_end)
13765c85 88#define PL_error_count (PL_parser->error_count)
199e78b7 89
fb205e7a
DM
90# define PL_nexttoke (PL_parser->nexttoke)
91# define PL_nexttype (PL_parser->nexttype)
92# define PL_nextval (PL_parser->nextval)
199e78b7 93
a1894d81 94static const char* const ident_too_long = "Identifier too long";
8903cb82 95
9ded7720 96# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2 97
a7aaec61
Z
98#define XENUMMASK 0x3f
99#define XFAKEEOF 0x40
100#define XFAKEBRACK 0x80
9059aa12 101
39e02b42
JH
102#ifdef USE_UTF8_SCRIPTS
103# define UTF (!IN_BYTES)
2b9d42f0 104#else
802a15e9 105# define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
2b9d42f0 106#endif
a0ed51b3 107
b1fc3636
CJ
108/* The maximum number of characters preceding the unrecognized one to display */
109#define UNRECOGNIZED_PRECEDE_COUNT 10
110
61f0cdd9 111/* In variables named $^X, these are the legal values for X.
2b92dfce
GS
112 * 1999-02-27 mjd-perl-patch@plover.com */
113#define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
114
14bd96d0 115#define SPACE_OR_TAB(c) isBLANK_A(c)
bf4acbe4 116
9ff909cf
JH
117#define HEXFP_PEEK(s) \
118 (((s[0] == '.') && \
119 (isXDIGIT(s[1]) || isALPHA_FOLD_EQ(s[1], 'p'))) || \
120 isALPHA_FOLD_EQ(s[0], 'p'))
121
ffb4593c
NT
122/* LEX_* are values for PL_lex_state, the state of the lexer.
123 * They are arranged oddly so that the guard on the switch statement
79072805 124 * can get by with a single comparison (if the compiler is smart enough).
9da1dd8f
DM
125 *
126 * These values refer to the various states within a sublex parse,
127 * i.e. within a double quotish string
79072805
LW
128 */
129
fb73857a 130/* #define LEX_NOTPARSING 11 is done in perl.h. */
131
b6007c36
DM
132#define LEX_NORMAL 10 /* normal code (ie not within "...") */
133#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
134#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
135#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
136#define LEX_INTERPSTART 6 /* expecting the start of a $var */
137
138 /* at end of code, eg "$x" followed by: */
139#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
140#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
141
142#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
143 string or after \E, $foo, etc */
144#define LEX_INTERPCONST 2 /* NOT USED */
145#define LEX_FORMLINE 1 /* expecting a format line */
146#define LEX_KNOWNEXT 0 /* next token known; just return it */
147
79072805 148
bbf60fe6 149#ifdef DEBUGGING
27da23d5 150static const char* const lex_state_names[] = {
bbf60fe6
DM
151 "KNOWNEXT",
152 "FORMLINE",
153 "INTERPCONST",
154 "INTERPCONCAT",
155 "INTERPENDMAYBE",
156 "INTERPEND",
157 "INTERPSTART",
158 "INTERPPUSH",
159 "INTERPCASEMOD",
160 "INTERPNORMAL",
161 "NORMAL"
162};
163#endif
164
79072805 165#include "keywords.h"
fe14fcc3 166
ffb4593c
NT
167/* CLINE is a macro that ensures PL_copline has a sane value */
168
57843af0 169#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 170
29595ff2
NC
171# define SKIPSPACE0(s) skipspace(s)
172# define SKIPSPACE1(s) skipspace(s)
173# define SKIPSPACE2(s,tsv) skipspace(s)
174# define PEEKSPACE(s) skipspace(s)
29595ff2 175
ffb4593c
NT
176/*
177 * Convenience functions to return different tokens and prime the
9cbb5ea2 178 * lexer for the next token. They all take an argument.
ffb4593c
NT
179 *
180 * TOKEN : generic token (used for '(', DOLSHARP, etc)
181 * OPERATOR : generic operator
182 * AOPERATOR : assignment operator
183 * PREBLOCK : beginning the block after an if, while, foreach, ...
184 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
185 * PREREF : *EXPR where EXPR is not a simple identifier
186 * TERM : expression term
89f35911 187 * POSTDEREF : postfix dereference (->$* ->@[...] etc.)
ffb4593c
NT
188 * LOOPX : loop exiting command (goto, last, dump, etc)
189 * FTST : file test operator
190 * FUN0 : zero-argument function
7eb971ee 191 * FUN0OP : zero-argument function, with its op created in this file
2d2e263d 192 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
193 * BOop : bitwise or or xor
194 * BAop : bitwise and
195 * SHop : shift operator
196 * PWop : power operator
9cbb5ea2 197 * PMop : pattern-matching operator
ffb4593c
NT
198 * Aop : addition-level operator
199 * Mop : multiplication-level operator
200 * Eop : equality-testing operator
e5edeb50 201 * Rop : relational operator <= != gt
ffb4593c
NT
202 *
203 * Also see LOP and lop() below.
204 */
205
998054bd 206#ifdef DEBUGGING /* Serve -DT. */
704d4215 207# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 208#else
bbf60fe6 209# define REPORT(retval) (retval)
998054bd
SC
210#endif
211
bbf60fe6
DM
212#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
213#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
214#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, REPORT(retval)))
215#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
216#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
217#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
218#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
89f35911 219#define POSTDEREF(f) return (PL_bufptr = s, S_postderef(aTHX_ REPORT(f),s[1]))
7a61bf3c
FC
220#define LOOPX(f) return (PL_expect = XOPERATOR, \
221 PL_bufptr = force_word(s,WORD,TRUE,FALSE), \
222 pl_yylval.ival=f, \
223 (void)(PL_nexttoke || (PL_expect = XTERM)), \
224 REPORT((int)LOOPEX))
6154021b
RGS
225#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
226#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
7eb971ee 227#define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
6154021b
RGS
228#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
229#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITOROP)))
230#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITANDOP)))
231#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)SHIFTOP)))
232#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)POWOP)))
233#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
234#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)ADDOP)))
235#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MULOP)))
236#define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
237#define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
2f3197b3 238
a687059c
LW
239/* This bit of chicanery makes a unary function followed by
240 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
241 * The UNIDOR macro is for unary functions that can be followed by the //
242 * operator (such as C<shift // 0>).
a687059c 243 */
d68ce4ac 244#define UNI3(f,x,have_x) { \
6154021b 245 pl_yylval.ival = f; \
d68ce4ac 246 if (have_x) PL_expect = x; \
376fcdbf
AL
247 PL_bufptr = s; \
248 PL_last_uni = PL_oldbufptr; \
249 PL_last_lop_op = f; \
250 if (*s == '(') \
251 return REPORT( (int)FUNC1 ); \
29595ff2 252 s = PEEKSPACE(s); \
376fcdbf
AL
253 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
254 }
d68ce4ac
FC
255#define UNI(f) UNI3(f,XTERM,1)
256#define UNIDOR(f) UNI3(f,XTERMORDORDOR,1)
b5fb7ce3
FC
257#define UNIPROTO(f,optional) { \
258 if (optional) PL_last_uni = PL_oldbufptr; \
22393538
MH
259 OPERATOR(f); \
260 }
a687059c 261
d68ce4ac 262#define UNIBRACK(f) UNI3(f,0,0)
79072805 263
9f68db38 264/* grandfather return to old style */
78cdf107
Z
265#define OLDLOP(f) \
266 do { \
267 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
268 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
269 pl_yylval.ival = (f); \
270 PL_expect = XTERM; \
271 PL_bufptr = s; \
272 return (int)LSTOP; \
273 } while(0)
79072805 274
83944c01
FC
275#define COPLINE_INC_WITH_HERELINES \
276 STMT_START { \
277 CopLINE_inc(PL_curcop); \
851b527a
FC
278 if (PL_parser->herelines) \
279 CopLINE(PL_curcop) += PL_parser->herelines, \
280 PL_parser->herelines = 0; \
83944c01 281 } STMT_END
ffdb8b16
FC
282/* Called after scan_str to update CopLINE(PL_curcop), but only when there
283 * is no sublex_push to follow. */
284#define COPLINE_SET_FROM_MULTI_END \
285 STMT_START { \
286 CopLINE_set(PL_curcop, PL_multi_end); \
287 if (PL_multi_end != PL_multi_start) \
851b527a 288 PL_parser->herelines = 0; \
ffdb8b16 289 } STMT_END
83944c01
FC
290
291
8fa7f367
JH
292#ifdef DEBUGGING
293
6154021b 294/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
295enum token_type {
296 TOKENTYPE_NONE,
297 TOKENTYPE_IVAL,
6154021b 298 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6 299 TOKENTYPE_PVAL,
aeaef349 300 TOKENTYPE_OPVAL
bbf60fe6
DM
301};
302
6d4a66ac
NC
303static struct debug_tokens {
304 const int token;
305 enum token_type type;
306 const char *name;
307} const debug_tokens[] =
9041c2e3 308{
bbf60fe6
DM
309 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
310 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
311 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
312 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
313 { ARROW, TOKENTYPE_NONE, "ARROW" },
314 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
315 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
316 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
317 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
318 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
0d863452 319 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
320 { DO, TOKENTYPE_NONE, "DO" },
321 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
322 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
323 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
324 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
325 { ELSE, TOKENTYPE_NONE, "ELSE" },
326 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
327 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
328 { FOR, TOKENTYPE_IVAL, "FOR" },
329 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
705fe0e5
FC
330 { FORMLBRACK, TOKENTYPE_NONE, "FORMLBRACK" },
331 { FORMRBRACK, TOKENTYPE_NONE, "FORMRBRACK" },
bbf60fe6
DM
332 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
333 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
7eb971ee 334 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
bbf60fe6
DM
335 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
336 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
337 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 338 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
339 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
340 { IF, TOKENTYPE_IVAL, "IF" },
5504e6cf 341 { LABEL, TOKENTYPE_PVAL, "LABEL" },
bbf60fe6
DM
342 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
343 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
344 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
345 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
346 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
347 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
348 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
349 { MY, TOKENTYPE_IVAL, "MY" },
bbf60fe6
DM
350 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
351 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
352 { OROP, TOKENTYPE_IVAL, "OROP" },
353 { OROR, TOKENTYPE_NONE, "OROR" },
354 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
88e1f1a2
JV
355 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
356 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6 357 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
cc624add 358 { POSTJOIN, TOKENTYPE_NONE, "POSTJOIN" },
bbf60fe6
DM
359 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
360 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
361 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
362 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
363 { PREINC, TOKENTYPE_NONE, "PREINC" },
364 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
f3f204dc 365 { QWLIST, TOKENTYPE_OPVAL, "QWLIST" },
bbf60fe6
DM
366 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
367 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
f3f204dc 368 { REQUIRE, TOKENTYPE_NONE, "REQUIRE" },
bbf60fe6
DM
369 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
370 { SUB, TOKENTYPE_NONE, "SUB" },
371 { THING, TOKENTYPE_OPVAL, "THING" },
372 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
373 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
374 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
375 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
376 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
377 { USE, TOKENTYPE_IVAL, "USE" },
0d863452 378 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6
DM
379 { WHILE, TOKENTYPE_IVAL, "WHILE" },
380 { WORD, TOKENTYPE_OPVAL, "WORD" },
be25f609 381 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 382 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
383};
384
6154021b 385/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 386
bbf60fe6 387STATIC int
704d4215 388S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 389{
7918f24d
NC
390 PERL_ARGS_ASSERT_TOKEREPORT;
391
bbf60fe6 392 if (DEBUG_T_TEST) {
bd61b366 393 const char *name = NULL;
bbf60fe6 394 enum token_type type = TOKENTYPE_NONE;
f54cb97a 395 const struct debug_tokens *p;
396482e1 396 SV* const report = newSVpvs("<== ");
bbf60fe6 397
f54cb97a 398 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
399 if (p->token == (int)rv) {
400 name = p->name;
401 type = p->type;
402 break;
403 }
404 }
405 if (name)
54667de8 406 Perl_sv_catpv(aTHX_ report, name);
74736ae6 407 else if ((char)rv > ' ' && (char)rv <= '~')
4ebc7986 408 {
bbf60fe6 409 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
4ebc7986
FC
410 if ((char)rv == 'p')
411 sv_catpvs(report, " (pending identifier)");
412 }
bbf60fe6 413 else if (!rv)
396482e1 414 sv_catpvs(report, "EOF");
bbf60fe6
DM
415 else
416 Perl_sv_catpvf(aTHX_ report, "?? %"IVdf, (IV)rv);
417 switch (type) {
418 case TOKENTYPE_NONE:
bbf60fe6
DM
419 break;
420 case TOKENTYPE_IVAL:
704d4215 421 Perl_sv_catpvf(aTHX_ report, "(ival=%"IVdf")", (IV)lvalp->ival);
bbf60fe6
DM
422 break;
423 case TOKENTYPE_OPNUM:
424 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 425 PL_op_name[lvalp->ival]);
bbf60fe6
DM
426 break;
427 case TOKENTYPE_PVAL:
704d4215 428 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
429 break;
430 case TOKENTYPE_OPVAL:
704d4215 431 if (lvalp->opval) {
401441c0 432 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
433 PL_op_name[lvalp->opval->op_type]);
434 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 435 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 436 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
437 }
438
439 }
401441c0 440 else
396482e1 441 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
442 break;
443 }
b6007c36 444 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
445 };
446 return (int)rv;
998054bd
SC
447}
448
b6007c36
DM
449
450/* print the buffer with suitable escapes */
451
452STATIC void
15f169a1 453S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 454{
396482e1 455 SV* const tmp = newSVpvs("");
7918f24d
NC
456
457 PERL_ARGS_ASSERT_PRINTBUF;
458
5d37acd6 459 GCC_DIAG_IGNORE(-Wformat-nonliteral); /* fmt checked by caller */
b6007c36 460 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
5d37acd6 461 GCC_DIAG_RESTORE;
b6007c36
DM
462 SvREFCNT_dec(tmp);
463}
464
8fa7f367
JH
465#endif
466
8290c323
NC
467static int
468S_deprecate_commaless_var_list(pTHX) {
469 PL_expect = XTERM;
470 deprecate("comma-less variable list");
471 return REPORT(','); /* grandfather non-comma-format format */
472}
473
ffb4593c
NT
474/*
475 * S_ao
476 *
c963b151
BD
477 * This subroutine detects &&=, ||=, and //= and turns an ANDAND, OROR or DORDOR
478 * into an OP_ANDASSIGN, OP_ORASSIGN, or OP_DORASSIGN
ffb4593c
NT
479 */
480
76e3520e 481STATIC int
cea2e8a9 482S_ao(pTHX_ int toketype)
a0d0e21e 483{
3280af22
NIS
484 if (*PL_bufptr == '=') {
485 PL_bufptr++;
a0d0e21e 486 if (toketype == ANDAND)
6154021b 487 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 488 else if (toketype == OROR)
6154021b 489 pl_yylval.ival = OP_ORASSIGN;
c963b151 490 else if (toketype == DORDOR)
6154021b 491 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
492 toketype = ASSIGNOP;
493 }
494 return toketype;
495}
496
ffb4593c
NT
497/*
498 * S_no_op
499 * When Perl expects an operator and finds something else, no_op
500 * prints the warning. It always prints "<something> found where
501 * operator expected. It prints "Missing semicolon on previous line?"
502 * if the surprise occurs at the start of the line. "do you need to
503 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
504 * where the compiler doesn't know if foo is a method call or a function.
505 * It prints "Missing operator before end of line" if there's nothing
506 * after the missing operator, or "... before <...>" if there is something
507 * after the missing operator.
508 */
509
76e3520e 510STATIC void
15f169a1 511S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 512{
9d4ba2ae
AL
513 char * const oldbp = PL_bufptr;
514 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 515
7918f24d
NC
516 PERL_ARGS_ASSERT_NO_OP;
517
1189a94a
GS
518 if (!s)
519 s = oldbp;
07c798fb 520 else
1189a94a 521 PL_bufptr = s;
734ab321 522 yywarn(Perl_form(aTHX_ "%s found where operator expected", what), UTF ? SVf_UTF8 : 0);
56da5a46
RGS
523 if (ckWARN_d(WARN_SYNTAX)) {
524 if (is_first)
525 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
526 "\t(Missing semicolon on previous line?)\n");
527 else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) {
f54cb97a 528 const char *t;
8a2bca12 529 for (t = PL_oldoldbufptr; (isWORDCHAR_lazy_if(t,UTF) || *t == ':');
734ab321 530 t += UTF ? UTF8SKIP(t) : 1)
c35e046a 531 NOOP;
56da5a46
RGS
532 if (t < PL_bufptr && isSPACE(*t))
533 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
b17a0679
FC
534 "\t(Do you need to predeclare %"UTF8f"?)\n",
535 UTF8fARG(UTF, t - PL_oldoldbufptr, PL_oldoldbufptr));
56da5a46
RGS
536 }
537 else {
538 assert(s >= oldbp);
539 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
b17a0679
FC
540 "\t(Missing operator before %"UTF8f"?)\n",
541 UTF8fARG(UTF, s - oldbp, oldbp));
56da5a46 542 }
07c798fb 543 }
3280af22 544 PL_bufptr = oldbp;
8990e307
LW
545}
546
ffb4593c
NT
547/*
548 * S_missingterm
549 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 550 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
551 * If we're in a delimited string and the delimiter is a control
552 * character, it's reformatted into a two-char sequence like ^C.
553 * This is fatal.
554 */
555
76e3520e 556STATIC void
cea2e8a9 557S_missingterm(pTHX_ char *s)
8990e307
LW
558{
559 char tmpbuf[3];
560 char q;
561 if (s) {
9d4ba2ae 562 char * const nl = strrchr(s,'\n');
d2719217 563 if (nl)
8990e307
LW
564 *nl = '\0';
565 }
ca8b19a7 566 else if ((U8) PL_multi_close < 32) {
8990e307 567 *tmpbuf = '^';
585ec06d 568 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
569 tmpbuf[2] = '\0';
570 s = tmpbuf;
571 }
572 else {
eb160463 573 *tmpbuf = (char)PL_multi_close;
8990e307
LW
574 tmpbuf[1] = '\0';
575 s = tmpbuf;
576 }
577 q = strchr(s,'"') ? '\'' : '"';
cea2e8a9 578 Perl_croak(aTHX_ "Can't find string terminator %c%s%c anywhere before EOF",q,s,q);
463ee0b2 579}
79072805 580
dd0ac2b9
FC
581#include "feature.h"
582
0d863452 583/*
0d863452
RH
584 * Check whether the named feature is enabled.
585 */
26ea9e12 586bool
3fff3427 587Perl_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
0d863452 588{
4a731d7b 589 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
7918f24d
NC
590
591 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
ca4d40c4
FC
592
593 assert(CURRENT_FEATURE_BUNDLE == FEATURE_BUNDLE_CUSTOM);
7918f24d 594
26ea9e12
NC
595 if (namelen > MAX_FEATURE_LEN)
596 return FALSE;
3fff3427 597 memcpy(&he_name[8], name, namelen);
7d69d4a6 598
c8ca97b0
NC
599 return cBOOL(cop_hints_fetch_pvn(PL_curcop, he_name, 8 + namelen, 0,
600 REFCOUNTED_HE_EXISTS));
0d863452
RH
601}
602
ffb4593c 603/*
9cbb5ea2
GS
604 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
605 * utf16-to-utf8-reversed.
ffb4593c
NT
606 */
607
c39cd008
GS
608#ifdef PERL_CR_FILTER
609static void
610strip_return(SV *sv)
611{
eb578fdb
KW
612 const char *s = SvPVX_const(sv);
613 const char * const e = s + SvCUR(sv);
7918f24d
NC
614
615 PERL_ARGS_ASSERT_STRIP_RETURN;
616
c39cd008
GS
617 /* outer loop optimized to do nothing if there are no CR-LFs */
618 while (s < e) {
619 if (*s++ == '\r' && *s == '\n') {
620 /* hit a CR-LF, need to copy the rest */
eb578fdb 621 char *d = s - 1;
c39cd008
GS
622 *d++ = *s++;
623 while (s < e) {
624 if (*s == '\r' && s[1] == '\n')
625 s++;
626 *d++ = *s++;
627 }
628 SvCUR(sv) -= s - d;
629 return;
630 }
631 }
632}
a868473f 633
76e3520e 634STATIC I32
c39cd008 635S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 636{
f54cb97a 637 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
638 if (count > 0 && !maxlen)
639 strip_return(sv);
640 return count;
a868473f
NIS
641}
642#endif
643
ffb4593c 644/*
8eaa0acf
Z
645=for apidoc Amx|void|lex_start|SV *line|PerlIO *rsfp|U32 flags
646
647Creates and initialises a new lexer/parser state object, supplying
648a context in which to lex and parse from a new source of Perl code.
649A pointer to the new state object is placed in L</PL_parser>. An entry
650is made on the save stack so that upon unwinding the new state object
651will be destroyed and the former value of L</PL_parser> will be restored.
652Nothing else need be done to clean up the parsing context.
653
654The code to be parsed comes from I<line> and I<rsfp>. I<line>, if
655non-null, provides a string (in SV form) containing code to be parsed.
656A copy of the string is made, so subsequent modification of I<line>
657does not affect parsing. I<rsfp>, if non-null, provides an input stream
658from which code will be read to be parsed. If both are non-null, the
659code in I<line> comes first and must consist of complete lines of input,
660and I<rsfp> supplies the remainder of the source.
661
e368b3bd
FC
662The I<flags> parameter is reserved for future use. Currently it is only
663used by perl internally, so extensions should always pass zero.
8eaa0acf
Z
664
665=cut
666*/
ffb4593c 667
27fcb6ee 668/* LEX_START_SAME_FILTER indicates that this is not a new file, so it
87606032
NC
669 can share filters with the current parser.
670 LEX_START_DONT_CLOSE indicates that the file handle wasn't opened by the
671 caller, hence isn't owned by the parser, so shouldn't be closed on parser
672 destruction. This is used to handle the case of defaulting to reading the
673 script from the standard input because no filename was given on the command
674 line (without getting confused by situation where STDIN has been closed, so
675 the script handle is opened on fd 0) */
27fcb6ee 676
a0d0e21e 677void
8eaa0acf 678Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
79072805 679{
6ef55633 680 const char *s = NULL;
5486870f 681 yy_parser *parser, *oparser;
60d63348 682 if (flags && flags & ~LEX_START_FLAGS)
8eaa0acf 683 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
acdf0a21
DM
684
685 /* create and initialise a parser */
686
199e78b7 687 Newxz(parser, 1, yy_parser);
5486870f 688 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
689 PL_parser = parser;
690
28ac2b49
Z
691 parser->stack = NULL;
692 parser->ps = NULL;
693 parser->stack_size = 0;
acdf0a21 694
e3abe207
DM
695 /* on scope exit, free this parser and restore any outer one */
696 SAVEPARSER(parser);
7c4baf47 697 parser->saved_curcop = PL_curcop;
e3abe207 698
acdf0a21 699 /* initialise lexer state */
8990e307 700
fb205e7a 701 parser->nexttoke = 0;
ca4cfd28 702 parser->error_count = oparser ? oparser->error_count : 0;
7f1c3e8c 703 parser->copline = parser->preambling = NOLINE;
5afb0a62 704 parser->lex_state = LEX_NORMAL;
c2598295 705 parser->expect = XSTATE;
2f9285f8 706 parser->rsfp = rsfp;
27fcb6ee
FC
707 parser->rsfp_filters =
708 !(flags & LEX_START_SAME_FILTER) || !oparser
d3cd8e11
FC
709 ? NULL
710 : MUTABLE_AV(SvREFCNT_inc(
711 oparser->rsfp_filters
712 ? oparser->rsfp_filters
713 : (oparser->rsfp_filters = newAV())
714 ));
2f9285f8 715
199e78b7
DM
716 Newx(parser->lex_brackstack, 120, char);
717 Newx(parser->lex_casestack, 12, char);
718 *parser->lex_casestack = '\0';
d794b522 719 Newxz(parser->lex_shared, 1, LEXSHARED);
02b34bbe 720
10efb74f 721 if (line) {
0528fd32 722 STRLEN len;
10efb74f 723 s = SvPV_const(line, len);
0abcdfa4
FC
724 parser->linestr = flags & LEX_START_COPIED
725 ? SvREFCNT_inc_simple_NN(line)
726 : newSVpvn_flags(s, len, SvUTF8(line));
bf1b738b 727 sv_catpvn(parser->linestr, "\n;", rsfp ? 1 : 2);
0abcdfa4 728 } else {
bf1b738b 729 parser->linestr = newSVpvn("\n;", rsfp ? 1 : 2);
8990e307 730 }
f06b5848
DM
731 parser->oldoldbufptr =
732 parser->oldbufptr =
733 parser->bufptr =
734 parser->linestart = SvPVX(parser->linestr);
735 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
736 parser->last_lop = parser->last_uni = NULL;
b54f893d
KW
737
738 assert(FITS_IN_8_BITS(LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
739 |LEX_DONT_CLOSE_RSFP));
740 parser->lex_flags = (U8) (flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
741 |LEX_DONT_CLOSE_RSFP));
737c24fc 742
60d63348 743 parser->in_pod = parser->filtered = 0;
79072805 744}
a687059c 745
e3abe207
DM
746
747/* delete a parser object */
748
749void
750Perl_parser_free(pTHX_ const yy_parser *parser)
751{
7918f24d
NC
752 PERL_ARGS_ASSERT_PARSER_FREE;
753
7c4baf47 754 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
755 SvREFCNT_dec(parser->linestr);
756
87606032 757 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
2f9285f8 758 PerlIO_clearerr(parser->rsfp);
799361c3
SH
759 else if (parser->rsfp && (!parser->old_parser ||
760 (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 761 PerlIO_close(parser->rsfp);
5486870f 762 SvREFCNT_dec(parser->rsfp_filters);
10002bc1
FC
763 SvREFCNT_dec(parser->lex_stuff);
764 SvREFCNT_dec(parser->sublex_info.repl);
3ac7ff8f
FC
765
766 Safefree(parser->lex_brackstack);
767 Safefree(parser->lex_casestack);
768 Safefree(parser->lex_shared);
769 PL_parser = parser->old_parser;
770 Safefree(parser);
771}
772
773void
774Perl_parser_free_nexttoke_ops(pTHX_ yy_parser *parser, OPSLAB *slab)
775{
3ac7ff8f 776 I32 nexttoke = parser->nexttoke;
3ac7ff8f 777 PERL_ARGS_ASSERT_PARSER_FREE_NEXTTOKE_OPS;
3ce3dcd9 778 while (nexttoke--) {
3ac7ff8f
FC
779 if (S_is_opval_token(parser->nexttype[nexttoke] & 0xffff)
780 && parser->nextval[nexttoke].opval
781 && parser->nextval[nexttoke].opval->op_slabbed
782 && OpSLAB(parser->nextval[nexttoke].opval) == slab) {
3ce3dcd9 783 op_free(parser->nextval[nexttoke].opval);
3ac7ff8f
FC
784 parser->nextval[nexttoke].opval = NULL;
785 }
3ce3dcd9 786 }
e3abe207
DM
787}
788
789
ffb4593c 790/*
f0e67a1d
Z
791=for apidoc AmxU|SV *|PL_parser-E<gt>linestr
792
793Buffer scalar containing the chunk currently under consideration of the
794text currently being lexed. This is always a plain string scalar (for
795which C<SvPOK> is true). It is not intended to be used as a scalar by
796normal scalar means; instead refer to the buffer directly by the pointer
797variables described below.
798
799The lexer maintains various C<char*> pointers to things in the
800C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
801reallocated, all of these pointers must be updated. Don't attempt to
802do this manually, but rather use L</lex_grow_linestr> if you need to
803reallocate the buffer.
804
805The content of the text chunk in the buffer is commonly exactly one
806complete line of input, up to and including a newline terminator,
807but there are situations where it is otherwise. The octets of the
808buffer may be intended to be interpreted as either UTF-8 or Latin-1.
809The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
810flag on this scalar, which may disagree with it.
811
812For direct examination of the buffer, the variable
813L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
814lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
815of these pointers is usually preferable to examination of the scalar
816through normal scalar means.
817
818=for apidoc AmxU|char *|PL_parser-E<gt>bufend
819
820Direct pointer to the end of the chunk of text currently being lexed, the
821end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
6602b933 822+ SvCUR(PL_parser-E<gt>linestr)>. A C<NUL> character (zero octet) is
f0e67a1d
Z
823always located at the end of the buffer, and does not count as part of
824the buffer's contents.
825
826=for apidoc AmxU|char *|PL_parser-E<gt>bufptr
827
828Points to the current position of lexing inside the lexer buffer.
829Characters around this point may be freely examined, within
830the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
831L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
832interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
833
834Lexing code (whether in the Perl core or not) moves this pointer past
835the characters that it consumes. It is also expected to perform some
836bookkeeping whenever a newline character is consumed. This movement
837can be more conveniently performed by the function L</lex_read_to>,
838which handles newlines appropriately.
839
840Interpretation of the buffer's octets can be abstracted out by
841using the slightly higher-level functions L</lex_peek_unichar> and
842L</lex_read_unichar>.
843
844=for apidoc AmxU|char *|PL_parser-E<gt>linestart
845
846Points to the start of the current line inside the lexer buffer.
847This is useful for indicating at which column an error occurred, and
848not much else. This must be updated by any lexing code that consumes
849a newline; the function L</lex_read_to> handles this detail.
850
851=cut
852*/
853
854/*
855=for apidoc Amx|bool|lex_bufutf8
856
857Indicates whether the octets in the lexer buffer
858(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
859of Unicode characters. If not, they should be interpreted as Latin-1
860characters. This is analogous to the C<SvUTF8> flag for scalars.
861
862In UTF-8 mode, it is not guaranteed that the lexer buffer actually
863contains valid UTF-8. Lexing code must be robust in the face of invalid
864encoding.
865
866The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
867is significant, but not the whole story regarding the input character
868encoding. Normally, when a file is being read, the scalar contains octets
869and its C<SvUTF8> flag is off, but the octets should be interpreted as
870UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
871however, the scalar may have the C<SvUTF8> flag on, and in this case its
872octets should be interpreted as UTF-8 unless the C<use bytes> pragma
873is in effect. This logic may change in the future; use this function
874instead of implementing the logic yourself.
875
876=cut
877*/
878
879bool
880Perl_lex_bufutf8(pTHX)
881{
882 return UTF;
883}
884
885/*
886=for apidoc Amx|char *|lex_grow_linestr|STRLEN len
887
888Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
6602b933 889at least I<len> octets (including terminating C<NUL>). Returns a
f0e67a1d
Z
890pointer to the reallocated buffer. This is necessary before making
891any direct modification of the buffer that would increase its length.
892L</lex_stuff_pvn> provides a more convenient way to insert text into
893the buffer.
894
895Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
896this function updates all of the lexer's variables that point directly
897into the buffer.
898
899=cut
900*/
901
902char *
903Perl_lex_grow_linestr(pTHX_ STRLEN len)
904{
905 SV *linestr;
906 char *buf;
907 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
c7641931 908 STRLEN linestart_pos, last_uni_pos, last_lop_pos, re_eval_start_pos;
f0e67a1d
Z
909 linestr = PL_parser->linestr;
910 buf = SvPVX(linestr);
911 if (len <= SvLEN(linestr))
912 return buf;
913 bufend_pos = PL_parser->bufend - buf;
914 bufptr_pos = PL_parser->bufptr - buf;
915 oldbufptr_pos = PL_parser->oldbufptr - buf;
916 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
917 linestart_pos = PL_parser->linestart - buf;
918 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
919 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
3328ab5a
FC
920 re_eval_start_pos = PL_parser->lex_shared->re_eval_start ?
921 PL_parser->lex_shared->re_eval_start - buf : 0;
c7641931 922
f0e67a1d 923 buf = sv_grow(linestr, len);
c7641931 924
f0e67a1d
Z
925 PL_parser->bufend = buf + bufend_pos;
926 PL_parser->bufptr = buf + bufptr_pos;
927 PL_parser->oldbufptr = buf + oldbufptr_pos;
928 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
929 PL_parser->linestart = buf + linestart_pos;
930 if (PL_parser->last_uni)
931 PL_parser->last_uni = buf + last_uni_pos;
932 if (PL_parser->last_lop)
933 PL_parser->last_lop = buf + last_lop_pos;
3328ab5a
FC
934 if (PL_parser->lex_shared->re_eval_start)
935 PL_parser->lex_shared->re_eval_start = buf + re_eval_start_pos;
f0e67a1d
Z
936 return buf;
937}
938
939/*
83aa740e 940=for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
f0e67a1d
Z
941
942Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
943immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
944reallocating the buffer if necessary. This means that lexing code that
945runs later will see the characters as if they had appeared in the input.
946It is not recommended to do this as part of normal parsing, and most
947uses of this facility run the risk of the inserted characters being
948interpreted in an unintended manner.
949
950The string to be inserted is represented by I<len> octets starting
951at I<pv>. These octets are interpreted as either UTF-8 or Latin-1,
952according to whether the C<LEX_STUFF_UTF8> flag is set in I<flags>.
953The characters are recoded for the lexer buffer, according to how the
954buffer is currently being interpreted (L</lex_bufutf8>). If a string
9dcc53ea 955to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
f0e67a1d
Z
956function is more convenient.
957
958=cut
959*/
960
961void
83aa740e 962Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
f0e67a1d 963{
749123ff 964 dVAR;
f0e67a1d
Z
965 char *bufptr;
966 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
967 if (flags & ~(LEX_STUFF_UTF8))
968 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
969 if (UTF) {
970 if (flags & LEX_STUFF_UTF8) {
971 goto plain_copy;
972 } else {
54d004e8 973 STRLEN highhalf = 0; /* Count of variants */
83aa740e 974 const char *p, *e = pv+len;
54d004e8
KW
975 for (p = pv; p != e; p++) {
976 if (! UTF8_IS_INVARIANT(*p)) {
977 highhalf++;
978 }
979 }
f0e67a1d
Z
980 if (!highhalf)
981 goto plain_copy;
982 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
983 bufptr = PL_parser->bufptr;
984 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
985 SvCUR_set(PL_parser->linestr,
986 SvCUR(PL_parser->linestr) + len+highhalf);
f0e67a1d
Z
987 PL_parser->bufend += len+highhalf;
988 for (p = pv; p != e; p++) {
989 U8 c = (U8)*p;
54d004e8
KW
990 if (! UTF8_IS_INVARIANT(c)) {
991 *bufptr++ = UTF8_TWO_BYTE_HI(c);
992 *bufptr++ = UTF8_TWO_BYTE_LO(c);
f0e67a1d
Z
993 } else {
994 *bufptr++ = (char)c;
995 }
996 }
997 }
998 } else {
999 if (flags & LEX_STUFF_UTF8) {
1000 STRLEN highhalf = 0;
83aa740e 1001 const char *p, *e = pv+len;
f0e67a1d
Z
1002 for (p = pv; p != e; p++) {
1003 U8 c = (U8)*p;
54d004e8 1004 if (UTF8_IS_ABOVE_LATIN1(c)) {
f0e67a1d
Z
1005 Perl_croak(aTHX_ "Lexing code attempted to stuff "
1006 "non-Latin-1 character into Latin-1 input");
54d004e8 1007 } else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e)) {
f0e67a1d
Z
1008 p++;
1009 highhalf++;
54d004e8 1010 } else if (! UTF8_IS_INVARIANT(c)) {
f0e67a1d
Z
1011 /* malformed UTF-8 */
1012 ENTER;
1013 SAVESPTR(PL_warnhook);
1014 PL_warnhook = PERL_WARNHOOK_FATAL;
c80e42f3 1015 utf8n_to_uvchr((U8*)p, e-p, NULL, 0);
f0e67a1d
Z
1016 LEAVE;
1017 }
1018 }
1019 if (!highhalf)
1020 goto plain_copy;
1021 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
1022 bufptr = PL_parser->bufptr;
1023 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1024 SvCUR_set(PL_parser->linestr,
1025 SvCUR(PL_parser->linestr) + len-highhalf);
f0e67a1d 1026 PL_parser->bufend += len-highhalf;
54d004e8
KW
1027 p = pv;
1028 while (p < e) {
1029 if (UTF8_IS_INVARIANT(*p)) {
1030 *bufptr++ = *p;
1031 p++;
f0e67a1d 1032 }
54d004e8
KW
1033 else {
1034 assert(p < e -1 );
94bb8c36 1035 *bufptr++ = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
54d004e8
KW
1036 p += 2;
1037 }
f0e67a1d
Z
1038 }
1039 } else {
54d004e8 1040 plain_copy:
f0e67a1d
Z
1041 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1042 bufptr = PL_parser->bufptr;
1043 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
255fdf19 1044 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
f0e67a1d
Z
1045 PL_parser->bufend += len;
1046 Copy(pv, bufptr, len, char);
1047 }
1048 }
1049}
1050
1051/*
9dcc53ea
Z
1052=for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
1053
1054Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1055immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1056reallocating the buffer if necessary. This means that lexing code that
1057runs later will see the characters as if they had appeared in the input.
1058It is not recommended to do this as part of normal parsing, and most
1059uses of this facility run the risk of the inserted characters being
1060interpreted in an unintended manner.
1061
1062The string to be inserted is represented by octets starting at I<pv>
1063and continuing to the first nul. These octets are interpreted as either
1064UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
1065in I<flags>. The characters are recoded for the lexer buffer, according
1066to how the buffer is currently being interpreted (L</lex_bufutf8>).
1067If it is not convenient to nul-terminate a string to be inserted, the
1068L</lex_stuff_pvn> function is more appropriate.
1069
1070=cut
1071*/
1072
1073void
1074Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1075{
1076 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1077 lex_stuff_pvn(pv, strlen(pv), flags);
1078}
1079
1080/*
f0e67a1d
Z
1081=for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1082
1083Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1084immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1085reallocating the buffer if necessary. This means that lexing code that
1086runs later will see the characters as if they had appeared in the input.
1087It is not recommended to do this as part of normal parsing, and most
1088uses of this facility run the risk of the inserted characters being
1089interpreted in an unintended manner.
1090
1091The string to be inserted is the string value of I<sv>. The characters
1092are recoded for the lexer buffer, according to how the buffer is currently
9dcc53ea 1093being interpreted (L</lex_bufutf8>). If a string to be inserted is
f0e67a1d
Z
1094not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1095need to construct a scalar.
1096
1097=cut
1098*/
1099
1100void
1101Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1102{
1103 char *pv;
1104 STRLEN len;
1105 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1106 if (flags)
1107 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1108 pv = SvPV(sv, len);
1109 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1110}
1111
1112/*
1113=for apidoc Amx|void|lex_unstuff|char *ptr
1114
1115Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1116I<ptr>. Text following I<ptr> will be moved, and the buffer shortened.
1117This hides the discarded text from any lexing code that runs later,
1118as if the text had never appeared.
1119
1120This is not the normal way to consume lexed text. For that, use
1121L</lex_read_to>.
1122
1123=cut
1124*/
1125
1126void
1127Perl_lex_unstuff(pTHX_ char *ptr)
1128{
1129 char *buf, *bufend;
1130 STRLEN unstuff_len;
1131 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1132 buf = PL_parser->bufptr;
1133 if (ptr < buf)
1134 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1135 if (ptr == buf)
1136 return;
1137 bufend = PL_parser->bufend;
1138 if (ptr > bufend)
1139 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1140 unstuff_len = ptr - buf;
1141 Move(ptr, buf, bufend+1-ptr, char);
1142 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1143 PL_parser->bufend = bufend - unstuff_len;
1144}
1145
1146/*
1147=for apidoc Amx|void|lex_read_to|char *ptr
1148
1149Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1150to I<ptr>. This advances L</PL_parser-E<gt>bufptr> to match I<ptr>,
1151performing the correct bookkeeping whenever a newline character is passed.
1152This is the normal way to consume lexed text.
1153
1154Interpretation of the buffer's octets can be abstracted out by
1155using the slightly higher-level functions L</lex_peek_unichar> and
1156L</lex_read_unichar>.
1157
1158=cut
1159*/
1160
1161void
1162Perl_lex_read_to(pTHX_ char *ptr)
1163{
1164 char *s;
1165 PERL_ARGS_ASSERT_LEX_READ_TO;
1166 s = PL_parser->bufptr;
1167 if (ptr < s || ptr > PL_parser->bufend)
1168 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1169 for (; s != ptr; s++)
1170 if (*s == '\n') {
83944c01 1171 COPLINE_INC_WITH_HERELINES;
f0e67a1d
Z
1172 PL_parser->linestart = s+1;
1173 }
1174 PL_parser->bufptr = ptr;
1175}
1176
1177/*
1178=for apidoc Amx|void|lex_discard_to|char *ptr
1179
1180Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1181up to I<ptr>. The remaining content of the buffer will be moved, and
1182all pointers into the buffer updated appropriately. I<ptr> must not
1183be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1184it is not permitted to discard text that has yet to be lexed.
1185
1186Normally it is not necessarily to do this directly, because it suffices to
1187use the implicit discarding behaviour of L</lex_next_chunk> and things
1188based on it. However, if a token stretches across multiple lines,
1f317c95 1189and the lexing code has kept multiple lines of text in the buffer for
f0e67a1d
Z
1190that purpose, then after completion of the token it would be wise to
1191explicitly discard the now-unneeded earlier lines, to avoid future
1192multi-line tokens growing the buffer without bound.
1193
1194=cut
1195*/
1196
1197void
1198Perl_lex_discard_to(pTHX_ char *ptr)
1199{
1200 char *buf;
1201 STRLEN discard_len;
1202 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1203 buf = SvPVX(PL_parser->linestr);
1204 if (ptr < buf)
1205 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1206 if (ptr == buf)
1207 return;
1208 if (ptr > PL_parser->bufptr)
1209 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1210 discard_len = ptr - buf;
1211 if (PL_parser->oldbufptr < ptr)
1212 PL_parser->oldbufptr = ptr;
1213 if (PL_parser->oldoldbufptr < ptr)
1214 PL_parser->oldoldbufptr = ptr;
1215 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1216 PL_parser->last_uni = NULL;
1217 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1218 PL_parser->last_lop = NULL;
1219 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1220 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1221 PL_parser->bufend -= discard_len;
1222 PL_parser->bufptr -= discard_len;
1223 PL_parser->oldbufptr -= discard_len;
1224 PL_parser->oldoldbufptr -= discard_len;
1225 if (PL_parser->last_uni)
1226 PL_parser->last_uni -= discard_len;
1227 if (PL_parser->last_lop)
1228 PL_parser->last_lop -= discard_len;
1229}
1230
1231/*
1232=for apidoc Amx|bool|lex_next_chunk|U32 flags
1233
1234Reads in the next chunk of text to be lexed, appending it to
1235L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1236looked to the end of the current chunk and wants to know more. It is
1237usual, but not necessary, for lexing to have consumed the entirety of
1238the current chunk at this time.
1239
1240If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1241chunk (i.e., the current chunk has been entirely consumed), normally the
1242current chunk will be discarded at the same time that the new chunk is
1243read in. If I<flags> includes C<LEX_KEEP_PREVIOUS>, the current chunk
1244will not be discarded. If the current chunk has not been entirely
1245consumed, then it will not be discarded regardless of the flag.
1246
1247Returns true if some new text was added to the buffer, or false if the
1248buffer has reached the end of the input text.
1249
1250=cut
1251*/
1252
1253#define LEX_FAKE_EOF 0x80000000
112d1284 1254#define LEX_NO_TERM 0x40000000
f0e67a1d
Z
1255
1256bool
1257Perl_lex_next_chunk(pTHX_ U32 flags)
1258{
1259 SV *linestr;
1260 char *buf;
1261 STRLEN old_bufend_pos, new_bufend_pos;
1262 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1263 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1264 bool got_some_for_debugger = 0;
f0e67a1d 1265 bool got_some;
112d1284 1266 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM))
f0e67a1d 1267 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
f0e67a1d
Z
1268 linestr = PL_parser->linestr;
1269 buf = SvPVX(linestr);
1270 if (!(flags & LEX_KEEP_PREVIOUS) &&
1271 PL_parser->bufptr == PL_parser->bufend) {
1272 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1273 linestart_pos = 0;
1274 if (PL_parser->last_uni != PL_parser->bufend)
1275 PL_parser->last_uni = NULL;
1276 if (PL_parser->last_lop != PL_parser->bufend)
1277 PL_parser->last_lop = NULL;
1278 last_uni_pos = last_lop_pos = 0;
1279 *buf = 0;
1280 SvCUR(linestr) = 0;
1281 } else {
1282 old_bufend_pos = PL_parser->bufend - buf;
1283 bufptr_pos = PL_parser->bufptr - buf;
1284 oldbufptr_pos = PL_parser->oldbufptr - buf;
1285 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1286 linestart_pos = PL_parser->linestart - buf;
1287 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1288 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1289 }
1290 if (flags & LEX_FAKE_EOF) {
1291 goto eof;
60d63348 1292 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
f0e67a1d
Z
1293 got_some = 0;
1294 } else if (filter_gets(linestr, old_bufend_pos)) {
1295 got_some = 1;
17cc9359 1296 got_some_for_debugger = 1;
112d1284
FC
1297 } else if (flags & LEX_NO_TERM) {
1298 got_some = 0;
f0e67a1d 1299 } else {
580561a3
Z
1300 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1301 sv_setpvs(linestr, "");
f0e67a1d
Z
1302 eof:
1303 /* End of real input. Close filehandle (unless it was STDIN),
1304 * then add implicit termination.
1305 */
87606032 1306 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
f0e67a1d
Z
1307 PerlIO_clearerr(PL_parser->rsfp);
1308 else if (PL_parser->rsfp)
1309 (void)PerlIO_close(PL_parser->rsfp);
1310 PL_parser->rsfp = NULL;
60d63348 1311 PL_parser->in_pod = PL_parser->filtered = 0;
f0e67a1d
Z
1312 if (!PL_in_eval && PL_minus_p) {
1313 sv_catpvs(linestr,
1314 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1315 PL_minus_n = PL_minus_p = 0;
1316 } else if (!PL_in_eval && PL_minus_n) {
1317 sv_catpvs(linestr, /*{*/";}");
1318 PL_minus_n = 0;
1319 } else
1320 sv_catpvs(linestr, ";");
1321 got_some = 1;
1322 }
1323 buf = SvPVX(linestr);
1324 new_bufend_pos = SvCUR(linestr);
1325 PL_parser->bufend = buf + new_bufend_pos;
1326 PL_parser->bufptr = buf + bufptr_pos;
1327 PL_parser->oldbufptr = buf + oldbufptr_pos;
1328 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1329 PL_parser->linestart = buf + linestart_pos;
1330 if (PL_parser->last_uni)
1331 PL_parser->last_uni = buf + last_uni_pos;
1332 if (PL_parser->last_lop)
1333 PL_parser->last_lop = buf + last_lop_pos;
7f1c3e8c
FC
1334 if (PL_parser->preambling != NOLINE) {
1335 CopLINE_set(PL_curcop, PL_parser->preambling + 1);
1336 PL_parser->preambling = NOLINE;
1337 }
17cc9359 1338 if (got_some_for_debugger && (PERLDB_LINE || PERLDB_SAVESRC) &&
f0e67a1d
Z
1339 PL_curstash != PL_debstash) {
1340 /* debugger active and we're not compiling the debugger code,
1341 * so store the line into the debugger's array of lines
1342 */
1343 update_debugger_info(NULL, buf+old_bufend_pos,
1344 new_bufend_pos-old_bufend_pos);
1345 }
1346 return got_some;
1347}
1348
1349/*
1350=for apidoc Amx|I32|lex_peek_unichar|U32 flags
1351
1352Looks ahead one (Unicode) character in the text currently being lexed.
1353Returns the codepoint (unsigned integer value) of the next character,
1354or -1 if lexing has reached the end of the input text. To consume the
1355peeked character, use L</lex_read_unichar>.
1356
1357If the next character is in (or extends into) the next chunk of input
1358text, the next chunk will be read in. Normally the current chunk will be
1359discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1360then the current chunk will not be discarded.
1361
1362If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1363is encountered, an exception is generated.
1364
1365=cut
1366*/
1367
1368I32
1369Perl_lex_peek_unichar(pTHX_ U32 flags)
1370{
749123ff 1371 dVAR;
f0e67a1d
Z
1372 char *s, *bufend;
1373 if (flags & ~(LEX_KEEP_PREVIOUS))
1374 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1375 s = PL_parser->bufptr;
1376 bufend = PL_parser->bufend;
1377 if (UTF) {
1378 U8 head;
1379 I32 unichar;
1380 STRLEN len, retlen;
1381 if (s == bufend) {
1382 if (!lex_next_chunk(flags))
1383 return -1;
1384 s = PL_parser->bufptr;
1385 bufend = PL_parser->bufend;
1386 }
1387 head = (U8)*s;
54d004e8 1388 if (UTF8_IS_INVARIANT(head))
f0e67a1d 1389 return head;
54d004e8
KW
1390 if (UTF8_IS_START(head)) {
1391 len = UTF8SKIP(&head);
f0e67a1d
Z
1392 while ((STRLEN)(bufend-s) < len) {
1393 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1394 break;
1395 s = PL_parser->bufptr;
1396 bufend = PL_parser->bufend;
1397 }
1398 }
c80e42f3 1399 unichar = utf8n_to_uvchr((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
f0e67a1d
Z
1400 if (retlen == (STRLEN)-1) {
1401 /* malformed UTF-8 */
1402 ENTER;
1403 SAVESPTR(PL_warnhook);
1404 PL_warnhook = PERL_WARNHOOK_FATAL;
c80e42f3 1405 utf8n_to_uvchr((U8*)s, bufend-s, NULL, 0);
f0e67a1d
Z
1406 LEAVE;
1407 }
1408 return unichar;
1409 } else {
1410 if (s == bufend) {
1411 if (!lex_next_chunk(flags))
1412 return -1;
1413 s = PL_parser->bufptr;
1414 }
1415 return (U8)*s;
1416 }
1417}
1418
1419/*
1420=for apidoc Amx|I32|lex_read_unichar|U32 flags
1421
1422Reads the next (Unicode) character in the text currently being lexed.
1423Returns the codepoint (unsigned integer value) of the character read,
1424and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1425if lexing has reached the end of the input text. To non-destructively
1426examine the next character, use L</lex_peek_unichar> instead.
1427
1428If the next character is in (or extends into) the next chunk of input
1429text, the next chunk will be read in. Normally the current chunk will be
1430discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1431then the current chunk will not be discarded.
1432
1433If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1434is encountered, an exception is generated.
1435
1436=cut
1437*/
1438
1439I32
1440Perl_lex_read_unichar(pTHX_ U32 flags)
1441{
1442 I32 c;
1443 if (flags & ~(LEX_KEEP_PREVIOUS))
1444 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1445 c = lex_peek_unichar(flags);
1446 if (c != -1) {
1447 if (c == '\n')
83944c01 1448 COPLINE_INC_WITH_HERELINES;
d9018cbe
EB
1449 if (UTF)
1450 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1451 else
1452 ++(PL_parser->bufptr);
f0e67a1d
Z
1453 }
1454 return c;
1455}
1456
1457/*
1458=for apidoc Amx|void|lex_read_space|U32 flags
1459
1460Reads optional spaces, in Perl style, in the text currently being
1461lexed. The spaces may include ordinary whitespace characters and
1462Perl-style comments. C<#line> directives are processed if encountered.
1463L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1464at a non-space character (or the end of the input text).
1465
1466If spaces extend into the next chunk of input text, the next chunk will
1467be read in. Normally the current chunk will be discarded at the same
1468time, but if I<flags> includes C<LEX_KEEP_PREVIOUS> then the current
1469chunk will not be discarded.
1470
1471=cut
1472*/
1473
21791330 1474#define LEX_NO_INCLINE 0x40000000
f0998909
Z
1475#define LEX_NO_NEXT_CHUNK 0x80000000
1476
f0e67a1d
Z
1477void
1478Perl_lex_read_space(pTHX_ U32 flags)
1479{
1480 char *s, *bufend;
21791330 1481 const bool can_incline = !(flags & LEX_NO_INCLINE);
f0e67a1d 1482 bool need_incline = 0;
21791330 1483 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK|LEX_NO_INCLINE))
f0e67a1d 1484 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
f0e67a1d
Z
1485 s = PL_parser->bufptr;
1486 bufend = PL_parser->bufend;
1487 while (1) {
1488 char c = *s;
1489 if (c == '#') {
1490 do {
1491 c = *++s;
1492 } while (!(c == '\n' || (c == 0 && s == bufend)));
1493 } else if (c == '\n') {
1494 s++;
21791330
FC
1495 if (can_incline) {
1496 PL_parser->linestart = s;
1497 if (s == bufend)
1498 need_incline = 1;
1499 else
1500 incline(s);
1501 }
f0e67a1d
Z
1502 } else if (isSPACE(c)) {
1503 s++;
1504 } else if (c == 0 && s == bufend) {
1505 bool got_more;
65c68e17 1506 line_t l;
f0998909
Z
1507 if (flags & LEX_NO_NEXT_CHUNK)
1508 break;
f0e67a1d 1509 PL_parser->bufptr = s;
65c68e17 1510 l = CopLINE(PL_curcop);
851b527a 1511 CopLINE(PL_curcop) += PL_parser->herelines + 1;
f0e67a1d 1512 got_more = lex_next_chunk(flags);
65c68e17 1513 CopLINE_set(PL_curcop, l);
f0e67a1d
Z
1514 s = PL_parser->bufptr;
1515 bufend = PL_parser->bufend;
1516 if (!got_more)
1517 break;
21791330 1518 if (can_incline && need_incline && PL_parser->rsfp) {
f0e67a1d
Z
1519 incline(s);
1520 need_incline = 0;
1521 }
1522 } else {
1523 break;
1524 }
1525 }
f0e67a1d
Z
1526 PL_parser->bufptr = s;
1527}
1528
1529/*
fe788d6b
PM
1530
1531=for apidoc EXMp|bool|validate_proto|SV *name|SV *proto|bool warn
1532
1533This function performs syntax checking on a prototype, C<proto>.
1534If C<warn> is true, any illegal characters or mismatched brackets
1535will trigger illegalproto warnings, declaring that they were
1536detected in the prototype for C<name>.
1537
1538The return value is C<true> if this is a valid prototype, and
1539C<false> if it is not, regardless of whether C<warn> was C<true> or
1540C<false>.
1541
1542Note that C<NULL> is a valid C<proto> and will always return C<true>.
1543
1544=cut
1545
1546 */
1547
1548bool
1549Perl_validate_proto(pTHX_ SV *name, SV *proto, bool warn)
1550{
1551 STRLEN len, origlen;
1552 char *p = proto ? SvPV(proto, len) : NULL;
1553 bool bad_proto = FALSE;
1554 bool in_brackets = FALSE;
1555 bool after_slash = FALSE;
1556 char greedy_proto = ' ';
1557 bool proto_after_greedy_proto = FALSE;
1558 bool must_be_last = FALSE;
1559 bool underscore = FALSE;
f791a21a 1560 bool bad_proto_after_underscore = FALSE;
fe788d6b
PM
1561
1562 PERL_ARGS_ASSERT_VALIDATE_PROTO;
1563
1564 if (!proto)
1565 return TRUE;
1566
1567 origlen = len;
1568 for (; len--; p++) {
1569 if (!isSPACE(*p)) {
1570 if (must_be_last)
1571 proto_after_greedy_proto = TRUE;
f791a21a
PM
1572 if (underscore) {
1573 if (!strchr(";@%", *p))
1574 bad_proto_after_underscore = TRUE;
1575 underscore = FALSE;
1576 }
fe788d6b
PM
1577 if (!strchr("$@%*;[]&\\_+", *p) || *p == '\0') {
1578 bad_proto = TRUE;
1579 }
1580 else {
fe788d6b
PM
1581 if (*p == '[')
1582 in_brackets = TRUE;
1583 else if (*p == ']')
1584 in_brackets = FALSE;
1585 else if ((*p == '@' || *p == '%') &&
1586 !after_slash &&
1587 !in_brackets ) {
1588 must_be_last = TRUE;
1589 greedy_proto = *p;
1590 }
1591 else if (*p == '_')
f791a21a 1592 underscore = TRUE;
fe788d6b
PM
1593 }
1594 if (*p == '\\')
1595 after_slash = TRUE;
1596 else
1597 after_slash = FALSE;
1598 }
1599 }
1600
1601 if (warn) {
b54d603d 1602 SV *tmpsv = newSVpvs_flags("", SVs_TEMP);
fe788d6b 1603 p -= origlen;
b54d603d
PM
1604 p = SvUTF8(proto)
1605 ? sv_uni_display(tmpsv, newSVpvn_flags(p, origlen, SVs_TEMP | SVf_UTF8),
1606 origlen, UNI_DISPLAY_ISPRINT)
1607 : pv_pretty(tmpsv, p, origlen, 60, NULL, NULL, PERL_PV_ESCAPE_NONASCII);
1608
fe788d6b
PM
1609 if (proto_after_greedy_proto)
1610 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1611 "Prototype after '%c' for %"SVf" : %s",
1612 greedy_proto, SVfARG(name), p);
50278ed0
PM
1613 if (in_brackets)
1614 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1615 "Missing ']' in prototype for %"SVf" : %s",
1616 SVfARG(name), p);
b54d603d 1617 if (bad_proto)
fe788d6b 1618 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
f791a21a
PM
1619 "Illegal character in prototype for %"SVf" : %s",
1620 SVfARG(name), p);
1621 if (bad_proto_after_underscore)
1622 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1623 "Illegal character after '_' in prototype for %"SVf" : %s",
1624 SVfARG(name), p);
fe788d6b
PM
1625 }
1626
1627 return (! (proto_after_greedy_proto || bad_proto) );
1628}
1629
1630/*
ffb4593c
NT
1631 * S_incline
1632 * This subroutine has nothing to do with tilting, whether at windmills
1633 * or pinball tables. Its name is short for "increment line". It
57843af0 1634 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1635 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1636 * # line 500 "foo.pm"
1637 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1638 */
1639
76e3520e 1640STATIC void
d9095cec 1641S_incline(pTHX_ const char *s)
463ee0b2 1642{
d9095cec
NC
1643 const char *t;
1644 const char *n;
1645 const char *e;
8818d409 1646 line_t line_num;
463ee0b2 1647
7918f24d
NC
1648 PERL_ARGS_ASSERT_INCLINE;
1649
83944c01 1650 COPLINE_INC_WITH_HERELINES;
451f421f
FC
1651 if (!PL_rsfp && !PL_parser->filtered && PL_lex_state == LEX_NORMAL
1652 && s+1 == PL_bufend && *s == ';') {
1653 /* fake newline in string eval */
1654 CopLINE_dec(PL_curcop);
1655 return;
1656 }
463ee0b2
LW
1657 if (*s++ != '#')
1658 return;
d4c19fe8
AL
1659 while (SPACE_OR_TAB(*s))
1660 s++;
73659bf1
GS
1661 if (strnEQ(s, "line", 4))
1662 s += 4;
1663 else
1664 return;
084592ab 1665 if (SPACE_OR_TAB(*s))
73659bf1 1666 s++;
4e553d73 1667 else
73659bf1 1668 return;
d4c19fe8
AL
1669 while (SPACE_OR_TAB(*s))
1670 s++;
463ee0b2
LW
1671 if (!isDIGIT(*s))
1672 return;
d4c19fe8 1673
463ee0b2
LW
1674 n = s;
1675 while (isDIGIT(*s))
1676 s++;
07714eb4 1677 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1678 return;
bf4acbe4 1679 while (SPACE_OR_TAB(*s))
463ee0b2 1680 s++;
73659bf1 1681 if (*s == '"' && (t = strchr(s+1, '"'))) {
463ee0b2 1682 s++;
73659bf1
GS
1683 e = t + 1;
1684 }
463ee0b2 1685 else {
c35e046a
AL
1686 t = s;
1687 while (!isSPACE(*t))
1688 t++;
73659bf1 1689 e = t;
463ee0b2 1690 }
bf4acbe4 1691 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1692 e++;
1693 if (*e != '\n' && *e != '\0')
1694 return; /* false alarm */
1695
96e440d2 1696 line_num = grok_atou(n, &e) - 1;
8818d409 1697
f4dd75d9 1698 if (t - s > 0) {
d9095cec 1699 const STRLEN len = t - s;
3df32bda 1700
d36ee5be 1701 if (!PL_rsfp && !PL_parser->filtered) {
e66cf94c
RGS
1702 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1703 * to *{"::_<newfilename"} */
44867030
NC
1704 /* However, the long form of evals is only turned on by the
1705 debugger - usually they're "(eval %lu)" */
d36ee5be
FC
1706 GV * const cfgv = CopFILEGV(PL_curcop);
1707 if (cfgv) {
38bd7ad8
FC
1708 char smallbuf[128];
1709 STRLEN tmplen2 = len;
44867030 1710 char *tmpbuf2;
449dd039 1711 GV *gv2;
44867030
NC
1712
1713 if (tmplen2 + 2 <= sizeof smallbuf)
1714 tmpbuf2 = smallbuf;
1715 else
1716 Newx(tmpbuf2, tmplen2 + 2, char);
1717
38bd7ad8
FC
1718 tmpbuf2[0] = '_';
1719 tmpbuf2[1] = '<';
44867030
NC
1720
1721 memcpy(tmpbuf2 + 2, s, tmplen2);
1722 tmplen2 += 2;
1723
8a5ee598 1724 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1725 if (!isGV(gv2)) {
8a5ee598 1726 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1727 /* adjust ${"::_<newfilename"} to store the new file name */
1728 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
8818d409
FC
1729 /* The line number may differ. If that is the case,
1730 alias the saved lines that are in the array.
1731 Otherwise alias the whole array. */
1732 if (CopLINE(PL_curcop) == line_num) {
38bd7ad8
FC
1733 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(cfgv)));
1734 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(cfgv)));
8818d409 1735 }
38bd7ad8
FC
1736 else if (GvAV(cfgv)) {
1737 AV * const av = GvAV(cfgv);
8818d409
FC
1738 const I32 start = CopLINE(PL_curcop)+1;
1739 I32 items = AvFILLp(av) - start;
1740 if (items > 0) {
1741 AV * const av2 = GvAVn(gv2);
1742 SV **svp = AvARRAY(av) + start;
1743 I32 l = (I32)line_num+1;
1744 while (items--)
1745 av_store(av2, l++, SvREFCNT_inc(*svp++));
1746 }
1747 }
e5527e4b 1748 }
44867030
NC
1749
1750 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
d36ee5be 1751 }
e66cf94c 1752 }
05ec9bb3 1753 CopFILE_free(PL_curcop);
449dd039 1754 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1755 }
8818d409 1756 CopLINE_set(PL_curcop, line_num);
463ee0b2
LW
1757}
1758
21791330
FC
1759#define skipspace(s) skipspace_flags(s, 0)
1760
29595ff2 1761
80a702cd 1762STATIC void
15f169a1 1763S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1764{
1765 AV *av = CopFILEAVx(PL_curcop);
1766 if (av) {
7f1c3e8c
FC
1767 SV * sv;
1768 if (PL_parser->preambling == NOLINE) sv = newSV_type(SVt_PVMG);
1769 else {
1770 sv = *av_fetch(av, 0, 1);
1771 SvUPGRADE(sv, SVt_PVMG);
1772 }
1773 if (!SvPOK(sv)) sv_setpvs(sv,"");
5fa550fb 1774 if (orig_sv)
7f1c3e8c 1775 sv_catsv(sv, orig_sv);
5fa550fb 1776 else
7f1c3e8c
FC
1777 sv_catpvn(sv, buf, len);
1778 if (!SvIOK(sv)) {
1779 (void)SvIOK_on(sv);
1780 SvIV_set(sv, 0);
1781 }
1782 if (PL_parser->preambling == NOLINE)
1783 av_store(av, CopLINE(PL_curcop), sv);
80a702cd
RGS
1784 }
1785}
1786
ffb4593c
NT
1787/*
1788 * S_skipspace
1789 * Called to gobble the appropriate amount and type of whitespace.
1790 * Skips comments as well.
1791 */
1792
76e3520e 1793STATIC char *
21791330 1794S_skipspace_flags(pTHX_ char *s, U32 flags)
a687059c 1795{
21791330 1796 PERL_ARGS_ASSERT_SKIPSPACE_FLAGS;
3280af22 1797 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
bf4acbe4 1798 while (s < PL_bufend && SPACE_OR_TAB(*s))
463ee0b2 1799 s++;
f0e67a1d
Z
1800 } else {
1801 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1802 PL_bufptr = s;
21791330 1803 lex_read_space(flags | LEX_KEEP_PREVIOUS |
f0998909
Z
1804 (PL_sublex_info.sub_inwhat || PL_lex_state == LEX_FORMLINE ?
1805 LEX_NO_NEXT_CHUNK : 0));
3280af22 1806 s = PL_bufptr;
f0e67a1d
Z
1807 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1808 if (PL_linestart > PL_bufptr)
1809 PL_bufptr = PL_linestart;
1810 return s;
463ee0b2 1811 }
5db06880 1812 return s;
a687059c 1813}
378cc40b 1814
ffb4593c
NT
1815/*
1816 * S_check_uni
1817 * Check the unary operators to ensure there's no ambiguity in how they're
1818 * used. An ambiguous piece of code would be:
1819 * rand + 5
1820 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1821 * the +5 is its argument.
1822 */
1823
76e3520e 1824STATIC void
cea2e8a9 1825S_check_uni(pTHX)
ba106d47 1826{
d4c19fe8
AL
1827 const char *s;
1828 const char *t;
2f3197b3 1829
3280af22 1830 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1831 return;
3280af22
NIS
1832 while (isSPACE(*PL_last_uni))
1833 PL_last_uni++;
c35e046a 1834 s = PL_last_uni;
8a2bca12 1835 while (isWORDCHAR_lazy_if(s,UTF) || *s == '-')
c35e046a 1836 s++;
3280af22 1837 if ((t = strchr(s, '(')) && t < PL_bufptr)
a0d0e21e 1838 return;
6136c704 1839
9b387841
NC
1840 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
1841 "Warning: Use of \"%.*s\" without parentheses is ambiguous",
1842 (int)(s - PL_last_uni), PL_last_uni);
2f3197b3
LW
1843}
1844
ffb4593c
NT
1845/*
1846 * LOP : macro to build a list operator. Its behaviour has been replaced
1847 * with a subroutine, S_lop() for which LOP is just another name.
1848 */
1849
a0d0e21e
LW
1850#define LOP(f,x) return lop(f,x,s)
1851
ffb4593c
NT
1852/*
1853 * S_lop
1854 * Build a list operator (or something that might be one). The rules:
41e8cbf4
FC
1855 * - if we have a next token, then it's a list operator (no parens) for
1856 * which the next token has already been parsed; e.g.,
1857 * sort foo @args
1858 * sort foo (@args)
ffb4593c
NT
1859 * - if the next thing is an opening paren, then it's a function
1860 * - else it's a list operator
1861 */
1862
76e3520e 1863STATIC I32
a0be28da 1864S_lop(pTHX_ I32 f, int x, char *s)
ffed7fef 1865{
7918f24d
NC
1866 PERL_ARGS_ASSERT_LOP;
1867
6154021b 1868 pl_yylval.ival = f;
35c8bce7 1869 CLINE;
3280af22
NIS
1870 PL_bufptr = s;
1871 PL_last_lop = PL_oldbufptr;
eb160463 1872 PL_last_lop_op = (OPCODE)f;
3280af22 1873 if (PL_nexttoke)
78cdf107 1874 goto lstop;
19f1898a 1875 PL_expect = x;
79072805 1876 if (*s == '(')
bbf60fe6 1877 return REPORT(FUNC);
29595ff2 1878 s = PEEKSPACE(s);
79072805 1879 if (*s == '(')
bbf60fe6 1880 return REPORT(FUNC);
78cdf107
Z
1881 else {
1882 lstop:
1883 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
1884 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
bbf60fe6 1885 return REPORT(LSTOP);
78cdf107 1886 }
79072805
LW
1887}
1888
ffb4593c
NT
1889/*
1890 * S_force_next
9cbb5ea2 1891 * When the lexer realizes it knows the next token (for instance,
ffb4593c 1892 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 1893 * to know what token to return the next time the lexer is called. Caller
b5bbe64a
JH
1894 * will need to set PL_nextval[] and possibly PL_expect to ensure
1895 * the lexer handles the token correctly.
ffb4593c
NT
1896 */
1897
4e553d73 1898STATIC void
cea2e8a9 1899S_force_next(pTHX_ I32 type)
79072805 1900{
704d4215
GG
1901#ifdef DEBUGGING
1902 if (DEBUG_T_TEST) {
1903 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 1904 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
1905 }
1906#endif
3280af22
NIS
1907 PL_nexttype[PL_nexttoke] = type;
1908 PL_nexttoke++;
1909 if (PL_lex_state != LEX_KNOWNEXT) {
1910 PL_lex_defer = PL_lex_state;
1911 PL_lex_expect = PL_expect;
1912 PL_lex_state = LEX_KNOWNEXT;
79072805
LW
1913 }
1914}
1915
89f35911
FC
1916/*
1917 * S_postderef
1918 *
1919 * This subroutine handles postfix deref syntax after the arrow has already
1920 * been emitted. @* $* etc. are emitted as two separate token right here.
1921 * @[ @{ %[ %{ *{ are emitted also as two tokens, but this function emits
1922 * only the first, leaving yylex to find the next.
89f35911
FC
1923 */
1924
1925static int
ff25e5db 1926S_postderef(pTHX_ int const funny, char const next)
89f35911 1927{
ff25e5db 1928 assert(funny == DOLSHARP || strchr("$@%&*", funny));
89f35911
FC
1929 assert(strchr("*[{", next));
1930 if (next == '*') {
1931 PL_expect = XOPERATOR;
cc624add 1932 if (PL_lex_state == LEX_INTERPNORMAL && !PL_lex_brackets) {
ff25e5db 1933 assert('@' == funny || '$' == funny || DOLSHARP == funny);
cc624add 1934 PL_lex_state = LEX_INTERPEND;
cc624add
FC
1935 force_next(POSTJOIN);
1936 }
89f35911
FC
1937 force_next(next);
1938 PL_bufptr+=2;
1939 }
1940 else {
760ca746
FC
1941 if ('@' == funny && PL_lex_state == LEX_INTERPNORMAL
1942 && !PL_lex_brackets)
1943 PL_lex_dojoin = 2;
89f35911
FC
1944 PL_expect = XOPERATOR;
1945 PL_bufptr++;
1946 }
1947 return funny;
1948}
1949
28ac2b49
Z
1950void
1951Perl_yyunlex(pTHX)
1952{
a7aaec61
Z
1953 int yyc = PL_parser->yychar;
1954 if (yyc != YYEMPTY) {
1955 if (yyc) {
a7aaec61
Z
1956 NEXTVAL_NEXTTOKE = PL_parser->yylval;
1957 if (yyc == '{'/*}*/ || yyc == HASHBRACK || yyc == '['/*]*/) {
78cdf107 1958 PL_lex_allbrackets--;
a7aaec61 1959 PL_lex_brackets--;
78cdf107
Z
1960 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
1961 } else if (yyc == '('/*)*/) {
1962 PL_lex_allbrackets--;
1963 yyc |= (2<<24);
a7aaec61
Z
1964 }
1965 force_next(yyc);
1966 }
28ac2b49
Z
1967 PL_parser->yychar = YYEMPTY;
1968 }
1969}
1970
d0a148a6 1971STATIC SV *
15f169a1 1972S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 1973{
740cce10 1974 SV * const sv = newSVpvn_utf8(start, len,
eaf7a4d2
CS
1975 !IN_BYTES
1976 && UTF
1977 && !is_ascii_string((const U8*)start, len)
740cce10 1978 && is_utf8_string((const U8*)start, len));
d0a148a6
NC
1979 return sv;
1980}
1981
ffb4593c
NT
1982/*
1983 * S_force_word
1984 * When the lexer knows the next thing is a word (for instance, it has
1985 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
1986 * it calls S_force_word to stick the next word into the PL_nexttoke/val
1987 * lookahead.
ffb4593c
NT
1988 *
1989 * Arguments:
b1b65b59 1990 * char *start : buffer position (must be within PL_linestr)
02b34bbe 1991 * int token : PL_next* will be this type of bare word (e.g., METHOD,WORD)
ffb4593c
NT
1992 * int check_keyword : if true, Perl checks to make sure the word isn't
1993 * a keyword (do this if the word is a label, e.g. goto FOO)
1994 * int allow_pack : if true, : characters will also be allowed (require,
1995 * use, etc. do this)
9cbb5ea2 1996 * int allow_initial_tick : used by the "sub" lexer only.
ffb4593c
NT
1997 */
1998
76e3520e 1999STATIC char *
345b3785 2000S_force_word(pTHX_ char *start, int token, int check_keyword, int allow_pack)
79072805 2001{
eb578fdb 2002 char *s;
463ee0b2 2003 STRLEN len;
4e553d73 2004
7918f24d
NC
2005 PERL_ARGS_ASSERT_FORCE_WORD;
2006
29595ff2 2007 start = SKIPSPACE1(start);
463ee0b2 2008 s = start;
7e2040f0 2009 if (isIDFIRST_lazy_if(s,UTF) ||
345b3785 2010 (allow_pack && *s == ':') )
a0d0e21e 2011 {
3280af22 2012 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
01b5ef50
FC
2013 if (check_keyword) {
2014 char *s2 = PL_tokenbuf;
2015 if (allow_pack && len > 6 && strnEQ(s2, "CORE::", 6))
2016 s2 += 6, len -= 6;
2017 if (keyword(s2, len, 0))
463ee0b2 2018 return start;
01b5ef50 2019 }
463ee0b2 2020 if (token == METHOD) {
29595ff2 2021 s = SKIPSPACE1(s);
463ee0b2 2022 if (*s == '(')
3280af22 2023 PL_expect = XTERM;
463ee0b2 2024 else {
3280af22 2025 PL_expect = XOPERATOR;
463ee0b2 2026 }
79072805 2027 }
9ded7720 2028 NEXTVAL_NEXTTOKE.opval
d0a148a6
NC
2029 = (OP*)newSVOP(OP_CONST,0,
2030 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 2031 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
2032 force_next(token);
2033 }
2034 return s;
2035}
2036
ffb4593c
NT
2037/*
2038 * S_force_ident
9cbb5ea2 2039 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
2040 * text only contains the "foo" portion. The first argument is a pointer
2041 * to the "foo", and the second argument is the type symbol to prefix.
2042 * Forces the next token to be a "WORD".
9cbb5ea2 2043 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2044 */
2045
76e3520e 2046STATIC void
5aaab254 2047S_force_ident(pTHX_ const char *s, int kind)
79072805 2048{
7918f24d
NC
2049 PERL_ARGS_ASSERT_FORCE_IDENT;
2050
c9b48522
DD
2051 if (s[0]) {
2052 const STRLEN len = s[1] ? strlen(s) : 1; /* s = "\"" see yylex */
728847b1
BF
2053 OP* const o = (OP*)newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
2054 UTF ? SVf_UTF8 : 0));
9ded7720 2055 NEXTVAL_NEXTTOKE.opval = o;
79072805 2056 force_next(WORD);
748a9306 2057 if (kind) {
11343788 2058 o->op_private = OPpCONST_ENTERED;
55497cff 2059 /* XXX see note in pp_entereval() for why we forgo typo
2060 warnings if the symbol must be introduced in an eval.
2061 GSAR 96-10-12 */
90e5519e 2062 gv_fetchpvn_flags(s, len,
728847b1
BF
2063 (PL_in_eval ? (GV_ADDMULTI | GV_ADDINEVAL)
2064 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
90e5519e
NC
2065 kind == '$' ? SVt_PV :
2066 kind == '@' ? SVt_PVAV :
2067 kind == '%' ? SVt_PVHV :
a0d0e21e 2068 SVt_PVGV
90e5519e 2069 );
748a9306 2070 }
79072805
LW
2071 }
2072}
2073
3f33d153
FC
2074static void
2075S_force_ident_maybe_lex(pTHX_ char pit)
2076{
3f33d153
FC
2077 NEXTVAL_NEXTTOKE.ival = pit;
2078 force_next('p');
2079}
2080
1571675a
GS
2081NV
2082Perl_str_to_version(pTHX_ SV *sv)
2083{
2084 NV retval = 0.0;
2085 NV nshift = 1.0;
2086 STRLEN len;
cfd0369c 2087 const char *start = SvPV_const(sv,len);
9d4ba2ae 2088 const char * const end = start + len;
504618e9 2089 const bool utf = SvUTF8(sv) ? TRUE : FALSE;
7918f24d
NC
2090
2091 PERL_ARGS_ASSERT_STR_TO_VERSION;
2092
1571675a 2093 while (start < end) {
ba210ebe 2094 STRLEN skip;
1571675a
GS
2095 UV n;
2096 if (utf)
9041c2e3 2097 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2098 else {
2099 n = *(U8*)start;
2100 skip = 1;
2101 }
2102 retval += ((NV)n)/nshift;
2103 start += skip;
2104 nshift *= 1000;
2105 }
2106 return retval;
2107}
2108
4e553d73 2109/*
ffb4593c
NT
2110 * S_force_version
2111 * Forces the next token to be a version number.
e759cc13
RGS
2112 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2113 * and if "guessing" is TRUE, then no new token is created (and the caller
2114 * must use an alternative parsing method).
ffb4593c
NT
2115 */
2116
76e3520e 2117STATIC char *
e759cc13 2118S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2119{
5f66b61c 2120 OP *version = NULL;
44dcb63b 2121 char *d;
89bfa8cd 2122
7918f24d
NC
2123 PERL_ARGS_ASSERT_FORCE_VERSION;
2124
29595ff2 2125 s = SKIPSPACE1(s);
89bfa8cd 2126
44dcb63b 2127 d = s;
dd629d5b 2128 if (*d == 'v')
44dcb63b 2129 d++;
44dcb63b 2130 if (isDIGIT(*d)) {
e759cc13
RGS
2131 while (isDIGIT(*d) || *d == '_' || *d == '.')
2132 d++;
4e4da3ac 2133 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
dd629d5b 2134 SV *ver;
6154021b
RGS
2135 s = scan_num(s, &pl_yylval);
2136 version = pl_yylval.opval;
dd629d5b
GS
2137 ver = cSVOPx(version)->op_sv;
2138 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2139 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2140 SvNV_set(ver, str_to_version(ver));
1571675a 2141 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2142 }
89bfa8cd 2143 }
5db06880 2144 else if (guessing) {
e759cc13 2145 return s;
5db06880 2146 }
89bfa8cd 2147 }
2148
2149 /* NOTE: The parser sees the package name and the VERSION swapped */
9ded7720 2150 NEXTVAL_NEXTTOKE.opval = version;
4e553d73 2151 force_next(WORD);
89bfa8cd 2152
e759cc13 2153 return s;
89bfa8cd 2154}
2155
ffb4593c 2156/*
91152fc1
DG
2157 * S_force_strict_version
2158 * Forces the next token to be a version number using strict syntax rules.
2159 */
2160
2161STATIC char *
2162S_force_strict_version(pTHX_ char *s)
2163{
91152fc1 2164 OP *version = NULL;
91152fc1
DG
2165 const char *errstr = NULL;
2166
2167 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2168
2169 while (isSPACE(*s)) /* leading whitespace */
2170 s++;
2171
2172 if (is_STRICT_VERSION(s,&errstr)) {
2173 SV *ver = newSV(0);
2174 s = (char *)scan_version(s, ver, 0);
2175 version = newSVOP(OP_CONST, 0, ver);
2176 }
4e4da3ac
Z
2177 else if ( (*s != ';' && *s != '{' && *s != '}' ) &&
2178 (s = SKIPSPACE1(s), (*s != ';' && *s != '{' && *s != '}' )))
2179 {
91152fc1
DG
2180 PL_bufptr = s;
2181 if (errstr)
2182 yyerror(errstr); /* version required */
2183 return s;
2184 }
2185
91152fc1 2186 /* NOTE: The parser sees the package name and the VERSION swapped */
91152fc1
DG
2187 NEXTVAL_NEXTTOKE.opval = version;
2188 force_next(WORD);
2189
2190 return s;
2191}
2192
2193/*
ffb4593c
NT
2194 * S_tokeq
2195 * Tokenize a quoted string passed in as an SV. It finds the next
2196 * chunk, up to end of string or a backslash. It may make a new
2197 * SV containing that chunk (if HINT_NEW_STRING is on). It also
2198 * turns \\ into \.
2199 */
2200
76e3520e 2201STATIC SV *
cea2e8a9 2202S_tokeq(pTHX_ SV *sv)
79072805 2203{
eb578fdb
KW
2204 char *s;
2205 char *send;
2206 char *d;
b3ac6de7 2207 SV *pv = sv;
79072805 2208
7918f24d
NC
2209 PERL_ARGS_ASSERT_TOKEQ;
2210
279b35ad
FC
2211 assert (SvPOK(sv));
2212 assert (SvLEN(sv));
2213 assert (!SvIsCOW(sv));
307ed071 2214 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1) /* <<'heredoc' */
b3ac6de7 2215 goto finish;
279b35ad
FC
2216 s = SvPVX(sv);
2217 send = SvEND(sv);
dcb21ed6
NC
2218 /* This is relying on the SV being "well formed" with a trailing '\0' */
2219 while (s < send && !(*s == '\\' && s[1] == '\\'))
79072805
LW
2220 s++;
2221 if (s == send)
b3ac6de7 2222 goto finish;
79072805 2223 d = s;
be4731d2 2224 if ( PL_hints & HINT_NEW_STRING ) {
279b35ad
FC
2225 pv = newSVpvn_flags(SvPVX_const(pv), SvCUR(sv),
2226 SVs_TEMP | SvUTF8(sv));
be4731d2 2227 }
79072805
LW
2228 while (s < send) {
2229 if (*s == '\\') {
a0d0e21e 2230 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2231 s++; /* all that, just for this */
2232 }
2233 *d++ = *s++;
2234 }
2235 *d = '\0';
95a20fc0 2236 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2237 finish:
3280af22 2238 if ( PL_hints & HINT_NEW_STRING )
eb0d8d16 2239 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
79072805
LW
2240 return sv;
2241}
2242
ffb4593c
NT
2243/*
2244 * Now come three functions related to double-quote context,
2245 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2246 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2247 * interact with PL_lex_state, and create fake ( ... ) argument lists
2248 * to handle functions and concatenation.
ecd24171
DM
2249 * For example,
2250 * "foo\lbar"
2251 * is tokenised as
2252 * stringify ( const[foo] concat lcfirst ( const[bar] ) )
ffb4593c
NT
2253 */
2254
2255/*
2256 * S_sublex_start
6154021b 2257 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2258 *
2259 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2260 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c
NT
2261 *
2262 * OP_CONST and OP_READLINE are easy--just make the new op and return.
2263 *
2264 * Everything else becomes a FUNC.
2265 *
2266 * Sets PL_lex_state to LEX_INTERPPUSH unless (ival was OP_NULL or we
2267 * had an OP_CONST or OP_READLINE). This just sets us up for a
2268 * call to S_sublex_push().
2269 */
2270
76e3520e 2271STATIC I32
cea2e8a9 2272S_sublex_start(pTHX)
79072805 2273{
eb578fdb 2274 const I32 op_type = pl_yylval.ival;
79072805
LW
2275
2276 if (op_type == OP_NULL) {
6154021b 2277 pl_yylval.opval = PL_lex_op;
5f66b61c 2278 PL_lex_op = NULL;
79072805
LW
2279 return THING;
2280 }
466112bb 2281 if (op_type == OP_CONST) {
3280af22 2282 SV *sv = tokeq(PL_lex_stuff);
b3ac6de7
IZ
2283
2284 if (SvTYPE(sv) == SVt_PVIV) {
2285 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2286 STRLEN len;
96a5add6 2287 const char * const p = SvPV_const(sv, len);
740cce10 2288 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2289 SvREFCNT_dec(sv);
2290 sv = nsv;
4e553d73 2291 }
6154021b 2292 pl_yylval.opval = (OP*)newSVOP(op_type, 0, sv);
a0714e2c 2293 PL_lex_stuff = NULL;
79072805
LW
2294 return THING;
2295 }
2296
3280af22 2297 PL_sublex_info.super_state = PL_lex_state;
eac04b2e 2298 PL_sublex_info.sub_inwhat = (U16)op_type;
3280af22
NIS
2299 PL_sublex_info.sub_op = PL_lex_op;
2300 PL_lex_state = LEX_INTERPPUSH;
55497cff 2301
3280af22
NIS
2302 PL_expect = XTERM;
2303 if (PL_lex_op) {
6154021b 2304 pl_yylval.opval = PL_lex_op;
5f66b61c 2305 PL_lex_op = NULL;
55497cff 2306 return PMFUNC;
2307 }
2308 else
2309 return FUNC;
2310}
2311
ffb4593c
NT
2312/*
2313 * S_sublex_push
2314 * Create a new scope to save the lexing state. The scope will be
2315 * ended in S_sublex_done. Returns a '(', starting the function arguments
2316 * to the uc, lc, etc. found before.
2317 * Sets PL_lex_state to LEX_INTERPCONCAT.
2318 */
2319
76e3520e 2320STATIC I32
cea2e8a9 2321S_sublex_push(pTHX)
55497cff 2322{
78a635de 2323 LEXSHARED *shared;
801d32ac 2324 const bool is_heredoc = PL_multi_close == '<';
f46d017c 2325 ENTER;
55497cff 2326
3280af22 2327 PL_lex_state = PL_sublex_info.super_state;
cc624add 2328 SAVEI8(PL_lex_dojoin);
3280af22 2329 SAVEI32(PL_lex_brackets);
78cdf107 2330 SAVEI32(PL_lex_allbrackets);
b27dce25 2331 SAVEI32(PL_lex_formbrack);
78cdf107 2332 SAVEI8(PL_lex_fakeeof);
3280af22
NIS
2333 SAVEI32(PL_lex_casemods);
2334 SAVEI32(PL_lex_starts);
651b5b28 2335 SAVEI8(PL_lex_state);
7cc34111 2336 SAVESPTR(PL_lex_repl);
7766f137 2337 SAVEVPTR(PL_lex_inpat);
98246f1e 2338 SAVEI16(PL_lex_inwhat);
ffdb8b16 2339 if (is_heredoc)
b42366d4 2340 {
ffdb8b16 2341 SAVECOPLINE(PL_curcop);
b42366d4 2342 SAVEI32(PL_multi_end);
851b527a
FC
2343 SAVEI32(PL_parser->herelines);
2344 PL_parser->herelines = 0;
b42366d4
FC
2345 }
2346 SAVEI8(PL_multi_close);
3280af22 2347 SAVEPPTR(PL_bufptr);
8452ff4b 2348 SAVEPPTR(PL_bufend);
3280af22
NIS
2349 SAVEPPTR(PL_oldbufptr);
2350 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2351 SAVEPPTR(PL_last_lop);
2352 SAVEPPTR(PL_last_uni);
3280af22
NIS
2353 SAVEPPTR(PL_linestart);
2354 SAVESPTR(PL_linestr);
8edd5f42
RGS
2355 SAVEGENERICPV(PL_lex_brackstack);
2356 SAVEGENERICPV(PL_lex_casestack);
78a635de 2357 SAVEGENERICPV(PL_parser->lex_shared);
3a54fd60 2358 SAVEBOOL(PL_parser->lex_re_reparsing);
ffdb8b16 2359 SAVEI32(PL_copline);
3280af22 2360
99bd9d90 2361 /* The here-doc parser needs to be able to peek into outer lexing
60f40a38
FC
2362 scopes to find the body of the here-doc. So we put PL_linestr and
2363 PL_bufptr into lex_shared, to ‘share’ those values.
99bd9d90 2364 */
60f40a38
FC
2365 PL_parser->lex_shared->ls_linestr = PL_linestr;
2366 PL_parser->lex_shared->ls_bufptr = PL_bufptr;
99bd9d90 2367
3280af22 2368 PL_linestr = PL_lex_stuff;
7cc34111 2369 PL_lex_repl = PL_sublex_info.repl;
a0714e2c 2370 PL_lex_stuff = NULL;
7cc34111 2371 PL_sublex_info.repl = NULL;
3280af22 2372
9cbb5ea2
GS
2373 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2374 = SvPVX(PL_linestr);
3280af22 2375 PL_bufend += SvCUR(PL_linestr);
bd61b366 2376 PL_last_lop = PL_last_uni = NULL;
3280af22 2377 SAVEFREESV(PL_linestr);
4dc843bc 2378 if (PL_lex_repl) SAVEFREESV(PL_lex_repl);
3280af22
NIS
2379
2380 PL_lex_dojoin = FALSE;
b27dce25 2381 PL_lex_brackets = PL_lex_formbrack = 0;
78cdf107
Z
2382 PL_lex_allbrackets = 0;
2383 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
a02a5408
JC
2384 Newx(PL_lex_brackstack, 120, char);
2385 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2386 PL_lex_casemods = 0;
2387 *PL_lex_casestack = '\0';
2388 PL_lex_starts = 0;
2389 PL_lex_state = LEX_INTERPCONCAT;
ffdb8b16 2390 if (is_heredoc)
6ddcf93b 2391 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
ffdb8b16 2392 PL_copline = NOLINE;
78a635de
FC
2393
2394 Newxz(shared, 1, LEXSHARED);
2395 shared->ls_prev = PL_parser->lex_shared;
2396 PL_parser->lex_shared = shared;
3280af22
NIS
2397
2398 PL_lex_inwhat = PL_sublex_info.sub_inwhat;
bb16bae8 2399 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
3280af22
NIS
2400 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2401 PL_lex_inpat = PL_sublex_info.sub_op;
79072805 2402 else
5f66b61c 2403 PL_lex_inpat = NULL;
79072805 2404
3a54fd60
DM
2405 PL_parser->lex_re_reparsing = cBOOL(PL_in_eval & EVAL_RE_REPARSING);
2406 PL_in_eval &= ~EVAL_RE_REPARSING;
2407
55497cff 2408 return '(';
79072805
LW
2409}
2410
ffb4593c
NT
2411/*
2412 * S_sublex_done
2413 * Restores lexer state after a S_sublex_push.
2414 */
2415
76e3520e 2416STATIC I32
cea2e8a9 2417S_sublex_done(pTHX)
79072805 2418{
3280af22 2419 if (!PL_lex_starts++) {
396482e1 2420 SV * const sv = newSVpvs("");
9aa983d2
JH
2421 if (SvUTF8(PL_linestr))
2422 SvUTF8_on(sv);
3280af22 2423 PL_expect = XOPERATOR;
6154021b 2424 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
79072805
LW
2425 return THING;
2426 }
2427
3280af22
NIS
2428 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2429 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2430 return yylex();
79072805
LW
2431 }
2432
ffb4593c 2433 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
bb16bae8 2434 assert(PL_lex_inwhat != OP_TRANSR);
5aa91856
FC
2435 if (PL_lex_repl) {
2436 assert (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS);
3280af22
NIS
2437 PL_linestr = PL_lex_repl;
2438 PL_lex_inpat = 0;
2439 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2440 PL_bufend += SvCUR(PL_linestr);
bd61b366 2441 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2442 PL_lex_dojoin = FALSE;
2443 PL_lex_brackets = 0;
78cdf107
Z
2444 PL_lex_allbrackets = 0;
2445 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
3280af22
NIS
2446 PL_lex_casemods = 0;
2447 *PL_lex_casestack = '\0';
2448 PL_lex_starts = 0;
25da4f38 2449 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2450 PL_lex_state = LEX_INTERPNORMAL;
2451 PL_lex_starts++;
e9fa98b2
HS
2452 /* we don't clear PL_lex_repl here, so that we can check later
2453 whether this is an evalled subst; that means we rely on the
2454 logic to ensure sublex_done() is called again only via the
2455 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2456 }
e9fa98b2 2457 else {
3280af22 2458 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2459 PL_lex_repl = NULL;
e9fa98b2 2460 }
ffdb8b16
FC
2461 if (SvTYPE(PL_linestr) >= SVt_PVNV) {
2462 CopLINE(PL_curcop) +=
2463 ((XPVNV*)SvANY(PL_linestr))->xnv_u.xpad_cop_seq.xlow
851b527a
FC
2464 + PL_parser->herelines;
2465 PL_parser->herelines = 0;
ffdb8b16 2466 }
79072805 2467 return ',';
ffed7fef
LW
2468 }
2469 else {
b42366d4 2470 const line_t l = CopLINE(PL_curcop);
f46d017c 2471 LEAVE;
b42366d4 2472 if (PL_multi_close == '<')
851b527a 2473 PL_parser->herelines += l - PL_multi_end;
3280af22
NIS
2474 PL_bufend = SvPVX(PL_linestr);
2475 PL_bufend += SvCUR(PL_linestr);
2476 PL_expect = XOPERATOR;
09bef843 2477 PL_sublex_info.sub_inwhat = 0;
79072805 2478 return ')';
ffed7fef
LW
2479 }
2480}
2481
6f613c73
KW
2482PERL_STATIC_INLINE SV*
2483S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
2484{
140b12ad
KW
2485 /* <s> points to first character of interior of \N{}, <e> to one beyond the
2486 * interior, hence to the "}". Finds what the name resolves to, returning
2487 * an SV* containing it; NULL if no valid one found */
2488
dd2b1b72 2489 SV* res = newSVpvn_flags(s, e - s, UTF ? SVf_UTF8 : 0);
6f613c73 2490
0c415a79
KW
2491 HV * table;
2492 SV **cvp;
2493 SV *cv;
2494 SV *rv;
2495 HV *stash;
2496 const U8* first_bad_char_loc;
2497 const char* backslash_ptr = s - 3; /* Points to the <\> of \N{... */
2498
6f613c73
KW
2499 PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME;
2500
107160e2
KW
2501 if (UTF && ! is_utf8_string_loc((U8 *) backslash_ptr,
2502 e - backslash_ptr,
2503 &first_bad_char_loc))
2504 {
2505 /* If warnings are on, this will print a more detailed analysis of what
2506 * is wrong than the error message below */
c80e42f3 2507 utf8n_to_uvchr(first_bad_char_loc,
107160e2
KW
2508 e - ((char *) first_bad_char_loc),
2509 NULL, 0);
2510
2511 /* We deliberately don't try to print the malformed character, which
2512 * might not print very well; it also may be just the first of many
2513 * malformations, so don't print what comes after it */
2514 yyerror(Perl_form(aTHX_
2515 "Malformed UTF-8 character immediately after '%.*s'",
2516 (int) (first_bad_char_loc - (U8 *) backslash_ptr), backslash_ptr));
2517 return NULL;
2518 }
2519
2520 res = new_constant( NULL, 0, "charnames", res, NULL, backslash_ptr,
2521 /* include the <}> */
2522 e - backslash_ptr + 1);
6f613c73 2523 if (! SvPOK(res)) {
b6407c49 2524 SvREFCNT_dec_NN(res);
6f613c73
KW
2525 return NULL;
2526 }
2527
0c415a79
KW
2528 /* See if the charnames handler is the Perl core's, and if so, we can skip
2529 * the validation needed for a user-supplied one, as Perl's does its own
2530 * validation. */
2531 table = GvHV(PL_hintgv); /* ^H */
2532 cvp = hv_fetchs(table, "charnames", FALSE);
5882ddb3
FC
2533 if (cvp && (cv = *cvp) && SvROK(cv) && (rv = SvRV(cv),
2534 SvTYPE(rv) == SVt_PVCV) && ((stash = CvSTASH(rv)) != NULL))
0c415a79
KW
2535 {
2536 const char * const name = HvNAME(stash);
6a642c21
FC
2537 if (HvNAMELEN(stash) == sizeof("_charnames")-1
2538 && strEQ(name, "_charnames")) {
0c415a79
KW
2539 return res;
2540 }
2541 }
2542
bde9e88d
KW
2543 /* Here, it isn't Perl's charname handler. We can't rely on a
2544 * user-supplied handler to validate the input name. For non-ut8 input,
2545 * look to see that the first character is legal. Then loop through the
2546 * rest checking that each is a continuation */
6f613c73 2547
36897d64
KW
2548 /* This code makes the reasonable assumption that the only Latin1-range
2549 * characters that begin a character name alias are alphabetic, otherwise
2550 * would have to create a isCHARNAME_BEGIN macro */
b6ba1137 2551
b6ba1137 2552 if (! UTF) {
bde9e88d 2553 if (! isALPHAU(*s)) {
b6ba1137
KW
2554 goto bad_charname;
2555 }
bde9e88d
KW
2556 s++;
2557 while (s < e) {
2558 if (! isCHARNAME_CONT(*s)) {
b6ba1137
KW
2559 goto bad_charname;
2560 }
2d8eb851
KW
2561 if (*s == ' ' && *(s-1) == ' ') {
2562 goto multi_spaces;
bd299e29 2563 }
df758df2
KW
2564 if ((U8) *s == NBSP_NATIVE && ckWARN_d(WARN_DEPRECATED)) {
2565 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2566 "NO-BREAK SPACE in a charnames "
2567 "alias definition is deprecated");
2568 }
bde9e88d 2569 s++;
b6ba1137
KW
2570 }
2571 }
2572 else {
bde9e88d
KW
2573 /* Similarly for utf8. For invariants can check directly; for other
2574 * Latin1, can calculate their code point and check; otherwise use a
2575 * swash */
2576 if (UTF8_IS_INVARIANT(*s)) {
2577 if (! isALPHAU(*s)) {
140b12ad
KW
2578 goto bad_charname;
2579 }
bde9e88d
KW
2580 s++;
2581 } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
94bb8c36 2582 if (! isALPHAU(TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)))) {
b6ba1137 2583 goto bad_charname;
6f613c73 2584 }
bde9e88d 2585 s += 2;
6f613c73 2586 }
bde9e88d
KW
2587 else {
2588 if (! PL_utf8_charname_begin) {
2589 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2590 PL_utf8_charname_begin = _core_swash_init("utf8",
2591 "_Perl_Charname_Begin",
2592 &PL_sv_undef,
2593 1, 0, NULL, &flags);
2594 }
2595 if (! swash_fetch(PL_utf8_charname_begin, (U8 *) s, TRUE)) {
2596 goto bad_charname;
2597 }
2598 s += UTF8SKIP(s);
2599 }
2600
2601 while (s < e) {
2602 if (UTF8_IS_INVARIANT(*s)) {
2603 if (! isCHARNAME_CONT(*s)) {
2604 goto bad_charname;
2605 }
2d8eb851
KW
2606 if (*s == ' ' && *(s-1) == ' ') {
2607 goto multi_spaces;
bd299e29 2608 }
bde9e88d
KW
2609 s++;
2610 }
2611 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
94bb8c36 2612 if (! isCHARNAME_CONT(TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1))))
bde9e88d
KW
2613 {
2614 goto bad_charname;
2615 }
df758df2
KW
2616 if (*s == *NBSP_UTF8
2617 && *(s+1) == *(NBSP_UTF8+1)
2618 && ckWARN_d(WARN_DEPRECATED))
2619 {
2620 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2621 "NO-BREAK SPACE in a charnames "
2622 "alias definition is deprecated");
2623 }
bde9e88d
KW
2624 s += 2;
2625 }
2626 else {
2627 if (! PL_utf8_charname_continue) {
2628 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2629 PL_utf8_charname_continue = _core_swash_init("utf8",
2630 "_Perl_Charname_Continue",
2631 &PL_sv_undef,
2632 1, 0, NULL, &flags);
2633 }
2634 if (! swash_fetch(PL_utf8_charname_continue, (U8 *) s, TRUE)) {
2635 goto bad_charname;
2636 }
2637 s += UTF8SKIP(s);
6f613c73
KW
2638 }
2639 }
2d8eb851
KW
2640 }
2641 if (*(s-1) == ' ') {
2642 yyerror_pv(
2643 Perl_form(aTHX_
2644 "charnames alias definitions may not contain trailing "
2645 "white-space; marked by <-- HERE in %.*s<-- HERE %.*s",
2646 (int)(s - backslash_ptr + 1), backslash_ptr,
2647 (int)(e - s + 1), s + 1
2648 ),
2649 UTF ? SVf_UTF8 : 0);
2650 return NULL;
6f613c73
KW
2651 }
2652
94ca1619 2653 if (SvUTF8(res)) { /* Don't accept malformed input */
bde9e88d
KW
2654 const U8* first_bad_char_loc;
2655 STRLEN len;
2656 const char* const str = SvPV_const(res, len);
2657 if (! is_utf8_string_loc((U8 *) str, len, &first_bad_char_loc)) {
2658 /* If warnings are on, this will print a more detailed analysis of
2659 * what is wrong than the error message below */
c80e42f3 2660 utf8n_to_uvchr(first_bad_char_loc,
bde9e88d
KW
2661 (char *) first_bad_char_loc - str,
2662 NULL, 0);
2663
2664 /* We deliberately don't try to print the malformed character,
2665 * which might not print very well; it also may be just the first
2666 * of many malformations, so don't print what comes after it */
2667 yyerror_pv(
2668 Perl_form(aTHX_
2669 "Malformed UTF-8 returned by %.*s immediately after '%.*s'",
2670 (int) (e - backslash_ptr + 1), backslash_ptr,
2671 (int) ((char *) first_bad_char_loc - str), str
2672 ),
2673 SVf_UTF8);
2674 return NULL;
2675 }
2676 }
140b12ad 2677
bde9e88d 2678 return res;
140b12ad 2679
bde9e88d 2680 bad_charname: {
bde9e88d
KW
2681
2682 /* The final %.*s makes sure that should the trailing NUL be missing
2683 * that this print won't run off the end of the string */
2684 yyerror_pv(
2685 Perl_form(aTHX_
2686 "Invalid character in \\N{...}; marked by <-- HERE in %.*s<-- HERE %.*s",
2d8eb851
KW
2687 (int)(s - backslash_ptr + 1), backslash_ptr,
2688 (int)(e - s + 1), s + 1
bde9e88d
KW
2689 ),
2690 UTF ? SVf_UTF8 : 0);
2691 return NULL;
2692 }
2d8eb851
KW
2693
2694 multi_spaces:
2695 yyerror_pv(
2696 Perl_form(aTHX_
2697 "charnames alias definitions may not contain a sequence of "
2698 "multiple spaces; marked by <-- HERE in %.*s<-- HERE %.*s",
2699 (int)(s - backslash_ptr + 1), backslash_ptr,
2700 (int)(e - s + 1), s + 1
2701 ),
2702 UTF ? SVf_UTF8 : 0);
2703 return NULL;
6f613c73
KW
2704}
2705
02aa26ce
NT
2706/*
2707 scan_const
2708
9da1dd8f
DM
2709 Extracts the next constant part of a pattern, double-quoted string,
2710 or transliteration. This is terrifying code.
2711
2712 For example, in parsing the double-quoted string "ab\x63$d", it would
2713 stop at the '$' and return an OP_CONST containing 'abc'.
02aa26ce 2714
94def140 2715 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 2716 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 2717 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 2718
94def140
TS
2719 Returns a pointer to the character scanned up to. If this is
2720 advanced from the start pointer supplied (i.e. if anything was
9da1dd8f 2721 successfully parsed), will leave an OP_CONST for the substring scanned
6154021b 2722 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
2723 by looking at the next characters herself.
2724
02aa26ce 2725 In patterns:
9da1dd8f 2726 expand:
537124e4
KW
2727 \N{FOO} => \N{U+hex_for_character_FOO}
2728 (if FOO expands to multiple characters, expands to \N{U+xx.XX.yy ...})
9da1dd8f
DM
2729
2730 pass through:
2731 all other \-char, including \N and \N{ apart from \N{ABC}
2732
2733 stops on:
2734 @ and $ where it appears to be a var, but not for $ as tail anchor
2735 \l \L \u \U \Q \E
2736 (?{ or (??{
2737
02aa26ce
NT
2738
2739 In transliterations:
2740 characters are VERY literal, except for - not at the start or end
94def140
TS
2741 of the string, which indicates a range. If the range is in bytes,
2742 scan_const expands the range to the full set of intermediate
2743 characters. If the range is in utf8, the hyphen is replaced with
2744 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
2745
2746 In double-quoted strings:
2747 backslashes:
2748 double-quoted style: \r and \n
ff3f963a 2749 constants: \x31, etc.
94def140 2750 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
2751 case and quoting: \U \Q \E
2752 stops on @ and $
2753
2754 scan_const does *not* construct ops to handle interpolated strings.
2755 It stops processing as soon as it finds an embedded $ or @ variable
2756 and leaves it to the caller to work out what's going on.
2757
94def140
TS
2758 embedded arrays (whether in pattern or not) could be:
2759 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2760
2761 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
2762
2763 $ in pattern could be $foo or could be tail anchor. Assumption:
2764 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 2765 followed by one of "()| \r\n\t"
02aa26ce 2766
9da1dd8f 2767 \1 (backreferences) are turned into $1 in substitutions
02aa26ce
NT
2768
2769 The structure of the code is
2770 while (there's a character to process) {
94def140
TS
2771 handle transliteration ranges
2772 skip regexp comments /(?#comment)/ and codes /(?{code})/
2773 skip #-initiated comments in //x patterns
2774 check for embedded arrays
02aa26ce
NT
2775 check for embedded scalars
2776 if (backslash) {
94def140 2777 deprecate \1 in substitution replacements
02aa26ce
NT
2778 handle string-changing backslashes \l \U \Q \E, etc.
2779 switch (what was escaped) {
94def140 2780 handle \- in a transliteration (becomes a literal -)
ff3f963a 2781 if a pattern and not \N{, go treat as regular character
94def140
TS
2782 handle \132 (octal characters)
2783 handle \x15 and \x{1234} (hex characters)
ff3f963a 2784 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
TS
2785 handle \cV (control characters)
2786 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 2787 } (end switch)
77a135fe 2788 continue
02aa26ce 2789 } (end if backslash)
77a135fe 2790 handle regular character
02aa26ce 2791 } (end while character to read)
4e553d73 2792
02aa26ce
NT
2793*/
2794
76e3520e 2795STATIC char *
cea2e8a9 2796S_scan_const(pTHX_ char *start)
79072805 2797{
eb578fdb 2798 char *send = PL_bufend; /* end of the constant */
dc023dbb
KW
2799 SV *sv = newSV(send - start); /* sv for the constant. See note below
2800 on sizing. */
eb578fdb
KW
2801 char *s = start; /* start of the constant */
2802 char *d = SvPVX(sv); /* destination for copies */
dc023dbb
KW
2803 bool dorange = FALSE; /* are we in a translit range? */
2804 bool didrange = FALSE; /* did we just finish a range? */
2805 bool in_charclass = FALSE; /* within /[...]/ */
2806 bool has_utf8 = FALSE; /* Output constant is UTF8 */
2807 bool this_utf8 = cBOOL(UTF); /* Is the source string assumed to be
2808 UTF8? But, this can show as true
2809 when the source isn't utf8, as for
2810 example when it is entirely composed
2811 of hex constants */
6f613c73 2812 SV *res; /* result from charnames */
77a135fe
KW
2813
2814 /* Note on sizing: The scanned constant is placed into sv, which is
2815 * initialized by newSV() assuming one byte of output for every byte of
2816 * input. This routine expects newSV() to allocate an extra byte for a
2817 * trailing NUL, which this routine will append if it gets to the end of
2818 * the input. There may be more bytes of input than output (eg., \N{LATIN
2819 * CAPITAL LETTER A}), or more output than input if the constant ends up
2820 * recoded to utf8, but each time a construct is found that might increase
2821 * the needed size, SvGROW() is called. Its size parameter each time is
2822 * based on the best guess estimate at the time, namely the length used so
2823 * far, plus the length the current construct will occupy, plus room for
2824 * the trailing NUL, plus one byte for every input byte still unscanned */
2825
c3320c2a
KW
2826 UV uv = UV_MAX; /* Initialize to weird value to try to catch any uses
2827 before set */
4c3a8340
TS
2828#ifdef EBCDIC
2829 UV literal_endpoint = 0;
e294cc5d 2830 bool native_range = TRUE; /* turned to FALSE if the first endpoint is Unicode. */
4c3a8340 2831#endif
012bcf8d 2832
7918f24d
NC
2833 PERL_ARGS_ASSERT_SCAN_CONST;
2834
bb16bae8 2835 assert(PL_lex_inwhat != OP_TRANSR);
2b9d42f0
NIS
2836 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
2837 /* If we are doing a trans and we know we want UTF8 set expectation */
2838 has_utf8 = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
2839 this_utf8 = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
2840 }
2841
b899e89d
FC
2842 /* Protect sv from errors and fatal warnings. */
2843 ENTER_with_name("scan_const");
2844 SAVEFREESV(sv);
2b9d42f0 2845
79072805 2846 while (s < send || dorange) {
ff3f963a 2847
02aa26ce 2848 /* get transliterations out of the way (they're most literal) */
3280af22 2849 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 2850 /* expand a range A-Z to the full set of characters. AIE! */
79072805 2851 if (dorange) {
1ba5c669
JH
2852 I32 i; /* current expanded character */
2853 I32 min; /* first character in range */
2854 I32 max; /* last character in range */
02aa26ce 2855
e294cc5d
JH
2856#ifdef EBCDIC
2857 UV uvmax = 0;
2858#endif
2859
2860 if (has_utf8
2861#ifdef EBCDIC
2862 && !native_range
2863#endif
1953db30 2864 ) {
9d4ba2ae 2865 char * const c = (char*)utf8_hop((U8*)d, -1);
8973db79
JH
2866 char *e = d++;
2867 while (e-- > c)
2868 *(e + 1) = *e;
e7214ce8 2869 *c = (char) ILLEGAL_UTF8_BYTE;
8973db79
JH
2870 /* mark the range as done, and continue */
2871 dorange = FALSE;
2872 didrange = TRUE;
2873 continue;
2874 }
2b9d42f0 2875
95a20fc0 2876 i = d - SvPVX_const(sv); /* remember current offset */
e294cc5d
JH
2877#ifdef EBCDIC
2878 SvGROW(sv,
dc023dbb
KW
2879 SvLEN(sv) + ((has_utf8)
2880 ? (512 - UTF_CONTINUATION_MARK
2881 + UNISKIP(0x100))
e294cc5d
JH
2882 : 256));
2883 /* How many two-byte within 0..255: 128 in UTF-8,
2884 * 96 in UTF-8-mod. */
2885#else
9cbb5ea2 2886 SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */
e294cc5d 2887#endif
9cbb5ea2 2888 d = SvPVX(sv) + i; /* refresh d after realloc */
e294cc5d
JH
2889#ifdef EBCDIC
2890 if (has_utf8) {
2891 int j;
2892 for (j = 0; j <= 1; j++) {
2893 char * const c = (char*)utf8_hop((U8*)d, -1);
2894 const UV uv = utf8n_to_uvchr((U8*)c, d - c, NULL, 0);
2895 if (j)
2896 min = (U8)uv;
2897 else if (uv < 256)
2898 max = (U8)uv;
2899 else {
2900 max = (U8)0xff; /* only to \xff */
2901 uvmax = uv; /* \x{100} to uvmax */
2902 }
2903 d = c; /* eat endpoint chars */
2904 }
2905 }
2906 else {
2907#endif
2908 d -= 2; /* eat the first char and the - */
2909 min = (U8)*d; /* first char in range */
2910 max = (U8)d[1]; /* last char in range */
2911#ifdef EBCDIC
2912 }
2913#endif
8ada0baa 2914
c2e66d9e 2915 if (min > max) {
01ec43d0 2916 Perl_croak(aTHX_
d1573ac7 2917 "Invalid range \"%c-%c\" in transliteration operator",
1ba5c669 2918 (char)min, (char)max);
c2e66d9e
GS
2919 }
2920
c7f1f016 2921#ifdef EBCDIC
dc023dbb
KW
2922 /* Because of the discontinuities in EBCDIC A-Z and a-z, expand
2923 * any subsets of these ranges into individual characters */
4c3a8340 2924 if (literal_endpoint == 2 &&
c286a9a6 2925 ((isLOWER_A(min) && isLOWER_A(max)) ||
75e9e7bf
KW
2926 (isUPPER_A(min) && isUPPER_A(max))))
2927 {
2928 for (i = min; i <= max; i++) {
2929 if (isALPHA_A(i))
2930 *d++ = i;
8ada0baa
JH
2931 }
2932 }
2933 else
2934#endif
2935 for (i = min; i <= max; i++)
e294cc5d
JH
2936#ifdef EBCDIC
2937 if (has_utf8) {
55d09dc8 2938 append_utf8_from_native_byte(i, &d);
e294cc5d
JH
2939 }
2940 else
2941#endif
2942 *d++ = (char)i;
2943
2944#ifdef EBCDIC
2945 if (uvmax) {
2946 d = (char*)uvchr_to_utf8((U8*)d, 0x100);
2947 if (uvmax > 0x101)
e7214ce8 2948 *d++ = (char) ILLEGAL_UTF8_BYTE;
e294cc5d
JH
2949 if (uvmax > 0x100)
2950 d = (char*)uvchr_to_utf8((U8*)d, uvmax);
2951 }
2952#endif
02aa26ce
NT
2953
2954 /* mark the range as done, and continue */
79072805 2955 dorange = FALSE;
01ec43d0 2956 didrange = TRUE;
4c3a8340
TS
2957#ifdef EBCDIC
2958 literal_endpoint = 0;
2959#endif
79072805 2960 continue;
4e553d73 2961 }
02aa26ce
NT
2962
2963 /* range begins (ignore - as first or last char) */
79072805 2964 else if (*s == '-' && s+1 < send && s != start) {
4e553d73 2965 if (didrange) {
1fafa243 2966 Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
01ec43d0 2967 }
e294cc5d
JH
2968 if (has_utf8
2969#ifdef EBCDIC
2970 && !native_range
2971#endif
2972 ) {
e7214ce8 2973 *d++ = (char) ILLEGAL_UTF8_BYTE; /* use illegal utf8 byte--see pmtrans */
a0ed51b3
LW
2974 s++;
2975 continue;
2976 }
79072805
LW
2977 dorange = TRUE;
2978 s++;
01ec43d0
GS
2979 }
2980 else {
2981 didrange = FALSE;
4c3a8340
TS
2982#ifdef EBCDIC
2983 literal_endpoint = 0;
e294cc5d 2984 native_range = TRUE;
4c3a8340 2985#endif
01ec43d0 2986 }
79072805 2987 }
02aa26ce
NT
2988
2989 /* if we get here, we're not doing a transliteration */
2990
e4a2df84
DM
2991 else if (*s == '[' && PL_lex_inpat && !in_charclass) {
2992 char *s1 = s-1;
2993 int esc = 0;
2994 while (s1 >= start && *s1-- == '\\')
2995 esc = !esc;
2996 if (!esc)
2997 in_charclass = TRUE;
2998 }
2866decb 2999
e4a2df84
DM
3000 else if (*s == ']' && PL_lex_inpat && in_charclass) {
3001 char *s1 = s-1;
3002 int esc = 0;
3003 while (s1 >= start && *s1-- == '\\')
3004 esc = !esc;
3005 if (!esc)
3006 in_charclass = FALSE;
3007 }
2866decb 3008
9da1dd8f
DM
3009 /* skip for regexp comments /(?#comment)/, except for the last
3010 * char, which will be done separately.
3011 * Stop on (?{..}) and friends */
3012
c30fc27b 3013 else if (*s == '(' && PL_lex_inpat && s[1] == '?' && !in_charclass) {
cc6b7395 3014 if (s[2] == '#') {
e994fd66 3015 while (s+1 < send && *s != ')')
5ff03569 3016 *d++ = *s++;
155aba94 3017 }
c30fc27b 3018 else if (!PL_lex_casemods &&
d3cec5e5
DM
3019 ( s[2] == '{' /* This should match regcomp.c */
3020 || (s[2] == '?' && s[3] == '{')))
155aba94 3021 {
9da1dd8f 3022 break;
cc6b7395 3023 }
748a9306 3024 }
02aa26ce
NT
3025
3026 /* likewise skip #-initiated comments in //x patterns */
c30fc27b 3027 else if (*s == '#' && PL_lex_inpat && !in_charclass &&
73134a2e 3028 ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED) {
748a9306 3029 while (s+1 < send && *s != '\n')
5ff03569 3030 *d++ = *s++;
748a9306 3031 }
02aa26ce 3032
9da1dd8f
DM
3033 /* no further processing of single-quoted regex */
3034 else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'')
3035 goto default_action;
3036
5d1d4326 3037 /* check for embedded arrays
da6eedaa 3038 (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
5d1d4326 3039 */
1749ea0d 3040 else if (*s == '@' && s[1]) {
8a2bca12 3041 if (isWORDCHAR_lazy_if(s+1,UTF))
1749ea0d
TS
3042 break;
3043 if (strchr(":'{$", s[1]))
3044 break;
3045 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
3046 break; /* in regexp, neither @+ nor @- are interpolated */
3047 }
02aa26ce
NT
3048
3049 /* check for embedded scalars. only stop if we're sure it's a
3050 variable.
3051 */
79072805 3052 else if (*s == '$') {
3280af22 3053 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
79072805 3054 break;
77772344 3055 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
a2a5de95
NC
3056 if (s[1] == '\\') {
3057 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
3058 "Possible unintended interpolation of $\\ in regex");
77772344 3059 }
79072805 3060 break; /* in regexp, $ might be tail anchor */
77772344 3061 }
79072805 3062 }
02aa26ce 3063
2b9d42f0
NIS
3064 /* End of else if chain - OP_TRANS rejoin rest */
3065
02aa26ce 3066 /* backslashes */
79072805 3067 if (*s == '\\' && s+1 < send) {
ff3f963a
KW
3068 char* e; /* Can be used for ending '}', etc. */
3069
79072805 3070 s++;
02aa26ce 3071
7d0fc23c
KW
3072 /* warn on \1 - \9 in substitution replacements, but note that \11
3073 * is an octal; and \19 is \1 followed by '9' */
3280af22 3074 if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat &&
a0d0e21e 3075 isDIGIT(*s) && *s != '0' && !isDIGIT(s[1]))
79072805 3076 {
c782d7ee 3077 /* diag_listed_as: \%d better written as $%d */
a2a5de95 3078 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
79072805
LW
3079 *--s = '$';
3080 break;
3081 }
02aa26ce
NT
3082
3083 /* string-change backslash escapes */
838f2281 3084 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQF", *s)) {
79072805
LW
3085 --s;
3086 break;
3087 }
ff3f963a
KW
3088 /* In a pattern, process \N, but skip any other backslash escapes.
3089 * This is because we don't want to translate an escape sequence
3090 * into a meta symbol and have the regex compiler use the meta
3091 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
3092 * in spite of this, we do have to process \N here while the proper
3093 * charnames handler is in scope. See bugs #56444 and #62056.
3094 * There is a complication because \N in a pattern may also stand
3095 * for 'match a non-nl', and not mean a charname, in which case its
3096 * processing should be deferred to the regex compiler. To be a
3097 * charname it must be followed immediately by a '{', and not look
3098 * like \N followed by a curly quantifier, i.e., not something like
3099 * \N{3,}. regcurly returns a boolean indicating if it is a legal
3100 * quantifier */
3101 else if (PL_lex_inpat
3102 && (*s != 'N'
3103 || s[1] != '{'
412f55bb 3104 || regcurly(s + 1)))
ff3f963a 3105 {
4d73d076 3106 *d++ = '\\';
cc74c5bd
TS
3107 goto default_action;
3108 }
02aa26ce 3109
79072805 3110 switch (*s) {
02aa26ce
NT
3111
3112 /* quoted - in transliterations */
79072805 3113 case '-':
3280af22 3114 if (PL_lex_inwhat == OP_TRANS) {
79072805
LW
3115 *d++ = *s++;
3116 continue;
3117 }
924ba076 3118 /* FALLTHROUGH */
79072805 3119 default:
11b8faa4 3120 {
15861f94 3121 if ((isALPHANUMERIC(*s)))
a2a5de95
NC
3122 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
3123 "Unrecognized escape \\%c passed through",
3124 *s);
11b8faa4 3125 /* default action is to copy the quoted character */
f9a63242 3126 goto default_action;
11b8faa4 3127 }
02aa26ce 3128
632403cc 3129 /* eg. \132 indicates the octal constant 0132 */
79072805
LW
3130 case '0': case '1': case '2': case '3':
3131 case '4': case '5': case '6': case '7':
ba210ebe 3132 {
5e0a247b 3133 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
53305cf1 3134 STRLEN len = 3;
06972766 3135 uv = grok_oct(s, &len, &flags, NULL);
ba210ebe 3136 s += len;
5e0a247b
KW
3137 if (len < 3 && s < send && isDIGIT(*s)
3138 && ckWARN(WARN_MISC))
3139 {
3140 Perl_warner(aTHX_ packWARN(WARN_MISC),
3141 "%s", form_short_octal_warning(s, len));
3142 }
ba210ebe 3143 }
012bcf8d 3144 goto NUM_ESCAPE_INSERT;
02aa26ce 3145
f0a2b745
KW
3146 /* eg. \o{24} indicates the octal constant \024 */
3147 case 'o':
3148 {
454155d9 3149 const char* error;
f0a2b745 3150
00ce5563 3151 bool valid = grok_bslash_o(&s, &uv, &error,
80f4111b
KW
3152 TRUE, /* Output warning */
3153 FALSE, /* Not strict */
17896a85
KW
3154 TRUE, /* Output warnings for
3155 non-portables */
80f4111b 3156 UTF);
454155d9 3157 if (! valid) {
f0a2b745
KW
3158 yyerror(error);
3159 continue;
3160 }
3161 goto NUM_ESCAPE_INSERT;
3162 }
3163
77a135fe 3164 /* eg. \x24 indicates the hex constant 0x24 */
79072805 3165 case 'x':
a0481293 3166 {
a0481293 3167 const char* error;
355860ce 3168
00ce5563 3169 bool valid = grok_bslash_x(&s, &uv, &error,
80f4111b
KW
3170 TRUE, /* Output warning */
3171 FALSE, /* Not strict */
17896a85
KW
3172 TRUE, /* Output warnings for
3173 non-portables */
80f4111b 3174 UTF);
a0481293
KW
3175 if (! valid) {
3176 yyerror(error);
355860ce 3177 continue;
ba210ebe 3178 }
012bcf8d
GS
3179 }
3180
3181 NUM_ESCAPE_INSERT:
ff3f963a
KW
3182 /* Insert oct or hex escaped character. There will always be
3183 * enough room in sv since such escapes will be longer than any
3184 * UTF-8 sequence they can end up as, except if they force us
3185 * to recode the rest of the string into utf8 */
ba7cea30 3186
06972766 3187 /* Here uv is the ordinal of the next character being added */
6f2d5cbc 3188 if (!UVCHR_IS_INVARIANT(uv)) {
9aa983d2 3189 if (!has_utf8 && uv > 255) {
77a135fe
KW
3190 /* Might need to recode whatever we have accumulated so
3191 * far if it contains any chars variant in utf8 or
3192 * utf-ebcdic. */
3193
3194 SvCUR_set(sv, d - SvPVX_const(sv));
3195 SvPOK_on(sv);
3196 *d = '\0';
77a135fe 3197 /* See Note on sizing above. */
7bf79863
KW
3198 sv_utf8_upgrade_flags_grow(sv,
3199 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3200 UNISKIP(uv) + (STRLEN)(send - s) + 1);
77a135fe
KW
3201 d = SvPVX(sv) + SvCUR(sv);
3202 has_utf8 = TRUE;
012bcf8d
GS
3203 }
3204
77a135fe 3205 if (has_utf8) {
c80e42f3 3206 d = (char*)uvchr_to_utf8((U8*)d, uv);
f9a63242
JH
3207 if (PL_lex_inwhat == OP_TRANS &&
3208 PL_sublex_info.sub_op) {
3209 PL_sublex_info.sub_op->op_private |=
3210 (PL_lex_repl ? OPpTRANS_FROM_UTF
3211 : OPpTRANS_TO_UTF);
f9a63242 3212 }
e294cc5d
JH
3213#ifdef EBCDIC
3214 if (uv > 255 && !dorange)
3215 native_range = FALSE;
3216#endif
012bcf8d 3217 }
a0ed51b3 3218 else {
012bcf8d 3219 *d++ = (char)uv;
a0ed51b3 3220 }
012bcf8d
GS
3221 }
3222 else {
c4d5f83a 3223 *d++ = (char) uv;
a0ed51b3 3224 }
79072805 3225 continue;
02aa26ce 3226
4a2d328f 3227 case 'N':
ff3f963a
KW
3228 /* In a non-pattern \N must be a named character, like \N{LATIN
3229 * SMALL LETTER A} or \N{U+0041}. For patterns, it also can
3230 * mean to match a non-newline. For non-patterns, named
3231 * characters are converted to their string equivalents. In
3232 * patterns, named characters are not converted to their
3233 * ultimate forms for the same reasons that other escapes
3234 * aren't. Instead, they are converted to the \N{U+...} form
3235 * to get the value from the charnames that is in effect right
3236 * now, while preserving the fact that it was a named character
3237 * so that the regex compiler knows this */
3238
ff3f963a
KW
3239 /* The structure of this section of code (besides checking for
3240 * errors and upgrading to utf8) is:
3241 * Further disambiguate between the two meanings of \N, and if
3242 * not a charname, go process it elsewhere
0a96133f
KW
3243 * If of form \N{U+...}, pass it through if a pattern;
3244 * otherwise convert to utf8
3245 * Otherwise must be \N{NAME}: convert to \N{U+c1.c2...} if a
3246 * pattern; otherwise convert to utf8 */
ff3f963a
KW
3247
3248 /* Here, s points to the 'N'; the test below is guaranteed to
3249 * succeed if we are being called on a pattern as we already
3250 * know from a test above that the next character is a '{'.
3251 * On a non-pattern \N must mean 'named sequence, which
3252 * requires braces */
3253 s++;
3254 if (*s != '{') {
3255 yyerror("Missing braces on \\N{}");
3256 continue;
3257 }
3258 s++;
3259
0a96133f 3260 /* If there is no matching '}', it is an error. */
ff3f963a
KW
3261 if (! (e = strchr(s, '}'))) {
3262 if (! PL_lex_inpat) {
5777a3f7 3263 yyerror("Missing right brace on \\N{}");
0a96133f 3264 } else {
4407f1b8 3265 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N");
dbc0d4f2 3266 }
0a96133f 3267 continue;
ff3f963a 3268 }
cddc7ef4 3269
ff3f963a 3270 /* Here it looks like a named character */
cddc7ef4 3271
ff3f963a
KW
3272 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3273 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3274 | PERL_SCAN_DISALLOW_PREFIX;
3275 STRLEN len;
3276
3277 /* For \N{U+...}, the '...' is a unicode value even on
3278 * EBCDIC machines */
3279 s += 2; /* Skip to next char after the 'U+' */
3280 len = e - s;
3281 uv = grok_hex(s, &len, &flags, NULL);
3282 if (len == 0 || len != (STRLEN)(e - s)) {
3283 yyerror("Invalid hexadecimal number in \\N{U+...}");
3284 s = e + 1;
3285 continue;
3286 }
3287
3288 if (PL_lex_inpat) {
3289
e2a7e165
KW
3290 /* On non-EBCDIC platforms, pass through to the regex
3291 * compiler unchanged. The reason we evaluated the
3292 * number above is to make sure there wasn't a syntax
3293 * error. But on EBCDIC we convert to native so
3294 * downstream code can continue to assume it's native
3295 */
ff3f963a 3296 s -= 5; /* Include the '\N{U+' */
e2a7e165
KW
3297#ifdef EBCDIC
3298 d += my_snprintf(d, e - s + 1 + 1, /* includes the }
3299 and the \0 */
3300 "\\N{U+%X}",
3301 (unsigned int) UNI_TO_NATIVE(uv));
3302#else
ff3f963a
KW
3303 Copy(s, d, e - s + 1, char); /* 1 = include the } */
3304 d += e - s + 1;
e2a7e165 3305#endif
ff3f963a
KW
3306 }
3307 else { /* Not a pattern: convert the hex to string */
3308
3309 /* If destination is not in utf8, unconditionally
3310 * recode it to be so. This is because \N{} implies
3311 * Unicode semantics, and scalars have to be in utf8
3312 * to guarantee those semantics */
3313 if (! has_utf8) {
3314 SvCUR_set(sv, d - SvPVX_const(sv));
3315 SvPOK_on(sv);
3316 *d = '\0';
3317 /* See Note on sizing above. */
3318 sv_utf8_upgrade_flags_grow(
3319 sv,
3320 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3321 UNISKIP(uv) + (STRLEN)(send - e) + 1);
3322 d = SvPVX(sv) + SvCUR(sv);
3323 has_utf8 = TRUE;
3324 }
3325
a46469e6 3326 /* Add the (Unicode) code point to the output. */
ff3f963a 3327 if (UNI_IS_INVARIANT(uv)) {
a46469e6 3328 *d++ = (char) LATIN1_TO_NATIVE(uv);
ff3f963a 3329 }
a46469e6
KW
3330 else {
3331 d = (char*) uvoffuni_to_utf8_flags((U8*)d, uv, 0);
3332 }
ff3f963a
KW
3333 }
3334 }
6f613c73
KW
3335 else /* Here is \N{NAME} but not \N{U+...}. */
3336 if ((res = get_and_check_backslash_N_name(s, e)))
3337 {
3338 STRLEN len;
3339 const char *str = SvPV_const(res, len);
3340 if (PL_lex_inpat) {
ff3f963a
KW
3341
3342 if (! len) { /* The name resolved to an empty string */
3343 Copy("\\N{}", d, 4, char);
3344 d += 4;
3345 }
3346 else {
3347 /* In order to not lose information for the regex
3348 * compiler, pass the result in the specially made
3349 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3350 * the code points in hex of each character
3351 * returned by charnames */
3352
3353 const char *str_end = str + len;
3b721c4f 3354 const STRLEN off = d - SvPVX_const(sv);
94ca1619
KW
3355
3356 if (! SvUTF8(res)) {
3357 /* For the non-UTF-8 case, we can determine the
3358 * exact length needed without having to parse
3359 * through the string. Each character takes up
3360 * 2 hex digits plus either a trailing dot or
3361 * the "}" */
3362 d = off + SvGROW(sv, off
3363 + 3 * len
3364 + 6 /* For the "\N{U+", and
3365 trailing NUL */
3366 + (STRLEN)(send - e));
3367 Copy("\\N{U+", d, 5, char);
3368 d += 5;
3369 while (str < str_end) {
3370 char hex_string[4];
e8549682 3371 int len =
51f14a05
JH
3372 my_snprintf(hex_string,
3373 sizeof(hex_string),
e8549682
JH
3374 "%02X.", (U8) *str);
3375 PERL_MY_SNPRINTF_POST_GUARD(len, sizeof(hex_string));
94ca1619
KW
3376 Copy(hex_string, d, 3, char);
3377 d += 3;
3378 str++;
3379 }
3380 d--; /* We will overwrite below the final
3381 dot with a right brace */
3382 }
3383 else {
1953db30
KW
3384 STRLEN char_length; /* cur char's byte length */
3385
3386 /* and the number of bytes after this is
3387 * translated into hex digits */
3388 STRLEN output_length;
3389
3390 /* 2 hex per byte; 2 chars for '\N'; 2 chars
3391 * for max('U+', '.'); and 1 for NUL */
3392 char hex_string[2 * UTF8_MAXBYTES + 5];
3393
3394 /* Get the first character of the result. */
a46469e6 3395 U32 uv = utf8n_to_uvchr((U8 *) str,
1953db30
KW
3396 len,
3397 &char_length,
3398 UTF8_ALLOW_ANYUV);
3399 /* Convert first code point to hex, including
a46469e6 3400 * the boiler plate before it. */
1953db30
KW
3401 output_length =
3402 my_snprintf(hex_string, sizeof(hex_string),
a46469e6
KW
3403 "\\N{U+%X",
3404 (unsigned int) uv);
1953db30
KW
3405
3406 /* Make sure there is enough space to hold it */
3407 d = off + SvGROW(sv, off
3408 + output_length
3409 + (STRLEN)(send - e)
3410 + 2); /* '}' + NUL */
3411 /* And output it */
3412 Copy(hex_string, d, output_length, char);
3413 d += output_length;
3414
3415 /* For each subsequent character, append dot and
3416 * its ordinal in hex */
3417 while ((str += char_length) < str_end) {
3418 const STRLEN off = d - SvPVX_const(sv);
a46469e6 3419 U32 uv = utf8n_to_uvchr((U8 *) str,
1953db30
KW
3420 str_end - str,
3421 &char_length,
3422 UTF8_ALLOW_ANYUV);
3423 output_length =
3424 my_snprintf(hex_string,
a46469e6
KW
3425 sizeof(hex_string),
3426 ".%X",
3427 (unsigned int) uv);
1953db30
KW
3428
3429 d = off + SvGROW(sv, off
3430 + output_length
3431 + (STRLEN)(send - e)
3432 + 2); /* '}' + NUL */
3433 Copy(hex_string, d, output_length, char);
3434 d += output_length;
3435 }
94ca1619 3436 }
ff3f963a
KW
3437
3438 *d++ = '}'; /* Done. Add the trailing brace */
3439 }
3440 }
3441 else { /* Here, not in a pattern. Convert the name to a
3442 * string. */
3443
3444 /* If destination is not in utf8, unconditionally
3445 * recode it to be so. This is because \N{} implies
3446 * Unicode semantics, and scalars have to be in utf8
3447 * to guarantee those semantics */
3448 if (! has_utf8) {
3449 SvCUR_set(sv, d - SvPVX_const(sv));
3450 SvPOK_on(sv);
3451 *d = '\0';
3452 /* See Note on sizing above. */
3453 sv_utf8_upgrade_flags_grow(sv,
3454 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3455 len + (STRLEN)(send - s) + 1);
3456 d = SvPVX(sv) + SvCUR(sv);
3457 has_utf8 = TRUE;
3458 } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
3459
3460 /* See Note on sizing above. (NOTE: SvCUR() is not
3461 * set correctly here). */
3462 const STRLEN off = d - SvPVX_const(sv);
3463 d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1);
3464 }
7fc82458
KW
3465 if (! SvUTF8(res)) { /* Make sure is \N{} return is UTF-8 */
3466 sv_utf8_upgrade(res);
3467 str = SvPV_const(res, len);
3468 }
ff3f963a
KW
3469 Copy(str, d, len, char);
3470 d += len;
423cee85 3471 }
6f613c73 3472
423cee85 3473 SvREFCNT_dec(res);
cb233ae3 3474
cb233ae3 3475 } /* End \N{NAME} */
ff3f963a
KW
3476#ifdef EBCDIC
3477 if (!dorange)
3478 native_range = FALSE; /* \N{} is defined to be Unicode */
3479#endif
3480 s = e + 1; /* Point to just after the '}' */
423cee85
JH
3481 continue;
3482
02aa26ce 3483 /* \c is a control character */
79072805
LW
3484 case 'c':
3485 s++;
961ce445 3486 if (s < send) {
421e43ba 3487 *d++ = grok_bslash_c(*s++, 1);
ba210ebe 3488 }
961ce445
RGS
3489 else {
3490 yyerror("Missing control char name in \\c");
3491 }
79072805 3492 continue;
02aa26ce
NT
3493
3494 /* printf-style backslashes, formfeeds, newlines, etc */
79072805 3495 case 'b':
4d73d076 3496 *d++ = '\b';
79072805
LW
3497 break;
3498 case 'n':
4d73d076 3499 *d++ = '\n';
79072805
LW
3500 break;
3501 case 'r':
4d73d076 3502 *d++ = '\r';
79072805
LW
3503 break;
3504 case 'f':
4d73d076 3505 *d++ = '\f';
79072805
LW
3506 break;
3507 case 't':
4d73d076 3508 *d++ = '\t';
79072805 3509 break;
34a3fe2a 3510 case 'e':
da0334e8 3511 *d++ = ESC_NATIVE;
34a3fe2a
PP
3512 break;
3513 case 'a':
4d73d076 3514 *d++ = '\a';
79072805 3515 break;
02aa26ce
NT
3516 } /* end switch */
3517
79072805
LW
3518 s++;
3519 continue;
02aa26ce 3520 } /* end if (backslash) */
4c3a8340
TS
3521#ifdef EBCDIC
3522 else
3523 literal_endpoint++;
3524#endif
02aa26ce 3525
f9a63242 3526 default_action:
77a135fe
KW
3527 /* If we started with encoded form, or already know we want it,
3528 then encode the next character */
6f2d5cbc 3529 if (! NATIVE_BYTE_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
2b9d42f0 3530 STRLEN len = 1;
77a135fe
KW
3531
3532
3533 /* One might think that it is wasted effort in the case of the
3534 * source being utf8 (this_utf8 == TRUE) to take the next character
3535 * in the source, convert it to an unsigned value, and then convert
3536 * it back again. But the source has not been validated here. The
3537 * routine that does the conversion checks for errors like
3538 * malformed utf8 */
3539
233ca360
KW
3540 const UV nextuv = (this_utf8)
3541 ? utf8n_to_uvchr((U8*)s, send - s, &len, 0)
3542 : (UV) ((U8) *s);
5aaebcb3 3543 const STRLEN need = UNISKIP(nextuv);
77a135fe
KW
3544 if (!has_utf8) {
3545 SvCUR_set(sv, d - SvPVX_const(sv));
3546 SvPOK_on(sv);
3547 *d = '\0';
77a135fe 3548 /* See Note on sizing above. */
7bf79863
KW
3549 sv_utf8_upgrade_flags_grow(sv,
3550 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3551 need + (STRLEN)(send - s) + 1);
77a135fe
KW
3552 d = SvPVX(sv) + SvCUR(sv);
3553 has_utf8 = TRUE;
3554 } else if (need > len) {
3555 /* encoded value larger than old, may need extra space (NOTE:
3556 * SvCUR() is not set correctly here). See Note on sizing
3557 * above. */
9d4ba2ae 3558 const STRLEN off = d - SvPVX_const(sv);
77a135fe 3559 d = SvGROW(sv, off + need + (STRLEN)(send - s) + 1) + off;
2b9d42f0 3560 }
77a135fe
KW
3561 s += len;
3562
5f66b61c 3563 d = (char*)uvchr_to_utf8((U8*)d, nextuv);
e294cc5d
JH
3564#ifdef EBCDIC
3565 if (uv > 255 && !dorange)
3566 native_range = FALSE;
3567#endif
2b9d42f0
NIS
3568 }
3569 else {
5ff03569 3570 *d++ = *s++;
2b9d42f0 3571 }
02aa26ce
NT
3572 } /* while loop to process each character */
3573
3574 /* terminate the string and set up the sv */
79072805 3575 *d = '\0';
95a20fc0 3576 SvCUR_set(sv, d - SvPVX_const(sv));
2b9d42f0 3577 if (SvCUR(sv) >= SvLEN(sv))
5637ef5b
NC
3578 Perl_croak(aTHX_ "panic: constant overflowed allocated space, %"UVuf
3579 " >= %"UVuf, (UV)SvCUR(sv), (UV)SvLEN(sv));
2b9d42f0 3580
79072805 3581 SvPOK_on(sv);
9f4817db 3582 if (PL_encoding && !has_utf8) {
d0063567
DK
3583 sv_recode_to_utf8(sv, PL_encoding);
3584 if (SvUTF8(sv))
3585 has_utf8 = TRUE;
9f4817db 3586 }
2b9d42f0 3587 if (has_utf8) {
7e2040f0 3588 SvUTF8_on(sv);
2b9d42f0 3589 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
d0063567 3590 PL_sublex_info.sub_op->op_private |=
2b9d42f0
NIS
3591 (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
3592 }
3593 }
79072805 3594
02aa26ce 3595 /* shrink the sv if we allocated more than we used */
79072805 3596 if (SvCUR(sv) + 5 < SvLEN(sv)) {
1da4ca5f 3597 SvPV_shrink_to_cur(sv);
79072805 3598 }
02aa26ce 3599
6154021b 3600 /* return the substring (via pl_yylval) only if we parsed anything */
f0feb466
FC
3601 if (s > start) {
3602 char *s2 = start;
ffdb8b16
FC
3603 for (; s2 < s; s2++) {
3604 if (*s2 == '\n')
3605 COPLINE_INC_WITH_HERELINES;
3606 }
b899e89d 3607 SvREFCNT_inc_simple_void_NN(sv);
4f3e2518
DM
3608 if ( (PL_hints & ( PL_lex_inpat ? HINT_NEW_RE : HINT_NEW_STRING ))
3609 && ! PL_parser->lex_re_reparsing)
3610 {
eb0d8d16
NC
3611 const char *const key = PL_lex_inpat ? "qr" : "q";
3612 const STRLEN keylen = PL_lex_inpat ? 2 : 1;
3613 const char *type;
3614 STRLEN typelen;
3615
3616 if (PL_lex_inwhat == OP_TRANS) {
3617 type = "tr";
3618 typelen = 2;
3619 } else if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat) {
3620 type = "s";
3621 typelen = 1;
9da1dd8f
DM
3622 } else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'') {
3623 type = "q";
3624 typelen = 1;
eb0d8d16
NC
3625 } else {
3626 type = "qq";
3627 typelen = 2;
3628 }
3629
3630 sv = S_new_constant(aTHX_ start, s - start, key, keylen, sv, NULL,
3631 type, typelen);
3632 }
6154021b 3633 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
b899e89d
FC
3634 }
3635 LEAVE_with_name("scan_const");
79072805
LW
3636 return s;
3637}
3638
ffb4593c
NT
3639/* S_intuit_more
3640 * Returns TRUE if there's more to the expression (e.g., a subscript),
3641 * FALSE otherwise.
ffb4593c
NT
3642 *
3643 * It deals with "$foo[3]" and /$foo[3]/ and /$foo[0123456789$]+/
3644 *
3645 * ->[ and ->{ return TRUE
ff25e5db 3646 * ->$* ->$#* ->@* ->@[ ->@{ return TRUE if postderef_qq is enabled
ffb4593c
NT
3647 * { and [ outside a pattern are always subscripts, so return TRUE
3648 * if we're outside a pattern and it's not { or [, then return FALSE
3649 * if we're in a pattern and the first char is a {
3650 * {4,5} (any digits around the comma) returns FALSE
3651 * if we're in a pattern and the first char is a [
3652 * [] returns FALSE
3653 * [SOMETHING] has a funky algorithm to decide whether it's a
3654 * character class or not. It has to deal with things like
3655 * /$foo[-3]/ and /$foo[$bar]/ as well as /$foo[$\d]+/
3656 * anything else returns TRUE
3657 */
3658
9cbb5ea2
GS
3659/* This is the one truly awful dwimmer necessary to conflate C and sed. */
3660
76e3520e 3661STATIC int
5aaab254 3662S_intuit_more(pTHX_ char *s)
79072805