This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Avoid vivifying stuff when looking up barewords
[perl5.git] / toke.c
<
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
f0e67a1d
Z
26This is the lower layer of the Perl parser, managing characters and tokens.
27
28=for apidoc AmU|yy_parser *|PL_parser
29
30Pointer to a structure encapsulating the state of the parsing operation
31currently in progress. The pointer can be locally changed to perform
32a nested parse without interfering with the state of an outer parse.
33Individual members of C<PL_parser> have their own documentation.
34
35=cut
36*/
37
378cc40b 38#include "EXTERN.h"
864dbfa3 39#define PERL_IN_TOKE_C
378cc40b 40#include "perl.h"
04e98a4d 41#include "dquote_static.c"
378cc40b 42
eb0d8d16
NC
43#define new_constant(a,b,c,d,e,f,g) \
44 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
45
6154021b 46#define pl_yylval (PL_parser->yylval)
d3b6f988 47
199e78b7
DM
48/* XXX temporary backwards compatibility */
49#define PL_lex_brackets (PL_parser->lex_brackets)
78cdf107
Z
50#define PL_lex_allbrackets (PL_parser->lex_allbrackets)
51#define PL_lex_fakeeof (PL_parser->lex_fakeeof)
199e78b7
DM
52#define PL_lex_brackstack (PL_parser->lex_brackstack)
53#define PL_lex_casemods (PL_parser->lex_casemods)
54#define PL_lex_casestack (PL_parser->lex_casestack)
55#define PL_lex_defer (PL_parser->lex_defer)
56#define PL_lex_dojoin (PL_parser->lex_dojoin)
199e78b7
DM
57#define PL_lex_formbrack (PL_parser->lex_formbrack)
58#define PL_lex_inpat (PL_parser->lex_inpat)
59#define PL_lex_inwhat (PL_parser->lex_inwhat)
60#define PL_lex_op (PL_parser->lex_op)
61#define PL_lex_repl (PL_parser->lex_repl)
62#define PL_lex_starts (PL_parser->lex_starts)
63#define PL_lex_stuff (PL_parser->lex_stuff)
64#define PL_multi_start (PL_parser->multi_start)
65#define PL_multi_open (PL_parser->multi_open)
66#define PL_multi_close (PL_parser->multi_close)
199e78b7
DM
67#define PL_preambled (PL_parser->preambled)
68#define PL_sublex_info (PL_parser->sublex_info)
bdc0bf6f 69#define PL_linestr (PL_parser->linestr)
c2598295
DM
70#define PL_expect (PL_parser->expect)
71#define PL_copline (PL_parser->copline)
f06b5848
DM
72#define PL_bufptr (PL_parser->bufptr)
73#define PL_oldbufptr (PL_parser->oldbufptr)
74#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
75#define PL_linestart (PL_parser->linestart)
76#define PL_bufend (PL_parser->bufend)
77#define PL_last_uni (PL_parser->last_uni)
78#define PL_last_lop (PL_parser->last_lop)
79#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 80#define PL_lex_state (PL_parser->lex_state)
2f9285f8 81#define PL_rsfp (PL_parser->rsfp)
5486870f 82#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
83#define PL_in_my (PL_parser->in_my)
84#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 85#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 86#define PL_multi_end (PL_parser->multi_end)
13765c85 87#define PL_error_count (PL_parser->error_count)
199e78b7 88
fb205e7a
DM
89# define PL_nexttoke (PL_parser->nexttoke)
90# define PL_nexttype (PL_parser->nexttype)
91# define PL_nextval (PL_parser->nextval)
199e78b7 92
a1894d81 93static const char* const ident_too_long = "Identifier too long";
8903cb82 94
9ded7720 95# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2 96
a7aaec61
Z
97#define XENUMMASK 0x3f
98#define XFAKEEOF 0x40
99#define XFAKEBRACK 0x80
9059aa12 100
39e02b42
JH
101#ifdef USE_UTF8_SCRIPTS
102# define UTF (!IN_BYTES)
2b9d42f0 103#else
802a15e9 104# define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
2b9d42f0 105#endif
a0ed51b3 106
b1fc3636
CJ
107/* The maximum number of characters preceding the unrecognized one to display */
108#define UNRECOGNIZED_PRECEDE_COUNT 10
109
61f0cdd9 110/* In variables named $^X, these are the legal values for X.
2b92dfce
GS
111 * 1999-02-27 mjd-perl-patch@plover.com */
112#define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
113
14bd96d0 114#define SPACE_OR_TAB(c) isBLANK_A(c)
bf4acbe4 115
9ff909cf
JH
116#define HEXFP_PEEK(s) \
117 (((s[0] == '.') && \
118 (isXDIGIT(s[1]) || isALPHA_FOLD_EQ(s[1], 'p'))) || \
119 isALPHA_FOLD_EQ(s[0], 'p'))
120
ffb4593c
NT
121/* LEX_* are values for PL_lex_state, the state of the lexer.
122 * They are arranged oddly so that the guard on the switch statement
79072805 123 * can get by with a single comparison (if the compiler is smart enough).
9da1dd8f
DM
124 *
125 * These values refer to the various states within a sublex parse,
126 * i.e. within a double quotish string
79072805
LW
127 */
128
fb73857a
PP
129/* #define LEX_NOTPARSING 11 is done in perl.h. */
130
b6007c36
DM
131#define LEX_NORMAL 10 /* normal code (ie not within "...") */
132#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
133#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
134#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
135#define LEX_INTERPSTART 6 /* expecting the start of a $var */
136
137 /* at end of code, eg "$x" followed by: */
138#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
139#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
140
141#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
142 string or after \E, $foo, etc */
143#define LEX_INTERPCONST 2 /* NOT USED */
144#define LEX_FORMLINE 1 /* expecting a format line */
145#define LEX_KNOWNEXT 0 /* next token known; just return it */
146
79072805 147
bbf60fe6 148#ifdef DEBUGGING
27da23d5 149static const char* const lex_state_names[] = {
bbf60fe6
DM
150 "KNOWNEXT",
151 "FORMLINE",
152 "INTERPCONST",
153 "INTERPCONCAT",
154 "INTERPENDMAYBE",
155 "INTERPEND",
156 "INTERPSTART",
157 "INTERPPUSH",
158 "INTERPCASEMOD",
159 "INTERPNORMAL",
160 "NORMAL"
161};
162#endif
163
79072805 164#include "keywords.h"
fe14fcc3 165
ffb4593c
NT
166/* CLINE is a macro that ensures PL_copline has a sane value */
167
57843af0 168#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 169
29595ff2
NC
170# define SKIPSPACE0(s) skipspace(s)
171# define SKIPSPACE1(s) skipspace(s)
172# define SKIPSPACE2(s,tsv) skipspace(s)
173# define PEEKSPACE(s) skipspace(s)
29595ff2 174
ffb4593c
NT
175/*
176 * Convenience functions to return different tokens and prime the
9cbb5ea2 177 * lexer for the next token. They all take an argument.
ffb4593c
NT
178 *
179 * TOKEN : generic token (used for '(', DOLSHARP, etc)
180 * OPERATOR : generic operator
181 * AOPERATOR : assignment operator
182 * PREBLOCK : beginning the block after an if, while, foreach, ...
183 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
184 * PREREF : *EXPR where EXPR is not a simple identifier
185 * TERM : expression term
89f35911 186 * POSTDEREF : postfix dereference (->$* ->@[...] etc.)
ffb4593c
NT
187 * LOOPX : loop exiting command (goto, last, dump, etc)
188 * FTST : file test operator
189 * FUN0 : zero-argument function
7eb971ee 190 * FUN0OP : zero-argument function, with its op created in this file
2d2e263d 191 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
192 * BOop : bitwise or or xor
193 * BAop : bitwise and
194 * SHop : shift operator
195 * PWop : power operator
9cbb5ea2 196 * PMop : pattern-matching operator
ffb4593c 197 * Aop : addition-level operator
e4916dd1 198 * AopNOASSIGN : addition-level operator that is never part of .=
ffb4593c
NT
199 * Mop : multiplication-level operator
200 * Eop : equality-testing operator
e5edeb50 201 * Rop : relational operator <= != gt
ffb4593c
NT
202 *
203 * Also see LOP and lop() below.
204 */
205
998054bd 206#ifdef DEBUGGING /* Serve -DT. */
704d4215 207# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 208#else
bbf60fe6 209# define REPORT(retval) (retval)
998054bd
SC
210#endif
211
bbf60fe6
DM
212#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
213#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
214#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, REPORT(retval)))
215#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
216#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
217#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
218#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
89f35911 219#define POSTDEREF(f) return (PL_bufptr = s, S_postderef(aTHX_ REPORT(f),s[1]))
a49203fd 220#define LOOPX(f) return (PL_bufptr = force_word(s,WORD,TRUE,FALSE), \
7a61bf3c 221 pl_yylval.ival=f, \
a49203fd 222 PL_expect = PL_nexttoke ? XOPERATOR : XTERM, \
7a61bf3c 223 REPORT((int)LOOPEX))
6154021b
RGS
224#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
225#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
7eb971ee 226#define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
6154021b
RGS
227#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
228#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITOROP)))
229#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITANDOP)))
230#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)SHIFTOP)))
231#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)POWOP)))
232#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
233#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)ADDOP)))
e4916dd1 234#define AopNOASSIGN(f) return (pl_yylval.ival=f, PL_bufptr=s, REPORT((int)ADDOP))
6154021b
RGS
235#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MULOP)))
236#define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
237#define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
2f3197b3 238
a687059c
LW
239/* This bit of chicanery makes a unary function followed by
240 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
241 * The UNIDOR macro is for unary functions that can be followed by the //
242 * operator (such as C<shift // 0>).
a687059c 243 */
d68ce4ac 244#define UNI3(f,x,have_x) { \
6154021b 245 pl_yylval.ival = f; \
d68ce4ac 246 if (have_x) PL_expect = x; \
376fcdbf
AL
247 PL_bufptr = s; \
248 PL_last_uni = PL_oldbufptr; \
249 PL_last_lop_op = f; \
250 if (*s == '(') \
251 return REPORT( (int)FUNC1 ); \
29595ff2 252 s = PEEKSPACE(s); \
376fcdbf
AL
253 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
254 }
d68ce4ac
FC
255#define UNI(f) UNI3(f,XTERM,1)
256#define UNIDOR(f) UNI3(f,XTERMORDORDOR,1)
b5fb7ce3
FC
257#define UNIPROTO(f,optional) { \
258 if (optional) PL_last_uni = PL_oldbufptr; \
22393538
MH
259 OPERATOR(f); \
260 }
a687059c 261
d68ce4ac 262#define UNIBRACK(f) UNI3(f,0,0)
79072805 263
9f68db38 264/* grandfather return to old style */
78cdf107
Z
265#define OLDLOP(f) \
266 do { \
267 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
268 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
269 pl_yylval.ival = (f); \
270 PL_expect = XTERM; \
271 PL_bufptr = s; \
272 return (int)LSTOP; \
273 } while(0)
79072805 274
83944c01
FC
275#define COPLINE_INC_WITH_HERELINES \
276 STMT_START { \
277 CopLINE_inc(PL_curcop); \
851b527a
FC
278 if (PL_parser->herelines) \
279 CopLINE(PL_curcop) += PL_parser->herelines, \
280 PL_parser->herelines = 0; \
83944c01 281 } STMT_END
ffdb8b16
FC
282/* Called after scan_str to update CopLINE(PL_curcop), but only when there
283 * is no sublex_push to follow. */
284#define COPLINE_SET_FROM_MULTI_END \
285 STMT_START { \
286 CopLINE_set(PL_curcop, PL_multi_end); \
287 if (PL_multi_end != PL_multi_start) \
851b527a 288 PL_parser->herelines = 0; \
ffdb8b16 289 } STMT_END
83944c01
FC
290
291
8fa7f367
JH
292#ifdef DEBUGGING
293
6154021b 294/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
295enum token_type {
296 TOKENTYPE_NONE,
297 TOKENTYPE_IVAL,
6154021b 298 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6 299 TOKENTYPE_PVAL,
aeaef349 300 TOKENTYPE_OPVAL
bbf60fe6
DM
301};
302
6d4a66ac
NC
303static struct debug_tokens {
304 const int token;
305 enum token_type type;
306 const char *name;
307} const debug_tokens[] =
9041c2e3 308{
bbf60fe6
DM
309 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
310 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
311 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
312 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
313 { ARROW, TOKENTYPE_NONE, "ARROW" },
314 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
315 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
316 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
317 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
318 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
0d863452 319 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
320 { DO, TOKENTYPE_NONE, "DO" },
321 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
322 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
323 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
324 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
325 { ELSE, TOKENTYPE_NONE, "ELSE" },
326 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
327 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
328 { FOR, TOKENTYPE_IVAL, "FOR" },
329 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
705fe0e5
FC
330 { FORMLBRACK, TOKENTYPE_NONE, "FORMLBRACK" },
331 { FORMRBRACK, TOKENTYPE_NONE, "FORMRBRACK" },
bbf60fe6
DM
332 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
333 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
7eb971ee 334 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
bbf60fe6
DM
335 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
336 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
337 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 338 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
339 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
340 { IF, TOKENTYPE_IVAL, "IF" },
5504e6cf 341 { LABEL, TOKENTYPE_PVAL, "LABEL" },
bbf60fe6
DM
342 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
343 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
344 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
345 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
346 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
347 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
348 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
349 { MY, TOKENTYPE_IVAL, "MY" },
bbf60fe6
DM
350 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
351 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
352 { OROP, TOKENTYPE_IVAL, "OROP" },
353 { OROR, TOKENTYPE_NONE, "OROR" },
354 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
88e1f1a2
JV
355 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
356 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6 357 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
cc624add 358 { POSTJOIN, TOKENTYPE_NONE, "POSTJOIN" },
bbf60fe6
DM
359 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
360 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
361 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
362 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
363 { PREINC, TOKENTYPE_NONE, "PREINC" },
364 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
f3f204dc 365 { QWLIST, TOKENTYPE_OPVAL, "QWLIST" },
bbf60fe6
DM
366 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
367 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
f3f204dc 368 { REQUIRE, TOKENTYPE_NONE, "REQUIRE" },
bbf60fe6
DM
369 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
370 { SUB, TOKENTYPE_NONE, "SUB" },
371 { THING, TOKENTYPE_OPVAL, "THING" },
372 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
373 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
374 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
375 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
376 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
377 { USE, TOKENTYPE_IVAL, "USE" },
0d863452 378 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6
DM
379 { WHILE, TOKENTYPE_IVAL, "WHILE" },
380 { WORD, TOKENTYPE_OPVAL, "WORD" },
be25f609 381 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 382 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
383};
384
6154021b 385/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 386
bbf60fe6 387STATIC int
704d4215 388S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 389{
7918f24d
NC
390 PERL_ARGS_ASSERT_TOKEREPORT;
391
bbf60fe6 392 if (DEBUG_T_TEST) {
bd61b366 393 const char *name = NULL;
bbf60fe6 394 enum token_type type = TOKENTYPE_NONE;
f54cb97a 395 const struct debug_tokens *p;
396482e1 396 SV* const report = newSVpvs("<== ");
bbf60fe6 397
f54cb97a 398 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
399 if (p->token == (int)rv) {
400 name = p->name;
401 type = p->type;
402 break;
403 }
404 }
405 if (name)
54667de8 406 Perl_sv_catpv(aTHX_ report, name);
74736ae6 407 else if ((char)rv > ' ' && (char)rv <= '~')
4ebc7986 408 {
bbf60fe6 409 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
4ebc7986
FC
410 if ((char)rv == 'p')
411 sv_catpvs(report, " (pending identifier)");
412 }
bbf60fe6 413 else if (!rv)
396482e1 414 sv_catpvs(report, "EOF");
bbf60fe6
DM
415 else
416 Perl_sv_catpvf(aTHX_ report, "?? %"IVdf, (IV)rv);
417 switch (type) {
418 case TOKENTYPE_NONE:
bbf60fe6
DM
419 break;
420 case TOKENTYPE_IVAL:
704d4215 421 Perl_sv_catpvf(aTHX_ report, "(ival=%"IVdf")", (IV)lvalp->ival);
bbf60fe6
DM
422 break;
423 case TOKENTYPE_OPNUM:
424 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 425 PL_op_name[lvalp->ival]);
bbf60fe6
DM
426 break;
427 case TOKENTYPE_PVAL:
704d4215 428 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
429 break;
430 case TOKENTYPE_OPVAL:
704d4215 431 if (lvalp->opval) {
401441c0 432 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
433 PL_op_name[lvalp->opval->op_type]);
434 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 435 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 436 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
437 }
438
439 }
401441c0 440 else
396482e1 441 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
442 break;
443 }
b6007c36 444 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
445 };
446 return (int)rv;
998054bd
SC
447}
448
b6007c36
DM
449
450/* print the buffer with suitable escapes */
451
452STATIC void
15f169a1 453S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 454{
396482e1 455 SV* const tmp = newSVpvs("");
7918f24d
NC
456
457 PERL_ARGS_ASSERT_PRINTBUF;
458
5d37acd6 459 GCC_DIAG_IGNORE(-Wformat-nonliteral); /* fmt checked by caller */
b6007c36 460 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
5d37acd6 461 GCC_DIAG_RESTORE;
b6007c36
DM
462 SvREFCNT_dec(tmp);
463}
464
8fa7f367
JH
465#endif
466
8290c323
NC
467static int
468S_deprecate_commaless_var_list(pTHX) {
469 PL_expect = XTERM;
470 deprecate("comma-less variable list");
471 return REPORT(','); /* grandfather non-comma-format format */
472}
473
ffb4593c
NT
474/*
475 * S_ao
476 *
f393a21a
FC
477 * This subroutine looks for an '=' next to the operator that has just been
478 * parsed and turns it into an ASSIGNOP if it finds one.
ffb4593c
NT
479 */
480
76e3520e 481STATIC int
cea2e8a9 482S_ao(pTHX_ int toketype)
a0d0e21e 483{
3280af22
NIS
484 if (*PL_bufptr == '=') {
485 PL_bufptr++;
a0d0e21e 486 if (toketype == ANDAND)
6154021b 487 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 488 else if (toketype == OROR)
6154021b 489 pl_yylval.ival = OP_ORASSIGN;
c963b151 490 else if (toketype == DORDOR)
6154021b 491 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
492 toketype = ASSIGNOP;
493 }
494 return toketype;
495}
496
ffb4593c
NT
497/*
498 * S_no_op
499 * When Perl expects an operator and finds something else, no_op
500 * prints the warning. It always prints "<something> found where
501 * operator expected. It prints "Missing semicolon on previous line?"
502 * if the surprise occurs at the start of the line. "do you need to
503 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
504 * where the compiler doesn't know if foo is a method call or a function.
505 * It prints "Missing operator before end of line" if there's nothing
506 * after the missing operator, or "... before <...>" if there is something
507 * after the missing operator.
508 */
509
76e3520e 510STATIC void
15f169a1 511S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 512{
9d4ba2ae
AL
513 char * const oldbp = PL_bufptr;
514 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 515
7918f24d
NC
516 PERL_ARGS_ASSERT_NO_OP;
517
1189a94a
GS
518 if (!s)
519 s = oldbp;
07c798fb 520 else
1189a94a 521 PL_bufptr = s;
734ab321 522 yywarn(Perl_form(aTHX_ "%s found where operator expected", what), UTF ? SVf_UTF8 : 0);
56da5a46
RGS
523 if (ckWARN_d(WARN_SYNTAX)) {
524 if (is_first)
525 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
526 "\t(Missing semicolon on previous line?)\n");
527 else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) {
f54cb97a 528 const char *t;
8a2bca12 529 for (t = PL_oldoldbufptr; (isWORDCHAR_lazy_if(t,UTF) || *t == ':');
734ab321 530 t += UTF ? UTF8SKIP(t) : 1)
c35e046a 531 NOOP;
56da5a46
RGS
532 if (t < PL_bufptr && isSPACE(*t))
533 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
b17a0679
FC
534 "\t(Do you need to predeclare %"UTF8f"?)\n",
535 UTF8fARG(UTF, t - PL_oldoldbufptr, PL_oldoldbufptr));
56da5a46
RGS
536 }
537 else {
538 assert(s >= oldbp);
539 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
b17a0679
FC
540 "\t(Missing operator before %"UTF8f"?)\n",
541 UTF8fARG(UTF, s - oldbp, oldbp));
56da5a46 542 }
07c798fb 543 }
3280af22 544 PL_bufptr = oldbp;
8990e307
LW
545}
546
ffb4593c
NT
547/*
548 * S_missingterm
549 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 550 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
551 * If we're in a delimited string and the delimiter is a control
552 * character, it's reformatted into a two-char sequence like ^C.
553 * This is fatal.
554 */
555
76e3520e 556STATIC void
cea2e8a9 557S_missingterm(pTHX_ char *s)
8990e307
LW
558{
559 char tmpbuf[3];
560 char q;
561 if (s) {
9d4ba2ae 562 char * const nl = strrchr(s,'\n');
d2719217 563 if (nl)
8990e307
LW
564 *nl = '\0';
565 }
ca8b19a7 566 else if ((U8) PL_multi_close < 32) {
8990e307 567 *tmpbuf = '^';
585ec06d 568 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
569 tmpbuf[2] = '\0';
570 s = tmpbuf;
571 }
572 else {
eb160463 573 *tmpbuf = (char)PL_multi_close;
8990e307
LW
574 tmpbuf[1] = '\0';
575 s = tmpbuf;
576 }
577 q = strchr(s,'"') ? '\'' : '"';
cea2e8a9 578 Perl_croak(aTHX_ "Can't find string terminator %c%s%c anywhere before EOF",q,s,q);
463ee0b2 579}
79072805 580
dd0ac2b9
FC
581#include "feature.h"
582
0d863452 583/*
0d863452
RH
584 * Check whether the named feature is enabled.
585 */
26ea9e12 586bool
3fff3427 587Perl_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
0d863452 588{
4a731d7b 589 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
7918f24d
NC
590
591 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
ca4d40c4
FC
592
593 assert(CURRENT_FEATURE_BUNDLE == FEATURE_BUNDLE_CUSTOM);
7918f24d 594
26ea9e12
NC
595 if (namelen > MAX_FEATURE_LEN)
596 return FALSE;
3fff3427 597 memcpy(&he_name[8], name, namelen);
7d69d4a6 598
c8ca97b0
NC
599 return cBOOL(cop_hints_fetch_pvn(PL_curcop, he_name, 8 + namelen, 0,
600 REFCOUNTED_HE_EXISTS));
0d863452
RH
601}
602
ffb4593c 603/*
9cbb5ea2
GS
604 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
605 * utf16-to-utf8-reversed.
ffb4593c
NT
606 */
607
c39cd008
GS
608#ifdef PERL_CR_FILTER
609static void
610strip_return(SV *sv)
611{
eb578fdb
KW
612 const char *s = SvPVX_const(sv);
613 const char * const e = s + SvCUR(sv);
7918f24d
NC
614
615 PERL_ARGS_ASSERT_STRIP_RETURN;
616
c39cd008
GS
617 /* outer loop optimized to do nothing if there are no CR-LFs */
618 while (s < e) {
619 if (*s++ == '\r' && *s == '\n') {
620 /* hit a CR-LF, need to copy the rest */
eb578fdb 621 char *d = s - 1;
c39cd008
GS
622 *d++ = *s++;
623 while (s < e) {
624 if (*s == '\r' && s[1] == '\n')
625 s++;
626 *d++ = *s++;
627 }
628 SvCUR(sv) -= s - d;
629 return;
630 }
631 }
632}
a868473f 633
76e3520e 634STATIC I32
c39cd008 635S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 636{
f54cb97a 637 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
638 if (count > 0 && !maxlen)
639 strip_return(sv);
640 return count;
a868473f
NIS
641}
642#endif
643
ffb4593c 644/*
8eaa0acf
Z
645=for apidoc Amx|void|lex_start|SV *line|PerlIO *rsfp|U32 flags
646
647Creates and initialises a new lexer/parser state object, supplying
648a context in which to lex and parse from a new source of Perl code.
649A pointer to the new state object is placed in L</PL_parser>. An entry
650is made on the save stack so that upon unwinding the new state object
651will be destroyed and the former value of L</PL_parser> will be restored.
652Nothing else need be done to clean up the parsing context.
653
654The code to be parsed comes from I<line> and I<rsfp>. I<line>, if
655non-null, provides a string (in SV form) containing code to be parsed.
656A copy of the string is made, so subsequent modification of I<line>
657does not affect parsing. I<rsfp>, if non-null, provides an input stream
658from which code will be read to be parsed. If both are non-null, the
659code in I<line> comes first and must consist of complete lines of input,
660and I<rsfp> supplies the remainder of the source.
661
e368b3bd
FC
662The I<flags> parameter is reserved for future use. Currently it is only
663used by perl internally, so extensions should always pass zero.
8eaa0acf
Z
664
665=cut
666*/
ffb4593c 667
27fcb6ee 668/* LEX_START_SAME_FILTER indicates that this is not a new file, so it
87606032
NC
669 can share filters with the current parser.
670 LEX_START_DONT_CLOSE indicates that the file handle wasn't opened by the
671 caller, hence isn't owned by the parser, so shouldn't be closed on parser
672 destruction. This is used to handle the case of defaulting to reading the
673 script from the standard input because no filename was given on the command
674 line (without getting confused by situation where STDIN has been closed, so
675 the script handle is opened on fd 0) */
27fcb6ee 676
a0d0e21e 677void
8eaa0acf 678Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
79072805 679{
6ef55633 680 const char *s = NULL;
5486870f 681 yy_parser *parser, *oparser;
60d63348 682 if (flags && flags & ~LEX_START_FLAGS)
8eaa0acf 683 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
acdf0a21
DM
684
685 /* create and initialise a parser */
686
199e78b7 687 Newxz(parser, 1, yy_parser);
5486870f 688 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
689 PL_parser = parser;
690
28ac2b49
Z
691 parser->stack = NULL;
692 parser->ps = NULL;
693 parser->stack_size = 0;
acdf0a21 694
e3abe207
DM
695 /* on scope exit, free this parser and restore any outer one */
696 SAVEPARSER(parser);
7c4baf47 697 parser->saved_curcop = PL_curcop;
e3abe207 698
acdf0a21 699 /* initialise lexer state */
8990e307 700
fb205e7a 701 parser->nexttoke = 0;
ca4cfd28 702 parser->error_count = oparser ? oparser->error_count : 0;
7f1c3e8c 703 parser->copline = parser->preambling = NOLINE;
5afb0a62 704 parser->lex_state = LEX_NORMAL;
c2598295 705 parser->expect = XSTATE;
2f9285f8 706 parser->rsfp = rsfp;
27fcb6ee
FC
707 parser->rsfp_filters =
708 !(flags & LEX_START_SAME_FILTER) || !oparser
d3cd8e11
FC
709 ? NULL
710 : MUTABLE_AV(SvREFCNT_inc(
711 oparser->rsfp_filters
712 ? oparser->rsfp_filters
713 : (oparser->rsfp_filters = newAV())
714 ));
2f9285f8 715
199e78b7
DM
716 Newx(parser->lex_brackstack, 120, char);
717 Newx(parser->lex_casestack, 12, char);
718 *parser->lex_casestack = '\0';
d794b522 719 Newxz(parser->lex_shared, 1, LEXSHARED);
02b34bbe 720
10efb74f 721 if (line) {
0528fd32 722 STRLEN len;
10efb74f 723 s = SvPV_const(line, len);
0abcdfa4
FC
724 parser->linestr = flags & LEX_START_COPIED
725 ? SvREFCNT_inc_simple_NN(line)
726 : newSVpvn_flags(s, len, SvUTF8(line));
bf1b738b 727 sv_catpvn(parser->linestr, "\n;", rsfp ? 1 : 2);
0abcdfa4 728 } else {
bf1b738b 729 parser->linestr = newSVpvn("\n;", rsfp ? 1 : 2);
8990e307 730 }
f06b5848
DM
731 parser->oldoldbufptr =
732 parser->oldbufptr =
733 parser->bufptr =
734 parser->linestart = SvPVX(parser->linestr);
735 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
736 parser->last_lop = parser->last_uni = NULL;
b54f893d
KW
737
738 assert(FITS_IN_8_BITS(LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
739 |LEX_DONT_CLOSE_RSFP));
740 parser->lex_flags = (U8) (flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
741 |LEX_DONT_CLOSE_RSFP));
737c24fc 742
60d63348 743 parser->in_pod = parser->filtered = 0;
79072805 744}
a687059c 745
e3abe207
DM
746
747/* delete a parser object */
748
749void
750Perl_parser_free(pTHX_ const yy_parser *parser)
751{
7918f24d
NC
752 PERL_ARGS_ASSERT_PARSER_FREE;
753
7c4baf47 754 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
755 SvREFCNT_dec(parser->linestr);
756
87606032 757 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
2f9285f8 758 PerlIO_clearerr(parser->rsfp);
799361c3
SH
759 else if (parser->rsfp && (!parser->old_parser ||
760 (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 761 PerlIO_close(parser->rsfp);
5486870f 762 SvREFCNT_dec(parser->rsfp_filters);
10002bc1
FC
763 SvREFCNT_dec(parser->lex_stuff);
764 SvREFCNT_dec(parser->sublex_info.repl);
3ac7ff8f
FC
765
766 Safefree(parser->lex_brackstack);
767 Safefree(parser->lex_casestack);
768 Safefree(parser->lex_shared);
769 PL_parser = parser->old_parser;
770 Safefree(parser);
771}
772
773void
774Perl_parser_free_nexttoke_ops(pTHX_ yy_parser *parser, OPSLAB *slab)
775{
3ac7ff8f 776 I32 nexttoke = parser->nexttoke;
3ac7ff8f 777 PERL_ARGS_ASSERT_PARSER_FREE_NEXTTOKE_OPS;
3ce3dcd9 778 while (nexttoke--) {
3ac7ff8f
FC
779 if (S_is_opval_token(parser->nexttype[nexttoke] & 0xffff)
780 && parser->nextval[nexttoke].opval
781 && parser->nextval[nexttoke].opval->op_slabbed
782 && OpSLAB(parser->nextval[nexttoke].opval) == slab) {
3ce3dcd9 783 op_free(parser->nextval[nexttoke].opval);
3ac7ff8f
FC
784 parser->nextval[nexttoke].opval = NULL;
785 }
3ce3dcd9 786 }
e3abe207
DM
787}
788
789
ffb4593c 790/*
f0e67a1d
Z
791=for apidoc AmxU|SV *|PL_parser-E<gt>linestr
792
793Buffer scalar containing the chunk currently under consideration of the
794text currently being lexed. This is always a plain string scalar (for
795which C<SvPOK> is true). It is not intended to be used as a scalar by
796normal scalar means; instead refer to the buffer directly by the pointer
797variables described below.
798
799The lexer maintains various C<char*> pointers to things in the
800C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
801reallocated, all of these pointers must be updated. Don't attempt to
802do this manually, but rather use L</lex_grow_linestr> if you need to
803reallocate the buffer.
804
805The content of the text chunk in the buffer is commonly exactly one
806complete line of input, up to and including a newline terminator,
807but there are situations where it is otherwise. The octets of the
808buffer may be intended to be interpreted as either UTF-8 or Latin-1.
809The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
810flag on this scalar, which may disagree with it.
811
812For direct examination of the buffer, the variable
813L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
814lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
815of these pointers is usually preferable to examination of the scalar
816through normal scalar means.
817
818=for apidoc AmxU|char *|PL_parser-E<gt>bufend
819
820Direct pointer to the end of the chunk of text currently being lexed, the
821end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
6602b933 822+ SvCUR(PL_parser-E<gt>linestr)>. A C<NUL> character (zero octet) is
f0e67a1d
Z
823always located at the end of the buffer, and does not count as part of
824the buffer's contents.
825
826=for apidoc AmxU|char *|PL_parser-E<gt>bufptr
827
828Points to the current position of lexing inside the lexer buffer.
829Characters around this point may be freely examined, within
830the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
831L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
832interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
833
834Lexing code (whether in the Perl core or not) moves this pointer past
835the characters that it consumes. It is also expected to perform some
836bookkeeping whenever a newline character is consumed. This movement
837can be more conveniently performed by the function L</lex_read_to>,
838which handles newlines appropriately.
839
840Interpretation of the buffer's octets can be abstracted out by
841using the slightly higher-level functions L</lex_peek_unichar> and
842L</lex_read_unichar>.
843
844=for apidoc AmxU|char *|PL_parser-E<gt>linestart
845
846Points to the start of the current line inside the lexer buffer.
847This is useful for indicating at which column an error occurred, and
848not much else. This must be updated by any lexing code that consumes
849a newline; the function L</lex_read_to> handles this detail.
850
851=cut
852*/
853
854/*
855=for apidoc Amx|bool|lex_bufutf8
856
857Indicates whether the octets in the lexer buffer
858(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
859of Unicode characters. If not, they should be interpreted as Latin-1
860characters. This is analogous to the C<SvUTF8> flag for scalars.
861
862In UTF-8 mode, it is not guaranteed that the lexer buffer actually
863contains valid UTF-8. Lexing code must be robust in the face of invalid
864encoding.
865
866The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
867is significant, but not the whole story regarding the input character
868encoding. Normally, when a file is being read, the scalar contains octets
869and its C<SvUTF8> flag is off, but the octets should be interpreted as
870UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
871however, the scalar may have the C<SvUTF8> flag on, and in this case its
872octets should be interpreted as UTF-8 unless the C<use bytes> pragma
873is in effect. This logic may change in the future; use this function
874instead of implementing the logic yourself.
875
876=cut
877*/
878
879bool
880Perl_lex_bufutf8(pTHX)
881{
882 return UTF;
883}
884
885/*
886=for apidoc Amx|char *|lex_grow_linestr|STRLEN len
887
888Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
6602b933 889at least I<len> octets (including terminating C<NUL>). Returns a
f0e67a1d
Z
890pointer to the reallocated buffer. This is necessary before making
891any direct modification of the buffer that would increase its length.
892L</lex_stuff_pvn> provides a more convenient way to insert text into
893the buffer.
894
895Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
896this function updates all of the lexer's variables that point directly
897into the buffer.
898
899=cut
900*/
901
902char *
903Perl_lex_grow_linestr(pTHX_ STRLEN len)
904{
905 SV *linestr;
906 char *buf;
907 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
c7641931 908 STRLEN linestart_pos, last_uni_pos, last_lop_pos, re_eval_start_pos;
f0e67a1d
Z
909 linestr = PL_parser->linestr;
910 buf = SvPVX(linestr);
911 if (len <= SvLEN(linestr))
912 return buf;
913 bufend_pos = PL_parser->bufend - buf;
914 bufptr_pos = PL_parser->bufptr - buf;
915 oldbufptr_pos = PL_parser->oldbufptr - buf;
916 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
917 linestart_pos = PL_parser->linestart - buf;
918 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
919 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
3328ab5a
FC
920 re_eval_start_pos = PL_parser->lex_shared->re_eval_start ?
921 PL_parser->lex_shared->re_eval_start - buf : 0;
c7641931 922
f0e67a1d 923 buf = sv_grow(linestr, len);
c7641931 924
f0e67a1d
Z
925 PL_parser->bufend = buf + bufend_pos;
926 PL_parser->bufptr = buf + bufptr_pos;
927 PL_parser->oldbufptr = buf + oldbufptr_pos;
928 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
929 PL_parser->linestart = buf + linestart_pos;
930 if (PL_parser->last_uni)
931 PL_parser->last_uni = buf + last_uni_pos;
932 if (PL_parser->last_lop)
933 PL_parser->last_lop = buf + last_lop_pos;
3328ab5a
FC
934 if (PL_parser->lex_shared->re_eval_start)
935 PL_parser->lex_shared->re_eval_start = buf + re_eval_start_pos;
f0e67a1d
Z
936 return buf;
937}
938
939/*
83aa740e 940=for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
f0e67a1d
Z
941
942Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
943immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
944reallocating the buffer if necessary. This means that lexing code that
945runs later will see the characters as if they had appeared in the input.
946It is not recommended to do this as part of normal parsing, and most
947uses of this facility run the risk of the inserted characters being
948interpreted in an unintended manner.
949
950The string to be inserted is represented by I<len> octets starting
951at I<pv>. These octets are interpreted as either UTF-8 or Latin-1,
952according to whether the C<LEX_STUFF_UTF8> flag is set in I<flags>.
953The characters are recoded for the lexer buffer, according to how the
954buffer is currently being interpreted (L</lex_bufutf8>). If a string
9dcc53ea 955to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
f0e67a1d
Z
956function is more convenient.
957
958=cut
959*/
960
961void
83aa740e 962Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
f0e67a1d 963{
749123ff 964 dVAR;
f0e67a1d
Z
965 char *bufptr;
966 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
967 if (flags & ~(LEX_STUFF_UTF8))
968 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
969 if (UTF) {
970 if (flags & LEX_STUFF_UTF8) {
971 goto plain_copy;
972 } else {
54d004e8 973 STRLEN highhalf = 0; /* Count of variants */
83aa740e 974 const char *p, *e = pv+len;
54d004e8
KW
975 for (p = pv; p != e; p++) {
976 if (! UTF8_IS_INVARIANT(*p)) {
977 highhalf++;
978 }
979 }
f0e67a1d
Z
980 if (!highhalf)
981 goto plain_copy;
982 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
983 bufptr = PL_parser->bufptr;
984 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
985 SvCUR_set(PL_parser->linestr,
986 SvCUR(PL_parser->linestr) + len+highhalf);
f0e67a1d
Z
987 PL_parser->bufend += len+highhalf;
988 for (p = pv; p != e; p++) {
989 U8 c = (U8)*p;
54d004e8
KW
990 if (! UTF8_IS_INVARIANT(c)) {
991 *bufptr++ = UTF8_TWO_BYTE_HI(c);
992 *bufptr++ = UTF8_TWO_BYTE_LO(c);
f0e67a1d
Z
993 } else {
994 *bufptr++ = (char)c;
995 }
996 }
997 }
998 } else {
999 if (flags & LEX_STUFF_UTF8) {
1000 STRLEN highhalf = 0;
83aa740e 1001 const char *p, *e = pv+len;
f0e67a1d
Z
1002 for (p = pv; p != e; p++) {
1003 U8 c = (U8)*p;
54d004e8 1004 if (UTF8_IS_ABOVE_LATIN1(c)) {
f0e67a1d
Z
1005 Perl_croak(aTHX_ "Lexing code attempted to stuff "
1006 "non-Latin-1 character into Latin-1 input");
54d004e8 1007 } else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e)) {
f0e67a1d
Z
1008 p++;
1009 highhalf++;
54d004e8 1010 } else if (! UTF8_IS_INVARIANT(c)) {
f0e67a1d
Z
1011 /* malformed UTF-8 */
1012 ENTER;
1013 SAVESPTR(PL_warnhook);
1014 PL_warnhook = PERL_WARNHOOK_FATAL;
c80e42f3 1015 utf8n_to_uvchr((U8*)p, e-p, NULL, 0);
f0e67a1d
Z
1016 LEAVE;
1017 }
1018 }
1019 if (!highhalf)
1020 goto plain_copy;
1021 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
1022 bufptr = PL_parser->bufptr;
1023 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1024 SvCUR_set(PL_parser->linestr,
1025 SvCUR(PL_parser->linestr) + len-highhalf);
f0e67a1d 1026 PL_parser->bufend += len-highhalf;
54d004e8
KW
1027 p = pv;
1028 while (p < e) {
1029 if (UTF8_IS_INVARIANT(*p)) {
1030 *bufptr++ = *p;
1031 p++;
f0e67a1d 1032 }
54d004e8
KW
1033 else {
1034 assert(p < e -1 );
94bb8c36 1035 *bufptr++ = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
54d004e8
KW
1036 p += 2;
1037 }
f0e67a1d
Z
1038 }
1039 } else {
54d004e8 1040 plain_copy:
f0e67a1d
Z
1041 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1042 bufptr = PL_parser->bufptr;
1043 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
255fdf19 1044 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
f0e67a1d
Z
1045 PL_parser->bufend += len;
1046 Copy(pv, bufptr, len, char);
1047 }
1048 }
1049}
1050
1051/*
9dcc53ea
Z
1052=for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
1053
1054Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1055immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1056reallocating the buffer if necessary. This means that lexing code that
1057runs later will see the characters as if they had appeared in the input.
1058It is not recommended to do this as part of normal parsing, and most
1059uses of this facility run the risk of the inserted characters being
1060interpreted in an unintended manner.
1061
1062The string to be inserted is represented by octets starting at I<pv>
1063and continuing to the first nul. These octets are interpreted as either
1064UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
1065in I<flags>. The characters are recoded for the lexer buffer, according
1066to how the buffer is currently being interpreted (L</lex_bufutf8>).
1067If it is not convenient to nul-terminate a string to be inserted, the
1068L</lex_stuff_pvn> function is more appropriate.
1069
1070=cut
1071*/
1072
1073void
1074Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1075{
1076 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1077 lex_stuff_pvn(pv, strlen(pv), flags);
1078}
1079
1080/*
f0e67a1d
Z
1081=for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1082
1083Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1084immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1085reallocating the buffer if necessary. This means that lexing code that
1086runs later will see the characters as if they had appeared in the input.
1087It is not recommended to do this as part of normal parsing, and most
1088uses of this facility run the risk of the inserted characters being
1089interpreted in an unintended manner.
1090
1091The string to be inserted is the string value of I<sv>. The characters
1092are recoded for the lexer buffer, according to how the buffer is currently
9dcc53ea 1093being interpreted (L</lex_bufutf8>). If a string to be inserted is
f0e67a1d
Z
1094not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1095need to construct a scalar.
1096
1097=cut
1098*/
1099
1100void
1101Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1102{
1103 char *pv;
1104 STRLEN len;
1105 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1106 if (flags)
1107 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1108 pv = SvPV(sv, len);
1109 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1110}
1111
1112/*
1113=for apidoc Amx|void|lex_unstuff|char *ptr
1114
1115Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1116I<ptr>. Text following I<ptr> will be moved, and the buffer shortened.
1117This hides the discarded text from any lexing code that runs later,
1118as if the text had never appeared.
1119
1120This is not the normal way to consume lexed text. For that, use
1121L</lex_read_to>.
1122
1123=cut
1124*/
1125
1126void
1127Perl_lex_unstuff(pTHX_ char *ptr)
1128{
1129 char *buf, *bufend;
1130 STRLEN unstuff_len;
1131 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1132 buf = PL_parser->bufptr;
1133 if (ptr < buf)
1134 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1135 if (ptr == buf)
1136 return;
1137 bufend = PL_parser->bufend;
1138 if (ptr > bufend)
1139 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1140 unstuff_len = ptr - buf;
1141 Move(ptr, buf, bufend+1-ptr, char);
1142 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1143 PL_parser->bufend = bufend - unstuff_len;
1144}
1145
1146/*
1147=for apidoc Amx|void|lex_read_to|char *ptr
1148
1149Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1150to I<ptr>. This advances L</PL_parser-E<gt>bufptr> to match I<ptr>,
1151performing the correct bookkeeping whenever a newline character is passed.
1152This is the normal way to consume lexed text.
1153
1154Interpretation of the buffer's octets can be abstracted out by
1155using the slightly higher-level functions L</lex_peek_unichar> and
1156L</lex_read_unichar>.
1157
1158=cut
1159*/
1160
1161void
1162Perl_lex_read_to(pTHX_ char *ptr)
1163{
1164 char *s;
1165 PERL_ARGS_ASSERT_LEX_READ_TO;
1166 s = PL_parser->bufptr;
1167 if (ptr < s || ptr > PL_parser->bufend)
1168 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1169 for (; s != ptr; s++)
1170 if (*s == '\n') {
83944c01 1171 COPLINE_INC_WITH_HERELINES;
f0e67a1d
Z
1172 PL_parser->linestart = s+1;
1173 }
1174 PL_parser->bufptr = ptr;
1175}
1176
1177/*
1178=for apidoc Amx|void|lex_discard_to|char *ptr
1179
1180Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1181up to I<ptr>. The remaining content of the buffer will be moved, and
1182all pointers into the buffer updated appropriately. I<ptr> must not
1183be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1184it is not permitted to discard text that has yet to be lexed.
1185
1186Normally it is not necessarily to do this directly, because it suffices to
1187use the implicit discarding behaviour of L</lex_next_chunk> and things
1188based on it. However, if a token stretches across multiple lines,
1f317c95 1189and the lexing code has kept multiple lines of text in the buffer for
f0e67a1d
Z
1190that purpose, then after completion of the token it would be wise to
1191explicitly discard the now-unneeded earlier lines, to avoid future
1192multi-line tokens growing the buffer without bound.
1193
1194=cut
1195*/
1196
1197void
1198Perl_lex_discard_to(pTHX_ char *ptr)
1199{
1200 char *buf;
1201 STRLEN discard_len;
1202 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1203 buf = SvPVX(PL_parser->linestr);
1204 if (ptr < buf)
1205 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1206 if (ptr == buf)
1207 return;
1208 if (ptr > PL_parser->bufptr)
1209 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1210 discard_len = ptr - buf;
1211 if (PL_parser->oldbufptr < ptr)
1212 PL_parser->oldbufptr = ptr;
1213 if (PL_parser->oldoldbufptr < ptr)
1214 PL_parser->oldoldbufptr = ptr;
1215 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1216 PL_parser->last_uni = NULL;
1217 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1218 PL_parser->last_lop = NULL;
1219 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1220 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1221 PL_parser->bufend -= discard_len;
1222 PL_parser->bufptr -= discard_len;
1223 PL_parser->oldbufptr -= discard_len;
1224 PL_parser->oldoldbufptr -= discard_len;
1225 if (PL_parser->last_uni)
1226 PL_parser->last_uni -= discard_len;
1227 if (PL_parser->last_lop)
1228 PL_parser->last_lop -= discard_len;
1229}
1230
1231/*
1232=for apidoc Amx|bool|lex_next_chunk|U32 flags
1233
1234Reads in the next chunk of text to be lexed, appending it to
1235L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1236looked to the end of the current chunk and wants to know more. It is
1237usual, but not necessary, for lexing to have consumed the entirety of
1238the current chunk at this time.
1239
1240If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1241chunk (i.e., the current chunk has been entirely consumed), normally the
1242current chunk will be discarded at the same time that the new chunk is
1243read in. If I<flags> includes C<LEX_KEEP_PREVIOUS>, the current chunk
1244will not be discarded. If the current chunk has not been entirely
1245consumed, then it will not be discarded regardless of the flag.
1246
1247Returns true if some new text was added to the buffer, or false if the
1248buffer has reached the end of the input text.
1249
1250=cut
1251*/
1252
1253#define LEX_FAKE_EOF 0x80000000
112d1284 1254#define LEX_NO_TERM 0x40000000
f0e67a1d
Z
1255
1256bool
1257Perl_lex_next_chunk(pTHX_ U32 flags)
1258{
1259 SV *linestr;
1260 char *buf;
1261 STRLEN old_bufend_pos, new_bufend_pos;
1262 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1263 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1264 bool got_some_for_debugger = 0;
f0e67a1d 1265 bool got_some;
112d1284 1266 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM))
f0e67a1d 1267 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
f0e67a1d
Z
1268 linestr = PL_parser->linestr;
1269 buf = SvPVX(linestr);
1270 if (!(flags & LEX_KEEP_PREVIOUS) &&
1271 PL_parser->bufptr == PL_parser->bufend) {
1272 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1273 linestart_pos = 0;
1274 if (PL_parser->last_uni != PL_parser->bufend)
1275 PL_parser->last_uni = NULL;
1276 if (PL_parser->last_lop != PL_parser->bufend)
1277 PL_parser->last_lop = NULL;
1278 last_uni_pos = last_lop_pos = 0;
1279 *buf = 0;
1280 SvCUR(linestr) = 0;
1281 } else {
1282 old_bufend_pos = PL_parser->bufend - buf;
1283 bufptr_pos = PL_parser->bufptr - buf;
1284 oldbufptr_pos = PL_parser->oldbufptr - buf;
1285 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1286 linestart_pos = PL_parser->linestart - buf;
1287 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1288 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1289 }
1290 if (flags & LEX_FAKE_EOF) {
1291 goto eof;
60d63348 1292 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
f0e67a1d
Z
1293 got_some = 0;
1294 } else if (filter_gets(linestr, old_bufend_pos)) {
1295 got_some = 1;
17cc9359 1296 got_some_for_debugger = 1;
112d1284
FC
1297 } else if (flags & LEX_NO_TERM) {
1298 got_some = 0;
f0e67a1d 1299 } else {
580561a3
Z
1300 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1301 sv_setpvs(linestr, "");
f0e67a1d
Z
1302 eof:
1303 /* End of real input. Close filehandle (unless it was STDIN),
1304 * then add implicit termination.
1305 */
87606032 1306 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
f0e67a1d
Z
1307 PerlIO_clearerr(PL_parser->rsfp);
1308 else if (PL_parser->rsfp)
1309 (void)PerlIO_close(PL_parser->rsfp);
1310 PL_parser->rsfp = NULL;
60d63348 1311 PL_parser->in_pod = PL_parser->filtered = 0;
f0e67a1d
Z
1312 if (!PL_in_eval && PL_minus_p) {
1313 sv_catpvs(linestr,
1314 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1315 PL_minus_n = PL_minus_p = 0;
1316 } else if (!PL_in_eval && PL_minus_n) {
1317 sv_catpvs(linestr, /*{*/";}");
1318 PL_minus_n = 0;
1319 } else
1320 sv_catpvs(linestr, ";");
1321 got_some = 1;
1322 }
1323 buf = SvPVX(linestr);
1324 new_bufend_pos = SvCUR(linestr);
1325 PL_parser->bufend = buf + new_bufend_pos;
1326 PL_parser->bufptr = buf + bufptr_pos;
1327 PL_parser->oldbufptr = buf + oldbufptr_pos;
1328 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1329 PL_parser->linestart = buf + linestart_pos;
1330 if (PL_parser->last_uni)
1331 PL_parser->last_uni = buf + last_uni_pos;
1332 if (PL_parser->last_lop)
1333 PL_parser->last_lop = buf + last_lop_pos;
7f1c3e8c
FC
1334 if (PL_parser->preambling != NOLINE) {
1335 CopLINE_set(PL_curcop, PL_parser->preambling + 1);
1336 PL_parser->preambling = NOLINE;
1337 }
17cc9359 1338 if (got_some_for_debugger && (PERLDB_LINE || PERLDB_SAVESRC) &&
f0e67a1d
Z
1339 PL_curstash != PL_debstash) {
1340 /* debugger active and we're not compiling the debugger code,
1341 * so store the line into the debugger's array of lines
1342 */
1343 update_debugger_info(NULL, buf+old_bufend_pos,
1344 new_bufend_pos-old_bufend_pos);
1345 }
1346 return got_some;
1347}
1348
1349/*
1350=for apidoc Amx|I32|lex_peek_unichar|U32 flags
1351
1352Looks ahead one (Unicode) character in the text currently being lexed.
1353Returns the codepoint (unsigned integer value) of the next character,
1354or -1 if lexing has reached the end of the input text. To consume the
1355peeked character, use L</lex_read_unichar>.
1356
1357If the next character is in (or extends into) the next chunk of input
1358text, the next chunk will be read in. Normally the current chunk will be
1359discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1360then the current chunk will not be discarded.
1361
1362If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1363is encountered, an exception is generated.
1364
1365=cut
1366*/
1367
1368I32
1369Perl_lex_peek_unichar(pTHX_ U32 flags)
1370{
749123ff 1371 dVAR;
f0e67a1d
Z
1372 char *s, *bufend;
1373 if (flags & ~(LEX_KEEP_PREVIOUS))
1374 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1375 s = PL_parser->bufptr;
1376 bufend = PL_parser->bufend;
1377 if (UTF) {
1378 U8 head;
1379 I32 unichar;
1380 STRLEN len, retlen;
1381 if (s == bufend) {
1382 if (!lex_next_chunk(flags))
1383 return -1;
1384 s = PL_parser->bufptr;
1385 bufend = PL_parser->bufend;
1386 }
1387 head = (U8)*s;
54d004e8 1388 if (UTF8_IS_INVARIANT(head))
f0e67a1d 1389 return head;
54d004e8
KW
1390 if (UTF8_IS_START(head)) {
1391 len = UTF8SKIP(&head);
f0e67a1d
Z
1392 while ((STRLEN)(bufend-s) < len) {
1393 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1394 break;
1395 s = PL_parser->bufptr;
1396 bufend = PL_parser->bufend;
1397 }
1398 }
c80e42f3 1399 unichar = utf8n_to_uvchr((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
f0e67a1d
Z
1400 if (retlen == (STRLEN)-1) {
1401 /* malformed UTF-8 */
1402 ENTER;
1403 SAVESPTR(PL_warnhook);
1404 PL_warnhook = PERL_WARNHOOK_FATAL;
c80e42f3 1405 utf8n_to_uvchr((U8*)s, bufend-s, NULL, 0);
f0e67a1d
Z
1406 LEAVE;
1407 }
1408 return unichar;
1409 } else {
1410 if (s == bufend) {
1411 if (!lex_next_chunk(flags))
1412 return -1;
1413 s = PL_parser->bufptr;
1414 }
1415 return (U8)*s;
1416 }
1417}
1418
1419/*
1420=for apidoc Amx|I32|lex_read_unichar|U32 flags
1421
1422Reads the next (Unicode) character in the text currently being lexed.
1423Returns the codepoint (unsigned integer value) of the character read,
1424and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1425if lexing has reached the end of the input text. To non-destructively
1426examine the next character, use L</lex_peek_unichar> instead.
1427
1428If the next character is in (or extends into) the next chunk of input
1429text, the next chunk will be read in. Normally the current chunk will be
1430discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1431then the current chunk will not be discarded.
1432
1433If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1434is encountered, an exception is generated.
1435
1436=cut
1437*/
1438
1439I32
1440Perl_lex_read_unichar(pTHX_ U32 flags)
1441{
1442 I32 c;
1443 if (flags & ~(LEX_KEEP_PREVIOUS))
1444 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1445 c = lex_peek_unichar(flags);
1446 if (c != -1) {
1447 if (c == '\n')
83944c01 1448 COPLINE_INC_WITH_HERELINES;
d9018cbe
EB
1449 if (UTF)
1450 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1451 else
1452 ++(PL_parser->bufptr);
f0e67a1d
Z
1453 }
1454 return c;
1455}
1456
1457/*
1458=for apidoc Amx|void|lex_read_space|U32 flags
1459
1460Reads optional spaces, in Perl style, in the text currently being
1461lexed. The spaces may include ordinary whitespace characters and
1462Perl-style comments. C<#line> directives are processed if encountered.
1463L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1464at a non-space character (or the end of the input text).
1465
1466If spaces extend into the next chunk of input text, the next chunk will
1467be read in. Normally the current chunk will be discarded at the same
1468time, but if I<flags> includes C<LEX_KEEP_PREVIOUS> then the current
1469chunk will not be discarded.
1470
1471=cut
1472*/
1473
21791330 1474#define LEX_NO_INCLINE 0x40000000
f0998909
Z
1475#define LEX_NO_NEXT_CHUNK 0x80000000
1476
f0e67a1d
Z
1477void
1478Perl_lex_read_space(pTHX_ U32 flags)
1479{
1480 char *s, *bufend;
21791330 1481 const bool can_incline = !(flags & LEX_NO_INCLINE);
f0e67a1d 1482 bool need_incline = 0;
21791330 1483 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK|LEX_NO_INCLINE))
f0e67a1d 1484 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
f0e67a1d
Z
1485 s = PL_parser->bufptr;
1486 bufend = PL_parser->bufend;
1487 while (1) {
1488 char c = *s;
1489 if (c == '#') {
1490 do {
1491 c = *++s;
1492 } while (!(c == '\n' || (c == 0 && s == bufend)));
1493 } else if (c == '\n') {
1494 s++;
21791330
FC
1495 if (can_incline) {
1496 PL_parser->linestart = s;
1497 if (s == bufend)
1498 need_incline = 1;
1499 else
1500 incline(s);
1501 }
f0e67a1d
Z
1502 } else if (isSPACE(c)) {
1503 s++;
1504 } else if (c == 0 && s == bufend) {
1505 bool got_more;
65c68e17 1506 line_t l;
f0998909
Z
1507 if (flags & LEX_NO_NEXT_CHUNK)
1508 break;
f0e67a1d 1509 PL_parser->bufptr = s;
65c68e17 1510 l = CopLINE(PL_curcop);
851b527a 1511 CopLINE(PL_curcop) += PL_parser->herelines + 1;
f0e67a1d 1512 got_more = lex_next_chunk(flags);
65c68e17 1513 CopLINE_set(PL_curcop, l);
f0e67a1d
Z
1514 s = PL_parser->bufptr;
1515 bufend = PL_parser->bufend;
1516 if (!got_more)
1517 break;
21791330 1518 if (can_incline && need_incline && PL_parser->rsfp) {
f0e67a1d
Z
1519 incline(s);
1520 need_incline = 0;
1521 }
1522 } else {
1523 break;
1524 }
1525 }
f0e67a1d
Z
1526 PL_parser->bufptr = s;
1527}
1528
1529/*
fe788d6b
PM
1530
1531=for apidoc EXMp|bool|validate_proto|SV *name|SV *proto|bool warn
1532
1533This function performs syntax checking on a prototype, C<proto>.
1534If C<warn> is true, any illegal characters or mismatched brackets
1535will trigger illegalproto warnings, declaring that they were
1536detected in the prototype for C<name>.
1537
1538The return value is C<true> if this is a valid prototype, and
1539C<false> if it is not, regardless of whether C<warn> was C<true> or
1540C<false>.
1541
1542Note that C<NULL> is a valid C<proto> and will always return C<true>.
1543
1544=cut
1545
1546 */
1547
1548bool
1549Perl_validate_proto(pTHX_ SV *name, SV *proto, bool warn)
1550{
1551 STRLEN len, origlen;
1552 char *p = proto ? SvPV(proto, len) : NULL;
1553 bool bad_proto = FALSE;
1554 bool in_brackets = FALSE;
1555 bool after_slash = FALSE;
1556 char greedy_proto = ' ';
1557 bool proto_after_greedy_proto = FALSE;
1558 bool must_be_last = FALSE;
1559 bool underscore = FALSE;
f791a21a 1560 bool bad_proto_after_underscore = FALSE;
fe788d6b
PM
1561
1562 PERL_ARGS_ASSERT_VALIDATE_PROTO;
1563
1564 if (!proto)
1565 return TRUE;
1566
1567 origlen = len;
1568 for (; len--; p++) {
1569 if (!isSPACE(*p)) {
1570 if (must_be_last)
1571 proto_after_greedy_proto = TRUE;
f791a21a
PM
1572 if (underscore) {
1573 if (!strchr(";@%", *p))
1574 bad_proto_after_underscore = TRUE;
1575 underscore = FALSE;
1576 }
fe788d6b
PM
1577 if (!strchr("$@%*;[]&\\_+", *p) || *p == '\0') {
1578 bad_proto = TRUE;
1579 }
1580 else {
fe788d6b
PM
1581 if (*p == '[')
1582 in_brackets = TRUE;
1583 else if (*p == ']')
1584 in_brackets = FALSE;
1585 else if ((*p == '@' || *p == '%') &&
1586 !after_slash &&
1587 !in_brackets ) {
1588 must_be_last = TRUE;
1589 greedy_proto = *p;
1590 }
1591 else if (*p == '_')
f791a21a 1592 underscore = TRUE;
fe788d6b
PM
1593 }
1594 if (*p == '\\')
1595 after_slash = TRUE;
1596 else
1597 after_slash = FALSE;
1598 }
1599 }
1600
1601 if (warn) {
b54d603d 1602 SV *tmpsv = newSVpvs_flags("", SVs_TEMP);
fe788d6b 1603 p -= origlen;
b54d603d
PM
1604 p = SvUTF8(proto)
1605 ? sv_uni_display(tmpsv, newSVpvn_flags(p, origlen, SVs_TEMP | SVf_UTF8),
1606 origlen, UNI_DISPLAY_ISPRINT)
1607 : pv_pretty(tmpsv, p, origlen, 60, NULL, NULL, PERL_PV_ESCAPE_NONASCII);
1608
fe788d6b
PM
1609 if (proto_after_greedy_proto)
1610 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1611 "Prototype after '%c' for %"SVf" : %s",
1612 greedy_proto, SVfARG(name), p);
50278ed0
PM
1613 if (in_brackets)
1614 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1615 "Missing ']' in prototype for %"SVf" : %s",
1616 SVfARG(name), p);
b54d603d 1617 if (bad_proto)
fe788d6b 1618 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
f791a21a
PM
1619 "Illegal character in prototype for %"SVf" : %s",
1620 SVfARG(name), p);
1621 if (bad_proto_after_underscore)
1622 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1623 "Illegal character after '_' in prototype for %"SVf" : %s",
1624 SVfARG(name), p);
fe788d6b
PM
1625 }
1626
1627 return (! (proto_after_greedy_proto || bad_proto) );
1628}
1629
1630/*
ffb4593c
NT
1631 * S_incline
1632 * This subroutine has nothing to do with tilting, whether at windmills
1633 * or pinball tables. Its name is short for "increment line". It
57843af0 1634 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1635 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1636 * # line 500 "foo.pm"
1637 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1638 */
1639
76e3520e 1640STATIC void
d9095cec 1641S_incline(pTHX_ const char *s)
463ee0b2 1642{
d9095cec
NC
1643 const char *t;
1644 const char *n;
1645 const char *e;
8818d409 1646 line_t line_num;
463ee0b2 1647
7918f24d
NC
1648 PERL_ARGS_ASSERT_INCLINE;
1649
83944c01 1650 COPLINE_INC_WITH_HERELINES;
451f421f
FC
1651 if (!PL_rsfp && !PL_parser->filtered && PL_lex_state == LEX_NORMAL
1652 && s+1 == PL_bufend && *s == ';') {
1653 /* fake newline in string eval */
1654 CopLINE_dec(PL_curcop);
1655 return;
1656 }
463ee0b2
LW
1657 if (*s++ != '#')
1658 return;
d4c19fe8
AL
1659 while (SPACE_OR_TAB(*s))
1660 s++;
73659bf1
GS
1661 if (strnEQ(s, "line", 4))
1662 s += 4;
1663 else
1664 return;
084592ab 1665 if (SPACE_OR_TAB(*s))
73659bf1 1666 s++;
4e553d73 1667 else
73659bf1 1668 return;
d4c19fe8
AL
1669 while (SPACE_OR_TAB(*s))
1670 s++;
463ee0b2
LW
1671 if (!isDIGIT(*s))
1672 return;
d4c19fe8 1673
463ee0b2
LW
1674 n = s;
1675 while (isDIGIT(*s))
1676 s++;
07714eb4 1677 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1678 return;
bf4acbe4 1679 while (SPACE_OR_TAB(*s))
463ee0b2 1680 s++;
73659bf1 1681 if (*s == '"' && (t = strchr(s+1, '"'))) {
463ee0b2 1682 s++;
73659bf1
GS
1683 e = t + 1;
1684 }
463ee0b2 1685 else {
c35e046a
AL
1686 t = s;
1687 while (!isSPACE(*t))
1688 t++;
73659bf1 1689 e = t;
463ee0b2 1690 }
bf4acbe4 1691 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1692 e++;
1693 if (*e != '\n' && *e != '\0')
1694 return; /* false alarm */
1695
96e440d2 1696 line_num = grok_atou(n, &e) - 1;
8818d409 1697
f4dd75d9 1698 if (t - s > 0) {
d9095cec 1699 const STRLEN len = t - s;
3df32bda 1700
d36ee5be 1701 if (!PL_rsfp && !PL_parser->filtered) {
e66cf94c
RGS
1702 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1703 * to *{"::_<newfilename"} */
44867030
NC
1704 /* However, the long form of evals is only turned on by the
1705 debugger - usually they're "(eval %lu)" */
d36ee5be
FC
1706 GV * const cfgv = CopFILEGV(PL_curcop);
1707 if (cfgv) {
38bd7ad8
FC
1708 char smallbuf[128];
1709 STRLEN tmplen2 = len;
44867030 1710 char *tmpbuf2;
449dd039 1711 GV *gv2;
44867030
NC
1712
1713 if (tmplen2 + 2 <= sizeof smallbuf)
1714 tmpbuf2 = smallbuf;
1715 else
1716 Newx(tmpbuf2, tmplen2 + 2, char);
1717
38bd7ad8
FC
1718 tmpbuf2[0] = '_';
1719 tmpbuf2[1] = '<';
44867030
NC
1720
1721 memcpy(tmpbuf2 + 2, s, tmplen2);
1722 tmplen2 += 2;
1723
8a5ee598 1724 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1725 if (!isGV(gv2)) {
8a5ee598 1726 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1727 /* adjust ${"::_<newfilename"} to store the new file name */
1728 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
8818d409
FC
1729 /* The line number may differ. If that is the case,
1730 alias the saved lines that are in the array.
1731 Otherwise alias the whole array. */
1732 if (CopLINE(PL_curcop) == line_num) {
38bd7ad8
FC
1733 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(cfgv)));
1734 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(cfgv)));
8818d409 1735 }
38bd7ad8
FC
1736 else if (GvAV(cfgv)) {
1737 AV * const av = GvAV(cfgv);
8818d409
FC
1738 const I32 start = CopLINE(PL_curcop)+1;
1739 I32 items = AvFILLp(av) - start;
1740 if (items > 0) {
1741 AV * const av2 = GvAVn(gv2);
1742 SV **svp = AvARRAY(av) + start;
1743 I32 l = (I32)line_num+1;
1744 while (items--)
1745 av_store(av2, l++, SvREFCNT_inc(*svp++));
1746 }
1747 }
e5527e4b 1748 }
44867030
NC
1749
1750 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
d36ee5be 1751 }
e66cf94c 1752 }
05ec9bb3 1753 CopFILE_free(PL_curcop);
449dd039 1754 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1755 }
8818d409 1756 CopLINE_set(PL_curcop, line_num);
463ee0b2
LW
1757}
1758
21791330
FC
1759#define skipspace(s) skipspace_flags(s, 0)
1760
29595ff2 1761
80a702cd 1762STATIC void
15f169a1 1763S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1764{
1765 AV *av = CopFILEAVx(PL_curcop);
1766 if (av) {
7f1c3e8c
FC
1767 SV * sv;
1768 if (PL_parser->preambling == NOLINE) sv = newSV_type(SVt_PVMG);
1769 else {
1770 sv = *av_fetch(av, 0, 1);
1771 SvUPGRADE(sv, SVt_PVMG);
1772 }
1773 if (!SvPOK(sv)) sv_setpvs(sv,"");
5fa550fb 1774 if (orig_sv)
7f1c3e8c 1775 sv_catsv(sv, orig_sv);
5fa550fb 1776 else
7f1c3e8c
FC
1777 sv_catpvn(sv, buf, len);
1778 if (!SvIOK(sv)) {
1779 (void)SvIOK_on(sv);
1780 SvIV_set(sv, 0);
1781 }
1782 if (PL_parser->preambling == NOLINE)
1783 av_store(av, CopLINE(PL_curcop), sv);
80a702cd
RGS
1784 }
1785}
1786
ffb4593c
NT
1787/*
1788 * S_skipspace
1789 * Called to gobble the appropriate amount and type of whitespace.
1790 * Skips comments as well.
1791 */
1792
76e3520e 1793STATIC char *
21791330 1794S_skipspace_flags(pTHX_ char *s, U32 flags)
a687059c 1795{
21791330 1796 PERL_ARGS_ASSERT_SKIPSPACE_FLAGS;
3280af22 1797 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
bf4acbe4 1798 while (s < PL_bufend && SPACE_OR_TAB(*s))
463ee0b2 1799 s++;
f0e67a1d
Z
1800 } else {
1801 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1802 PL_bufptr = s;
21791330 1803 lex_read_space(flags | LEX_KEEP_PREVIOUS |
f0998909
Z
1804 (PL_sublex_info.sub_inwhat || PL_lex_state == LEX_FORMLINE ?
1805 LEX_NO_NEXT_CHUNK : 0));
3280af22 1806 s = PL_bufptr;
f0e67a1d
Z
1807 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1808 if (PL_linestart > PL_bufptr)
1809 PL_bufptr = PL_linestart;
1810 return s;
463ee0b2 1811 }
5db06880 1812 return s;
a687059c 1813}
378cc40b 1814
ffb4593c
NT
1815/*
1816 * S_check_uni
1817 * Check the unary operators to ensure there's no ambiguity in how they're
1818 * used. An ambiguous piece of code would be:
1819 * rand + 5
1820 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1821 * the +5 is its argument.
1822 */
1823
76e3520e 1824STATIC void
cea2e8a9 1825S_check_uni(pTHX)
ba106d47 1826{
d4c19fe8
AL
1827 const char *s;
1828 const char *t;
2f3197b3 1829
3280af22 1830 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1831 return;
3280af22
NIS
1832 while (isSPACE(*PL_last_uni))
1833 PL_last_uni++;
c35e046a 1834 s = PL_last_uni;
8a2bca12 1835 while (isWORDCHAR_lazy_if(s,UTF) || *s == '-')
c35e046a 1836 s++;
3280af22 1837 if ((t = strchr(s, '(')) && t < PL_bufptr)
a0d0e21e 1838 return;
6136c704 1839
9b387841
NC
1840 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
1841 "Warning: Use of \"%.*s\" without parentheses is ambiguous",
1842 (int)(s - PL_last_uni), PL_last_uni);
2f3197b3
LW
1843}
1844
ffb4593c
NT
1845/*
1846 * LOP : macro to build a list operator. Its behaviour has been replaced
1847 * with a subroutine, S_lop() for which LOP is just another name.
1848 */
1849
a0d0e21e
LW
1850#define LOP(f,x) return lop(f,x,s)
1851
ffb4593c
NT
1852/*
1853 * S_lop
1854 * Build a list operator (or something that might be one). The rules:
41e8cbf4
FC
1855 * - if we have a next token, then it's a list operator (no parens) for
1856 * which the next token has already been parsed; e.g.,
1857 * sort foo @args
1858 * sort foo (@args)
ffb4593c
NT
1859 * - if the next thing is an opening paren, then it's a function
1860 * - else it's a list operator
1861 */
1862
76e3520e 1863STATIC I32
a0be28da 1864S_lop(pTHX_ I32 f, int x, char *s)
ffed7fef 1865{
7918f24d
NC
1866 PERL_ARGS_ASSERT_LOP;
1867
6154021b 1868 pl_yylval.ival = f;
35c8bce7 1869 CLINE;
3280af22
NIS
1870 PL_bufptr = s;
1871 PL_last_lop = PL_oldbufptr;
eb160463 1872 PL_last_lop_op = (OPCODE)f;
3280af22 1873 if (PL_nexttoke)
78cdf107 1874 goto lstop;
19f1898a 1875 PL_expect = x;
79072805 1876 if (*s == '(')
bbf60fe6 1877 return REPORT(FUNC);
29595ff2 1878 s = PEEKSPACE(s);
79072805 1879 if (*s == '(')
bbf60fe6 1880 return REPORT(FUNC);
78cdf107
Z
1881 else {
1882 lstop:
1883 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
1884 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
bbf60fe6 1885 return REPORT(LSTOP);
78cdf107 1886 }
79072805
LW
1887}
1888
ffb4593c
NT
1889/*
1890 * S_force_next
9cbb5ea2 1891 * When the lexer realizes it knows the next token (for instance,
ffb4593c 1892 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 1893 * to know what token to return the next time the lexer is called. Caller
b5bbe64a
JH
1894 * will need to set PL_nextval[] and possibly PL_expect to ensure
1895 * the lexer handles the token correctly.
ffb4593c
NT
1896 */
1897
4e553d73 1898STATIC void
cea2e8a9 1899S_force_next(pTHX_ I32 type)
79072805 1900{
704d4215
GG
1901#ifdef DEBUGGING
1902 if (DEBUG_T_TEST) {
1903 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 1904 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
1905 }
1906#endif
3280af22
NIS
1907 PL_nexttype[PL_nexttoke] = type;
1908 PL_nexttoke++;
1909 if (PL_lex_state != LEX_KNOWNEXT) {
1910 PL_lex_defer = PL_lex_state;
3280af22 1911 PL_lex_state = LEX_KNOWNEXT;
79072805
LW
1912 }
1913}
1914
89f35911
FC
1915/*
1916 * S_postderef
1917 *
1918 * This subroutine handles postfix deref syntax after the arrow has already
1919 * been emitted. @* $* etc. are emitted as two separate token right here.
1920 * @[ @{ %[ %{ *{ are emitted also as two tokens, but this function emits
1921 * only the first, leaving yylex to find the next.
89f35911
FC
1922 */
1923
1924static int
ff25e5db 1925S_postderef(pTHX_ int const funny, char const next)
89f35911 1926{
ff25e5db 1927 assert(funny == DOLSHARP || strchr("$@%&*", funny));
89f35911
FC
1928 assert(strchr("*[{", next));
1929 if (next == '*') {
1930 PL_expect = XOPERATOR;
cc624add 1931 if (PL_lex_state == LEX_INTERPNORMAL && !PL_lex_brackets) {
ff25e5db 1932 assert('@' == funny || '$' == funny || DOLSHARP == funny);
cc624add 1933 PL_lex_state = LEX_INTERPEND;
cc624add
FC
1934 force_next(POSTJOIN);
1935 }
89f35911
FC
1936 force_next(next);
1937 PL_bufptr+=2;
1938 }
1939 else {
760ca746
FC
1940 if ('@' == funny && PL_lex_state == LEX_INTERPNORMAL
1941 && !PL_lex_brackets)
1942 PL_lex_dojoin = 2;
89f35911
FC
1943 PL_expect = XOPERATOR;
1944 PL_bufptr++;
1945 }
1946 return funny;
1947}
1948
28ac2b49
Z
1949void
1950Perl_yyunlex(pTHX)
1951{
a7aaec61
Z
1952 int yyc = PL_parser->yychar;
1953 if (yyc != YYEMPTY) {
1954 if (yyc) {
a7aaec61
Z
1955 NEXTVAL_NEXTTOKE = PL_parser->yylval;
1956 if (yyc == '{'/*}*/ || yyc == HASHBRACK || yyc == '['/*]*/) {
78cdf107 1957 PL_lex_allbrackets--;
a7aaec61 1958 PL_lex_brackets--;
78cdf107
Z
1959 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
1960 } else if (yyc == '('/*)*/) {
1961 PL_lex_allbrackets--;
1962 yyc |= (2<<24);
a7aaec61
Z
1963 }
1964 force_next(yyc);
1965 }
28ac2b49
Z
1966 PL_parser->yychar = YYEMPTY;
1967 }
1968}
1969
d0a148a6 1970STATIC SV *
15f169a1 1971S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 1972{
740cce10 1973 SV * const sv = newSVpvn_utf8(start, len,
eaf7a4d2
CS
1974 !IN_BYTES
1975 && UTF
1976 && !is_ascii_string((const U8*)start, len)
740cce10 1977 && is_utf8_string((const U8*)start, len));
d0a148a6
NC
1978 return sv;
1979}
1980
ffb4593c
NT
1981/*
1982 * S_force_word
1983 * When the lexer knows the next thing is a word (for instance, it has
1984 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
1985 * it calls S_force_word to stick the next word into the PL_nexttoke/val
1986 * lookahead.
ffb4593c
NT
1987 *
1988 * Arguments:
b1b65b59 1989 * char *start : buffer position (must be within PL_linestr)
02b34bbe 1990 * int token : PL_next* will be this type of bare word (e.g., METHOD,WORD)
ffb4593c
NT
1991 * int check_keyword : if true, Perl checks to make sure the word isn't
1992 * a keyword (do this if the word is a label, e.g. goto FOO)
1993 * int allow_pack : if true, : characters will also be allowed (require,
1994 * use, etc. do this)
9cbb5ea2 1995 * int allow_initial_tick : used by the "sub" lexer only.
ffb4593c
NT
1996 */
1997
76e3520e 1998STATIC char *
345b3785 1999S_force_word(pTHX_ char *start, int token, int check_keyword, int allow_pack)
79072805 2000{
eb578fdb 2001 char *s;
463ee0b2 2002 STRLEN len;
4e553d73 2003
7918f24d
NC
2004 PERL_ARGS_ASSERT_FORCE_WORD;
2005
29595ff2 2006 start = SKIPSPACE1(start);
463ee0b2 2007 s = start;
7e2040f0 2008 if (isIDFIRST_lazy_if(s,UTF) ||
345b3785 2009 (allow_pack && *s == ':') )
a0d0e21e 2010 {
3280af22 2011 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
01b5ef50
FC
2012 if (check_keyword) {
2013 char *s2 = PL_tokenbuf;
2014 if (allow_pack && len > 6 && strnEQ(s2, "CORE::", 6))
2015 s2 += 6, len -= 6;
2016 if (keyword(s2, len, 0))
463ee0b2 2017 return start;
01b5ef50 2018 }
463ee0b2 2019 if (token == METHOD) {
29595ff2 2020 s = SKIPSPACE1(s);
463ee0b2 2021 if (*s == '(')
3280af22 2022 PL_expect = XTERM;
463ee0b2 2023 else {
3280af22 2024 PL_expect = XOPERATOR;
463ee0b2 2025 }
79072805 2026 }
9ded7720 2027 NEXTVAL_NEXTTOKE.opval
d0a148a6
NC
2028 = (OP*)newSVOP(OP_CONST,0,
2029 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 2030 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
2031 force_next(token);
2032 }
2033 return s;
2034}
2035
ffb4593c
NT
2036/*
2037 * S_force_ident
9cbb5ea2 2038 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
2039 * text only contains the "foo" portion. The first argument is a pointer
2040 * to the "foo", and the second argument is the type symbol to prefix.
2041 * Forces the next token to be a "WORD".
9cbb5ea2 2042 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2043 */
2044
76e3520e 2045STATIC void
5aaab254 2046S_force_ident(pTHX_ const char *s, int kind)
79072805 2047{
7918f24d
NC
2048 PERL_ARGS_ASSERT_FORCE_IDENT;
2049
c9b48522
DD
2050 if (s[0]) {
2051 const STRLEN len = s[1] ? strlen(s) : 1; /* s = "\"" see yylex */
728847b1
BF
2052 OP* const o = (OP*)newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
2053 UTF ? SVf_UTF8 : 0));
9ded7720 2054 NEXTVAL_NEXTTOKE.opval = o;
79072805 2055 force_next(WORD);
748a9306 2056 if (kind) {
11343788 2057 o->op_private = OPpCONST_ENTERED;
55497cff
PP
2058 /* XXX see note in pp_entereval() for why we forgo typo
2059 warnings if the symbol must be introduced in an eval.
2060 GSAR 96-10-12 */
90e5519e 2061 gv_fetchpvn_flags(s, len,
728847b1
BF
2062 (PL_in_eval ? (GV_ADDMULTI | GV_ADDINEVAL)
2063 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
90e5519e
NC
2064 kind == '$' ? SVt_PV :
2065 kind == '@' ? SVt_PVAV :
2066 kind == '%' ? SVt_PVHV :
a0d0e21e 2067 SVt_PVGV
90e5519e 2068 );
748a9306 2069 }
79072805
LW
2070 }
2071}
2072
3f33d153
FC
2073static void
2074S_force_ident_maybe_lex(pTHX_ char pit)
2075{
3f33d153
FC
2076 NEXTVAL_NEXTTOKE.ival = pit;
2077 force_next('p');
2078}
2079
1571675a
GS
2080NV
2081Perl_str_to_version(pTHX_ SV *sv)
2082{
2083 NV retval = 0.0;
2084 NV nshift = 1.0;
2085 STRLEN len;
cfd0369c 2086 const char *start = SvPV_const(sv,len);
9d4ba2ae 2087 const char * const end = start + len;
504618e9 2088 const bool utf = SvUTF8(sv) ? TRUE : FALSE;
7918f24d
NC
2089
2090 PERL_ARGS_ASSERT_STR_TO_VERSION;
2091
1571675a 2092 while (start < end) {
ba210ebe 2093 STRLEN skip;
1571675a
GS
2094 UV n;
2095 if (utf)
9041c2e3 2096 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2097 else {
2098 n = *(U8*)start;
2099 skip = 1;
2100 }
2101 retval += ((NV)n)/nshift;
2102 start += skip;
2103 nshift *= 1000;
2104 }
2105 return retval;
2106}
2107
4e553d73 2108/*
ffb4593c
NT
2109 * S_force_version
2110 * Forces the next token to be a version number.
e759cc13
RGS
2111 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2112 * and if "guessing" is TRUE, then no new token is created (and the caller
2113 * must use an alternative parsing method).
ffb4593c
NT
2114 */
2115
76e3520e 2116STATIC char *
e759cc13 2117S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2118{
5f66b61c 2119 OP *version = NULL;
44dcb63b 2120 char *d;
89bfa8cd 2121
7918f24d
NC
2122 PERL_ARGS_ASSERT_FORCE_VERSION;
2123
29595ff2 2124 s = SKIPSPACE1(s);
89bfa8cd 2125
44dcb63b 2126 d = s;
dd629d5b 2127 if (*d == 'v')
44dcb63b 2128 d++;
44dcb63b 2129 if (isDIGIT(*d)) {
e759cc13
RGS
2130 while (isDIGIT(*d) || *d == '_' || *d == '.')
2131 d++;
4e4da3ac 2132 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
dd629d5b 2133 SV *ver;
6154021b
RGS
2134 s = scan_num(s, &pl_yylval);
2135 version = pl_yylval.opval;
dd629d5b
GS
2136 ver = cSVOPx(version)->op_sv;
2137 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2138 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2139 SvNV_set(ver, str_to_version(ver));
1571675a 2140 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2141 }
89bfa8cd 2142 }
5db06880 2143 else if (guessing) {
e759cc13 2144 return s;
5db06880 2145 }
89bfa8cd
PP
2146 }
2147
2148 /* NOTE: The parser sees the package name and the VERSION swapped */
9ded7720 2149 NEXTVAL_NEXTTOKE.opval = version;
4e553d73 2150 force_next(WORD);
89bfa8cd 2151
e759cc13 2152 return s;
89bfa8cd
PP
2153}
2154
ffb4593c 2155/*
91152fc1
DG
2156 * S_force_strict_version
2157 * Forces the next token to be a version number using strict syntax rules.
2158 */
2159
2160STATIC char *
2161S_force_strict_version(pTHX_ char *s)
2162{
91152fc1 2163 OP *version = NULL;
91152fc1
DG
2164 const char *errstr = NULL;
2165
2166 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2167
2168 while (isSPACE(*s)) /* leading whitespace */
2169 s++;
2170
2171 if (is_STRICT_VERSION(s,&errstr)) {
2172 SV *ver = newSV(0);
2173 s = (char *)scan_version(s, ver, 0);
2174 version = newSVOP(OP_CONST, 0, ver);
2175 }
4e4da3ac
Z
2176 else if ( (*s != ';' && *s != '{' && *s != '}' ) &&
2177 (s = SKIPSPACE1(s), (*s != ';' && *s != '{' && *s != '}' )))
2178 {
91152fc1
DG
2179 PL_bufptr = s;
2180 if (errstr)
2181 yyerror(errstr); /* version required */
2182 return s;
2183 }
2184
91152fc1 2185 /* NOTE: The parser sees the package name and the VERSION swapped */
91152fc1
DG
2186 NEXTVAL_NEXTTOKE.opval = version;
2187 force_next(WORD);
2188
2189 return s;
2190}
2191
2192/*
ffb4593c
NT
2193 * S_tokeq
2194 * Tokenize a quoted string passed in as an SV. It finds the next
2195 * chunk, up to end of string or a backslash. It may make a new
2196 * SV containing that chunk (if HINT_NEW_STRING is on). It also
2197 * turns \\ into \.
2198 */
2199
76e3520e 2200STATIC SV *
cea2e8a9 2201S_tokeq(pTHX_ SV *sv)
79072805 2202{
eb578fdb
KW
2203 char *s;
2204 char *send;
2205 char *d;
b3ac6de7 2206 SV *pv = sv;
79072805 2207
7918f24d
NC
2208 PERL_ARGS_ASSERT_TOKEQ;
2209
279b35ad
FC
2210 assert (SvPOK(sv));
2211 assert (SvLEN(sv));
2212 assert (!SvIsCOW(sv));
307ed071 2213 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1) /* <<'heredoc' */
b3ac6de7 2214 goto finish;
279b35ad
FC
2215 s = SvPVX(sv);
2216 send = SvEND(sv);
dcb21ed6
NC
2217 /* This is relying on the SV being "well formed" with a trailing '\0' */
2218 while (s < send && !(*s == '\\' && s[1] == '\\'))
79072805
LW
2219 s++;
2220 if (s == send)
b3ac6de7 2221 goto finish;
79072805 2222 d = s;
be4731d2 2223 if ( PL_hints & HINT_NEW_STRING ) {
279b35ad
FC
2224 pv = newSVpvn_flags(SvPVX_const(pv), SvCUR(sv),
2225 SVs_TEMP | SvUTF8(sv));
be4731d2 2226 }
79072805
LW
2227 while (s < send) {
2228 if (*s == '\\') {
a0d0e21e 2229 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2230 s++; /* all that, just for this */
2231 }
2232 *d++ = *s++;
2233 }
2234 *d = '\0';
95a20fc0 2235 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2236 finish:
3280af22 2237 if ( PL_hints & HINT_NEW_STRING )
eb0d8d16 2238 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
79072805
LW
2239 return sv;
2240}
2241
ffb4593c
NT
2242/*
2243 * Now come three functions related to double-quote context,
2244 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2245 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2246 * interact with PL_lex_state, and create fake ( ... ) argument lists
2247 * to handle functions and concatenation.
ecd24171
DM
2248 * For example,
2249 * "foo\lbar"
2250 * is tokenised as
2251 * stringify ( const[foo] concat lcfirst ( const[bar] ) )
ffb4593c
NT
2252 */
2253
2254/*
2255 * S_sublex_start
6154021b 2256 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2257 *
2258 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2259 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c
NT
2260 *
2261 * OP_CONST and OP_READLINE are easy--just make the new op and return.
2262 *
2263 * Everything else becomes a FUNC.
2264 *
2265 * Sets PL_lex_state to LEX_INTERPPUSH unless (ival was OP_NULL or we
2266 * had an OP_CONST or OP_READLINE). This just sets us up for a
2267 * call to S_sublex_push().
2268 */
2269
76e3520e 2270STATIC I32
cea2e8a9 2271S_sublex_start(pTHX)
79072805 2272{
eb578fdb 2273 const I32 op_type = pl_yylval.ival;
79072805
LW
2274
2275 if (op_type == OP_NULL) {
6154021b 2276 pl_yylval.opval = PL_lex_op;
5f66b61c 2277 PL_lex_op = NULL;
79072805
LW
2278 return THING;
2279 }
466112bb 2280 if (op_type == OP_CONST) {
3280af22 2281 SV *sv = tokeq(PL_lex_stuff);
b3ac6de7
IZ
2282
2283 if (SvTYPE(sv) == SVt_PVIV) {
2284 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2285 STRLEN len;
96a5add6 2286 const char * const p = SvPV_const(sv, len);
740cce10 2287 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2288 SvREFCNT_dec(sv);
2289 sv = nsv;
4e553d73 2290 }
6154021b 2291 pl_yylval.opval = (OP*)newSVOP(op_type, 0, sv);
a0714e2c 2292 PL_lex_stuff = NULL;
79072805
LW
2293 return THING;
2294 }
2295
3280af22 2296 PL_sublex_info.super_state = PL_lex_state;
eac04b2e 2297 PL_sublex_info.sub_inwhat = (U16)op_type;
3280af22
NIS
2298 PL_sublex_info.sub_op = PL_lex_op;
2299 PL_lex_state = LEX_INTERPPUSH;
55497cff 2300
3280af22
NIS
2301 PL_expect = XTERM;
2302 if (PL_lex_op) {
6154021b 2303 pl_yylval.opval = PL_lex_op;
5f66b61c 2304 PL_lex_op = NULL;
55497cff
PP
2305 return PMFUNC;
2306 }
2307 else
2308 return FUNC;
2309}
2310
ffb4593c
NT
2311/*
2312 * S_sublex_push
2313 * Create a new scope to save the lexing state. The scope will be
2314 * ended in S_sublex_done. Returns a '(', starting the function arguments
2315 * to the uc, lc, etc. found before.
2316 * Sets PL_lex_state to LEX_INTERPCONCAT.
2317 */
2318
76e3520e 2319STATIC I32
cea2e8a9 2320S_sublex_push(pTHX)
55497cff 2321{
78a635de 2322 LEXSHARED *shared;
801d32ac 2323 const bool is_heredoc = PL_multi_close == '<';
f46d017c 2324 ENTER;
55497cff 2325
3280af22 2326 PL_lex_state = PL_sublex_info.super_state;
cc624add 2327 SAVEI8(PL_lex_dojoin);
3280af22 2328 SAVEI32(PL_lex_brackets);
78cdf107 2329 SAVEI32(PL_lex_allbrackets);
b27dce25 2330 SAVEI32(PL_lex_formbrack);
78cdf107 2331 SAVEI8(PL_lex_fakeeof);
3280af22
NIS
2332 SAVEI32(PL_lex_casemods);
2333 SAVEI32(PL_lex_starts);
651b5b28 2334 SAVEI8(PL_lex_state);
7cc34111 2335 SAVESPTR(PL_lex_repl);
7766f137 2336 SAVEVPTR(PL_lex_inpat);
98246f1e 2337 SAVEI16(PL_lex_inwhat);
ffdb8b16 2338 if (is_heredoc)
b42366d4 2339 {
ffdb8b16 2340 SAVECOPLINE(PL_curcop);
b42366d4 2341 SAVEI32(PL_multi_end);
851b527a
FC
2342 SAVEI32(PL_parser->herelines);
2343 PL_parser->herelines = 0;
b42366d4
FC
2344 }
2345 SAVEI8(PL_multi_close);
3280af22 2346 SAVEPPTR(PL_bufptr);
8452ff4b 2347 SAVEPPTR(PL_bufend);
3280af22
NIS
2348 SAVEPPTR(PL_oldbufptr);
2349 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2350 SAVEPPTR(PL_last_lop);
2351 SAVEPPTR(PL_last_uni);
3280af22
NIS
2352 SAVEPPTR(PL_linestart);
2353 SAVESPTR(PL_linestr);
8edd5f42
RGS
2354 SAVEGENERICPV(PL_lex_brackstack);
2355 SAVEGENERICPV(PL_lex_casestack);
78a635de 2356 SAVEGENERICPV(PL_parser->lex_shared);
3a54fd60 2357 SAVEBOOL(PL_parser->lex_re_reparsing);
ffdb8b16 2358 SAVEI32(PL_copline);
3280af22 2359
99bd9d90 2360 /* The here-doc parser needs to be able to peek into outer lexing
60f40a38
FC
2361 scopes to find the body of the here-doc. So we put PL_linestr and
2362 PL_bufptr into lex_shared, to ‘share’ those values.
99bd9d90 2363 */
60f40a38
FC
2364 PL_parser->lex_shared->ls_linestr = PL_linestr;
2365 PL_parser->lex_shared->ls_bufptr = PL_bufptr;
99bd9d90 2366
3280af22 2367 PL_linestr = PL_lex_stuff;
7cc34111 2368 PL_lex_repl = PL_sublex_info.repl;
a0714e2c 2369 PL_lex_stuff = NULL;
7cc34111 2370 PL_sublex_info.repl = NULL;
3280af22 2371
9cbb5ea2
GS
2372 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2373 = SvPVX(PL_linestr);
3280af22 2374 PL_bufend += SvCUR(PL_linestr);
bd61b366 2375 PL_last_lop = PL_last_uni = NULL;
3280af22 2376 SAVEFREESV(PL_linestr);
4dc843bc 2377 if (PL_lex_repl) SAVEFREESV(PL_lex_repl);
3280af22
NIS
2378
2379 PL_lex_dojoin = FALSE;
b27dce25 2380 PL_lex_brackets = PL_lex_formbrack = 0;
78cdf107
Z
2381 PL_lex_allbrackets = 0;
2382 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
a02a5408
JC
2383 Newx(PL_lex_brackstack, 120, char);
2384 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2385 PL_lex_casemods = 0;
2386 *PL_lex_casestack = '\0';
2387 PL_lex_starts = 0;
2388 PL_lex_state = LEX_INTERPCONCAT;
ffdb8b16 2389 if (is_heredoc)
6ddcf93b 2390 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
ffdb8b16 2391 PL_copline = NOLINE;
78a635de
FC
2392
2393 Newxz(shared, 1, LEXSHARED);
2394 shared->ls_prev = PL_parser->lex_shared;
2395 PL_parser->lex_shared = shared;
3280af22
NIS
2396
2397 PL_lex_inwhat = PL_sublex_info.sub_inwhat;
bb16bae8 2398 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
3280af22
NIS
2399 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2400 PL_lex_inpat = PL_sublex_info.sub_op;
79072805 2401 else
5f66b61c 2402 PL_lex_inpat = NULL;
79072805 2403
3a54fd60
DM
2404 PL_parser->lex_re_reparsing = cBOOL(PL_in_eval & EVAL_RE_REPARSING);
2405 PL_in_eval &= ~EVAL_RE_REPARSING;
2406
55497cff 2407 return '(';
79072805
LW
2408}
2409
ffb4593c
NT
2410/*
2411 * S_sublex_done
2412 * Restores lexer state after a S_sublex_push.
2413 */
2414
76e3520e 2415STATIC I32
cea2e8a9 2416S_sublex_done(pTHX)
79072805 2417{
3280af22 2418 if (!PL_lex_starts++) {
396482e1 2419 SV * const sv = newSVpvs("");
9aa983d2
JH
2420 if (SvUTF8(PL_linestr))
2421 SvUTF8_on(sv);
3280af22 2422 PL_expect = XOPERATOR;
6154021b 2423 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
79072805
LW
2424 return THING;
2425 }
2426
3280af22
NIS
2427 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2428 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2429 return yylex();
79072805
LW
2430 }
2431
ffb4593c 2432 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
bb16bae8 2433 assert(PL_lex_inwhat != OP_TRANSR);
5aa91856
FC
2434 if (PL_lex_repl) {
2435 assert (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS);
3280af22
NIS
2436 PL_linestr = PL_lex_repl;
2437 PL_lex_inpat = 0;
2438 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2439 PL_bufend += SvCUR(PL_linestr);
bd61b366 2440 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2441 PL_lex_dojoin = FALSE;
2442 PL_lex_brackets = 0;
78cdf107
Z
2443 PL_lex_allbrackets = 0;
2444 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
3280af22
NIS
2445 PL_lex_casemods = 0;
2446 *PL_lex_casestack = '\0';
2447 PL_lex_starts = 0;
25da4f38 2448 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2449 PL_lex_state = LEX_INTERPNORMAL;
2450 PL_lex_starts++;
e9fa98b2
HS
2451 /* we don't clear PL_lex_repl here, so that we can check later
2452 whether this is an evalled subst; that means we rely on the
2453 logic to ensure sublex_done() is called again only via the
2454 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2455 }
e9fa98b2 2456 else {
3280af22 2457 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2458 PL_lex_repl = NULL;
e9fa98b2 2459 }
ffdb8b16
FC
2460 if (SvTYPE(PL_linestr) >= SVt_PVNV) {
2461 CopLINE(PL_curcop) +=
2462 ((XPVNV*)SvANY(PL_linestr))->xnv_u.xpad_cop_seq.xlow
851b527a
FC
2463 + PL_parser->herelines;
2464 PL_parser->herelines = 0;
ffdb8b16 2465 }
79072805 2466 return ',';
ffed7fef
LW
2467 }
2468 else {
b42366d4 2469 const line_t l = CopLINE(PL_curcop);
f46d017c 2470 LEAVE;
b42366d4 2471 if (PL_multi_close == '<')
851b527a 2472 PL_parser->herelines += l - PL_multi_end;
3280af22
NIS
2473 PL_bufend = SvPVX(PL_linestr);
2474 PL_bufend += SvCUR(PL_linestr);
2475 PL_expect = XOPERATOR;
09bef843 2476 PL_sublex_info.sub_inwhat = 0;
79072805 2477 return ')';
ffed7fef
LW
2478 }
2479}
2480
6f613c73
KW
2481PERL_STATIC_INLINE SV*
2482S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
2483{
140b12ad
KW
2484 /* <s> points to first character of interior of \N{}, <e> to one beyond the
2485 * interior, hence to the "}". Finds what the name resolves to, returning
2486 * an SV* containing it; NULL if no valid one found */
2487
dd2b1b72 2488 SV* res = newSVpvn_flags(s, e - s, UTF ? SVf_UTF8 : 0);
6f613c73 2489
0c415a79
KW
2490 HV * table;
2491 SV **cvp;
2492 SV *cv;
2493 SV *rv;
2494 HV *stash;
2495 const U8* first_bad_char_loc;
2496 const char* backslash_ptr = s - 3; /* Points to the <\> of \N{... */
2497
6f613c73
KW
2498 PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME;
2499
107160e2
KW
2500 if (UTF && ! is_utf8_string_loc((U8 *) backslash_ptr,
2501 e - backslash_ptr,
2502 &first_bad_char_loc))
2503 {
2504 /* If warnings are on, this will print a more detailed analysis of what
2505 * is wrong than the error message below */
c80e42f3 2506 utf8n_to_uvchr(first_bad_char_loc,
107160e2
KW
2507 e - ((char *) first_bad_char_loc),
2508 NULL, 0);
2509
2510 /* We deliberately don't try to print the malformed character, which
2511 * might not print very well; it also may be just the first of many
2512 * malformations, so don't print what comes after it */
2513 yyerror(Perl_form(aTHX_
2514 "Malformed UTF-8 character immediately after '%.*s'",
2515 (int) (first_bad_char_loc - (U8 *) backslash_ptr), backslash_ptr));
2516 return NULL;
2517 }
2518
2519 res = new_constant( NULL, 0, "charnames", res, NULL, backslash_ptr,
2520 /* include the <}> */
2521 e - backslash_ptr + 1);
6f613c73 2522 if (! SvPOK(res)) {
b6407c49 2523 SvREFCNT_dec_NN(res);
6f613c73
KW
2524 return NULL;
2525 }
2526
0c415a79
KW
2527 /* See if the charnames handler is the Perl core's, and if so, we can skip
2528 * the validation needed for a user-supplied one, as Perl's does its own
2529 * validation. */
2530 table = GvHV(PL_hintgv); /* ^H */
2531 cvp = hv_fetchs(table, "charnames", FALSE);
5882ddb3
FC
2532 if (cvp && (cv = *cvp) && SvROK(cv) && (rv = SvRV(cv),
2533 SvTYPE(rv) == SVt_PVCV) && ((stash = CvSTASH(rv)) != NULL))
0c415a79
KW
2534 {
2535 const char * const name = HvNAME(stash);
6a642c21
FC
2536 if (HvNAMELEN(stash) == sizeof("_charnames")-1
2537 && strEQ(name, "_charnames")) {
0c415a79
KW
2538 return res;
2539 }
2540 }
2541
bde9e88d
KW
2542 /* Here, it isn't Perl's charname handler. We can't rely on a
2543 * user-supplied handler to validate the input name. For non-ut8 input,
2544 * look to see that the first character is legal. Then loop through the
2545 * rest checking that each is a continuation */
6f613c73 2546
36897d64
KW
2547 /* This code makes the reasonable assumption that the only Latin1-range
2548 * characters that begin a character name alias are alphabetic, otherwise
2549 * would have to create a isCHARNAME_BEGIN macro */
b6ba1137 2550
b6ba1137 2551 if (! UTF) {
bde9e88d 2552 if (! isALPHAU(*s)) {
b6ba1137
KW
2553 goto bad_charname;
2554 }
bde9e88d
KW
2555 s++;
2556 while (s < e) {
2557 if (! isCHARNAME_CONT(*s)) {
b6ba1137
KW
2558 goto bad_charname;
2559 }
2d8eb851
KW
2560 if (*s == ' ' && *(s-1) == ' ') {
2561 goto multi_spaces;
bd299e29 2562 }
df758df2
KW
2563 if ((U8) *s == NBSP_NATIVE && ckWARN_d(WARN_DEPRECATED)) {
2564 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2565 "NO-BREAK SPACE in a charnames "
2566 "alias definition is deprecated");
2567 }
bde9e88d 2568 s++;
b6ba1137
KW
2569 }
2570 }
2571 else {
bde9e88d
KW
2572 /* Similarly for utf8. For invariants can check directly; for other
2573 * Latin1, can calculate their code point and check; otherwise use a
2574 * swash */
2575 if (UTF8_IS_INVARIANT(*s)) {
2576 if (! isALPHAU(*s)) {
140b12ad
KW
2577 goto bad_charname;
2578 }
bde9e88d
KW
2579 s++;
2580 } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
94bb8c36 2581 if (! isALPHAU(TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)))) {
b6ba1137 2582 goto bad_charname;
6f613c73 2583 }
bde9e88d 2584 s += 2;
6f613c73 2585 }
bde9e88d
KW
2586 else {
2587 if (! PL_utf8_charname_begin) {
2588 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2589 PL_utf8_charname_begin = _core_swash_init("utf8",
2590 "_Perl_Charname_Begin",
2591 &PL_sv_undef,
2592 1, 0, NULL, &flags);
2593 }
2594 if (! swash_fetch(PL_utf8_charname_begin, (U8 *) s, TRUE)) {
2595 goto bad_charname;
2596 }
2597 s += UTF8SKIP(s);
2598 }
2599
2600 while (s < e) {
2601 if (UTF8_IS_INVARIANT(*s)) {
2602 if (! isCHARNAME_CONT(*s)) {
2603 goto bad_charname;
2604 }
2d8eb851
KW
2605 if (*s == ' ' && *(s-1) == ' ') {
2606 goto multi_spaces;
bd299e29 2607 }
bde9e88d
KW
2608 s++;
2609 }
2610 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
94bb8c36 2611 if (! isCHARNAME_CONT(TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1))))
bde9e88d
KW
2612 {
2613 goto bad_charname;
2614 }
df758df2
KW
2615 if (*s == *NBSP_UTF8
2616 && *(s+1) == *(NBSP_UTF8+1)
2617 && ckWARN_d(WARN_DEPRECATED))
2618 {
2619 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2620 "NO-BREAK SPACE in a charnames "
2621 "alias definition is deprecated");
2622 }
bde9e88d
KW
2623 s += 2;
2624 }
2625 else {
2626 if (! PL_utf8_charname_continue) {
2627 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2628 PL_utf8_charname_continue = _core_swash_init("utf8",
2629 "_Perl_Charname_Continue",
2630 &PL_sv_undef,
2631 1, 0, NULL, &flags);
2632 }
2633 if (! swash_fetch(PL_utf8_charname_continue, (U8 *) s, TRUE)) {
2634 goto bad_charname;
2635 }
2636 s += UTF8SKIP(s);
6f613c73
KW
2637 }
2638 }
2d8eb851
KW
2639 }
2640 if (*(s-1) == ' ') {
2641 yyerror_pv(
2642 Perl_form(aTHX_
2643 "charnames alias definitions may not contain trailing "
2644 "white-space; marked by <-- HERE in %.*s<-- HERE %.*s",
2645 (int)(s - backslash_ptr + 1), backslash_ptr,
2646 (int)(e - s + 1), s + 1
2647 ),
2648 UTF ? SVf_UTF8 : 0);
2649 return NULL;
6f613c73
KW
2650 }
2651
94ca1619 2652 if (SvUTF8(res)) { /* Don't accept malformed input */
bde9e88d
KW
2653 const U8* first_bad_char_loc;
2654 STRLEN len;
2655 const char* const str = SvPV_const(res, len);
2656 if (! is_utf8_string_loc((U8 *) str, len, &first_bad_char_loc)) {
2657 /* If warnings are on, this will print a more detailed analysis of
2658 * what is wrong than the error message below */
c80e42f3 2659 utf8n_to_uvchr(first_bad_char_loc,
bde9e88d
KW
2660 (char *) first_bad_char_loc - str,
2661 NULL, 0);
2662
2663 /* We deliberately don't try to print the malformed character,
2664 * which might not print very well; it also may be just the first
2665 * of many malformations, so don't print what comes after it */
2666 yyerror_pv(
2667 Perl_form(aTHX_
2668 "Malformed UTF-8 returned by %.*s immediately after '%.*s'",
2669 (int) (e - backslash_ptr + 1), backslash_ptr,
2670 (int) ((char *) first_bad_char_loc - str), str
2671 ),
2672 SVf_UTF8);
2673 return NULL;
2674 }
2675 }
140b12ad 2676
bde9e88d 2677 return res;
140b12ad 2678
bde9e88d 2679 bad_charname: {
bde9e88d
KW
2680
2681 /* The final %.*s makes sure that should the trailing NUL be missing
2682 * that this print won't run off the end of the string */
2683 yyerror_pv(
2684 Perl_form(aTHX_
2685 "Invalid character in \\N{...}; marked by <-- HERE in %.*s<-- HERE %.*s",
2d8eb851
KW
2686 (int)(s - backslash_ptr + 1), backslash_ptr,
2687 (int)(e - s + 1), s + 1
bde9e88d
KW
2688 ),
2689 UTF ? SVf_UTF8 : 0);
2690 return NULL;
2691 }
2d8eb851
KW
2692
2693 multi_spaces:
2694 yyerror_pv(
2695 Perl_form(aTHX_
2696 "charnames alias definitions may not contain a sequence of "
2697 "multiple spaces; marked by <-- HERE in %.*s<-- HERE %.*s",
2698 (int)(s - backslash_ptr + 1), backslash_ptr,
2699 (int)(e - s + 1), s + 1
2700 ),
2701 UTF ? SVf_UTF8 : 0);
2702 return NULL;
6f613c73
KW
2703}
2704
02aa26ce
NT
2705/*
2706 scan_const
2707
9da1dd8f
DM
2708 Extracts the next constant part of a pattern, double-quoted string,
2709 or transliteration. This is terrifying code.
2710
2711 For example, in parsing the double-quoted string "ab\x63$d", it would
2712 stop at the '$' and return an OP_CONST containing 'abc'.
02aa26ce 2713
94def140 2714 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 2715 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 2716 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 2717
94def140
ST
2718 Returns a pointer to the character scanned up to. If this is
2719 advanced from the start pointer supplied (i.e. if anything was
9da1dd8f 2720 successfully parsed), will leave an OP_CONST for the substring scanned
6154021b 2721 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
2722 by looking at the next characters herself.
2723
02aa26ce 2724 In patterns:
9da1dd8f 2725 expand:
537124e4
KW
2726 \N{FOO} => \N{U+hex_for_character_FOO}
2727 (if FOO expands to multiple characters, expands to \N{U+xx.XX.yy ...})
9da1dd8f
DM
2728
2729 pass through:
2730 all other \-char, including \N and \N{ apart from \N{ABC}
2731
2732 stops on:
2733 @ and $ where it appears to be a var, but not for $ as tail anchor
2734 \l \L \u \U \Q \E
2735 (?{ or (??{
2736
02aa26ce
NT
2737
2738 In transliterations:
2739 characters are VERY literal, except for - not at the start or end
94def140
ST
2740 of the string, which indicates a range. If the range is in bytes,
2741 scan_const expands the range to the full set of intermediate
2742 characters. If the range is in utf8, the hyphen is replaced with
2743 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
2744
2745 In double-quoted strings:
2746 backslashes:
2747 double-quoted style: \r and \n
ff3f963a 2748 constants: \x31, etc.
94def140 2749 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
2750 case and quoting: \U \Q \E
2751 stops on @ and $
2752
2753 scan_const does *not* construct ops to handle interpolated strings.
2754 It stops processing as soon as it finds an embedded $ or @ variable
2755 and leaves it to the caller to work out what's going on.
2756
94def140
ST
2757 embedded arrays (whether in pattern or not) could be:
2758 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2759
2760 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
2761
2762 $ in pattern could be $foo or could be tail anchor. Assumption:
2763 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 2764 followed by one of "()| \r\n\t"
02aa26ce 2765
9da1dd8f 2766 \1 (backreferences) are turned into $1 in substitutions
02aa26ce
NT
2767
2768 The structure of the code is
2769 while (there's a character to process) {
94def140
ST
2770 handle transliteration ranges
2771 skip regexp comments /(?#comment)/ and codes /(?{code})/
2772 skip #-initiated comments in //x patterns
2773 check for embedded arrays
02aa26ce
NT
2774 check for embedded scalars
2775 if (backslash) {
94def140 2776 deprecate \1 in substitution replacements
02aa26ce
NT
2777 handle string-changing backslashes \l \U \Q \E, etc.
2778 switch (what was escaped) {
94def140 2779 handle \- in a transliteration (becomes a literal -)
ff3f963a 2780 if a pattern and not \N{, go treat as regular character
94def140
ST
2781 handle \132 (octal characters)
2782 handle \x15 and \x{1234} (hex characters)
ff3f963a 2783 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
ST
2784 handle \cV (control characters)
2785 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 2786 } (end switch)
77a135fe 2787 continue
02aa26ce 2788 } (end if backslash)
77a135fe 2789 handle regular character
02aa26ce 2790 } (end while character to read)
4e553d73 2791
02aa26ce
NT
2792*/
2793
76e3520e 2794STATIC char *
cea2e8a9 2795S_scan_const(pTHX_ char *start)
79072805 2796{
eb578fdb 2797 char *send = PL_bufend; /* end of the constant */
dc023dbb
KW
2798 SV *sv = newSV(send - start); /* sv for the constant. See note below
2799 on sizing. */
eb578fdb
KW
2800 char *s = start; /* start of the constant */
2801 char *d = SvPVX(sv); /* destination for copies */
dc023dbb
KW
2802 bool dorange = FALSE; /* are we in a translit range? */
2803 bool didrange = FALSE; /* did we just finish a range? */
2804 bool in_charclass = FALSE; /* within /[...]/ */
2805 bool has_utf8 = FALSE; /* Output constant is UTF8 */
2806 bool this_utf8 = cBOOL(UTF); /* Is the source string assumed to be
2807 UTF8? But, this can show as true
2808 when the source isn't utf8, as for
2809 example when it is entirely composed
2810 of hex constants */
6f613c73 2811 SV *res; /* result from charnames */
77a135fe
KW
2812
2813 /* Note on sizing: The scanned constant is placed into sv, which is
2814 * initialized by newSV() assuming one byte of output for every byte of
2815 * input. This routine expects newSV() to allocate an extra byte for a
2816 * trailing NUL, which this routine will append if it gets to the end of
2817 * the input. There may be more bytes of input than output (eg., \N{LATIN
2818 * CAPITAL LETTER A}), or more output than input if the constant ends up
2819 * recoded to utf8, but each time a construct is found that might increase
2820 * the needed size, SvGROW() is called. Its size parameter each time is
2821 * based on the best guess estimate at the time, namely the length used so
2822 * far, plus the length the current construct will occupy, plus room for
2823 * the trailing NUL, plus one byte for every input byte still unscanned */
2824
c3320c2a
KW
2825 UV uv = UV_MAX; /* Initialize to weird value to try to catch any uses
2826 before set */
4c3a8340
ST
2827#ifdef EBCDIC
2828 UV literal_endpoint = 0;
e294cc5d 2829 bool native_range = TRUE; /* turned to FALSE if the first endpoint is Unicode. */
4c3a8340 2830#endif
012bcf8d 2831
7918f24d
NC
2832 PERL_ARGS_ASSERT_SCAN_CONST;
2833
bb16bae8 2834 assert(PL_lex_inwhat != OP_TRANSR);
2b9d42f0
NIS
2835 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
2836 /* If we are doing a trans and we know we want UTF8 set expectation */
2837 has_utf8 = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
2838 this_utf8 = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
2839 }
2840
b899e89d
FC
2841 /* Protect sv from errors and fatal warnings. */
2842 ENTER_with_name("scan_const");
2843 SAVEFREESV(sv);
2b9d42f0 2844
79072805 2845 while (s < send || dorange) {
ff3f963a 2846
02aa26ce 2847 /* get transliterations out of the way (they're most literal) */
3280af22 2848 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 2849 /* expand a range A-Z to the full set of characters. AIE! */
79072805 2850 if (dorange) {
1ba5c669
JH
2851 I32 i; /* current expanded character */
2852 I32 min; /* first character in range */
2853 I32 max; /* last character in range */
02aa26ce 2854
e294cc5d
JH
2855#ifdef EBCDIC
2856 UV uvmax = 0;
2857#endif
2858
2859 if (has_utf8
2860#ifdef EBCDIC
2861 && !native_range
2862#endif
1953db30 2863 ) {
9d4ba2ae 2864 char * const c = (char*)utf8_hop((U8*)d, -1);
8973db79
JH
2865 char *e = d++;
2866 while (e-- > c)
2867 *(e + 1) = *e;
e7214ce8 2868 *c = (char) ILLEGAL_UTF8_BYTE;
8973db79
JH
2869 /* mark the range as done, and continue */
2870 dorange = FALSE;
2871 didrange = TRUE;
2872 continue;
2873 }
2b9d42f0 2874
95a20fc0 2875 i = d - SvPVX_const(sv); /* remember current offset */
e294cc5d
JH
2876#ifdef EBCDIC
2877 SvGROW(sv,
dc023dbb
KW
2878 SvLEN(sv) + ((has_utf8)
2879 ? (512 - UTF_CONTINUATION_MARK
2880 + UNISKIP(0x100))
e294cc5d
JH
2881 : 256));
2882 /* How many two-byte within 0..255: 128 in UTF-8,
2883 * 96 in UTF-8-mod. */
2884#else
9cbb5ea2 2885 SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */
e294cc5d 2886#endif
9cbb5ea2 2887 d = SvPVX(sv) + i; /* refresh d after realloc */
e294cc5d
JH
2888#ifdef EBCDIC
2889 if (has_utf8) {
2890 int j;
2891 for (j = 0; j <= 1; j++) {
2892 char * const c = (char*)utf8_hop((U8*)d, -1);
2893 const UV uv = utf8n_to_uvchr((U8*)c, d - c, NULL, 0);
2894 if (j)
2895 min = (U8)uv;
2896 else if (uv < 256)
2897 max = (U8)uv;
2898 else {
2899 max = (U8)0xff; /* only to \xff */
2900 uvmax = uv; /* \x{100} to uvmax */
2901 }
2902 d = c; /* eat endpoint chars */
2903 }
2904 }
2905 else {
2906#endif
2907 d -= 2; /* eat the first char and the - */
2908 min = (U8)*d; /* first char in range */
2909 max = (U8)d[1]; /* last char in range */
2910#ifdef EBCDIC
2911 }
2912#endif
8ada0baa 2913
c2e66d9e 2914 if (min > max) {
01ec43d0 2915 Perl_croak(aTHX_
d1573ac7 2916 "Invalid range \"%c-%c\" in transliteration operator",
1ba5c669 2917 (char)min, (char)max);
c2e66d9e
GS
2918 }
2919
c7f1f016 2920#ifdef EBCDIC
dc023dbb
KW
2921 /* Because of the discontinuities in EBCDIC A-Z and a-z, expand
2922 * any subsets of these ranges into individual characters */
4c3a8340 2923 if (literal_endpoint == 2 &&
c286a9a6 2924 ((isLOWER_A(min) && isLOWER_A(max)) ||
75e9e7bf
KW
2925 (isUPPER_A(min) && isUPPER_A(max))))
2926 {
2927 for (i = min; i <= max; i++) {
2928 if (isALPHA_A(i))
2929 *d++ = i;
8ada0baa
JH
2930 }
2931 }
2932 else
2933#endif
2934 for (i = min; i <= max; i++)
e294cc5d
JH
2935#ifdef EBCDIC
2936 if (has_utf8) {
55d09dc8 2937 append_utf8_from_native_byte(i, &d);
e294cc5d
JH
2938 }
2939 else
2940#endif
2941 *d++ = (char)i;
2942
2943#ifdef EBCDIC
2944 if (uvmax) {
2945 d = (char*)uvchr_to_utf8((U8*)d, 0x100);
2946 if (uvmax > 0x101)
e7214ce8 2947 *d++ = (char) ILLEGAL_UTF8_BYTE;
e294cc5d
JH
2948 if (uvmax > 0x100)
2949 d = (char*)uvchr_to_utf8((U8*)d, uvmax);
2950 }
2951#endif
02aa26ce
NT
2952
2953 /* mark the range as done, and continue */
79072805 2954 dorange = FALSE;
01ec43d0 2955 didrange = TRUE;
4c3a8340
ST
2956#ifdef EBCDIC
2957 literal_endpoint = 0;
2958#endif
79072805 2959 continue;
4e553d73 2960 }
02aa26ce
NT
2961
2962 /* range begins (ignore - as first or last char) */
79072805 2963 else if (*s == '-' && s+1 < send && s != start) {
4e553d73 2964 if (didrange) {
1fafa243 2965 Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
01ec43d0 2966 }
e294cc5d
JH
2967 if (has_utf8
2968#ifdef EBCDIC
2969 && !native_range
2970#endif
2971 ) {
e7214ce8 2972 *d++ = (char) ILLEGAL_UTF8_BYTE; /* use illegal utf8 byte--see pmtrans */
a0ed51b3
LW
2973 s++;
2974 continue;
2975 }
79072805
LW
2976 dorange = TRUE;
2977 s++;
01ec43d0
GS
2978 }
2979 else {
2980 didrange = FALSE;
4c3a8340
ST
2981#ifdef EBCDIC
2982 literal_endpoint = 0;
e294cc5d 2983 native_range = TRUE;
4c3a8340 2984#endif
01ec43d0 2985 }
79072805 2986 }
02aa26ce
NT
2987
2988 /* if we get here, we're not doing a transliteration */
2989
e4a2df84
DM
2990 else if (*s == '[' && PL_lex_inpat && !in_charclass) {
2991 char *s1 = s-1;
2992 int esc = 0;
2993 while (s1 >= start && *s1-- == '\\')
2994 esc = !esc;
2995 if (!esc)
2996 in_charclass = TRUE;
2997 }
2866decb 2998
e4a2df84
DM
2999 else if (*s == ']' && PL_lex_inpat && in_charclass) {
3000 char *s1 = s-1;
3001 int esc = 0;
3002 while (s1 >= start && *s1-- == '\\')
3003 esc = !esc;
3004 if (!esc)
3005 in_charclass = FALSE;
3006 }
2866decb 3007
9da1dd8f
DM
3008 /* skip for regexp comments /(?#comment)/, except for the last
3009 * char, which will be done separately.
3010 * Stop on (?{..}) and friends */
3011
c30fc27b 3012 else if (*s == '(' && PL_lex_inpat && s[1] == '?' && !in_charclass) {
cc6b7395 3013 if (s[2] == '#') {
e994fd66 3014 while (s+1 < send && *s != ')')
5ff03569 3015 *d++ = *s++;
155aba94 3016 }
c30fc27b 3017 else if (!PL_lex_casemods &&
d3cec5e5
DM
3018 ( s[2] == '{' /* This should match regcomp.c */
3019 || (s[2] == '?' && s[3] == '{')))
155aba94 3020 {
9da1dd8f 3021 break;
cc6b7395 3022 }
748a9306 3023 }
02aa26ce
NT
3024
3025 /* likewise skip #-initiated comments in //x patterns */
c30fc27b 3026 else if (*s == '#' && PL_lex_inpat && !in_charclass &&
73134a2e 3027 ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED) {
748a9306 3028 while (s+1 < send && *s != '\n')
5ff03569 3029 *d++ = *s++;
748a9306 3030 }
02aa26ce 3031
9da1dd8f
DM
3032 /* no further processing of single-quoted regex */
3033 else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'')
3034 goto default_action;
3035
5d1d4326 3036 /* check for embedded arrays
da6eedaa 3037 (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
5d1d4326 3038 */
1749ea0d 3039 else if (*s == '@' && s[1]) {
8a2bca12 3040 if (isWORDCHAR_lazy_if(s+1,UTF))
1749ea0d
ST
3041 break;
3042 if (strchr(":'{$", s[1]))
3043 break;
3044 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
3045 break; /* in regexp, neither @+ nor @- are interpolated */
3046 }
02aa26ce
NT
3047
3048 /* check for embedded scalars. only stop if we're sure it's a
3049 variable.
3050 */
79072805 3051 else if (*s == '$') {
3280af22 3052 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
79072805 3053 break;
77772344 3054 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
a2a5de95
NC
3055 if (s[1] == '\\') {
3056 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
3057 "Possible unintended interpolation of $\\ in regex");
77772344 3058 }
79072805 3059 break; /* in regexp, $ might be tail anchor */
77772344 3060 }
79072805 3061 }
02aa26ce 3062
2b9d42f0
NIS
3063 /* End of else if chain - OP_TRANS rejoin rest */
3064
02aa26ce 3065 /* backslashes */
79072805 3066 if (*s == '\\' && s+1 < send) {
ff3f963a
KW
3067 char* e; /* Can be used for ending '}', etc. */
3068
79072805 3069 s++;
02aa26ce 3070
7d0fc23c
KW
3071 /* warn on \1 - \9 in substitution replacements, but note that \11
3072 * is an octal; and \19 is \1 followed by '9' */
3280af22 3073 if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat &&
a0d0e21e 3074 isDIGIT(*s) && *s != '0' && !isDIGIT(s[1]))
79072805 3075 {
c782d7ee 3076 /* diag_listed_as: \%d better written as $%d */
a2a5de95 3077 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
79072805
LW
3078 *--s = '$';
3079 break;
3080 }
02aa26ce
NT
3081
3082 /* string-change backslash escapes */
838f2281 3083 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQF", *s)) {
79072805
LW
3084 --s;
3085 break;
3086 }
ff3f963a
KW
3087 /* In a pattern, process \N, but skip any other backslash escapes.
3088 * This is because we don't want to translate an escape sequence
3089 * into a meta symbol and have the regex compiler use the meta
3090 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
3091 * in spite of this, we do have to process \N here while the proper
3092 * charnames handler is in scope. See bugs #56444 and #62056.
3093 * There is a complication because \N in a pattern may also stand
3094 * for 'match a non-nl', and not mean a charname, in which case its
3095 * processing should be deferred to the regex compiler. To be a
3096 * charname it must be followed immediately by a '{', and not look
3097 * like \N followed by a curly quantifier, i.e., not something like
3098 * \N{3,}. regcurly returns a boolean indicating if it is a legal
3099 * quantifier */
3100 else if (PL_lex_inpat
3101 && (*s != 'N'
3102 || s[1] != '{'
412f55bb 3103 || regcurly(s + 1)))
ff3f963a 3104 {
4d73d076 3105 *d++ = '\\';
cc74c5bd
ST
3106 goto default_action;
3107 }
02aa26ce 3108
79072805 3109 switch (*s) {
02aa26ce
NT
3110
3111 /* quoted - in transliterations */
79072805 3112 case '-':
3280af22 3113 if (PL_lex_inwhat == OP_TRANS) {
79072805
LW
3114 *d++ = *s++;
3115 continue;
3116 }
924ba076 3117 /* FALLTHROUGH */
79072805 3118 default:
11b8faa4 3119 {
15861f94 3120 if ((isALPHANUMERIC(*s)))
a2a5de95
NC
3121 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
3122 "Unrecognized escape \\%c passed through",
3123 *s);
11b8faa4 3124 /* default action is to copy the quoted character */
f9a63242 3125 goto default_action;
11b8faa4 3126 }
02aa26ce 3127
632403cc 3128 /* eg. \132 indicates the octal constant 0132 */
79072805
LW
3129 case '0': case '1': case '2': case '3':
3130 case '4': case '5': case '6': case '7':
ba210ebe 3131 {
5e0a247b 3132 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
53305cf1 3133 STRLEN len = 3;
06972766 3134 uv = grok_oct(s, &len, &flags, NULL);
ba210ebe 3135 s += len;
5e0a247b
KW
3136 if (len < 3 && s < send && isDIGIT(*s)
3137 && ckWARN(WARN_MISC))
3138 {
3139 Perl_warner(aTHX_ packWARN(WARN_MISC),
3140 "%s", form_short_octal_warning(s, len));
3141 }
ba210ebe 3142 }
012bcf8d 3143 goto NUM_ESCAPE_INSERT;
02aa26ce 3144
f0a2b745
KW
3145 /* eg. \o{24} indicates the octal constant \024 */
3146 case 'o':
3147 {
454155d9 3148 const char* error;
f0a2b745 3149
00ce5563 3150 bool valid = grok_bslash_o(&s, &uv, &error,
80f4111b
KW
3151 TRUE, /* Output warning */
3152 FALSE, /* Not strict */
17896a85
KW
3153 TRUE, /* Output warnings for
3154 non-portables */
80f4111b 3155 UTF);
454155d9 3156 if (! valid) {
f0a2b745
KW
3157 yyerror(error);
3158 continue;
3159 }
3160 goto NUM_ESCAPE_INSERT;
3161 }
3162
77a135fe 3163 /* eg. \x24 indicates the hex constant 0x24 */
79072805 3164 case 'x':
a0481293 3165 {
a0481293 3166 const char* error;
355860ce 3167
00ce5563 3168 bool valid = grok_bslash_x(&s, &uv, &error,
80f4111b
KW
3169 TRUE, /* Output warning */
3170 FALSE, /* Not strict */
17896a85
KW
3171 TRUE, /* Output warnings for
3172 non-portables */
80f4111b 3173 UTF);
a0481293
KW
3174 if (! valid) {
3175 yyerror(error);
355860ce 3176 continue;
ba210ebe 3177 }
012bcf8d
GS
3178 }
3179
3180 NUM_ESCAPE_INSERT:
ff3f963a
KW
3181 /* Insert oct or hex escaped character. There will always be
3182 * enough room in sv since such escapes will be longer than any
3183 * UTF-8 sequence they can end up as, except if they force us
3184 * to recode the rest of the string into utf8 */
ba7cea30 3185
06972766 3186 /* Here uv is the ordinal of the next character being added */
6f2d5cbc 3187 if (!UVCHR_IS_INVARIANT(uv)) {
9aa983d2 3188 if (!has_utf8 && uv > 255) {
77a135fe
KW
3189 /* Might need to recode whatever we have accumulated so
3190 * far if it contains any chars variant in utf8 or
3191 * utf-ebcdic. */
3192
3193 SvCUR_set(sv, d - SvPVX_const(sv));
3194 SvPOK_on(sv);
3195 *d = '\0';
77a135fe 3196 /* See Note on sizing above. */
7bf79863
KW
3197 sv_utf8_upgrade_flags_grow(sv,
3198 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3199 UNISKIP(uv) + (STRLEN)(send - s) + 1);
77a135fe
KW
3200 d = SvPVX(sv) + SvCUR(sv);
3201 has_utf8 = TRUE;
012bcf8d
GS
3202 }
3203
77a135fe 3204 if (has_utf8) {
c80e42f3 3205 d = (char*)uvchr_to_utf8((U8*)d, uv);
f9a63242
JH
3206 if (PL_lex_inwhat == OP_TRANS &&
3207 PL_sublex_info.sub_op) {
3208 PL_sublex_info.sub_op->op_private |=
3209 (PL_lex_repl ? OPpTRANS_FROM_UTF
3210 : OPpTRANS_TO_UTF);
f9a63242 3211 }
e294cc5d
JH
3212#ifdef EBCDIC
3213 if (uv > 255 && !dorange)
3214 native_range = FALSE;
3215#endif
012bcf8d 3216 }
a0ed51b3 3217 else {
012bcf8d 3218 *d++ = (char)uv;
a0ed51b3 3219 }
012bcf8d
GS
3220 }
3221 else {
c4d5f83a 3222 *d++ = (char) uv;
a0ed51b3 3223 }
79072805 3224 continue;
02aa26ce 3225
4a2d328f 3226 case 'N':
ff3f963a
KW
3227 /* In a non-pattern \N must be a named character, like \N{LATIN
3228 * SMALL LETTER A} or \N{U+0041}. For patterns, it also can
3229 * mean to match a non-newline. For non-patterns, named
3230 * characters are converted to their string equivalents. In
3231 * patterns, named characters are not converted to their
3232 * ultimate forms for the same reasons that other escapes
3233 * aren't. Instead, they are converted to the \N{U+...} form
3234 * to get the value from the charnames that is in effect right
3235 * now, while preserving the fact that it was a named character
3236 * so that the regex compiler knows this */
3237
ff3f963a
KW
3238 /* The structure of this section of code (besides checking for
3239 * errors and upgrading to utf8) is:
3240 * Further disambiguate between the two meanings of \N, and if
3241 * not a charname, go process it elsewhere
0a96133f
KW
3242 * If of form \N{U+...}, pass it through if a pattern;
3243 * otherwise convert to utf8
3244 * Otherwise must be \N{NAME}: convert to \N{U+c1.c2...} if a
3245 * pattern; otherwise convert to utf8 */
ff3f963a
KW
3246
3247 /* Here, s points to the 'N'; the test below is guaranteed to
3248 * succeed if we are being called on a pattern as we already
3249 * know from a test above that the next character is a '{'.
3250 * On a non-pattern \N must mean 'named sequence, which
3251 * requires braces */
3252 s++;
3253 if (*s != '{') {
3254 yyerror("Missing braces on \\N{}");
3255 continue;
3256 }
3257 s++;
3258
0a96133f 3259 /* If there is no matching '}', it is an error. */
ff3f963a
KW
3260 if (! (e = strchr(s, '}'))) {
3261 if (! PL_lex_inpat) {
5777a3f7 3262 yyerror("Missing right brace on \\N{}");
0a96133f 3263 } else {
4407f1b8 3264 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N");
dbc0d4f2 3265 }
0a96133f 3266 continue;
ff3f963a 3267 }
cddc7ef4 3268
ff3f963a 3269 /* Here it looks like a named character */
cddc7ef4 3270
ff3f963a
KW
3271 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3272 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3273 | PERL_SCAN_DISALLOW_PREFIX;
3274 STRLEN len;
3275
3276 /* For \N{U+...}, the '...' is a unicode value even on
3277 * EBCDIC machines */
3278 s += 2; /* Skip to next char after the 'U+' */
3279 len = e - s;
3280 uv = grok_hex(s, &len, &flags, NULL);
3281 if (len == 0 || len != (STRLEN)(e - s)) {
3282 yyerror("Invalid hexadecimal number in \\N{U+...}");
3283 s = e + 1;
3284 continue;
3285 }
3286
3287 if (PL_lex_inpat) {
3288
e2a7e165
KW
3289 /* On non-EBCDIC platforms, pass through to the regex
3290 * compiler unchanged. The reason we evaluated the
3291 * number above is to make sure there wasn't a syntax
3292 * error. But on EBCDIC we convert to native so
3293 * downstream code can continue to assume it's native
3294 */
ff3f963a 3295 s -= 5; /* Include the '\N{U+' */
e2a7e165
KW
3296#ifdef EBCDIC
3297 d += my_snprintf(d, e - s + 1 + 1, /* includes the }
3298 and the \0 */
3299 "\\N{U+%X}",
3300 (unsigned int) UNI_TO_NATIVE(uv));
3301#else
ff3f963a
KW
3302 Copy(s, d, e - s + 1, char); /* 1 = include the } */
3303 d += e - s + 1;
e2a7e165 3304#endif
ff3f963a
KW
3305 }
3306 else { /* Not a pattern: convert the hex to string */
3307
3308 /* If destination is not in utf8, unconditionally
3309 * recode it to be so. This is because \N{} implies
3310 * Unicode semantics, and scalars have to be in utf8
3311 * to guarantee those semantics */
3312 if (! has_utf8) {
3313 SvCUR_set(sv, d - SvPVX_const(sv));
3314 SvPOK_on(sv);
3315 *d = '\0';
3316 /* See Note on sizing above. */
3317 sv_utf8_upgrade_flags_grow(
3318 sv,
3319 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3320 UNISKIP(uv) + (STRLEN)(send - e) + 1);
3321 d = SvPVX(sv) + SvCUR(sv);
3322 has_utf8 = TRUE;
3323 }
3324
a46469e6 3325 /* Add the (Unicode) code point to the output. */
ff3f963a 3326 if (UNI_IS_INVARIANT(uv)) {
a46469e6 3327 *d++ = (char) LATIN1_TO_NATIVE(uv);
ff3f963a 3328 }
a46469e6
KW
3329 else {
3330 d = (char*) uvoffuni_to_utf8_flags((U8*)d, uv, 0);
3331 }
ff3f963a
KW
3332 }
3333 }
6f613c73
KW
3334 else /* Here is \N{NAME} but not \N{U+...}. */
3335 if ((res = get_and_check_backslash_N_name(s, e)))
3336 {
3337 STRLEN len;
3338 const char *str = SvPV_const(res, len);
3339 if (PL_lex_inpat) {
ff3f963a
KW
3340
3341 if (! len) { /* The name resolved to an empty string */
3342 Copy("\\N{}", d, 4, char);
3343 d += 4;
3344 }
3345 else {
3346 /* In order to not lose information for the regex
3347 * compiler, pass the result in the specially made
3348 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3349 * the code points in hex of each character
3350 * returned by charnames */
3351
3352 const char *str_end = str + len;
3b721c4f 3353 const STRLEN off = d - SvPVX_const(sv);
94ca1619
KW
3354
3355 if (! SvUTF8(res)) {
3356 /* For the non-UTF-8 case, we can determine the
3357 * exact length needed without having to parse
3358 * through the string. Each character takes up
3359 * 2 hex digits plus either a trailing dot or
3360 * the "}" */
3361 d = off + SvGROW(sv, off
3362 + 3 * len
3363 + 6 /* For the "\N{U+", and
3364 trailing NUL */
3365 + (STRLEN)(send - e));
3366 Copy("\\N{U+", d, 5, char);
3367 d += 5;
3368 while (str < str_end) {
3369 char hex_string[4];
e8549682 3370 int len =
51f14a05
JH
3371 my_snprintf(hex_string,
3372 sizeof(hex_string),
e8549682
JH
3373 "%02X.", (U8) *str);
3374 PERL_MY_SNPRINTF_POST_GUARD(len, sizeof(hex_string));
94ca1619
KW
3375 Copy(hex_string, d, 3, char);
3376 d += 3;
3377 str++;
3378 }
3379 d--; /* We will overwrite below the final
3380 dot with a right brace */
3381 }
3382 else {
1953db30
KW
3383 STRLEN char_length; /* cur char's byte length */
3384
3385 /* and the number of bytes after this is
3386 * translated into hex digits */
3387 STRLEN output_length;
3388
3389 /* 2 hex per byte; 2 chars for '\N'; 2 chars
3390 * for max('U+', '.'); and 1 for NUL */
3391 char hex_string[2 * UTF8_MAXBYTES + 5];
3392
3393 /* Get the first character of the result. */
a46469e6 3394 U32 uv = utf8n_to_uvchr((U8 *) str,
1953db30
KW
3395 len,
3396 &char_length,
3397 UTF8_ALLOW_ANYUV);
3398 /* Convert first code point to hex, including
a46469e6 3399 * the boiler plate before it. */
1953db30
KW
3400 output_length =
3401 my_snprintf(hex_string, sizeof(hex_string),
a46469e6
KW
3402 "\\N{U+%X",
3403 (unsigned int) uv);
1953db30
KW
3404
3405 /* Make sure there is enough space to hold it */
3406 d = off + SvGROW(sv, off
3407 + output_length
3408 + (STRLEN)(send - e)
3409 + 2); /* '}' + NUL */
3410 /* And output it */
3411 Copy(hex_string, d, output_length, char);
3412 d += output_length;
3413
3414 /* For each subsequent character, append dot and
3415 * its ordinal in hex */
3416 while ((str += char_length) < str_end) {
3417 const STRLEN off = d - SvPVX_const(sv);
a46469e6 3418 U32 uv = utf8n_to_uvchr((U8 *) str,
1953db30
KW
3419 str_end - str,
3420 &char_length,
3421 UTF8_ALLOW_ANYUV);
3422 output_length =
3423 my_snprintf(hex_string,
a46469e6
KW
3424 sizeof(hex_string),
3425 ".%X",
3426 (unsigned int) uv);
1953db30
KW
3427
3428 d = off + SvGROW(sv, off
3429 + output_length
3430 + (STRLEN)(send - e)
3431 + 2); /* '}' + NUL */
3432 Copy(hex_string, d, output_length, char);
3433 d += output_length;
3434 }
94ca1619 3435 }
ff3f963a
KW
3436
3437 *d++ = '}'; /* Done. Add the trailing brace */
3438 }
3439 }
3440 else { /* Here, not in a pattern. Convert the name to a
3441 * string. */
3442
3443 /* If destination is not in utf8, unconditionally
3444 * recode it to be so. This is because \N{} implies
3445 * Unicode semantics, and scalars have to be in utf8
3446 * to guarantee those semantics */
3447 if (! has_utf8) {
3448 SvCUR_set(sv, d - SvPVX_const(sv));
3449 SvPOK_on(sv);
3450 *d = '\0';
3451 /* See Note on sizing above. */
3452 sv_utf8_upgrade_flags_grow(sv,
3453 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3454 len + (STRLEN)(send - s) + 1);
3455 d = SvPVX(sv) + SvCUR(sv);
3456 has_utf8 = TRUE;
3457 } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
3458
3459 /* See Note on sizing above. (NOTE: SvCUR() is not
3460 * set correctly here). */
3461 const STRLEN off = d - SvPVX_const(sv);
3462 d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1);
3463 }
7fc82458
KW
3464 if (! SvUTF8(res)) { /* Make sure is \N{} return is UTF-8 */
3465 sv_utf8_upgrade(res);
3466 str = SvPV_const(res, len);
3467 }
ff3f963a
KW
3468 Copy(str, d, len, char);
3469 d += len;
423cee85 3470 }
6f613c73 3471
423cee85 3472 SvREFCNT_dec(res);
cb233ae3 3473
cb233ae3 3474 } /* End \N{NAME} */
ff3f963a
KW
3475#ifdef EBCDIC
3476 if (!dorange)
3477 native_range = FALSE; /* \N{} is defined to be Unicode */
3478#endif
3479 s = e + 1; /* Point to just after the '}' */
423cee85
JH
3480 continue;
3481
02aa26ce 3482 /* \c is a control character */
79072805
LW
3483 case 'c':
3484 s++;
961ce445 3485 if (s < send) {
421e43ba 3486 *d++ = grok_bslash_c(*s++, 1);
ba210ebe 3487 }
961ce445
RGS
3488 else {
3489 yyerror("Missing control char name in \\c");
3490 }
79072805 3491 continue;
02aa26ce
NT
3492
3493 /* printf-style backslashes, formfeeds, newlines, etc */
79072805 3494 case 'b':
4d73d076 3495 *d++ = '\b';
79072805
LW
3496 break;
3497 case 'n':
4d73d076 3498 *d++ = '\n';
79072805
LW
3499 break;
3500 case 'r':
4d73d076 3501 *d++ = '\r';
79072805
LW
3502 break;
3503 case 'f':
4d73d076 3504 *d++ = '\f';
79072805
LW
3505 break;
3506 case 't':
4d73d076 3507 *d++ = '\t';
79072805 3508 break;
34a3fe2a 3509 case 'e':
da0334e8 3510 *d++ = ESC_NATIVE;
34a3fe2a
PP
3511 break;
3512 case 'a':
4d73d076 3513 *d++ = '\a';
79072805 3514 break;
02aa26ce
NT
3515 } /* end switch */
3516
79072805
LW
3517 s++;
3518 continue;
02aa26ce 3519 } /* end if (backslash) */
4c3a8340
ST
3520#ifdef EBCDIC
3521 else
3522 literal_endpoint++;
3523#endif
02aa26ce 3524
f9a63242 3525 default_action:
77a135fe
KW
3526 /* If we started with encoded form, or already know we want it,
3527 then encode the next character */
6f2d5cbc 3528 if (! NATIVE_BYTE_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
2b9d42f0 3529 STRLEN len = 1;
77a135fe
KW
3530
3531
3532 /* One might think that it is wasted effort in the case of the
3533 * source being utf8 (this_utf8 == TRUE) to take the next character
3534 * in the source, convert it to an unsigned value, and then convert
3535 * it back again. But the source has not been validated here. The
3536 * routine that does the conversion checks for errors like
3537 * malformed utf8 */
3538
233ca360
KW
3539 const UV nextuv = (this_utf8)
3540 ? utf8n_to_uvchr((U8*)s, send - s, &len, 0)
3541 : (UV) ((U8) *s);
5aaebcb3 3542 const STRLEN need = UNISKIP(nextuv);
77a135fe
KW
3543 if (!has_utf8) {
3544 SvCUR_set(sv, d - SvPVX_const(sv));
3545 SvPOK_on(sv);
3546 *d = '\0';
77a135fe 3547 /* See Note on sizing above. */
7bf79863
KW
3548 sv_utf8_upgrade_flags_grow(sv,
3549 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3550 need + (STRLEN)(send - s) + 1);
77a135fe
KW
3551 d = SvPVX(sv) + SvCUR(sv);
3552 has_utf8 = TRUE;
3553 } else if (need > len) {
3554 /* encoded value larger than old, may need extra space (NOTE:
3555 * SvCUR() is not set correctly here). See Note on sizing
3556 * above. */
9d4ba2ae 3557 const STRLEN off = d - SvPVX_const(sv);
77a135fe 3558 d = SvGROW(sv, off + need + (STRLEN)(send - s) + 1) + off;
2b9d42f0 3559 }
77a135fe
KW
3560 s += len;
3561
5f66b61c 3562 d = (char*)uvchr_to_utf8((U8*)d, nextuv);
e294cc5d
JH
3563#ifdef EBCDIC
3564 if (uv > 255 && !dorange)
3565 native_range = FALSE;
3566#endif
2b9d42f0
NIS
3567 }
3568 else {
5ff03569 3569 *d++ = *s++;
2b9d42f0 3570 }
02aa26ce
NT
3571 } /* while loop to process each character */
3572
3573 /* terminate the string and set up the sv */
79072805 3574 *d = '\0';
95a20fc0 3575 SvCUR_set(sv, d - SvPVX_const(sv));
2b9d42f0 3576 if (SvCUR(sv) >= SvLEN(sv))
5637ef5b
NC
3577 Perl_croak(aTHX_ "panic: constant overflowed allocated space, %"UVuf
3578 " >= %"UVuf, (UV)SvCUR(sv), (UV)SvLEN(sv));
2b9d42f0 3579
79072805 3580 SvPOK_on(sv);
9f4817db 3581 if (PL_encoding && !has_utf8) {
d0063567
DK
3582 sv_recode_to_utf8(sv, PL_encoding);
3583 if (SvUTF8(sv))
3584 has_utf8 = TRUE;
9f4817db 3585 }
2b9d42f0 3586 if (has_utf8) {
7e2040f0 3587 SvUTF8_on(sv);
2b9d42f0 3588 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
d0063567 3589 PL_sublex_info.sub_op->op_private |=
2b9d42f0
NIS
3590 (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
3591 }
3592 }
79072805 3593
02aa26ce 3594 /* shrink the sv if we allocated more than we used */
79072805 3595 if (SvCUR(sv) + 5 < SvLEN(sv)) {
1da4ca5f 3596 SvPV_shrink_to_cur(sv);
79072805 3597 }
02aa26ce 3598
6154021b 3599 /* return the substring (via pl_yylval) only if we parsed anything */
f0feb466
FC
3600 if (s > start) {
3601 char *s2 = start;
ffdb8b16
FC
3602 for (; s2 < s; s2++) {
3603 if (*s2 == '\n')
3604 COPLINE_INC_WITH_HERELINES;
3605 }
b899e89d 3606 SvREFCNT_inc_simple_void_NN(sv);
4f3e2518
DM
3607 if ( (PL_hints & ( PL_lex_inpat ? HINT_NEW_RE : HINT_NEW_STRING ))
3608 && ! PL_parser->lex_re_reparsing)
3609 {
eb0d8d16
NC
3610 const char *const key = PL_lex_inpat ? "qr" : "q";
3611 const STRLEN keylen = PL_lex_inpat ? 2 : 1;
3612 const char *type;
3613 STRLEN typelen;
3614
3615 if (PL_lex_inwhat == OP_TRANS) {
3616 type = "tr";
3617 typelen = 2;
3618 } else if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat) {
3619 type = "s";
3620 typelen = 1;
9da1dd8f
DM
3621 } else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'') {
3622 type = "q";
3623 typelen = 1;
eb0d8d16
NC
3624 } else {
3625 type = "qq";
3626 typelen = 2;
3627 }
3628
3629 sv = S_new_constant(aTHX_ start, s - start, key, keylen, sv, NULL,
3630 type, typelen);
3631 }
6154021b 3632 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
b899e89d
FC
3633 }
3634 LEAVE_with_name("scan_const");
79072805
LW
3635 return s;
3636}
3637
ffb4593c
NT
3638/* S_intuit_more
3639 * Returns TRUE if there's more to the expression (e.g., a subscript),
3640 * FALSE otherwise.
ffb4593c
NT
3641 *
3642 * It deals with "$foo[3]" and /$foo[3]/ and /$foo[0123456789$]+/
3643 *
3644 * ->[ and ->{ return TRUE
ff25e5db 3645 * ->$* ->$#* ->@* ->@[ ->@{ return TRUE if postderef_qq is enabled
ffb4593c
NT
3646 * { and [ outside a pattern are always subscripts, so return TRUE
3647 * if we're outside a pattern and it's not { or [, then return FALSE
3648 * if we're in a pattern and the first char is a {
3649 * {4,5} (any digits around the comma) returns FALSE
3650 * if we're in a pattern and the first char is a [
3651 * [] returns FALSE
3652 * [SOMETHING] has a funky algorithm to decide whether it's a
3653 * character class or not. It has to deal with things like
3654 * /$foo[-3]/ and /$foo[$bar]/ as well as /$foo[$\d]+/
3655 * anything else returns TRUE
3656 */
3657