This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
[perl #123893] Fix hang with "@{"
[perl5.git] / toke.c
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
f0e67a1d
Z
26This is the lower layer of the Perl parser, managing characters and tokens.
27
28=for apidoc AmU|yy_parser *|PL_parser
29
30Pointer to a structure encapsulating the state of the parsing operation
31currently in progress. The pointer can be locally changed to perform
32a nested parse without interfering with the state of an outer parse.
33Individual members of C<PL_parser> have their own documentation.
34
35=cut
36*/
37
378cc40b 38#include "EXTERN.h"
864dbfa3 39#define PERL_IN_TOKE_C
378cc40b 40#include "perl.h"
04e98a4d 41#include "dquote_static.c"
378cc40b 42
eb0d8d16
NC
43#define new_constant(a,b,c,d,e,f,g) \
44 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
45
6154021b 46#define pl_yylval (PL_parser->yylval)
d3b6f988 47
199e78b7
DM
48/* XXX temporary backwards compatibility */
49#define PL_lex_brackets (PL_parser->lex_brackets)
78cdf107
Z
50#define PL_lex_allbrackets (PL_parser->lex_allbrackets)
51#define PL_lex_fakeeof (PL_parser->lex_fakeeof)
199e78b7
DM
52#define PL_lex_brackstack (PL_parser->lex_brackstack)
53#define PL_lex_casemods (PL_parser->lex_casemods)
54#define PL_lex_casestack (PL_parser->lex_casestack)
55#define PL_lex_defer (PL_parser->lex_defer)
56#define PL_lex_dojoin (PL_parser->lex_dojoin)
199e78b7
DM
57#define PL_lex_formbrack (PL_parser->lex_formbrack)
58#define PL_lex_inpat (PL_parser->lex_inpat)
59#define PL_lex_inwhat (PL_parser->lex_inwhat)
60#define PL_lex_op (PL_parser->lex_op)
61#define PL_lex_repl (PL_parser->lex_repl)
62#define PL_lex_starts (PL_parser->lex_starts)
63#define PL_lex_stuff (PL_parser->lex_stuff)
64#define PL_multi_start (PL_parser->multi_start)
65#define PL_multi_open (PL_parser->multi_open)
66#define PL_multi_close (PL_parser->multi_close)
199e78b7
DM
67#define PL_preambled (PL_parser->preambled)
68#define PL_sublex_info (PL_parser->sublex_info)
bdc0bf6f 69#define PL_linestr (PL_parser->linestr)
c2598295
DM
70#define PL_expect (PL_parser->expect)
71#define PL_copline (PL_parser->copline)
f06b5848
DM
72#define PL_bufptr (PL_parser->bufptr)
73#define PL_oldbufptr (PL_parser->oldbufptr)
74#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
75#define PL_linestart (PL_parser->linestart)
76#define PL_bufend (PL_parser->bufend)
77#define PL_last_uni (PL_parser->last_uni)
78#define PL_last_lop (PL_parser->last_lop)
79#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 80#define PL_lex_state (PL_parser->lex_state)
2f9285f8 81#define PL_rsfp (PL_parser->rsfp)
5486870f 82#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
83#define PL_in_my (PL_parser->in_my)
84#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 85#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 86#define PL_multi_end (PL_parser->multi_end)
13765c85 87#define PL_error_count (PL_parser->error_count)
199e78b7 88
fb205e7a
DM
89# define PL_nexttoke (PL_parser->nexttoke)
90# define PL_nexttype (PL_parser->nexttype)
91# define PL_nextval (PL_parser->nextval)
199e78b7 92
a1894d81 93static const char* const ident_too_long = "Identifier too long";
8903cb82 94
9ded7720 95# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2 96
a7aaec61
Z
97#define XENUMMASK 0x3f
98#define XFAKEEOF 0x40
99#define XFAKEBRACK 0x80
9059aa12 100
39e02b42 101#ifdef USE_UTF8_SCRIPTS
b3041197 102# define UTF cBOOL(!IN_BYTES)
2b9d42f0 103#else
b3041197 104# define UTF cBOOL((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
2b9d42f0 105#endif
a0ed51b3 106
b1fc3636
CJ
107/* The maximum number of characters preceding the unrecognized one to display */
108#define UNRECOGNIZED_PRECEDE_COUNT 10
109
61f0cdd9 110/* In variables named $^X, these are the legal values for X.
2b92dfce
GS
111 * 1999-02-27 mjd-perl-patch@plover.com */
112#define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
113
14bd96d0 114#define SPACE_OR_TAB(c) isBLANK_A(c)
bf4acbe4 115
9ff909cf
JH
116#define HEXFP_PEEK(s) \
117 (((s[0] == '.') && \
118 (isXDIGIT(s[1]) || isALPHA_FOLD_EQ(s[1], 'p'))) || \
119 isALPHA_FOLD_EQ(s[0], 'p'))
120
ffb4593c
NT
121/* LEX_* are values for PL_lex_state, the state of the lexer.
122 * They are arranged oddly so that the guard on the switch statement
79072805 123 * can get by with a single comparison (if the compiler is smart enough).
9da1dd8f
DM
124 *
125 * These values refer to the various states within a sublex parse,
126 * i.e. within a double quotish string
79072805
LW
127 */
128
fb73857a
PP
129/* #define LEX_NOTPARSING 11 is done in perl.h. */
130
b6007c36
DM
131#define LEX_NORMAL 10 /* normal code (ie not within "...") */
132#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
133#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
134#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
135#define LEX_INTERPSTART 6 /* expecting the start of a $var */
136
137 /* at end of code, eg "$x" followed by: */
138#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
139#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
140
141#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
142 string or after \E, $foo, etc */
143#define LEX_INTERPCONST 2 /* NOT USED */
144#define LEX_FORMLINE 1 /* expecting a format line */
145#define LEX_KNOWNEXT 0 /* next token known; just return it */
146
79072805 147
bbf60fe6 148#ifdef DEBUGGING
27da23d5 149static const char* const lex_state_names[] = {
bbf60fe6
DM
150 "KNOWNEXT",
151 "FORMLINE",
152 "INTERPCONST",
153 "INTERPCONCAT",
154 "INTERPENDMAYBE",
155 "INTERPEND",
156 "INTERPSTART",
157 "INTERPPUSH",
158 "INTERPCASEMOD",
159 "INTERPNORMAL",
160 "NORMAL"
161};
162#endif
163
79072805 164#include "keywords.h"
fe14fcc3 165
ffb4593c
NT
166/* CLINE is a macro that ensures PL_copline has a sane value */
167
57843af0 168#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 169
ffb4593c
NT
170/*
171 * Convenience functions to return different tokens and prime the
9cbb5ea2 172 * lexer for the next token. They all take an argument.
ffb4593c
NT
173 *
174 * TOKEN : generic token (used for '(', DOLSHARP, etc)
175 * OPERATOR : generic operator
176 * AOPERATOR : assignment operator
177 * PREBLOCK : beginning the block after an if, while, foreach, ...
178 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
179 * PREREF : *EXPR where EXPR is not a simple identifier
180 * TERM : expression term
89f35911 181 * POSTDEREF : postfix dereference (->$* ->@[...] etc.)
ffb4593c
NT
182 * LOOPX : loop exiting command (goto, last, dump, etc)
183 * FTST : file test operator
184 * FUN0 : zero-argument function
7eb971ee 185 * FUN0OP : zero-argument function, with its op created in this file
2d2e263d 186 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
187 * BOop : bitwise or or xor
188 * BAop : bitwise and
8823cb89 189 * BCop : bitwise complement
ffb4593c
NT
190 * SHop : shift operator
191 * PWop : power operator
9cbb5ea2 192 * PMop : pattern-matching operator
ffb4593c 193 * Aop : addition-level operator
e4916dd1 194 * AopNOASSIGN : addition-level operator that is never part of .=
ffb4593c
NT
195 * Mop : multiplication-level operator
196 * Eop : equality-testing operator
e5edeb50 197 * Rop : relational operator <= != gt
ffb4593c
NT
198 *
199 * Also see LOP and lop() below.
200 */
201
998054bd 202#ifdef DEBUGGING /* Serve -DT. */
704d4215 203# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 204#else
bbf60fe6 205# define REPORT(retval) (retval)
998054bd
SC
206#endif
207
bbf60fe6
DM
208#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
209#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
b1764551 210#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, retval))
bbf60fe6
DM
211#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
212#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
213#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
214#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
89f35911 215#define POSTDEREF(f) return (PL_bufptr = s, S_postderef(aTHX_ REPORT(f),s[1]))
a49203fd 216#define LOOPX(f) return (PL_bufptr = force_word(s,WORD,TRUE,FALSE), \
7a61bf3c 217 pl_yylval.ival=f, \
a49203fd 218 PL_expect = PL_nexttoke ? XOPERATOR : XTERM, \
7a61bf3c 219 REPORT((int)LOOPEX))
6154021b
RGS
220#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
221#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
7eb971ee 222#define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
6154021b 223#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
b1764551
FC
224#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)BITOROP))
225#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)BITANDOP))
8823cb89
FC
226#define BCop(f) return pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr = s, \
227 REPORT('~')
b1764551
FC
228#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)SHIFTOP))
229#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)POWOP))
6154021b 230#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
b1764551 231#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)ADDOP))
e4916dd1 232#define AopNOASSIGN(f) return (pl_yylval.ival=f, PL_bufptr=s, REPORT((int)ADDOP))
b1764551 233#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)MULOP))
6154021b
RGS
234#define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
235#define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
2f3197b3 236
a687059c
LW
237/* This bit of chicanery makes a unary function followed by
238 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
239 * The UNIDOR macro is for unary functions that can be followed by the //
240 * operator (such as C<shift // 0>).
a687059c 241 */
d68ce4ac 242#define UNI3(f,x,have_x) { \
6154021b 243 pl_yylval.ival = f; \
d68ce4ac 244 if (have_x) PL_expect = x; \
376fcdbf
AL
245 PL_bufptr = s; \
246 PL_last_uni = PL_oldbufptr; \
247 PL_last_lop_op = f; \
248 if (*s == '(') \
249 return REPORT( (int)FUNC1 ); \
294a536f 250 s = skipspace(s); \
376fcdbf
AL
251 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
252 }
d68ce4ac
FC
253#define UNI(f) UNI3(f,XTERM,1)
254#define UNIDOR(f) UNI3(f,XTERMORDORDOR,1)
b5fb7ce3
FC
255#define UNIPROTO(f,optional) { \
256 if (optional) PL_last_uni = PL_oldbufptr; \
22393538
MH
257 OPERATOR(f); \
258 }
a687059c 259
d68ce4ac 260#define UNIBRACK(f) UNI3(f,0,0)
79072805 261
9f68db38 262/* grandfather return to old style */
78cdf107
Z
263#define OLDLOP(f) \
264 do { \
265 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
266 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
267 pl_yylval.ival = (f); \
268 PL_expect = XTERM; \
269 PL_bufptr = s; \
270 return (int)LSTOP; \
271 } while(0)
79072805 272
83944c01
FC
273#define COPLINE_INC_WITH_HERELINES \
274 STMT_START { \
275 CopLINE_inc(PL_curcop); \
851b527a
FC
276 if (PL_parser->herelines) \
277 CopLINE(PL_curcop) += PL_parser->herelines, \
278 PL_parser->herelines = 0; \
83944c01 279 } STMT_END
ffdb8b16
FC
280/* Called after scan_str to update CopLINE(PL_curcop), but only when there
281 * is no sublex_push to follow. */
282#define COPLINE_SET_FROM_MULTI_END \
283 STMT_START { \
284 CopLINE_set(PL_curcop, PL_multi_end); \
285 if (PL_multi_end != PL_multi_start) \
851b527a 286 PL_parser->herelines = 0; \
ffdb8b16 287 } STMT_END
83944c01
FC
288
289
8fa7f367
JH
290#ifdef DEBUGGING
291
6154021b 292/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
293enum token_type {
294 TOKENTYPE_NONE,
295 TOKENTYPE_IVAL,
6154021b 296 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6 297 TOKENTYPE_PVAL,
aeaef349 298 TOKENTYPE_OPVAL
bbf60fe6
DM
299};
300
6d4a66ac
NC
301static struct debug_tokens {
302 const int token;
303 enum token_type type;
304 const char *name;
305} const debug_tokens[] =
9041c2e3 306{
bbf60fe6
DM
307 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
308 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
309 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
310 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
311 { ARROW, TOKENTYPE_NONE, "ARROW" },
312 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
313 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
314 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
315 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
316 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
0d863452 317 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
318 { DO, TOKENTYPE_NONE, "DO" },
319 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
320 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
321 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
322 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
323 { ELSE, TOKENTYPE_NONE, "ELSE" },
324 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
325 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
326 { FOR, TOKENTYPE_IVAL, "FOR" },
327 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
705fe0e5
FC
328 { FORMLBRACK, TOKENTYPE_NONE, "FORMLBRACK" },
329 { FORMRBRACK, TOKENTYPE_NONE, "FORMRBRACK" },
bbf60fe6
DM
330 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
331 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
7eb971ee 332 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
bbf60fe6
DM
333 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
334 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
335 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 336 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
337 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
338 { IF, TOKENTYPE_IVAL, "IF" },
5504e6cf 339 { LABEL, TOKENTYPE_PVAL, "LABEL" },
bbf60fe6
DM
340 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
341 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
342 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
343 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
344 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
345 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
346 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
347 { MY, TOKENTYPE_IVAL, "MY" },
bbf60fe6
DM
348 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
349 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
350 { OROP, TOKENTYPE_IVAL, "OROP" },
351 { OROR, TOKENTYPE_NONE, "OROR" },
352 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
88e1f1a2
JV
353 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
354 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6 355 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
cc624add 356 { POSTJOIN, TOKENTYPE_NONE, "POSTJOIN" },
bbf60fe6
DM
357 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
358 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
359 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
360 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
361 { PREINC, TOKENTYPE_NONE, "PREINC" },
362 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
f3f204dc 363 { QWLIST, TOKENTYPE_OPVAL, "QWLIST" },
bbf60fe6
DM
364 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
365 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
f3f204dc 366 { REQUIRE, TOKENTYPE_NONE, "REQUIRE" },
bbf60fe6
DM
367 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
368 { SUB, TOKENTYPE_NONE, "SUB" },
369 { THING, TOKENTYPE_OPVAL, "THING" },
370 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
371 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
372 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
373 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
374 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
375 { USE, TOKENTYPE_IVAL, "USE" },
0d863452 376 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6
DM
377 { WHILE, TOKENTYPE_IVAL, "WHILE" },
378 { WORD, TOKENTYPE_OPVAL, "WORD" },
be25f609 379 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 380 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
381};
382
6154021b 383/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 384
bbf60fe6 385STATIC int
704d4215 386S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 387{
7918f24d
NC
388 PERL_ARGS_ASSERT_TOKEREPORT;
389
bbf60fe6 390 if (DEBUG_T_TEST) {
bd61b366 391 const char *name = NULL;
bbf60fe6 392 enum token_type type = TOKENTYPE_NONE;
f54cb97a 393 const struct debug_tokens *p;
396482e1 394 SV* const report = newSVpvs("<== ");
bbf60fe6 395
f54cb97a 396 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
397 if (p->token == (int)rv) {
398 name = p->name;
399 type = p->type;
400 break;
401 }
402 }
403 if (name)
54667de8 404 Perl_sv_catpv(aTHX_ report, name);
239f83d5 405 else if (isGRAPH(rv))
4ebc7986 406 {
bbf60fe6 407 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
4ebc7986
FC
408 if ((char)rv == 'p')
409 sv_catpvs(report, " (pending identifier)");
410 }
bbf60fe6 411 else if (!rv)
396482e1 412 sv_catpvs(report, "EOF");
bbf60fe6
DM
413 else
414 Perl_sv_catpvf(aTHX_ report, "?? %"IVdf, (IV)rv);
415 switch (type) {
416 case TOKENTYPE_NONE:
bbf60fe6
DM
417 break;
418 case TOKENTYPE_IVAL:
704d4215 419 Perl_sv_catpvf(aTHX_ report, "(ival=%"IVdf")", (IV)lvalp->ival);
bbf60fe6
DM
420 break;
421 case TOKENTYPE_OPNUM:
422 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 423 PL_op_name[lvalp->ival]);
bbf60fe6
DM
424 break;
425 case TOKENTYPE_PVAL:
704d4215 426 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
427 break;
428 case TOKENTYPE_OPVAL:
704d4215 429 if (lvalp->opval) {
401441c0 430 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
431 PL_op_name[lvalp->opval->op_type]);
432 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 433 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 434 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
435 }
436
437 }
401441c0 438 else
396482e1 439 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
440 break;
441 }
b6007c36 442 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
443 };
444 return (int)rv;
998054bd
SC
445}
446
b6007c36
DM
447
448/* print the buffer with suitable escapes */
449
450STATIC void
15f169a1 451S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 452{
396482e1 453 SV* const tmp = newSVpvs("");
7918f24d
NC
454
455 PERL_ARGS_ASSERT_PRINTBUF;
456
5d37acd6 457 GCC_DIAG_IGNORE(-Wformat-nonliteral); /* fmt checked by caller */
b6007c36 458 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
5d37acd6 459 GCC_DIAG_RESTORE;
b6007c36
DM
460 SvREFCNT_dec(tmp);
461}
462
8fa7f367
JH
463#endif
464
8290c323
NC
465static int
466S_deprecate_commaless_var_list(pTHX) {
467 PL_expect = XTERM;
468 deprecate("comma-less variable list");
469 return REPORT(','); /* grandfather non-comma-format format */
470}
471
ffb4593c
NT
472/*
473 * S_ao
474 *
f393a21a
FC
475 * This subroutine looks for an '=' next to the operator that has just been
476 * parsed and turns it into an ASSIGNOP if it finds one.
ffb4593c
NT
477 */
478
76e3520e 479STATIC int
cea2e8a9 480S_ao(pTHX_ int toketype)
a0d0e21e 481{
3280af22
NIS
482 if (*PL_bufptr == '=') {
483 PL_bufptr++;
a0d0e21e 484 if (toketype == ANDAND)
6154021b 485 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 486 else if (toketype == OROR)
6154021b 487 pl_yylval.ival = OP_ORASSIGN;
c963b151 488 else if (toketype == DORDOR)
6154021b 489 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
490 toketype = ASSIGNOP;
491 }
b1764551 492 return REPORT(toketype);
a0d0e21e
LW
493}
494
ffb4593c
NT
495/*
496 * S_no_op
497 * When Perl expects an operator and finds something else, no_op
498 * prints the warning. It always prints "<something> found where
499 * operator expected. It prints "Missing semicolon on previous line?"
500 * if the surprise occurs at the start of the line. "do you need to
501 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
502 * where the compiler doesn't know if foo is a method call or a function.
503 * It prints "Missing operator before end of line" if there's nothing
504 * after the missing operator, or "... before <...>" if there is something
505 * after the missing operator.
488bc579
FC
506 *
507 * PL_bufptr is expected to point to the start of the thing that was found,
508 * and s after the next token or partial token.
ffb4593c
NT
509 */
510
76e3520e 511STATIC void
15f169a1 512S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 513{
9d4ba2ae
AL
514 char * const oldbp = PL_bufptr;
515 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 516
7918f24d
NC
517 PERL_ARGS_ASSERT_NO_OP;
518
1189a94a
GS
519 if (!s)
520 s = oldbp;
07c798fb 521 else
1189a94a 522 PL_bufptr = s;
734ab321 523 yywarn(Perl_form(aTHX_ "%s found where operator expected", what), UTF ? SVf_UTF8 : 0);
56da5a46
RGS
524 if (ckWARN_d(WARN_SYNTAX)) {
525 if (is_first)
526 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
527 "\t(Missing semicolon on previous line?)\n");
528 else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) {
f54cb97a 529 const char *t;
8a2bca12 530 for (t = PL_oldoldbufptr; (isWORDCHAR_lazy_if(t,UTF) || *t == ':');
734ab321 531 t += UTF ? UTF8SKIP(t) : 1)
c35e046a 532 NOOP;
56da5a46
RGS
533 if (t < PL_bufptr && isSPACE(*t))
534 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
b17a0679
FC
535 "\t(Do you need to predeclare %"UTF8f"?)\n",
536 UTF8fARG(UTF, t - PL_oldoldbufptr, PL_oldoldbufptr));
56da5a46
RGS
537 }
538 else {
539 assert(s >= oldbp);
540 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
b17a0679
FC
541 "\t(Missing operator before %"UTF8f"?)\n",
542 UTF8fARG(UTF, s - oldbp, oldbp));
56da5a46 543 }
07c798fb 544 }
3280af22 545 PL_bufptr = oldbp;
8990e307
LW
546}
547
ffb4593c
NT
548/*
549 * S_missingterm
550 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 551 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
552 * If we're in a delimited string and the delimiter is a control
553 * character, it's reformatted into a two-char sequence like ^C.
554 * This is fatal.
555 */
556
76e3520e 557STATIC void
cea2e8a9 558S_missingterm(pTHX_ char *s)
8990e307
LW
559{
560 char tmpbuf[3];
561 char q;
562 if (s) {
9d4ba2ae 563 char * const nl = strrchr(s,'\n');
d2719217 564 if (nl)
8990e307
LW
565 *nl = '\0';
566 }
ca8b19a7 567 else if ((U8) PL_multi_close < 32) {
8990e307 568 *tmpbuf = '^';
585ec06d 569 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
570 tmpbuf[2] = '\0';
571 s = tmpbuf;
572 }
573 else {
eb160463 574 *tmpbuf = (char)PL_multi_close;
8990e307
LW
575 tmpbuf[1] = '\0';
576 s = tmpbuf;
577 }
578 q = strchr(s,'"') ? '\'' : '"';
cea2e8a9 579 Perl_croak(aTHX_ "Can't find string terminator %c%s%c anywhere before EOF",q,s,q);
463ee0b2 580}
79072805 581
dd0ac2b9
FC
582#include "feature.h"
583
0d863452 584/*
0d863452
RH
585 * Check whether the named feature is enabled.
586 */
26ea9e12 587bool
3fff3427 588Perl_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
0d863452 589{
4a731d7b 590 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
7918f24d
NC
591
592 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
ca4d40c4
FC
593
594 assert(CURRENT_FEATURE_BUNDLE == FEATURE_BUNDLE_CUSTOM);
7918f24d 595
26ea9e12
NC
596 if (namelen > MAX_FEATURE_LEN)
597 return FALSE;
3fff3427 598 memcpy(&he_name[8], name, namelen);
7d69d4a6 599
c8ca97b0
NC
600 return cBOOL(cop_hints_fetch_pvn(PL_curcop, he_name, 8 + namelen, 0,
601 REFCOUNTED_HE_EXISTS));
0d863452
RH
602}
603
ffb4593c 604/*
9cbb5ea2
GS
605 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
606 * utf16-to-utf8-reversed.
ffb4593c
NT
607 */
608
c39cd008
GS
609#ifdef PERL_CR_FILTER
610static void
611strip_return(SV *sv)
612{
eb578fdb
KW
613 const char *s = SvPVX_const(sv);
614 const char * const e = s + SvCUR(sv);
7918f24d
NC
615
616 PERL_ARGS_ASSERT_STRIP_RETURN;
617
c39cd008
GS
618 /* outer loop optimized to do nothing if there are no CR-LFs */
619 while (s < e) {
620 if (*s++ == '\r' && *s == '\n') {
621 /* hit a CR-LF, need to copy the rest */
eb578fdb 622 char *d = s - 1;
c39cd008
GS
623 *d++ = *s++;
624 while (s < e) {
625 if (*s == '\r' && s[1] == '\n')
626 s++;
627 *d++ = *s++;
628 }
629 SvCUR(sv) -= s - d;
630 return;
631 }
632 }
633}
a868473f 634
76e3520e 635STATIC I32
c39cd008 636S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 637{
f54cb97a 638 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
639 if (count > 0 && !maxlen)
640 strip_return(sv);
641 return count;
a868473f
NIS
642}
643#endif
644
ffb4593c 645/*
8eaa0acf
Z
646=for apidoc Amx|void|lex_start|SV *line|PerlIO *rsfp|U32 flags
647
648Creates and initialises a new lexer/parser state object, supplying
649a context in which to lex and parse from a new source of Perl code.
650A pointer to the new state object is placed in L</PL_parser>. An entry
651is made on the save stack so that upon unwinding the new state object
652will be destroyed and the former value of L</PL_parser> will be restored.
653Nothing else need be done to clean up the parsing context.
654
655The code to be parsed comes from I<line> and I<rsfp>. I<line>, if
656non-null, provides a string (in SV form) containing code to be parsed.
657A copy of the string is made, so subsequent modification of I<line>
658does not affect parsing. I<rsfp>, if non-null, provides an input stream
659from which code will be read to be parsed. If both are non-null, the
660code in I<line> comes first and must consist of complete lines of input,
661and I<rsfp> supplies the remainder of the source.
662
e368b3bd
FC
663The I<flags> parameter is reserved for future use. Currently it is only
664used by perl internally, so extensions should always pass zero.
8eaa0acf
Z
665
666=cut
667*/
ffb4593c 668
27fcb6ee 669/* LEX_START_SAME_FILTER indicates that this is not a new file, so it
87606032
NC
670 can share filters with the current parser.
671 LEX_START_DONT_CLOSE indicates that the file handle wasn't opened by the
672 caller, hence isn't owned by the parser, so shouldn't be closed on parser
673 destruction. This is used to handle the case of defaulting to reading the
674 script from the standard input because no filename was given on the command
675 line (without getting confused by situation where STDIN has been closed, so
676 the script handle is opened on fd 0) */
27fcb6ee 677
a0d0e21e 678void
8eaa0acf 679Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
79072805 680{
6ef55633 681 const char *s = NULL;
5486870f 682 yy_parser *parser, *oparser;
60d63348 683 if (flags && flags & ~LEX_START_FLAGS)
8eaa0acf 684 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
acdf0a21
DM
685
686 /* create and initialise a parser */
687
199e78b7 688 Newxz(parser, 1, yy_parser);
5486870f 689 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
690 PL_parser = parser;
691
28ac2b49
Z
692 parser->stack = NULL;
693 parser->ps = NULL;
694 parser->stack_size = 0;
acdf0a21 695
e3abe207
DM
696 /* on scope exit, free this parser and restore any outer one */
697 SAVEPARSER(parser);
7c4baf47 698 parser->saved_curcop = PL_curcop;
e3abe207 699
acdf0a21 700 /* initialise lexer state */
8990e307 701
fb205e7a 702 parser->nexttoke = 0;
ca4cfd28 703 parser->error_count = oparser ? oparser->error_count : 0;
7f1c3e8c 704 parser->copline = parser->preambling = NOLINE;
5afb0a62 705 parser->lex_state = LEX_NORMAL;
c2598295 706 parser->expect = XSTATE;
2f9285f8 707 parser->rsfp = rsfp;
27fcb6ee
FC
708 parser->rsfp_filters =
709 !(flags & LEX_START_SAME_FILTER) || !oparser
d3cd8e11
FC
710 ? NULL
711 : MUTABLE_AV(SvREFCNT_inc(
712 oparser->rsfp_filters
713 ? oparser->rsfp_filters
714 : (oparser->rsfp_filters = newAV())
715 ));
2f9285f8 716
199e78b7
DM
717 Newx(parser->lex_brackstack, 120, char);
718 Newx(parser->lex_casestack, 12, char);
719 *parser->lex_casestack = '\0';
d794b522 720 Newxz(parser->lex_shared, 1, LEXSHARED);
02b34bbe 721
10efb74f 722 if (line) {
0528fd32 723 STRLEN len;
10efb74f 724 s = SvPV_const(line, len);
0abcdfa4
FC
725 parser->linestr = flags & LEX_START_COPIED
726 ? SvREFCNT_inc_simple_NN(line)
727 : newSVpvn_flags(s, len, SvUTF8(line));
bf1b738b 728 sv_catpvn(parser->linestr, "\n;", rsfp ? 1 : 2);
0abcdfa4 729 } else {
bf1b738b 730 parser->linestr = newSVpvn("\n;", rsfp ? 1 : 2);
8990e307 731 }
f06b5848
DM
732 parser->oldoldbufptr =
733 parser->oldbufptr =
734 parser->bufptr =
735 parser->linestart = SvPVX(parser->linestr);
736 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
737 parser->last_lop = parser->last_uni = NULL;
b54f893d 738
6d59e610 739 STATIC_ASSERT_STMT(FITS_IN_8_BITS(LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
b54f893d
KW
740 |LEX_DONT_CLOSE_RSFP));
741 parser->lex_flags = (U8) (flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
742 |LEX_DONT_CLOSE_RSFP));
737c24fc 743
60d63348 744 parser->in_pod = parser->filtered = 0;
79072805 745}
a687059c 746
e3abe207
DM
747
748/* delete a parser object */
749
750void
751Perl_parser_free(pTHX_ const yy_parser *parser)
752{
7918f24d
NC
753 PERL_ARGS_ASSERT_PARSER_FREE;
754
7c4baf47 755 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
756 SvREFCNT_dec(parser->linestr);
757
87606032 758 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
2f9285f8 759 PerlIO_clearerr(parser->rsfp);
799361c3
SH
760 else if (parser->rsfp && (!parser->old_parser ||
761 (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 762 PerlIO_close(parser->rsfp);
5486870f 763 SvREFCNT_dec(parser->rsfp_filters);
10002bc1
FC
764 SvREFCNT_dec(parser->lex_stuff);
765 SvREFCNT_dec(parser->sublex_info.repl);
3ac7ff8f
FC
766
767 Safefree(parser->lex_brackstack);
768 Safefree(parser->lex_casestack);
769 Safefree(parser->lex_shared);
770 PL_parser = parser->old_parser;
771 Safefree(parser);
772}
773
774void
775Perl_parser_free_nexttoke_ops(pTHX_ yy_parser *parser, OPSLAB *slab)
776{
3ac7ff8f 777 I32 nexttoke = parser->nexttoke;
3ac7ff8f 778 PERL_ARGS_ASSERT_PARSER_FREE_NEXTTOKE_OPS;
3ce3dcd9 779 while (nexttoke--) {
3ac7ff8f
FC
780 if (S_is_opval_token(parser->nexttype[nexttoke] & 0xffff)
781 && parser->nextval[nexttoke].opval
782 && parser->nextval[nexttoke].opval->op_slabbed
783 && OpSLAB(parser->nextval[nexttoke].opval) == slab) {
3ce3dcd9 784 op_free(parser->nextval[nexttoke].opval);
3ac7ff8f
FC
785 parser->nextval[nexttoke].opval = NULL;
786 }
3ce3dcd9 787 }
e3abe207
DM
788}
789
790
ffb4593c 791/*
f0e67a1d
Z
792=for apidoc AmxU|SV *|PL_parser-E<gt>linestr
793
794Buffer scalar containing the chunk currently under consideration of the
795text currently being lexed. This is always a plain string scalar (for
796which C<SvPOK> is true). It is not intended to be used as a scalar by
797normal scalar means; instead refer to the buffer directly by the pointer
798variables described below.
799
800The lexer maintains various C<char*> pointers to things in the
801C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
802reallocated, all of these pointers must be updated. Don't attempt to
803do this manually, but rather use L</lex_grow_linestr> if you need to
804reallocate the buffer.
805
806The content of the text chunk in the buffer is commonly exactly one
807complete line of input, up to and including a newline terminator,
808but there are situations where it is otherwise. The octets of the
809buffer may be intended to be interpreted as either UTF-8 or Latin-1.
810The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
811flag on this scalar, which may disagree with it.
812
813For direct examination of the buffer, the variable
814L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
815lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
816of these pointers is usually preferable to examination of the scalar
817through normal scalar means.
818
819=for apidoc AmxU|char *|PL_parser-E<gt>bufend
820
821Direct pointer to the end of the chunk of text currently being lexed, the
822end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
6602b933 823+ SvCUR(PL_parser-E<gt>linestr)>. A C<NUL> character (zero octet) is
f0e67a1d
Z
824always located at the end of the buffer, and does not count as part of
825the buffer's contents.
826
827=for apidoc AmxU|char *|PL_parser-E<gt>bufptr
828
829Points to the current position of lexing inside the lexer buffer.
830Characters around this point may be freely examined, within
831the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
832L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
833interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
834
835Lexing code (whether in the Perl core or not) moves this pointer past
836the characters that it consumes. It is also expected to perform some
837bookkeeping whenever a newline character is consumed. This movement
838can be more conveniently performed by the function L</lex_read_to>,
839which handles newlines appropriately.
840
841Interpretation of the buffer's octets can be abstracted out by
842using the slightly higher-level functions L</lex_peek_unichar> and
843L</lex_read_unichar>.
844
845=for apidoc AmxU|char *|PL_parser-E<gt>linestart
846
847Points to the start of the current line inside the lexer buffer.
848This is useful for indicating at which column an error occurred, and
849not much else. This must be updated by any lexing code that consumes
850a newline; the function L</lex_read_to> handles this detail.
851
852=cut
853*/
854
855/*
856=for apidoc Amx|bool|lex_bufutf8
857
858Indicates whether the octets in the lexer buffer
859(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
860of Unicode characters. If not, they should be interpreted as Latin-1
861characters. This is analogous to the C<SvUTF8> flag for scalars.
862
863In UTF-8 mode, it is not guaranteed that the lexer buffer actually
864contains valid UTF-8. Lexing code must be robust in the face of invalid
865encoding.
866
867The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
868is significant, but not the whole story regarding the input character
869encoding. Normally, when a file is being read, the scalar contains octets
870and its C<SvUTF8> flag is off, but the octets should be interpreted as
871UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
872however, the scalar may have the C<SvUTF8> flag on, and in this case its
873octets should be interpreted as UTF-8 unless the C<use bytes> pragma
874is in effect. This logic may change in the future; use this function
875instead of implementing the logic yourself.
876
877=cut
878*/
879
880bool
881Perl_lex_bufutf8(pTHX)
882{
883 return UTF;
884}
885
886/*
887=for apidoc Amx|char *|lex_grow_linestr|STRLEN len
888
889Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
6602b933 890at least I<len> octets (including terminating C<NUL>). Returns a
f0e67a1d
Z
891pointer to the reallocated buffer. This is necessary before making
892any direct modification of the buffer that would increase its length.
893L</lex_stuff_pvn> provides a more convenient way to insert text into
894the buffer.
895
896Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
897this function updates all of the lexer's variables that point directly
898into the buffer.
899
900=cut
901*/
902
903char *
904Perl_lex_grow_linestr(pTHX_ STRLEN len)
905{
906 SV *linestr;
907 char *buf;
908 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
c7641931 909 STRLEN linestart_pos, last_uni_pos, last_lop_pos, re_eval_start_pos;
f0e67a1d
Z
910 linestr = PL_parser->linestr;
911 buf = SvPVX(linestr);
912 if (len <= SvLEN(linestr))
913 return buf;
914 bufend_pos = PL_parser->bufend - buf;
915 bufptr_pos = PL_parser->bufptr - buf;
916 oldbufptr_pos = PL_parser->oldbufptr - buf;
917 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
918 linestart_pos = PL_parser->linestart - buf;
919 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
920 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
3328ab5a
FC
921 re_eval_start_pos = PL_parser->lex_shared->re_eval_start ?
922 PL_parser->lex_shared->re_eval_start - buf : 0;
c7641931 923
f0e67a1d 924 buf = sv_grow(linestr, len);
c7641931 925
f0e67a1d
Z
926 PL_parser->bufend = buf + bufend_pos;
927 PL_parser->bufptr = buf + bufptr_pos;
928 PL_parser->oldbufptr = buf + oldbufptr_pos;
929 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
930 PL_parser->linestart = buf + linestart_pos;
931 if (PL_parser->last_uni)
932 PL_parser->last_uni = buf + last_uni_pos;
933 if (PL_parser->last_lop)
934 PL_parser->last_lop = buf + last_lop_pos;
3328ab5a
FC
935 if (PL_parser->lex_shared->re_eval_start)
936 PL_parser->lex_shared->re_eval_start = buf + re_eval_start_pos;
f0e67a1d
Z
937 return buf;
938}
939
940/*
83aa740e 941=for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
f0e67a1d
Z
942
943Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
944immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
945reallocating the buffer if necessary. This means that lexing code that
946runs later will see the characters as if they had appeared in the input.
947It is not recommended to do this as part of normal parsing, and most
948uses of this facility run the risk of the inserted characters being
949interpreted in an unintended manner.
950
951The string to be inserted is represented by I<len> octets starting
952at I<pv>. These octets are interpreted as either UTF-8 or Latin-1,
953according to whether the C<LEX_STUFF_UTF8> flag is set in I<flags>.
954The characters are recoded for the lexer buffer, according to how the
955buffer is currently being interpreted (L</lex_bufutf8>). If a string
9dcc53ea 956to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
f0e67a1d
Z
957function is more convenient.
958
959=cut
960*/
961
962void
83aa740e 963Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
f0e67a1d 964{
749123ff 965 dVAR;
f0e67a1d
Z
966 char *bufptr;
967 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
968 if (flags & ~(LEX_STUFF_UTF8))
969 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
970 if (UTF) {
971 if (flags & LEX_STUFF_UTF8) {
972 goto plain_copy;
973 } else {
54d004e8 974 STRLEN highhalf = 0; /* Count of variants */
83aa740e 975 const char *p, *e = pv+len;
54d004e8
KW
976 for (p = pv; p != e; p++) {
977 if (! UTF8_IS_INVARIANT(*p)) {
978 highhalf++;
979 }
980 }
f0e67a1d
Z
981 if (!highhalf)
982 goto plain_copy;
983 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
984 bufptr = PL_parser->bufptr;
985 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
986 SvCUR_set(PL_parser->linestr,
987 SvCUR(PL_parser->linestr) + len+highhalf);
f0e67a1d
Z
988 PL_parser->bufend += len+highhalf;
989 for (p = pv; p != e; p++) {
990 U8 c = (U8)*p;
54d004e8
KW
991 if (! UTF8_IS_INVARIANT(c)) {
992 *bufptr++ = UTF8_TWO_BYTE_HI(c);
993 *bufptr++ = UTF8_TWO_BYTE_LO(c);
f0e67a1d
Z
994 } else {
995 *bufptr++ = (char)c;
996 }
997 }
998 }
999 } else {
1000 if (flags & LEX_STUFF_UTF8) {
1001 STRLEN highhalf = 0;
83aa740e 1002 const char *p, *e = pv+len;
f0e67a1d
Z
1003 for (p = pv; p != e; p++) {
1004 U8 c = (U8)*p;
54d004e8 1005 if (UTF8_IS_ABOVE_LATIN1(c)) {
f0e67a1d
Z
1006 Perl_croak(aTHX_ "Lexing code attempted to stuff "
1007 "non-Latin-1 character into Latin-1 input");
54d004e8 1008 } else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e)) {
f0e67a1d
Z
1009 p++;
1010 highhalf++;
54d004e8 1011 } else if (! UTF8_IS_INVARIANT(c)) {
f0e67a1d
Z
1012 /* malformed UTF-8 */
1013 ENTER;
1014 SAVESPTR(PL_warnhook);
1015 PL_warnhook = PERL_WARNHOOK_FATAL;
c80e42f3 1016 utf8n_to_uvchr((U8*)p, e-p, NULL, 0);
f0e67a1d
Z
1017 LEAVE;
1018 }
1019 }
1020 if (!highhalf)
1021 goto plain_copy;
1022 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
1023 bufptr = PL_parser->bufptr;
1024 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1025 SvCUR_set(PL_parser->linestr,
1026 SvCUR(PL_parser->linestr) + len-highhalf);
f0e67a1d 1027 PL_parser->bufend += len-highhalf;
54d004e8
KW
1028 p = pv;
1029 while (p < e) {
1030 if (UTF8_IS_INVARIANT(*p)) {
1031 *bufptr++ = *p;
1032 p++;
f0e67a1d 1033 }
54d004e8
KW
1034 else {
1035 assert(p < e -1 );
94bb8c36 1036 *bufptr++ = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
54d004e8
KW
1037 p += 2;
1038 }
f0e67a1d
Z
1039 }
1040 } else {
54d004e8 1041 plain_copy:
f0e67a1d
Z
1042 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1043 bufptr = PL_parser->bufptr;
1044 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
255fdf19 1045 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
f0e67a1d
Z
1046 PL_parser->bufend += len;
1047 Copy(pv, bufptr, len, char);
1048 }
1049 }
1050}
1051
1052/*
9dcc53ea
Z
1053=for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
1054
1055Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1056immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1057reallocating the buffer if necessary. This means that lexing code that
1058runs later will see the characters as if they had appeared in the input.
1059It is not recommended to do this as part of normal parsing, and most
1060uses of this facility run the risk of the inserted characters being
1061interpreted in an unintended manner.
1062
1063The string to be inserted is represented by octets starting at I<pv>
1064and continuing to the first nul. These octets are interpreted as either
1065UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
1066in I<flags>. The characters are recoded for the lexer buffer, according
1067to how the buffer is currently being interpreted (L</lex_bufutf8>).
1068If it is not convenient to nul-terminate a string to be inserted, the
1069L</lex_stuff_pvn> function is more appropriate.
1070
1071=cut
1072*/
1073
1074void
1075Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1076{
1077 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1078 lex_stuff_pvn(pv, strlen(pv), flags);
1079}
1080
1081/*
f0e67a1d
Z
1082=for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1083
1084Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1085immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1086reallocating the buffer if necessary. This means that lexing code that
1087runs later will see the characters as if they had appeared in the input.
1088It is not recommended to do this as part of normal parsing, and most
1089uses of this facility run the risk of the inserted characters being
1090interpreted in an unintended manner.
1091
1092The string to be inserted is the string value of I<sv>. The characters
1093are recoded for the lexer buffer, according to how the buffer is currently
9dcc53ea 1094being interpreted (L</lex_bufutf8>). If a string to be inserted is
f0e67a1d
Z
1095not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1096need to construct a scalar.
1097
1098=cut
1099*/
1100
1101void
1102Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1103{
1104 char *pv;
1105 STRLEN len;
1106 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1107 if (flags)
1108 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1109 pv = SvPV(sv, len);
1110 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1111}
1112
1113/*
1114=for apidoc Amx|void|lex_unstuff|char *ptr
1115
1116Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1117I<ptr>. Text following I<ptr> will be moved, and the buffer shortened.
1118This hides the discarded text from any lexing code that runs later,
1119as if the text had never appeared.
1120
1121This is not the normal way to consume lexed text. For that, use
1122L</lex_read_to>.
1123
1124=cut
1125*/
1126
1127void
1128Perl_lex_unstuff(pTHX_ char *ptr)
1129{
1130 char *buf, *bufend;
1131 STRLEN unstuff_len;
1132 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1133 buf = PL_parser->bufptr;
1134 if (ptr < buf)
1135 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1136 if (ptr == buf)
1137 return;
1138 bufend = PL_parser->bufend;
1139 if (ptr > bufend)
1140 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1141 unstuff_len = ptr - buf;
1142 Move(ptr, buf, bufend+1-ptr, char);
1143 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1144 PL_parser->bufend = bufend - unstuff_len;
1145}
1146
1147/*
1148=for apidoc Amx|void|lex_read_to|char *ptr
1149
1150Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1151to I<ptr>. This advances L</PL_parser-E<gt>bufptr> to match I<ptr>,
1152performing the correct bookkeeping whenever a newline character is passed.
1153This is the normal way to consume lexed text.
1154
1155Interpretation of the buffer's octets can be abstracted out by
1156using the slightly higher-level functions L</lex_peek_unichar> and
1157L</lex_read_unichar>.
1158
1159=cut
1160*/
1161
1162void
1163Perl_lex_read_to(pTHX_ char *ptr)
1164{
1165 char *s;
1166 PERL_ARGS_ASSERT_LEX_READ_TO;
1167 s = PL_parser->bufptr;
1168 if (ptr < s || ptr > PL_parser->bufend)
1169 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1170 for (; s != ptr; s++)
1171 if (*s == '\n') {
83944c01 1172 COPLINE_INC_WITH_HERELINES;
f0e67a1d
Z
1173 PL_parser->linestart = s+1;
1174 }
1175 PL_parser->bufptr = ptr;
1176}
1177
1178/*
1179=for apidoc Amx|void|lex_discard_to|char *ptr
1180
1181Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1182up to I<ptr>. The remaining content of the buffer will be moved, and
1183all pointers into the buffer updated appropriately. I<ptr> must not
1184be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1185it is not permitted to discard text that has yet to be lexed.
1186
1187Normally it is not necessarily to do this directly, because it suffices to
1188use the implicit discarding behaviour of L</lex_next_chunk> and things
1189based on it. However, if a token stretches across multiple lines,
1f317c95 1190and the lexing code has kept multiple lines of text in the buffer for
f0e67a1d
Z
1191that purpose, then after completion of the token it would be wise to
1192explicitly discard the now-unneeded earlier lines, to avoid future
1193multi-line tokens growing the buffer without bound.
1194
1195=cut
1196*/
1197
1198void
1199Perl_lex_discard_to(pTHX_ char *ptr)
1200{
1201 char *buf;
1202 STRLEN discard_len;
1203 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1204 buf = SvPVX(PL_parser->linestr);
1205 if (ptr < buf)
1206 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1207 if (ptr == buf)
1208 return;
1209 if (ptr > PL_parser->bufptr)
1210 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1211 discard_len = ptr - buf;
1212 if (PL_parser->oldbufptr < ptr)
1213 PL_parser->oldbufptr = ptr;
1214 if (PL_parser->oldoldbufptr < ptr)
1215 PL_parser->oldoldbufptr = ptr;
1216 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1217 PL_parser->last_uni = NULL;
1218 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1219 PL_parser->last_lop = NULL;
1220 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1221 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1222 PL_parser->bufend -= discard_len;
1223 PL_parser->bufptr -= discard_len;
1224 PL_parser->oldbufptr -= discard_len;
1225 PL_parser->oldoldbufptr -= discard_len;
1226 if (PL_parser->last_uni)
1227 PL_parser->last_uni -= discard_len;
1228 if (PL_parser->last_lop)
1229 PL_parser->last_lop -= discard_len;
1230}
1231
1232/*
1233=for apidoc Amx|bool|lex_next_chunk|U32 flags
1234
1235Reads in the next chunk of text to be lexed, appending it to
1236L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1237looked to the end of the current chunk and wants to know more. It is
1238usual, but not necessary, for lexing to have consumed the entirety of
1239the current chunk at this time.
1240
1241If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1242chunk (i.e., the current chunk has been entirely consumed), normally the
1243current chunk will be discarded at the same time that the new chunk is
1244read in. If I<flags> includes C<LEX_KEEP_PREVIOUS>, the current chunk
1245will not be discarded. If the current chunk has not been entirely
1246consumed, then it will not be discarded regardless of the flag.
1247
1248Returns true if some new text was added to the buffer, or false if the
1249buffer has reached the end of the input text.
1250
1251=cut
1252*/
1253
1254#define LEX_FAKE_EOF 0x80000000
e47d32dc 1255#define LEX_NO_TERM 0x40000000 /* here-doc */
f0e67a1d
Z
1256
1257bool
1258Perl_lex_next_chunk(pTHX_ U32 flags)
1259{
1260 SV *linestr;
1261 char *buf;
1262 STRLEN old_bufend_pos, new_bufend_pos;
1263 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1264 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1265 bool got_some_for_debugger = 0;
f0e67a1d 1266 bool got_some;
112d1284 1267 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM))
f0e67a1d 1268 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
d27f4b91 1269 if (!(flags & LEX_NO_TERM) && PL_lex_inwhat)
e47d32dc 1270 return FALSE;
f0e67a1d
Z
1271 linestr = PL_parser->linestr;
1272 buf = SvPVX(linestr);
1273 if (!(flags & LEX_KEEP_PREVIOUS) &&
1274 PL_parser->bufptr == PL_parser->bufend) {
1275 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1276 linestart_pos = 0;
1277 if (PL_parser->last_uni != PL_parser->bufend)
1278 PL_parser->last_uni = NULL;
1279 if (PL_parser->last_lop != PL_parser->bufend)
1280 PL_parser->last_lop = NULL;
1281 last_uni_pos = last_lop_pos = 0;
1282 *buf = 0;
1283 SvCUR(linestr) = 0;
1284 } else {
1285 old_bufend_pos = PL_parser->bufend - buf;
1286 bufptr_pos = PL_parser->bufptr - buf;
1287 oldbufptr_pos = PL_parser->oldbufptr - buf;
1288 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1289 linestart_pos = PL_parser->linestart - buf;
1290 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1291 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1292 }
1293 if (flags & LEX_FAKE_EOF) {
1294 goto eof;
60d63348 1295 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
f0e67a1d
Z
1296 got_some = 0;
1297 } else if (filter_gets(linestr, old_bufend_pos)) {
1298 got_some = 1;
17cc9359 1299 got_some_for_debugger = 1;
112d1284
FC
1300 } else if (flags & LEX_NO_TERM) {
1301 got_some = 0;
f0e67a1d 1302 } else {
580561a3
Z
1303 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1304 sv_setpvs(linestr, "");
f0e67a1d
Z
1305 eof:
1306 /* End of real input. Close filehandle (unless it was STDIN),
1307 * then add implicit termination.
1308 */
87606032 1309 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
f0e67a1d
Z
1310 PerlIO_clearerr(PL_parser->rsfp);
1311 else if (PL_parser->rsfp)
1312 (void)PerlIO_close(PL_parser->rsfp);
1313 PL_parser->rsfp = NULL;
60d63348 1314 PL_parser->in_pod = PL_parser->filtered = 0;
f0e67a1d
Z
1315 if (!PL_in_eval && PL_minus_p) {
1316 sv_catpvs(linestr,
1317 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1318 PL_minus_n = PL_minus_p = 0;
1319 } else if (!PL_in_eval && PL_minus_n) {
1320 sv_catpvs(linestr, /*{*/";}");
1321 PL_minus_n = 0;
1322 } else
1323 sv_catpvs(linestr, ";");
1324 got_some = 1;
1325 }
1326 buf = SvPVX(linestr);
1327 new_bufend_pos = SvCUR(linestr);
1328 PL_parser->bufend = buf + new_bufend_pos;
1329 PL_parser->bufptr = buf + bufptr_pos;
1330 PL_parser->oldbufptr = buf + oldbufptr_pos;
1331 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1332 PL_parser->linestart = buf + linestart_pos;
1333 if (PL_parser->last_uni)
1334 PL_parser->last_uni = buf + last_uni_pos;
1335 if (PL_parser->last_lop)
1336 PL_parser->last_lop = buf + last_lop_pos;
7f1c3e8c
FC
1337 if (PL_parser->preambling != NOLINE) {
1338 CopLINE_set(PL_curcop, PL_parser->preambling + 1);
1339 PL_parser->preambling = NOLINE;
1340 }
17cc9359 1341 if (got_some_for_debugger && (PERLDB_LINE || PERLDB_SAVESRC) &&
f0e67a1d
Z
1342 PL_curstash != PL_debstash) {
1343 /* debugger active and we're not compiling the debugger code,
1344 * so store the line into the debugger's array of lines
1345 */
1346 update_debugger_info(NULL, buf+old_bufend_pos,
1347 new_bufend_pos-old_bufend_pos);
1348 }
1349 return got_some;
1350}
1351
1352/*
1353=for apidoc Amx|I32|lex_peek_unichar|U32 flags
1354
1355Looks ahead one (Unicode) character in the text currently being lexed.
1356Returns the codepoint (unsigned integer value) of the next character,
1357or -1 if lexing has reached the end of the input text. To consume the
1358peeked character, use L</lex_read_unichar>.
1359
1360If the next character is in (or extends into) the next chunk of input
1361text, the next chunk will be read in. Normally the current chunk will be
1362discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1363then the current chunk will not be discarded.
1364
1365If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1366is encountered, an exception is generated.
1367
1368=cut
1369*/
1370
1371I32
1372Perl_lex_peek_unichar(pTHX_ U32 flags)
1373{
749123ff 1374 dVAR;
f0e67a1d
Z
1375 char *s, *bufend;
1376 if (flags & ~(LEX_KEEP_PREVIOUS))
1377 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1378 s = PL_parser->bufptr;
1379 bufend = PL_parser->bufend;
1380 if (UTF) {
1381 U8 head;
1382 I32 unichar;
1383 STRLEN len, retlen;
1384 if (s == bufend) {
1385 if (!lex_next_chunk(flags))
1386 return -1;
1387 s = PL_parser->bufptr;
1388 bufend = PL_parser->bufend;
1389 }
1390 head = (U8)*s;
54d004e8 1391 if (UTF8_IS_INVARIANT(head))
f0e67a1d 1392 return head;
54d004e8
KW
1393 if (UTF8_IS_START(head)) {
1394 len = UTF8SKIP(&head);
f0e67a1d
Z
1395 while ((STRLEN)(bufend-s) < len) {
1396 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1397 break;
1398 s = PL_parser->bufptr;
1399 bufend = PL_parser->bufend;
1400 }
1401 }
c80e42f3 1402 unichar = utf8n_to_uvchr((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
f0e67a1d
Z
1403 if (retlen == (STRLEN)-1) {
1404 /* malformed UTF-8 */
1405 ENTER;
1406 SAVESPTR(PL_warnhook);
1407 PL_warnhook = PERL_WARNHOOK_FATAL;
c80e42f3 1408 utf8n_to_uvchr((U8*)s, bufend-s, NULL, 0);
f0e67a1d
Z
1409 LEAVE;
1410 }
1411 return unichar;
1412 } else {
1413 if (s == bufend) {
1414 if (!lex_next_chunk(flags))
1415 return -1;
1416 s = PL_parser->bufptr;
1417 }
1418 return (U8)*s;
1419 }
1420}
1421
1422/*
1423=for apidoc Amx|I32|lex_read_unichar|U32 flags
1424
1425Reads the next (Unicode) character in the text currently being lexed.
1426Returns the codepoint (unsigned integer value) of the character read,
1427and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1428if lexing has reached the end of the input text. To non-destructively
1429examine the next character, use L</lex_peek_unichar> instead.
1430
1431If the next character is in (or extends into) the next chunk of input
1432text, the next chunk will be read in. Normally the current chunk will be
1433discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1434then the current chunk will not be discarded.
1435
1436If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1437is encountered, an exception is generated.
1438
1439=cut
1440*/
1441
1442I32
1443Perl_lex_read_unichar(pTHX_ U32 flags)
1444{
1445 I32 c;
1446 if (flags & ~(LEX_KEEP_PREVIOUS))
1447 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1448 c = lex_peek_unichar(flags);
1449 if (c != -1) {
1450 if (c == '\n')
83944c01 1451 COPLINE_INC_WITH_HERELINES;
d9018cbe
EB
1452 if (UTF)
1453 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1454 else
1455 ++(PL_parser->bufptr);
f0e67a1d
Z
1456 }
1457 return c;
1458}
1459
1460/*
1461=for apidoc Amx|void|lex_read_space|U32 flags
1462
1463Reads optional spaces, in Perl style, in the text currently being
1464lexed. The spaces may include ordinary whitespace characters and
1465Perl-style comments. C<#line> directives are processed if encountered.
1466L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1467at a non-space character (or the end of the input text).
1468
1469If spaces extend into the next chunk of input text, the next chunk will
1470be read in. Normally the current chunk will be discarded at the same
1471time, but if I<flags> includes C<LEX_KEEP_PREVIOUS> then the current
1472chunk will not be discarded.
1473
1474=cut
1475*/
1476
21791330 1477#define LEX_NO_INCLINE 0x40000000
f0998909
Z
1478#define LEX_NO_NEXT_CHUNK 0x80000000
1479
f0e67a1d
Z
1480void
1481Perl_lex_read_space(pTHX_ U32 flags)
1482{
1483 char *s, *bufend;
21791330 1484 const bool can_incline = !(flags & LEX_NO_INCLINE);
f0e67a1d 1485 bool need_incline = 0;
21791330 1486 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK|LEX_NO_INCLINE))
f0e67a1d 1487 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
f0e67a1d
Z
1488 s = PL_parser->bufptr;
1489 bufend = PL_parser->bufend;
1490 while (1) {
1491 char c = *s;
1492 if (c == '#') {
1493 do {
1494 c = *++s;
1495 } while (!(c == '\n' || (c == 0 && s == bufend)));
1496 } else if (c == '\n') {
1497 s++;
21791330
FC
1498 if (can_incline) {
1499 PL_parser->linestart = s;
1500 if (s == bufend)
1501 need_incline = 1;
1502 else
1503 incline(s);
1504 }
f0e67a1d
Z
1505 } else if (isSPACE(c)) {
1506 s++;
1507 } else if (c == 0 && s == bufend) {
1508 bool got_more;
65c68e17 1509 line_t l;
f0998909
Z
1510 if (flags & LEX_NO_NEXT_CHUNK)
1511 break;
f0e67a1d 1512 PL_parser->bufptr = s;
65c68e17 1513 l = CopLINE(PL_curcop);
851b527a 1514 CopLINE(PL_curcop) += PL_parser->herelines + 1;
f0e67a1d 1515 got_more = lex_next_chunk(flags);
65c68e17 1516 CopLINE_set(PL_curcop, l);
f0e67a1d
Z
1517 s = PL_parser->bufptr;
1518 bufend = PL_parser->bufend;
1519 if (!got_more)
1520 break;
21791330 1521 if (can_incline && need_incline && PL_parser->rsfp) {
f0e67a1d
Z
1522 incline(s);
1523 need_incline = 0;
1524 }
3c47da3c
FC
1525 } else if (!c) {
1526 s++;
f0e67a1d
Z
1527 } else {
1528 break;
1529 }
1530 }
f0e67a1d
Z
1531 PL_parser->bufptr = s;
1532}
1533
1534/*
fe788d6b
PM
1535
1536=for apidoc EXMp|bool|validate_proto|SV *name|SV *proto|bool warn
1537
1538This function performs syntax checking on a prototype, C<proto>.
1539If C<warn> is true, any illegal characters or mismatched brackets
1540will trigger illegalproto warnings, declaring that they were
1541detected in the prototype for C<name>.
1542
1543The return value is C<true> if this is a valid prototype, and
1544C<false> if it is not, regardless of whether C<warn> was C<true> or
1545C<false>.
1546
1547Note that C<NULL> is a valid C<proto> and will always return C<true>.
1548
1549=cut
1550
1551 */
1552
1553bool
1554Perl_validate_proto(pTHX_ SV *name, SV *proto, bool warn)
1555{
1556 STRLEN len, origlen;
1557 char *p = proto ? SvPV(proto, len) : NULL;
1558 bool bad_proto = FALSE;
1559 bool in_brackets = FALSE;
1560 bool after_slash = FALSE;
1561 char greedy_proto = ' ';
1562 bool proto_after_greedy_proto = FALSE;
1563 bool must_be_last = FALSE;
1564 bool underscore = FALSE;
f791a21a 1565 bool bad_proto_after_underscore = FALSE;
fe788d6b
PM
1566
1567 PERL_ARGS_ASSERT_VALIDATE_PROTO;
1568
1569 if (!proto)
1570 return TRUE;
1571
1572 origlen = len;
1573 for (; len--; p++) {
1574 if (!isSPACE(*p)) {
1575 if (must_be_last)
1576 proto_after_greedy_proto = TRUE;
f791a21a
PM
1577 if (underscore) {
1578 if (!strchr(";@%", *p))
1579 bad_proto_after_underscore = TRUE;
1580 underscore = FALSE;
1581 }
fe788d6b
PM
1582 if (!strchr("$@%*;[]&\\_+", *p) || *p == '\0') {
1583 bad_proto = TRUE;
1584 }
1585 else {
fe788d6b
PM
1586 if (*p == '[')
1587 in_brackets = TRUE;
1588 else if (*p == ']')
1589 in_brackets = FALSE;
1590 else if ((*p == '@' || *p == '%') &&
1591 !after_slash &&
1592 !in_brackets ) {
1593 must_be_last = TRUE;
1594 greedy_proto = *p;
1595 }
1596 else if (*p == '_')
f791a21a 1597 underscore = TRUE;
fe788d6b
PM
1598 }
1599 if (*p == '\\')
1600 after_slash = TRUE;
1601 else
1602 after_slash = FALSE;
1603 }
1604 }
1605
1606 if (warn) {
b54d603d 1607 SV *tmpsv = newSVpvs_flags("", SVs_TEMP);
fe788d6b 1608 p -= origlen;
b54d603d
PM
1609 p = SvUTF8(proto)
1610 ? sv_uni_display(tmpsv, newSVpvn_flags(p, origlen, SVs_TEMP | SVf_UTF8),
1611 origlen, UNI_DISPLAY_ISPRINT)
1612 : pv_pretty(tmpsv, p, origlen, 60, NULL, NULL, PERL_PV_ESCAPE_NONASCII);
1613
fe788d6b
PM
1614 if (proto_after_greedy_proto)
1615 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1616 "Prototype after '%c' for %"SVf" : %s",
1617 greedy_proto, SVfARG(name), p);
50278ed0
PM
1618 if (in_brackets)
1619 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1620 "Missing ']' in prototype for %"SVf" : %s",
1621 SVfARG(name), p);
b54d603d 1622 if (bad_proto)
fe788d6b 1623 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
f791a21a
PM
1624 "Illegal character in prototype for %"SVf" : %s",
1625 SVfARG(name), p);
1626 if (bad_proto_after_underscore)
1627 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1628 "Illegal character after '_' in prototype for %"SVf" : %s",
1629 SVfARG(name), p);
fe788d6b
PM
1630 }
1631
1632 return (! (proto_after_greedy_proto || bad_proto) );
1633}
1634
1635/*
ffb4593c
NT
1636 * S_incline
1637 * This subroutine has nothing to do with tilting, whether at windmills
1638 * or pinball tables. Its name is short for "increment line". It
57843af0 1639 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1640 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1641 * # line 500 "foo.pm"
1642 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1643 */
1644
76e3520e 1645STATIC void
d9095cec 1646S_incline(pTHX_ const char *s)
463ee0b2 1647{
d9095cec
NC
1648 const char *t;
1649 const char *n;
1650 const char *e;
8818d409 1651 line_t line_num;
463ee0b2 1652
7918f24d
NC
1653 PERL_ARGS_ASSERT_INCLINE;
1654
83944c01 1655 COPLINE_INC_WITH_HERELINES;
451f421f
FC
1656 if (!PL_rsfp && !PL_parser->filtered && PL_lex_state == LEX_NORMAL
1657 && s+1 == PL_bufend && *s == ';') {
1658 /* fake newline in string eval */
1659 CopLINE_dec(PL_curcop);
1660 return;
1661 }
463ee0b2
LW
1662 if (*s++ != '#')
1663 return;
d4c19fe8
AL
1664 while (SPACE_OR_TAB(*s))
1665 s++;
73659bf1
GS
1666 if (strnEQ(s, "line", 4))
1667 s += 4;
1668 else
1669 return;
084592ab 1670 if (SPACE_OR_TAB(*s))
73659bf1 1671 s++;
4e553d73 1672 else
73659bf1 1673 return;
d4c19fe8
AL
1674 while (SPACE_OR_TAB(*s))
1675 s++;
463ee0b2
LW
1676 if (!isDIGIT(*s))
1677 return;
d4c19fe8 1678
463ee0b2
LW
1679 n = s;
1680 while (isDIGIT(*s))
1681 s++;
07714eb4 1682 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1683 return;
bf4acbe4 1684 while (SPACE_OR_TAB(*s))
463ee0b2 1685 s++;
73659bf1 1686 if (*s == '"' && (t = strchr(s+1, '"'))) {
463ee0b2 1687 s++;
73659bf1
GS
1688 e = t + 1;
1689 }
463ee0b2 1690 else {
c35e046a
AL
1691 t = s;
1692 while (!isSPACE(*t))
1693 t++;
73659bf1 1694 e = t;
463ee0b2 1695 }
bf4acbe4 1696 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1697 e++;
1698 if (*e != '\n' && *e != '\0')
1699 return; /* false alarm */
1700
96e440d2 1701 line_num = grok_atou(n, &e) - 1;
8818d409 1702
f4dd75d9 1703 if (t - s > 0) {
d9095cec 1704 const STRLEN len = t - s;
3df32bda 1705
d36ee5be 1706 if (!PL_rsfp && !PL_parser->filtered) {
e66cf94c
RGS
1707 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1708 * to *{"::_<newfilename"} */
44867030
NC
1709 /* However, the long form of evals is only turned on by the
1710 debugger - usually they're "(eval %lu)" */
d36ee5be
FC
1711 GV * const cfgv = CopFILEGV(PL_curcop);
1712 if (cfgv) {
38bd7ad8
FC
1713 char smallbuf[128];
1714 STRLEN tmplen2 = len;
44867030 1715 char *tmpbuf2;
449dd039 1716 GV *gv2;
44867030
NC
1717
1718 if (tmplen2 + 2 <= sizeof smallbuf)
1719 tmpbuf2 = smallbuf;
1720 else
1721 Newx(tmpbuf2, tmplen2 + 2, char);
1722
38bd7ad8
FC
1723 tmpbuf2[0] = '_';
1724 tmpbuf2[1] = '<';
44867030
NC
1725
1726 memcpy(tmpbuf2 + 2, s, tmplen2);
1727 tmplen2 += 2;
1728
8a5ee598 1729 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1730 if (!isGV(gv2)) {
8a5ee598 1731 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1732 /* adjust ${"::_<newfilename"} to store the new file name */
1733 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
8818d409
FC
1734 /* The line number may differ. If that is the case,
1735 alias the saved lines that are in the array.
1736 Otherwise alias the whole array. */
1737 if (CopLINE(PL_curcop) == line_num) {
38bd7ad8
FC
1738 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(cfgv)));
1739 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(cfgv)));
8818d409 1740 }
38bd7ad8
FC
1741 else if (GvAV(cfgv)) {
1742 AV * const av = GvAV(cfgv);
8818d409
FC
1743 const I32 start = CopLINE(PL_curcop)+1;
1744 I32 items = AvFILLp(av) - start;
1745 if (items > 0) {
1746 AV * const av2 = GvAVn(gv2);
1747 SV **svp = AvARRAY(av) + start;
1748 I32 l = (I32)line_num+1;
1749 while (items--)
1750 av_store(av2, l++, SvREFCNT_inc(*svp++));
1751 }
1752 }
e5527e4b 1753 }
44867030
NC
1754
1755 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
d36ee5be 1756 }
e66cf94c 1757 }
05ec9bb3 1758 CopFILE_free(PL_curcop);
449dd039 1759 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1760 }
8818d409 1761 CopLINE_set(PL_curcop, line_num);
463ee0b2
LW
1762}
1763
21791330
FC
1764#define skipspace(s) skipspace_flags(s, 0)
1765
29595ff2 1766
80a702cd 1767STATIC void
15f169a1 1768S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1769{
1770 AV *av = CopFILEAVx(PL_curcop);
1771 if (av) {
7f1c3e8c
FC
1772 SV * sv;
1773 if (PL_parser->preambling == NOLINE) sv = newSV_type(SVt_PVMG);
1774 else {
1775 sv = *av_fetch(av, 0, 1);
1776 SvUPGRADE(sv, SVt_PVMG);
1777 }
1778 if (!SvPOK(sv)) sv_setpvs(sv,"");
5fa550fb 1779 if (orig_sv)
7f1c3e8c 1780 sv_catsv(sv, orig_sv);
5fa550fb 1781 else
7f1c3e8c
FC
1782 sv_catpvn(sv, buf, len);
1783 if (!SvIOK(sv)) {
1784 (void)SvIOK_on(sv);
1785 SvIV_set(sv, 0);
1786 }
1787 if (PL_parser->preambling == NOLINE)
1788 av_store(av, CopLINE(PL_curcop), sv);
80a702cd
RGS
1789 }
1790}
1791
ffb4593c
NT
1792/*
1793 * S_skipspace
1794 * Called to gobble the appropriate amount and type of whitespace.
1795 * Skips comments as well.
1796 */
1797
76e3520e 1798STATIC char *
21791330 1799S_skipspace_flags(pTHX_ char *s, U32 flags)
a687059c 1800{
21791330 1801 PERL_ARGS_ASSERT_SKIPSPACE_FLAGS;
3280af22 1802 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
3c47da3c 1803 while (s < PL_bufend && (SPACE_OR_TAB(*s) || !*s))
463ee0b2 1804 s++;
f0e67a1d
Z
1805 } else {
1806 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1807 PL_bufptr = s;
21791330 1808 lex_read_space(flags | LEX_KEEP_PREVIOUS |
d27f4b91 1809 (PL_lex_inwhat || PL_lex_state == LEX_FORMLINE ?
f0998909 1810 LEX_NO_NEXT_CHUNK : 0));
3280af22 1811 s = PL_bufptr;
f0e67a1d
Z
1812 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1813 if (PL_linestart > PL_bufptr)
1814 PL_bufptr = PL_linestart;
1815 return s;
463ee0b2 1816 }
5db06880 1817 return s;
a687059c 1818}
378cc40b 1819
ffb4593c
NT
1820/*
1821 * S_check_uni
1822 * Check the unary operators to ensure there's no ambiguity in how they're
1823 * used. An ambiguous piece of code would be:
1824 * rand + 5
1825 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1826 * the +5 is its argument.
1827 */
1828
76e3520e 1829STATIC void
cea2e8a9 1830S_check_uni(pTHX)
ba106d47 1831{
d4c19fe8
AL
1832 const char *s;
1833 const char *t;
2f3197b3 1834
3280af22 1835 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1836 return;
3280af22
NIS
1837 while (isSPACE(*PL_last_uni))
1838 PL_last_uni++;
c35e046a 1839 s = PL_last_uni;
8a2bca12 1840 while (isWORDCHAR_lazy_if(s,UTF) || *s == '-')
c35e046a 1841 s++;
3280af22 1842 if ((t = strchr(s, '(')) && t < PL_bufptr)
a0d0e21e 1843 return;
6136c704 1844
9b387841
NC
1845 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
1846 "Warning: Use of \"%.*s\" without parentheses is ambiguous",
1847 (int)(s - PL_last_uni), PL_last_uni);
2f3197b3
LW
1848}
1849
ffb4593c
NT
1850/*
1851 * LOP : macro to build a list operator. Its behaviour has been replaced
1852 * with a subroutine, S_lop() for which LOP is just another name.
1853 */
1854
a0d0e21e
LW
1855#define LOP(f,x) return lop(f,x,s)
1856
ffb4593c
NT
1857/*
1858 * S_lop
1859 * Build a list operator (or something that might be one). The rules:
41e8cbf4
FC
1860 * - if we have a next token, then it's a list operator (no parens) for
1861 * which the next token has already been parsed; e.g.,
1862 * sort foo @args
1863 * sort foo (@args)
ffb4593c
NT
1864 * - if the next thing is an opening paren, then it's a function
1865 * - else it's a list operator
1866 */
1867
76e3520e 1868STATIC I32
a0be28da 1869S_lop(pTHX_ I32 f, int x, char *s)
ffed7fef 1870{
7918f24d
NC
1871 PERL_ARGS_ASSERT_LOP;
1872
6154021b 1873 pl_yylval.ival = f;
35c8bce7 1874 CLINE;
3280af22
NIS
1875 PL_bufptr = s;
1876 PL_last_lop = PL_oldbufptr;
eb160463 1877 PL_last_lop_op = (OPCODE)f;
3280af22 1878 if (PL_nexttoke)
78cdf107 1879 goto lstop;
19f1898a 1880 PL_expect = x;
79072805 1881 if (*s == '(')
bbf60fe6 1882 return REPORT(FUNC);
294a536f 1883 s = skipspace(s);
79072805 1884 if (*s == '(')
bbf60fe6 1885 return REPORT(FUNC);
78cdf107
Z
1886 else {
1887 lstop:
1888 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
1889 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
bbf60fe6 1890 return REPORT(LSTOP);
78cdf107 1891 }
79072805
LW
1892}
1893
ffb4593c
NT
1894/*
1895 * S_force_next
9cbb5ea2 1896 * When the lexer realizes it knows the next token (for instance,
ffb4593c 1897 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 1898 * to know what token to return the next time the lexer is called. Caller
b5bbe64a
JH
1899 * will need to set PL_nextval[] and possibly PL_expect to ensure
1900 * the lexer handles the token correctly.
ffb4593c
NT
1901 */
1902
4e553d73 1903STATIC void
cea2e8a9 1904S_force_next(pTHX_ I32 type)
79072805 1905{
704d4215
GG
1906#ifdef DEBUGGING
1907 if (DEBUG_T_TEST) {
1908 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 1909 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
1910 }
1911#endif
1f7c3e7c 1912 assert(PL_nexttoke < C_ARRAY_LENGTH(PL_nexttype));
3280af22
NIS
1913 PL_nexttype[PL_nexttoke] = type;
1914 PL_nexttoke++;
1915 if (PL_lex_state != LEX_KNOWNEXT) {
1916 PL_lex_defer = PL_lex_state;
3280af22 1917 PL_lex_state = LEX_KNOWNEXT;
79072805
LW
1918 }
1919}
1920
89f35911
FC
1921/*
1922 * S_postderef
1923 *
1924 * This subroutine handles postfix deref syntax after the arrow has already
1925 * been emitted. @* $* etc. are emitted as two separate token right here.
1926 * @[ @{ %[ %{ *{ are emitted also as two tokens, but this function emits
1927 * only the first, leaving yylex to find the next.
89f35911
FC
1928 */
1929
1930static int
ff25e5db 1931S_postderef(pTHX_ int const funny, char const next)
89f35911 1932{
ff25e5db 1933 assert(funny == DOLSHARP || strchr("$@%&*", funny));
89f35911
FC
1934 assert(strchr("*[{", next));
1935 if (next == '*') {
1936 PL_expect = XOPERATOR;
cc624add 1937 if (PL_lex_state == LEX_INTERPNORMAL && !PL_lex_brackets) {
ff25e5db 1938 assert('@' == funny || '$' == funny || DOLSHARP == funny);
cc624add 1939 PL_lex_state = LEX_INTERPEND;
cc624add
FC
1940 force_next(POSTJOIN);
1941 }
89f35911
FC
1942 force_next(next);
1943 PL_bufptr+=2;
1944 }
1945 else {
760ca746
FC
1946 if ('@' == funny && PL_lex_state == LEX_INTERPNORMAL
1947 && !PL_lex_brackets)
1948 PL_lex_dojoin = 2;
89f35911
FC
1949 PL_expect = XOPERATOR;
1950 PL_bufptr++;
1951 }
1952 return funny;
1953}
1954
28ac2b49
Z
1955void
1956Perl_yyunlex(pTHX)
1957{
a7aaec61
Z
1958 int yyc = PL_parser->yychar;
1959 if (yyc != YYEMPTY) {
1960 if (yyc) {
a7aaec61
Z
1961 NEXTVAL_NEXTTOKE = PL_parser->yylval;
1962 if (yyc == '{'/*}*/ || yyc == HASHBRACK || yyc == '['/*]*/) {
78cdf107 1963 PL_lex_allbrackets--;
a7aaec61 1964 PL_lex_brackets--;
78cdf107
Z
1965 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
1966 } else if (yyc == '('/*)*/) {
1967 PL_lex_allbrackets--;
1968 yyc |= (2<<24);
a7aaec61
Z
1969 }
1970 force_next(yyc);
1971 }
28ac2b49
Z
1972 PL_parser->yychar = YYEMPTY;
1973 }
1974}
1975
d0a148a6 1976STATIC SV *
15f169a1 1977S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 1978{
740cce10 1979 SV * const sv = newSVpvn_utf8(start, len,
eaf7a4d2
CS
1980 !IN_BYTES
1981 && UTF
9f10db87 1982 && !is_invariant_string((const U8*)start, len)
740cce10 1983 && is_utf8_string((const U8*)start, len));
d0a148a6
NC
1984 return sv;
1985}
1986
ffb4593c
NT
1987/*
1988 * S_force_word
1989 * When the lexer knows the next thing is a word (for instance, it has
1990 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
1991 * it calls S_force_word to stick the next word into the PL_nexttoke/val
1992 * lookahead.
ffb4593c
NT
1993 *
1994 * Arguments:
b1b65b59 1995 * char *start : buffer position (must be within PL_linestr)
02b34bbe 1996 * int token : PL_next* will be this type of bare word (e.g., METHOD,WORD)
ffb4593c
NT
1997 * int check_keyword : if true, Perl checks to make sure the word isn't
1998 * a keyword (do this if the word is a label, e.g. goto FOO)
1999 * int allow_pack : if true, : characters will also be allowed (require,
2000 * use, etc. do this)
ffb4593c
NT
2001 */
2002
76e3520e 2003STATIC char *
345b3785 2004S_force_word(pTHX_ char *start, int token, int check_keyword, int allow_pack)
79072805 2005{
eb578fdb 2006 char *s;
463ee0b2 2007 STRLEN len;
4e553d73 2008
7918f24d
NC
2009 PERL_ARGS_ASSERT_FORCE_WORD;
2010
294a536f 2011 start = skipspace(start);
463ee0b2 2012 s = start;
7e2040f0 2013 if (isIDFIRST_lazy_if(s,UTF) ||
345b3785 2014 (allow_pack && *s == ':') )
a0d0e21e 2015 {
3280af22 2016 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
01b5ef50
FC
2017 if (check_keyword) {
2018 char *s2 = PL_tokenbuf;
487e470d 2019 STRLEN len2 = len;
01b5ef50 2020 if (allow_pack && len > 6 && strnEQ(s2, "CORE::", 6))
487e470d
FC
2021 s2 += 6, len2 -= 6;
2022 if (keyword(s2, len2, 0))
463ee0b2 2023 return start;
01b5ef50 2024 }
463ee0b2 2025 if (token == METHOD) {
294a536f 2026 s = skipspace(s);
463ee0b2 2027 if (*s == '(')
3280af22 2028 PL_expect = XTERM;
463ee0b2 2029 else {
3280af22 2030 PL_expect = XOPERATOR;
463ee0b2 2031 }
79072805 2032 }
9ded7720 2033 NEXTVAL_NEXTTOKE.opval
d0a148a6
NC
2034 = (OP*)newSVOP(OP_CONST,0,
2035 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 2036 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
2037 force_next(token);
2038 }
2039 return s;
2040}
2041
ffb4593c
NT
2042/*
2043 * S_force_ident
9cbb5ea2 2044 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
2045 * text only contains the "foo" portion. The first argument is a pointer
2046 * to the "foo", and the second argument is the type symbol to prefix.
2047 * Forces the next token to be a "WORD".
9cbb5ea2 2048 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2049 */
2050
76e3520e 2051STATIC void
5aaab254 2052S_force_ident(pTHX_ const char *s, int kind)
79072805 2053{
7918f24d
NC
2054 PERL_ARGS_ASSERT_FORCE_IDENT;
2055
c9b48522
DD
2056 if (s[0]) {
2057 const STRLEN len = s[1] ? strlen(s) : 1; /* s = "\"" see yylex */
728847b1
BF
2058 OP* const o = (OP*)newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
2059 UTF ? SVf_UTF8 : 0));
9ded7720 2060 NEXTVAL_NEXTTOKE.opval = o;
79072805 2061 force_next(WORD);
748a9306 2062 if (kind) {
11343788 2063 o->op_private = OPpCONST_ENTERED;
55497cff
PP
2064 /* XXX see note in pp_entereval() for why we forgo typo
2065 warnings if the symbol must be introduced in an eval.
2066 GSAR 96-10-12 */
90e5519e 2067 gv_fetchpvn_flags(s, len,
4bff32c5 2068 (PL_in_eval ? GV_ADDMULTI
728847b1 2069 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
90e5519e
NC
2070 kind == '$' ? SVt_PV :
2071 kind == '@' ? SVt_PVAV :
2072 kind == '%' ? SVt_PVHV :
a0d0e21e 2073 SVt_PVGV
90e5519e 2074 );
748a9306 2075 }
79072805
LW
2076 }
2077}
2078
3f33d153
FC
2079static void
2080S_force_ident_maybe_lex(pTHX_ char pit)
2081{
3f33d153
FC
2082 NEXTVAL_NEXTTOKE.ival = pit;
2083 force_next('p');
2084}
2085
1571675a
GS
2086NV
2087Perl_str_to_version(pTHX_ SV *sv)
2088{
2089 NV retval = 0.0;
2090 NV nshift = 1.0;
2091 STRLEN len;
cfd0369c 2092 const char *start = SvPV_const(sv,len);
9d4ba2ae 2093 const char * const end = start + len;
504618e9 2094 const bool utf = SvUTF8(sv) ? TRUE : FALSE;
7918f24d
NC
2095
2096 PERL_ARGS_ASSERT_STR_TO_VERSION;
2097
1571675a 2098 while (start < end) {
ba210ebe 2099 STRLEN skip;
1571675a
GS
2100 UV n;
2101 if (utf)
9041c2e3 2102 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2103 else {
2104 n = *(U8*)start;
2105 skip = 1;
2106 }
2107 retval += ((NV)n)/nshift;
2108 start += skip;
2109 nshift *= 1000;
2110 }
2111 return retval;
2112}
2113
4e553d73 2114/*
ffb4593c
NT
2115 * S_force_version
2116 * Forces the next token to be a version number.
e759cc13
RGS
2117 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2118 * and if "guessing" is TRUE, then no new token is created (and the caller
2119 * must use an alternative parsing method).
ffb4593c
NT
2120 */
2121
76e3520e 2122STATIC char *
e759cc13 2123S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2124{
5f66b61c 2125 OP *version = NULL;
44dcb63b 2126 char *d;
89bfa8cd 2127
7918f24d
NC
2128 PERL_ARGS_ASSERT_FORCE_VERSION;
2129
294a536f 2130 s = skipspace(s);
89bfa8cd 2131
44dcb63b 2132 d = s;
dd629d5b 2133 if (*d == 'v')
44dcb63b 2134 d++;
44dcb63b 2135 if (isDIGIT(*d)) {
e759cc13
RGS
2136 while (isDIGIT(*d) || *d == '_' || *d == '.')
2137 d++;
4e4da3ac 2138 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
dd629d5b 2139 SV *ver;
6154021b
RGS
2140 s = scan_num(s, &pl_yylval);
2141 version = pl_yylval.opval;
dd629d5b
GS
2142 ver = cSVOPx(version)->op_sv;
2143 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2144 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2145 SvNV_set(ver, str_to_version(ver));
1571675a 2146 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2147 }
89bfa8cd 2148 }
5db06880 2149 else if (guessing) {
e759cc13 2150 return s;
5db06880 2151 }
89bfa8cd
PP
2152 }
2153
2154 /* NOTE: The parser sees the package name and the VERSION swapped */
9ded7720 2155 NEXTVAL_NEXTTOKE.opval = version;
4e553d73 2156 force_next(WORD);
89bfa8cd 2157
e759cc13 2158 return s;
89bfa8cd
PP
2159}
2160
ffb4593c 2161/*
91152fc1
DG
2162 * S_force_strict_version
2163 * Forces the next token to be a version number using strict syntax rules.
2164 */
2165
2166STATIC char *
2167S_force_strict_version(pTHX_ char *s)
2168{
91152fc1 2169 OP *version = NULL;
91152fc1
DG
2170 const char *errstr = NULL;
2171
2172 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2173
2174 while (isSPACE(*s)) /* leading whitespace */
2175 s++;
2176
2177 if (is_STRICT_VERSION(s,&errstr)) {
2178 SV *ver = newSV(0);
2179 s = (char *)scan_version(s, ver, 0);
2180 version = newSVOP(OP_CONST, 0, ver);
2181 }
4e4da3ac 2182 else if ( (*s != ';' && *s != '{' && *s != '}' ) &&
294a536f 2183 (s = skipspace(s), (*s != ';' && *s != '{' && *s != '}' )))
4e4da3ac 2184 {
91152fc1
DG
2185 PL_bufptr = s;
2186 if (errstr)
2187 yyerror(errstr); /* version required */
2188 return s;
2189 }
2190
91152fc1 2191 /* NOTE: The parser sees the package name and the VERSION swapped */
91152fc1
DG
2192 NEXTVAL_NEXTTOKE.opval = version;
2193 force_next(WORD);
2194
2195 return s;
2196}
2197
2198/*
ffb4593c
NT
2199 * S_tokeq
2200 * Tokenize a quoted string passed in as an SV. It finds the next
2201 * chunk, up to end of string or a backslash. It may make a new
2202 * SV containing that chunk (if HINT_NEW_STRING is on). It also
2203 * turns \\ into \.
2204 */
2205
76e3520e 2206STATIC SV *
cea2e8a9 2207S_tokeq(pTHX_ SV *sv)
79072805 2208{
eb578fdb
KW
2209 char *s;
2210 char *send;
2211 char *d;
b3ac6de7 2212 SV *pv = sv;
79072805 2213
7918f24d
NC
2214 PERL_ARGS_ASSERT_TOKEQ;
2215
279b35ad
FC
2216 assert (SvPOK(sv));
2217 assert (SvLEN(sv));
2218 assert (!SvIsCOW(sv));
307ed071 2219 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1) /* <<'heredoc' */
b3ac6de7 2220 goto finish;
279b35ad
FC
2221 s = SvPVX(sv);
2222 send = SvEND(sv);
dcb21ed6
NC
2223 /* This is relying on the SV being "well formed" with a trailing '\0' */
2224 while (s < send && !(*s == '\\' && s[1] == '\\'))
79072805
LW
2225 s++;
2226 if (s == send)
b3ac6de7 2227 goto finish;
79072805 2228 d = s;
be4731d2 2229 if ( PL_hints & HINT_NEW_STRING ) {
279b35ad
FC
2230 pv = newSVpvn_flags(SvPVX_const(pv), SvCUR(sv),
2231 SVs_TEMP | SvUTF8(sv));
be4731d2 2232 }
79072805
LW
2233 while (s < send) {
2234 if (*s == '\\') {
a0d0e21e 2235 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2236 s++; /* all that, just for this */
2237 }
2238 *d++ = *s++;
2239 }
2240 *d = '\0';
95a20fc0 2241 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2242 finish:
3280af22 2243 if ( PL_hints & HINT_NEW_STRING )
eb0d8d16 2244 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
79072805
LW
2245 return sv;
2246}
2247
ffb4593c
NT
2248/*
2249 * Now come three functions related to double-quote context,
2250 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2251 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2252 * interact with PL_lex_state, and create fake ( ... ) argument lists
2253 * to handle functions and concatenation.
ecd24171
DM
2254 * For example,
2255 * "foo\lbar"
2256 * is tokenised as
2257 * stringify ( const[foo] concat lcfirst ( const[bar] ) )
ffb4593c
NT
2258 */
2259
2260/*
2261 * S_sublex_start
6154021b 2262 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2263 *
2264 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2265 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c
NT
2266 *
2267 * OP_CONST and OP_READLINE are easy--just make the new op and return.
2268 *
2269 * Everything else becomes a FUNC.
2270 *
2271 * Sets PL_lex_state to LEX_INTERPPUSH unless (ival was OP_NULL or we
2272 * had an OP_CONST or OP_READLINE). This just sets us up for a
2273 * call to S_sublex_push().
2274 */
2275
76e3520e 2276STATIC I32
cea2e8a9 2277S_sublex_start(pTHX)
79072805 2278{
eb578fdb 2279 const I32 op_type = pl_yylval.ival;
79072805
LW
2280
2281 if (op_type == OP_NULL) {
6154021b 2282 pl_yylval.opval = PL_lex_op;
5f66b61c 2283 PL_lex_op = NULL;
79072805
LW
2284 return THING;
2285 }
466112bb 2286 if (op_type == OP_CONST) {
67c71cbb
FC
2287 SV *sv = PL_lex_stuff;
2288 PL_lex_stuff = NULL;
2289 sv = tokeq(sv);
b3ac6de7
IZ
2290
2291 if (SvTYPE(sv) == SVt_PVIV) {
2292 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2293 STRLEN len;
96a5add6 2294 const char * const p = SvPV_const(sv, len);
740cce10 2295 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2296 SvREFCNT_dec(sv);
2297 sv = nsv;
4e553d73 2298 }
6154021b 2299 pl_yylval.opval = (OP*)newSVOP(op_type, 0, sv);
79072805
LW
2300 return THING;
2301 }
2302
3280af22 2303 PL_sublex_info.super_state = PL_lex_state;
eac04b2e 2304 PL_sublex_info.sub_inwhat = (U16)op_type;
3280af22
NIS
2305 PL_sublex_info.sub_op = PL_lex_op;
2306 PL_lex_state = LEX_INTERPPUSH;
55497cff 2307
3280af22
NIS
2308 PL_expect = XTERM;
2309 if (PL_lex_op) {
6154021b 2310 pl_yylval.opval = PL_lex_op;
5f66b61c 2311 PL_lex_op = NULL;
55497cff
PP
2312 return PMFUNC;
2313 }
2314 else
2315 return FUNC;
2316}
2317
ffb4593c
NT
2318/*
2319 * S_sublex_push
2320 * Create a new scope to save the lexing state. The scope will be
2321 * ended in S_sublex_done. Returns a '(', starting the function arguments
2322 * to the uc, lc, etc. found before.
2323 * Sets PL_lex_state to LEX_INTERPCONCAT.
2324 */
2325
76e3520e 2326STATIC I32
cea2e8a9 2327S_sublex_push(pTHX)
55497cff 2328{
78a635de 2329 LEXSHARED *shared;
801d32ac 2330 const bool is_heredoc = PL_multi_close == '<';
f46d017c 2331 ENTER;
55497cff 2332
3280af22 2333 PL_lex_state = PL_sublex_info.super_state;
cc624add 2334 SAVEI8(PL_lex_dojoin);
3280af22 2335 SAVEI32(PL_lex_brackets);
78cdf107 2336 SAVEI32(PL_lex_allbrackets);
b27dce25 2337 SAVEI32(PL_lex_formbrack);
78cdf107 2338 SAVEI8(PL_lex_fakeeof);
3280af22
NIS
2339 SAVEI32(PL_lex_casemods);
2340 SAVEI32(PL_lex_starts);
651b5b28 2341 SAVEI8(PL_lex_state);
7cc34111 2342 SAVESPTR(PL_lex_repl);
7766f137 2343 SAVEVPTR(PL_lex_inpat);
98246f1e 2344 SAVEI16(PL_lex_inwhat);
ffdb8b16 2345 if (is_heredoc)
b42366d4 2346 {
ffdb8b16 2347 SAVECOPLINE(PL_curcop);
b42366d4 2348 SAVEI32(PL_multi_end);
851b527a
FC
2349 SAVEI32(PL_parser->herelines);
2350 PL_parser->herelines = 0;
b42366d4
FC
2351 }
2352 SAVEI8(PL_multi_close);
3280af22 2353 SAVEPPTR(PL_bufptr);
8452ff4b 2354 SAVEPPTR(PL_bufend);
3280af22
NIS
2355 SAVEPPTR(PL_oldbufptr);
2356 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2357 SAVEPPTR(PL_last_lop);
2358 SAVEPPTR(PL_last_uni);
3280af22
NIS
2359 SAVEPPTR(PL_linestart);
2360 SAVESPTR(PL_linestr);
8edd5f42
RGS
2361 SAVEGENERICPV(PL_lex_brackstack);
2362 SAVEGENERICPV(PL_lex_casestack);
78a635de 2363 SAVEGENERICPV(PL_parser->lex_shared);
3a54fd60 2364 SAVEBOOL(PL_parser->lex_re_reparsing);
ffdb8b16 2365 SAVEI32(PL_copline);
3280af22 2366
99bd9d90 2367 /* The here-doc parser needs to be able to peek into outer lexing
60f40a38
FC
2368 scopes to find the body of the here-doc. So we put PL_linestr and
2369 PL_bufptr into lex_shared, to ‘share’ those values.
99bd9d90 2370 */
60f40a38
FC
2371 PL_parser->lex_shared->ls_linestr = PL_linestr;
2372 PL_parser->lex_shared->ls_bufptr = PL_bufptr;
99bd9d90 2373
3280af22 2374 PL_linestr = PL_lex_stuff;
7cc34111 2375 PL_lex_repl = PL_sublex_info.repl;
a0714e2c 2376 PL_lex_stuff = NULL;
7cc34111 2377 PL_sublex_info.repl = NULL;
3280af22 2378
eabab8bc
FC
2379 /* Arrange for PL_lex_stuff to be freed on scope exit, in case it gets
2380 set for an inner quote-like operator and then an error causes scope-
2381 popping. We must not have a PL_lex_stuff value left dangling, as
2382 that breaks assumptions elsewhere. See bug #123617. */
2383 SAVEGENERICSV(PL_lex_stuff);
2384
9cbb5ea2
GS
2385 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2386 = SvPVX(PL_linestr);
3280af22 2387 PL_bufend += SvCUR(PL_linestr);
bd61b366 2388 PL_last_lop = PL_last_uni = NULL;
3280af22 2389 SAVEFREESV(PL_linestr);
4dc843bc 2390 if (PL_lex_repl) SAVEFREESV(PL_lex_repl);
3280af22
NIS
2391
2392 PL_lex_dojoin = FALSE;
b27dce25 2393 PL_lex_brackets = PL_lex_formbrack = 0;
78cdf107
Z
2394 PL_lex_allbrackets = 0;
2395 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
a02a5408
JC
2396 Newx(PL_lex_brackstack, 120, char);
2397 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2398 PL_lex_casemods = 0;
2399 *PL_lex_casestack = '\0';
2400 PL_lex_starts = 0;
2401 PL_lex_state = LEX_INTERPCONCAT;
ffdb8b16 2402 if (is_heredoc)
6ddcf93b 2403 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
ffdb8b16 2404 PL_copline = NOLINE;
78a635de
FC
2405
2406 Newxz(shared, 1, LEXSHARED);
2407 shared->ls_prev = PL_parser->lex_shared;
2408 PL_parser->lex_shared = shared;
3280af22
NIS
2409
2410 PL_lex_inwhat = PL_sublex_info.sub_inwhat;
bb16bae8 2411 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
3280af22
NIS
2412 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2413 PL_lex_inpat = PL_sublex_info.sub_op;
79072805 2414 else
5f66b61c 2415 PL_lex_inpat = NULL;
79072805 2416
3a54fd60
DM
2417 PL_parser->lex_re_reparsing = cBOOL(PL_in_eval & EVAL_RE_REPARSING);
2418 PL_in_eval &= ~EVAL_RE_REPARSING;
2419
55497cff 2420 return '(';
79072805
LW
2421}
2422
ffb4593c
NT
2423/*
2424 * S_sublex_done
2425 * Restores lexer state after a S_sublex_push.
2426 */
2427
76e3520e 2428STATIC I32
cea2e8a9 2429S_sublex_done(pTHX)
79072805 2430{
3280af22 2431 if (!PL_lex_starts++) {
396482e1 2432 SV * const sv = newSVpvs("");
9aa983d2
JH
2433 if (SvUTF8(PL_linestr))
2434 SvUTF8_on(sv);
3280af22 2435 PL_expect = XOPERATOR;
6154021b 2436 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
79072805
LW
2437 return THING;
2438 }
2439
3280af22
NIS
2440 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2441 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2442 return yylex();
79072805
LW
2443 }
2444
ffb4593c 2445 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
bb16bae8 2446 assert(PL_lex_inwhat != OP_TRANSR);
5aa91856
FC
2447 if (PL_lex_repl) {
2448 assert (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS);
3280af22
NIS
2449 PL_linestr = PL_lex_repl;
2450 PL_lex_inpat = 0;
2451 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2452 PL_bufend += SvCUR(PL_linestr);
bd61b366 2453 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2454 PL_lex_dojoin = FALSE;
2455 PL_lex_brackets = 0;
78cdf107
Z
2456 PL_lex_allbrackets = 0;
2457 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
3280af22
NIS
2458 PL_lex_casemods = 0;
2459 *PL_lex_casestack = '\0';
2460 PL_lex_starts = 0;
25da4f38 2461 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2462 PL_lex_state = LEX_INTERPNORMAL;
2463 PL_lex_starts++;
e9fa98b2
HS
2464 /* we don't clear PL_lex_repl here, so that we can check later
2465 whether this is an evalled subst; that means we rely on the
2466 logic to ensure sublex_done() is called again only via the
2467 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2468 }
e9fa98b2 2469 else {
3280af22 2470 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2471 PL_lex_repl = NULL;
e9fa98b2 2472 }
ffdb8b16
FC
2473 if (SvTYPE(PL_linestr) >= SVt_PVNV) {
2474 CopLINE(PL_curcop) +=
2475 ((XPVNV*)SvANY(PL_linestr))->xnv_u.xpad_cop_seq.xlow
851b527a
FC
2476 + PL_parser->herelines;
2477 PL_parser->herelines = 0;
ffdb8b16 2478 }
9b6b7be8 2479 return '/';
ffed7fef
LW
2480 }
2481 else {
b42366d4 2482 const line_t l = CopLINE(PL_curcop);
f46d017c 2483 LEAVE;
b42366d4 2484 if (PL_multi_close == '<')
851b527a 2485 PL_parser->herelines += l - PL_multi_end;
3280af22
NIS
2486 PL_bufend = SvPVX(PL_linestr);
2487 PL_bufend += SvCUR(PL_linestr);
2488 PL_expect = XOPERATOR;
79072805 2489 return ')';
ffed7fef
LW
2490 }
2491}
2492
6f613c73
KW
2493PERL_STATIC_INLINE SV*
2494S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
2495{
140b12ad
KW
2496 /* <s> points to first character of interior of \N{}, <e> to one beyond the
2497 * interior, hence to the "}". Finds what the name resolves to, returning
2498 * an SV* containing it; NULL if no valid one found */
2499
dd2b1b72 2500 SV* res = newSVpvn_flags(s, e - s, UTF ? SVf_UTF8 : 0);
6f613c73 2501
0c415a79
KW
2502 HV * table;
2503 SV **cvp;
2504 SV *cv;
2505 SV *rv;
2506 HV *stash;
2507 const U8* first_bad_char_loc;
2508 const char* backslash_ptr = s - 3; /* Points to the <\> of \N{... */
2509
6f613c73
KW
2510 PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME;
2511
b7e6151c
FC
2512 if (!SvCUR(res))
2513 return res;
2514
107160e2
KW
2515 if (UTF && ! is_utf8_string_loc((U8 *) backslash_ptr,
2516 e - backslash_ptr,
2517 &first_bad_char_loc))
2518 {
2519 /* If warnings are on, this will print a more detailed analysis of what
2520 * is wrong than the error message below */
c80e42f3 2521 utf8n_to_uvchr(first_bad_char_loc,
107160e2
KW
2522 e - ((char *) first_bad_char_loc),
2523 NULL, 0);
2524
2525 /* We deliberately don't try to print the malformed character, which
2526 * might not print very well; it also may be just the first of many
2527 * malformations, so don't print what comes after it */
2528 yyerror(Perl_form(aTHX_
2529 "Malformed UTF-8 character immediately after '%.*s'",
2530 (int) (first_bad_char_loc - (U8 *) backslash_ptr), backslash_ptr));
2531 return NULL;
2532 }
2533
2534 res = new_constant( NULL, 0, "charnames", res, NULL, backslash_ptr,
2535 /* include the <}> */
2536 e - backslash_ptr + 1);
6f613c73 2537 if (! SvPOK(res)) {
b6407c49 2538 SvREFCNT_dec_NN(res);
6f613c73
KW
2539 return NULL;
2540 }
2541
0c415a79
KW
2542 /* See if the charnames handler is the Perl core's, and if so, we can skip
2543 * the validation needed for a user-supplied one, as Perl's does its own
2544 * validation. */
2545 table = GvHV(PL_hintgv); /* ^H */
2546 cvp = hv_fetchs(table, "charnames", FALSE);
5882ddb3
FC
2547 if (cvp && (cv = *cvp) && SvROK(cv) && (rv = SvRV(cv),
2548 SvTYPE(rv) == SVt_PVCV) && ((stash = CvSTASH(rv)) != NULL))
0c415a79
KW
2549 {
2550 const char * const name = HvNAME(stash);
6a642c21
FC
2551 if (HvNAMELEN(stash) == sizeof("_charnames")-1
2552 && strEQ(name, "_charnames")) {
0c415a79
KW
2553 return res;
2554 }
2555 }
2556
bde9e88d
KW
2557 /* Here, it isn't Perl's charname handler. We can't rely on a
2558 * user-supplied handler to validate the input name. For non-ut8 input,
2559 * look to see that the first character is legal. Then loop through the
2560 * rest checking that each is a continuation */
6f613c73 2561
36897d64
KW
2562 /* This code makes the reasonable assumption that the only Latin1-range
2563 * characters that begin a character name alias are alphabetic, otherwise
2564 * would have to create a isCHARNAME_BEGIN macro */
b6ba1137 2565
b6ba1137 2566 if (! UTF) {
bde9e88d 2567 if (! isALPHAU(*s)) {
b6ba1137
KW
2568 goto bad_charname;
2569 }
bde9e88d
KW
2570 s++;
2571 while (s < e) {
2572 if (! isCHARNAME_CONT(*s)) {
b6ba1137
KW
2573 goto bad_charname;
2574 }
2d8eb851
KW
2575 if (*s == ' ' && *(s-1) == ' ') {
2576 goto multi_spaces;
bd299e29 2577 }
df758df2
KW
2578 if ((U8) *s == NBSP_NATIVE && ckWARN_d(WARN_DEPRECATED)) {
2579 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2580 "NO-BREAK SPACE in a charnames "
2581 "alias definition is deprecated");
2582 }
bde9e88d 2583 s++;
b6ba1137
KW
2584 }
2585 }
2586 else {
bde9e88d
KW
2587 /* Similarly for utf8. For invariants can check directly; for other
2588 * Latin1, can calculate their code point and check; otherwise use a
2589 * swash */
2590 if (UTF8_IS_INVARIANT(*s)) {
2591 if (! isALPHAU(*s)) {
140b12ad
KW
2592 goto bad_charname;
2593 }
bde9e88d
KW
2594 s++;
2595 } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
94bb8c36 2596 if (! isALPHAU(TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)))) {
b6ba1137 2597 goto bad_charname;
6f613c73 2598 }
bde9e88d 2599 s += 2;
6f613c73 2600 }
bde9e88d
KW
2601 else {
2602 if (! PL_utf8_charname_begin) {
2603 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2604 PL_utf8_charname_begin = _core_swash_init("utf8",
2605 "_Perl_Charname_Begin",
2606 &PL_sv_undef,
2607 1, 0, NULL, &flags);
2608 }
2609 if (! swash_fetch(PL_utf8_charname_begin, (U8 *) s, TRUE)) {
2610 goto bad_charname;
2611 }
2612 s += UTF8SKIP(s);
2613 }
2614
2615 while (s < e) {
2616 if (UTF8_IS_INVARIANT(*s)) {
2617 if (! isCHARNAME_CONT(*s)) {
2618 goto bad_charname;
2619 }
2d8eb851
KW
2620 if (*s == ' ' && *(s-1) == ' ') {
2621 goto multi_spaces;
bd299e29 2622 }
bde9e88d
KW
2623 s++;
2624 }
2625 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
94bb8c36 2626 if (! isCHARNAME_CONT(TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1))))
bde9e88d
KW
2627 {
2628 goto bad_charname;
2629 }
df758df2
KW
2630 if (*s == *NBSP_UTF8
2631 && *(s+1) == *(NBSP_UTF8+1)
2632 && ckWARN_d(WARN_DEPRECATED))
2633 {
2634 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2635 "NO-BREAK SPACE in a charnames "
2636 "alias definition is deprecated");
2637 }
bde9e88d
KW
2638 s += 2;
2639 }
2640 else {
2641 if (! PL_utf8_charname_continue) {
2642 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2643 PL_utf8_charname_continue = _core_swash_init("utf8",
2644 "_Perl_Charname_Continue",
2645 &PL_sv_undef,
2646 1, 0, NULL, &flags);
2647 }
2648 if (! swash_fetch(PL_utf8_charname_continue, (U8 *) s, TRUE)) {
2649 goto bad_charname;
2650 }
2651 s += UTF8SKIP(s);
6f613c73
KW
2652 }
2653 }
2d8eb851
KW
2654 }
2655 if (*(s-1) == ' ') {
2656 yyerror_pv(
2657 Perl_form(aTHX_
2658 "charnames alias definitions may not contain trailing "
2659 "white-space; marked by <-- HERE in %.*s<-- HERE %.*s",
2660 (int)(s - backslash_ptr + 1), backslash_ptr,
2661 (int)(e - s + 1), s + 1
2662 ),
2663 UTF ? SVf_UTF8 : 0);
2664 return NULL;
6f613c73
KW
2665 }
2666
94ca1619 2667 if (SvUTF8(res)) { /* Don't accept malformed input */
bde9e88d
KW
2668 const U8* first_bad_char_loc;
2669 STRLEN len;
2670 const char* const str = SvPV_const(res, len);
2671 if (! is_utf8_string_loc((U8 *) str, len, &first_bad_char_loc)) {
2672 /* If warnings are on, this will print a more detailed analysis of
2673 * what is wrong than the error message below */
c80e42f3 2674 utf8n_to_uvchr(first_bad_char_loc,
bde9e88d
KW
2675 (char *) first_bad_char_loc - str,
2676 NULL, 0);
2677
2678 /* We deliberately don't try to print the malformed character,
2679 * which might not print very well; it also may be just the first
2680 * of many malformations, so don't print what comes after it */
2681 yyerror_pv(
2682 Perl_form(aTHX_
2683 "Malformed UTF-8 returned by %.*s immediately after '%.*s'",
2684 (int) (e - backslash_ptr + 1), backslash_ptr,
2685 (int) ((char *) first_bad_char_loc - str), str
2686 ),
2687 SVf_UTF8);
2688 return NULL;
2689 }
2690 }
140b12ad 2691
bde9e88d 2692 return res;
140b12ad 2693
bde9e88d 2694 bad_charname: {
bde9e88d
KW
2695
2696 /* The final %.*s makes sure that should the trailing NUL be missing
2697 * that this print won't run off the end of the string */
2698 yyerror_pv(
2699 Perl_form(aTHX_
2700 "Invalid character in \\N{...}; marked by <-- HERE in %.*s<-- HERE %.*s",
2d8eb851
KW
2701 (int)(s - backslash_ptr + 1), backslash_ptr,
2702 (int)(e - s + 1), s + 1
bde9e88d
KW
2703 ),
2704 UTF ? SVf_UTF8 : 0);
2705 return NULL;
2706 }
2d8eb851
KW
2707
2708 multi_spaces:
2709 yyerror_pv(
2710 Perl_form(aTHX_
2711 "charnames alias definitions may not contain a sequence of "
2712 "multiple spaces; marked by <-- HERE in %.*s<-- HERE %.*s",
2713 (int)(s - backslash_ptr + 1), backslash_ptr,
2714 (int)(e - s + 1), s + 1
2715 ),
2716 UTF ? SVf_UTF8 : 0);
2717 return NULL;
6f613c73
KW
2718}
2719
02aa26ce
NT
2720/*
2721 scan_const
2722
9da1dd8f
DM
2723 Extracts the next constant part of a pattern, double-quoted string,
2724 or transliteration. This is terrifying code.
2725
2726 For example, in parsing the double-quoted string "ab\x63$d", it would
2727 stop at the '$' and return an OP_CONST containing 'abc'.
02aa26ce 2728
94def140 2729 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 2730 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 2731 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 2732
94def140
ST
2733 Returns a pointer to the character scanned up to. If this is
2734 advanced from the start pointer supplied (i.e. if anything was
9da1dd8f 2735 successfully parsed), will leave an OP_CONST for the substring scanned
6154021b 2736 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
2737 by looking at the next characters herself.
2738
02aa26ce 2739 In patterns:
9da1dd8f 2740 expand:
537124e4
KW
2741 \N{FOO} => \N{U+hex_for_character_FOO}
2742 (if FOO expands to multiple characters, expands to \N{U+xx.XX.yy ...})
9da1dd8f
DM
2743
2744 pass through:
2745 all other \-char, including \N and \N{ apart from \N{ABC}
2746
2747 stops on:
2748 @ and $ where it appears to be a var, but not for $ as tail anchor
2749 \l \L \u \U \Q \E
2750 (?{ or (??{
2751
02aa26ce
NT
2752
2753 In transliterations:
2754 characters are VERY literal, except for - not at the start or end
94def140
ST
2755 of the string, which indicates a range. If the range is in bytes,
2756 scan_const expands the range to the full set of intermediate
2757 characters. If the range is in utf8, the hyphen is replaced with
2758 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
2759
2760 In double-quoted strings:
2761 backslashes:
2762 double-quoted style: \r and \n
ff3f963a 2763 constants: \x31, etc.
94def140 2764 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
2765 case and quoting: \U \Q \E
2766 stops on @ and $
2767
2768 scan_const does *not* construct ops to handle interpolated strings.
2769 It stops processing as soon as it finds an embedded $ or @ variable
2770 and leaves it to the caller to work out what's going on.
2771
94def140
ST
2772 embedded arrays (whether in pattern or not) could be:
2773 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2774
2775 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
2776
2777 $ in pattern could be $foo or could be tail anchor. Assumption:
2778 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 2779 followed by one of "()| \r\n\t"
02aa26ce 2780
9da1dd8f 2781 \1 (backreferences) are turned into $1 in substitutions
02aa26ce
NT
2782
2783 The structure of the code is
2784 while (there's a character to process) {
94def140
ST
2785 handle transliteration ranges
2786 skip regexp comments /(?#comment)/ and codes /(?{code})/
2787 skip #-initiated comments in //x patterns
2788 check for embedded arrays
02aa26ce
NT
2789 check for embedded scalars
2790 if (backslash) {
94def140 2791 deprecate \1 in substitution replacements
02aa26ce
NT
2792 handle string-changing backslashes \l \U \Q \E, etc.
2793 switch (what was escaped) {
94def140 2794 handle \- in a transliteration (becomes a literal -)
ff3f963a 2795 if a pattern and not \N{, go treat as regular character
94def140
ST
2796 handle \132 (octal characters)
2797 handle \x15 and \x{1234} (hex characters)
ff3f963a 2798 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
ST
2799 handle \cV (control characters)
2800 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 2801 } (end switch)
77a135fe 2802 continue
02aa26ce 2803 } (end if backslash)
77a135fe 2804 handle regular character
02aa26ce 2805 } (end while character to read)
4e553d73 2806
02aa26ce
NT
2807*/
2808
76e3520e 2809STATIC char *
cea2e8a9 2810S_scan_const(pTHX_ char *start)
79072805 2811{
eb578fdb 2812 char *send = PL_bufend; /* end of the constant */
dc023dbb
KW
2813 SV *sv = newSV(send - start); /* sv for the constant. See note below
2814 on sizing. */
eb578fdb
KW
2815 char *s = start; /* start of the constant */
2816 char *d = SvPVX(sv); /* destination for copies */
dc023dbb
KW
2817 bool dorange = FALSE; /* are we in a translit range? */
2818 bool didrange = FALSE; /* did we just finish a range? */
2819 bool in_charclass = FALSE; /* within /[...]/ */
2820 bool has_utf8 = FALSE; /* Output constant is UTF8 */
2821 bool this_utf8 = cBOOL(UTF); /* Is the source string assumed to be
2822 UTF8? But, this can show as true
2823 when the source isn't utf8, as for
2824 example when it is entirely composed
2825 of hex constants */
6f613c73 2826 SV *res; /* result from charnames */
77a135fe
KW
2827
2828 /* Note on sizing: The scanned constant is placed into sv, which is
2829 * initialized by newSV() assuming one byte of output for every byte of
2830 * input. This routine expects newSV() to allocate an extra byte for a
2831 * trailing NUL, which this routine will append if it gets to the end of
2832 * the input. There may be more bytes of input than output (eg., \N{LATIN
2833 * CAPITAL LETTER A}), or more output than input if the constant ends up
2834 * recoded to utf8, but each time a construct is found that might increase
2835 * the needed size, SvGROW() is called. Its size parameter each time is
2836 * based on the best guess estimate at the time, namely the length used so
2837 * far, plus the length the current construct will occupy, plus room for
2838 * the trailing NUL, plus one byte for every input byte still unscanned */
2839
c3320c2a
KW
2840 UV uv = UV_MAX; /* Initialize to weird value to try to catch any uses
2841 before set */
4c3a8340
ST
2842#ifdef EBCDIC
2843 UV literal_endpoint = 0;
e294cc5d 2844 bool native_range = TRUE; /* turned to FALSE if the first endpoint is Unicode. */
4c3a8340 2845#endif
012bcf8d 2846
7918f24d
NC
2847 PERL_ARGS_ASSERT_SCAN_CONST;
2848
bb16bae8 2849 assert(PL_lex_inwhat != OP_TRANSR);
2b9d42f0
NIS
2850 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
2851 /* If we are doing a trans and we know we want UTF8 set expectation */
2852 has_utf8 = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
2853 this_utf8 = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
2854 }
2855
b899e89d
FC
2856 /* Protect sv from errors and fatal warnings. */
2857 ENTER_with_name("scan_const");
2858 SAVEFREESV(sv);
2b9d42f0 2859
79072805 2860 while (s < send || dorange) {
ff3f963a 2861
02aa26ce 2862 /* get transliterations out of the way (they're most literal) */
3280af22 2863 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 2864 /* expand a range A-Z to the full set of characters. AIE! */
79072805 2865 if (dorange) {
1ba5c669
JH
2866 I32 i; /* current expanded character */
2867 I32 min; /* first character in range */
2868 I32 max; /* last character in range */
02aa26ce 2869
e294cc5d
JH
2870#ifdef EBCDIC
2871 UV uvmax = 0;
2872#endif
2873
2874 if (has_utf8
2875#ifdef EBCDIC
2876 && !native_range
2877#endif
1953db30 2878 ) {
9d4ba2ae 2879 char * const c = (char*)utf8_hop((U8*)d, -1);
8973db79
JH
2880 char *e = d++;
2881 while (e-- > c)
2882 *(e + 1) = *e;
e7214ce8 2883 *c = (char) ILLEGAL_UTF8_BYTE;
8973db79
JH
2884 /* mark the range as done, and continue */
2885 dorange = FALSE;
2886 didrange = TRUE;
2887 continue;
2888 }
2b9d42f0 2889
95a20fc0 2890 i = d - SvPVX_const(sv); /* remember current offset */
e294cc5d
JH
2891#ifdef EBCDIC
2892 SvGROW(sv,
dc023dbb
KW
2893 SvLEN(sv) + ((has_utf8)
2894 ? (512 - UTF_CONTINUATION_MARK
2895 + UNISKIP(0x100))
e294cc5d
JH
2896 : 256));
2897 /* How many two-byte within 0..255: 128 in UTF-8,
2898 * 96 in UTF-8-mod. */
2899#else
9cbb5ea2 2900 SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */
e294cc5d 2901#endif
9cbb5ea2 2902 d = SvPVX(sv) + i; /* refresh d after realloc */
e294cc5d
JH
2903#ifdef EBCDIC
2904 if (has_utf8) {
2905 int j;
2906 for (j = 0; j <= 1; j++) {
2907 char * const c = (char*)utf8_hop((U8*)d, -1);
2908 const UV uv = utf8n_to_uvchr((U8*)c, d - c, NULL, 0);
2909 if (j)
2910 min = (U8)uv;
2911 else if (uv < 256)
2912 max = (U8)uv;
2913 else {
2914 max = (U8)0xff; /* only to \xff */
2915 uvmax = uv; /* \x{100} to uvmax */
2916 }
2917 d = c; /* eat endpoint chars */
2918 }
2919 }
2920 else {
2921#endif
2922 d -= 2; /* eat the first char and the - */
2923 min = (U8)*d; /* first char in range */
2924 max = (U8)d[1]; /* last char in range */
2925#ifdef EBCDIC
2926 }
2927#endif
8ada0baa 2928
c2e66d9e 2929 if (min > max) {
01ec43d0 2930 Perl_croak(aTHX_
d1573ac7 2931 "Invalid range \"%c-%c\" in transliteration operator",
1ba5c669 2932 (char)min, (char)max);
c2e66d9e
GS
2933 }
2934
c7f1f016 2935#ifdef EBCDIC
dc023dbb
KW
2936 /* Because of the discontinuities in EBCDIC A-Z and a-z, expand
2937 * any subsets of these ranges into individual characters */
4c3a8340 2938 if (literal_endpoint == 2 &&
c286a9a6 2939 ((isLOWER_A(min) && isLOWER_A(max)) ||
75e9e7bf
KW
2940 (isUPPER_A(min) && isUPPER_A(max))))
2941 {
2942 for (i = min; i <= max; i++) {
2943 if (isALPHA_A(i))
2944 *d++ = i;
8ada0baa
JH
2945 }
2946 }
2947 else
2948#endif
2949 for (i = min; i <= max; i++)
e294cc5d
JH
2950#ifdef EBCDIC
2951 if (has_utf8) {
55d09dc8 2952 append_utf8_from_native_byte(i, &d);
e294cc5d
JH
2953 }
2954 else
2955#endif
2956 *d++ = (char)i;
2957
2958#ifdef EBCDIC
2959 if (uvmax) {
2960 d = (char*)uvchr_to_utf8((U8*)d, 0x100);
2961 if (uvmax > 0x101)
e7214ce8 2962 *d++ = (char) ILLEGAL_UTF8_BYTE;
e294cc5d
JH
2963 if (uvmax > 0x100)
2964 d = (char*)uvchr_to_utf8((U8*)d, uvmax);
2965 }
2966#endif
02aa26ce
NT
2967
2968 /* mark the range as done, and continue */
79072805 2969 dorange = FALSE;
01ec43d0 2970 didrange = TRUE;
4c3a8340
ST
2971#ifdef EBCDIC
2972 literal_endpoint = 0;
2973#endif
79072805 2974 continue;
4e553d73 2975 }
02aa26ce
NT
2976
2977 /* range begins (ignore - as first or last char) */
79072805 2978 else if (*s == '-' && s+1 < send && s != start) {
4e553d73 2979 if (didrange) {
1fafa243 2980 Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
01ec43d0 2981 }
e294cc5d
JH
2982 if (has_utf8
2983#ifdef EBCDIC
2984 && !native_range
2985#endif
2986 ) {
e7214ce8 2987 *d++ = (char) ILLEGAL_UTF8_BYTE; /* use illegal utf8 byte--see pmtrans */
a0ed51b3
LW
2988 s++;
2989 continue;
2990 }
79072805
LW
2991 dorange = TRUE;
2992 s++;
01ec43d0
GS
2993 }
2994 else {
2995 didrange = FALSE;
4c3a8340
ST
2996#ifdef EBCDIC
2997 literal_endpoint = 0;
e294cc5d 2998 native_range = TRUE;
4c3a8340 2999#endif
01ec43d0 3000 }
79072805 3001 }
02aa26ce
NT
3002
3003 /* if we get here, we're not doing a transliteration */
3004
e4a2df84
DM
3005 else if (*s == '[' && PL_lex_inpat && !in_charclass) {
3006 char *s1 = s-1;
3007 int esc = 0;
3008 while (s1 >= start && *s1-- == '\\')
3009 esc = !esc;
3010 if (!esc)
3011 in_charclass = TRUE;
3012 }
2866decb 3013
e4a2df84
DM
3014 else if (*s == ']' && PL_lex_inpat && in_charclass) {
3015 char *s1 = s-1;
3016 int esc = 0;
3017 while (s1 >= start && *s1-- == '\\')
3018 esc = !esc;
3019 if (!esc)
3020 in_charclass = FALSE;
3021 }
2866decb 3022
9da1dd8f
DM
3023 /* skip for regexp comments /(?#comment)/, except for the last
3024 * char, which will be done separately.
3025 * Stop on (?{..}) and friends */
3026
c30fc27b 3027 else if (*s == '(' && PL_lex_inpat && s[1] == '?' && !in_charclass) {
cc6b7395 3028 if (s[2] == '#') {
e994fd66 3029 while (s+1 < send && *s != ')')
5ff03569 3030 *d++ = *s++;
155aba94 3031 }
c30fc27b 3032 else if (!PL_lex_casemods &&
d3cec5e5
DM
3033 ( s[2] == '{' /* This should match regcomp.c */
3034 || (s[2] == '?' && s[3] == '{')))
155aba94 3035 {
9da1dd8f 3036 break;
cc6b7395 3037 }
748a9306 3038 }
02aa26ce
NT
3039
3040 /* likewise skip #-initiated comments in //x patterns */
c30fc27b 3041 else if (*s == '#' && PL_lex_inpat && !in_charclass &&
73134a2e 3042 ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED) {
748a9306 3043 while (s+1 < send && *s != '\n')
5ff03569 3044 *d++ = *s++;
748a9306 3045 }
02aa26ce 3046
9da1dd8f
DM
3047 /* no further processing of single-quoted regex */
3048 else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'')
3049 goto default_action;
3050
5d1d4326 3051 /* check for embedded arrays
da6eedaa 3052 (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
5d1d4326 3053 */
1749ea0d 3054 else if (*s == '@' && s[1]) {
8a2bca12 3055 if (isWORDCHAR_lazy_if(s+1,UTF))
1749ea0d
ST
3056 break;
3057 if (strchr(":'{$", s[1]))
3058 break;
3059 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
3060 break; /* in regexp, neither @+ nor @- are interpolated */
3061 }
02aa26ce
NT
3062
3063 /* check for embedded scalars. only stop if we're sure it's a
3064 variable.
3065 */
79072805 3066 else if (*s == '$') {
3280af22 3067 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
79072805 3068 break;
77772344 3069 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
a2a5de95
NC
3070 if (s[1] == '\\') {
3071 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
3072 "Possible unintended interpolation of $\\ in regex");
77772344 3073 }
79072805 3074 break; /* in regexp, $ might be tail anchor */
77772344 3075 }
79072805 3076 }
02aa26ce 3077
2b9d42f0
NIS
3078 /* End of else if chain - OP_TRANS rejoin rest */
3079
02aa26ce 3080 /* backslashes */
79072805 3081 if (*s == '\\' && s+1 < send) {
ff3f963a
KW
3082 char* e; /* Can be used for ending '}', etc. */
3083
79072805 3084 s++;
02aa26ce 3085
7d0fc23c
KW
3086 /* warn on \1 - \9 in substitution replacements, but note that \11
3087 * is an octal; and \19 is \1 followed by '9' */
3280af22 3088 if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat &&
a0d0e21e 3089 isDIGIT(*s) && *s != '0' && !isDIGIT(s[1]))
79072805 3090 {
c782d7ee 3091 /* diag_listed_as: \%d better written as $%d */
a2a5de95 3092 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
79072805
LW
3093 *--s = '$';
3094 break;
3095 }
02aa26ce
NT
3096
3097 /* string-change backslash escapes */
838f2281 3098 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQF", *s)) {
79072805
LW
3099 --s;
3100 break;
3101 }
ff3f963a
KW
3102 /* In a pattern, process \N, but skip any other backslash escapes.
3103 * This is because we don't want to translate an escape sequence
3104 * into a meta symbol and have the regex compiler use the meta
3105 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
3106 * in spite of this, we do have to process \N here while the proper
3107 * charnames handler is in scope. See bugs #56444 and #62056.
85fba779 3108 *
ff3f963a
KW
3109 * There is a complication because \N in a pattern may also stand
3110 * for 'match a non-nl', and not mean a charname, in which case its
3111 * processing should be deferred to the regex compiler. To be a
3112 * charname it must be followed immediately by a '{', and not look
3113 * like \N followed by a curly quantifier, i.e., not something like
3114 * \N{3,}. regcurly returns a boolean indicating if it is a legal
3115 * quantifier */
3116 else if (PL_lex_inpat
3117 && (*s != 'N'
3118 || s[1] != '{'
412f55bb 3119 || regcurly(s + 1)))
ff3f963a 3120 {
4d73d076 3121 *d++ = '\\';
cc74c5bd
ST
3122 goto default_action;
3123 }
02aa26ce 3124
79072805 3125 switch (*s) {
02aa26ce
NT
3126
3127 /* quoted - in transliterations */
79072805 3128 case '-':
3280af22 3129 if (PL_lex_inwhat == OP_TRANS) {
79072805
LW
3130 *d++ = *s++;
3131 continue;
3132 }
924ba076 3133 /* FALLTHROUGH */
79072805 3134 default:
11b8faa4 3135 {
15861f94 3136 if ((isALPHANUMERIC(*s)))
a2a5de95
NC
3137 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
3138 "Unrecognized escape \\%c passed through",
3139 *s);
11b8faa4 3140 /* default action is to copy the quoted character */
f9a63242 3141 goto default_action;
11b8faa4 3142 }
02aa26ce 3143
632403cc 3144 /* eg. \132 indicates the octal constant 0132 */
79072805
LW
3145 case '0': case '1': case '2': case '3':
3146 case '4': case '5': case '6': case '7':
ba210ebe 3147 {
5e0a247b 3148 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
53305cf1 3149 STRLEN len = 3;
06972766 3150 uv = grok_oct(s, &len, &flags, NULL);
ba210ebe 3151 s += len;
5e0a247b
KW
3152 if (len < 3 && s < send && isDIGIT(*s)
3153 && ckWARN(WARN_MISC))
3154 {
3155 Perl_warner(aTHX_ packWARN(WARN_MISC),
3156 "%s", form_short_octal_warning(s, len));
3157 }
ba210ebe 3158 }
012bcf8d 3159 goto NUM_ESCAPE_INSERT;
02aa26ce 3160
f0a2b745
KW
3161 /* eg. \o{24} indicates the octal constant \024 */
3162 case 'o':
3163 {
454155d9 3164 const char* error;
f0a2b745 3165
00ce5563 3166 bool valid = grok_bslash_o(&s, &uv, &error,
80f4111b
KW
3167 TRUE, /* Output warning */
3168 FALSE, /* Not strict */
17896a85
KW
3169 TRUE, /* Output warnings for
3170 non-portables */
80f4111b 3171 UTF);
454155d9 3172 if (! valid) {
f0a2b745
KW
3173 yyerror(error);
3174 continue;
3175 }
3176 goto NUM_ESCAPE_INSERT;
3177 }
3178
77a135fe 3179 /* eg. \x24 indicates the hex constant 0x24 */
79072805 3180 case 'x':
a0481293 3181 {
a0481293 3182 const char* error;
355860ce 3183
00ce5563 3184 bool valid = grok_bslash_x(&s, &uv, &error,
80f4111b
KW
3185 TRUE, /* Output warning */
3186 FALSE, /* Not strict */
17896a85
KW
3187 TRUE, /* Output warnings for
3188 non-portables */
80f4111b 3189 UTF);
a0481293
KW
3190 if (! valid) {
3191 yyerror(error);
355860ce 3192 continue;
ba210ebe 3193 }
012bcf8d
GS
3194 }
3195
3196 NUM_ESCAPE_INSERT:
ff3f963a
KW
3197 /* Insert oct or hex escaped character. There will always be
3198 * enough room in sv since such escapes will be longer than any
3199 * UTF-8 sequence they can end up as, except if they force us
3200 * to recode the rest of the string into utf8 */
ba7cea30 3201
06972766 3202 /* Here uv is the ordinal of the next character being added */
6f2d5cbc 3203 if (!UVCHR_IS_INVARIANT(uv)) {
9aa983d2 3204 if (!has_utf8 && uv > 255) {
77a135fe
KW
3205 /* Might need to recode whatever we have accumulated so
3206 * far if it contains any chars variant in utf8 or
3207 * utf-ebcdic. */
3208
3209 SvCUR_set(sv, d - SvPVX_const(sv));
3210 SvPOK_on(sv);
3211 *d = '\0';
77a135fe 3212 /* See Note on sizing above. */
5eef447b
KW
3213 sv_utf8_upgrade_flags_grow(
3214 sv,
3215 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE
3216 /* Above-latin1 in string
3217 * implies no encoding */
3218 |SV_UTF8_NO_ENCODING,
3219 UNISKIP(uv) + (STRLEN)(send - s) + 1);
77a135fe
KW
3220 d = SvPVX(sv) + SvCUR(sv);
3221 has_utf8 = TRUE;
012bcf8d
GS
3222 }
3223
77a135fe 3224 if (has_utf8) {
c80e42f3 3225 d = (char*)uvchr_to_utf8((U8*)d, uv);
f9a63242
JH
3226 if (PL_lex_inwhat == OP_TRANS &&
3227 PL_sublex_info.sub_op) {
3228 PL_sublex_info.sub_op->op_private |=
3229 (PL_lex_repl ? OPpTRANS_FROM_UTF
3230 : OPpTRANS_TO_UTF);
f9a63242 3231 }
e294cc5d
JH
3232#ifdef EBCDIC
3233 if (uv > 255 && !dorange)
3234 native_range = FALSE;
3235#endif
012bcf8d 3236 }
a0ed51b3 3237 else {
012bcf8d 3238 *d++ = (char)uv;
a0ed51b3 3239 }
012bcf8d
GS
3240 }
3241 else {
c4d5f83a 3242 *d++ = (char) uv;
a0ed51b3 3243 }
79072805 3244 continue;
02aa26ce 3245
4a2d328f 3246 case 'N':
85fba779
KW
3247 /* In a non-pattern \N must be like \N{U+0041}, or it can be a
3248 * named character, like \N{LATIN SMALL LETTER A}, or a named
3249 * sequence, like \N{LATIN CAPITAL LETTER A WITH MACRON AND
3250 * GRAVE}. For convenience all three forms are referred to as
3251 * "named characters" below.
3252 *
3253 * For patterns, \N also can mean to match a non-newline. Code
3254 * before this 'switch' statement should already have handled
3255 * this situation, and hence this code only has to deal with
3256 * the named character cases.
3257 *
3258 * For non-patterns, the named characters are converted to
3259 * their string equivalents. In patterns, named characters are
3260 * not converted to their ultimate forms for the same reasons
3261 * that other escapes aren't. Instead, they are converted to
3262 * the \N{U+...} form to get the value from the charnames that
3263 * is in effect right now, while preserving the fact that it
3264 * was a named character, so that the regex compiler knows
3265 * this.
3266 *
3267 * The structure of this section of code (besides checking for
ff3f963a 3268 * errors and upgrading to utf8) is:
85fba779
KW
3269 * If the named character is of the form \N{U+...}, pass it
3270 * through if a pattern; otherwise convert the code point
3271 * to utf8
3272 * Otherwise must be some \N{NAME}: convert to \N{U+c1.c2...}
3273 * if a pattern; otherwise convert to utf8
3274 *
3275 * If the regex compiler should ever need to differentiate
3276 * between the \N{U+...} and \N{name} forms, that could easily
3277 * be done here by stripping any leading zeros from the
3278 * \N{U+...} case, and adding them to the other one. */
3279
3280 /* Here, 's' points to the 'N'; the test below is guaranteed to
3281 * succeed if we are being called on a pattern, as we already
3282 * know from a test above that the next character is a '{'. A
3283 * non-pattern \N must mean 'named character', which requires
3284 * braces */
ff3f963a
KW
3285 s++;
3286 if (*s != '{') {
3287 yyerror("Missing braces on \\N{}");
3288 continue;
3289 }
3290 s++;
3291
0a96133f 3292 /* If there is no matching '}', it is an error. */
ff3f963a
KW
3293 if (! (e = strchr(s, '}'))) {
3294 if (! PL_lex_inpat) {
5777a3f7 3295 yyerror("Missing right brace on \\N{}");
0a96133f 3296 } else {
4407f1b8 3297 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N");
dbc0d4f2 3298 }
0a96133f 3299 continue;
ff3f963a 3300 }
cddc7ef4 3301
ff3f963a 3302 /* Here it looks like a named character */
cddc7ef4 3303
ff3f963a 3304 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
ff3f963a 3305 s += 2; /* Skip to next char after the 'U+' */
ff3f963a 3306 if (PL_lex_inpat) {
af352bf2
KW
3307
3308 /* In patterns, we can have \N{U+xxxx.yyyy.zzzz...} */
fb2eed93
FC
3309 /* Check the syntax. */
3310 const char *orig_s;
3311 orig_s = s - 5;
3312 if (!isXDIGIT(*s)) {
3313 bad_NU:
3314 yyerror(
3315 "Invalid hexadecimal number in \\N{U+...}"
3316 );
3317 s = e + 1;
3318 continue;
3319 }
3320 while (++s < e) {
3321 if (isXDIGIT(*s))
3322 continue;
3323 else if ((*s == '.' || *s == '_')
3324 && isXDIGIT(s[1]))
3325 continue;
3326 goto bad_NU;
4cbd7e22 3327 }
af352bf2 3328
fb2eed93
FC
3329 /* Pass everything through unchanged.
3330 * +1 is for the '}' */
4cbd7e22
FC
3331 Copy(orig_s, d, e - orig_s + 1, char);
3332 d += e - orig_s + 1;
ff3f963a
KW
3333 }
3334 else { /* Not a pattern: convert the hex to string */
fb2eed93
FC
3335 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3336 | PERL_SCAN_SILENT_ILLDIGIT
3337 | PERL_SCAN_DISALLOW_PREFIX;
3338 STRLEN len = e - s;
3339 uv = grok_hex(s, &len, &flags, NULL);
3340 if (len == 0 || (len != (STRLEN)(e - s)))
3341 goto bad_NU;
ff3f963a 3342
85fba779 3343 /* If the destination is not in utf8, unconditionally
ff3f963a
KW
3344 * recode it to be so. This is because \N{} implies
3345 * Unicode semantics, and scalars have to be in utf8
3346 * to guarantee those semantics */
3347 if (! has_utf8) {
3348 SvCUR_set(sv, d - SvPVX_const(sv));
3349 SvPOK_on(sv);
3350 *d = '\0';
3351 /* See Note on sizing above. */
3352 sv_utf8_upgrade_flags_grow(
3353 sv,
3354 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3355 UNISKIP(uv) + (STRLEN)(send - e) + 1);
3356 d = SvPVX(sv) + SvCUR(sv);
3357 has_utf8 = TRUE;
3358 }
3359
a46469e6 3360 /* Add the (Unicode) code point to the output. */
ff3f963a 3361 if (UNI_IS_INVARIANT(uv)) {
a46469e6 3362 *d++ = (char) LATIN1_TO_NATIVE(uv);
ff3f963a 3363 }
a46469e6
KW
3364 else {
3365 d = (char*) uvoffuni_to_utf8_flags((U8*)d, uv, 0);
3366 }
ff3f963a
KW
3367 }
3368 }
6f613c73
KW
3369 else /* Here is \N{NAME} but not \N{U+...}. */
3370 if ((res = get_and_check_backslash_N_name(s, e)))
3371 {
3372 STRLEN len;
3373 const char *str = SvPV_const(res, len);
3374 if (PL_lex_inpat) {
ff3f963a
KW
3375
3376 if (! len) { /* The name resolved to an empty string */
3377 Copy("\\N{}", d, 4, char);
3378 d += 4;
3379 }
3380 else {
3381 /* In order to not lose information for the regex
3382 * compiler, pass the result in the specially made
3383 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3384 * the code points in hex of each character
3385 * returned by charnames */
3386
3387 const char *str_end = str + len;
3b721c4f 3388 const STRLEN off = d - SvPVX_const(sv);
94ca1619
KW
3389
3390 if (! SvUTF8(res)) {
3391 /* For the non-UTF-8 case, we can determine the
3392 * exact length needed without having to parse
3393 * through the string. Each character takes up
3394 * 2 hex digits plus either a trailing dot or
3395 * the "}" */
97651d61
KW
3396 const char initial_text[] = "\\N{U+";
3397 const STRLEN initial_len = sizeof(initial_text)
3398 - 1;
94ca1619
KW
3399 d = off + SvGROW(sv, off
3400 + 3 * len
97651d61
KW
3401
3402 /* +1 for trailing NUL */
3403 + initial_len + 1
3404
94ca1619 3405 + (STRLEN)(send - e));
97651d61
KW
3406 Copy(initial_text, d, initial_len, char);
3407 d += initial_len;
94ca1619
KW
3408 while (str < str_end) {
3409 char hex_string[4];
e8549682 3410 int len =
51f14a05
JH
3411 my_snprintf(hex_string,
3412 sizeof(hex_string),
e8549682
JH
3413 "%02X.", (U8) *str);
3414 PERL_MY_SNPRINTF_POST_GUARD(len, sizeof(hex_string));
94ca1619
KW
3415 Copy(hex_string, d, 3, char);
3416 d += 3;
3417 str++;
3418 }
85fba779 3419 d--; /* Below, we will overwrite the final
94ca1619
KW
3420 dot with a right brace */
3421 }
3422 else {
1953db30
KW
3423 STRLEN char_length; /* cur char's byte length */
3424
3425 /* and the number of bytes after this is
3426 * translated into hex digits */
3427 STRLEN output_length;
3428
3429 /* 2 hex per byte; 2 chars for '\N'; 2 chars
3430 * for max('U+', '.'); and 1 for NUL */
3431 char hex_string[2 * UTF8_MAXBYTES + 5];
3432
3433 /* Get the first character of the result. */
a46469e6 3434 U32 uv = utf8n_to_uvchr((U8 *) str,
1953db30
KW
3435 len,
3436 &char_length,
3437 UTF8_ALLOW_ANYUV);
3438 /* Convert first code point to hex, including
a46469e6 3439 * the boiler plate before it. */
1953db30
KW
3440 output_length =
3441 my_snprintf(hex_string, sizeof(hex_string),
a46469e6
KW
3442 "\\N{U+%X",
3443 (unsigned int) uv);
1953db30
KW
3444
3445 /* Make sure there is enough space to hold it */
3446 d = off + SvGROW(sv, off
3447 + output_length
3448 + (STRLEN)(send - e)
3449 + 2); /* '}' + NUL */
3450 /* And output it */
3451 Copy(hex_string, d, output_length, char);
3452 d += output_length;
3453
3454 /* For each subsequent character, append dot and
3455 * its ordinal in hex */
3456 while ((str += char_length) < str_end) {
3457 const STRLEN off = d - SvPVX_const(sv);
a46469e6 3458 U32 uv = utf8n_to_uvchr((U8 *) str,
1953db30
KW
3459 str_end - str,
3460 &char_length,
3461 UTF8_ALLOW_ANYUV);
3462 output_length =
3463 my_snprintf(hex_string,
a46469e6
KW
3464 sizeof(hex_string),
3465 ".%X",
3466 (unsigned int) uv);
1953db30
KW
3467
3468 d = off + SvGROW(sv, off
3469 + output_length
3470 + (STRLEN)(send - e)
3471 + 2); /* '}' + NUL */
3472 Copy(hex_string, d, output_length, char);
3473 d += output_length;
3474 }
94ca1619 3475 }
ff3f963a
KW
3476
3477 *d++ = '}'; /* Done. Add the trailing brace */
3478 }
3479 }
3480 else { /* Here, not in a pattern. Convert the name to a
3481 * string. */
3482
3483 /* If destination is not in utf8, unconditionally
3484 * recode it to be so. This is because \N{} implies
3485 * Unicode semantics, and scalars have to be in utf8
3486 * to guarantee those semantics */
3487 if (! has_utf8) {
3488 SvCUR_set(sv, d - SvPVX_const(sv));
3489 SvPOK_on(sv);
3490 *d = '\0';
3491 /* See Note on sizing above. */
3492 sv_utf8_upgrade_flags_grow(sv,
3493 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3494 len + (STRLEN)(send - s) + 1);
3495 d = SvPVX(sv) + SvCUR(sv);
3496 has_utf8 = TRUE;
3497 } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
3498
3499 /* See Note on sizing above. (NOTE: SvCUR() is not
3500 * set correctly here). */
3501 const STRLEN off = d - SvPVX_const(sv);
3502 d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1);
3503 }
716d5ef3 3504 if (! SvUTF8(res)) { /* Make sure \N{} return is UTF-8 */
896eff3c 3505 sv_utf8_upgrade_flags(res, SV_UTF8_NO_ENCODING);
7fc82458
KW
3506 str = SvPV_const(res, len);
3507 }
ff3f963a
KW
3508 Copy(str, d, len, char);
3509 d += len;
423cee85 3510 }
6f613c73 3511
423cee85 3512 SvREFCNT_dec(res);
cb233ae3 3513
cb233ae3 3514 } /* End \N{NAME} */
ff3f963a
KW
3515#ifdef EBCDIC
3516 if (!dorange)
3517 native_range = FALSE; /* \N{} is defined to be Unicode */
3518#endif
3519 s = e + 1; /* Point to just after the '}' */
423cee85
JH
3520 continue;
3521
02aa26ce 3522 /* \c is a control character */
79072805
LW
3523 case 'c':
3524 s++;
961ce445 3525 if (s < send) {
421e43ba 3526 *d++ = grok_bslash_c(*s++, 1);
ba210ebe 3527 }
961ce445
RGS
3528 else {
3529 yyerror("Missing control char name in \\c");
3530 }
79072805 3531 continue;
02aa26ce
NT
3532
3533 /* printf-style backslashes, formfeeds, newlines, etc */
79072805 3534 case 'b':
4d73d076 3535 *d++ = '\b';
79072805
LW
3536 break;
3537 case 'n':
4d73d076 3538 *d++ = '\n';
79072805
LW
3539 break;
3540 case 'r':
4d73d076 3541 *d++ = '\r';
79072805
LW
3542 break;
3543 case 'f':
4d73d076 3544 *d++ = '\f';
79072805
LW
3545 break;
3546 case 't':
4d73d076 3547 *d++ = '\t';
79072805 3548 break;
34a3fe2a 3549 case 'e':
da0334e8 3550 *d++ = ESC_NATIVE;
34a3fe2a
PP
3551 break;
3552 case 'a':
4d73d076 3553 *d++ = '\a';
79072805 3554 break;
02aa26ce
NT
3555 } /* end switch */
3556
79072805
LW
3557 s++;
3558 continue;
02aa26ce 3559 } /* end if (backslash) */
4c3a8340
ST
3560#ifdef EBCDIC
3561 else
3562 literal_endpoint++;
3563#endif
02aa26ce 3564
f9a63242 3565 default_action:
77a135fe
KW
3566 /* If we started with encoded form, or already know we want it,
3567 then encode the next character */
6f2d5cbc 3568 if (! NATIVE_BYTE_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
2b9d42f0 3569 STRLEN len = 1;
77a135fe
KW
3570
3571
3572 /* One might think that it is wasted effort in the case of the
3573 * source being utf8 (this_utf8 == TRUE) to take the next character
3574 * in the source, convert it to an unsigned value, and then convert
3575 * it back again. But the source has not been validated here. The
3576 * routine that does the conversion checks for errors like
3577 * malformed utf8 */
3578
233ca360
KW
3579 const UV nextuv = (this_utf8)
3580 ? utf8n_to_uvchr((U8*)s, send - s, &len, 0)
3581 : (UV) ((U8) *s);
5aaebcb3 3582 const STRLEN need = UNISKIP(nextuv);
77a135fe
KW
3583 if (!has_utf8) {
3584 SvCUR_set(sv, d - SvPVX_const(sv));
3585 SvPOK_on(sv);
3586 *d = '\0';
77a135fe 3587 /* See Note on sizing above. */
7bf79863
KW
3588 sv_utf8_upgrade_flags_grow(sv,
3589 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3590 need + (STRLEN)(send - s) + 1);
77a135fe
KW
3591 d = SvPVX(sv) + SvCUR(sv);
3592 has_utf8 = TRUE;
3593 } else if (need > len) {
3594 /* encoded value larger than old, may need extra space (NOTE:
3595 * SvCUR() is not set correctly here). See Note on sizing
3596 * above. */
9d4ba2ae 3597 const STRLEN off = d - SvPVX_const(sv);
77a135fe 3598 d = SvGROW(sv, off + need + (STRLEN)(send - s) + 1) + off;
2b9d42f0 3599 }
77a135fe
KW
3600 s += len;
3601
5f66b61c 3602 d = (char*)uvchr_to_utf8((U8*)d, nextuv);
e294cc5d
JH
3603#ifdef EBCDIC
3604 if (uv > 255 && !dorange)
3605 native_range = FALSE;
3606#endif
2b9d42f0
NIS
3607 }
3608 else {
5ff03569 3609 *d++ = *s++;
2b9d42f0 3610 }
02aa26ce
NT
3611 } /* while loop to process each character */
3612
3613 /* terminate the string and set up the sv */
79072805 3614 *d = '\0';
95a20fc0 3615 SvCUR_set(sv, d - SvPVX_const(sv));
2b9d42f0 3616 if (SvCUR(sv) >= SvLEN(sv))
5637ef5b
NC
3617 Perl_croak(aTHX_ "panic: constant overflowed allocated space, %"UVuf
3618 " >= %"UVuf, (UV)SvCUR(sv), (UV)SvLEN(sv));
2b9d42f0 3619
79072805 3620 SvPOK_on(sv);
47e13f24 3621 if (IN_ENCODING && !has_utf8) {
ad2de1b2 3622 sv_recode_to_utf8(sv, _get_encoding());
d0063567
DK
3623 if (SvUTF8(sv))
3624 has_utf8 = TRUE;
9f4817db 3625 }
2b9d42f0 3626 if (has_utf8) {
7e2040f0 3627 SvUTF8_on(sv);
2b9d42f0 3628 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
d0063567 3629 PL_sublex_info.sub_op->op_private |=
2b9d42f0
NIS
3630 (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);