This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
add $B::OP::does_parent variable
[perl5.git] / toke.c
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
f0e67a1d
Z
26This is the lower layer of the Perl parser, managing characters and tokens.
27
28=for apidoc AmU|yy_parser *|PL_parser
29
30Pointer to a structure encapsulating the state of the parsing operation
31currently in progress. The pointer can be locally changed to perform
32a nested parse without interfering with the state of an outer parse.
33Individual members of C<PL_parser> have their own documentation.
34
35=cut
36*/
37
378cc40b 38#include "EXTERN.h"
864dbfa3 39#define PERL_IN_TOKE_C
378cc40b 40#include "perl.h"
04e98a4d 41#include "dquote_static.c"
378cc40b 42
eb0d8d16
NC
43#define new_constant(a,b,c,d,e,f,g) \
44 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
45
6154021b 46#define pl_yylval (PL_parser->yylval)
d3b6f988 47
199e78b7
DM
48/* XXX temporary backwards compatibility */
49#define PL_lex_brackets (PL_parser->lex_brackets)
78cdf107
Z
50#define PL_lex_allbrackets (PL_parser->lex_allbrackets)
51#define PL_lex_fakeeof (PL_parser->lex_fakeeof)
199e78b7
DM
52#define PL_lex_brackstack (PL_parser->lex_brackstack)
53#define PL_lex_casemods (PL_parser->lex_casemods)
54#define PL_lex_casestack (PL_parser->lex_casestack)
55#define PL_lex_defer (PL_parser->lex_defer)
56#define PL_lex_dojoin (PL_parser->lex_dojoin)
199e78b7
DM
57#define PL_lex_formbrack (PL_parser->lex_formbrack)
58#define PL_lex_inpat (PL_parser->lex_inpat)
59#define PL_lex_inwhat (PL_parser->lex_inwhat)
60#define PL_lex_op (PL_parser->lex_op)
61#define PL_lex_repl (PL_parser->lex_repl)
62#define PL_lex_starts (PL_parser->lex_starts)
63#define PL_lex_stuff (PL_parser->lex_stuff)
64#define PL_multi_start (PL_parser->multi_start)
65#define PL_multi_open (PL_parser->multi_open)
66#define PL_multi_close (PL_parser->multi_close)
199e78b7
DM
67#define PL_preambled (PL_parser->preambled)
68#define PL_sublex_info (PL_parser->sublex_info)
bdc0bf6f 69#define PL_linestr (PL_parser->linestr)
c2598295
DM
70#define PL_expect (PL_parser->expect)
71#define PL_copline (PL_parser->copline)
f06b5848
DM
72#define PL_bufptr (PL_parser->bufptr)
73#define PL_oldbufptr (PL_parser->oldbufptr)
74#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
75#define PL_linestart (PL_parser->linestart)
76#define PL_bufend (PL_parser->bufend)
77#define PL_last_uni (PL_parser->last_uni)
78#define PL_last_lop (PL_parser->last_lop)
79#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 80#define PL_lex_state (PL_parser->lex_state)
2f9285f8 81#define PL_rsfp (PL_parser->rsfp)
5486870f 82#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
83#define PL_in_my (PL_parser->in_my)
84#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 85#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 86#define PL_multi_end (PL_parser->multi_end)
13765c85 87#define PL_error_count (PL_parser->error_count)
199e78b7 88
fb205e7a
DM
89# define PL_nexttoke (PL_parser->nexttoke)
90# define PL_nexttype (PL_parser->nexttype)
91# define PL_nextval (PL_parser->nextval)
199e78b7 92
a1894d81 93static const char* const ident_too_long = "Identifier too long";
8903cb82 94
9ded7720 95# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2 96
a7aaec61
Z
97#define XENUMMASK 0x3f
98#define XFAKEEOF 0x40
99#define XFAKEBRACK 0x80
9059aa12 100
39e02b42 101#ifdef USE_UTF8_SCRIPTS
b3041197 102# define UTF cBOOL(!IN_BYTES)
2b9d42f0 103#else
b3041197 104# define UTF cBOOL((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
2b9d42f0 105#endif
a0ed51b3 106
b1fc3636
CJ
107/* The maximum number of characters preceding the unrecognized one to display */
108#define UNRECOGNIZED_PRECEDE_COUNT 10
109
61f0cdd9 110/* In variables named $^X, these are the legal values for X.
2b92dfce
GS
111 * 1999-02-27 mjd-perl-patch@plover.com */
112#define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
113
14bd96d0 114#define SPACE_OR_TAB(c) isBLANK_A(c)
bf4acbe4 115
9ff909cf
JH
116#define HEXFP_PEEK(s) \
117 (((s[0] == '.') && \
118 (isXDIGIT(s[1]) || isALPHA_FOLD_EQ(s[1], 'p'))) || \
119 isALPHA_FOLD_EQ(s[0], 'p'))
120
ffb4593c
NT
121/* LEX_* are values for PL_lex_state, the state of the lexer.
122 * They are arranged oddly so that the guard on the switch statement
79072805 123 * can get by with a single comparison (if the compiler is smart enough).
9da1dd8f
DM
124 *
125 * These values refer to the various states within a sublex parse,
126 * i.e. within a double quotish string
79072805
LW
127 */
128
fb73857a 129/* #define LEX_NOTPARSING 11 is done in perl.h. */
130
b6007c36
DM
131#define LEX_NORMAL 10 /* normal code (ie not within "...") */
132#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
133#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
134#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
135#define LEX_INTERPSTART 6 /* expecting the start of a $var */
136
137 /* at end of code, eg "$x" followed by: */
138#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
139#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
140
141#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
142 string or after \E, $foo, etc */
143#define LEX_INTERPCONST 2 /* NOT USED */
144#define LEX_FORMLINE 1 /* expecting a format line */
145#define LEX_KNOWNEXT 0 /* next token known; just return it */
146
79072805 147
bbf60fe6 148#ifdef DEBUGGING
27da23d5 149static const char* const lex_state_names[] = {
bbf60fe6
DM
150 "KNOWNEXT",
151 "FORMLINE",
152 "INTERPCONST",
153 "INTERPCONCAT",
154 "INTERPENDMAYBE",
155 "INTERPEND",
156 "INTERPSTART",
157 "INTERPPUSH",
158 "INTERPCASEMOD",
159 "INTERPNORMAL",
160 "NORMAL"
161};
162#endif
163
79072805 164#include "keywords.h"
fe14fcc3 165
ffb4593c
NT
166/* CLINE is a macro that ensures PL_copline has a sane value */
167
57843af0 168#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 169
ffb4593c
NT
170/*
171 * Convenience functions to return different tokens and prime the
9cbb5ea2 172 * lexer for the next token. They all take an argument.
ffb4593c
NT
173 *
174 * TOKEN : generic token (used for '(', DOLSHARP, etc)
175 * OPERATOR : generic operator
176 * AOPERATOR : assignment operator
177 * PREBLOCK : beginning the block after an if, while, foreach, ...
178 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
179 * PREREF : *EXPR where EXPR is not a simple identifier
180 * TERM : expression term
89f35911 181 * POSTDEREF : postfix dereference (->$* ->@[...] etc.)
ffb4593c
NT
182 * LOOPX : loop exiting command (goto, last, dump, etc)
183 * FTST : file test operator
184 * FUN0 : zero-argument function
7eb971ee 185 * FUN0OP : zero-argument function, with its op created in this file
2d2e263d 186 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
187 * BOop : bitwise or or xor
188 * BAop : bitwise and
8823cb89 189 * BCop : bitwise complement
ffb4593c
NT
190 * SHop : shift operator
191 * PWop : power operator
9cbb5ea2 192 * PMop : pattern-matching operator
ffb4593c 193 * Aop : addition-level operator
e4916dd1 194 * AopNOASSIGN : addition-level operator that is never part of .=
ffb4593c
NT
195 * Mop : multiplication-level operator
196 * Eop : equality-testing operator
e5edeb50 197 * Rop : relational operator <= != gt
ffb4593c
NT
198 *
199 * Also see LOP and lop() below.
200 */
201
998054bd 202#ifdef DEBUGGING /* Serve -DT. */
704d4215 203# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 204#else
bbf60fe6 205# define REPORT(retval) (retval)
998054bd
SC
206#endif
207
bbf60fe6
DM
208#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
209#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
b1764551 210#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, retval))
bbf60fe6
DM
211#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
212#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
213#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
214#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
89f35911 215#define POSTDEREF(f) return (PL_bufptr = s, S_postderef(aTHX_ REPORT(f),s[1]))
a49203fd 216#define LOOPX(f) return (PL_bufptr = force_word(s,WORD,TRUE,FALSE), \
7a61bf3c 217 pl_yylval.ival=f, \
a49203fd 218 PL_expect = PL_nexttoke ? XOPERATOR : XTERM, \
7a61bf3c 219 REPORT((int)LOOPEX))
6154021b
RGS
220#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
221#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
7eb971ee 222#define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
6154021b 223#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
b1764551
FC
224#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)BITOROP))
225#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)BITANDOP))
8823cb89
FC
226#define BCop(f) return pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr = s, \
227 REPORT('~')
b1764551
FC
228#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)SHIFTOP))
229#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)POWOP))
6154021b 230#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
b1764551 231#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)ADDOP))
e4916dd1 232#define AopNOASSIGN(f) return (pl_yylval.ival=f, PL_bufptr=s, REPORT((int)ADDOP))
b1764551 233#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)MULOP))
6154021b
RGS
234#define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
235#define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
2f3197b3 236
a687059c
LW
237/* This bit of chicanery makes a unary function followed by
238 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
239 * The UNIDOR macro is for unary functions that can be followed by the //
240 * operator (such as C<shift // 0>).
a687059c 241 */
d68ce4ac 242#define UNI3(f,x,have_x) { \
6154021b 243 pl_yylval.ival = f; \
d68ce4ac 244 if (have_x) PL_expect = x; \
376fcdbf
AL
245 PL_bufptr = s; \
246 PL_last_uni = PL_oldbufptr; \
247 PL_last_lop_op = f; \
248 if (*s == '(') \
249 return REPORT( (int)FUNC1 ); \
294a536f 250 s = skipspace(s); \
376fcdbf
AL
251 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
252 }
d68ce4ac
FC
253#define UNI(f) UNI3(f,XTERM,1)
254#define UNIDOR(f) UNI3(f,XTERMORDORDOR,1)
b5fb7ce3
FC
255#define UNIPROTO(f,optional) { \
256 if (optional) PL_last_uni = PL_oldbufptr; \
22393538
MH
257 OPERATOR(f); \
258 }
a687059c 259
d68ce4ac 260#define UNIBRACK(f) UNI3(f,0,0)
79072805 261
9f68db38 262/* grandfather return to old style */
78cdf107
Z
263#define OLDLOP(f) \
264 do { \
265 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
266 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
267 pl_yylval.ival = (f); \
268 PL_expect = XTERM; \
269 PL_bufptr = s; \
270 return (int)LSTOP; \
271 } while(0)
79072805 272
83944c01
FC
273#define COPLINE_INC_WITH_HERELINES \
274 STMT_START { \
275 CopLINE_inc(PL_curcop); \
851b527a
FC
276 if (PL_parser->herelines) \
277 CopLINE(PL_curcop) += PL_parser->herelines, \
278 PL_parser->herelines = 0; \
83944c01 279 } STMT_END
ffdb8b16
FC
280/* Called after scan_str to update CopLINE(PL_curcop), but only when there
281 * is no sublex_push to follow. */
282#define COPLINE_SET_FROM_MULTI_END \
283 STMT_START { \
284 CopLINE_set(PL_curcop, PL_multi_end); \
285 if (PL_multi_end != PL_multi_start) \
851b527a 286 PL_parser->herelines = 0; \
ffdb8b16 287 } STMT_END
83944c01
FC
288
289
8fa7f367
JH
290#ifdef DEBUGGING
291
6154021b 292/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
293enum token_type {
294 TOKENTYPE_NONE,
295 TOKENTYPE_IVAL,
6154021b 296 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6 297 TOKENTYPE_PVAL,
aeaef349 298 TOKENTYPE_OPVAL
bbf60fe6
DM
299};
300
6d4a66ac
NC
301static struct debug_tokens {
302 const int token;
303 enum token_type type;
304 const char *name;
305} const debug_tokens[] =
9041c2e3 306{
bbf60fe6
DM
307 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
308 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
309 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
310 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
311 { ARROW, TOKENTYPE_NONE, "ARROW" },
312 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
313 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
314 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
315 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
316 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
0d863452 317 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
318 { DO, TOKENTYPE_NONE, "DO" },
319 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
320 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
321 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
322 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
323 { ELSE, TOKENTYPE_NONE, "ELSE" },
324 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
325 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
326 { FOR, TOKENTYPE_IVAL, "FOR" },
327 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
705fe0e5
FC
328 { FORMLBRACK, TOKENTYPE_NONE, "FORMLBRACK" },
329 { FORMRBRACK, TOKENTYPE_NONE, "FORMRBRACK" },
bbf60fe6
DM
330 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
331 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
7eb971ee 332 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
bbf60fe6
DM
333 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
334 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
335 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 336 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
337 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
338 { IF, TOKENTYPE_IVAL, "IF" },
5504e6cf 339 { LABEL, TOKENTYPE_PVAL, "LABEL" },
bbf60fe6
DM
340 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
341 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
342 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
343 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
344 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
345 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
346 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
347 { MY, TOKENTYPE_IVAL, "MY" },
bbf60fe6
DM
348 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
349 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
350 { OROP, TOKENTYPE_IVAL, "OROP" },
351 { OROR, TOKENTYPE_NONE, "OROR" },
352 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
88e1f1a2
JV
353 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
354 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6 355 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
cc624add 356 { POSTJOIN, TOKENTYPE_NONE, "POSTJOIN" },
bbf60fe6
DM
357 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
358 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
359 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
360 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
361 { PREINC, TOKENTYPE_NONE, "PREINC" },
362 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
f3f204dc 363 { QWLIST, TOKENTYPE_OPVAL, "QWLIST" },
bbf60fe6
DM
364 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
365 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
f3f204dc 366 { REQUIRE, TOKENTYPE_NONE, "REQUIRE" },
bbf60fe6
DM
367 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
368 { SUB, TOKENTYPE_NONE, "SUB" },
369 { THING, TOKENTYPE_OPVAL, "THING" },
370 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
371 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
372 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
373 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
374 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
375 { USE, TOKENTYPE_IVAL, "USE" },
0d863452 376 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6
DM
377 { WHILE, TOKENTYPE_IVAL, "WHILE" },
378 { WORD, TOKENTYPE_OPVAL, "WORD" },
be25f609 379 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 380 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
381};
382
6154021b 383/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 384
bbf60fe6 385STATIC int
704d4215 386S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 387{
7918f24d
NC
388 PERL_ARGS_ASSERT_TOKEREPORT;
389
bbf60fe6 390 if (DEBUG_T_TEST) {
bd61b366 391 const char *name = NULL;
bbf60fe6 392 enum token_type type = TOKENTYPE_NONE;
f54cb97a 393 const struct debug_tokens *p;
396482e1 394 SV* const report = newSVpvs("<== ");
bbf60fe6 395
f54cb97a 396 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
397 if (p->token == (int)rv) {
398 name = p->name;
399 type = p->type;
400 break;
401 }
402 }
403 if (name)
54667de8 404 Perl_sv_catpv(aTHX_ report, name);
239f83d5 405 else if (isGRAPH(rv))
4ebc7986 406 {
bbf60fe6 407 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
4ebc7986
FC
408 if ((char)rv == 'p')
409 sv_catpvs(report, " (pending identifier)");
410 }
bbf60fe6 411 else if (!rv)
396482e1 412 sv_catpvs(report, "EOF");
bbf60fe6
DM
413 else
414 Perl_sv_catpvf(aTHX_ report, "?? %"IVdf, (IV)rv);
415 switch (type) {
416 case TOKENTYPE_NONE:
bbf60fe6
DM
417 break;
418 case TOKENTYPE_IVAL:
704d4215 419 Perl_sv_catpvf(aTHX_ report, "(ival=%"IVdf")", (IV)lvalp->ival);
bbf60fe6
DM
420 break;
421 case TOKENTYPE_OPNUM:
422 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 423 PL_op_name[lvalp->ival]);
bbf60fe6
DM
424 break;
425 case TOKENTYPE_PVAL:
704d4215 426 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
427 break;
428 case TOKENTYPE_OPVAL:
704d4215 429 if (lvalp->opval) {
401441c0 430 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
431 PL_op_name[lvalp->opval->op_type]);
432 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 433 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 434 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
435 }
436
437 }
401441c0 438 else
396482e1 439 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
440 break;
441 }
b6007c36 442 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
443 };
444 return (int)rv;
998054bd
SC
445}
446
b6007c36
DM
447
448/* print the buffer with suitable escapes */
449
450STATIC void
15f169a1 451S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 452{
396482e1 453 SV* const tmp = newSVpvs("");
7918f24d
NC
454
455 PERL_ARGS_ASSERT_PRINTBUF;
456
5d37acd6 457 GCC_DIAG_IGNORE(-Wformat-nonliteral); /* fmt checked by caller */
b6007c36 458 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
5d37acd6 459 GCC_DIAG_RESTORE;
b6007c36
DM
460 SvREFCNT_dec(tmp);
461}
462
8fa7f367
JH
463#endif
464
8290c323
NC
465static int
466S_deprecate_commaless_var_list(pTHX) {
467 PL_expect = XTERM;
468 deprecate("comma-less variable list");
469 return REPORT(','); /* grandfather non-comma-format format */
470}
471
ffb4593c
NT
472/*
473 * S_ao
474 *
f393a21a
FC
475 * This subroutine looks for an '=' next to the operator that has just been
476 * parsed and turns it into an ASSIGNOP if it finds one.
ffb4593c
NT
477 */
478
76e3520e 479STATIC int
cea2e8a9 480S_ao(pTHX_ int toketype)
a0d0e21e 481{
3280af22
NIS
482 if (*PL_bufptr == '=') {
483 PL_bufptr++;
a0d0e21e 484 if (toketype == ANDAND)
6154021b 485 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 486 else if (toketype == OROR)
6154021b 487 pl_yylval.ival = OP_ORASSIGN;
c963b151 488 else if (toketype == DORDOR)
6154021b 489 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
490 toketype = ASSIGNOP;
491 }
b1764551 492 return REPORT(toketype);
a0d0e21e
LW
493}
494
ffb4593c
NT
495/*
496 * S_no_op
497 * When Perl expects an operator and finds something else, no_op
498 * prints the warning. It always prints "<something> found where
499 * operator expected. It prints "Missing semicolon on previous line?"
500 * if the surprise occurs at the start of the line. "do you need to
501 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
502 * where the compiler doesn't know if foo is a method call or a function.
503 * It prints "Missing operator before end of line" if there's nothing
504 * after the missing operator, or "... before <...>" if there is something
505 * after the missing operator.
506 */
507
76e3520e 508STATIC void
15f169a1 509S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 510{
9d4ba2ae
AL
511 char * const oldbp = PL_bufptr;
512 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 513
7918f24d
NC
514 PERL_ARGS_ASSERT_NO_OP;
515
1189a94a
GS
516 if (!s)
517 s = oldbp;
07c798fb 518 else
1189a94a 519 PL_bufptr = s;
734ab321 520 yywarn(Perl_form(aTHX_ "%s found where operator expected", what), UTF ? SVf_UTF8 : 0);
56da5a46
RGS
521 if (ckWARN_d(WARN_SYNTAX)) {
522 if (is_first)
523 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
524 "\t(Missing semicolon on previous line?)\n");
525 else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) {
f54cb97a 526 const char *t;
8a2bca12 527 for (t = PL_oldoldbufptr; (isWORDCHAR_lazy_if(t,UTF) || *t == ':');
734ab321 528 t += UTF ? UTF8SKIP(t) : 1)
c35e046a 529 NOOP;
56da5a46
RGS
530 if (t < PL_bufptr && isSPACE(*t))
531 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
b17a0679
FC
532 "\t(Do you need to predeclare %"UTF8f"?)\n",
533 UTF8fARG(UTF, t - PL_oldoldbufptr, PL_oldoldbufptr));
56da5a46
RGS
534 }
535 else {
536 assert(s >= oldbp);
537 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
b17a0679
FC
538 "\t(Missing operator before %"UTF8f"?)\n",
539 UTF8fARG(UTF, s - oldbp, oldbp));
56da5a46 540 }
07c798fb 541 }
3280af22 542 PL_bufptr = oldbp;
8990e307
LW
543}
544
ffb4593c
NT
545/*
546 * S_missingterm
547 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 548 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
549 * If we're in a delimited string and the delimiter is a control
550 * character, it's reformatted into a two-char sequence like ^C.
551 * This is fatal.
552 */
553
76e3520e 554STATIC void
cea2e8a9 555S_missingterm(pTHX_ char *s)
8990e307
LW
556{
557 char tmpbuf[3];
558 char q;
559 if (s) {
9d4ba2ae 560 char * const nl = strrchr(s,'\n');
d2719217 561 if (nl)
8990e307
LW
562 *nl = '\0';
563 }
ca8b19a7 564 else if ((U8) PL_multi_close < 32) {
8990e307 565 *tmpbuf = '^';
585ec06d 566 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
567 tmpbuf[2] = '\0';
568 s = tmpbuf;
569 }
570 else {
eb160463 571 *tmpbuf = (char)PL_multi_close;
8990e307
LW
572 tmpbuf[1] = '\0';
573 s = tmpbuf;
574 }
575 q = strchr(s,'"') ? '\'' : '"';
cea2e8a9 576 Perl_croak(aTHX_ "Can't find string terminator %c%s%c anywhere before EOF",q,s,q);
463ee0b2 577}
79072805 578
dd0ac2b9
FC
579#include "feature.h"
580
0d863452 581/*
0d863452
RH
582 * Check whether the named feature is enabled.
583 */
26ea9e12 584bool
3fff3427 585Perl_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
0d863452 586{
4a731d7b 587 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
7918f24d
NC
588
589 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
ca4d40c4
FC
590
591 assert(CURRENT_FEATURE_BUNDLE == FEATURE_BUNDLE_CUSTOM);
7918f24d 592
26ea9e12
NC
593 if (namelen > MAX_FEATURE_LEN)
594 return FALSE;
3fff3427 595 memcpy(&he_name[8], name, namelen);
7d69d4a6 596
c8ca97b0
NC
597 return cBOOL(cop_hints_fetch_pvn(PL_curcop, he_name, 8 + namelen, 0,
598 REFCOUNTED_HE_EXISTS));
0d863452
RH
599}
600
ffb4593c 601/*
9cbb5ea2
GS
602 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
603 * utf16-to-utf8-reversed.
ffb4593c
NT
604 */
605
c39cd008
GS
606#ifdef PERL_CR_FILTER
607static void
608strip_return(SV *sv)
609{
eb578fdb
KW
610 const char *s = SvPVX_const(sv);
611 const char * const e = s + SvCUR(sv);
7918f24d
NC
612
613 PERL_ARGS_ASSERT_STRIP_RETURN;
614
c39cd008
GS
615 /* outer loop optimized to do nothing if there are no CR-LFs */
616 while (s < e) {
617 if (*s++ == '\r' && *s == '\n') {
618 /* hit a CR-LF, need to copy the rest */
eb578fdb 619 char *d = s - 1;
c39cd008
GS
620 *d++ = *s++;
621 while (s < e) {
622 if (*s == '\r' && s[1] == '\n')
623 s++;
624 *d++ = *s++;
625 }
626 SvCUR(sv) -= s - d;
627 return;
628 }
629 }
630}
a868473f 631
76e3520e 632STATIC I32
c39cd008 633S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 634{
f54cb97a 635 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
636 if (count > 0 && !maxlen)
637 strip_return(sv);
638 return count;
a868473f
NIS
639}
640#endif
641
ffb4593c 642/*
8eaa0acf
Z
643=for apidoc Amx|void|lex_start|SV *line|PerlIO *rsfp|U32 flags
644
645Creates and initialises a new lexer/parser state object, supplying
646a context in which to lex and parse from a new source of Perl code.
647A pointer to the new state object is placed in L</PL_parser>. An entry
648is made on the save stack so that upon unwinding the new state object
649will be destroyed and the former value of L</PL_parser> will be restored.
650Nothing else need be done to clean up the parsing context.
651
652The code to be parsed comes from I<line> and I<rsfp>. I<line>, if
653non-null, provides a string (in SV form) containing code to be parsed.
654A copy of the string is made, so subsequent modification of I<line>
655does not affect parsing. I<rsfp>, if non-null, provides an input stream
656from which code will be read to be parsed. If both are non-null, the
657code in I<line> comes first and must consist of complete lines of input,
658and I<rsfp> supplies the remainder of the source.
659
e368b3bd
FC
660The I<flags> parameter is reserved for future use. Currently it is only
661used by perl internally, so extensions should always pass zero.
8eaa0acf
Z
662
663=cut
664*/
ffb4593c 665
27fcb6ee 666/* LEX_START_SAME_FILTER indicates that this is not a new file, so it
87606032
NC
667 can share filters with the current parser.
668 LEX_START_DONT_CLOSE indicates that the file handle wasn't opened by the
669 caller, hence isn't owned by the parser, so shouldn't be closed on parser
670 destruction. This is used to handle the case of defaulting to reading the
671 script from the standard input because no filename was given on the command
672 line (without getting confused by situation where STDIN has been closed, so
673 the script handle is opened on fd 0) */
27fcb6ee 674
a0d0e21e 675void
8eaa0acf 676Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
79072805 677{
6ef55633 678 const char *s = NULL;
5486870f 679 yy_parser *parser, *oparser;
60d63348 680 if (flags && flags & ~LEX_START_FLAGS)
8eaa0acf 681 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
acdf0a21
DM
682
683 /* create and initialise a parser */
684
199e78b7 685 Newxz(parser, 1, yy_parser);
5486870f 686 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
687 PL_parser = parser;
688
28ac2b49
Z
689 parser->stack = NULL;
690 parser->ps = NULL;
691 parser->stack_size = 0;
acdf0a21 692
e3abe207
DM
693 /* on scope exit, free this parser and restore any outer one */
694 SAVEPARSER(parser);
7c4baf47 695 parser->saved_curcop = PL_curcop;
e3abe207 696
acdf0a21 697 /* initialise lexer state */
8990e307 698
fb205e7a 699 parser->nexttoke = 0;
ca4cfd28 700 parser->error_count = oparser ? oparser->error_count : 0;
7f1c3e8c 701 parser->copline = parser->preambling = NOLINE;
5afb0a62 702 parser->lex_state = LEX_NORMAL;
c2598295 703 parser->expect = XSTATE;
2f9285f8 704 parser->rsfp = rsfp;
27fcb6ee
FC
705 parser->rsfp_filters =
706 !(flags & LEX_START_SAME_FILTER) || !oparser
d3cd8e11
FC
707 ? NULL
708 : MUTABLE_AV(SvREFCNT_inc(
709 oparser->rsfp_filters
710 ? oparser->rsfp_filters
711 : (oparser->rsfp_filters = newAV())
712 ));
2f9285f8 713
199e78b7
DM
714 Newx(parser->lex_brackstack, 120, char);
715 Newx(parser->lex_casestack, 12, char);
716 *parser->lex_casestack = '\0';
d794b522 717 Newxz(parser->lex_shared, 1, LEXSHARED);
02b34bbe 718
10efb74f 719 if (line) {
0528fd32 720 STRLEN len;
10efb74f 721 s = SvPV_const(line, len);
0abcdfa4
FC
722 parser->linestr = flags & LEX_START_COPIED
723 ? SvREFCNT_inc_simple_NN(line)
724 : newSVpvn_flags(s, len, SvUTF8(line));
bf1b738b 725 sv_catpvn(parser->linestr, "\n;", rsfp ? 1 : 2);
0abcdfa4 726 } else {
bf1b738b 727 parser->linestr = newSVpvn("\n;", rsfp ? 1 : 2);
8990e307 728 }
f06b5848
DM
729 parser->oldoldbufptr =
730 parser->oldbufptr =
731 parser->bufptr =
732 parser->linestart = SvPVX(parser->linestr);
733 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
734 parser->last_lop = parser->last_uni = NULL;
b54f893d 735
6d59e610 736 STATIC_ASSERT_STMT(FITS_IN_8_BITS(LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
b54f893d
KW
737 |LEX_DONT_CLOSE_RSFP));
738 parser->lex_flags = (U8) (flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
739 |LEX_DONT_CLOSE_RSFP));
737c24fc 740
60d63348 741 parser->in_pod = parser->filtered = 0;
79072805 742}
a687059c 743
e3abe207
DM
744
745/* delete a parser object */
746
747void
748Perl_parser_free(pTHX_ const yy_parser *parser)
749{
7918f24d
NC
750 PERL_ARGS_ASSERT_PARSER_FREE;
751
7c4baf47 752 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
753 SvREFCNT_dec(parser->linestr);
754
87606032 755 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
2f9285f8 756 PerlIO_clearerr(parser->rsfp);
799361c3
SH
757 else if (parser->rsfp && (!parser->old_parser ||
758 (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 759 PerlIO_close(parser->rsfp);
5486870f 760 SvREFCNT_dec(parser->rsfp_filters);
10002bc1
FC
761 SvREFCNT_dec(parser->lex_stuff);
762 SvREFCNT_dec(parser->sublex_info.repl);
3ac7ff8f
FC
763
764 Safefree(parser->lex_brackstack);
765 Safefree(parser->lex_casestack);
766 Safefree(parser->lex_shared);
767 PL_parser = parser->old_parser;
768 Safefree(parser);
769}
770
771void
772Perl_parser_free_nexttoke_ops(pTHX_ yy_parser *parser, OPSLAB *slab)
773{
3ac7ff8f 774 I32 nexttoke = parser->nexttoke;
3ac7ff8f 775 PERL_ARGS_ASSERT_PARSER_FREE_NEXTTOKE_OPS;
3ce3dcd9 776 while (nexttoke--) {
3ac7ff8f
FC
777 if (S_is_opval_token(parser->nexttype[nexttoke] & 0xffff)
778 && parser->nextval[nexttoke].opval
779 && parser->nextval[nexttoke].opval->op_slabbed
780 && OpSLAB(parser->nextval[nexttoke].opval) == slab) {
3ce3dcd9 781 op_free(parser->nextval[nexttoke].opval);
3ac7ff8f
FC
782 parser->nextval[nexttoke].opval = NULL;
783 }
3ce3dcd9 784 }
e3abe207
DM
785}
786
787
ffb4593c 788/*
f0e67a1d
Z
789=for apidoc AmxU|SV *|PL_parser-E<gt>linestr
790
791Buffer scalar containing the chunk currently under consideration of the
792text currently being lexed. This is always a plain string scalar (for
793which C<SvPOK> is true). It is not intended to be used as a scalar by
794normal scalar means; instead refer to the buffer directly by the pointer
795variables described below.
796
797The lexer maintains various C<char*> pointers to things in the
798C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
799reallocated, all of these pointers must be updated. Don't attempt to
800do this manually, but rather use L</lex_grow_linestr> if you need to
801reallocate the buffer.
802
803The content of the text chunk in the buffer is commonly exactly one
804complete line of input, up to and including a newline terminator,
805but there are situations where it is otherwise. The octets of the
806buffer may be intended to be interpreted as either UTF-8 or Latin-1.
807The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
808flag on this scalar, which may disagree with it.
809
810For direct examination of the buffer, the variable
811L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
812lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
813of these pointers is usually preferable to examination of the scalar
814through normal scalar means.
815
816=for apidoc AmxU|char *|PL_parser-E<gt>bufend
817
818Direct pointer to the end of the chunk of text currently being lexed, the
819end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
6602b933 820+ SvCUR(PL_parser-E<gt>linestr)>. A C<NUL> character (zero octet) is
f0e67a1d
Z
821always located at the end of the buffer, and does not count as part of
822the buffer's contents.
823
824=for apidoc AmxU|char *|PL_parser-E<gt>bufptr
825
826Points to the current position of lexing inside the lexer buffer.
827Characters around this point may be freely examined, within
828the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
829L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
830interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
831
832Lexing code (whether in the Perl core or not) moves this pointer past
833the characters that it consumes. It is also expected to perform some
834bookkeeping whenever a newline character is consumed. This movement
835can be more conveniently performed by the function L</lex_read_to>,
836which handles newlines appropriately.
837
838Interpretation of the buffer's octets can be abstracted out by
839using the slightly higher-level functions L</lex_peek_unichar> and
840L</lex_read_unichar>.
841
842=for apidoc AmxU|char *|PL_parser-E<gt>linestart
843
844Points to the start of the current line inside the lexer buffer.
845This is useful for indicating at which column an error occurred, and
846not much else. This must be updated by any lexing code that consumes
847a newline; the function L</lex_read_to> handles this detail.
848
849=cut
850*/
851
852/*
853=for apidoc Amx|bool|lex_bufutf8
854
855Indicates whether the octets in the lexer buffer
856(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
857of Unicode characters. If not, they should be interpreted as Latin-1
858characters. This is analogous to the C<SvUTF8> flag for scalars.
859
860In UTF-8 mode, it is not guaranteed that the lexer buffer actually
861contains valid UTF-8. Lexing code must be robust in the face of invalid
862encoding.
863
864The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
865is significant, but not the whole story regarding the input character
866encoding. Normally, when a file is being read, the scalar contains octets
867and its C<SvUTF8> flag is off, but the octets should be interpreted as
868UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
869however, the scalar may have the C<SvUTF8> flag on, and in this case its
870octets should be interpreted as UTF-8 unless the C<use bytes> pragma
871is in effect. This logic may change in the future; use this function
872instead of implementing the logic yourself.
873
874=cut
875*/
876
877bool
878Perl_lex_bufutf8(pTHX)
879{
880 return UTF;
881}
882
883/*
884=for apidoc Amx|char *|lex_grow_linestr|STRLEN len
885
886Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
6602b933 887at least I<len> octets (including terminating C<NUL>). Returns a
f0e67a1d
Z
888pointer to the reallocated buffer. This is necessary before making
889any direct modification of the buffer that would increase its length.
890L</lex_stuff_pvn> provides a more convenient way to insert text into
891the buffer.
892
893Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
894this function updates all of the lexer's variables that point directly
895into the buffer.
896
897=cut
898*/
899
900char *
901Perl_lex_grow_linestr(pTHX_ STRLEN len)
902{
903 SV *linestr;
904 char *buf;
905 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
c7641931 906 STRLEN linestart_pos, last_uni_pos, last_lop_pos, re_eval_start_pos;
f0e67a1d
Z
907 linestr = PL_parser->linestr;
908 buf = SvPVX(linestr);
909 if (len <= SvLEN(linestr))
910 return buf;
911 bufend_pos = PL_parser->bufend - buf;
912 bufptr_pos = PL_parser->bufptr - buf;
913 oldbufptr_pos = PL_parser->oldbufptr - buf;
914 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
915 linestart_pos = PL_parser->linestart - buf;
916 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
917 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
3328ab5a
FC
918 re_eval_start_pos = PL_parser->lex_shared->re_eval_start ?
919 PL_parser->lex_shared->re_eval_start - buf : 0;
c7641931 920
f0e67a1d 921 buf = sv_grow(linestr, len);
c7641931 922
f0e67a1d
Z
923 PL_parser->bufend = buf + bufend_pos;
924 PL_parser->bufptr = buf + bufptr_pos;
925 PL_parser->oldbufptr = buf + oldbufptr_pos;
926 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
927 PL_parser->linestart = buf + linestart_pos;
928 if (PL_parser->last_uni)
929 PL_parser->last_uni = buf + last_uni_pos;
930 if (PL_parser->last_lop)
931 PL_parser->last_lop = buf + last_lop_pos;
3328ab5a
FC
932 if (PL_parser->lex_shared->re_eval_start)
933 PL_parser->lex_shared->re_eval_start = buf + re_eval_start_pos;
f0e67a1d
Z
934 return buf;
935}
936
937/*
83aa740e 938=for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
f0e67a1d
Z
939
940Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
941immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
942reallocating the buffer if necessary. This means that lexing code that
943runs later will see the characters as if they had appeared in the input.
944It is not recommended to do this as part of normal parsing, and most
945uses of this facility run the risk of the inserted characters being
946interpreted in an unintended manner.
947
948The string to be inserted is represented by I<len> octets starting
949at I<pv>. These octets are interpreted as either UTF-8 or Latin-1,
950according to whether the C<LEX_STUFF_UTF8> flag is set in I<flags>.
951The characters are recoded for the lexer buffer, according to how the
952buffer is currently being interpreted (L</lex_bufutf8>). If a string
9dcc53ea 953to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
f0e67a1d
Z
954function is more convenient.
955
956=cut
957*/
958
959void
83aa740e 960Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
f0e67a1d 961{
749123ff 962 dVAR;
f0e67a1d
Z
963 char *bufptr;
964 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
965 if (flags & ~(LEX_STUFF_UTF8))
966 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
967 if (UTF) {
968 if (flags & LEX_STUFF_UTF8) {
969 goto plain_copy;
970 } else {
54d004e8 971 STRLEN highhalf = 0; /* Count of variants */
83aa740e 972 const char *p, *e = pv+len;
54d004e8
KW
973 for (p = pv; p != e; p++) {
974 if (! UTF8_IS_INVARIANT(*p)) {
975 highhalf++;
976 }
977 }
f0e67a1d
Z
978 if (!highhalf)
979 goto plain_copy;
980 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
981 bufptr = PL_parser->bufptr;
982 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
983 SvCUR_set(PL_parser->linestr,
984 SvCUR(PL_parser->linestr) + len+highhalf);
f0e67a1d
Z
985 PL_parser->bufend += len+highhalf;
986 for (p = pv; p != e; p++) {
987 U8 c = (U8)*p;
54d004e8
KW
988 if (! UTF8_IS_INVARIANT(c)) {
989 *bufptr++ = UTF8_TWO_BYTE_HI(c);
990 *bufptr++ = UTF8_TWO_BYTE_LO(c);
f0e67a1d
Z
991 } else {
992 *bufptr++ = (char)c;
993 }
994 }
995 }
996 } else {
997 if (flags & LEX_STUFF_UTF8) {
998 STRLEN highhalf = 0;
83aa740e 999 const char *p, *e = pv+len;
f0e67a1d
Z
1000 for (p = pv; p != e; p++) {
1001 U8 c = (U8)*p;
54d004e8 1002 if (UTF8_IS_ABOVE_LATIN1(c)) {
f0e67a1d
Z
1003 Perl_croak(aTHX_ "Lexing code attempted to stuff "
1004 "non-Latin-1 character into Latin-1 input");
54d004e8 1005 } else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e)) {
f0e67a1d
Z
1006 p++;
1007 highhalf++;
54d004e8 1008 } else if (! UTF8_IS_INVARIANT(c)) {
f0e67a1d
Z
1009 /* malformed UTF-8 */
1010 ENTER;
1011 SAVESPTR(PL_warnhook);
1012 PL_warnhook = PERL_WARNHOOK_FATAL;
c80e42f3 1013 utf8n_to_uvchr((U8*)p, e-p, NULL, 0);
f0e67a1d
Z
1014 LEAVE;
1015 }
1016 }
1017 if (!highhalf)
1018 goto plain_copy;
1019 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
1020 bufptr = PL_parser->bufptr;
1021 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1022 SvCUR_set(PL_parser->linestr,
1023 SvCUR(PL_parser->linestr) + len-highhalf);
f0e67a1d 1024 PL_parser->bufend += len-highhalf;
54d004e8
KW
1025 p = pv;
1026 while (p < e) {
1027 if (UTF8_IS_INVARIANT(*p)) {
1028 *bufptr++ = *p;
1029 p++;
f0e67a1d 1030 }
54d004e8
KW
1031 else {
1032 assert(p < e -1 );
94bb8c36 1033 *bufptr++ = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
54d004e8
KW
1034 p += 2;
1035 }
f0e67a1d
Z
1036 }
1037 } else {
54d004e8 1038 plain_copy:
f0e67a1d
Z
1039 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1040 bufptr = PL_parser->bufptr;
1041 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
255fdf19 1042 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
f0e67a1d
Z
1043 PL_parser->bufend += len;
1044 Copy(pv, bufptr, len, char);
1045 }
1046 }
1047}
1048
1049/*
9dcc53ea
Z
1050=for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
1051
1052Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1053immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1054reallocating the buffer if necessary. This means that lexing code that
1055runs later will see the characters as if they had appeared in the input.
1056It is not recommended to do this as part of normal parsing, and most
1057uses of this facility run the risk of the inserted characters being
1058interpreted in an unintended manner.
1059
1060The string to be inserted is represented by octets starting at I<pv>
1061and continuing to the first nul. These octets are interpreted as either
1062UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
1063in I<flags>. The characters are recoded for the lexer buffer, according
1064to how the buffer is currently being interpreted (L</lex_bufutf8>).
1065If it is not convenient to nul-terminate a string to be inserted, the
1066L</lex_stuff_pvn> function is more appropriate.
1067
1068=cut
1069*/
1070
1071void
1072Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1073{
1074 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1075 lex_stuff_pvn(pv, strlen(pv), flags);
1076}
1077
1078/*
f0e67a1d
Z
1079=for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1080
1081Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1082immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1083reallocating the buffer if necessary. This means that lexing code that
1084runs later will see the characters as if they had appeared in the input.
1085It is not recommended to do this as part of normal parsing, and most
1086uses of this facility run the risk of the inserted characters being
1087interpreted in an unintended manner.
1088
1089The string to be inserted is the string value of I<sv>. The characters
1090are recoded for the lexer buffer, according to how the buffer is currently
9dcc53ea 1091being interpreted (L</lex_bufutf8>). If a string to be inserted is
f0e67a1d
Z
1092not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1093need to construct a scalar.
1094
1095=cut
1096*/
1097
1098void
1099Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1100{
1101 char *pv;
1102 STRLEN len;
1103 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1104 if (flags)
1105 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1106 pv = SvPV(sv, len);
1107 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1108}
1109
1110/*
1111=for apidoc Amx|void|lex_unstuff|char *ptr
1112
1113Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1114I<ptr>. Text following I<ptr> will be moved, and the buffer shortened.
1115This hides the discarded text from any lexing code that runs later,
1116as if the text had never appeared.
1117
1118This is not the normal way to consume lexed text. For that, use
1119L</lex_read_to>.
1120
1121=cut
1122*/
1123
1124void
1125Perl_lex_unstuff(pTHX_ char *ptr)
1126{
1127 char *buf, *bufend;
1128 STRLEN unstuff_len;
1129 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1130 buf = PL_parser->bufptr;
1131 if (ptr < buf)
1132 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1133 if (ptr == buf)
1134 return;
1135 bufend = PL_parser->bufend;
1136 if (ptr > bufend)
1137 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1138 unstuff_len = ptr - buf;
1139 Move(ptr, buf, bufend+1-ptr, char);
1140 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1141 PL_parser->bufend = bufend - unstuff_len;
1142}
1143
1144/*
1145=for apidoc Amx|void|lex_read_to|char *ptr
1146
1147Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1148to I<ptr>. This advances L</PL_parser-E<gt>bufptr> to match I<ptr>,
1149performing the correct bookkeeping whenever a newline character is passed.
1150This is the normal way to consume lexed text.
1151
1152Interpretation of the buffer's octets can be abstracted out by
1153using the slightly higher-level functions L</lex_peek_unichar> and
1154L</lex_read_unichar>.
1155
1156=cut
1157*/
1158
1159void
1160Perl_lex_read_to(pTHX_ char *ptr)
1161{
1162 char *s;
1163 PERL_ARGS_ASSERT_LEX_READ_TO;
1164 s = PL_parser->bufptr;
1165 if (ptr < s || ptr > PL_parser->bufend)
1166 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1167 for (; s != ptr; s++)
1168 if (*s == '\n') {
83944c01 1169 COPLINE_INC_WITH_HERELINES;
f0e67a1d
Z
1170 PL_parser->linestart = s+1;
1171 }
1172 PL_parser->bufptr = ptr;
1173}
1174
1175/*
1176=for apidoc Amx|void|lex_discard_to|char *ptr
1177
1178Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1179up to I<ptr>. The remaining content of the buffer will be moved, and
1180all pointers into the buffer updated appropriately. I<ptr> must not
1181be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1182it is not permitted to discard text that has yet to be lexed.
1183
1184Normally it is not necessarily to do this directly, because it suffices to
1185use the implicit discarding behaviour of L</lex_next_chunk> and things
1186based on it. However, if a token stretches across multiple lines,
1f317c95 1187and the lexing code has kept multiple lines of text in the buffer for
f0e67a1d
Z
1188that purpose, then after completion of the token it would be wise to
1189explicitly discard the now-unneeded earlier lines, to avoid future
1190multi-line tokens growing the buffer without bound.
1191
1192=cut
1193*/
1194
1195void
1196Perl_lex_discard_to(pTHX_ char *ptr)
1197{
1198 char *buf;
1199 STRLEN discard_len;
1200 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1201 buf = SvPVX(PL_parser->linestr);
1202 if (ptr < buf)
1203 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1204 if (ptr == buf)
1205 return;
1206 if (ptr > PL_parser->bufptr)
1207 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1208 discard_len = ptr - buf;
1209 if (PL_parser->oldbufptr < ptr)
1210 PL_parser->oldbufptr = ptr;
1211 if (PL_parser->oldoldbufptr < ptr)
1212 PL_parser->oldoldbufptr = ptr;
1213 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1214 PL_parser->last_uni = NULL;
1215 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1216 PL_parser->last_lop = NULL;
1217 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1218 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1219 PL_parser->bufend -= discard_len;
1220 PL_parser->bufptr -= discard_len;
1221 PL_parser->oldbufptr -= discard_len;
1222 PL_parser->oldoldbufptr -= discard_len;
1223 if (PL_parser->last_uni)
1224 PL_parser->last_uni -= discard_len;
1225 if (PL_parser->last_lop)
1226 PL_parser->last_lop -= discard_len;
1227}
1228
1229/*
1230=for apidoc Amx|bool|lex_next_chunk|U32 flags
1231
1232Reads in the next chunk of text to be lexed, appending it to
1233L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1234looked to the end of the current chunk and wants to know more. It is
1235usual, but not necessary, for lexing to have consumed the entirety of
1236the current chunk at this time.
1237
1238If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1239chunk (i.e., the current chunk has been entirely consumed), normally the
1240current chunk will be discarded at the same time that the new chunk is
1241read in. If I<flags> includes C<LEX_KEEP_PREVIOUS>, the current chunk
1242will not be discarded. If the current chunk has not been entirely
1243consumed, then it will not be discarded regardless of the flag.
1244
1245Returns true if some new text was added to the buffer, or false if the
1246buffer has reached the end of the input text.
1247
1248=cut
1249*/
1250
1251#define LEX_FAKE_EOF 0x80000000
112d1284 1252#define LEX_NO_TERM 0x40000000
f0e67a1d
Z
1253
1254bool
1255Perl_lex_next_chunk(pTHX_ U32 flags)
1256{
1257 SV *linestr;
1258 char *buf;
1259 STRLEN old_bufend_pos, new_bufend_pos;
1260 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1261 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1262 bool got_some_for_debugger = 0;
f0e67a1d 1263 bool got_some;
112d1284 1264 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM))
f0e67a1d 1265 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
f0e67a1d
Z
1266 linestr = PL_parser->linestr;
1267 buf = SvPVX(linestr);
1268 if (!(flags & LEX_KEEP_PREVIOUS) &&
1269 PL_parser->bufptr == PL_parser->bufend) {
1270 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1271 linestart_pos = 0;
1272 if (PL_parser->last_uni != PL_parser->bufend)
1273 PL_parser->last_uni = NULL;
1274 if (PL_parser->last_lop != PL_parser->bufend)
1275 PL_parser->last_lop = NULL;
1276 last_uni_pos = last_lop_pos = 0;
1277 *buf = 0;
1278 SvCUR(linestr) = 0;
1279 } else {
1280 old_bufend_pos = PL_parser->bufend - buf;
1281 bufptr_pos = PL_parser->bufptr - buf;
1282 oldbufptr_pos = PL_parser->oldbufptr - buf;
1283 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1284 linestart_pos = PL_parser->linestart - buf;
1285 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1286 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1287 }
1288 if (flags & LEX_FAKE_EOF) {
1289 goto eof;
60d63348 1290 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
f0e67a1d
Z
1291 got_some = 0;
1292 } else if (filter_gets(linestr, old_bufend_pos)) {
1293 got_some = 1;
17cc9359 1294 got_some_for_debugger = 1;
112d1284
FC
1295 } else if (flags & LEX_NO_TERM) {
1296 got_some = 0;
f0e67a1d 1297 } else {
580561a3
Z
1298 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1299 sv_setpvs(linestr, "");
f0e67a1d
Z
1300 eof:
1301 /* End of real input. Close filehandle (unless it was STDIN),
1302 * then add implicit termination.
1303 */
87606032 1304 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
f0e67a1d
Z
1305 PerlIO_clearerr(PL_parser->rsfp);
1306 else if (PL_parser->rsfp)
1307 (void)PerlIO_close(PL_parser->rsfp);
1308 PL_parser->rsfp = NULL;
60d63348 1309 PL_parser->in_pod = PL_parser->filtered = 0;
f0e67a1d
Z
1310 if (!PL_in_eval && PL_minus_p) {
1311 sv_catpvs(linestr,
1312 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1313 PL_minus_n = PL_minus_p = 0;
1314 } else if (!PL_in_eval && PL_minus_n) {
1315 sv_catpvs(linestr, /*{*/";}");
1316 PL_minus_n = 0;
1317 } else
1318 sv_catpvs(linestr, ";");
1319 got_some = 1;
1320 }
1321 buf = SvPVX(linestr);
1322 new_bufend_pos = SvCUR(linestr);
1323 PL_parser->bufend = buf + new_bufend_pos;
1324 PL_parser->bufptr = buf + bufptr_pos;
1325 PL_parser->oldbufptr = buf + oldbufptr_pos;
1326 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1327 PL_parser->linestart = buf + linestart_pos;
1328 if (PL_parser->last_uni)
1329 PL_parser->last_uni = buf + last_uni_pos;
1330 if (PL_parser->last_lop)
1331 PL_parser->last_lop = buf + last_lop_pos;
7f1c3e8c
FC
1332 if (PL_parser->preambling != NOLINE) {
1333 CopLINE_set(PL_curcop, PL_parser->preambling + 1);
1334 PL_parser->preambling = NOLINE;
1335 }
17cc9359 1336 if (got_some_for_debugger && (PERLDB_LINE || PERLDB_SAVESRC) &&
f0e67a1d
Z
1337 PL_curstash != PL_debstash) {
1338 /* debugger active and we're not compiling the debugger code,
1339 * so store the line into the debugger's array of lines
1340 */
1341 update_debugger_info(NULL, buf+old_bufend_pos,
1342 new_bufend_pos-old_bufend_pos);
1343 }
1344 return got_some;
1345}
1346
1347/*
1348=for apidoc Amx|I32|lex_peek_unichar|U32 flags
1349
1350Looks ahead one (Unicode) character in the text currently being lexed.
1351Returns the codepoint (unsigned integer value) of the next character,
1352or -1 if lexing has reached the end of the input text. To consume the
1353peeked character, use L</lex_read_unichar>.
1354
1355If the next character is in (or extends into) the next chunk of input
1356text, the next chunk will be read in. Normally the current chunk will be
1357discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1358then the current chunk will not be discarded.
1359
1360If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1361is encountered, an exception is generated.
1362
1363=cut
1364*/
1365
1366I32
1367Perl_lex_peek_unichar(pTHX_ U32 flags)
1368{
749123ff 1369 dVAR;
f0e67a1d
Z
1370 char *s, *bufend;
1371 if (flags & ~(LEX_KEEP_PREVIOUS))
1372 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1373 s = PL_parser->bufptr;
1374 bufend = PL_parser->bufend;
1375 if (UTF) {
1376 U8 head;
1377 I32 unichar;
1378 STRLEN len, retlen;
1379 if (s == bufend) {
1380 if (!lex_next_chunk(flags))
1381 return -1;
1382 s = PL_parser->bufptr;
1383 bufend = PL_parser->bufend;
1384 }
1385 head = (U8)*s;
54d004e8 1386 if (UTF8_IS_INVARIANT(head))
f0e67a1d 1387 return head;
54d004e8
KW
1388 if (UTF8_IS_START(head)) {
1389 len = UTF8SKIP(&head);
f0e67a1d
Z
1390 while ((STRLEN)(bufend-s) < len) {
1391 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1392 break;
1393 s = PL_parser->bufptr;
1394 bufend = PL_parser->bufend;
1395 }
1396 }
c80e42f3 1397 unichar = utf8n_to_uvchr((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
f0e67a1d
Z
1398 if (retlen == (STRLEN)-1) {
1399 /* malformed UTF-8 */
1400 ENTER;
1401 SAVESPTR(PL_warnhook);
1402 PL_warnhook = PERL_WARNHOOK_FATAL;
c80e42f3 1403 utf8n_to_uvchr((U8*)s, bufend-s, NULL, 0);
f0e67a1d
Z
1404 LEAVE;
1405 }
1406 return unichar;
1407 } else {
1408 if (s == bufend) {
1409 if (!lex_next_chunk(flags))
1410 return -1;
1411 s = PL_parser->bufptr;
1412 }
1413 return (U8)*s;
1414 }
1415}
1416
1417/*
1418=for apidoc Amx|I32|lex_read_unichar|U32 flags
1419
1420Reads the next (Unicode) character in the text currently being lexed.
1421Returns the codepoint (unsigned integer value) of the character read,
1422and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1423if lexing has reached the end of the input text. To non-destructively
1424examine the next character, use L</lex_peek_unichar> instead.
1425
1426If the next character is in (or extends into) the next chunk of input
1427text, the next chunk will be read in. Normally the current chunk will be
1428discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1429then the current chunk will not be discarded.
1430
1431If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1432is encountered, an exception is generated.
1433
1434=cut
1435*/
1436
1437I32
1438Perl_lex_read_unichar(pTHX_ U32 flags)
1439{
1440 I32 c;
1441 if (flags & ~(LEX_KEEP_PREVIOUS))
1442 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1443 c = lex_peek_unichar(flags);
1444 if (c != -1) {
1445 if (c == '\n')
83944c01 1446 COPLINE_INC_WITH_HERELINES;
d9018cbe
EB
1447 if (UTF)
1448 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1449 else
1450 ++(PL_parser->bufptr);
f0e67a1d
Z
1451 }
1452 return c;
1453}
1454
1455/*
1456=for apidoc Amx|void|lex_read_space|U32 flags
1457
1458Reads optional spaces, in Perl style, in the text currently being
1459lexed. The spaces may include ordinary whitespace characters and
1460Perl-style comments. C<#line> directives are processed if encountered.
1461L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1462at a non-space character (or the end of the input text).
1463
1464If spaces extend into the next chunk of input text, the next chunk will
1465be read in. Normally the current chunk will be discarded at the same
1466time, but if I<flags> includes C<LEX_KEEP_PREVIOUS> then the current
1467chunk will not be discarded.
1468
1469=cut
1470*/
1471
21791330 1472#define LEX_NO_INCLINE 0x40000000
f0998909
Z
1473#define LEX_NO_NEXT_CHUNK 0x80000000
1474
f0e67a1d
Z
1475void
1476Perl_lex_read_space(pTHX_ U32 flags)
1477{
1478 char *s, *bufend;
21791330 1479 const bool can_incline = !(flags & LEX_NO_INCLINE);
f0e67a1d 1480 bool need_incline = 0;
21791330 1481 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK|LEX_NO_INCLINE))
f0e67a1d 1482 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
f0e67a1d
Z
1483 s = PL_parser->bufptr;
1484 bufend = PL_parser->bufend;
1485 while (1) {
1486 char c = *s;
1487 if (c == '#') {
1488 do {
1489 c = *++s;
1490 } while (!(c == '\n' || (c == 0 && s == bufend)));
1491 } else if (c == '\n') {
1492 s++;
21791330
FC
1493 if (can_incline) {
1494 PL_parser->linestart = s;
1495 if (s == bufend)
1496 need_incline = 1;
1497 else
1498 incline(s);
1499 }
f0e67a1d
Z
1500 } else if (isSPACE(c)) {
1501 s++;
1502 } else if (c == 0 && s == bufend) {
1503 bool got_more;
65c68e17 1504 line_t l;
f0998909
Z
1505 if (flags & LEX_NO_NEXT_CHUNK)
1506 break;
f0e67a1d 1507 PL_parser->bufptr = s;
65c68e17 1508 l = CopLINE(PL_curcop);
851b527a 1509 CopLINE(PL_curcop) += PL_parser->herelines + 1;
f0e67a1d 1510 got_more = lex_next_chunk(flags);
65c68e17 1511 CopLINE_set(PL_curcop, l);
f0e67a1d
Z
1512 s = PL_parser->bufptr;
1513 bufend = PL_parser->bufend;
1514 if (!got_more)
1515 break;
21791330 1516 if (can_incline && need_incline && PL_parser->rsfp) {
f0e67a1d
Z
1517 incline(s);
1518 need_incline = 0;
1519 }
1520 } else {
1521 break;
1522 }
1523 }
f0e67a1d
Z
1524 PL_parser->bufptr = s;
1525}
1526
1527/*
fe788d6b
PM
1528
1529=for apidoc EXMp|bool|validate_proto|SV *name|SV *proto|bool warn
1530
1531This function performs syntax checking on a prototype, C<proto>.
1532If C<warn> is true, any illegal characters or mismatched brackets
1533will trigger illegalproto warnings, declaring that they were
1534detected in the prototype for C<name>.
1535
1536The return value is C<true> if this is a valid prototype, and
1537C<false> if it is not, regardless of whether C<warn> was C<true> or
1538C<false>.
1539
1540Note that C<NULL> is a valid C<proto> and will always return C<true>.
1541
1542=cut
1543
1544 */
1545
1546bool
1547Perl_validate_proto(pTHX_ SV *name, SV *proto, bool warn)
1548{
1549 STRLEN len, origlen;
1550 char *p = proto ? SvPV(proto, len) : NULL;
1551 bool bad_proto = FALSE;
1552 bool in_brackets = FALSE;
1553 bool after_slash = FALSE;
1554 char greedy_proto = ' ';
1555 bool proto_after_greedy_proto = FALSE;
1556 bool must_be_last = FALSE;
1557 bool underscore = FALSE;
f791a21a 1558 bool bad_proto_after_underscore = FALSE;
fe788d6b
PM
1559
1560 PERL_ARGS_ASSERT_VALIDATE_PROTO;
1561
1562 if (!proto)
1563 return TRUE;
1564
1565 origlen = len;
1566 for (; len--; p++) {
1567 if (!isSPACE(*p)) {
1568 if (must_be_last)
1569 proto_after_greedy_proto = TRUE;
f791a21a
PM
1570 if (underscore) {
1571 if (!strchr(";@%", *p))
1572 bad_proto_after_underscore = TRUE;
1573 underscore = FALSE;
1574 }
fe788d6b
PM
1575 if (!strchr("$@%*;[]&\\_+", *p) || *p == '\0') {
1576 bad_proto = TRUE;
1577 }
1578 else {
fe788d6b
PM
1579 if (*p == '[')
1580 in_brackets = TRUE;
1581 else if (*p == ']')
1582 in_brackets = FALSE;
1583 else if ((*p == '@' || *p == '%') &&
1584 !after_slash &&
1585 !in_brackets ) {
1586 must_be_last = TRUE;
1587 greedy_proto = *p;
1588 }
1589 else if (*p == '_')
f791a21a 1590 underscore = TRUE;
fe788d6b
PM
1591 }
1592 if (*p == '\\')
1593 after_slash = TRUE;
1594 else
1595 after_slash = FALSE;
1596 }
1597 }
1598
1599 if (warn) {
b54d603d 1600 SV *tmpsv = newSVpvs_flags("", SVs_TEMP);
fe788d6b 1601 p -= origlen;
b54d603d
PM
1602 p = SvUTF8(proto)
1603 ? sv_uni_display(tmpsv, newSVpvn_flags(p, origlen, SVs_TEMP | SVf_UTF8),
1604 origlen, UNI_DISPLAY_ISPRINT)
1605 : pv_pretty(tmpsv, p, origlen, 60, NULL, NULL, PERL_PV_ESCAPE_NONASCII);
1606
fe788d6b
PM
1607 if (proto_after_greedy_proto)
1608 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1609 "Prototype after '%c' for %"SVf" : %s",
1610 greedy_proto, SVfARG(name), p);
50278ed0
PM
1611 if (in_brackets)
1612 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1613 "Missing ']' in prototype for %"SVf" : %s",
1614 SVfARG(name), p);
b54d603d 1615 if (bad_proto)
fe788d6b 1616 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
f791a21a
PM
1617 "Illegal character in prototype for %"SVf" : %s",
1618 SVfARG(name), p);
1619 if (bad_proto_after_underscore)
1620 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1621 "Illegal character after '_' in prototype for %"SVf" : %s",
1622 SVfARG(name), p);
fe788d6b
PM
1623 }
1624
1625 return (! (proto_after_greedy_proto || bad_proto) );
1626}
1627
1628/*
ffb4593c
NT
1629 * S_incline
1630 * This subroutine has nothing to do with tilting, whether at windmills
1631 * or pinball tables. Its name is short for "increment line". It
57843af0 1632 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1633 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1634 * # line 500 "foo.pm"
1635 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1636 */
1637
76e3520e 1638STATIC void
d9095cec 1639S_incline(pTHX_ const char *s)
463ee0b2 1640{
d9095cec
NC
1641 const char *t;
1642 const char *n;
1643 const char *e;
8818d409 1644 line_t line_num;
463ee0b2 1645
7918f24d
NC
1646 PERL_ARGS_ASSERT_INCLINE;
1647
83944c01 1648 COPLINE_INC_WITH_HERELINES;
451f421f
FC
1649 if (!PL_rsfp && !PL_parser->filtered && PL_lex_state == LEX_NORMAL
1650 && s+1 == PL_bufend && *s == ';') {
1651 /* fake newline in string eval */
1652 CopLINE_dec(PL_curcop);
1653 return;
1654 }
463ee0b2
LW
1655 if (*s++ != '#')
1656 return;
d4c19fe8
AL
1657 while (SPACE_OR_TAB(*s))
1658 s++;
73659bf1
GS
1659 if (strnEQ(s, "line", 4))
1660 s += 4;
1661 else
1662 return;
084592ab 1663 if (SPACE_OR_TAB(*s))
73659bf1 1664 s++;
4e553d73 1665 else
73659bf1 1666 return;
d4c19fe8
AL
1667 while (SPACE_OR_TAB(*s))
1668 s++;
463ee0b2
LW
1669 if (!isDIGIT(*s))
1670 return;
d4c19fe8 1671
463ee0b2
LW
1672 n = s;
1673 while (isDIGIT(*s))
1674 s++;
07714eb4 1675 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1676 return;
bf4acbe4 1677 while (SPACE_OR_TAB(*s))
463ee0b2 1678 s++;
73659bf1 1679 if (*s == '"' && (t = strchr(s+1, '"'))) {
463ee0b2 1680 s++;
73659bf1
GS
1681 e = t + 1;
1682 }
463ee0b2 1683 else {
c35e046a
AL
1684 t = s;
1685 while (!isSPACE(*t))
1686 t++;
73659bf1 1687 e = t;
463ee0b2 1688 }
bf4acbe4 1689 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1690 e++;
1691 if (*e != '\n' && *e != '\0')
1692 return; /* false alarm */
1693
96e440d2 1694 line_num = grok_atou(n, &e) - 1;
8818d409 1695
f4dd75d9 1696 if (t - s > 0) {
d9095cec 1697 const STRLEN len = t - s;
3df32bda 1698
d36ee5be 1699 if (!PL_rsfp && !PL_parser->filtered) {
e66cf94c
RGS
1700 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1701 * to *{"::_<newfilename"} */
44867030
NC
1702 /* However, the long form of evals is only turned on by the
1703 debugger - usually they're "(eval %lu)" */
d36ee5be
FC
1704 GV * const cfgv = CopFILEGV(PL_curcop);
1705 if (cfgv) {
38bd7ad8
FC
1706 char smallbuf[128];
1707 STRLEN tmplen2 = len;
44867030 1708 char *tmpbuf2;
449dd039 1709 GV *gv2;
44867030
NC
1710
1711 if (tmplen2 + 2 <= sizeof smallbuf)
1712 tmpbuf2 = smallbuf;
1713 else
1714 Newx(tmpbuf2, tmplen2 + 2, char);
1715
38bd7ad8
FC
1716 tmpbuf2[0] = '_';
1717 tmpbuf2[1] = '<';
44867030
NC
1718
1719 memcpy(tmpbuf2 + 2, s, tmplen2);
1720 tmplen2 += 2;
1721
8a5ee598 1722 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1723 if (!isGV(gv2)) {
8a5ee598 1724 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1725 /* adjust ${"::_<newfilename"} to store the new file name */
1726 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
8818d409
FC
1727 /* The line number may differ. If that is the case,
1728 alias the saved lines that are in the array.
1729 Otherwise alias the whole array. */
1730 if (CopLINE(PL_curcop) == line_num) {
38bd7ad8
FC
1731 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(cfgv)));
1732 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(cfgv)));
8818d409 1733 }
38bd7ad8
FC
1734 else if (GvAV(cfgv)) {
1735 AV * const av = GvAV(cfgv);
8818d409
FC
1736 const I32 start = CopLINE(PL_curcop)+1;
1737 I32 items = AvFILLp(av) - start;
1738 if (items > 0) {
1739 AV * const av2 = GvAVn(gv2);
1740 SV **svp = AvARRAY(av) + start;
1741 I32 l = (I32)line_num+1;
1742 while (items--)
1743 av_store(av2, l++, SvREFCNT_inc(*svp++));
1744 }
1745 }
e5527e4b 1746 }
44867030
NC
1747
1748 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
d36ee5be 1749 }
e66cf94c 1750 }
05ec9bb3 1751 CopFILE_free(PL_curcop);
449dd039 1752 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1753 }
8818d409 1754 CopLINE_set(PL_curcop, line_num);
463ee0b2
LW
1755}
1756
21791330
FC
1757#define skipspace(s) skipspace_flags(s, 0)
1758
29595ff2 1759
80a702cd 1760STATIC void
15f169a1 1761S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1762{
1763 AV *av = CopFILEAVx(PL_curcop);
1764 if (av) {
7f1c3e8c
FC
1765 SV * sv;
1766 if (PL_parser->preambling == NOLINE) sv = newSV_type(SVt_PVMG);
1767 else {
1768 sv = *av_fetch(av, 0, 1);
1769 SvUPGRADE(sv, SVt_PVMG);
1770 }
1771 if (!SvPOK(sv)) sv_setpvs(sv,"");
5fa550fb 1772 if (orig_sv)
7f1c3e8c 1773 sv_catsv(sv, orig_sv);
5fa550fb 1774 else
7f1c3e8c
FC
1775 sv_catpvn(sv, buf, len);
1776 if (!SvIOK(sv)) {
1777 (void)SvIOK_on(sv);
1778 SvIV_set(sv, 0);
1779 }
1780 if (PL_parser->preambling == NOLINE)
1781 av_store(av, CopLINE(PL_curcop), sv);
80a702cd
RGS
1782 }
1783}
1784
ffb4593c
NT
1785/*
1786 * S_skipspace
1787 * Called to gobble the appropriate amount and type of whitespace.
1788 * Skips comments as well.
1789 */
1790
76e3520e 1791STATIC char *
21791330 1792S_skipspace_flags(pTHX_ char *s, U32 flags)
a687059c 1793{
21791330 1794 PERL_ARGS_ASSERT_SKIPSPACE_FLAGS;
3280af22 1795 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
bf4acbe4 1796 while (s < PL_bufend && SPACE_OR_TAB(*s))
463ee0b2 1797 s++;
f0e67a1d
Z
1798 } else {
1799 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1800 PL_bufptr = s;
21791330 1801 lex_read_space(flags | LEX_KEEP_PREVIOUS |
f0998909
Z
1802 (PL_sublex_info.sub_inwhat || PL_lex_state == LEX_FORMLINE ?
1803 LEX_NO_NEXT_CHUNK : 0));
3280af22 1804 s = PL_bufptr;
f0e67a1d
Z
1805 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1806 if (PL_linestart > PL_bufptr)
1807 PL_bufptr = PL_linestart;
1808 return s;
463ee0b2 1809 }
5db06880 1810 return s;
a687059c 1811}
378cc40b 1812
ffb4593c
NT
1813/*
1814 * S_check_uni
1815 * Check the unary operators to ensure there's no ambiguity in how they're
1816 * used. An ambiguous piece of code would be:
1817 * rand + 5
1818 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1819 * the +5 is its argument.
1820 */
1821
76e3520e 1822STATIC void
cea2e8a9 1823S_check_uni(pTHX)
ba106d47 1824{
d4c19fe8
AL
1825 const char *s;
1826 const char *t;
2f3197b3 1827
3280af22 1828 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1829 return;
3280af22
NIS
1830 while (isSPACE(*PL_last_uni))
1831 PL_last_uni++;
c35e046a 1832 s = PL_last_uni;
8a2bca12 1833 while (isWORDCHAR_lazy_if(s,UTF) || *s == '-')
c35e046a 1834 s++;
3280af22 1835 if ((t = strchr(s, '(')) && t < PL_bufptr)
a0d0e21e 1836 return;
6136c704 1837
9b387841
NC
1838 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
1839 "Warning: Use of \"%.*s\" without parentheses is ambiguous",
1840 (int)(s - PL_last_uni), PL_last_uni);
2f3197b3
LW
1841}
1842
ffb4593c
NT
1843/*
1844 * LOP : macro to build a list operator. Its behaviour has been replaced
1845 * with a subroutine, S_lop() for which LOP is just another name.
1846 */
1847
a0d0e21e
LW
1848#define LOP(f,x) return lop(f,x,s)
1849
ffb4593c
NT
1850/*
1851 * S_lop
1852 * Build a list operator (or something that might be one). The rules:
41e8cbf4
FC
1853 * - if we have a next token, then it's a list operator (no parens) for
1854 * which the next token has already been parsed; e.g.,
1855 * sort foo @args
1856 * sort foo (@args)
ffb4593c
NT
1857 * - if the next thing is an opening paren, then it's a function
1858 * - else it's a list operator
1859 */
1860
76e3520e 1861STATIC I32
a0be28da 1862S_lop(pTHX_ I32 f, int x, char *s)
ffed7fef 1863{
7918f24d
NC
1864 PERL_ARGS_ASSERT_LOP;
1865
6154021b 1866 pl_yylval.ival = f;
35c8bce7 1867 CLINE;
3280af22
NIS
1868 PL_bufptr = s;
1869 PL_last_lop = PL_oldbufptr;
eb160463 1870 PL_last_lop_op = (OPCODE)f;
3280af22 1871 if (PL_nexttoke)
78cdf107 1872 goto lstop;
19f1898a 1873 PL_expect = x;
79072805 1874 if (*s == '(')
bbf60fe6 1875 return REPORT(FUNC);
294a536f 1876 s = skipspace(s);
79072805 1877 if (*s == '(')
bbf60fe6 1878 return REPORT(FUNC);
78cdf107
Z
1879 else {
1880 lstop:
1881 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
1882 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
bbf60fe6 1883 return REPORT(LSTOP);
78cdf107 1884 }
79072805
LW
1885}
1886
ffb4593c
NT
1887/*
1888 * S_force_next
9cbb5ea2 1889 * When the lexer realizes it knows the next token (for instance,
ffb4593c 1890 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 1891 * to know what token to return the next time the lexer is called. Caller
b5bbe64a
JH
1892 * will need to set PL_nextval[] and possibly PL_expect to ensure
1893 * the lexer handles the token correctly.
ffb4593c
NT
1894 */
1895
4e553d73 1896STATIC void
cea2e8a9 1897S_force_next(pTHX_ I32 type)
79072805 1898{
704d4215
GG
1899#ifdef DEBUGGING
1900 if (DEBUG_T_TEST) {
1901 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 1902 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
1903 }
1904#endif
3280af22
NIS
1905 PL_nexttype[PL_nexttoke] = type;
1906 PL_nexttoke++;
1907 if (PL_lex_state != LEX_KNOWNEXT) {
1908 PL_lex_defer = PL_lex_state;
3280af22 1909 PL_lex_state = LEX_KNOWNEXT;
79072805
LW
1910 }
1911}
1912
89f35911
FC
1913/*
1914 * S_postderef
1915 *
1916 * This subroutine handles postfix deref syntax after the arrow has already
1917 * been emitted. @* $* etc. are emitted as two separate token right here.
1918 * @[ @{ %[ %{ *{ are emitted also as two tokens, but this function emits
1919 * only the first, leaving yylex to find the next.
89f35911
FC
1920 */
1921
1922static int
ff25e5db 1923S_postderef(pTHX_ int const funny, char const next)
89f35911 1924{
ff25e5db 1925 assert(funny == DOLSHARP || strchr("$@%&*", funny));
89f35911
FC
1926 assert(strchr("*[{", next));
1927 if (next == '*') {
1928 PL_expect = XOPERATOR;
cc624add 1929 if (PL_lex_state == LEX_INTERPNORMAL && !PL_lex_brackets) {
ff25e5db 1930 assert('@' == funny || '$' == funny || DOLSHARP == funny);
cc624add 1931 PL_lex_state = LEX_INTERPEND;
cc624add
FC
1932 force_next(POSTJOIN);
1933 }
89f35911
FC
1934 force_next(next);
1935 PL_bufptr+=2;
1936 }
1937 else {
760ca746
FC
1938 if ('@' == funny && PL_lex_state == LEX_INTERPNORMAL
1939 && !PL_lex_brackets)
1940 PL_lex_dojoin = 2;
89f35911
FC
1941 PL_expect = XOPERATOR;
1942 PL_bufptr++;
1943 }
1944 return funny;
1945}
1946
28ac2b49
Z
1947void
1948Perl_yyunlex(pTHX)
1949{
a7aaec61
Z
1950 int yyc = PL_parser->yychar;
1951 if (yyc != YYEMPTY) {
1952 if (yyc) {
a7aaec61
Z
1953 NEXTVAL_NEXTTOKE = PL_parser->yylval;
1954 if (yyc == '{'/*}*/ || yyc == HASHBRACK || yyc == '['/*]*/) {
78cdf107 1955 PL_lex_allbrackets--;
a7aaec61 1956 PL_lex_brackets--;
78cdf107
Z
1957 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
1958 } else if (yyc == '('/*)*/) {
1959 PL_lex_allbrackets--;
1960 yyc |= (2<<24);
a7aaec61
Z
1961 }
1962 force_next(yyc);
1963 }
28ac2b49
Z
1964 PL_parser->yychar = YYEMPTY;
1965 }
1966}
1967
d0a148a6 1968STATIC SV *
15f169a1 1969S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 1970{
740cce10 1971 SV * const sv = newSVpvn_utf8(start, len,
eaf7a4d2
CS
1972 !IN_BYTES
1973 && UTF
9f10db87 1974 && !is_invariant_string((const U8*)start, len)
740cce10 1975 && is_utf8_string((const U8*)start, len));
d0a148a6
NC
1976 return sv;
1977}
1978
ffb4593c
NT
1979/*
1980 * S_force_word
1981 * When the lexer knows the next thing is a word (for instance, it has
1982 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
1983 * it calls S_force_word to stick the next word into the PL_nexttoke/val
1984 * lookahead.
ffb4593c
NT
1985 *
1986 * Arguments:
b1b65b59 1987 * char *start : buffer position (must be within PL_linestr)
02b34bbe 1988 * int token : PL_next* will be this type of bare word (e.g., METHOD,WORD)
ffb4593c
NT
1989 * int check_keyword : if true, Perl checks to make sure the word isn't
1990 * a keyword (do this if the word is a label, e.g. goto FOO)
1991 * int allow_pack : if true, : characters will also be allowed (require,
1992 * use, etc. do this)
ffb4593c
NT
1993 */
1994
76e3520e 1995STATIC char *
345b3785 1996S_force_word(pTHX_ char *start, int token, int check_keyword, int allow_pack)
79072805 1997{
eb578fdb 1998 char *s;
463ee0b2 1999 STRLEN len;
4e553d73 2000
7918f24d
NC
2001 PERL_ARGS_ASSERT_FORCE_WORD;
2002
294a536f 2003 start = skipspace(start);
463ee0b2 2004 s = start;
7e2040f0 2005 if (isIDFIRST_lazy_if(s,UTF) ||
345b3785 2006 (allow_pack && *s == ':') )
a0d0e21e 2007 {
3280af22 2008 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
01b5ef50
FC
2009 if (check_keyword) {
2010 char *s2 = PL_tokenbuf;
487e470d 2011 STRLEN len2 = len;
01b5ef50 2012 if (allow_pack && len > 6 && strnEQ(s2, "CORE::", 6))
487e470d
FC
2013 s2 += 6, len2 -= 6;
2014 if (keyword(s2, len2, 0))
463ee0b2 2015 return start;
01b5ef50 2016 }
463ee0b2 2017 if (token == METHOD) {
294a536f 2018 s = skipspace(s);
463ee0b2 2019 if (*s == '(')
3280af22 2020 PL_expect = XTERM;
463ee0b2 2021 else {
3280af22 2022 PL_expect = XOPERATOR;
463ee0b2 2023 }
79072805 2024 }
9ded7720 2025 NEXTVAL_NEXTTOKE.opval
d0a148a6
NC
2026 = (OP*)newSVOP(OP_CONST,0,
2027 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 2028 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
2029 force_next(token);
2030 }
2031 return s;
2032}
2033
ffb4593c
NT
2034/*
2035 * S_force_ident
9cbb5ea2 2036 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
2037 * text only contains the "foo" portion. The first argument is a pointer
2038 * to the "foo", and the second argument is the type symbol to prefix.
2039 * Forces the next token to be a "WORD".
9cbb5ea2 2040 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2041 */
2042
76e3520e 2043STATIC void
5aaab254 2044S_force_ident(pTHX_ const char *s, int kind)
79072805 2045{
7918f24d
NC
2046 PERL_ARGS_ASSERT_FORCE_IDENT;
2047
c9b48522
DD
2048 if (s[0]) {
2049 const STRLEN len = s[1] ? strlen(s) : 1; /* s = "\"" see yylex */
728847b1
BF
2050 OP* const o = (OP*)newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
2051 UTF ? SVf_UTF8 : 0));
9ded7720 2052 NEXTVAL_NEXTTOKE.opval = o;
79072805 2053 force_next(WORD);
748a9306 2054 if (kind) {
11343788 2055 o->op_private = OPpCONST_ENTERED;
55497cff 2056 /* XXX see note in pp_entereval() for why we forgo typo
2057 warnings if the symbol must be introduced in an eval.
2058 GSAR 96-10-12 */
90e5519e 2059 gv_fetchpvn_flags(s, len,
4bff32c5 2060 (PL_in_eval ? GV_ADDMULTI
728847b1 2061 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
90e5519e
NC
2062 kind == '$' ? SVt_PV :
2063 kind == '@' ? SVt_PVAV :
2064 kind == '%' ? SVt_PVHV :
a0d0e21e 2065 SVt_PVGV
90e5519e 2066 );
748a9306 2067 }
79072805
LW
2068 }
2069}
2070
3f33d153
FC
2071static void
2072S_force_ident_maybe_lex(pTHX_ char pit)
2073{
3f33d153
FC
2074 NEXTVAL_NEXTTOKE.ival = pit;
2075 force_next('p');
2076}
2077
1571675a
GS
2078NV
2079Perl_str_to_version(pTHX_ SV *sv)
2080{
2081 NV retval = 0.0;
2082 NV nshift = 1.0;
2083 STRLEN len;
cfd0369c 2084 const char *start = SvPV_const(sv,len);
9d4ba2ae 2085 const char * const end = start + len;
504618e9 2086 const bool utf = SvUTF8(sv) ? TRUE : FALSE;
7918f24d
NC
2087
2088 PERL_ARGS_ASSERT_STR_TO_VERSION;
2089
1571675a 2090 while (start < end) {
ba210ebe 2091 STRLEN skip;
1571675a
GS
2092 UV n;
2093 if (utf)
9041c2e3 2094 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2095 else {
2096 n = *(U8*)start;
2097 skip = 1;
2098 }
2099 retval += ((NV)n)/nshift;
2100 start += skip;
2101 nshift *= 1000;
2102 }
2103 return retval;
2104}
2105
4e553d73 2106/*
ffb4593c
NT
2107 * S_force_version
2108 * Forces the next token to be a version number.
e759cc13
RGS
2109 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2110 * and if "guessing" is TRUE, then no new token is created (and the caller
2111 * must use an alternative parsing method).
ffb4593c
NT
2112 */
2113
76e3520e 2114STATIC char *
e759cc13 2115S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2116{
5f66b61c 2117 OP *version = NULL;
44dcb63b 2118 char *d;
89bfa8cd 2119
7918f24d
NC
2120 PERL_ARGS_ASSERT_FORCE_VERSION;
2121
294a536f 2122 s = skipspace(s);
89bfa8cd 2123
44dcb63b 2124 d = s;
dd629d5b 2125 if (*d == 'v')
44dcb63b 2126 d++;
44dcb63b 2127 if (isDIGIT(*d)) {
e759cc13
RGS
2128 while (isDIGIT(*d) || *d == '_' || *d == '.')
2129 d++;
4e4da3ac 2130 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
dd629d5b 2131 SV *ver;
6154021b
RGS
2132 s = scan_num(s, &pl_yylval);
2133 version = pl_yylval.opval;
dd629d5b
GS
2134 ver = cSVOPx(version)->op_sv;
2135 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2136 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2137 SvNV_set(ver, str_to_version(ver));
1571675a 2138 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2139 }
89bfa8cd 2140 }
5db06880 2141 else if (guessing) {
e759cc13 2142 return s;
5db06880 2143 }
89bfa8cd 2144 }
2145
2146 /* NOTE: The parser sees the package name and the VERSION swapped */
9ded7720 2147 NEXTVAL_NEXTTOKE.opval = version;
4e553d73 2148 force_next(WORD);
89bfa8cd 2149
e759cc13 2150 return s;
89bfa8cd 2151}
2152
ffb4593c 2153/*
91152fc1
DG
2154 * S_force_strict_version
2155 * Forces the next token to be a version number using strict syntax rules.
2156 */
2157
2158STATIC char *
2159S_force_strict_version(pTHX_ char *s)
2160{
91152fc1 2161 OP *version = NULL;
91152fc1
DG
2162 const char *errstr = NULL;
2163
2164 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2165
2166 while (isSPACE(*s)) /* leading whitespace */
2167 s++;
2168
2169 if (is_STRICT_VERSION(s,&errstr)) {
2170 SV *ver = newSV(0);
2171 s = (char *)scan_version(s, ver, 0);
2172 version = newSVOP(OP_CONST, 0, ver);
2173 }
4e4da3ac 2174 else if ( (*s != ';' && *s != '{' && *s != '}' ) &&
294a536f 2175 (s = skipspace(s), (*s != ';' && *s != '{' && *s != '}' )))
4e4da3ac 2176 {
91152fc1
DG
2177 PL_bufptr = s;
2178 if (errstr)
2179 yyerror(errstr); /* version required */
2180 return s;
2181 }
2182
91152fc1 2183 /* NOTE: The parser sees the package name and the VERSION swapped */
91152fc1
DG
2184 NEXTVAL_NEXTTOKE.opval = version;
2185 force_next(WORD);
2186
2187 return s;
2188}
2189
2190/*
ffb4593c
NT
2191 * S_tokeq
2192 * Tokenize a quoted string passed in as an SV. It finds the next
2193 * chunk, up to end of string or a backslash. It may make a new
2194 * SV containing that chunk (if HINT_NEW_STRING is on). It also
2195 * turns \\ into \.
2196 */
2197
76e3520e 2198STATIC SV *
cea2e8a9 2199S_tokeq(pTHX_ SV *sv)
79072805 2200{
eb578fdb
KW
2201 char *s;
2202 char *send;
2203 char *d;
b3ac6de7 2204 SV *pv = sv;
79072805 2205
7918f24d
NC
2206 PERL_ARGS_ASSERT_TOKEQ;
2207
279b35ad
FC
2208 assert (SvPOK(sv));
2209 assert (SvLEN(sv));
2210 assert (!SvIsCOW(sv));
307ed071 2211 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1) /* <<'heredoc' */
b3ac6de7 2212 goto finish;
279b35ad
FC
2213 s = SvPVX(sv);
2214 send = SvEND(sv);
dcb21ed6
NC
2215 /* This is relying on the SV being "well formed" with a trailing '\0' */
2216 while (s < send && !(*s == '\\' && s[1] == '\\'))
79072805
LW
2217 s++;
2218 if (s == send)
b3ac6de7 2219 goto finish;
79072805 2220 d = s;
be4731d2 2221 if ( PL_hints & HINT_NEW_STRING ) {
279b35ad
FC
2222 pv = newSVpvn_flags(SvPVX_const(pv), SvCUR(sv),
2223 SVs_TEMP | SvUTF8(sv));
be4731d2 2224 }
79072805
LW
2225 while (s < send) {
2226 if (*s == '\\') {
a0d0e21e 2227 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2228 s++; /* all that, just for this */
2229 }
2230 *d++ = *s++;
2231 }
2232 *d = '\0';
95a20fc0 2233 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2234 finish:
3280af22 2235 if ( PL_hints & HINT_NEW_STRING )
eb0d8d16 2236 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
79072805
LW
2237 return sv;
2238}
2239
ffb4593c
NT
2240/*
2241 * Now come three functions related to double-quote context,
2242 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2243 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2244 * interact with PL_lex_state, and create fake ( ... ) argument lists
2245 * to handle functions and concatenation.
ecd24171
DM
2246 * For example,
2247 * "foo\lbar"
2248 * is tokenised as
2249 * stringify ( const[foo] concat lcfirst ( const[bar] ) )
ffb4593c
NT
2250 */
2251
2252/*
2253 * S_sublex_start
6154021b 2254 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2255 *
2256 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2257 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c
NT
2258 *
2259 * OP_CONST and OP_READLINE are easy--just make the new op and return.
2260 *
2261 * Everything else becomes a FUNC.
2262 *
2263 * Sets PL_lex_state to LEX_INTERPPUSH unless (ival was OP_NULL or we
2264 * had an OP_CONST or OP_READLINE). This just sets us up for a
2265 * call to S_sublex_push().
2266 */
2267
76e3520e 2268STATIC I32
cea2e8a9 2269S_sublex_start(pTHX)
79072805 2270{
eb578fdb 2271 const I32 op_type = pl_yylval.ival;
79072805
LW
2272
2273 if (op_type == OP_NULL) {
6154021b 2274 pl_yylval.opval = PL_lex_op;
5f66b61c 2275 PL_lex_op = NULL;
79072805
LW
2276 return THING;
2277 }
466112bb 2278 if (op_type == OP_CONST) {
3280af22 2279 SV *sv = tokeq(PL_lex_stuff);
b3ac6de7
IZ
2280
2281 if (SvTYPE(sv) == SVt_PVIV) {
2282 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2283 STRLEN len;
96a5add6 2284 const char * const p = SvPV_const(sv, len);
740cce10 2285 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2286 SvREFCNT_dec(sv);
2287 sv = nsv;
4e553d73 2288 }
6154021b 2289 pl_yylval.opval = (OP*)newSVOP(op_type, 0, sv);
a0714e2c 2290 PL_lex_stuff = NULL;
79072805
LW
2291 return THING;
2292 }
2293
3280af22 2294 PL_sublex_info.super_state = PL_lex_state;
eac04b2e 2295 PL_sublex_info.sub_inwhat = (U16)op_type;
3280af22
NIS
2296 PL_sublex_info.sub_op = PL_lex_op;
2297 PL_lex_state = LEX_INTERPPUSH;
55497cff 2298
3280af22
NIS
2299 PL_expect = XTERM;
2300 if (PL_lex_op) {
6154021b 2301 pl_yylval.opval = PL_lex_op;
5f66b61c 2302 PL_lex_op = NULL;
55497cff 2303 return PMFUNC;
2304 }
2305 else
2306 return FUNC;
2307}
2308
ffb4593c
NT
2309/*
2310 * S_sublex_push
2311 * Create a new scope to save the lexing state. The scope will be
2312 * ended in S_sublex_done. Returns a '(', starting the function arguments
2313 * to the uc, lc, etc. found before.
2314 * Sets PL_lex_state to LEX_INTERPCONCAT.
2315 */
2316
76e3520e 2317STATIC I32
cea2e8a9 2318S_sublex_push(pTHX)
55497cff 2319{
78a635de 2320 LEXSHARED *shared;
801d32ac 2321 const bool is_heredoc = PL_multi_close == '<';
f46d017c 2322 ENTER;
55497cff 2323
3280af22 2324 PL_lex_state = PL_sublex_info.super_state;
cc624add 2325 SAVEI8(PL_lex_dojoin);
3280af22 2326 SAVEI32(PL_lex_brackets);
78cdf107 2327 SAVEI32(PL_lex_allbrackets);
b27dce25 2328 SAVEI32(PL_lex_formbrack);
78cdf107 2329 SAVEI8(PL_lex_fakeeof);
3280af22
NIS
2330 SAVEI32(PL_lex_casemods);
2331 SAVEI32(PL_lex_starts);
651b5b28 2332 SAVEI8(PL_lex_state);
7cc34111 2333 SAVESPTR(PL_lex_repl);
7766f137 2334 SAVEVPTR(PL_lex_inpat);
98246f1e 2335 SAVEI16(PL_lex_inwhat);
ffdb8b16 2336 if (is_heredoc)
b42366d4 2337 {
ffdb8b16 2338 SAVECOPLINE(PL_curcop);
b42366d4 2339 SAVEI32(PL_multi_end);
851b527a
FC
2340 SAVEI32(PL_parser->herelines);
2341 PL_parser->herelines = 0;
b42366d4
FC
2342 }
2343 SAVEI8(PL_multi_close);
3280af22 2344 SAVEPPTR(PL_bufptr);
8452ff4b 2345 SAVEPPTR(PL_bufend);
3280af22
NIS
2346 SAVEPPTR(PL_oldbufptr);
2347 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2348 SAVEPPTR(PL_last_lop);
2349 SAVEPPTR(PL_last_uni);
3280af22
NIS
2350 SAVEPPTR(PL_linestart);
2351 SAVESPTR(PL_linestr);
8edd5f42
RGS
2352 SAVEGENERICPV(PL_lex_brackstack);
2353 SAVEGENERICPV(PL_lex_casestack);
78a635de 2354 SAVEGENERICPV(PL_parser->lex_shared);
3a54fd60 2355 SAVEBOOL(PL_parser->lex_re_reparsing);
ffdb8b16 2356 SAVEI32(PL_copline);
3280af22 2357
99bd9d90 2358 /* The here-doc parser needs to be able to peek into outer lexing
60f40a38
FC
2359 scopes to find the body of the here-doc. So we put PL_linestr and
2360 PL_bufptr into lex_shared, to ‘share’ those values.
99bd9d90 2361 */
60f40a38
FC
2362 PL_parser->lex_shared->ls_linestr = PL_linestr;
2363 PL_parser->lex_shared->ls_bufptr = PL_bufptr;
99bd9d90 2364
3280af22 2365 PL_linestr = PL_lex_stuff;
7cc34111 2366 PL_lex_repl = PL_sublex_info.repl;
a0714e2c 2367 PL_lex_stuff = NULL;
7cc34111 2368 PL_sublex_info.repl = NULL;
3280af22 2369
9cbb5ea2
GS
2370 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2371 = SvPVX(PL_linestr);
3280af22 2372 PL_bufend += SvCUR(PL_linestr);
bd61b366 2373 PL_last_lop = PL_last_uni = NULL;
3280af22 2374 SAVEFREESV(PL_linestr);
4dc843bc 2375 if (PL_lex_repl) SAVEFREESV(PL_lex_repl);
3280af22
NIS
2376
2377 PL_lex_dojoin = FALSE;
b27dce25 2378 PL_lex_brackets = PL_lex_formbrack = 0;
78cdf107
Z
2379 PL_lex_allbrackets = 0;
2380 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
a02a5408
JC
2381 Newx(PL_lex_brackstack, 120, char);
2382 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2383 PL_lex_casemods = 0;
2384 *PL_lex_casestack = '\0';
2385 PL_lex_starts = 0;
2386 PL_lex_state = LEX_INTERPCONCAT;
ffdb8b16 2387 if (is_heredoc)
6ddcf93b 2388 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
ffdb8b16 2389 PL_copline = NOLINE;
78a635de
FC
2390
2391 Newxz(shared, 1, LEXSHARED);
2392 shared->ls_prev = PL_parser->lex_shared;
2393 PL_parser->lex_shared = shared;
3280af22
NIS
2394
2395 PL_lex_inwhat = PL_sublex_info.sub_inwhat;
bb16bae8 2396 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
3280af22
NIS
2397 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2398 PL_lex_inpat = PL_sublex_info.sub_op;
79072805 2399 else
5f66b61c 2400 PL_lex_inpat = NULL;
79072805 2401
3a54fd60
DM
2402 PL_parser->lex_re_reparsing = cBOOL(PL_in_eval & EVAL_RE_REPARSING);
2403 PL_in_eval &= ~EVAL_RE_REPARSING;
2404
55497cff 2405 return '(';
79072805
LW
2406}
2407
ffb4593c
NT
2408/*
2409 * S_sublex_done
2410 * Restores lexer state after a S_sublex_push.
2411 */
2412
76e3520e 2413STATIC I32
cea2e8a9 2414S_sublex_done(pTHX)
79072805 2415{
3280af22 2416 if (!PL_lex_starts++) {
396482e1 2417 SV * const sv = newSVpvs("");
9aa983d2
JH
2418 if (SvUTF8(PL_linestr))
2419 SvUTF8_on(sv);
3280af22 2420 PL_expect = XOPERATOR;
6154021b 2421 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
79072805
LW
2422 return THING;
2423 }
2424
3280af22
NIS
2425 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2426 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2427 return yylex();
79072805
LW
2428 }
2429
ffb4593c 2430 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
bb16bae8 2431 assert(PL_lex_inwhat != OP_TRANSR);
5aa91856
FC
2432 if (PL_lex_repl) {
2433 assert (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS);
3280af22
NIS
2434 PL_linestr = PL_lex_repl;
2435 PL_lex_inpat = 0;
2436 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2437 PL_bufend += SvCUR(PL_linestr);
bd61b366 2438 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2439 PL_lex_dojoin = FALSE;
2440 PL_lex_brackets = 0;
78cdf107
Z
2441 PL_lex_allbrackets = 0;
2442 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
3280af22
NIS
2443 PL_lex_casemods = 0;
2444 *PL_lex_casestack = '\0';
2445 PL_lex_starts = 0;
25da4f38 2446 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2447 PL_lex_state = LEX_INTERPNORMAL;
2448 PL_lex_starts++;
e9fa98b2
HS
2449 /* we don't clear PL_lex_repl here, so that we can check later
2450 whether this is an evalled subst; that means we rely on the
2451 logic to ensure sublex_done() is called again only via the
2452 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2453 }
e9fa98b2 2454 else {
3280af22 2455 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2456 PL_lex_repl = NULL;
e9fa98b2 2457 }
ffdb8b16
FC
2458 if (SvTYPE(PL_linestr) >= SVt_PVNV) {
2459 CopLINE(PL_curcop) +=
2460 ((XPVNV*)SvANY(PL_linestr))->xnv_u.xpad_cop_seq.xlow
851b527a
FC
2461 + PL_parser->herelines;
2462 PL_parser->herelines = 0;
ffdb8b16 2463 }
9b6b7be8 2464 return '/';
ffed7fef
LW
2465 }
2466 else {
b42366d4 2467 const line_t l = CopLINE(PL_curcop);
f46d017c 2468 LEAVE;
b42366d4 2469 if (PL_multi_close == '<')
851b527a 2470 PL_parser->herelines += l - PL_multi_end;
3280af22
NIS
2471 PL_bufend = SvPVX(PL_linestr);
2472 PL_bufend += SvCUR(PL_linestr);
2473 PL_expect = XOPERATOR;
09bef843 2474 PL_sublex_info.sub_inwhat = 0;
79072805 2475 return ')';
ffed7fef
LW
2476 }
2477}
2478
6f613c73
KW
2479PERL_STATIC_INLINE SV*
2480S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
2481{
140b12ad
KW
2482 /* <s> points to first character of interior of \N{}, <e> to one beyond the
2483 * interior, hence to the "}". Finds what the name resolves to, returning
2484 * an SV* containing it; NULL if no valid one found */
2485
dd2b1b72 2486 SV* res = newSVpvn_flags(s, e - s, UTF ? SVf_UTF8 : 0);
6f613c73 2487
0c415a79
KW
2488 HV * table;
2489 SV **cvp;
2490 SV *cv;
2491 SV *rv;
2492 HV *stash;
2493 const U8* first_bad_char_loc;
2494 const char* backslash_ptr = s - 3; /* Points to the <\> of \N{... */
2495
6f613c73
KW
2496 PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME;
2497
b7e6151c
FC
2498 if (!SvCUR(res))
2499 return res;
2500
107160e2
KW
2501 if (UTF && ! is_utf8_string_loc((U8 *) backslash_ptr,
2502 e - backslash_ptr,
2503 &first_bad_char_loc))
2504 {
2505 /* If warnings are on, this will print a more detailed analysis of what
2506 * is wrong than the error message below */
c80e42f3 2507 utf8n_to_uvchr(first_bad_char_loc,
107160e2
KW
2508 e - ((char *) first_bad_char_loc),
2509 NULL, 0);
2510
2511 /* We deliberately don't try to print the malformed character, which
2512 * might not print very well; it also may be just the first of many
2513 * malformations, so don't print what comes after it */
2514 yyerror(Perl_form(aTHX_
2515 "Malformed UTF-8 character immediately after '%.*s'",
2516 (int) (first_bad_char_loc - (U8 *) backslash_ptr), backslash_ptr));
2517 return NULL;
2518 }
2519
2520 res = new_constant( NULL, 0, "charnames", res, NULL, backslash_ptr,
2521 /* include the <}> */
2522 e - backslash_ptr + 1);
6f613c73 2523 if (! SvPOK(res)) {
b6407c49 2524 SvREFCNT_dec_NN(res);
6f613c73
KW
2525 return NULL;
2526 }
2527
0c415a79
KW
2528 /* See if the charnames handler is the Perl core's, and if so, we can skip
2529 * the validation needed for a user-supplied one, as Perl's does its own
2530 * validation. */
2531 table = GvHV(PL_hintgv); /* ^H */
2532 cvp = hv_fetchs(table, "charnames", FALSE);
5882ddb3
FC
2533 if (cvp && (cv = *cvp) && SvROK(cv) && (rv = SvRV(cv),
2534 SvTYPE(rv) == SVt_PVCV) && ((stash = CvSTASH(rv)) != NULL))
0c415a79
KW
2535 {
2536 const char * const name = HvNAME(stash);
6a642c21
FC
2537 if (HvNAMELEN(stash) == sizeof("_charnames")-1
2538 && strEQ(name, "_charnames")) {
0c415a79
KW
2539 return res;
2540 }
2541 }
2542
bde9e88d
KW
2543 /* Here, it isn't Perl's charname handler. We can't rely on a
2544 * user-supplied handler to validate the input name. For non-ut8 input,
2545 * look to see that the first character is legal. Then loop through the
2546 * rest checking that each is a continuation */
6f613c73 2547
36897d64
KW
2548 /* This code makes the reasonable assumption that the only Latin1-range
2549 * characters that begin a character name alias are alphabetic, otherwise
2550 * would have to create a isCHARNAME_BEGIN macro */
b6ba1137 2551
b6ba1137 2552 if (! UTF) {
bde9e88d 2553 if (! isALPHAU(*s)) {
b6ba1137
KW
2554 goto bad_charname;
2555 }
bde9e88d
KW
2556 s++;
2557 while (s < e) {
2558 if (! isCHARNAME_CONT(*s)) {
b6ba1137
KW
2559 goto bad_charname;
2560 }
2d8eb851
KW
2561 if (*s == ' ' && *(s-1) == ' ') {
2562 goto multi_spaces;
bd299e29 2563 }
df758df2
KW
2564 if ((U8) *s == NBSP_NATIVE && ckWARN_d(WARN_DEPRECATED)) {
2565 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2566 "NO-BREAK SPACE in a charnames "
2567 "alias definition is deprecated");
2568 }
bde9e88d 2569 s++;
b6ba1137
KW
2570 }
2571 }
2572 else {
bde9e88d
KW
2573 /* Similarly for utf8. For invariants can check directly; for other
2574 * Latin1, can calculate their code point and check; otherwise use a
2575 * swash */
2576 if (UTF8_IS_INVARIANT(*s)) {
2577 if (! isALPHAU(*s)) {
140b12ad
KW
2578 goto bad_charname;
2579 }
bde9e88d
KW
2580 s++;
2581 } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
94bb8c36 2582 if (! isALPHAU(TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)))) {
b6ba1137 2583 goto bad_charname;
6f613c73 2584 }
bde9e88d 2585 s += 2;
6f613c73 2586 }
bde9e88d
KW
2587 else {
2588 if (! PL_utf8_charname_begin) {
2589 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2590 PL_utf8_charname_begin = _core_swash_init("utf8",
2591 "_Perl_Charname_Begin",
2592 &PL_sv_undef,
2593 1, 0, NULL, &flags);
2594 }
2595 if (! swash_fetch(PL_utf8_charname_begin, (U8 *) s, TRUE)) {
2596 goto bad_charname;
2597 }
2598 s += UTF8SKIP(s);
2599 }
2600
2601 while (s < e) {
2602 if (UTF8_IS_INVARIANT(*s)) {
2603 if (! isCHARNAME_CONT(*s)) {
2604 goto bad_charname;
2605 }
2d8eb851
KW
2606 if (*s == ' ' && *(s-1) == ' ') {
2607 goto multi_spaces;
bd299e29 2608 }
bde9e88d
KW
2609 s++;
2610 }
2611 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
94bb8c36 2612 if (! isCHARNAME_CONT(TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1))))
bde9e88d
KW
2613 {
2614 goto bad_charname;
2615 }
df758df2
KW
2616 if (*s == *NBSP_UTF8
2617 && *(s+1) == *(NBSP_UTF8+1)
2618 && ckWARN_d(WARN_DEPRECATED))
2619 {
2620 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2621 "NO-BREAK SPACE in a charnames "
2622 "alias definition is deprecated");
2623 }
bde9e88d
KW
2624 s += 2;
2625 }
2626 else {
2627 if (! PL_utf8_charname_continue) {
2628 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2629 PL_utf8_charname_continue = _core_swash_init("utf8",
2630 "_Perl_Charname_Continue",
2631 &PL_sv_undef,
2632 1, 0, NULL, &flags);
2633 }
2634 if (! swash_fetch(PL_utf8_charname_continue, (U8 *) s, TRUE)) {
2635 goto bad_charname;
2636 }
2637 s += UTF8SKIP(s);
6f613c73
KW
2638 }
2639 }
2d8eb851
KW
2640 }
2641 if (*(s-1) == ' ') {
2642 yyerror_pv(
2643 Perl_form(aTHX_
2644 "charnames alias definitions may not contain trailing "
2645 "white-space; marked by <-- HERE in %.*s<-- HERE %.*s",
2646 (int)(s - backslash_ptr + 1), backslash_ptr,
2647 (int)(e - s + 1), s + 1
2648 ),
2649 UTF ? SVf_UTF8 : 0);
2650 return NULL;
6f613c73
KW
2651 }
2652
94ca1619 2653 if (SvUTF8(res)) { /* Don't accept malformed input */
bde9e88d
KW
2654 const U8* first_bad_char_loc;
2655 STRLEN len;
2656 const char* const str = SvPV_const(res, len);
2657 if (! is_utf8_string_loc((U8 *) str, len, &first_bad_char_loc)) {
2658 /* If warnings are on, this will print a more detailed analysis of
2659 * what is wrong than the error message below */
c80e42f3 2660 utf8n_to_uvchr(first_bad_char_loc,
bde9e88d
KW
2661 (char *) first_bad_char_loc - str,
2662 NULL, 0);
2663
2664 /* We deliberately don't try to print the malformed character,
2665 * which might not print very well; it also may be just the first
2666 * of many malformations, so don't print what comes after it */
2667 yyerror_pv(
2668 Perl_form(aTHX_
2669 "Malformed UTF-8 returned by %.*s immediately after '%.*s'",
2670 (int) (e - backslash_ptr + 1), backslash_ptr,
2671 (int) ((char *) first_bad_char_loc - str), str
2672 ),
2673 SVf_UTF8);
2674 return NULL;
2675 }
2676 }
140b12ad 2677
bde9e88d 2678 return res;
140b12ad 2679
bde9e88d 2680 bad_charname: {
bde9e88d
KW
2681
2682 /* The final %.*s makes sure that should the trailing NUL be missing
2683 * that this print won't run off the end of the string */
2684 yyerror_pv(
2685 Perl_form(aTHX_
2686 "Invalid character in \\N{...}; marked by <-- HERE in %.*s<-- HERE %.*s",
2d8eb851
KW
2687 (int)(s - backslash_ptr + 1), backslash_ptr,
2688 (int)(e - s + 1), s + 1
bde9e88d
KW
2689 ),
2690 UTF ? SVf_UTF8 : 0);
2691 return NULL;
2692 }
2d8eb851
KW
2693
2694 multi_spaces:
2695 yyerror_pv(
2696 Perl_form(aTHX_
2697 "charnames alias definitions may not contain a sequence of "
2698 "multiple spaces; marked by <-- HERE in %.*s<-- HERE %.*s",
2699 (int)(s - backslash_ptr + 1), backslash_ptr,
2700 (int)(e - s + 1), s + 1
2701 ),
2702 UTF ? SVf_UTF8 : 0);
2703 return NULL;
6f613c73
KW
2704}
2705
02aa26ce
NT
2706/*
2707 scan_const
2708
9da1dd8f
DM
2709 Extracts the next constant part of a pattern, double-quoted string,
2710 or transliteration. This is terrifying code.
2711
2712 For example, in parsing the double-quoted string "ab\x63$d", it would
2713 stop at the '$' and return an OP_CONST containing 'abc'.
02aa26ce 2714
94def140 2715 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 2716 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 2717 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 2718
94def140
TS
2719 Returns a pointer to the character scanned up to. If this is
2720 advanced from the start pointer supplied (i.e. if anything was
9da1dd8f 2721 successfully parsed), will leave an OP_CONST for the substring scanned
6154021b 2722 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
2723 by looking at the next characters herself.
2724
02aa26ce 2725 In patterns:
9da1dd8f 2726 expand:
537124e4
KW
2727 \N{FOO} => \N{U+hex_for_character_FOO}
2728 (if FOO expands to multiple characters, expands to \N{U+xx.XX.yy ...})
9da1dd8f
DM
2729
2730 pass through:
2731 all other \-char, including \N and \N{ apart from \N{ABC}
2732
2733 stops on:
2734 @ and $ where it appears to be a var, but not for $ as tail anchor
2735 \l \L \u \U \Q \E
2736 (?{ or (??{
2737
02aa26ce
NT
2738
2739 In transliterations:
2740 characters are VERY literal, except for - not at the start or end
94def140
TS
2741 of the string, which indicates a range. If the range is in bytes,
2742 scan_const expands the range to the full set of intermediate
2743 characters. If the range is in utf8, the hyphen is replaced with
2744 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
2745
2746 In double-quoted strings:
2747 backslashes:
2748 double-quoted style: \r and \n
ff3f963a 2749 constants: \x31, etc.
94def140 2750 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
2751 case and quoting: \U \Q \E
2752 stops on @ and $
2753
2754 scan_const does *not* construct ops to handle interpolated strings.
2755 It stops processing as soon as it finds an embedded $ or @ variable
2756 and leaves it to the caller to work out what's going on.
2757
94def140
TS
2758 embedded arrays (whether in pattern or not) could be:
2759 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2760
2761 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
2762
2763 $ in pattern could be $foo or could be tail anchor. Assumption:
2764 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 2765 followed by one of "()| \r\n\t"
02aa26ce 2766
9da1dd8f 2767 \1 (backreferences) are turned into $1 in substitutions
02aa26ce
NT
2768
2769 The structure of the code is
2770 while (there's a character to process) {
94def140
TS
2771 handle transliteration ranges
2772 skip regexp comments /(?#comment)/ and codes /(?{code})/
2773 skip #-initiated comments in //x patterns
2774 check for embedded arrays
02aa26ce
NT
2775 check for embedded scalars
2776 if (backslash) {
94def140 2777 deprecate \1 in substitution replacements
02aa26ce
NT
2778 handle string-changing backslashes \l \U \Q \E, etc.
2779 switch (what was escaped) {
94def140 2780 handle \- in a transliteration (becomes a literal -)
ff3f963a 2781 if a pattern and not \N{, go treat as regular character
94def140
TS
2782 handle \132 (octal characters)
2783 handle \x15 and \x{1234} (hex characters)
ff3f963a 2784 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
TS
2785 handle \cV (control characters)
2786 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 2787 } (end switch)
77a135fe 2788 continue
02aa26ce 2789 } (end if backslash)
77a135fe 2790 handle regular character
02aa26ce 2791 } (end while character to read)
4e553d73 2792
02aa26ce
NT
2793*/
2794
76e3520e 2795STATIC char *
cea2e8a9 2796S_scan_const(pTHX_ char *start)
79072805 2797{
eb578fdb 2798 char *send = PL_bufend; /* end of the constant */
dc023dbb
KW
2799 SV *sv = newSV(send - start); /* sv for the constant. See note below
2800 on sizing. */
eb578fdb
KW
2801 char *s = start; /* start of the constant */
2802 char *d = SvPVX(sv); /* destination for copies */
dc023dbb
KW
2803 bool dorange = FALSE; /* are we in a translit range? */
2804 bool didrange = FALSE; /* did we just finish a range? */
2805 bool in_charclass = FALSE; /* within /[...]/ */
2806 bool has_utf8 = FALSE; /* Output constant is UTF8 */
2807 bool this_utf8 = cBOOL(UTF); /* Is the source string assumed to be
2808 UTF8? But, this can show as true
2809 when the source isn't utf8, as for
2810 example when it is entirely composed
2811 of hex constants */
6f613c73 2812 SV *res; /* result from charnames */
77a135fe
KW
2813
2814 /* Note on sizing: The scanned constant is placed into sv, which is
2815 * initialized by newSV() assuming one byte of output for every byte of
2816 * input. This routine expects newSV() to allocate an extra byte for a
2817 * trailing NUL, which this routine will append if it gets to the end of
2818 * the input. There may be more bytes of input than output (eg., \N{LATIN
2819 * CAPITAL LETTER A}), or more output than input if the constant ends up
2820 * recoded to utf8, but each time a construct is found that might increase
2821 * the needed size, SvGROW() is called. Its size parameter each time is
2822 * based on the best guess estimate at the time, namely the length used so
2823 * far, plus the length the current construct will occupy, plus room for
2824 * the trailing NUL, plus one byte for every input byte still unscanned */
2825
c3320c2a
KW
2826 UV uv = UV_MAX; /* Initialize to weird value to try to catch any uses
2827 before set */
4c3a8340
TS
2828#ifdef EBCDIC
2829 UV literal_endpoint = 0;
e294cc5d 2830 bool native_range = TRUE; /* turned to FALSE if the first endpoint is Unicode. */
4c3a8340 2831#endif
012bcf8d 2832
7918f24d
NC
2833 PERL_ARGS_ASSERT_SCAN_CONST;
2834
bb16bae8 2835 assert(PL_lex_inwhat != OP_TRANSR);
2b9d42f0
NIS
2836 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
2837 /* If we are doing a trans and we know we want UTF8 set expectation */
2838 has_utf8 = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
2839 this_utf8 = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
2840 }
2841
b899e89d
FC
2842 /* Protect sv from errors and fatal warnings. */
2843 ENTER_with_name("scan_const");
2844 SAVEFREESV(sv);
2b9d42f0 2845
79072805 2846 while (s < send || dorange) {
ff3f963a 2847
02aa26ce 2848 /* get transliterations out of the way (they're most literal) */
3280af22 2849 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 2850 /* expand a range A-Z to the full set of characters. AIE! */
79072805 2851 if (dorange) {
1ba5c669
JH
2852 I32 i; /* current expanded character */
2853 I32 min; /* first character in range */
2854 I32 max; /* last character in range */
02aa26ce 2855
e294cc5d
JH
2856#ifdef EBCDIC
2857 UV uvmax = 0;
2858#endif
2859
2860 if (has_utf8
2861#ifdef EBCDIC
2862 && !native_range
2863#endif
1953db30 2864 ) {
9d4ba2ae 2865 char * const c = (char*)utf8_hop((U8*)d, -1);
8973db79
JH
2866 char *e = d++;
2867 while (e-- > c)
2868 *(e + 1) = *e;
e7214ce8 2869 *c = (char) ILLEGAL_UTF8_BYTE;
8973db79
JH
2870 /* mark the range as done, and continue */
2871 dorange = FALSE;
2872 didrange = TRUE;
2873 continue;
2874 }
2b9d42f0 2875
95a20fc0 2876 i = d - SvPVX_const(sv); /* remember current offset */
e294cc5d
JH
2877#ifdef EBCDIC
2878 SvGROW(sv,
dc023dbb
KW
2879 SvLEN(sv) + ((has_utf8)
2880 ? (512 - UTF_CONTINUATION_MARK
2881 + UNISKIP(0x100))
e294cc5d
JH
2882 : 256));
2883 /* How many two-byte within 0..255: 128 in UTF-8,
2884 * 96 in UTF-8-mod. */
2885#else
9cbb5ea2 2886 SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */
e294cc5d 2887#endif
9cbb5ea2 2888 d = SvPVX(sv) + i; /* refresh d after realloc */
e294cc5d
JH
2889#ifdef EBCDIC
2890 if (has_utf8) {
2891 int j;
2892 for (j = 0; j <= 1; j++) {
2893 char * const c = (char*)utf8_hop((U8*)d, -1);
2894 const UV uv = utf8n_to_uvchr((U8*)c, d - c, NULL, 0);
2895 if (j)
2896 min = (U8)uv;
2897 else if (uv < 256)
2898 max = (U8)uv;
2899 else {
2900 max = (U8)0xff; /* only to \xff */
2901 uvmax = uv; /* \x{100} to uvmax */
2902 }
2903 d = c; /* eat endpoint chars */
2904 }
2905 }
2906 else {
2907#endif
2908 d -= 2; /* eat the first char and the - */
2909 min = (U8)*d; /* first char in range */
2910 max = (U8)d[1]; /* last char in range */
2911#ifdef EBCDIC
2912 }
2913#endif
8ada0baa 2914
c2e66d9e 2915 if (min > max) {
01ec43d0 2916 Perl_croak(aTHX_
d1573ac7 2917 "Invalid range \"%c-%c\" in transliteration operator",
1ba5c669 2918 (char)min, (char)max);
c2e66d9e
GS
2919 }
2920
c7f1f016 2921#ifdef EBCDIC
dc023dbb
KW
2922 /* Because of the discontinuities in EBCDIC A-Z and a-z, expand
2923 * any subsets of these ranges into individual characters */
4c3a8340 2924 if (literal_endpoint == 2 &&
c286a9a6 2925 ((isLOWER_A(min) && isLOWER_A(max)) ||
75e9e7bf
KW
2926 (isUPPER_A(min) && isUPPER_A(max))))
2927 {
2928 for (i = min; i <= max; i++) {
2929 if (isALPHA_A(i))
2930 *d++ = i;
8ada0baa
JH
2931 }
2932 }
2933 else
2934#endif
2935 for (i = min; i <= max; i++)
e294cc5d
JH
2936#ifdef EBCDIC
2937 if (has_utf8) {
55d09dc8 2938 append_utf8_from_native_byte(i, &d);
e294cc5d
JH
2939 }
2940 else
2941#endif
2942 *d++ = (char)i;
2943
2944#ifdef EBCDIC
2945 if (uvmax) {
2946 d = (char*)uvchr_to_utf8((U8*)d, 0x100);
2947 if (uvmax > 0x101)
e7214ce8 2948 *d++ = (char) ILLEGAL_UTF8_BYTE;
e294cc5d
JH
2949 if (uvmax > 0x100)
2950 d = (char*)uvchr_to_utf8((U8*)d, uvmax);
2951 }
2952#endif
02aa26ce
NT
2953
2954 /* mark the range as done, and continue */
79072805 2955 dorange = FALSE;
01ec43d0 2956 didrange = TRUE;
4c3a8340
TS
2957#ifdef EBCDIC
2958 literal_endpoint = 0;
2959#endif
79072805 2960 continue;
4e553d73 2961 }
02aa26ce
NT
2962
2963 /* range begins (ignore - as first or last char) */
79072805 2964 else if (*s == '-' && s+1 < send && s != start) {
4e553d73 2965 if (didrange) {
1fafa243 2966 Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
01ec43d0 2967 }
e294cc5d
JH
2968 if (has_utf8
2969#ifdef EBCDIC
2970 && !native_range
2971#endif
2972 ) {
e7214ce8 2973 *d++ = (char) ILLEGAL_UTF8_BYTE; /* use illegal utf8 byte--see pmtrans */
a0ed51b3
LW
2974 s++;
2975 continue;
2976 }
79072805
LW
2977 dorange = TRUE;
2978 s++;
01ec43d0
GS
2979 }
2980 else {
2981 didrange = FALSE;
4c3a8340
TS
2982#ifdef EBCDIC
2983 literal_endpoint = 0;
e294cc5d 2984 native_range = TRUE;
4c3a8340 2985#endif
01ec43d0 2986 }
79072805 2987 }
02aa26ce
NT
2988
2989 /* if we get here, we're not doing a transliteration */
2990
e4a2df84
DM
2991 else if (*s == '[' && PL_lex_inpat && !in_charclass) {
2992 char *s1 = s-1;
2993 int esc = 0;
2994 while (s1 >= start && *s1-- == '\\')
2995 esc = !esc;
2996 if (!esc)
2997 in_charclass = TRUE;
2998 }
2866decb 2999
e4a2df84
DM
3000 else if (*s == ']' && PL_lex_inpat && in_charclass) {
3001 char *s1 = s-1;
3002 int esc = 0;
3003 while (s1 >= start && *s1-- == '\\')
3004 esc = !esc;
3005 if (!esc)
3006 in_charclass = FALSE;
3007 }
2866decb 3008
9da1dd8f
DM
3009 /* skip for regexp comments /(?#comment)/, except for the last
3010 * char, which will be done separately.
3011 * Stop on (?{..}) and friends */
3012
c30fc27b 3013 else if (*s == '(' && PL_lex_inpat && s[1] == '?' && !in_charclass) {
cc6b7395 3014 if (s[2] == '#') {
e994fd66 3015 while (s+1 < send && *s != ')')
5ff03569 3016 *d++ = *s++;
155aba94 3017 }
c30fc27b 3018 else if (!PL_lex_casemods &&
d3cec5e5
DM
3019 ( s[2] == '{' /* This should match regcomp.c */
3020 || (s[2] == '?' && s[3] == '{')))
155aba94 3021 {
9da1dd8f 3022 break;
cc6b7395 3023 }
748a9306 3024 }
02aa26ce
NT
3025
3026 /* likewise skip #-initiated comments in //x patterns */
c30fc27b 3027 else if (*s == '#' && PL_lex_inpat && !in_charclass &&
73134a2e 3028 ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED) {
748a9306 3029 while (s+1 < send && *s != '\n')
5ff03569 3030 *d++ = *s++;
748a9306 3031 }
02aa26ce 3032
9da1dd8f
DM
3033 /* no further processing of single-quoted regex */
3034 else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'')
3035 goto default_action;
3036
5d1d4326 3037 /* check for embedded arrays
da6eedaa 3038 (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
5d1d4326 3039 */
1749ea0d 3040 else if (*s == '@' && s[1]) {
8a2bca12 3041 if (isWORDCHAR_lazy_if(s+1,UTF))
1749ea0d
TS
3042 break;
3043 if (strchr(":'{$", s[1]))
3044 break;
3045 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
3046 break; /* in regexp, neither @+ nor @- are interpolated */
3047 }
02aa26ce
NT
3048
3049 /* check for embedded scalars. only stop if we're sure it's a
3050 variable.
3051 */
79072805 3052 else if (*s == '$') {
3280af22 3053 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
79072805 3054 break;
77772344 3055 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
a2a5de95
NC
3056 if (s[1] == '\\') {
3057 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
3058 "Possible unintended interpolation of $\\ in regex");
77772344 3059 }
79072805 3060 break; /* in regexp, $ might be tail anchor */
77772344 3061 }
79072805 3062 }
02aa26ce 3063
2b9d42f0
NIS
3064 /* End of else if chain - OP_TRANS rejoin rest */
3065
02aa26ce 3066 /* backslashes */
79072805 3067 if (*s == '\\' && s+1 < send) {
ff3f963a
KW
3068 char* e; /* Can be used for ending '}', etc. */
3069
79072805 3070 s++;
02aa26ce 3071
7d0fc23c
KW
3072 /* warn on \1 - \9 in substitution replacements, but note that \11
3073 * is an octal; and \19 is \1 followed by '9' */
3280af22 3074 if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat &&
a0d0e21e 3075 isDIGIT(*s) && *s != '0' && !isDIGIT(s[1]))
79072805 3076 {
c782d7ee 3077 /* diag_listed_as: \%d better written as $%d */
a2a5de95 3078 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
79072805
LW
3079 *--s = '$';
3080 break;
3081 }
02aa26ce
NT
3082
3083 /* string-change backslash escapes */
838f2281 3084 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQF", *s)) {
79072805
LW
3085 --s;
3086 break;
3087 }
ff3f963a
KW
3088 /* In a pattern, process \N, but skip any other backslash escapes.
3089 * This is because we don't want to translate an escape sequence
3090 * into a meta symbol and have the regex compiler use the meta
3091 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
3092 * in spite of this, we do have to process \N here while the proper
3093 * charnames handler is in scope. See bugs #56444 and #62056.
85fba779 3094 *
ff3f963a
KW
3095 * There is a complication because \N in a pattern may also stand
3096 * for 'match a non-nl', and not mean a charname, in which case its
3097 * processing should be deferred to the regex compiler. To be a
3098 * charname it must be followed immediately by a '{', and not look
3099 * like \N followed by a curly quantifier, i.e., not something like
3100 * \N{3,}. regcurly returns a boolean indicating if it is a legal
3101 * quantifier */
3102 else if (PL_lex_inpat
3103 && (*s != 'N'
3104 || s[1] != '{'
412f55bb 3105 || regcurly(s + 1)))
ff3f963a 3106 {
4d73d076 3107 *d++ = '\\';
cc74c5bd
TS
3108 goto default_action;
3109 }
02aa26ce 3110
79072805 3111 switch (*s) {
02aa26ce
NT
3112
3113 /* quoted - in transliterations */
79072805 3114 case '-':
3280af22 3115 if (PL_lex_inwhat == OP_TRANS) {
79072805
LW
3116 *d++ = *s++;
3117 continue;
3118 }
924ba076 3119 /* FALLTHROUGH */
79072805 3120 default:
11b8faa4 3121 {
15861f94 3122 if ((isALPHANUMERIC(*s)))
a2a5de95
NC
3123 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
3124 "Unrecognized escape \\%c passed through",
3125 *s);
11b8faa4 3126 /* default action is to copy the quoted character */
f9a63242 3127 goto default_action;
11b8faa4 3128 }
02aa26ce 3129
632403cc 3130 /* eg. \132 indicates the octal constant 0132 */
79072805
LW
3131 case '0': case '1': case '2': case '3':
3132 case '4': case '5': case '6': case '7':
ba210ebe 3133 {
5e0a247b 3134 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
53305cf1 3135 STRLEN len = 3;
06972766 3136 uv = grok_oct(s, &len, &flags, NULL);
ba210ebe 3137 s += len;
5e0a247b
KW
3138 if (len < 3 && s < send && isDIGIT(*s)
3139 && ckWARN(WARN_MISC))
3140 {
3141 Perl_warner(aTHX_ packWARN(WARN_MISC),
3142 "%s", form_short_octal_warning(s, len));
3143 }
ba210ebe 3144 }
012bcf8d 3145 goto NUM_ESCAPE_INSERT;
02aa26ce 3146
f0a2b745
KW
3147 /* eg. \o{24} indicates the octal constant \024 */
3148 case 'o':
3149 {
454155d9 3150 const char* error;
f0a2b745 3151
00ce5563 3152 bool valid = grok_bslash_o(&s, &uv, &error,
80f4111b
KW
3153 TRUE, /* Output warning */
3154 FALSE, /* Not strict */
17896a85
KW
3155 TRUE, /* Output warnings for
3156 non-portables */
80f4111b 3157 UTF);
454155d9 3158 if (! valid) {
f0a2b745
KW
3159 yyerror(error);
3160 continue;
3161 }
3162 goto NUM_ESCAPE_INSERT;
3163 }
3164
77a135fe 3165 /* eg. \x24 indicates the hex constant 0x24 */
79072805 3166 case 'x':
a0481293 3167 {
a0481293 3168 const char* error;
355860ce 3169
00ce5563 3170 bool valid = grok_bslash_x(&s, &uv, &error,
80f4111b
KW
3171 TRUE, /* Output warning */
3172 FALSE, /* Not strict */
17896a85
KW
3173 TRUE, /* Output warnings for
3174 non-portables */
80f4111b 3175 UTF);
a0481293
KW
3176 if (! valid) {
3177 yyerror(error);
355860ce 3178 continue;
ba210ebe 3179 }
012bcf8d
GS
3180 }
3181
3182 NUM_ESCAPE_INSERT:
ff3f963a
KW
3183 /* Insert oct or hex escaped character. There will always be
3184 * enough room in sv since such escapes will be longer than any
3185 * UTF-8 sequence they can end up as, except if they force us
3186 * to recode the rest of the string into utf8 */
ba7cea30 3187
06972766 3188 /* Here uv is the ordinal of the next character being added */
6f2d5cbc 3189 if (!UVCHR_IS_INVARIANT(uv)) {
9aa983d2 3190 if (!has_utf8 && uv > 255) {
77a135fe
KW
3191 /* Might need to recode whatever we have accumulated so
3192 * far if it contains any chars variant in utf8 or
3193 * utf-ebcdic. */
3194
3195 SvCUR_set(sv, d - SvPVX_const(sv));
3196 SvPOK_on(sv);
3197 *d = '\0';
77a135fe 3198 /* See Note on sizing above. */
5eef447b
KW
3199 sv_utf8_upgrade_flags_grow(
3200 sv,
3201 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE
3202 /* Above-latin1 in string
3203 * implies no encoding */
3204 |SV_UTF8_NO_ENCODING,
3205 UNISKIP(uv) + (STRLEN)(send - s) + 1);
77a135fe
KW
3206 d = SvPVX(sv) + SvCUR(sv);
3207 has_utf8 = TRUE;
012bcf8d
GS
3208 }
3209
77a135fe 3210 if (has_utf8) {
c80e42f3 3211 d = (char*)uvchr_to_utf8((U8*)d, uv);
f9a63242
JH
3212 if (PL_lex_inwhat == OP_TRANS &&
3213 PL_sublex_info.sub_op) {
3214 PL_sublex_info.sub_op->op_private |=
3215 (PL_lex_repl ? OPpTRANS_FROM_UTF
3216 : OPpTRANS_TO_UTF);
f9a63242 3217 }
e294cc5d
JH
3218#ifdef EBCDIC
3219 if (uv > 255 && !dorange)
3220 native_range = FALSE;
3221#endif
012bcf8d 3222 }
a0ed51b3 3223 else {
012bcf8d 3224 *d++ = (char)uv;
a0ed51b3 3225 }
012bcf8d
GS
3226 }
3227 else {
c4d5f83a 3228 *d++ = (char) uv;
a0ed51b3 3229 }
79072805 3230 continue;
02aa26ce 3231
4a2d328f 3232 case 'N':
85fba779
KW
3233 /* In a non-pattern \N must be like \N{U+0041}, or it can be a
3234 * named character, like \N{LATIN SMALL LETTER A}, or a named
3235 * sequence, like \N{LATIN CAPITAL LETTER A WITH MACRON AND
3236 * GRAVE}. For convenience all three forms are referred to as
3237 * "named characters" below.
3238 *
3239 * For patterns, \N also can mean to match a non-newline. Code
3240 * before this 'switch' statement should already have handled
3241 * this situation, and hence this code only has to deal with
3242 * the named character cases.
3243 *
3244 * For non-patterns, the named characters are converted to
3245 * their string equivalents. In patterns, named characters are
3246 * not converted to their ultimate forms for the same reasons
3247 * that other escapes aren't. Instead, they are converted to
3248 * the \N{U+...} form to get the value from the charnames that
3249 * is in effect right now, while preserving the fact that it
3250 * was a named character, so that the regex compiler knows
3251 * this.
3252 *
3253 * The structure of this section of code (besides checking for
ff3f963a 3254 * errors and upgrading to utf8) is:
85fba779
KW
3255 * If the named character is of the form \N{U+...}, pass it
3256 * through if a pattern; otherwise convert the code point
3257 * to utf8
3258 * Otherwise must be some \N{NAME}: convert to \N{U+c1.c2...}
3259 * if a pattern; otherwise convert to utf8
3260 *
3261 * If the regex compiler should ever need to differentiate
3262 * between the \N{U+...} and \N{name} forms, that could easily
3263 * be done here by stripping any leading zeros from the
3264 * \N{U+...} case, and adding them to the other one. */
3265
3266 /* Here, 's' points to the 'N'; the test below is guaranteed to
3267 * succeed if we are being called on a pattern, as we already
3268 * know from a test above that the next character is a '{'. A
3269 * non-pattern \N must mean 'named character', which requires
3270 * braces */
ff3f963a
KW
3271 s++;
3272 if (*s != '{') {
3273 yyerror("Missing braces on \\N{}");
3274 continue;
3275 }
3276 s++;
3277
0a96133f 3278 /* If there is no matching '}', it is an error. */
ff3f963a
KW
3279 if (! (e = strchr(s, '}'))) {
3280 if (! PL_lex_inpat) {
5777a3f7 3281 yyerror("Missing right brace on \\N{}");
0a96133f 3282 } else {
4407f1b8 3283 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N");
dbc0d4f2 3284 }
0a96133f 3285 continue;
ff3f963a 3286 }
cddc7ef4 3287
ff3f963a 3288 /* Here it looks like a named character */
cddc7ef4 3289
ff3f963a
KW
3290 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3291 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4cbd7e22 3292 | PERL_SCAN_SILENT_ILLDIGIT
ff3f963a
KW
3293 | PERL_SCAN_DISALLOW_PREFIX;
3294 STRLEN len;
3295
ff3f963a
KW
3296 s += 2; /* Skip to next char after the 'U+' */
3297 len = e - s;
3298 uv = grok_hex(s, &len, &flags, NULL);
4cbd7e22
FC
3299 if (len == 0
3300 || ( len != (STRLEN)(e - s) && s[len] != '.'
3301 && PL_lex_inpat))
3302 {
3303 bad_NU:
ff3f963a
KW
3304 yyerror("Invalid hexadecimal number in \\N{U+...}");
3305 s = e + 1;
3306 continue;
3307 }
3308
3309 if (PL_lex_inpat) {
af352bf2
KW
3310
3311 /* In patterns, we can have \N{U+xxxx.yyyy.zzzz...} */
4cbd7e22
FC
3312 const char * const orig_s = s - 5;
3313 while (*s == '.') {
3314 s++;
3315 len = e - s;
3316 uv = grok_hex(s, &len, &flags, NULL);
3317 if (!len
3318 || (len != (STRLEN)(e - s) && s[len] != '.'))
3319 goto bad_NU;
3320 }
af352bf2
KW
3321
3322 /* Pass everything through unchanged. The reason we
3323 * evaluate the numbers is to make sure there wasn't a
3324 * syntax error. +1 is for the '}' */
4cbd7e22
FC
3325 Copy(orig_s, d, e - orig_s + 1, char);
3326 d += e - orig_s + 1;
ff3f963a
KW
3327 }
3328 else { /* Not a pattern: convert the hex to string */
3329
85fba779 3330 /* If the destination is not in utf8, unconditionally
ff3f963a
KW
3331 * recode it to be so. This is because \N{} implies
3332 * Unicode semantics, and scalars have to be in utf8
3333 * to guarantee those semantics */
3334 if (! has_utf8) {
3335 SvCUR_set(sv, d - SvPVX_const(sv));
3336 SvPOK_on(sv);
3337 *d = '\0';
3338 /* See Note on sizing above. */
3339 sv_utf8_upgrade_flags_grow(
3340 sv,
3341 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3342 UNISKIP(uv) + (STRLEN)(send - e) + 1);
3343 d = SvPVX(sv) + SvCUR(sv);
3344 has_utf8 = TRUE;
3345 }
3346
a46469e6 3347 /* Add the (Unicode) code point to the output. */
ff3f963a 3348 if (UNI_IS_INVARIANT(uv)) {
a46469e6 3349 *d++ = (char) LATIN1_TO_NATIVE(uv);
ff3f963a 3350 }
a46469e6
KW
3351 else {
3352 d = (char*) uvoffuni_to_utf8_flags((U8*)d, uv, 0);
3353 }
ff3f963a
KW
3354 }
3355 }
6f613c73
KW
3356 else /* Here is \N{NAME} but not \N{U+...}. */
3357 if ((res = get_and_check_backslash_N_name(s, e)))
3358 {
3359 STRLEN len;
3360 const char *str = SvPV_const(res, len);
3361 if (PL_lex_inpat) {
ff3f963a
KW
3362
3363 if (! len) { /* The name resolved to an empty string */
3364 Copy("\\N{}", d, 4, char);
3365 d += 4;
3366 }
3367 else {
3368 /* In order to not lose information for the regex
3369 * compiler, pass the result in the specially made
3370 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3371 * the code points in hex of each character
3372 * returned by charnames */
3373
3374 const char *str_end = str + len;
3b721c4f 3375 const STRLEN off = d - SvPVX_const(sv);
94ca1619
KW
3376
3377 if (! SvUTF8(res)) {
3378 /* For the non-UTF-8 case, we can determine the
3379 * exact length needed without having to parse
3380 * through the string. Each character takes up
3381 * 2 hex digits plus either a trailing dot or
3382 * the "}" */
97651d61
KW
3383 const char initial_text[] = "\\N{U+";
3384 const STRLEN initial_len = sizeof(initial_text)
3385 - 1;
94ca1619
KW
3386 d = off + SvGROW(sv, off
3387 + 3 * len
97651d61
KW
3388
3389 /* +1 for trailing NUL */
3390 + initial_len + 1
3391
94ca1619 3392 + (STRLEN)(send - e));
97651d61
KW
3393 Copy(initial_text, d, initial_len, char);
3394 d += initial_len;
94ca1619
KW
3395 while (str < str_end) {
3396 char hex_string[4];
e8549682 3397 int len =
51f14a05
JH
3398 my_snprintf(hex_string,
3399 sizeof(hex_string),
e8549682
JH
3400 "%02X.", (U8) *str);
3401 PERL_MY_SNPRINTF_POST_GUARD(len, sizeof(hex_string));
94ca1619
KW
3402 Copy(hex_string, d, 3, char);
3403 d += 3;
3404 str++;
3405 }
85fba779 3406 d--; /* Below, we will overwrite the final
94ca1619
KW
3407 dot with a right brace */
3408 }
3409 else {
1953db30
KW
3410 STRLEN char_length; /* cur char's byte length */
3411
3412 /* and the number of bytes after this is
3413 * translated into hex digits */
3414 STRLEN output_length;
3415
3416 /* 2 hex per byte; 2 chars for '\N'; 2 chars
3417 * for max('U+', '.'); and 1 for NUL */
3418 char hex_string[2 * UTF8_MAXBYTES + 5];
3419
3420 /* Get the first character of the result. */
a46469e6 3421 U32 uv = utf8n_to_uvchr((U8 *) str,
1953db30
KW
3422 len,
3423 &char_length,
3424 UTF8_ALLOW_ANYUV);
3425 /* Convert first code point to hex, including
a46469e6 3426 * the boiler plate before it. */
1953db30
KW
3427 output_length =
3428 my_snprintf(hex_string, sizeof(hex_string),
a46469e6
KW
3429 "\\N{U+%X",
3430 (unsigned int) uv);
1953db30
KW
3431
3432 /* Make sure there is enough space to hold it */
3433 d = off + SvGROW(sv, off
3434 + output_length
3435 + (STRLEN)(send - e)
3436 + 2); /* '}' + NUL */
3437 /* And output it */
3438 Copy(hex_string, d, output_length, char);
3439 d += output_length;
3440
3441 /* For each subsequent character, append dot and
3442 * its ordinal in hex */
3443 while ((str += char_length) < str_end) {
3444 const STRLEN off = d - SvPVX_const(sv);
a46469e6 3445 U32 uv = utf8n_to_uvchr((U8 *) str,
1953db30
KW
3446 str_end - str,
3447 &char_length,
3448 UTF8_ALLOW_ANYUV);
3449 output_length =
3450 my_snprintf(hex_string,
a46469e6
KW
3451 sizeof(hex_string),
3452 ".%X",
3453 (unsigned int) uv);
1953db30
KW
3454
3455 d = off + SvGROW(sv, off
3456 + output_length
3457 + (STRLEN)(send - e)
3458 + 2); /* '}' + NUL */
3459 Copy(hex_string, d, output_length, char);
3460 d += output_length;
3461 }
94ca1619 3462 }
ff3f963a
KW
3463
3464 *d++ = '}'; /* Done. Add the trailing brace */
3465 }
3466 }
3467 else { /* Here, not in a pattern. Convert the name to a
3468 * string. */
3469
3470 /* If destination is not in utf8, unconditionally
3471 * recode it to be so. This is because \N{} implies
3472 * Unicode semantics, and scalars have to be in utf8
3473 * to guarantee those semantics */
3474 if (! has_utf8) {
3475 SvCUR_set(sv, d - SvPVX_const(sv));
3476 SvPOK_on(sv);
3477 *d = '\0';
3478 /* See Note on sizing above. */
3479 sv_utf8_upgrade_flags_grow(sv,
3480 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3481 len + (STRLEN)(send - s) + 1);
3482 d = SvPVX(sv) + SvCUR(sv);
3483 has_utf8 = TRUE;
3484 } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
3485
3486 /* See Note on sizing above. (NOTE: SvCUR() is not
3487 * set correctly here). */
3488 const STRLEN off = d - SvPVX_const(sv);
3489 d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1);
3490 }
716d5ef3 3491 if (! SvUTF8(res)) { /* Make sure \N{} return is UTF-8 */
896eff3c 3492 sv_utf8_upgrade_flags(res, SV_UTF8_NO_ENCODING);
7fc82458
KW
3493 str = SvPV_const(res, len);
3494 }
ff3f963a
KW
3495 Copy(str, d, len, char);
3496 d += len;
423cee85 3497 }
6f613c73 3498
423cee85 3499 SvREFCNT_dec(res);
cb233ae3 3500
cb233ae3 3501 } /* End \N{NAME} */
ff3f963a
KW
3502#ifdef EBCDIC
3503 if (!dorange)
3504 native_range = FALSE; /* \N{} is defined to be Unicode */
3505#endif
3506 s = e + 1; /* Point to just after the '}' */
423cee85
JH
3507 continue;
3508
02aa26ce 3509 /* \c is a control character */
79072805
LW
3510 case 'c':
3511 s++;
961ce445 3512 if (s < send) {
421e43ba 3513 *d++ = grok_bslash_c(*s++, 1);
ba210ebe 3514 }
961ce445
RGS
3515 else {
3516 yyerror("Missing control char name in \\c");
3517 }
79072805 3518 continue;
02aa26ce
NT
3519
3520 /* printf-style backslashes, formfeeds, newlines, etc */
79072805 3521 case 'b':
4d73d076 3522 *d++ = '\b';
79072805
LW
3523 break;
3524 case 'n':
4d73d076 3525 *d++ = '\n';
79072805
LW
3526 break;
3527 case 'r':
4d73d076 3528 *d++ = '\r';
79072805
LW
3529 break;
3530 case 'f':
4d73d076 3531 *d++ = '\f';
79072805
LW
3532 break;
3533 case 't':
4d73d076 3534 *d++ = '\t';
79072805 3535 break;
34a3fe2a 3536 case 'e':
da0334e8 3537 *d++ = ESC_NATIVE;
34a3fe2a
PP
3538 break;
3539 case 'a':
4d73d076 3540 *d++ = '\a';
79072805 3541 break;
02aa26ce
NT
3542 } /* end switch */
3543
79072805
LW
3544 s++;
3545 continue;
02aa26ce 3546 } /* end if (backslash) */
4c3a8340
TS
3547#ifdef EBCDIC
3548 else
3549 literal_endpoint++;
3550#endif
02aa26ce 3551
f9a63242 3552 default_action:
77a135fe
KW
3553 /* If we started with encoded form, or already know we want it,
3554 then encode the next character */
6f2d5cbc 3555 if (! NATIVE_BYTE_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
2b9d42f0 3556 STRLEN len = 1;
77a135fe
KW
3557
3558
3559 /* One might think that it is wasted effort in the case of the
3560 * source being utf8 (this_utf8 == TRUE) to take the next character
3561 * in the source, convert it to an unsigned value, and then convert
3562 * it back again. But the source has not been validated here. The
3563 * routine that does the conversion checks for errors like
3564 * malformed utf8 */
3565
233ca360
KW
3566 const UV nextuv = (this_utf8)
3567 ? utf8n_to_uvchr((U8*)s, send - s, &len, 0)
3568 : (UV) ((U8) *s);
5aaebcb3 3569 const STRLEN need = UNISKIP(nextuv);
77a135fe
KW
3570 if (!has_utf8) {
3571 SvCUR_set(sv, d - SvPVX_const(sv));
3572 SvPOK_on(sv);
3573 *d = '\0';
77a135fe 3574 /* See Note on sizing above. */
7bf79863
KW
3575 sv_utf8_upgrade_flags_grow(sv,
3576 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3577 need + (STRLEN)(send - s) + 1);
77a135fe
KW
3578 d = SvPVX(sv) + SvCUR(sv);
3579 has_utf8 = TRUE;
3580 } else if (need > len) {
3581 /* encoded value larger than old, may need extra space (NOTE:
3582 * SvCUR() is not set correctly here). See Note on sizing
3583 * above. */
9d4ba2ae 3584 const STRLEN off = d - SvPVX_const(sv);
77a135fe 3585 d = SvGROW(sv, off + need + (STRLEN)(send - s) + 1) + off;
2b9d42f0 3586 }
77a135fe
KW
3587 s += len;
3588
5f66b61c 3589 d = (char*)uvchr_to_utf8((U8*)d, nextuv);
e294cc5d
JH
3590#ifdef EBCDIC
3591 if (uv > 255 && !dorange)
3592 native_range = FALSE;
3593#endif
2b9d42f0
NIS
3594 }
3595 else {
5ff03569 3596 *d++ = *s++;
2b9d42f0 3597 }
02aa26ce
NT
3598 } /* while loop to process each character */
3599
3600 /* terminate the string and set up the sv */
79072805 3601 *d = '\0';
95a20fc0 3602 SvCUR_set(sv, d - SvPVX_const(sv));
2b9d42f0 3603 if (SvCUR(sv) >= SvLEN(sv))
5637ef5b
NC
3604 Perl_croak(aTHX_ "panic: constant overflowed allocated space, %"UVuf
3605 " >= %"UVuf, (UV)SvCUR(sv), (UV)SvLEN(sv));
2b9d42f0 3606
79072805 3607 SvPOK_on(sv);
47e13f24 3608 if (IN_ENCODING && !has_utf8) {
ad2de1b2 3609 sv_recode_to_utf8(sv, _get_encoding());
d0063567
DK
3610 if (SvUTF8(sv))
3611 has_utf8 = TRUE;
9f4817db 3612 }
2b9d42f0 3613 if (has_utf8) {
7e2040f0 3614 SvUTF8_on(sv);
2b9d42f0 3615 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
d0063567 3616 PL_sublex_info.sub_op->op_private |=
2b9d42f0
NIS
3617 (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
3618 }
3619 }
79072805 3620
02aa26ce 3621 /* shrink the sv if we allocated more than we used */
79072805 3622 if (SvCUR(sv) + 5 < SvLEN(sv)) {
1da4ca5f 3623 SvPV_shrink_to_cur(sv);
79072805 3624 }
02aa26ce 3625
6154021b 3626 /* return the substring (via pl_yylval) only if we parsed anything */
f0feb466
FC
3627 if (s > start) {
3628 char *s2 = start;
ffdb8b16
FC
3629 for (; s2 < s; s2++) {
3630 if (*s2 == '\n')
3631 COPLINE_INC_WITH_HERELINES;
3632 }