This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
PATCH: [perl #126310] single quote UTF-8 malformation detection
[perl5.git] / toke.c
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
f0e67a1d
Z
26This is the lower layer of the Perl parser, managing characters and tokens.
27
28=for apidoc AmU|yy_parser *|PL_parser
29
30Pointer to a structure encapsulating the state of the parsing operation
31currently in progress. The pointer can be locally changed to perform
32a nested parse without interfering with the state of an outer parse.
33Individual members of C<PL_parser> have their own documentation.
34
35=cut
36*/
37
378cc40b 38#include "EXTERN.h"
864dbfa3 39#define PERL_IN_TOKE_C
378cc40b 40#include "perl.h"
f7e03a10 41#include "dquote_inline.h"
378cc40b 42
eb0d8d16
NC
43#define new_constant(a,b,c,d,e,f,g) \
44 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
45
6154021b 46#define pl_yylval (PL_parser->yylval)
d3b6f988 47
199e78b7
DM
48/* XXX temporary backwards compatibility */
49#define PL_lex_brackets (PL_parser->lex_brackets)
78cdf107
Z
50#define PL_lex_allbrackets (PL_parser->lex_allbrackets)
51#define PL_lex_fakeeof (PL_parser->lex_fakeeof)
199e78b7
DM
52#define PL_lex_brackstack (PL_parser->lex_brackstack)
53#define PL_lex_casemods (PL_parser->lex_casemods)
54#define PL_lex_casestack (PL_parser->lex_casestack)
199e78b7 55#define PL_lex_dojoin (PL_parser->lex_dojoin)
199e78b7
DM
56#define PL_lex_formbrack (PL_parser->lex_formbrack)
57#define PL_lex_inpat (PL_parser->lex_inpat)
58#define PL_lex_inwhat (PL_parser->lex_inwhat)
59#define PL_lex_op (PL_parser->lex_op)
60#define PL_lex_repl (PL_parser->lex_repl)
61#define PL_lex_starts (PL_parser->lex_starts)
62#define PL_lex_stuff (PL_parser->lex_stuff)
63#define PL_multi_start (PL_parser->multi_start)
64#define PL_multi_open (PL_parser->multi_open)
65#define PL_multi_close (PL_parser->multi_close)
199e78b7 66#define PL_preambled (PL_parser->preambled)
bdc0bf6f 67#define PL_linestr (PL_parser->linestr)
c2598295
DM
68#define PL_expect (PL_parser->expect)
69#define PL_copline (PL_parser->copline)
f06b5848
DM
70#define PL_bufptr (PL_parser->bufptr)
71#define PL_oldbufptr (PL_parser->oldbufptr)
72#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
73#define PL_linestart (PL_parser->linestart)
74#define PL_bufend (PL_parser->bufend)
75#define PL_last_uni (PL_parser->last_uni)
76#define PL_last_lop (PL_parser->last_lop)
77#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 78#define PL_lex_state (PL_parser->lex_state)
2f9285f8 79#define PL_rsfp (PL_parser->rsfp)
5486870f 80#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
81#define PL_in_my (PL_parser->in_my)
82#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 83#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 84#define PL_multi_end (PL_parser->multi_end)
13765c85 85#define PL_error_count (PL_parser->error_count)
199e78b7 86
fb205e7a
DM
87# define PL_nexttoke (PL_parser->nexttoke)
88# define PL_nexttype (PL_parser->nexttype)
89# define PL_nextval (PL_parser->nextval)
199e78b7 90
6432a58a
DM
91
92#define SvEVALED(sv) \
93 (SvTYPE(sv) >= SVt_PVNV \
94 && ((XPVIV*)SvANY(sv))->xiv_u.xivu_eval_seen)
95
a1894d81 96static const char* const ident_too_long = "Identifier too long";
8903cb82 97
9ded7720 98# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2 99
a7aaec61
Z
100#define XENUMMASK 0x3f
101#define XFAKEEOF 0x40
102#define XFAKEBRACK 0x80
9059aa12 103
39e02b42 104#ifdef USE_UTF8_SCRIPTS
b3041197 105# define UTF cBOOL(!IN_BYTES)
2b9d42f0 106#else
b3041197 107# define UTF cBOOL((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
2b9d42f0 108#endif
a0ed51b3 109
b1fc3636
CJ
110/* The maximum number of characters preceding the unrecognized one to display */
111#define UNRECOGNIZED_PRECEDE_COUNT 10
112
61f0cdd9 113/* In variables named $^X, these are the legal values for X.
2b92dfce
GS
114 * 1999-02-27 mjd-perl-patch@plover.com */
115#define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
116
14bd96d0 117#define SPACE_OR_TAB(c) isBLANK_A(c)
bf4acbe4 118
9ff909cf
JH
119#define HEXFP_PEEK(s) \
120 (((s[0] == '.') && \
121 (isXDIGIT(s[1]) || isALPHA_FOLD_EQ(s[1], 'p'))) || \
122 isALPHA_FOLD_EQ(s[0], 'p'))
123
ffb4593c
NT
124/* LEX_* are values for PL_lex_state, the state of the lexer.
125 * They are arranged oddly so that the guard on the switch statement
79072805 126 * can get by with a single comparison (if the compiler is smart enough).
9da1dd8f
DM
127 *
128 * These values refer to the various states within a sublex parse,
129 * i.e. within a double quotish string
79072805
LW
130 */
131
fb73857a
PP
132/* #define LEX_NOTPARSING 11 is done in perl.h. */
133
b6007c36
DM
134#define LEX_NORMAL 10 /* normal code (ie not within "...") */
135#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
136#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
137#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
138#define LEX_INTERPSTART 6 /* expecting the start of a $var */
139
140 /* at end of code, eg "$x" followed by: */
141#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
142#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
143
144#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
145 string or after \E, $foo, etc */
146#define LEX_INTERPCONST 2 /* NOT USED */
147#define LEX_FORMLINE 1 /* expecting a format line */
b6007c36 148
79072805 149
bbf60fe6 150#ifdef DEBUGGING
27da23d5 151static const char* const lex_state_names[] = {
bbf60fe6
DM
152 "KNOWNEXT",
153 "FORMLINE",
154 "INTERPCONST",
155 "INTERPCONCAT",
156 "INTERPENDMAYBE",
157 "INTERPEND",
158 "INTERPSTART",
159 "INTERPPUSH",
160 "INTERPCASEMOD",
161 "INTERPNORMAL",
162 "NORMAL"
163};
164#endif
165
79072805 166#include "keywords.h"
fe14fcc3 167
ffb4593c
NT
168/* CLINE is a macro that ensures PL_copline has a sane value */
169
57843af0 170#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 171
ffb4593c
NT
172/*
173 * Convenience functions to return different tokens and prime the
9cbb5ea2 174 * lexer for the next token. They all take an argument.
ffb4593c
NT
175 *
176 * TOKEN : generic token (used for '(', DOLSHARP, etc)
177 * OPERATOR : generic operator
178 * AOPERATOR : assignment operator
179 * PREBLOCK : beginning the block after an if, while, foreach, ...
180 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
181 * PREREF : *EXPR where EXPR is not a simple identifier
182 * TERM : expression term
89f35911 183 * POSTDEREF : postfix dereference (->$* ->@[...] etc.)
ffb4593c
NT
184 * LOOPX : loop exiting command (goto, last, dump, etc)
185 * FTST : file test operator
186 * FUN0 : zero-argument function
7eb971ee 187 * FUN0OP : zero-argument function, with its op created in this file
2d2e263d 188 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
189 * BOop : bitwise or or xor
190 * BAop : bitwise and
8823cb89 191 * BCop : bitwise complement
ffb4593c
NT
192 * SHop : shift operator
193 * PWop : power operator
9cbb5ea2 194 * PMop : pattern-matching operator
ffb4593c 195 * Aop : addition-level operator
e4916dd1 196 * AopNOASSIGN : addition-level operator that is never part of .=
ffb4593c
NT
197 * Mop : multiplication-level operator
198 * Eop : equality-testing operator
e5edeb50 199 * Rop : relational operator <= != gt
ffb4593c
NT
200 *
201 * Also see LOP and lop() below.
202 */
203
998054bd 204#ifdef DEBUGGING /* Serve -DT. */
704d4215 205# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 206#else
bbf60fe6 207# define REPORT(retval) (retval)
998054bd
SC
208#endif
209
bbf60fe6
DM
210#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
211#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
b1764551 212#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, retval))
bbf60fe6
DM
213#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
214#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
215#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
216#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
89f35911 217#define POSTDEREF(f) return (PL_bufptr = s, S_postderef(aTHX_ REPORT(f),s[1]))
185c2e96 218#define LOOPX(f) return (PL_bufptr = force_word(s,BAREWORD,TRUE,FALSE), \
7a61bf3c 219 pl_yylval.ival=f, \
a49203fd 220 PL_expect = PL_nexttoke ? XOPERATOR : XTERM, \
7a61bf3c 221 REPORT((int)LOOPEX))
6154021b
RGS
222#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
223#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
7eb971ee 224#define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
6154021b 225#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
b1764551
FC
226#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)BITOROP))
227#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)BITANDOP))
8823cb89
FC
228#define BCop(f) return pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr = s, \
229 REPORT('~')
b1764551
FC
230#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)SHIFTOP))
231#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)POWOP))
6154021b 232#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
b1764551 233#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)ADDOP))
e4916dd1 234#define AopNOASSIGN(f) return (pl_yylval.ival=f, PL_bufptr=s, REPORT((int)ADDOP))
b1764551 235#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)MULOP))
6154021b
RGS
236#define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
237#define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
2f3197b3 238
a687059c
LW
239/* This bit of chicanery makes a unary function followed by
240 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
241 * The UNIDOR macro is for unary functions that can be followed by the //
242 * operator (such as C<shift // 0>).
a687059c 243 */
d68ce4ac 244#define UNI3(f,x,have_x) { \
6154021b 245 pl_yylval.ival = f; \
d68ce4ac 246 if (have_x) PL_expect = x; \
376fcdbf
AL
247 PL_bufptr = s; \
248 PL_last_uni = PL_oldbufptr; \
0af40c75 249 PL_last_lop_op = (f) < 0 ? -(f) : (f); \
376fcdbf
AL
250 if (*s == '(') \
251 return REPORT( (int)FUNC1 ); \
294a536f 252 s = skipspace(s); \
376fcdbf
AL
253 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
254 }
d68ce4ac
FC
255#define UNI(f) UNI3(f,XTERM,1)
256#define UNIDOR(f) UNI3(f,XTERMORDORDOR,1)
b5fb7ce3
FC
257#define UNIPROTO(f,optional) { \
258 if (optional) PL_last_uni = PL_oldbufptr; \
22393538
MH
259 OPERATOR(f); \
260 }
a687059c 261
d68ce4ac 262#define UNIBRACK(f) UNI3(f,0,0)
79072805 263
9f68db38 264/* grandfather return to old style */
78cdf107
Z
265#define OLDLOP(f) \
266 do { \
267 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
268 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
269 pl_yylval.ival = (f); \
270 PL_expect = XTERM; \
271 PL_bufptr = s; \
272 return (int)LSTOP; \
273 } while(0)
79072805 274
83944c01
FC
275#define COPLINE_INC_WITH_HERELINES \
276 STMT_START { \
277 CopLINE_inc(PL_curcop); \
851b527a
FC
278 if (PL_parser->herelines) \
279 CopLINE(PL_curcop) += PL_parser->herelines, \
280 PL_parser->herelines = 0; \
83944c01 281 } STMT_END
ffdb8b16
FC
282/* Called after scan_str to update CopLINE(PL_curcop), but only when there
283 * is no sublex_push to follow. */
284#define COPLINE_SET_FROM_MULTI_END \
285 STMT_START { \
286 CopLINE_set(PL_curcop, PL_multi_end); \
287 if (PL_multi_end != PL_multi_start) \
851b527a 288 PL_parser->herelines = 0; \
ffdb8b16 289 } STMT_END
83944c01
FC
290
291
8fa7f367
JH
292#ifdef DEBUGGING
293
6154021b 294/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
295enum token_type {
296 TOKENTYPE_NONE,
297 TOKENTYPE_IVAL,
6154021b 298 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6 299 TOKENTYPE_PVAL,
aeaef349 300 TOKENTYPE_OPVAL
bbf60fe6
DM
301};
302
6d4a66ac
NC
303static struct debug_tokens {
304 const int token;
305 enum token_type type;
306 const char *name;
307} const debug_tokens[] =
9041c2e3 308{
bbf60fe6
DM
309 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
310 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
311 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
312 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
313 { ARROW, TOKENTYPE_NONE, "ARROW" },
314 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
315 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
316 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
317 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
318 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
0d863452 319 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
320 { DO, TOKENTYPE_NONE, "DO" },
321 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
322 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
323 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
324 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
325 { ELSE, TOKENTYPE_NONE, "ELSE" },
326 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
327 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
328 { FOR, TOKENTYPE_IVAL, "FOR" },
329 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
705fe0e5
FC
330 { FORMLBRACK, TOKENTYPE_NONE, "FORMLBRACK" },
331 { FORMRBRACK, TOKENTYPE_NONE, "FORMRBRACK" },
bbf60fe6
DM
332 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
333 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
7eb971ee 334 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
bbf60fe6
DM
335 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
336 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
337 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 338 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
339 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
340 { IF, TOKENTYPE_IVAL, "IF" },
5504e6cf 341 { LABEL, TOKENTYPE_PVAL, "LABEL" },
bbf60fe6
DM
342 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
343 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
344 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
345 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
346 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
347 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
348 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
349 { MY, TOKENTYPE_IVAL, "MY" },
bbf60fe6
DM
350 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
351 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
352 { OROP, TOKENTYPE_IVAL, "OROP" },
353 { OROR, TOKENTYPE_NONE, "OROR" },
354 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
88e1f1a2
JV
355 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
356 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6 357 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
cc624add 358 { POSTJOIN, TOKENTYPE_NONE, "POSTJOIN" },
bbf60fe6
DM
359 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
360 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
361 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
362 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
363 { PREINC, TOKENTYPE_NONE, "PREINC" },
364 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
f3f204dc 365 { QWLIST, TOKENTYPE_OPVAL, "QWLIST" },
bbf60fe6
DM
366 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
367 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
f3f204dc 368 { REQUIRE, TOKENTYPE_NONE, "REQUIRE" },
bbf60fe6
DM
369 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
370 { SUB, TOKENTYPE_NONE, "SUB" },
371 { THING, TOKENTYPE_OPVAL, "THING" },
372 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
373 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
374 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
375 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
376 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
377 { USE, TOKENTYPE_IVAL, "USE" },
0d863452 378 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6 379 { WHILE, TOKENTYPE_IVAL, "WHILE" },
185c2e96 380 { BAREWORD, TOKENTYPE_OPVAL, "BAREWORD" },
be25f609 381 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 382 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
383};
384
6154021b 385/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 386
bbf60fe6 387STATIC int
704d4215 388S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 389{
7918f24d
NC
390 PERL_ARGS_ASSERT_TOKEREPORT;
391
bbf60fe6 392 if (DEBUG_T_TEST) {
bd61b366 393 const char *name = NULL;
bbf60fe6 394 enum token_type type = TOKENTYPE_NONE;
f54cb97a 395 const struct debug_tokens *p;
396482e1 396 SV* const report = newSVpvs("<== ");
bbf60fe6 397
f54cb97a 398 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
399 if (p->token == (int)rv) {
400 name = p->name;
401 type = p->type;
402 break;
403 }
404 }
405 if (name)
54667de8 406 Perl_sv_catpv(aTHX_ report, name);
239f83d5 407 else if (isGRAPH(rv))
4ebc7986 408 {
bbf60fe6 409 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
4ebc7986
FC
410 if ((char)rv == 'p')
411 sv_catpvs(report, " (pending identifier)");
412 }
bbf60fe6 413 else if (!rv)
396482e1 414 sv_catpvs(report, "EOF");
bbf60fe6 415 else
147e3846 416 Perl_sv_catpvf(aTHX_ report, "?? %" IVdf, (IV)rv);
bbf60fe6
DM
417 switch (type) {
418 case TOKENTYPE_NONE:
bbf60fe6
DM
419 break;
420 case TOKENTYPE_IVAL:
147e3846 421 Perl_sv_catpvf(aTHX_ report, "(ival=%" IVdf ")", (IV)lvalp->ival);
bbf60fe6
DM
422 break;
423 case TOKENTYPE_OPNUM:
424 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 425 PL_op_name[lvalp->ival]);
bbf60fe6
DM
426 break;
427 case TOKENTYPE_PVAL:
704d4215 428 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
429 break;
430 case TOKENTYPE_OPVAL:
704d4215 431 if (lvalp->opval) {
401441c0 432 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
433 PL_op_name[lvalp->opval->op_type]);
434 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 435 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 436 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
437 }
438
439 }
401441c0 440 else
396482e1 441 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
442 break;
443 }
b6007c36 444 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
445 };
446 return (int)rv;
998054bd
SC
447}
448
b6007c36
DM
449
450/* print the buffer with suitable escapes */
451
452STATIC void
15f169a1 453S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 454{
396482e1 455 SV* const tmp = newSVpvs("");
7918f24d
NC
456
457 PERL_ARGS_ASSERT_PRINTBUF;
458
5d37acd6 459 GCC_DIAG_IGNORE(-Wformat-nonliteral); /* fmt checked by caller */
b6007c36 460 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
5d37acd6 461 GCC_DIAG_RESTORE;
b6007c36
DM
462 SvREFCNT_dec(tmp);
463}
464
8fa7f367
JH
465#endif
466
8290c323
NC
467static int
468S_deprecate_commaless_var_list(pTHX) {
469 PL_expect = XTERM;
470 deprecate("comma-less variable list");
471 return REPORT(','); /* grandfather non-comma-format format */
472}
473
ffb4593c
NT
474/*
475 * S_ao
476 *
f393a21a
FC
477 * This subroutine looks for an '=' next to the operator that has just been
478 * parsed and turns it into an ASSIGNOP if it finds one.
ffb4593c
NT
479 */
480
76e3520e 481STATIC int
cea2e8a9 482S_ao(pTHX_ int toketype)
a0d0e21e 483{
3280af22
NIS
484 if (*PL_bufptr == '=') {
485 PL_bufptr++;
a0d0e21e 486 if (toketype == ANDAND)
6154021b 487 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 488 else if (toketype == OROR)
6154021b 489 pl_yylval.ival = OP_ORASSIGN;
c963b151 490 else if (toketype == DORDOR)
6154021b 491 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
492 toketype = ASSIGNOP;
493 }
b1764551 494 return REPORT(toketype);
a0d0e21e
LW
495}
496
ffb4593c
NT
497/*
498 * S_no_op
499 * When Perl expects an operator and finds something else, no_op
500 * prints the warning. It always prints "<something> found where
501 * operator expected. It prints "Missing semicolon on previous line?"
502 * if the surprise occurs at the start of the line. "do you need to
503 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
504 * where the compiler doesn't know if foo is a method call or a function.
505 * It prints "Missing operator before end of line" if there's nothing
506 * after the missing operator, or "... before <...>" if there is something
507 * after the missing operator.
488bc579
FC
508 *
509 * PL_bufptr is expected to point to the start of the thing that was found,
510 * and s after the next token or partial token.
ffb4593c
NT
511 */
512
76e3520e 513STATIC void
15f169a1 514S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 515{
9d4ba2ae
AL
516 char * const oldbp = PL_bufptr;
517 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 518
7918f24d
NC
519 PERL_ARGS_ASSERT_NO_OP;
520
1189a94a
GS
521 if (!s)
522 s = oldbp;
07c798fb 523 else
1189a94a 524 PL_bufptr = s;
734ab321 525 yywarn(Perl_form(aTHX_ "%s found where operator expected", what), UTF ? SVf_UTF8 : 0);
56da5a46
RGS
526 if (ckWARN_d(WARN_SYNTAX)) {
527 if (is_first)
528 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
529 "\t(Missing semicolon on previous line?)\n");
530 else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) {
f54cb97a 531 const char *t;
8a2bca12 532 for (t = PL_oldoldbufptr; (isWORDCHAR_lazy_if(t,UTF) || *t == ':');
734ab321 533 t += UTF ? UTF8SKIP(t) : 1)
c35e046a 534 NOOP;
56da5a46
RGS
535 if (t < PL_bufptr && isSPACE(*t))
536 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
147e3846 537 "\t(Do you need to predeclare %" UTF8f "?)\n",
b17a0679 538 UTF8fARG(UTF, t - PL_oldoldbufptr, PL_oldoldbufptr));
56da5a46
RGS
539 }
540 else {
541 assert(s >= oldbp);
542 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
147e3846 543 "\t(Missing operator before %" UTF8f "?)\n",
b17a0679 544 UTF8fARG(UTF, s - oldbp, oldbp));
56da5a46 545 }
07c798fb 546 }
3280af22 547 PL_bufptr = oldbp;
8990e307
LW
548}
549
ffb4593c
NT
550/*
551 * S_missingterm
552 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 553 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
554 * If we're in a delimited string and the delimiter is a control
555 * character, it's reformatted into a two-char sequence like ^C.
556 * This is fatal.
557 */
558
76e3520e 559STATIC void
cea2e8a9 560S_missingterm(pTHX_ char *s)
8990e307 561{
e487ff5e 562 char tmpbuf[UTF8_MAXBYTES + 1];
8990e307 563 char q;
cb650135
FC
564 bool uni = FALSE;
565 SV *sv;
8990e307 566 if (s) {
9d4ba2ae 567 char * const nl = strrchr(s,'\n');
d2719217 568 if (nl)
8990e307 569 *nl = '\0';
7f7f7d08 570 uni = UTF;
8990e307 571 }
cb650135 572 else if (PL_multi_close < 32) {
8990e307 573 *tmpbuf = '^';
585ec06d 574 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
575 tmpbuf[2] = '\0';
576 s = tmpbuf;
577 }
578 else {
cb650135
FC
579 if (LIKELY(PL_multi_close < 256)) {
580 *tmpbuf = (char)PL_multi_close;
581 tmpbuf[1] = '\0';
582 }
583 else {
584 uni = TRUE;
585 *uvchr_to_utf8((U8 *)tmpbuf, PL_multi_close) = 0;
586 }
8990e307
LW
587 s = tmpbuf;
588 }
589 q = strchr(s,'"') ? '\'' : '"';
cb650135
FC
590 sv = sv_2mortal(newSVpv(s,0));
591 if (uni)
592 SvUTF8_on(sv);
147e3846 593 Perl_croak(aTHX_ "Can't find string terminator %c%" SVf
cb650135 594 "%c anywhere before EOF",q,SVfARG(sv),q);
463ee0b2 595}
79072805 596
dd0ac2b9
FC
597#include "feature.h"
598
0d863452 599/*
0d863452
RH
600 * Check whether the named feature is enabled.
601 */
26ea9e12 602bool
3fff3427 603Perl_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
0d863452 604{
4a731d7b 605 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
7918f24d
NC
606
607 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
ca4d40c4
FC
608
609 assert(CURRENT_FEATURE_BUNDLE == FEATURE_BUNDLE_CUSTOM);
7918f24d 610
26ea9e12
NC
611 if (namelen > MAX_FEATURE_LEN)
612 return FALSE;
3fff3427 613 memcpy(&he_name[8], name, namelen);
7d69d4a6 614
c8ca97b0
NC
615 return cBOOL(cop_hints_fetch_pvn(PL_curcop, he_name, 8 + namelen, 0,
616 REFCOUNTED_HE_EXISTS));
0d863452
RH
617}
618
ffb4593c 619/*
9cbb5ea2
GS
620 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
621 * utf16-to-utf8-reversed.
ffb4593c
NT
622 */
623
c39cd008
GS
624#ifdef PERL_CR_FILTER
625static void
626strip_return(SV *sv)
627{
eb578fdb
KW
628 const char *s = SvPVX_const(sv);
629 const char * const e = s + SvCUR(sv);
7918f24d
NC
630
631 PERL_ARGS_ASSERT_STRIP_RETURN;
632
c39cd008
GS
633 /* outer loop optimized to do nothing if there are no CR-LFs */
634 while (s < e) {
635 if (*s++ == '\r' && *s == '\n') {
636 /* hit a CR-LF, need to copy the rest */
eb578fdb 637 char *d = s - 1;
c39cd008
GS
638 *d++ = *s++;
639 while (s < e) {
640 if (*s == '\r' && s[1] == '\n')
641 s++;
642 *d++ = *s++;
643 }
644 SvCUR(sv) -= s - d;
645 return;
646 }
647 }
648}
a868473f 649
76e3520e 650STATIC I32
c39cd008 651S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 652{
f54cb97a 653 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
654 if (count > 0 && !maxlen)
655 strip_return(sv);
656 return count;
a868473f
NIS
657}
658#endif
659
ffb4593c 660/*
8eaa0acf
Z
661=for apidoc Amx|void|lex_start|SV *line|PerlIO *rsfp|U32 flags
662
663Creates and initialises a new lexer/parser state object, supplying
664a context in which to lex and parse from a new source of Perl code.
665A pointer to the new state object is placed in L</PL_parser>. An entry
666is made on the save stack so that upon unwinding the new state object
667will be destroyed and the former value of L</PL_parser> will be restored.
668Nothing else need be done to clean up the parsing context.
669
2d7f6611 670The code to be parsed comes from C<line> and C<rsfp>. C<line>, if
8eaa0acf 671non-null, provides a string (in SV form) containing code to be parsed.
2d7f6611
KW
672A copy of the string is made, so subsequent modification of C<line>
673does not affect parsing. C<rsfp>, if non-null, provides an input stream
8eaa0acf 674from which code will be read to be parsed. If both are non-null, the
2d7f6611
KW
675code in C<line> comes first and must consist of complete lines of input,
676and C<rsfp> supplies the remainder of the source.
8eaa0acf 677
2d7f6611 678The C<flags> parameter is reserved for future use. Currently it is only
e368b3bd 679used by perl internally, so extensions should always pass zero.
8eaa0acf
Z
680
681=cut
682*/
ffb4593c 683
27fcb6ee 684/* LEX_START_SAME_FILTER indicates that this is not a new file, so it
87606032
NC
685 can share filters with the current parser.
686 LEX_START_DONT_CLOSE indicates that the file handle wasn't opened by the
687 caller, hence isn't owned by the parser, so shouldn't be closed on parser
688 destruction. This is used to handle the case of defaulting to reading the
689 script from the standard input because no filename was given on the command
690 line (without getting confused by situation where STDIN has been closed, so
691 the script handle is opened on fd 0) */
27fcb6ee 692
a0d0e21e 693void
8eaa0acf 694Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
79072805 695{
6ef55633 696 const char *s = NULL;
5486870f 697 yy_parser *parser, *oparser;
60d63348 698 if (flags && flags & ~LEX_START_FLAGS)
8eaa0acf 699 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
acdf0a21
DM
700
701 /* create and initialise a parser */
702
199e78b7 703 Newxz(parser, 1, yy_parser);
5486870f 704 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
705 PL_parser = parser;
706
28ac2b49 707 parser->stack = NULL;
df13534a 708 parser->stack_max1 = NULL;
28ac2b49 709 parser->ps = NULL;
acdf0a21 710
e3abe207
DM
711 /* on scope exit, free this parser and restore any outer one */
712 SAVEPARSER(parser);
7c4baf47 713 parser->saved_curcop = PL_curcop;
e3abe207 714
acdf0a21 715 /* initialise lexer state */
8990e307 716
fb205e7a 717 parser->nexttoke = 0;
ca4cfd28 718 parser->error_count = oparser ? oparser->error_count : 0;
7f1c3e8c 719 parser->copline = parser->preambling = NOLINE;
5afb0a62 720 parser->lex_state = LEX_NORMAL;
c2598295 721 parser->expect = XSTATE;
2f9285f8 722 parser->rsfp = rsfp;
27fcb6ee
FC
723 parser->rsfp_filters =
724 !(flags & LEX_START_SAME_FILTER) || !oparser
d3cd8e11
FC
725 ? NULL
726 : MUTABLE_AV(SvREFCNT_inc(
727 oparser->rsfp_filters
728 ? oparser->rsfp_filters
729 : (oparser->rsfp_filters = newAV())
730 ));
2f9285f8 731
199e78b7
DM
732 Newx(parser->lex_brackstack, 120, char);
733 Newx(parser->lex_casestack, 12, char);
734 *parser->lex_casestack = '\0';
d794b522 735 Newxz(parser->lex_shared, 1, LEXSHARED);
02b34bbe 736
10efb74f 737 if (line) {
0528fd32 738 STRLEN len;
10efb74f 739 s = SvPV_const(line, len);
0abcdfa4
FC
740 parser->linestr = flags & LEX_START_COPIED
741 ? SvREFCNT_inc_simple_NN(line)
742 : newSVpvn_flags(s, len, SvUTF8(line));
b3dd0aba
FC
743 if (!rsfp)
744 sv_catpvs(parser->linestr, "\n;");
0abcdfa4 745 } else {
bf1b738b 746 parser->linestr = newSVpvn("\n;", rsfp ? 1 : 2);
8990e307 747 }
f06b5848
DM
748 parser->oldoldbufptr =
749 parser->oldbufptr =
750 parser->bufptr =
751 parser->linestart = SvPVX(parser->linestr);
752 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
753 parser->last_lop = parser->last_uni = NULL;
b54f893d 754
6d59e610 755 STATIC_ASSERT_STMT(FITS_IN_8_BITS(LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
b54f893d
KW
756 |LEX_DONT_CLOSE_RSFP));
757 parser->lex_flags = (U8) (flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
758 |LEX_DONT_CLOSE_RSFP));
737c24fc 759
60d63348 760 parser->in_pod = parser->filtered = 0;
79072805 761}
a687059c 762
e3abe207
DM
763
764/* delete a parser object */
765
766void
767Perl_parser_free(pTHX_ const yy_parser *parser)
768{
7918f24d
NC
769 PERL_ARGS_ASSERT_PARSER_FREE;
770
7c4baf47 771 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
772 SvREFCNT_dec(parser->linestr);
773
87606032 774 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
2f9285f8 775 PerlIO_clearerr(parser->rsfp);
407f8cf2
KW
776 else if (parser->rsfp && (!parser->old_parser
777 || (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 778 PerlIO_close(parser->rsfp);
5486870f 779 SvREFCNT_dec(parser->rsfp_filters);
10002bc1 780 SvREFCNT_dec(parser->lex_stuff);
7ef70b3d 781 SvREFCNT_dec(parser->lex_sub_repl);
3ac7ff8f
FC
782
783 Safefree(parser->lex_brackstack);
784 Safefree(parser->lex_casestack);
785 Safefree(parser->lex_shared);
786 PL_parser = parser->old_parser;
787 Safefree(parser);
788}
789
790void
791Perl_parser_free_nexttoke_ops(pTHX_ yy_parser *parser, OPSLAB *slab)
792{
3ac7ff8f 793 I32 nexttoke = parser->nexttoke;
3ac7ff8f 794 PERL_ARGS_ASSERT_PARSER_FREE_NEXTTOKE_OPS;
3ce3dcd9 795 while (nexttoke--) {
3ac7ff8f
FC
796 if (S_is_opval_token(parser->nexttype[nexttoke] & 0xffff)
797 && parser->nextval[nexttoke].opval
798 && parser->nextval[nexttoke].opval->op_slabbed
799 && OpSLAB(parser->nextval[nexttoke].opval) == slab) {
3ce3dcd9 800 op_free(parser->nextval[nexttoke].opval);
3ac7ff8f
FC
801 parser->nextval[nexttoke].opval = NULL;
802 }
3ce3dcd9 803 }
e3abe207
DM
804}
805
806
ffb4593c 807/*
f0e67a1d
Z
808=for apidoc AmxU|SV *|PL_parser-E<gt>linestr
809
810Buffer scalar containing the chunk currently under consideration of the
811text currently being lexed. This is always a plain string scalar (for
812which C<SvPOK> is true). It is not intended to be used as a scalar by
813normal scalar means; instead refer to the buffer directly by the pointer
814variables described below.
815
816The lexer maintains various C<char*> pointers to things in the
817C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
818reallocated, all of these pointers must be updated. Don't attempt to
819do this manually, but rather use L</lex_grow_linestr> if you need to
820reallocate the buffer.
821
822The content of the text chunk in the buffer is commonly exactly one
823complete line of input, up to and including a newline terminator,
824but there are situations where it is otherwise. The octets of the
825buffer may be intended to be interpreted as either UTF-8 or Latin-1.
826The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
827flag on this scalar, which may disagree with it.
828
829For direct examination of the buffer, the variable
830L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
831lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
832of these pointers is usually preferable to examination of the scalar
833through normal scalar means.
834
835=for apidoc AmxU|char *|PL_parser-E<gt>bufend
836
837Direct pointer to the end of the chunk of text currently being lexed, the
838end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
6602b933 839+ SvCUR(PL_parser-E<gt>linestr)>. A C<NUL> character (zero octet) is
f0e67a1d
Z
840always located at the end of the buffer, and does not count as part of
841the buffer's contents.
842
843=for apidoc AmxU|char *|PL_parser-E<gt>bufptr
844
845Points to the current position of lexing inside the lexer buffer.
846Characters around this point may be freely examined, within
847the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
848L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
849interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
850
851Lexing code (whether in the Perl core or not) moves this pointer past
852the characters that it consumes. It is also expected to perform some
853bookkeeping whenever a newline character is consumed. This movement
854can be more conveniently performed by the function L</lex_read_to>,
855which handles newlines appropriately.
856
857Interpretation of the buffer's octets can be abstracted out by
858using the slightly higher-level functions L</lex_peek_unichar> and
859L</lex_read_unichar>.
860
861=for apidoc AmxU|char *|PL_parser-E<gt>linestart
862
863Points to the start of the current line inside the lexer buffer.
864This is useful for indicating at which column an error occurred, and
865not much else. This must be updated by any lexing code that consumes
866a newline; the function L</lex_read_to> handles this detail.
867
868=cut
869*/
870
871/*
872=for apidoc Amx|bool|lex_bufutf8
873
874Indicates whether the octets in the lexer buffer
875(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
876of Unicode characters. If not, they should be interpreted as Latin-1
877characters. This is analogous to the C<SvUTF8> flag for scalars.
878
879In UTF-8 mode, it is not guaranteed that the lexer buffer actually
880contains valid UTF-8. Lexing code must be robust in the face of invalid
881encoding.
882
883The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
884is significant, but not the whole story regarding the input character
885encoding. Normally, when a file is being read, the scalar contains octets
886and its C<SvUTF8> flag is off, but the octets should be interpreted as
887UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
888however, the scalar may have the C<SvUTF8> flag on, and in this case its
889octets should be interpreted as UTF-8 unless the C<use bytes> pragma
890is in effect. This logic may change in the future; use this function
891instead of implementing the logic yourself.
892
893=cut
894*/
895
896bool
897Perl_lex_bufutf8(pTHX)
898{
899 return UTF;
900}
901
902/*
903=for apidoc Amx|char *|lex_grow_linestr|STRLEN len
904
905Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
2d7f6611 906at least C<len> octets (including terminating C<NUL>). Returns a
f0e67a1d
Z
907pointer to the reallocated buffer. This is necessary before making
908any direct modification of the buffer that would increase its length.
909L</lex_stuff_pvn> provides a more convenient way to insert text into
910the buffer.
911
912Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
913this function updates all of the lexer's variables that point directly
914into the buffer.
915
916=cut
917*/
918
919char *
920Perl_lex_grow_linestr(pTHX_ STRLEN len)
921{
922 SV *linestr;
923 char *buf;
924 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
c7641931 925 STRLEN linestart_pos, last_uni_pos, last_lop_pos, re_eval_start_pos;
98d5e3ef
DM
926 bool current;
927
f0e67a1d
Z
928 linestr = PL_parser->linestr;
929 buf = SvPVX(linestr);
930 if (len <= SvLEN(linestr))
931 return buf;
98d5e3ef
DM
932
933 /* Is the lex_shared linestr SV the same as the current linestr SV?
934 * Only in this case does re_eval_start need adjusting, since it
935 * points within lex_shared->ls_linestr's buffer */
936 current = (linestr == PL_parser->lex_shared->ls_linestr);
937
f0e67a1d
Z
938 bufend_pos = PL_parser->bufend - buf;
939 bufptr_pos = PL_parser->bufptr - buf;
940 oldbufptr_pos = PL_parser->oldbufptr - buf;
941 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
942 linestart_pos = PL_parser->linestart - buf;
943 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
944 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
98d5e3ef 945 re_eval_start_pos = (current && PL_parser->lex_shared->re_eval_start) ?
3328ab5a 946 PL_parser->lex_shared->re_eval_start - buf : 0;
c7641931 947
f0e67a1d 948 buf = sv_grow(linestr, len);
c7641931 949
f0e67a1d
Z
950 PL_parser->bufend = buf + bufend_pos;
951 PL_parser->bufptr = buf + bufptr_pos;
952 PL_parser->oldbufptr = buf + oldbufptr_pos;
953 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
954 PL_parser->linestart = buf + linestart_pos;
955 if (PL_parser->last_uni)
956 PL_parser->last_uni = buf + last_uni_pos;
957 if (PL_parser->last_lop)
958 PL_parser->last_lop = buf + last_lop_pos;
98d5e3ef 959 if (current && PL_parser->lex_shared->re_eval_start)
3328ab5a 960 PL_parser->lex_shared->re_eval_start = buf + re_eval_start_pos;
f0e67a1d
Z
961 return buf;
962}
963
964/*
83aa740e 965=for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
f0e67a1d
Z
966
967Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
968immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
969reallocating the buffer if necessary. This means that lexing code that
970runs later will see the characters as if they had appeared in the input.
971It is not recommended to do this as part of normal parsing, and most
972uses of this facility run the risk of the inserted characters being
973interpreted in an unintended manner.
974
2d7f6611
KW
975The string to be inserted is represented by C<len> octets starting
976at C<pv>. These octets are interpreted as either UTF-8 or Latin-1,
977according to whether the C<LEX_STUFF_UTF8> flag is set in C<flags>.
f0e67a1d
Z
978The characters are recoded for the lexer buffer, according to how the
979buffer is currently being interpreted (L</lex_bufutf8>). If a string
9dcc53ea 980to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
f0e67a1d
Z
981function is more convenient.
982
983=cut
984*/
985
986void
83aa740e 987Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
f0e67a1d 988{
749123ff 989 dVAR;
f0e67a1d
Z
990 char *bufptr;
991 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
992 if (flags & ~(LEX_STUFF_UTF8))
993 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
994 if (UTF) {
995 if (flags & LEX_STUFF_UTF8) {
996 goto plain_copy;
997 } else {
54d004e8 998 STRLEN highhalf = 0; /* Count of variants */
83aa740e 999 const char *p, *e = pv+len;
54d004e8
KW
1000 for (p = pv; p != e; p++) {
1001 if (! UTF8_IS_INVARIANT(*p)) {
1002 highhalf++;
1003 }
1004 }
f0e67a1d
Z
1005 if (!highhalf)
1006 goto plain_copy;
1007 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
1008 bufptr = PL_parser->bufptr;
1009 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1010 SvCUR_set(PL_parser->linestr,
1011 SvCUR(PL_parser->linestr) + len+highhalf);
f0e67a1d
Z
1012 PL_parser->bufend += len+highhalf;
1013 for (p = pv; p != e; p++) {
1014 U8 c = (U8)*p;
54d004e8
KW
1015 if (! UTF8_IS_INVARIANT(c)) {
1016 *bufptr++ = UTF8_TWO_BYTE_HI(c);
1017 *bufptr++ = UTF8_TWO_BYTE_LO(c);
f0e67a1d
Z
1018 } else {
1019 *bufptr++ = (char)c;
1020 }
1021 }
1022 }
1023 } else {
1024 if (flags & LEX_STUFF_UTF8) {
1025 STRLEN highhalf = 0;
83aa740e 1026 const char *p, *e = pv+len;
f0e67a1d
Z
1027 for (p = pv; p != e; p++) {
1028 U8 c = (U8)*p;
54d004e8 1029 if (UTF8_IS_ABOVE_LATIN1(c)) {
f0e67a1d
Z
1030 Perl_croak(aTHX_ "Lexing code attempted to stuff "
1031 "non-Latin-1 character into Latin-1 input");
54d004e8 1032 } else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e)) {
f0e67a1d
Z
1033 p++;
1034 highhalf++;
75219bac
KW
1035 } else if (! UTF8_IS_INVARIANT(c)) {
1036 _force_out_malformed_utf8_message((U8 *) p, (U8 *) e,
1037 0,
1038 1 /* 1 means die */ );
1039 NOT_REACHED; /* NOTREACHED */
f0e67a1d
Z
1040 }
1041 }
1042 if (!highhalf)
1043 goto plain_copy;
1044 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
1045 bufptr = PL_parser->bufptr;
1046 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1047 SvCUR_set(PL_parser->linestr,
1048 SvCUR(PL_parser->linestr) + len-highhalf);
f0e67a1d 1049 PL_parser->bufend += len-highhalf;
54d004e8
KW
1050 p = pv;
1051 while (p < e) {
1052 if (UTF8_IS_INVARIANT(*p)) {
1053 *bufptr++ = *p;
1054 p++;
f0e67a1d 1055 }
54d004e8
KW
1056 else {
1057 assert(p < e -1 );
a62b247b 1058 *bufptr++ = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
54d004e8
KW
1059 p += 2;
1060 }
f0e67a1d
Z
1061 }
1062 } else {
54d004e8 1063 plain_copy:
f0e67a1d
Z
1064 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1065 bufptr = PL_parser->bufptr;
1066 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
255fdf19 1067 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
f0e67a1d
Z
1068 PL_parser->bufend += len;
1069 Copy(pv, bufptr, len, char);
1070 }
1071 }
1072}
1073
1074/*
9dcc53ea
Z
1075=for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
1076
1077Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1078immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1079reallocating the buffer if necessary. This means that lexing code that
1080runs later will see the characters as if they had appeared in the input.
1081It is not recommended to do this as part of normal parsing, and most
1082uses of this facility run the risk of the inserted characters being
1083interpreted in an unintended manner.
1084
2d7f6611 1085The string to be inserted is represented by octets starting at C<pv>
9dcc53ea
Z
1086and continuing to the first nul. These octets are interpreted as either
1087UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
2d7f6611 1088in C<flags>. The characters are recoded for the lexer buffer, according
9dcc53ea
Z
1089to how the buffer is currently being interpreted (L</lex_bufutf8>).
1090If it is not convenient to nul-terminate a string to be inserted, the
1091L</lex_stuff_pvn> function is more appropriate.
1092
1093=cut
1094*/
1095
1096void
1097Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1098{
1099 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1100 lex_stuff_pvn(pv, strlen(pv), flags);
1101}
1102
1103/*
f0e67a1d
Z
1104=for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1105
1106Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1107immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1108reallocating the buffer if necessary. This means that lexing code that
1109runs later will see the characters as if they had appeared in the input.
1110It is not recommended to do this as part of normal parsing, and most
1111uses of this facility run the risk of the inserted characters being
1112interpreted in an unintended manner.
1113
2d7f6611 1114The string to be inserted is the string value of C<sv>. The characters
f0e67a1d 1115are recoded for the lexer buffer, according to how the buffer is currently
9dcc53ea 1116being interpreted (L</lex_bufutf8>). If a string to be inserted is
f0e67a1d
Z
1117not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1118need to construct a scalar.
1119
1120=cut
1121*/
1122
1123void
1124Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1125{
1126 char *pv;
1127 STRLEN len;
1128 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1129 if (flags)
1130 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1131 pv = SvPV(sv, len);
1132 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1133}
1134
1135/*
1136=for apidoc Amx|void|lex_unstuff|char *ptr
1137
1138Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
2d7f6611 1139C<ptr>. Text following C<ptr> will be moved, and the buffer shortened.
f0e67a1d
Z
1140This hides the discarded text from any lexing code that runs later,
1141as if the text had never appeared.
1142
1143This is not the normal way to consume lexed text. For that, use
1144L</lex_read_to>.
1145
1146=cut
1147*/
1148
1149void
1150Perl_lex_unstuff(pTHX_ char *ptr)
1151{
1152 char *buf, *bufend;
1153 STRLEN unstuff_len;
1154 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1155 buf = PL_parser->bufptr;
1156 if (ptr < buf)
1157 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1158 if (ptr == buf)
1159 return;
1160 bufend = PL_parser->bufend;
1161 if (ptr > bufend)
1162 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1163 unstuff_len = ptr - buf;
1164 Move(ptr, buf, bufend+1-ptr, char);
1165 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1166 PL_parser->bufend = bufend - unstuff_len;
1167}
1168
1169/*
1170=for apidoc Amx|void|lex_read_to|char *ptr
1171
1172Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
2d7f6611 1173to C<ptr>. This advances L</PL_parser-E<gt>bufptr> to match C<ptr>,
f0e67a1d
Z
1174performing the correct bookkeeping whenever a newline character is passed.
1175This is the normal way to consume lexed text.
1176
1177Interpretation of the buffer's octets can be abstracted out by
1178using the slightly higher-level functions L</lex_peek_unichar> and
1179L</lex_read_unichar>.
1180
1181=cut
1182*/
1183
1184void
1185Perl_lex_read_to(pTHX_ char *ptr)
1186{
1187 char *s;
1188 PERL_ARGS_ASSERT_LEX_READ_TO;
1189 s = PL_parser->bufptr;
1190 if (ptr < s || ptr > PL_parser->bufend)
1191 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1192 for (; s != ptr; s++)
1193 if (*s == '\n') {
83944c01 1194 COPLINE_INC_WITH_HERELINES;
f0e67a1d
Z
1195 PL_parser->linestart = s+1;
1196 }
1197 PL_parser->bufptr = ptr;
1198}
1199
1200/*
1201=for apidoc Amx|void|lex_discard_to|char *ptr
1202
1203Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
2d7f6611
KW
1204up to C<ptr>. The remaining content of the buffer will be moved, and
1205all pointers into the buffer updated appropriately. C<ptr> must not
f0e67a1d
Z
1206be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1207it is not permitted to discard text that has yet to be lexed.
1208
1209Normally it is not necessarily to do this directly, because it suffices to
1210use the implicit discarding behaviour of L</lex_next_chunk> and things
1211based on it. However, if a token stretches across multiple lines,
1f317c95 1212and the lexing code has kept multiple lines of text in the buffer for
f0e67a1d
Z
1213that purpose, then after completion of the token it would be wise to
1214explicitly discard the now-unneeded earlier lines, to avoid future
1215multi-line tokens growing the buffer without bound.
1216
1217=cut
1218*/
1219
1220void
1221Perl_lex_discard_to(pTHX_ char *ptr)
1222{
1223 char *buf;
1224 STRLEN discard_len;
1225 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1226 buf = SvPVX(PL_parser->linestr);
1227 if (ptr < buf)
1228 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1229 if (ptr == buf)
1230 return;
1231 if (ptr > PL_parser->bufptr)
1232 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1233 discard_len = ptr - buf;
1234 if (PL_parser->oldbufptr < ptr)
1235 PL_parser->oldbufptr = ptr;
1236 if (PL_parser->oldoldbufptr < ptr)
1237 PL_parser->oldoldbufptr = ptr;
1238 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1239 PL_parser->last_uni = NULL;
1240 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1241 PL_parser->last_lop = NULL;
1242 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1243 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1244 PL_parser->bufend -= discard_len;
1245 PL_parser->bufptr -= discard_len;
1246 PL_parser->oldbufptr -= discard_len;
1247 PL_parser->oldoldbufptr -= discard_len;
1248 if (PL_parser->last_uni)
1249 PL_parser->last_uni -= discard_len;
1250 if (PL_parser->last_lop)
1251 PL_parser->last_lop -= discard_len;
1252}
1253
1254/*
1255=for apidoc Amx|bool|lex_next_chunk|U32 flags
1256
1257Reads in the next chunk of text to be lexed, appending it to
1258L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1259looked to the end of the current chunk and wants to know more. It is
1260usual, but not necessary, for lexing to have consumed the entirety of
1261the current chunk at this time.
1262
1263If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1264chunk (i.e., the current chunk has been entirely consumed), normally the
1265current chunk will be discarded at the same time that the new chunk is
c5608a1f 1266read in. If C<flags> has the C<LEX_KEEP_PREVIOUS> bit set, the current chunk
f0e67a1d
Z
1267will not be discarded. If the current chunk has not been entirely
1268consumed, then it will not be discarded regardless of the flag.
1269
1270Returns true if some new text was added to the buffer, or false if the
1271buffer has reached the end of the input text.
1272
1273=cut
1274*/
1275
1276#define LEX_FAKE_EOF 0x80000000
e47d32dc 1277#define LEX_NO_TERM 0x40000000 /* here-doc */
f0e67a1d
Z
1278
1279bool
1280Perl_lex_next_chunk(pTHX_ U32 flags)
1281{
1282 SV *linestr;
1283 char *buf;
1284 STRLEN old_bufend_pos, new_bufend_pos;
1285 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1286 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1287 bool got_some_for_debugger = 0;
f0e67a1d 1288 bool got_some;
6cdc5cd8
KW
1289 const U8* first_bad_char_loc;
1290
112d1284 1291 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM))
f0e67a1d 1292 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
d27f4b91 1293 if (!(flags & LEX_NO_TERM) && PL_lex_inwhat)
e47d32dc 1294 return FALSE;
f0e67a1d
Z
1295 linestr = PL_parser->linestr;
1296 buf = SvPVX(linestr);
407f8cf2
KW
1297 if (!(flags & LEX_KEEP_PREVIOUS)
1298 && PL_parser->bufptr == PL_parser->bufend)
1299 {
f0e67a1d
Z
1300 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1301 linestart_pos = 0;
1302 if (PL_parser->last_uni != PL_parser->bufend)
1303 PL_parser->last_uni = NULL;
1304 if (PL_parser->last_lop != PL_parser->bufend)
1305 PL_parser->last_lop = NULL;
1306 last_uni_pos = last_lop_pos = 0;
1307 *buf = 0;
1308 SvCUR(linestr) = 0;
1309 } else {
1310 old_bufend_pos = PL_parser->bufend - buf;
1311 bufptr_pos = PL_parser->bufptr - buf;
1312 oldbufptr_pos = PL_parser->oldbufptr - buf;
1313 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1314 linestart_pos = PL_parser->linestart - buf;
1315 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1316 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1317 }
1318 if (flags & LEX_FAKE_EOF) {
1319 goto eof;
60d63348 1320 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
f0e67a1d
Z
1321 got_some = 0;
1322 } else if (filter_gets(linestr, old_bufend_pos)) {
1323 got_some = 1;
17cc9359 1324 got_some_for_debugger = 1;
112d1284
FC
1325 } else if (flags & LEX_NO_TERM) {
1326 got_some = 0;
f0e67a1d 1327 } else {
580561a3 1328 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
847cc851 1329 SvPVCLEAR(linestr);
f0e67a1d
Z
1330 eof:
1331 /* End of real input. Close filehandle (unless it was STDIN),
1332 * then add implicit termination.
1333 */
87606032 1334 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
f0e67a1d
Z
1335 PerlIO_clearerr(PL_parser->rsfp);
1336 else if (PL_parser->rsfp)
1337 (void)PerlIO_close(PL_parser->rsfp);
1338 PL_parser->rsfp = NULL;
60d63348 1339 PL_parser->in_pod = PL_parser->filtered = 0;
f0e67a1d
Z
1340 if (!PL_in_eval && PL_minus_p) {
1341 sv_catpvs(linestr,
1342 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1343 PL_minus_n = PL_minus_p = 0;
1344 } else if (!PL_in_eval && PL_minus_n) {
1345 sv_catpvs(linestr, /*{*/";}");
1346 PL_minus_n = 0;
1347 } else
1348 sv_catpvs(linestr, ";");
1349 got_some = 1;
1350 }
1351 buf = SvPVX(linestr);
1352 new_bufend_pos = SvCUR(linestr);
1353 PL_parser->bufend = buf + new_bufend_pos;
1354 PL_parser->bufptr = buf + bufptr_pos;
6cdc5cd8
KW
1355
1356 if (UTF && ! is_utf8_string_loc((U8 *) PL_parser->bufptr,
1357 PL_parser->bufend - PL_parser->bufptr,
1358 &first_bad_char_loc))
1359 {
1360
1361 _force_out_malformed_utf8_message(first_bad_char_loc,
1362 (U8 *) PL_parser->bufend,
1363 0,
1364 1 /* 1 means die */ );
1365 NOT_REACHED; /* NOTREACHED */
1366 }
1367
f0e67a1d
Z
1368 PL_parser->oldbufptr = buf + oldbufptr_pos;
1369 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1370 PL_parser->linestart = buf + linestart_pos;
1371 if (PL_parser->last_uni)
1372 PL_parser->last_uni = buf + last_uni_pos;
1373 if (PL_parser->last_lop)
1374 PL_parser->last_lop = buf + last_lop_pos;
7f1c3e8c
FC
1375 if (PL_parser->preambling != NOLINE) {
1376 CopLINE_set(PL_curcop, PL_parser->preambling + 1);
1377 PL_parser->preambling = NOLINE;
1378 }
407f8cf2
KW
1379 if ( got_some_for_debugger
1380 && PERLDB_LINE_OR_SAVESRC
1381 && PL_curstash != PL_debstash)
1382 {
f0e67a1d
Z
1383 /* debugger active and we're not compiling the debugger code,
1384 * so store the line into the debugger's array of lines
1385 */
1386 update_debugger_info(NULL, buf+old_bufend_pos,
1387 new_bufend_pos-old_bufend_pos);
1388 }
1389 return got_some;
1390}
1391
1392/*
1393=for apidoc Amx|I32|lex_peek_unichar|U32 flags
1394
1395Looks ahead one (Unicode) character in the text currently being lexed.
1396Returns the codepoint (unsigned integer value) of the next character,
1397or -1 if lexing has reached the end of the input text. To consume the
1398peeked character, use L</lex_read_unichar>.
1399
1400If the next character is in (or extends into) the next chunk of input
1401text, the next chunk will be read in. Normally the current chunk will be
c5608a1f
KW
1402discarded at the same time, but if C<flags> has the C<LEX_KEEP_PREVIOUS>
1403bit set, then the current chunk will not be discarded.
f0e67a1d
Z
1404
1405If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1406is encountered, an exception is generated.
1407
1408=cut
1409*/
1410
1411I32
1412Perl_lex_peek_unichar(pTHX_ U32 flags)
1413{
749123ff 1414 dVAR;
f0e67a1d
Z
1415 char *s, *bufend;
1416 if (flags & ~(LEX_KEEP_PREVIOUS))
1417 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1418 s = PL_parser->bufptr;
1419 bufend = PL_parser->bufend;
1420 if (UTF) {
1421 U8 head;
1422 I32 unichar;
1423 STRLEN len, retlen;
1424 if (s == bufend) {
1425 if (!lex_next_chunk(flags))
1426 return -1;
1427 s = PL_parser->bufptr;
1428 bufend = PL_parser->bufend;
1429 }
1430 head = (U8)*s;
54d004e8 1431 if (UTF8_IS_INVARIANT(head))
f0e67a1d 1432 return head;
54d004e8
KW
1433 if (UTF8_IS_START(head)) {
1434 len = UTF8SKIP(&head);
f0e67a1d
Z
1435 while ((STRLEN)(bufend-s) < len) {
1436 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1437 break;
1438 s = PL_parser->bufptr;
1439 bufend = PL_parser->bufend;
1440 }
1441 }
c80e42f3 1442 unichar = utf8n_to_uvchr((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
f0e67a1d 1443 if (retlen == (STRLEN)-1) {
75219bac
KW
1444 _force_out_malformed_utf8_message((U8 *) s,
1445 (U8 *) bufend,
1446 0,
1447 1 /* 1 means die */ );
1448 NOT_REACHED; /* NOTREACHED */
f0e67a1d
Z
1449 }
1450 return unichar;
1451 } else {
1452 if (s == bufend) {
1453 if (!lex_next_chunk(flags))
1454 return -1;
1455 s = PL_parser->bufptr;
1456 }
1457 return (U8)*s;
1458 }
1459}
1460
1461/*
1462=for apidoc Amx|I32|lex_read_unichar|U32 flags
1463
1464Reads the next (Unicode) character in the text currently being lexed.
1465Returns the codepoint (unsigned integer value) of the character read,
1466and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1467if lexing has reached the end of the input text. To non-destructively
1468examine the next character, use L</lex_peek_unichar> instead.
1469
1470If the next character is in (or extends into) the next chunk of input
1471text, the next chunk will be read in. Normally the current chunk will be
c5608a1f
KW
1472discarded at the same time, but if C<flags> has the C<LEX_KEEP_PREVIOUS>
1473bit set, then the current chunk will not be discarded.
f0e67a1d
Z
1474
1475If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1476is encountered, an exception is generated.
1477
1478=cut
1479*/
1480
1481I32
1482Perl_lex_read_unichar(pTHX_ U32 flags)
1483{
1484 I32 c;
1485 if (flags & ~(LEX_KEEP_PREVIOUS))
1486 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1487 c = lex_peek_unichar(flags);
1488 if (c != -1) {
1489 if (c == '\n')
83944c01 1490 COPLINE_INC_WITH_HERELINES;
d9018cbe
EB
1491 if (UTF)
1492 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1493 else
1494 ++(PL_parser->bufptr);
f0e67a1d
Z
1495 }
1496 return c;
1497}
1498
1499/*
1500=for apidoc Amx|void|lex_read_space|U32 flags
1501
1502Reads optional spaces, in Perl style, in the text currently being
1503lexed. The spaces may include ordinary whitespace characters and
1504Perl-style comments. C<#line> directives are processed if encountered.
1505L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1506at a non-space character (or the end of the input text).
1507
1508If spaces extend into the next chunk of input text, the next chunk will
1509be read in. Normally the current chunk will be discarded at the same
c5608a1f 1510time, but if C<flags> has the C<LEX_KEEP_PREVIOUS> bit set, then the current
f0e67a1d
Z
1511chunk will not be discarded.
1512
1513=cut
1514*/
1515
21791330 1516#define LEX_NO_INCLINE 0x40000000
f0998909
Z
1517#define LEX_NO_NEXT_CHUNK 0x80000000
1518
f0e67a1d
Z
1519void
1520Perl_lex_read_space(pTHX_ U32 flags)
1521{
1522 char *s, *bufend;
21791330 1523 const bool can_incline = !(flags & LEX_NO_INCLINE);
f0e67a1d 1524 bool need_incline = 0;
21791330 1525 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK|LEX_NO_INCLINE))
f0e67a1d 1526 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
f0e67a1d
Z
1527 s = PL_parser->bufptr;
1528 bufend = PL_parser->bufend;
1529 while (1) {
1530 char c = *s;
1531 if (c == '#') {
1532 do {
1533 c = *++s;
1534 } while (!(c == '\n' || (c == 0 && s == bufend)));
1535 } else if (c == '\n') {
1536 s++;
21791330
FC
1537 if (can_incline) {
1538 PL_parser->linestart = s;
1539 if (s == bufend)
1540 need_incline = 1;
1541 else
1542 incline(s);
1543 }
f0e67a1d
Z
1544 } else if (isSPACE(c)) {
1545 s++;
1546 } else if (c == 0 && s == bufend) {
1547 bool got_more;
65c68e17 1548 line_t l;
f0998909
Z
1549 if (flags & LEX_NO_NEXT_CHUNK)
1550 break;
f0e67a1d 1551 PL_parser->bufptr = s;
65c68e17 1552 l = CopLINE(PL_curcop);
851b527a 1553 CopLINE(PL_curcop) += PL_parser->herelines + 1;
f0e67a1d 1554 got_more = lex_next_chunk(flags);
65c68e17 1555 CopLINE_set(PL_curcop, l);
f0e67a1d
Z
1556 s = PL_parser->bufptr;
1557 bufend = PL_parser->bufend;
1558 if (!got_more)
1559 break;
21791330 1560 if (can_incline && need_incline && PL_parser->rsfp) {
f0e67a1d
Z
1561 incline(s);
1562 need_incline = 0;
1563 }
3c47da3c
FC
1564 } else if (!c) {
1565 s++;
f0e67a1d
Z
1566 } else {
1567 break;
1568 }
1569 }
f0e67a1d
Z
1570 PL_parser->bufptr = s;
1571}
1572
1573/*
fe788d6b
PM
1574
1575=for apidoc EXMp|bool|validate_proto|SV *name|SV *proto|bool warn
1576
1577This function performs syntax checking on a prototype, C<proto>.
1578If C<warn> is true, any illegal characters or mismatched brackets
1579will trigger illegalproto warnings, declaring that they were
1580detected in the prototype for C<name>.
1581
1582The return value is C<true> if this is a valid prototype, and
1583C<false> if it is not, regardless of whether C<warn> was C<true> or
1584C<false>.
1585
1586Note that C<NULL> is a valid C<proto> and will always return C<true>.
1587
1588=cut
1589
1590 */
1591
1592bool
1593Perl_validate_proto(pTHX_ SV *name, SV *proto, bool warn)
1594{
1595 STRLEN len, origlen;
11327fa1 1596 char *p;
fe788d6b
PM
1597 bool bad_proto = FALSE;
1598 bool in_brackets = FALSE;
1599 bool after_slash = FALSE;
1600 char greedy_proto = ' ';
1601 bool proto_after_greedy_proto = FALSE;
1602 bool must_be_last = FALSE;
1603 bool underscore = FALSE;
f791a21a 1604 bool bad_proto_after_underscore = FALSE;
fe788d6b
PM
1605
1606 PERL_ARGS_ASSERT_VALIDATE_PROTO;
1607
1608 if (!proto)
1609 return TRUE;
1610
11327fa1 1611 p = SvPV(proto, len);
fe788d6b
PM
1612 origlen = len;
1613 for (; len--; p++) {
1614 if (!isSPACE(*p)) {
1615 if (must_be_last)
1616 proto_after_greedy_proto = TRUE;
f791a21a
PM
1617 if (underscore) {
1618 if (!strchr(";@%", *p))
1619 bad_proto_after_underscore = TRUE;
1620 underscore = FALSE;
1621 }
fe788d6b
PM
1622 if (!strchr("$@%*;[]&\\_+", *p) || *p == '\0') {
1623 bad_proto = TRUE;
1624 }
1625 else {
fe788d6b
PM
1626 if (*p == '[')
1627 in_brackets = TRUE;
1628 else if (*p == ']')
1629 in_brackets = FALSE;
407f8cf2
KW
1630 else if ((*p == '@' || *p == '%')
1631 && !after_slash
1632 && !in_brackets )
1633 {
fe788d6b
PM
1634 must_be_last = TRUE;
1635 greedy_proto = *p;
1636 }
1637 else if (*p == '_')
f791a21a 1638 underscore = TRUE;
fe788d6b
PM
1639 }
1640 if (*p == '\\')
1641 after_slash = TRUE;
1642 else
1643 after_slash = FALSE;
1644 }
1645 }
1646
1647 if (warn) {
b54d603d 1648 SV *tmpsv = newSVpvs_flags("", SVs_TEMP);
fe788d6b 1649 p -= origlen;
b54d603d
PM
1650 p = SvUTF8(proto)
1651 ? sv_uni_display(tmpsv, newSVpvn_flags(p, origlen, SVs_TEMP | SVf_UTF8),
1652 origlen, UNI_DISPLAY_ISPRINT)
1653 : pv_pretty(tmpsv, p, origlen, 60, NULL, NULL, PERL_PV_ESCAPE_NONASCII);
1654
fe788d6b
PM
1655 if (proto_after_greedy_proto)
1656 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
147e3846 1657 "Prototype after '%c' for %" SVf " : %s",
fe788d6b 1658 greedy_proto, SVfARG(name), p);
50278ed0
PM
1659 if (in_brackets)
1660 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
147e3846 1661 "Missing ']' in prototype for %" SVf " : %s",
50278ed0 1662 SVfARG(name), p);
b54d603d 1663 if (bad_proto)
fe788d6b 1664 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
147e3846 1665 "Illegal character in prototype for %" SVf " : %s",
f791a21a
PM
1666 SVfARG(name), p);
1667 if (bad_proto_after_underscore)
1668 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
147e3846 1669 "Illegal character after '_' in prototype for %" SVf " : %s",
f791a21a 1670 SVfARG(name), p);
fe788d6b
PM
1671 }
1672
1673 return (! (proto_after_greedy_proto || bad_proto) );
1674}
1675
1676/*
ffb4593c
NT
1677 * S_incline
1678 * This subroutine has nothing to do with tilting, whether at windmills
1679 * or pinball tables. Its name is short for "increment line". It
57843af0 1680 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1681 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1682 * # line 500 "foo.pm"
1683 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1684 */
1685
76e3520e 1686STATIC void
d9095cec 1687S_incline(pTHX_ const char *s)
463ee0b2 1688{
d9095cec
NC
1689 const char *t;
1690 const char *n;
1691 const char *e;
8818d409 1692 line_t line_num;
22ff3130 1693 UV uv;
463ee0b2 1694
7918f24d
NC
1695 PERL_ARGS_ASSERT_INCLINE;
1696
83944c01 1697 COPLINE_INC_WITH_HERELINES;
451f421f
FC
1698 if (!PL_rsfp && !PL_parser->filtered && PL_lex_state == LEX_NORMAL
1699 && s+1 == PL_bufend && *s == ';') {
1700 /* fake newline in string eval */
1701 CopLINE_dec(PL_curcop);
1702 return;
1703 }
463ee0b2
LW
1704 if (*s++ != '#')
1705 return;
d4c19fe8
AL
1706 while (SPACE_OR_TAB(*s))
1707 s++;
847cc851 1708 if (strEQs(s, "line"))
73659bf1
GS
1709 s += 4;
1710 else
1711 return;
084592ab 1712 if (SPACE_OR_TAB(*s))
73659bf1 1713 s++;
4e553d73 1714 else
73659bf1 1715 return;
d4c19fe8
AL
1716 while (SPACE_OR_TAB(*s))
1717 s++;
463ee0b2
LW
1718 if (!isDIGIT(*s))
1719 return;
d4c19fe8 1720
463ee0b2
LW
1721 n = s;
1722 while (isDIGIT(*s))
1723 s++;
07714eb4 1724 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1725 return;
bf4acbe4 1726 while (SPACE_OR_TAB(*s))
463ee0b2 1727 s++;
73659bf1 1728 if (*s == '"' && (t = strchr(s+1, '"'))) {
463ee0b2 1729 s++;
73659bf1
GS
1730 e = t + 1;
1731 }
463ee0b2 1732 else {
c35e046a 1733 t = s;
1bb1a3d6 1734 while (*t && !isSPACE(*t))
c35e046a 1735 t++;
73659bf1 1736 e = t;
463ee0b2 1737 }
bf4acbe4 1738 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1739 e++;
1740 if (*e != '\n' && *e != '\0')
1741 return; /* false alarm */
1742
22ff3130
HS
1743 if (!grok_atoUV(n, &uv, &e))
1744 return;
1745 line_num = ((line_t)uv) - 1;
8818d409 1746
f4dd75d9 1747 if (t - s > 0) {
d9095cec 1748 const STRLEN len = t - s;
3df32bda 1749
d36ee5be 1750 if (!PL_rsfp && !PL_parser->filtered) {
e66cf94c
RGS
1751 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1752 * to *{"::_<newfilename"} */
44867030
NC
1753 /* However, the long form of evals is only turned on by the
1754 debugger - usually they're "(eval %lu)" */
d36ee5be
FC
1755 GV * const cfgv = CopFILEGV(PL_curcop);
1756 if (cfgv) {
38bd7ad8
FC
1757 char smallbuf[128];
1758 STRLEN tmplen2 = len;
44867030 1759 char *tmpbuf2;
449dd039 1760 GV *gv2;
44867030
NC
1761
1762 if (tmplen2 + 2 <= sizeof smallbuf)
1763 tmpbuf2 = smallbuf;
1764 else
1765 Newx(tmpbuf2, tmplen2 + 2, char);
1766
38bd7ad8
FC
1767 tmpbuf2[0] = '_';
1768 tmpbuf2[1] = '<';
44867030
NC
1769
1770 memcpy(tmpbuf2 + 2, s, tmplen2);
1771 tmplen2 += 2;
1772
8a5ee598 1773 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1774 if (!isGV(gv2)) {
8a5ee598 1775 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1776 /* adjust ${"::_<newfilename"} to store the new file name */
1777 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
8818d409
FC
1778 /* The line number may differ. If that is the case,
1779 alias the saved lines that are in the array.
1780 Otherwise alias the whole array. */
1781 if (CopLINE(PL_curcop) == line_num) {
38bd7ad8
FC
1782 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(cfgv)));
1783 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(cfgv)));
8818d409 1784 }
38bd7ad8
FC
1785 else if (GvAV(cfgv)) {
1786 AV * const av = GvAV(cfgv);
8818d409
FC
1787 const I32 start = CopLINE(PL_curcop)+1;
1788 I32 items = AvFILLp(av) - start;
1789 if (items > 0) {
1790 AV * const av2 = GvAVn(gv2);
1791 SV **svp = AvARRAY(av) + start;
1792 I32 l = (I32)line_num+1;
1793 while (items--)
1794 av_store(av2, l++, SvREFCNT_inc(*svp++));
1795 }
1796 }
e5527e4b 1797 }
44867030
NC
1798
1799 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
d36ee5be 1800 }
e66cf94c 1801 }
05ec9bb3 1802 CopFILE_free(PL_curcop);
449dd039 1803 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1804 }
8818d409 1805 CopLINE_set(PL_curcop, line_num);
463ee0b2
LW
1806}
1807
80a702cd 1808STATIC void
15f169a1 1809S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1810{
1811 AV *av = CopFILEAVx(PL_curcop);
1812 if (av) {
7f1c3e8c
FC
1813 SV * sv;
1814 if (PL_parser->preambling == NOLINE) sv = newSV_type(SVt_PVMG);
1815 else {
1816 sv = *av_fetch(av, 0, 1);
1817 SvUPGRADE(sv, SVt_PVMG);
1818 }
847cc851 1819 if (!SvPOK(sv)) SvPVCLEAR(sv);
5fa550fb 1820 if (orig_sv)
7f1c3e8c 1821 sv_catsv(sv, orig_sv);
5fa550fb 1822 else
7f1c3e8c
FC
1823 sv_catpvn(sv, buf, len);
1824 if (!SvIOK(sv)) {
1825 (void)SvIOK_on(sv);
1826 SvIV_set(sv, 0);
1827 }
1828 if (PL_parser->preambling == NOLINE)
1829 av_store(av, CopLINE(PL_curcop), sv);
80a702cd
RGS
1830 }
1831}
1832
ffb4593c 1833/*
8c6b0c7d 1834 * skipspace
ffb4593c
NT
1835 * Called to gobble the appropriate amount and type of whitespace.
1836 * Skips comments as well.
71fff7cb 1837 * Returns the next character after the whitespace that is skipped.
8c6b0c7d
FC
1838 *
1839 * peekspace
1840 * Same thing, but look ahead without incrementing line numbers or
1841 * adjusting PL_linestart.
ffb4593c
NT
1842 */
1843
3218e223 1844#define skipspace(s) skipspace_flags(s, 0)
8c6b0c7d 1845#define peekspace(s) skipspace_flags(s, LEX_NO_INCLINE)
3218e223 1846
76e3520e 1847STATIC char *
21791330 1848S_skipspace_flags(pTHX_ char *s, U32 flags)
a687059c 1849{
21791330 1850 PERL_ARGS_ASSERT_SKIPSPACE_FLAGS;
3280af22 1851 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
3c47da3c 1852 while (s < PL_bufend && (SPACE_OR_TAB(*s) || !*s))
463ee0b2 1853 s++;
f0e67a1d
Z
1854 } else {
1855 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1856 PL_bufptr = s;
21791330 1857 lex_read_space(flags | LEX_KEEP_PREVIOUS |
d27f4b91 1858 (PL_lex_inwhat || PL_lex_state == LEX_FORMLINE ?
f0998909 1859 LEX_NO_NEXT_CHUNK : 0));
3280af22 1860 s = PL_bufptr;
f0e67a1d
Z
1861 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1862 if (PL_linestart > PL_bufptr)
1863 PL_bufptr = PL_linestart;
1864 return s;
463ee0b2 1865 }
5db06880 1866 return s;
a687059c 1867}
378cc40b 1868
ffb4593c
NT
1869/*
1870 * S_check_uni
1871 * Check the unary operators to ensure there's no ambiguity in how they're
1872 * used. An ambiguous piece of code would be:
1873 * rand + 5
1874 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1875 * the +5 is its argument.
1876 */
1877
76e3520e 1878STATIC void
cea2e8a9 1879S_check_uni(pTHX)
ba106d47 1880{
d4c19fe8
AL
1881 const char *s;
1882 const char *t;
2f3197b3 1883
3280af22 1884 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1885 return;
3280af22
NIS
1886 while (isSPACE(*PL_last_uni))
1887 PL_last_uni++;
c35e046a 1888 s = PL_last_uni;
8a2bca12 1889 while (isWORDCHAR_lazy_if(s,UTF) || *s == '-')
8ce2ba82 1890 s += UTF ? UTF8SKIP(s) : 1;
3280af22 1891 if ((t = strchr(s, '(')) && t < PL_bufptr)
a0d0e21e 1892 return;
6136c704 1893
9b387841 1894 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
147e3846 1895 "Warning: Use of \"%" UTF8f "\" without parentheses is ambiguous",
b59c097b 1896 UTF8fARG(UTF, (int)(s - PL_last_uni), PL_last_uni));
2f3197b3
LW
1897}
1898
ffb4593c
NT
1899/*
1900 * LOP : macro to build a list operator. Its behaviour has been replaced
1901 * with a subroutine, S_lop() for which LOP is just another name.
1902 */
1903
a0d0e21e
LW
1904#define LOP(f,x) return lop(f,x,s)
1905
ffb4593c
NT
1906/*
1907 * S_lop
1908 * Build a list operator (or something that might be one). The rules:
41e8cbf4
FC
1909 * - if we have a next token, then it's a list operator (no parens) for
1910 * which the next token has already been parsed; e.g.,
1911 * sort foo @args
1912 * sort foo (@args)
ffb4593c
NT
1913 * - if the next thing is an opening paren, then it's a function
1914 * - else it's a list operator
1915 */
1916
76e3520e 1917STATIC I32
11288bb3 1918S_lop(pTHX_ I32 f, U8 x, char *s)
ffed7fef 1919{
7918f24d
NC
1920 PERL_ARGS_ASSERT_LOP;
1921
6154021b 1922 pl_yylval.ival = f;
35c8bce7 1923 CLINE;
3280af22
NIS
1924 PL_bufptr = s;
1925 PL_last_lop = PL_oldbufptr;
eb160463 1926 PL_last_lop_op = (OPCODE)f;
3280af22 1927 if (PL_nexttoke)
78cdf107 1928 goto lstop;
19f1898a 1929 PL_expect = x;
79072805 1930 if (*s == '(')
bbf60fe6 1931 return REPORT(FUNC);
294a536f 1932 s = skipspace(s);
79072805 1933 if (*s == '(')
bbf60fe6 1934 return REPORT(FUNC);
78cdf107
Z
1935 else {
1936 lstop:
1937 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
1938 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
bbf60fe6 1939 return REPORT(LSTOP);
78cdf107 1940 }
79072805
LW
1941}
1942
ffb4593c
NT
1943/*
1944 * S_force_next
9cbb5ea2 1945 * When the lexer realizes it knows the next token (for instance,
ffb4593c 1946 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 1947 * to know what token to return the next time the lexer is called. Caller
b5bbe64a
JH
1948 * will need to set PL_nextval[] and possibly PL_expect to ensure
1949 * the lexer handles the token correctly.
ffb4593c
NT
1950 */
1951
4e553d73 1952STATIC void
cea2e8a9 1953S_force_next(pTHX_ I32 type)
79072805 1954{
704d4215
GG
1955#ifdef DEBUGGING
1956 if (DEBUG_T_TEST) {
1957 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 1958 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
1959 }
1960#endif
1f7c3e7c 1961 assert(PL_nexttoke < C_ARRAY_LENGTH(PL_nexttype));
3280af22
NIS
1962 PL_nexttype[PL_nexttoke] = type;
1963 PL_nexttoke++;
79072805
LW
1964}
1965
89f35911
FC
1966/*
1967 * S_postderef
1968 *
1969 * This subroutine handles postfix deref syntax after the arrow has already
1970 * been emitted. @* $* etc. are emitted as two separate token right here.
1971 * @[ @{ %[ %{ *{ are emitted also as two tokens, but this function emits
1972 * only the first, leaving yylex to find the next.
89f35911
FC
1973 */
1974
1975static int
ff25e5db 1976S_postderef(pTHX_ int const funny, char const next)
89f35911 1977{
ff25e5db 1978 assert(funny == DOLSHARP || strchr("$@%&*", funny));
89f35911
FC
1979 if (next == '*') {
1980 PL_expect = XOPERATOR;
cc624add 1981 if (PL_lex_state == LEX_INTERPNORMAL && !PL_lex_brackets) {
ff25e5db 1982 assert('@' == funny || '$' == funny || DOLSHARP == funny);
cc624add 1983 PL_lex_state = LEX_INTERPEND;
c3492809
FC
1984 if ('@' == funny)
1985 force_next(POSTJOIN);
cc624add 1986 }
89f35911
FC
1987 force_next(next);
1988 PL_bufptr+=2;
1989 }
1990 else {
760ca746
FC
1991 if ('@' == funny && PL_lex_state == LEX_INTERPNORMAL
1992 && !PL_lex_brackets)
1993 PL_lex_dojoin = 2;
89f35911
FC
1994 PL_expect = XOPERATOR;
1995 PL_bufptr++;
1996 }
1997 return funny;
1998}
1999
28ac2b49
Z
2000void
2001Perl_yyunlex(pTHX)
2002{
a7aaec61
Z
2003 int yyc = PL_parser->yychar;
2004 if (yyc != YYEMPTY) {
2005 if (yyc) {
a7aaec61
Z
2006 NEXTVAL_NEXTTOKE = PL_parser->yylval;
2007 if (yyc == '{'/*}*/ || yyc == HASHBRACK || yyc == '['/*]*/) {
78cdf107 2008 PL_lex_allbrackets--;
a7aaec61 2009 PL_lex_brackets--;
78cdf107
Z
2010 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
2011 } else if (yyc == '('/*)*/) {
2012 PL_lex_allbrackets--;
2013 yyc |= (2<<24);
a7aaec61
Z
2014 }
2015 force_next(yyc);
2016 }
28ac2b49
Z
2017 PL_parser->yychar = YYEMPTY;
2018 }
2019}
2020
d0a148a6 2021STATIC SV *
15f169a1 2022S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 2023{
740cce10 2024 SV * const sv = newSVpvn_utf8(start, len,
c5f058df
KW
2025 !IN_BYTES
2026 && UTF
2027 && !is_utf8_invariant_string((const U8*)start, len)
2028 && is_utf8_string((const U8*)start, len));
d0a148a6
NC
2029 return sv;
2030}
2031
ffb4593c
NT
2032/*
2033 * S_force_word
2034 * When the lexer knows the next thing is a word (for instance, it has
2035 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
2036 * it calls S_force_word to stick the next word into the PL_nexttoke/val
2037 * lookahead.
ffb4593c
NT
2038 *
2039 * Arguments:
b1b65b59 2040 * char *start : buffer position (must be within PL_linestr)
185c2e96
DM
2041 * int token : PL_next* will be this type of bare word
2042 * (e.g., METHOD,BAREWORD)
ffb4593c
NT
2043 * int check_keyword : if true, Perl checks to make sure the word isn't
2044 * a keyword (do this if the word is a label, e.g. goto FOO)
2045 * int allow_pack : if true, : characters will also be allowed (require,
2046 * use, etc. do this)
ffb4593c
NT
2047 */
2048
76e3520e 2049STATIC char *
345b3785 2050S_force_word(pTHX_ char *start, int token, int check_keyword, int allow_pack)
79072805 2051{
eb578fdb 2052 char *s;
463ee0b2 2053 STRLEN len;
4e553d73 2054
7918f24d
NC
2055 PERL_ARGS_ASSERT_FORCE_WORD;
2056
294a536f 2057 start = skipspace(start);
463ee0b2 2058 s = start;
407f8cf2 2059 if (isIDFIRST_lazy_if(s,UTF)
e7127e21 2060 || (allow_pack && *s == ':' && s[1] == ':') )
a0d0e21e 2061 {
3280af22 2062 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
01b5ef50
FC
2063 if (check_keyword) {
2064 char *s2 = PL_tokenbuf;
487e470d 2065 STRLEN len2 = len;
847cc851 2066 if (allow_pack && len > 6 && strEQs(s2, "CORE::"))
487e470d
FC
2067 s2 += 6, len2 -= 6;
2068 if (keyword(s2, len2, 0))
463ee0b2 2069 return start;
01b5ef50 2070 }
463ee0b2 2071 if (token == METHOD) {
294a536f 2072 s = skipspace(s);
463ee0b2 2073 if (*s == '(')
3280af22 2074 PL_expect = XTERM;
463ee0b2 2075 else {
3280af22 2076 PL_expect = XOPERATOR;
463ee0b2 2077 }
79072805 2078 }
9ded7720 2079 NEXTVAL_NEXTTOKE.opval
275103cd 2080 = newSVOP(OP_CONST,0,
d0a148a6 2081 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 2082 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
2083 force_next(token);
2084 }
2085 return s;
2086}
2087
ffb4593c
NT
2088/*
2089 * S_force_ident
9cbb5ea2 2090 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
2091 * text only contains the "foo" portion. The first argument is a pointer
2092 * to the "foo", and the second argument is the type symbol to prefix.
185c2e96 2093 * Forces the next token to be a "BAREWORD".
9cbb5ea2 2094 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2095 */
2096
76e3520e 2097STATIC void
5aaab254 2098S_force_ident(pTHX_ const char *s, int kind)
79072805 2099{
7918f24d
NC
2100 PERL_ARGS_ASSERT_FORCE_IDENT;
2101
c9b48522
DD
2102 if (s[0]) {
2103 const STRLEN len = s[1] ? strlen(s) : 1; /* s = "\"" see yylex */
275103cd 2104 OP* const o = newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
728847b1 2105 UTF ? SVf_UTF8 : 0));
9ded7720 2106 NEXTVAL_NEXTTOKE.opval = o;
185c2e96 2107 force_next(BAREWORD);
748a9306 2108 if (kind) {
11343788 2109 o->op_private = OPpCONST_ENTERED;
55497cff
PP
2110 /* XXX see note in pp_entereval() for why we forgo typo
2111 warnings if the symbol must be introduced in an eval.
2112 GSAR 96-10-12 */
90e5519e 2113 gv_fetchpvn_flags(s, len,
4bff32c5 2114 (PL_in_eval ? GV_ADDMULTI
728847b1 2115 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
90e5519e
NC
2116 kind == '$' ? SVt_PV :
2117 kind == '@' ? SVt_PVAV :
2118 kind == '%' ? SVt_PVHV :
a0d0e21e 2119 SVt_PVGV
90e5519e 2120 );
748a9306 2121 }
79072805
LW
2122 }
2123}
2124
3f33d153
FC
2125static void
2126S_force_ident_maybe_lex(pTHX_ char pit)
2127{
3f33d153
FC
2128 NEXTVAL_NEXTTOKE.ival = pit;
2129 force_next('p');
2130}
2131
1571675a
GS
2132NV
2133Perl_str_to_version(pTHX_ SV *sv)
2134{
2135 NV retval = 0.0;
2136 NV nshift = 1.0;
2137 STRLEN len;
cfd0369c 2138 const char *start = SvPV_const(sv,len);
9d4ba2ae 2139 const char * const end = start + len;
504618e9 2140 const bool utf = SvUTF8(sv) ? TRUE : FALSE;
7918f24d
NC
2141
2142 PERL_ARGS_ASSERT_STR_TO_VERSION;
2143
1571675a 2144 while (start < end) {
ba210ebe 2145 STRLEN skip;
1571675a
GS
2146 UV n;
2147 if (utf)
9041c2e3 2148 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2149 else {
2150 n = *(U8*)start;
2151 skip = 1;
2152 }
2153 retval += ((NV)n)/nshift;
2154 start += skip;
2155 nshift *= 1000;
2156 }
2157 return retval;
2158}
2159
4e553d73 2160/*
ffb4593c
NT
2161 * S_force_version
2162 * Forces the next token to be a version number.
e759cc13
RGS
2163 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2164 * and if "guessing" is TRUE, then no new token is created (and the caller
2165 * must use an alternative parsing method).
ffb4593c
NT
2166 */
2167
76e3520e 2168STATIC char *
e759cc13 2169S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2170{
5f66b61c 2171 OP *version = NULL;
44dcb63b 2172 char *d;
89bfa8cd 2173
7918f24d
NC
2174 PERL_ARGS_ASSERT_FORCE_VERSION;
2175
294a536f 2176 s = skipspace(s);
89bfa8cd 2177
44dcb63b 2178 d = s;
dd629d5b 2179 if (*d == 'v')
44dcb63b 2180 d++;
44dcb63b 2181 if (isDIGIT(*d)) {
e759cc13
RGS
2182 while (isDIGIT(*d) || *d == '_' || *d == '.')
2183 d++;
4e4da3ac 2184 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
dd629d5b 2185 SV *ver;
6154021b
RGS
2186 s = scan_num(s, &pl_yylval);
2187 version = pl_yylval.opval;
dd629d5b
GS
2188 ver = cSVOPx(version)->op_sv;
2189 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2190 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2191 SvNV_set(ver, str_to_version(ver));
1571675a 2192 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2193 }
89bfa8cd 2194 }
5db06880 2195 else if (guessing) {
e759cc13 2196 return s;
5db06880 2197 }
89bfa8cd
PP
2198 }
2199
2200 /* NOTE: The parser sees the package name and the VERSION swapped */
9ded7720 2201 NEXTVAL_NEXTTOKE.opval = version;
185c2e96 2202 force_next(BAREWORD);
89bfa8cd 2203
e759cc13 2204 return s;
89bfa8cd
PP
2205}
2206
ffb4593c 2207/*
91152fc1
DG
2208 * S_force_strict_version
2209 * Forces the next token to be a version number using strict syntax rules.
2210 */
2211
2212STATIC char *
2213S_force_strict_version(pTHX_ char *s)
2214{
91152fc1 2215 OP *version = NULL;
91152fc1
DG
2216 const char *errstr = NULL;
2217
2218 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2219
2220 while (isSPACE(*s)) /* leading whitespace */
2221 s++;
2222
2223 if (is_STRICT_VERSION(s,&errstr)) {
2224 SV *ver = newSV(0);
2225 s = (char *)scan_version(s, ver, 0);
2226 version = newSVOP(OP_CONST, 0, ver);
2227 }
407f8cf2
KW
2228 else if ((*s != ';' && *s != '{' && *s != '}' )
2229 && (s = skipspace(s), (*s != ';' && *s != '{' && *s != '}' )))
4e4da3ac 2230 {
91152fc1
DG
2231 PL_bufptr = s;
2232 if (errstr)
2233 yyerror(errstr); /* version required */
2234 return s;
2235 }
2236
91152fc1 2237 /* NOTE: The parser sees the package name and the VERSION swapped */
91152fc1 2238 NEXTVAL_NEXTTOKE.opval = version;
185c2e96 2239 force_next(BAREWORD);
91152fc1
DG
2240
2241 return s;
2242}
2243
2244/*
ffb4593c
NT
2245 * S_tokeq
2246 * Tokenize a quoted string passed in as an SV. It finds the next
2247 * chunk, up to end of string or a backslash. It may make a new
2248 * SV containing that chunk (if HINT_NEW_STRING is on). It also
2249 * turns \\ into \.
2250 */
2251
76e3520e 2252STATIC SV *
cea2e8a9 2253S_tokeq(pTHX_ SV *sv)
79072805 2254{
eb578fdb
KW
2255 char *s;
2256 char *send;
2257 char *d;
b3ac6de7 2258 SV *pv = sv;
79072805 2259
7918f24d
NC
2260 PERL_ARGS_ASSERT_TOKEQ;
2261
279b35ad
FC
2262 assert (SvPOK(sv));
2263 assert (SvLEN(sv));
2264 assert (!SvIsCOW(sv));
307ed071 2265 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1) /* <<'heredoc' */
b3ac6de7 2266 goto finish;
279b35ad
FC
2267 s = SvPVX(sv);
2268 send = SvEND(sv);
dcb21ed6
NC
2269 /* This is relying on the SV being "well formed" with a trailing '\0' */
2270 while (s < send && !(*s == '\\' && s[1] == '\\'))
79072805
LW
2271 s++;
2272 if (s == send)
b3ac6de7 2273 goto finish;
79072805 2274 d = s;
be4731d2 2275 if ( PL_hints & HINT_NEW_STRING ) {
279b35ad
FC
2276 pv = newSVpvn_flags(SvPVX_const(pv), SvCUR(sv),
2277 SVs_TEMP | SvUTF8(sv));
be4731d2 2278 }
79072805
LW
2279 while (s < send) {
2280 if (*s == '\\') {
a0d0e21e 2281 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2282 s++; /* all that, just for this */
2283 }
2284 *d++ = *s++;
2285 }
2286 *d = '\0';
95a20fc0 2287 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2288 finish:
3280af22 2289 if ( PL_hints & HINT_NEW_STRING )
eb0d8d16 2290 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
79072805
LW
2291 return sv;
2292}
2293
ffb4593c
NT
2294/*
2295 * Now come three functions related to double-quote context,
2296 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2297 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2298 * interact with PL_lex_state, and create fake ( ... ) argument lists
2299 * to handle functions and concatenation.
ecd24171
DM
2300 * For example,
2301 * "foo\lbar"
2302 * is tokenised as
2303 * stringify ( const[foo] concat lcfirst ( const[bar] ) )
ffb4593c
NT
2304 */
2305
2306/*
2307 * S_sublex_start
6154021b 2308 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2309 *
2310 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2311 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c 2312 *
16d1d8bd 2313 * OP_CONST is easy--just make the new op and return.
ffb4593c
NT
2314 *
2315 * Everything else becomes a FUNC.
2316 *
16d1d8bd
LM
2317 * Sets PL_lex_state to LEX_INTERPPUSH unless ival was OP_NULL or we
2318 * had an OP_CONST. This just sets us up for a
ffb4593c
NT
2319 * call to S_sublex_push().
2320 */
2321
76e3520e 2322STATIC I32
cea2e8a9 2323S_sublex_start(pTHX)
79072805 2324{
eb578fdb 2325 const I32 op_type = pl_yylval.ival;
79072805
LW
2326
2327 if (op_type == OP_NULL) {
6154021b 2328 pl_yylval.opval = PL_lex_op;
5f66b61c 2329 PL_lex_op = NULL;
79072805
LW
2330 return THING;
2331 }
466112bb 2332 if (op_type == OP_CONST) {
67c71cbb
FC
2333 SV *sv = PL_lex_stuff;
2334 PL_lex_stuff = NULL;
2335 sv = tokeq(sv);
b3ac6de7
IZ
2336
2337 if (SvTYPE(sv) == SVt_PVIV) {
2338 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2339 STRLEN len;
96a5add6 2340 const char * const p = SvPV_const(sv, len);
740cce10 2341 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2342 SvREFCNT_dec(sv);
2343 sv = nsv;
4e553d73 2344 }
275103cd 2345 pl_yylval.opval = newSVOP(op_type, 0, sv);
79072805
LW
2346 return THING;
2347 }
2348
7ef70b3d
FC
2349 PL_parser->lex_super_state = PL_lex_state;
2350 PL_parser->lex_sub_inwhat = (U16)op_type;
2351 PL_parser->lex_sub_op = PL_lex_op;
3280af22 2352 PL_lex_state = LEX_INTERPPUSH;
55497cff 2353
3280af22
NIS
2354 PL_expect = XTERM;
2355 if (PL_lex_op) {
6154021b 2356 pl_yylval.opval = PL_lex_op;
5f66b61c 2357 PL_lex_op = NULL;
55497cff
PP
2358 return PMFUNC;
2359 }
2360 else
2361 return FUNC;
2362}
2363
ffb4593c
NT
2364/*
2365 * S_sublex_push
2366 * Create a new scope to save the lexing state. The scope will be
2367 * ended in S_sublex_done. Returns a '(', starting the function arguments
2368 * to the uc, lc, etc. found before.
2369 * Sets PL_lex_state to LEX_INTERPCONCAT.
2370 */
2371
76e3520e 2372STATIC I32
cea2e8a9 2373S_sublex_push(pTHX)
55497cff 2374{
78a635de 2375 LEXSHARED *shared;
801d32ac 2376 const bool is_heredoc = PL_multi_close == '<';
f46d017c 2377 ENTER;
55497cff 2378
7ef70b3d 2379 PL_lex_state = PL_parser->lex_super_state;
cc624add 2380 SAVEI8(PL_lex_dojoin);
3280af22 2381 SAVEI32(PL_lex_brackets);
78cdf107 2382 SAVEI32(PL_lex_allbrackets);
b27dce25 2383 SAVEI32(PL_lex_formbrack);
78cdf107 2384 SAVEI8(PL_lex_fakeeof);
3280af22
NIS
2385 SAVEI32(PL_lex_casemods);
2386 SAVEI32(PL_lex_starts);
651b5b28 2387 SAVEI8(PL_lex_state);
7cc34111 2388 SAVESPTR(PL_lex_repl);
7766f137 2389 SAVEVPTR(PL_lex_inpat);
98246f1e 2390 SAVEI16(PL_lex_inwhat);
ffdb8b16 2391 if (is_heredoc)
b42366d4 2392 {
ffdb8b16 2393 SAVECOPLINE(PL_curcop);
b42366d4 2394 SAVEI32(PL_multi_end);
851b527a
FC
2395 SAVEI32(PL_parser->herelines);
2396 PL_parser->herelines = 0;
b42366d4 2397 }
2ca4363d 2398 SAVEIV(PL_multi_close);
3280af22 2399 SAVEPPTR(PL_bufptr);
8452ff4b 2400 SAVEPPTR(PL_bufend);
3280af22
NIS
2401 SAVEPPTR(PL_oldbufptr);
2402 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2403 SAVEPPTR(PL_last_lop);
2404 SAVEPPTR(PL_last_uni);
3280af22
NIS
2405 SAVEPPTR(PL_linestart);
2406 SAVESPTR(PL_linestr);
8edd5f42
RGS
2407 SAVEGENERICPV(PL_lex_brackstack);
2408 SAVEGENERICPV(PL_lex_casestack);
78a635de 2409 SAVEGENERICPV(PL_parser->lex_shared);
3a54fd60 2410 SAVEBOOL(PL_parser->lex_re_reparsing);
ffdb8b16 2411 SAVEI32(PL_copline);
3280af22 2412
99bd9d90 2413 /* The here-doc parser needs to be able to peek into outer lexing
60f40a38
FC
2414 scopes to find the body of the here-doc. So we put PL_linestr and
2415 PL_bufptr into lex_shared, to ‘share’ those values.
99bd9d90 2416 */
60f40a38
FC
2417 PL_parser->lex_shared->ls_linestr = PL_linestr;
2418 PL_parser->lex_shared->ls_bufptr = PL_bufptr;
99bd9d90 2419
3280af22 2420 PL_linestr = PL_lex_stuff;
7ef70b3d 2421 PL_lex_repl = PL_parser->lex_sub_repl;
a0714e2c 2422 PL_lex_stuff = NULL;
7ef70b3d 2423 PL_parser->lex_sub_repl = NULL;
3280af22 2424
eabab8bc
FC
2425 /* Arrange for PL_lex_stuff to be freed on scope exit, in case it gets
2426 set for an inner quote-like operator and then an error causes scope-
2427 popping. We must not have a PL_lex_stuff value left dangling, as
2428 that breaks assumptions elsewhere. See bug #123617. */
2429 SAVEGENERICSV(PL_lex_stuff);
7ef70b3d 2430 SAVEGENERICSV(PL_parser->lex_sub_repl);
eabab8bc 2431
9cbb5ea2
GS
2432 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2433 = SvPVX(PL_linestr);
3280af22 2434 PL_bufend += SvCUR(PL_linestr);
bd61b366 2435 PL_last_lop = PL_last_uni = NULL;
3280af22 2436 SAVEFREESV(PL_linestr);
4dc843bc 2437 if (PL_lex_repl) SAVEFREESV(PL_lex_repl);
3280af22
NIS
2438
2439 PL_lex_dojoin = FALSE;
b27dce25 2440 PL_lex_brackets = PL_lex_formbrack = 0;
78cdf107
Z
2441 PL_lex_allbrackets = 0;
2442 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
a02a5408
JC
2443 Newx(PL_lex_brackstack, 120, char);
2444 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2445 PL_lex_casemods = 0;
2446 *PL_lex_casestack = '\0';
2447 PL_lex_starts = 0;
2448 PL_lex_state = LEX_INTERPCONCAT;
ffdb8b16 2449 if (is_heredoc)
6ddcf93b 2450 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
ffdb8b16 2451 PL_copline = NOLINE;
78a635de
FC
2452
2453 Newxz(shared, 1, LEXSHARED);
2454 shared->ls_prev = PL_parser->lex_shared;
2455 PL_parser->lex_shared = shared;
3280af22 2456
7ef70b3d 2457 PL_lex_inwhat = PL_parser->lex_sub_inwhat;
bb16bae8 2458 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
3280af22 2459 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
7ef70b3d 2460 PL_lex_inpat = PL_parser->lex_sub_op;
79072805 2461 else
5f66b61c 2462 PL_lex_inpat = NULL;
79072805 2463
3a54fd60
DM
2464 PL_parser->lex_re_reparsing = cBOOL(PL_in_eval & EVAL_RE_REPARSING);
2465 PL_in_eval &= ~EVAL_RE_REPARSING;
2466
55497cff 2467 return '(';
79072805
LW
2468}
2469
ffb4593c
NT
2470/*
2471 * S_sublex_done
2472 * Restores lexer state after a S_sublex_push.
2473 */
2474
76e3520e 2475STATIC I32
cea2e8a9 2476S_sublex_done(pTHX)
79072805 2477{
3280af22 2478 if (!PL_lex_starts++) {
396482e1 2479 SV * const sv = newSVpvs("");
9aa983d2
JH
2480 if (SvUTF8(PL_linestr))
2481 SvUTF8_on(sv);
3280af22 2482 PL_expect = XOPERATOR;
275103cd 2483 pl_yylval.opval = newSVOP(OP_CONST, 0, sv);
79072805
LW
2484 return THING;
2485 }
2486
3280af22
NIS
2487 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2488 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2489 return yylex();
79072805
LW
2490 }
2491
ffb4593c 2492 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
bb16bae8 2493 assert(PL_lex_inwhat != OP_TRANSR);
5aa91856
FC
2494 if (PL_lex_repl) {
2495 assert (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS);
3280af22
NIS
2496 PL_linestr = PL_lex_repl;
2497 PL_lex_inpat = 0;
2498 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2499 PL_bufend += SvCUR(PL_linestr);
bd61b366 2500 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2501 PL_lex_dojoin = FALSE;
2502 PL_lex_brackets = 0;
78cdf107
Z
2503 PL_lex_allbrackets = 0;
2504 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
3280af22
NIS
2505 PL_lex_casemods = 0;
2506 *PL_lex_casestack = '\0';
2507 PL_lex_starts = 0;
25da4f38 2508 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2509 PL_lex_state = LEX_INTERPNORMAL;
2510 PL_lex_starts++;
e9fa98b2
HS
2511 /* we don't clear PL_lex_repl here, so that we can check later
2512 whether this is an evalled subst; that means we rely on the
2513 logic to ensure sublex_done() is called again only via the
2514 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2515 }
e9fa98b2 2516 else {
3280af22 2517 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2518 PL_lex_repl = NULL;
e9fa98b2 2519 }
ffdb8b16
FC
2520 if (SvTYPE(PL_linestr) >= SVt_PVNV) {
2521 CopLINE(PL_curcop) +=
9420b268 2522 ((XPVNV*)SvANY(PL_linestr))->xnv_u.xnv_lines
851b527a
FC
2523 + PL_parser->herelines;
2524 PL_parser->herelines = 0;
ffdb8b16 2525 }
9b6b7be8 2526 return '/';
ffed7fef
LW
2527 }
2528 else {
b42366d4 2529 const line_t l = CopLINE(PL_curcop);
f46d017c 2530 LEAVE;
b42366d4 2531 if (PL_multi_close == '<')
851b527a 2532 PL_parser->herelines += l - PL_multi_end;
3280af22
NIS
2533 PL_bufend = SvPVX(PL_linestr);
2534 PL_bufend += SvCUR(PL_linestr);
2535 PL_expect = XOPERATOR;
79072805 2536 return ')';
ffed7fef
LW
2537 }
2538}
2539
6f613c73
KW
2540PERL_STATIC_INLINE SV*
2541S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
2542{
140b12ad
KW
2543 /* <s> points to first character of interior of \N{}, <e> to one beyond the
2544 * interior, hence to the "}". Finds what the name resolves to, returning
2545 * an SV* containing it; NULL if no valid one found */
2546
dd2b1b72 2547 SV* res = newSVpvn_flags(s, e - s, UTF ? SVf_UTF8 : 0);
6f613c73 2548
0c415a79
KW
2549 HV * table;
2550 SV **cvp;
2551 SV *cv;
2552 SV *rv;
2553 HV *stash;
2554 const U8* first_bad_char_loc;
2555 const char* backslash_ptr = s - 3; /* Points to the <\> of \N{... */
2556
6f613c73
KW
2557 PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME;
2558
d8d26cac
KW
2559 if (!SvCUR(res)) {
2560 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),
2561 "Unknown charname '' is deprecated");
b7e6151c 2562 return res;
d8d26cac 2563 }
b7e6151c 2564
107160e2
KW
2565 if (UTF && ! is_utf8_string_loc((U8 *) backslash_ptr,
2566 e - backslash_ptr,
2567 &first_bad_char_loc))
2568 {
75219bac
KW
2569 _force_out_malformed_utf8_message(first_bad_char_loc,
2570 (U8 *) PL_parser->bufend,
2571 0,
2572 0 /* 0 means don't die */ );
b59c097b 2573 yyerror_pv(Perl_form(aTHX_
107160e2 2574 "Malformed UTF-8 character immediately after '%.*s'",
b59c097b
AV
2575 (int) (first_bad_char_loc - (U8 *) backslash_ptr), backslash_ptr),
2576 SVf_UTF8);
107160e2
KW
2577 return NULL;
2578 }
2579
2580 res = new_constant( NULL, 0, "charnames", res, NULL, backslash_ptr,
2581 /* include the <}> */
2582 e - backslash_ptr + 1);
6f613c73 2583 if (! SvPOK(res)) {
b6407c49 2584 SvREFCNT_dec_NN(res);
6f613c73
KW
2585 return NULL;
2586 }
2587
0c415a79
KW
2588 /* See if the charnames handler is the Perl core's, and if so, we can skip
2589 * the validation needed for a user-supplied one, as Perl's does its own
2590 * validation. */
2591 table = GvHV(PL_hintgv); /* ^H */
2592 cvp = hv_fetchs(table, "charnames", FALSE);
5882ddb3
FC
2593 if (cvp && (cv = *cvp) && SvROK(cv) && (rv = SvRV(cv),
2594 SvTYPE(rv) == SVt_PVCV) && ((stash = CvSTASH(rv)) != NULL))
0c415a79
KW
2595 {
2596 const char * const name = HvNAME(stash);
6a642c21
FC
2597 if (HvNAMELEN(stash) == sizeof("_charnames")-1
2598 && strEQ(name, "_charnames")) {
0c415a79
KW
2599 return res;
2600 }
2601 }
2602
bde9e88d
KW
2603 /* Here, it isn't Perl's charname handler. We can't rely on a
2604 * user-supplied handler to validate the input name. For non-ut8 input,
2605 * look to see that the first character is legal. Then loop through the
2606 * rest checking that each is a continuation */
6f613c73 2607
36897d64
KW
2608 /* This code makes the reasonable assumption that the only Latin1-range
2609 * characters that begin a character name alias are alphabetic, otherwise
2610 * would have to create a isCHARNAME_BEGIN macro */
b6ba1137 2611
b6ba1137 2612 if (! UTF) {
bde9e88d 2613 if (! isALPHAU(*s)) {
b6ba1137
KW
2614 goto bad_charname;
2615 }
bde9e88d
KW
2616 s++;
2617 while (s < e) {
2618 if (! isCHARNAME_CONT(*s)) {
b6ba1137
KW
2619 goto bad_charname;
2620 }
2d8eb851
KW
2621 if (*s == ' ' && *(s-1) == ' ') {
2622 goto multi_spaces;
bd299e29 2623 }
bde9e88d 2624 s++;
b6ba1137
KW
2625 }
2626 }
2627 else {
bde9e88d
KW
2628 /* Similarly for utf8. For invariants can check directly; for other
2629 * Latin1, can calculate their code point and check; otherwise use a
2630 * swash */
2631 if (UTF8_IS_INVARIANT(*s)) {
2632 if (! isALPHAU(*s)) {
140b12ad
KW
2633 goto bad_charname;
2634 }
bde9e88d
KW
2635 s++;
2636 } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
a62b247b 2637 if (! isALPHAU(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)))) {
b6ba1137 2638 goto bad_charname;
6f613c73 2639 }
bde9e88d 2640 s += 2;
6f613c73 2641 }
bde9e88d
KW
2642 else {
2643 if (! PL_utf8_charname_begin) {
2644 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2645 PL_utf8_charname_begin = _core_swash_init("utf8",
2646 "_Perl_Charname_Begin",
2647 &PL_sv_undef,
2648 1, 0, NULL, &flags);
2649 }
2650 if (! swash_fetch(PL_utf8_charname_begin, (U8 *) s, TRUE)) {
2651 goto bad_charname;
2652 }
2653 s += UTF8SKIP(s);
2654 }
2655
2656 while (s < e) {
2657 if (UTF8_IS_INVARIANT(*s)) {
2658 if (! isCHARNAME_CONT(*s)) {
2659 goto bad_charname;
2660 }
2d8eb851
KW
2661 if (*s == ' ' && *(s-1) == ' ') {
2662 goto multi_spaces;
bd299e29 2663 }
bde9e88d
KW
2664 s++;
2665 }
2666 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
a62b247b 2667 if (! isCHARNAME_CONT(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1))))
bde9e88d
KW
2668 {
2669 goto bad_charname;
2670 }
2671 s += 2;
2672 }
2673 else {
2674 if (! PL_utf8_charname_continue) {
2675 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2676 PL_utf8_charname_continue = _core_swash_init("utf8",
2677 "_Perl_Charname_Continue",
2678 &PL_sv_undef,
2679 1, 0, NULL, &flags);
2680 }
2681 if (! swash_fetch(PL_utf8_charname_continue, (U8 *) s, TRUE)) {
2682 goto bad_charname;
2683 }
2684 s += UTF8SKIP(s);
6f613c73
KW
2685 }
2686 }
2d8eb851
KW
2687 }
2688 if (*(s-1) == ' ') {
2689 yyerror_pv(
2690 Perl_form(aTHX_
2691 "charnames alias definitions may not contain trailing "
2692 "white-space; marked by <-- HERE in %.*s<-- HERE %.*s",
2693 (int)(s - backslash_ptr + 1), backslash_ptr,
2694 (int)(e - s + 1), s + 1
2695 ),
2696 UTF ? SVf_UTF8 : 0);
2697 return NULL;
6f613c73
KW
2698 }
2699
94ca1619 2700 if (SvUTF8(res)) { /* Don't accept malformed input */
bde9e88d
KW
2701 const U8* first_bad_char_loc;
2702 STRLEN len;
2703 const char* const str = SvPV_const(res, len);
2704 if (! is_utf8_string_loc((U8 *) str, len, &first_bad_char_loc)) {
75219bac
KW
2705 _force_out_malformed_utf8_message(first_bad_char_loc,
2706 (U8 *) PL_parser->bufend,
2707 0,
2708 0 /* 0 means don't die */ );
bde9e88d
KW
2709 yyerror_pv(
2710 Perl_form(aTHX_
2711 "Malformed UTF-8 returned by %.*s immediately after '%.*s'",
2712 (int) (e - backslash_ptr + 1), backslash_ptr,
2713 (int) ((char *) first_bad_char_loc - str), str
2714 ),
2715 SVf_UTF8);
2716 return NULL;
2717 }
2718 }
140b12ad 2719
bde9e88d 2720 return res;
140b12ad 2721
bde9e88d 2722 bad_charname: {
bde9e88d
KW
2723
2724 /* The final %.*s makes sure that should the trailing NUL be missing
2725 * that this print won't run off the end of the string */
2726 yyerror_pv(
2727 Perl_form(aTHX_
2728 "Invalid character in \\N{...}; marked by <-- HERE in %.*s<-- HERE %.*s",
2d8eb851
KW
2729 (int)(s - backslash_ptr + 1), backslash_ptr,
2730 (int)(e - s + 1), s + 1
bde9e88d
KW
2731 ),
2732 UTF ? SVf_UTF8 : 0);
2733 return NULL;
2734 }
2d8eb851
KW
2735
2736 multi_spaces:
2737 yyerror_pv(
2738 Perl_form(aTHX_
2739 "charnames alias definitions may not contain a sequence of "
2740 "multiple spaces; marked by <-- HERE in %.*s<-- HERE %.*s",
2741 (int)(s - backslash_ptr + 1), backslash_ptr,
2742 (int)(e - s + 1), s + 1
2743 ),
2744 UTF ? SVf_UTF8 : 0);
2745 return NULL;
6f613c73
KW
2746}
2747
02aa26ce
NT
2748/*
2749 scan_const
2750
9da1dd8f
DM
2751 Extracts the next constant part of a pattern, double-quoted string,
2752 or transliteration. This is terrifying code.
2753
2754 For example, in parsing the double-quoted string "ab\x63$d", it would
2755 stop at the '$' and return an OP_CONST containing 'abc'.
02aa26ce 2756
94def140 2757 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 2758 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 2759 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 2760
94def140
ST
2761 Returns a pointer to the character scanned up to. If this is
2762 advanced from the start pointer supplied (i.e. if anything was
9da1dd8f 2763 successfully parsed), will leave an OP_CONST for the substring scanned
6154021b 2764 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
2765 by looking at the next characters herself.
2766
02aa26ce 2767 In patterns:
9da1dd8f 2768 expand:
537124e4
KW
2769 \N{FOO} => \N{U+hex_for_character_FOO}
2770 (if FOO expands to multiple characters, expands to \N{U+xx.XX.yy ...})
9da1dd8f
DM
2771
2772 pass through:
2773 all other \-char, including \N and \N{ apart from \N{ABC}
2774
2775 stops on:
2776 @ and $ where it appears to be a var, but not for $ as tail anchor
2777 \l \L \u \U \Q \E
2778 (?{ or (??{
2779
02aa26ce
NT
2780 In transliterations:
2781 characters are VERY literal, except for - not at the start or end
94def140
ST
2782 of the string, which indicates a range. If the range is in bytes,
2783 scan_const expands the range to the full set of intermediate
2784 characters. If the range is in utf8, the hyphen is replaced with
2785 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
2786
2787 In double-quoted strings:
2788 backslashes:
2789 double-quoted style: \r and \n
ff3f963a 2790 constants: \x31, etc.
94def140 2791 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
2792 case and quoting: \U \Q \E
2793 stops on @ and $
2794
2795 scan_const does *not* construct ops to handle interpolated strings.
2796 It stops processing as soon as it finds an embedded $ or @ variable
2797 and leaves it to the caller to work out what's going on.
2798
94def140
ST
2799 embedded arrays (whether in pattern or not) could be:
2800 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2801
2802 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
2803
2804 $ in pattern could be $foo or could be tail anchor. Assumption:
2805 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 2806 followed by one of "()| \r\n\t"
02aa26ce 2807
9da1dd8f 2808 \1 (backreferences) are turned into $1 in substitutions
02aa26ce
NT
2809
2810 The structure of the code is
2811 while (there's a character to process) {
94def140
ST
2812 handle transliteration ranges
2813 skip regexp comments /(?#comment)/ and codes /(?{code})/
2814 skip #-initiated comments in //x patterns
2815 check for embedded arrays
02aa26ce
NT
2816 check for embedded scalars
2817 if (backslash) {
94def140 2818 deprecate \1 in substitution replacements
02aa26ce
NT
2819 handle string-changing backslashes \l \U \Q \E, etc.
2820 switch (what was escaped) {
94def140 2821 handle \- in a transliteration (becomes a literal -)
ff3f963a 2822 if a pattern and not \N{, go treat as regular character
94def140
ST
2823 handle \132 (octal characters)
2824 handle \x15 and \x{1234} (hex characters)
ff3f963a 2825 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
ST
2826 handle \cV (control characters)
2827 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 2828 } (end switch)
77a135fe 2829 continue
02aa26ce 2830 } (end if backslash)
77a135fe 2831 handle regular character
02aa26ce 2832 } (end while character to read)
4e553d73 2833
02aa26ce
NT
2834*/
2835
76e3520e 2836STATIC char *
cea2e8a9 2837S_scan_const(pTHX_ char *start)
79072805 2838{
eb578fdb 2839 char *send = PL_bufend; /* end of the constant */
dc023dbb
KW
2840 SV *sv = newSV(send - start); /* sv for the constant. See note below
2841 on sizing. */
eb578fdb
KW
2842 char *s = start; /* start of the constant */
2843 char *d = SvPVX(sv); /* destination for copies */
dc023dbb
KW
2844 bool dorange = FALSE; /* are we in a translit range? */
2845 bool didrange = FALSE; /* did we just finish a range? */
2846 bool in_charclass = FALSE; /* within /[...]/ */
2847 bool has_utf8 = FALSE; /* Output constant is UTF8 */
2848 bool this_utf8 = cBOOL(UTF); /* Is the source string assumed to be
2849 UTF8? But, this can show as true
2850 when the source isn't utf8, as for
2851 example when it is entirely composed
2852 of hex constants */
6f613c73 2853 SV *res; /* result from charnames */
f4240379
KW
2854 STRLEN offset_to_max; /* The offset in the output to where the range
2855 high-end character is temporarily placed */
77a135fe
KW
2856
2857 /* Note on sizing: The scanned constant is placed into sv, which is
2858 * initialized by newSV() assuming one byte of output for every byte of
2859 * input. This routine expects newSV() to allocate an extra byte for a
2860 * trailing NUL, which this routine will append if it gets to the end of
2861 * the input. There may be more bytes of input than output (eg., \N{LATIN
2862 * CAPITAL LETTER A}), or more output than input if the constant ends up
2863 * recoded to utf8, but each time a construct is found that might increase
2864 * the needed size, SvGROW() is called. Its size parameter each time is
2865 * based on the best guess estimate at the time, namely the length used so
2866 * far, plus the length the current construct will occupy, plus room for
2867 * the trailing NUL, plus one byte for every input byte still unscanned */
2868
c3320c2a
KW
2869 UV uv = UV_MAX; /* Initialize to weird value to try to catch any uses
2870 before set */
4c3a8340 2871#ifdef EBCDIC
f4240379
KW
2872 int backslash_N = 0; /* ? was the character from \N{} */
2873 int non_portable_endpoint = 0; /* ? In a range is an endpoint
2874 platform-specific like \x65 */
4c3a8340 2875#endif
012bcf8d 2876
7918f24d
NC
2877 PERL_ARGS_ASSERT_SCAN_CONST;
2878
bb16bae8 2879 assert(PL_lex_inwhat != OP_TRANSR);
7ef70b3d 2880 if (PL_lex_inwhat == OP_TRANS && PL_parser->lex_sub_op) {
2b9d42f0 2881 /* If we are doing a trans and we know we want UTF8 set expectation */
7ef70b3d
FC
2882 has_utf8 = PL_parser->lex_sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
2883 this_utf8 = PL_parser->lex_sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
2b9d42f0
NIS
2884 }
2885
b899e89d
FC
2886 /* Protect sv from errors and fatal warnings. */
2887 ENTER_with_name("scan_const");
2888 SAVEFREESV(sv);
2b9d42f0 2889
f4240379
KW
2890 while (s < send
2891 || dorange /* Handle tr/// range at right edge of input */
2892 ) {
ff3f963a 2893
02aa26ce 2894 /* get transliterations out of the way (they're most literal) */
3280af22 2895 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 2896
f4240379
KW
2897 /* But there isn't any special handling necessary unless there is a
2898 * range, so for most cases we just drop down and handle the value
2899 * as any other. There are two exceptions.
2900 *
2901 * 1. A minus sign indicates that we are actually going to have
2902 * a range. In this case, skip the '-', set a flag, then drop
2903 * down to handle what should be the end range value.
2904 * 2. After we've handled that value, the next time through, that
2905 * flag is set and we fix up the range.
2906 *
2907 * Ranges entirely within Latin1 are expanded out entirely, in
2908 * order to avoid the significant overhead of making a swash.
2909 * Ranges that extend above Latin1 have to have a swash, so there
4698efb6
KW
2910 * is no advantage to abbreviating them here, so they are stored
2911 * here as Min, ILLEGAL_UTF8_BYTE, Max. The illegal byte signifies
2912 * a hyphen without any possible ambiguity. On EBCDIC machines, if
f4240379
KW
2913 * the range is expressed as Unicode, the Latin1 portion is
2914 * expanded out even if the entire range extends above Latin1.
2915 * This is because each code point in it has to be processed here
2916 * individually to get its native translation */
2917
2918 if (! dorange) {
2919
2920 /* Here, we don't think we're in a range. If we've processed
2921 * at least one character, then see if this next one is a '-',
2922 * indicating the previous one was the start of a range. But
2923 * don't bother if we're too close to the end for the minus to
2924 * mean that. */
2925 if (*s != '-' || s >= send - 1 || s == start) {
2926
2927 /* A regular character. Process like any other, but first
2928 * clear any flags */
2929 didrange = FALSE;
2930 dorange = FALSE;
e294cc5d 2931#ifdef EBCDIC
f4240379
KW
2932 non_portable_endpoint = 0;
2933 backslash_N = 0;
e294cc5d 2934#endif
f4240379
KW
2935 /* Drops down to generic code to process current byte */
2936 }
2937 else {
2938 if (didrange) { /* Something like y/A-C-Z// */
2939 Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
2940 }
e294cc5d 2941
f4240379 2942 dorange = TRUE;
2b9d42f0 2943
f4240379
KW
2944 s++; /* Skip past the minus */
2945
2946 /* d now points to where the end-range character will be
2947 * placed. Save it so won't have to go finding it later,
2948 * and drop down to get that character. (Actually we
2949 * instead save the offset, to handle the case where a
2950 * realloc in the meantime could change the actual
2951 * pointer). We'll finish processing the range the next
2952 * time through the loop */
2953 offset_to_max = d - SvPVX_const(sv);
2954 }
2955 } /* End of not a range */
2956 else {
2957 /* Here we have parsed a range. Now must handle it. At this
2958 * point:
2959 * 'sv' is a SV* that contains the output string we are
2960 * constructing. The final two characters in that string
2961 * are the range start and range end, in order.
2962 * 'd' points to just beyond the range end in the 'sv' string,
2963 * where we would next place something
2964 * 'offset_to_max' is the offset in 'sv' at which the character
2965 * before 'd' begins.
2966 */
2967 const char * max_ptr = SvPVX_const(sv) + offset_to_max;
2968 const char * min_ptr;
2969 IV range_min;
2970 IV range_max; /* last character in range */
2971 STRLEN save_offset;
2972 STRLEN grow;
11327fa1 2973#ifdef EBCDIC
f4240379
KW
2974 bool convert_unicode;
2975 IV real_range_max = 0;
e294cc5d 2976#endif
f4240379
KW
2977
2978 /* Get the range-ends code point values. */
e294cc5d 2979 if (has_utf8) {
f4240379
KW
2980 /* We know the utf8 is valid, because we just constructed
2981 * it ourselves in previous loop iterations */
2982 min_ptr = (char*) utf8_hop( (U8*) max_ptr, -1);
2983 range_min = valid_utf8_to_uvchr( (U8*) min_ptr, NULL);
2984 range_max = valid_utf8_to_uvchr( (U8*) max_ptr, NULL);
e294cc5d 2985 }
f4240379
KW
2986 else {
2987 min_ptr = max_ptr - 1;
2988 range_min = * (U8*) min_ptr;
2989 range_max = * (U8*) max_ptr;
2990 }
2991
e294cc5d 2992#ifdef EBCDIC
f4240379
KW
2993 /* On EBCDIC platforms, we may have to deal with portable
2994 * ranges. These happen if at least one range endpoint is a
2995 * Unicode value (\N{...}), or if the range is a subset of
2996 * [A-Z] or [a-z], and both ends are literal characters,
2997 * like 'A', and not like \x{C1} */
2998 if ((convert_unicode
2999 = cBOOL(backslash_N) /* \N{} forces Unicode, hence
3000 portable range */
3001 || ( ! non_portable_endpoint
3002 && (( isLOWER_A(range_min) && isLOWER_A(range_max))
3003 || (isUPPER_A(range_min) && isUPPER_A(range_max))))
3004 )) {
3005
3006 /* Special handling is needed for these portable ranges.
3007 * They are defined to all be in Unicode terms, which
3008 * include all Unicode code points between the end points.
3009 * Convert to Unicode to get the Unicode range. Later we
3010 * will convert each code point in the range back to
3011 * native. */
3012 range_min = NATIVE_TO_UNI(range_min);
3013 range_max = NATIVE_TO_UNI(range_max);
3014 }
e294cc5d 3015#endif
8ada0baa 3016
f4240379 3017 if (range_min > range_max) {
11327fa1 3018#ifdef EBCDIC
f4240379
KW
3019 if (convert_unicode) {
3020 /* Need to convert back to native for meaningful
3021 * messages for this platform */
3022 range_min = UNI_TO_NATIVE(range_min);
3023 range_max = UNI_TO_NATIVE(range_max);
3024 }
11327fa1 3025#endif
f4240379
KW
3026
3027 /* Use the characters themselves for the error message if
3028 * ASCII printables; otherwise some visible representation
3029 * of them */
3030 if (isPRINT_A(range_min) && isPRINT_A(range_max)) {
3031 Perl_croak(aTHX_
3032 "Invalid range \"%c-%c\" in transliteration operator",
3033 (char)range_min, (char)range_max);
3034 }
11327fa1 3035#ifdef EBCDIC
f4240379
KW
3036 else if (convert_unicode) {
3037 /* diag_listed_as: Invalid range "%s" in transliteration operator */
3038 Perl_croak(aTHX_
147e3846 3039 "Invalid range \"\\N{U+%04" UVXf "}-\\N{U+%04" UVXf "}\""
f4240379
KW
3040 " in transliteration operator",
3041 range_min, range_max);
3042 }
11327fa1 3043#endif
f4240379
KW
3044 else {
3045 /* diag_listed_as: Invalid range "%s" in transliteration operator */
3046 Perl_croak(aTHX_
147e3846 3047 "Invalid range \"\\x{%04" UVXf "}-\\x{%04" UVXf "}\""
f4240379
KW
3048 " in transliteration operator",
3049 range_min, range_max);
3050 }
c2e66d9e
GS
3051 }
3052
f4240379
KW
3053 if (has_utf8) {
3054
3055 /* We try to avoid creating a swash. If the upper end of
3056 * this range is below 256, this range won't force a swash;
3057 * otherwise it does force a swash, and as long as we have
3058 * to have one, we might as well not expand things out.
3059 * But if it's EBCDIC, we may have to look at each
3060 * character below 256 if we have to convert to/from
3061 * Unicode values */
3062 if (range_max > 255
c7f1f016 3063#ifdef EBCDIC
f4240379 3064 && (range_min > 255 || ! convert_unicode)
8ada0baa 3065#endif
f4240379
KW
3066 ) {
3067 /* Move the high character one byte to the right; then
3068 * insert between it and the range begin, an illegal
3069 * byte which serves to indicate this is a range (using
3070 * a '-' could be ambiguous). */
3071 char *e = d++;
3072 while (e-- > max_ptr) {
3073 *(e + 1) = *e;
e294cc5d 3074 }
f4240379
KW
3075 *(e + 1) = (char) ILLEGAL_UTF8_BYTE;
3076 goto range_done;
3077 }
3078
3079 /* Here, we're going to expand out the range. For EBCDIC
3080 * the range can extend above 255 (not so in ASCII), so
3081 * for EBCDIC, split it into the parts above and below
3082 * 255/256 */
e294cc5d 3083#ifdef EBCDIC
f4240379
KW
3084 if (range_max > 255) {
3085 real_range_max = range_max;
3086 range_max = 255;
3087 }
e294cc5d 3088#endif
f4240379 3089 }
02aa26ce 3090
f4240379
KW
3091 /* Here we need to expand out the string to contain each
3092 * character in the range. Grow the output to handle this */
3093
3094 save_offset = min_ptr - SvPVX_const(sv);
3095
3096 /* The base growth is the number of code points in the range */
3097 grow = range_max - range_min + 1;
3098 if (has_utf8) {
3099
3100 /* But if the output is UTF-8, some of those characters may
3101 * need two bytes (since the maximum range value here is
3102 * 255, the max bytes per character is two). On ASCII
3103 * platforms, it's not much trouble to get an accurate
3104 * count of what's needed. But on EBCDIC, the ones that
3105 * need 2 bytes are scattered around, so just use a worst
3106 * case value instead of calculating for that platform. */
4c3a8340 3107#ifdef EBCDIC
f4240379
KW
3108 grow *= 2;
3109#else
3110 /* Only those above 127 require 2 bytes. This may be
3111 * everything in the range, or not */
3112 if (range_min > 127) {
3113 grow *= 2;
3114 }
3115 else if (range_max > 127) {
3116 grow += range_max - 127;
3117 }
4c3a8340 3118#endif
f4240379
KW
3119 }
3120
3121 /* Subtract 3 for the bytes that were already accounted for
3122 * (min, max, and the hyphen) */
10656159 3123 d = save_offset + SvGROW(sv, SvLEN(sv) + grow - 3);
f4240379 3124
11327fa1
AL
3125#ifdef EBCDIC
3126 /* Here, we expand out the range. */
f4240379
KW
3127 if (convert_unicode) {
3128 IV i;
02aa26ce 3129
f4240379
KW
3130 /* Recall that the min and max are now in Unicode terms, so
3131 * we have to convert each character to its native
3132 * equivalent */
3133 if (has_utf8) {
3134 for (i = range_min; i <= range_max; i++) {
81324705 3135 append_utf8_from_native_byte(LATIN1_TO_NATIVE((U8) i),
f4240379
KW
3136 (U8 **) &d);
3137 }
3138 }
3139 else {
3140 for (i = range_min; i <= range_max; i++) {
81324705 3141 *d++ = (char)LATIN1_TO_NATIVE((U8) i);
f4240379
KW
3142 }
3143 }
01ec43d0 3144 }
11327fa1
AL
3145 else
3146#endif
3147 /* Always gets run for ASCII, and sometimes for EBCDIC. */
3148 {
f4240379
KW
3149 IV i;
3150
3151 /* Here, no conversions are necessary, which means that the
3152 * first character in the range is already in 'd' and
3153 * valid, so we can skip overwriting it */
3154 if (has_utf8) {
3155 d += UTF8SKIP(d);
3156 for (i = range_min + 1; i <= range_max; i++) {
81324705 3157 append_utf8_from_native_byte((U8) i, (U8 **) &d);
f4240379
KW
3158 }
3159 }
3160 else {
3161 d++;
3162 for (i = range_min + 1; i <= range_max; i++) {
3163 *d++ = (char)i;
3164 }
3165 }
a0ed51b3 3166 }
02aa26ce 3167
11327fa1
AL
3168#ifdef EBCDIC
3169 /* If the original range extended above 255, add in that portion. */
f4240379
KW
3170 if (real_range_max) {
3171 *d++ = (char) UTF8_TWO_BYTE_HI(0x100);
3172 *d++ = (char) UTF8_TWO_BYTE_LO(0x100);
3173 if (real_range_max > 0x101)
3174 *d++ = (char) ILLEGAL_UTF8_BYTE;
3175 if (real_range_max > 0x100)
3176 d = (char*)uvchr_to_utf8((U8*)d, real_range_max);
3177 }
11327fa1 3178#endif
02aa26ce 3179
f4240379
KW
3180 range_done:
3181 /* mark the range as done, and continue */
3182 didrange = TRUE;
3183 dorange = FALSE;
3184#ifdef EBCDIC
3185 non_portable_endpoint = 0;
3186 backslash_N = 0;
3187#endif
3188 continue;
3189 } /* End of is a range */
3190 } /* End of transliteration. Joins main code after these else's */
e4a2df84
DM
3191 else if (*s == '[' && PL_lex_inpat && !in_charclass) {
3192 char *s1 = s-1;
3193 int esc = 0;
3194 while (s1 >= start && *s1-- == '\\')
3195 esc = !esc;
3196 if (!esc)
3197 in_charclass = TRUE;
3198 }
2866decb 3199
e4a2df84
DM
3200 else if (*s == ']' && PL_lex_inpat && in_charclass) {
3201 char *s1 = s-1;
3202 int esc = 0;
3203 while (s1 >= start && *s1-- == '\\')
3204 esc = !esc;
3205 if (!esc)
3206 in_charclass = FALSE;
3207 }
2866decb 3208
9da1dd8f
DM
3209 /* skip for regexp comments /(?#comment)/, except for the last
3210 * char, which will be done separately.
3211 * Stop on (?{..}) and friends */
3212
c30fc27b 3213 else if (*s == '(' && PL_lex_inpat && s[1] == '?' && !in_charclass) {
cc6b7395 3214 if (s[2] == '#') {
e994fd66 3215 while (s+1 < send && *s != ')')
5ff03569 3216 *d++ = *s++;
155aba94 3217 }
407f8cf2
KW
3218 else if (!PL_lex_casemods
3219 && ( s[2] == '{' /* This should match regcomp.c */
3220 || (s[2] == '?' && s[3] == '{')))
155aba94 3221 {
9da1dd8f 3222 break;
cc6b7395 3223 }
748a9306 3224 }
02aa26ce
NT
3225
3226 /* likewise skip #-initiated comments in //x patterns */
407f8cf2
KW
3227 else if (*s == '#'
3228 && PL_lex_inpat
3229 && !in_charclass
3230 && ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED)
3231 {
748a9306 3232 while (s+1 < send && *s != '\n')
5ff03569 3233 *d++ = *s++;
748a9306 3234 }
02aa26ce 3235
9da1dd8f
DM
3236 /* no further processing of single-quoted regex */
3237 else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'')
3238 goto default_action;
3239
5d1d4326 3240 /* check for embedded arrays
da6eedaa 3241 (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
5d1d4326 3242 */
1749ea0d 3243 else if (*s == '@' && s[1]) {
9d58dbc4 3244 if (UTF ? isIDFIRST_utf8((U8*)s+1) : isWORDCHAR_A(s[1]))
1749ea0d
ST
3245 break;
3246 if (strchr(":'{$", s[1]))
3247 break;
3248 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
3249 break; /* in regexp, neither @+ nor @- are interpolated */
3250 }
02aa26ce
NT
3251
3252 /* check for embedded scalars. only stop if we're sure it's a
3253 variable.
3254 */
79072805 3255 else if (*s == '$') {
3280af22 3256 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
79072805 3257 break;
77772344 3258 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
a2a5de95
NC
3259 if (s[1] == '\\') {
3260 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
3261 "Possible unintended interpolation of $\\ in regex");
77772344 3262 }
79072805 3263 break; /* in regexp, $ might be tail anchor */
77772344 3264 }
79072805 3265 }
02aa26ce 3266
2b9d42f0
NIS
3267 /* End of else if chain - OP_TRANS rejoin rest */
3268
02aa26ce 3269 /* backslashes */
79072805 3270 if (*s == '\\' && s+1 < send) {
ff3f963a
KW
3271 char* e; /* Can be used for ending '}', etc. */
3272
79072805 3273 s++;
02aa26ce 3274
7d0fc23c
KW
3275 /* warn on \1 - \9 in substitution replacements, but note that \11
3276 * is an octal; and \19 is \1 followed by '9' */
407f8cf2
KW
3277 if (PL_lex_inwhat == OP_SUBST
3278 && !PL_lex_inpat
3279 && isDIGIT(*s)
3280 && *s != '0'
3281 && !isDIGIT(s[1]))
79072805 3282 {
c782d7ee 3283 /* diag_listed_as: \%d better written as $%d */
a2a5de95 3284 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
79072805
LW
3285 *--s = '$';
3286 break;
3287 }
02aa26ce
NT
3288
3289 /* string-change backslash escapes */
838f2281 3290 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQF", *s)) {
79072805
LW
3291 --s;
3292 break;
3293 }
ff3f963a
KW
3294 /* In a pattern, process \N, but skip any other backslash escapes.
3295 * This is because we don't want to translate an escape sequence
3296 * into a meta symbol and have the regex compiler use the meta
3297 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
3298 * in spite of this, we do have to process \N here while the proper
3299 * charnames handler is in scope. See bugs #56444 and #62056.
85fba779 3300 *
ff3f963a
KW
3301 * There is a complication because \N in a pattern may also stand
3302 * for 'match a non-nl', and not mean a charname, in which case its
3303 * processing should be deferred to the regex compiler. To be a
3304 * charname it must be followed immediately by a '{', and not look
3305 * like \N followed by a curly quantifier, i.e., not something like
3306 * \N{3,}. regcurly returns a boolean indicating if it is a legal
3307 * quantifier */
3308 else if (PL_lex_inpat
3309 && (*s != 'N'
3310 || s[1] != '{'
412f55bb 3311 || regcurly(s + 1)))
ff3f963a 3312 {
4d73d076 3313 *d++ = '\\';
cc74c5bd
ST
3314 goto default_action;
3315 }
02aa26ce 3316
79072805 3317 switch (*s) {
79072805 3318 default:
11b8faa4 3319 {
15861f94 3320 if ((isALPHANUMERIC(*s)))
a2a5de95
NC
3321 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
3322 "Unrecognized escape \\%c passed through",
3323 *s);
11b8faa4 3324 /* default action is to copy the quoted character */
f9a63242 3325 goto default_action;
11b8faa4 3326 }
02aa26ce 3327
632403cc 3328 /* eg. \132 indicates the octal constant 0132 */
79072805
LW
3329 case '0': case '1': case '2': case '3':
3330 case '4': case '5': case '6': case '7':
ba210ebe 3331 {
5e0a247b 3332 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
53305cf1 3333 STRLEN len = 3;
06972766 3334 uv = grok_oct(s, &len, &flags, NULL);
ba210ebe 3335 s += len;
5e0a247b
KW
3336 if (len < 3 && s < send && isDIGIT(*s)
3337 && ckWARN(WARN_MISC))
3338 {
3339 Perl_warner(aTHX_ packWARN(WARN_MISC),
3340 "%s", form_short_octal_warning(s, len));
3341 }
ba210ebe 3342 }
012bcf8d 3343 goto NUM_ESCAPE_INSERT;
02aa26ce 3344
f0a2b745
KW
3345 /* eg. \o{24} indicates the octal constant \024 */
3346 case 'o':
3347 {
454155d9 3348 const char* error;
f0a2b745 3349
00ce5563 3350 bool valid = grok_bslash_o(&s, &uv, &error,
80f4111b
KW
3351 TRUE, /* Output warning */
3352 FALSE, /* Not strict */
17896a85
KW
3353 TRUE, /* Output warnings for
3354 non-portables */
80f4111b 3355 UTF);
454155d9 3356 if (! valid) {
f0a2b745
KW
3357 yyerror(error);
3358 continue;
3359 }
3360 goto NUM_ESCAPE_INSERT;
3361 }
3362
77a135fe 3363 /* eg. \x24 indicates the hex constant 0x24 */
79072805 3364 case 'x':
a0481293 3365 {
a0481293 3366 const char* error;
355860ce 3367
00ce5563 3368 bool valid = grok_bslash_x(&s, &uv, &error,
80f4111b
KW
3369 TRUE, /* Output warning */
3370 FALSE, /* Not strict */
17896a85
KW
3371 TRUE, /* Output warnings for
3372 non-portables */
80f4111b 3373 UTF);
a0481293
KW
3374 if (! valid) {
3375 yyerror(error);
355860ce 3376 continue;
ba210ebe 3377 }
012bcf8d
GS
3378 }
3379
3380 NUM_ESCAPE_INSERT:
b67fd2c5 3381 /* Insert oct or hex escaped character. */
ba7cea30 3382
06972766 3383 /* Here uv is the ordinal of the next character being added */
f4240379
KW
3384 if (UVCHR_IS_INVARIANT(uv)) {
3385 *d++ = (char) uv;
3386 }
3387 else {
9aa983d2 3388 if (!has_utf8 && uv > 255) {
77a135fe
KW
3389 /* Might need to recode whatever we have accumulated so
3390 * far if it contains any chars variant in utf8 or
3391 * utf-ebcdic. */
3392
3393 SvCUR_set(sv, d - SvPVX_const(sv));
3394 SvPOK_on(sv);
3395 *d = '\0';
77a135fe 3396 /* See Note on sizing above. */
5eef447b 3397 sv_utf8_upgrade_flags_grow(
5f560d8a
KW
3398 sv,
3399 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE
5eef447b
KW
3400 /* Above-latin1 in string
3401 * implies no encoding */
3402 |SV_UTF8_NO_ENCODING,
5f560d8a 3403 UVCHR_SKIP(uv) + (STRLEN)(send - s) + 1);
77a135fe
KW
3404 d = SvPVX(sv) + SvCUR(sv);
3405 has_utf8 = TRUE;
012bcf8d
GS
3406 }
3407
77a135fe 3408 if (has_utf8) {
b67fd2c5
KW
3409 /* Usually, there will already be enough room in 'sv'
3410 * since such escapes are likely longer than any UTF-8
3411 * sequence they can end up as. This isn't the case on
3412 * EBCDIC where \x{40000000} contains 12 bytes, and the
3413 * UTF-8 for it contains 14. And, we have to allow for
3414 * a trailing NUL. It probably can't happen on ASCII
3415 * platforms, but be safe */
3416 const STRLEN needed = d - SvPVX(sv) + UVCHR_SKIP(uv)
3417 + 1;
3418 if (UNLIKELY(needed > SvLEN(sv))) {
3419 SvCUR_set(sv, d - SvPVX_const(sv));
3420 d = sv_grow(sv, needed) + SvCUR(sv);
3421 }
3422
c80e42f3 3423 d = (char*)uvchr_to_utf8((U8*)d, uv);
407f8cf2 3424 if (PL_lex_inwhat == OP_TRANS
7ef70b3d 3425 && PL_parser->lex_sub_op)
407f8cf2 3426 {
7ef70b3d 3427 PL_parser->lex_sub_op->op_private |=
f9a63242
JH
3428 (PL_lex_repl ? OPpTRANS_FROM_UTF
3429 : OPpTRANS_TO_UTF);
f9a63242 3430 }
012bcf8d 3431 }
a0ed51b3 3432 else {
012bcf8d 3433 *d++ = (char)uv;
a0ed51b3 3434 }
012bcf8d 3435 }
f4240379
KW
3436#ifdef EBCDIC
3437 non_portable_endpoint++;
3438#endif
79072805 3439 continue;
02aa26ce 3440
4a2d328f 3441 case 'N':
85fba779
KW
3442 /* In a non-pattern \N must be like \N{U+0041}, or it can be a
3443 * named character, like \N{LATIN SMALL LETTER A}, or a named
3444 * sequence, like \N{LATIN CAPITAL LETTER A WITH MACRON AND
f4240379
KW
3445 * GRAVE} (except y/// can't handle the latter, croaking). For
3446 * convenience all three forms are referred to as "named
3447 * characters" below.
85fba779
KW
3448 *
3449 * For patterns, \N also can mean to match a non-newline. Code
3450 * before this 'switch' statement should already have handled
3451 * this situation, and hence this code only has to deal with
3452 * the named character cases.
3453 *
3454 * For non-patterns, the named characters are converted to
3455 * their string equivalents. In patterns, named characters are
3456 * not converted to their ultimate forms for the same reasons
3457 * that other escapes aren't. Instead, they are converted to
3458 * the \N{U+...} form to get the value from the charnames that
3459 * is in effect right now, while preserving the fact that it
3460 * was a named character, so that the regex compiler knows
3461 * this.
3462 *
3463 * The structure of this section of code (besides checking for
ff3f963a 3464 * errors and upgrading to utf8) is:
f4240379 3465 * If the named character is of the form \N{U+...}, pass it
85fba779
KW
3466 * through if a pattern; otherwise convert the code point
3467 * to utf8
f4240379
KW
3468 * Otherwise must be some \N{NAME}: convert to
3469 * \N{U+c1.c2...} if a pattern; otherwise convert to utf8
3470 *
3471 * Transliteration is an exception. The conversion to utf8 is
3472 * only done if the code point requires it to be representable.
85fba779 3473 *
b6d67071 3474 * Here, 's' points to the 'N'; the test below is guaranteed to
85fba779
KW
3475 * succeed if we are being called on a pattern, as we already
3476 * know from a test above that the next character is a '{'. A
3477 * non-pattern \N must mean 'named character', which requires
3478 * braces */
ff3f963a
KW
3479 s++;
3480 if (*s != '{') {
3481 yyerror("Missing braces on \\N{}");
3482 continue;
3483 }
3484 s++;
3485
0a96133f 3486 /* If there is no matching '}', it is an error. */
ff3f963a
KW
3487 if (! (e = strchr(s, '}'))) {
3488 if (! PL_lex_inpat) {
5777a3f7 3489 yyerror("Missing right brace on \\N{}");
0a96133f 3490 } else {
4407f1b8 3491 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N");
dbc0d4f2 3492 }
0a96133f 3493 continue;
ff3f963a 3494 }
cddc7ef4 3495
ff3f963a 3496 /* Here it looks like a named character */
cddc7ef4 3497
ff3f963a 3498 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
ff3f963a 3499 s += 2; /* Skip to next char after the 'U+' */
ff3f963a 3500 if (PL_lex_inpat) {
af352bf2
KW
3501
3502 /* In patterns, we can have \N{U+xxxx.yyyy.zzzz...} */
fb2eed93
FC
3503 /* Check the syntax. */
3504 const char *orig_s;
3505 orig_s = s - 5;
3506 if (!isXDIGIT(*s)) {
3507 bad_NU:
3508 yyerror(
3509 "Invalid hexadecimal number in \\N{U+...}"
3510 );
3511 s = e + 1;
3512 continue;
3513 }
3514 while (++s < e) {
3515 if (isXDIGIT(*s))
3516 continue;
3517 else if ((*s == '.' || *s == '_')
3518 && isXDIGIT(s[1]))
3519 continue;
3520 goto bad_NU;
4cbd7e22 3521 }
af352bf2 3522
fb2eed93
FC
3523 /* Pass everything through unchanged.
3524 * +1 is for the '}' */
4cbd7e22
FC
3525 Copy(orig_s, d, e - orig_s + 1, char);
3526 d += e - orig_s + 1;
ff3f963a
KW
3527 }
3528 else { /* Not a pattern: convert the hex to string */
fb2eed93
FC
3529 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3530 | PERL_SCAN_SILENT_ILLDIGIT
3531 | PERL_SCAN_DISALLOW_PREFIX;
3532 STRLEN len = e - s;
3533 uv = grok_hex(s, &len, &flags, NULL);
3534 if (len == 0 || (len != (STRLEN)(e - s)))
3535 goto bad_NU;
ff3f963a 3536
f4240379
KW
3537 /* For non-tr///, if the destination is not in utf8,
3538 * unconditionally recode it to be so. This is
3539 * because \N{} implies Unicode semantics, and scalars
3540 * have to be in utf8 to guarantee those semantics.
3541 * tr/// doesn't care about Unicode rules, so no need
3542 * there to upgrade to UTF-8 for small enough code
3543 * points */
3544 if (! has_utf8 && ( uv > 0xFF
3545 || PL_lex_inwhat != OP_TRANS))
3546 {
ff3f963a
KW
3547 SvCUR_set(sv, d - SvPVX_const(sv));
3548 SvPOK_on(sv);
3549 *d = '\0';
3550 /* See Note on sizing above. */
3551 sv_utf8_upgrade_flags_grow(
5f560d8a
KW
3552 sv,
3553 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
b2873c15 3554 OFFUNISKIP(uv) + (STRLEN)(send - e) + 1);
ff3f963a
KW
3555 d = SvPVX(sv) + SvCUR(sv);
3556 has_utf8 = TRUE;
3557 }
3558
a46469e6 3559 /* Add the (Unicode) code point to the output. */
3a34ca0b 3560 if (! has_utf8 || OFFUNI_IS_INVARIANT(uv)) {
a46469e6 3561 *d++ = (char) LATIN1_TO_NATIVE(uv);
ff3f963a 3562 }
a46469e6
KW
3563 else {
3564 d = (char*) uvoffuni_to_utf8_flags((U8*)d, uv, 0);
3565 }
ff3f963a
KW
3566 }
3567 }
6f613c73
KW
3568 else /* Here is \N{NAME} but not \N{U+...}. */
3569 if ((res = get_and_check_backslash_N_name(s, e)))
3570 {
3571 STRLEN len;
3572 const char *str = SvPV_const(res, len);
3573 if (PL_lex_inpat) {
ff3f963a
KW
3574
3575 if (! len) { /* The name resolved to an empty string */
3576 Copy("\\N{}", d, 4, char);
3577 d += 4;
3578 }
3579 else {
3580 /* In order to not lose information for the regex
3581 * compiler, pass the result in the specially made
3582 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3583 * the code points in hex of each character
3584 * returned by charnames */
3585
3586 const char *str_end = str + len;
3b721c4f 3587 const STRLEN off = d - SvPVX_const(sv);
94ca1619
KW
3588
3589 if (! SvUTF8(res)) {
3590 /* For the non-UTF-8 case, we can determine the
3591 * exact length needed without having to parse
3592 * through the string. Each character takes