This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Remove pod/perlembed.pod in wake of RT #120093 corrections.
[perl5.git] / toke.c
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
26
27This is the lower layer of the Perl parser, managing characters and tokens.
28
29=for apidoc AmU|yy_parser *|PL_parser
30
31Pointer to a structure encapsulating the state of the parsing operation
32currently in progress. The pointer can be locally changed to perform
33a nested parse without interfering with the state of an outer parse.
34Individual members of C<PL_parser> have their own documentation.
35
36=cut
37*/
38
378cc40b 39#include "EXTERN.h"
864dbfa3 40#define PERL_IN_TOKE_C
378cc40b 41#include "perl.h"
04e98a4d 42#include "dquote_static.c"
378cc40b 43
eb0d8d16
NC
44#define new_constant(a,b,c,d,e,f,g) \
45 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
46
6154021b 47#define pl_yylval (PL_parser->yylval)
d3b6f988 48
199e78b7
DM
49/* XXX temporary backwards compatibility */
50#define PL_lex_brackets (PL_parser->lex_brackets)
78cdf107
Z
51#define PL_lex_allbrackets (PL_parser->lex_allbrackets)
52#define PL_lex_fakeeof (PL_parser->lex_fakeeof)
199e78b7
DM
53#define PL_lex_brackstack (PL_parser->lex_brackstack)
54#define PL_lex_casemods (PL_parser->lex_casemods)
55#define PL_lex_casestack (PL_parser->lex_casestack)
56#define PL_lex_defer (PL_parser->lex_defer)
57#define PL_lex_dojoin (PL_parser->lex_dojoin)
58#define PL_lex_expect (PL_parser->lex_expect)
59#define PL_lex_formbrack (PL_parser->lex_formbrack)
60#define PL_lex_inpat (PL_parser->lex_inpat)
61#define PL_lex_inwhat (PL_parser->lex_inwhat)
62#define PL_lex_op (PL_parser->lex_op)
63#define PL_lex_repl (PL_parser->lex_repl)
64#define PL_lex_starts (PL_parser->lex_starts)
65#define PL_lex_stuff (PL_parser->lex_stuff)
66#define PL_multi_start (PL_parser->multi_start)
67#define PL_multi_open (PL_parser->multi_open)
68#define PL_multi_close (PL_parser->multi_close)
199e78b7
DM
69#define PL_preambled (PL_parser->preambled)
70#define PL_sublex_info (PL_parser->sublex_info)
bdc0bf6f 71#define PL_linestr (PL_parser->linestr)
c2598295
DM
72#define PL_expect (PL_parser->expect)
73#define PL_copline (PL_parser->copline)
f06b5848
DM
74#define PL_bufptr (PL_parser->bufptr)
75#define PL_oldbufptr (PL_parser->oldbufptr)
76#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
77#define PL_linestart (PL_parser->linestart)
78#define PL_bufend (PL_parser->bufend)
79#define PL_last_uni (PL_parser->last_uni)
80#define PL_last_lop (PL_parser->last_lop)
81#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 82#define PL_lex_state (PL_parser->lex_state)
2f9285f8 83#define PL_rsfp (PL_parser->rsfp)
5486870f 84#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
85#define PL_in_my (PL_parser->in_my)
86#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 87#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 88#define PL_multi_end (PL_parser->multi_end)
13765c85 89#define PL_error_count (PL_parser->error_count)
199e78b7
DM
90
91#ifdef PERL_MAD
92# define PL_endwhite (PL_parser->endwhite)
93# define PL_faketokens (PL_parser->faketokens)
94# define PL_lasttoke (PL_parser->lasttoke)
95# define PL_nextwhite (PL_parser->nextwhite)
96# define PL_realtokenstart (PL_parser->realtokenstart)
97# define PL_skipwhite (PL_parser->skipwhite)
98# define PL_thisclose (PL_parser->thisclose)
99# define PL_thismad (PL_parser->thismad)
100# define PL_thisopen (PL_parser->thisopen)
101# define PL_thisstuff (PL_parser->thisstuff)
102# define PL_thistoken (PL_parser->thistoken)
103# define PL_thiswhite (PL_parser->thiswhite)
fb205e7a
DM
104# define PL_thiswhite (PL_parser->thiswhite)
105# define PL_nexttoke (PL_parser->nexttoke)
106# define PL_curforce (PL_parser->curforce)
107#else
108# define PL_nexttoke (PL_parser->nexttoke)
109# define PL_nexttype (PL_parser->nexttype)
110# define PL_nextval (PL_parser->nextval)
199e78b7
DM
111#endif
112
a1894d81 113static const char* const ident_too_long = "Identifier too long";
8903cb82 114
29595ff2 115#ifdef PERL_MAD
29595ff2 116# define CURMAD(slot,sv) if (PL_madskills) { curmad(slot,sv); sv = 0; }
cd81e915 117# define NEXTVAL_NEXTTOKE PL_nexttoke[PL_curforce].next_val
9ded7720 118#else
5db06880 119# define CURMAD(slot,sv)
9ded7720 120# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2
NC
121#endif
122
a7aaec61
Z
123#define XENUMMASK 0x3f
124#define XFAKEEOF 0x40
125#define XFAKEBRACK 0x80
9059aa12 126
39e02b42
JH
127#ifdef USE_UTF8_SCRIPTS
128# define UTF (!IN_BYTES)
2b9d42f0 129#else
802a15e9 130# define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
2b9d42f0 131#endif
a0ed51b3 132
b1fc3636
CJ
133/* The maximum number of characters preceding the unrecognized one to display */
134#define UNRECOGNIZED_PRECEDE_COUNT 10
135
61f0cdd9 136/* In variables named $^X, these are the legal values for X.
2b92dfce
GS
137 * 1999-02-27 mjd-perl-patch@plover.com */
138#define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
139
14bd96d0 140#define SPACE_OR_TAB(c) isBLANK_A(c)
bf4acbe4 141
ffb4593c
NT
142/* LEX_* are values for PL_lex_state, the state of the lexer.
143 * They are arranged oddly so that the guard on the switch statement
79072805 144 * can get by with a single comparison (if the compiler is smart enough).
9da1dd8f
DM
145 *
146 * These values refer to the various states within a sublex parse,
147 * i.e. within a double quotish string
79072805
LW
148 */
149
fb73857a 150/* #define LEX_NOTPARSING 11 is done in perl.h. */
151
b6007c36
DM
152#define LEX_NORMAL 10 /* normal code (ie not within "...") */
153#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
154#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
155#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
156#define LEX_INTERPSTART 6 /* expecting the start of a $var */
157
158 /* at end of code, eg "$x" followed by: */
159#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
160#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
161
162#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
163 string or after \E, $foo, etc */
164#define LEX_INTERPCONST 2 /* NOT USED */
165#define LEX_FORMLINE 1 /* expecting a format line */
166#define LEX_KNOWNEXT 0 /* next token known; just return it */
167
79072805 168
bbf60fe6 169#ifdef DEBUGGING
27da23d5 170static const char* const lex_state_names[] = {
bbf60fe6
DM
171 "KNOWNEXT",
172 "FORMLINE",
173 "INTERPCONST",
174 "INTERPCONCAT",
175 "INTERPENDMAYBE",
176 "INTERPEND",
177 "INTERPSTART",
178 "INTERPPUSH",
179 "INTERPCASEMOD",
180 "INTERPNORMAL",
181 "NORMAL"
182};
183#endif
184
79072805 185#include "keywords.h"
fe14fcc3 186
ffb4593c
NT
187/* CLINE is a macro that ensures PL_copline has a sane value */
188
57843af0 189#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 190
5db06880 191#ifdef PERL_MAD
29595ff2
NC
192# define SKIPSPACE0(s) skipspace0(s)
193# define SKIPSPACE1(s) skipspace1(s)
194# define SKIPSPACE2(s,tsv) skipspace2(s,&tsv)
195# define PEEKSPACE(s) skipspace2(s,0)
196#else
197# define SKIPSPACE0(s) skipspace(s)
198# define SKIPSPACE1(s) skipspace(s)
199# define SKIPSPACE2(s,tsv) skipspace(s)
200# define PEEKSPACE(s) skipspace(s)
201#endif
202
ffb4593c
NT
203/*
204 * Convenience functions to return different tokens and prime the
9cbb5ea2 205 * lexer for the next token. They all take an argument.
ffb4593c
NT
206 *
207 * TOKEN : generic token (used for '(', DOLSHARP, etc)
208 * OPERATOR : generic operator
209 * AOPERATOR : assignment operator
210 * PREBLOCK : beginning the block after an if, while, foreach, ...
211 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
212 * PREREF : *EXPR where EXPR is not a simple identifier
213 * TERM : expression term
214 * LOOPX : loop exiting command (goto, last, dump, etc)
215 * FTST : file test operator
216 * FUN0 : zero-argument function
7eb971ee 217 * FUN0OP : zero-argument function, with its op created in this file
2d2e263d 218 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
219 * BOop : bitwise or or xor
220 * BAop : bitwise and
221 * SHop : shift operator
222 * PWop : power operator
9cbb5ea2 223 * PMop : pattern-matching operator
ffb4593c
NT
224 * Aop : addition-level operator
225 * Mop : multiplication-level operator
226 * Eop : equality-testing operator
e5edeb50 227 * Rop : relational operator <= != gt
ffb4593c
NT
228 *
229 * Also see LOP and lop() below.
230 */
231
998054bd 232#ifdef DEBUGGING /* Serve -DT. */
704d4215 233# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 234#else
bbf60fe6 235# define REPORT(retval) (retval)
998054bd
SC
236#endif
237
bbf60fe6
DM
238#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
239#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
240#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, REPORT(retval)))
241#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
242#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
243#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
244#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
6154021b
RGS
245#define LOOPX(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)LOOPEX))
246#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
247#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
7eb971ee 248#define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
6154021b
RGS
249#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
250#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITOROP)))
251#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITANDOP)))
252#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)SHIFTOP)))
253#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)POWOP)))
254#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
255#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)ADDOP)))
256#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MULOP)))
257#define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
258#define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
2f3197b3 259
a687059c
LW
260/* This bit of chicanery makes a unary function followed by
261 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
262 * The UNIDOR macro is for unary functions that can be followed by the //
263 * operator (such as C<shift // 0>).
a687059c 264 */
d68ce4ac 265#define UNI3(f,x,have_x) { \
6154021b 266 pl_yylval.ival = f; \
d68ce4ac 267 if (have_x) PL_expect = x; \
376fcdbf
AL
268 PL_bufptr = s; \
269 PL_last_uni = PL_oldbufptr; \
270 PL_last_lop_op = f; \
271 if (*s == '(') \
272 return REPORT( (int)FUNC1 ); \
29595ff2 273 s = PEEKSPACE(s); \
376fcdbf
AL
274 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
275 }
d68ce4ac
FC
276#define UNI(f) UNI3(f,XTERM,1)
277#define UNIDOR(f) UNI3(f,XTERMORDORDOR,1)
b5fb7ce3
FC
278#define UNIPROTO(f,optional) { \
279 if (optional) PL_last_uni = PL_oldbufptr; \
22393538
MH
280 OPERATOR(f); \
281 }
a687059c 282
d68ce4ac 283#define UNIBRACK(f) UNI3(f,0,0)
79072805 284
9f68db38 285/* grandfather return to old style */
78cdf107
Z
286#define OLDLOP(f) \
287 do { \
288 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
289 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
290 pl_yylval.ival = (f); \
291 PL_expect = XTERM; \
292 PL_bufptr = s; \
293 return (int)LSTOP; \
294 } while(0)
79072805 295
83944c01
FC
296#define COPLINE_INC_WITH_HERELINES \
297 STMT_START { \
298 CopLINE_inc(PL_curcop); \
851b527a
FC
299 if (PL_parser->herelines) \
300 CopLINE(PL_curcop) += PL_parser->herelines, \
301 PL_parser->herelines = 0; \
83944c01 302 } STMT_END
ffdb8b16
FC
303/* Called after scan_str to update CopLINE(PL_curcop), but only when there
304 * is no sublex_push to follow. */
305#define COPLINE_SET_FROM_MULTI_END \
306 STMT_START { \
307 CopLINE_set(PL_curcop, PL_multi_end); \
308 if (PL_multi_end != PL_multi_start) \
851b527a 309 PL_parser->herelines = 0; \
ffdb8b16 310 } STMT_END
83944c01
FC
311
312
8fa7f367
JH
313#ifdef DEBUGGING
314
6154021b 315/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
316enum token_type {
317 TOKENTYPE_NONE,
318 TOKENTYPE_IVAL,
6154021b 319 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6 320 TOKENTYPE_PVAL,
aeaef349 321 TOKENTYPE_OPVAL
bbf60fe6
DM
322};
323
6d4a66ac
NC
324static struct debug_tokens {
325 const int token;
326 enum token_type type;
327 const char *name;
328} const debug_tokens[] =
9041c2e3 329{
bbf60fe6
DM
330 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
331 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
332 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
333 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
334 { ARROW, TOKENTYPE_NONE, "ARROW" },
335 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
336 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
337 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
338 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
339 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
0d863452 340 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
341 { DO, TOKENTYPE_NONE, "DO" },
342 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
343 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
344 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
345 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
346 { ELSE, TOKENTYPE_NONE, "ELSE" },
347 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
348 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
349 { FOR, TOKENTYPE_IVAL, "FOR" },
350 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
705fe0e5
FC
351 { FORMLBRACK, TOKENTYPE_NONE, "FORMLBRACK" },
352 { FORMRBRACK, TOKENTYPE_NONE, "FORMRBRACK" },
bbf60fe6
DM
353 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
354 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
7eb971ee 355 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
bbf60fe6
DM
356 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
357 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
358 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 359 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
360 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
361 { IF, TOKENTYPE_IVAL, "IF" },
5504e6cf 362 { LABEL, TOKENTYPE_PVAL, "LABEL" },
bbf60fe6
DM
363 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
364 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
365 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
366 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
367 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
368 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
369 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
370 { MY, TOKENTYPE_IVAL, "MY" },
bbf60fe6
DM
371 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
372 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
373 { OROP, TOKENTYPE_IVAL, "OROP" },
374 { OROR, TOKENTYPE_NONE, "OROR" },
375 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
f3f204dc 376 { PEG, TOKENTYPE_NONE, "PEG" },
88e1f1a2
JV
377 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
378 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6
DM
379 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
380 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
381 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
382 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
383 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
384 { PREINC, TOKENTYPE_NONE, "PREINC" },
385 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
f3f204dc 386 { QWLIST, TOKENTYPE_OPVAL, "QWLIST" },
bbf60fe6
DM
387 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
388 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
f3f204dc 389 { REQUIRE, TOKENTYPE_NONE, "REQUIRE" },
bbf60fe6
DM
390 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
391 { SUB, TOKENTYPE_NONE, "SUB" },
392 { THING, TOKENTYPE_OPVAL, "THING" },
393 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
394 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
395 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
396 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
397 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
398 { USE, TOKENTYPE_IVAL, "USE" },
0d863452 399 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6
DM
400 { WHILE, TOKENTYPE_IVAL, "WHILE" },
401 { WORD, TOKENTYPE_OPVAL, "WORD" },
be25f609 402 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 403 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
404};
405
6154021b 406/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 407
bbf60fe6 408STATIC int
704d4215 409S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 410{
97aff369 411 dVAR;
7918f24d
NC
412
413 PERL_ARGS_ASSERT_TOKEREPORT;
414
bbf60fe6 415 if (DEBUG_T_TEST) {
bd61b366 416 const char *name = NULL;
bbf60fe6 417 enum token_type type = TOKENTYPE_NONE;
f54cb97a 418 const struct debug_tokens *p;
396482e1 419 SV* const report = newSVpvs("<== ");
bbf60fe6 420
f54cb97a 421 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
422 if (p->token == (int)rv) {
423 name = p->name;
424 type = p->type;
425 break;
426 }
427 }
428 if (name)
54667de8 429 Perl_sv_catpv(aTHX_ report, name);
74736ae6 430 else if ((char)rv > ' ' && (char)rv <= '~')
4ebc7986 431 {
bbf60fe6 432 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
4ebc7986
FC
433 if ((char)rv == 'p')
434 sv_catpvs(report, " (pending identifier)");
435 }
bbf60fe6 436 else if (!rv)
396482e1 437 sv_catpvs(report, "EOF");
bbf60fe6
DM
438 else
439 Perl_sv_catpvf(aTHX_ report, "?? %"IVdf, (IV)rv);
440 switch (type) {
441 case TOKENTYPE_NONE:
bbf60fe6
DM
442 break;
443 case TOKENTYPE_IVAL:
704d4215 444 Perl_sv_catpvf(aTHX_ report, "(ival=%"IVdf")", (IV)lvalp->ival);
bbf60fe6
DM
445 break;
446 case TOKENTYPE_OPNUM:
447 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 448 PL_op_name[lvalp->ival]);
bbf60fe6
DM
449 break;
450 case TOKENTYPE_PVAL:
704d4215 451 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
452 break;
453 case TOKENTYPE_OPVAL:
704d4215 454 if (lvalp->opval) {
401441c0 455 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
456 PL_op_name[lvalp->opval->op_type]);
457 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 458 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 459 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
460 }
461
462 }
401441c0 463 else
396482e1 464 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
465 break;
466 }
b6007c36 467 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
468 };
469 return (int)rv;
998054bd
SC
470}
471
b6007c36
DM
472
473/* print the buffer with suitable escapes */
474
475STATIC void
15f169a1 476S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 477{
396482e1 478 SV* const tmp = newSVpvs("");
7918f24d
NC
479
480 PERL_ARGS_ASSERT_PRINTBUF;
481
b6007c36
DM
482 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
483 SvREFCNT_dec(tmp);
484}
485
8fa7f367
JH
486#endif
487
8290c323
NC
488static int
489S_deprecate_commaless_var_list(pTHX) {
490 PL_expect = XTERM;
491 deprecate("comma-less variable list");
492 return REPORT(','); /* grandfather non-comma-format format */
493}
494
ffb4593c
NT
495/*
496 * S_ao
497 *
c963b151
BD
498 * This subroutine detects &&=, ||=, and //= and turns an ANDAND, OROR or DORDOR
499 * into an OP_ANDASSIGN, OP_ORASSIGN, or OP_DORASSIGN
ffb4593c
NT
500 */
501
76e3520e 502STATIC int
cea2e8a9 503S_ao(pTHX_ int toketype)
a0d0e21e 504{
97aff369 505 dVAR;
3280af22
NIS
506 if (*PL_bufptr == '=') {
507 PL_bufptr++;
a0d0e21e 508 if (toketype == ANDAND)
6154021b 509 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 510 else if (toketype == OROR)
6154021b 511 pl_yylval.ival = OP_ORASSIGN;
c963b151 512 else if (toketype == DORDOR)
6154021b 513 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
514 toketype = ASSIGNOP;
515 }
516 return toketype;
517}
518
ffb4593c
NT
519/*
520 * S_no_op
521 * When Perl expects an operator and finds something else, no_op
522 * prints the warning. It always prints "<something> found where
523 * operator expected. It prints "Missing semicolon on previous line?"
524 * if the surprise occurs at the start of the line. "do you need to
525 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
526 * where the compiler doesn't know if foo is a method call or a function.
527 * It prints "Missing operator before end of line" if there's nothing
528 * after the missing operator, or "... before <...>" if there is something
529 * after the missing operator.
530 */
531
76e3520e 532STATIC void
15f169a1 533S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 534{
97aff369 535 dVAR;
9d4ba2ae
AL
536 char * const oldbp = PL_bufptr;
537 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 538
7918f24d
NC
539 PERL_ARGS_ASSERT_NO_OP;
540
1189a94a
GS
541 if (!s)
542 s = oldbp;
07c798fb 543 else
1189a94a 544 PL_bufptr = s;
734ab321 545 yywarn(Perl_form(aTHX_ "%s found where operator expected", what), UTF ? SVf_UTF8 : 0);
56da5a46
RGS
546 if (ckWARN_d(WARN_SYNTAX)) {
547 if (is_first)
548 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
549 "\t(Missing semicolon on previous line?)\n");
550 else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) {
f54cb97a 551 const char *t;
8a2bca12 552 for (t = PL_oldoldbufptr; (isWORDCHAR_lazy_if(t,UTF) || *t == ':');
734ab321 553 t += UTF ? UTF8SKIP(t) : 1)
c35e046a 554 NOOP;
56da5a46
RGS
555 if (t < PL_bufptr && isSPACE(*t))
556 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
b17a0679
FC
557 "\t(Do you need to predeclare %"UTF8f"?)\n",
558 UTF8fARG(UTF, t - PL_oldoldbufptr, PL_oldoldbufptr));
56da5a46
RGS
559 }
560 else {
561 assert(s >= oldbp);
562 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
b17a0679
FC
563 "\t(Missing operator before %"UTF8f"?)\n",
564 UTF8fARG(UTF, s - oldbp, oldbp));
56da5a46 565 }
07c798fb 566 }
3280af22 567 PL_bufptr = oldbp;
8990e307
LW
568}
569
ffb4593c
NT
570/*
571 * S_missingterm
572 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 573 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
574 * If we're in a delimited string and the delimiter is a control
575 * character, it's reformatted into a two-char sequence like ^C.
576 * This is fatal.
577 */
578
76e3520e 579STATIC void
cea2e8a9 580S_missingterm(pTHX_ char *s)
8990e307 581{
97aff369 582 dVAR;
8990e307
LW
583 char tmpbuf[3];
584 char q;
585 if (s) {
9d4ba2ae 586 char * const nl = strrchr(s,'\n');
d2719217 587 if (nl)
8990e307
LW
588 *nl = '\0';
589 }
ca8b19a7 590 else if ((U8) PL_multi_close < 32) {
8990e307 591 *tmpbuf = '^';
585ec06d 592 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
593 tmpbuf[2] = '\0';
594 s = tmpbuf;
595 }
596 else {
eb160463 597 *tmpbuf = (char)PL_multi_close;
8990e307
LW
598 tmpbuf[1] = '\0';
599 s = tmpbuf;
600 }
601 q = strchr(s,'"') ? '\'' : '"';
cea2e8a9 602 Perl_croak(aTHX_ "Can't find string terminator %c%s%c anywhere before EOF",q,s,q);
463ee0b2 603}
79072805 604
dd0ac2b9
FC
605#include "feature.h"
606
0d863452 607/*
0d863452
RH
608 * Check whether the named feature is enabled.
609 */
26ea9e12 610bool
3fff3427 611Perl_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
0d863452 612{
97aff369 613 dVAR;
4a731d7b 614 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
7918f24d
NC
615
616 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
ca4d40c4
FC
617
618 assert(CURRENT_FEATURE_BUNDLE == FEATURE_BUNDLE_CUSTOM);
7918f24d 619
26ea9e12
NC
620 if (namelen > MAX_FEATURE_LEN)
621 return FALSE;
3fff3427 622 memcpy(&he_name[8], name, namelen);
7d69d4a6 623
c8ca97b0
NC
624 return cBOOL(cop_hints_fetch_pvn(PL_curcop, he_name, 8 + namelen, 0,
625 REFCOUNTED_HE_EXISTS));
0d863452
RH
626}
627
ffb4593c 628/*
9cbb5ea2
GS
629 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
630 * utf16-to-utf8-reversed.
ffb4593c
NT
631 */
632
c39cd008
GS
633#ifdef PERL_CR_FILTER
634static void
635strip_return(SV *sv)
636{
eb578fdb
KW
637 const char *s = SvPVX_const(sv);
638 const char * const e = s + SvCUR(sv);
7918f24d
NC
639
640 PERL_ARGS_ASSERT_STRIP_RETURN;
641
c39cd008
GS
642 /* outer loop optimized to do nothing if there are no CR-LFs */
643 while (s < e) {
644 if (*s++ == '\r' && *s == '\n') {
645 /* hit a CR-LF, need to copy the rest */
eb578fdb 646 char *d = s - 1;
c39cd008
GS
647 *d++ = *s++;
648 while (s < e) {
649 if (*s == '\r' && s[1] == '\n')
650 s++;
651 *d++ = *s++;
652 }
653 SvCUR(sv) -= s - d;
654 return;
655 }
656 }
657}
a868473f 658
76e3520e 659STATIC I32
c39cd008 660S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 661{
f54cb97a 662 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
663 if (count > 0 && !maxlen)
664 strip_return(sv);
665 return count;
a868473f
NIS
666}
667#endif
668
ffb4593c 669/*
8eaa0acf
Z
670=for apidoc Amx|void|lex_start|SV *line|PerlIO *rsfp|U32 flags
671
672Creates and initialises a new lexer/parser state object, supplying
673a context in which to lex and parse from a new source of Perl code.
674A pointer to the new state object is placed in L</PL_parser>. An entry
675is made on the save stack so that upon unwinding the new state object
676will be destroyed and the former value of L</PL_parser> will be restored.
677Nothing else need be done to clean up the parsing context.
678
679The code to be parsed comes from I<line> and I<rsfp>. I<line>, if
680non-null, provides a string (in SV form) containing code to be parsed.
681A copy of the string is made, so subsequent modification of I<line>
682does not affect parsing. I<rsfp>, if non-null, provides an input stream
683from which code will be read to be parsed. If both are non-null, the
684code in I<line> comes first and must consist of complete lines of input,
685and I<rsfp> supplies the remainder of the source.
686
e368b3bd
FC
687The I<flags> parameter is reserved for future use. Currently it is only
688used by perl internally, so extensions should always pass zero.
8eaa0acf
Z
689
690=cut
691*/
ffb4593c 692
27fcb6ee 693/* LEX_START_SAME_FILTER indicates that this is not a new file, so it
87606032
NC
694 can share filters with the current parser.
695 LEX_START_DONT_CLOSE indicates that the file handle wasn't opened by the
696 caller, hence isn't owned by the parser, so shouldn't be closed on parser
697 destruction. This is used to handle the case of defaulting to reading the
698 script from the standard input because no filename was given on the command
699 line (without getting confused by situation where STDIN has been closed, so
700 the script handle is opened on fd 0) */
27fcb6ee 701
a0d0e21e 702void
8eaa0acf 703Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
79072805 704{
97aff369 705 dVAR;
6ef55633 706 const char *s = NULL;
5486870f 707 yy_parser *parser, *oparser;
60d63348 708 if (flags && flags & ~LEX_START_FLAGS)
8eaa0acf 709 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
acdf0a21
DM
710
711 /* create and initialise a parser */
712
199e78b7 713 Newxz(parser, 1, yy_parser);
5486870f 714 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
715 PL_parser = parser;
716
28ac2b49
Z
717 parser->stack = NULL;
718 parser->ps = NULL;
719 parser->stack_size = 0;
acdf0a21 720
e3abe207
DM
721 /* on scope exit, free this parser and restore any outer one */
722 SAVEPARSER(parser);
7c4baf47 723 parser->saved_curcop = PL_curcop;
e3abe207 724
acdf0a21 725 /* initialise lexer state */
8990e307 726
fb205e7a
DM
727#ifdef PERL_MAD
728 parser->curforce = -1;
729#else
730 parser->nexttoke = 0;
731#endif
ca4cfd28 732 parser->error_count = oparser ? oparser->error_count : 0;
7f1c3e8c 733 parser->copline = parser->preambling = NOLINE;
5afb0a62 734 parser->lex_state = LEX_NORMAL;
c2598295 735 parser->expect = XSTATE;
2f9285f8 736 parser->rsfp = rsfp;
27fcb6ee
FC
737 parser->rsfp_filters =
738 !(flags & LEX_START_SAME_FILTER) || !oparser
d3cd8e11
FC
739 ? NULL
740 : MUTABLE_AV(SvREFCNT_inc(
741 oparser->rsfp_filters
742 ? oparser->rsfp_filters
743 : (oparser->rsfp_filters = newAV())
744 ));
2f9285f8 745
199e78b7
DM
746 Newx(parser->lex_brackstack, 120, char);
747 Newx(parser->lex_casestack, 12, char);
748 *parser->lex_casestack = '\0';
d794b522 749 Newxz(parser->lex_shared, 1, LEXSHARED);
02b34bbe 750
10efb74f 751 if (line) {
0528fd32 752 STRLEN len;
10efb74f 753 s = SvPV_const(line, len);
0abcdfa4
FC
754 parser->linestr = flags & LEX_START_COPIED
755 ? SvREFCNT_inc_simple_NN(line)
756 : newSVpvn_flags(s, len, SvUTF8(line));
bf1b738b 757 sv_catpvn(parser->linestr, "\n;", rsfp ? 1 : 2);
0abcdfa4 758 } else {
bf1b738b 759 parser->linestr = newSVpvn("\n;", rsfp ? 1 : 2);
8990e307 760 }
f06b5848
DM
761 parser->oldoldbufptr =
762 parser->oldbufptr =
763 parser->bufptr =
764 parser->linestart = SvPVX(parser->linestr);
765 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
766 parser->last_lop = parser->last_uni = NULL;
87606032
NC
767 parser->lex_flags = flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
768 |LEX_DONT_CLOSE_RSFP);
737c24fc 769
60d63348 770 parser->in_pod = parser->filtered = 0;
79072805 771}
a687059c 772
e3abe207
DM
773
774/* delete a parser object */
775
776void
777Perl_parser_free(pTHX_ const yy_parser *parser)
778{
7918f24d
NC
779 PERL_ARGS_ASSERT_PARSER_FREE;
780
7c4baf47 781 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
782 SvREFCNT_dec(parser->linestr);
783
87606032 784 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
2f9285f8 785 PerlIO_clearerr(parser->rsfp);
799361c3
SH
786 else if (parser->rsfp && (!parser->old_parser ||
787 (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 788 PerlIO_close(parser->rsfp);
5486870f 789 SvREFCNT_dec(parser->rsfp_filters);
10002bc1
FC
790 SvREFCNT_dec(parser->lex_stuff);
791 SvREFCNT_dec(parser->sublex_info.repl);
3ac7ff8f
FC
792
793 Safefree(parser->lex_brackstack);
794 Safefree(parser->lex_casestack);
795 Safefree(parser->lex_shared);
796 PL_parser = parser->old_parser;
797 Safefree(parser);
798}
799
800void
801Perl_parser_free_nexttoke_ops(pTHX_ yy_parser *parser, OPSLAB *slab)
802{
803#ifdef PERL_MAD
804 I32 nexttoke = parser->lasttoke;
805#else
806 I32 nexttoke = parser->nexttoke;
807#endif
808 PERL_ARGS_ASSERT_PARSER_FREE_NEXTTOKE_OPS;
3ce3dcd9
FC
809 while (nexttoke--) {
810#ifdef PERL_MAD
811 if (S_is_opval_token(parser->nexttoke[nexttoke].next_type
3ac7ff8f
FC
812 & 0xffff)
813 && parser->nexttoke[nexttoke].next_val.opval
814 && parser->nexttoke[nexttoke].next_val.opval->op_slabbed
815 && OpSLAB(parser->nexttoke[nexttoke].next_val.opval) == slab) {
816 op_free(parser->nexttoke[nexttoke].next_val.opval);
817 parser->nexttoke[nexttoke].next_val.opval = NULL;
818 }
3ce3dcd9 819#else
3ac7ff8f
FC
820 if (S_is_opval_token(parser->nexttype[nexttoke] & 0xffff)
821 && parser->nextval[nexttoke].opval
822 && parser->nextval[nexttoke].opval->op_slabbed
823 && OpSLAB(parser->nextval[nexttoke].opval) == slab) {
3ce3dcd9 824 op_free(parser->nextval[nexttoke].opval);
3ac7ff8f
FC
825 parser->nextval[nexttoke].opval = NULL;
826 }
3ce3dcd9
FC
827#endif
828 }
e3abe207
DM
829}
830
831
ffb4593c 832/*
f0e67a1d
Z
833=for apidoc AmxU|SV *|PL_parser-E<gt>linestr
834
835Buffer scalar containing the chunk currently under consideration of the
836text currently being lexed. This is always a plain string scalar (for
837which C<SvPOK> is true). It is not intended to be used as a scalar by
838normal scalar means; instead refer to the buffer directly by the pointer
839variables described below.
840
841The lexer maintains various C<char*> pointers to things in the
842C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
843reallocated, all of these pointers must be updated. Don't attempt to
844do this manually, but rather use L</lex_grow_linestr> if you need to
845reallocate the buffer.
846
847The content of the text chunk in the buffer is commonly exactly one
848complete line of input, up to and including a newline terminator,
849but there are situations where it is otherwise. The octets of the
850buffer may be intended to be interpreted as either UTF-8 or Latin-1.
851The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
852flag on this scalar, which may disagree with it.
853
854For direct examination of the buffer, the variable
855L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
856lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
857of these pointers is usually preferable to examination of the scalar
858through normal scalar means.
859
860=for apidoc AmxU|char *|PL_parser-E<gt>bufend
861
862Direct pointer to the end of the chunk of text currently being lexed, the
863end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
864+ SvCUR(PL_parser-E<gt>linestr)>. A NUL character (zero octet) is
865always located at the end of the buffer, and does not count as part of
866the buffer's contents.
867
868=for apidoc AmxU|char *|PL_parser-E<gt>bufptr
869
870Points to the current position of lexing inside the lexer buffer.
871Characters around this point may be freely examined, within
872the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
873L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
874interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
875
876Lexing code (whether in the Perl core or not) moves this pointer past
877the characters that it consumes. It is also expected to perform some
878bookkeeping whenever a newline character is consumed. This movement
879can be more conveniently performed by the function L</lex_read_to>,
880which handles newlines appropriately.
881
882Interpretation of the buffer's octets can be abstracted out by
883using the slightly higher-level functions L</lex_peek_unichar> and
884L</lex_read_unichar>.
885
886=for apidoc AmxU|char *|PL_parser-E<gt>linestart
887
888Points to the start of the current line inside the lexer buffer.
889This is useful for indicating at which column an error occurred, and
890not much else. This must be updated by any lexing code that consumes
891a newline; the function L</lex_read_to> handles this detail.
892
893=cut
894*/
895
896/*
897=for apidoc Amx|bool|lex_bufutf8
898
899Indicates whether the octets in the lexer buffer
900(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
901of Unicode characters. If not, they should be interpreted as Latin-1
902characters. This is analogous to the C<SvUTF8> flag for scalars.
903
904In UTF-8 mode, it is not guaranteed that the lexer buffer actually
905contains valid UTF-8. Lexing code must be robust in the face of invalid
906encoding.
907
908The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
909is significant, but not the whole story regarding the input character
910encoding. Normally, when a file is being read, the scalar contains octets
911and its C<SvUTF8> flag is off, but the octets should be interpreted as
912UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
913however, the scalar may have the C<SvUTF8> flag on, and in this case its
914octets should be interpreted as UTF-8 unless the C<use bytes> pragma
915is in effect. This logic may change in the future; use this function
916instead of implementing the logic yourself.
917
918=cut
919*/
920
921bool
922Perl_lex_bufutf8(pTHX)
923{
924 return UTF;
925}
926
927/*
928=for apidoc Amx|char *|lex_grow_linestr|STRLEN len
929
930Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
931at least I<len> octets (including terminating NUL). Returns a
932pointer to the reallocated buffer. This is necessary before making
933any direct modification of the buffer that would increase its length.
934L</lex_stuff_pvn> provides a more convenient way to insert text into
935the buffer.
936
937Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
938this function updates all of the lexer's variables that point directly
939into the buffer.
940
941=cut
942*/
943
944char *
945Perl_lex_grow_linestr(pTHX_ STRLEN len)
946{
947 SV *linestr;
948 char *buf;
949 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
c7641931 950 STRLEN linestart_pos, last_uni_pos, last_lop_pos, re_eval_start_pos;
f0e67a1d
Z
951 linestr = PL_parser->linestr;
952 buf = SvPVX(linestr);
953 if (len <= SvLEN(linestr))
954 return buf;
955 bufend_pos = PL_parser->bufend - buf;
956 bufptr_pos = PL_parser->bufptr - buf;
957 oldbufptr_pos = PL_parser->oldbufptr - buf;
958 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
959 linestart_pos = PL_parser->linestart - buf;
960 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
961 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
3328ab5a
FC
962 re_eval_start_pos = PL_parser->lex_shared->re_eval_start ?
963 PL_parser->lex_shared->re_eval_start - buf : 0;
c7641931 964
f0e67a1d 965 buf = sv_grow(linestr, len);
c7641931 966
f0e67a1d
Z
967 PL_parser->bufend = buf + bufend_pos;
968 PL_parser->bufptr = buf + bufptr_pos;
969 PL_parser->oldbufptr = buf + oldbufptr_pos;
970 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
971 PL_parser->linestart = buf + linestart_pos;
972 if (PL_parser->last_uni)
973 PL_parser->last_uni = buf + last_uni_pos;
974 if (PL_parser->last_lop)
975 PL_parser->last_lop = buf + last_lop_pos;
3328ab5a
FC
976 if (PL_parser->lex_shared->re_eval_start)
977 PL_parser->lex_shared->re_eval_start = buf + re_eval_start_pos;
f0e67a1d
Z
978 return buf;
979}
980
981/*
83aa740e 982=for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
f0e67a1d
Z
983
984Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
985immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
986reallocating the buffer if necessary. This means that lexing code that
987runs later will see the characters as if they had appeared in the input.
988It is not recommended to do this as part of normal parsing, and most
989uses of this facility run the risk of the inserted characters being
990interpreted in an unintended manner.
991
992The string to be inserted is represented by I<len> octets starting
993at I<pv>. These octets are interpreted as either UTF-8 or Latin-1,
994according to whether the C<LEX_STUFF_UTF8> flag is set in I<flags>.
995The characters are recoded for the lexer buffer, according to how the
996buffer is currently being interpreted (L</lex_bufutf8>). If a string
9dcc53ea 997to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
f0e67a1d
Z
998function is more convenient.
999
1000=cut
1001*/
1002
1003void
83aa740e 1004Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
f0e67a1d 1005{
749123ff 1006 dVAR;
f0e67a1d
Z
1007 char *bufptr;
1008 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
1009 if (flags & ~(LEX_STUFF_UTF8))
1010 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
1011 if (UTF) {
1012 if (flags & LEX_STUFF_UTF8) {
1013 goto plain_copy;
1014 } else {
54d004e8 1015 STRLEN highhalf = 0; /* Count of variants */
83aa740e 1016 const char *p, *e = pv+len;
54d004e8
KW
1017 for (p = pv; p != e; p++) {
1018 if (! UTF8_IS_INVARIANT(*p)) {
1019 highhalf++;
1020 }
1021 }
f0e67a1d
Z
1022 if (!highhalf)
1023 goto plain_copy;
1024 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
1025 bufptr = PL_parser->bufptr;
1026 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1027 SvCUR_set(PL_parser->linestr,
1028 SvCUR(PL_parser->linestr) + len+highhalf);
f0e67a1d
Z
1029 PL_parser->bufend += len+highhalf;
1030 for (p = pv; p != e; p++) {
1031 U8 c = (U8)*p;
54d004e8
KW
1032 if (! UTF8_IS_INVARIANT(c)) {
1033 *bufptr++ = UTF8_TWO_BYTE_HI(c);
1034 *bufptr++ = UTF8_TWO_BYTE_LO(c);
f0e67a1d
Z
1035 } else {
1036 *bufptr++ = (char)c;
1037 }
1038 }
1039 }
1040 } else {
1041 if (flags & LEX_STUFF_UTF8) {
1042 STRLEN highhalf = 0;
83aa740e 1043 const char *p, *e = pv+len;
f0e67a1d
Z
1044 for (p = pv; p != e; p++) {
1045 U8 c = (U8)*p;
54d004e8 1046 if (UTF8_IS_ABOVE_LATIN1(c)) {
f0e67a1d
Z
1047 Perl_croak(aTHX_ "Lexing code attempted to stuff "
1048 "non-Latin-1 character into Latin-1 input");
54d004e8 1049 } else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e)) {
f0e67a1d
Z
1050 p++;
1051 highhalf++;
54d004e8 1052 } else if (! UTF8_IS_INVARIANT(c)) {
f0e67a1d
Z
1053 /* malformed UTF-8 */
1054 ENTER;
1055 SAVESPTR(PL_warnhook);
1056 PL_warnhook = PERL_WARNHOOK_FATAL;
c80e42f3 1057 utf8n_to_uvchr((U8*)p, e-p, NULL, 0);
f0e67a1d
Z
1058 LEAVE;
1059 }
1060 }
1061 if (!highhalf)
1062 goto plain_copy;
1063 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
1064 bufptr = PL_parser->bufptr;
1065 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1066 SvCUR_set(PL_parser->linestr,
1067 SvCUR(PL_parser->linestr) + len-highhalf);
f0e67a1d 1068 PL_parser->bufend += len-highhalf;
54d004e8
KW
1069 p = pv;
1070 while (p < e) {
1071 if (UTF8_IS_INVARIANT(*p)) {
1072 *bufptr++ = *p;
1073 p++;
f0e67a1d 1074 }
54d004e8
KW
1075 else {
1076 assert(p < e -1 );
94bb8c36 1077 *bufptr++ = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
54d004e8
KW
1078 p += 2;
1079 }
f0e67a1d
Z
1080 }
1081 } else {
54d004e8 1082 plain_copy:
f0e67a1d
Z
1083 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1084 bufptr = PL_parser->bufptr;
1085 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
255fdf19 1086 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
f0e67a1d
Z
1087 PL_parser->bufend += len;
1088 Copy(pv, bufptr, len, char);
1089 }
1090 }
1091}
1092
1093/*
9dcc53ea
Z
1094=for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
1095
1096Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1097immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1098reallocating the buffer if necessary. This means that lexing code that
1099runs later will see the characters as if they had appeared in the input.
1100It is not recommended to do this as part of normal parsing, and most
1101uses of this facility run the risk of the inserted characters being
1102interpreted in an unintended manner.
1103
1104The string to be inserted is represented by octets starting at I<pv>
1105and continuing to the first nul. These octets are interpreted as either
1106UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
1107in I<flags>. The characters are recoded for the lexer buffer, according
1108to how the buffer is currently being interpreted (L</lex_bufutf8>).
1109If it is not convenient to nul-terminate a string to be inserted, the
1110L</lex_stuff_pvn> function is more appropriate.
1111
1112=cut
1113*/
1114
1115void
1116Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1117{
1118 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1119 lex_stuff_pvn(pv, strlen(pv), flags);
1120}
1121
1122/*
f0e67a1d
Z
1123=for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1124
1125Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1126immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1127reallocating the buffer if necessary. This means that lexing code that
1128runs later will see the characters as if they had appeared in the input.
1129It is not recommended to do this as part of normal parsing, and most
1130uses of this facility run the risk of the inserted characters being
1131interpreted in an unintended manner.
1132
1133The string to be inserted is the string value of I<sv>. The characters
1134are recoded for the lexer buffer, according to how the buffer is currently
9dcc53ea 1135being interpreted (L</lex_bufutf8>). If a string to be inserted is
f0e67a1d
Z
1136not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1137need to construct a scalar.
1138
1139=cut
1140*/
1141
1142void
1143Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1144{
1145 char *pv;
1146 STRLEN len;
1147 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1148 if (flags)
1149 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1150 pv = SvPV(sv, len);
1151 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1152}
1153
1154/*
1155=for apidoc Amx|void|lex_unstuff|char *ptr
1156
1157Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1158I<ptr>. Text following I<ptr> will be moved, and the buffer shortened.
1159This hides the discarded text from any lexing code that runs later,
1160as if the text had never appeared.
1161
1162This is not the normal way to consume lexed text. For that, use
1163L</lex_read_to>.
1164
1165=cut
1166*/
1167
1168void
1169Perl_lex_unstuff(pTHX_ char *ptr)
1170{
1171 char *buf, *bufend;
1172 STRLEN unstuff_len;
1173 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1174 buf = PL_parser->bufptr;
1175 if (ptr < buf)
1176 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1177 if (ptr == buf)
1178 return;
1179 bufend = PL_parser->bufend;
1180 if (ptr > bufend)
1181 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1182 unstuff_len = ptr - buf;
1183 Move(ptr, buf, bufend+1-ptr, char);
1184 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1185 PL_parser->bufend = bufend - unstuff_len;
1186}
1187
1188/*
1189=for apidoc Amx|void|lex_read_to|char *ptr
1190
1191Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1192to I<ptr>. This advances L</PL_parser-E<gt>bufptr> to match I<ptr>,
1193performing the correct bookkeeping whenever a newline character is passed.
1194This is the normal way to consume lexed text.
1195
1196Interpretation of the buffer's octets can be abstracted out by
1197using the slightly higher-level functions L</lex_peek_unichar> and
1198L</lex_read_unichar>.
1199
1200=cut
1201*/
1202
1203void
1204Perl_lex_read_to(pTHX_ char *ptr)
1205{
1206 char *s;
1207 PERL_ARGS_ASSERT_LEX_READ_TO;
1208 s = PL_parser->bufptr;
1209 if (ptr < s || ptr > PL_parser->bufend)
1210 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1211 for (; s != ptr; s++)
1212 if (*s == '\n') {
83944c01 1213 COPLINE_INC_WITH_HERELINES;
f0e67a1d
Z
1214 PL_parser->linestart = s+1;
1215 }
1216 PL_parser->bufptr = ptr;
1217}
1218
1219/*
1220=for apidoc Amx|void|lex_discard_to|char *ptr
1221
1222Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1223up to I<ptr>. The remaining content of the buffer will be moved, and
1224all pointers into the buffer updated appropriately. I<ptr> must not
1225be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1226it is not permitted to discard text that has yet to be lexed.
1227
1228Normally it is not necessarily to do this directly, because it suffices to
1229use the implicit discarding behaviour of L</lex_next_chunk> and things
1230based on it. However, if a token stretches across multiple lines,
1f317c95 1231and the lexing code has kept multiple lines of text in the buffer for
f0e67a1d
Z
1232that purpose, then after completion of the token it would be wise to
1233explicitly discard the now-unneeded earlier lines, to avoid future
1234multi-line tokens growing the buffer without bound.
1235
1236=cut
1237*/
1238
1239void
1240Perl_lex_discard_to(pTHX_ char *ptr)
1241{
1242 char *buf;
1243 STRLEN discard_len;
1244 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1245 buf = SvPVX(PL_parser->linestr);
1246 if (ptr < buf)
1247 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1248 if (ptr == buf)
1249 return;
1250 if (ptr > PL_parser->bufptr)
1251 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1252 discard_len = ptr - buf;
1253 if (PL_parser->oldbufptr < ptr)
1254 PL_parser->oldbufptr = ptr;
1255 if (PL_parser->oldoldbufptr < ptr)
1256 PL_parser->oldoldbufptr = ptr;
1257 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1258 PL_parser->last_uni = NULL;
1259 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1260 PL_parser->last_lop = NULL;
1261 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1262 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1263 PL_parser->bufend -= discard_len;
1264 PL_parser->bufptr -= discard_len;
1265 PL_parser->oldbufptr -= discard_len;
1266 PL_parser->oldoldbufptr -= discard_len;
1267 if (PL_parser->last_uni)
1268 PL_parser->last_uni -= discard_len;
1269 if (PL_parser->last_lop)
1270 PL_parser->last_lop -= discard_len;
1271}
1272
1273/*
1274=for apidoc Amx|bool|lex_next_chunk|U32 flags
1275
1276Reads in the next chunk of text to be lexed, appending it to
1277L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1278looked to the end of the current chunk and wants to know more. It is
1279usual, but not necessary, for lexing to have consumed the entirety of
1280the current chunk at this time.
1281
1282If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1283chunk (i.e., the current chunk has been entirely consumed), normally the
1284current chunk will be discarded at the same time that the new chunk is
1285read in. If I<flags> includes C<LEX_KEEP_PREVIOUS>, the current chunk
1286will not be discarded. If the current chunk has not been entirely
1287consumed, then it will not be discarded regardless of the flag.
1288
1289Returns true if some new text was added to the buffer, or false if the
1290buffer has reached the end of the input text.
1291
1292=cut
1293*/
1294
1295#define LEX_FAKE_EOF 0x80000000
112d1284 1296#define LEX_NO_TERM 0x40000000
f0e67a1d
Z
1297
1298bool
1299Perl_lex_next_chunk(pTHX_ U32 flags)
1300{
1301 SV *linestr;
1302 char *buf;
1303 STRLEN old_bufend_pos, new_bufend_pos;
1304 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1305 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1306 bool got_some_for_debugger = 0;
f0e67a1d 1307 bool got_some;
112d1284 1308 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM))
f0e67a1d 1309 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
f0e67a1d
Z
1310 linestr = PL_parser->linestr;
1311 buf = SvPVX(linestr);
1312 if (!(flags & LEX_KEEP_PREVIOUS) &&
1313 PL_parser->bufptr == PL_parser->bufend) {
1314 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1315 linestart_pos = 0;
1316 if (PL_parser->last_uni != PL_parser->bufend)
1317 PL_parser->last_uni = NULL;
1318 if (PL_parser->last_lop != PL_parser->bufend)
1319 PL_parser->last_lop = NULL;
1320 last_uni_pos = last_lop_pos = 0;
1321 *buf = 0;
1322 SvCUR(linestr) = 0;
1323 } else {
1324 old_bufend_pos = PL_parser->bufend - buf;
1325 bufptr_pos = PL_parser->bufptr - buf;
1326 oldbufptr_pos = PL_parser->oldbufptr - buf;
1327 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1328 linestart_pos = PL_parser->linestart - buf;
1329 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1330 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1331 }
1332 if (flags & LEX_FAKE_EOF) {
1333 goto eof;
60d63348 1334 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
f0e67a1d
Z
1335 got_some = 0;
1336 } else if (filter_gets(linestr, old_bufend_pos)) {
1337 got_some = 1;
17cc9359 1338 got_some_for_debugger = 1;
112d1284
FC
1339 } else if (flags & LEX_NO_TERM) {
1340 got_some = 0;
f0e67a1d 1341 } else {
580561a3
Z
1342 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1343 sv_setpvs(linestr, "");
f0e67a1d
Z
1344 eof:
1345 /* End of real input. Close filehandle (unless it was STDIN),
1346 * then add implicit termination.
1347 */
87606032 1348 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
f0e67a1d
Z
1349 PerlIO_clearerr(PL_parser->rsfp);
1350 else if (PL_parser->rsfp)
1351 (void)PerlIO_close(PL_parser->rsfp);
1352 PL_parser->rsfp = NULL;
60d63348 1353 PL_parser->in_pod = PL_parser->filtered = 0;
f0e67a1d
Z
1354#ifdef PERL_MAD
1355 if (PL_madskills && !PL_in_eval && (PL_minus_p || PL_minus_n))
1356 PL_faketokens = 1;
1357#endif
1358 if (!PL_in_eval && PL_minus_p) {
1359 sv_catpvs(linestr,
1360 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1361 PL_minus_n = PL_minus_p = 0;
1362 } else if (!PL_in_eval && PL_minus_n) {
1363 sv_catpvs(linestr, /*{*/";}");
1364 PL_minus_n = 0;
1365 } else
1366 sv_catpvs(linestr, ";");
1367 got_some = 1;
1368 }
1369 buf = SvPVX(linestr);
1370 new_bufend_pos = SvCUR(linestr);
1371 PL_parser->bufend = buf + new_bufend_pos;
1372 PL_parser->bufptr = buf + bufptr_pos;
1373 PL_parser->oldbufptr = buf + oldbufptr_pos;
1374 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1375 PL_parser->linestart = buf + linestart_pos;
1376 if (PL_parser->last_uni)
1377 PL_parser->last_uni = buf + last_uni_pos;
1378 if (PL_parser->last_lop)
1379 PL_parser->last_lop = buf + last_lop_pos;
7f1c3e8c
FC
1380 if (PL_parser->preambling != NOLINE) {
1381 CopLINE_set(PL_curcop, PL_parser->preambling + 1);
1382 PL_parser->preambling = NOLINE;
1383 }
17cc9359 1384 if (got_some_for_debugger && (PERLDB_LINE || PERLDB_SAVESRC) &&
f0e67a1d
Z
1385 PL_curstash != PL_debstash) {
1386 /* debugger active and we're not compiling the debugger code,
1387 * so store the line into the debugger's array of lines
1388 */
1389 update_debugger_info(NULL, buf+old_bufend_pos,
1390 new_bufend_pos-old_bufend_pos);
1391 }
1392 return got_some;
1393}
1394
1395/*
1396=for apidoc Amx|I32|lex_peek_unichar|U32 flags
1397
1398Looks ahead one (Unicode) character in the text currently being lexed.
1399Returns the codepoint (unsigned integer value) of the next character,
1400or -1 if lexing has reached the end of the input text. To consume the
1401peeked character, use L</lex_read_unichar>.
1402
1403If the next character is in (or extends into) the next chunk of input
1404text, the next chunk will be read in. Normally the current chunk will be
1405discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1406then the current chunk will not be discarded.
1407
1408If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1409is encountered, an exception is generated.
1410
1411=cut
1412*/
1413
1414I32
1415Perl_lex_peek_unichar(pTHX_ U32 flags)
1416{
749123ff 1417 dVAR;
f0e67a1d
Z
1418 char *s, *bufend;
1419 if (flags & ~(LEX_KEEP_PREVIOUS))
1420 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1421 s = PL_parser->bufptr;
1422 bufend = PL_parser->bufend;
1423 if (UTF) {
1424 U8 head;
1425 I32 unichar;
1426 STRLEN len, retlen;
1427 if (s == bufend) {
1428 if (!lex_next_chunk(flags))
1429 return -1;
1430 s = PL_parser->bufptr;
1431 bufend = PL_parser->bufend;
1432 }
1433 head = (U8)*s;
54d004e8 1434 if (UTF8_IS_INVARIANT(head))
f0e67a1d 1435 return head;
54d004e8
KW
1436 if (UTF8_IS_START(head)) {
1437 len = UTF8SKIP(&head);
f0e67a1d
Z
1438 while ((STRLEN)(bufend-s) < len) {
1439 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1440 break;
1441 s = PL_parser->bufptr;
1442 bufend = PL_parser->bufend;
1443 }
1444 }
c80e42f3 1445 unichar = utf8n_to_uvchr((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
f0e67a1d
Z
1446 if (retlen == (STRLEN)-1) {
1447 /* malformed UTF-8 */
1448 ENTER;
1449 SAVESPTR(PL_warnhook);
1450 PL_warnhook = PERL_WARNHOOK_FATAL;
c80e42f3 1451 utf8n_to_uvchr((U8*)s, bufend-s, NULL, 0);
f0e67a1d
Z
1452 LEAVE;
1453 }
1454 return unichar;
1455 } else {
1456 if (s == bufend) {
1457 if (!lex_next_chunk(flags))
1458 return -1;
1459 s = PL_parser->bufptr;
1460 }
1461 return (U8)*s;
1462 }
1463}
1464
1465/*
1466=for apidoc Amx|I32|lex_read_unichar|U32 flags
1467
1468Reads the next (Unicode) character in the text currently being lexed.
1469Returns the codepoint (unsigned integer value) of the character read,
1470and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1471if lexing has reached the end of the input text. To non-destructively
1472examine the next character, use L</lex_peek_unichar> instead.
1473
1474If the next character is in (or extends into) the next chunk of input
1475text, the next chunk will be read in. Normally the current chunk will be
1476discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1477then the current chunk will not be discarded.
1478
1479If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1480is encountered, an exception is generated.
1481
1482=cut
1483*/
1484
1485I32
1486Perl_lex_read_unichar(pTHX_ U32 flags)
1487{
1488 I32 c;
1489 if (flags & ~(LEX_KEEP_PREVIOUS))
1490 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1491 c = lex_peek_unichar(flags);
1492 if (c != -1) {
1493 if (c == '\n')
83944c01 1494 COPLINE_INC_WITH_HERELINES;
d9018cbe
EB
1495 if (UTF)
1496 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1497 else
1498 ++(PL_parser->bufptr);
f0e67a1d
Z
1499 }
1500 return c;
1501}
1502
1503/*
1504=for apidoc Amx|void|lex_read_space|U32 flags
1505
1506Reads optional spaces, in Perl style, in the text currently being
1507lexed. The spaces may include ordinary whitespace characters and
1508Perl-style comments. C<#line> directives are processed if encountered.
1509L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1510at a non-space character (or the end of the input text).
1511
1512If spaces extend into the next chunk of input text, the next chunk will
1513be read in. Normally the current chunk will be discarded at the same
1514time, but if I<flags> includes C<LEX_KEEP_PREVIOUS> then the current
1515chunk will not be discarded.
1516
1517=cut
1518*/
1519
21791330 1520#define LEX_NO_INCLINE 0x40000000
f0998909
Z
1521#define LEX_NO_NEXT_CHUNK 0x80000000
1522
f0e67a1d
Z
1523void
1524Perl_lex_read_space(pTHX_ U32 flags)
1525{
1526 char *s, *bufend;
21791330 1527 const bool can_incline = !(flags & LEX_NO_INCLINE);
f0e67a1d 1528 bool need_incline = 0;
21791330 1529 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK|LEX_NO_INCLINE))
f0e67a1d
Z
1530 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
1531#ifdef PERL_MAD
1532 if (PL_skipwhite) {
1533 sv_free(PL_skipwhite);
1534 PL_skipwhite = NULL;
1535 }
1536 if (PL_madskills)
1537 PL_skipwhite = newSVpvs("");
1538#endif /* PERL_MAD */
1539 s = PL_parser->bufptr;
1540 bufend = PL_parser->bufend;
1541 while (1) {
1542 char c = *s;
1543 if (c == '#') {
1544 do {
1545 c = *++s;
1546 } while (!(c == '\n' || (c == 0 && s == bufend)));
1547 } else if (c == '\n') {
1548 s++;
21791330
FC
1549 if (can_incline) {
1550 PL_parser->linestart = s;
1551 if (s == bufend)
1552 need_incline = 1;
1553 else
1554 incline(s);
1555 }
f0e67a1d
Z
1556 } else if (isSPACE(c)) {
1557 s++;
1558 } else if (c == 0 && s == bufend) {
1559 bool got_more;
65c68e17 1560 line_t l;
f0e67a1d
Z
1561#ifdef PERL_MAD
1562 if (PL_madskills)
1563 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1564#endif /* PERL_MAD */
f0998909
Z
1565 if (flags & LEX_NO_NEXT_CHUNK)
1566 break;
f0e67a1d 1567 PL_parser->bufptr = s;
65c68e17 1568 l = CopLINE(PL_curcop);
851b527a 1569 CopLINE(PL_curcop) += PL_parser->herelines + 1;
f0e67a1d 1570 got_more = lex_next_chunk(flags);
65c68e17 1571 CopLINE_set(PL_curcop, l);
f0e67a1d
Z
1572 s = PL_parser->bufptr;
1573 bufend = PL_parser->bufend;
1574 if (!got_more)
1575 break;
21791330 1576 if (can_incline && need_incline && PL_parser->rsfp) {
f0e67a1d
Z
1577 incline(s);
1578 need_incline = 0;
1579 }
1580 } else {
1581 break;
1582 }
1583 }
1584#ifdef PERL_MAD
1585 if (PL_madskills)
1586 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1587#endif /* PERL_MAD */
1588 PL_parser->bufptr = s;
1589}
1590
1591/*
fe788d6b
PM
1592
1593=for apidoc EXMp|bool|validate_proto|SV *name|SV *proto|bool warn
1594
1595This function performs syntax checking on a prototype, C<proto>.
1596If C<warn> is true, any illegal characters or mismatched brackets
1597will trigger illegalproto warnings, declaring that they were
1598detected in the prototype for C<name>.
1599
1600The return value is C<true> if this is a valid prototype, and
1601C<false> if it is not, regardless of whether C<warn> was C<true> or
1602C<false>.
1603
1604Note that C<NULL> is a valid C<proto> and will always return C<true>.
1605
1606=cut
1607
1608 */
1609
1610bool
1611Perl_validate_proto(pTHX_ SV *name, SV *proto, bool warn)
1612{
1613 STRLEN len, origlen;
1614 char *p = proto ? SvPV(proto, len) : NULL;
1615 bool bad_proto = FALSE;
1616 bool in_brackets = FALSE;
1617 bool after_slash = FALSE;
1618 char greedy_proto = ' ';
1619 bool proto_after_greedy_proto = FALSE;
1620 bool must_be_last = FALSE;
1621 bool underscore = FALSE;
f791a21a 1622 bool bad_proto_after_underscore = FALSE;
fe788d6b
PM
1623
1624 PERL_ARGS_ASSERT_VALIDATE_PROTO;
1625
1626 if (!proto)
1627 return TRUE;
1628
1629 origlen = len;
1630 for (; len--; p++) {
1631 if (!isSPACE(*p)) {
1632 if (must_be_last)
1633 proto_after_greedy_proto = TRUE;
f791a21a
PM
1634 if (underscore) {
1635 if (!strchr(";@%", *p))
1636 bad_proto_after_underscore = TRUE;
1637 underscore = FALSE;
1638 }
fe788d6b
PM
1639 if (!strchr("$@%*;[]&\\_+", *p) || *p == '\0') {
1640 bad_proto = TRUE;
1641 }
1642 else {
fe788d6b
PM
1643 if (*p == '[')
1644 in_brackets = TRUE;
1645 else if (*p == ']')
1646 in_brackets = FALSE;
1647 else if ((*p == '@' || *p == '%') &&
1648 !after_slash &&
1649 !in_brackets ) {
1650 must_be_last = TRUE;
1651 greedy_proto = *p;
1652 }
1653 else if (*p == '_')
f791a21a 1654 underscore = TRUE;
fe788d6b
PM
1655 }
1656 if (*p == '\\')
1657 after_slash = TRUE;
1658 else
1659 after_slash = FALSE;
1660 }
1661 }
1662
1663 if (warn) {
b54d603d 1664 SV *tmpsv = newSVpvs_flags("", SVs_TEMP);
fe788d6b 1665 p -= origlen;
b54d603d
PM
1666 p = SvUTF8(proto)
1667 ? sv_uni_display(tmpsv, newSVpvn_flags(p, origlen, SVs_TEMP | SVf_UTF8),
1668 origlen, UNI_DISPLAY_ISPRINT)
1669 : pv_pretty(tmpsv, p, origlen, 60, NULL, NULL, PERL_PV_ESCAPE_NONASCII);
1670
fe788d6b
PM
1671 if (proto_after_greedy_proto)
1672 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1673 "Prototype after '%c' for %"SVf" : %s",
1674 greedy_proto, SVfARG(name), p);
50278ed0
PM
1675 if (in_brackets)
1676 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1677 "Missing ']' in prototype for %"SVf" : %s",
1678 SVfARG(name), p);
b54d603d 1679 if (bad_proto)
fe788d6b 1680 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
f791a21a
PM
1681 "Illegal character in prototype for %"SVf" : %s",
1682 SVfARG(name), p);
1683 if (bad_proto_after_underscore)
1684 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1685 "Illegal character after '_' in prototype for %"SVf" : %s",
1686 SVfARG(name), p);
fe788d6b
PM
1687 }
1688
1689 return (! (proto_after_greedy_proto || bad_proto) );
1690}
1691
1692/*
ffb4593c
NT
1693 * S_incline
1694 * This subroutine has nothing to do with tilting, whether at windmills
1695 * or pinball tables. Its name is short for "increment line". It
57843af0 1696 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1697 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1698 * # line 500 "foo.pm"
1699 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1700 */
1701
76e3520e 1702STATIC void
d9095cec 1703S_incline(pTHX_ const char *s)
463ee0b2 1704{
97aff369 1705 dVAR;
d9095cec
NC
1706 const char *t;
1707 const char *n;
1708 const char *e;
8818d409 1709 line_t line_num;
463ee0b2 1710
7918f24d
NC
1711 PERL_ARGS_ASSERT_INCLINE;
1712
83944c01 1713 COPLINE_INC_WITH_HERELINES;
451f421f
FC
1714 if (!PL_rsfp && !PL_parser->filtered && PL_lex_state == LEX_NORMAL
1715 && s+1 == PL_bufend && *s == ';') {
1716 /* fake newline in string eval */
1717 CopLINE_dec(PL_curcop);
1718 return;
1719 }
463ee0b2
LW
1720 if (*s++ != '#')
1721 return;
d4c19fe8
AL
1722 while (SPACE_OR_TAB(*s))
1723 s++;
73659bf1
GS
1724 if (strnEQ(s, "line", 4))
1725 s += 4;
1726 else
1727 return;
084592ab 1728 if (SPACE_OR_TAB(*s))
73659bf1 1729 s++;
4e553d73 1730 else
73659bf1 1731 return;
d4c19fe8
AL
1732 while (SPACE_OR_TAB(*s))
1733 s++;
463ee0b2
LW
1734 if (!isDIGIT(*s))
1735 return;
d4c19fe8 1736
463ee0b2
LW
1737 n = s;
1738 while (isDIGIT(*s))
1739 s++;
07714eb4 1740 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1741 return;
bf4acbe4 1742 while (SPACE_OR_TAB(*s))
463ee0b2 1743 s++;
73659bf1 1744 if (*s == '"' && (t = strchr(s+1, '"'))) {
463ee0b2 1745 s++;
73659bf1
GS
1746 e = t + 1;
1747 }
463ee0b2 1748 else {
c35e046a
AL
1749 t = s;
1750 while (!isSPACE(*t))
1751 t++;
73659bf1 1752 e = t;
463ee0b2 1753 }
bf4acbe4 1754 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1755 e++;
1756 if (*e != '\n' && *e != '\0')
1757 return; /* false alarm */
1758
8818d409
FC
1759 line_num = atoi(n)-1;
1760
f4dd75d9 1761 if (t - s > 0) {
d9095cec 1762 const STRLEN len = t - s;
3df32bda 1763
d36ee5be 1764 if (!PL_rsfp && !PL_parser->filtered) {
e66cf94c
RGS
1765 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1766 * to *{"::_<newfilename"} */
44867030
NC
1767 /* However, the long form of evals is only turned on by the
1768 debugger - usually they're "(eval %lu)" */
d36ee5be
FC
1769 GV * const cfgv = CopFILEGV(PL_curcop);
1770 if (cfgv) {
38bd7ad8
FC
1771 char smallbuf[128];
1772 STRLEN tmplen2 = len;
44867030 1773 char *tmpbuf2;
449dd039 1774 GV *gv2;
44867030
NC
1775
1776 if (tmplen2 + 2 <= sizeof smallbuf)
1777 tmpbuf2 = smallbuf;
1778 else
1779 Newx(tmpbuf2, tmplen2 + 2, char);
1780
38bd7ad8
FC
1781 tmpbuf2[0] = '_';
1782 tmpbuf2[1] = '<';
44867030
NC
1783
1784 memcpy(tmpbuf2 + 2, s, tmplen2);
1785 tmplen2 += 2;
1786
8a5ee598 1787 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1788 if (!isGV(gv2)) {
8a5ee598 1789 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1790 /* adjust ${"::_<newfilename"} to store the new file name */
1791 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
8818d409
FC
1792 /* The line number may differ. If that is the case,
1793 alias the saved lines that are in the array.
1794 Otherwise alias the whole array. */
1795 if (CopLINE(PL_curcop) == line_num) {
38bd7ad8
FC
1796 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(cfgv)));
1797 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(cfgv)));
8818d409 1798 }
38bd7ad8
FC
1799 else if (GvAV(cfgv)) {
1800 AV * const av = GvAV(cfgv);
8818d409
FC
1801 const I32 start = CopLINE(PL_curcop)+1;
1802 I32 items = AvFILLp(av) - start;
1803 if (items > 0) {
1804 AV * const av2 = GvAVn(gv2);
1805 SV **svp = AvARRAY(av) + start;
1806 I32 l = (I32)line_num+1;
1807 while (items--)
1808 av_store(av2, l++, SvREFCNT_inc(*svp++));
1809 }
1810 }
e5527e4b 1811 }
44867030
NC
1812
1813 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
d36ee5be 1814 }
e66cf94c 1815 }
05ec9bb3 1816 CopFILE_free(PL_curcop);
449dd039 1817 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1818 }
8818d409 1819 CopLINE_set(PL_curcop, line_num);
463ee0b2
LW
1820}
1821
21791330
FC
1822#define skipspace(s) skipspace_flags(s, 0)
1823
29595ff2 1824#ifdef PERL_MAD
cd81e915 1825/* skip space before PL_thistoken */
29595ff2
NC
1826
1827STATIC char *
5aaab254 1828S_skipspace0(pTHX_ char *s)
29595ff2 1829{
7918f24d
NC
1830 PERL_ARGS_ASSERT_SKIPSPACE0;
1831
29595ff2
NC
1832 s = skipspace(s);
1833 if (!PL_madskills)
1834 return s;
cd81e915
NC
1835 if (PL_skipwhite) {
1836 if (!PL_thiswhite)
6b29d1f5 1837 PL_thiswhite = newSVpvs("");
cd81e915
NC
1838 sv_catsv(PL_thiswhite, PL_skipwhite);
1839 sv_free(PL_skipwhite);
1840 PL_skipwhite = 0;
1841 }
1842 PL_realtokenstart = s - SvPVX(PL_linestr);
29595ff2
NC
1843 return s;
1844}
1845
cd81e915 1846/* skip space after PL_thistoken */
29595ff2
NC
1847
1848STATIC char *
5aaab254 1849S_skipspace1(pTHX_ char *s)
29595ff2 1850{
d4c19fe8 1851 const char *start = s;
29595ff2
NC
1852 I32 startoff = start - SvPVX(PL_linestr);
1853
7918f24d
NC
1854 PERL_ARGS_ASSERT_SKIPSPACE1;
1855
29595ff2
NC
1856 s = skipspace(s);
1857 if (!PL_madskills)
1858 return s;
1859 start = SvPVX(PL_linestr) + startoff;
cd81e915 1860 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1861 const char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1862 PL_thistoken = newSVpvn(tstart, start - tstart);
1863 }
1864 PL_realtokenstart = -1;
1865 if (PL_skipwhite) {
1866 if (!PL_nextwhite)
6b29d1f5 1867 PL_nextwhite = newSVpvs("");
cd81e915
NC
1868 sv_catsv(PL_nextwhite, PL_skipwhite);
1869 sv_free(PL_skipwhite);
1870 PL_skipwhite = 0;
29595ff2
NC
1871 }
1872 return s;
1873}
1874
1875STATIC char *
5aaab254 1876S_skipspace2(pTHX_ char *s, SV **svp)
29595ff2 1877{
c35e046a 1878 char *start;
c35e046a
AL
1879 const I32 startoff = s - SvPVX(PL_linestr);
1880
7918f24d
NC
1881 PERL_ARGS_ASSERT_SKIPSPACE2;
1882
29595ff2 1883 s = skipspace(s);
29595ff2
NC
1884 if (!PL_madskills || !svp)
1885 return s;
1886 start = SvPVX(PL_linestr) + startoff;
cd81e915 1887 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1888 char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1889 PL_thistoken = newSVpvn(tstart, start - tstart);
1890 PL_realtokenstart = -1;
29595ff2 1891 }
cd81e915 1892 if (PL_skipwhite) {
29595ff2 1893 if (!*svp)
6b29d1f5 1894 *svp = newSVpvs("");
cd81e915
NC
1895 sv_setsv(*svp, PL_skipwhite);
1896 sv_free(PL_skipwhite);
1897 PL_skipwhite = 0;
29595ff2
NC
1898 }
1899
1900 return s;
1901}
1902#endif
1903
80a702cd 1904STATIC void
15f169a1 1905S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1906{
1907 AV *av = CopFILEAVx(PL_curcop);
1908 if (av) {
7f1c3e8c
FC
1909 SV * sv;
1910 if (PL_parser->preambling == NOLINE) sv = newSV_type(SVt_PVMG);
1911 else {
1912 sv = *av_fetch(av, 0, 1);
1913 SvUPGRADE(sv, SVt_PVMG);
1914 }
1915 if (!SvPOK(sv)) sv_setpvs(sv,"");
5fa550fb 1916 if (orig_sv)
7f1c3e8c 1917 sv_catsv(sv, orig_sv);
5fa550fb 1918 else
7f1c3e8c
FC
1919 sv_catpvn(sv, buf, len);
1920 if (!SvIOK(sv)) {
1921 (void)SvIOK_on(sv);
1922 SvIV_set(sv, 0);
1923 }
1924 if (PL_parser->preambling == NOLINE)
1925 av_store(av, CopLINE(PL_curcop), sv);
80a702cd
RGS
1926 }
1927}
1928
ffb4593c
NT
1929/*
1930 * S_skipspace
1931 * Called to gobble the appropriate amount and type of whitespace.
1932 * Skips comments as well.
1933 */
1934
76e3520e 1935STATIC char *
21791330 1936S_skipspace_flags(pTHX_ char *s, U32 flags)
a687059c 1937{
5db06880 1938#ifdef PERL_MAD
f0e67a1d
Z
1939 char *start = s;
1940#endif /* PERL_MAD */
21791330 1941 PERL_ARGS_ASSERT_SKIPSPACE_FLAGS;
f0e67a1d 1942#ifdef PERL_MAD
cd81e915
NC
1943 if (PL_skipwhite) {
1944 sv_free(PL_skipwhite);
f0e67a1d 1945 PL_skipwhite = NULL;
5db06880 1946 }
f0e67a1d 1947#endif /* PERL_MAD */
3280af22 1948 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
bf4acbe4 1949 while (s < PL_bufend && SPACE_OR_TAB(*s))
463ee0b2 1950 s++;
f0e67a1d
Z
1951 } else {
1952 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1953 PL_bufptr = s;
21791330 1954 lex_read_space(flags | LEX_KEEP_PREVIOUS |
f0998909
Z
1955 (PL_sublex_info.sub_inwhat || PL_lex_state == LEX_FORMLINE ?
1956 LEX_NO_NEXT_CHUNK : 0));
3280af22 1957 s = PL_bufptr;
f0e67a1d
Z
1958 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1959 if (PL_linestart > PL_bufptr)
1960 PL_bufptr = PL_linestart;
1961 return s;
463ee0b2 1962 }
5db06880 1963#ifdef PERL_MAD
f0e67a1d
Z
1964 if (PL_madskills)
1965 PL_skipwhite = newSVpvn(start, s-start);
1966#endif /* PERL_MAD */
5db06880 1967 return s;
a687059c 1968}
378cc40b 1969
ffb4593c
NT
1970/*
1971 * S_check_uni
1972 * Check the unary operators to ensure there's no ambiguity in how they're
1973 * used. An ambiguous piece of code would be:
1974 * rand + 5
1975 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1976 * the +5 is its argument.
1977 */
1978
76e3520e 1979STATIC void
cea2e8a9 1980S_check_uni(pTHX)
ba106d47 1981{
97aff369 1982 dVAR;
d4c19fe8
AL
1983 const char *s;
1984 const char *t;
2f3197b3 1985
3280af22 1986 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1987 return;
3280af22
NIS
1988 while (isSPACE(*PL_last_uni))
1989 PL_last_uni++;
c35e046a 1990 s = PL_last_uni;
8a2bca12 1991 while (isWORDCHAR_lazy_if(s,UTF) || *s == '-')
c35e046a 1992 s++;
3280af22 1993 if ((t = strchr(s, '(')) && t < PL_bufptr)
a0d0e21e 1994 return;
6136c704 1995
9b387841
NC
1996 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
1997 "Warning: Use of \"%.*s\" without parentheses is ambiguous",
1998 (int)(s - PL_last_uni), PL_last_uni);
2f3197b3
LW
1999}
2000
ffb4593c
NT
2001/*
2002 * LOP : macro to build a list operator. Its behaviour has been replaced
2003 * with a subroutine, S_lop() for which LOP is just another name.
2004 */
2005
a0d0e21e
LW
2006#define LOP(f,x) return lop(f,x,s)
2007
ffb4593c
NT
2008/*
2009 * S_lop
2010 * Build a list operator (or something that might be one). The rules:
2011 * - if we have a next token, then it's a list operator [why?]
2012 * - if the next thing is an opening paren, then it's a function
2013 * - else it's a list operator
2014 */
2015
76e3520e 2016STATIC I32
a0be28da 2017S_lop(pTHX_ I32 f, int x, char *s)
ffed7fef 2018{
97aff369 2019 dVAR;
7918f24d
NC
2020
2021 PERL_ARGS_ASSERT_LOP;
2022
6154021b 2023 pl_yylval.ival = f;
35c8bce7 2024 CLINE;
3280af22
NIS
2025 PL_expect = x;
2026 PL_bufptr = s;
2027 PL_last_lop = PL_oldbufptr;
eb160463 2028 PL_last_lop_op = (OPCODE)f;
5db06880
NC
2029#ifdef PERL_MAD
2030 if (PL_lasttoke)
78cdf107 2031 goto lstop;
5db06880 2032#else
3280af22 2033 if (PL_nexttoke)
78cdf107 2034 goto lstop;
5db06880 2035#endif
79072805 2036 if (*s == '(')
bbf60fe6 2037 return REPORT(FUNC);
29595ff2 2038 s = PEEKSPACE(s);
79072805 2039 if (*s == '(')
bbf60fe6 2040 return REPORT(FUNC);
78cdf107
Z
2041 else {
2042 lstop:
2043 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
2044 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
bbf60fe6 2045 return REPORT(LSTOP);
78cdf107 2046 }
79072805
LW
2047}
2048
5db06880
NC
2049#ifdef PERL_MAD
2050 /*
2051 * S_start_force
2052 * Sets up for an eventual force_next(). start_force(0) basically does
2053 * an unshift, while start_force(-1) does a push. yylex removes items
2054 * on the "pop" end.
2055 */
2056
2057STATIC void
2058S_start_force(pTHX_ int where)
2059{
2060 int i;
2061
cd81e915 2062 if (where < 0) /* so people can duplicate start_force(PL_curforce) */
5db06880 2063 where = PL_lasttoke;
cd81e915
NC
2064 assert(PL_curforce < 0 || PL_curforce == where);
2065 if (PL_curforce != where) {
5db06880
NC
2066 for (i = PL_lasttoke; i > where; --i) {
2067 PL_nexttoke[i] = PL_nexttoke[i-1];
2068 }
2069 PL_lasttoke++;
2070 }
cd81e915 2071 if (PL_curforce < 0) /* in case of duplicate start_force() */
5db06880 2072 Zero(&PL_nexttoke[where], 1, NEXTTOKE);
cd81e915
NC
2073 PL_curforce = where;
2074 if (PL_nextwhite) {
5db06880 2075 if (PL_madskills)
6b29d1f5 2076 curmad('^', newSVpvs(""));
cd81e915 2077 CURMAD('_', PL_nextwhite);
5db06880
NC
2078 }
2079}
2080
2081STATIC void
2082S_curmad(pTHX_ char slot, SV *sv)
2083{
2084 MADPROP **where;
2085
2086 if (!sv)
2087 return;
cd81e915
NC
2088 if (PL_curforce < 0)
2089 where = &PL_thismad;
5db06880 2090 else
cd81e915 2091 where = &PL_nexttoke[PL_curforce].next_mad;
5db06880 2092
cd81e915 2093 if (PL_faketokens)
76f68e9b 2094 sv_setpvs(sv, "");
5db06880
NC
2095 else {
2096 if (!IN_BYTES) {
2097 if (UTF && is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
2098 SvUTF8_on(sv);
2099 else if (PL_encoding) {
2100 sv_recode_to_utf8(sv, PL_encoding);
2101 }
2102 }
2103 }
2104
2105 /* keep a slot open for the head of the list? */
2106 if (slot != '_' && *where && (*where)->mad_key == '^') {
2107 (*where)->mad_key = slot;
daba3364 2108 sv_free(MUTABLE_SV(((*where)->mad_val)));
5db06880
NC
2109 (*where)->mad_val = (void*)sv;
2110 }
2111 else
2112 addmad(newMADsv(slot, sv), where, 0);
2113}
2114#else
b3f24c00
MHM
2115# define start_force(where) NOOP
2116# define curmad(slot, sv) NOOP
5db06880
NC
2117#endif
2118
ffb4593c
NT
2119/*
2120 * S_force_next
9cbb5ea2 2121 * When the lexer realizes it knows the next token (for instance,
ffb4593c 2122 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 2123 * to know what token to return the next time the lexer is called. Caller
5db06880
NC
2124 * will need to set PL_nextval[] (or PL_nexttoke[].next_val with PERL_MAD),
2125 * and possibly PL_expect to ensure the lexer handles the token correctly.
ffb4593c
NT
2126 */
2127
4e553d73 2128STATIC void
cea2e8a9 2129S_force_next(pTHX_ I32 type)
79072805 2130{
97aff369 2131 dVAR;
704d4215
GG
2132#ifdef DEBUGGING
2133 if (DEBUG_T_TEST) {
2134 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 2135 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
2136 }
2137#endif
5db06880 2138#ifdef PERL_MAD
cd81e915 2139 if (PL_curforce < 0)
5db06880 2140 start_force(PL_lasttoke);
cd81e915 2141 PL_nexttoke[PL_curforce].next_type = type;
5db06880
NC
2142 if (PL_lex_state != LEX_KNOWNEXT)
2143 PL_lex_defer = PL_lex_state;
2144 PL_lex_state = LEX_KNOWNEXT;
2145 PL_lex_expect = PL_expect;
cd81e915 2146 PL_curforce = -1;
5db06880 2147#else
3280af22
NIS
2148 PL_nexttype[PL_nexttoke] = type;
2149 PL_nexttoke++;
2150 if (PL_lex_state != LEX_KNOWNEXT) {
2151 PL_lex_defer = PL_lex_state;
2152 PL_lex_expect = PL_expect;
2153 PL_lex_state = LEX_KNOWNEXT;
79072805 2154 }
5db06880 2155#endif
79072805
LW
2156}
2157
28ac2b49
Z
2158void
2159Perl_yyunlex(pTHX)
2160{
a7aaec61
Z
2161 int yyc = PL_parser->yychar;
2162 if (yyc != YYEMPTY) {
2163 if (yyc) {
2164 start_force(-1);
2165 NEXTVAL_NEXTTOKE = PL_parser->yylval;
2166 if (yyc == '{'/*}*/ || yyc == HASHBRACK || yyc == '['/*]*/) {
78cdf107 2167 PL_lex_allbrackets--;
a7aaec61 2168 PL_lex_brackets--;
78cdf107
Z
2169 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
2170 } else if (yyc == '('/*)*/) {
2171 PL_lex_allbrackets--;
2172 yyc |= (2<<24);
a7aaec61
Z
2173 }
2174 force_next(yyc);
2175 }
28ac2b49
Z
2176 PL_parser->yychar = YYEMPTY;
2177 }
2178}
2179
d0a148a6 2180STATIC SV *
15f169a1 2181S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 2182{
97aff369 2183 dVAR;
740cce10 2184 SV * const sv = newSVpvn_utf8(start, len,
eaf7a4d2
CS
2185 !IN_BYTES
2186 && UTF
2187 && !is_ascii_string((const U8*)start, len)
740cce10 2188 && is_utf8_string((const U8*)start, len));
d0a148a6
NC
2189 return sv;
2190}
2191
ffb4593c
NT
2192/*
2193 * S_force_word
2194 * When the lexer knows the next thing is a word (for instance, it has
2195 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
2196 * it calls S_force_word to stick the next word into the PL_nexttoke/val
2197 * lookahead.
ffb4593c
NT
2198 *
2199 * Arguments:
b1b65b59 2200 * char *start : buffer position (must be within PL_linestr)
02b34bbe 2201 * int token : PL_next* will be this type of bare word (e.g., METHOD,WORD)
ffb4593c
NT
2202 * int check_keyword : if true, Perl checks to make sure the word isn't
2203 * a keyword (do this if the word is a label, e.g. goto FOO)
2204 * int allow_pack : if true, : characters will also be allowed (require,
2205 * use, etc. do this)
9cbb5ea2 2206 * int allow_initial_tick : used by the "sub" lexer only.
ffb4593c
NT
2207 */
2208
76e3520e 2209STATIC char *
345b3785 2210S_force_word(pTHX_ char *start, int token, int check_keyword, int allow_pack)
79072805 2211{
97aff369 2212 dVAR;
eb578fdb 2213 char *s;
463ee0b2 2214 STRLEN len;
4e553d73 2215
7918f24d
NC
2216 PERL_ARGS_ASSERT_FORCE_WORD;
2217
29595ff2 2218 start = SKIPSPACE1(start);
463ee0b2 2219 s = start;
7e2040f0 2220 if (isIDFIRST_lazy_if(s,UTF) ||
345b3785 2221 (allow_pack && *s == ':') )
a0d0e21e 2222 {
3280af22 2223 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
01b5ef50
FC
2224 if (check_keyword) {
2225 char *s2 = PL_tokenbuf;
2226 if (allow_pack && len > 6 && strnEQ(s2, "CORE::", 6))
2227 s2 += 6, len -= 6;
2228 if (keyword(s2, len, 0))
463ee0b2 2229 return start;
01b5ef50 2230 }
cd81e915 2231 start_force(PL_curforce);
5db06880
NC
2232 if (PL_madskills)
2233 curmad('X', newSVpvn(start,s-start));
463ee0b2 2234 if (token == METHOD) {
29595ff2 2235 s = SKIPSPACE1(s);
463ee0b2 2236 if (*s == '(')
3280af22 2237 PL_expect = XTERM;
463ee0b2 2238 else {
3280af22 2239 PL_expect = XOPERATOR;
463ee0b2 2240 }
79072805 2241 }
e74e6b3d 2242 if (PL_madskills)
63575281 2243 curmad('g', newSVpvs( "forced" ));
9ded7720 2244 NEXTVAL_NEXTTOKE.opval
d0a148a6
NC
2245 = (OP*)newSVOP(OP_CONST,0,
2246 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 2247 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
2248 force_next(token);
2249 }
2250 return s;
2251}
2252
ffb4593c
NT
2253/*
2254 * S_force_ident
9cbb5ea2 2255 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
2256 * text only contains the "foo" portion. The first argument is a pointer
2257 * to the "foo", and the second argument is the type symbol to prefix.
2258 * Forces the next token to be a "WORD".
9cbb5ea2 2259 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2260 */
2261
76e3520e 2262STATIC void
5aaab254 2263S_force_ident(pTHX_ const char *s, int kind)
79072805 2264{
97aff369 2265 dVAR;
7918f24d
NC
2266
2267 PERL_ARGS_ASSERT_FORCE_IDENT;
2268
c9b48522
DD
2269 if (s[0]) {
2270 const STRLEN len = s[1] ? strlen(s) : 1; /* s = "\"" see yylex */
728847b1
BF
2271 OP* const o = (OP*)newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
2272 UTF ? SVf_UTF8 : 0));
cd81e915 2273 start_force(PL_curforce);
9ded7720 2274 NEXTVAL_NEXTTOKE.opval = o;
79072805 2275 force_next(WORD);
748a9306 2276 if (kind) {
11343788 2277 o->op_private = OPpCONST_ENTERED;
55497cff 2278 /* XXX see note in pp_entereval() for why we forgo typo
2279 warnings if the symbol must be introduced in an eval.
2280 GSAR 96-10-12 */
90e5519e 2281 gv_fetchpvn_flags(s, len,
728847b1
BF
2282 (PL_in_eval ? (GV_ADDMULTI | GV_ADDINEVAL)
2283 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
90e5519e
NC
2284 kind == '$' ? SVt_PV :
2285 kind == '@' ? SVt_PVAV :
2286 kind == '%' ? SVt_PVHV :
a0d0e21e 2287 SVt_PVGV
90e5519e 2288 );
748a9306 2289 }
79072805
LW
2290 }
2291}
2292
3f33d153
FC
2293static void
2294S_force_ident_maybe_lex(pTHX_ char pit)
2295{
2296 start_force(PL_curforce);
2297 NEXTVAL_NEXTTOKE.ival = pit;
2298 force_next('p');
2299}
2300
1571675a
GS
2301NV
2302Perl_str_to_version(pTHX_ SV *sv)
2303{
2304 NV retval = 0.0;
2305 NV nshift = 1.0;
2306 STRLEN len;
cfd0369c 2307 const char *start = SvPV_const(sv,len);
9d4ba2ae 2308 const char * const end = start + len;
504618e9 2309 const bool utf = SvUTF8(sv) ? TRUE : FALSE;
7918f24d
NC
2310
2311 PERL_ARGS_ASSERT_STR_TO_VERSION;
2312
1571675a 2313 while (start < end) {
ba210ebe 2314 STRLEN skip;
1571675a
GS
2315 UV n;
2316 if (utf)
9041c2e3 2317 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2318 else {
2319 n = *(U8*)start;
2320 skip = 1;
2321 }
2322 retval += ((NV)n)/nshift;
2323 start += skip;
2324 nshift *= 1000;
2325 }
2326 return retval;
2327}
2328
4e553d73 2329/*
ffb4593c
NT
2330 * S_force_version
2331 * Forces the next token to be a version number.
e759cc13
RGS
2332 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2333 * and if "guessing" is TRUE, then no new token is created (and the caller
2334 * must use an alternative parsing method).
ffb4593c
NT
2335 */
2336
76e3520e 2337STATIC char *
e759cc13 2338S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2339{
97aff369 2340 dVAR;
5f66b61c 2341 OP *version = NULL;
44dcb63b 2342 char *d;
5db06880
NC
2343#ifdef PERL_MAD
2344 I32 startoff = s - SvPVX(PL_linestr);
2345#endif
89bfa8cd 2346
7918f24d
NC
2347 PERL_ARGS_ASSERT_FORCE_VERSION;
2348
29595ff2 2349 s = SKIPSPACE1(s);
89bfa8cd 2350
44dcb63b 2351 d = s;
dd629d5b 2352 if (*d == 'v')
44dcb63b 2353 d++;
44dcb63b 2354 if (isDIGIT(*d)) {
e759cc13
RGS
2355 while (isDIGIT(*d) || *d == '_' || *d == '.')
2356 d++;
5db06880
NC
2357#ifdef PERL_MAD
2358 if (PL_madskills) {
cd81e915 2359 start_force(PL_curforce);
5db06880
NC
2360 curmad('X', newSVpvn(s,d-s));
2361 }
2362#endif
4e4da3ac 2363 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
dd629d5b 2364 SV *ver;
8d08d9ba 2365#ifdef USE_LOCALE_NUMERIC
909d3787
KW
2366 char *loc = savepv(setlocale(LC_NUMERIC, NULL));
2367 setlocale(LC_NUMERIC, "C");
8d08d9ba 2368#endif
6154021b 2369 s = scan_num(s, &pl_yylval);
8d08d9ba
DG
2370#ifdef USE_LOCALE_NUMERIC
2371 setlocale(LC_NUMERIC, loc);
909d3787 2372 Safefree(loc);
8d08d9ba 2373#endif
6154021b 2374 version = pl_yylval.opval;
dd629d5b
GS
2375 ver = cSVOPx(version)->op_sv;
2376 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2377 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2378 SvNV_set(ver, str_to_version(ver));
1571675a 2379 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2380 }
89bfa8cd 2381 }
5db06880
NC
2382 else if (guessing) {
2383#ifdef PERL_MAD
2384 if (PL_madskills) {
cd81e915
NC
2385 sv_free(PL_nextwhite); /* let next token collect whitespace */
2386 PL_nextwhite = 0;
5db06880
NC
2387 s = SvPVX(PL_linestr) + startoff;
2388 }
2389#endif
e759cc13 2390 return s;
5db06880 2391 }
89bfa8cd 2392 }
2393
5db06880
NC
2394#ifdef PERL_MAD
2395 if (PL_madskills && !version) {
cd81e915
NC
2396 sv_free(PL_nextwhite); /* let next token collect whitespace */
2397 PL_nextwhite = 0;
5db06880
NC
2398 s = SvPVX(PL_linestr) + startoff;
2399 }
2400#endif
89bfa8cd 2401 /* NOTE: The parser sees the package name and the VERSION swapped */
cd81e915 2402 start_force(PL_curforce);
9ded7720 2403 NEXTVAL_NEXTTOKE.opval = version;
4e553d73 2404 force_next(WORD);
89bfa8cd 2405
e759cc13 2406 return s;
89bfa8cd 2407}
2408
ffb4593c 2409/*
91152fc1
DG
2410 * S_force_strict_version
2411 * Forces the next token to be a version number using strict syntax rules.
2412 */
2413
2414STATIC char *
2415S_force_strict_version(pTHX_ char *s)
2416{
2417 dVAR;
2418 OP *version = NULL;
2419#ifdef PERL_MAD
2420 I32 startoff = s - SvPVX(PL_linestr);
2421#endif
2422 const char *errstr = NULL;
2423
2424 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2425
2426 while (isSPACE(*s)) /* leading whitespace */
2427 s++;
2428
2429 if (is_STRICT_VERSION(s,&errstr)) {
2430 SV *ver = newSV(0);
2431 s = (char *)scan_version(s, ver, 0);
2432 version = newSVOP(OP_CONST, 0, ver);
2433 }
4e4da3ac
Z
2434 else if ( (*s != ';' && *s != '{' && *s != '}' ) &&
2435 (s = SKIPSPACE1(s), (*s != ';' && *s != '{' && *s != '}' )))
2436 {
91152fc1
DG
2437 PL_bufptr = s;
2438 if (errstr)
2439 yyerror(errstr); /* version required */
2440 return s;
2441 }
2442
2443#ifdef PERL_MAD
2444 if (PL_madskills && !version) {
2445 sv_free(PL_nextwhite); /* let next token collect whitespace */
2446 PL_nextwhite = 0;
2447 s = SvPVX(PL_linestr) + startoff;
2448 }
2449#endif
2450 /* NOTE: The parser sees the package name and the VERSION swapped */
2451 start_force(PL_curforce);
2452 NEXTVAL_NEXTTOKE.opval = version;
2453 force_next(WORD);
2454
2455 return s;
2456}
2457
2458/*
ffb4593c
NT
2459 * S_tokeq
2460 * Tokenize a quoted string passed in as an SV. It finds the next
2461 * chunk, up to end of string or a backslash. It may make a new
2462 * SV containing that chunk (if HINT_NEW_STRING is on). It also
2463 * turns \\ into \.
2464 */
2465
76e3520e 2466STATIC SV *
cea2e8a9 2467S_tokeq(pTHX_ SV *sv)
79072805 2468{
97aff369 2469 dVAR;
eb578fdb
KW
2470 char *s;
2471 char *send;
2472 char *d;
b3ac6de7
IZ
2473 STRLEN len = 0;
2474 SV *pv = sv;
79072805 2475
7918f24d
NC
2476 PERL_ARGS_ASSERT_TOKEQ;
2477
79072805 2478 if (!SvLEN(sv))
b3ac6de7 2479 goto finish;
79072805 2480
a0d0e21e 2481 s = SvPV_force(sv, len);
21a311ee 2482 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1)
b3ac6de7 2483 goto finish;
463ee0b2 2484 send = s + len;
dcb21ed6
NC
2485 /* This is relying on the SV being "well formed" with a trailing '\0' */
2486 while (s < send && !(*s == '\\' && s[1] == '\\'))
79072805
LW
2487 s++;
2488 if (s == send)
b3ac6de7 2489 goto finish;
79072805 2490 d = s;
be4731d2 2491 if ( PL_hints & HINT_NEW_STRING ) {
59cd0e26 2492 pv = newSVpvn_flags(SvPVX_const(pv), len, SVs_TEMP | SvUTF8(sv));
be4731d2 2493 }
79072805
LW
2494 while (s < send) {
2495 if (*s == '\\') {
a0d0e21e 2496 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2497 s++; /* all that, just for this */
2498 }
2499 *d++ = *s++;
2500 }
2501 *d = '\0';
95a20fc0 2502 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2503 finish:
3280af22 2504 if ( PL_hints & HINT_NEW_STRING )
eb0d8d16 2505 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
79072805
LW
2506 return sv;
2507}
2508
ffb4593c
NT
2509/*
2510 * Now come three functions related to double-quote context,
2511 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2512 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2513 * interact with PL_lex_state, and create fake ( ... ) argument lists
2514 * to handle functions and concatenation.
ecd24171
DM
2515 * For example,
2516 * "foo\lbar"
2517 * is tokenised as
2518 * stringify ( const[foo] concat lcfirst ( const[bar] ) )
ffb4593c
NT
2519 */
2520
2521/*
2522 * S_sublex_start
6154021b 2523 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2524 *
2525 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2526 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c
NT
2527 *
2528 * OP_CONST and OP_READLINE are easy--just make the new op and return.
2529 *
2530 * Everything else becomes a FUNC.
2531 *
2532 * Sets PL_lex_state to LEX_INTERPPUSH unless (ival was OP_NULL or we
2533 * had an OP_CONST or OP_READLINE). This just sets us up for a
2534 * call to S_sublex_push().
2535 */
2536
76e3520e 2537STATIC I32
cea2e8a9 2538S_sublex_start(pTHX)
79072805 2539{
97aff369 2540 dVAR;
eb578fdb 2541 const I32 op_type = pl_yylval.ival;
79072805
LW
2542
2543 if (op_type == OP_NULL) {
6154021b 2544 pl_yylval.opval = PL_lex_op;
5f66b61c 2545 PL_lex_op = NULL;
79072805
LW
2546 return THING;
2547 }
2548 if (op_type == OP_CONST || op_type == OP_READLINE) {
3280af22 2549 SV *sv = tokeq(PL_lex_stuff);
b3ac6de7
IZ
2550
2551 if (SvTYPE(sv) == SVt_PVIV) {
2552 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2553 STRLEN len;
96a5add6 2554 const char * const p = SvPV_const(sv, len);
740cce10 2555 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2556 SvREFCNT_dec(sv);
2557 sv = nsv;
4e553d73 2558 }
6154021b 2559 pl_yylval.opval = (OP*)newSVOP(op_type, 0, sv);
a0714e2c 2560 PL_lex_stuff = NULL;
6f33ba73
RGS
2561 /* Allow <FH> // "foo" */
2562 if (op_type == OP_READLINE)
2563 PL_expect = XTERMORDORDOR;
79072805
LW
2564 return THING;
2565 }
e3f73d4e 2566 else if (op_type == OP_BACKTICK && PL_lex_op) {
06df4f41 2567 /* readpipe() was overridden */
e3f73d4e 2568 cSVOPx(cLISTOPx(cUNOPx(PL_lex_op)->op_first)->op_first->op_sibling)->op_sv = tokeq(PL_lex_stuff);
6154021b 2569 pl_yylval.opval = PL_lex_op;
9b201d7d 2570 PL_lex_op = NULL;
e3f73d4e
RGS
2571 PL_lex_stuff = NULL;
2572 return THING;
2573 }
79072805 2574
3280af22 2575 PL_sublex_info.super_state = PL_lex_state;
eac04b2e 2576 PL_sublex_info.sub_inwhat = (U16)op_type;
3280af22
NIS
2577 PL_sublex_info.sub_op = PL_lex_op;
2578 PL_lex_state = LEX_INTERPPUSH;
55497cff 2579
3280af22
NIS
2580 PL_expect = XTERM;
2581 if (PL_lex_op) {
6154021b 2582 pl_yylval.opval = PL_lex_op;
5f66b61c 2583 PL_lex_op = NULL;
55497cff 2584 return PMFUNC;
2585 }
2586 else
2587 return FUNC;
2588}
2589
ffb4593c
NT
2590/*
2591 * S_sublex_push
2592 * Create a new scope to save the lexing state. The scope will be
2593 * ended in S_sublex_done. Returns a '(', starting the function arguments
2594 * to the uc, lc, etc. found before.
2595 * Sets PL_lex_state to LEX_INTERPCONCAT.
2596 */
2597
76e3520e 2598STATIC I32
cea2e8a9 2599S_sublex_push(pTHX)
55497cff 2600{
27da23d5 2601 dVAR;
78a635de 2602 LEXSHARED *shared;
801d32ac 2603 const bool is_heredoc = PL_multi_close == '<';
f46d017c 2604 ENTER;
55497cff 2605
3280af22 2606 PL_lex_state = PL_sublex_info.super_state;
651b5b28 2607 SAVEBOOL(PL_lex_dojoin);
3280af22 2608 SAVEI32(PL_lex_brackets);
78cdf107 2609 SAVEI32(PL_lex_allbrackets);
b27dce25 2610 SAVEI32(PL_lex_formbrack);
78cdf107 2611 SAVEI8(PL_lex_fakeeof);
3280af22
NIS
2612 SAVEI32(PL_lex_casemods);
2613 SAVEI32(PL_lex_starts);
651b5b28 2614 SAVEI8(PL_lex_state);
7cc34111 2615 SAVESPTR(PL_lex_repl);
7766f137 2616 SAVEVPTR(PL_lex_inpat);
98246f1e 2617 SAVEI16(PL_lex_inwhat);
ffdb8b16 2618 if (is_heredoc)
b42366d4 2619 {
ffdb8b16 2620 SAVECOPLINE(PL_curcop);
b42366d4 2621 SAVEI32(PL_multi_end);
851b527a
FC
2622 SAVEI32(PL_parser->herelines);
2623 PL_parser->herelines = 0;
b42366d4
FC
2624 }
2625 SAVEI8(PL_multi_close);
3280af22 2626 SAVEPPTR(PL_bufptr);
8452ff4b 2627 SAVEPPTR(PL_bufend);
3280af22
NIS
2628 SAVEPPTR(PL_oldbufptr);
2629 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2630 SAVEPPTR(PL_last_lop);
2631 SAVEPPTR(PL_last_uni);
3280af22
NIS
2632 SAVEPPTR(PL_linestart);
2633 SAVESPTR(PL_linestr);
8edd5f42
RGS
2634 SAVEGENERICPV(PL_lex_brackstack);
2635 SAVEGENERICPV(PL_lex_casestack);
78a635de 2636 SAVEGENERICPV(PL_parser->lex_shared);
3a54fd60 2637 SAVEBOOL(PL_parser->lex_re_reparsing);
ffdb8b16 2638 SAVEI32(PL_copline);
3280af22 2639
99bd9d90 2640 /* The here-doc parser needs to be able to peek into outer lexing
60f40a38
FC
2641 scopes to find the body of the here-doc. So we put PL_linestr and
2642 PL_bufptr into lex_shared, to ‘share’ those values.
99bd9d90 2643 */
60f40a38
FC
2644 PL_parser->lex_shared->ls_linestr = PL_linestr;
2645 PL_parser->lex_shared->ls_bufptr = PL_bufptr;
99bd9d90 2646
3280af22 2647 PL_linestr = PL_lex_stuff;
7cc34111 2648 PL_lex_repl = PL_sublex_info.repl;
a0714e2c 2649 PL_lex_stuff = NULL;
7cc34111 2650 PL_sublex_info.repl = NULL;
3280af22 2651
9cbb5ea2
GS
2652 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2653 = SvPVX(PL_linestr);
3280af22 2654 PL_bufend += SvCUR(PL_linestr);
bd61b366 2655 PL_last_lop = PL_last_uni = NULL;
3280af22 2656 SAVEFREESV(PL_linestr);
4dc843bc 2657 if (PL_lex_repl) SAVEFREESV(PL_lex_repl);
3280af22
NIS
2658
2659 PL_lex_dojoin = FALSE;
b27dce25 2660 PL_lex_brackets = PL_lex_formbrack = 0;
78cdf107
Z
2661 PL_lex_allbrackets = 0;
2662 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
a02a5408
JC
2663 Newx(PL_lex_brackstack, 120, char);
2664 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2665 PL_lex_casemods = 0;
2666 *PL_lex_casestack = '\0';
2667 PL_lex_starts = 0;
2668 PL_lex_state = LEX_INTERPCONCAT;
ffdb8b16 2669 if (is_heredoc)
6ddcf93b 2670 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
ffdb8b16 2671 PL_copline = NOLINE;
78a635de
FC
2672
2673 Newxz(shared, 1, LEXSHARED);
2674 shared->ls_prev = PL_parser->lex_shared;
2675 PL_parser->lex_shared = shared;
3280af22
NIS
2676
2677 PL_lex_inwhat = PL_sublex_info.sub_inwhat;
bb16bae8 2678 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
3280af22
NIS
2679 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2680 PL_lex_inpat = PL_sublex_info.sub_op;
79072805 2681 else
5f66b61c 2682 PL_lex_inpat = NULL;
79072805 2683
3a54fd60
DM
2684 PL_parser->lex_re_reparsing = cBOOL(PL_in_eval & EVAL_RE_REPARSING);
2685 PL_in_eval &= ~EVAL_RE_REPARSING;
2686
55497cff 2687 return '(';
79072805
LW
2688}
2689
ffb4593c
NT
2690/*
2691 * S_sublex_done
2692 * Restores lexer state after a S_sublex_push.
2693 */
2694
76e3520e 2695STATIC I32
cea2e8a9 2696S_sublex_done(pTHX)
79072805 2697{
27da23d5 2698 dVAR;
3280af22 2699 if (!PL_lex_starts++) {
396482e1 2700 SV * const sv = newSVpvs("");
9aa983d2
JH
2701 if (SvUTF8(PL_linestr))
2702 SvUTF8_on(sv);
3280af22 2703 PL_expect = XOPERATOR;
6154021b 2704 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
79072805
LW
2705 return THING;
2706 }
2707
3280af22
NIS
2708 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2709 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2710 return yylex();
79072805
LW
2711 }
2712
ffb4593c 2713 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
bb16bae8 2714 assert(PL_lex_inwhat != OP_TRANSR);
3280af22
NIS
2715 if (PL_lex_repl && (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS)) {
2716 PL_linestr = PL_lex_repl;
2717 PL_lex_inpat = 0;
2718 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2719 PL_bufend += SvCUR(PL_linestr);
bd61b366 2720 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2721 PL_lex_dojoin = FALSE;
2722 PL_lex_brackets = 0;
78cdf107
Z
2723 PL_lex_allbrackets = 0;
2724 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
3280af22
NIS
2725 PL_lex_casemods = 0;
2726 *PL_lex_casestack = '\0';
2727 PL_lex_starts = 0;
25da4f38 2728 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2729 PL_lex_state = LEX_INTERPNORMAL;
2730 PL_lex_starts++;
e9fa98b2
HS
2731 /* we don't clear PL_lex_repl here, so that we can check later
2732 whether this is an evalled subst; that means we rely on the
2733 logic to ensure sublex_done() is called again only via the
2734 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2735 }
e9fa98b2 2736 else {
3280af22 2737 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2738 PL_lex_repl = NULL;
e9fa98b2 2739 }
ffdb8b16
FC
2740 if (SvTYPE(PL_linestr) >= SVt_PVNV) {
2741 CopLINE(PL_curcop) +=
2742 ((XPVNV*)SvANY(PL_linestr))->xnv_u.xpad_cop_seq.xlow
851b527a
FC
2743 + PL_parser->herelines;
2744 PL_parser->herelines = 0;
ffdb8b16 2745 }
79072805 2746 return ',';
ffed7fef
LW
2747 }
2748 else {
b42366d4 2749 const line_t l = CopLINE(PL_curcop);
5db06880
NC
2750#ifdef PERL_MAD
2751 if (PL_madskills) {
cd81e915
NC
2752 if (PL_thiswhite) {
2753 if (!PL_endwhite)
6b29d1f5 2754 PL_endwhite = newSVpvs("");
cd81e915
NC
2755 sv_catsv(PL_endwhite, PL_thiswhite);
2756 PL_thiswhite = 0;
2757 }
2758 if (PL_thistoken)
76f68e9b 2759 sv_setpvs(PL_thistoken,"");
5db06880 2760 else
cd81e915 2761 PL_realtokenstart = -1;
5db06880
NC
2762 }
2763#endif
f46d017c 2764 LEAVE;
b42366d4 2765 if (PL_multi_close == '<')
851b527a 2766 PL_parser->herelines += l - PL_multi_end;
3280af22
NIS
2767 PL_bufend = SvPVX(PL_linestr);
2768 PL_bufend += SvCUR(PL_linestr);
2769 PL_expect = XOPERATOR;
09bef843 2770 PL_sublex_info.sub_inwhat = 0;
79072805 2771 return ')';
ffed7fef
LW
2772 }
2773}
2774
6f613c73
KW
2775PERL_STATIC_INLINE SV*
2776S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
2777{
140b12ad
KW
2778 /* <s> points to first character of interior of \N{}, <e> to one beyond the
2779 * interior, hence to the "}". Finds what the name resolves to, returning
2780 * an SV* containing it; NULL if no valid one found */
2781
dd2b1b72 2782 SV* res = newSVpvn_flags(s, e - s, UTF ? SVf_UTF8 : 0);
6f613c73 2783
0c415a79
KW
2784 HV * table;
2785 SV **cvp;
2786 SV *cv;
2787 SV *rv;
2788 HV *stash;
2789 const U8* first_bad_char_loc;
2790 const char* backslash_ptr = s - 3; /* Points to the <\> of \N{... */
2791
6f613c73
KW
2792 PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME;
2793
107160e2
KW
2794 if (UTF && ! is_utf8_string_loc((U8 *) backslash_ptr,
2795 e - backslash_ptr,
2796 &first_bad_char_loc))
2797 {
2798 /* If warnings are on, this will print a more detailed analysis of what
2799 * is wrong than the error message below */
c80e42f3 2800 utf8n_to_uvchr(first_bad_char_loc,
107160e2
KW
2801 e - ((char *) first_bad_char_loc),
2802 NULL, 0);
2803
2804 /* We deliberately don't try to print the malformed character, which
2805 * might not print very well; it also may be just the first of many
2806 * malformations, so don't print what comes after it */
2807 yyerror(Perl_form(aTHX_
2808 "Malformed UTF-8 character immediately after '%.*s'",
2809 (int) (first_bad_char_loc - (U8 *) backslash_ptr), backslash_ptr));
2810 return NULL;
2811 }
2812
2813 res = new_constant( NULL, 0, "charnames", res, NULL, backslash_ptr,
2814 /* include the <}> */
2815 e - backslash_ptr + 1);
6f613c73 2816 if (! SvPOK(res)) {
b6407c49 2817 SvREFCNT_dec_NN(res);
6f613c73
KW
2818 return NULL;
2819 }
2820
0c415a79
KW
2821 /* See if the charnames handler is the Perl core's, and if so, we can skip
2822 * the validation needed for a user-supplied one, as Perl's does its own
2823 * validation. */
2824 table = GvHV(PL_hintgv); /* ^H */
2825 cvp = hv_fetchs(table, "charnames", FALSE);
67a057d6
FC
2826 if (cvp && (cv = *cvp) && SvROK(cv) && ((rv = SvRV(cv)) != NULL)
2827 && SvTYPE(rv) == SVt_PVCV && ((stash = CvSTASH(rv)) != NULL))
0c415a79
KW
2828 {
2829 const char * const name = HvNAME(stash);
2830 if strEQ(name, "_charnames") {
2831 return res;
2832 }
2833 }
2834
bde9e88d
KW
2835 /* Here, it isn't Perl's charname handler. We can't rely on a
2836 * user-supplied handler to validate the input name. For non-ut8 input,
2837 * look to see that the first character is legal. Then loop through the
2838 * rest checking that each is a continuation */
6f613c73 2839
b6ba1137
KW
2840 /* This code needs to be sync'ed with a regex in _charnames.pm which does
2841 * the same thing */
2842
b6ba1137 2843 if (! UTF) {
bde9e88d 2844 if (! isALPHAU(*s)) {
b6ba1137
KW
2845 goto bad_charname;
2846 }
bde9e88d
KW
2847 s++;
2848 while (s < e) {
2849 if (! isCHARNAME_CONT(*s)) {
b6ba1137
KW
2850 goto bad_charname;
2851 }
3036c853 2852 if (*s == ' ' && *(s-1) == ' ' && ckWARN_d(WARN_DEPRECATED)) {
94ec3a20
FC
2853 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2854 "A sequence of multiple spaces in a charnames "
2855 "alias definition is deprecated");
bd299e29 2856 }
bde9e88d 2857 s++;
b6ba1137 2858 }
3036c853 2859 if (*(s-1) == ' ' && ckWARN_d(WARN_DEPRECATED)) {
94ec3a20
FC
2860 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2861 "Trailing white-space in a charnames alias "
2862 "definition is deprecated");
bd299e29 2863 }
b6ba1137
KW
2864 }
2865 else {
bde9e88d
KW
2866 /* Similarly for utf8. For invariants can check directly; for other
2867 * Latin1, can calculate their code point and check; otherwise use a
2868 * swash */
2869 if (UTF8_IS_INVARIANT(*s)) {
2870 if (! isALPHAU(*s)) {
140b12ad
KW
2871 goto bad_charname;
2872 }
bde9e88d
KW
2873 s++;
2874 } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
94bb8c36 2875 if (! isALPHAU(TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)))) {
b6ba1137 2876 goto bad_charname;
6f613c73 2877 }
bde9e88d 2878 s += 2;
6f613c73 2879 }
bde9e88d
KW
2880 else {
2881 if (! PL_utf8_charname_begin) {
2882 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2883 PL_utf8_charname_begin = _core_swash_init("utf8",
2884 "_Perl_Charname_Begin",
2885 &PL_sv_undef,
2886 1, 0, NULL, &flags);
2887 }
2888 if (! swash_fetch(PL_utf8_charname_begin, (U8 *) s, TRUE)) {
2889 goto bad_charname;
2890 }
2891 s += UTF8SKIP(s);
2892 }
2893
2894 while (s < e) {
2895 if (UTF8_IS_INVARIANT(*s)) {
2896 if (! isCHARNAME_CONT(*s)) {
2897 goto bad_charname;
2898 }
3036c853
FC
2899 if (*s == ' ' && *(s-1) == ' '
2900 && ckWARN_d(WARN_DEPRECATED)) {
94ec3a20
FC
2901 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2902 "A sequence of multiple spaces in a charnam"
2903 "es alias definition is deprecated");
bd299e29 2904 }
bde9e88d
KW
2905 s++;
2906 }
2907 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
94bb8c36 2908 if (! isCHARNAME_CONT(TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1))))
bde9e88d
KW
2909 {
2910 goto bad_charname;
2911 }
2912 s += 2;
2913 }
2914 else {
2915 if (! PL_utf8_charname_continue) {
2916 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2917 PL_utf8_charname_continue = _core_swash_init("utf8",
2918 "_Perl_Charname_Continue",
2919 &PL_sv_undef,
2920 1, 0, NULL, &flags);
2921 }
2922 if (! swash_fetch(PL_utf8_charname_continue, (U8 *) s, TRUE)) {
2923 goto bad_charname;
2924 }
2925 s += UTF8SKIP(s);
6f613c73
KW
2926 }
2927 }
3036c853 2928 if (*(s-1) == ' ' && ckWARN_d(WARN_DEPRECATED)) {
94ec3a20
FC
2929 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2930 "Trailing white-space in a charnames alias "
2931 "definition is deprecated");
bd299e29 2932 }
6f613c73
KW
2933 }
2934
94ca1619 2935 if (SvUTF8(res)) { /* Don't accept malformed input */
bde9e88d
KW
2936 const U8* first_bad_char_loc;
2937 STRLEN len;
2938 const char* const str = SvPV_const(res, len);
2939 if (! is_utf8_string_loc((U8 *) str, len, &first_bad_char_loc)) {
2940 /* If warnings are on, this will print a more detailed analysis of
2941 * what is wrong than the error message below */
c80e42f3 2942 utf8n_to_uvchr(first_bad_char_loc,
bde9e88d
KW
2943 (char *) first_bad_char_loc - str,
2944 NULL, 0);
2945
2946 /* We deliberately don't try to print the malformed character,
2947 * which might not print very well; it also may be just the first
2948 * of many malformations, so don't print what comes after it */
2949 yyerror_pv(
2950 Perl_form(aTHX_
2951 "Malformed UTF-8 returned by %.*s immediately after '%.*s'",
2952 (int) (e - backslash_ptr + 1), backslash_ptr,
2953 (int) ((char *) first_bad_char_loc - str), str
2954 ),
2955 SVf_UTF8);
2956 return NULL;
2957 }
2958 }
140b12ad 2959
bde9e88d 2960 return res;
140b12ad 2961
bde9e88d
KW
2962 bad_charname: {
2963 int bad_char_size = ((UTF) ? UTF8SKIP(s) : 1);
2964
2965 /* The final %.*s makes sure that should the trailing NUL be missing
2966 * that this print won't run off the end of the string */
2967 yyerror_pv(
2968 Perl_form(aTHX_
2969 "Invalid character in \\N{...}; marked by <-- HERE in %.*s<-- HERE %.*s",
2970 (int)(s - backslash_ptr + bad_char_size), backslash_ptr,
2971 (int)(e - s + bad_char_size), s + bad_char_size
2972 ),
2973 UTF ? SVf_UTF8 : 0);
2974 return NULL;
2975 }
6f613c73
KW
2976}
2977
02aa26ce
NT
2978/*
2979 scan_const
2980
9da1dd8f
DM
2981 Extracts the next constant part of a pattern, double-quoted string,
2982 or transliteration. This is terrifying code.
2983
2984 For example, in parsing the double-quoted string "ab\x63$d", it would
2985 stop at the '$' and return an OP_CONST containing 'abc'.
02aa26ce 2986
94def140 2987 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 2988 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 2989 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 2990
94def140
TS
2991 Returns a pointer to the character scanned up to. If this is
2992 advanced from the start pointer supplied (i.e. if anything was
9da1dd8f 2993 successfully parsed), will leave an OP_CONST for the substring scanned
6154021b 2994 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
2995 by looking at the next characters herself.
2996
02aa26ce 2997 In patterns:
9da1dd8f 2998 expand:
537124e4
KW
2999 \N{FOO} => \N{U+hex_for_character_FOO}
3000 (if FOO expands to multiple characters, expands to \N{U+xx.XX.yy ...})
9da1dd8f
DM
3001
3002 pass through:
3003 all other \-char, including \N and \N{ apart from \N{ABC}
3004
3005 stops on:
3006 @ and $ where it appears to be a var, but not for $ as tail anchor
3007 \l \L \u \U \Q \E
3008 (?{ or (??{
3009
02aa26ce
NT
3010
3011 In transliterations:
3012 characters are VERY literal, except for - not at the start or end
94def140
TS
3013 of the string, which indicates a range. If the range is in bytes,
3014 scan_const expands the range to the full set of intermediate
3015 characters. If the range is in utf8, the hyphen is replaced with
3016 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
3017
3018 In double-quoted strings:
3019 backslashes:
3020 double-quoted style: \r and \n
ff3f963a 3021 constants: \x31, etc.
94def140 3022 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
3023 case and quoting: \U \Q \E
3024 stops on @ and $
3025
3026 scan_const does *not* construct ops to handle interpolated strings.
3027 It stops processing as soon as it finds an embedded $ or @ variable
3028 and leaves it to the caller to work out what's going on.
3029
94def140
TS
3030 embedded arrays (whether in pattern or not) could be:
3031 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
3032
3033 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
3034
3035 $ in pattern could be $foo or could be tail anchor. Assumption:
3036 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 3037 followed by one of "()| \r\n\t"
02aa26ce 3038
9da1dd8f 3039 \1 (backreferences) are turned into $1 in substitutions
02aa26ce
NT
3040
3041 The structure of the code is
3042 while (there's a character to process) {
94def140
TS
3043 handle transliteration ranges
3044 skip regexp comments /(?#comment)/ and codes /(?{code})/
3045 skip #-initiated comments in //x patterns
3046 check for embedded arrays
02aa26ce
NT
3047 check for embedded scalars
3048 if (backslash) {
94def140 3049 deprecate \1 in substitution replacements
02aa26ce
NT
3050 handle string-changing backslashes \l \U \Q \E, etc.
3051 switch (what was escaped) {
94def140 3052 handle \- in a transliteration (becomes a literal -)
ff3f963a 3053 if a pattern and not \N{, go treat as regular character
94def140
TS
3054 handle \132 (octal characters)
3055 handle \x15 and \x{1234} (hex characters)
ff3f963a 3056 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
TS
3057 handle \cV (control characters)
3058 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 3059 } (end switch)
77a135fe 3060 continue
02aa26ce 3061 } (end if backslash)
77a135fe 3062 handle regular character
02aa26ce 3063 } (end while character to read)
4e553d73 3064
02aa26ce
NT
3065*/
3066
76e3520e 3067STATIC char *
cea2e8a9 3068S_scan_const(pTHX_ char *start)
79072805 3069{
97aff369 3070 dVAR;
eb578fdb 3071 char *send = PL_bufend; /* end of the constant */
77a135fe
KW
3072 SV *sv = newSV(send - start); /* sv for the constant. See
3073 note below on sizing. */
eb578fdb
KW
3074 char *s = start; /* start of the constant */
3075 char *d = SvPVX(sv); /* destination for copies */
02aa26ce 3076 bool dorange = FALSE; /* are we in a translit range? */
c2e66d9e 3077 bool didrange = FALSE; /* did we just finish a range? */
2866decb 3078 bool in_charclass = FALSE; /* within /[...]/ */
b953e60c
KW
3079 bool has_utf8 = FALSE; /* Output constant is UTF8 */
3080 bool this_utf8 = cBOOL(UTF); /* Is the source string assumed
77a135fe
KW
3081 to be UTF8? But, this can
3082 show as true when the source
3083 isn't utf8, as for example
3084 when it is entirely composed
3085 of hex constants */
6f613c73 3086 SV *res; /* result from charnames */
77a135fe
KW
3087
3088 /* Note on sizing: The scanned constant is placed into sv, which is
3089 * initialized by newSV() assuming one byte of output for every byte of
3090 * input. This routine expects newSV() to allocate an extra byte for a
3091 * trailing NUL, which this routine will append if it gets to the end of
3092 * the input. There may be more bytes of input than output (eg., \N{LATIN
3093 * CAPITAL LETTER A}), or more output than input if the constant ends up
3094 * recoded to utf8, but each time a construct is found that might increase
3095 * the needed size, SvGROW() is called. Its size parameter each time is
3096 * based on the best guess estimate at the time, namely the length used so
3097 * far, plus the length the current construct will occupy, plus room for
3098 * the trailing NUL, plus one byte for every input byte still unscanned */
3099
c3320c2a
KW
3100 UV uv = UV_MAX; /* Initialize to weird value to try to catch any uses
3101 before set */
4c3a8340
TS
3102#ifdef EBCDIC
3103 UV literal_endpoint = 0;
e294cc5d 3104 bool native_range = TRUE; /* turned to FALSE if the first endpoint is Unicode. */
4c3a8340 3105#endif
012bcf8d 3106
7918f24d
NC
3107 PERL_ARGS_ASSERT_SCAN_CONST;
3108
bb16bae8 3109 assert(PL_lex_inwhat != OP_TRANSR);
2b9d42f0
NIS
3110 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
3111 /* If we are doing a trans and we know we want UTF8 set expectation */
3112 has_utf8 = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
3113 this_utf8 = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
3114 }
3115
b899e89d
FC
3116 /* Protect sv from errors and fatal warnings. */
3117 ENTER_with_name("scan_const");
3118 SAVEFREESV(sv);
2b9d42f0 3119
79072805 3120 while (s < send || dorange) {
ff3f963a 3121
02aa26ce 3122 /* get transliterations out of the way (they're most literal) */
3280af22 3123 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 3124 /* expand a range A-Z to the full set of characters. AIE! */
79072805 3125 if (dorange) {
1ba5c669
JH
3126 I32 i; /* current expanded character */
3127 I32 min; /* first character in range */
3128 I32 max; /* last character in range */
02aa26ce 3129
e294cc5d
JH
3130#ifdef EBCDIC
3131 UV uvmax = 0;
3132#endif
3133
3134 if (has_utf8
3135#ifdef EBCDIC
3136 && !native_range
3137#endif
1953db30 3138 ) {
9d4ba2ae 3139 char * const c = (char*)utf8_hop((U8*)d, -1);
8973db79
JH
3140 char *e = d++;
3141 while (e-- > c)
3142 *(e + 1) = *e;
e7214ce8 3143 *c = (char) ILLEGAL_UTF8_BYTE;
8973db79
JH
3144 /* mark the range as done, and continue */
3145 dorange = FALSE;
3146 didrange = TRUE;
3147 continue;
3148 }
2b9d42f0 3149
95a20fc0 3150 i = d - SvPVX_const(sv); /* remember current offset */
e294cc5d
JH
3151#ifdef EBCDIC
3152 SvGROW(sv,
3153 SvLEN(sv) + (has_utf8 ?
3154 (512 - UTF_CONTINUATION_MARK +
3155 UNISKIP(0x100))
3156 : 256));
3157 /* How many two-byte within 0..255: 128 in UTF-8,
3158 * 96 in UTF-8-mod. */
3159#else
9cbb5ea2 3160 SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */
e294cc5d 3161#endif
9cbb5ea2 3162 d = SvPVX(sv) + i; /* refresh d after realloc */
e294cc5d
JH
3163#ifdef EBCDIC
3164 if (has_utf8) {
3165 int j;
3166 for (j = 0; j <= 1; j++) {
3167 char * const c = (char*)utf8_hop((U8*)d, -1);
3168 const UV uv = utf8n_to_uvchr((U8*)c, d - c, NULL, 0);
3169 if (j)
3170 min = (U8)uv;
3171 else if (uv < 256)
3172 max = (U8)uv;
3173 else {
3174 max = (U8)0xff; /* only to \xff */
3175 uvmax = uv; /* \x{100} to uvmax */
3176 }
3177 d = c; /* eat endpoint chars */
3178 }
3179 }
3180 else {
3181#endif
3182 d -= 2; /* eat the first char and the - */
3183 min = (U8)*d; /* first char in range */
3184 max = (U8)d[1]; /* last char in range */
3185#ifdef EBCDIC
3186 }
3187#endif
8ada0baa 3188
c2e66d9e 3189 if (min > max) {
01ec43d0 3190 Perl_croak(aTHX_
d1573ac7 3191 "Invalid range \"%c-%c\" in transliteration operator",
1ba5c669 3192 (char)min, (char)max);
c2e66d9e
GS
3193 }
3194
c7f1f016 3195#ifdef EBCDIC
4c3a8340 3196 if (literal_endpoint == 2 &&
c286a9a6 3197 ((isLOWER_A(min) && isLOWER_A(max)) ||
75e9e7bf
KW
3198 (isUPPER_A(min) && isUPPER_A(max))))
3199 {
3200 for (i = min; i <= max; i++) {
3201 if (isALPHA_A(i))
3202 *d++ = i;
8ada0baa
JH
3203 }
3204 }
3205 else
3206#endif
3207 for (i = min; i <= max; i++)
e294cc5d
JH
3208#ifdef EBCDIC
3209 if (has_utf8) {
55d09dc8 3210 append_utf8_from_native_byte(i, &d);
e294cc5d
JH
3211 }
3212 else
3213#endif
3214 *d++ = (char)i;
3215
3216#ifdef EBCDIC
3217 if (uvmax) {
3218 d = (char*)uvchr_to_utf8((U8*)d, 0x100);
3219 if (uvmax > 0x101)
e7214ce8 3220 *d++ = (char) ILLEGAL_UTF8_BYTE;
e294cc5d
JH
3221 if (uvmax > 0x100)
3222 d = (char*)uvchr_to_utf8((U8*)d, uvmax);
3223 }
3224#endif
02aa26ce
NT
3225
3226 /* mark the range as done, and continue */
79072805 3227 dorange = FALSE;
01ec43d0 3228 didrange = TRUE;
4c3a8340
TS
3229#ifdef EBCDIC
3230 literal_endpoint = 0;
3231#endif
79072805 3232 continue;
4e553d73 3233 }
02aa26ce
NT
3234
3235 /* range begins (ignore - as first or last char) */
79072805 3236 else if (*s == '-' && s+1 < send && s != start) {
4e553d73 3237 if (didrange) {
1fafa243 3238 Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
01ec43d0 3239 }
e294cc5d
JH
3240 if (has_utf8
3241#ifdef EBCDIC
3242 && !native_range
3243#endif
3244 ) {
e7214ce8 3245 *d++ = (char) ILLEGAL_UTF8_BYTE; /* use illegal utf8 byte--see pmtrans */
a0ed51b3
LW
3246 s++;
3247 continue;
3248 }
79072805
LW
3249 dorange = TRUE;
3250 s++;
01ec43d0
GS
3251 }
3252 else {
3253 didrange = FALSE;
4c3a8340
TS
3254#ifdef EBCDIC
3255 literal_endpoint = 0;
e294cc5d 3256 native_range = TRUE;
4c3a8340 3257#endif
01ec43d0 3258 }
79072805 3259 }
02aa26ce
NT
3260
3261 /* if we get here, we're not doing a transliteration */
3262
e4a2df84
DM
3263 else if (*s == '[' && PL_lex_inpat && !in_charclass) {
3264 char *s1 = s-1;
3265 int esc = 0;
3266 while (s1 >= start && *s1-- == '\\')
3267 esc = !esc;
3268 if (!esc)
3269 in_charclass = TRUE;
3270 }
2866decb 3271
e4a2df84
DM
3272 else if (*s == ']' && PL_lex_inpat && in_charclass) {
3273 char *s1 = s-1;
3274 int esc = 0;
3275 while (s1 >= start && *s1-- == '\\')
3276 esc = !esc;
3277 if (!esc)
3278 in_charclass = FALSE;
3279 }
2866decb 3280
9da1dd8f
DM
3281 /* skip for regexp comments /(?#comment)/, except for the last
3282 * char, which will be done separately.
3283 * Stop on (?{..}) and friends */
3284
c30fc27b 3285 else if (*s == '(' && PL_lex_inpat && s[1] == '?' && !in_charclass) {
cc6b7395 3286 if (s[2] == '#') {
e994fd66 3287 while (s+1 < send && *s != ')')
5ff03569 3288 *d++ = *s++;
155aba94 3289 }
c30fc27b 3290 else if (!PL_lex_casemods &&
d3cec5e5
DM
3291 ( s[2] == '{' /* This should match regcomp.c */
3292 || (s[2] == '?' && s[3] == '{')))
155aba94 3293 {
9da1dd8f 3294 break;
cc6b7395 3295 }
748a9306 3296 }
02aa26ce
NT
3297
3298 /* likewise skip #-initiated comments in //x patterns */
c30fc27b 3299 else if (*s == '#' && PL_lex_inpat && !in_charclass &&
73134a2e 3300 ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED) {
748a9306 3301 while (s+1 < send && *s != '\n')
5ff03569 3302 *d++ = *s++;
748a9306 3303 }
02aa26ce 3304
9da1dd8f
DM
3305 /* no further processing of single-quoted regex */
3306 else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'')
3307 goto default_action;
3308
5d1d4326 3309 /* check for embedded arrays
da6eedaa 3310 (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
5d1d4326 3311 */
1749ea0d 3312 else if (*s == '@' && s[1]) {
8a2bca12 3313 if (isWORDCHAR_lazy_if(s+1,UTF))
1749ea0d
TS
3314 break;
3315 if (strchr(":'{$", s[1]))
3316 break;
3317 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
3318 break; /* in regexp, neither @+ nor @- are interpolated */
3319 }
02aa26ce
NT
3320
3321 /* check for embedded scalars. only stop if we're sure it's a
3322 variable.
3323 */
79072805 3324 else if (*s == '$') {
3280af22 3325 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
79072805 3326 break;
77772344 3327 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
a2a5de95
NC
3328 if (s[1] == '\\') {
3329 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
3330 "Possible unintended interpolation of $\\ in regex");
77772344 3331 }
79072805 3332 break; /* in regexp, $ might be tail anchor */
77772344 3333 }
79072805 3334 }
02aa26ce 3335
2b9d42f0
NIS
3336 /* End of else if chain - OP_TRANS rejoin rest */
3337
02aa26ce 3338 /* backslashes */
79072805 3339 if (*s == '\\' && s+1 < send) {
ff3f963a
KW
3340 char* e; /* Can be used for ending '}', etc. */
3341
79072805 3342 s++;
02aa26ce 3343
7d0fc23c
KW
3344 /* warn on \1 - \9 in substitution replacements, but note that \11
3345 * is an octal; and \19 is \1 followed by '9' */
3280af22 3346 if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat &&
a0d0e21e 3347 isDIGIT(*s) && *s != '0' && !isDIGIT(s[1]))
79072805 3348 {
a2a5de95 3349 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
79072805
LW
3350 *--s = '$';
3351 break;
3352 }
02aa26ce
NT
3353
3354 /* string-change backslash escapes */
838f2281 3355 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQF", *s)) {
79072805
LW
3356 --s;
3357 break;
3358 }
ff3f963a
KW
3359 /* In a pattern, process \N, but skip any other backslash escapes.
3360 * This is because we don't want to translate an escape sequence
3361 * into a meta symbol and have the regex compiler use the meta
3362 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
3363 * in spite of this, we do have to process \N here while the proper
3364 * charnames handler is in scope. See bugs #56444 and #62056.
3365 * There is a complication because \N in a pattern may also stand
3366 * for 'match a non-nl', and not mean a charname, in which case its
3367 * processing should be deferred to the regex compiler. To be a
3368 * charname it must be followed immediately by a '{', and not look
3369 * like \N followed by a curly quantifier, i.e., not something like
3370 * \N{3,}. regcurly returns a boolean indicating if it is a legal
3371 * quantifier */
3372 else if (PL_lex_inpat
3373 && (*s != 'N'
3374 || s[1] != '{'
4d68ffa0 3375 || regcurly(s + 1, FALSE)))
ff3f963a 3376 {
4d73d076 3377 *d++ = '\\';
cc74c5bd
TS
3378 goto default_action;
3379 }
02aa26ce 3380
79072805 3381 switch (*s) {
02aa26ce
NT
3382
3383 /* quoted - in transliterations */
79072805 3384 case '-':
3280af22 3385 if (PL_lex_inwhat == OP_TRANS) {
79072805
LW
3386 *d++ = *s++;
3387 continue;
3388 }
3389 /* FALL THROUGH */
3390 default:
11b8faa4 3391 {
15861f94 3392 if ((isALPHANUMERIC(*s)))
a2a5de95
NC
3393 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
3394 "Unrecognized escape \\%c passed through",
3395 *s);
11b8faa4 3396 /* default action is to copy the quoted character */
f9a63242 3397 goto default_action;
11b8faa4 3398 }
02aa26ce 3399
632403cc 3400 /* eg. \132 indicates the octal constant 0132 */
79072805
LW
3401 case '0': case '1': case '2': case '3':
3402 case '4': case '5': case '6': case '7':
ba210ebe 3403 {
5e0a247b 3404 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
53305cf1 3405 STRLEN len = 3;
06972766 3406 uv = grok_oct(s, &len, &flags, NULL);
ba210ebe 3407 s += len;
5e0a247b
KW
3408 if (len < 3 && s < send && isDIGIT(*s)
3409 && ckWARN(WARN_MISC))
3410 {
3411 Perl_warner(aTHX_ packWARN(WARN_MISC),
3412 "%s", form_short_octal_warning(s, len));
3413 }
ba210ebe 3414 }
012bcf8d 3415 goto NUM_ESCAPE_INSERT;
02aa26ce 3416
f0a2b745
KW
3417 /* eg. \o{24} indicates the octal constant \024 */
3418 case 'o':
3419 {
454155d9 3420 const char* error;
f0a2b745 3421
00ce5563 3422 bool valid = grok_bslash_o(&s, &uv, &error,
80f4111b
KW
3423 TRUE, /* Output warning */
3424 FALSE, /* Not strict */
17896a85
KW
3425 TRUE, /* Output warnings for
3426 non-portables */
80f4111b 3427 UTF);
454155d9 3428 if (! valid) {
f0a2b745
KW
3429 yyerror(error);
3430 continue;
3431 }
3432 goto NUM_ESCAPE_INSERT;
3433 }
3434
77a135fe 3435 /* eg. \x24 indicates the hex constant 0x24 */
79072805 3436 case 'x':
a0481293 3437 {
a0481293 3438 const char* error;
355860ce 3439
00ce5563 3440 bool valid = grok_bslash_x(&s, &uv, &error,
80f4111b
KW
3441 TRUE, /* Output warning */
3442 FALSE, /* Not strict */
17896a85
KW
3443 TRUE, /* Output warnings for
3444 non-portables */
80f4111b 3445 UTF);
a0481293
KW
3446 if (! valid) {
3447 yyerror(error);
355860ce 3448 continue;
ba210ebe 3449 }
012bcf8d
GS
3450 }
3451
3452 NUM_ESCAPE_INSERT:
ff3f963a
KW
3453 /* Insert oct or hex escaped character. There will always be
3454 * enough room in sv since such escapes will be longer than any
3455 * UTF-8 sequence they can end up as, except if they force us
3456 * to recode the rest of the string into utf8 */
ba7cea30 3457
06972766 3458 /* Here uv is the ordinal of the next character being added */
6f2d5cbc 3459 if (!UVCHR_IS_INVARIANT(uv)) {
9aa983d2 3460 if (!has_utf8 && uv > 255) {
77a135fe
KW
3461 /* Might need to recode whatever we have accumulated so
3462 * far if it contains any chars variant in utf8 or
3463 * utf-ebcdic. */
3464
3465 SvCUR_set(sv, d - SvPVX_const(sv));
3466 SvPOK_on(sv);
3467 *d = '\0';
77a135fe 3468 /* See Note on sizing above. */
7bf79863
KW
3469 sv_utf8_upgrade_flags_grow(sv,
3470 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3471 UNISKIP(uv) + (STRLEN)(send - s) + 1);
77a135fe
KW
3472 d = SvPVX(sv) + SvCUR(sv);
3473 has_utf8 = TRUE;
012bcf8d
GS
3474 }
3475
77a135fe 3476 if (has_utf8) {
c80e42f3 3477 d = (char*)uvchr_to_utf8((U8*)d, uv);
f9a63242
JH
3478 if (PL_lex_inwhat == OP_TRANS &&
3479 PL_sublex_info.sub_op) {
3480 PL_sublex_info.sub_op->op_private |=
3481 (PL_lex_repl ? OPpTRANS_FROM_UTF
3482 : OPpTRANS_TO_UTF);
f9a63242 3483 }
e294cc5d
JH
3484#ifdef EBCDIC
3485 if (uv > 255 && !dorange)
3486 native_range = FALSE;
3487#endif
012bcf8d 3488 }
a0ed51b3 3489 else {
012bcf8d 3490 *d++ = (char)uv;
a0ed51b3 3491 }
012bcf8d
GS
3492 }
3493 else {
c4d5f83a 3494 *d++ = (char) uv;
a0ed51b3 3495 }
79072805 3496 continue;
02aa26ce 3497
4a2d328f 3498 case 'N':
ff3f963a
KW
3499 /* In a non-pattern \N must be a named character, like \N{LATIN
3500 * SMALL LETTER A} or \N{U+0041}. For patterns, it also can
3501 * mean to match a non-newline. For non-patterns, named
3502 * characters are converted to their string equivalents. In
3503 * patterns, named characters are not converted to their
3504 * ultimate forms for the same reasons that other escapes
3505 * aren't. Instead, they are converted to the \N{U+...} form
3506 * to get the value from the charnames that is in effect right
3507 * now, while preserving the fact that it was a named character
3508 * so that the regex compiler knows this */
3509
ff3f963a
KW
3510 /* The structure of this section of code (besides checking for
3511 * errors and upgrading to utf8) is:
3512 * Further disambiguate between the two meanings of \N, and if
3513 * not a charname, go process it elsewhere
0a96133f
KW
3514 * If of form \N{U+...}, pass it through if a pattern;
3515 * otherwise convert to utf8
3516 * Otherwise must be \N{NAME}: convert to \N{U+c1.c2...} if a
3517 * pattern; otherwise convert to utf8 */
ff3f963a
KW
3518
3519 /* Here, s points to the 'N'; the test below is guaranteed to
3520 * succeed if we are being called on a pattern as we already
3521 * know from a test above that the next character is a '{'.
3522 * On a non-pattern \N must mean 'named sequence, which
3523 * requires braces */
3524 s++;
3525 if (*s != '{') {
3526 yyerror("Missing braces on \\N{}");
3527 continue;
3528 }
3529 s++;
3530
0a96133f 3531 /* If there is no matching '}', it is an error. */
ff3f963a
KW
3532 if (! (e = strchr(s, '}'))) {
3533 if (! PL_lex_inpat) {
5777a3f7 3534 yyerror("Missing right brace on \\N{}");
0a96133f
KW
3535 } else {
3536 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N.");
dbc0d4f2 3537 }
0a96133f 3538 continue;
ff3f963a 3539 }
cddc7ef4 3540
ff3f963a 3541 /* Here it looks like a named character */
cddc7ef4 3542
ff3f963a
KW
3543 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3544 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3545 | PERL_SCAN_DISALLOW_PREFIX;
3546 STRLEN len;
3547
3548 /* For \N{U+...}, the '...' is a unicode value even on
3549 * EBCDIC machines */
3550 s += 2; /* Skip to next char after the 'U+' */
3551 len = e - s;
3552 uv = grok_hex(s, &len, &flags, NULL);
3553 if (len == 0 || len != (STRLEN)(e - s)) {
3554 yyerror("Invalid hexadecimal number in \\N{U+...}");
3555 s = e + 1;
3556 continue;
3557 }
3558
3559 if (PL_lex_inpat) {
3560
e2a7e165
KW
3561 /* On non-EBCDIC platforms, pass through to the regex
3562 * compiler unchanged. The reason we evaluated the
3563 * number above is to make sure there wasn't a syntax
3564 * error. But on EBCDIC we convert to native so
3565 * downstream code can continue to assume it's native
3566 */
ff3f963a 3567 s -= 5; /* Include the '\N{U+' */
e2a7e165
KW
3568#ifdef EBCDIC
3569 d += my_snprintf(d, e - s + 1 + 1, /* includes the }
3570 and the \0 */
3571 "\\N{U+%X}",
3572 (unsigned int) UNI_TO_NATIVE(uv));
3573#else
ff3f963a
KW
3574 Copy(s, d, e - s + 1, char); /* 1 = include the } */
3575 d += e - s + 1;
e2a7e165 3576#endif
ff3f963a
KW
3577 }
3578 else { /* Not a pattern: convert the hex to string */
3579
3580 /* If destination is not in utf8, unconditionally
3581 * recode it to be so. This is because \N{} implies
3582 * Unicode semantics, and scalars have to be in utf8
3583 * to guarantee those semantics */
3584 if (! has_utf8) {
3585 SvCUR_set(sv, d - SvPVX_const(sv));
3586 SvPOK_on(sv);
3587 *d = '\0';
3588 /* See Note on sizing above. */
3589 sv_utf8_upgrade_flags_grow(
3590 sv,
3591 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3592 UNISKIP(uv) + (STRLEN)(send - e) + 1);
3593 d = SvPVX(sv) + SvCUR(sv);
3594 has_utf8 = TRUE;
3595 }
3596
a46469e6 3597 /* Add the (Unicode) code point to the output. */
ff3f963a 3598 if (UNI_IS_INVARIANT(uv)) {
a46469e6 3599 *d++ = (char) LATIN1_TO_NATIVE(uv);
ff3f963a 3600 }
a46469e6
KW
3601 else {
3602 d = (char*) uvoffuni_to_utf8_flags((U8*)d, uv, 0);
3603 }
ff3f963a
KW
3604 }
3605 }
6f613c73
KW
3606 else /* Here is \N{NAME} but not \N{U+...}. */
3607 if ((res = get_and_check_backslash_N_name(s, e)))
3608 {
3609 STRLEN len;
3610 const char *str = SvPV_const(res, len);
3611 if (PL_lex_inpat) {
ff3f963a
KW
3612
3613 if (! len) { /* The name resolved to an empty string */
3614 Copy("\\N{}", d, 4, char);
3615 d += 4;
3616 }
3617 else {
3618 /* In order to not lose information for the regex
3619 * compiler, pass the result in the specially made
3620 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3621 * the code points in hex of each character
3622 * returned by charnames */
3623
3624 const char *str_end = str + len;
3b721c4f 3625 const STRLEN off = d - SvPVX_const(sv);
94ca1619
KW
3626
3627 if (! SvUTF8(res)) {
3628 /* For the non-UTF-8 case, we can determine the
3629 * exact length needed without having to parse
3630 * through the string. Each character takes up
3631 * 2 hex digits plus either a trailing dot or
3632 * the "}" */
3633 d = off + SvGROW(sv, off
3634 + 3 * len
3635 + 6 /* For the "\N{U+", and
3636 trailing NUL */
3637 + (STRLEN)(send - e));
3638 Copy("\\N{U+", d, 5, char);
3639 d += 5;
3640 while (str < str_end) {
3641 char hex_string[4];
3642 my_snprintf(hex_string, sizeof(hex_string),
3643 "%02X.", (U8) *str);
3644 Copy(hex_string, d, 3, char);
3645 d += 3;
3646 str++;
3647 }
3648 d--; /* We will overwrite below the final
3649 dot with a right brace */
3650 }
3651 else {
1953db30
KW
3652 STRLEN char_length; /* cur char's byte length */
3653
3654 /* and the number of bytes after this is
3655 * translated into hex digits */
3656 STRLEN output_length;
3657
3658 /* 2 hex per byte; 2 chars for '\N'; 2 chars
3659 * for max('U+', '.'); and 1 for NUL */
3660 char hex_string[2 * UTF8_MAXBYTES + 5];
3661
3662 /* Get the first character of the result. */
a46469e6 3663 U32 uv = utf8n_to_uvchr((U8 *) str,
1953db30
KW
3664 len,
3665 &char_length,
3666 UTF8_ALLOW_ANYUV);
3667 /* Convert first code point to hex, including
a46469e6 3668 * the boiler plate before it. */
1953db30
KW
3669 output_length =
3670 my_snprintf(hex_string, sizeof(hex_string),
a46469e6
KW
3671 "\\N{U+%X",
3672 (unsigned int) uv);
1953db30
KW
3673
3674 /* Make sure there is enough space to hold it */
3675 d = off + SvGROW(sv, off
3676 + output_length
3677 + (STRLEN)(send - e)
3678 + 2); /* '}' + NUL */
3679 /* And output it */
3680 Copy(hex_string, d, output_le