This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
charnames: Eliminate need to sync code in two places
[perl5.git] / toke.c
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
26
27This is the lower layer of the Perl parser, managing characters and tokens.
28
29=for apidoc AmU|yy_parser *|PL_parser
30
31Pointer to a structure encapsulating the state of the parsing operation
32currently in progress. The pointer can be locally changed to perform
33a nested parse without interfering with the state of an outer parse.
34Individual members of C<PL_parser> have their own documentation.
35
36=cut
37*/
38
378cc40b 39#include "EXTERN.h"
864dbfa3 40#define PERL_IN_TOKE_C
378cc40b 41#include "perl.h"
04e98a4d 42#include "dquote_static.c"
378cc40b 43
eb0d8d16
NC
44#define new_constant(a,b,c,d,e,f,g) \
45 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
46
6154021b 47#define pl_yylval (PL_parser->yylval)
d3b6f988 48
199e78b7
DM
49/* XXX temporary backwards compatibility */
50#define PL_lex_brackets (PL_parser->lex_brackets)
78cdf107
Z
51#define PL_lex_allbrackets (PL_parser->lex_allbrackets)
52#define PL_lex_fakeeof (PL_parser->lex_fakeeof)
199e78b7
DM
53#define PL_lex_brackstack (PL_parser->lex_brackstack)
54#define PL_lex_casemods (PL_parser->lex_casemods)
55#define PL_lex_casestack (PL_parser->lex_casestack)
56#define PL_lex_defer (PL_parser->lex_defer)
57#define PL_lex_dojoin (PL_parser->lex_dojoin)
58#define PL_lex_expect (PL_parser->lex_expect)
59#define PL_lex_formbrack (PL_parser->lex_formbrack)
60#define PL_lex_inpat (PL_parser->lex_inpat)
61#define PL_lex_inwhat (PL_parser->lex_inwhat)
62#define PL_lex_op (PL_parser->lex_op)
63#define PL_lex_repl (PL_parser->lex_repl)
64#define PL_lex_starts (PL_parser->lex_starts)
65#define PL_lex_stuff (PL_parser->lex_stuff)
66#define PL_multi_start (PL_parser->multi_start)
67#define PL_multi_open (PL_parser->multi_open)
68#define PL_multi_close (PL_parser->multi_close)
199e78b7
DM
69#define PL_preambled (PL_parser->preambled)
70#define PL_sublex_info (PL_parser->sublex_info)
bdc0bf6f 71#define PL_linestr (PL_parser->linestr)
c2598295
DM
72#define PL_expect (PL_parser->expect)
73#define PL_copline (PL_parser->copline)
f06b5848
DM
74#define PL_bufptr (PL_parser->bufptr)
75#define PL_oldbufptr (PL_parser->oldbufptr)
76#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
77#define PL_linestart (PL_parser->linestart)
78#define PL_bufend (PL_parser->bufend)
79#define PL_last_uni (PL_parser->last_uni)
80#define PL_last_lop (PL_parser->last_lop)
81#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 82#define PL_lex_state (PL_parser->lex_state)
2f9285f8 83#define PL_rsfp (PL_parser->rsfp)
5486870f 84#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
85#define PL_in_my (PL_parser->in_my)
86#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 87#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 88#define PL_multi_end (PL_parser->multi_end)
13765c85 89#define PL_error_count (PL_parser->error_count)
199e78b7
DM
90
91#ifdef PERL_MAD
92# define PL_endwhite (PL_parser->endwhite)
93# define PL_faketokens (PL_parser->faketokens)
94# define PL_lasttoke (PL_parser->lasttoke)
95# define PL_nextwhite (PL_parser->nextwhite)
96# define PL_realtokenstart (PL_parser->realtokenstart)
97# define PL_skipwhite (PL_parser->skipwhite)
98# define PL_thisclose (PL_parser->thisclose)
99# define PL_thismad (PL_parser->thismad)
100# define PL_thisopen (PL_parser->thisopen)
101# define PL_thisstuff (PL_parser->thisstuff)
102# define PL_thistoken (PL_parser->thistoken)
103# define PL_thiswhite (PL_parser->thiswhite)
fb205e7a
DM
104# define PL_thiswhite (PL_parser->thiswhite)
105# define PL_nexttoke (PL_parser->nexttoke)
106# define PL_curforce (PL_parser->curforce)
107#else
108# define PL_nexttoke (PL_parser->nexttoke)
109# define PL_nexttype (PL_parser->nexttype)
110# define PL_nextval (PL_parser->nextval)
199e78b7
DM
111#endif
112
a1894d81 113static const char* const ident_too_long = "Identifier too long";
8903cb82 114
29595ff2 115#ifdef PERL_MAD
29595ff2 116# define CURMAD(slot,sv) if (PL_madskills) { curmad(slot,sv); sv = 0; }
cd81e915 117# define NEXTVAL_NEXTTOKE PL_nexttoke[PL_curforce].next_val
9ded7720 118#else
5db06880 119# define CURMAD(slot,sv)
9ded7720 120# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2
NC
121#endif
122
a7aaec61
Z
123#define XENUMMASK 0x3f
124#define XFAKEEOF 0x40
125#define XFAKEBRACK 0x80
9059aa12 126
39e02b42
JH
127#ifdef USE_UTF8_SCRIPTS
128# define UTF (!IN_BYTES)
2b9d42f0 129#else
802a15e9 130# define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
2b9d42f0 131#endif
a0ed51b3 132
b1fc3636
CJ
133/* The maximum number of characters preceding the unrecognized one to display */
134#define UNRECOGNIZED_PRECEDE_COUNT 10
135
61f0cdd9 136/* In variables named $^X, these are the legal values for X.
2b92dfce
GS
137 * 1999-02-27 mjd-perl-patch@plover.com */
138#define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
139
14bd96d0 140#define SPACE_OR_TAB(c) isBLANK_A(c)
bf4acbe4 141
ffb4593c
NT
142/* LEX_* are values for PL_lex_state, the state of the lexer.
143 * They are arranged oddly so that the guard on the switch statement
79072805 144 * can get by with a single comparison (if the compiler is smart enough).
9da1dd8f
DM
145 *
146 * These values refer to the various states within a sublex parse,
147 * i.e. within a double quotish string
79072805
LW
148 */
149
fb73857a 150/* #define LEX_NOTPARSING 11 is done in perl.h. */
151
b6007c36
DM
152#define LEX_NORMAL 10 /* normal code (ie not within "...") */
153#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
154#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
155#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
156#define LEX_INTERPSTART 6 /* expecting the start of a $var */
157
158 /* at end of code, eg "$x" followed by: */
159#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
160#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
161
162#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
163 string or after \E, $foo, etc */
164#define LEX_INTERPCONST 2 /* NOT USED */
165#define LEX_FORMLINE 1 /* expecting a format line */
166#define LEX_KNOWNEXT 0 /* next token known; just return it */
167
79072805 168
bbf60fe6 169#ifdef DEBUGGING
27da23d5 170static const char* const lex_state_names[] = {
bbf60fe6
DM
171 "KNOWNEXT",
172 "FORMLINE",
173 "INTERPCONST",
174 "INTERPCONCAT",
175 "INTERPENDMAYBE",
176 "INTERPEND",
177 "INTERPSTART",
178 "INTERPPUSH",
179 "INTERPCASEMOD",
180 "INTERPNORMAL",
181 "NORMAL"
182};
183#endif
184
79072805 185#include "keywords.h"
fe14fcc3 186
ffb4593c
NT
187/* CLINE is a macro that ensures PL_copline has a sane value */
188
57843af0 189#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 190
5db06880 191#ifdef PERL_MAD
29595ff2
NC
192# define SKIPSPACE0(s) skipspace0(s)
193# define SKIPSPACE1(s) skipspace1(s)
194# define SKIPSPACE2(s,tsv) skipspace2(s,&tsv)
195# define PEEKSPACE(s) skipspace2(s,0)
196#else
197# define SKIPSPACE0(s) skipspace(s)
198# define SKIPSPACE1(s) skipspace(s)
199# define SKIPSPACE2(s,tsv) skipspace(s)
200# define PEEKSPACE(s) skipspace(s)
201#endif
202
ffb4593c
NT
203/*
204 * Convenience functions to return different tokens and prime the
9cbb5ea2 205 * lexer for the next token. They all take an argument.
ffb4593c
NT
206 *
207 * TOKEN : generic token (used for '(', DOLSHARP, etc)
208 * OPERATOR : generic operator
209 * AOPERATOR : assignment operator
210 * PREBLOCK : beginning the block after an if, while, foreach, ...
211 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
212 * PREREF : *EXPR where EXPR is not a simple identifier
213 * TERM : expression term
89f35911 214 * POSTDEREF : postfix dereference (->$* ->@[...] etc.)
ffb4593c
NT
215 * LOOPX : loop exiting command (goto, last, dump, etc)
216 * FTST : file test operator
217 * FUN0 : zero-argument function
7eb971ee 218 * FUN0OP : zero-argument function, with its op created in this file
2d2e263d 219 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
220 * BOop : bitwise or or xor
221 * BAop : bitwise and
222 * SHop : shift operator
223 * PWop : power operator
9cbb5ea2 224 * PMop : pattern-matching operator
ffb4593c
NT
225 * Aop : addition-level operator
226 * Mop : multiplication-level operator
227 * Eop : equality-testing operator
e5edeb50 228 * Rop : relational operator <= != gt
ffb4593c
NT
229 *
230 * Also see LOP and lop() below.
231 */
232
998054bd 233#ifdef DEBUGGING /* Serve -DT. */
704d4215 234# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 235#else
bbf60fe6 236# define REPORT(retval) (retval)
998054bd
SC
237#endif
238
bbf60fe6
DM
239#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
240#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
241#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, REPORT(retval)))
242#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
243#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
244#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
245#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
89f35911 246#define POSTDEREF(f) return (PL_bufptr = s, S_postderef(aTHX_ REPORT(f),s[1]))
6154021b
RGS
247#define LOOPX(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)LOOPEX))
248#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
249#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
7eb971ee 250#define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
6154021b
RGS
251#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
252#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITOROP)))
253#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITANDOP)))
254#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)SHIFTOP)))
255#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)POWOP)))
256#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
257#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)ADDOP)))
258#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MULOP)))
259#define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
260#define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
2f3197b3 261
a687059c
LW
262/* This bit of chicanery makes a unary function followed by
263 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
264 * The UNIDOR macro is for unary functions that can be followed by the //
265 * operator (such as C<shift // 0>).
a687059c 266 */
d68ce4ac 267#define UNI3(f,x,have_x) { \
6154021b 268 pl_yylval.ival = f; \
d68ce4ac 269 if (have_x) PL_expect = x; \
376fcdbf
AL
270 PL_bufptr = s; \
271 PL_last_uni = PL_oldbufptr; \
272 PL_last_lop_op = f; \
273 if (*s == '(') \
274 return REPORT( (int)FUNC1 ); \
29595ff2 275 s = PEEKSPACE(s); \
376fcdbf
AL
276 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
277 }
d68ce4ac
FC
278#define UNI(f) UNI3(f,XTERM,1)
279#define UNIDOR(f) UNI3(f,XTERMORDORDOR,1)
b5fb7ce3
FC
280#define UNIPROTO(f,optional) { \
281 if (optional) PL_last_uni = PL_oldbufptr; \
22393538
MH
282 OPERATOR(f); \
283 }
a687059c 284
d68ce4ac 285#define UNIBRACK(f) UNI3(f,0,0)
79072805 286
9f68db38 287/* grandfather return to old style */
78cdf107
Z
288#define OLDLOP(f) \
289 do { \
290 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
291 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
292 pl_yylval.ival = (f); \
293 PL_expect = XTERM; \
294 PL_bufptr = s; \
295 return (int)LSTOP; \
296 } while(0)
79072805 297
83944c01
FC
298#define COPLINE_INC_WITH_HERELINES \
299 STMT_START { \
300 CopLINE_inc(PL_curcop); \
851b527a
FC
301 if (PL_parser->herelines) \
302 CopLINE(PL_curcop) += PL_parser->herelines, \
303 PL_parser->herelines = 0; \
83944c01 304 } STMT_END
ffdb8b16
FC
305/* Called after scan_str to update CopLINE(PL_curcop), but only when there
306 * is no sublex_push to follow. */
307#define COPLINE_SET_FROM_MULTI_END \
308 STMT_START { \
309 CopLINE_set(PL_curcop, PL_multi_end); \
310 if (PL_multi_end != PL_multi_start) \
851b527a 311 PL_parser->herelines = 0; \
ffdb8b16 312 } STMT_END
83944c01
FC
313
314
8fa7f367
JH
315#ifdef DEBUGGING
316
6154021b 317/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
318enum token_type {
319 TOKENTYPE_NONE,
320 TOKENTYPE_IVAL,
6154021b 321 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6 322 TOKENTYPE_PVAL,
aeaef349 323 TOKENTYPE_OPVAL
bbf60fe6
DM
324};
325
6d4a66ac
NC
326static struct debug_tokens {
327 const int token;
328 enum token_type type;
329 const char *name;
330} const debug_tokens[] =
9041c2e3 331{
bbf60fe6
DM
332 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
333 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
334 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
335 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
336 { ARROW, TOKENTYPE_NONE, "ARROW" },
337 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
338 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
339 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
340 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
341 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
0d863452 342 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
343 { DO, TOKENTYPE_NONE, "DO" },
344 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
345 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
346 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
347 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
348 { ELSE, TOKENTYPE_NONE, "ELSE" },
349 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
350 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
351 { FOR, TOKENTYPE_IVAL, "FOR" },
352 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
705fe0e5
FC
353 { FORMLBRACK, TOKENTYPE_NONE, "FORMLBRACK" },
354 { FORMRBRACK, TOKENTYPE_NONE, "FORMRBRACK" },
bbf60fe6
DM
355 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
356 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
7eb971ee 357 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
bbf60fe6
DM
358 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
359 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
360 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 361 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
362 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
363 { IF, TOKENTYPE_IVAL, "IF" },
5504e6cf 364 { LABEL, TOKENTYPE_PVAL, "LABEL" },
bbf60fe6
DM
365 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
366 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
367 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
368 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
369 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
370 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
371 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
372 { MY, TOKENTYPE_IVAL, "MY" },
bbf60fe6
DM
373 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
374 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
375 { OROP, TOKENTYPE_IVAL, "OROP" },
376 { OROR, TOKENTYPE_NONE, "OROR" },
377 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
f3f204dc 378 { PEG, TOKENTYPE_NONE, "PEG" },
88e1f1a2
JV
379 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
380 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6 381 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
cc624add 382 { POSTJOIN, TOKENTYPE_NONE, "POSTJOIN" },
bbf60fe6
DM
383 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
384 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
385 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
386 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
387 { PREINC, TOKENTYPE_NONE, "PREINC" },
388 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
f3f204dc 389 { QWLIST, TOKENTYPE_OPVAL, "QWLIST" },
bbf60fe6
DM
390 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
391 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
f3f204dc 392 { REQUIRE, TOKENTYPE_NONE, "REQUIRE" },
bbf60fe6
DM
393 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
394 { SUB, TOKENTYPE_NONE, "SUB" },
395 { THING, TOKENTYPE_OPVAL, "THING" },
396 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
397 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
398 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
399 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
400 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
401 { USE, TOKENTYPE_IVAL, "USE" },
0d863452 402 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6
DM
403 { WHILE, TOKENTYPE_IVAL, "WHILE" },
404 { WORD, TOKENTYPE_OPVAL, "WORD" },
be25f609 405 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 406 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
407};
408
6154021b 409/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 410
bbf60fe6 411STATIC int
704d4215 412S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 413{
97aff369 414 dVAR;
7918f24d
NC
415
416 PERL_ARGS_ASSERT_TOKEREPORT;
417
bbf60fe6 418 if (DEBUG_T_TEST) {
bd61b366 419 const char *name = NULL;
bbf60fe6 420 enum token_type type = TOKENTYPE_NONE;
f54cb97a 421 const struct debug_tokens *p;
396482e1 422 SV* const report = newSVpvs("<== ");
bbf60fe6 423
f54cb97a 424 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
425 if (p->token == (int)rv) {
426 name = p->name;
427 type = p->type;
428 break;
429 }
430 }
431 if (name)
54667de8 432 Perl_sv_catpv(aTHX_ report, name);
74736ae6 433 else if ((char)rv > ' ' && (char)rv <= '~')
4ebc7986 434 {
bbf60fe6 435 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
4ebc7986
FC
436 if ((char)rv == 'p')
437 sv_catpvs(report, " (pending identifier)");
438 }
bbf60fe6 439 else if (!rv)
396482e1 440 sv_catpvs(report, "EOF");
bbf60fe6
DM
441 else
442 Perl_sv_catpvf(aTHX_ report, "?? %"IVdf, (IV)rv);
443 switch (type) {
444 case TOKENTYPE_NONE:
bbf60fe6
DM
445 break;
446 case TOKENTYPE_IVAL:
704d4215 447 Perl_sv_catpvf(aTHX_ report, "(ival=%"IVdf")", (IV)lvalp->ival);
bbf60fe6
DM
448 break;
449 case TOKENTYPE_OPNUM:
450 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 451 PL_op_name[lvalp->ival]);
bbf60fe6
DM
452 break;
453 case TOKENTYPE_PVAL:
704d4215 454 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
455 break;
456 case TOKENTYPE_OPVAL:
704d4215 457 if (lvalp->opval) {
401441c0 458 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
459 PL_op_name[lvalp->opval->op_type]);
460 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 461 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 462 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
463 }
464
465 }
401441c0 466 else
396482e1 467 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
468 break;
469 }
b6007c36 470 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
471 };
472 return (int)rv;
998054bd
SC
473}
474
b6007c36
DM
475
476/* print the buffer with suitable escapes */
477
478STATIC void
15f169a1 479S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 480{
396482e1 481 SV* const tmp = newSVpvs("");
7918f24d
NC
482
483 PERL_ARGS_ASSERT_PRINTBUF;
484
5d37acd6 485 GCC_DIAG_IGNORE(-Wformat-nonliteral); /* fmt checked by caller */
b6007c36 486 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
5d37acd6 487 GCC_DIAG_RESTORE;
b6007c36
DM
488 SvREFCNT_dec(tmp);
489}
490
8fa7f367
JH
491#endif
492
8290c323
NC
493static int
494S_deprecate_commaless_var_list(pTHX) {
495 PL_expect = XTERM;
496 deprecate("comma-less variable list");
497 return REPORT(','); /* grandfather non-comma-format format */
498}
499
ffb4593c
NT
500/*
501 * S_ao
502 *
c963b151
BD
503 * This subroutine detects &&=, ||=, and //= and turns an ANDAND, OROR or DORDOR
504 * into an OP_ANDASSIGN, OP_ORASSIGN, or OP_DORASSIGN
ffb4593c
NT
505 */
506
76e3520e 507STATIC int
cea2e8a9 508S_ao(pTHX_ int toketype)
a0d0e21e 509{
97aff369 510 dVAR;
3280af22
NIS
511 if (*PL_bufptr == '=') {
512 PL_bufptr++;
a0d0e21e 513 if (toketype == ANDAND)
6154021b 514 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 515 else if (toketype == OROR)
6154021b 516 pl_yylval.ival = OP_ORASSIGN;
c963b151 517 else if (toketype == DORDOR)
6154021b 518 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
519 toketype = ASSIGNOP;
520 }
521 return toketype;
522}
523
ffb4593c
NT
524/*
525 * S_no_op
526 * When Perl expects an operator and finds something else, no_op
527 * prints the warning. It always prints "<something> found where
528 * operator expected. It prints "Missing semicolon on previous line?"
529 * if the surprise occurs at the start of the line. "do you need to
530 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
531 * where the compiler doesn't know if foo is a method call or a function.
532 * It prints "Missing operator before end of line" if there's nothing
533 * after the missing operator, or "... before <...>" if there is something
534 * after the missing operator.
535 */
536
76e3520e 537STATIC void
15f169a1 538S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 539{
97aff369 540 dVAR;
9d4ba2ae
AL
541 char * const oldbp = PL_bufptr;
542 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 543
7918f24d
NC
544 PERL_ARGS_ASSERT_NO_OP;
545
1189a94a
GS
546 if (!s)
547 s = oldbp;
07c798fb 548 else
1189a94a 549 PL_bufptr = s;
734ab321 550 yywarn(Perl_form(aTHX_ "%s found where operator expected", what), UTF ? SVf_UTF8 : 0);
56da5a46
RGS
551 if (ckWARN_d(WARN_SYNTAX)) {
552 if (is_first)
553 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
554 "\t(Missing semicolon on previous line?)\n");
555 else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) {
f54cb97a 556 const char *t;
8a2bca12 557 for (t = PL_oldoldbufptr; (isWORDCHAR_lazy_if(t,UTF) || *t == ':');
734ab321 558 t += UTF ? UTF8SKIP(t) : 1)
c35e046a 559 NOOP;
56da5a46
RGS
560 if (t < PL_bufptr && isSPACE(*t))
561 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
b17a0679
FC
562 "\t(Do you need to predeclare %"UTF8f"?)\n",
563 UTF8fARG(UTF, t - PL_oldoldbufptr, PL_oldoldbufptr));
56da5a46
RGS
564 }
565 else {
566 assert(s >= oldbp);
567 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
b17a0679
FC
568 "\t(Missing operator before %"UTF8f"?)\n",
569 UTF8fARG(UTF, s - oldbp, oldbp));
56da5a46 570 }
07c798fb 571 }
3280af22 572 PL_bufptr = oldbp;
8990e307
LW
573}
574
ffb4593c
NT
575/*
576 * S_missingterm
577 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 578 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
579 * If we're in a delimited string and the delimiter is a control
580 * character, it's reformatted into a two-char sequence like ^C.
581 * This is fatal.
582 */
583
76e3520e 584STATIC void
cea2e8a9 585S_missingterm(pTHX_ char *s)
8990e307 586{
97aff369 587 dVAR;
8990e307
LW
588 char tmpbuf[3];
589 char q;
590 if (s) {
9d4ba2ae 591 char * const nl = strrchr(s,'\n');
d2719217 592 if (nl)
8990e307
LW
593 *nl = '\0';
594 }
ca8b19a7 595 else if ((U8) PL_multi_close < 32) {
8990e307 596 *tmpbuf = '^';
585ec06d 597 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
598 tmpbuf[2] = '\0';
599 s = tmpbuf;
600 }
601 else {
eb160463 602 *tmpbuf = (char)PL_multi_close;
8990e307
LW
603 tmpbuf[1] = '\0';
604 s = tmpbuf;
605 }
606 q = strchr(s,'"') ? '\'' : '"';
cea2e8a9 607 Perl_croak(aTHX_ "Can't find string terminator %c%s%c anywhere before EOF",q,s,q);
463ee0b2 608}
79072805 609
dd0ac2b9
FC
610#include "feature.h"
611
0d863452 612/*
0d863452
RH
613 * Check whether the named feature is enabled.
614 */
26ea9e12 615bool
3fff3427 616Perl_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
0d863452 617{
97aff369 618 dVAR;
4a731d7b 619 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
7918f24d
NC
620
621 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
ca4d40c4
FC
622
623 assert(CURRENT_FEATURE_BUNDLE == FEATURE_BUNDLE_CUSTOM);
7918f24d 624
26ea9e12
NC
625 if (namelen > MAX_FEATURE_LEN)
626 return FALSE;
3fff3427 627 memcpy(&he_name[8], name, namelen);
7d69d4a6 628
c8ca97b0
NC
629 return cBOOL(cop_hints_fetch_pvn(PL_curcop, he_name, 8 + namelen, 0,
630 REFCOUNTED_HE_EXISTS));
0d863452
RH
631}
632
ffb4593c 633/*
9cbb5ea2
GS
634 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
635 * utf16-to-utf8-reversed.
ffb4593c
NT
636 */
637
c39cd008
GS
638#ifdef PERL_CR_FILTER
639static void
640strip_return(SV *sv)
641{
eb578fdb
KW
642 const char *s = SvPVX_const(sv);
643 const char * const e = s + SvCUR(sv);
7918f24d
NC
644
645 PERL_ARGS_ASSERT_STRIP_RETURN;
646
c39cd008
GS
647 /* outer loop optimized to do nothing if there are no CR-LFs */
648 while (s < e) {
649 if (*s++ == '\r' && *s == '\n') {
650 /* hit a CR-LF, need to copy the rest */
eb578fdb 651 char *d = s - 1;
c39cd008
GS
652 *d++ = *s++;
653 while (s < e) {
654 if (*s == '\r' && s[1] == '\n')
655 s++;
656 *d++ = *s++;
657 }
658 SvCUR(sv) -= s - d;
659 return;
660 }
661 }
662}
a868473f 663
76e3520e 664STATIC I32
c39cd008 665S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 666{
f54cb97a 667 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
668 if (count > 0 && !maxlen)
669 strip_return(sv);
670 return count;
a868473f
NIS
671}
672#endif
673
ffb4593c 674/*
8eaa0acf
Z
675=for apidoc Amx|void|lex_start|SV *line|PerlIO *rsfp|U32 flags
676
677Creates and initialises a new lexer/parser state object, supplying
678a context in which to lex and parse from a new source of Perl code.
679A pointer to the new state object is placed in L</PL_parser>. An entry
680is made on the save stack so that upon unwinding the new state object
681will be destroyed and the former value of L</PL_parser> will be restored.
682Nothing else need be done to clean up the parsing context.
683
684The code to be parsed comes from I<line> and I<rsfp>. I<line>, if
685non-null, provides a string (in SV form) containing code to be parsed.
686A copy of the string is made, so subsequent modification of I<line>
687does not affect parsing. I<rsfp>, if non-null, provides an input stream
688from which code will be read to be parsed. If both are non-null, the
689code in I<line> comes first and must consist of complete lines of input,
690and I<rsfp> supplies the remainder of the source.
691
e368b3bd
FC
692The I<flags> parameter is reserved for future use. Currently it is only
693used by perl internally, so extensions should always pass zero.
8eaa0acf
Z
694
695=cut
696*/
ffb4593c 697
27fcb6ee 698/* LEX_START_SAME_FILTER indicates that this is not a new file, so it
87606032
NC
699 can share filters with the current parser.
700 LEX_START_DONT_CLOSE indicates that the file handle wasn't opened by the
701 caller, hence isn't owned by the parser, so shouldn't be closed on parser
702 destruction. This is used to handle the case of defaulting to reading the
703 script from the standard input because no filename was given on the command
704 line (without getting confused by situation where STDIN has been closed, so
705 the script handle is opened on fd 0) */
27fcb6ee 706
a0d0e21e 707void
8eaa0acf 708Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
79072805 709{
97aff369 710 dVAR;
6ef55633 711 const char *s = NULL;
5486870f 712 yy_parser *parser, *oparser;
60d63348 713 if (flags && flags & ~LEX_START_FLAGS)
8eaa0acf 714 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
acdf0a21
DM
715
716 /* create and initialise a parser */
717
199e78b7 718 Newxz(parser, 1, yy_parser);
5486870f 719 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
720 PL_parser = parser;
721
28ac2b49
Z
722 parser->stack = NULL;
723 parser->ps = NULL;
724 parser->stack_size = 0;
acdf0a21 725
e3abe207
DM
726 /* on scope exit, free this parser and restore any outer one */
727 SAVEPARSER(parser);
7c4baf47 728 parser->saved_curcop = PL_curcop;
e3abe207 729
acdf0a21 730 /* initialise lexer state */
8990e307 731
fb205e7a
DM
732#ifdef PERL_MAD
733 parser->curforce = -1;
734#else
735 parser->nexttoke = 0;
736#endif
ca4cfd28 737 parser->error_count = oparser ? oparser->error_count : 0;
7f1c3e8c 738 parser->copline = parser->preambling = NOLINE;
5afb0a62 739 parser->lex_state = LEX_NORMAL;
c2598295 740 parser->expect = XSTATE;
2f9285f8 741 parser->rsfp = rsfp;
27fcb6ee
FC
742 parser->rsfp_filters =
743 !(flags & LEX_START_SAME_FILTER) || !oparser
d3cd8e11
FC
744 ? NULL
745 : MUTABLE_AV(SvREFCNT_inc(
746 oparser->rsfp_filters
747 ? oparser->rsfp_filters
748 : (oparser->rsfp_filters = newAV())
749 ));
2f9285f8 750
199e78b7
DM
751 Newx(parser->lex_brackstack, 120, char);
752 Newx(parser->lex_casestack, 12, char);
753 *parser->lex_casestack = '\0';
d794b522 754 Newxz(parser->lex_shared, 1, LEXSHARED);
02b34bbe 755
10efb74f 756 if (line) {
0528fd32 757 STRLEN len;
10efb74f 758 s = SvPV_const(line, len);
0abcdfa4
FC
759 parser->linestr = flags & LEX_START_COPIED
760 ? SvREFCNT_inc_simple_NN(line)
761 : newSVpvn_flags(s, len, SvUTF8(line));
bf1b738b 762 sv_catpvn(parser->linestr, "\n;", rsfp ? 1 : 2);
0abcdfa4 763 } else {
bf1b738b 764 parser->linestr = newSVpvn("\n;", rsfp ? 1 : 2);
8990e307 765 }
f06b5848
DM
766 parser->oldoldbufptr =
767 parser->oldbufptr =
768 parser->bufptr =
769 parser->linestart = SvPVX(parser->linestr);
770 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
771 parser->last_lop = parser->last_uni = NULL;
b54f893d
KW
772
773 assert(FITS_IN_8_BITS(LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
774 |LEX_DONT_CLOSE_RSFP));
775 parser->lex_flags = (U8) (flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
776 |LEX_DONT_CLOSE_RSFP));
737c24fc 777
60d63348 778 parser->in_pod = parser->filtered = 0;
79072805 779}
a687059c 780
e3abe207
DM
781
782/* delete a parser object */
783
784void
785Perl_parser_free(pTHX_ const yy_parser *parser)
786{
7918f24d
NC
787 PERL_ARGS_ASSERT_PARSER_FREE;
788
7c4baf47 789 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
790 SvREFCNT_dec(parser->linestr);
791
87606032 792 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
2f9285f8 793 PerlIO_clearerr(parser->rsfp);
799361c3
SH
794 else if (parser->rsfp && (!parser->old_parser ||
795 (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 796 PerlIO_close(parser->rsfp);
5486870f 797 SvREFCNT_dec(parser->rsfp_filters);
10002bc1
FC
798 SvREFCNT_dec(parser->lex_stuff);
799 SvREFCNT_dec(parser->sublex_info.repl);
3ac7ff8f
FC
800
801 Safefree(parser->lex_brackstack);
802 Safefree(parser->lex_casestack);
803 Safefree(parser->lex_shared);
804 PL_parser = parser->old_parser;
805 Safefree(parser);
806}
807
808void
809Perl_parser_free_nexttoke_ops(pTHX_ yy_parser *parser, OPSLAB *slab)
810{
811#ifdef PERL_MAD
812 I32 nexttoke = parser->lasttoke;
813#else
814 I32 nexttoke = parser->nexttoke;
815#endif
816 PERL_ARGS_ASSERT_PARSER_FREE_NEXTTOKE_OPS;
3ce3dcd9
FC
817 while (nexttoke--) {
818#ifdef PERL_MAD
819 if (S_is_opval_token(parser->nexttoke[nexttoke].next_type
3ac7ff8f
FC
820 & 0xffff)
821 && parser->nexttoke[nexttoke].next_val.opval
822 && parser->nexttoke[nexttoke].next_val.opval->op_slabbed
823 && OpSLAB(parser->nexttoke[nexttoke].next_val.opval) == slab) {
824 op_free(parser->nexttoke[nexttoke].next_val.opval);
825 parser->nexttoke[nexttoke].next_val.opval = NULL;
826 }
3ce3dcd9 827#else
3ac7ff8f
FC
828 if (S_is_opval_token(parser->nexttype[nexttoke] & 0xffff)
829 && parser->nextval[nexttoke].opval
830 && parser->nextval[nexttoke].opval->op_slabbed
831 && OpSLAB(parser->nextval[nexttoke].opval) == slab) {
3ce3dcd9 832 op_free(parser->nextval[nexttoke].opval);
3ac7ff8f
FC
833 parser->nextval[nexttoke].opval = NULL;
834 }
3ce3dcd9
FC
835#endif
836 }
e3abe207
DM
837}
838
839
ffb4593c 840/*
f0e67a1d
Z
841=for apidoc AmxU|SV *|PL_parser-E<gt>linestr
842
843Buffer scalar containing the chunk currently under consideration of the
844text currently being lexed. This is always a plain string scalar (for
845which C<SvPOK> is true). It is not intended to be used as a scalar by
846normal scalar means; instead refer to the buffer directly by the pointer
847variables described below.
848
849The lexer maintains various C<char*> pointers to things in the
850C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
851reallocated, all of these pointers must be updated. Don't attempt to
852do this manually, but rather use L</lex_grow_linestr> if you need to
853reallocate the buffer.
854
855The content of the text chunk in the buffer is commonly exactly one
856complete line of input, up to and including a newline terminator,
857but there are situations where it is otherwise. The octets of the
858buffer may be intended to be interpreted as either UTF-8 or Latin-1.
859The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
860flag on this scalar, which may disagree with it.
861
862For direct examination of the buffer, the variable
863L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
864lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
865of these pointers is usually preferable to examination of the scalar
866through normal scalar means.
867
868=for apidoc AmxU|char *|PL_parser-E<gt>bufend
869
870Direct pointer to the end of the chunk of text currently being lexed, the
871end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
872+ SvCUR(PL_parser-E<gt>linestr)>. A NUL character (zero octet) is
873always located at the end of the buffer, and does not count as part of
874the buffer's contents.
875
876=for apidoc AmxU|char *|PL_parser-E<gt>bufptr
877
878Points to the current position of lexing inside the lexer buffer.
879Characters around this point may be freely examined, within
880the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
881L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
882interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
883
884Lexing code (whether in the Perl core or not) moves this pointer past
885the characters that it consumes. It is also expected to perform some
886bookkeeping whenever a newline character is consumed. This movement
887can be more conveniently performed by the function L</lex_read_to>,
888which handles newlines appropriately.
889
890Interpretation of the buffer's octets can be abstracted out by
891using the slightly higher-level functions L</lex_peek_unichar> and
892L</lex_read_unichar>.
893
894=for apidoc AmxU|char *|PL_parser-E<gt>linestart
895
896Points to the start of the current line inside the lexer buffer.
897This is useful for indicating at which column an error occurred, and
898not much else. This must be updated by any lexing code that consumes
899a newline; the function L</lex_read_to> handles this detail.
900
901=cut
902*/
903
904/*
905=for apidoc Amx|bool|lex_bufutf8
906
907Indicates whether the octets in the lexer buffer
908(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
909of Unicode characters. If not, they should be interpreted as Latin-1
910characters. This is analogous to the C<SvUTF8> flag for scalars.
911
912In UTF-8 mode, it is not guaranteed that the lexer buffer actually
913contains valid UTF-8. Lexing code must be robust in the face of invalid
914encoding.
915
916The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
917is significant, but not the whole story regarding the input character
918encoding. Normally, when a file is being read, the scalar contains octets
919and its C<SvUTF8> flag is off, but the octets should be interpreted as
920UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
921however, the scalar may have the C<SvUTF8> flag on, and in this case its
922octets should be interpreted as UTF-8 unless the C<use bytes> pragma
923is in effect. This logic may change in the future; use this function
924instead of implementing the logic yourself.
925
926=cut
927*/
928
929bool
930Perl_lex_bufutf8(pTHX)
931{
932 return UTF;
933}
934
935/*
936=for apidoc Amx|char *|lex_grow_linestr|STRLEN len
937
938Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
939at least I<len> octets (including terminating NUL). Returns a
940pointer to the reallocated buffer. This is necessary before making
941any direct modification of the buffer that would increase its length.
942L</lex_stuff_pvn> provides a more convenient way to insert text into
943the buffer.
944
945Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
946this function updates all of the lexer's variables that point directly
947into the buffer.
948
949=cut
950*/
951
952char *
953Perl_lex_grow_linestr(pTHX_ STRLEN len)
954{
955 SV *linestr;
956 char *buf;
957 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
c7641931 958 STRLEN linestart_pos, last_uni_pos, last_lop_pos, re_eval_start_pos;
f0e67a1d
Z
959 linestr = PL_parser->linestr;
960 buf = SvPVX(linestr);
961 if (len <= SvLEN(linestr))
962 return buf;
963 bufend_pos = PL_parser->bufend - buf;
964 bufptr_pos = PL_parser->bufptr - buf;
965 oldbufptr_pos = PL_parser->oldbufptr - buf;
966 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
967 linestart_pos = PL_parser->linestart - buf;
968 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
969 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
3328ab5a
FC
970 re_eval_start_pos = PL_parser->lex_shared->re_eval_start ?
971 PL_parser->lex_shared->re_eval_start - buf : 0;
c7641931 972
f0e67a1d 973 buf = sv_grow(linestr, len);
c7641931 974
f0e67a1d
Z
975 PL_parser->bufend = buf + bufend_pos;
976 PL_parser->bufptr = buf + bufptr_pos;
977 PL_parser->oldbufptr = buf + oldbufptr_pos;
978 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
979 PL_parser->linestart = buf + linestart_pos;
980 if (PL_parser->last_uni)
981 PL_parser->last_uni = buf + last_uni_pos;
982 if (PL_parser->last_lop)
983 PL_parser->last_lop = buf + last_lop_pos;
3328ab5a
FC
984 if (PL_parser->lex_shared->re_eval_start)
985 PL_parser->lex_shared->re_eval_start = buf + re_eval_start_pos;
f0e67a1d
Z
986 return buf;
987}
988
989/*
83aa740e 990=for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
f0e67a1d
Z
991
992Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
993immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
994reallocating the buffer if necessary. This means that lexing code that
995runs later will see the characters as if they had appeared in the input.
996It is not recommended to do this as part of normal parsing, and most
997uses of this facility run the risk of the inserted characters being
998interpreted in an unintended manner.
999
1000The string to be inserted is represented by I<len> octets starting
1001at I<pv>. These octets are interpreted as either UTF-8 or Latin-1,
1002according to whether the C<LEX_STUFF_UTF8> flag is set in I<flags>.
1003The characters are recoded for the lexer buffer, according to how the
1004buffer is currently being interpreted (L</lex_bufutf8>). If a string
9dcc53ea 1005to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
f0e67a1d
Z
1006function is more convenient.
1007
1008=cut
1009*/
1010
1011void
83aa740e 1012Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
f0e67a1d 1013{
749123ff 1014 dVAR;
f0e67a1d
Z
1015 char *bufptr;
1016 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
1017 if (flags & ~(LEX_STUFF_UTF8))
1018 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
1019 if (UTF) {
1020 if (flags & LEX_STUFF_UTF8) {
1021 goto plain_copy;
1022 } else {
54d004e8 1023 STRLEN highhalf = 0; /* Count of variants */
83aa740e 1024 const char *p, *e = pv+len;
54d004e8
KW
1025 for (p = pv; p != e; p++) {
1026 if (! UTF8_IS_INVARIANT(*p)) {
1027 highhalf++;
1028 }
1029 }
f0e67a1d
Z
1030 if (!highhalf)
1031 goto plain_copy;
1032 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
1033 bufptr = PL_parser->bufptr;
1034 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1035 SvCUR_set(PL_parser->linestr,
1036 SvCUR(PL_parser->linestr) + len+highhalf);
f0e67a1d
Z
1037 PL_parser->bufend += len+highhalf;
1038 for (p = pv; p != e; p++) {
1039 U8 c = (U8)*p;
54d004e8
KW
1040 if (! UTF8_IS_INVARIANT(c)) {
1041 *bufptr++ = UTF8_TWO_BYTE_HI(c);
1042 *bufptr++ = UTF8_TWO_BYTE_LO(c);
f0e67a1d
Z
1043 } else {
1044 *bufptr++ = (char)c;
1045 }
1046 }
1047 }
1048 } else {
1049 if (flags & LEX_STUFF_UTF8) {
1050 STRLEN highhalf = 0;
83aa740e 1051 const char *p, *e = pv+len;
f0e67a1d
Z
1052 for (p = pv; p != e; p++) {
1053 U8 c = (U8)*p;
54d004e8 1054 if (UTF8_IS_ABOVE_LATIN1(c)) {
f0e67a1d
Z
1055 Perl_croak(aTHX_ "Lexing code attempted to stuff "
1056 "non-Latin-1 character into Latin-1 input");
54d004e8 1057 } else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e)) {
f0e67a1d
Z
1058 p++;
1059 highhalf++;
54d004e8 1060 } else if (! UTF8_IS_INVARIANT(c)) {
f0e67a1d
Z
1061 /* malformed UTF-8 */
1062 ENTER;
1063 SAVESPTR(PL_warnhook);
1064 PL_warnhook = PERL_WARNHOOK_FATAL;
c80e42f3 1065 utf8n_to_uvchr((U8*)p, e-p, NULL, 0);
f0e67a1d
Z
1066 LEAVE;
1067 }
1068 }
1069 if (!highhalf)
1070 goto plain_copy;
1071 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
1072 bufptr = PL_parser->bufptr;
1073 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1074 SvCUR_set(PL_parser->linestr,
1075 SvCUR(PL_parser->linestr) + len-highhalf);
f0e67a1d 1076 PL_parser->bufend += len-highhalf;
54d004e8
KW
1077 p = pv;
1078 while (p < e) {
1079 if (UTF8_IS_INVARIANT(*p)) {
1080 *bufptr++ = *p;
1081 p++;
f0e67a1d 1082 }
54d004e8
KW
1083 else {
1084 assert(p < e -1 );
94bb8c36 1085 *bufptr++ = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
54d004e8
KW
1086 p += 2;
1087 }
f0e67a1d
Z
1088 }
1089 } else {
54d004e8 1090 plain_copy:
f0e67a1d
Z
1091 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1092 bufptr = PL_parser->bufptr;
1093 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
255fdf19 1094 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
f0e67a1d
Z
1095 PL_parser->bufend += len;
1096 Copy(pv, bufptr, len, char);
1097 }
1098 }
1099}
1100
1101/*
9dcc53ea
Z
1102=for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
1103
1104Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1105immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1106reallocating the buffer if necessary. This means that lexing code that
1107runs later will see the characters as if they had appeared in the input.
1108It is not recommended to do this as part of normal parsing, and most
1109uses of this facility run the risk of the inserted characters being
1110interpreted in an unintended manner.
1111
1112The string to be inserted is represented by octets starting at I<pv>
1113and continuing to the first nul. These octets are interpreted as either
1114UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
1115in I<flags>. The characters are recoded for the lexer buffer, according
1116to how the buffer is currently being interpreted (L</lex_bufutf8>).
1117If it is not convenient to nul-terminate a string to be inserted, the
1118L</lex_stuff_pvn> function is more appropriate.
1119
1120=cut
1121*/
1122
1123void
1124Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1125{
1126 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1127 lex_stuff_pvn(pv, strlen(pv), flags);
1128}
1129
1130/*
f0e67a1d
Z
1131=for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1132
1133Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1134immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1135reallocating the buffer if necessary. This means that lexing code that
1136runs later will see the characters as if they had appeared in the input.
1137It is not recommended to do this as part of normal parsing, and most
1138uses of this facility run the risk of the inserted characters being
1139interpreted in an unintended manner.
1140
1141The string to be inserted is the string value of I<sv>. The characters
1142are recoded for the lexer buffer, according to how the buffer is currently
9dcc53ea 1143being interpreted (L</lex_bufutf8>). If a string to be inserted is
f0e67a1d
Z
1144not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1145need to construct a scalar.
1146
1147=cut
1148*/
1149
1150void
1151Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1152{
1153 char *pv;
1154 STRLEN len;
1155 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1156 if (flags)
1157 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1158 pv = SvPV(sv, len);
1159 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1160}
1161
1162/*
1163=for apidoc Amx|void|lex_unstuff|char *ptr
1164
1165Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1166I<ptr>. Text following I<ptr> will be moved, and the buffer shortened.
1167This hides the discarded text from any lexing code that runs later,
1168as if the text had never appeared.
1169
1170This is not the normal way to consume lexed text. For that, use
1171L</lex_read_to>.
1172
1173=cut
1174*/
1175
1176void
1177Perl_lex_unstuff(pTHX_ char *ptr)
1178{
1179 char *buf, *bufend;
1180 STRLEN unstuff_len;
1181 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1182 buf = PL_parser->bufptr;
1183 if (ptr < buf)
1184 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1185 if (ptr == buf)
1186 return;
1187 bufend = PL_parser->bufend;
1188 if (ptr > bufend)
1189 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1190 unstuff_len = ptr - buf;
1191 Move(ptr, buf, bufend+1-ptr, char);
1192 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1193 PL_parser->bufend = bufend - unstuff_len;
1194}
1195
1196/*
1197=for apidoc Amx|void|lex_read_to|char *ptr
1198
1199Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1200to I<ptr>. This advances L</PL_parser-E<gt>bufptr> to match I<ptr>,
1201performing the correct bookkeeping whenever a newline character is passed.
1202This is the normal way to consume lexed text.
1203
1204Interpretation of the buffer's octets can be abstracted out by
1205using the slightly higher-level functions L</lex_peek_unichar> and
1206L</lex_read_unichar>.
1207
1208=cut
1209*/
1210
1211void
1212Perl_lex_read_to(pTHX_ char *ptr)
1213{
1214 char *s;
1215 PERL_ARGS_ASSERT_LEX_READ_TO;
1216 s = PL_parser->bufptr;
1217 if (ptr < s || ptr > PL_parser->bufend)
1218 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1219 for (; s != ptr; s++)
1220 if (*s == '\n') {
83944c01 1221 COPLINE_INC_WITH_HERELINES;
f0e67a1d
Z
1222 PL_parser->linestart = s+1;
1223 }
1224 PL_parser->bufptr = ptr;
1225}
1226
1227/*
1228=for apidoc Amx|void|lex_discard_to|char *ptr
1229
1230Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1231up to I<ptr>. The remaining content of the buffer will be moved, and
1232all pointers into the buffer updated appropriately. I<ptr> must not
1233be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1234it is not permitted to discard text that has yet to be lexed.
1235
1236Normally it is not necessarily to do this directly, because it suffices to
1237use the implicit discarding behaviour of L</lex_next_chunk> and things
1238based on it. However, if a token stretches across multiple lines,
1f317c95 1239and the lexing code has kept multiple lines of text in the buffer for
f0e67a1d
Z
1240that purpose, then after completion of the token it would be wise to
1241explicitly discard the now-unneeded earlier lines, to avoid future
1242multi-line tokens growing the buffer without bound.
1243
1244=cut
1245*/
1246
1247void
1248Perl_lex_discard_to(pTHX_ char *ptr)
1249{
1250 char *buf;
1251 STRLEN discard_len;
1252 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1253 buf = SvPVX(PL_parser->linestr);
1254 if (ptr < buf)
1255 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1256 if (ptr == buf)
1257 return;
1258 if (ptr > PL_parser->bufptr)
1259 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1260 discard_len = ptr - buf;
1261 if (PL_parser->oldbufptr < ptr)
1262 PL_parser->oldbufptr = ptr;
1263 if (PL_parser->oldoldbufptr < ptr)
1264 PL_parser->oldoldbufptr = ptr;
1265 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1266 PL_parser->last_uni = NULL;
1267 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1268 PL_parser->last_lop = NULL;
1269 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1270 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1271 PL_parser->bufend -= discard_len;
1272 PL_parser->bufptr -= discard_len;
1273 PL_parser->oldbufptr -= discard_len;
1274 PL_parser->oldoldbufptr -= discard_len;
1275 if (PL_parser->last_uni)
1276 PL_parser->last_uni -= discard_len;
1277 if (PL_parser->last_lop)
1278 PL_parser->last_lop -= discard_len;
1279}
1280
1281/*
1282=for apidoc Amx|bool|lex_next_chunk|U32 flags
1283
1284Reads in the next chunk of text to be lexed, appending it to
1285L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1286looked to the end of the current chunk and wants to know more. It is
1287usual, but not necessary, for lexing to have consumed the entirety of
1288the current chunk at this time.
1289
1290If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1291chunk (i.e., the current chunk has been entirely consumed), normally the
1292current chunk will be discarded at the same time that the new chunk is
1293read in. If I<flags> includes C<LEX_KEEP_PREVIOUS>, the current chunk
1294will not be discarded. If the current chunk has not been entirely
1295consumed, then it will not be discarded regardless of the flag.
1296
1297Returns true if some new text was added to the buffer, or false if the
1298buffer has reached the end of the input text.
1299
1300=cut
1301*/
1302
1303#define LEX_FAKE_EOF 0x80000000
112d1284 1304#define LEX_NO_TERM 0x40000000
f0e67a1d
Z
1305
1306bool
1307Perl_lex_next_chunk(pTHX_ U32 flags)
1308{
1309 SV *linestr;
1310 char *buf;
1311 STRLEN old_bufend_pos, new_bufend_pos;
1312 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1313 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1314 bool got_some_for_debugger = 0;
f0e67a1d 1315 bool got_some;
112d1284 1316 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM))
f0e67a1d 1317 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
f0e67a1d
Z
1318 linestr = PL_parser->linestr;
1319 buf = SvPVX(linestr);
1320 if (!(flags & LEX_KEEP_PREVIOUS) &&
1321 PL_parser->bufptr == PL_parser->bufend) {
1322 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1323 linestart_pos = 0;
1324 if (PL_parser->last_uni != PL_parser->bufend)
1325 PL_parser->last_uni = NULL;
1326 if (PL_parser->last_lop != PL_parser->bufend)
1327 PL_parser->last_lop = NULL;
1328 last_uni_pos = last_lop_pos = 0;
1329 *buf = 0;
1330 SvCUR(linestr) = 0;
1331 } else {
1332 old_bufend_pos = PL_parser->bufend - buf;
1333 bufptr_pos = PL_parser->bufptr - buf;
1334 oldbufptr_pos = PL_parser->oldbufptr - buf;
1335 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1336 linestart_pos = PL_parser->linestart - buf;
1337 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1338 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1339 }
1340 if (flags & LEX_FAKE_EOF) {
1341 goto eof;
60d63348 1342 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
f0e67a1d
Z
1343 got_some = 0;
1344 } else if (filter_gets(linestr, old_bufend_pos)) {
1345 got_some = 1;
17cc9359 1346 got_some_for_debugger = 1;
112d1284
FC
1347 } else if (flags & LEX_NO_TERM) {
1348 got_some = 0;
f0e67a1d 1349 } else {
580561a3
Z
1350 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1351 sv_setpvs(linestr, "");
f0e67a1d
Z
1352 eof:
1353 /* End of real input. Close filehandle (unless it was STDIN),
1354 * then add implicit termination.
1355 */
87606032 1356 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
f0e67a1d
Z
1357 PerlIO_clearerr(PL_parser->rsfp);
1358 else if (PL_parser->rsfp)
1359 (void)PerlIO_close(PL_parser->rsfp);
1360 PL_parser->rsfp = NULL;
60d63348 1361 PL_parser->in_pod = PL_parser->filtered = 0;
f0e67a1d
Z
1362#ifdef PERL_MAD
1363 if (PL_madskills && !PL_in_eval && (PL_minus_p || PL_minus_n))
1364 PL_faketokens = 1;
1365#endif
1366 if (!PL_in_eval && PL_minus_p) {
1367 sv_catpvs(linestr,
1368 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1369 PL_minus_n = PL_minus_p = 0;
1370 } else if (!PL_in_eval && PL_minus_n) {
1371 sv_catpvs(linestr, /*{*/";}");
1372 PL_minus_n = 0;
1373 } else
1374 sv_catpvs(linestr, ";");
1375 got_some = 1;
1376 }
1377 buf = SvPVX(linestr);
1378 new_bufend_pos = SvCUR(linestr);
1379 PL_parser->bufend = buf + new_bufend_pos;
1380 PL_parser->bufptr = buf + bufptr_pos;
1381 PL_parser->oldbufptr = buf + oldbufptr_pos;
1382 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1383 PL_parser->linestart = buf + linestart_pos;
1384 if (PL_parser->last_uni)
1385 PL_parser->last_uni = buf + last_uni_pos;
1386 if (PL_parser->last_lop)
1387 PL_parser->last_lop = buf + last_lop_pos;
7f1c3e8c
FC
1388 if (PL_parser->preambling != NOLINE) {
1389 CopLINE_set(PL_curcop, PL_parser->preambling + 1);
1390 PL_parser->preambling = NOLINE;
1391 }
17cc9359 1392 if (got_some_for_debugger && (PERLDB_LINE || PERLDB_SAVESRC) &&
f0e67a1d
Z
1393 PL_curstash != PL_debstash) {
1394 /* debugger active and we're not compiling the debugger code,
1395 * so store the line into the debugger's array of lines
1396 */
1397 update_debugger_info(NULL, buf+old_bufend_pos,
1398 new_bufend_pos-old_bufend_pos);
1399 }
1400 return got_some;
1401}
1402
1403/*
1404=for apidoc Amx|I32|lex_peek_unichar|U32 flags
1405
1406Looks ahead one (Unicode) character in the text currently being lexed.
1407Returns the codepoint (unsigned integer value) of the next character,
1408or -1 if lexing has reached the end of the input text. To consume the
1409peeked character, use L</lex_read_unichar>.
1410
1411If the next character is in (or extends into) the next chunk of input
1412text, the next chunk will be read in. Normally the current chunk will be
1413discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1414then the current chunk will not be discarded.
1415
1416If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1417is encountered, an exception is generated.
1418
1419=cut
1420*/
1421
1422I32
1423Perl_lex_peek_unichar(pTHX_ U32 flags)
1424{
749123ff 1425 dVAR;
f0e67a1d
Z
1426 char *s, *bufend;
1427 if (flags & ~(LEX_KEEP_PREVIOUS))
1428 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1429 s = PL_parser->bufptr;
1430 bufend = PL_parser->bufend;
1431 if (UTF) {
1432 U8 head;
1433 I32 unichar;
1434 STRLEN len, retlen;
1435 if (s == bufend) {
1436 if (!lex_next_chunk(flags))
1437 return -1;
1438 s = PL_parser->bufptr;
1439 bufend = PL_parser->bufend;
1440 }
1441 head = (U8)*s;
54d004e8 1442 if (UTF8_IS_INVARIANT(head))
f0e67a1d 1443 return head;
54d004e8
KW
1444 if (UTF8_IS_START(head)) {
1445 len = UTF8SKIP(&head);
f0e67a1d
Z
1446 while ((STRLEN)(bufend-s) < len) {
1447 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1448 break;
1449 s = PL_parser->bufptr;
1450 bufend = PL_parser->bufend;
1451 }
1452 }
c80e42f3 1453 unichar = utf8n_to_uvchr((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
f0e67a1d
Z
1454 if (retlen == (STRLEN)-1) {
1455 /* malformed UTF-8 */
1456 ENTER;
1457 SAVESPTR(PL_warnhook);
1458 PL_warnhook = PERL_WARNHOOK_FATAL;
c80e42f3 1459 utf8n_to_uvchr((U8*)s, bufend-s, NULL, 0);
f0e67a1d
Z
1460 LEAVE;
1461 }
1462 return unichar;
1463 } else {
1464 if (s == bufend) {
1465 if (!lex_next_chunk(flags))
1466 return -1;
1467 s = PL_parser->bufptr;
1468 }
1469 return (U8)*s;
1470 }
1471}
1472
1473/*
1474=for apidoc Amx|I32|lex_read_unichar|U32 flags
1475
1476Reads the next (Unicode) character in the text currently being lexed.
1477Returns the codepoint (unsigned integer value) of the character read,
1478and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1479if lexing has reached the end of the input text. To non-destructively
1480examine the next character, use L</lex_peek_unichar> instead.
1481
1482If the next character is in (or extends into) the next chunk of input
1483text, the next chunk will be read in. Normally the current chunk will be
1484discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1485then the current chunk will not be discarded.
1486
1487If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1488is encountered, an exception is generated.
1489
1490=cut
1491*/
1492
1493I32
1494Perl_lex_read_unichar(pTHX_ U32 flags)
1495{
1496 I32 c;
1497 if (flags & ~(LEX_KEEP_PREVIOUS))
1498 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1499 c = lex_peek_unichar(flags);
1500 if (c != -1) {
1501 if (c == '\n')
83944c01 1502 COPLINE_INC_WITH_HERELINES;
d9018cbe
EB
1503 if (UTF)
1504 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1505 else
1506 ++(PL_parser->bufptr);
f0e67a1d
Z
1507 }
1508 return c;
1509}
1510
1511/*
1512=for apidoc Amx|void|lex_read_space|U32 flags
1513
1514Reads optional spaces, in Perl style, in the text currently being
1515lexed. The spaces may include ordinary whitespace characters and
1516Perl-style comments. C<#line> directives are processed if encountered.
1517L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1518at a non-space character (or the end of the input text).
1519
1520If spaces extend into the next chunk of input text, the next chunk will
1521be read in. Normally the current chunk will be discarded at the same
1522time, but if I<flags> includes C<LEX_KEEP_PREVIOUS> then the current
1523chunk will not be discarded.
1524
1525=cut
1526*/
1527
21791330 1528#define LEX_NO_INCLINE 0x40000000
f0998909
Z
1529#define LEX_NO_NEXT_CHUNK 0x80000000
1530
f0e67a1d
Z
1531void
1532Perl_lex_read_space(pTHX_ U32 flags)
1533{
1534 char *s, *bufend;
21791330 1535 const bool can_incline = !(flags & LEX_NO_INCLINE);
f0e67a1d 1536 bool need_incline = 0;
21791330 1537 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK|LEX_NO_INCLINE))
f0e67a1d
Z
1538 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
1539#ifdef PERL_MAD
1540 if (PL_skipwhite) {
1541 sv_free(PL_skipwhite);
1542 PL_skipwhite = NULL;
1543 }
1544 if (PL_madskills)
1545 PL_skipwhite = newSVpvs("");
1546#endif /* PERL_MAD */
1547 s = PL_parser->bufptr;
1548 bufend = PL_parser->bufend;
1549 while (1) {
1550 char c = *s;
1551 if (c == '#') {
1552 do {
1553 c = *++s;
1554 } while (!(c == '\n' || (c == 0 && s == bufend)));
1555 } else if (c == '\n') {
1556 s++;
21791330
FC
1557 if (can_incline) {
1558 PL_parser->linestart = s;
1559 if (s == bufend)
1560 need_incline = 1;
1561 else
1562 incline(s);
1563 }
f0e67a1d
Z
1564 } else if (isSPACE(c)) {
1565 s++;
1566 } else if (c == 0 && s == bufend) {
1567 bool got_more;
65c68e17 1568 line_t l;
f0e67a1d
Z
1569#ifdef PERL_MAD
1570 if (PL_madskills)
1571 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1572#endif /* PERL_MAD */
f0998909
Z
1573 if (flags & LEX_NO_NEXT_CHUNK)
1574 break;
f0e67a1d 1575 PL_parser->bufptr = s;
65c68e17 1576 l = CopLINE(PL_curcop);
851b527a 1577 CopLINE(PL_curcop) += PL_parser->herelines + 1;
f0e67a1d 1578 got_more = lex_next_chunk(flags);
65c68e17 1579 CopLINE_set(PL_curcop, l);
f0e67a1d
Z
1580 s = PL_parser->bufptr;
1581 bufend = PL_parser->bufend;
1582 if (!got_more)
1583 break;
21791330 1584 if (can_incline && need_incline && PL_parser->rsfp) {
f0e67a1d
Z
1585 incline(s);
1586 need_incline = 0;
1587 }
1588 } else {
1589 break;
1590 }
1591 }
1592#ifdef PERL_MAD
1593 if (PL_madskills)
1594 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1595#endif /* PERL_MAD */
1596 PL_parser->bufptr = s;
1597}
1598
1599/*
fe788d6b
PM
1600
1601=for apidoc EXMp|bool|validate_proto|SV *name|SV *proto|bool warn
1602
1603This function performs syntax checking on a prototype, C<proto>.
1604If C<warn> is true, any illegal characters or mismatched brackets
1605will trigger illegalproto warnings, declaring that they were
1606detected in the prototype for C<name>.
1607
1608The return value is C<true> if this is a valid prototype, and
1609C<false> if it is not, regardless of whether C<warn> was C<true> or
1610C<false>.
1611
1612Note that C<NULL> is a valid C<proto> and will always return C<true>.
1613
1614=cut
1615
1616 */
1617
1618bool
1619Perl_validate_proto(pTHX_ SV *name, SV *proto, bool warn)
1620{
1621 STRLEN len, origlen;
1622 char *p = proto ? SvPV(proto, len) : NULL;
1623 bool bad_proto = FALSE;
1624 bool in_brackets = FALSE;
1625 bool after_slash = FALSE;
1626 char greedy_proto = ' ';
1627 bool proto_after_greedy_proto = FALSE;
1628 bool must_be_last = FALSE;
1629 bool underscore = FALSE;
f791a21a 1630 bool bad_proto_after_underscore = FALSE;
fe788d6b
PM
1631
1632 PERL_ARGS_ASSERT_VALIDATE_PROTO;
1633
1634 if (!proto)
1635 return TRUE;
1636
1637 origlen = len;
1638 for (; len--; p++) {
1639 if (!isSPACE(*p)) {
1640 if (must_be_last)
1641 proto_after_greedy_proto = TRUE;
f791a21a
PM
1642 if (underscore) {
1643 if (!strchr(";@%", *p))
1644 bad_proto_after_underscore = TRUE;
1645 underscore = FALSE;
1646 }
fe788d6b
PM
1647 if (!strchr("$@%*;[]&\\_+", *p) || *p == '\0') {
1648 bad_proto = TRUE;
1649 }
1650 else {
fe788d6b
PM
1651 if (*p == '[')
1652 in_brackets = TRUE;
1653 else if (*p == ']')
1654 in_brackets = FALSE;
1655 else if ((*p == '@' || *p == '%') &&
1656 !after_slash &&
1657 !in_brackets ) {
1658 must_be_last = TRUE;
1659 greedy_proto = *p;
1660 }
1661 else if (*p == '_')
f791a21a 1662 underscore = TRUE;
fe788d6b
PM
1663 }
1664 if (*p == '\\')
1665 after_slash = TRUE;
1666 else
1667 after_slash = FALSE;
1668 }
1669 }
1670
1671 if (warn) {
b54d603d 1672 SV *tmpsv = newSVpvs_flags("", SVs_TEMP);
fe788d6b 1673 p -= origlen;
b54d603d
PM
1674 p = SvUTF8(proto)
1675 ? sv_uni_display(tmpsv, newSVpvn_flags(p, origlen, SVs_TEMP | SVf_UTF8),
1676 origlen, UNI_DISPLAY_ISPRINT)
1677 : pv_pretty(tmpsv, p, origlen, 60, NULL, NULL, PERL_PV_ESCAPE_NONASCII);
1678
fe788d6b
PM
1679 if (proto_after_greedy_proto)
1680 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1681 "Prototype after '%c' for %"SVf" : %s",
1682 greedy_proto, SVfARG(name), p);
50278ed0
PM
1683 if (in_brackets)
1684 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1685 "Missing ']' in prototype for %"SVf" : %s",
1686 SVfARG(name), p);
b54d603d 1687 if (bad_proto)
fe788d6b 1688 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
f791a21a
PM
1689 "Illegal character in prototype for %"SVf" : %s",
1690 SVfARG(name), p);
1691 if (bad_proto_after_underscore)
1692 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1693 "Illegal character after '_' in prototype for %"SVf" : %s",
1694 SVfARG(name), p);
fe788d6b
PM
1695 }
1696
1697 return (! (proto_after_greedy_proto || bad_proto) );
1698}
1699
1700/*
ffb4593c
NT
1701 * S_incline
1702 * This subroutine has nothing to do with tilting, whether at windmills
1703 * or pinball tables. Its name is short for "increment line". It
57843af0 1704 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1705 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1706 * # line 500 "foo.pm"
1707 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1708 */
1709
76e3520e 1710STATIC void
d9095cec 1711S_incline(pTHX_ const char *s)
463ee0b2 1712{
97aff369 1713 dVAR;
d9095cec
NC
1714 const char *t;
1715 const char *n;
1716 const char *e;
8818d409 1717 line_t line_num;
463ee0b2 1718
7918f24d
NC
1719 PERL_ARGS_ASSERT_INCLINE;
1720
83944c01 1721 COPLINE_INC_WITH_HERELINES;
451f421f
FC
1722 if (!PL_rsfp && !PL_parser->filtered && PL_lex_state == LEX_NORMAL
1723 && s+1 == PL_bufend && *s == ';') {
1724 /* fake newline in string eval */
1725 CopLINE_dec(PL_curcop);
1726 return;
1727 }
463ee0b2
LW
1728 if (*s++ != '#')
1729 return;
d4c19fe8
AL
1730 while (SPACE_OR_TAB(*s))
1731 s++;
73659bf1
GS
1732 if (strnEQ(s, "line", 4))
1733 s += 4;
1734 else
1735 return;
084592ab 1736 if (SPACE_OR_TAB(*s))
73659bf1 1737 s++;
4e553d73 1738 else
73659bf1 1739 return;
d4c19fe8
AL
1740 while (SPACE_OR_TAB(*s))
1741 s++;
463ee0b2
LW
1742 if (!isDIGIT(*s))
1743 return;
d4c19fe8 1744
463ee0b2
LW
1745 n = s;
1746 while (isDIGIT(*s))
1747 s++;
07714eb4 1748 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1749 return;
bf4acbe4 1750 while (SPACE_OR_TAB(*s))
463ee0b2 1751 s++;
73659bf1 1752 if (*s == '"' && (t = strchr(s+1, '"'))) {
463ee0b2 1753 s++;
73659bf1
GS
1754 e = t + 1;
1755 }
463ee0b2 1756 else {
c35e046a
AL
1757 t = s;
1758 while (!isSPACE(*t))
1759 t++;
73659bf1 1760 e = t;
463ee0b2 1761 }
bf4acbe4 1762 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1763 e++;
1764 if (*e != '\n' && *e != '\0')
1765 return; /* false alarm */
1766
8818d409
FC
1767 line_num = atoi(n)-1;
1768
f4dd75d9 1769 if (t - s > 0) {
d9095cec 1770 const STRLEN len = t - s;
3df32bda 1771
d36ee5be 1772 if (!PL_rsfp && !PL_parser->filtered) {
e66cf94c
RGS
1773 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1774 * to *{"::_<newfilename"} */
44867030
NC
1775 /* However, the long form of evals is only turned on by the
1776 debugger - usually they're "(eval %lu)" */
d36ee5be
FC
1777 GV * const cfgv = CopFILEGV(PL_curcop);
1778 if (cfgv) {
38bd7ad8
FC
1779 char smallbuf[128];
1780 STRLEN tmplen2 = len;
44867030 1781 char *tmpbuf2;
449dd039 1782 GV *gv2;
44867030
NC
1783
1784 if (tmplen2 + 2 <= sizeof smallbuf)
1785 tmpbuf2 = smallbuf;
1786 else
1787 Newx(tmpbuf2, tmplen2 + 2, char);
1788
38bd7ad8
FC
1789 tmpbuf2[0] = '_';
1790 tmpbuf2[1] = '<';
44867030
NC
1791
1792 memcpy(tmpbuf2 + 2, s, tmplen2);
1793 tmplen2 += 2;
1794
8a5ee598 1795 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1796 if (!isGV(gv2)) {
8a5ee598 1797 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1798 /* adjust ${"::_<newfilename"} to store the new file name */
1799 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
8818d409
FC
1800 /* The line number may differ. If that is the case,
1801 alias the saved lines that are in the array.
1802 Otherwise alias the whole array. */
1803 if (CopLINE(PL_curcop) == line_num) {
38bd7ad8
FC
1804 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(cfgv)));
1805 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(cfgv)));
8818d409 1806 }
38bd7ad8
FC
1807 else if (GvAV(cfgv)) {
1808 AV * const av = GvAV(cfgv);
8818d409
FC
1809 const I32 start = CopLINE(PL_curcop)+1;
1810 I32 items = AvFILLp(av) - start;
1811 if (items > 0) {
1812 AV * const av2 = GvAVn(gv2);
1813 SV **svp = AvARRAY(av) + start;
1814 I32 l = (I32)line_num+1;
1815 while (items--)
1816 av_store(av2, l++, SvREFCNT_inc(*svp++));
1817 }
1818 }
e5527e4b 1819 }
44867030
NC
1820
1821 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
d36ee5be 1822 }
e66cf94c 1823 }
05ec9bb3 1824 CopFILE_free(PL_curcop);
449dd039 1825 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1826 }
8818d409 1827 CopLINE_set(PL_curcop, line_num);
463ee0b2
LW
1828}
1829
21791330
FC
1830#define skipspace(s) skipspace_flags(s, 0)
1831
29595ff2 1832#ifdef PERL_MAD
cd81e915 1833/* skip space before PL_thistoken */
29595ff2
NC
1834
1835STATIC char *
5aaab254 1836S_skipspace0(pTHX_ char *s)
29595ff2 1837{
7918f24d
NC
1838 PERL_ARGS_ASSERT_SKIPSPACE0;
1839
29595ff2
NC
1840 s = skipspace(s);
1841 if (!PL_madskills)
1842 return s;
cd81e915
NC
1843 if (PL_skipwhite) {
1844 if (!PL_thiswhite)
6b29d1f5 1845 PL_thiswhite = newSVpvs("");
cd81e915
NC
1846 sv_catsv(PL_thiswhite, PL_skipwhite);
1847 sv_free(PL_skipwhite);
1848 PL_skipwhite = 0;
1849 }
1850 PL_realtokenstart = s - SvPVX(PL_linestr);
29595ff2
NC
1851 return s;
1852}
1853
cd81e915 1854/* skip space after PL_thistoken */
29595ff2
NC
1855
1856STATIC char *
5aaab254 1857S_skipspace1(pTHX_ char *s)
29595ff2 1858{
d4c19fe8 1859 const char *start = s;
29595ff2
NC
1860 I32 startoff = start - SvPVX(PL_linestr);
1861
7918f24d
NC
1862 PERL_ARGS_ASSERT_SKIPSPACE1;
1863
29595ff2
NC
1864 s = skipspace(s);
1865 if (!PL_madskills)
1866 return s;
1867 start = SvPVX(PL_linestr) + startoff;
cd81e915 1868 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1869 const char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1870 PL_thistoken = newSVpvn(tstart, start - tstart);
1871 }
1872 PL_realtokenstart = -1;
1873 if (PL_skipwhite) {
1874 if (!PL_nextwhite)
6b29d1f5 1875 PL_nextwhite = newSVpvs("");
cd81e915
NC
1876 sv_catsv(PL_nextwhite, PL_skipwhite);
1877 sv_free(PL_skipwhite);
1878 PL_skipwhite = 0;
29595ff2
NC
1879 }
1880 return s;
1881}
1882
1883STATIC char *
5aaab254 1884S_skipspace2(pTHX_ char *s, SV **svp)
29595ff2 1885{
c35e046a 1886 char *start;
c35e046a
AL
1887 const I32 startoff = s - SvPVX(PL_linestr);
1888
7918f24d
NC
1889 PERL_ARGS_ASSERT_SKIPSPACE2;
1890
29595ff2 1891 s = skipspace(s);
29595ff2
NC
1892 if (!PL_madskills || !svp)
1893 return s;
1894 start = SvPVX(PL_linestr) + startoff;
cd81e915 1895 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1896 char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1897 PL_thistoken = newSVpvn(tstart, start - tstart);
1898 PL_realtokenstart = -1;
29595ff2 1899 }
cd81e915 1900 if (PL_skipwhite) {
29595ff2 1901 if (!*svp)
6b29d1f5 1902 *svp = newSVpvs("");
cd81e915
NC
1903 sv_setsv(*svp, PL_skipwhite);
1904 sv_free(PL_skipwhite);
1905 PL_skipwhite = 0;
29595ff2
NC
1906 }
1907
1908 return s;
1909}
1910#endif
1911
80a702cd 1912STATIC void
15f169a1 1913S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1914{
1915 AV *av = CopFILEAVx(PL_curcop);
1916 if (av) {
7f1c3e8c
FC
1917 SV * sv;
1918 if (PL_parser->preambling == NOLINE) sv = newSV_type(SVt_PVMG);
1919 else {
1920 sv = *av_fetch(av, 0, 1);
1921 SvUPGRADE(sv, SVt_PVMG);
1922 }
1923 if (!SvPOK(sv)) sv_setpvs(sv,"");
5fa550fb 1924 if (orig_sv)
7f1c3e8c 1925 sv_catsv(sv, orig_sv);
5fa550fb 1926 else
7f1c3e8c
FC
1927 sv_catpvn(sv, buf, len);
1928 if (!SvIOK(sv)) {
1929 (void)SvIOK_on(sv);
1930 SvIV_set(sv, 0);
1931 }
1932 if (PL_parser->preambling == NOLINE)
1933 av_store(av, CopLINE(PL_curcop), sv);
80a702cd
RGS
1934 }
1935}
1936
ffb4593c
NT
1937/*
1938 * S_skipspace
1939 * Called to gobble the appropriate amount and type of whitespace.
1940 * Skips comments as well.
1941 */
1942
76e3520e 1943STATIC char *
21791330 1944S_skipspace_flags(pTHX_ char *s, U32 flags)
a687059c 1945{
5db06880 1946#ifdef PERL_MAD
f0e67a1d
Z
1947 char *start = s;
1948#endif /* PERL_MAD */
21791330 1949 PERL_ARGS_ASSERT_SKIPSPACE_FLAGS;
f0e67a1d 1950#ifdef PERL_MAD
cd81e915
NC
1951 if (PL_skipwhite) {
1952 sv_free(PL_skipwhite);
f0e67a1d 1953 PL_skipwhite = NULL;
5db06880 1954 }
f0e67a1d 1955#endif /* PERL_MAD */
3280af22 1956 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
bf4acbe4 1957 while (s < PL_bufend && SPACE_OR_TAB(*s))
463ee0b2 1958 s++;
f0e67a1d
Z
1959 } else {
1960 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1961 PL_bufptr = s;
21791330 1962 lex_read_space(flags | LEX_KEEP_PREVIOUS |
f0998909
Z
1963 (PL_sublex_info.sub_inwhat || PL_lex_state == LEX_FORMLINE ?
1964 LEX_NO_NEXT_CHUNK : 0));
3280af22 1965 s = PL_bufptr;
f0e67a1d
Z
1966 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1967 if (PL_linestart > PL_bufptr)
1968 PL_bufptr = PL_linestart;
1969 return s;
463ee0b2 1970 }
5db06880 1971#ifdef PERL_MAD
f0e67a1d
Z
1972 if (PL_madskills)
1973 PL_skipwhite = newSVpvn(start, s-start);
1974#endif /* PERL_MAD */
5db06880 1975 return s;
a687059c 1976}
378cc40b 1977
ffb4593c
NT
1978/*
1979 * S_check_uni
1980 * Check the unary operators to ensure there's no ambiguity in how they're
1981 * used. An ambiguous piece of code would be:
1982 * rand + 5
1983 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1984 * the +5 is its argument.
1985 */
1986
76e3520e 1987STATIC void
cea2e8a9 1988S_check_uni(pTHX)
ba106d47 1989{
97aff369 1990 dVAR;
d4c19fe8
AL
1991 const char *s;
1992 const char *t;
2f3197b3 1993
3280af22 1994 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1995 return;
3280af22
NIS
1996 while (isSPACE(*PL_last_uni))
1997 PL_last_uni++;
c35e046a 1998 s = PL_last_uni;
8a2bca12 1999 while (isWORDCHAR_lazy_if(s,UTF) || *s == '-')
c35e046a 2000 s++;
3280af22 2001 if ((t = strchr(s, '(')) && t < PL_bufptr)
a0d0e21e 2002 return;
6136c704 2003
9b387841
NC
2004 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
2005 "Warning: Use of \"%.*s\" without parentheses is ambiguous",
2006 (int)(s - PL_last_uni), PL_last_uni);
2f3197b3
LW
2007}
2008
ffb4593c
NT
2009/*
2010 * LOP : macro to build a list operator. Its behaviour has been replaced
2011 * with a subroutine, S_lop() for which LOP is just another name.
2012 */
2013
a0d0e21e
LW
2014#define LOP(f,x) return lop(f,x,s)
2015
ffb4593c
NT
2016/*
2017 * S_lop
2018 * Build a list operator (or something that might be one). The rules:
2019 * - if we have a next token, then it's a list operator [why?]
2020 * - if the next thing is an opening paren, then it's a function
2021 * - else it's a list operator
2022 */
2023
76e3520e 2024STATIC I32
a0be28da 2025S_lop(pTHX_ I32 f, int x, char *s)
ffed7fef 2026{
97aff369 2027 dVAR;
7918f24d
NC
2028
2029 PERL_ARGS_ASSERT_LOP;
2030
6154021b 2031 pl_yylval.ival = f;
35c8bce7 2032 CLINE;
3280af22
NIS
2033 PL_expect = x;
2034 PL_bufptr = s;
2035 PL_last_lop = PL_oldbufptr;
eb160463 2036 PL_last_lop_op = (OPCODE)f;
5db06880
NC
2037#ifdef PERL_MAD
2038 if (PL_lasttoke)
78cdf107 2039 goto lstop;
5db06880 2040#else
3280af22 2041 if (PL_nexttoke)
78cdf107 2042 goto lstop;
5db06880 2043#endif
79072805 2044 if (*s == '(')
bbf60fe6 2045 return REPORT(FUNC);
29595ff2 2046 s = PEEKSPACE(s);
79072805 2047 if (*s == '(')
bbf60fe6 2048 return REPORT(FUNC);
78cdf107
Z
2049 else {
2050 lstop:
2051 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
2052 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
bbf60fe6 2053 return REPORT(LSTOP);
78cdf107 2054 }
79072805
LW
2055}
2056
5db06880
NC
2057#ifdef PERL_MAD
2058 /*
2059 * S_start_force
2060 * Sets up for an eventual force_next(). start_force(0) basically does
2061 * an unshift, while start_force(-1) does a push. yylex removes items
2062 * on the "pop" end.
2063 */
2064
2065STATIC void
2066S_start_force(pTHX_ int where)
2067{
2068 int i;
2069
cd81e915 2070 if (where < 0) /* so people can duplicate start_force(PL_curforce) */
5db06880 2071 where = PL_lasttoke;
cd81e915
NC
2072 assert(PL_curforce < 0 || PL_curforce == where);
2073 if (PL_curforce != where) {
5db06880
NC
2074 for (i = PL_lasttoke; i > where; --i) {
2075 PL_nexttoke[i] = PL_nexttoke[i-1];
2076 }
2077 PL_lasttoke++;
2078 }
cd81e915 2079 if (PL_curforce < 0) /* in case of duplicate start_force() */
5db06880 2080 Zero(&PL_nexttoke[where], 1, NEXTTOKE);
cd81e915
NC
2081 PL_curforce = where;
2082 if (PL_nextwhite) {
5db06880 2083 if (PL_madskills)
6b29d1f5 2084 curmad('^', newSVpvs(""));
cd81e915 2085 CURMAD('_', PL_nextwhite);
5db06880
NC
2086 }
2087}
2088
2089STATIC void
2090S_curmad(pTHX_ char slot, SV *sv)
2091{
2092 MADPROP **where;
2093
2094 if (!sv)
2095 return;
cd81e915
NC
2096 if (PL_curforce < 0)
2097 where = &PL_thismad;
5db06880 2098 else
cd81e915 2099 where = &PL_nexttoke[PL_curforce].next_mad;
5db06880 2100
cd81e915 2101 if (PL_faketokens)
76f68e9b 2102 sv_setpvs(sv, "");
5db06880
NC
2103 else {
2104 if (!IN_BYTES) {
2105 if (UTF && is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
2106 SvUTF8_on(sv);
2107 else if (PL_encoding) {
2108 sv_recode_to_utf8(sv, PL_encoding);
2109 }
2110 }
2111 }
2112
2113 /* keep a slot open for the head of the list? */
2114 if (slot != '_' && *where && (*where)->mad_key == '^') {
2115 (*where)->mad_key = slot;
daba3364 2116 sv_free(MUTABLE_SV(((*where)->mad_val)));
5db06880
NC
2117 (*where)->mad_val = (void*)sv;
2118 }
2119 else
2120 addmad(newMADsv(slot, sv), where, 0);
2121}
2122#else
b3f24c00
MHM
2123# define start_force(where) NOOP
2124# define curmad(slot, sv) NOOP
5db06880
NC
2125#endif
2126
ffb4593c
NT
2127/*
2128 * S_force_next
9cbb5ea2 2129 * When the lexer realizes it knows the next token (for instance,
ffb4593c 2130 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 2131 * to know what token to return the next time the lexer is called. Caller
5db06880
NC
2132 * will need to set PL_nextval[] (or PL_nexttoke[].next_val with PERL_MAD),
2133 * and possibly PL_expect to ensure the lexer handles the token correctly.
ffb4593c
NT
2134 */
2135
4e553d73 2136STATIC void
cea2e8a9 2137S_force_next(pTHX_ I32 type)
79072805 2138{
97aff369 2139 dVAR;
704d4215
GG
2140#ifdef DEBUGGING
2141 if (DEBUG_T_TEST) {
2142 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 2143 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
2144 }
2145#endif
5db06880 2146#ifdef PERL_MAD
cd81e915 2147 if (PL_curforce < 0)
5db06880 2148 start_force(PL_lasttoke);
cd81e915 2149 PL_nexttoke[PL_curforce].next_type = type;
5db06880
NC
2150 if (PL_lex_state != LEX_KNOWNEXT)
2151 PL_lex_defer = PL_lex_state;
2152 PL_lex_state = LEX_KNOWNEXT;
2153 PL_lex_expect = PL_expect;
cd81e915 2154 PL_curforce = -1;
5db06880 2155#else
3280af22
NIS
2156 PL_nexttype[PL_nexttoke] = type;
2157 PL_nexttoke++;
2158 if (PL_lex_state != LEX_KNOWNEXT) {
2159 PL_lex_defer = PL_lex_state;
2160 PL_lex_expect = PL_expect;
2161 PL_lex_state = LEX_KNOWNEXT;
79072805 2162 }
5db06880 2163#endif
79072805
LW
2164}
2165
89f35911
FC
2166/*
2167 * S_postderef
2168 *
2169 * This subroutine handles postfix deref syntax after the arrow has already
2170 * been emitted. @* $* etc. are emitted as two separate token right here.
2171 * @[ @{ %[ %{ *{ are emitted also as two tokens, but this function emits
2172 * only the first, leaving yylex to find the next.
89f35911
FC
2173 */
2174
2175static int
ff25e5db 2176S_postderef(pTHX_ int const funny, char const next)
89f35911
FC
2177{
2178 dVAR;
ff25e5db 2179 assert(funny == DOLSHARP || strchr("$@%&*", funny));
89f35911
FC
2180 assert(strchr("*[{", next));
2181 if (next == '*') {
2182 PL_expect = XOPERATOR;
cc624add 2183 if (PL_lex_state == LEX_INTERPNORMAL && !PL_lex_brackets) {
ff25e5db 2184 assert('@' == funny || '$' == funny || DOLSHARP == funny);
cc624add
FC
2185 PL_lex_state = LEX_INTERPEND;
2186 start_force(PL_curforce);
2187 force_next(POSTJOIN);
2188 }
89f35911
FC
2189 start_force(PL_curforce);
2190 force_next(next);
2191 PL_bufptr+=2;
2192 }
2193 else {
760ca746
FC
2194 if ('@' == funny && PL_lex_state == LEX_INTERPNORMAL
2195 && !PL_lex_brackets)
2196 PL_lex_dojoin = 2;
89f35911
FC
2197 PL_expect = XOPERATOR;
2198 PL_bufptr++;
2199 }
2200 return funny;
2201}
2202
28ac2b49
Z
2203void
2204Perl_yyunlex(pTHX)
2205{
a7aaec61
Z
2206 int yyc = PL_parser->yychar;
2207 if (yyc != YYEMPTY) {
2208 if (yyc) {
2209 start_force(-1);
2210 NEXTVAL_NEXTTOKE = PL_parser->yylval;
2211 if (yyc == '{'/*}*/ || yyc == HASHBRACK || yyc == '['/*]*/) {
78cdf107 2212 PL_lex_allbrackets--;
a7aaec61 2213 PL_lex_brackets--;
78cdf107
Z
2214 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
2215 } else if (yyc == '('/*)*/) {
2216 PL_lex_allbrackets--;
2217 yyc |= (2<<24);
a7aaec61
Z
2218 }
2219 force_next(yyc);
2220 }
28ac2b49
Z
2221 PL_parser->yychar = YYEMPTY;
2222 }
2223}
2224
d0a148a6 2225STATIC SV *
15f169a1 2226S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 2227{
97aff369 2228 dVAR;
740cce10 2229 SV * const sv = newSVpvn_utf8(start, len,
eaf7a4d2
CS
2230 !IN_BYTES
2231 && UTF
2232 && !is_ascii_string((const U8*)start, len)
740cce10 2233 && is_utf8_string((const U8*)start, len));
d0a148a6
NC
2234 return sv;
2235}
2236
ffb4593c
NT
2237/*
2238 * S_force_word
2239 * When the lexer knows the next thing is a word (for instance, it has
2240 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
2241 * it calls S_force_word to stick the next word into the PL_nexttoke/val
2242 * lookahead.
ffb4593c
NT
2243 *
2244 * Arguments:
b1b65b59 2245 * char *start : buffer position (must be within PL_linestr)
02b34bbe 2246 * int token : PL_next* will be this type of bare word (e.g., METHOD,WORD)
ffb4593c
NT
2247 * int check_keyword : if true, Perl checks to make sure the word isn't
2248 * a keyword (do this if the word is a label, e.g. goto FOO)
2249 * int allow_pack : if true, : characters will also be allowed (require,
2250 * use, etc. do this)
9cbb5ea2 2251 * int allow_initial_tick : used by the "sub" lexer only.
ffb4593c
NT
2252 */
2253
76e3520e 2254STATIC char *
345b3785 2255S_force_word(pTHX_ char *start, int token, int check_keyword, int allow_pack)
79072805 2256{
97aff369 2257 dVAR;
eb578fdb 2258 char *s;
463ee0b2 2259 STRLEN len;
4e553d73 2260
7918f24d
NC
2261 PERL_ARGS_ASSERT_FORCE_WORD;
2262
29595ff2 2263 start = SKIPSPACE1(start);
463ee0b2 2264 s = start;
7e2040f0 2265 if (isIDFIRST_lazy_if(s,UTF) ||
345b3785 2266 (allow_pack && *s == ':') )
a0d0e21e 2267 {
3280af22 2268 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
01b5ef50
FC
2269 if (check_keyword) {
2270 char *s2 = PL_tokenbuf;
2271 if (allow_pack && len > 6 && strnEQ(s2, "CORE::", 6))
2272 s2 += 6, len -= 6;
2273 if (keyword(s2, len, 0))
463ee0b2 2274 return start;
01b5ef50 2275 }
cd81e915 2276 start_force(PL_curforce);
5db06880
NC
2277 if (PL_madskills)
2278 curmad('X', newSVpvn(start,s-start));
463ee0b2 2279 if (token == METHOD) {
29595ff2 2280 s = SKIPSPACE1(s);
463ee0b2 2281 if (*s == '(')
3280af22 2282 PL_expect = XTERM;
463ee0b2 2283 else {
3280af22 2284 PL_expect = XOPERATOR;
463ee0b2 2285 }
79072805 2286 }
e74e6b3d 2287 if (PL_madskills)
63575281 2288 curmad('g', newSVpvs( "forced" ));
9ded7720 2289 NEXTVAL_NEXTTOKE.opval
d0a148a6
NC
2290 = (OP*)newSVOP(OP_CONST,0,
2291 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 2292 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
2293 force_next(token);
2294 }
2295 return s;
2296}
2297
ffb4593c
NT
2298/*
2299 * S_force_ident
9cbb5ea2 2300 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
2301 * text only contains the "foo" portion. The first argument is a pointer
2302 * to the "foo", and the second argument is the type symbol to prefix.
2303 * Forces the next token to be a "WORD".
9cbb5ea2 2304 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2305 */
2306
76e3520e 2307STATIC void
5aaab254 2308S_force_ident(pTHX_ const char *s, int kind)
79072805 2309{
97aff369 2310 dVAR;
7918f24d
NC
2311
2312 PERL_ARGS_ASSERT_FORCE_IDENT;
2313
c9b48522
DD
2314 if (s[0]) {
2315 const STRLEN len = s[1] ? strlen(s) : 1; /* s = "\"" see yylex */
728847b1
BF
2316 OP* const o = (OP*)newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
2317 UTF ? SVf_UTF8 : 0));
cd81e915 2318 start_force(PL_curforce);
9ded7720 2319 NEXTVAL_NEXTTOKE.opval = o;
79072805 2320 force_next(WORD);
748a9306 2321 if (kind) {
11343788 2322 o->op_private = OPpCONST_ENTERED;
55497cff 2323 /* XXX see note in pp_entereval() for why we forgo typo
2324 warnings if the symbol must be introduced in an eval.
2325 GSAR 96-10-12 */
90e5519e 2326 gv_fetchpvn_flags(s, len,
728847b1
BF
2327 (PL_in_eval ? (GV_ADDMULTI | GV_ADDINEVAL)
2328 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
90e5519e
NC
2329 kind == '$' ? SVt_PV :
2330 kind == '@' ? SVt_PVAV :
2331 kind == '%' ? SVt_PVHV :
a0d0e21e 2332 SVt_PVGV
90e5519e 2333 );
748a9306 2334 }
79072805
LW
2335 }
2336}
2337
3f33d153
FC
2338static void
2339S_force_ident_maybe_lex(pTHX_ char pit)
2340{
2341 start_force(PL_curforce);
2342 NEXTVAL_NEXTTOKE.ival = pit;
2343 force_next('p');
2344}
2345
1571675a
GS
2346NV
2347Perl_str_to_version(pTHX_ SV *sv)
2348{
2349 NV retval = 0.0;
2350 NV nshift = 1.0;
2351 STRLEN len;
cfd0369c 2352 const char *start = SvPV_const(sv,len);
9d4ba2ae 2353 const char * const end = start + len;
504618e9 2354 const bool utf = SvUTF8(sv) ? TRUE : FALSE;
7918f24d
NC
2355
2356 PERL_ARGS_ASSERT_STR_TO_VERSION;
2357
1571675a 2358 while (start < end) {
ba210ebe 2359 STRLEN skip;
1571675a
GS
2360 UV n;
2361 if (utf)
9041c2e3 2362 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2363 else {
2364 n = *(U8*)start;
2365 skip = 1;
2366 }
2367 retval += ((NV)n)/nshift;
2368 start += skip;
2369 nshift *= 1000;
2370 }
2371 return retval;
2372}
2373
4e553d73 2374/*
ffb4593c
NT
2375 * S_force_version
2376 * Forces the next token to be a version number.
e759cc13
RGS
2377 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2378 * and if "guessing" is TRUE, then no new token is created (and the caller
2379 * must use an alternative parsing method).
ffb4593c
NT
2380 */
2381
76e3520e 2382STATIC char *
e759cc13 2383S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2384{
97aff369 2385 dVAR;
5f66b61c 2386 OP *version = NULL;
44dcb63b 2387 char *d;
5db06880
NC
2388#ifdef PERL_MAD
2389 I32 startoff = s - SvPVX(PL_linestr);
2390#endif
89bfa8cd 2391
7918f24d
NC
2392 PERL_ARGS_ASSERT_FORCE_VERSION;
2393
29595ff2 2394 s = SKIPSPACE1(s);
89bfa8cd 2395
44dcb63b 2396 d = s;
dd629d5b 2397 if (*d == 'v')
44dcb63b 2398 d++;
44dcb63b 2399 if (isDIGIT(*d)) {
e759cc13
RGS
2400 while (isDIGIT(*d) || *d == '_' || *d == '.')
2401 d++;
5db06880
NC
2402#ifdef PERL_MAD
2403 if (PL_madskills) {
cd81e915 2404 start_force(PL_curforce);
5db06880
NC
2405 curmad('X', newSVpvn(s,d-s));
2406 }
2407#endif
4e4da3ac 2408 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
dd629d5b 2409 SV *ver;
6154021b
RGS
2410 s = scan_num(s, &pl_yylval);
2411 version = pl_yylval.opval;
dd629d5b
GS
2412 ver = cSVOPx(version)->op_sv;
2413 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2414 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2415 SvNV_set(ver, str_to_version(ver));
1571675a 2416 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2417 }
89bfa8cd 2418 }
5db06880
NC
2419 else if (guessing) {
2420#ifdef PERL_MAD
2421 if (PL_madskills) {
cd81e915
NC
2422 sv_free(PL_nextwhite); /* let next token collect whitespace */
2423 PL_nextwhite = 0;
5db06880
NC
2424 s = SvPVX(PL_linestr) + startoff;
2425 }
2426#endif
e759cc13 2427 return s;
5db06880 2428 }
89bfa8cd 2429 }
2430
5db06880
NC
2431#ifdef PERL_MAD
2432 if (PL_madskills && !version) {
cd81e915
NC
2433 sv_free(PL_nextwhite); /* let next token collect whitespace */
2434 PL_nextwhite = 0;
5db06880
NC
2435 s = SvPVX(PL_linestr) + startoff;
2436 }
2437#endif
89bfa8cd 2438 /* NOTE: The parser sees the package name and the VERSION swapped */
cd81e915 2439 start_force(PL_curforce);
9ded7720 2440 NEXTVAL_NEXTTOKE.opval = version;
4e553d73 2441 force_next(WORD);
89bfa8cd 2442
e759cc13 2443 return s;
89bfa8cd 2444}
2445
ffb4593c 2446/*
91152fc1
DG
2447 * S_force_strict_version
2448 * Forces the next token to be a version number using strict syntax rules.
2449 */
2450
2451STATIC char *
2452S_force_strict_version(pTHX_ char *s)
2453{
2454 dVAR;
2455 OP *version = NULL;
2456#ifdef PERL_MAD
2457 I32 startoff = s - SvPVX(PL_linestr);
2458#endif
2459 const char *errstr = NULL;
2460
2461 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2462
2463 while (isSPACE(*s)) /* leading whitespace */
2464 s++;
2465
2466 if (is_STRICT_VERSION(s,&errstr)) {
2467 SV *ver = newSV(0);
2468 s = (char *)scan_version(s, ver, 0);
2469 version = newSVOP(OP_CONST, 0, ver);
2470 }
4e4da3ac
Z
2471 else if ( (*s != ';' && *s != '{' && *s != '}' ) &&
2472 (s = SKIPSPACE1(s), (*s != ';' && *s != '{' && *s != '}' )))
2473 {
91152fc1
DG
2474 PL_bufptr = s;
2475 if (errstr)
2476 yyerror(errstr); /* version required */
2477 return s;
2478 }
2479
2480#ifdef PERL_MAD
2481 if (PL_madskills && !version) {
2482 sv_free(PL_nextwhite); /* let next token collect whitespace */
2483 PL_nextwhite = 0;
2484 s = SvPVX(PL_linestr) + startoff;
2485 }
2486#endif
2487 /* NOTE: The parser sees the package name and the VERSION swapped */
2488 start_force(PL_curforce);
2489 NEXTVAL_NEXTTOKE.opval = version;
2490 force_next(WORD);
2491
2492 return s;
2493}
2494
2495/*
ffb4593c
NT
2496 * S_tokeq
2497 * Tokenize a quoted string passed in as an SV. It finds the next
2498 * chunk, up to end of string or a backslash. It may make a new
2499 * SV containing that chunk (if HINT_NEW_STRING is on). It also
2500 * turns \\ into \.
2501 */
2502
76e3520e 2503STATIC SV *
cea2e8a9 2504S_tokeq(pTHX_ SV *sv)
79072805 2505{
97aff369 2506 dVAR;
eb578fdb
KW
2507 char *s;
2508 char *send;
2509 char *d;
b3ac6de7 2510 SV *pv = sv;
79072805 2511
7918f24d
NC
2512 PERL_ARGS_ASSERT_TOKEQ;
2513
279b35ad
FC
2514 assert (SvPOK(sv));
2515 assert (SvLEN(sv));
2516 assert (!SvIsCOW(sv));
307ed071 2517 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1) /* <<'heredoc' */
b3ac6de7 2518 goto finish;
279b35ad
FC
2519 s = SvPVX(sv);
2520 send = SvEND(sv);
dcb21ed6
NC
2521 /* This is relying on the SV being "well formed" with a trailing '\0' */
2522 while (s < send && !(*s == '\\' && s[1] == '\\'))
79072805
LW
2523 s++;
2524 if (s == send)
b3ac6de7 2525 goto finish;
79072805 2526 d = s;
be4731d2 2527 if ( PL_hints & HINT_NEW_STRING ) {
279b35ad
FC
2528 pv = newSVpvn_flags(SvPVX_const(pv), SvCUR(sv),
2529 SVs_TEMP | SvUTF8(sv));
be4731d2 2530 }
79072805
LW
2531 while (s < send) {
2532 if (*s == '\\') {
a0d0e21e 2533 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2534 s++; /* all that, just for this */
2535 }
2536 *d++ = *s++;
2537 }
2538 *d = '\0';
95a20fc0 2539 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2540 finish:
3280af22 2541 if ( PL_hints & HINT_NEW_STRING )
eb0d8d16 2542 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
79072805
LW
2543 return sv;
2544}
2545
ffb4593c
NT
2546/*
2547 * Now come three functions related to double-quote context,
2548 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2549 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2550 * interact with PL_lex_state, and create fake ( ... ) argument lists
2551 * to handle functions and concatenation.
ecd24171
DM
2552 * For example,
2553 * "foo\lbar"
2554 * is tokenised as
2555 * stringify ( const[foo] concat lcfirst ( const[bar] ) )
ffb4593c
NT
2556 */
2557
2558/*
2559 * S_sublex_start
6154021b 2560 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2561 *
2562 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2563 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c
NT
2564 *
2565 * OP_CONST and OP_READLINE are easy--just make the new op and return.
2566 *
2567 * Everything else becomes a FUNC.
2568 *
2569 * Sets PL_lex_state to LEX_INTERPPUSH unless (ival was OP_NULL or we
2570 * had an OP_CONST or OP_READLINE). This just sets us up for a
2571 * call to S_sublex_push().
2572 */
2573
76e3520e 2574STATIC I32
cea2e8a9 2575S_sublex_start(pTHX)
79072805 2576{
97aff369 2577 dVAR;
eb578fdb 2578 const I32 op_type = pl_yylval.ival;
79072805
LW
2579
2580 if (op_type == OP_NULL) {
6154021b 2581 pl_yylval.opval = PL_lex_op;
5f66b61c 2582 PL_lex_op = NULL;
79072805
LW
2583 return THING;
2584 }
466112bb 2585 if (op_type == OP_CONST) {
3280af22 2586 SV *sv = tokeq(PL_lex_stuff);
b3ac6de7
IZ
2587
2588 if (SvTYPE(sv) == SVt_PVIV) {
2589 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2590 STRLEN len;
96a5add6 2591 const char * const p = SvPV_const(sv, len);
740cce10 2592 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2593 SvREFCNT_dec(sv);
2594 sv = nsv;
4e553d73 2595 }
6154021b 2596 pl_yylval.opval = (OP*)newSVOP(op_type, 0, sv);
a0714e2c 2597 PL_lex_stuff = NULL;
79072805
LW
2598 return THING;
2599 }
2600
3280af22 2601 PL_sublex_info.super_state = PL_lex_state;
eac04b2e 2602 PL_sublex_info.sub_inwhat = (U16)op_type;
3280af22
NIS
2603 PL_sublex_info.sub_op = PL_lex_op;
2604 PL_lex_state = LEX_INTERPPUSH;
55497cff 2605
3280af22
NIS
2606 PL_expect = XTERM;
2607 if (PL_lex_op) {
6154021b 2608 pl_yylval.opval = PL_lex_op;
5f66b61c 2609 PL_lex_op = NULL;
55497cff 2610 return PMFUNC;
2611 }
2612 else
2613 return FUNC;
2614}
2615
ffb4593c
NT
2616/*
2617 * S_sublex_push
2618 * Create a new scope to save the lexing state. The scope will be
2619 * ended in S_sublex_done. Returns a '(', starting the function arguments
2620 * to the uc, lc, etc. found before.
2621 * Sets PL_lex_state to LEX_INTERPCONCAT.
2622 */
2623
76e3520e 2624STATIC I32
cea2e8a9 2625S_sublex_push(pTHX)
55497cff 2626{
27da23d5 2627 dVAR;
78a635de 2628 LEXSHARED *shared;
801d32ac 2629 const bool is_heredoc = PL_multi_close == '<';
f46d017c 2630 ENTER;
55497cff 2631
3280af22 2632 PL_lex_state = PL_sublex_info.super_state;
cc624add 2633 SAVEI8(PL_lex_dojoin);
3280af22 2634 SAVEI32(PL_lex_brackets);
78cdf107 2635 SAVEI32(PL_lex_allbrackets);
b27dce25 2636 SAVEI32(PL_lex_formbrack);
78cdf107 2637 SAVEI8(PL_lex_fakeeof);
3280af22
NIS
2638 SAVEI32(PL_lex_casemods);
2639 SAVEI32(PL_lex_starts);
651b5b28 2640 SAVEI8(PL_lex_state);
7cc34111 2641 SAVESPTR(PL_lex_repl);
7766f137 2642 SAVEVPTR(PL_lex_inpat);
98246f1e 2643 SAVEI16(PL_lex_inwhat);
ffdb8b16 2644 if (is_heredoc)
b42366d4 2645 {
ffdb8b16 2646 SAVECOPLINE(PL_curcop);
b42366d4 2647 SAVEI32(PL_multi_end);
851b527a
FC
2648 SAVEI32(PL_parser->herelines);
2649 PL_parser->herelines = 0;
b42366d4
FC
2650 }
2651 SAVEI8(PL_multi_close);
3280af22 2652 SAVEPPTR(PL_bufptr);
8452ff4b 2653 SAVEPPTR(PL_bufend);
3280af22
NIS
2654 SAVEPPTR(PL_oldbufptr);
2655 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2656 SAVEPPTR(PL_last_lop);
2657 SAVEPPTR(PL_last_uni);
3280af22
NIS
2658 SAVEPPTR(PL_linestart);
2659 SAVESPTR(PL_linestr);
8edd5f42
RGS
2660 SAVEGENERICPV(PL_lex_brackstack);
2661 SAVEGENERICPV(PL_lex_casestack);
78a635de 2662 SAVEGENERICPV(PL_parser->lex_shared);
3a54fd60 2663 SAVEBOOL(PL_parser->lex_re_reparsing);
ffdb8b16 2664 SAVEI32(PL_copline);
3280af22 2665
99bd9d90 2666 /* The here-doc parser needs to be able to peek into outer lexing
60f40a38
FC
2667 scopes to find the body of the here-doc. So we put PL_linestr and
2668 PL_bufptr into lex_shared, to ‘share’ those values.
99bd9d90 2669 */
60f40a38
FC
2670 PL_parser->lex_shared->ls_linestr = PL_linestr;
2671 PL_parser->lex_shared->ls_bufptr = PL_bufptr;
99bd9d90 2672
3280af22 2673 PL_linestr = PL_lex_stuff;
7cc34111 2674 PL_lex_repl = PL_sublex_info.repl;
a0714e2c 2675 PL_lex_stuff = NULL;
7cc34111 2676 PL_sublex_info.repl = NULL;
3280af22 2677
9cbb5ea2
GS
2678 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2679 = SvPVX(PL_linestr);
3280af22 2680 PL_bufend += SvCUR(PL_linestr);
bd61b366 2681 PL_last_lop = PL_last_uni = NULL;
3280af22 2682 SAVEFREESV(PL_linestr);
4dc843bc 2683 if (PL_lex_repl) SAVEFREESV(PL_lex_repl);
3280af22
NIS
2684
2685 PL_lex_dojoin = FALSE;
b27dce25 2686 PL_lex_brackets = PL_lex_formbrack = 0;
78cdf107
Z
2687 PL_lex_allbrackets = 0;
2688 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
a02a5408
JC
2689 Newx(PL_lex_brackstack, 120, char);
2690 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2691 PL_lex_casemods = 0;
2692 *PL_lex_casestack = '\0';
2693 PL_lex_starts = 0;
2694 PL_lex_state = LEX_INTERPCONCAT;
ffdb8b16 2695 if (is_heredoc)
6ddcf93b 2696 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
ffdb8b16 2697 PL_copline = NOLINE;
78a635de
FC
2698
2699 Newxz(shared, 1, LEXSHARED);
2700 shared->ls_prev = PL_parser->lex_shared;
2701 PL_parser->lex_shared = shared;
3280af22
NIS
2702
2703 PL_lex_inwhat = PL_sublex_info.sub_inwhat;
bb16bae8 2704 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
3280af22
NIS
2705 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2706 PL_lex_inpat = PL_sublex_info.sub_op;
79072805 2707 else
5f66b61c 2708 PL_lex_inpat = NULL;
79072805 2709
3a54fd60
DM
2710 PL_parser->lex_re_reparsing = cBOOL(PL_in_eval & EVAL_RE_REPARSING);
2711 PL_in_eval &= ~EVAL_RE_REPARSING;
2712
55497cff 2713 return '(';
79072805
LW
2714}
2715
ffb4593c
NT
2716/*
2717 * S_sublex_done
2718 * Restores lexer state after a S_sublex_push.
2719 */
2720
76e3520e 2721STATIC I32
cea2e8a9 2722S_sublex_done(pTHX)
79072805 2723{
27da23d5 2724 dVAR;
3280af22 2725 if (!PL_lex_starts++) {
396482e1 2726 SV * const sv = newSVpvs("");
9aa983d2
JH
2727 if (SvUTF8(PL_linestr))
2728 SvUTF8_on(sv);
3280af22 2729 PL_expect = XOPERATOR;
6154021b 2730 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
79072805
LW
2731 return THING;
2732 }
2733
3280af22
NIS
2734 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2735 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2736 return yylex();
79072805
LW
2737 }
2738
ffb4593c 2739 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
bb16bae8 2740 assert(PL_lex_inwhat != OP_TRANSR);
5aa91856
FC
2741 if (PL_lex_repl) {
2742 assert (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS);
3280af22
NIS
2743 PL_linestr = PL_lex_repl;
2744 PL_lex_inpat = 0;
2745 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2746 PL_bufend += SvCUR(PL_linestr);
bd61b366 2747 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2748 PL_lex_dojoin = FALSE;
2749 PL_lex_brackets = 0;
78cdf107
Z
2750 PL_lex_allbrackets = 0;
2751 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
3280af22
NIS
2752 PL_lex_casemods = 0;
2753 *PL_lex_casestack = '\0';
2754 PL_lex_starts = 0;
25da4f38 2755 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2756 PL_lex_state = LEX_INTERPNORMAL;
2757 PL_lex_starts++;
e9fa98b2
HS
2758 /* we don't clear PL_lex_repl here, so that we can check later
2759 whether this is an evalled subst; that means we rely on the
2760 logic to ensure sublex_done() is called again only via the
2761 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2762 }
e9fa98b2 2763 else {
3280af22 2764 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2765 PL_lex_repl = NULL;
e9fa98b2 2766 }
ffdb8b16
FC
2767 if (SvTYPE(PL_linestr) >= SVt_PVNV) {
2768 CopLINE(PL_curcop) +=
2769 ((XPVNV*)SvANY(PL_linestr))->xnv_u.xpad_cop_seq.xlow
851b527a
FC
2770 + PL_parser->herelines;
2771 PL_parser->herelines = 0;
ffdb8b16 2772 }
79072805 2773 return ',';
ffed7fef
LW
2774 }
2775 else {
b42366d4 2776 const line_t l = CopLINE(PL_curcop);
5db06880
NC
2777#ifdef PERL_MAD
2778 if (PL_madskills) {
cd81e915
NC
2779 if (PL_thiswhite) {
2780 if (!PL_endwhite)
6b29d1f5 2781 PL_endwhite = newSVpvs("");
cd81e915
NC
2782 sv_catsv(PL_endwhite, PL_thiswhite);
2783 PL_thiswhite = 0;
2784 }
2785 if (PL_thistoken)
76f68e9b 2786 sv_setpvs(PL_thistoken,"");
5db06880 2787 else
cd81e915 2788 PL_realtokenstart = -1;
5db06880
NC
2789 }
2790#endif
f46d017c 2791 LEAVE;
b42366d4 2792 if (PL_multi_close == '<')
851b527a 2793 PL_parser->herelines += l - PL_multi_end;
3280af22
NIS
2794 PL_bufend = SvPVX(PL_linestr);
2795 PL_bufend += SvCUR(PL_linestr);
2796 PL_expect = XOPERATOR;
09bef843 2797 PL_sublex_info.sub_inwhat = 0;
79072805 2798 return ')';
ffed7fef
LW
2799 }
2800}
2801
6f613c73
KW
2802PERL_STATIC_INLINE SV*
2803S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
2804{
140b12ad
KW
2805 /* <s> points to first character of interior of \N{}, <e> to one beyond the
2806 * interior, hence to the "}". Finds what the name resolves to, returning
2807 * an SV* containing it; NULL if no valid one found */
2808
dd2b1b72 2809 SV* res = newSVpvn_flags(s, e - s, UTF ? SVf_UTF8 : 0);
6f613c73 2810
0c415a79
KW
2811 HV * table;
2812 SV **cvp;
2813 SV *cv;
2814 SV *rv;
2815 HV *stash;
2816 const U8* first_bad_char_loc;
2817 const char* backslash_ptr = s - 3; /* Points to the <\> of \N{... */
2818
6f613c73
KW
2819 PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME;
2820
107160e2
KW
2821 if (UTF && ! is_utf8_string_loc((U8 *) backslash_ptr,
2822 e - backslash_ptr,
2823 &first_bad_char_loc))
2824 {
2825 /* If warnings are on, this will print a more detailed analysis of what
2826 * is wrong than the error message below */
c80e42f3 2827 utf8n_to_uvchr(first_bad_char_loc,
107160e2
KW
2828 e - ((char *) first_bad_char_loc),
2829 NULL, 0);
2830
2831 /* We deliberately don't try to print the malformed character, which
2832 * might not print very well; it also may be just the first of many
2833 * malformations, so don't print what comes after it */
2834 yyerror(Perl_form(aTHX_
2835 "Malformed UTF-8 character immediately after '%.*s'",
2836 (int) (first_bad_char_loc - (U8 *) backslash_ptr), backslash_ptr));
2837 return NULL;
2838 }
2839
2840 res = new_constant( NULL, 0, "charnames", res, NULL, backslash_ptr,
2841 /* include the <}> */
2842 e - backslash_ptr + 1);
6f613c73 2843 if (! SvPOK(res)) {
b6407c49 2844 SvREFCNT_dec_NN(res);
6f613c73
KW
2845 return NULL;
2846 }
2847
0c415a79
KW
2848 /* See if the charnames handler is the Perl core's, and if so, we can skip
2849 * the validation needed for a user-supplied one, as Perl's does its own
2850 * validation. */
2851 table = GvHV(PL_hintgv); /* ^H */
2852 cvp = hv_fetchs(table, "charnames", FALSE);
5882ddb3
FC
2853 if (cvp && (cv = *cvp) && SvROK(cv) && (rv = SvRV(cv),
2854 SvTYPE(rv) == SVt_PVCV) && ((stash = CvSTASH(rv)) != NULL))
0c415a79
KW
2855 {
2856 const char * const name = HvNAME(stash);
6a642c21
FC
2857 if (HvNAMELEN(stash) == sizeof("_charnames")-1
2858 && strEQ(name, "_charnames")) {
0c415a79
KW
2859 return res;
2860 }
2861 }
2862
bde9e88d
KW
2863 /* Here, it isn't Perl's charname handler. We can't rely on a
2864 * user-supplied handler to validate the input name. For non-ut8 input,
2865 * look to see that the first character is legal. Then loop through the
2866 * rest checking that each is a continuation */
6f613c73 2867
36897d64
KW
2868 /* This code makes the reasonable assumption that the only Latin1-range
2869 * characters that begin a character name alias are alphabetic, otherwise
2870 * would have to create a isCHARNAME_BEGIN macro */
b6ba1137 2871
b6ba1137 2872 if (! UTF) {
bde9e88d 2873 if (! isALPHAU(*s)) {
b6ba1137
KW
2874 goto bad_charname;
2875 }
bde9e88d
KW
2876 s++;
2877 while (s < e) {
2878 if (! isCHARNAME_CONT(*s)) {
b6ba1137
KW
2879 goto bad_charname;
2880 }
3036c853 2881 if (*s == ' ' && *(s-1) == ' ' && ckWARN_d(WARN_DEPRECATED)) {
94ec3a20
FC
2882 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2883 "A sequence of multiple spaces in a charnames "
2884 "alias definition is deprecated");
bd299e29 2885 }
bde9e88d 2886 s++;
b6ba1137 2887 }
3036c853 2888 if (*(s-1) == ' ' && ckWARN_d(WARN_DEPRECATED)) {
94ec3a20
FC
2889 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2890 "Trailing white-space in a charnames alias "
2891 "definition is deprecated");
bd299e29 2892 }
b6ba1137
KW
2893 }
2894 else {
bde9e88d
KW
2895 /* Similarly for utf8. For invariants can check directly; for other
2896 * Latin1, can calculate their code point and check; otherwise use a
2897 * swash */
2898 if (UTF8_IS_INVARIANT(*s)) {
2899 if (! isALPHAU(*s)) {
140b12ad
KW
2900 goto bad_charname;
2901 }
bde9e88d
KW
2902 s++;
2903 } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
94bb8c36 2904 if (! isALPHAU(TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)))) {
b6ba1137 2905 goto bad_charname;
6f613c73 2906 }
bde9e88d 2907 s += 2;
6f613c73 2908 }
bde9e88d
KW
2909 else {
2910 if (! PL_utf8_charname_begin) {
2911 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2912 PL_utf8_charname_begin = _core_swash_init("utf8",
2913 "_Perl_Charname_Begin",
2914 &PL_sv_undef,
2915 1, 0, NULL, &flags);
2916 }
2917 if (! swash_fetch(PL_utf8_charname_begin, (U8 *) s, TRUE)) {
2918 goto bad_charname;
2919 }
2920 s += UTF8SKIP(s);
2921 }
2922
2923 while (s < e) {
2924 if (UTF8_IS_INVARIANT(*s)) {
2925 if (! isCHARNAME_CONT(*s)) {
2926 goto bad_charname;
2927 }
3036c853
FC
2928 if (*s == ' ' && *(s-1) == ' '
2929 && ckWARN_d(WARN_DEPRECATED)) {
94ec3a20
FC
2930 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2931 "A sequence of multiple spaces in a charnam"
2932 "es alias definition is deprecated");
bd299e29 2933 }
bde9e88d
KW
2934 s++;
2935 }
2936 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
94bb8c36 2937 if (! isCHARNAME_CONT(TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1))))
bde9e88d
KW
2938 {
2939 goto bad_charname;
2940 }
2941 s += 2;
2942 }
2943 else {
2944 if (! PL_utf8_charname_continue) {
2945 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2946 PL_utf8_charname_continue = _core_swash_init("utf8",
2947 "_Perl_Charname_Continue",
2948 &PL_sv_undef,
2949 1, 0, NULL, &flags);
2950 }
2951 if (! swash_fetch(PL_utf8_charname_continue, (U8 *) s, TRUE)) {
2952 goto bad_charname;
2953 }
2954 s += UTF8SKIP(s);
6f613c73
KW
2955 }
2956 }
3036c853 2957 if (*(s-1) == ' ' && ckWARN_d(WARN_DEPRECATED)) {
94ec3a20
FC
2958 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2959 "Trailing white-space in a charnames alias "
2960 "definition is deprecated");
bd299e29 2961 }
6f613c73
KW
2962 }
2963
94ca1619 2964 if (SvUTF8(res)) { /* Don't accept malformed input */
bde9e88d
KW
2965 const U8* first_bad_char_loc;
2966 STRLEN len;
2967 const char* const str = SvPV_const(res, len);
2968 if (! is_utf8_string_loc((U8 *) str, len, &first_bad_char_loc)) {
2969 /* If warnings are on, this will print a more detailed analysis of
2970 * what is wrong than the error message below */
c80e42f3 2971 utf8n_to_uvchr(first_bad_char_loc,
bde9e88d
KW
2972 (char *) first_bad_char_loc - str,
2973 NULL, 0);
2974
2975 /* We deliberately don't try to print the malformed character,
2976 * which might not print very well; it also may be just the first
2977 * of many malformations, so don't print what comes after it */
2978 yyerror_pv(
2979 Perl_form(aTHX_
2980 "Malformed UTF-8 returned by %.*s immediately after '%.*s'",
2981 (int) (e - backslash_ptr + 1), backslash_ptr,
2982 (int) ((char *) first_bad_char_loc - str), str
2983 ),
2984 SVf_UTF8);
2985 return NULL;
2986 }
2987 }
140b12ad 2988
bde9e88d 2989 return res;
140b12ad 2990
bde9e88d
KW
2991 bad_charname: {
2992 int bad_char_size = ((UTF) ? UTF8SKIP(s) : 1);
2993
2994 /* The final %.*s makes sure that should the trailing NUL be missing
2995 * that this print won't run off the end of the string */
2996 yyerror_pv(
2997 Perl_form(aTHX_
2998 "Invalid character in \\N{...}; marked by <-- HERE in %.*s<-- HERE %.*s",
2999 (int)(s - backslash_ptr + bad_char_size), backslash_ptr,
3000 (int)(e - s + bad_char_size), s + bad_char_size
3001 ),
3002 UTF ? SVf_UTF8 : 0);
3003 return NULL;
3004 }
6f613c73
KW
3005}
3006
02aa26ce
NT
3007/*
3008 scan_const
3009
9da1dd8f
DM
3010 Extracts the next constant part of a pattern, double-quoted string,
3011 or transliteration. This is terrifying code.
3012
3013 For example, in parsing the double-quoted string "ab\x63$d", it would
3014 stop at the '$' and return an OP_CONST containing 'abc'.
02aa26ce 3015
94def140 3016 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 3017 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 3018 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 3019
94def140
TS
3020 Returns a pointer to the character scanned up to. If this is
3021 advanced from the start pointer supplied (i.e. if anything was
9da1dd8f 3022 successfully parsed), will leave an OP_CONST for the substring scanned
6154021b 3023 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
3024 by looking at the next characters herself.
3025
02aa26ce 3026 In patterns:
9da1dd8f 3027 expand:
537124e4
KW
3028 \N{FOO} => \N{U+hex_for_character_FOO}
3029 (if FOO expands to multiple characters, expands to \N{U+xx.XX.yy ...})
9da1dd8f
DM
3030
3031 pass through:
3032 all other \-char, including \N and \N{ apart from \N{ABC}
3033
3034 stops on:
3035 @ and $ where it appears to be a var, but not for $ as tail anchor
3036 \l \L \u \U \Q \E
3037 (?{ or (??{
3038
02aa26ce
NT
3039
3040 In transliterations:
3041 characters are VERY literal, except for - not at the start or end
94def140
TS
3042 of the string, which indicates a range. If the range is in bytes,
3043 scan_const expands the range to the full set of intermediate
3044 characters. If the range is in utf8, the hyphen is replaced with
3045 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
3046
3047 In double-quoted strings:
3048 backslashes:
3049 double-quoted style: \r and \n
ff3f963a 3050 constants: \x31, etc.
94def140 3051 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
3052 case and quoting: \U \Q \E
3053 stops on @ and $
3054
3055 scan_const does *not* construct ops to handle interpolated strings.
3056 It stops processing as soon as it finds an embedded $ or @ variable
3057 and leaves it to the caller to work out what's going on.
3058
94def140
TS
3059 embedded arrays (whether in pattern or not) could be:
3060 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
3061
3062 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
3063
3064 $ in pattern could be $foo or could be tail anchor. Assumption:
3065 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 3066 followed by one of "()| \r\n\t"
02aa26ce 3067
9da1dd8f 3068 \1 (backreferences) are turned into $1 in substitutions
02aa26ce
NT
3069
3070 The structure of the code is
3071 while (there's a character to process) {
94def140
TS
3072 handle transliteration ranges
3073 skip regexp comments /(?#comment)/ and codes /(?{code})/
3074 skip #-initiated comments in //x patterns
3075 check for embedded arrays
02aa26ce
NT
3076 check for embedded scalars
3077 if (backslash) {
94def140 3078 deprecate \1 in substitution replacements
02aa26ce
NT
3079 handle string-changing backslashes \l \U \Q \E, etc.
3080 switch (what was escaped) {
94def140 3081 handle \- in a transliteration (becomes a literal -)
ff3f963a 3082 if a pattern and not \N{, go treat as regular character
94def140
TS
3083 handle \132 (octal characters)
3084 handle \x15 and \x{1234} (hex characters)
ff3f963a 3085 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
TS
3086 handle \cV (control characters)
3087 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 3088 } (end switch)
77a135fe 3089 continue
02aa26ce 3090 } (end if backslash)
77a135fe 3091 handle regular character
02aa26ce 3092 } (end while character to read)
4e553d73 3093
02aa26ce
NT
3094*/
3095
76e3520e 3096STATIC char *
cea2e8a9 3097S_scan_const(pTHX_ char *start)
79072805 3098{
97aff369 3099 dVAR;
eb578fdb 3100 char *send = PL_bufend; /* end of the constant */
77a135fe
KW
3101 SV *sv = newSV(send - start); /* sv for the constant. See
3102 note below on sizing. */
eb578fdb
KW
3103 char *s = start; /* start of the constant */
3104 char *d = SvPVX(sv); /* destination for copies */
02aa26ce 3105 bool dorange = FALSE; /* are we in a translit range? */
c2e66d9e 3106 bool didrange = FALSE; /* did we just finish a range? */
2866decb 3107 bool in_charclass = FALSE; /* within /[...]/ */
b953e60c
KW
3108 bool has_utf8 = FALSE; /* Output constant is UTF8 */
3109 bool this_utf8 = cBOOL(UTF); /* Is the source string assumed
77a135fe
KW
3110 to be UTF8? But, this can
3111 show as true when the source
3112 isn't utf8, as for example
3113 when it is entirely composed
3114 of hex constants */
6f613c73 3115 SV *res; /* result from charnames */
77a135fe
KW
3116
3117 /* Note on sizing: The scanned constant is placed into sv, which is
3118 * initialized by newSV() assuming one byte of output for every byte of
3119 * input. This routine expects newSV() to allocate an extra byte for a
3120 * trailing NUL, which this routine will append if it gets to the end of
3121 * the input. There may be more bytes of input than output (eg., \N{LATIN
3122 * CAPITAL LETTER A}), or more output than input if the constant ends up
3123 * recoded to utf8, but each time a construct is found that might increase
3124 * the needed size, SvGROW() is called. Its size parameter each time is
3125 * based on the best guess estimate at the time, namely the length used so
3126 * far, plus the length the current construct will occupy, plus room for
3127 * the trailing NUL, plus one byte for every input byte still unscanned */
3128
c3320c2a
KW
3129 UV uv = UV_MAX; /* Initialize to weird value to try to catch any uses
3130 before set */
4c3a8340
TS
3131#ifdef EBCDIC
3132 UV literal_endpoint = 0;
e294cc5d 3133 bool native_range = TRUE; /* turned to FALSE if the first endpoint is Unicode. */
4c3a8340 3134#endif
012bcf8d 3135
7918f24d
NC
3136 PERL_ARGS_ASSERT_SCAN_CONST;
3137
bb16bae8 3138 assert(PL_lex_inwhat != OP_TRANSR);
2b9d42f0
NIS
3139 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
3140 /* If we are doing a trans and we know we want UTF8 set expectation */
3141 has_utf8 = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
3142 this_utf8 = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
3143 }
3144
b899e89d
FC
3145 /* Protect sv from errors and fatal warnings. */
3146 ENTER_with_name("scan_const");
3147 SAVEFREESV(sv);
2b9d42f0 3148
79072805 3149 while (s < send || dorange) {
ff3f963a 3150
02aa26ce 3151 /* get transliterations out of the way (they're most literal) */
3280af22 3152 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 3153 /* expand a range A-Z to the full set of characters. AIE! */
79072805 3154 if (dorange) {
1ba5c669
JH
3155 I32 i; /* current expanded character */
3156 I32 min; /* first character in range */
3157 I32 max; /* last character in range */
02aa26ce 3158
e294cc5d
JH
3159#ifdef EBCDIC
3160 UV uvmax = 0;
3161#endif
3162
3163 if (has_utf8
3164#ifdef EBCDIC
3165 && !native_range
3166#endif
1953db30 3167 ) {
9d4ba2ae 3168 char * const c = (char*)utf8_hop((U8*)d, -1);
8973db79
JH
3169 char *e = d++;
3170 while (e-- > c)
3171 *(e + 1) = *e;
e7214ce8 3172 *c = (char) ILLEGAL_UTF8_BYTE;
8973db79
JH
3173 /* mark the range as done, and continue */
3174 dorange = FALSE;
3175 didrange = TRUE;
3176 continue;
3177 }
2b9d42f0 3178
95a20fc0 3179 i = d - SvPVX_const(sv); /* remember current offset */
e294cc5d
JH
3180#ifdef EBCDIC
3181 SvGROW(sv,
3182 SvLEN(sv) + (has_utf8 ?
3183 (512 - UTF_CONTINUATION_MARK +
3184 UNISKIP(0x100))
3185 : 256));
3186 /* How many two-byte within 0..255: 128 in UTF-8,
3187 * 96 in UTF-8-mod. */
3188#else
9cbb5ea2 3189 SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */
e294cc5d 3190#endif
9cbb5ea2 3191 d = SvPVX(sv) + i; /* refresh d after realloc */
e294cc5d
JH
3192#ifdef EBCDIC
3193 if (has_utf8) {
3194 int j;
3195 for (j = 0; j <= 1; j++) {
3196 char * const c = (char*)utf8_hop((U8*)d, -1);
3197 const UV uv = utf8n_to_uvchr((U8*)c, d - c, NULL, 0);
3198 if (j)
3199 min = (U8)uv;
3200 else if (uv < 256)
3201 max = (U8)uv;
3202 else {
3203 max = (U8)0xff; /* only to \xff */
3204 uvmax = uv; /* \x{100} to uvmax */
3205 }
3206 d = c; /* eat endpoint chars */
3207 }
3208 }
3209 else {
3210#endif
3211 d -= 2; /* eat the first char and the - */
3212 min = (U8)*d; /* first char in range */
3213 max = (U8)d[1]; /* last char in range */
3214#ifdef EBCDIC
3215 }
3216#endif
8ada0baa 3217
c2e66d9e 3218 if (min > max) {
01ec43d0 3219 Perl_croak(aTHX_
d1573ac7 3220 "Invalid range \"%c-%c\" in transliteration operator",
1ba5c669 3221 (char)min, (char)max);
c2e66d9e
GS
3222 }
3223
c7f1f016 3224#ifdef EBCDIC
4c3a8340 3225 if (literal_endpoint == 2 &&
c286a9a6 3226 ((isLOWER_A(min) && isLOWER_A(max)) ||
75e9e7bf
KW
3227 (isUPPER_A(min) && isUPPER_A(max))))
3228 {
3229 for (i = min; i <= max; i++) {
3230 if (isALPHA_A(i))
3231 *d++ = i;
8ada0baa
JH
3232 }
3233 }
3234 else
3235#endif
3236 for (i = min; i <= max; i++)
e294cc5d
JH
3237#ifdef EBCDIC
3238 if (has_utf8) {
55d09dc8 3239 append_utf8_from_native_byte(i, &d);
e294cc5d
JH
3240 }
3241 else
3242#endif
3243 *d++ = (char)i;
3244
3245#ifdef EBCDIC
3246 if (uvmax) {
3247 d = (char*)uvchr_to_utf8((U8*)d, 0x100);
3248 if (uvmax > 0x101)
e7214ce8 3249 *d++ = (char) ILLEGAL_UTF8_BYTE;
e294cc5d
JH
3250 if (uvmax > 0x100)
3251 d = (char*)uvchr_to_utf8((U8*)d, uvmax);
3252 }
3253#endif
02aa26ce
NT
3254
3255 /* mark the range as done, and continue */
79072805 3256 dorange = FALSE;
01ec43d0 3257 didrange = TRUE;
4c3a8340
TS
3258#ifdef EBCDIC
3259 literal_endpoint = 0;
3260#endif
79072805 3261 continue;
4e553d73 3262 }
02aa26ce
NT
3263
3264 /* range begins (ignore - as first or last char) */
79072805 3265 else if (*s == '-' && s+1 < send && s != start) {
4e553d73 3266 if (didrange) {
1fafa243 3267 Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
01ec43d0 3268 }
e294cc5d
JH
3269 if (has_utf8
3270#ifdef EBCDIC
3271 && !native_range
3272#endif
3273 ) {
e7214ce8 3274 *d++ = (char) ILLEGAL_UTF8_BYTE; /* use illegal utf8 byte--see pmtrans */
a0ed51b3
LW
3275 s++;
3276 continue;
3277 }
79072805
LW
3278 dorange = TRUE;
3279 s++;
01ec43d0
GS
3280 }
3281 else {
3282 didrange = FALSE;
4c3a8340
TS
3283#ifdef EBCDIC
3284 literal_endpoint = 0;
e294cc5d 3285 native_range = TRUE;
4c3a8340 3286#endif
01ec43d0 3287 }
79072805 3288 }
02aa26ce
NT
3289
3290 /* if we get here, we're not doing a transliteration */
3291
e4a2df84
DM
3292 else if (*s == '[' && PL_lex_inpat && !in_charclass) {
3293 char *s1 = s-1;
3294 int esc = 0;
3295 while (s1 >= start && *s1-- == '\\')
3296 esc = !esc;
3297 if (!esc)
3298 in_charclass = TRUE;
3299 }
2866decb 3300
e4a2df84
DM
3301 else if (*s == ']' && PL_lex_inpat && in_charclass) {
3302 char *s1 = s-1;
3303 int esc = 0;
3304 while (s1 >= start && *s1-- == '\\')
3305 esc = !esc;
3306 if (!esc)
3307 in_charclass = FALSE;
3308 }
2866decb 3309
9da1dd8f
DM
3310 /* skip for regexp comments /(?#comment)/, except for the last
3311 * char, which will be done separately.
3312 * Stop on (?{..}) and friends */
3313
c30fc27b 3314 else if (*s == '(' && PL_lex_inpat && s[1] == '?' && !in_charclass) {
cc6b7395 3315 if (s[2] == '#') {
e994fd66 3316 while (s+1 < send && *s != ')')
5ff03569 3317 *d++ = *s++;
155aba94 3318 }
c30fc27b 3319 else if (!PL_lex_casemods &&
d3cec5e5
DM
3320 ( s[2] == '{' /* This should match regcomp.c */
3321 || (s[2] == '?' && s[3] == '{')))
155aba94 3322 {
9da1dd8f 3323 break;
cc6b7395 3324 }
748a9306 3325 }
02aa26ce
NT
3326
3327 /* likewise skip #-initiated comments in //x patterns */
c30fc27b 3328 else if (*s == '#' && PL_lex_inpat && !in_charclass &&
73134a2e 3329 ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED) {
748a9306 3330 while (s+1 < send && *s != '\n')
5ff03569 3331 *d++ = *s++;
748a9306 3332 }
02aa26ce 3333
9da1dd8f
DM
3334 /* no further processing of single-quoted regex */
3335 else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'')
3336 goto default_action;
3337
5d1d4326 3338 /* check for embedded arrays
da6eedaa 3339 (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
5d1d4326 3340 */
1749ea0d 3341 else if (*s == '@' && s[1]) {
8a2bca12 3342 if (isWORDCHAR_lazy_if(s+1,UTF))
1749ea0d
TS
3343 break;
3344 if (strchr(":'{$", s[1]))
3345 break;
3346 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
3347 break; /* in regexp, neither @+ nor @- are interpolated */
3348 }
02aa26ce
NT
3349
3350 /* check for embedded scalars. only stop if we're sure it's a
3351 variable.
3352 */
79072805 3353 else if (*s == '$') {
3280af22 3354 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
79072805 3355 break;
77772344 3356 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
a2a5de95
NC
3357 if (s[1] == '\\') {
3358 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
3359 "Possible unintended interpolation of $\\ in regex");
77772344 3360 }
79072805 3361 break; /* in regexp, $ might be tail anchor */
77772344 3362 }
79072805 3363 }
02aa26ce 3364
2b9d42f0
NIS
3365 /* End of else if chain - OP_TRANS rejoin rest */
3366
02aa26ce 3367 /* backslashes */
79072805 3368 if (*s == '\\' && s+1 < send) {
ff3f963a
KW
3369 char* e; /* Can be used for ending '}', etc. */
3370
79072805 3371 s++;
02aa26ce 3372
7d0fc23c
KW
3373 /* warn on \1 - \9 in substitution replacements, but note that \11
3374 * is an octal; and \19 is \1 followed by '9' */
3280af22 3375 if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat &&
a0d0e21e 3376 isDIGIT(*s) && *s != '0' && !isDIGIT(s[1]))
79072805 3377 {
c782d7ee 3378 /* diag_listed_as: \%d better written as $%d */
a2a5de95 3379 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
79072805
LW
3380 *--s = '$';
3381 break;
3382 }
02aa26ce
NT
3383
3384 /* string-change backslash escapes */
838f2281 3385 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQF", *s)) {
79072805
LW
3386 --s;
3387 break;
3388 }
ff3f963a
KW
3389 /* In a pattern, process \N, but skip any other backslash escapes.
3390 * This is because we don't want to translate an escape sequence
3391 * into a meta symbol and have the regex compiler use the meta
3392 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
3393 * in spite of this, we do have to process \N here while the proper
3394 * charnames handler is in scope. See bugs #56444 and #62056.
3395 * There is a complication because \N in a pattern may also stand
3396 * for 'match a non-nl', and not mean a charname, in which case its
3397 * processing should be deferred to the regex compiler. To be a
3398 * charname it must be followed immediately by a '{', and not look
3399 * like \N followed by a curly quantifier, i.e., not something like
3400 * \N{3,}. regcurly returns a boolean indicating if it is a legal
3401 * quantifier */
3402 else if (PL_lex_inpat
3403 && (*s != 'N'
3404 || s[1] != '{'
4d68ffa0 3405 || regcurly(s + 1, FALSE)))
ff3f963a 3406 {
4d73d076 3407 *d++ = '\\';
cc74c5bd
TS
3408 goto default_action;
3409 }
02aa26ce 3410
79072805 3411 switch (*s) {
02aa26ce
NT
3412
3413 /* quoted - in transliterations */
79072805 3414 case '-':
3280af22 3415 if (PL_lex_inwhat == OP_TRANS) {
79072805
LW
3416 *d++ = *s++;
3417 continue;
3418 }
924ba076 3419 /* FALLTHROUGH */
79072805 3420 default:
11b8faa4 3421 {
15861f94 3422 if ((isALPHANUMERIC(*s)))
a2a5de95
NC
3423 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
3424 "Unrecognized escape \\%c passed through",
3425 *s);
11b8faa4 3426 /* default action is to copy the quoted character */
f9a63242 3427 goto default_action;
11b8faa4 3428 }
02aa26ce 3429
632403cc 3430 /* eg. \132 indicates the octal constant 0132 */
79072805
LW
3431 case '0': case '1': case '2': case '3':
3432 case '4': case '5': case '6': case '7':
ba210ebe 3433 {
5e0a247b 3434 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
53305cf1 3435 STRLEN len = 3;
06972766 3436 uv = grok_oct(s, &len, &flags, NULL);
ba210ebe 3437 s += len;
5e0a247b
KW
3438 if (len < 3 && s < send && isDIGIT(*s)
3439 && ckWARN(WARN_MISC))
3440 {
3441 Perl_warner(aTHX_ packWARN(WARN_MISC),
3442 "%s", form_short_octal_warning(s, len));
3443 }
ba210ebe 3444 }
012bcf8d 3445 goto NUM_ESCAPE_INSERT;
02aa26ce 3446
f0a2b745
KW
3447 /* eg. \o{24} indicates the octal constant \024 */
3448 case 'o':
3449 {
454155d9 3450 const char* error;
f0a2b745 3451
00ce5563 3452 bool valid = grok_bslash_o(&s, &uv, &error,
80f4111b
KW
3453 TRUE, /* Output warning */
3454 FALSE, /* Not strict */
17896a85
KW
3455 TRUE, /* Output warnings for
3456 non-portables */
80f4111b 3457 UTF);
454155d9 3458 if (! valid) {
f0a2b745
KW
3459 yyerror(error);
3460 continue;
3461 }
3462 goto NUM_ESCAPE_INSERT;
3463 }
3464
77a135fe 3465 /* eg. \x24 indicates the hex constant 0x24 */
79072805 3466 case 'x':
a0481293 3467 {
a0481293 3468 const char* error;
355860ce 3469
00ce5563 3470 bool valid = grok_bslash_x(&s, &uv, &error,
80f4111b
KW
3471 TRUE, /* Output warning */
3472 FALSE, /* Not strict */
17896a85
KW
3473 TRUE, /* Output warnings for
3474 non-portables */
80f4111b 3475 UTF);
a0481293
KW
3476 if (! valid) {
3477 yyerror(error);
355860ce 3478 continue;
ba210ebe 3479 }
012bcf8d
GS
3480 }
3481
3482 NUM_ESCAPE_INSERT:
ff3f963a
KW
3483 /* Insert oct or hex escaped character. There will always be
3484 * enough room in sv since such escapes will be longer than any
3485 * UTF-8 sequence they can end up as, except if they force us
3486 * to recode the rest of the string into utf8 */
ba7cea30 3487
06972766 3488 /* Here uv is the ordinal of the next character being added */
6f2d5cbc 3489 if (!UVCHR_IS_INVARIANT(uv)) {
9aa983d2 3490 if (!has_utf8 && uv > 255) {
77a135fe
KW
3491 /* Might need to recode whatever we have accumulated so
3492 * far if it contains any chars variant in utf8 or
3493 * utf-ebcdic. */
3494
3495 SvCUR_set(sv, d - SvPVX_const(sv));
3496 SvPOK_on(sv);
3497 *d = '\0';
77a135fe 3498 /* See Note on sizing above. */
7bf79863
KW
3499 sv_utf8_upgrade_flags_grow(sv,
3500 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3501 UNISKIP(uv) + (STRLEN)(send - s) + 1);
77a135fe
KW
3502 d = SvPVX(sv) + SvCUR(sv);
3503 has_utf8 = TRUE;
012bcf8d
GS
3504 }
3505
77a135fe 3506 if (has_utf8) {
c80e42f3 3507 d = (char*)uvchr_to_utf8((U8*)d, uv);
f9a63242
JH
3508 if (PL_lex_inwhat == OP_TRANS &&
3509 PL_sublex_info.sub_op) {
3510 PL_sublex_info.sub_op->op_private |=
3511 (PL_lex_repl ? OPpTRANS_FROM_UTF
3512 : OPpTRANS_TO_UTF);
f9a63242 3513 }
e294cc5d
JH
3514#ifdef EBCDIC
3515 if (uv > 255 && !dorange)
3516 native_range = FALSE;
3517#endif
012bcf8d 3518 }
a0ed51b3 3519 else {
012bcf8d 3520 *d++ = (char)uv;
a0ed51b3 3521 }
012bcf8d
GS
3522 }
3523 else {
c4d5f83a 3524 *d++ = (char) uv;
a0ed51b3 3525 }
79072805 3526 continue;
02aa26ce 3527
4a2d328f 3528 case 'N':
ff3f963a
KW
3529 /* In a non-pattern \N must be a named character, like \N{LATIN
3530 * SMALL LETTER A} or \N{U+0041}. For patterns, it also can
3531 * mean to match a non-newline. For non-patterns, named
3532 * characters are converted to their string equivalents. In
3533 * patterns, named characters are not converted to their
3534 * ultimate forms for the same reasons that other escapes
3535 * aren't. Instead, they are converted to the \N{U+...} form
3536 * to get the value from the charnames that is in effect right
3537 * now, while preserving the fact that it was a named character
3538 * so that the regex compiler knows this */
3539
ff3f963a
KW
3540 /* The structure of this section of code (besides checking for
3541 * errors and upgrading to utf8) is:
3542 * Further disambiguate between the two meanings of \N, and if
3543 * not a charname, go process it elsewhere
0a96133f
KW
3544 * If of form \N{U+...}, pass it through if a pattern;
3545 * otherwise convert to utf8
3546 * Otherwise must be \N{NAME}: convert to \N{U+c1.c2...} if a
3547 * pattern; otherwise convert to utf8 */
ff3f963a
KW
3548
3549 /* Here, s points to the 'N'; the test below is guaranteed to
3550 * succeed if we are being called on a pattern as we already
3551 * know from a test above that the next character is a '{'.
3552 * On a non-pattern \N must mean 'named sequence, which
3553 * requires braces */
3554 s++;
3555 if (*s != '{') {
3556 yyerror("Missing braces on \\N{}");
3557 continue;
3558 }
3559 s++;
3560
0a96133f 3561 /* If there is no matching '}', it is an error. */
ff3f963a
KW
3562 if (! (e = strchr(s, '}'))) {
3563 if (! PL_lex_inpat) {
5777a3f7 3564 yyerror("Missing right brace on \\N{}");
0a96133f 3565 } else {
4407f1b8 3566 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N");
dbc0d4f2 3567 }
0a96133f 3568 continue;
ff3f963a 3569 }
cddc7ef4 3570
ff3f963a 3571 /* Here it looks like a named character */
cddc7ef4 3572
ff3f963a
KW
3573 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3574 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3575 | PERL_SCAN_DISALLOW_PREFIX;
3576 STRLEN len;
3577
3578 /* For \N{U+...}, the '...' is a unicode value even on
3579 * EBCDIC machines */
3580 s += 2; /* Skip to next char after the 'U+' */
3581 len = e - s;
3582 uv = grok_hex(s, &len, &flags, NULL);
3583 if (len == 0 || len != (STRLEN)(e - s)) {
3584 yyerror("Invalid hexadecimal number in \\N{U+...}");
3585 s = e + 1;
3586 continue;
3587 }
3588
3589 if (PL_lex_inpat) {
3590
e2a7e165
KW
3591 /* On non-EBCDIC platforms, pass through to the regex
3592 * compiler unchanged. The reason we evaluated the
3593 * number above is to make sure there wasn't a syntax
3594 * error. But on EBCDIC we convert to native so
3595 * downstream code can continue to assume it's native
3596 */
ff3f963a 3597 s -= 5; /* Include the '\N{U+' */
e2a7e165
KW
3598#ifdef EBCDIC
3599 d += my_snprintf(d, e - s + 1 + 1, /* includes the }
3600 and the \0 */
3601 "\\N{U+%X}",
3602 (unsigned int) UNI_TO_NATIVE(uv));
3603#else
ff3f963a
KW
3604 Copy(s, d, e - s + 1, char); /* 1 = include the } */
3605 d += e - s + 1;
e2a7e165 3606#endif
ff3f963a
KW
3607 }
3608 else { /* Not a pattern: convert the hex to string */
3609
3610 /* If destination is not in utf8, unconditionally
3611 * recode it to be so. This is because \N{} implies
3612 * Unicode semantics, and scalars have to be in utf8
3613 * to guarantee those semantics */
3614 if (! has_utf8) {
3615 SvCUR_set(sv, d - SvPVX_const(sv));
3616 SvPOK_on(sv);
3617 *d = '\0';
3618 /* See Note on sizing above. */
3619 sv_utf8_upgrade_flags_grow(
3620 sv,
3621 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3622 UNISKIP(uv) + (STRLEN)(send - e) + 1);
3623 d = SvPVX(sv) + SvCUR(sv);
3624 has_utf8 = TRUE;
3625 }
3626
a46469e6 3627 /* Add the (Unicode) code point to the output. */
ff3f963a 3628 if (UNI_IS_INVARIANT(uv)) {
a46469e6 3629 *d++ = (char) LATIN1_TO_NATIVE(uv);
ff3f963a 3630 }
a46469e6
KW
3631 else {
3632 d = (char*) uvoffuni_to_utf8_flags((U8*)d, uv, 0);
3633 }
ff3f963a
KW
3634 }
3635 }
6f613c73
KW
3636 else /* Here is \N{NAME} but not \N{U+...}. */
3637 if ((res = get_and_check_backslash_N_name(s, e)))
3638 {
3639 STRLEN len;
3640 const char *str = SvPV_const(res, len);
3641 if (PL_lex_inpat) {
ff3f963a
KW
3642
3643 if (! len) { /* The name resolved to an empty string */
3644 Copy("\\N{}", d, 4, char);
3645 d += 4;
3646 }
3647 else {
3648 /* In order to not lose information for the regex
3649 * compiler, pass the result in the specially made
3650 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3651 * the code points in hex of each character
3652 * returned by charnames */
3653
3654 const char *str_end = str + len;
3b721c4f 3655 const STRLEN off = d - SvPVX_const(sv);
94ca1619
KW
3656
3657 if (! SvUTF8(res)) {
3658 /* For the non-UTF-8 case, we can determine the
3659 * exact length needed without having to parse
3660 * through the string. Each character takes up
3661 * 2 hex digits plus either a trailing dot or
3662 * the "}" */
3663 d = off + SvGROW(sv, off
3664 + 3 * len
3665 + 6 /* For the "\N{U+", and
3666 trailing NUL */
3667 + (STRLEN)(send - e));
3668 Copy("\\N{U+", d, 5, char);
3669 d += 5;
3670 while (str < str_end) {
3671 char hex_string[4];
3672 my_snprintf(hex_string, sizeof(hex_string),
3673 "%02X.", (U8) *str);
3674 Copy(hex_string, d, 3, char);
3675 d += 3;
3676 str++;
3677 }
3678 d--; /* We will overwrite below the final
3679