This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Treat a consecutive semicolons in a prototype as 1
[perl5.git] / toke.c
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
26
27This is the lower layer of the Perl parser, managing characters and tokens.
28
29=for apidoc AmU|yy_parser *|PL_parser
30
31Pointer to a structure encapsulating the state of the parsing operation
32currently in progress. The pointer can be locally changed to perform
33a nested parse without interfering with the state of an outer parse.
34Individual members of C<PL_parser> have their own documentation.
35
36=cut
37*/
38
378cc40b 39#include "EXTERN.h"
864dbfa3 40#define PERL_IN_TOKE_C
378cc40b 41#include "perl.h"
04e98a4d 42#include "dquote_static.c"
378cc40b 43
eb0d8d16
NC
44#define new_constant(a,b,c,d,e,f,g) \
45 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
46
6154021b 47#define pl_yylval (PL_parser->yylval)
d3b6f988 48
199e78b7
DM
49/* XXX temporary backwards compatibility */
50#define PL_lex_brackets (PL_parser->lex_brackets)
78cdf107
Z
51#define PL_lex_allbrackets (PL_parser->lex_allbrackets)
52#define PL_lex_fakeeof (PL_parser->lex_fakeeof)
199e78b7
DM
53#define PL_lex_brackstack (PL_parser->lex_brackstack)
54#define PL_lex_casemods (PL_parser->lex_casemods)
55#define PL_lex_casestack (PL_parser->lex_casestack)
56#define PL_lex_defer (PL_parser->lex_defer)
57#define PL_lex_dojoin (PL_parser->lex_dojoin)
58#define PL_lex_expect (PL_parser->lex_expect)
59#define PL_lex_formbrack (PL_parser->lex_formbrack)
60#define PL_lex_inpat (PL_parser->lex_inpat)
61#define PL_lex_inwhat (PL_parser->lex_inwhat)
62#define PL_lex_op (PL_parser->lex_op)
63#define PL_lex_repl (PL_parser->lex_repl)
64#define PL_lex_starts (PL_parser->lex_starts)
65#define PL_lex_stuff (PL_parser->lex_stuff)
66#define PL_multi_start (PL_parser->multi_start)
67#define PL_multi_open (PL_parser->multi_open)
68#define PL_multi_close (PL_parser->multi_close)
199e78b7
DM
69#define PL_preambled (PL_parser->preambled)
70#define PL_sublex_info (PL_parser->sublex_info)
bdc0bf6f 71#define PL_linestr (PL_parser->linestr)
c2598295
DM
72#define PL_expect (PL_parser->expect)
73#define PL_copline (PL_parser->copline)
f06b5848
DM
74#define PL_bufptr (PL_parser->bufptr)
75#define PL_oldbufptr (PL_parser->oldbufptr)
76#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
77#define PL_linestart (PL_parser->linestart)
78#define PL_bufend (PL_parser->bufend)
79#define PL_last_uni (PL_parser->last_uni)
80#define PL_last_lop (PL_parser->last_lop)
81#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 82#define PL_lex_state (PL_parser->lex_state)
2f9285f8 83#define PL_rsfp (PL_parser->rsfp)
5486870f 84#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
85#define PL_in_my (PL_parser->in_my)
86#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 87#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 88#define PL_multi_end (PL_parser->multi_end)
13765c85 89#define PL_error_count (PL_parser->error_count)
199e78b7
DM
90
91#ifdef PERL_MAD
92# define PL_endwhite (PL_parser->endwhite)
93# define PL_faketokens (PL_parser->faketokens)
94# define PL_lasttoke (PL_parser->lasttoke)
95# define PL_nextwhite (PL_parser->nextwhite)
96# define PL_realtokenstart (PL_parser->realtokenstart)
97# define PL_skipwhite (PL_parser->skipwhite)
98# define PL_thisclose (PL_parser->thisclose)
99# define PL_thismad (PL_parser->thismad)
100# define PL_thisopen (PL_parser->thisopen)
101# define PL_thisstuff (PL_parser->thisstuff)
102# define PL_thistoken (PL_parser->thistoken)
103# define PL_thiswhite (PL_parser->thiswhite)
fb205e7a
DM
104# define PL_thiswhite (PL_parser->thiswhite)
105# define PL_nexttoke (PL_parser->nexttoke)
106# define PL_curforce (PL_parser->curforce)
107#else
108# define PL_nexttoke (PL_parser->nexttoke)
109# define PL_nexttype (PL_parser->nexttype)
110# define PL_nextval (PL_parser->nextval)
199e78b7
DM
111#endif
112
a1894d81 113static const char* const ident_too_long = "Identifier too long";
8903cb82 114
29595ff2 115#ifdef PERL_MAD
29595ff2 116# define CURMAD(slot,sv) if (PL_madskills) { curmad(slot,sv); sv = 0; }
cd81e915 117# define NEXTVAL_NEXTTOKE PL_nexttoke[PL_curforce].next_val
9ded7720 118#else
5db06880 119# define CURMAD(slot,sv)
9ded7720 120# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2
NC
121#endif
122
a7aaec61
Z
123#define XENUMMASK 0x3f
124#define XFAKEEOF 0x40
125#define XFAKEBRACK 0x80
9059aa12 126
39e02b42
JH
127#ifdef USE_UTF8_SCRIPTS
128# define UTF (!IN_BYTES)
2b9d42f0 129#else
802a15e9 130# define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
2b9d42f0 131#endif
a0ed51b3 132
b1fc3636
CJ
133/* The maximum number of characters preceding the unrecognized one to display */
134#define UNRECOGNIZED_PRECEDE_COUNT 10
135
61f0cdd9 136/* In variables named $^X, these are the legal values for X.
2b92dfce
GS
137 * 1999-02-27 mjd-perl-patch@plover.com */
138#define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
139
14bd96d0 140#define SPACE_OR_TAB(c) isBLANK_A(c)
bf4acbe4 141
ffb4593c
NT
142/* LEX_* are values for PL_lex_state, the state of the lexer.
143 * They are arranged oddly so that the guard on the switch statement
79072805 144 * can get by with a single comparison (if the compiler is smart enough).
9da1dd8f
DM
145 *
146 * These values refer to the various states within a sublex parse,
147 * i.e. within a double quotish string
79072805
LW
148 */
149
fb73857a
PP
150/* #define LEX_NOTPARSING 11 is done in perl.h. */
151
b6007c36
DM
152#define LEX_NORMAL 10 /* normal code (ie not within "...") */
153#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
154#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
155#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
156#define LEX_INTERPSTART 6 /* expecting the start of a $var */
157
158 /* at end of code, eg "$x" followed by: */
159#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
160#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
161
162#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
163 string or after \E, $foo, etc */
164#define LEX_INTERPCONST 2 /* NOT USED */
165#define LEX_FORMLINE 1 /* expecting a format line */
166#define LEX_KNOWNEXT 0 /* next token known; just return it */
167
79072805 168
bbf60fe6 169#ifdef DEBUGGING
27da23d5 170static const char* const lex_state_names[] = {
bbf60fe6
DM
171 "KNOWNEXT",
172 "FORMLINE",
173 "INTERPCONST",
174 "INTERPCONCAT",
175 "INTERPENDMAYBE",
176 "INTERPEND",
177 "INTERPSTART",
178 "INTERPPUSH",
179 "INTERPCASEMOD",
180 "INTERPNORMAL",
181 "NORMAL"
182};
183#endif
184
79072805
LW
185#ifdef ff_next
186#undef ff_next
d48672a2
LW
187#endif
188
79072805 189#include "keywords.h"
fe14fcc3 190
ffb4593c
NT
191/* CLINE is a macro that ensures PL_copline has a sane value */
192
ae986130
LW
193#ifdef CLINE
194#undef CLINE
195#endif
57843af0 196#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 197
5db06880 198#ifdef PERL_MAD
29595ff2
NC
199# define SKIPSPACE0(s) skipspace0(s)
200# define SKIPSPACE1(s) skipspace1(s)
201# define SKIPSPACE2(s,tsv) skipspace2(s,&tsv)
202# define PEEKSPACE(s) skipspace2(s,0)
203#else
204# define SKIPSPACE0(s) skipspace(s)
205# define SKIPSPACE1(s) skipspace(s)
206# define SKIPSPACE2(s,tsv) skipspace(s)
207# define PEEKSPACE(s) skipspace(s)
208#endif
209
ffb4593c
NT
210/*
211 * Convenience functions to return different tokens and prime the
9cbb5ea2 212 * lexer for the next token. They all take an argument.
ffb4593c
NT
213 *
214 * TOKEN : generic token (used for '(', DOLSHARP, etc)
215 * OPERATOR : generic operator
216 * AOPERATOR : assignment operator
217 * PREBLOCK : beginning the block after an if, while, foreach, ...
218 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
219 * PREREF : *EXPR where EXPR is not a simple identifier
220 * TERM : expression term
221 * LOOPX : loop exiting command (goto, last, dump, etc)
222 * FTST : file test operator
223 * FUN0 : zero-argument function
7eb971ee 224 * FUN0OP : zero-argument function, with its op created in this file
2d2e263d 225 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
226 * BOop : bitwise or or xor
227 * BAop : bitwise and
228 * SHop : shift operator
229 * PWop : power operator
9cbb5ea2 230 * PMop : pattern-matching operator
ffb4593c
NT
231 * Aop : addition-level operator
232 * Mop : multiplication-level operator
233 * Eop : equality-testing operator
e5edeb50 234 * Rop : relational operator <= != gt
ffb4593c
NT
235 *
236 * Also see LOP and lop() below.
237 */
238
998054bd 239#ifdef DEBUGGING /* Serve -DT. */
704d4215 240# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 241#else
bbf60fe6 242# define REPORT(retval) (retval)
998054bd
SC
243#endif
244
bbf60fe6
DM
245#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
246#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
247#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, REPORT(retval)))
248#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
249#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
250#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
251#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
6154021b
RGS
252#define LOOPX(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)LOOPEX))
253#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
254#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
7eb971ee 255#define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
6154021b
RGS
256#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
257#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITOROP)))
258#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITANDOP)))
259#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)SHIFTOP)))
260#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)POWOP)))
261#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
262#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)ADDOP)))
263#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MULOP)))
264#define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
265#define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
2f3197b3 266
a687059c
LW
267/* This bit of chicanery makes a unary function followed by
268 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
269 * The UNIDOR macro is for unary functions that can be followed by the //
270 * operator (such as C<shift // 0>).
a687059c 271 */
d68ce4ac 272#define UNI3(f,x,have_x) { \
6154021b 273 pl_yylval.ival = f; \
d68ce4ac 274 if (have_x) PL_expect = x; \
376fcdbf
AL
275 PL_bufptr = s; \
276 PL_last_uni = PL_oldbufptr; \
277 PL_last_lop_op = f; \
278 if (*s == '(') \
279 return REPORT( (int)FUNC1 ); \
29595ff2 280 s = PEEKSPACE(s); \
376fcdbf
AL
281 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
282 }
d68ce4ac
FC
283#define UNI(f) UNI3(f,XTERM,1)
284#define UNIDOR(f) UNI3(f,XTERMORDORDOR,1)
b5fb7ce3
FC
285#define UNIPROTO(f,optional) { \
286 if (optional) PL_last_uni = PL_oldbufptr; \
22393538
MH
287 OPERATOR(f); \
288 }
a687059c 289
d68ce4ac 290#define UNIBRACK(f) UNI3(f,0,0)
79072805 291
9f68db38 292/* grandfather return to old style */
78cdf107
Z
293#define OLDLOP(f) \
294 do { \
295 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
296 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
297 pl_yylval.ival = (f); \
298 PL_expect = XTERM; \
299 PL_bufptr = s; \
300 return (int)LSTOP; \
301 } while(0)
79072805 302
83944c01
FC
303#define COPLINE_INC_WITH_HERELINES \
304 STMT_START { \
305 CopLINE_inc(PL_curcop); \
d794b522
FC
306 if (PL_parser->lex_shared->herelines) \
307 CopLINE(PL_curcop) += PL_parser->lex_shared->herelines, \
308 PL_parser->lex_shared->herelines = 0; \
83944c01
FC
309 } STMT_END
310
311
8fa7f367
JH
312#ifdef DEBUGGING
313
6154021b 314/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
315enum token_type {
316 TOKENTYPE_NONE,
317 TOKENTYPE_IVAL,
6154021b 318 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6 319 TOKENTYPE_PVAL,
aeaef349 320 TOKENTYPE_OPVAL
bbf60fe6
DM
321};
322
6d4a66ac
NC
323static struct debug_tokens {
324 const int token;
325 enum token_type type;
326 const char *name;
327} const debug_tokens[] =
9041c2e3 328{
bbf60fe6
DM
329 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
330 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
331 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
332 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
333 { ARROW, TOKENTYPE_NONE, "ARROW" },
334 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
335 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
336 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
337 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
338 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
0d863452 339 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
340 { DO, TOKENTYPE_NONE, "DO" },
341 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
342 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
343 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
344 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
345 { ELSE, TOKENTYPE_NONE, "ELSE" },
346 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
347 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
348 { FOR, TOKENTYPE_IVAL, "FOR" },
349 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
705fe0e5
FC
350 { FORMLBRACK, TOKENTYPE_NONE, "FORMLBRACK" },
351 { FORMRBRACK, TOKENTYPE_NONE, "FORMRBRACK" },
bbf60fe6
DM
352 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
353 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
7eb971ee 354 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
bbf60fe6
DM
355 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
356 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
357 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 358 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
359 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
360 { IF, TOKENTYPE_IVAL, "IF" },
5504e6cf 361 { LABEL, TOKENTYPE_PVAL, "LABEL" },
bbf60fe6
DM
362 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
363 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
364 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
365 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
366 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
367 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
368 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
369 { MY, TOKENTYPE_IVAL, "MY" },
bbf60fe6
DM
370 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
371 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
372 { OROP, TOKENTYPE_IVAL, "OROP" },
373 { OROR, TOKENTYPE_NONE, "OROR" },
374 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
f3f204dc 375 { PEG, TOKENTYPE_NONE, "PEG" },
88e1f1a2
JV
376 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
377 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6
DM
378 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
379 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
380 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
381 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
382 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
383 { PREINC, TOKENTYPE_NONE, "PREINC" },
384 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
f3f204dc 385 { QWLIST, TOKENTYPE_OPVAL, "QWLIST" },
bbf60fe6
DM
386 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
387 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
f3f204dc 388 { REQUIRE, TOKENTYPE_NONE, "REQUIRE" },
bbf60fe6
DM
389 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
390 { SUB, TOKENTYPE_NONE, "SUB" },
391 { THING, TOKENTYPE_OPVAL, "THING" },
392 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
393 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
394 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
395 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
396 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
397 { USE, TOKENTYPE_IVAL, "USE" },
0d863452 398 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6
DM
399 { WHILE, TOKENTYPE_IVAL, "WHILE" },
400 { WORD, TOKENTYPE_OPVAL, "WORD" },
be25f609 401 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 402 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
403};
404
6154021b 405/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 406
bbf60fe6 407STATIC int
704d4215 408S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 409{
97aff369 410 dVAR;
7918f24d
NC
411
412 PERL_ARGS_ASSERT_TOKEREPORT;
413
bbf60fe6 414 if (DEBUG_T_TEST) {
bd61b366 415 const char *name = NULL;
bbf60fe6 416 enum token_type type = TOKENTYPE_NONE;
f54cb97a 417 const struct debug_tokens *p;
396482e1 418 SV* const report = newSVpvs("<== ");
bbf60fe6 419
f54cb97a 420 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
421 if (p->token == (int)rv) {
422 name = p->name;
423 type = p->type;
424 break;
425 }
426 }
427 if (name)
54667de8 428 Perl_sv_catpv(aTHX_ report, name);
74736ae6 429 else if ((char)rv > ' ' && (char)rv <= '~')
4ebc7986 430 {
bbf60fe6 431 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
4ebc7986
FC
432 if ((char)rv == 'p')
433 sv_catpvs(report, " (pending identifier)");
434 }
bbf60fe6 435 else if (!rv)
396482e1 436 sv_catpvs(report, "EOF");
bbf60fe6
DM
437 else
438 Perl_sv_catpvf(aTHX_ report, "?? %"IVdf, (IV)rv);
439 switch (type) {
440 case TOKENTYPE_NONE:
bbf60fe6
DM
441 break;
442 case TOKENTYPE_IVAL:
704d4215 443 Perl_sv_catpvf(aTHX_ report, "(ival=%"IVdf")", (IV)lvalp->ival);
bbf60fe6
DM
444 break;
445 case TOKENTYPE_OPNUM:
446 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 447 PL_op_name[lvalp->ival]);
bbf60fe6
DM
448 break;
449 case TOKENTYPE_PVAL:
704d4215 450 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
451 break;
452 case TOKENTYPE_OPVAL:
704d4215 453 if (lvalp->opval) {
401441c0 454 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
455 PL_op_name[lvalp->opval->op_type]);
456 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 457 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 458 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
459 }
460
461 }
401441c0 462 else
396482e1 463 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
464 break;
465 }
b6007c36 466 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
467 };
468 return (int)rv;
998054bd
SC
469}
470
b6007c36
DM
471
472/* print the buffer with suitable escapes */
473
474STATIC void
15f169a1 475S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 476{
396482e1 477 SV* const tmp = newSVpvs("");
7918f24d
NC
478
479 PERL_ARGS_ASSERT_PRINTBUF;
480
b6007c36
DM
481 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
482 SvREFCNT_dec(tmp);
483}
484
8fa7f367
JH
485#endif
486
8290c323
NC
487static int
488S_deprecate_commaless_var_list(pTHX) {
489 PL_expect = XTERM;
490 deprecate("comma-less variable list");
491 return REPORT(','); /* grandfather non-comma-format format */
492}
493
ffb4593c
NT
494/*
495 * S_ao
496 *
c963b151
BD
497 * This subroutine detects &&=, ||=, and //= and turns an ANDAND, OROR or DORDOR
498 * into an OP_ANDASSIGN, OP_ORASSIGN, or OP_DORASSIGN
ffb4593c
NT
499 */
500
76e3520e 501STATIC int
cea2e8a9 502S_ao(pTHX_ int toketype)
a0d0e21e 503{
97aff369 504 dVAR;
3280af22
NIS
505 if (*PL_bufptr == '=') {
506 PL_bufptr++;
a0d0e21e 507 if (toketype == ANDAND)
6154021b 508 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 509 else if (toketype == OROR)
6154021b 510 pl_yylval.ival = OP_ORASSIGN;
c963b151 511 else if (toketype == DORDOR)
6154021b 512 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
513 toketype = ASSIGNOP;
514 }
515 return toketype;
516}
517
ffb4593c
NT
518/*
519 * S_no_op
520 * When Perl expects an operator and finds something else, no_op
521 * prints the warning. It always prints "<something> found where
522 * operator expected. It prints "Missing semicolon on previous line?"
523 * if the surprise occurs at the start of the line. "do you need to
524 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
525 * where the compiler doesn't know if foo is a method call or a function.
526 * It prints "Missing operator before end of line" if there's nothing
527 * after the missing operator, or "... before <...>" if there is something
528 * after the missing operator.
529 */
530
76e3520e 531STATIC void
15f169a1 532S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 533{
97aff369 534 dVAR;
9d4ba2ae
AL
535 char * const oldbp = PL_bufptr;
536 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 537
7918f24d
NC
538 PERL_ARGS_ASSERT_NO_OP;
539
1189a94a
GS
540 if (!s)
541 s = oldbp;
07c798fb 542 else
1189a94a 543 PL_bufptr = s;
734ab321 544 yywarn(Perl_form(aTHX_ "%s found where operator expected", what), UTF ? SVf_UTF8 : 0);
56da5a46
RGS
545 if (ckWARN_d(WARN_SYNTAX)) {
546 if (is_first)
547 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
548 "\t(Missing semicolon on previous line?)\n");
549 else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) {
f54cb97a 550 const char *t;
8a2bca12 551 for (t = PL_oldoldbufptr; (isWORDCHAR_lazy_if(t,UTF) || *t == ':');
734ab321 552 t += UTF ? UTF8SKIP(t) : 1)
c35e046a 553 NOOP;
56da5a46
RGS
554 if (t < PL_bufptr && isSPACE(*t))
555 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
734ab321
BF
556 "\t(Do you need to predeclare %"SVf"?)\n",
557 SVfARG(newSVpvn_flags(PL_oldoldbufptr, (STRLEN)(t - PL_oldoldbufptr),
558 SVs_TEMP | (UTF ? SVf_UTF8 : 0))));
56da5a46
RGS
559 }
560 else {
561 assert(s >= oldbp);
562 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
734ab321
BF
563 "\t(Missing operator before %"SVf"?)\n",
564 SVfARG(newSVpvn_flags(oldbp, (STRLEN)(s - oldbp),
565 SVs_TEMP | (UTF ? SVf_UTF8 : 0))));
56da5a46 566 }
07c798fb 567 }
3280af22 568 PL_bufptr = oldbp;
8990e307
LW
569}
570
ffb4593c
NT
571/*
572 * S_missingterm
573 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 574 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
575 * If we're in a delimited string and the delimiter is a control
576 * character, it's reformatted into a two-char sequence like ^C.
577 * This is fatal.
578 */
579
76e3520e 580STATIC void
cea2e8a9 581S_missingterm(pTHX_ char *s)
8990e307 582{
97aff369 583 dVAR;
8990e307
LW
584 char tmpbuf[3];
585 char q;
586 if (s) {
9d4ba2ae 587 char * const nl = strrchr(s,'\n');
d2719217 588 if (nl)
8990e307
LW
589 *nl = '\0';
590 }
463559e7 591 else if (isCNTRL(PL_multi_close)) {
8990e307 592 *tmpbuf = '^';
585ec06d 593 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
594 tmpbuf[2] = '\0';
595 s = tmpbuf;
596 }
597 else {
eb160463 598 *tmpbuf = (char)PL_multi_close;
8990e307
LW
599 tmpbuf[1] = '\0';
600 s = tmpbuf;
601 }
602 q = strchr(s,'"') ? '\'' : '"';
cea2e8a9 603 Perl_croak(aTHX_ "Can't find string terminator %c%s%c anywhere before EOF",q,s,q);
463ee0b2 604}
79072805 605
dd0ac2b9
FC
606#include "feature.h"
607
0d863452 608/*
0d863452
RH
609 * Check whether the named feature is enabled.
610 */
26ea9e12 611bool
3fff3427 612Perl_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
0d863452 613{
97aff369 614 dVAR;
4a731d7b 615 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
7918f24d
NC
616
617 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
ca4d40c4
FC
618
619 assert(CURRENT_FEATURE_BUNDLE == FEATURE_BUNDLE_CUSTOM);
7918f24d 620
26ea9e12
NC
621 if (namelen > MAX_FEATURE_LEN)
622 return FALSE;
3fff3427 623 memcpy(&he_name[8], name, namelen);
7d69d4a6 624
c8ca97b0
NC
625 return cBOOL(cop_hints_fetch_pvn(PL_curcop, he_name, 8 + namelen, 0,
626 REFCOUNTED_HE_EXISTS));
0d863452
RH
627}
628
ffb4593c 629/*
9cbb5ea2
GS
630 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
631 * utf16-to-utf8-reversed.
ffb4593c
NT
632 */
633
c39cd008
GS
634#ifdef PERL_CR_FILTER
635static void
636strip_return(SV *sv)
637{
eb578fdb
KW
638 const char *s = SvPVX_const(sv);
639 const char * const e = s + SvCUR(sv);
7918f24d
NC
640
641 PERL_ARGS_ASSERT_STRIP_RETURN;
642
c39cd008
GS
643 /* outer loop optimized to do nothing if there are no CR-LFs */
644 while (s < e) {
645 if (*s++ == '\r' && *s == '\n') {
646 /* hit a CR-LF, need to copy the rest */
eb578fdb 647 char *d = s - 1;
c39cd008
GS
648 *d++ = *s++;
649 while (s < e) {
650 if (*s == '\r' && s[1] == '\n')
651 s++;
652 *d++ = *s++;
653 }
654 SvCUR(sv) -= s - d;
655 return;
656 }
657 }
658}
a868473f 659
76e3520e 660STATIC I32
c39cd008 661S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 662{
f54cb97a 663 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
664 if (count > 0 && !maxlen)
665 strip_return(sv);
666 return count;
a868473f
NIS
667}
668#endif
669
ffb4593c 670/*
8eaa0acf
Z
671=for apidoc Amx|void|lex_start|SV *line|PerlIO *rsfp|U32 flags
672
673Creates and initialises a new lexer/parser state object, supplying
674a context in which to lex and parse from a new source of Perl code.
675A pointer to the new state object is placed in L</PL_parser>. An entry
676is made on the save stack so that upon unwinding the new state object
677will be destroyed and the former value of L</PL_parser> will be restored.
678Nothing else need be done to clean up the parsing context.
679
680The code to be parsed comes from I<line> and I<rsfp>. I<line>, if
681non-null, provides a string (in SV form) containing code to be parsed.
682A copy of the string is made, so subsequent modification of I<line>
683does not affect parsing. I<rsfp>, if non-null, provides an input stream
684from which code will be read to be parsed. If both are non-null, the
685code in I<line> comes first and must consist of complete lines of input,
686and I<rsfp> supplies the remainder of the source.
687
e368b3bd
FC
688The I<flags> parameter is reserved for future use. Currently it is only
689used by perl internally, so extensions should always pass zero.
8eaa0acf
Z
690
691=cut
692*/
ffb4593c 693
27fcb6ee 694/* LEX_START_SAME_FILTER indicates that this is not a new file, so it
87606032
NC
695 can share filters with the current parser.
696 LEX_START_DONT_CLOSE indicates that the file handle wasn't opened by the
697 caller, hence isn't owned by the parser, so shouldn't be closed on parser
698 destruction. This is used to handle the case of defaulting to reading the
699 script from the standard input because no filename was given on the command
700 line (without getting confused by situation where STDIN has been closed, so
701 the script handle is opened on fd 0) */
27fcb6ee 702
a0d0e21e 703void
8eaa0acf 704Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
79072805 705{
97aff369 706 dVAR;
6ef55633 707 const char *s = NULL;
5486870f 708 yy_parser *parser, *oparser;
60d63348 709 if (flags && flags & ~LEX_START_FLAGS)
8eaa0acf 710 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
acdf0a21
DM
711
712 /* create and initialise a parser */
713
199e78b7 714 Newxz(parser, 1, yy_parser);
5486870f 715 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
716 PL_parser = parser;
717
28ac2b49
Z
718 parser->stack = NULL;
719 parser->ps = NULL;
720 parser->stack_size = 0;
acdf0a21 721
e3abe207
DM
722 /* on scope exit, free this parser and restore any outer one */
723 SAVEPARSER(parser);
7c4baf47 724 parser->saved_curcop = PL_curcop;
e3abe207 725
acdf0a21 726 /* initialise lexer state */
8990e307 727
fb205e7a
DM
728#ifdef PERL_MAD
729 parser->curforce = -1;
730#else
731 parser->nexttoke = 0;
732#endif
ca4cfd28 733 parser->error_count = oparser ? oparser->error_count : 0;
c2598295 734 parser->copline = NOLINE;
5afb0a62 735 parser->lex_state = LEX_NORMAL;
c2598295 736 parser->expect = XSTATE;
2f9285f8 737 parser->rsfp = rsfp;
27fcb6ee
FC
738 parser->rsfp_filters =
739 !(flags & LEX_START_SAME_FILTER) || !oparser
d3cd8e11
FC
740 ? NULL
741 : MUTABLE_AV(SvREFCNT_inc(
742 oparser->rsfp_filters
743 ? oparser->rsfp_filters
744 : (oparser->rsfp_filters = newAV())
745 ));
2f9285f8 746
199e78b7
DM
747 Newx(parser->lex_brackstack, 120, char);
748 Newx(parser->lex_casestack, 12, char);
749 *parser->lex_casestack = '\0';
d794b522 750 Newxz(parser->lex_shared, 1, LEXSHARED);
02b34bbe 751
10efb74f 752 if (line) {
0528fd32 753 STRLEN len;
10efb74f 754 s = SvPV_const(line, len);
0abcdfa4
FC
755 parser->linestr = flags & LEX_START_COPIED
756 ? SvREFCNT_inc_simple_NN(line)
757 : newSVpvn_flags(s, len, SvUTF8(line));
11076590 758 sv_catpvs(parser->linestr, "\n;");
0abcdfa4
FC
759 } else {
760 parser->linestr = newSVpvs("\n;");
8990e307 761 }
f06b5848
DM
762 parser->oldoldbufptr =
763 parser->oldbufptr =
764 parser->bufptr =
765 parser->linestart = SvPVX(parser->linestr);
766 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
767 parser->last_lop = parser->last_uni = NULL;
87606032
NC
768 parser->lex_flags = flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
769 |LEX_DONT_CLOSE_RSFP);
737c24fc 770
60d63348 771 parser->in_pod = parser->filtered = 0;
79072805 772}
a687059c 773
e3abe207
DM
774
775/* delete a parser object */
776
777void
778Perl_parser_free(pTHX_ const yy_parser *parser)
779{
7918f24d
NC
780 PERL_ARGS_ASSERT_PARSER_FREE;
781
7c4baf47 782 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
783 SvREFCNT_dec(parser->linestr);
784
87606032 785 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
2f9285f8 786 PerlIO_clearerr(parser->rsfp);
799361c3
SH
787 else if (parser->rsfp && (!parser->old_parser ||
788 (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 789 PerlIO_close(parser->rsfp);
5486870f 790 SvREFCNT_dec(parser->rsfp_filters);
10002bc1
FC
791 SvREFCNT_dec(parser->lex_stuff);
792 SvREFCNT_dec(parser->sublex_info.repl);
3ac7ff8f
FC
793
794 Safefree(parser->lex_brackstack);
795 Safefree(parser->lex_casestack);
796 Safefree(parser->lex_shared);
797 PL_parser = parser->old_parser;
798 Safefree(parser);
799}
800
801void
802Perl_parser_free_nexttoke_ops(pTHX_ yy_parser *parser, OPSLAB *slab)
803{
804#ifdef PERL_MAD
805 I32 nexttoke = parser->lasttoke;
806#else
807 I32 nexttoke = parser->nexttoke;
808#endif
809 PERL_ARGS_ASSERT_PARSER_FREE_NEXTTOKE_OPS;
3ce3dcd9
FC
810 while (nexttoke--) {
811#ifdef PERL_MAD
812 if (S_is_opval_token(parser->nexttoke[nexttoke].next_type
3ac7ff8f
FC
813 & 0xffff)
814 && parser->nexttoke[nexttoke].next_val.opval
815 && parser->nexttoke[nexttoke].next_val.opval->op_slabbed
816 && OpSLAB(parser->nexttoke[nexttoke].next_val.opval) == slab) {
817 op_free(parser->nexttoke[nexttoke].next_val.opval);
818 parser->nexttoke[nexttoke].next_val.opval = NULL;
819 }
3ce3dcd9 820#else
3ac7ff8f
FC
821 if (S_is_opval_token(parser->nexttype[nexttoke] & 0xffff)
822 && parser->nextval[nexttoke].opval
823 && parser->nextval[nexttoke].opval->op_slabbed
824 && OpSLAB(parser->nextval[nexttoke].opval) == slab) {
3ce3dcd9 825 op_free(parser->nextval[nexttoke].opval);
3ac7ff8f
FC
826 parser->nextval[nexttoke].opval = NULL;
827 }
3ce3dcd9
FC
828#endif
829 }
e3abe207
DM
830}
831
832
ffb4593c 833/*
f0e67a1d
Z
834=for apidoc AmxU|SV *|PL_parser-E<gt>linestr
835
836Buffer scalar containing the chunk currently under consideration of the
837text currently being lexed. This is always a plain string scalar (for
838which C<SvPOK> is true). It is not intended to be used as a scalar by
839normal scalar means; instead refer to the buffer directly by the pointer
840variables described below.
841
842The lexer maintains various C<char*> pointers to things in the
843C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
844reallocated, all of these pointers must be updated. Don't attempt to
845do this manually, but rather use L</lex_grow_linestr> if you need to
846reallocate the buffer.
847
848The content of the text chunk in the buffer is commonly exactly one
849complete line of input, up to and including a newline terminator,
850but there are situations where it is otherwise. The octets of the
851buffer may be intended to be interpreted as either UTF-8 or Latin-1.
852The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
853flag on this scalar, which may disagree with it.
854
855For direct examination of the buffer, the variable
856L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
857lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
858of these pointers is usually preferable to examination of the scalar
859through normal scalar means.
860
861=for apidoc AmxU|char *|PL_parser-E<gt>bufend
862
863Direct pointer to the end of the chunk of text currently being lexed, the
864end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
865+ SvCUR(PL_parser-E<gt>linestr)>. A NUL character (zero octet) is
866always located at the end of the buffer, and does not count as part of
867the buffer's contents.
868
869=for apidoc AmxU|char *|PL_parser-E<gt>bufptr
870
871Points to the current position of lexing inside the lexer buffer.
872Characters around this point may be freely examined, within
873the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
874L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
875interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
876
877Lexing code (whether in the Perl core or not) moves this pointer past
878the characters that it consumes. It is also expected to perform some
879bookkeeping whenever a newline character is consumed. This movement
880can be more conveniently performed by the function L</lex_read_to>,
881which handles newlines appropriately.
882
883Interpretation of the buffer's octets can be abstracted out by
884using the slightly higher-level functions L</lex_peek_unichar> and
885L</lex_read_unichar>.
886
887=for apidoc AmxU|char *|PL_parser-E<gt>linestart
888
889Points to the start of the current line inside the lexer buffer.
890This is useful for indicating at which column an error occurred, and
891not much else. This must be updated by any lexing code that consumes
892a newline; the function L</lex_read_to> handles this detail.
893
894=cut
895*/
896
897/*
898=for apidoc Amx|bool|lex_bufutf8
899
900Indicates whether the octets in the lexer buffer
901(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
902of Unicode characters. If not, they should be interpreted as Latin-1
903characters. This is analogous to the C<SvUTF8> flag for scalars.
904
905In UTF-8 mode, it is not guaranteed that the lexer buffer actually
906contains valid UTF-8. Lexing code must be robust in the face of invalid
907encoding.
908
909The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
910is significant, but not the whole story regarding the input character
911encoding. Normally, when a file is being read, the scalar contains octets
912and its C<SvUTF8> flag is off, but the octets should be interpreted as
913UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
914however, the scalar may have the C<SvUTF8> flag on, and in this case its
915octets should be interpreted as UTF-8 unless the C<use bytes> pragma
916is in effect. This logic may change in the future; use this function
917instead of implementing the logic yourself.
918
919=cut
920*/
921
922bool
923Perl_lex_bufutf8(pTHX)
924{
925 return UTF;
926}
927
928/*
929=for apidoc Amx|char *|lex_grow_linestr|STRLEN len
930
931Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
932at least I<len> octets (including terminating NUL). Returns a
933pointer to the reallocated buffer. This is necessary before making
934any direct modification of the buffer that would increase its length.
935L</lex_stuff_pvn> provides a more convenient way to insert text into
936the buffer.
937
938Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
939this function updates all of the lexer's variables that point directly
940into the buffer.
941
942=cut
943*/
944
945char *
946Perl_lex_grow_linestr(pTHX_ STRLEN len)
947{
948 SV *linestr;
949 char *buf;
950 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
c7641931 951 STRLEN linestart_pos, last_uni_pos, last_lop_pos, re_eval_start_pos;
f0e67a1d
Z
952 linestr = PL_parser->linestr;
953 buf = SvPVX(linestr);
954 if (len <= SvLEN(linestr))
955 return buf;
956 bufend_pos = PL_parser->bufend - buf;
957 bufptr_pos = PL_parser->bufptr - buf;
958 oldbufptr_pos = PL_parser->oldbufptr - buf;
959 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
960 linestart_pos = PL_parser->linestart - buf;
961 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
962 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
3328ab5a
FC
963 re_eval_start_pos = PL_parser->lex_shared->re_eval_start ?
964 PL_parser->lex_shared->re_eval_start - buf : 0;
c7641931 965
f0e67a1d 966 buf = sv_grow(linestr, len);
c7641931 967
f0e67a1d
Z
968 PL_parser->bufend = buf + bufend_pos;
969 PL_parser->bufptr = buf + bufptr_pos;
970 PL_parser->oldbufptr = buf + oldbufptr_pos;
971 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
972 PL_parser->linestart = buf + linestart_pos;
973 if (PL_parser->last_uni)
974 PL_parser->last_uni = buf + last_uni_pos;
975 if (PL_parser->last_lop)
976 PL_parser->last_lop = buf + last_lop_pos;
3328ab5a
FC
977 if (PL_parser->lex_shared->re_eval_start)
978 PL_parser->lex_shared->re_eval_start = buf + re_eval_start_pos;
f0e67a1d
Z
979 return buf;
980}
981
982/*
83aa740e 983=for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
f0e67a1d
Z
984
985Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
986immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
987reallocating the buffer if necessary. This means that lexing code that
988runs later will see the characters as if they had appeared in the input.
989It is not recommended to do this as part of normal parsing, and most
990uses of this facility run the risk of the inserted characters being
991interpreted in an unintended manner.
992
993The string to be inserted is represented by I<len> octets starting
994at I<pv>. These octets are interpreted as either UTF-8 or Latin-1,
995according to whether the C<LEX_STUFF_UTF8> flag is set in I<flags>.
996The characters are recoded for the lexer buffer, according to how the
997buffer is currently being interpreted (L</lex_bufutf8>). If a string
9dcc53ea 998to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
f0e67a1d
Z
999function is more convenient.
1000
1001=cut
1002*/
1003
1004void
83aa740e 1005Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
f0e67a1d 1006{
749123ff 1007 dVAR;
f0e67a1d
Z
1008 char *bufptr;
1009 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
1010 if (flags & ~(LEX_STUFF_UTF8))
1011 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
1012 if (UTF) {
1013 if (flags & LEX_STUFF_UTF8) {
1014 goto plain_copy;
1015 } else {
54d004e8 1016 STRLEN highhalf = 0; /* Count of variants */
83aa740e 1017 const char *p, *e = pv+len;
54d004e8
KW
1018 for (p = pv; p != e; p++) {
1019 if (! UTF8_IS_INVARIANT(*p)) {
1020 highhalf++;
1021 }
1022 }
f0e67a1d
Z
1023 if (!highhalf)
1024 goto plain_copy;
1025 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
1026 bufptr = PL_parser->bufptr;
1027 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1028 SvCUR_set(PL_parser->linestr,
1029 SvCUR(PL_parser->linestr) + len+highhalf);
f0e67a1d
Z
1030 PL_parser->bufend += len+highhalf;
1031 for (p = pv; p != e; p++) {
1032 U8 c = (U8)*p;
54d004e8
KW
1033 if (! UTF8_IS_INVARIANT(c)) {
1034 *bufptr++ = UTF8_TWO_BYTE_HI(c);
1035 *bufptr++ = UTF8_TWO_BYTE_LO(c);
f0e67a1d
Z
1036 } else {
1037 *bufptr++ = (char)c;
1038 }
1039 }
1040 }
1041 } else {
1042 if (flags & LEX_STUFF_UTF8) {
1043 STRLEN highhalf = 0;
83aa740e 1044 const char *p, *e = pv+len;
f0e67a1d
Z
1045 for (p = pv; p != e; p++) {
1046 U8 c = (U8)*p;
54d004e8 1047 if (UTF8_IS_ABOVE_LATIN1(c)) {
f0e67a1d
Z
1048 Perl_croak(aTHX_ "Lexing code attempted to stuff "
1049 "non-Latin-1 character into Latin-1 input");
54d004e8 1050 } else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e)) {
f0e67a1d
Z
1051 p++;
1052 highhalf++;
54d004e8 1053 } else if (! UTF8_IS_INVARIANT(c)) {
f0e67a1d
Z
1054 /* malformed UTF-8 */
1055 ENTER;
1056 SAVESPTR(PL_warnhook);
1057 PL_warnhook = PERL_WARNHOOK_FATAL;
1058 utf8n_to_uvuni((U8*)p, e-p, NULL, 0);
1059 LEAVE;
1060 }
1061 }
1062 if (!highhalf)
1063 goto plain_copy;
1064 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
1065 bufptr = PL_parser->bufptr;
1066 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1067 SvCUR_set(PL_parser->linestr,
1068 SvCUR(PL_parser->linestr) + len-highhalf);
f0e67a1d 1069 PL_parser->bufend += len-highhalf;
54d004e8
KW
1070 p = pv;
1071 while (p < e) {
1072 if (UTF8_IS_INVARIANT(*p)) {
1073 *bufptr++ = *p;
1074 p++;
f0e67a1d 1075 }
54d004e8
KW
1076 else {
1077 assert(p < e -1 );
1078 *bufptr++ = TWO_BYTE_UTF8_TO_UNI(*p, *(p+1));
1079 p += 2;
1080 }
f0e67a1d
Z
1081 }
1082 } else {
54d004e8 1083 plain_copy:
f0e67a1d
Z
1084 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1085 bufptr = PL_parser->bufptr;
1086 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
255fdf19 1087 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
f0e67a1d
Z
1088 PL_parser->bufend += len;
1089 Copy(pv, bufptr, len, char);
1090 }
1091 }
1092}
1093
1094/*
9dcc53ea
Z
1095=for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
1096
1097Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1098immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1099reallocating the buffer if necessary. This means that lexing code that
1100runs later will see the characters as if they had appeared in the input.
1101It is not recommended to do this as part of normal parsing, and most
1102uses of this facility run the risk of the inserted characters being
1103interpreted in an unintended manner.
1104
1105The string to be inserted is represented by octets starting at I<pv>
1106and continuing to the first nul. These octets are interpreted as either
1107UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
1108in I<flags>. The characters are recoded for the lexer buffer, according
1109to how the buffer is currently being interpreted (L</lex_bufutf8>).
1110If it is not convenient to nul-terminate a string to be inserted, the
1111L</lex_stuff_pvn> function is more appropriate.
1112
1113=cut
1114*/
1115
1116void
1117Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1118{
1119 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1120 lex_stuff_pvn(pv, strlen(pv), flags);
1121}
1122
1123/*
f0e67a1d
Z
1124=for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1125
1126Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1127immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1128reallocating the buffer if necessary. This means that lexing code that
1129runs later will see the characters as if they had appeared in the input.
1130It is not recommended to do this as part of normal parsing, and most
1131uses of this facility run the risk of the inserted characters being
1132interpreted in an unintended manner.
1133
1134The string to be inserted is the string value of I<sv>. The characters
1135are recoded for the lexer buffer, according to how the buffer is currently
9dcc53ea 1136being interpreted (L</lex_bufutf8>). If a string to be inserted is
f0e67a1d
Z
1137not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1138need to construct a scalar.
1139
1140=cut
1141*/
1142
1143void
1144Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1145{
1146 char *pv;
1147 STRLEN len;
1148 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1149 if (flags)
1150 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1151 pv = SvPV(sv, len);
1152 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1153}
1154
1155/*
1156=for apidoc Amx|void|lex_unstuff|char *ptr
1157
1158Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1159I<ptr>. Text following I<ptr> will be moved, and the buffer shortened.
1160This hides the discarded text from any lexing code that runs later,
1161as if the text had never appeared.
1162
1163This is not the normal way to consume lexed text. For that, use
1164L</lex_read_to>.
1165
1166=cut
1167*/
1168
1169void
1170Perl_lex_unstuff(pTHX_ char *ptr)
1171{
1172 char *buf, *bufend;
1173 STRLEN unstuff_len;
1174 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1175 buf = PL_parser->bufptr;
1176 if (ptr < buf)
1177 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1178 if (ptr == buf)
1179 return;
1180 bufend = PL_parser->bufend;
1181 if (ptr > bufend)
1182 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1183 unstuff_len = ptr - buf;
1184 Move(ptr, buf, bufend+1-ptr, char);
1185 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1186 PL_parser->bufend = bufend - unstuff_len;
1187}
1188
1189/*
1190=for apidoc Amx|void|lex_read_to|char *ptr
1191
1192Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1193to I<ptr>. This advances L</PL_parser-E<gt>bufptr> to match I<ptr>,
1194performing the correct bookkeeping whenever a newline character is passed.
1195This is the normal way to consume lexed text.
1196
1197Interpretation of the buffer's octets can be abstracted out by
1198using the slightly higher-level functions L</lex_peek_unichar> and
1199L</lex_read_unichar>.
1200
1201=cut
1202*/
1203
1204void
1205Perl_lex_read_to(pTHX_ char *ptr)
1206{
1207 char *s;
1208 PERL_ARGS_ASSERT_LEX_READ_TO;
1209 s = PL_parser->bufptr;
1210 if (ptr < s || ptr > PL_parser->bufend)
1211 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1212 for (; s != ptr; s++)
1213 if (*s == '\n') {
83944c01 1214 COPLINE_INC_WITH_HERELINES;
f0e67a1d
Z
1215 PL_parser->linestart = s+1;
1216 }
1217 PL_parser->bufptr = ptr;
1218}
1219
1220/*
1221=for apidoc Amx|void|lex_discard_to|char *ptr
1222
1223Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1224up to I<ptr>. The remaining content of the buffer will be moved, and
1225all pointers into the buffer updated appropriately. I<ptr> must not
1226be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1227it is not permitted to discard text that has yet to be lexed.
1228
1229Normally it is not necessarily to do this directly, because it suffices to
1230use the implicit discarding behaviour of L</lex_next_chunk> and things
1231based on it. However, if a token stretches across multiple lines,
1f317c95 1232and the lexing code has kept multiple lines of text in the buffer for
f0e67a1d
Z
1233that purpose, then after completion of the token it would be wise to
1234explicitly discard the now-unneeded earlier lines, to avoid future
1235multi-line tokens growing the buffer without bound.
1236
1237=cut
1238*/
1239
1240void
1241Perl_lex_discard_to(pTHX_ char *ptr)
1242{
1243 char *buf;
1244 STRLEN discard_len;
1245 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1246 buf = SvPVX(PL_parser->linestr);
1247 if (ptr < buf)
1248 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1249 if (ptr == buf)
1250 return;
1251 if (ptr > PL_parser->bufptr)
1252 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1253 discard_len = ptr - buf;
1254 if (PL_parser->oldbufptr < ptr)
1255 PL_parser->oldbufptr = ptr;
1256 if (PL_parser->oldoldbufptr < ptr)
1257 PL_parser->oldoldbufptr = ptr;
1258 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1259 PL_parser->last_uni = NULL;
1260 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1261 PL_parser->last_lop = NULL;
1262 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1263 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1264 PL_parser->bufend -= discard_len;
1265 PL_parser->bufptr -= discard_len;
1266 PL_parser->oldbufptr -= discard_len;
1267 PL_parser->oldoldbufptr -= discard_len;
1268 if (PL_parser->last_uni)
1269 PL_parser->last_uni -= discard_len;
1270 if (PL_parser->last_lop)
1271 PL_parser->last_lop -= discard_len;
1272}
1273
1274/*
1275=for apidoc Amx|bool|lex_next_chunk|U32 flags
1276
1277Reads in the next chunk of text to be lexed, appending it to
1278L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1279looked to the end of the current chunk and wants to know more. It is
1280usual, but not necessary, for lexing to have consumed the entirety of
1281the current chunk at this time.
1282
1283If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1284chunk (i.e., the current chunk has been entirely consumed), normally the
1285current chunk will be discarded at the same time that the new chunk is
1286read in. If I<flags> includes C<LEX_KEEP_PREVIOUS>, the current chunk
1287will not be discarded. If the current chunk has not been entirely
1288consumed, then it will not be discarded regardless of the flag.
1289
1290Returns true if some new text was added to the buffer, or false if the
1291buffer has reached the end of the input text.
1292
1293=cut
1294*/
1295
1296#define LEX_FAKE_EOF 0x80000000
112d1284 1297#define LEX_NO_TERM 0x40000000
f0e67a1d
Z
1298
1299bool
1300Perl_lex_next_chunk(pTHX_ U32 flags)
1301{
1302 SV *linestr;
1303 char *buf;
1304 STRLEN old_bufend_pos, new_bufend_pos;
1305 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1306 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1307 bool got_some_for_debugger = 0;
f0e67a1d 1308 bool got_some;
112d1284 1309 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM))
f0e67a1d 1310 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
f0e67a1d
Z
1311 linestr = PL_parser->linestr;
1312 buf = SvPVX(linestr);
1313 if (!(flags & LEX_KEEP_PREVIOUS) &&
1314 PL_parser->bufptr == PL_parser->bufend) {
1315 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1316 linestart_pos = 0;
1317 if (PL_parser->last_uni != PL_parser->bufend)
1318 PL_parser->last_uni = NULL;
1319 if (PL_parser->last_lop != PL_parser->bufend)
1320 PL_parser->last_lop = NULL;
1321 last_uni_pos = last_lop_pos = 0;
1322 *buf = 0;
1323 SvCUR(linestr) = 0;
1324 } else {
1325 old_bufend_pos = PL_parser->bufend - buf;
1326 bufptr_pos = PL_parser->bufptr - buf;
1327 oldbufptr_pos = PL_parser->oldbufptr - buf;
1328 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1329 linestart_pos = PL_parser->linestart - buf;
1330 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1331 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1332 }
1333 if (flags & LEX_FAKE_EOF) {
1334 goto eof;
60d63348 1335 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
f0e67a1d
Z
1336 got_some = 0;
1337 } else if (filter_gets(linestr, old_bufend_pos)) {
1338 got_some = 1;
17cc9359 1339 got_some_for_debugger = 1;
112d1284
FC
1340 } else if (flags & LEX_NO_TERM) {
1341 got_some = 0;
f0e67a1d 1342 } else {
580561a3
Z
1343 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1344 sv_setpvs(linestr, "");
f0e67a1d
Z
1345 eof:
1346 /* End of real input. Close filehandle (unless it was STDIN),
1347 * then add implicit termination.
1348 */
87606032 1349 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
f0e67a1d
Z
1350 PerlIO_clearerr(PL_parser->rsfp);
1351 else if (PL_parser->rsfp)
1352 (void)PerlIO_close(PL_parser->rsfp);
1353 PL_parser->rsfp = NULL;
60d63348 1354 PL_parser->in_pod = PL_parser->filtered = 0;
f0e67a1d
Z
1355#ifdef PERL_MAD
1356 if (PL_madskills && !PL_in_eval && (PL_minus_p || PL_minus_n))
1357 PL_faketokens = 1;
1358#endif
1359 if (!PL_in_eval && PL_minus_p) {
1360 sv_catpvs(linestr,
1361 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1362 PL_minus_n = PL_minus_p = 0;
1363 } else if (!PL_in_eval && PL_minus_n) {
1364 sv_catpvs(linestr, /*{*/";}");
1365 PL_minus_n = 0;
1366 } else
1367 sv_catpvs(linestr, ";");
1368 got_some = 1;
1369 }
1370 buf = SvPVX(linestr);
1371 new_bufend_pos = SvCUR(linestr);
1372 PL_parser->bufend = buf + new_bufend_pos;
1373 PL_parser->bufptr = buf + bufptr_pos;
1374 PL_parser->oldbufptr = buf + oldbufptr_pos;
1375 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1376 PL_parser->linestart = buf + linestart_pos;
1377 if (PL_parser->last_uni)
1378 PL_parser->last_uni = buf + last_uni_pos;
1379 if (PL_parser->last_lop)
1380 PL_parser->last_lop = buf + last_lop_pos;
17cc9359 1381 if (got_some_for_debugger && (PERLDB_LINE || PERLDB_SAVESRC) &&
f0e67a1d
Z
1382 PL_curstash != PL_debstash) {
1383 /* debugger active and we're not compiling the debugger code,
1384 * so store the line into the debugger's array of lines
1385 */
1386 update_debugger_info(NULL, buf+old_bufend_pos,
1387 new_bufend_pos-old_bufend_pos);
1388 }
1389 return got_some;
1390}
1391
1392/*
1393=for apidoc Amx|I32|lex_peek_unichar|U32 flags
1394
1395Looks ahead one (Unicode) character in the text currently being lexed.
1396Returns the codepoint (unsigned integer value) of the next character,
1397or -1 if lexing has reached the end of the input text. To consume the
1398peeked character, use L</lex_read_unichar>.
1399
1400If the next character is in (or extends into) the next chunk of input
1401text, the next chunk will be read in. Normally the current chunk will be
1402discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1403then the current chunk will not be discarded.
1404
1405If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1406is encountered, an exception is generated.
1407
1408=cut
1409*/
1410
1411I32
1412Perl_lex_peek_unichar(pTHX_ U32 flags)
1413{
749123ff 1414 dVAR;
f0e67a1d
Z
1415 char *s, *bufend;
1416 if (flags & ~(LEX_KEEP_PREVIOUS))
1417 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1418 s = PL_parser->bufptr;
1419 bufend = PL_parser->bufend;
1420 if (UTF) {
1421 U8 head;
1422 I32 unichar;
1423 STRLEN len, retlen;
1424 if (s == bufend) {
1425 if (!lex_next_chunk(flags))
1426 return -1;
1427 s = PL_parser->bufptr;
1428 bufend = PL_parser->bufend;
1429 }
1430 head = (U8)*s;
54d004e8 1431 if (UTF8_IS_INVARIANT(head))
f0e67a1d 1432 return head;
54d004e8
KW
1433 if (UTF8_IS_START(head)) {
1434 len = UTF8SKIP(&head);
f0e67a1d
Z
1435 while ((STRLEN)(bufend-s) < len) {
1436 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1437 break;
1438 s = PL_parser->bufptr;
1439 bufend = PL_parser->bufend;
1440 }
1441 }
1442 unichar = utf8n_to_uvuni((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
1443 if (retlen == (STRLEN)-1) {
1444 /* malformed UTF-8 */
1445 ENTER;
1446 SAVESPTR(PL_warnhook);
1447 PL_warnhook = PERL_WARNHOOK_FATAL;
1448 utf8n_to_uvuni((U8*)s, bufend-s, NULL, 0);
1449 LEAVE;
1450 }
1451 return unichar;
1452 } else {
1453 if (s == bufend) {
1454 if (!lex_next_chunk(flags))
1455 return -1;
1456 s = PL_parser->bufptr;
1457 }
1458 return (U8)*s;
1459 }
1460}
1461
1462/*
1463=for apidoc Amx|I32|lex_read_unichar|U32 flags
1464
1465Reads the next (Unicode) character in the text currently being lexed.
1466Returns the codepoint (unsigned integer value) of the character read,
1467and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1468if lexing has reached the end of the input text. To non-destructively
1469examine the next character, use L</lex_peek_unichar> instead.
1470
1471If the next character is in (or extends into) the next chunk of input
1472text, the next chunk will be read in. Normally the current chunk will be
1473discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1474then the current chunk will not be discarded.
1475
1476If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1477is encountered, an exception is generated.
1478
1479=cut
1480*/
1481
1482I32
1483Perl_lex_read_unichar(pTHX_ U32 flags)
1484{
1485 I32 c;
1486 if (flags & ~(LEX_KEEP_PREVIOUS))
1487 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1488 c = lex_peek_unichar(flags);
1489 if (c != -1) {
1490 if (c == '\n')
83944c01 1491 COPLINE_INC_WITH_HERELINES;
d9018cbe
EB
1492 if (UTF)
1493 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1494 else
1495 ++(PL_parser->bufptr);
f0e67a1d
Z
1496 }
1497 return c;
1498}
1499
1500/*
1501=for apidoc Amx|void|lex_read_space|U32 flags
1502
1503Reads optional spaces, in Perl style, in the text currently being
1504lexed. The spaces may include ordinary whitespace characters and
1505Perl-style comments. C<#line> directives are processed if encountered.
1506L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1507at a non-space character (or the end of the input text).
1508
1509If spaces extend into the next chunk of input text, the next chunk will
1510be read in. Normally the current chunk will be discarded at the same
1511time, but if I<flags> includes C<LEX_KEEP_PREVIOUS> then the current
1512chunk will not be discarded.
1513
1514=cut
1515*/
1516
f0998909
Z
1517#define LEX_NO_NEXT_CHUNK 0x80000000
1518
f0e67a1d
Z
1519void
1520Perl_lex_read_space(pTHX_ U32 flags)
1521{
1522 char *s, *bufend;
1523 bool need_incline = 0;
f0998909 1524 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK))
f0e67a1d
Z
1525 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
1526#ifdef PERL_MAD
1527 if (PL_skipwhite) {
1528 sv_free(PL_skipwhite);
1529 PL_skipwhite = NULL;
1530 }
1531 if (PL_madskills)
1532 PL_skipwhite = newSVpvs("");
1533#endif /* PERL_MAD */
1534 s = PL_parser->bufptr;
1535 bufend = PL_parser->bufend;
1536 while (1) {
1537 char c = *s;
1538 if (c == '#') {
1539 do {
1540 c = *++s;
1541 } while (!(c == '\n' || (c == 0 && s == bufend)));
1542 } else if (c == '\n') {
1543 s++;
1544 PL_parser->linestart = s;
1545 if (s == bufend)
1546 need_incline = 1;
1547 else
1548 incline(s);
1549 } else if (isSPACE(c)) {
1550 s++;
1551 } else if (c == 0 && s == bufend) {
1552 bool got_more;
1553#ifdef PERL_MAD
1554 if (PL_madskills)
1555 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1556#endif /* PERL_MAD */
f0998909
Z
1557 if (flags & LEX_NO_NEXT_CHUNK)
1558 break;
f0e67a1d 1559 PL_parser->bufptr = s;
83944c01 1560 COPLINE_INC_WITH_HERELINES;
f0e67a1d
Z
1561 got_more = lex_next_chunk(flags);
1562 CopLINE_dec(PL_curcop);
1563 s = PL_parser->bufptr;
1564 bufend = PL_parser->bufend;
1565 if (!got_more)
1566 break;
1567 if (need_incline && PL_parser->rsfp) {
1568 incline(s);
1569 need_incline = 0;
1570 }
1571 } else {
1572 break;
1573 }
1574 }
1575#ifdef PERL_MAD
1576 if (PL_madskills)
1577 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1578#endif /* PERL_MAD */
1579 PL_parser->bufptr = s;
1580}
1581
1582/*
ffb4593c
NT
1583 * S_incline
1584 * This subroutine has nothing to do with tilting, whether at windmills
1585 * or pinball tables. Its name is short for "increment line". It
57843af0 1586 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1587 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1588 * # line 500 "foo.pm"
1589 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1590 */
1591
76e3520e 1592STATIC void
d9095cec 1593S_incline(pTHX_ const char *s)
463ee0b2 1594{
97aff369 1595 dVAR;
d9095cec
NC
1596 const char *t;
1597 const char *n;
1598 const char *e;
8818d409 1599 line_t line_num;
463ee0b2 1600
7918f24d
NC
1601 PERL_ARGS_ASSERT_INCLINE;
1602
83944c01 1603 COPLINE_INC_WITH_HERELINES;
451f421f
FC
1604 if (!PL_rsfp && !PL_parser->filtered && PL_lex_state == LEX_NORMAL
1605 && s+1 == PL_bufend && *s == ';') {
1606 /* fake newline in string eval */
1607 CopLINE_dec(PL_curcop);
1608 return;
1609 }
463ee0b2
LW
1610 if (*s++ != '#')
1611 return;
d4c19fe8
AL
1612 while (SPACE_OR_TAB(*s))
1613 s++;
73659bf1
GS
1614 if (strnEQ(s, "line", 4))
1615 s += 4;
1616 else
1617 return;
084592ab 1618 if (SPACE_OR_TAB(*s))
73659bf1 1619 s++;
4e553d73 1620 else
73659bf1 1621 return;
d4c19fe8
AL
1622 while (SPACE_OR_TAB(*s))
1623 s++;
463ee0b2
LW
1624 if (!isDIGIT(*s))
1625 return;
d4c19fe8 1626
463ee0b2
LW
1627 n = s;
1628 while (isDIGIT(*s))
1629 s++;
07714eb4 1630 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1631 return;
bf4acbe4 1632 while (SPACE_OR_TAB(*s))
463ee0b2 1633 s++;
73659bf1 1634 if (*s == '"' && (t = strchr(s+1, '"'))) {
463ee0b2 1635 s++;
73659bf1
GS
1636 e = t + 1;
1637 }
463ee0b2 1638 else {
c35e046a
AL
1639 t = s;
1640 while (!isSPACE(*t))
1641 t++;
73659bf1 1642 e = t;
463ee0b2 1643 }
bf4acbe4 1644 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1645 e++;
1646 if (*e != '\n' && *e != '\0')
1647 return; /* false alarm */
1648
8818d409
FC
1649 line_num = atoi(n)-1;
1650
f4dd75d9 1651 if (t - s > 0) {
d9095cec 1652 const STRLEN len = t - s;
19bad673
NC
1653 SV *const temp_sv = CopFILESV(PL_curcop);
1654 const char *cf;
1655 STRLEN tmplen;
1656
1657 if (temp_sv) {
1658 cf = SvPVX(temp_sv);
1659 tmplen = SvCUR(temp_sv);
1660 } else {
1661 cf = NULL;
1662 tmplen = 0;
1663 }
1664
d1299d44 1665 if (!PL_rsfp && !PL_parser->filtered) {
e66cf94c
RGS
1666 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1667 * to *{"::_<newfilename"} */
44867030
NC
1668 /* However, the long form of evals is only turned on by the
1669 debugger - usually they're "(eval %lu)" */
1670 char smallbuf[128];
1671 char *tmpbuf;
1672 GV **gvp;
d9095cec 1673 STRLEN tmplen2 = len;
798b63bc 1674 if (tmplen + 2 <= sizeof smallbuf)
e66cf94c
RGS
1675 tmpbuf = smallbuf;
1676 else
2ae0db35 1677 Newx(tmpbuf, tmplen + 2, char);
44867030
NC
1678 tmpbuf[0] = '_';
1679 tmpbuf[1] = '<';
2ae0db35 1680 memcpy(tmpbuf + 2, cf, tmplen);
44867030 1681 tmplen += 2;
8a5ee598
RGS
1682 gvp = (GV**)hv_fetch(PL_defstash, tmpbuf, tmplen, FALSE);
1683 if (gvp) {
44867030
NC
1684 char *tmpbuf2;
1685 GV *gv2;
1686
1687 if (tmplen2 + 2 <= sizeof smallbuf)
1688 tmpbuf2 = smallbuf;
1689 else
1690 Newx(tmpbuf2, tmplen2 + 2, char);
1691
1692 if (tmpbuf2 != smallbuf || tmpbuf != smallbuf) {
1693 /* Either they malloc'd it, or we malloc'd it,
1694 so no prefix is present in ours. */
1695 tmpbuf2[0] = '_';
1696 tmpbuf2[1] = '<';
1697 }
1698
1699 memcpy(tmpbuf2 + 2, s, tmplen2);
1700 tmplen2 += 2;
1701
8a5ee598 1702 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1703 if (!isGV(gv2)) {
8a5ee598 1704 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1705 /* adjust ${"::_<newfilename"} to store the new file name */
1706 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
8818d409
FC
1707 /* The line number may differ. If that is the case,
1708 alias the saved lines that are in the array.
1709 Otherwise alias the whole array. */
1710 if (CopLINE(PL_curcop) == line_num) {
1711 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(*gvp)));
1712 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(*gvp)));
1713 }
1714 else if (GvAV(*gvp)) {
1715 AV * const av = GvAV(*gvp);
1716 const I32 start = CopLINE(PL_curcop)+1;
1717 I32 items = AvFILLp(av) - start;
1718 if (items > 0) {
1719 AV * const av2 = GvAVn(gv2);
1720 SV **svp = AvARRAY(av) + start;
1721 I32 l = (I32)line_num+1;
1722 while (items--)
1723 av_store(av2, l++, SvREFCNT_inc(*svp++));
1724 }
1725 }
e5527e4b 1726 }
44867030
NC
1727
1728 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
8a5ee598 1729 }
e66cf94c 1730 if (tmpbuf != smallbuf) Safefree(tmpbuf);
e66cf94c 1731 }
05ec9bb3 1732 CopFILE_free(PL_curcop);
d9095cec 1733 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1734 }
8818d409 1735 CopLINE_set(PL_curcop, line_num);
463ee0b2
LW
1736}
1737
29595ff2 1738#ifdef PERL_MAD
cd81e915 1739/* skip space before PL_thistoken */
29595ff2
NC
1740
1741STATIC char *
5aaab254 1742S_skipspace0(pTHX_ char *s)
29595ff2 1743{
7918f24d
NC
1744 PERL_ARGS_ASSERT_SKIPSPACE0;
1745
29595ff2
NC
1746 s = skipspace(s);
1747 if (!PL_madskills)
1748 return s;
cd81e915
NC
1749 if (PL_skipwhite) {
1750 if (!PL_thiswhite)
6b29d1f5 1751 PL_thiswhite = newSVpvs("");
cd81e915
NC
1752 sv_catsv(PL_thiswhite, PL_skipwhite);
1753 sv_free(PL_skipwhite);
1754 PL_skipwhite = 0;
1755 }
1756 PL_realtokenstart = s - SvPVX(PL_linestr);
29595ff2
NC
1757 return s;
1758}
1759
cd81e915 1760/* skip space after PL_thistoken */
29595ff2
NC
1761
1762STATIC char *
5aaab254 1763S_skipspace1(pTHX_ char *s)
29595ff2 1764{
d4c19fe8 1765 const char *start = s;
29595ff2
NC
1766 I32 startoff = start - SvPVX(PL_linestr);
1767
7918f24d
NC
1768 PERL_ARGS_ASSERT_SKIPSPACE1;
1769
29595ff2
NC
1770 s = skipspace(s);
1771 if (!PL_madskills)
1772 return s;
1773 start = SvPVX(PL_linestr) + startoff;
cd81e915 1774 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1775 const char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1776 PL_thistoken = newSVpvn(tstart, start - tstart);
1777 }
1778 PL_realtokenstart = -1;
1779 if (PL_skipwhite) {
1780 if (!PL_nextwhite)
6b29d1f5 1781 PL_nextwhite = newSVpvs("");
cd81e915
NC
1782 sv_catsv(PL_nextwhite, PL_skipwhite);
1783 sv_free(PL_skipwhite);
1784 PL_skipwhite = 0;
29595ff2
NC
1785 }
1786 return s;
1787}
1788
1789STATIC char *
5aaab254 1790S_skipspace2(pTHX_ char *s, SV **svp)
29595ff2 1791{
c35e046a
AL
1792 char *start;
1793 const I32 bufptroff = PL_bufptr - SvPVX(PL_linestr);
1794 const I32 startoff = s - SvPVX(PL_linestr);
1795
7918f24d
NC
1796 PERL_ARGS_ASSERT_SKIPSPACE2;
1797
29595ff2
NC
1798 s = skipspace(s);
1799 PL_bufptr = SvPVX(PL_linestr) + bufptroff;
1800 if (!PL_madskills || !svp)
1801 return s;
1802 start = SvPVX(PL_linestr) + startoff;
cd81e915 1803 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1804 char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1805 PL_thistoken = newSVpvn(tstart, start - tstart);
1806 PL_realtokenstart = -1;
29595ff2 1807 }
cd81e915 1808 if (PL_skipwhite) {
29595ff2 1809 if (!*svp)
6b29d1f5 1810 *svp = newSVpvs("");
cd81e915
NC
1811 sv_setsv(*svp, PL_skipwhite);
1812 sv_free(PL_skipwhite);
1813 PL_skipwhite = 0;
29595ff2
NC
1814 }
1815
1816 return s;
1817}
1818#endif
1819
80a702cd 1820STATIC void
15f169a1 1821S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1822{
1823 AV *av = CopFILEAVx(PL_curcop);
1824 if (av) {
b9f83d2f 1825 SV * const sv = newSV_type(SVt_PVMG);
5fa550fb
NC
1826 if (orig_sv)
1827 sv_setsv(sv, orig_sv);
1828 else
1829 sv_setpvn(sv, buf, len);
80a702cd
RGS
1830 (void)SvIOK_on(sv);
1831 SvIV_set(sv, 0);
1832 av_store(av, (I32)CopLINE(PL_curcop), sv);
1833 }
1834}
1835
ffb4593c
NT
1836/*
1837 * S_skipspace
1838 * Called to gobble the appropriate amount and type of whitespace.
1839 * Skips comments as well.
1840 */
1841
76e3520e 1842STATIC char *
5aaab254 1843S_skipspace(pTHX_ char *s)
a687059c 1844{
5db06880 1845#ifdef PERL_MAD
f0e67a1d
Z
1846 char *start = s;
1847#endif /* PERL_MAD */
7918f24d 1848 PERL_ARGS_ASSERT_SKIPSPACE;
f0e67a1d 1849#ifdef PERL_MAD
cd81e915
NC
1850 if (PL_skipwhite) {
1851 sv_free(PL_skipwhite);
f0e67a1d 1852 PL_skipwhite = NULL;
5db06880 1853 }
f0e67a1d 1854#endif /* PERL_MAD */
3280af22 1855 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
bf4acbe4 1856 while (s < PL_bufend && SPACE_OR_TAB(*s))
463ee0b2 1857 s++;
f0e67a1d
Z
1858 } else {
1859 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1860 PL_bufptr = s;
f0998909
Z
1861 lex_read_space(LEX_KEEP_PREVIOUS |
1862 (PL_sublex_info.sub_inwhat || PL_lex_state == LEX_FORMLINE ?
1863 LEX_NO_NEXT_CHUNK : 0));
3280af22 1864 s = PL_bufptr;
f0e67a1d
Z
1865 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1866 if (PL_linestart > PL_bufptr)
1867 PL_bufptr = PL_linestart;
1868 return s;
463ee0b2 1869 }
5db06880 1870#ifdef PERL_MAD
f0e67a1d
Z
1871 if (PL_madskills)
1872 PL_skipwhite = newSVpvn(start, s-start);
1873#endif /* PERL_MAD */
5db06880 1874 return s;
a687059c 1875}
378cc40b 1876
ffb4593c
NT
1877/*
1878 * S_check_uni
1879 * Check the unary operators to ensure there's no ambiguity in how they're
1880 * used. An ambiguous piece of code would be:
1881 * rand + 5
1882 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1883 * the +5 is its argument.
1884 */
1885
76e3520e 1886STATIC void
cea2e8a9 1887S_check_uni(pTHX)
ba106d47 1888{
97aff369 1889 dVAR;
d4c19fe8
AL
1890 const char *s;
1891 const char *t;
2f3197b3 1892
3280af22 1893 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1894 return;
3280af22
NIS
1895 while (isSPACE(*PL_last_uni))
1896 PL_last_uni++;
c35e046a 1897 s = PL_last_uni;
8a2bca12 1898 while (isWORDCHAR_lazy_if(s,UTF) || *s == '-')
c35e046a 1899 s++;
3280af22 1900 if ((t = strchr(s, '(')) && t < PL_bufptr)
a0d0e21e 1901 return;
6136c704 1902
9b387841
NC
1903 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
1904 "Warning: Use of \"%.*s\" without parentheses is ambiguous",
1905 (int)(s - PL_last_uni), PL_last_uni);
2f3197b3
LW
1906}
1907
ffb4593c
NT
1908/*
1909 * LOP : macro to build a list operator. Its behaviour has been replaced
1910 * with a subroutine, S_lop() for which LOP is just another name.
1911 */
1912
a0d0e21e
LW
1913#define LOP(f,x) return lop(f,x,s)
1914
ffb4593c
NT
1915/*
1916 * S_lop
1917 * Build a list operator (or something that might be one). The rules:
1918 * - if we have a next token, then it's a list operator [why?]
1919 * - if the next thing is an opening paren, then it's a function
1920 * - else it's a list operator
1921 */
1922
76e3520e 1923STATIC I32
a0be28da 1924S_lop(pTHX_ I32 f, int x, char *s)
ffed7fef 1925{
97aff369 1926 dVAR;
7918f24d
NC
1927
1928 PERL_ARGS_ASSERT_LOP;
1929
6154021b 1930 pl_yylval.ival = f;
35c8bce7 1931 CLINE;
3280af22
NIS
1932 PL_expect = x;
1933 PL_bufptr = s;
1934 PL_last_lop = PL_oldbufptr;
eb160463 1935 PL_last_lop_op = (OPCODE)f;
5db06880
NC
1936#ifdef PERL_MAD
1937 if (PL_lasttoke)
78cdf107 1938 goto lstop;
5db06880 1939#else
3280af22 1940 if (PL_nexttoke)
78cdf107 1941 goto lstop;
5db06880 1942#endif
79072805 1943 if (*s == '(')
bbf60fe6 1944 return REPORT(FUNC);
29595ff2 1945 s = PEEKSPACE(s);
79072805 1946 if (*s == '(')
bbf60fe6 1947 return REPORT(FUNC);
78cdf107
Z
1948 else {
1949 lstop:
1950 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
1951 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
bbf60fe6 1952 return REPORT(LSTOP);
78cdf107 1953 }
79072805
LW
1954}
1955
5db06880
NC
1956#ifdef PERL_MAD
1957 /*
1958 * S_start_force
1959 * Sets up for an eventual force_next(). start_force(0) basically does
1960 * an unshift, while start_force(-1) does a push. yylex removes items
1961 * on the "pop" end.
1962 */
1963
1964STATIC void
1965S_start_force(pTHX_ int where)
1966{
1967 int i;
1968
cd81e915 1969 if (where < 0) /* so people can duplicate start_force(PL_curforce) */
5db06880 1970 where = PL_lasttoke;
cd81e915
NC
1971 assert(PL_curforce < 0 || PL_curforce == where);
1972 if (PL_curforce != where) {
5db06880
NC
1973 for (i = PL_lasttoke; i > where; --i) {
1974 PL_nexttoke[i] = PL_nexttoke[i-1];
1975 }
1976 PL_lasttoke++;
1977 }
cd81e915 1978 if (PL_curforce < 0) /* in case of duplicate start_force() */
5db06880 1979 Zero(&PL_nexttoke[where], 1, NEXTTOKE);
cd81e915
NC
1980 PL_curforce = where;
1981 if (PL_nextwhite) {
5db06880 1982 if (PL_madskills)
6b29d1f5 1983 curmad('^', newSVpvs(""));
cd81e915 1984 CURMAD('_', PL_nextwhite);
5db06880
NC
1985 }
1986}
1987
1988STATIC void
1989S_curmad(pTHX_ char slot, SV *sv)
1990{
1991 MADPROP **where;
1992
1993 if (!sv)
1994 return;
cd81e915
NC
1995 if (PL_curforce < 0)
1996 where = &PL_thismad;
5db06880 1997 else
cd81e915 1998 where = &PL_nexttoke[PL_curforce].next_mad;
5db06880 1999
cd81e915 2000 if (PL_faketokens)
76f68e9b 2001 sv_setpvs(sv, "");
5db06880
NC
2002 else {
2003 if (!IN_BYTES) {
2004 if (UTF && is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
2005 SvUTF8_on(sv);
2006 else if (PL_encoding) {
2007 sv_recode_to_utf8(sv, PL_encoding);
2008 }
2009 }
2010 }
2011
2012 /* keep a slot open for the head of the list? */
2013 if (slot != '_' && *where && (*where)->mad_key == '^') {
2014 (*where)->mad_key = slot;
daba3364 2015 sv_free(MUTABLE_SV(((*where)->mad_val)));
5db06880
NC
2016 (*where)->mad_val = (void*)sv;
2017 }
2018 else
2019 addmad(newMADsv(slot, sv), where, 0);
2020}
2021#else
b3f24c00
MHM
2022# define start_force(where) NOOP
2023# define curmad(slot, sv) NOOP
5db06880
NC
2024#endif
2025
ffb4593c
NT
2026/*
2027 * S_force_next
9cbb5ea2 2028 * When the lexer realizes it knows the next token (for instance,
ffb4593c 2029 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 2030 * to know what token to return the next time the lexer is called. Caller
5db06880
NC
2031 * will need to set PL_nextval[] (or PL_nexttoke[].next_val with PERL_MAD),
2032 * and possibly PL_expect to ensure the lexer handles the token correctly.
ffb4593c
NT
2033 */
2034
4e553d73 2035STATIC void
cea2e8a9 2036S_force_next(pTHX_ I32 type)
79072805 2037{
97aff369 2038 dVAR;
704d4215
GG
2039#ifdef DEBUGGING
2040 if (DEBUG_T_TEST) {
2041 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 2042 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
2043 }
2044#endif
5db06880 2045#ifdef PERL_MAD
cd81e915 2046 if (PL_curforce < 0)
5db06880 2047 start_force(PL_lasttoke);
cd81e915 2048 PL_nexttoke[PL_curforce].next_type = type;
5db06880
NC
2049 if (PL_lex_state != LEX_KNOWNEXT)
2050 PL_lex_defer = PL_lex_state;
2051 PL_lex_state = LEX_KNOWNEXT;
2052 PL_lex_expect = PL_expect;
cd81e915 2053 PL_curforce = -1;
5db06880 2054#else
3280af22
NIS
2055 PL_nexttype[PL_nexttoke] = type;
2056 PL_nexttoke++;
2057 if (PL_lex_state != LEX_KNOWNEXT) {
2058 PL_lex_defer = PL_lex_state;
2059 PL_lex_expect = PL_expect;
2060 PL_lex_state = LEX_KNOWNEXT;
79072805 2061 }
5db06880 2062#endif
79072805
LW
2063}
2064
28ac2b49
Z
2065void
2066Perl_yyunlex(pTHX)
2067{
a7aaec61
Z
2068 int yyc = PL_parser->yychar;
2069 if (yyc != YYEMPTY) {
2070 if (yyc) {
2071 start_force(-1);
2072 NEXTVAL_NEXTTOKE = PL_parser->yylval;
2073 if (yyc == '{'/*}*/ || yyc == HASHBRACK || yyc == '['/*]*/) {
78cdf107 2074 PL_lex_allbrackets--;
a7aaec61 2075 PL_lex_brackets--;
78cdf107
Z
2076 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
2077 } else if (yyc == '('/*)*/) {
2078 PL_lex_allbrackets--;
2079 yyc |= (2<<24);
a7aaec61
Z
2080 }
2081 force_next(yyc);
2082 }
28ac2b49
Z
2083 PL_parser->yychar = YYEMPTY;
2084 }
2085}
2086
d0a148a6 2087STATIC SV *
15f169a1 2088S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 2089{
97aff369 2090 dVAR;
740cce10 2091 SV * const sv = newSVpvn_utf8(start, len,
eaf7a4d2
CS
2092 !IN_BYTES
2093 && UTF
2094 && !is_ascii_string((const U8*)start, len)
740cce10 2095 && is_utf8_string((const U8*)start, len));
d0a148a6
NC
2096 return sv;
2097}
2098
ffb4593c
NT
2099/*
2100 * S_force_word
2101 * When the lexer knows the next thing is a word (for instance, it has
2102 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
2103 * it calls S_force_word to stick the next word into the PL_nexttoke/val
2104 * lookahead.
ffb4593c
NT
2105 *
2106 * Arguments:
b1b65b59 2107 * char *start : buffer position (must be within PL_linestr)
02b34bbe 2108 * int token : PL_next* will be this type of bare word (e.g., METHOD,WORD)
ffb4593c
NT
2109 * int check_keyword : if true, Perl checks to make sure the word isn't
2110 * a keyword (do this if the word is a label, e.g. goto FOO)
2111 * int allow_pack : if true, : characters will also be allowed (require,
2112 * use, etc. do this)
9cbb5ea2 2113 * int allow_initial_tick : used by the "sub" lexer only.
ffb4593c
NT
2114 */
2115
76e3520e 2116STATIC char *
345b3785 2117S_force_word(pTHX_ char *start, int token, int check_keyword, int allow_pack)
79072805 2118{
97aff369 2119 dVAR;
eb578fdb 2120 char *s;
463ee0b2 2121 STRLEN len;
4e553d73 2122
7918f24d
NC
2123 PERL_ARGS_ASSERT_FORCE_WORD;
2124
29595ff2 2125 start = SKIPSPACE1(start);
463ee0b2 2126 s = start;
7e2040f0 2127 if (isIDFIRST_lazy_if(s,UTF) ||
345b3785 2128 (allow_pack && *s == ':') )
a0d0e21e 2129 {
3280af22 2130 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
01b5ef50
FC
2131 if (check_keyword) {
2132 char *s2 = PL_tokenbuf;
2133 if (allow_pack && len > 6 && strnEQ(s2, "CORE::", 6))
2134 s2 += 6, len -= 6;
2135 if (keyword(s2, len, 0))
463ee0b2 2136 return start;
01b5ef50 2137 }
cd81e915 2138 start_force(PL_curforce);
5db06880
NC
2139 if (PL_madskills)
2140 curmad('X', newSVpvn(start,s-start));
463ee0b2 2141 if (token == METHOD) {
29595ff2 2142 s = SKIPSPACE1(s);
463ee0b2 2143 if (*s == '(')
3280af22 2144 PL_expect = XTERM;
463ee0b2 2145 else {
3280af22 2146 PL_expect = XOPERATOR;
463ee0b2 2147 }
79072805 2148 }
e74e6b3d 2149 if (PL_madskills)
63575281 2150 curmad('g', newSVpvs( "forced" ));
9ded7720 2151 NEXTVAL_NEXTTOKE.opval
d0a148a6
NC
2152 = (OP*)newSVOP(OP_CONST,0,
2153 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 2154 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
2155 force_next(token);
2156 }
2157 return s;
2158}
2159
ffb4593c
NT
2160/*
2161 * S_force_ident
9cbb5ea2 2162 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
2163 * text only contains the "foo" portion. The first argument is a pointer
2164 * to the "foo", and the second argument is the type symbol to prefix.
2165 * Forces the next token to be a "WORD".
9cbb5ea2 2166 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2167 */
2168
76e3520e 2169STATIC void
5aaab254 2170S_force_ident(pTHX_ const char *s, int kind)
79072805 2171{
97aff369 2172 dVAR;
7918f24d
NC
2173
2174 PERL_ARGS_ASSERT_FORCE_IDENT;
2175
c9b48522
DD
2176 if (s[0]) {
2177 const STRLEN len = s[1] ? strlen(s) : 1; /* s = "\"" see yylex */
728847b1
BF
2178 OP* const o = (OP*)newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
2179 UTF ? SVf_UTF8 : 0));
cd81e915 2180 start_force(PL_curforce);
9ded7720 2181 NEXTVAL_NEXTTOKE.opval = o;
79072805 2182 force_next(WORD);
748a9306 2183 if (kind) {
11343788 2184 o->op_private = OPpCONST_ENTERED;
55497cff
PP
2185 /* XXX see note in pp_entereval() for why we forgo typo
2186 warnings if the symbol must be introduced in an eval.
2187 GSAR 96-10-12 */
90e5519e 2188 gv_fetchpvn_flags(s, len,
728847b1
BF
2189 (PL_in_eval ? (GV_ADDMULTI | GV_ADDINEVAL)
2190 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
90e5519e
NC
2191 kind == '$' ? SVt_PV :
2192 kind == '@' ? SVt_PVAV :
2193 kind == '%' ? SVt_PVHV :
a0d0e21e 2194 SVt_PVGV
90e5519e 2195 );
748a9306 2196 }
79072805
LW
2197 }
2198}
2199
3f33d153
FC
2200static void
2201S_force_ident_maybe_lex(pTHX_ char pit)
2202{
2203 start_force(PL_curforce);
2204 NEXTVAL_NEXTTOKE.ival = pit;
2205 force_next('p');
2206}
2207
1571675a
GS
2208NV
2209Perl_str_to_version(pTHX_ SV *sv)
2210{
2211 NV retval = 0.0;
2212 NV nshift = 1.0;
2213 STRLEN len;
cfd0369c 2214 const char *start = SvPV_const(sv,len);
9d4ba2ae 2215 const char * const end = start + len;
504618e9 2216 const bool utf = SvUTF8(sv) ? TRUE : FALSE;
7918f24d
NC
2217
2218 PERL_ARGS_ASSERT_STR_TO_VERSION;
2219
1571675a 2220 while (start < end) {
ba210ebe 2221 STRLEN skip;
1571675a
GS
2222 UV n;
2223 if (utf)
9041c2e3 2224 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2225 else {
2226 n = *(U8*)start;
2227 skip = 1;
2228 }
2229 retval += ((NV)n)/nshift;
2230 start += skip;
2231 nshift *= 1000;
2232 }
2233 return retval;
2234}
2235
4e553d73 2236/*
ffb4593c
NT
2237 * S_force_version
2238 * Forces the next token to be a version number.
e759cc13
RGS
2239 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2240 * and if "guessing" is TRUE, then no new token is created (and the caller
2241 * must use an alternative parsing method).
ffb4593c
NT
2242 */
2243
76e3520e 2244STATIC char *
e759cc13 2245S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2246{
97aff369 2247 dVAR;
5f66b61c 2248 OP *version = NULL;
44dcb63b 2249 char *d;
5db06880
NC
2250#ifdef PERL_MAD
2251 I32 startoff = s - SvPVX(PL_linestr);
2252#endif
89bfa8cd 2253
7918f24d
NC
2254 PERL_ARGS_ASSERT_FORCE_VERSION;
2255
29595ff2 2256 s = SKIPSPACE1(s);
89bfa8cd 2257
44dcb63b 2258 d = s;
dd629d5b 2259 if (*d == 'v')
44dcb63b 2260 d++;
44dcb63b 2261 if (isDIGIT(*d)) {
e759cc13
RGS
2262 while (isDIGIT(*d) || *d == '_' || *d == '.')
2263 d++;
5db06880
NC
2264#ifdef PERL_MAD
2265 if (PL_madskills) {
cd81e915 2266 start_force(PL_curforce);
5db06880
NC
2267 curmad('X', newSVpvn(s,d-s));
2268 }
2269#endif
4e4da3ac 2270 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
dd629d5b 2271 SV *ver;
8d08d9ba 2272#ifdef USE_LOCALE_NUMERIC
909d3787
KW
2273 char *loc = savepv(setlocale(LC_NUMERIC, NULL));
2274 setlocale(LC_NUMERIC, "C");
8d08d9ba 2275#endif
6154021b 2276 s = scan_num(s, &pl_yylval);
8d08d9ba
DG
2277#ifdef USE_LOCALE_NUMERIC
2278 setlocale(LC_NUMERIC, loc);
909d3787 2279 Safefree(loc);
8d08d9ba 2280#endif
6154021b 2281 version = pl_yylval.opval;
dd629d5b
GS
2282 ver = cSVOPx(version)->op_sv;
2283 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2284 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2285 SvNV_set(ver, str_to_version(ver));
1571675a 2286 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2287 }
89bfa8cd 2288 }
5db06880
NC
2289 else if (guessing) {
2290#ifdef PERL_MAD
2291 if (PL_madskills) {
cd81e915
NC
2292 sv_free(PL_nextwhite); /* let next token collect whitespace */
2293 PL_nextwhite = 0;
5db06880
NC
2294 s = SvPVX(PL_linestr) + startoff;
2295 }
2296#endif
e759cc13 2297 return s;
5db06880 2298 }
89bfa8cd
PP
2299 }
2300
5db06880
NC
2301#ifdef PERL_MAD
2302 if (PL_madskills && !version) {
cd81e915
NC
2303 sv_free(PL_nextwhite); /* let next token collect whitespace */
2304 PL_nextwhite = 0;
5db06880
NC
2305 s = SvPVX(PL_linestr) + startoff;
2306 }
2307#endif
89bfa8cd 2308 /* NOTE: The parser sees the package name and the VERSION swapped */
cd81e915 2309 start_force(PL_curforce);
9ded7720 2310 NEXTVAL_NEXTTOKE.opval = version;
4e553d73 2311 force_next(WORD);
89bfa8cd 2312
e759cc13 2313 return s;
89bfa8cd
PP
2314}
2315
ffb4593c 2316/*
91152fc1
DG
2317 * S_force_strict_version
2318 * Forces the next token to be a version number using strict syntax rules.
2319 */
2320
2321STATIC char *
2322S_force_strict_version(pTHX_ char *s)
2323{
2324 dVAR;
2325 OP *version = NULL;
2326#ifdef PERL_MAD
2327 I32 startoff = s - SvPVX(PL_linestr);
2328#endif
2329 const char *errstr = NULL;
2330
2331 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2332
2333 while (isSPACE(*s)) /* leading whitespace */
2334 s++;
2335
2336 if (is_STRICT_VERSION(s,&errstr)) {
2337 SV *ver = newSV(0);
2338 s = (char *)scan_version(s, ver, 0);
2339 version = newSVOP(OP_CONST, 0, ver);
2340 }
4e4da3ac
Z
2341 else if ( (*s != ';' && *s != '{' && *s != '}' ) &&
2342 (s = SKIPSPACE1(s), (*s != ';' && *s != '{' && *s != '}' )))
2343 {
91152fc1
DG
2344 PL_bufptr = s;
2345 if (errstr)
2346 yyerror(errstr); /* version required */
2347 return s;
2348 }
2349
2350#ifdef PERL_MAD
2351 if (PL_madskills && !version) {
2352 sv_free(PL_nextwhite); /* let next token collect whitespace */
2353 PL_nextwhite = 0;
2354 s = SvPVX(PL_linestr) + startoff;
2355 }
2356#endif
2357 /* NOTE: The parser sees the package name and the VERSION swapped */
2358 start_force(PL_curforce);
2359 NEXTVAL_NEXTTOKE.opval = version;
2360 force_next(WORD);
2361
2362 return s;
2363}
2364
2365/*
ffb4593c
NT
2366 * S_tokeq
2367 * Tokenize a quoted string passed in as an SV. It finds the next
2368 * chunk, up to end of string or a backslash. It may make a new
2369 * SV containing that chunk (if HINT_NEW_STRING is on). It also
2370 * turns \\ into \.
2371 */
2372
76e3520e 2373STATIC SV *
cea2e8a9 2374S_tokeq(pTHX_ SV *sv)
79072805 2375{
97aff369 2376 dVAR;
eb578fdb
KW
2377 char *s;
2378 char *send;
2379 char *d;
b3ac6de7
IZ
2380 STRLEN len = 0;
2381 SV *pv = sv;
79072805 2382
7918f24d
NC
2383 PERL_ARGS_ASSERT_TOKEQ;
2384
79072805 2385 if (!SvLEN(sv))
b3ac6de7 2386 goto finish;
79072805 2387
a0d0e21e 2388 s = SvPV_force(sv, len);
21a311ee 2389 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1)
b3ac6de7 2390 goto finish;
463ee0b2 2391 send = s + len;
dcb21ed6
NC
2392 /* This is relying on the SV being "well formed" with a trailing '\0' */
2393 while (s < send && !(*s == '\\' && s[1] == '\\'))
79072805
LW
2394 s++;
2395 if (s == send)
b3ac6de7 2396 goto finish;
79072805 2397 d = s;
be4731d2 2398 if ( PL_hints & HINT_NEW_STRING ) {
59cd0e26 2399 pv = newSVpvn_flags(SvPVX_const(pv), len, SVs_TEMP | SvUTF8(sv));
be4731d2 2400 }
79072805
LW
2401 while (s < send) {
2402 if (*s == '\\') {
a0d0e21e 2403 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2404 s++; /* all that, just for this */
2405 }
2406 *d++ = *s++;
2407 }
2408 *d = '\0';
95a20fc0 2409 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2410 finish:
3280af22 2411 if ( PL_hints & HINT_NEW_STRING )
eb0d8d16 2412 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
79072805
LW
2413 return sv;
2414}
2415
ffb4593c
NT
2416/*
2417 * Now come three functions related to double-quote context,
2418 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2419 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2420 * interact with PL_lex_state, and create fake ( ... ) argument lists
2421 * to handle functions and concatenation.
ecd24171
DM
2422 * For example,
2423 * "foo\lbar"
2424 * is tokenised as
2425 * stringify ( const[foo] concat lcfirst ( const[bar] ) )
ffb4593c
NT
2426 */
2427
2428/*
2429 * S_sublex_start
6154021b 2430 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2431 *
2432 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2433 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c
NT
2434 *
2435 * OP_CONST and OP_READLINE are easy--just make the new op and return.
2436 *
2437 * Everything else becomes a FUNC.
2438 *
2439 * Sets PL_lex_state to LEX_INTERPPUSH unless (ival was OP_NULL or we
2440 * had an OP_CONST or OP_READLINE). This just sets us up for a
2441 * call to S_sublex_push().
2442 */
2443
76e3520e 2444STATIC I32
cea2e8a9 2445S_sublex_start(pTHX)
79072805 2446{
97aff369 2447 dVAR;
eb578fdb 2448 const I32 op_type = pl_yylval.ival;
79072805
LW
2449
2450 if (op_type == OP_NULL) {
6154021b 2451 pl_yylval.opval = PL_lex_op;
5f66b61c 2452 PL_lex_op = NULL;
79072805
LW
2453 return THING;
2454 }
2455 if (op_type == OP_CONST || op_type == OP_READLINE) {
3280af22 2456 SV *sv = tokeq(PL_lex_stuff);
b3ac6de7
IZ
2457
2458 if (SvTYPE(sv) == SVt_PVIV) {
2459 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2460 STRLEN len;
96a5add6 2461 const char * const p = SvPV_const(sv, len);
740cce10 2462 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2463 SvREFCNT_dec(sv);
2464 sv = nsv;
4e553d73 2465 }
6154021b 2466 pl_yylval.opval = (OP*)newSVOP(op_type, 0, sv);
a0714e2c 2467 PL_lex_stuff = NULL;
6f33ba73
RGS
2468 /* Allow <FH> // "foo" */
2469 if (op_type == OP_READLINE)
2470 PL_expect = XTERMORDORDOR;
79072805
LW
2471 return THING;
2472 }
e3f73d4e
RGS
2473 else if (op_type == OP_BACKTICK && PL_lex_op) {
2474 /* readpipe() vas overriden */
2475 cSVOPx(cLISTOPx(cUNOPx(PL_lex_op)->op_first)->op_first->op_sibling)->op_sv = tokeq(PL_lex_stuff);
6154021b 2476 pl_yylval.opval = PL_lex_op;
9b201d7d 2477 PL_lex_op = NULL;
e3f73d4e
RGS
2478 PL_lex_stuff = NULL;
2479 return THING;
2480 }
79072805 2481
3280af22 2482 PL_sublex_info.super_state = PL_lex_state;
eac04b2e 2483 PL_sublex_info.sub_inwhat = (U16)op_type;
3280af22
NIS
2484 PL_sublex_info.sub_op = PL_lex_op;
2485 PL_lex_state = LEX_INTERPPUSH;
55497cff 2486
3280af22
NIS
2487 PL_expect = XTERM;
2488 if (PL_lex_op) {
6154021b 2489 pl_yylval.opval = PL_lex_op;
5f66b61c 2490 PL_lex_op = NULL;
55497cff
PP
2491 return PMFUNC;
2492 }
2493 else
2494 return FUNC;
2495}
2496
ffb4593c
NT
2497/*
2498 * S_sublex_push
2499 * Create a new scope to save the lexing state. The scope will be
2500 * ended in S_sublex_done. Returns a '(', starting the function arguments
2501 * to the uc, lc, etc. found before.
2502 * Sets PL_lex_state to LEX_INTERPCONCAT.
2503 */
2504
76e3520e 2505STATIC I32
cea2e8a9 2506S_sublex_push(pTHX)
55497cff 2507{
27da23d5 2508 dVAR;
78a635de 2509 LEXSHARED *shared;
f46d017c 2510 ENTER;
55497cff 2511
3280af22 2512 PL_lex_state = PL_sublex_info.super_state;
651b5b28 2513 SAVEBOOL(PL_lex_dojoin);
3280af22 2514 SAVEI32(PL_lex_brackets);
78cdf107 2515 SAVEI32(PL_lex_allbrackets);
b27dce25 2516 SAVEI32(PL_lex_formbrack);
78cdf107 2517 SAVEI8(PL_lex_fakeeof);
3280af22
NIS
2518 SAVEI32(PL_lex_casemods);
2519 SAVEI32(PL_lex_starts);
651b5b28 2520 SAVEI8(PL_lex_state);
7cc34111 2521 SAVESPTR(PL_lex_repl);
7766f137 2522 SAVEVPTR(PL_lex_inpat);
98246f1e 2523 SAVEI16(PL_lex_inwhat);
57843af0 2524 SAVECOPLINE(PL_curcop);
3280af22 2525 SAVEPPTR(PL_bufptr);
8452ff4b 2526 SAVEPPTR(PL_bufend);
3280af22
NIS
2527 SAVEPPTR(PL_oldbufptr);
2528 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2529 SAVEPPTR(PL_last_lop);
2530 SAVEPPTR(PL_last_uni);
3280af22
NIS
2531 SAVEPPTR(PL_linestart);
2532 SAVESPTR(PL_linestr);
8edd5f42
RGS
2533 SAVEGENERICPV(PL_lex_brackstack);
2534 SAVEGENERICPV(PL_lex_casestack);
78a635de 2535 SAVEGENERICPV(PL_parser->lex_shared);
3a54fd60 2536 SAVEBOOL(PL_parser->lex_re_reparsing);
3280af22 2537
99bd9d90 2538 /* The here-doc parser needs to be able to peek into outer lexing
60f40a38
FC
2539 scopes to find the body of the here-doc. So we put PL_linestr and
2540 PL_bufptr into lex_shared, to ‘share’ those values.
99bd9d90 2541 */
60f40a38
FC
2542 PL_parser->lex_shared->ls_linestr = PL_linestr;
2543 PL_parser->lex_shared->ls_bufptr = PL_bufptr;
99bd9d90 2544
3280af22 2545 PL_linestr = PL_lex_stuff;
7cc34111 2546 PL_lex_repl = PL_sublex_info.repl;
a0714e2c 2547 PL_lex_stuff = NULL;
7cc34111 2548 PL_sublex_info.repl = NULL;
3280af22 2549
9cbb5ea2
GS
2550 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2551 = SvPVX(PL_linestr);
3280af22 2552 PL_bufend += SvCUR(PL_linestr);
bd61b366 2553 PL_last_lop = PL_last_uni = NULL;
3280af22 2554 SAVEFREESV(PL_linestr);
4dc843bc 2555 if (PL_lex_repl) SAVEFREESV(PL_lex_repl);
3280af22
NIS
2556
2557 PL_lex_dojoin = FALSE;
b27dce25 2558 PL_lex_brackets = PL_lex_formbrack = 0;
78cdf107
Z
2559 PL_lex_allbrackets = 0;
2560 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
a02a5408
JC
2561 Newx(PL_lex_brackstack, 120, char);
2562 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2563 PL_lex_casemods = 0;
2564 *PL_lex_casestack = '\0';
2565 PL_lex_starts = 0;
2566 PL_lex_state = LEX_INTERPCONCAT;
eb160463 2567 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
78a635de
FC
2568
2569 Newxz(shared, 1, LEXSHARED);
2570 shared->ls_prev = PL_parser->lex_shared;
2571 PL_parser->lex_shared = shared;
3280af22
NIS
2572
2573 PL_lex_inwhat = PL_sublex_info.sub_inwhat;
bb16bae8 2574 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
3280af22
NIS
2575 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2576 PL_lex_inpat = PL_sublex_info.sub_op;
79072805 2577 else
5f66b61c 2578 PL_lex_inpat = NULL;
79072805 2579
3a54fd60
DM
2580 PL_parser->lex_re_reparsing = cBOOL(PL_in_eval & EVAL_RE_REPARSING);
2581 PL_in_eval &= ~EVAL_RE_REPARSING;
2582
55497cff 2583 return '(';
79072805
LW
2584}
2585
ffb4593c
NT
2586/*
2587 * S_sublex_done
2588 * Restores lexer state after a S_sublex_push.
2589 */
2590
76e3520e 2591STATIC I32
cea2e8a9 2592S_sublex_done(pTHX)
79072805 2593{
27da23d5 2594 dVAR;
3280af22 2595 if (!PL_lex_starts++) {
396482e1 2596 SV * const sv = newSVpvs("");
9aa983d2
JH
2597 if (SvUTF8(PL_linestr))
2598 SvUTF8_on(sv);
3280af22 2599 PL_expect = XOPERATOR;
6154021b 2600 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
79072805
LW
2601 return THING;
2602 }
2603
3280af22
NIS
2604 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2605 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2606 return yylex();
79072805
LW
2607 }
2608
ffb4593c 2609 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
bb16bae8 2610 assert(PL_lex_inwhat != OP_TRANSR);
3280af22
NIS
2611 if (PL_lex_repl && (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS)) {
2612 PL_linestr = PL_lex_repl;
2613 PL_lex_inpat = 0;
2614 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2615 PL_bufend += SvCUR(PL_linestr);
bd61b366 2616 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2617 PL_lex_dojoin = FALSE;
2618 PL_lex_brackets = 0;
78cdf107
Z
2619 PL_lex_allbrackets = 0;
2620 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
3280af22
NIS
2621 PL_lex_casemods = 0;
2622 *PL_lex_casestack = '\0';
2623 PL_lex_starts = 0;
25da4f38 2624 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2625 PL_lex_state = LEX_INTERPNORMAL;
2626 PL_lex_starts++;
e9fa98b2
HS
2627 /* we don't clear PL_lex_repl here, so that we can check later
2628 whether this is an evalled subst; that means we rely on the
2629 logic to ensure sublex_done() is called again only via the
2630 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2631 }
e9fa98b2 2632 else {
3280af22 2633 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2634 PL_lex_repl = NULL;
e9fa98b2 2635 }
79072805 2636 return ',';
ffed7fef
LW
2637 }
2638 else {
5db06880
NC
2639#ifdef PERL_MAD
2640 if (PL_madskills) {
cd81e915
NC
2641 if (PL_thiswhite) {
2642 if (!PL_endwhite)
6b29d1f5 2643 PL_endwhite = newSVpvs("");
cd81e915
NC
2644 sv_catsv(PL_endwhite, PL_thiswhite);
2645 PL_thiswhite = 0;
2646 }
2647 if (PL_thistoken)
76f68e9b 2648 sv_setpvs(PL_thistoken,"");
5db06880 2649 else
cd81e915 2650 PL_realtokenstart = -1;
5db06880
NC
2651 }
2652#endif
f46d017c 2653 LEAVE;
3280af22
NIS
2654 PL_bufend = SvPVX(PL_linestr);
2655 PL_bufend += SvCUR(PL_linestr);
2656 PL_expect = XOPERATOR;
09bef843 2657 PL_sublex_info.sub_inwhat = 0;
79072805 2658 return ')';
ffed7fef
LW
2659 }
2660}
2661
6f613c73
KW
2662PERL_STATIC_INLINE SV*
2663S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
2664{
140b12ad
KW
2665 /* <s> points to first character of interior of \N{}, <e> to one beyond the
2666 * interior, hence to the "}". Finds what the name resolves to, returning
2667 * an SV* containing it; NULL if no valid one found */
2668
dd2b1b72 2669 SV* res = newSVpvn_flags(s, e - s, UTF ? SVf_UTF8 : 0);
6f613c73 2670
0c415a79
KW
2671 HV * table;
2672 SV **cvp;
2673 SV *cv;
2674 SV *rv;
2675 HV *stash;
2676 const U8* first_bad_char_loc;
2677 const char* backslash_ptr = s - 3; /* Points to the <\> of \N{... */
2678
6f613c73
KW
2679 PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME;
2680
107160e2
KW
2681 if (UTF && ! is_utf8_string_loc((U8 *) backslash_ptr,
2682 e - backslash_ptr,
2683 &first_bad_char_loc))
2684 {
2685 /* If warnings are on, this will print a more detailed analysis of what
2686 * is wrong than the error message below */
2687 utf8n_to_uvuni(first_bad_char_loc,
2688 e - ((char *) first_bad_char_loc),
2689 NULL, 0);
2690
2691 /* We deliberately don't try to print the malformed character, which
2692 * might not print very well; it also may be just the first of many
2693 * malformations, so don't print what comes after it */
2694 yyerror(Perl_form(aTHX_
2695 "Malformed UTF-8 character immediately after '%.*s'",
2696 (int) (first_bad_char_loc - (U8 *) backslash_ptr), backslash_ptr));
2697 return NULL;
2698 }
2699
2700 res = new_constant( NULL, 0, "charnames", res, NULL, backslash_ptr,
2701 /* include the <}> */
2702 e - backslash_ptr + 1);
6f613c73 2703 if (! SvPOK(res)) {
b6407c49 2704 SvREFCNT_dec_NN(res);
6f613c73
KW
2705 return NULL;
2706 }
2707
0c415a79
KW
2708 /* See if the charnames handler is the Perl core's, and if so, we can skip
2709 * the validation needed for a user-supplied one, as Perl's does its own
2710 * validation. */
2711 table = GvHV(PL_hintgv); /* ^H */
2712 cvp = hv_fetchs(table, "charnames", FALSE);
67a057d6
FC
2713 if (cvp && (cv = *cvp) && SvROK(cv) && ((rv = SvRV(cv)) != NULL)
2714 && SvTYPE(rv) == SVt_PVCV && ((stash = CvSTASH(rv)) != NULL))
0c415a79
KW
2715 {
2716 const char * const name = HvNAME(stash);
2717 if strEQ(name, "_charnames") {
2718 return res;
2719 }
2720 }
2721
bde9e88d
KW
2722 /* Here, it isn't Perl's charname handler. We can't rely on a
2723 * user-supplied handler to validate the input name. For non-ut8 input,
2724 * look to see that the first character is legal. Then loop through the
2725 * rest checking that each is a continuation */
6f613c73 2726
b6ba1137
KW
2727 /* This code needs to be sync'ed with a regex in _charnames.pm which does
2728 * the same thing */
2729
b6ba1137 2730 if (! UTF) {
bde9e88d 2731 if (! isALPHAU(*s)) {
b6ba1137
KW
2732 goto bad_charname;
2733 }
bde9e88d
KW
2734 s++;
2735 while (s < e) {
2736 if (! isCHARNAME_CONT(*s)) {
b6ba1137
KW
2737 goto bad_charname;
2738 }
3036c853 2739 if (*s == ' ' && *(s-1) == ' ' && ckWARN_d(WARN_DEPRECATED)) {
94ec3a20
FC
2740 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2741 "A sequence of multiple spaces in a charnames "
2742 "alias definition is deprecated");
bd299e29 2743 }
bde9e88d 2744 s++;
b6ba1137 2745 }
3036c853 2746 if (*(s-1) == ' ' && ckWARN_d(WARN_DEPRECATED)) {
94ec3a20
FC
2747 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2748 "Trailing white-space in a charnames alias "
2749 "definition is deprecated");
bd299e29 2750 }
b6ba1137
KW
2751 }
2752 else {
bde9e88d
KW
2753 /* Similarly for utf8. For invariants can check directly; for other
2754 * Latin1, can calculate their code point and check; otherwise use a
2755 * swash */
2756 if (UTF8_IS_INVARIANT(*s)) {
2757 if (! isALPHAU(*s)) {
140b12ad
KW
2758 goto bad_charname;
2759 }
bde9e88d
KW
2760 s++;
2761 } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
2762 if (! isALPHAU(UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(*s, *(s+1))))) {
b6ba1137 2763 goto bad_charname;
6f613c73 2764 }
bde9e88d 2765 s += 2;
6f613c73 2766 }
bde9e88d
KW
2767 else {
2768 if (! PL_utf8_charname_begin) {
2769 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2770 PL_utf8_charname_begin = _core_swash_init("utf8",
2771 "_Perl_Charname_Begin",
2772 &PL_sv_undef,
2773 1, 0, NULL, &flags);
2774 }
2775 if (! swash_fetch(PL_utf8_charname_begin, (U8 *) s, TRUE)) {
2776 goto bad_charname;
2777 }
2778 s += UTF8SKIP(s);
2779 }
2780
2781 while (s < e) {
2782 if (UTF8_IS_INVARIANT(*s)) {
2783 if (! isCHARNAME_CONT(*s)) {
2784 goto bad_charname;
2785 }
3036c853
FC
2786 if (*s == ' ' && *(s-1) == ' '
2787 && ckWARN_d(WARN_DEPRECATED)) {
94ec3a20
FC
2788 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2789 "A sequence of multiple spaces in a charnam"
2790 "es alias definition is deprecated");
bd299e29 2791 }
bde9e88d
KW
2792 s++;
2793 }
2794 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
2795 if (! isCHARNAME_CONT(UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(*s,
2796 *(s+1)))))
2797 {
2798 goto bad_charname;
2799 }
2800 s += 2;
2801 }
2802 else {
2803 if (! PL_utf8_charname_continue) {
2804 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2805 PL_utf8_charname_continue = _core_swash_init("utf8",
2806 "_Perl_Charname_Continue",
2807 &PL_sv_undef,
2808 1, 0, NULL, &flags);
2809 }
2810 if (! swash_fetch(PL_utf8_charname_continue, (U8 *) s, TRUE)) {
2811 goto bad_charname;
2812 }
2813 s += UTF8SKIP(s);
6f613c73
KW
2814 }
2815 }
3036c853 2816 if (*(s-1) == ' ' && ckWARN_d(WARN_DEPRECATED)) {
94ec3a20
FC
2817 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2818 "Trailing white-space in a charnames alias "
2819 "definition is deprecated");
bd299e29 2820 }
6f613c73
KW
2821 }
2822
94ca1619 2823 if (SvUTF8(res)) { /* Don't accept malformed input */
bde9e88d
KW
2824 const U8* first_bad_char_loc;
2825 STRLEN len;
2826 const char* const str = SvPV_const(res, len);
2827 if (! is_utf8_string_loc((U8 *) str, len, &first_bad_char_loc)) {
2828 /* If warnings are on, this will print a more detailed analysis of
2829 * what is wrong than the error message below */
2830 utf8n_to_uvuni(first_bad_char_loc,
2831 (char *) first_bad_char_loc - str,
2832 NULL, 0);
2833
2834 /* We deliberately don't try to print the malformed character,
2835 * which might not print very well; it also may be just the first
2836 * of many malformations, so don't print what comes after it */
2837 yyerror_pv(
2838 Perl_form(aTHX_
2839 "Malformed UTF-8 returned by %.*s immediately after '%.*s'",
2840 (int) (e - backslash_ptr + 1), backslash_ptr,
2841 (int) ((char *) first_bad_char_loc - str), str
2842 ),
2843 SVf_UTF8);
2844 return NULL;
2845 }
2846 }
140b12ad 2847
bde9e88d 2848 return res;
140b12ad 2849
bde9e88d
KW
2850 bad_charname: {
2851 int bad_char_size = ((UTF) ? UTF8SKIP(s) : 1);
2852
2853 /* The final %.*s makes sure that should the trailing NUL be missing
2854 * that this print won't run off the end of the string */
2855 yyerror_pv(
2856 Perl_form(aTHX_
2857 "Invalid character in \\N{...}; marked by <-- HERE in %.*s<-- HERE %.*s",
2858 (int)(s - backslash_ptr + bad_char_size), backslash_ptr,
2859 (int)(e - s + bad_char_size), s + bad_char_size
2860 ),
2861 UTF ? SVf_UTF8 : 0);
2862 return NULL;
2863 }
6f613c73
KW
2864}
2865
02aa26ce
NT
2866/*
2867 scan_const
2868
9da1dd8f
DM
2869 Extracts the next constant part of a pattern, double-quoted string,
2870 or transliteration. This is terrifying code.
2871
2872 For example, in parsing the double-quoted string "ab\x63$d", it would
2873 stop at the '$' and return an OP_CONST containing 'abc'.
02aa26ce 2874
94def140 2875 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 2876 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 2877 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 2878
94def140
ST
2879 Returns a pointer to the character scanned up to. If this is
2880 advanced from the start pointer supplied (i.e. if anything was
9da1dd8f 2881 successfully parsed), will leave an OP_CONST for the substring scanned
6154021b 2882 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
2883 by looking at the next characters herself.
2884
02aa26ce 2885 In patterns:
9da1dd8f 2886 expand:
537124e4
KW
2887 \N{FOO} => \N{U+hex_for_character_FOO}
2888 (if FOO expands to multiple characters, expands to \N{U+xx.XX.yy ...})
9da1dd8f
DM
2889
2890 pass through:
2891 all other \-char, including \N and \N{ apart from \N{ABC}
2892
2893 stops on:
2894 @ and $ where it appears to be a var, but not for $ as tail anchor
2895 \l \L \u \U \Q \E
2896 (?{ or (??{
2897
02aa26ce
NT
2898
2899 In transliterations:
2900 characters are VERY literal, except for - not at the start or end
94def140
ST
2901 of the string, which indicates a range. If the range is in bytes,
2902 scan_const expands the range to the full set of intermediate
2903 characters. If the range is in utf8, the hyphen is replaced with
2904 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
2905
2906 In double-quoted strings:
2907 backslashes:
2908 double-quoted style: \r and \n
ff3f963a 2909 constants: \x31, etc.
94def140 2910 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
2911 case and quoting: \U \Q \E
2912 stops on @ and $
2913
2914 scan_const does *not* construct ops to handle interpolated strings.
2915 It stops processing as soon as it finds an embedded $ or @ variable
2916 and leaves it to the caller to work out what's going on.
2917
94def140
ST
2918 embedded arrays (whether in pattern or not) could be:
2919 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2920
2921 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
2922
2923 $ in pattern could be $foo or could be tail anchor. Assumption:
2924 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 2925 followed by one of "()| \r\n\t"
02aa26ce 2926
9da1dd8f 2927 \1 (backreferences) are turned into $1 in substitutions
02aa26ce
NT
2928
2929 The structure of the code is
2930 while (there's a character to process) {
94def140
ST
2931 handle transliteration ranges
2932 skip regexp comments /(?#comment)/ and codes /(?{code})/
2933 skip #-initiated comments in //x patterns
2934 check for embedded arrays
02aa26ce
NT
2935 check for embedded scalars
2936 if (backslash) {
94def140 2937 deprecate \1 in substitution replacements
02aa26ce
NT
2938 handle string-changing backslashes \l \U \Q \E, etc.
2939 switch (what was escaped) {
94def140 2940 handle \- in a transliteration (becomes a literal -)
ff3f963a 2941 if a pattern and not \N{, go treat as regular character
94def140
ST
2942 handle \132 (octal characters)
2943 handle \x15 and \x{1234} (hex characters)
ff3f963a 2944 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
ST
2945 handle \cV (control characters)
2946 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 2947 } (end switch)
77a135fe 2948 continue
02aa26ce 2949 } (end if backslash)
77a135fe 2950 handle regular character
02aa26ce 2951 } (end while character to read)
4e553d73 2952
02aa26ce
NT
2953*/
2954
76e3520e 2955STATIC char *
cea2e8a9 2956S_scan_const(pTHX_ char *start)
79072805 2957{
97aff369 2958 dVAR;
eb578fdb 2959 char *send = PL_bufend; /* end of the constant */
77a135fe
KW
2960 SV *sv = newSV(send - start); /* sv for the constant. See
2961 note below on sizing. */
eb578fdb
KW
2962 char *s = start; /* start of the constant */
2963 char *d = SvPVX(sv); /* destination for copies */
02aa26ce 2964 bool dorange = FALSE; /* are we in a translit range? */
c2e66d9e 2965 bool didrange = FALSE; /* did we just finish a range? */
2866decb 2966 bool in_charclass = FALSE; /* within /[...]/ */
b953e60c
KW
2967 bool has_utf8 = FALSE; /* Output constant is UTF8 */
2968 bool this_utf8 = cBOOL(UTF); /* Is the source string assumed
77a135fe
KW
2969 to be UTF8? But, this can
2970 show as true when the source
2971 isn't utf8, as for example
2972 when it is entirely composed
2973 of hex constants */
6f613c73 2974 SV *res; /* result from charnames */
77a135fe
KW
2975
2976 /* Note on sizing: The scanned constant is placed into sv, which is
2977 * initialized by newSV() assuming one byte of output for every byte of
2978 * input. This routine expects newSV() to allocate an extra byte for a
2979 * trailing NUL, which this routine will append if it gets to the end of
2980 * the input. There may be more bytes of input than output (eg., \N{LATIN
2981 * CAPITAL LETTER A}), or more output than input if the constant ends up
2982 * recoded to utf8, but each time a construct is found that might increase
2983 * the needed size, SvGROW() is called. Its size parameter each time is
2984 * based on the best guess estimate at the time, namely the length used so
2985 * far, plus the length the current construct will occupy, plus room for
2986 * the trailing NUL, plus one byte for every input byte still unscanned */
2987
c3320c2a
KW
2988 UV uv = UV_MAX; /* Initialize to weird value to try to catch any uses
2989 before set */
4c3a8340
ST
2990#ifdef EBCDIC
2991 UV literal_endpoint = 0;
e294cc5d 2992 bool native_range = TRUE; /* turned to FALSE if the first endpoint is Unicode. */
4c3a8340 2993#endif
012bcf8d 2994
7918f24d
NC
2995 PERL_ARGS_ASSERT_SCAN_CONST;
2996
bb16bae8 2997 assert(PL_lex_inwhat != OP_TRANSR);
2b9d42f0
NIS
2998 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
2999 /* If we are doing a trans and we know we want UTF8 set expectation */
3000 has_utf8 = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
3001 this_utf8 = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
3002 }
3003
b899e89d
FC
3004 /* Protect sv from errors and fatal warnings. */
3005 ENTER_with_name("scan_const");
3006 SAVEFREESV(sv);
2b9d42f0 3007
79072805 3008 while (s < send || dorange) {
ff3f963a 3009
02aa26ce 3010 /* get transliterations out of the way (they're most literal) */
3280af22 3011 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 3012 /* expand a range A-Z to the full set of characters. AIE! */
79072805 3013 if (dorange) {
1ba5c669
JH
3014 I32 i; /* current expanded character */
3015 I32 min; /* first character in range */
3016 I32 max; /* last character in range */
02aa26ce 3017
e294cc5d
JH
3018#ifdef EBCDIC
3019 UV uvmax = 0;
3020#endif
3021
3022 if (has_utf8
3023#ifdef EBCDIC
3024 && !native_range
3025#endif
1953db30 3026 ) {
9d4ba2ae 3027 char * const c = (char*)utf8_hop((U8*)d, -1);
8973db79
JH
3028 char *e = d++;
3029 while (e-- > c)
3030 *(e + 1) = *e;
25716404 3031 *c = (char)UTF_TO_NATIVE(0xff);
8973db79
JH
3032 /* mark the range as done, and continue */
3033 dorange = FALSE;
3034 didrange = TRUE;
3035 continue;
3036 }
2b9d42f0 3037
95a20fc0 3038 i = d - SvPVX_const(sv); /* remember current offset */
e294cc5d
JH
3039#ifdef EBCDIC
3040 SvGROW(sv,
3041 SvLEN(sv) + (has_utf8 ?
3042 (512 - UTF_CONTINUATION_MARK +
3043 UNISKIP(0x100))
3044 : 256));
3045 /* How many two-byte within 0..255: 128 in UTF-8,
3046 * 96 in UTF-8-mod. */
3047#else
9cbb5ea2 3048 SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */
e294cc5d 3049#endif
9cbb5ea2 3050 d = SvPVX(sv) + i; /* refresh d after realloc */
e294cc5d
JH
3051#ifdef EBCDIC
3052 if (has_utf8) {
3053 int j;
3054 for (j = 0; j <= 1; j++) {
3055 char * const c = (char*)utf8_hop((U8*)d, -1);
3056 const UV uv = utf8n_to_uvchr((U8*)c, d - c, NULL, 0);
3057 if (j)
3058 min = (U8)uv;
3059 else if (uv < 256)
3060 max = (U8)uv;
3061 else {
3062 max = (U8)0xff; /* only to \xff */
3063 uvmax = uv; /* \x{100} to uvmax */
3064 }
3065 d = c; /* eat endpoint chars */
3066 }
3067 }
3068 else {
3069#endif
3070 d -= 2; /* eat the first char and the - */
3071 min = (U8)*d; /* first char in range */
3072 max = (U8)d[1]; /* last char in range */
3073#ifdef EBCDIC
3074 }
3075#endif
8ada0baa 3076
c2e66d9e 3077 if (min > max) {
01ec43d0 3078 Perl_croak(aTHX_
d1573ac7 3079 "Invalid range \"%c-%c\" in transliteration operator",
1ba5c669 3080 (char)min, (char)max);
c2e66d9e
GS
3081 }
3082
c7f1f016 3083#ifdef EBCDIC
4c3a8340
ST
3084 if (literal_endpoint == 2 &&
3085 ((isLOWER(min) && isLOWER(max)) ||
3086 (isUPPER(min) && isUPPER(max)))) {
8ada0baa
JH
3087 if (isLOWER(min)) {
3088 for (i = min; i <= max; i++)
3089 if (isLOWER(i))
db42d148 3090 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
3091 } else {
3092 for (i = min; i <= max; i++)
3093 if (isUPPER(i))
db42d148 3094 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
3095 }
3096 }
3097 else
3098#endif
3099 for (i = min; i <= max; i++)
e294cc5d
JH
3100#ifdef EBCDIC
3101 if (has_utf8) {
3102 const U8 ch = (U8)NATIVE_TO_UTF(i);
3103 if (UNI_IS_INVARIANT(ch))
3104 *d++ = (U8)i;
3105 else {
3106 *d++ = (U8)UTF8_EIGHT_BIT_HI(ch);
3107 *d++ = (U8)UTF8_EIGHT_BIT_LO(ch);
3108 }
3109 }
3110 else
3111#endif
3112 *d++ = (char)i;
3113
3114#ifdef EBCDIC
3115 if (uvmax) {
3116 d = (char*)uvchr_to_utf8((U8*)d, 0x100);
3117 if (uvmax > 0x101)
3118 *d++ = (char)UTF_TO_NATIVE(0xff);
3119 if (uvmax > 0x100)
3120 d = (char*)uvchr_to_utf8((U8*)d, uvmax);
3121 }
3122#endif
02aa26ce
NT
3123
3124 /* mark the range as done, and continue */
79072805 3125 dorange = FALSE;
01ec43d0 3126 didrange = TRUE;
4c3a8340
ST
3127#ifdef EBCDIC
3128 literal_endpoint = 0;
3129#endif
79072805 3130 continue;
4e553d73 3131 }
02aa26ce
NT
3132
3133 /* range begins (ignore - as first or last char) */
79072805 3134 else if (*s == '-' && s+1 < send && s != start) {
4e553d73 3135 if (didrange) {
1fafa243 3136 Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
01ec43d0 3137 }
e294cc5d
JH
3138 if (has_utf8
3139#ifdef EBCDIC
3140 && !native_range
3141#endif
3142 ) {
25716404 3143 *d++ = (char)UTF_TO_NATIVE(0xff); /* use illegal utf8 byte--see pmtrans */
a0ed51b3
LW
3144 s++;
3145 continue;
3146 }
79072805
LW
3147 dorange = TRUE;
3148 s++;
01ec43d0
GS
3149 }
3150 else {
3151 didrange = FALSE;
4c3a8340
ST
3152#ifdef EBCDIC
3153 literal_endpoint = 0;
e294cc5d 3154 native_range = TRUE;
4c3a8340 3155#endif
01ec43d0 3156 }
79072805 3157 }
02aa26ce
NT
3158
3159 /* if we get here, we're not doing a transliteration */
3160
e4a2df84
DM
3161 else if (*s == '[' && PL_lex_inpat && !in_charclass) {
3162 char *s1 = s-1;
3163 int esc = 0;
3164 while (s1 >= start && *s1-- == '\\')
3165 esc = !esc;
3166 if (!esc)
3167 in_charclass = TRUE;
3168 }
2866decb 3169
e4a2df84
DM
3170 else if (*s == ']' && PL_lex_inpat && in_charclass) {
3171 char *s1 = s-1;
3172 int esc = 0;
3173 while (s1 >= start && *s1-- == '\\')
3174 esc = !esc;
3175 if (!esc)
3176 in_charclass = FALSE;
3177 }
2866decb 3178
9da1dd8f
DM
3179 /* skip for regexp comments /(?#comment)/, except for the last
3180 * char, which will be done separately.
3181 * Stop on (?{..}) and friends */
3182
3280af22 3183 else if (*s == '(' && PL_lex_inpat && s[1] == '?') {
cc6b7395 3184 if (s[2] == '#') {
e994fd66 3185 while (s+1 < send && *s != ')')
db42d148 3186 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
155aba94 3187 }
2866decb 3188 else if (!PL_lex_casemods && !in_charclass &&
d3cec5e5
DM
3189 ( s[2] == '{' /* This should match regcomp.c */
3190 || (s[2] == '?' && s[3] == '{')))
155aba94 3191 {
9da1dd8f 3192 break;
cc6b7395 3193 }
748a9306 3194 }
02aa26ce
NT
3195
3196 /* likewise skip #-initiated comments in //x patterns */
3280af22 3197 else if (*s == '#' && PL_lex_inpat &&
73134a2e 3198 ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED) {
748a9306 3199 while (s+1 < send && *s != '\n')
db42d148 3200 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
748a9306 3201 }
02aa26ce 3202
9da1dd8f
DM
3203 /* no further processing of single-quoted regex */
3204 else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'')
3205 goto default_action;
3206
5d1d4326 3207 /* check for embedded arrays
da6eedaa 3208 (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
5d1d4326 3209 */
1749ea0d 3210 else if (*s == '@' && s[1]) {
8a2bca12 3211 if (isWORDCHAR_lazy_if(s+1,UTF))
1749ea0d
ST
3212 break;
3213 if (strchr(":'{$", s[1]))
3214 break;
3215 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
3216 break; /* in regexp, neither @+ nor @- are interpolated */
3217 }
02aa26ce
NT
3218
3219 /* check for embedded scalars. only stop if we're sure it's a
3220 variable.
3221 */
79072805 3222 else if (*s == '$') {
3280af22 3223 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
79072805 3224 break;
77772344 3225 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
a2a5de95
NC
3226 if (s[1] == '\\') {
3227 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
3228 "Possible unintended interpolation of $\\ in regex");
77772344 3229 }
79072805 3230 break; /* in regexp, $ might be tail anchor */
77772344 3231 }
79072805 3232 }
02aa26ce 3233
2b9d42f0
NIS
3234 /* End of else if chain - OP_TRANS rejoin rest */
3235
02aa26ce 3236 /* backslashes */
79072805 3237 if (*s == '\\' && s+1 < send) {
ff3f963a
KW
3238 char* e; /* Can be used for ending '}', etc. */
3239
79072805 3240 s++;
02aa26ce 3241
7d0fc23c
KW
3242 /* warn on \1 - \9 in substitution replacements, but note that \11
3243 * is an octal; and \19 is \1 followed by '9' */
3280af22 3244 if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat &&
a0d0e21e 3245 isDIGIT(*s) && *s != '0' && !isDIGIT(s[1]))
79072805 3246 {
a2a5de95 3247 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
79072805
LW
3248 *--s = '$';
3249 break;
3250 }
02aa26ce
NT
3251
3252 /* string-change backslash escapes */
838f2281 3253 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQF", *s)) {
79072805
LW
3254 --s;
3255 break;
3256 }
ff3f963a
KW
3257 /* In a pattern, process \N, but skip any other backslash escapes.
3258 * This is because we don't want to translate an escape sequence
3259 * into a meta symbol and have the regex compiler use the meta
3260 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
3261 * in spite of this, we do have to process \N here while the proper
3262 * charnames handler is in scope. See bugs #56444 and #62056.
3263 * There is a complication because \N in a pattern may also stand
3264 * for 'match a non-nl', and not mean a charname, in which case its
3265 * processing should be deferred to the regex compiler. To be a
3266 * charname it must be followed immediately by a '{', and not look
3267 * like \N followed by a curly quantifier, i.e., not something like
3268 * \N{3,}. regcurly returns a boolean indicating if it is a legal
3269 * quantifier */
3270 else if (PL_lex_inpat
3271 && (*s != 'N'
3272 || s[1] != '{'
4d68ffa0 3273 || regcurly(s + 1, FALSE)))
ff3f963a 3274 {
cc74c5bd
ST
3275 *d++ = NATIVE_TO_NEED(has_utf8,'\\');
3276 goto default_action;
3277 }
02aa26ce 3278
79072805 3279 switch (*s) {
02aa26ce
NT
3280
3281 /* quoted - in transliterations */
79072805 3282 case '-':
3280af22 3283 if (PL_lex_inwhat == OP_TRANS) {
79072805
LW
3284 *d++ = *s++;
3285 continue;
3286 }
3287 /* FALL THROUGH */
3288 default:
11b8faa4 3289 {
15861f94 3290 if ((isALPHANUMERIC(*s)))
a2a5de95
NC
3291 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
3292 "Unrecognized escape \\%c passed through",
3293 *s);
11b8faa4 3294 /* default action is to copy the quoted character */
f9a63242 3295 goto default_action;
11b8faa4 3296 }
02aa26ce 3297
632403cc 3298 /* eg. \132 indicates the octal constant 0132 */
79072805
LW
3299 case '0': case '1': case '2': case '3':
3300 case '4': case '5': case '6': case '7':
ba210ebe 3301 {
5e0a247b 3302 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
53305cf1 3303 STRLEN len = 3;
77a135fe 3304 uv = NATIVE_TO_UNI(grok_oct(s, &len, &flags, NULL));
ba210ebe 3305 s += len;
5e0a247b
KW
3306 if (len < 3 && s < send && isDIGIT(*s)
3307 && ckWARN(WARN_MISC))
3308 {
3309 Perl_warner(aTHX_ packWARN(WARN_MISC),
3310 "%s", form_short_octal_warning(s, len));
3311 }
ba210ebe 3312 }
012bcf8d 3313 goto NUM_ESCAPE_INSERT;
02aa26ce 3314
f0a2b745
KW
3315 /* eg. \o{24} indicates the octal constant \024 */
3316 case 'o':
3317 {
454155d9 3318 const char* error;
f0a2b745 3319
00ce5563 3320 bool valid = grok_bslash_o(&s, &uv, &error,
80f4111b
KW
3321 TRUE, /* Output warning */
3322 FALSE, /* Not strict */
17896a85
KW
3323 TRUE, /* Output warnings for
3324 non-portables */
80f4111b 3325 UTF);
454155d9 3326 if (! valid) {
f0a2b745
KW
3327 yyerror(error);
3328 continue;
3329 }
3330 goto NUM_ESCAPE_INSERT;
3331 }
3332
77a135fe 3333 /* eg. \x24 indicates the hex constant 0x24 */
79072805 3334 case 'x':
a0481293 3335 {
a0481293 3336 const char* error;
355860ce 3337
00ce5563 3338 bool valid = grok_bslash_x(&s, &uv, &error,
80f4111b
KW
3339 TRUE, /* Output warning */
3340 FALSE, /* Not strict */
17896a85
KW
3341 TRUE, /* Output warnings for
3342 non-portables */
80f4111b 3343 UTF);
a0481293
KW
3344 if (! valid) {
3345 yyerror(error);
355860ce 3346 continue;
ba210ebe 3347 }
012bcf8d
GS
3348 }
3349
3350 NUM_ESCAPE_INSERT:
ff3f963a
KW
3351 /* Insert oct or hex escaped character. There will always be
3352 * enough room in sv since such escapes will be longer than any
3353 * UTF-8 sequence they can end up as, except if they force us
3354 * to recode the rest of the string into utf8 */
ba7cea30 3355
77a135fe 3356 /* Here uv is the ordinal of the next character being added in
ff3f963a 3357 * unicode (converted from native). */
77a135fe 3358 if (!UNI_IS_INVARIANT(uv)) {
9aa983d2 3359 if (!has_utf8 && uv > 255) {
77a135fe
KW
3360 /* Might need to recode whatever we have accumulated so
3361 * far if it contains any chars variant in utf8 or
3362 * utf-ebcdic. */
3363
3364 SvCUR_set(sv, d - SvPVX_const(sv));
3365 SvPOK_on(sv);
3366 *d = '\0';
77a135fe 3367 /* See Note on sizing above. */
7bf79863
KW
3368 sv_utf8_upgrade_flags_grow(sv,
3369 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3370 UNISKIP(uv) + (STRLEN)(send - s) + 1);
77a135fe
KW
3371 d = SvPVX(sv) + SvCUR(sv);
3372 has_utf8 = TRUE;
012bcf8d
GS
3373 }
3374
77a135fe
KW
3375 if (has_utf8) {
3376 d = (char*)uvuni_to_utf8((U8*)d, uv);
f9a63242
JH
3377 if (PL_lex_inwhat == OP_TRANS &&
3378 PL_sublex_info.sub_op) {
3379 PL_sublex_info.sub_op->op_private |=
3380 (PL_lex_repl ? OPpTRANS_FROM_UTF
3381 : OPpTRANS_TO_UTF);
f9a63242 3382 }
e294cc5d
JH
3383#ifdef EBCDIC
3384 if (uv > 255 && !dorange)
3385 native_range = FALSE;
3386#endif
012bcf8d 3387 }
a0ed51b3 3388 else {
012bcf8d 3389 *d++ = (char)uv;
a0ed51b3 3390 }
012bcf8d
GS
3391 }
3392 else {
c4d5f83a 3393 *d++ = (char) uv;
a0ed51b3 3394 }
79072805 3395 continue;
02aa26ce 3396
4a2d328f 3397 case 'N':
ff3f963a
KW
3398 /* In a non-pattern \N must be a named character, like \N{LATIN
3399 * SMALL LETTER A} or \N{U+0041}. For patterns, it also can
3400 * mean to match a non-newline. For non-patterns, named
3401 * characters are converted to their string equivalents. In
3402 * patterns, named characters are not converted to their
3403 * ultimate forms for the same reasons that other escapes
3404 * aren't. Instead, they are converted to the \N{U+...} form
3405 * to get the value from the charnames that is in effect right
3406 * now, while preserving the fact that it was a named character
3407 * so that the regex compiler knows this */
3408
3409 /* This section of code doesn't generally use the
3410 * NATIVE_TO_NEED() macro to transform the input. I (khw) did
3411 * a close examination of this macro and determined it is a
3412 * no-op except on utfebcdic variant characters. Every
3413 * character generated by this that would normally need to be
3414 * enclosed by this macro is invariant, so the macro is not
7538f724
KW
3415 * needed, and would complicate use of copy(). XXX There are
3416 * other parts of this file where the macro is used
3417 * inconsistently, but are saved by it being a no-op */
ff3f963a
KW
3418
3419 /* The structure of this section of code (besides checking for
3420 * errors and upgrading to utf8) is:
3421 * Further disambiguate between the two meanings of \N, and if
3422 * not a charname, go process it elsewhere
0a96133f
KW
3423 * If of form \N{U+...}, pass it through if a pattern;
3424 * otherwise convert to utf8
3425 * Otherwise must be \N{NAME}: convert to \N{U+c1.c2...} if a
3426 * pattern; otherwise convert to utf8 */
ff3f963a
KW
3427
3428 /* Here, s points to the 'N'; the test below is guaranteed to
3429 * succeed if we are being called on a pattern as we already
3430 * know from a test above that the next character is a '{'.
3431 * On a non-pattern \N must mean 'named sequence, which
3432 * requires braces */
3433 s++;
3434 if (*s != '{') {
3435 yyerror("Missing braces on \\N{}");
3436 continue;
3437 }
3438 s++;
3439
0a96133f 3440 /* If there is no matching '}', it is an error. */
ff3f963a
KW
3441 if (! (e = strchr(s, '}'))) {
3442 if (! PL_lex_inpat) {
5777a3f7 3443 yyerror("Missing right brace on \\N{}");
0a96133f
KW
3444 } else {
3445 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N.");
dbc0d4f2 3446 }
0a96133f 3447 continue;
ff3f963a 3448 }
cddc7ef4 3449
ff3f963a 3450 /* Here it looks like a named character */
cddc7ef4 3451
ff3f963a
KW
3452 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3453 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3454 | PERL_SCAN_DISALLOW_PREFIX;
3455 STRLEN len;
3456
3457 /* For \N{U+...}, the '...' is a unicode value even on
3458 * EBCDIC machines */
3459 s += 2; /* Skip to next char after the 'U+' */
3460 len = e - s;
3461 uv = grok_hex(s, &len, &flags, NULL);
3462 if (len == 0 || len != (STRLEN)(e - s)) {
3463 yyerror("Invalid hexadecimal number in \\N{U+...}");
3464 s = e + 1;
3465 continue;
3466 }
3467
3468 if (PL_lex_inpat) {
3469
e2a7e165
KW
3470 /* On non-EBCDIC platforms, pass through to the regex
3471 * compiler unchanged. The reason we evaluated the
3472 * number above is to make sure there wasn't a syntax
3473 * error. But on EBCDIC we convert to native so
3474 * downstream code can continue to assume it's native
3475 */
ff3f963a 3476 s -= 5; /* Include the '\N{U+' */
e2a7e165
KW
3477#ifdef EBCDIC
3478 d += my_snprintf(d, e - s + 1 + 1, /* includes the }
3479 and the \0 */
3480 "\\N{U+%X}",
3481 (unsigned int) UNI_TO_NATIVE(uv));
3482#else
ff3f963a
KW
3483 Copy(s, d, e - s + 1, char); /* 1 = include the } */
3484 d += e - s + 1;
e2a7e165 3485#endif
ff3f963a
KW
3486 }
3487 else { /* Not a pattern: convert the hex to string */
3488
3489 /* If destination is not in utf8, unconditionally
3490 * recode it to be so. This is because \N{} implies
3491 * Unicode semantics, and scalars have to be in utf8
3492 * to guarantee those semantics */
3493 if (! has_utf8) {
3494 SvCUR_set(sv, d - SvPVX_const(sv));
3495 SvPOK_on(sv);
3496 *d = '\0';
3497 /* See Note on sizing above. */
3498 sv_utf8_upgrade_flags_grow(
3499 sv,
3500 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3501 UNISKIP(uv) + (STRLEN)(send - e) + 1);
3502 d = SvPVX(sv) + SvCUR(sv);
3503 has_utf8 = TRUE;
3504 }
3505
3506 /* Add the string to the output */
3507 if (UNI_IS_INVARIANT(uv)) {
3508 *d++ = (char) uv;
3509 }
3510 else d = (char*)uvuni_to_utf8((U8*)d, uv);
3511 }
3512 }
6f613c73
KW
3513 else /* Here is \N{NAME} but not \N{U+...}. */
3514 if ((res = get_and_check_backslash_N_name(s, e)))
3515 {
3516 STRLEN len;
3517 const char *str = SvPV_const(res, len);
3518 if (PL_lex_inpat) {
ff3f963a
KW
3519
3520 if (! len) { /* The name resolved to an empty string */
3521 Copy("\\N{}", d, 4, char);
3522 d += 4;
3523 }
3524 else {
3525 /* In order to not lose information for the regex
3526 * compiler, pass the result in the specially made
3527 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3528 * the code points in hex of each character
3529 * returned by charnames */
3530
3531 const char *str_end = str + len;
3b721c4f 3532 const STRLEN off = d - SvPVX_const(sv);
94ca1619
KW
3533
3534 if (! SvUTF8(res)) {
3535 /* For the non-UTF-8 case, we can determine the
3536 * exact length needed without having to parse
3537 * through the string. Each character takes up
3538 * 2 hex digits plus either a trailing dot or
3539 * the "}" */
3540 d = off + SvGROW(sv, off
3541 + 3 * len
3542 + 6 /* For the "\N{U+", and
3543 trailing NUL */
3544 + (STRLEN)(send - e));
3545 Copy("\\N{U+", d, 5, char);
3546 d += 5;
3547 while (str < str_end) {
3548 char hex_string[4];
3549 my_snprintf(hex_string, sizeof(hex_string),
3550 "%02X.", (U8) *str);
3551 Copy(hex_string, d, 3, char);
3552 d += 3;
3553 str++;
3554 }
3555 d--; /* We will overwrite below the final
3556 dot with a right brace */
3557 }
3558 else {
1953db30
KW
3559 STRLEN char_length; /* cur char's byte length */
3560
3561 /* and the number of bytes after this is
3562 * translated into hex digits */
3563 STRLEN output_length;
3564
3565 /* 2 hex per byte; 2 chars for '\N'; 2 chars
3566 * for max('U+', '.'); and 1 for NUL */
3567 char hex_string[2 * UTF8_MAXBYTES + 5];
3568
3569 /* Get the first character of the result. */
3570 U32 uv = utf8n_to_uvuni((U8 *) str,
3571 len,
3572 &char_length,
3573 UTF8_ALLOW_ANYUV);
3574 /* Convert first code point to hex, including
3575 * the boiler plate before it. For all these,
3576 * we convert to native format so that
3577 * downstream code can continue to assume the
3578 * input is native */
3579 output_length =
3580 my_snprintf(hex_string, sizeof(hex_string),
3581 "\\N{U+%X",
3582 (unsigned int) UNI_TO_NATIVE(uv));
3583
3584 /* Make sure there is enough space to hold it */
3585 d = off + SvGROW(sv, off
3586 + output_length
3587 + (STRLEN)(send - e)
3588 + 2); /* '}' + NUL */
3589 /* And output it */
3590 Copy(hex_string, d, output_length, char);
3591 d += output_length;
3592
3593 /* For each subsequent character, append dot and
3594 * its ordinal in hex */
3595 while ((str += char_length) < str_end) {
3596 const STRLEN off = d - SvPVX_const(sv);
3597 U32 uv = utf8n_to_uvuni((U8 *) str,
3598 str_end - str,
3599 &char_length,
3600 UTF8_ALLOW_ANYUV);
3601 output_length =
3602 my_snprintf(hex_string,
3603 sizeof(hex_string),
3604 ".%X",
3605 (unsigned int) UNI_TO_NATIVE(uv));
3606
3607 d = off + SvGROW(sv, off
3608 + output_length
3609 + (STRLEN)(send - e)
3610 + 2); /* '}' + NUL */
3611 Copy(hex_string, d, output_length, char);
3612 d += output_length;
3613 }
94ca1619 3614 }
ff3f963a
KW
3615
3616 *d++ = '}'; /* Done. Add the trailing brace */
3617 }
3618 }
3619 else { /* Here, not in a pattern. Convert the name to a
3620 * string. */
3621
3622 /* If destination is not in utf8, unconditionally
3623 * recode it to be so. This is because \N{} implies
3624 * Unicode semantics, and scalars have to be in utf8
3625 * to guarantee those semantics */
3626 if (! has_utf8) {
3627 SvCUR_set(sv, d - SvPVX_const(sv));
3628 SvPOK_on(sv);
3629 *d = '\0';
3630 /* See Note on sizing above. */
3631 sv_utf8_upgrade_flags_grow(sv,
3632 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3633 len + (STRLEN)(send - s) + 1);
3634 d = SvPVX(sv) + SvCUR(sv);
3635 has_utf8 = TRUE;
3636 } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
3637
3638 /* See Note on sizing above. (NOTE: SvCUR() is not
3639 * set correctly here). */
3640 const STRLEN off = d - SvPVX_const(sv);
3641 d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1);
3642 }
3643 Copy(str, d, len, char);
3644 d += len;
423cee85 3645 }
6f613c73 3646
423cee85 3647 SvREFCNT_dec(res);
cb233ae3 3648
cb233ae3 3649 } /* End \N{NAME} */
ff3f963a
KW
3650#ifdef EBCDIC
3651 if (!dorange)
3652 native_range = FALSE; /* \N{} is defined to be Unicode */
3653#endif
3654 s = e + 1; /* Point to just after the '}' */
423cee85
JH
3655 continue;
3656
02aa26ce 3657 /* \c is a control character */
79072805
LW
3658 case 'c':
3659 s++;
961ce445 3660 if (s < send) {
17a3df4c 3661 *d++ = grok_bslash_c(*s++, has_utf8, 1);
ba210ebe 3662 }
961ce445
RGS
3663 else {
3664 yyerror("Missing control char name in \\c");
3665 }
79072805 3666 continue;
02aa26ce
NT
3667
3668 /* printf-style backslashes, formfeeds, newlines, etc */
79072805 3669 case 'b':
db42d148 3670 *d++ = NATIVE_TO_NEED(has_utf8,'\b');
79072805
LW
3671 break;
3672 case 'n':
db42d148 3673 *d++ = NATIVE_TO_NEED(has_utf8,'\n');
79072805
LW
3674 break;
3675 case 'r':
db42d148 3676 *d++ = NATIVE_TO_NEED(has_utf8,'\r');
79072805
LW
3677 break;
3678 case 'f':
db42d148 3679 *d++ = NATIVE_TO_NEED(has_utf8,'\f');
79072805
LW
3680 break;
3681 case 't':
db42d148 3682 *d++ = NATIVE_TO_NEED(has_utf8,'\t');
79072805 3683 break;
34a3fe2a 3684 case 'e':
db42d148 3685 *d++ = ASCII_TO_NEED(has_utf8,'\033');
34a3fe2a
PP
3686 break;
3687 case 'a':
db42d148 3688 *d++ = ASCII_TO_NEED(has_utf8,'\007');
79072805 3689 break;
02aa26ce
NT
3690 } /* end switch */
3691
79072805
LW
3692 s++;
3693 continue;
02aa26ce 3694 } /* end if (backslash) */
4c3a8340
ST
3695#ifdef EBCDIC
3696 else
3697 literal_endpoint++;
3698#endif
02aa26ce 3699
f9a63242 3700 default_action:
77a135fe
KW
3701 /* If we started with encoded form, or already know we want it,
3702 then encode the next character */
3703 if (! NATIVE_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
2b9d42f0 3704 STRLEN len = 1;
77a135fe
KW
3705
3706
3707 /* One might think that it is wasted effort in the case of the
3708 * source being utf8 (this_utf8 == TRUE) to take the next character
3709 * in the source, convert it to an unsigned value, and then convert
3710 * it back again. But the source has not been validated here. The
3711