This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
toke.c: Consolidate some PL_expect assignments
[perl5.git] / toke.c
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
f0e67a1d
Z
26This is the lower layer of the Perl parser, managing characters and tokens.
27
28=for apidoc AmU|yy_parser *|PL_parser
29
30Pointer to a structure encapsulating the state of the parsing operation
31currently in progress. The pointer can be locally changed to perform
32a nested parse without interfering with the state of an outer parse.
33Individual members of C<PL_parser> have their own documentation.
34
35=cut
36*/
37
378cc40b 38#include "EXTERN.h"
864dbfa3 39#define PERL_IN_TOKE_C
378cc40b 40#include "perl.h"
04e98a4d 41#include "dquote_static.c"
378cc40b 42
eb0d8d16
NC
43#define new_constant(a,b,c,d,e,f,g) \
44 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
45
6154021b 46#define pl_yylval (PL_parser->yylval)
d3b6f988 47
199e78b7
DM
48/* XXX temporary backwards compatibility */
49#define PL_lex_brackets (PL_parser->lex_brackets)
78cdf107
Z
50#define PL_lex_allbrackets (PL_parser->lex_allbrackets)
51#define PL_lex_fakeeof (PL_parser->lex_fakeeof)
199e78b7
DM
52#define PL_lex_brackstack (PL_parser->lex_brackstack)
53#define PL_lex_casemods (PL_parser->lex_casemods)
54#define PL_lex_casestack (PL_parser->lex_casestack)
55#define PL_lex_defer (PL_parser->lex_defer)
56#define PL_lex_dojoin (PL_parser->lex_dojoin)
57#define PL_lex_expect (PL_parser->lex_expect)
58#define PL_lex_formbrack (PL_parser->lex_formbrack)
59#define PL_lex_inpat (PL_parser->lex_inpat)
60#define PL_lex_inwhat (PL_parser->lex_inwhat)
61#define PL_lex_op (PL_parser->lex_op)
62#define PL_lex_repl (PL_parser->lex_repl)
63#define PL_lex_starts (PL_parser->lex_starts)
64#define PL_lex_stuff (PL_parser->lex_stuff)
65#define PL_multi_start (PL_parser->multi_start)
66#define PL_multi_open (PL_parser->multi_open)
67#define PL_multi_close (PL_parser->multi_close)
199e78b7
DM
68#define PL_preambled (PL_parser->preambled)
69#define PL_sublex_info (PL_parser->sublex_info)
bdc0bf6f 70#define PL_linestr (PL_parser->linestr)
c2598295
DM
71#define PL_expect (PL_parser->expect)
72#define PL_copline (PL_parser->copline)
f06b5848
DM
73#define PL_bufptr (PL_parser->bufptr)
74#define PL_oldbufptr (PL_parser->oldbufptr)
75#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
76#define PL_linestart (PL_parser->linestart)
77#define PL_bufend (PL_parser->bufend)
78#define PL_last_uni (PL_parser->last_uni)
79#define PL_last_lop (PL_parser->last_lop)
80#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 81#define PL_lex_state (PL_parser->lex_state)
2f9285f8 82#define PL_rsfp (PL_parser->rsfp)
5486870f 83#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
84#define PL_in_my (PL_parser->in_my)
85#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 86#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 87#define PL_multi_end (PL_parser->multi_end)
13765c85 88#define PL_error_count (PL_parser->error_count)
199e78b7 89
fb205e7a
DM
90# define PL_nexttoke (PL_parser->nexttoke)
91# define PL_nexttype (PL_parser->nexttype)
92# define PL_nextval (PL_parser->nextval)
199e78b7 93
a1894d81 94static const char* const ident_too_long = "Identifier too long";
8903cb82 95
9ded7720 96# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2 97
a7aaec61
Z
98#define XENUMMASK 0x3f
99#define XFAKEEOF 0x40
100#define XFAKEBRACK 0x80
9059aa12 101
39e02b42
JH
102#ifdef USE_UTF8_SCRIPTS
103# define UTF (!IN_BYTES)
2b9d42f0 104#else
802a15e9 105# define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
2b9d42f0 106#endif
a0ed51b3 107
b1fc3636
CJ
108/* The maximum number of characters preceding the unrecognized one to display */
109#define UNRECOGNIZED_PRECEDE_COUNT 10
110
61f0cdd9 111/* In variables named $^X, these are the legal values for X.
2b92dfce
GS
112 * 1999-02-27 mjd-perl-patch@plover.com */
113#define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
114
14bd96d0 115#define SPACE_OR_TAB(c) isBLANK_A(c)
bf4acbe4 116
9ff909cf
JH
117#define HEXFP_PEEK(s) \
118 (((s[0] == '.') && \
119 (isXDIGIT(s[1]) || isALPHA_FOLD_EQ(s[1], 'p'))) || \
120 isALPHA_FOLD_EQ(s[0], 'p'))
121
ffb4593c
NT
122/* LEX_* are values for PL_lex_state, the state of the lexer.
123 * They are arranged oddly so that the guard on the switch statement
79072805 124 * can get by with a single comparison (if the compiler is smart enough).
9da1dd8f
DM
125 *
126 * These values refer to the various states within a sublex parse,
127 * i.e. within a double quotish string
79072805
LW
128 */
129
fb73857a
PP
130/* #define LEX_NOTPARSING 11 is done in perl.h. */
131
b6007c36
DM
132#define LEX_NORMAL 10 /* normal code (ie not within "...") */
133#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
134#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
135#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
136#define LEX_INTERPSTART 6 /* expecting the start of a $var */
137
138 /* at end of code, eg "$x" followed by: */
139#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
140#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
141
142#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
143 string or after \E, $foo, etc */
144#define LEX_INTERPCONST 2 /* NOT USED */
145#define LEX_FORMLINE 1 /* expecting a format line */
146#define LEX_KNOWNEXT 0 /* next token known; just return it */
147
79072805 148
bbf60fe6 149#ifdef DEBUGGING
27da23d5 150static const char* const lex_state_names[] = {
bbf60fe6
DM
151 "KNOWNEXT",
152 "FORMLINE",
153 "INTERPCONST",
154 "INTERPCONCAT",
155 "INTERPENDMAYBE",
156 "INTERPEND",
157 "INTERPSTART",
158 "INTERPPUSH",
159 "INTERPCASEMOD",
160 "INTERPNORMAL",
161 "NORMAL"
162};
163#endif
164
79072805 165#include "keywords.h"
fe14fcc3 166
ffb4593c
NT
167/* CLINE is a macro that ensures PL_copline has a sane value */
168
57843af0 169#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 170
29595ff2
NC
171# define SKIPSPACE0(s) skipspace(s)
172# define SKIPSPACE1(s) skipspace(s)
173# define SKIPSPACE2(s,tsv) skipspace(s)
174# define PEEKSPACE(s) skipspace(s)
29595ff2 175
ffb4593c
NT
176/*
177 * Convenience functions to return different tokens and prime the
9cbb5ea2 178 * lexer for the next token. They all take an argument.
ffb4593c
NT
179 *
180 * TOKEN : generic token (used for '(', DOLSHARP, etc)
181 * OPERATOR : generic operator
182 * AOPERATOR : assignment operator
183 * PREBLOCK : beginning the block after an if, while, foreach, ...
184 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
185 * PREREF : *EXPR where EXPR is not a simple identifier
186 * TERM : expression term
89f35911 187 * POSTDEREF : postfix dereference (->$* ->@[...] etc.)
ffb4593c
NT
188 * LOOPX : loop exiting command (goto, last, dump, etc)
189 * FTST : file test operator
190 * FUN0 : zero-argument function
7eb971ee 191 * FUN0OP : zero-argument function, with its op created in this file
2d2e263d 192 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
193 * BOop : bitwise or or xor
194 * BAop : bitwise and
195 * SHop : shift operator
196 * PWop : power operator
9cbb5ea2 197 * PMop : pattern-matching operator
ffb4593c 198 * Aop : addition-level operator
e4916dd1 199 * AopNOASSIGN : addition-level operator that is never part of .=
ffb4593c
NT
200 * Mop : multiplication-level operator
201 * Eop : equality-testing operator
e5edeb50 202 * Rop : relational operator <= != gt
ffb4593c
NT
203 *
204 * Also see LOP and lop() below.
205 */
206
998054bd 207#ifdef DEBUGGING /* Serve -DT. */
704d4215 208# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 209#else
bbf60fe6 210# define REPORT(retval) (retval)
998054bd
SC
211#endif
212
bbf60fe6
DM
213#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
214#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
215#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, REPORT(retval)))
216#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
217#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
218#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
219#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
89f35911 220#define POSTDEREF(f) return (PL_bufptr = s, S_postderef(aTHX_ REPORT(f),s[1]))
a49203fd 221#define LOOPX(f) return (PL_bufptr = force_word(s,WORD,TRUE,FALSE), \
7a61bf3c 222 pl_yylval.ival=f, \
a49203fd 223 PL_expect = PL_nexttoke ? XOPERATOR : XTERM, \
7a61bf3c 224 REPORT((int)LOOPEX))
6154021b
RGS
225#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
226#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
7eb971ee 227#define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
6154021b
RGS
228#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
229#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITOROP)))
230#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITANDOP)))
231#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)SHIFTOP)))
232#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)POWOP)))
233#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
234#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)ADDOP)))
e4916dd1 235#define AopNOASSIGN(f) return (pl_yylval.ival=f, PL_bufptr=s, REPORT((int)ADDOP))
6154021b
RGS
236#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MULOP)))
237#define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
238#define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
2f3197b3 239
a687059c
LW
240/* This bit of chicanery makes a unary function followed by
241 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
242 * The UNIDOR macro is for unary functions that can be followed by the //
243 * operator (such as C<shift // 0>).
a687059c 244 */
d68ce4ac 245#define UNI3(f,x,have_x) { \
6154021b 246 pl_yylval.ival = f; \
d68ce4ac 247 if (have_x) PL_expect = x; \
376fcdbf
AL
248 PL_bufptr = s; \
249 PL_last_uni = PL_oldbufptr; \
250 PL_last_lop_op = f; \
251 if (*s == '(') \
252 return REPORT( (int)FUNC1 ); \
29595ff2 253 s = PEEKSPACE(s); \
376fcdbf
AL
254 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
255 }
d68ce4ac
FC
256#define UNI(f) UNI3(f,XTERM,1)
257#define UNIDOR(f) UNI3(f,XTERMORDORDOR,1)
b5fb7ce3
FC
258#define UNIPROTO(f,optional) { \
259 if (optional) PL_last_uni = PL_oldbufptr; \
22393538
MH
260 OPERATOR(f); \
261 }
a687059c 262
d68ce4ac 263#define UNIBRACK(f) UNI3(f,0,0)
79072805 264
9f68db38 265/* grandfather return to old style */
78cdf107
Z
266#define OLDLOP(f) \
267 do { \
268 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
269 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
270 pl_yylval.ival = (f); \
271 PL_expect = XTERM; \
272 PL_bufptr = s; \
273 return (int)LSTOP; \
274 } while(0)
79072805 275
83944c01
FC
276#define COPLINE_INC_WITH_HERELINES \
277 STMT_START { \
278 CopLINE_inc(PL_curcop); \
851b527a
FC
279 if (PL_parser->herelines) \
280 CopLINE(PL_curcop) += PL_parser->herelines, \
281 PL_parser->herelines = 0; \
83944c01 282 } STMT_END
ffdb8b16
FC
283/* Called after scan_str to update CopLINE(PL_curcop), but only when there
284 * is no sublex_push to follow. */
285#define COPLINE_SET_FROM_MULTI_END \
286 STMT_START { \
287 CopLINE_set(PL_curcop, PL_multi_end); \
288 if (PL_multi_end != PL_multi_start) \
851b527a 289 PL_parser->herelines = 0; \
ffdb8b16 290 } STMT_END
83944c01
FC
291
292
8fa7f367
JH
293#ifdef DEBUGGING
294
6154021b 295/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
296enum token_type {
297 TOKENTYPE_NONE,
298 TOKENTYPE_IVAL,
6154021b 299 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6 300 TOKENTYPE_PVAL,
aeaef349 301 TOKENTYPE_OPVAL
bbf60fe6
DM
302};
303
6d4a66ac
NC
304static struct debug_tokens {
305 const int token;
306 enum token_type type;
307 const char *name;
308} const debug_tokens[] =
9041c2e3 309{
bbf60fe6
DM
310 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
311 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
312 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
313 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
314 { ARROW, TOKENTYPE_NONE, "ARROW" },
315 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
316 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
317 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
318 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
319 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
0d863452 320 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
321 { DO, TOKENTYPE_NONE, "DO" },
322 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
323 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
324 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
325 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
326 { ELSE, TOKENTYPE_NONE, "ELSE" },
327 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
328 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
329 { FOR, TOKENTYPE_IVAL, "FOR" },
330 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
705fe0e5
FC
331 { FORMLBRACK, TOKENTYPE_NONE, "FORMLBRACK" },
332 { FORMRBRACK, TOKENTYPE_NONE, "FORMRBRACK" },
bbf60fe6
DM
333 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
334 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
7eb971ee 335 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
bbf60fe6
DM
336 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
337 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
338 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 339 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
340 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
341 { IF, TOKENTYPE_IVAL, "IF" },
5504e6cf 342 { LABEL, TOKENTYPE_PVAL, "LABEL" },
bbf60fe6
DM
343 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
344 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
345 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
346 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
347 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
348 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
349 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
350 { MY, TOKENTYPE_IVAL, "MY" },
bbf60fe6
DM
351 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
352 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
353 { OROP, TOKENTYPE_IVAL, "OROP" },
354 { OROR, TOKENTYPE_NONE, "OROR" },
355 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
88e1f1a2
JV
356 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
357 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6 358 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
cc624add 359 { POSTJOIN, TOKENTYPE_NONE, "POSTJOIN" },
bbf60fe6
DM
360 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
361 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
362 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
363 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
364 { PREINC, TOKENTYPE_NONE, "PREINC" },
365 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
f3f204dc 366 { QWLIST, TOKENTYPE_OPVAL, "QWLIST" },
bbf60fe6
DM
367 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
368 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
f3f204dc 369 { REQUIRE, TOKENTYPE_NONE, "REQUIRE" },
bbf60fe6
DM
370 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
371 { SUB, TOKENTYPE_NONE, "SUB" },
372 { THING, TOKENTYPE_OPVAL, "THING" },
373 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
374 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
375 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
376 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
377 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
378 { USE, TOKENTYPE_IVAL, "USE" },
0d863452 379 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6
DM
380 { WHILE, TOKENTYPE_IVAL, "WHILE" },
381 { WORD, TOKENTYPE_OPVAL, "WORD" },
be25f609 382 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 383 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
384};
385
6154021b 386/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 387
bbf60fe6 388STATIC int
704d4215 389S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 390{
7918f24d
NC
391 PERL_ARGS_ASSERT_TOKEREPORT;
392
bbf60fe6 393 if (DEBUG_T_TEST) {
bd61b366 394 const char *name = NULL;
bbf60fe6 395 enum token_type type = TOKENTYPE_NONE;
f54cb97a 396 const struct debug_tokens *p;
396482e1 397 SV* const report = newSVpvs("<== ");
bbf60fe6 398
f54cb97a 399 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
400 if (p->token == (int)rv) {
401 name = p->name;
402 type = p->type;
403 break;
404 }
405 }
406 if (name)
54667de8 407 Perl_sv_catpv(aTHX_ report, name);
74736ae6 408 else if ((char)rv > ' ' && (char)rv <= '~')
4ebc7986 409 {
bbf60fe6 410 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
4ebc7986
FC
411 if ((char)rv == 'p')
412 sv_catpvs(report, " (pending identifier)");
413 }
bbf60fe6 414 else if (!rv)
396482e1 415 sv_catpvs(report, "EOF");
bbf60fe6
DM
416 else
417 Perl_sv_catpvf(aTHX_ report, "?? %"IVdf, (IV)rv);
418 switch (type) {
419 case TOKENTYPE_NONE:
bbf60fe6
DM
420 break;
421 case TOKENTYPE_IVAL:
704d4215 422 Perl_sv_catpvf(aTHX_ report, "(ival=%"IVdf")", (IV)lvalp->ival);
bbf60fe6
DM
423 break;
424 case TOKENTYPE_OPNUM:
425 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 426 PL_op_name[lvalp->ival]);
bbf60fe6
DM
427 break;
428 case TOKENTYPE_PVAL:
704d4215 429 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
430 break;
431 case TOKENTYPE_OPVAL:
704d4215 432 if (lvalp->opval) {
401441c0 433 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
434 PL_op_name[lvalp->opval->op_type]);
435 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 436 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 437 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
438 }
439
440 }
401441c0 441 else
396482e1 442 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
443 break;
444 }
b6007c36 445 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
446 };
447 return (int)rv;
998054bd
SC
448}
449
b6007c36
DM
450
451/* print the buffer with suitable escapes */
452
453STATIC void
15f169a1 454S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 455{
396482e1 456 SV* const tmp = newSVpvs("");
7918f24d
NC
457
458 PERL_ARGS_ASSERT_PRINTBUF;
459
5d37acd6 460 GCC_DIAG_IGNORE(-Wformat-nonliteral); /* fmt checked by caller */
b6007c36 461 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
5d37acd6 462 GCC_DIAG_RESTORE;
b6007c36
DM
463 SvREFCNT_dec(tmp);
464}
465
8fa7f367
JH
466#endif
467
8290c323
NC
468static int
469S_deprecate_commaless_var_list(pTHX) {
470 PL_expect = XTERM;
471 deprecate("comma-less variable list");
472 return REPORT(','); /* grandfather non-comma-format format */
473}
474
ffb4593c
NT
475/*
476 * S_ao
477 *
f393a21a
FC
478 * This subroutine looks for an '=' next to the operator that has just been
479 * parsed and turns it into an ASSIGNOP if it finds one.
ffb4593c
NT
480 */
481
76e3520e 482STATIC int
cea2e8a9 483S_ao(pTHX_ int toketype)
a0d0e21e 484{
3280af22
NIS
485 if (*PL_bufptr == '=') {
486 PL_bufptr++;
a0d0e21e 487 if (toketype == ANDAND)
6154021b 488 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 489 else if (toketype == OROR)
6154021b 490 pl_yylval.ival = OP_ORASSIGN;
c963b151 491 else if (toketype == DORDOR)
6154021b 492 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
493 toketype = ASSIGNOP;
494 }
495 return toketype;
496}
497
ffb4593c
NT
498/*
499 * S_no_op
500 * When Perl expects an operator and finds something else, no_op
501 * prints the warning. It always prints "<something> found where
502 * operator expected. It prints "Missing semicolon on previous line?"
503 * if the surprise occurs at the start of the line. "do you need to
504 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
505 * where the compiler doesn't know if foo is a method call or a function.
506 * It prints "Missing operator before end of line" if there's nothing
507 * after the missing operator, or "... before <...>" if there is something
508 * after the missing operator.
509 */
510
76e3520e 511STATIC void
15f169a1 512S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 513{
9d4ba2ae
AL
514 char * const oldbp = PL_bufptr;
515 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 516
7918f24d
NC
517 PERL_ARGS_ASSERT_NO_OP;
518
1189a94a
GS
519 if (!s)
520 s = oldbp;
07c798fb 521 else
1189a94a 522 PL_bufptr = s;
734ab321 523 yywarn(Perl_form(aTHX_ "%s found where operator expected", what), UTF ? SVf_UTF8 : 0);
56da5a46
RGS
524 if (ckWARN_d(WARN_SYNTAX)) {
525 if (is_first)
526 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
527 "\t(Missing semicolon on previous line?)\n");
528 else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) {
f54cb97a 529 const char *t;
8a2bca12 530 for (t = PL_oldoldbufptr; (isWORDCHAR_lazy_if(t,UTF) || *t == ':');
734ab321 531 t += UTF ? UTF8SKIP(t) : 1)
c35e046a 532 NOOP;
56da5a46
RGS
533 if (t < PL_bufptr && isSPACE(*t))
534 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
b17a0679
FC
535 "\t(Do you need to predeclare %"UTF8f"?)\n",
536 UTF8fARG(UTF, t - PL_oldoldbufptr, PL_oldoldbufptr));
56da5a46
RGS
537 }
538 else {
539 assert(s >= oldbp);
540 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
b17a0679
FC
541 "\t(Missing operator before %"UTF8f"?)\n",
542 UTF8fARG(UTF, s - oldbp, oldbp));
56da5a46 543 }
07c798fb 544 }
3280af22 545 PL_bufptr = oldbp;
8990e307
LW
546}
547
ffb4593c
NT
548/*
549 * S_missingterm
550 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 551 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
552 * If we're in a delimited string and the delimiter is a control
553 * character, it's reformatted into a two-char sequence like ^C.
554 * This is fatal.
555 */
556
76e3520e 557STATIC void
cea2e8a9 558S_missingterm(pTHX_ char *s)
8990e307
LW
559{
560 char tmpbuf[3];
561 char q;
562 if (s) {
9d4ba2ae 563 char * const nl = strrchr(s,'\n');
d2719217 564 if (nl)
8990e307
LW
565 *nl = '\0';
566 }
ca8b19a7 567 else if ((U8) PL_multi_close < 32) {
8990e307 568 *tmpbuf = '^';
585ec06d 569 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
570 tmpbuf[2] = '\0';
571 s = tmpbuf;
572 }
573 else {
eb160463 574 *tmpbuf = (char)PL_multi_close;
8990e307
LW
575 tmpbuf[1] = '\0';
576 s = tmpbuf;
577 }
578 q = strchr(s,'"') ? '\'' : '"';
cea2e8a9 579 Perl_croak(aTHX_ "Can't find string terminator %c%s%c anywhere before EOF",q,s,q);
463ee0b2 580}
79072805 581
dd0ac2b9
FC
582#include "feature.h"
583
0d863452 584/*
0d863452
RH
585 * Check whether the named feature is enabled.
586 */
26ea9e12 587bool
3fff3427 588Perl_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
0d863452 589{
4a731d7b 590 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
7918f24d
NC
591
592 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
ca4d40c4
FC
593
594 assert(CURRENT_FEATURE_BUNDLE == FEATURE_BUNDLE_CUSTOM);
7918f24d 595
26ea9e12
NC
596 if (namelen > MAX_FEATURE_LEN)
597 return FALSE;
3fff3427 598 memcpy(&he_name[8], name, namelen);
7d69d4a6 599
c8ca97b0
NC
600 return cBOOL(cop_hints_fetch_pvn(PL_curcop, he_name, 8 + namelen, 0,
601 REFCOUNTED_HE_EXISTS));
0d863452
RH
602}
603
ffb4593c 604/*
9cbb5ea2
GS
605 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
606 * utf16-to-utf8-reversed.
ffb4593c
NT
607 */
608
c39cd008
GS
609#ifdef PERL_CR_FILTER
610static void
611strip_return(SV *sv)
612{
eb578fdb
KW
613 const char *s = SvPVX_const(sv);
614 const char * const e = s + SvCUR(sv);
7918f24d
NC
615
616 PERL_ARGS_ASSERT_STRIP_RETURN;
617
c39cd008
GS
618 /* outer loop optimized to do nothing if there are no CR-LFs */
619 while (s < e) {
620 if (*s++ == '\r' && *s == '\n') {
621 /* hit a CR-LF, need to copy the rest */
eb578fdb 622 char *d = s - 1;
c39cd008
GS
623 *d++ = *s++;
624 while (s < e) {
625 if (*s == '\r' && s[1] == '\n')
626 s++;
627 *d++ = *s++;
628 }
629 SvCUR(sv) -= s - d;
630 return;
631 }
632 }
633}
a868473f 634
76e3520e 635STATIC I32
c39cd008 636S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 637{
f54cb97a 638 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
639 if (count > 0 && !maxlen)
640 strip_return(sv);
641 return count;
a868473f
NIS
642}
643#endif
644
ffb4593c 645/*
8eaa0acf
Z
646=for apidoc Amx|void|lex_start|SV *line|PerlIO *rsfp|U32 flags
647
648Creates and initialises a new lexer/parser state object, supplying
649a context in which to lex and parse from a new source of Perl code.
650A pointer to the new state object is placed in L</PL_parser>. An entry
651is made on the save stack so that upon unwinding the new state object
652will be destroyed and the former value of L</PL_parser> will be restored.
653Nothing else need be done to clean up the parsing context.
654
655The code to be parsed comes from I<line> and I<rsfp>. I<line>, if
656non-null, provides a string (in SV form) containing code to be parsed.
657A copy of the string is made, so subsequent modification of I<line>
658does not affect parsing. I<rsfp>, if non-null, provides an input stream
659from which code will be read to be parsed. If both are non-null, the
660code in I<line> comes first and must consist of complete lines of input,
661and I<rsfp> supplies the remainder of the source.
662
e368b3bd
FC
663The I<flags> parameter is reserved for future use. Currently it is only
664used by perl internally, so extensions should always pass zero.
8eaa0acf
Z
665
666=cut
667*/
ffb4593c 668
27fcb6ee 669/* LEX_START_SAME_FILTER indicates that this is not a new file, so it
87606032
NC
670 can share filters with the current parser.
671 LEX_START_DONT_CLOSE indicates that the file handle wasn't opened by the
672 caller, hence isn't owned by the parser, so shouldn't be closed on parser
673 destruction. This is used to handle the case of defaulting to reading the
674 script from the standard input because no filename was given on the command
675 line (without getting confused by situation where STDIN has been closed, so
676 the script handle is opened on fd 0) */
27fcb6ee 677
a0d0e21e 678void
8eaa0acf 679Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
79072805 680{
6ef55633 681 const char *s = NULL;
5486870f 682 yy_parser *parser, *oparser;
60d63348 683 if (flags && flags & ~LEX_START_FLAGS)
8eaa0acf 684 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
acdf0a21
DM
685
686 /* create and initialise a parser */
687
199e78b7 688 Newxz(parser, 1, yy_parser);
5486870f 689 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
690 PL_parser = parser;
691
28ac2b49
Z
692 parser->stack = NULL;
693 parser->ps = NULL;
694 parser->stack_size = 0;
acdf0a21 695
e3abe207
DM
696 /* on scope exit, free this parser and restore any outer one */
697 SAVEPARSER(parser);
7c4baf47 698 parser->saved_curcop = PL_curcop;
e3abe207 699
acdf0a21 700 /* initialise lexer state */
8990e307 701
fb205e7a 702 parser->nexttoke = 0;
ca4cfd28 703 parser->error_count = oparser ? oparser->error_count : 0;
7f1c3e8c 704 parser->copline = parser->preambling = NOLINE;
5afb0a62 705 parser->lex_state = LEX_NORMAL;
c2598295 706 parser->expect = XSTATE;
2f9285f8 707 parser->rsfp = rsfp;
27fcb6ee
FC
708 parser->rsfp_filters =
709 !(flags & LEX_START_SAME_FILTER) || !oparser
d3cd8e11
FC
710 ? NULL
711 : MUTABLE_AV(SvREFCNT_inc(
712 oparser->rsfp_filters
713 ? oparser->rsfp_filters
714 : (oparser->rsfp_filters = newAV())
715 ));
2f9285f8 716
199e78b7
DM
717 Newx(parser->lex_brackstack, 120, char);
718 Newx(parser->lex_casestack, 12, char);
719 *parser->lex_casestack = '\0';
d794b522 720 Newxz(parser->lex_shared, 1, LEXSHARED);
02b34bbe 721
10efb74f 722 if (line) {
0528fd32 723 STRLEN len;
10efb74f 724 s = SvPV_const(line, len);
0abcdfa4
FC
725 parser->linestr = flags & LEX_START_COPIED
726 ? SvREFCNT_inc_simple_NN(line)
727 : newSVpvn_flags(s, len, SvUTF8(line));
bf1b738b 728 sv_catpvn(parser->linestr, "\n;", rsfp ? 1 : 2);
0abcdfa4 729 } else {
bf1b738b 730 parser->linestr = newSVpvn("\n;", rsfp ? 1 : 2);
8990e307 731 }
f06b5848
DM
732 parser->oldoldbufptr =
733 parser->oldbufptr =
734 parser->bufptr =
735 parser->linestart = SvPVX(parser->linestr);
736 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
737 parser->last_lop = parser->last_uni = NULL;
b54f893d
KW
738
739 assert(FITS_IN_8_BITS(LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
740 |LEX_DONT_CLOSE_RSFP));
741 parser->lex_flags = (U8) (flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
742 |LEX_DONT_CLOSE_RSFP));
737c24fc 743
60d63348 744 parser->in_pod = parser->filtered = 0;
79072805 745}
a687059c 746
e3abe207
DM
747
748/* delete a parser object */
749
750void
751Perl_parser_free(pTHX_ const yy_parser *parser)
752{
7918f24d
NC
753 PERL_ARGS_ASSERT_PARSER_FREE;
754
7c4baf47 755 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
756 SvREFCNT_dec(parser->linestr);
757
87606032 758 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
2f9285f8 759 PerlIO_clearerr(parser->rsfp);
799361c3
SH
760 else if (parser->rsfp && (!parser->old_parser ||
761 (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 762 PerlIO_close(parser->rsfp);
5486870f 763 SvREFCNT_dec(parser->rsfp_filters);
10002bc1
FC
764 SvREFCNT_dec(parser->lex_stuff);
765 SvREFCNT_dec(parser->sublex_info.repl);
3ac7ff8f
FC
766
767 Safefree(parser->lex_brackstack);
768 Safefree(parser->lex_casestack);
769 Safefree(parser->lex_shared);
770 PL_parser = parser->old_parser;
771 Safefree(parser);
772}
773
774void
775Perl_parser_free_nexttoke_ops(pTHX_ yy_parser *parser, OPSLAB *slab)
776{
3ac7ff8f 777 I32 nexttoke = parser->nexttoke;
3ac7ff8f 778 PERL_ARGS_ASSERT_PARSER_FREE_NEXTTOKE_OPS;
3ce3dcd9 779 while (nexttoke--) {
3ac7ff8f
FC
780 if (S_is_opval_token(parser->nexttype[nexttoke] & 0xffff)
781 && parser->nextval[nexttoke].opval
782 && parser->nextval[nexttoke].opval->op_slabbed
783 && OpSLAB(parser->nextval[nexttoke].opval) == slab) {
3ce3dcd9 784 op_free(parser->nextval[nexttoke].opval);
3ac7ff8f
FC
785 parser->nextval[nexttoke].opval = NULL;
786 }
3ce3dcd9 787 }
e3abe207
DM
788}
789
790
ffb4593c 791/*
f0e67a1d
Z
792=for apidoc AmxU|SV *|PL_parser-E<gt>linestr
793
794Buffer scalar containing the chunk currently under consideration of the
795text currently being lexed. This is always a plain string scalar (for
796which C<SvPOK> is true). It is not intended to be used as a scalar by
797normal scalar means; instead refer to the buffer directly by the pointer
798variables described below.
799
800The lexer maintains various C<char*> pointers to things in the
801C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
802reallocated, all of these pointers must be updated. Don't attempt to
803do this manually, but rather use L</lex_grow_linestr> if you need to
804reallocate the buffer.
805
806The content of the text chunk in the buffer is commonly exactly one
807complete line of input, up to and including a newline terminator,
808but there are situations where it is otherwise. The octets of the
809buffer may be intended to be interpreted as either UTF-8 or Latin-1.
810The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
811flag on this scalar, which may disagree with it.
812
813For direct examination of the buffer, the variable
814L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
815lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
816of these pointers is usually preferable to examination of the scalar
817through normal scalar means.
818
819=for apidoc AmxU|char *|PL_parser-E<gt>bufend
820
821Direct pointer to the end of the chunk of text currently being lexed, the
822end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
6602b933 823+ SvCUR(PL_parser-E<gt>linestr)>. A C<NUL> character (zero octet) is
f0e67a1d
Z
824always located at the end of the buffer, and does not count as part of
825the buffer's contents.
826
827=for apidoc AmxU|char *|PL_parser-E<gt>bufptr
828
829Points to the current position of lexing inside the lexer buffer.
830Characters around this point may be freely examined, within
831the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
832L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
833interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
834
835Lexing code (whether in the Perl core or not) moves this pointer past
836the characters that it consumes. It is also expected to perform some
837bookkeeping whenever a newline character is consumed. This movement
838can be more conveniently performed by the function L</lex_read_to>,
839which handles newlines appropriately.
840
841Interpretation of the buffer's octets can be abstracted out by
842using the slightly higher-level functions L</lex_peek_unichar> and
843L</lex_read_unichar>.
844
845=for apidoc AmxU|char *|PL_parser-E<gt>linestart
846
847Points to the start of the current line inside the lexer buffer.
848This is useful for indicating at which column an error occurred, and
849not much else. This must be updated by any lexing code that consumes
850a newline; the function L</lex_read_to> handles this detail.
851
852=cut
853*/
854
855/*
856=for apidoc Amx|bool|lex_bufutf8
857
858Indicates whether the octets in the lexer buffer
859(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
860of Unicode characters. If not, they should be interpreted as Latin-1
861characters. This is analogous to the C<SvUTF8> flag for scalars.
862
863In UTF-8 mode, it is not guaranteed that the lexer buffer actually
864contains valid UTF-8. Lexing code must be robust in the face of invalid
865encoding.
866
867The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
868is significant, but not the whole story regarding the input character
869encoding. Normally, when a file is being read, the scalar contains octets
870and its C<SvUTF8> flag is off, but the octets should be interpreted as
871UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
872however, the scalar may have the C<SvUTF8> flag on, and in this case its
873octets should be interpreted as UTF-8 unless the C<use bytes> pragma
874is in effect. This logic may change in the future; use this function
875instead of implementing the logic yourself.
876
877=cut
878*/
879
880bool
881Perl_lex_bufutf8(pTHX)
882{
883 return UTF;
884}
885
886/*
887=for apidoc Amx|char *|lex_grow_linestr|STRLEN len
888
889Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
6602b933 890at least I<len> octets (including terminating C<NUL>). Returns a
f0e67a1d
Z
891pointer to the reallocated buffer. This is necessary before making
892any direct modification of the buffer that would increase its length.
893L</lex_stuff_pvn> provides a more convenient way to insert text into
894the buffer.
895
896Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
897this function updates all of the lexer's variables that point directly
898into the buffer.
899
900=cut
901*/
902
903char *
904Perl_lex_grow_linestr(pTHX_ STRLEN len)
905{
906 SV *linestr;
907 char *buf;
908 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
c7641931 909 STRLEN linestart_pos, last_uni_pos, last_lop_pos, re_eval_start_pos;
f0e67a1d
Z
910 linestr = PL_parser->linestr;
911 buf = SvPVX(linestr);
912 if (len <= SvLEN(linestr))
913 return buf;
914 bufend_pos = PL_parser->bufend - buf;
915 bufptr_pos = PL_parser->bufptr - buf;
916 oldbufptr_pos = PL_parser->oldbufptr - buf;
917 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
918 linestart_pos = PL_parser->linestart - buf;
919 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
920 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
3328ab5a
FC
921 re_eval_start_pos = PL_parser->lex_shared->re_eval_start ?
922 PL_parser->lex_shared->re_eval_start - buf : 0;
c7641931 923
f0e67a1d 924 buf = sv_grow(linestr, len);
c7641931 925
f0e67a1d
Z
926 PL_parser->bufend = buf + bufend_pos;
927 PL_parser->bufptr = buf + bufptr_pos;
928 PL_parser->oldbufptr = buf + oldbufptr_pos;
929 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
930 PL_parser->linestart = buf + linestart_pos;
931 if (PL_parser->last_uni)
932 PL_parser->last_uni = buf + last_uni_pos;
933 if (PL_parser->last_lop)
934 PL_parser->last_lop = buf + last_lop_pos;
3328ab5a
FC
935 if (PL_parser->lex_shared->re_eval_start)
936 PL_parser->lex_shared->re_eval_start = buf + re_eval_start_pos;
f0e67a1d
Z
937 return buf;
938}
939
940/*
83aa740e 941=for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
f0e67a1d
Z
942
943Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
944immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
945reallocating the buffer if necessary. This means that lexing code that
946runs later will see the characters as if they had appeared in the input.
947It is not recommended to do this as part of normal parsing, and most
948uses of this facility run the risk of the inserted characters being
949interpreted in an unintended manner.
950
951The string to be inserted is represented by I<len> octets starting
952at I<pv>. These octets are interpreted as either UTF-8 or Latin-1,
953according to whether the C<LEX_STUFF_UTF8> flag is set in I<flags>.
954The characters are recoded for the lexer buffer, according to how the
955buffer is currently being interpreted (L</lex_bufutf8>). If a string
9dcc53ea 956to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
f0e67a1d
Z
957function is more convenient.
958
959=cut
960*/
961
962void
83aa740e 963Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
f0e67a1d 964{
749123ff 965 dVAR;
f0e67a1d
Z
966 char *bufptr;
967 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
968 if (flags & ~(LEX_STUFF_UTF8))
969 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
970 if (UTF) {
971 if (flags & LEX_STUFF_UTF8) {
972 goto plain_copy;
973 } else {
54d004e8 974 STRLEN highhalf = 0; /* Count of variants */
83aa740e 975 const char *p, *e = pv+len;
54d004e8
KW
976 for (p = pv; p != e; p++) {
977 if (! UTF8_IS_INVARIANT(*p)) {
978 highhalf++;
979 }
980 }
f0e67a1d
Z
981 if (!highhalf)
982 goto plain_copy;
983 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
984 bufptr = PL_parser->bufptr;
985 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
986 SvCUR_set(PL_parser->linestr,
987 SvCUR(PL_parser->linestr) + len+highhalf);
f0e67a1d
Z
988 PL_parser->bufend += len+highhalf;
989 for (p = pv; p != e; p++) {
990 U8 c = (U8)*p;
54d004e8
KW
991 if (! UTF8_IS_INVARIANT(c)) {
992 *bufptr++ = UTF8_TWO_BYTE_HI(c);
993 *bufptr++ = UTF8_TWO_BYTE_LO(c);
f0e67a1d
Z
994 } else {
995 *bufptr++ = (char)c;
996 }
997 }
998 }
999 } else {
1000 if (flags & LEX_STUFF_UTF8) {
1001 STRLEN highhalf = 0;
83aa740e 1002 const char *p, *e = pv+len;
f0e67a1d
Z
1003 for (p = pv; p != e; p++) {
1004 U8 c = (U8)*p;
54d004e8 1005 if (UTF8_IS_ABOVE_LATIN1(c)) {
f0e67a1d
Z
1006 Perl_croak(aTHX_ "Lexing code attempted to stuff "
1007 "non-Latin-1 character into Latin-1 input");
54d004e8 1008 } else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e)) {
f0e67a1d
Z
1009 p++;
1010 highhalf++;
54d004e8 1011 } else if (! UTF8_IS_INVARIANT(c)) {
f0e67a1d
Z
1012 /* malformed UTF-8 */
1013 ENTER;
1014 SAVESPTR(PL_warnhook);
1015 PL_warnhook = PERL_WARNHOOK_FATAL;
c80e42f3 1016 utf8n_to_uvchr((U8*)p, e-p, NULL, 0);
f0e67a1d
Z
1017 LEAVE;
1018 }
1019 }
1020 if (!highhalf)
1021 goto plain_copy;
1022 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
1023 bufptr = PL_parser->bufptr;
1024 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1025 SvCUR_set(PL_parser->linestr,
1026 SvCUR(PL_parser->linestr) + len-highhalf);
f0e67a1d 1027 PL_parser->bufend += len-highhalf;
54d004e8
KW
1028 p = pv;
1029 while (p < e) {
1030 if (UTF8_IS_INVARIANT(*p)) {
1031 *bufptr++ = *p;
1032 p++;
f0e67a1d 1033 }
54d004e8
KW
1034 else {
1035 assert(p < e -1 );
94bb8c36 1036 *bufptr++ = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
54d004e8
KW
1037 p += 2;
1038 }
f0e67a1d
Z
1039 }
1040 } else {
54d004e8 1041 plain_copy:
f0e67a1d
Z
1042 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1043 bufptr = PL_parser->bufptr;
1044 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
255fdf19 1045 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
f0e67a1d
Z
1046 PL_parser->bufend += len;
1047 Copy(pv, bufptr, len, char);
1048 }
1049 }
1050}
1051
1052/*
9dcc53ea
Z
1053=for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
1054
1055Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1056immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1057reallocating the buffer if necessary. This means that lexing code that
1058runs later will see the characters as if they had appeared in the input.
1059It is not recommended to do this as part of normal parsing, and most
1060uses of this facility run the risk of the inserted characters being
1061interpreted in an unintended manner.
1062
1063The string to be inserted is represented by octets starting at I<pv>
1064and continuing to the first nul. These octets are interpreted as either
1065UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
1066in I<flags>. The characters are recoded for the lexer buffer, according
1067to how the buffer is currently being interpreted (L</lex_bufutf8>).
1068If it is not convenient to nul-terminate a string to be inserted, the
1069L</lex_stuff_pvn> function is more appropriate.
1070
1071=cut
1072*/
1073
1074void
1075Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1076{
1077 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1078 lex_stuff_pvn(pv, strlen(pv), flags);
1079}
1080
1081/*
f0e67a1d
Z
1082=for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1083
1084Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1085immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1086reallocating the buffer if necessary. This means that lexing code that
1087runs later will see the characters as if they had appeared in the input.
1088It is not recommended to do this as part of normal parsing, and most
1089uses of this facility run the risk of the inserted characters being
1090interpreted in an unintended manner.
1091
1092The string to be inserted is the string value of I<sv>. The characters
1093are recoded for the lexer buffer, according to how the buffer is currently
9dcc53ea 1094being interpreted (L</lex_bufutf8>). If a string to be inserted is
f0e67a1d
Z
1095not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1096need to construct a scalar.
1097
1098=cut
1099*/
1100
1101void
1102Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1103{
1104 char *pv;
1105 STRLEN len;
1106 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1107 if (flags)
1108 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1109 pv = SvPV(sv, len);
1110 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1111}
1112
1113/*
1114=for apidoc Amx|void|lex_unstuff|char *ptr
1115
1116Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1117I<ptr>. Text following I<ptr> will be moved, and the buffer shortened.
1118This hides the discarded text from any lexing code that runs later,
1119as if the text had never appeared.
1120
1121This is not the normal way to consume lexed text. For that, use
1122L</lex_read_to>.
1123
1124=cut
1125*/
1126
1127void
1128Perl_lex_unstuff(pTHX_ char *ptr)
1129{
1130 char *buf, *bufend;
1131 STRLEN unstuff_len;
1132 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1133 buf = PL_parser->bufptr;
1134 if (ptr < buf)
1135 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1136 if (ptr == buf)
1137 return;
1138 bufend = PL_parser->bufend;
1139 if (ptr > bufend)
1140 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1141 unstuff_len = ptr - buf;
1142 Move(ptr, buf, bufend+1-ptr, char);
1143 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1144 PL_parser->bufend = bufend - unstuff_len;
1145}
1146
1147/*
1148=for apidoc Amx|void|lex_read_to|char *ptr
1149
1150Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1151to I<ptr>. This advances L</PL_parser-E<gt>bufptr> to match I<ptr>,
1152performing the correct bookkeeping whenever a newline character is passed.
1153This is the normal way to consume lexed text.
1154
1155Interpretation of the buffer's octets can be abstracted out by
1156using the slightly higher-level functions L</lex_peek_unichar> and
1157L</lex_read_unichar>.
1158
1159=cut
1160*/
1161
1162void
1163Perl_lex_read_to(pTHX_ char *ptr)
1164{
1165 char *s;
1166 PERL_ARGS_ASSERT_LEX_READ_TO;
1167 s = PL_parser->bufptr;
1168 if (ptr < s || ptr > PL_parser->bufend)
1169 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1170 for (; s != ptr; s++)
1171 if (*s == '\n') {
83944c01 1172 COPLINE_INC_WITH_HERELINES;
f0e67a1d
Z
1173 PL_parser->linestart = s+1;
1174 }
1175 PL_parser->bufptr = ptr;
1176}
1177
1178/*
1179=for apidoc Amx|void|lex_discard_to|char *ptr
1180
1181Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1182up to I<ptr>. The remaining content of the buffer will be moved, and
1183all pointers into the buffer updated appropriately. I<ptr> must not
1184be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1185it is not permitted to discard text that has yet to be lexed.
1186
1187Normally it is not necessarily to do this directly, because it suffices to
1188use the implicit discarding behaviour of L</lex_next_chunk> and things
1189based on it. However, if a token stretches across multiple lines,
1f317c95 1190and the lexing code has kept multiple lines of text in the buffer for
f0e67a1d
Z
1191that purpose, then after completion of the token it would be wise to
1192explicitly discard the now-unneeded earlier lines, to avoid future
1193multi-line tokens growing the buffer without bound.
1194
1195=cut
1196*/
1197
1198void
1199Perl_lex_discard_to(pTHX_ char *ptr)
1200{
1201 char *buf;
1202 STRLEN discard_len;
1203 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1204 buf = SvPVX(PL_parser->linestr);
1205 if (ptr < buf)
1206 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1207 if (ptr == buf)
1208 return;
1209 if (ptr > PL_parser->bufptr)
1210 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1211 discard_len = ptr - buf;
1212 if (PL_parser->oldbufptr < ptr)
1213 PL_parser->oldbufptr = ptr;
1214 if (PL_parser->oldoldbufptr < ptr)
1215 PL_parser->oldoldbufptr = ptr;
1216 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1217 PL_parser->last_uni = NULL;
1218 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1219 PL_parser->last_lop = NULL;
1220 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1221 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1222 PL_parser->bufend -= discard_len;
1223 PL_parser->bufptr -= discard_len;
1224 PL_parser->oldbufptr -= discard_len;
1225 PL_parser->oldoldbufptr -= discard_len;
1226 if (PL_parser->last_uni)
1227 PL_parser->last_uni -= discard_len;
1228 if (PL_parser->last_lop)
1229 PL_parser->last_lop -= discard_len;
1230}
1231
1232/*
1233=for apidoc Amx|bool|lex_next_chunk|U32 flags
1234
1235Reads in the next chunk of text to be lexed, appending it to
1236L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1237looked to the end of the current chunk and wants to know more. It is
1238usual, but not necessary, for lexing to have consumed the entirety of
1239the current chunk at this time.
1240
1241If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1242chunk (i.e., the current chunk has been entirely consumed), normally the
1243current chunk will be discarded at the same time that the new chunk is
1244read in. If I<flags> includes C<LEX_KEEP_PREVIOUS>, the current chunk
1245will not be discarded. If the current chunk has not been entirely
1246consumed, then it will not be discarded regardless of the flag.
1247
1248Returns true if some new text was added to the buffer, or false if the
1249buffer has reached the end of the input text.
1250
1251=cut
1252*/
1253
1254#define LEX_FAKE_EOF 0x80000000
112d1284 1255#define LEX_NO_TERM 0x40000000
f0e67a1d
Z
1256
1257bool
1258Perl_lex_next_chunk(pTHX_ U32 flags)
1259{
1260 SV *linestr;
1261 char *buf;
1262 STRLEN old_bufend_pos, new_bufend_pos;
1263 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1264 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1265 bool got_some_for_debugger = 0;
f0e67a1d 1266 bool got_some;
112d1284 1267 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM))
f0e67a1d 1268 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
f0e67a1d
Z
1269 linestr = PL_parser->linestr;
1270 buf = SvPVX(linestr);
1271 if (!(flags & LEX_KEEP_PREVIOUS) &&
1272 PL_parser->bufptr == PL_parser->bufend) {
1273 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1274 linestart_pos = 0;
1275 if (PL_parser->last_uni != PL_parser->bufend)
1276 PL_parser->last_uni = NULL;
1277 if (PL_parser->last_lop != PL_parser->bufend)
1278 PL_parser->last_lop = NULL;
1279 last_uni_pos = last_lop_pos = 0;
1280 *buf = 0;
1281 SvCUR(linestr) = 0;
1282 } else {
1283 old_bufend_pos = PL_parser->bufend - buf;
1284 bufptr_pos = PL_parser->bufptr - buf;
1285 oldbufptr_pos = PL_parser->oldbufptr - buf;
1286 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1287 linestart_pos = PL_parser->linestart - buf;
1288 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1289 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1290 }
1291 if (flags & LEX_FAKE_EOF) {
1292 goto eof;
60d63348 1293 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
f0e67a1d
Z
1294 got_some = 0;
1295 } else if (filter_gets(linestr, old_bufend_pos)) {
1296 got_some = 1;
17cc9359 1297 got_some_for_debugger = 1;
112d1284
FC
1298 } else if (flags & LEX_NO_TERM) {
1299 got_some = 0;
f0e67a1d 1300 } else {
580561a3
Z
1301 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1302 sv_setpvs(linestr, "");
f0e67a1d
Z
1303 eof:
1304 /* End of real input. Close filehandle (unless it was STDIN),
1305 * then add implicit termination.
1306 */
87606032 1307 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
f0e67a1d
Z
1308 PerlIO_clearerr(PL_parser->rsfp);
1309 else if (PL_parser->rsfp)
1310 (void)PerlIO_close(PL_parser->rsfp);
1311 PL_parser->rsfp = NULL;
60d63348 1312 PL_parser->in_pod = PL_parser->filtered = 0;
f0e67a1d
Z
1313 if (!PL_in_eval && PL_minus_p) {
1314 sv_catpvs(linestr,
1315 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1316 PL_minus_n = PL_minus_p = 0;
1317 } else if (!PL_in_eval && PL_minus_n) {
1318 sv_catpvs(linestr, /*{*/";}");
1319 PL_minus_n = 0;
1320 } else
1321 sv_catpvs(linestr, ";");
1322 got_some = 1;
1323 }
1324 buf = SvPVX(linestr);
1325 new_bufend_pos = SvCUR(linestr);
1326 PL_parser->bufend = buf + new_bufend_pos;
1327 PL_parser->bufptr = buf + bufptr_pos;
1328 PL_parser->oldbufptr = buf + oldbufptr_pos;
1329 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1330 PL_parser->linestart = buf + linestart_pos;
1331 if (PL_parser->last_uni)
1332 PL_parser->last_uni = buf + last_uni_pos;
1333 if (PL_parser->last_lop)
1334 PL_parser->last_lop = buf + last_lop_pos;
7f1c3e8c
FC
1335 if (PL_parser->preambling != NOLINE) {
1336 CopLINE_set(PL_curcop, PL_parser->preambling + 1);
1337 PL_parser->preambling = NOLINE;
1338 }
17cc9359 1339 if (got_some_for_debugger && (PERLDB_LINE || PERLDB_SAVESRC) &&
f0e67a1d
Z
1340 PL_curstash != PL_debstash) {
1341 /* debugger active and we're not compiling the debugger code,
1342 * so store the line into the debugger's array of lines
1343 */
1344 update_debugger_info(NULL, buf+old_bufend_pos,
1345 new_bufend_pos-old_bufend_pos);
1346 }
1347 return got_some;
1348}
1349
1350/*
1351=for apidoc Amx|I32|lex_peek_unichar|U32 flags
1352
1353Looks ahead one (Unicode) character in the text currently being lexed.
1354Returns the codepoint (unsigned integer value) of the next character,
1355or -1 if lexing has reached the end of the input text. To consume the
1356peeked character, use L</lex_read_unichar>.
1357
1358If the next character is in (or extends into) the next chunk of input
1359text, the next chunk will be read in. Normally the current chunk will be
1360discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1361then the current chunk will not be discarded.
1362
1363If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1364is encountered, an exception is generated.
1365
1366=cut
1367*/
1368
1369I32
1370Perl_lex_peek_unichar(pTHX_ U32 flags)
1371{
749123ff 1372 dVAR;
f0e67a1d
Z
1373 char *s, *bufend;
1374 if (flags & ~(LEX_KEEP_PREVIOUS))
1375 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1376 s = PL_parser->bufptr;
1377 bufend = PL_parser->bufend;
1378 if (UTF) {
1379 U8 head;
1380 I32 unichar;
1381 STRLEN len, retlen;
1382 if (s == bufend) {
1383 if (!lex_next_chunk(flags))
1384 return -1;
1385 s = PL_parser->bufptr;
1386 bufend = PL_parser->bufend;
1387 }
1388 head = (U8)*s;
54d004e8 1389 if (UTF8_IS_INVARIANT(head))
f0e67a1d 1390 return head;
54d004e8
KW
1391 if (UTF8_IS_START(head)) {
1392 len = UTF8SKIP(&head);
f0e67a1d
Z
1393 while ((STRLEN)(bufend-s) < len) {
1394 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1395 break;
1396 s = PL_parser->bufptr;
1397 bufend = PL_parser->bufend;
1398 }
1399 }
c80e42f3 1400 unichar = utf8n_to_uvchr((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
f0e67a1d
Z
1401 if (retlen == (STRLEN)-1) {
1402 /* malformed UTF-8 */
1403 ENTER;
1404 SAVESPTR(PL_warnhook);
1405 PL_warnhook = PERL_WARNHOOK_FATAL;
c80e42f3 1406 utf8n_to_uvchr((U8*)s, bufend-s, NULL, 0);
f0e67a1d
Z
1407 LEAVE;
1408 }
1409 return unichar;
1410 } else {
1411 if (s == bufend) {
1412 if (!lex_next_chunk(flags))
1413 return -1;
1414 s = PL_parser->bufptr;
1415 }
1416 return (U8)*s;
1417 }
1418}
1419
1420/*
1421=for apidoc Amx|I32|lex_read_unichar|U32 flags
1422
1423Reads the next (Unicode) character in the text currently being lexed.
1424Returns the codepoint (unsigned integer value) of the character read,
1425and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1426if lexing has reached the end of the input text. To non-destructively
1427examine the next character, use L</lex_peek_unichar> instead.
1428
1429If the next character is in (or extends into) the next chunk of input
1430text, the next chunk will be read in. Normally the current chunk will be
1431discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1432then the current chunk will not be discarded.
1433
1434If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1435is encountered, an exception is generated.
1436
1437=cut
1438*/
1439
1440I32
1441Perl_lex_read_unichar(pTHX_ U32 flags)
1442{
1443 I32 c;
1444 if (flags & ~(LEX_KEEP_PREVIOUS))
1445 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1446 c = lex_peek_unichar(flags);
1447 if (c != -1) {
1448 if (c == '\n')
83944c01 1449 COPLINE_INC_WITH_HERELINES;
d9018cbe
EB
1450 if (UTF)
1451 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1452 else
1453 ++(PL_parser->bufptr);
f0e67a1d
Z
1454 }
1455 return c;
1456}
1457
1458/*
1459=for apidoc Amx|void|lex_read_space|U32 flags
1460
1461Reads optional spaces, in Perl style, in the text currently being
1462lexed. The spaces may include ordinary whitespace characters and
1463Perl-style comments. C<#line> directives are processed if encountered.
1464L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1465at a non-space character (or the end of the input text).
1466
1467If spaces extend into the next chunk of input text, the next chunk will
1468be read in. Normally the current chunk will be discarded at the same
1469time, but if I<flags> includes C<LEX_KEEP_PREVIOUS> then the current
1470chunk will not be discarded.
1471
1472=cut
1473*/
1474
21791330 1475#define LEX_NO_INCLINE 0x40000000
f0998909
Z
1476#define LEX_NO_NEXT_CHUNK 0x80000000
1477
f0e67a1d
Z
1478void
1479Perl_lex_read_space(pTHX_ U32 flags)
1480{
1481 char *s, *bufend;
21791330 1482 const bool can_incline = !(flags & LEX_NO_INCLINE);
f0e67a1d 1483 bool need_incline = 0;
21791330 1484 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK|LEX_NO_INCLINE))
f0e67a1d 1485 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
f0e67a1d
Z
1486 s = PL_parser->bufptr;
1487 bufend = PL_parser->bufend;
1488 while (1) {
1489 char c = *s;
1490 if (c == '#') {
1491 do {
1492 c = *++s;
1493 } while (!(c == '\n' || (c == 0 && s == bufend)));
1494 } else if (c == '\n') {
1495 s++;
21791330
FC
1496 if (can_incline) {
1497 PL_parser->linestart = s;
1498 if (s == bufend)
1499 need_incline = 1;
1500 else
1501 incline(s);
1502 }
f0e67a1d
Z
1503 } else if (isSPACE(c)) {
1504 s++;
1505 } else if (c == 0 && s == bufend) {
1506 bool got_more;
65c68e17 1507 line_t l;
f0998909
Z
1508 if (flags & LEX_NO_NEXT_CHUNK)
1509 break;
f0e67a1d 1510 PL_parser->bufptr = s;
65c68e17 1511 l = CopLINE(PL_curcop);
851b527a 1512 CopLINE(PL_curcop) += PL_parser->herelines + 1;
f0e67a1d 1513 got_more = lex_next_chunk(flags);
65c68e17 1514 CopLINE_set(PL_curcop, l);
f0e67a1d
Z
1515 s = PL_parser->bufptr;
1516 bufend = PL_parser->bufend;
1517 if (!got_more)
1518 break;
21791330 1519 if (can_incline && need_incline && PL_parser->rsfp) {
f0e67a1d
Z
1520 incline(s);
1521 need_incline = 0;
1522 }
1523 } else {
1524 break;
1525 }
1526 }
f0e67a1d
Z
1527 PL_parser->bufptr = s;
1528}
1529
1530/*
fe788d6b
PM
1531
1532=for apidoc EXMp|bool|validate_proto|SV *name|SV *proto|bool warn
1533
1534This function performs syntax checking on a prototype, C<proto>.
1535If C<warn> is true, any illegal characters or mismatched brackets
1536will trigger illegalproto warnings, declaring that they were
1537detected in the prototype for C<name>.
1538
1539The return value is C<true> if this is a valid prototype, and
1540C<false> if it is not, regardless of whether C<warn> was C<true> or
1541C<false>.
1542
1543Note that C<NULL> is a valid C<proto> and will always return C<true>.
1544
1545=cut
1546
1547 */
1548
1549bool
1550Perl_validate_proto(pTHX_ SV *name, SV *proto, bool warn)
1551{
1552 STRLEN len, origlen;
1553 char *p = proto ? SvPV(proto, len) : NULL;
1554 bool bad_proto = FALSE;
1555 bool in_brackets = FALSE;
1556 bool after_slash = FALSE;
1557 char greedy_proto = ' ';
1558 bool proto_after_greedy_proto = FALSE;
1559 bool must_be_last = FALSE;
1560 bool underscore = FALSE;
f791a21a 1561 bool bad_proto_after_underscore = FALSE;
fe788d6b
PM
1562
1563 PERL_ARGS_ASSERT_VALIDATE_PROTO;
1564
1565 if (!proto)
1566 return TRUE;
1567
1568 origlen = len;
1569 for (; len--; p++) {
1570 if (!isSPACE(*p)) {
1571 if (must_be_last)
1572 proto_after_greedy_proto = TRUE;
f791a21a
PM
1573 if (underscore) {
1574 if (!strchr(";@%", *p))
1575 bad_proto_after_underscore = TRUE;
1576 underscore = FALSE;
1577 }
fe788d6b
PM
1578 if (!strchr("$@%*;[]&\\_+", *p) || *p == '\0') {
1579 bad_proto = TRUE;
1580 }
1581 else {
fe788d6b
PM
1582 if (*p == '[')
1583 in_brackets = TRUE;
1584 else if (*p == ']')
1585 in_brackets = FALSE;
1586 else if ((*p == '@' || *p == '%') &&
1587 !after_slash &&
1588 !in_brackets ) {
1589 must_be_last = TRUE;
1590 greedy_proto = *p;
1591 }
1592 else if (*p == '_')
f791a21a 1593 underscore = TRUE;
fe788d6b
PM
1594 }
1595 if (*p == '\\')
1596 after_slash = TRUE;
1597 else
1598 after_slash = FALSE;
1599 }
1600 }
1601
1602 if (warn) {
b54d603d 1603 SV *tmpsv = newSVpvs_flags("", SVs_TEMP);
fe788d6b 1604 p -= origlen;
b54d603d
PM
1605 p = SvUTF8(proto)
1606 ? sv_uni_display(tmpsv, newSVpvn_flags(p, origlen, SVs_TEMP | SVf_UTF8),
1607 origlen, UNI_DISPLAY_ISPRINT)
1608 : pv_pretty(tmpsv, p, origlen, 60, NULL, NULL, PERL_PV_ESCAPE_NONASCII);
1609
fe788d6b
PM
1610 if (proto_after_greedy_proto)
1611 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1612 "Prototype after '%c' for %"SVf" : %s",
1613 greedy_proto, SVfARG(name), p);
50278ed0
PM
1614 if (in_brackets)
1615 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1616 "Missing ']' in prototype for %"SVf" : %s",
1617 SVfARG(name), p);
b54d603d 1618 if (bad_proto)
fe788d6b 1619 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
f791a21a
PM
1620 "Illegal character in prototype for %"SVf" : %s",
1621 SVfARG(name), p);
1622 if (bad_proto_after_underscore)
1623 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1624 "Illegal character after '_' in prototype for %"SVf" : %s",
1625 SVfARG(name), p);
fe788d6b
PM
1626 }
1627
1628 return (! (proto_after_greedy_proto || bad_proto) );
1629}
1630
1631/*
ffb4593c
NT
1632 * S_incline
1633 * This subroutine has nothing to do with tilting, whether at windmills
1634 * or pinball tables. Its name is short for "increment line". It
57843af0 1635 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1636 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1637 * # line 500 "foo.pm"
1638 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1639 */
1640
76e3520e 1641STATIC void
d9095cec 1642S_incline(pTHX_ const char *s)
463ee0b2 1643{
d9095cec
NC
1644 const char *t;
1645 const char *n;
1646 const char *e;
8818d409 1647 line_t line_num;
463ee0b2 1648
7918f24d
NC
1649 PERL_ARGS_ASSERT_INCLINE;
1650
83944c01 1651 COPLINE_INC_WITH_HERELINES;
451f421f
FC
1652 if (!PL_rsfp && !PL_parser->filtered && PL_lex_state == LEX_NORMAL
1653 && s+1 == PL_bufend && *s == ';') {
1654 /* fake newline in string eval */
1655 CopLINE_dec(PL_curcop);
1656 return;
1657 }
463ee0b2
LW
1658 if (*s++ != '#')
1659 return;
d4c19fe8
AL
1660 while (SPACE_OR_TAB(*s))
1661 s++;
73659bf1
GS
1662 if (strnEQ(s, "line", 4))
1663 s += 4;
1664 else
1665 return;
084592ab 1666 if (SPACE_OR_TAB(*s))
73659bf1 1667 s++;
4e553d73 1668 else
73659bf1 1669 return;
d4c19fe8
AL
1670 while (SPACE_OR_TAB(*s))
1671 s++;
463ee0b2
LW
1672 if (!isDIGIT(*s))
1673 return;
d4c19fe8 1674
463ee0b2
LW
1675 n = s;
1676 while (isDIGIT(*s))
1677 s++;
07714eb4 1678 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1679 return;
bf4acbe4 1680 while (SPACE_OR_TAB(*s))
463ee0b2 1681 s++;
73659bf1 1682 if (*s == '"' && (t = strchr(s+1, '"'))) {
463ee0b2 1683 s++;
73659bf1
GS
1684 e = t + 1;
1685 }
463ee0b2 1686 else {
c35e046a
AL
1687 t = s;
1688 while (!isSPACE(*t))
1689 t++;
73659bf1 1690 e = t;
463ee0b2 1691 }
bf4acbe4 1692 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1693 e++;
1694 if (*e != '\n' && *e != '\0')
1695 return; /* false alarm */
1696
96e440d2 1697 line_num = grok_atou(n, &e) - 1;
8818d409 1698
f4dd75d9 1699 if (t - s > 0) {
d9095cec 1700 const STRLEN len = t - s;
3df32bda 1701
d36ee5be 1702 if (!PL_rsfp && !PL_parser->filtered) {
e66cf94c
RGS
1703 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1704 * to *{"::_<newfilename"} */
44867030
NC
1705 /* However, the long form of evals is only turned on by the
1706 debugger - usually they're "(eval %lu)" */
d36ee5be
FC
1707 GV * const cfgv = CopFILEGV(PL_curcop);
1708 if (cfgv) {
38bd7ad8
FC
1709 char smallbuf[128];
1710 STRLEN tmplen2 = len;
44867030 1711 char *tmpbuf2;
449dd039 1712 GV *gv2;
44867030
NC
1713
1714 if (tmplen2 + 2 <= sizeof smallbuf)
1715 tmpbuf2 = smallbuf;
1716 else
1717 Newx(tmpbuf2, tmplen2 + 2, char);
1718
38bd7ad8
FC
1719 tmpbuf2[0] = '_';
1720 tmpbuf2[1] = '<';
44867030
NC
1721
1722 memcpy(tmpbuf2 + 2, s, tmplen2);
1723 tmplen2 += 2;
1724
8a5ee598 1725 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1726 if (!isGV(gv2)) {
8a5ee598 1727 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1728 /* adjust ${"::_<newfilename"} to store the new file name */
1729 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
8818d409
FC
1730 /* The line number may differ. If that is the case,
1731 alias the saved lines that are in the array.
1732 Otherwise alias the whole array. */
1733 if (CopLINE(PL_curcop) == line_num) {
38bd7ad8
FC
1734 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(cfgv)));
1735 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(cfgv)));
8818d409 1736 }
38bd7ad8
FC
1737 else if (GvAV(cfgv)) {
1738 AV * const av = GvAV(cfgv);
8818d409
FC
1739 const I32 start = CopLINE(PL_curcop)+1;
1740 I32 items = AvFILLp(av) - start;
1741 if (items > 0) {
1742 AV * const av2 = GvAVn(gv2);
1743 SV **svp = AvARRAY(av) + start;
1744 I32 l = (I32)line_num+1;
1745 while (items--)
1746 av_store(av2, l++, SvREFCNT_inc(*svp++));
1747 }
1748 }
e5527e4b 1749 }
44867030
NC
1750
1751 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
d36ee5be 1752 }
e66cf94c 1753 }
05ec9bb3 1754 CopFILE_free(PL_curcop);
449dd039 1755 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1756 }
8818d409 1757 CopLINE_set(PL_curcop, line_num);
463ee0b2
LW
1758}
1759
21791330
FC
1760#define skipspace(s) skipspace_flags(s, 0)
1761
29595ff2 1762
80a702cd 1763STATIC void
15f169a1 1764S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1765{
1766 AV *av = CopFILEAVx(PL_curcop);
1767 if (av) {
7f1c3e8c
FC
1768 SV * sv;
1769 if (PL_parser->preambling == NOLINE) sv = newSV_type(SVt_PVMG);
1770 else {
1771 sv = *av_fetch(av, 0, 1);
1772 SvUPGRADE(sv, SVt_PVMG);
1773 }
1774 if (!SvPOK(sv)) sv_setpvs(sv,"");
5fa550fb 1775 if (orig_sv)
7f1c3e8c 1776 sv_catsv(sv, orig_sv);
5fa550fb 1777 else
7f1c3e8c
FC
1778 sv_catpvn(sv, buf, len);
1779 if (!SvIOK(sv)) {
1780 (void)SvIOK_on(sv);
1781 SvIV_set(sv, 0);
1782 }
1783 if (PL_parser->preambling == NOLINE)
1784 av_store(av, CopLINE(PL_curcop), sv);
80a702cd
RGS
1785 }
1786}
1787
ffb4593c
NT
1788/*
1789 * S_skipspace
1790 * Called to gobble the appropriate amount and type of whitespace.
1791 * Skips comments as well.
1792 */
1793
76e3520e 1794STATIC char *
21791330 1795S_skipspace_flags(pTHX_ char *s, U32 flags)
a687059c 1796{
21791330 1797 PERL_ARGS_ASSERT_SKIPSPACE_FLAGS;
3280af22 1798 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
bf4acbe4 1799 while (s < PL_bufend && SPACE_OR_TAB(*s))
463ee0b2 1800 s++;
f0e67a1d
Z
1801 } else {
1802 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1803 PL_bufptr = s;
21791330 1804 lex_read_space(flags | LEX_KEEP_PREVIOUS |
f0998909
Z
1805 (PL_sublex_info.sub_inwhat || PL_lex_state == LEX_FORMLINE ?
1806 LEX_NO_NEXT_CHUNK : 0));
3280af22 1807 s = PL_bufptr;
f0e67a1d
Z
1808 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1809 if (PL_linestart > PL_bufptr)
1810 PL_bufptr = PL_linestart;
1811 return s;
463ee0b2 1812 }
5db06880 1813 return s;
a687059c 1814}
378cc40b 1815
ffb4593c
NT
1816/*
1817 * S_check_uni
1818 * Check the unary operators to ensure there's no ambiguity in how they're
1819 * used. An ambiguous piece of code would be:
1820 * rand + 5
1821 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1822 * the +5 is its argument.
1823 */
1824
76e3520e 1825STATIC void
cea2e8a9 1826S_check_uni(pTHX)
ba106d47 1827{
d4c19fe8
AL
1828 const char *s;
1829 const char *t;
2f3197b3 1830
3280af22 1831 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1832 return;
3280af22
NIS
1833 while (isSPACE(*PL_last_uni))
1834 PL_last_uni++;
c35e046a 1835 s = PL_last_uni;
8a2bca12 1836 while (isWORDCHAR_lazy_if(s,UTF) || *s == '-')
c35e046a 1837 s++;
3280af22 1838 if ((t = strchr(s, '(')) && t < PL_bufptr)
a0d0e21e 1839 return;
6136c704 1840
9b387841
NC
1841 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
1842 "Warning: Use of \"%.*s\" without parentheses is ambiguous",
1843 (int)(s - PL_last_uni), PL_last_uni);
2f3197b3
LW
1844}
1845
ffb4593c
NT
1846/*
1847 * LOP : macro to build a list operator. Its behaviour has been replaced
1848 * with a subroutine, S_lop() for which LOP is just another name.
1849 */
1850
a0d0e21e
LW
1851#define LOP(f,x) return lop(f,x,s)
1852
ffb4593c
NT
1853/*
1854 * S_lop
1855 * Build a list operator (or something that might be one). The rules:
41e8cbf4
FC
1856 * - if we have a next token, then it's a list operator (no parens) for
1857 * which the next token has already been parsed; e.g.,
1858 * sort foo @args
1859 * sort foo (@args)
ffb4593c
NT
1860 * - if the next thing is an opening paren, then it's a function
1861 * - else it's a list operator
1862 */
1863
76e3520e 1864STATIC I32
a0be28da 1865S_lop(pTHX_ I32 f, int x, char *s)
ffed7fef 1866{
7918f24d
NC
1867 PERL_ARGS_ASSERT_LOP;
1868
6154021b 1869 pl_yylval.ival = f;
35c8bce7 1870 CLINE;
3280af22
NIS
1871 PL_bufptr = s;
1872 PL_last_lop = PL_oldbufptr;
eb160463 1873 PL_last_lop_op = (OPCODE)f;
3280af22 1874 if (PL_nexttoke)
78cdf107 1875 goto lstop;
19f1898a 1876 PL_expect = x;
79072805 1877 if (*s == '(')
bbf60fe6 1878 return REPORT(FUNC);
29595ff2 1879 s = PEEKSPACE(s);
79072805 1880 if (*s == '(')
bbf60fe6 1881 return REPORT(FUNC);
78cdf107
Z
1882 else {
1883 lstop:
1884 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
1885 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
bbf60fe6 1886 return REPORT(LSTOP);
78cdf107 1887 }
79072805
LW
1888}
1889
ffb4593c
NT
1890/*
1891 * S_force_next
9cbb5ea2 1892 * When the lexer realizes it knows the next token (for instance,
ffb4593c 1893 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 1894 * to know what token to return the next time the lexer is called. Caller
b5bbe64a
JH
1895 * will need to set PL_nextval[] and possibly PL_expect to ensure
1896 * the lexer handles the token correctly.
ffb4593c
NT
1897 */
1898
4e553d73 1899STATIC void
cea2e8a9 1900S_force_next(pTHX_ I32 type)
79072805 1901{
704d4215
GG
1902#ifdef DEBUGGING
1903 if (DEBUG_T_TEST) {
1904 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 1905 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
1906 }
1907#endif
3280af22
NIS
1908 PL_nexttype[PL_nexttoke] = type;
1909 PL_nexttoke++;
1910 if (PL_lex_state != LEX_KNOWNEXT) {
1911 PL_lex_defer = PL_lex_state;
1912 PL_lex_expect = PL_expect;
1913 PL_lex_state = LEX_KNOWNEXT;
79072805
LW
1914 }
1915}
1916
89f35911
FC
1917/*
1918 * S_postderef
1919 *
1920 * This subroutine handles postfix deref syntax after the arrow has already
1921 * been emitted. @* $* etc. are emitted as two separate token right here.
1922 * @[ @{ %[ %{ *{ are emitted also as two tokens, but this function emits
1923 * only the first, leaving yylex to find the next.
89f35911
FC
1924 */
1925
1926static int
ff25e5db 1927S_postderef(pTHX_ int const funny, char const next)
89f35911 1928{
ff25e5db 1929 assert(funny == DOLSHARP || strchr("$@%&*", funny));
89f35911
FC
1930 assert(strchr("*[{", next));
1931 if (next == '*') {
1932 PL_expect = XOPERATOR;
cc624add 1933 if (PL_lex_state == LEX_INTERPNORMAL && !PL_lex_brackets) {
ff25e5db 1934 assert('@' == funny || '$' == funny || DOLSHARP == funny);
cc624add 1935 PL_lex_state = LEX_INTERPEND;
cc624add
FC
1936 force_next(POSTJOIN);
1937 }
89f35911
FC
1938 force_next(next);
1939 PL_bufptr+=2;
1940 }
1941 else {
760ca746
FC
1942 if ('@' == funny && PL_lex_state == LEX_INTERPNORMAL
1943 && !PL_lex_brackets)
1944 PL_lex_dojoin = 2;
89f35911
FC
1945 PL_expect = XOPERATOR;
1946 PL_bufptr++;
1947 }
1948 return funny;
1949}
1950
28ac2b49
Z
1951void
1952Perl_yyunlex(pTHX)
1953{
a7aaec61
Z
1954 int yyc = PL_parser->yychar;
1955 if (yyc != YYEMPTY) {
1956 if (yyc) {
a7aaec61
Z
1957 NEXTVAL_NEXTTOKE = PL_parser->yylval;
1958 if (yyc == '{'/*}*/ || yyc == HASHBRACK || yyc == '['/*]*/) {
78cdf107 1959 PL_lex_allbrackets--;
a7aaec61 1960 PL_lex_brackets--;
78cdf107
Z
1961 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
1962 } else if (yyc == '('/*)*/) {
1963 PL_lex_allbrackets--;
1964 yyc |= (2<<24);
a7aaec61
Z
1965 }
1966 force_next(yyc);
1967 }
28ac2b49
Z
1968 PL_parser->yychar = YYEMPTY;
1969 }
1970}
1971
d0a148a6 1972STATIC SV *
15f169a1 1973S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 1974{
740cce10 1975 SV * const sv = newSVpvn_utf8(start, len,
eaf7a4d2
CS
1976 !IN_BYTES
1977 && UTF
1978 && !is_ascii_string((const U8*)start, len)
740cce10 1979 && is_utf8_string((const U8*)start, len));
d0a148a6
NC
1980 return sv;
1981}
1982
ffb4593c
NT
1983/*
1984 * S_force_word
1985 * When the lexer knows the next thing is a word (for instance, it has
1986 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
1987 * it calls S_force_word to stick the next word into the PL_nexttoke/val
1988 * lookahead.
ffb4593c
NT
1989 *
1990 * Arguments:
b1b65b59 1991 * char *start : buffer position (must be within PL_linestr)
02b34bbe 1992 * int token : PL_next* will be this type of bare word (e.g., METHOD,WORD)
ffb4593c
NT
1993 * int check_keyword : if true, Perl checks to make sure the word isn't
1994 * a keyword (do this if the word is a label, e.g. goto FOO)
1995 * int allow_pack : if true, : characters will also be allowed (require,
1996 * use, etc. do this)
9cbb5ea2 1997 * int allow_initial_tick : used by the "sub" lexer only.
ffb4593c
NT
1998 */
1999
76e3520e 2000STATIC char *
345b3785 2001S_force_word(pTHX_ char *start, int token, int check_keyword, int allow_pack)
79072805 2002{
eb578fdb 2003 char *s;
463ee0b2 2004 STRLEN len;
4e553d73 2005
7918f24d
NC
2006 PERL_ARGS_ASSERT_FORCE_WORD;
2007
29595ff2 2008 start = SKIPSPACE1(start);
463ee0b2 2009 s = start;
7e2040f0 2010 if (isIDFIRST_lazy_if(s,UTF) ||
345b3785 2011 (allow_pack && *s == ':') )
a0d0e21e 2012 {
3280af22 2013 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
01b5ef50
FC
2014 if (check_keyword) {
2015 char *s2 = PL_tokenbuf;
2016 if (allow_pack && len > 6 && strnEQ(s2, "CORE::", 6))
2017 s2 += 6, len -= 6;
2018 if (keyword(s2, len, 0))
463ee0b2 2019 return start;
01b5ef50 2020 }
463ee0b2 2021 if (token == METHOD) {
29595ff2 2022 s = SKIPSPACE1(s);
463ee0b2 2023 if (*s == '(')
3280af22 2024 PL_expect = XTERM;
463ee0b2 2025 else {
3280af22 2026 PL_expect = XOPERATOR;
463ee0b2 2027 }
79072805 2028 }
9ded7720 2029 NEXTVAL_NEXTTOKE.opval
d0a148a6
NC
2030 = (OP*)newSVOP(OP_CONST,0,
2031 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 2032 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
2033 force_next(token);
2034 }
2035 return s;
2036}
2037
ffb4593c
NT
2038/*
2039 * S_force_ident
9cbb5ea2 2040 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
2041 * text only contains the "foo" portion. The first argument is a pointer
2042 * to the "foo", and the second argument is the type symbol to prefix.
2043 * Forces the next token to be a "WORD".
9cbb5ea2 2044 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2045 */
2046
76e3520e 2047STATIC void
5aaab254 2048S_force_ident(pTHX_ const char *s, int kind)
79072805 2049{
7918f24d
NC
2050 PERL_ARGS_ASSERT_FORCE_IDENT;
2051
c9b48522
DD
2052 if (s[0]) {
2053 const STRLEN len = s[1] ? strlen(s) : 1; /* s = "\"" see yylex */
728847b1
BF
2054 OP* const o = (OP*)newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
2055 UTF ? SVf_UTF8 : 0));
9ded7720 2056 NEXTVAL_NEXTTOKE.opval = o;
79072805 2057 force_next(WORD);
748a9306 2058 if (kind) {
11343788 2059 o->op_private = OPpCONST_ENTERED;
55497cff
PP
2060 /* XXX see note in pp_entereval() for why we forgo typo
2061 warnings if the symbol must be introduced in an eval.
2062 GSAR 96-10-12 */
90e5519e 2063 gv_fetchpvn_flags(s, len,
728847b1
BF
2064 (PL_in_eval ? (GV_ADDMULTI | GV_ADDINEVAL)
2065 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
90e5519e
NC
2066 kind == '$' ? SVt_PV :
2067 kind == '@' ? SVt_PVAV :
2068 kind == '%' ? SVt_PVHV :
a0d0e21e 2069 SVt_PVGV
90e5519e 2070 );
748a9306 2071 }
79072805
LW
2072 }
2073}
2074
3f33d153
FC
2075static void
2076S_force_ident_maybe_lex(pTHX_ char pit)
2077{
3f33d153
FC
2078 NEXTVAL_NEXTTOKE.ival = pit;
2079 force_next('p');
2080}
2081
1571675a
GS
2082NV
2083Perl_str_to_version(pTHX_ SV *sv)
2084{
2085 NV retval = 0.0;
2086 NV nshift = 1.0;
2087 STRLEN len;
cfd0369c 2088 const char *start = SvPV_const(sv,len);
9d4ba2ae 2089 const char * const end = start + len;
504618e9 2090 const bool utf = SvUTF8(sv) ? TRUE : FALSE;
7918f24d
NC
2091
2092 PERL_ARGS_ASSERT_STR_TO_VERSION;
2093
1571675a 2094 while (start < end) {
ba210ebe 2095 STRLEN skip;
1571675a
GS
2096 UV n;
2097 if (utf)
9041c2e3 2098 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2099 else {
2100 n = *(U8*)start;
2101 skip = 1;
2102 }
2103 retval += ((NV)n)/nshift;
2104 start += skip;
2105 nshift *= 1000;
2106 }
2107 return retval;
2108}
2109
4e553d73 2110/*
ffb4593c
NT
2111 * S_force_version
2112 * Forces the next token to be a version number.
e759cc13
RGS
2113 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2114 * and if "guessing" is TRUE, then no new token is created (and the caller
2115 * must use an alternative parsing method).
ffb4593c
NT
2116 */
2117
76e3520e 2118STATIC char *
e759cc13 2119S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2120{
5f66b61c 2121 OP *version = NULL;
44dcb63b 2122 char *d;
89bfa8cd 2123
7918f24d
NC
2124 PERL_ARGS_ASSERT_FORCE_VERSION;
2125
29595ff2 2126 s = SKIPSPACE1(s);
89bfa8cd 2127
44dcb63b 2128 d = s;
dd629d5b 2129 if (*d == 'v')
44dcb63b 2130 d++;
44dcb63b 2131 if (isDIGIT(*d)) {
e759cc13
RGS
2132 while (isDIGIT(*d) || *d == '_' || *d == '.')
2133 d++;
4e4da3ac 2134 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
dd629d5b 2135 SV *ver;
6154021b
RGS
2136 s = scan_num(s, &pl_yylval);
2137 version = pl_yylval.opval;
dd629d5b
GS
2138 ver = cSVOPx(version)->op_sv;
2139 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2140 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2141 SvNV_set(ver, str_to_version(ver));
1571675a 2142 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2143 }
89bfa8cd 2144 }
5db06880 2145 else if (guessing) {
e759cc13 2146 return s;
5db06880 2147 }
89bfa8cd
PP
2148 }
2149
2150 /* NOTE: The parser sees the package name and the VERSION swapped */
9ded7720 2151 NEXTVAL_NEXTTOKE.opval = version;
4e553d73 2152 force_next(WORD);
89bfa8cd 2153
e759cc13 2154 return s;
89bfa8cd
PP
2155}
2156
ffb4593c 2157/*
91152fc1
DG
2158 * S_force_strict_version
2159 * Forces the next token to be a version number using strict syntax rules.
2160 */
2161
2162STATIC char *
2163S_force_strict_version(pTHX_ char *s)
2164{
91152fc1 2165 OP *version = NULL;
91152fc1
DG
2166 const char *errstr = NULL;
2167
2168 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2169
2170 while (isSPACE(*s)) /* leading whitespace */
2171 s++;
2172
2173 if (is_STRICT_VERSION(s,&errstr)) {
2174 SV *ver = newSV(0);
2175 s = (char *)scan_version(s, ver, 0);
2176 version = newSVOP(OP_CONST, 0, ver);
2177 }
4e4da3ac
Z
2178 else if ( (*s != ';' && *s != '{' && *s != '}' ) &&
2179 (s = SKIPSPACE1(s), (*s != ';' && *s != '{' && *s != '}' )))
2180 {
91152fc1
DG
2181 PL_bufptr = s;
2182 if (errstr)
2183 yyerror(errstr); /* version required */
2184 return s;
2185 }
2186
91152fc1 2187 /* NOTE: The parser sees the package name and the VERSION swapped */
91152fc1
DG
2188 NEXTVAL_NEXTTOKE.opval = version;
2189 force_next(WORD);
2190
2191 return s;
2192}
2193
2194/*
ffb4593c
NT
2195 * S_tokeq
2196 * Tokenize a quoted string passed in as an SV. It finds the next
2197 * chunk, up to end of string or a backslash. It may make a new
2198 * SV containing that chunk (if HINT_NEW_STRING is on). It also
2199 * turns \\ into \.
2200 */
2201
76e3520e 2202STATIC SV *
cea2e8a9 2203S_tokeq(pTHX_ SV *sv)
79072805 2204{
eb578fdb
KW
2205 char *s;
2206 char *send;
2207 char *d;
b3ac6de7 2208 SV *pv = sv;
79072805 2209
7918f24d
NC
2210 PERL_ARGS_ASSERT_TOKEQ;
2211
279b35ad
FC
2212 assert (SvPOK(sv));
2213 assert (SvLEN(sv));
2214 assert (!SvIsCOW(sv));
307ed071 2215 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1) /* <<'heredoc' */
b3ac6de7 2216 goto finish;
279b35ad
FC
2217 s = SvPVX(sv);
2218 send = SvEND(sv);
dcb21ed6
NC
2219 /* This is relying on the SV being "well formed" with a trailing '\0' */
2220 while (s < send && !(*s == '\\' && s[1] == '\\'))
79072805
LW
2221 s++;
2222 if (s == send)
b3ac6de7 2223 goto finish;
79072805 2224 d = s;
be4731d2 2225 if ( PL_hints & HINT_NEW_STRING ) {
279b35ad
FC
2226 pv = newSVpvn_flags(SvPVX_const(pv), SvCUR(sv),
2227 SVs_TEMP | SvUTF8(sv));
be4731d2 2228 }
79072805
LW
2229 while (s < send) {
2230 if (*s == '\\') {
a0d0e21e 2231 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2232 s++; /* all that, just for this */
2233 }
2234 *d++ = *s++;
2235 }
2236 *d = '\0';
95a20fc0 2237 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2238 finish:
3280af22 2239 if ( PL_hints & HINT_NEW_STRING )
eb0d8d16 2240 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
79072805
LW
2241 return sv;
2242}
2243
ffb4593c
NT
2244/*
2245 * Now come three functions related to double-quote context,
2246 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2247 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2248 * interact with PL_lex_state, and create fake ( ... ) argument lists
2249 * to handle functions and concatenation.
ecd24171
DM
2250 * For example,
2251 * "foo\lbar"
2252 * is tokenised as
2253 * stringify ( const[foo] concat lcfirst ( const[bar] ) )
ffb4593c
NT
2254 */
2255
2256/*
2257 * S_sublex_start
6154021b 2258 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2259 *
2260 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2261 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c
NT
2262 *
2263 * OP_CONST and OP_READLINE are easy--just make the new op and return.
2264 *
2265 * Everything else becomes a FUNC.
2266 *
2267 * Sets PL_lex_state to LEX_INTERPPUSH unless (ival was OP_NULL or we
2268 * had an OP_CONST or OP_READLINE). This just sets us up for a
2269 * call to S_sublex_push().
2270 */
2271
76e3520e 2272STATIC I32
cea2e8a9 2273S_sublex_start(pTHX)
79072805 2274{
eb578fdb 2275 const I32 op_type = pl_yylval.ival;
79072805
LW
2276
2277 if (op_type == OP_NULL) {
6154021b 2278 pl_yylval.opval = PL_lex_op;
5f66b61c 2279 PL_lex_op = NULL;
79072805
LW
2280 return THING;
2281 }
466112bb 2282 if (op_type == OP_CONST) {
3280af22 2283 SV *sv = tokeq(PL_lex_stuff);
b3ac6de7
IZ
2284
2285 if (SvTYPE(sv) == SVt_PVIV) {
2286 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2287 STRLEN len;
96a5add6 2288 const char * const p = SvPV_const(sv, len);
740cce10 2289 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2290 SvREFCNT_dec(sv);
2291 sv = nsv;
4e553d73 2292 }
6154021b 2293 pl_yylval.opval = (OP*)newSVOP(op_type, 0, sv);
a0714e2c 2294 PL_lex_stuff = NULL;
79072805
LW
2295 return THING;
2296 }
2297
3280af22 2298 PL_sublex_info.super_state = PL_lex_state;
eac04b2e 2299 PL_sublex_info.sub_inwhat = (U16)op_type;
3280af22
NIS
2300 PL_sublex_info.sub_op = PL_lex_op;
2301 PL_lex_state = LEX_INTERPPUSH;
55497cff 2302
3280af22
NIS
2303 PL_expect = XTERM;
2304 if (PL_lex_op) {
6154021b 2305 pl_yylval.opval = PL_lex_op;
5f66b61c 2306 PL_lex_op = NULL;
55497cff
PP
2307 return PMFUNC;
2308 }
2309 else
2310 return FUNC;
2311}
2312
ffb4593c
NT
2313/*
2314 * S_sublex_push
2315 * Create a new scope to save the lexing state. The scope will be
2316 * ended in S_sublex_done. Returns a '(', starting the function arguments
2317 * to the uc, lc, etc. found before.
2318 * Sets PL_lex_state to LEX_INTERPCONCAT.
2319 */
2320
76e3520e 2321STATIC I32
cea2e8a9 2322S_sublex_push(pTHX)
55497cff 2323{
78a635de 2324 LEXSHARED *shared;
801d32ac 2325 const bool is_heredoc = PL_multi_close == '<';
f46d017c 2326 ENTER;
55497cff 2327
3280af22 2328 PL_lex_state = PL_sublex_info.super_state;
cc624add 2329 SAVEI8(PL_lex_dojoin);
3280af22 2330 SAVEI32(PL_lex_brackets);
78cdf107 2331 SAVEI32(PL_lex_allbrackets);
b27dce25 2332 SAVEI32(PL_lex_formbrack);
78cdf107 2333 SAVEI8(PL_lex_fakeeof);
3280af22
NIS
2334 SAVEI32(PL_lex_casemods);
2335 SAVEI32(PL_lex_starts);
651b5b28 2336 SAVEI8(PL_lex_state);
7cc34111 2337 SAVESPTR(PL_lex_repl);
7766f137 2338 SAVEVPTR(PL_lex_inpat);
98246f1e 2339 SAVEI16(PL_lex_inwhat);
ffdb8b16 2340 if (is_heredoc)
b42366d4 2341 {
ffdb8b16 2342 SAVECOPLINE(PL_curcop);
b42366d4 2343 SAVEI32(PL_multi_end);
851b527a
FC
2344 SAVEI32(PL_parser->herelines);
2345 PL_parser->herelines = 0;
b42366d4
FC
2346 }
2347 SAVEI8(PL_multi_close);
3280af22 2348 SAVEPPTR(PL_bufptr);
8452ff4b 2349 SAVEPPTR(PL_bufend);
3280af22
NIS
2350 SAVEPPTR(PL_oldbufptr);
2351 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2352 SAVEPPTR(PL_last_lop);
2353 SAVEPPTR(PL_last_uni);
3280af22
NIS
2354 SAVEPPTR(PL_linestart);
2355 SAVESPTR(PL_linestr);
8edd5f42
RGS
2356 SAVEGENERICPV(PL_lex_brackstack);
2357 SAVEGENERICPV(PL_lex_casestack);
78a635de 2358 SAVEGENERICPV(PL_parser->lex_shared);
3a54fd60 2359 SAVEBOOL(PL_parser->lex_re_reparsing);
ffdb8b16 2360 SAVEI32(PL_copline);
3280af22 2361
99bd9d90 2362 /* The here-doc parser needs to be able to peek into outer lexing
60f40a38
FC
2363 scopes to find the body of the here-doc. So we put PL_linestr and
2364 PL_bufptr into lex_shared, to ‘share’ those values.
99bd9d90 2365 */
60f40a38
FC
2366 PL_parser->lex_shared->ls_linestr = PL_linestr;
2367 PL_parser->lex_shared->ls_bufptr = PL_bufptr;
99bd9d90 2368
3280af22 2369 PL_linestr = PL_lex_stuff;
7cc34111 2370 PL_lex_repl = PL_sublex_info.repl;
a0714e2c 2371 PL_lex_stuff = NULL;
7cc34111 2372 PL_sublex_info.repl = NULL;
3280af22 2373
9cbb5ea2
GS
2374 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2375 = SvPVX(PL_linestr);
3280af22 2376 PL_bufend += SvCUR(PL_linestr);
bd61b366 2377 PL_last_lop = PL_last_uni = NULL;
3280af22 2378 SAVEFREESV(PL_linestr);
4dc843bc 2379 if (PL_lex_repl) SAVEFREESV(PL_lex_repl);
3280af22
NIS
2380
2381 PL_lex_dojoin = FALSE;
b27dce25 2382 PL_lex_brackets = PL_lex_formbrack = 0;
78cdf107
Z
2383 PL_lex_allbrackets = 0;
2384 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
a02a5408
JC
2385 Newx(PL_lex_brackstack, 120, char);
2386 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2387 PL_lex_casemods = 0;
2388 *PL_lex_casestack = '\0';
2389 PL_lex_starts = 0;
2390 PL_lex_state = LEX_INTERPCONCAT;
ffdb8b16 2391 if (is_heredoc)
6ddcf93b 2392 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
ffdb8b16 2393 PL_copline = NOLINE;
78a635de
FC
2394
2395 Newxz(shared, 1, LEXSHARED);
2396 shared->ls_prev = PL_parser->lex_shared;
2397 PL_parser->lex_shared = shared;
3280af22
NIS
2398
2399 PL_lex_inwhat = PL_sublex_info.sub_inwhat;
bb16bae8 2400 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
3280af22
NIS
2401 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2402 PL_lex_inpat = PL_sublex_info.sub_op;
79072805 2403 else
5f66b61c 2404 PL_lex_inpat = NULL;
79072805 2405
3a54fd60
DM
2406 PL_parser->lex_re_reparsing = cBOOL(PL_in_eval & EVAL_RE_REPARSING);
2407 PL_in_eval &= ~EVAL_RE_REPARSING;
2408
55497cff 2409 return '(';
79072805
LW
2410}
2411
ffb4593c
NT
2412/*
2413 * S_sublex_done
2414 * Restores lexer state after a S_sublex_push.
2415 */
2416
76e3520e 2417STATIC I32
cea2e8a9 2418S_sublex_done(pTHX)
79072805 2419{
3280af22 2420 if (!PL_lex_starts++) {
396482e1 2421 SV * const sv = newSVpvs("");
9aa983d2
JH
2422 if (SvUTF8(PL_linestr))
2423 SvUTF8_on(sv);
3280af22 2424 PL_expect = XOPERATOR;
6154021b 2425 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
79072805
LW
2426 return THING;
2427 }
2428
3280af22
NIS
2429 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2430 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2431 return yylex();
79072805
LW
2432 }
2433
ffb4593c 2434 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
bb16bae8 2435 assert(PL_lex_inwhat != OP_TRANSR);
5aa91856
FC
2436 if (PL_lex_repl) {
2437 assert (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS);
3280af22
NIS
2438 PL_linestr = PL_lex_repl;
2439 PL_lex_inpat = 0;
2440 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2441 PL_bufend += SvCUR(PL_linestr);
bd61b366 2442 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2443 PL_lex_dojoin = FALSE;
2444 PL_lex_brackets = 0;
78cdf107
Z
2445 PL_lex_allbrackets = 0;
2446 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
3280af22
NIS
2447 PL_lex_casemods = 0;
2448 *PL_lex_casestack = '\0';
2449 PL_lex_starts = 0;
25da4f38 2450 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2451 PL_lex_state = LEX_INTERPNORMAL;
2452 PL_lex_starts++;
e9fa98b2
HS
2453 /* we don't clear PL_lex_repl here, so that we can check later
2454 whether this is an evalled subst; that means we rely on the
2455 logic to ensure sublex_done() is called again only via the
2456 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2457 }
e9fa98b2 2458 else {
3280af22 2459 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2460 PL_lex_repl = NULL;
e9fa98b2 2461 }
ffdb8b16
FC
2462 if (SvTYPE(PL_linestr) >= SVt_PVNV) {
2463 CopLINE(PL_curcop) +=
2464 ((XPVNV*)SvANY(PL_linestr))->xnv_u.xpad_cop_seq.xlow
851b527a
FC
2465 + PL_parser->herelines;
2466 PL_parser->herelines = 0;
ffdb8b16 2467 }
79072805 2468 return ',';
ffed7fef
LW
2469 }
2470 else {
b42366d4 2471 const line_t l = CopLINE(PL_curcop);
f46d017c 2472 LEAVE;
b42366d4 2473 if (PL_multi_close == '<')
851b527a 2474 PL_parser->herelines += l - PL_multi_end;
3280af22
NIS
2475 PL_bufend = SvPVX(PL_linestr);
2476 PL_bufend += SvCUR(PL_linestr);
2477 PL_expect = XOPERATOR;
09bef843 2478 PL_sublex_info.sub_inwhat = 0;
79072805 2479 return ')';
ffed7fef
LW
2480 }
2481}
2482
6f613c73
KW
2483PERL_STATIC_INLINE SV*
2484S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
2485{
140b12ad
KW
2486 /* <s> points to first character of interior of \N{}, <e> to one beyond the
2487 * interior, hence to the "}". Finds what the name resolves to, returning
2488 * an SV* containing it; NULL if no valid one found */
2489
dd2b1b72 2490 SV* res = newSVpvn_flags(s, e - s, UTF ? SVf_UTF8 : 0);
6f613c73 2491
0c415a79
KW
2492 HV * table;
2493 SV **cvp;
2494 SV *cv;
2495 SV *rv;
2496 HV *stash;
2497 const U8* first_bad_char_loc;
2498 const char* backslash_ptr = s - 3; /* Points to the <\> of \N{... */
2499
6f613c73
KW
2500 PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME;
2501
107160e2
KW
2502 if (UTF && ! is_utf8_string_loc((U8 *) backslash_ptr,
2503 e - backslash_ptr,
2504 &first_bad_char_loc))
2505 {
2506 /* If warnings are on, this will print a more detailed analysis of what
2507 * is wrong than the error message below */
c80e42f3 2508 utf8n_to_uvchr(first_bad_char_loc,
107160e2
KW
2509 e - ((char *) first_bad_char_loc),
2510 NULL, 0);
2511
2512 /* We deliberately don't try to print the malformed character, which
2513 * might not print very well; it also may be just the first of many
2514 * malformations, so don't print what comes after it */
2515 yyerror(Perl_form(aTHX_
2516 "Malformed UTF-8 character immediately after '%.*s'",
2517 (int) (first_bad_char_loc - (U8 *) backslash_ptr), backslash_ptr));
2518 return NULL;
2519 }
2520
2521 res = new_constant( NULL, 0, "charnames", res, NULL, backslash_ptr,
2522 /* include the <}> */
2523 e - backslash_ptr + 1);
6f613c73 2524 if (! SvPOK(res)) {
b6407c49 2525 SvREFCNT_dec_NN(res);
6f613c73
KW
2526 return NULL;
2527 }
2528
0c415a79
KW
2529 /* See if the charnames handler is the Perl core's, and if so, we can skip
2530 * the validation needed for a user-supplied one, as Perl's does its own
2531 * validation. */
2532 table = GvHV(PL_hintgv); /* ^H */
2533 cvp = hv_fetchs(table, "charnames", FALSE);
5882ddb3
FC
2534 if (cvp && (cv = *cvp) && SvROK(cv) && (rv = SvRV(cv),
2535 SvTYPE(rv) == SVt_PVCV) && ((stash = CvSTASH(rv)) != NULL))
0c415a79
KW
2536 {
2537 const char * const name = HvNAME(stash);
6a642c21
FC
2538 if (HvNAMELEN(stash) == sizeof("_charnames")-1
2539 && strEQ(name, "_charnames")) {
0c415a79
KW
2540 return res;
2541 }
2542 }
2543
bde9e88d
KW
2544 /* Here, it isn't Perl's charname handler. We can't rely on a
2545 * user-supplied handler to validate the input name. For non-ut8 input,
2546 * look to see that the first character is legal. Then loop through the
2547 * rest checking that each is a continuation */
6f613c73 2548
36897d64
KW
2549 /* This code makes the reasonable assumption that the only Latin1-range
2550 * characters that begin a character name alias are alphabetic, otherwise
2551 * would have to create a isCHARNAME_BEGIN macro */
b6ba1137 2552
b6ba1137 2553 if (! UTF) {
bde9e88d 2554 if (! isALPHAU(*s)) {
b6ba1137
KW
2555 goto bad_charname;
2556 }
bde9e88d
KW
2557 s++;
2558 while (s < e) {
2559 if (! isCHARNAME_CONT(*s)) {
b6ba1137
KW
2560 goto bad_charname;
2561 }
2d8eb851
KW
2562 if (*s == ' ' && *(s-1) == ' ') {
2563 goto multi_spaces;
bd299e29 2564 }
df758df2
KW
2565 if ((U8) *s == NBSP_NATIVE && ckWARN_d(WARN_DEPRECATED)) {
2566 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2567 "NO-BREAK SPACE in a charnames "
2568 "alias definition is deprecated");
2569 }
bde9e88d 2570 s++;
b6ba1137
KW
2571 }
2572 }
2573 else {
bde9e88d
KW
2574 /* Similarly for utf8. For invariants can check directly; for other
2575 * Latin1, can calculate their code point and check; otherwise use a
2576 * swash */
2577 if (UTF8_IS_INVARIANT(*s)) {
2578 if (! isALPHAU(*s)) {
140b12ad
KW
2579 goto bad_charname;
2580 }
bde9e88d
KW
2581 s++;
2582 } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
94bb8c36 2583 if (! isALPHAU(TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)))) {
b6ba1137 2584 goto bad_charname;
6f613c73 2585 }
bde9e88d 2586 s += 2;
6f613c73 2587 }
bde9e88d
KW
2588 else {
2589 if (! PL_utf8_charname_begin) {
2590 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2591 PL_utf8_charname_begin = _core_swash_init("utf8",
2592 "_Perl_Charname_Begin",
2593 &PL_sv_undef,
2594 1, 0, NULL, &flags);
2595 }
2596 if (! swash_fetch(PL_utf8_charname_begin, (U8 *) s, TRUE)) {
2597 goto bad_charname;
2598 }
2599 s += UTF8SKIP(s);
2600 }
2601
2602 while (s < e) {
2603 if (UTF8_IS_INVARIANT(*s)) {
2604 if (! isCHARNAME_CONT(*s)) {
2605 goto bad_charname;
2606 }
2d8eb851
KW
2607 if (*s == ' ' && *(s-1) == ' ') {
2608 goto multi_spaces;
bd299e29 2609 }
bde9e88d
KW
2610 s++;
2611 }
2612 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
94bb8c36 2613 if (! isCHARNAME_CONT(TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1))))
bde9e88d
KW
2614 {
2615 goto bad_charname;
2616 }
df758df2
KW
2617 if (*s == *NBSP_UTF8
2618 && *(s+1) == *(NBSP_UTF8+1)
2619 && ckWARN_d(WARN_DEPRECATED))
2620 {
2621 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2622 "NO-BREAK SPACE in a charnames "
2623 "alias definition is deprecated");
2624 }
bde9e88d
KW
2625 s += 2;
2626 }
2627 else {
2628 if (! PL_utf8_charname_continue) {
2629 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2630 PL_utf8_charname_continue = _core_swash_init("utf8",
2631 "_Perl_Charname_Continue",
2632 &PL_sv_undef,
2633 1, 0, NULL, &flags);
2634 }
2635 if (! swash_fetch(PL_utf8_charname_continue, (U8 *) s, TRUE)) {
2636 goto bad_charname;
2637 }
2638 s += UTF8SKIP(s);
6f613c73
KW
2639 }
2640 }
2d8eb851
KW
2641 }
2642 if (*(s-1) == ' ') {
2643 yyerror_pv(
2644 Perl_form(aTHX_
2645 "charnames alias definitions may not contain trailing "
2646 "white-space; marked by <-- HERE in %.*s<-- HERE %.*s",
2647 (int)(s - backslash_ptr + 1), backslash_ptr,
2648 (int)(e - s + 1), s + 1
2649 ),
2650 UTF ? SVf_UTF8 : 0);
2651 return NULL;
6f613c73
KW
2652 }
2653
94ca1619 2654 if (SvUTF8(res)) { /* Don't accept malformed input */
bde9e88d
KW
2655 const U8* first_bad_char_loc;
2656 STRLEN len;
2657 const char* const str = SvPV_const(res, len);
2658 if (! is_utf8_string_loc((U8 *) str, len, &first_bad_char_loc)) {
2659 /* If warnings are on, this will print a more detailed analysis of
2660 * what is wrong than the error message below */
c80e42f3 2661 utf8n_to_uvchr(first_bad_char_loc,
bde9e88d
KW
2662 (char *) first_bad_char_loc - str,
2663 NULL, 0);
2664
2665 /* We deliberately don't try to print the malformed character,
2666 * which might not print very well; it also may be just the first
2667 * of many malformations, so don't print what comes after it */
2668 yyerror_pv(
2669 Perl_form(aTHX_
2670 "Malformed UTF-8 returned by %.*s immediately after '%.*s'",
2671 (int) (e - backslash_ptr + 1), backslash_ptr,
2672 (int) ((char *) first_bad_char_loc - str), str
2673 ),
2674 SVf_UTF8);
2675 return NULL;
2676 }
2677 }
140b12ad 2678
bde9e88d 2679 return res;
140b12ad 2680
bde9e88d 2681 bad_charname: {
bde9e88d
KW
2682
2683 /* The final %.*s makes sure that should the trailing NUL be missing
2684 * that this print won't run off the end of the string */
2685 yyerror_pv(
2686 Perl_form(aTHX_
2687 "Invalid character in \\N{...}; marked by <-- HERE in %.*s<-- HERE %.*s",
2d8eb851
KW
2688 (int)(s - backslash_ptr + 1), backslash_ptr,
2689 (int)(e - s + 1), s + 1
bde9e88d
KW
2690 ),
2691 UTF ? SVf_UTF8 : 0);
2692 return NULL;
2693 }
2d8eb851
KW
2694
2695 multi_spaces:
2696 yyerror_pv(
2697 Perl_form(aTHX_
2698 "charnames alias definitions may not contain a sequence of "
2699 "multiple spaces; marked by <-- HERE in %.*s<-- HERE %.*s",
2700 (int)(s - backslash_ptr + 1), backslash_ptr,
2701 (int)(e - s + 1), s + 1
2702 ),
2703 UTF ? SVf_UTF8 : 0);
2704 return NULL;
6f613c73
KW
2705}
2706
02aa26ce
NT
2707/*
2708 scan_const
2709
9da1dd8f
DM
2710 Extracts the next constant part of a pattern, double-quoted string,
2711 or transliteration. This is terrifying code.
2712
2713 For example, in parsing the double-quoted string "ab\x63$d", it would
2714 stop at the '$' and return an OP_CONST containing 'abc'.
02aa26ce 2715
94def140 2716 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 2717 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 2718 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 2719
94def140
ST
2720 Returns a pointer to the character scanned up to. If this is
2721 advanced from the start pointer supplied (i.e. if anything was
9da1dd8f 2722 successfully parsed), will leave an OP_CONST for the substring scanned
6154021b 2723 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
2724 by looking at the next characters herself.
2725
02aa26ce 2726 In patterns:
9da1dd8f 2727 expand:
537124e4
KW
2728 \N{FOO} => \N{U+hex_for_character_FOO}
2729 (if FOO expands to multiple characters, expands to \N{U+xx.XX.yy ...})
9da1dd8f
DM
2730
2731 pass through:
2732 all other \-char, including \N and \N{ apart from \N{ABC}
2733
2734 stops on:
2735 @ and $ where it appears to be a var, but not for $ as tail anchor
2736 \l \L \u \U \Q \E
2737 (?{ or (??{
2738
02aa26ce
NT
2739
2740 In transliterations:
2741 characters are VERY literal, except for - not at the start or end
94def140
ST
2742 of the string, which indicates a range. If the range is in bytes,
2743 scan_const expands the range to the full set of intermediate
2744 characters. If the range is in utf8, the hyphen is replaced with
2745 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
2746
2747 In double-quoted strings:
2748 backslashes:
2749 double-quoted style: \r and \n
ff3f963a 2750 constants: \x31, etc.
94def140 2751 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
2752 case and quoting: \U \Q \E
2753 stops on @ and $
2754
2755 scan_const does *not* construct ops to handle interpolated strings.
2756 It stops processing as soon as it finds an embedded $ or @ variable
2757 and leaves it to the caller to work out what's going on.
2758
94def140
ST
2759 embedded arrays (whether in pattern or not) could be:
2760 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2761
2762 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
2763
2764 $ in pattern could be $foo or could be tail anchor. Assumption:
2765 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 2766 followed by one of "()| \r\n\t"
02aa26ce 2767
9da1dd8f 2768 \1 (backreferences) are turned into $1 in substitutions
02aa26ce
NT
2769
2770 The structure of the code is
2771 while (there's a character to process) {
94def140
ST
2772 handle transliteration ranges
2773 skip regexp comments /(?#comment)/ and codes /(?{code})/
2774 skip #-initiated comments in //x patterns
2775 check for embedded arrays
02aa26ce
NT
2776 check for embedded scalars
2777 if (backslash) {
94def140 2778 deprecate \1 in substitution replacements
02aa26ce
NT
2779 handle string-changing backslashes \l \U \Q \E, etc.
2780 switch (what was escaped) {
94def140 2781 handle \- in a transliteration (becomes a literal -)
ff3f963a 2782 if a pattern and not \N{, go treat as regular character
94def140
ST
2783 handle \132 (octal characters)
2784 handle \x15 and \x{1234} (hex characters)
ff3f963a 2785 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
ST
2786 handle \cV (control characters)
2787 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 2788 } (end switch)
77a135fe 2789 continue
02aa26ce 2790 } (end if backslash)
77a135fe 2791 handle regular character
02aa26ce 2792 } (end while character to read)
4e553d73 2793
02aa26ce
NT
2794*/
2795
76e3520e 2796STATIC char *
cea2e8a9 2797S_scan_const(pTHX_ char *start)
79072805 2798{
eb578fdb 2799 char *send = PL_bufend; /* end of the constant */
dc023dbb
KW
2800 SV *sv = newSV(send - start); /* sv for the constant. See note below
2801 on sizing. */
eb578fdb
KW
2802 char *s = start; /* start of the constant */
2803 char *d = SvPVX(sv); /* destination for copies */
dc023dbb
KW
2804 bool dorange = FALSE; /* are we in a translit range? */
2805 bool didrange = FALSE; /* did we just finish a range? */
2806 bool in_charclass = FALSE; /* within /[...]/ */
2807 bool has_utf8 = FALSE; /* Output constant is UTF8 */
2808 bool this_utf8 = cBOOL(UTF); /* Is the source string assumed to be
2809 UTF8? But, this can show as true
2810 when the source isn't utf8, as for
2811 example when it is entirely composed
2812 of hex constants */
6f613c73 2813 SV *res; /* result from charnames */
77a135fe
KW
2814
2815 /* Note on sizing: The scanned constant is placed into sv, which is
2816 * initialized by newSV() assuming one byte of output for every byte of
2817 * input. This routine expects newSV() to allocate an extra byte for a
2818 * trailing NUL, which this routine will append if it gets to the end of
2819 * the input. There may be more bytes of input than output (eg., \N{LATIN
2820 * CAPITAL LETTER A}), or more output than input if the constant ends up
2821 * recoded to utf8, but each time a construct is found that might increase
2822 * the needed size, SvGROW() is called. Its size parameter each time is
2823 * based on the best guess estimate at the time, namely the length used so
2824 * far, plus the length the current construct will occupy, plus room for
2825 * the trailing NUL, plus one byte for every input byte still unscanned */
2826
c3320c2a
KW
2827 UV uv = UV_MAX; /* Initialize to weird value to try to catch any uses
2828 before set */
4c3a8340
ST
2829#ifdef EBCDIC
2830 UV literal_endpoint = 0;
e294cc5d 2831 bool native_range = TRUE; /* turned to FALSE if the first endpoint is Unicode. */
4c3a8340 2832#endif
012bcf8d 2833
7918f24d
NC
2834 PERL_ARGS_ASSERT_SCAN_CONST;
2835
bb16bae8 2836 assert(PL_lex_inwhat != OP_TRANSR);
2b9d42f0
NIS
2837 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
2838 /* If we are doing a trans and we know we want UTF8 set expectation */
2839 has_utf8 = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
2840 this_utf8 = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
2841 }
2842
b899e89d
FC
2843 /* Protect sv from errors and fatal warnings. */
2844 ENTER_with_name("scan_const");
2845 SAVEFREESV(sv);
2b9d42f0 2846
79072805 2847 while (s < send || dorange) {
ff3f963a 2848
02aa26ce 2849 /* get transliterations out of the way (they're most literal) */
3280af22 2850 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 2851 /* expand a range A-Z to the full set of characters. AIE! */
79072805 2852 if (dorange) {
1ba5c669
JH
2853 I32 i; /* current expanded character */
2854 I32 min; /* first character in range */
2855 I32 max; /* last character in range */
02aa26ce 2856
e294cc5d
JH
2857#ifdef EBCDIC
2858 UV uvmax = 0;
2859#endif
2860
2861 if (has_utf8
2862#ifdef EBCDIC
2863 && !native_range
2864#endif
1953db30 2865 ) {
9d4ba2ae 2866 char * const c = (char*)utf8_hop((U8*)d, -1);
8973db79
JH
2867 char *e = d++;
2868 while (e-- > c)
2869 *(e + 1) = *e;
e7214ce8 2870 *c = (char) ILLEGAL_UTF8_BYTE;
8973db79
JH
2871 /* mark the range as done, and continue */
2872 dorange = FALSE;
2873 didrange = TRUE;
2874 continue;
2875 }
2b9d42f0 2876
95a20fc0 2877 i = d - SvPVX_const(sv); /* remember current offset */
e294cc5d
JH
2878#ifdef EBCDIC
2879 SvGROW(sv,
dc023dbb
KW
2880 SvLEN(sv) + ((has_utf8)
2881 ? (512 - UTF_CONTINUATION_MARK
2882 + UNISKIP(0x100))
e294cc5d
JH
2883 : 256));
2884 /* How many two-byte within 0..255: 128 in UTF-8,
2885 * 96 in UTF-8-mod. */
2886#else
9cbb5ea2 2887 SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */
e294cc5d 2888#endif
9cbb5ea2 2889 d = SvPVX(sv) + i; /* refresh d after realloc */
e294cc5d
JH
2890#ifdef EBCDIC
2891 if (has_utf8) {
2892 int j;
2893 for (j = 0; j <= 1; j++) {
2894 char * const c = (char*)utf8_hop((U8*)d, -1);
2895 const UV uv = utf8n_to_uvchr((U8*)c, d - c, NULL, 0);
2896 if (j)
2897 min = (U8)uv;
2898 else if (uv < 256)
2899 max = (U8)uv;
2900 else {
2901 max = (U8)0xff; /* only to \xff */
2902 uvmax = uv; /* \x{100} to uvmax */
2903 }
2904 d = c; /* eat endpoint chars */
2905 }
2906 }
2907 else {
2908#endif
2909 d -= 2; /* eat the first char and the - */
2910 min = (U8)*d; /* first char in range */
2911 max = (U8)d[1]; /* last char in range */
2912#ifdef EBCDIC
2913 }
2914#endif
8ada0baa 2915
c2e66d9e 2916 if (min > max) {
01ec43d0 2917 Perl_croak(aTHX_
d1573ac7 2918 "Invalid range \"%c-%c\" in transliteration operator",
1ba5c669 2919 (char)min, (char)max);
c2e66d9e
GS
2920 }
2921
c7f1f016 2922#ifdef EBCDIC
dc023dbb
KW
2923 /* Because of the discontinuities in EBCDIC A-Z and a-z, expand
2924 * any subsets of these ranges into individual characters */
4c3a8340 2925 if (literal_endpoint == 2 &&
c286a9a6 2926 ((isLOWER_A(min) && isLOWER_A(max)) ||
75e9e7bf
KW
2927 (isUPPER_A(min) && isUPPER_A(max))))
2928 {
2929 for (i = min; i <= max; i++) {
2930 if (isALPHA_A(i))
2931 *d++ = i;
8ada0baa
JH
2932 }
2933 }
2934 else
2935#endif
2936 for (i = min; i <= max; i++)
e294cc5d
JH
2937#ifdef EBCDIC
2938 if (has_utf8) {
55d09dc8 2939 append_utf8_from_native_byte(i, &d);
e294cc5d
JH
2940 }
2941 else
2942#endif
2943 *d++ = (char)i;
2944
2945#ifdef EBCDIC
2946 if (uvmax) {
2947 d = (char*)uvchr_to_utf8((U8*)d, 0x100);
2948 if (uvmax > 0x101)
e7214ce8 2949 *d++ = (char) ILLEGAL_UTF8_BYTE;
e294cc5d
JH
2950 if (uvmax > 0x100)
2951 d = (char*)uvchr_to_utf8((U8*)d, uvmax);
2952 }
2953#endif
02aa26ce
NT
2954
2955 /* mark the range as done, and continue */
79072805 2956 dorange = FALSE;
01ec43d0 2957 didrange = TRUE;
4c3a8340
ST
2958#ifdef EBCDIC
2959 literal_endpoint = 0;
2960#endif
79072805 2961 continue;
4e553d73 2962 }
02aa26ce
NT
2963
2964 /* range begins (ignore - as first or last char) */
79072805 2965 else if (*s == '-' && s+1 < send && s != start) {
4e553d73 2966 if (didrange) {
1fafa243 2967 Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
01ec43d0 2968 }
e294cc5d
JH
2969 if (has_utf8
2970#ifdef EBCDIC
2971 && !native_range
2972#endif
2973 ) {
e7214ce8 2974 *d++ = (char) ILLEGAL_UTF8_BYTE; /* use illegal utf8 byte--see pmtrans */
a0ed51b3
LW
2975 s++;
2976 continue;
2977 }
79072805
LW
2978 dorange = TRUE;
2979 s++;
01ec43d0
GS
2980 }
2981 else {
2982 didrange = FALSE;
4c3a8340
ST
2983#ifdef EBCDIC
2984 literal_endpoint = 0;
e294cc5d 2985 native_range = TRUE;
4c3a8340 2986#endif
01ec43d0 2987 }
79072805 2988 }
02aa26ce
NT
2989
2990 /* if we get here, we're not doing a transliteration */
2991
e4a2df84
DM
2992 else if (*s == '[' && PL_lex_inpat && !in_charclass) {
2993 char *s1 = s-1;
2994 int esc = 0;
2995 while (s1 >= start && *s1-- == '\\')
2996 esc = !esc;
2997 if (!esc)
2998 in_charclass = TRUE;
2999 }
2866decb 3000
e4a2df84
DM
3001 else if (*s == ']' && PL_lex_inpat && in_charclass) {
3002 char *s1 = s-1;
3003 int esc = 0;
3004 while (s1 >= start && *s1-- == '\\')
3005 esc = !esc;
3006 if (!esc)
3007 in_charclass = FALSE;
3008 }
2866decb 3009
9da1dd8f
DM
3010 /* skip for regexp comments /(?#comment)/, except for the last
3011 * char, which will be done separately.
3012 * Stop on (?{..}) and friends */
3013
c30fc27b 3014 else if (*s == '(' && PL_lex_inpat && s[1] == '?' && !in_charclass) {
cc6b7395 3015 if (s[2] == '#') {
e994fd66 3016 while (s+1 < send && *s != ')')
5ff03569 3017 *d++ = *s++;
155aba94 3018 }
c30fc27b 3019 else if (!PL_lex_casemods &&
d3cec5e5
DM
3020 ( s[2] == '{' /* This should match regcomp.c */
3021 || (s[2] == '?' && s[3] == '{')))
155aba94 3022 {
9da1dd8f 3023 break;
cc6b7395 3024 }
748a9306 3025 }
02aa26ce
NT
3026
3027 /* likewise skip #-initiated comments in //x patterns */
c30fc27b 3028 else if (*s == '#' && PL_lex_inpat && !in_charclass &&
73134a2e 3029 ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED) {
748a9306 3030 while (s+1 < send && *s != '\n')
5ff03569 3031 *d++ = *s++;
748a9306 3032 }
02aa26ce 3033
9da1dd8f
DM
3034 /* no further processing of single-quoted regex */
3035 else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'')
3036 goto default_action;
3037
5d1d4326 3038 /* check for embedded arrays
da6eedaa 3039 (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
5d1d4326 3040 */
1749ea0d 3041 else if (*s == '@' && s[1]) {
8a2bca12 3042 if (isWORDCHAR_lazy_if(s+1,UTF))
1749ea0d
ST
3043 break;
3044 if (strchr(":'{$", s[1]))
3045 break;
3046 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
3047 break; /* in regexp, neither @+ nor @- are interpolated */
3048 }
02aa26ce
NT
3049
3050 /* check for embedded scalars. only stop if we're sure it's a
3051 variable.
3052 */
79072805 3053 else if (*s == '$') {
3280af22 3054 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
79072805 3055 break;
77772344 3056 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
a2a5de95
NC
3057 if (s[1] == '\\') {
3058 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
3059 "Possible unintended interpolation of $\\ in regex");
77772344 3060 }
79072805 3061 break; /* in regexp, $ might be tail anchor */
77772344 3062 }
79072805 3063 }
02aa26ce 3064
2b9d42f0
NIS
3065 /* End of else if chain - OP_TRANS rejoin rest */
3066
02aa26ce 3067 /* backslashes */
79072805 3068 if (*s == '\\' && s+1 < send) {
ff3f963a
KW
3069 char* e; /* Can be used for ending '}', etc. */
3070
79072805 3071 s++;
02aa26ce 3072
7d0fc23c
KW
3073 /* warn on \1 - \9 in substitution replacements, but note that \11
3074 * is an octal; and \19 is \1 followed by '9' */
3280af22 3075 if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat &&
a0d0e21e 3076 isDIGIT(*s) && *s != '0' && !isDIGIT(s[1]))
79072805 3077 {
c782d7ee 3078 /* diag_listed_as: \%d better written as $%d */
a2a5de95 3079 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
79072805
LW
3080 *--s = '$';
3081 break;
3082 }
02aa26ce
NT
3083
3084 /* string-change backslash escapes */
838f2281 3085 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQF", *s)) {
79072805
LW
3086 --s;
3087 break;
3088 }
ff3f963a
KW
3089 /* In a pattern, process \N, but skip any other backslash escapes.
3090 * This is because we don't want to translate an escape sequence
3091 * into a meta symbol and have the regex compiler use the meta
3092 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
3093 * in spite of this, we do have to process \N here while the proper
3094 * charnames handler is in scope. See bugs #56444 and #62056.
3095 * There is a complication because \N in a pattern may also stand
3096 * for 'match a non-nl', and not mean a charname, in which case its
3097 * processing should be deferred to the regex compiler. To be a
3098 * charname it must be followed immediately by a '{', and not look
3099 * like \N followed by a curly quantifier, i.e., not something like
3100 * \N{3,}. regcurly returns a boolean indicating if it is a legal
3101 * quantifier */
3102 else if (PL_lex_inpat
3103 && (*s != 'N'
3104 || s[1] != '{'
412f55bb 3105 || regcurly(s + 1)))
ff3f963a 3106 {
4d73d076 3107 *d++ = '\\';
cc74c5bd
ST
3108 goto default_action;
3109 }
02aa26ce 3110
79072805 3111 switch (*s) {
02aa26ce
NT
3112
3113 /* quoted - in transliterations */
79072805 3114 case '-':
3280af22 3115 if (PL_lex_inwhat == OP_TRANS) {
79072805
LW
3116 *d++ = *s++;
3117 continue;
3118 }
924ba076 3119 /* FALLTHROUGH */
79072805 3120 default:
11b8faa4 3121 {
15861f94 3122 if ((isALPHANUMERIC(*s)))
a2a5de95
NC
3123 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
3124 "Unrecognized escape \\%c passed through",
3125 *s);
11b8faa4 3126 /* default action is to copy the quoted character */
f9a63242 3127 goto default_action;
11b8faa4 3128 }
02aa26ce 3129
632403cc 3130 /* eg. \132 indicates the octal constant 0132 */
79072805
LW
3131 case '0': case '1': case '2': case '3':
3132 case '4': case '5': case '6': case '7':
ba210ebe 3133 {
5e0a247b 3134 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
53305cf1 3135 STRLEN len = 3;
06972766 3136 uv = grok_oct(s, &len, &flags, NULL);
ba210ebe 3137 s += len;
5e0a247b
KW
3138 if (len < 3 && s < send && isDIGIT(*s)
3139 && ckWARN(WARN_MISC))
3140 {
3141 Perl_warner(aTHX_ packWARN(WARN_MISC),
3142 "%s", form_short_octal_warning(s, len));
3143 }
ba210ebe 3144 }
012bcf8d 3145 goto NUM_ESCAPE_INSERT;
02aa26ce 3146
f0a2b745
KW
3147 /* eg. \o{24} indicates the octal constant \024 */
3148 case 'o':
3149 {
454155d9 3150 const char* error;
f0a2b745 3151
00ce5563 3152 bool valid = grok_bslash_o(&s, &uv, &error,
80f4111b
KW
3153 TRUE, /* Output warning */
3154 FALSE, /* Not strict */
17896a85
KW
3155 TRUE, /* Output warnings for
3156 non-portables */
80f4111b 3157 UTF);
454155d9 3158 if (! valid) {
f0a2b745
KW
3159 yyerror(error);
3160 continue;
3161 }
3162 goto NUM_ESCAPE_INSERT;
3163 }
3164
77a135fe 3165 /* eg. \x24 indicates the hex constant 0x24 */
79072805 3166 case 'x':
a0481293 3167 {
a0481293 3168 const char* error;
355860ce 3169
00ce5563 3170 bool valid = grok_bslash_x(&s, &uv, &error,
80f4111b
KW
3171 TRUE, /* Output warning */
3172 FALSE, /* Not strict */
17896a85
KW
3173 TRUE, /* Output warnings for
3174 non-portables */
80f4111b 3175 UTF);
a0481293
KW
3176 if (! valid) {
3177 yyerror(error);
355860ce 3178 continue;
ba210ebe 3179 }
012bcf8d
GS
3180 }
3181
3182 NUM_ESCAPE_INSERT:
ff3f963a
KW
3183 /* Insert oct or hex escaped character. There will always be
3184 * enough room in sv since such escapes will be longer than any
3185 * UTF-8 sequence they can end up as, except if they force us
3186 * to recode the rest of the string into utf8 */
ba7cea30 3187
06972766 3188 /* Here uv is the ordinal of the next character being added */
6f2d5cbc 3189 if (!UVCHR_IS_INVARIANT(uv)) {
9aa983d2 3190 if (!has_utf8 && uv > 255) {
77a135fe
KW
3191 /* Might need to recode whatever we have accumulated so
3192 * far if it contains any chars variant in utf8 or
3193 * utf-ebcdic. */
3194
3195 SvCUR_set(sv, d - SvPVX_const(sv));
3196 SvPOK_on(sv);
3197 *d = '\0';
77a135fe 3198 /* See Note on sizing above. */
7bf79863
KW
3199 sv_utf8_upgrade_flags_grow(sv,
3200 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3201 UNISKIP(uv) + (STRLEN)(send - s) + 1);
77a135fe
KW
3202 d = SvPVX(sv) + SvCUR(sv);
3203 has_utf8 = TRUE;
012bcf8d
GS
3204 }
3205
77a135fe 3206 if (has_utf8) {
c80e42f3 3207 d = (char*)uvchr_to_utf8((U8*)d, uv);
f9a63242
JH
3208 if (PL_lex_inwhat == OP_TRANS &&
3209 PL_sublex_info.sub_op) {
3210 PL_sublex_info.sub_op->op_private |=
3211 (PL_lex_repl ? OPpTRANS_FROM_UTF
3212 : OPpTRANS_TO_UTF);
f9a63242 3213 }
e294cc5d
JH
3214#ifdef EBCDIC
3215 if (uv > 255 && !dorange)
3216 native_range = FALSE;
3217#endif
012bcf8d 3218 }
a0ed51b3 3219 else {
012bcf8d 3220 *d++ = (char)uv;
a0ed51b3 3221 }
012bcf8d
GS
3222 }
3223 else {
c4d5f83a 3224 *d++ = (char) uv;
a0ed51b3 3225 }
79072805 3226 continue;
02aa26ce 3227
4a2d328f 3228 case 'N':
ff3f963a
KW
3229 /* In a non-pattern \N must be a named character, like \N{LATIN
3230 * SMALL LETTER A} or \N{U+0041}. For patterns, it also can
3231 * mean to match a non-newline. For non-patterns, named
3232 * characters are converted to their string equivalents. In
3233 * patterns, named characters are not converted to their
3234 * ultimate forms for the same reasons that other escapes
3235 * aren't. Instead, they are converted to the \N{U+...} form
3236 * to get the value from the charnames that is in effect right
3237 * now, while preserving the fact that it was a named character
3238 * so that the regex compiler knows this */
3239
ff3f963a
KW
3240 /* The structure of this section of code (besides checking for
3241 * errors and upgrading to utf8) is:
3242 * Further disambiguate between the two meanings of \N, and if
3243 * not a charname, go process it elsewhere
0a96133f
KW
3244 * If of form \N{U+...}, pass it through if a pattern;
3245 * otherwise convert to utf8
3246 * Otherwise must be \N{NAME}: convert to \N{U+c1.c2...} if a
3247 * pattern; otherwise convert to utf8 */
ff3f963a
KW
3248
3249 /* Here, s points to the 'N'; the test below is guaranteed to
3250 * succeed if we are being called on a pattern as we already
3251 * know from a test above that the next character is a '{'.
3252 * On a non-pattern \N must mean 'named sequence, which
3253 * requires braces */
3254 s++;
3255 if (*s != '{') {
3256 yyerror("Missing braces on \\N{}");
3257 continue;
3258 }
3259 s++;
3260
0a96133f 3261 /* If there is no matching '}', it is an error. */
ff3f963a
KW
3262 if (! (e = strchr(s, '}'))) {
3263 if (! PL_lex_inpat) {
5777a3f7 3264 yyerror("Missing right brace on \\N{}");
0a96133f 3265 } else {
4407f1b8 3266 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N");
dbc0d4f2 3267 }
0a96133f 3268 continue;
ff3f963a 3269 }
cddc7ef4 3270
ff3f963a 3271 /* Here it looks like a named character */
cddc7ef4 3272
ff3f963a
KW
3273 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3274 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3275 | PERL_SCAN_DISALLOW_PREFIX;
3276 STRLEN len;
3277
3278 /* For \N{U+...}, the '...' is a unicode value even on
3279 * EBCDIC machines */
3280 s += 2; /* Skip to next char after the 'U+' */
3281 len = e - s;
3282 uv = grok_hex(s, &len, &flags, NULL);
3283 if (len == 0 || len != (STRLEN)(e - s)) {
3284 yyerror("Invalid hexadecimal number in \\N{U+...}");
3285 s = e + 1;
3286 continue;
3287 }
3288
3289 if (PL_lex_inpat) {
3290
e2a7e165
KW
3291 /* On non-EBCDIC platforms, pass through to the regex
3292 * compiler unchanged. The reason we evaluated the
3293 * number above is to make sure there wasn't a syntax
3294 * error. But on EBCDIC we convert to native so
3295 * downstream code can continue to assume it's native
3296 */
ff3f963a 3297 s -= 5; /* Include the '\N{U+' */
e2a7e165
KW
3298#ifdef EBCDIC
3299 d += my_snprintf(d, e - s + 1 + 1, /* includes the }
3300 and the \0 */
3301 "\\N{U+%X}",
3302 (unsigned int) UNI_TO_NATIVE(uv));
3303#else
ff3f963a
KW
3304 Copy(s, d, e - s + 1, char); /* 1 = include the } */
3305 d += e - s + 1;
e2a7e165 3306#endif
ff3f963a
KW
3307 }
3308 else { /* Not a pattern: convert the hex to string */
3309
3310 /* If destination is not in utf8, unconditionally
3311 * recode it to be so. This is because \N{} implies
3312 * Unicode semantics, and scalars have to be in utf8
3313 * to guarantee those semantics */
3314 if (! has_utf8) {
3315 SvCUR_set(sv, d - SvPVX_const(sv));
3316 SvPOK_on(sv);
3317 *d = '\0';
3318 /* See Note on sizing above. */
3319 sv_utf8_upgrade_flags_grow(
3320 sv,
3321 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3322 UNISKIP(uv) + (STRLEN)(send - e) + 1);
3323 d = SvPVX(sv) + SvCUR(sv);
3324 has_utf8 = TRUE;
3325 }
3326
a46469e6 3327 /* Add the (Unicode) code point to the output. */
ff3f963a 3328 if (UNI_IS_INVARIANT(uv)) {
a46469e6 3329 *d++ = (char) LATIN1_TO_NATIVE(uv);
ff3f963a 3330 }
a46469e6
KW
3331 else {
3332 d = (char*) uvoffuni_to_utf8_flags((U8*)d, uv, 0);
3333 }
ff3f963a
KW
3334 }
3335 }
6f613c73
KW
3336 else /* Here is \N{NAME} but not \N{U+...}. */
3337 if ((res = get_and_check_backslash_N_name(s, e)))
3338 {
3339 STRLEN len;
3340 const char *str = SvPV_const(res, len);
3341 if (PL_lex_inpat) {
ff3f963a
KW
3342
3343 if (! len) { /* The name resolved to an empty string */
3344 Copy("\\N{}", d, 4, char);
3345 d += 4;
3346 }
3347 else {
3348 /* In order to not lose information for the regex
3349 * compiler, pass the result in the specially made
3350 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3351 * the code points in hex of each character
3352 * returned by charnames */
3353
3354 const char *str_end = str + len;
3b721c4f 3355 const STRLEN off = d - SvPVX_const(sv);
94ca1619
KW
3356
3357 if (! SvUTF8(res)) {
3358 /* For the non-UTF-8 case, we can determine the
3359 * exact length needed without having to parse
3360 * through the string. Each character takes up
3361 * 2 hex digits plus either a trailing dot or
3362 * the "}" */
3363 d = off + SvGROW(sv, off
3364 + 3 * len
3365 + 6 /* For the "\N{U+", and
3366 trailing NUL */
3367 + (STRLEN)(send - e));
3368 Copy("\\N{U+", d, 5, char);
3369 d += 5;
3370 while (str < str_end) {
3371 char hex_string[4];
e8549682 3372 int len =
51f14a05
JH
3373 my_snprintf(hex_string,
3374 sizeof(hex_string),
e8549682
JH
3375 "%02X.", (U8) *str);
3376 PERL_MY_SNPRINTF_POST_GUARD(len, sizeof(hex_string));
94ca1619
KW
3377 Copy(hex_string, d, 3, char);
3378 d += 3;
3379 str++;
3380 }
3381 d--; /* We will overwrite below the final
3382 dot with a right brace */
3383 }
3384 else {
1953db30
KW
3385 STRLEN char_length; /* cur char's byte length */
3386
3387 /* and the number of bytes after this is
3388 * translated into hex digits */
3389 STRLEN output_length;
3390
3391 /* 2 hex per byte; 2 chars for '\N'; 2 chars
3392 * for max('U+', '.'); and 1 for NUL */
3393 char hex_string[2 * UTF8_MAXBYTES + 5];
3394
3395 /* Get the first character of the result. */
a46469e6 3396 U32 uv = utf8n_to_uvchr((U8 *) str,
1953db30
KW
3397 len,
3398 &char_length,
3399 UTF8_ALLOW_ANYUV);
3400 /* Convert first code point to hex, including
a46469e6 3401 * the boiler plate before it. */
1953db30
KW
3402 output_length =
3403 my_snprintf(hex_string, sizeof(hex_string),
a46469e6
KW
3404 "\\N{U+%X",
3405 (unsigned int) uv);
1953db30
KW
3406
3407 /* Make sure there is enough space to hold it */
3408 d = off + SvGROW(sv, off
3409 + output_length
3410 + (STRLEN)(send - e)
3411 + 2); /* '}' + NUL */
3412 /* And output it */
3413 Copy(hex_string, d, output_length, char);
3414 d += output_length;
3415
3416 /* For each subsequent character, append dot and
3417 * its ordinal in hex */
3418 while ((str += char_length) < str_end) {
3419 const STRLEN off = d - SvPVX_const(sv);
a46469e6 3420 U32 uv = utf8n_to_uvchr((U8 *) str,
1953db30
KW
3421 str_end - str,
3422 &char_length,
3423 UTF8_ALLOW_ANYUV);
3424 output_length =
3425 my_snprintf(hex_string,
a46469e6
KW
3426 sizeof(hex_string),
3427 ".%X",
3428 (unsigned int) uv);
1953db30
KW
3429
3430 d = off + SvGROW(sv, off
3431 + output_length
3432 + (STRLEN)(send - e)
3433 + 2); /* '}' + NUL */
3434 Copy(hex_string, d, output_length, char);
3435 d += output_length;
3436 }
94ca1619 3437 }
ff3f963a
KW
3438
3439 *d++ = '}'; /* Done. Add the trailing brace */
3440 }
3441 }
3442 else { /* Here, not in a pattern. Convert the name to a
3443 * string. */
3444
3445 /* If destination is not in utf8, unconditionally
3446 * recode it to be so. This is because \N{} implies
3447 * Unicode semantics, and scalars have to be in utf8
3448 * to guarantee those semantics */
3449 if (! has_utf8) {
3450 SvCUR_set(sv, d - SvPVX_const(sv));
3451 SvPOK_on(sv);
3452 *d = '\0';
3453 /* See Note on sizing above. */
3454 sv_utf8_upgrade_flags_grow(sv,
3455 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3456 len + (STRLEN)(send - s) + 1);
3457 d = SvPVX(sv) + SvCUR(sv);
3458 has_utf8 = TRUE;
3459 } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
3460
3461 /* See Note on sizing above. (NOTE: SvCUR() is not
3462 * set correctly here). */
3463 const STRLEN off = d - SvPVX_const(sv);
3464 d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1);
3465 }
7fc82458
KW
3466 if (! SvUTF8(res)) { /* Make sure is \N{} return is UTF-8 */
3467 sv_utf8_upgrade(res);
3468 str = SvPV_const(res, len);
3469 }
ff3f963a
KW
3470 Copy(str, d, len, char);
3471 d += len;
423cee85 3472 }
6f613c73 3473
423cee85 3474 SvREFCNT_dec(res);
cb233ae3 3475
cb233ae3 3476 } /* End \N{NAME} */
ff3f963a
KW
3477#ifdef EBCDIC
3478 if (!dorange)
3479 native_range = FALSE; /* \N{} is defined to be Unicode */
3480#endif
3481 s = e + 1; /* Point to just after the '}' */
423cee85
JH
3482 continue;
3483
02aa26ce 3484 /* \c is a control character */
79072805
LW
3485 case 'c':
3486 s++;
961ce445 3487 if (s < send) {
421e43ba 3488 *d++ = grok_bslash_c(*s++, 1);
ba210ebe 3489 }
961ce445
RGS
3490 else {
3491 yyerror("Missing control char name in \\c");
3492 }
79072805 3493 continue;
02aa26ce
NT
3494
3495 /* printf-style backslashes, formfeeds, newlines, etc */
79072805 3496 case 'b':
4d73d076 3497 *d++ = '\b';
79072805
LW
3498 break;
3499 case 'n':
4d73d076 3500 *d++ = '\n';
79072805
LW
3501 break;
3502 case 'r':
4d73d076 3503 *d++ = '\r';
79072805
LW
3504 break;
3505 case 'f':
4d73d076 3506 *d++ = '\f';
79072805
LW
3507 break;
3508 case 't':
4d73d076 3509 *d++ = '\t';
79072805 3510 break;
34a3fe2a 3511 case 'e':
da0334e8 3512 *d++ = ESC_NATIVE;
34a3fe2a
PP
3513 break;
3514 case 'a':
4d73d076 3515 *d++ = '\a';
79072805 3516 break;
02aa26ce
NT
3517 } /* end switch */
3518
79072805
LW
3519 s++;
3520 continue;
02aa26ce 3521 } /* end if (backslash) */
4c3a8340
ST
3522#ifdef EBCDIC
3523 else
3524 literal_endpoint++;
3525#endif
02aa26ce 3526
f9a63242 3527 default_action:
77a135fe
KW
3528 /* If we started with encoded form, or already know we want it,
3529 then encode the next character */
6f2d5cbc 3530 if (! NATIVE_BYTE_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
2b9d42f0 3531 STRLEN len = 1;
77a135fe
KW
3532
3533
3534 /* One might think that it is wasted effort in the case of the
3535 * source being utf8 (this_utf8 == TRUE) to take the next character
3536 * in the source, convert it to an unsigned value, and then convert
3537 * it back again. But the source has not been validated here. The
3538 * routine that does the conversion checks for errors like
3539 * malformed utf8 */
3540
233ca360
KW
3541 const UV nextuv = (this_utf8)
3542 ? utf8n_to_uvchr((U8*)s, send - s, &len, 0)
3543 : (UV) ((U8) *s);
5aaebcb3 3544 const STRLEN need = UNISKIP(nextuv);
77a135fe
KW
3545 if (!has_utf8) {
3546 SvCUR_set(sv, d - SvPVX_const(sv));
3547 SvPOK_on(sv);
3548 *d = '\0';
77a135fe 3549 /* See Note on sizing above. */
7bf79863
KW
3550 sv_utf8_upgrade_flags_grow(sv,
3551 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3552 need + (STRLEN)(send - s) + 1);
77a135fe
KW
3553 d = SvPVX(sv) + SvCUR(sv);
3554 has_utf8 = TRUE;
3555 } else if (need > len) {
3556 /* encoded value larger than old, may need extra space (NOTE:
3557 * SvCUR() is not set correctly here). See Note on sizing
3558 * above. */
9d4ba2ae 3559 const STRLEN off = d - SvPVX_const(sv);
77a135fe 3560 d = SvGROW(sv, off + need + (STRLEN)(send - s) + 1) + off;
2b9d42f0 3561 }
77a135fe
KW
3562 s += len;
3563
5f66b61c 3564 d = (char*)uvchr_to_utf8((U8*)d, nextuv);
e294cc5d
JH
3565#ifdef EBCDIC
3566 if (uv > 255 && !dorange)
3567 native_range = FALSE;
3568#endif
2b9d42f0
NIS
3569 }
3570 else {
5ff03569 3571 *d++ = *s++;
2b9d42f0 3572 }
02aa26ce
NT
3573 } /* while loop to process each character */
3574
3575 /* terminate the string and set up the sv */
79072805 3576 *d = '\0';
95a20fc0 3577 SvCUR_set(sv, d - SvPVX_const(sv));
2b9d42f0 3578 if (SvCUR(sv) >= SvLEN(sv))
5637ef5b
NC
3579 Perl_croak(aTHX_ "panic: constant overflowed allocated space, %"UVuf
3580 " >= %"UVuf, (UV)SvCUR(sv), (UV)SvLEN(sv));
2b9d42f0 3581
79072805 3582 SvPOK_on(sv);
9f4817db 3583 if (PL_encoding && !has_utf8) {
d0063567
DK
3584 sv_recode_to_utf8(sv, PL_encoding);
3585 if (SvUTF8(sv))
3586 has_utf8 = TRUE;
9f4817db 3587 }
2b9d42f0 3588 if (has_utf8) {
7e2040f0 3589 SvUTF8_on(sv);
2b9d42f0 3590 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
d0063567 3591 PL_sublex_info.sub_op->op_private |=
2b9d42f0
NIS
3592 (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
3593 }
3594 }
79072805 3595
02aa26ce 3596 /* shrink the sv if we allocated more than we used */
79072805 3597 if (SvCUR(sv) + 5 < SvLEN(sv)) {
1da4ca5f 3598 SvPV_shrink_to_cur(sv);
79072805 3599 }
02aa26ce 3600
6154021b 3601 /* return the substring (via pl_yylval) only if we parsed anything */
f0feb466
FC
3602 if (s > start) {
3603 char *s2 = start;
ffdb8b16
FC
3604 for (; s2 < s; s2++) {
3605 if (*s2 == '\n')
3606 COPLINE_INC_WITH_HERELINES;
3607 }
b899e89d 3608 SvREFCNT_inc_simple_void_NN(sv);
4f3e2518
DM
3609 if ( (PL_hints & ( PL_lex_inpat ? HINT_NEW_RE : HINT_NEW_STRING ))
3610 && ! PL_parser->lex_re_reparsing)
3611 {
eb0d8d16
NC
3612 const char *const key = PL_lex_inpat ? "qr" : "q";
3613 const STRLEN keylen = PL_lex_inpat ? 2 : 1;
3614 const char *type;
3615 STRLEN typelen;
3616
3617 if (PL_lex_inwhat == OP_TRANS) {
3618 type = "tr";
3619 typelen = 2;
3620 } else if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat) {
3621 type = "s";
3622 typelen = 1;
9da1dd8f
DM
3623 } else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'') {
3624 type = "q";
3625 typelen = 1;
eb0d8d16
NC
3626 } else {
3627 type = "qq";
3628 typelen = 2;
3629 }
3630
3631 sv = S_new_constant(aTHX_ start, s - start, key, keylen, sv, NULL,
3632 type, typelen);
3633 }
6154021b 3634 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
b899e89d
FC
3635 }
3636 LEAVE_with_name("scan_const");
79072805
LW
3637 return s;
3638}
3639
ffb4593c
NT
3640/* S_intuit_more
3641 * Returns TRUE if there's more to the expression (e.g., a subscript),
3642 * FALSE otherwise.
ffb4593c
NT
3643 *
3644 * It deals with "$foo[3]" and /$foo[3]/ and /$foo[0123456789$]+/
3645 *
3646 * ->[ and ->{ return TRUE
ff25e5db 3647 * ->$* ->$#* ->@* ->@[ ->@{ return TRUE if postderef_qq is enabled
ffb4593c
NT
3648 * { and [ outside a pattern are always subscripts, so return TRUE
3649 * if we're outside a pattern and it's not { or [, then return FALSE
3650 * if we're in a pattern and the first char is a {
3651 * {4,5} (any digits around the comma) returns FALSE
3652 * if we're in a pattern and the first char is a [
3653 * [] returns FALSE
3654 * [SOMETHING] has a funky algorithm to decide whether it's a
3655 * character class or not. It has to deal with things like
3656 * /$foo[-3]/ and /$foo[$bar]/ as well as /$foo[$\d]+/
3657 * anything else returns TRUE
3658 */