This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Don’t mention pkg in proto warnings for lex subs
[perl5.git] / toke.c
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
26
27This is the lower layer of the Perl parser, managing characters and tokens.
28
29=for apidoc AmU|yy_parser *|PL_parser
30
31Pointer to a structure encapsulating the state of the parsing operation
32currently in progress. The pointer can be locally changed to perform
33a nested parse without interfering with the state of an outer parse.
34Individual members of C<PL_parser> have their own documentation.
35
36=cut
37*/
38
378cc40b 39#include "EXTERN.h"
864dbfa3 40#define PERL_IN_TOKE_C
378cc40b 41#include "perl.h"
04e98a4d 42#include "dquote_static.c"
378cc40b 43
eb0d8d16
NC
44#define new_constant(a,b,c,d,e,f,g) \
45 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
46
6154021b 47#define pl_yylval (PL_parser->yylval)
d3b6f988 48
199e78b7
DM
49/* XXX temporary backwards compatibility */
50#define PL_lex_brackets (PL_parser->lex_brackets)
78cdf107
Z
51#define PL_lex_allbrackets (PL_parser->lex_allbrackets)
52#define PL_lex_fakeeof (PL_parser->lex_fakeeof)
199e78b7
DM
53#define PL_lex_brackstack (PL_parser->lex_brackstack)
54#define PL_lex_casemods (PL_parser->lex_casemods)
55#define PL_lex_casestack (PL_parser->lex_casestack)
56#define PL_lex_defer (PL_parser->lex_defer)
57#define PL_lex_dojoin (PL_parser->lex_dojoin)
58#define PL_lex_expect (PL_parser->lex_expect)
59#define PL_lex_formbrack (PL_parser->lex_formbrack)
60#define PL_lex_inpat (PL_parser->lex_inpat)
61#define PL_lex_inwhat (PL_parser->lex_inwhat)
62#define PL_lex_op (PL_parser->lex_op)
63#define PL_lex_repl (PL_parser->lex_repl)
64#define PL_lex_starts (PL_parser->lex_starts)
65#define PL_lex_stuff (PL_parser->lex_stuff)
66#define PL_multi_start (PL_parser->multi_start)
67#define PL_multi_open (PL_parser->multi_open)
68#define PL_multi_close (PL_parser->multi_close)
199e78b7
DM
69#define PL_preambled (PL_parser->preambled)
70#define PL_sublex_info (PL_parser->sublex_info)
bdc0bf6f 71#define PL_linestr (PL_parser->linestr)
c2598295
DM
72#define PL_expect (PL_parser->expect)
73#define PL_copline (PL_parser->copline)
f06b5848
DM
74#define PL_bufptr (PL_parser->bufptr)
75#define PL_oldbufptr (PL_parser->oldbufptr)
76#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
77#define PL_linestart (PL_parser->linestart)
78#define PL_bufend (PL_parser->bufend)
79#define PL_last_uni (PL_parser->last_uni)
80#define PL_last_lop (PL_parser->last_lop)
81#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 82#define PL_lex_state (PL_parser->lex_state)
2f9285f8 83#define PL_rsfp (PL_parser->rsfp)
5486870f 84#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
85#define PL_in_my (PL_parser->in_my)
86#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 87#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 88#define PL_multi_end (PL_parser->multi_end)
13765c85 89#define PL_error_count (PL_parser->error_count)
199e78b7
DM
90
91#ifdef PERL_MAD
92# define PL_endwhite (PL_parser->endwhite)
93# define PL_faketokens (PL_parser->faketokens)
94# define PL_lasttoke (PL_parser->lasttoke)
95# define PL_nextwhite (PL_parser->nextwhite)
96# define PL_realtokenstart (PL_parser->realtokenstart)
97# define PL_skipwhite (PL_parser->skipwhite)
98# define PL_thisclose (PL_parser->thisclose)
99# define PL_thismad (PL_parser->thismad)
100# define PL_thisopen (PL_parser->thisopen)
101# define PL_thisstuff (PL_parser->thisstuff)
102# define PL_thistoken (PL_parser->thistoken)
103# define PL_thiswhite (PL_parser->thiswhite)
fb205e7a
DM
104# define PL_thiswhite (PL_parser->thiswhite)
105# define PL_nexttoke (PL_parser->nexttoke)
106# define PL_curforce (PL_parser->curforce)
107#else
108# define PL_nexttoke (PL_parser->nexttoke)
109# define PL_nexttype (PL_parser->nexttype)
110# define PL_nextval (PL_parser->nextval)
199e78b7
DM
111#endif
112
60ac52eb
FC
113#define force_ident_maybe_lex(p) \
114 (PL_bufptr = s, S_force_ident_maybe_lex(aTHX_ p))
199e78b7 115
0bd48802 116static const char ident_too_long[] = "Identifier too long";
8903cb82 117
29595ff2 118#ifdef PERL_MAD
29595ff2 119# define CURMAD(slot,sv) if (PL_madskills) { curmad(slot,sv); sv = 0; }
cd81e915 120# define NEXTVAL_NEXTTOKE PL_nexttoke[PL_curforce].next_val
9ded7720 121#else
5db06880 122# define CURMAD(slot,sv)
9ded7720 123# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2
NC
124#endif
125
a7aaec61
Z
126#define XENUMMASK 0x3f
127#define XFAKEEOF 0x40
128#define XFAKEBRACK 0x80
9059aa12 129
39e02b42
JH
130#ifdef USE_UTF8_SCRIPTS
131# define UTF (!IN_BYTES)
2b9d42f0 132#else
802a15e9 133# define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
2b9d42f0 134#endif
a0ed51b3 135
b1fc3636
CJ
136/* The maximum number of characters preceding the unrecognized one to display */
137#define UNRECOGNIZED_PRECEDE_COUNT 10
138
61f0cdd9 139/* In variables named $^X, these are the legal values for X.
2b92dfce
GS
140 * 1999-02-27 mjd-perl-patch@plover.com */
141#define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
142
bf4acbe4 143#define SPACE_OR_TAB(c) ((c)==' '||(c)=='\t')
bf4acbe4 144
ffb4593c
NT
145/* LEX_* are values for PL_lex_state, the state of the lexer.
146 * They are arranged oddly so that the guard on the switch statement
79072805 147 * can get by with a single comparison (if the compiler is smart enough).
9da1dd8f
DM
148 *
149 * These values refer to the various states within a sublex parse,
150 * i.e. within a double quotish string
79072805
LW
151 */
152
fb73857a
PP
153/* #define LEX_NOTPARSING 11 is done in perl.h. */
154
b6007c36
DM
155#define LEX_NORMAL 10 /* normal code (ie not within "...") */
156#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
157#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
158#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
159#define LEX_INTERPSTART 6 /* expecting the start of a $var */
160
161 /* at end of code, eg "$x" followed by: */
162#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
163#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
164
165#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
166 string or after \E, $foo, etc */
167#define LEX_INTERPCONST 2 /* NOT USED */
168#define LEX_FORMLINE 1 /* expecting a format line */
169#define LEX_KNOWNEXT 0 /* next token known; just return it */
170
79072805 171
bbf60fe6 172#ifdef DEBUGGING
27da23d5 173static const char* const lex_state_names[] = {
bbf60fe6
DM
174 "KNOWNEXT",
175 "FORMLINE",
176 "INTERPCONST",
177 "INTERPCONCAT",
178 "INTERPENDMAYBE",
179 "INTERPEND",
180 "INTERPSTART",
181 "INTERPPUSH",
182 "INTERPCASEMOD",
183 "INTERPNORMAL",
184 "NORMAL"
185};
186#endif
187
79072805
LW
188#ifdef ff_next
189#undef ff_next
d48672a2
LW
190#endif
191
79072805 192#include "keywords.h"
fe14fcc3 193
ffb4593c
NT
194/* CLINE is a macro that ensures PL_copline has a sane value */
195
ae986130
LW
196#ifdef CLINE
197#undef CLINE
198#endif
57843af0 199#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 200
5db06880 201#ifdef PERL_MAD
29595ff2
NC
202# define SKIPSPACE0(s) skipspace0(s)
203# define SKIPSPACE1(s) skipspace1(s)
204# define SKIPSPACE2(s,tsv) skipspace2(s,&tsv)
205# define PEEKSPACE(s) skipspace2(s,0)
206#else
207# define SKIPSPACE0(s) skipspace(s)
208# define SKIPSPACE1(s) skipspace(s)
209# define SKIPSPACE2(s,tsv) skipspace(s)
210# define PEEKSPACE(s) skipspace(s)
211#endif
212
ffb4593c
NT
213/*
214 * Convenience functions to return different tokens and prime the
9cbb5ea2 215 * lexer for the next token. They all take an argument.
ffb4593c
NT
216 *
217 * TOKEN : generic token (used for '(', DOLSHARP, etc)
218 * OPERATOR : generic operator
219 * AOPERATOR : assignment operator
220 * PREBLOCK : beginning the block after an if, while, foreach, ...
221 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
222 * PREREF : *EXPR where EXPR is not a simple identifier
223 * TERM : expression term
224 * LOOPX : loop exiting command (goto, last, dump, etc)
225 * FTST : file test operator
226 * FUN0 : zero-argument function
7eb971ee 227 * FUN0OP : zero-argument function, with its op created in this file
2d2e263d 228 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
229 * BOop : bitwise or or xor
230 * BAop : bitwise and
231 * SHop : shift operator
232 * PWop : power operator
9cbb5ea2 233 * PMop : pattern-matching operator
ffb4593c
NT
234 * Aop : addition-level operator
235 * Mop : multiplication-level operator
236 * Eop : equality-testing operator
e5edeb50 237 * Rop : relational operator <= != gt
ffb4593c
NT
238 *
239 * Also see LOP and lop() below.
240 */
241
998054bd 242#ifdef DEBUGGING /* Serve -DT. */
704d4215 243# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 244#else
bbf60fe6 245# define REPORT(retval) (retval)
998054bd
SC
246#endif
247
bbf60fe6
DM
248#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
249#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
250#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, REPORT(retval)))
251#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
252#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
253#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
254#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
6154021b
RGS
255#define LOOPX(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)LOOPEX))
256#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
257#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
7eb971ee 258#define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
6154021b
RGS
259#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
260#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITOROP)))
261#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITANDOP)))
262#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)SHIFTOP)))
263#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)POWOP)))
264#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
265#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)ADDOP)))
266#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MULOP)))
267#define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
268#define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
2f3197b3 269
a687059c
LW
270/* This bit of chicanery makes a unary function followed by
271 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
272 * The UNIDOR macro is for unary functions that can be followed by the //
273 * operator (such as C<shift // 0>).
a687059c 274 */
d68ce4ac 275#define UNI3(f,x,have_x) { \
6154021b 276 pl_yylval.ival = f; \
d68ce4ac 277 if (have_x) PL_expect = x; \
376fcdbf
AL
278 PL_bufptr = s; \
279 PL_last_uni = PL_oldbufptr; \
280 PL_last_lop_op = f; \
281 if (*s == '(') \
282 return REPORT( (int)FUNC1 ); \
29595ff2 283 s = PEEKSPACE(s); \
376fcdbf
AL
284 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
285 }
d68ce4ac
FC
286#define UNI(f) UNI3(f,XTERM,1)
287#define UNIDOR(f) UNI3(f,XTERMORDORDOR,1)
b5fb7ce3
FC
288#define UNIPROTO(f,optional) { \
289 if (optional) PL_last_uni = PL_oldbufptr; \
22393538
MH
290 OPERATOR(f); \
291 }
a687059c 292
d68ce4ac 293#define UNIBRACK(f) UNI3(f,0,0)
79072805 294
9f68db38 295/* grandfather return to old style */
78cdf107
Z
296#define OLDLOP(f) \
297 do { \
298 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
299 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
300 pl_yylval.ival = (f); \
301 PL_expect = XTERM; \
302 PL_bufptr = s; \
303 return (int)LSTOP; \
304 } while(0)
79072805 305
83944c01
FC
306#define COPLINE_INC_WITH_HERELINES \
307 STMT_START { \
308 CopLINE_inc(PL_curcop); \
d794b522
FC
309 if (PL_parser->lex_shared->herelines) \
310 CopLINE(PL_curcop) += PL_parser->lex_shared->herelines, \
311 PL_parser->lex_shared->herelines = 0; \
83944c01
FC
312 } STMT_END
313
314
8fa7f367
JH
315#ifdef DEBUGGING
316
6154021b 317/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
318enum token_type {
319 TOKENTYPE_NONE,
320 TOKENTYPE_IVAL,
6154021b 321 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6 322 TOKENTYPE_PVAL,
aeaef349 323 TOKENTYPE_OPVAL
bbf60fe6
DM
324};
325
6d4a66ac
NC
326static struct debug_tokens {
327 const int token;
328 enum token_type type;
329 const char *name;
330} const debug_tokens[] =
9041c2e3 331{
bbf60fe6
DM
332 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
333 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
334 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
335 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
336 { ARROW, TOKENTYPE_NONE, "ARROW" },
337 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
338 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
339 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
340 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
341 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
0d863452 342 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
343 { DO, TOKENTYPE_NONE, "DO" },
344 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
345 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
346 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
347 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
348 { ELSE, TOKENTYPE_NONE, "ELSE" },
349 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
350 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
351 { FOR, TOKENTYPE_IVAL, "FOR" },
352 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
705fe0e5
FC
353 { FORMLBRACK, TOKENTYPE_NONE, "FORMLBRACK" },
354 { FORMRBRACK, TOKENTYPE_NONE, "FORMRBRACK" },
bbf60fe6
DM
355 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
356 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
7eb971ee 357 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
bbf60fe6
DM
358 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
359 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
360 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 361 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
362 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
363 { IF, TOKENTYPE_IVAL, "IF" },
5db1eb8d 364 { LABEL, TOKENTYPE_OPVAL, "LABEL" },
bbf60fe6
DM
365 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
366 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
367 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
368 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
369 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
370 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
371 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
372 { MY, TOKENTYPE_IVAL, "MY" },
bbf60fe6
DM
373 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
374 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
375 { OROP, TOKENTYPE_IVAL, "OROP" },
376 { OROR, TOKENTYPE_NONE, "OROR" },
377 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
f3f204dc 378 { PEG, TOKENTYPE_NONE, "PEG" },
88e1f1a2
JV
379 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
380 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6
DM
381 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
382 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
383 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
384 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
385 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
386 { PREINC, TOKENTYPE_NONE, "PREINC" },
387 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
f3f204dc 388 { QWLIST, TOKENTYPE_OPVAL, "QWLIST" },
bbf60fe6
DM
389 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
390 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
f3f204dc 391 { REQUIRE, TOKENTYPE_NONE, "REQUIRE" },
bbf60fe6
DM
392 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
393 { SUB, TOKENTYPE_NONE, "SUB" },
394 { THING, TOKENTYPE_OPVAL, "THING" },
395 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
396 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
397 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
398 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
399 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
400 { USE, TOKENTYPE_IVAL, "USE" },
0d863452 401 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6
DM
402 { WHILE, TOKENTYPE_IVAL, "WHILE" },
403 { WORD, TOKENTYPE_OPVAL, "WORD" },
be25f609 404 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 405 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
406};
407
6154021b 408/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 409
bbf60fe6 410STATIC int
704d4215 411S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 412{
97aff369 413 dVAR;
7918f24d
NC
414
415 PERL_ARGS_ASSERT_TOKEREPORT;
416
bbf60fe6 417 if (DEBUG_T_TEST) {
bd61b366 418 const char *name = NULL;
bbf60fe6 419 enum token_type type = TOKENTYPE_NONE;
f54cb97a 420 const struct debug_tokens *p;
396482e1 421 SV* const report = newSVpvs("<== ");
bbf60fe6 422
f54cb97a 423 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
424 if (p->token == (int)rv) {
425 name = p->name;
426 type = p->type;
427 break;
428 }
429 }
430 if (name)
54667de8 431 Perl_sv_catpv(aTHX_ report, name);
74736ae6 432 else if ((char)rv > ' ' && (char)rv <= '~')
bbf60fe6
DM
433 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
434 else if (!rv)
396482e1 435 sv_catpvs(report, "EOF");
bbf60fe6
DM
436 else
437 Perl_sv_catpvf(aTHX_ report, "?? %"IVdf, (IV)rv);
438 switch (type) {
439 case TOKENTYPE_NONE:
bbf60fe6
DM
440 break;
441 case TOKENTYPE_IVAL:
704d4215 442 Perl_sv_catpvf(aTHX_ report, "(ival=%"IVdf")", (IV)lvalp->ival);
bbf60fe6
DM
443 break;
444 case TOKENTYPE_OPNUM:
445 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 446 PL_op_name[lvalp->ival]);
bbf60fe6
DM
447 break;
448 case TOKENTYPE_PVAL:
704d4215 449 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
450 break;
451 case TOKENTYPE_OPVAL:
704d4215 452 if (lvalp->opval) {
401441c0 453 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
454 PL_op_name[lvalp->opval->op_type]);
455 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 456 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 457 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
458 }
459
460 }
401441c0 461 else
396482e1 462 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
463 break;
464 }
b6007c36 465 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
466 };
467 return (int)rv;
998054bd
SC
468}
469
b6007c36
DM
470
471/* print the buffer with suitable escapes */
472
473STATIC void
15f169a1 474S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 475{
396482e1 476 SV* const tmp = newSVpvs("");
7918f24d
NC
477
478 PERL_ARGS_ASSERT_PRINTBUF;
479
b6007c36
DM
480 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
481 SvREFCNT_dec(tmp);
482}
483
8fa7f367
JH
484#endif
485
8290c323
NC
486static int
487S_deprecate_commaless_var_list(pTHX) {
488 PL_expect = XTERM;
489 deprecate("comma-less variable list");
490 return REPORT(','); /* grandfather non-comma-format format */
491}
492
ffb4593c
NT
493/*
494 * S_ao
495 *
c963b151
BD
496 * This subroutine detects &&=, ||=, and //= and turns an ANDAND, OROR or DORDOR
497 * into an OP_ANDASSIGN, OP_ORASSIGN, or OP_DORASSIGN
ffb4593c
NT
498 */
499
76e3520e 500STATIC int
cea2e8a9 501S_ao(pTHX_ int toketype)
a0d0e21e 502{
97aff369 503 dVAR;
3280af22
NIS
504 if (*PL_bufptr == '=') {
505 PL_bufptr++;
a0d0e21e 506 if (toketype == ANDAND)
6154021b 507 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 508 else if (toketype == OROR)
6154021b 509 pl_yylval.ival = OP_ORASSIGN;
c963b151 510 else if (toketype == DORDOR)
6154021b 511 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
512 toketype = ASSIGNOP;
513 }
514 return toketype;
515}
516
ffb4593c
NT
517/*
518 * S_no_op
519 * When Perl expects an operator and finds something else, no_op
520 * prints the warning. It always prints "<something> found where
521 * operator expected. It prints "Missing semicolon on previous line?"
522 * if the surprise occurs at the start of the line. "do you need to
523 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
524 * where the compiler doesn't know if foo is a method call or a function.
525 * It prints "Missing operator before end of line" if there's nothing
526 * after the missing operator, or "... before <...>" if there is something
527 * after the missing operator.
528 */
529
76e3520e 530STATIC void
15f169a1 531S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 532{
97aff369 533 dVAR;
9d4ba2ae
AL
534 char * const oldbp = PL_bufptr;
535 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 536
7918f24d
NC
537 PERL_ARGS_ASSERT_NO_OP;
538
1189a94a
GS
539 if (!s)
540 s = oldbp;
07c798fb 541 else
1189a94a 542 PL_bufptr = s;
734ab321 543 yywarn(Perl_form(aTHX_ "%s found where operator expected", what), UTF ? SVf_UTF8 : 0);
56da5a46
RGS
544 if (ckWARN_d(WARN_SYNTAX)) {
545 if (is_first)
546 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
547 "\t(Missing semicolon on previous line?)\n");
548 else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) {
f54cb97a 549 const char *t;
734ab321
BF
550 for (t = PL_oldoldbufptr; (isALNUM_lazy_if(t,UTF) || *t == ':');
551 t += UTF ? UTF8SKIP(t) : 1)
c35e046a 552 NOOP;
56da5a46
RGS
553 if (t < PL_bufptr && isSPACE(*t))
554 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
734ab321
BF
555 "\t(Do you need to predeclare %"SVf"?)\n",
556 SVfARG(newSVpvn_flags(PL_oldoldbufptr, (STRLEN)(t - PL_oldoldbufptr),
557 SVs_TEMP | (UTF ? SVf_UTF8 : 0))));
56da5a46
RGS
558 }
559 else {
560 assert(s >= oldbp);
561 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
734ab321
BF
562 "\t(Missing operator before %"SVf"?)\n",
563 SVfARG(newSVpvn_flags(oldbp, (STRLEN)(s - oldbp),
564 SVs_TEMP | (UTF ? SVf_UTF8 : 0))));
56da5a46 565 }
07c798fb 566 }
3280af22 567 PL_bufptr = oldbp;
8990e307
LW
568}
569
ffb4593c
NT
570/*
571 * S_missingterm
572 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 573 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
574 * If we're in a delimited string and the delimiter is a control
575 * character, it's reformatted into a two-char sequence like ^C.
576 * This is fatal.
577 */
578
76e3520e 579STATIC void
cea2e8a9 580S_missingterm(pTHX_ char *s)
8990e307 581{
97aff369 582 dVAR;
8990e307
LW
583 char tmpbuf[3];
584 char q;
585 if (s) {
9d4ba2ae 586 char * const nl = strrchr(s,'\n');
d2719217 587 if (nl)
8990e307
LW
588 *nl = '\0';
589 }
463559e7 590 else if (isCNTRL(PL_multi_close)) {
8990e307 591 *tmpbuf = '^';
585ec06d 592 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
593 tmpbuf[2] = '\0';
594 s = tmpbuf;
595 }
596 else {
eb160463 597 *tmpbuf = (char)PL_multi_close;
8990e307
LW
598 tmpbuf[1] = '\0';
599 s = tmpbuf;
600 }
601 q = strchr(s,'"') ? '\'' : '"';
cea2e8a9 602 Perl_croak(aTHX_ "Can't find string terminator %c%s%c anywhere before EOF",q,s,q);
463ee0b2 603}
79072805 604
dd0ac2b9
FC
605#include "feature.h"
606
0d863452 607/*
0d863452
RH
608 * Check whether the named feature is enabled.
609 */
26ea9e12 610bool
3fff3427 611Perl_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
0d863452 612{
97aff369 613 dVAR;
4a731d7b 614 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
7918f24d
NC
615
616 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
ca4d40c4
FC
617
618 assert(CURRENT_FEATURE_BUNDLE == FEATURE_BUNDLE_CUSTOM);
7918f24d 619
26ea9e12
NC
620 if (namelen > MAX_FEATURE_LEN)
621 return FALSE;
3fff3427 622 memcpy(&he_name[8], name, namelen);
7d69d4a6 623
c8ca97b0
NC
624 return cBOOL(cop_hints_fetch_pvn(PL_curcop, he_name, 8 + namelen, 0,
625 REFCOUNTED_HE_EXISTS));
0d863452
RH
626}
627
ffb4593c 628/*
9cbb5ea2
GS
629 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
630 * utf16-to-utf8-reversed.
ffb4593c
NT
631 */
632
c39cd008
GS
633#ifdef PERL_CR_FILTER
634static void
635strip_return(SV *sv)
636{
eb578fdb
KW
637 const char *s = SvPVX_const(sv);
638 const char * const e = s + SvCUR(sv);
7918f24d
NC
639
640 PERL_ARGS_ASSERT_STRIP_RETURN;
641
c39cd008
GS
642 /* outer loop optimized to do nothing if there are no CR-LFs */
643 while (s < e) {
644 if (*s++ == '\r' && *s == '\n') {
645 /* hit a CR-LF, need to copy the rest */
eb578fdb 646 char *d = s - 1;
c39cd008
GS
647 *d++ = *s++;
648 while (s < e) {
649 if (*s == '\r' && s[1] == '\n')
650 s++;
651 *d++ = *s++;
652 }
653 SvCUR(sv) -= s - d;
654 return;
655 }
656 }
657}
a868473f 658
76e3520e 659STATIC I32
c39cd008 660S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 661{
f54cb97a 662 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
663 if (count > 0 && !maxlen)
664 strip_return(sv);
665 return count;
a868473f
NIS
666}
667#endif
668
ffb4593c 669/*
8eaa0acf
Z
670=for apidoc Amx|void|lex_start|SV *line|PerlIO *rsfp|U32 flags
671
672Creates and initialises a new lexer/parser state object, supplying
673a context in which to lex and parse from a new source of Perl code.
674A pointer to the new state object is placed in L</PL_parser>. An entry
675is made on the save stack so that upon unwinding the new state object
676will be destroyed and the former value of L</PL_parser> will be restored.
677Nothing else need be done to clean up the parsing context.
678
679The code to be parsed comes from I<line> and I<rsfp>. I<line>, if
680non-null, provides a string (in SV form) containing code to be parsed.
681A copy of the string is made, so subsequent modification of I<line>
682does not affect parsing. I<rsfp>, if non-null, provides an input stream
683from which code will be read to be parsed. If both are non-null, the
684code in I<line> comes first and must consist of complete lines of input,
685and I<rsfp> supplies the remainder of the source.
686
e368b3bd
FC
687The I<flags> parameter is reserved for future use. Currently it is only
688used by perl internally, so extensions should always pass zero.
8eaa0acf
Z
689
690=cut
691*/
ffb4593c 692
27fcb6ee 693/* LEX_START_SAME_FILTER indicates that this is not a new file, so it
87606032
NC
694 can share filters with the current parser.
695 LEX_START_DONT_CLOSE indicates that the file handle wasn't opened by the
696 caller, hence isn't owned by the parser, so shouldn't be closed on parser
697 destruction. This is used to handle the case of defaulting to reading the
698 script from the standard input because no filename was given on the command
699 line (without getting confused by situation where STDIN has been closed, so
700 the script handle is opened on fd 0) */
27fcb6ee 701
a0d0e21e 702void
8eaa0acf 703Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
79072805 704{
97aff369 705 dVAR;
6ef55633 706 const char *s = NULL;
5486870f 707 yy_parser *parser, *oparser;
60d63348 708 if (flags && flags & ~LEX_START_FLAGS)
8eaa0acf 709 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
acdf0a21
DM
710
711 /* create and initialise a parser */
712
199e78b7 713 Newxz(parser, 1, yy_parser);
5486870f 714 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
715 PL_parser = parser;
716
28ac2b49
Z
717 parser->stack = NULL;
718 parser->ps = NULL;
719 parser->stack_size = 0;
acdf0a21 720
e3abe207
DM
721 /* on scope exit, free this parser and restore any outer one */
722 SAVEPARSER(parser);
7c4baf47 723 parser->saved_curcop = PL_curcop;
e3abe207 724
acdf0a21 725 /* initialise lexer state */
8990e307 726
fb205e7a
DM
727#ifdef PERL_MAD
728 parser->curforce = -1;
729#else
730 parser->nexttoke = 0;
731#endif
ca4cfd28 732 parser->error_count = oparser ? oparser->error_count : 0;
c2598295 733 parser->copline = NOLINE;
5afb0a62 734 parser->lex_state = LEX_NORMAL;
c2598295 735 parser->expect = XSTATE;
2f9285f8 736 parser->rsfp = rsfp;
27fcb6ee
FC
737 parser->rsfp_filters =
738 !(flags & LEX_START_SAME_FILTER) || !oparser
d3cd8e11
FC
739 ? NULL
740 : MUTABLE_AV(SvREFCNT_inc(
741 oparser->rsfp_filters
742 ? oparser->rsfp_filters
743 : (oparser->rsfp_filters = newAV())
744 ));
2f9285f8 745
199e78b7
DM
746 Newx(parser->lex_brackstack, 120, char);
747 Newx(parser->lex_casestack, 12, char);
748 *parser->lex_casestack = '\0';
d794b522 749 Newxz(parser->lex_shared, 1, LEXSHARED);
02b34bbe 750
10efb74f 751 if (line) {
0528fd32 752 STRLEN len;
10efb74f 753 s = SvPV_const(line, len);
0abcdfa4
FC
754 parser->linestr = flags & LEX_START_COPIED
755 ? SvREFCNT_inc_simple_NN(line)
756 : newSVpvn_flags(s, len, SvUTF8(line));
11076590 757 sv_catpvs(parser->linestr, "\n;");
0abcdfa4
FC
758 } else {
759 parser->linestr = newSVpvs("\n;");
8990e307 760 }
f06b5848
DM
761 parser->oldoldbufptr =
762 parser->oldbufptr =
763 parser->bufptr =
764 parser->linestart = SvPVX(parser->linestr);
765 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
766 parser->last_lop = parser->last_uni = NULL;
87606032
NC
767 parser->lex_flags = flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
768 |LEX_DONT_CLOSE_RSFP);
737c24fc 769
60d63348 770 parser->in_pod = parser->filtered = 0;
79072805 771}
a687059c 772
e3abe207
DM
773
774/* delete a parser object */
775
776void
777Perl_parser_free(pTHX_ const yy_parser *parser)
778{
7918f24d
NC
779 PERL_ARGS_ASSERT_PARSER_FREE;
780
7c4baf47 781 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
782 SvREFCNT_dec(parser->linestr);
783
87606032 784 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
2f9285f8 785 PerlIO_clearerr(parser->rsfp);
799361c3
SH
786 else if (parser->rsfp && (!parser->old_parser ||
787 (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 788 PerlIO_close(parser->rsfp);
5486870f 789 SvREFCNT_dec(parser->rsfp_filters);
2f9285f8 790
e3abe207
DM
791 Safefree(parser->lex_brackstack);
792 Safefree(parser->lex_casestack);
d794b522 793 Safefree(parser->lex_shared);
e3abe207
DM
794 PL_parser = parser->old_parser;
795 Safefree(parser);
796}
797
798
ffb4593c 799/*
f0e67a1d
Z
800=for apidoc AmxU|SV *|PL_parser-E<gt>linestr
801
802Buffer scalar containing the chunk currently under consideration of the
803text currently being lexed. This is always a plain string scalar (for
804which C<SvPOK> is true). It is not intended to be used as a scalar by
805normal scalar means; instead refer to the buffer directly by the pointer
806variables described below.
807
808The lexer maintains various C<char*> pointers to things in the
809C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
810reallocated, all of these pointers must be updated. Don't attempt to
811do this manually, but rather use L</lex_grow_linestr> if you need to
812reallocate the buffer.
813
814The content of the text chunk in the buffer is commonly exactly one
815complete line of input, up to and including a newline terminator,
816but there are situations where it is otherwise. The octets of the
817buffer may be intended to be interpreted as either UTF-8 or Latin-1.
818The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
819flag on this scalar, which may disagree with it.
820
821For direct examination of the buffer, the variable
822L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
823lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
824of these pointers is usually preferable to examination of the scalar
825through normal scalar means.
826
827=for apidoc AmxU|char *|PL_parser-E<gt>bufend
828
829Direct pointer to the end of the chunk of text currently being lexed, the
830end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
831+ SvCUR(PL_parser-E<gt>linestr)>. A NUL character (zero octet) is
832always located at the end of the buffer, and does not count as part of
833the buffer's contents.
834
835=for apidoc AmxU|char *|PL_parser-E<gt>bufptr
836
837Points to the current position of lexing inside the lexer buffer.
838Characters around this point may be freely examined, within
839the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
840L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
841interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
842
843Lexing code (whether in the Perl core or not) moves this pointer past
844the characters that it consumes. It is also expected to perform some
845bookkeeping whenever a newline character is consumed. This movement
846can be more conveniently performed by the function L</lex_read_to>,
847which handles newlines appropriately.
848
849Interpretation of the buffer's octets can be abstracted out by
850using the slightly higher-level functions L</lex_peek_unichar> and
851L</lex_read_unichar>.
852
853=for apidoc AmxU|char *|PL_parser-E<gt>linestart
854
855Points to the start of the current line inside the lexer buffer.
856This is useful for indicating at which column an error occurred, and
857not much else. This must be updated by any lexing code that consumes
858a newline; the function L</lex_read_to> handles this detail.
859
860=cut
861*/
862
863/*
864=for apidoc Amx|bool|lex_bufutf8
865
866Indicates whether the octets in the lexer buffer
867(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
868of Unicode characters. If not, they should be interpreted as Latin-1
869characters. This is analogous to the C<SvUTF8> flag for scalars.
870
871In UTF-8 mode, it is not guaranteed that the lexer buffer actually
872contains valid UTF-8. Lexing code must be robust in the face of invalid
873encoding.
874
875The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
876is significant, but not the whole story regarding the input character
877encoding. Normally, when a file is being read, the scalar contains octets
878and its C<SvUTF8> flag is off, but the octets should be interpreted as
879UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
880however, the scalar may have the C<SvUTF8> flag on, and in this case its
881octets should be interpreted as UTF-8 unless the C<use bytes> pragma
882is in effect. This logic may change in the future; use this function
883instead of implementing the logic yourself.
884
885=cut
886*/
887
888bool
889Perl_lex_bufutf8(pTHX)
890{
891 return UTF;
892}
893
894/*
895=for apidoc Amx|char *|lex_grow_linestr|STRLEN len
896
897Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
898at least I<len> octets (including terminating NUL). Returns a
899pointer to the reallocated buffer. This is necessary before making
900any direct modification of the buffer that would increase its length.
901L</lex_stuff_pvn> provides a more convenient way to insert text into
902the buffer.
903
904Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
905this function updates all of the lexer's variables that point directly
906into the buffer.
907
908=cut
909*/
910
911char *
912Perl_lex_grow_linestr(pTHX_ STRLEN len)
913{
914 SV *linestr;
915 char *buf;
916 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
c7641931 917 STRLEN linestart_pos, last_uni_pos, last_lop_pos, re_eval_start_pos;
f0e67a1d
Z
918 linestr = PL_parser->linestr;
919 buf = SvPVX(linestr);
920 if (len <= SvLEN(linestr))
921 return buf;
922 bufend_pos = PL_parser->bufend - buf;
923 bufptr_pos = PL_parser->bufptr - buf;
924 oldbufptr_pos = PL_parser->oldbufptr - buf;
925 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
926 linestart_pos = PL_parser->linestart - buf;
927 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
928 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
3328ab5a
FC
929 re_eval_start_pos = PL_parser->lex_shared->re_eval_start ?
930 PL_parser->lex_shared->re_eval_start - buf : 0;
c7641931 931
f0e67a1d 932 buf = sv_grow(linestr, len);
c7641931 933
f0e67a1d
Z
934 PL_parser->bufend = buf + bufend_pos;
935 PL_parser->bufptr = buf + bufptr_pos;
936 PL_parser->oldbufptr = buf + oldbufptr_pos;
937 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
938 PL_parser->linestart = buf + linestart_pos;
939 if (PL_parser->last_uni)
940 PL_parser->last_uni = buf + last_uni_pos;
941 if (PL_parser->last_lop)
942 PL_parser->last_lop = buf + last_lop_pos;
3328ab5a
FC
943 if (PL_parser->lex_shared->re_eval_start)
944 PL_parser->lex_shared->re_eval_start = buf + re_eval_start_pos;
f0e67a1d
Z
945 return buf;
946}
947
948/*
83aa740e 949=for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
f0e67a1d
Z
950
951Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
952immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
953reallocating the buffer if necessary. This means that lexing code that
954runs later will see the characters as if they had appeared in the input.
955It is not recommended to do this as part of normal parsing, and most
956uses of this facility run the risk of the inserted characters being
957interpreted in an unintended manner.
958
959The string to be inserted is represented by I<len> octets starting
960at I<pv>. These octets are interpreted as either UTF-8 or Latin-1,
961according to whether the C<LEX_STUFF_UTF8> flag is set in I<flags>.
962The characters are recoded for the lexer buffer, according to how the
963buffer is currently being interpreted (L</lex_bufutf8>). If a string
9dcc53ea 964to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
f0e67a1d
Z
965function is more convenient.
966
967=cut
968*/
969
970void
83aa740e 971Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
f0e67a1d 972{
749123ff 973 dVAR;
f0e67a1d
Z
974 char *bufptr;
975 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
976 if (flags & ~(LEX_STUFF_UTF8))
977 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
978 if (UTF) {
979 if (flags & LEX_STUFF_UTF8) {
980 goto plain_copy;
981 } else {
982 STRLEN highhalf = 0;
83aa740e 983 const char *p, *e = pv+len;
f0e67a1d
Z
984 for (p = pv; p != e; p++)
985 highhalf += !!(((U8)*p) & 0x80);
986 if (!highhalf)
987 goto plain_copy;
988 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
989 bufptr = PL_parser->bufptr;
990 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
991 SvCUR_set(PL_parser->linestr,
992 SvCUR(PL_parser->linestr) + len+highhalf);
f0e67a1d
Z
993 PL_parser->bufend += len+highhalf;
994 for (p = pv; p != e; p++) {
995 U8 c = (U8)*p;
996 if (c & 0x80) {
997 *bufptr++ = (char)(0xc0 | (c >> 6));
998 *bufptr++ = (char)(0x80 | (c & 0x3f));
999 } else {
1000 *bufptr++ = (char)c;
1001 }
1002 }
1003 }
1004 } else {
1005 if (flags & LEX_STUFF_UTF8) {
1006 STRLEN highhalf = 0;
83aa740e 1007 const char *p, *e = pv+len;
f0e67a1d
Z
1008 for (p = pv; p != e; p++) {
1009 U8 c = (U8)*p;
1010 if (c >= 0xc4) {
1011 Perl_croak(aTHX_ "Lexing code attempted to stuff "
1012 "non-Latin-1 character into Latin-1 input");
1013 } else if (c >= 0xc2 && p+1 != e &&
1014 (((U8)p[1]) & 0xc0) == 0x80) {
1015 p++;
1016 highhalf++;
1017 } else if (c >= 0x80) {
1018 /* malformed UTF-8 */
1019 ENTER;
1020 SAVESPTR(PL_warnhook);
1021 PL_warnhook = PERL_WARNHOOK_FATAL;
1022 utf8n_to_uvuni((U8*)p, e-p, NULL, 0);
1023 LEAVE;
1024 }
1025 }
1026 if (!highhalf)
1027 goto plain_copy;
1028 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
1029 bufptr = PL_parser->bufptr;
1030 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1031 SvCUR_set(PL_parser->linestr,
1032 SvCUR(PL_parser->linestr) + len-highhalf);
f0e67a1d
Z
1033 PL_parser->bufend += len-highhalf;
1034 for (p = pv; p != e; p++) {
1035 U8 c = (U8)*p;
1036 if (c & 0x80) {
1037 *bufptr++ = (char)(((c & 0x3) << 6) | (p[1] & 0x3f));
1038 p++;
1039 } else {
1040 *bufptr++ = (char)c;
1041 }
1042 }
1043 } else {
1044 plain_copy:
1045 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1046 bufptr = PL_parser->bufptr;
1047 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
255fdf19 1048 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
f0e67a1d
Z
1049 PL_parser->bufend += len;
1050 Copy(pv, bufptr, len, char);
1051 }
1052 }
1053}
1054
1055/*
9dcc53ea
Z
1056=for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
1057
1058Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1059immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1060reallocating the buffer if necessary. This means that lexing code that
1061runs later will see the characters as if they had appeared in the input.
1062It is not recommended to do this as part of normal parsing, and most
1063uses of this facility run the risk of the inserted characters being
1064interpreted in an unintended manner.
1065
1066The string to be inserted is represented by octets starting at I<pv>
1067and continuing to the first nul. These octets are interpreted as either
1068UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
1069in I<flags>. The characters are recoded for the lexer buffer, according
1070to how the buffer is currently being interpreted (L</lex_bufutf8>).
1071If it is not convenient to nul-terminate a string to be inserted, the
1072L</lex_stuff_pvn> function is more appropriate.
1073
1074=cut
1075*/
1076
1077void
1078Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1079{
1080 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1081 lex_stuff_pvn(pv, strlen(pv), flags);
1082}
1083
1084/*
f0e67a1d
Z
1085=for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1086
1087Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1088immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1089reallocating the buffer if necessary. This means that lexing code that
1090runs later will see the characters as if they had appeared in the input.
1091It is not recommended to do this as part of normal parsing, and most
1092uses of this facility run the risk of the inserted characters being
1093interpreted in an unintended manner.
1094
1095The string to be inserted is the string value of I<sv>. The characters
1096are recoded for the lexer buffer, according to how the buffer is currently
9dcc53ea 1097being interpreted (L</lex_bufutf8>). If a string to be inserted is
f0e67a1d
Z
1098not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1099need to construct a scalar.
1100
1101=cut
1102*/
1103
1104void
1105Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1106{
1107 char *pv;
1108 STRLEN len;
1109 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1110 if (flags)
1111 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1112 pv = SvPV(sv, len);
1113 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1114}
1115
1116/*
1117=for apidoc Amx|void|lex_unstuff|char *ptr
1118
1119Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1120I<ptr>. Text following I<ptr> will be moved, and the buffer shortened.
1121This hides the discarded text from any lexing code that runs later,
1122as if the text had never appeared.
1123
1124This is not the normal way to consume lexed text. For that, use
1125L</lex_read_to>.
1126
1127=cut
1128*/
1129
1130void
1131Perl_lex_unstuff(pTHX_ char *ptr)
1132{
1133 char *buf, *bufend;
1134 STRLEN unstuff_len;
1135 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1136 buf = PL_parser->bufptr;
1137 if (ptr < buf)
1138 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1139 if (ptr == buf)
1140 return;
1141 bufend = PL_parser->bufend;
1142 if (ptr > bufend)
1143 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1144 unstuff_len = ptr - buf;
1145 Move(ptr, buf, bufend+1-ptr, char);
1146 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1147 PL_parser->bufend = bufend - unstuff_len;
1148}
1149
1150/*
1151=for apidoc Amx|void|lex_read_to|char *ptr
1152
1153Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1154to I<ptr>. This advances L</PL_parser-E<gt>bufptr> to match I<ptr>,
1155performing the correct bookkeeping whenever a newline character is passed.
1156This is the normal way to consume lexed text.
1157
1158Interpretation of the buffer's octets can be abstracted out by
1159using the slightly higher-level functions L</lex_peek_unichar> and
1160L</lex_read_unichar>.
1161
1162=cut
1163*/
1164
1165void
1166Perl_lex_read_to(pTHX_ char *ptr)
1167{
1168 char *s;
1169 PERL_ARGS_ASSERT_LEX_READ_TO;
1170 s = PL_parser->bufptr;
1171 if (ptr < s || ptr > PL_parser->bufend)
1172 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1173 for (; s != ptr; s++)
1174 if (*s == '\n') {
83944c01 1175 COPLINE_INC_WITH_HERELINES;
f0e67a1d
Z
1176 PL_parser->linestart = s+1;
1177 }
1178 PL_parser->bufptr = ptr;
1179}
1180
1181/*
1182=for apidoc Amx|void|lex_discard_to|char *ptr
1183
1184Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1185up to I<ptr>. The remaining content of the buffer will be moved, and
1186all pointers into the buffer updated appropriately. I<ptr> must not
1187be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1188it is not permitted to discard text that has yet to be lexed.
1189
1190Normally it is not necessarily to do this directly, because it suffices to
1191use the implicit discarding behaviour of L</lex_next_chunk> and things
1192based on it. However, if a token stretches across multiple lines,
1f317c95 1193and the lexing code has kept multiple lines of text in the buffer for
f0e67a1d
Z
1194that purpose, then after completion of the token it would be wise to
1195explicitly discard the now-unneeded earlier lines, to avoid future
1196multi-line tokens growing the buffer without bound.
1197
1198=cut
1199*/
1200
1201void
1202Perl_lex_discard_to(pTHX_ char *ptr)
1203{
1204 char *buf;
1205 STRLEN discard_len;
1206 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1207 buf = SvPVX(PL_parser->linestr);
1208 if (ptr < buf)
1209 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1210 if (ptr == buf)
1211 return;
1212 if (ptr > PL_parser->bufptr)
1213 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1214 discard_len = ptr - buf;
1215 if (PL_parser->oldbufptr < ptr)
1216 PL_parser->oldbufptr = ptr;
1217 if (PL_parser->oldoldbufptr < ptr)
1218 PL_parser->oldoldbufptr = ptr;
1219 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1220 PL_parser->last_uni = NULL;
1221 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1222 PL_parser->last_lop = NULL;
1223 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1224 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1225 PL_parser->bufend -= discard_len;
1226 PL_parser->bufptr -= discard_len;
1227 PL_parser->oldbufptr -= discard_len;
1228 PL_parser->oldoldbufptr -= discard_len;
1229 if (PL_parser->last_uni)
1230 PL_parser->last_uni -= discard_len;
1231 if (PL_parser->last_lop)
1232 PL_parser->last_lop -= discard_len;
1233}
1234
1235/*
1236=for apidoc Amx|bool|lex_next_chunk|U32 flags
1237
1238Reads in the next chunk of text to be lexed, appending it to
1239L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1240looked to the end of the current chunk and wants to know more. It is
1241usual, but not necessary, for lexing to have consumed the entirety of
1242the current chunk at this time.
1243
1244If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1245chunk (i.e., the current chunk has been entirely consumed), normally the
1246current chunk will be discarded at the same time that the new chunk is
1247read in. If I<flags> includes C<LEX_KEEP_PREVIOUS>, the current chunk
1248will not be discarded. If the current chunk has not been entirely
1249consumed, then it will not be discarded regardless of the flag.
1250
1251Returns true if some new text was added to the buffer, or false if the
1252buffer has reached the end of the input text.
1253
1254=cut
1255*/
1256
1257#define LEX_FAKE_EOF 0x80000000
112d1284 1258#define LEX_NO_TERM 0x40000000
f0e67a1d
Z
1259
1260bool
1261Perl_lex_next_chunk(pTHX_ U32 flags)
1262{
1263 SV *linestr;
1264 char *buf;
1265 STRLEN old_bufend_pos, new_bufend_pos;
1266 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1267 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1268 bool got_some_for_debugger = 0;
f0e67a1d 1269 bool got_some;
112d1284 1270 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM))
f0e67a1d 1271 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
f0e67a1d
Z
1272 linestr = PL_parser->linestr;
1273 buf = SvPVX(linestr);
1274 if (!(flags & LEX_KEEP_PREVIOUS) &&
1275 PL_parser->bufptr == PL_parser->bufend) {
1276 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1277 linestart_pos = 0;
1278 if (PL_parser->last_uni != PL_parser->bufend)
1279 PL_parser->last_uni = NULL;
1280 if (PL_parser->last_lop != PL_parser->bufend)
1281 PL_parser->last_lop = NULL;
1282 last_uni_pos = last_lop_pos = 0;
1283 *buf = 0;
1284 SvCUR(linestr) = 0;
1285 } else {
1286 old_bufend_pos = PL_parser->bufend - buf;
1287 bufptr_pos = PL_parser->bufptr - buf;
1288 oldbufptr_pos = PL_parser->oldbufptr - buf;
1289 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1290 linestart_pos = PL_parser->linestart - buf;
1291 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1292 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1293 }
1294 if (flags & LEX_FAKE_EOF) {
1295 goto eof;
60d63348 1296 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
f0e67a1d
Z
1297 got_some = 0;
1298 } else if (filter_gets(linestr, old_bufend_pos)) {
1299 got_some = 1;
17cc9359 1300 got_some_for_debugger = 1;
112d1284
FC
1301 } else if (flags & LEX_NO_TERM) {
1302 got_some = 0;
f0e67a1d 1303 } else {
580561a3
Z
1304 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1305 sv_setpvs(linestr, "");
f0e67a1d
Z
1306 eof:
1307 /* End of real input. Close filehandle (unless it was STDIN),
1308 * then add implicit termination.
1309 */
87606032 1310 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
f0e67a1d
Z
1311 PerlIO_clearerr(PL_parser->rsfp);
1312 else if (PL_parser->rsfp)
1313 (void)PerlIO_close(PL_parser->rsfp);
1314 PL_parser->rsfp = NULL;
60d63348 1315 PL_parser->in_pod = PL_parser->filtered = 0;
f0e67a1d
Z
1316#ifdef PERL_MAD
1317 if (PL_madskills && !PL_in_eval && (PL_minus_p || PL_minus_n))
1318 PL_faketokens = 1;
1319#endif
1320 if (!PL_in_eval && PL_minus_p) {
1321 sv_catpvs(linestr,
1322 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1323 PL_minus_n = PL_minus_p = 0;
1324 } else if (!PL_in_eval && PL_minus_n) {
1325 sv_catpvs(linestr, /*{*/";}");
1326 PL_minus_n = 0;
1327 } else
1328 sv_catpvs(linestr, ";");
1329 got_some = 1;
1330 }
1331 buf = SvPVX(linestr);
1332 new_bufend_pos = SvCUR(linestr);
1333 PL_parser->bufend = buf + new_bufend_pos;
1334 PL_parser->bufptr = buf + bufptr_pos;
1335 PL_parser->oldbufptr = buf + oldbufptr_pos;
1336 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1337 PL_parser->linestart = buf + linestart_pos;
1338 if (PL_parser->last_uni)
1339 PL_parser->last_uni = buf + last_uni_pos;
1340 if (PL_parser->last_lop)
1341 PL_parser->last_lop = buf + last_lop_pos;
17cc9359 1342 if (got_some_for_debugger && (PERLDB_LINE || PERLDB_SAVESRC) &&
f0e67a1d
Z
1343 PL_curstash != PL_debstash) {
1344 /* debugger active and we're not compiling the debugger code,
1345 * so store the line into the debugger's array of lines
1346 */
1347 update_debugger_info(NULL, buf+old_bufend_pos,
1348 new_bufend_pos-old_bufend_pos);
1349 }
1350 return got_some;
1351}
1352
1353/*
1354=for apidoc Amx|I32|lex_peek_unichar|U32 flags
1355
1356Looks ahead one (Unicode) character in the text currently being lexed.
1357Returns the codepoint (unsigned integer value) of the next character,
1358or -1 if lexing has reached the end of the input text. To consume the
1359peeked character, use L</lex_read_unichar>.
1360
1361If the next character is in (or extends into) the next chunk of input
1362text, the next chunk will be read in. Normally the current chunk will be
1363discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1364then the current chunk will not be discarded.
1365
1366If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1367is encountered, an exception is generated.
1368
1369=cut
1370*/
1371
1372I32
1373Perl_lex_peek_unichar(pTHX_ U32 flags)
1374{
749123ff 1375 dVAR;
f0e67a1d
Z
1376 char *s, *bufend;
1377 if (flags & ~(LEX_KEEP_PREVIOUS))
1378 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1379 s = PL_parser->bufptr;
1380 bufend = PL_parser->bufend;
1381 if (UTF) {
1382 U8 head;
1383 I32 unichar;
1384 STRLEN len, retlen;
1385 if (s == bufend) {
1386 if (!lex_next_chunk(flags))
1387 return -1;
1388 s = PL_parser->bufptr;
1389 bufend = PL_parser->bufend;
1390 }
1391 head = (U8)*s;
1392 if (!(head & 0x80))
1393 return head;
1394 if (head & 0x40) {
1395 len = PL_utf8skip[head];
1396 while ((STRLEN)(bufend-s) < len) {
1397 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1398 break;
1399 s = PL_parser->bufptr;
1400 bufend = PL_parser->bufend;
1401 }
1402 }
1403 unichar = utf8n_to_uvuni((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
1404 if (retlen == (STRLEN)-1) {
1405 /* malformed UTF-8 */
1406 ENTER;
1407 SAVESPTR(PL_warnhook);
1408 PL_warnhook = PERL_WARNHOOK_FATAL;
1409 utf8n_to_uvuni((U8*)s, bufend-s, NULL, 0);
1410 LEAVE;
1411 }
1412 return unichar;
1413 } else {
1414 if (s == bufend) {
1415 if (!lex_next_chunk(flags))
1416 return -1;
1417 s = PL_parser->bufptr;
1418 }
1419 return (U8)*s;
1420 }
1421}
1422
1423/*
1424=for apidoc Amx|I32|lex_read_unichar|U32 flags
1425
1426Reads the next (Unicode) character in the text currently being lexed.
1427Returns the codepoint (unsigned integer value) of the character read,
1428and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1429if lexing has reached the end of the input text. To non-destructively
1430examine the next character, use L</lex_peek_unichar> instead.
1431
1432If the next character is in (or extends into) the next chunk of input
1433text, the next chunk will be read in. Normally the current chunk will be
1434discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1435then the current chunk will not be discarded.
1436
1437If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1438is encountered, an exception is generated.
1439
1440=cut
1441*/
1442
1443I32
1444Perl_lex_read_unichar(pTHX_ U32 flags)
1445{
1446 I32 c;
1447 if (flags & ~(LEX_KEEP_PREVIOUS))
1448 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1449 c = lex_peek_unichar(flags);
1450 if (c != -1) {
1451 if (c == '\n')
83944c01 1452 COPLINE_INC_WITH_HERELINES;
d9018cbe
EB
1453 if (UTF)
1454 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1455 else
1456 ++(PL_parser->bufptr);
f0e67a1d
Z
1457 }
1458 return c;
1459}
1460
1461/*
1462=for apidoc Amx|void|lex_read_space|U32 flags
1463
1464Reads optional spaces, in Perl style, in the text currently being
1465lexed. The spaces may include ordinary whitespace characters and
1466Perl-style comments. C<#line> directives are processed if encountered.
1467L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1468at a non-space character (or the end of the input text).
1469
1470If spaces extend into the next chunk of input text, the next chunk will
1471be read in. Normally the current chunk will be discarded at the same
1472time, but if I<flags> includes C<LEX_KEEP_PREVIOUS> then the current
1473chunk will not be discarded.
1474
1475=cut
1476*/
1477
f0998909
Z
1478#define LEX_NO_NEXT_CHUNK 0x80000000
1479
f0e67a1d
Z
1480void
1481Perl_lex_read_space(pTHX_ U32 flags)
1482{
1483 char *s, *bufend;
1484 bool need_incline = 0;
f0998909 1485 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK))
f0e67a1d
Z
1486 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
1487#ifdef PERL_MAD
1488 if (PL_skipwhite) {
1489 sv_free(PL_skipwhite);
1490 PL_skipwhite = NULL;
1491 }
1492 if (PL_madskills)
1493 PL_skipwhite = newSVpvs("");
1494#endif /* PERL_MAD */
1495 s = PL_parser->bufptr;
1496 bufend = PL_parser->bufend;
1497 while (1) {
1498 char c = *s;
1499 if (c == '#') {
1500 do {
1501 c = *++s;
1502 } while (!(c == '\n' || (c == 0 && s == bufend)));
1503 } else if (c == '\n') {
1504 s++;
1505 PL_parser->linestart = s;
1506 if (s == bufend)
1507 need_incline = 1;
1508 else
1509 incline(s);
1510 } else if (isSPACE(c)) {
1511 s++;
1512 } else if (c == 0 && s == bufend) {
1513 bool got_more;
1514#ifdef PERL_MAD
1515 if (PL_madskills)
1516 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1517#endif /* PERL_MAD */
f0998909
Z
1518 if (flags & LEX_NO_NEXT_CHUNK)
1519 break;
f0e67a1d 1520 PL_parser->bufptr = s;
83944c01 1521 COPLINE_INC_WITH_HERELINES;
f0e67a1d
Z
1522 got_more = lex_next_chunk(flags);
1523 CopLINE_dec(PL_curcop);
1524 s = PL_parser->bufptr;
1525 bufend = PL_parser->bufend;
1526 if (!got_more)
1527 break;
1528 if (need_incline && PL_parser->rsfp) {
1529 incline(s);
1530 need_incline = 0;
1531 }
1532 } else {
1533 break;
1534 }
1535 }
1536#ifdef PERL_MAD
1537 if (PL_madskills)
1538 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1539#endif /* PERL_MAD */
1540 PL_parser->bufptr = s;
1541}
1542
1543/*
ffb4593c
NT
1544 * S_incline
1545 * This subroutine has nothing to do with tilting, whether at windmills
1546 * or pinball tables. Its name is short for "increment line". It
57843af0 1547 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1548 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1549 * # line 500 "foo.pm"
1550 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1551 */
1552
76e3520e 1553STATIC void
d9095cec 1554S_incline(pTHX_ const char *s)
463ee0b2 1555{
97aff369 1556 dVAR;
d9095cec
NC
1557 const char *t;
1558 const char *n;
1559 const char *e;
8818d409 1560 line_t line_num;
463ee0b2 1561
7918f24d
NC
1562 PERL_ARGS_ASSERT_INCLINE;
1563
83944c01 1564 COPLINE_INC_WITH_HERELINES;
463ee0b2
LW
1565 if (*s++ != '#')
1566 return;
d4c19fe8
AL
1567 while (SPACE_OR_TAB(*s))
1568 s++;
73659bf1
GS
1569 if (strnEQ(s, "line", 4))
1570 s += 4;
1571 else
1572 return;
084592ab 1573 if (SPACE_OR_TAB(*s))
73659bf1 1574 s++;
4e553d73 1575 else
73659bf1 1576 return;
d4c19fe8
AL
1577 while (SPACE_OR_TAB(*s))
1578 s++;
463ee0b2
LW
1579 if (!isDIGIT(*s))
1580 return;
d4c19fe8 1581
463ee0b2
LW
1582 n = s;
1583 while (isDIGIT(*s))
1584 s++;
07714eb4 1585 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1586 return;
bf4acbe4 1587 while (SPACE_OR_TAB(*s))
463ee0b2 1588 s++;
73659bf1 1589 if (*s == '"' && (t = strchr(s+1, '"'))) {
463ee0b2 1590 s++;
73659bf1
GS
1591 e = t + 1;
1592 }
463ee0b2 1593 else {
c35e046a
AL
1594 t = s;
1595 while (!isSPACE(*t))
1596 t++;
73659bf1 1597 e = t;
463ee0b2 1598 }
bf4acbe4 1599 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1600 e++;
1601 if (*e != '\n' && *e != '\0')
1602 return; /* false alarm */
1603
8818d409
FC
1604 line_num = atoi(n)-1;
1605
f4dd75d9 1606 if (t - s > 0) {
d9095cec 1607 const STRLEN len = t - s;
19bad673
NC
1608 SV *const temp_sv = CopFILESV(PL_curcop);
1609 const char *cf;
1610 STRLEN tmplen;
1611
1612 if (temp_sv) {
1613 cf = SvPVX(temp_sv);
1614 tmplen = SvCUR(temp_sv);
1615 } else {
1616 cf = NULL;
1617 tmplen = 0;
1618 }
1619
d1299d44 1620 if (!PL_rsfp && !PL_parser->filtered) {
e66cf94c
RGS
1621 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1622 * to *{"::_<newfilename"} */
44867030
NC
1623 /* However, the long form of evals is only turned on by the
1624 debugger - usually they're "(eval %lu)" */
1625 char smallbuf[128];
1626 char *tmpbuf;
1627 GV **gvp;
d9095cec 1628 STRLEN tmplen2 = len;
798b63bc 1629 if (tmplen + 2 <= sizeof smallbuf)
e66cf94c
RGS
1630 tmpbuf = smallbuf;
1631 else
2ae0db35 1632 Newx(tmpbuf, tmplen + 2, char);
44867030
NC
1633 tmpbuf[0] = '_';
1634 tmpbuf[1] = '<';
2ae0db35 1635 memcpy(tmpbuf + 2, cf, tmplen);
44867030 1636 tmplen += 2;
8a5ee598
RGS
1637 gvp = (GV**)hv_fetch(PL_defstash, tmpbuf, tmplen, FALSE);
1638 if (gvp) {
44867030
NC
1639 char *tmpbuf2;
1640 GV *gv2;
1641
1642 if (tmplen2 + 2 <= sizeof smallbuf)
1643 tmpbuf2 = smallbuf;
1644 else
1645 Newx(tmpbuf2, tmplen2 + 2, char);
1646
1647 if (tmpbuf2 != smallbuf || tmpbuf != smallbuf) {
1648 /* Either they malloc'd it, or we malloc'd it,
1649 so no prefix is present in ours. */
1650 tmpbuf2[0] = '_';
1651 tmpbuf2[1] = '<';
1652 }
1653
1654 memcpy(tmpbuf2 + 2, s, tmplen2);
1655 tmplen2 += 2;
1656
8a5ee598 1657 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1658 if (!isGV(gv2)) {
8a5ee598 1659 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1660 /* adjust ${"::_<newfilename"} to store the new file name */
1661 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
8818d409
FC
1662 /* The line number may differ. If that is the case,
1663 alias the saved lines that are in the array.
1664 Otherwise alias the whole array. */
1665 if (CopLINE(PL_curcop) == line_num) {
1666 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(*gvp)));
1667 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(*gvp)));
1668 }
1669 else if (GvAV(*gvp)) {
1670 AV * const av = GvAV(*gvp);
1671 const I32 start = CopLINE(PL_curcop)+1;
1672 I32 items = AvFILLp(av) - start;
1673 if (items > 0) {
1674 AV * const av2 = GvAVn(gv2);
1675 SV **svp = AvARRAY(av) + start;
1676 I32 l = (I32)line_num+1;
1677 while (items--)
1678 av_store(av2, l++, SvREFCNT_inc(*svp++));
1679 }
1680 }
e5527e4b 1681 }
44867030
NC
1682
1683 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
8a5ee598 1684 }
e66cf94c 1685 if (tmpbuf != smallbuf) Safefree(tmpbuf);
e66cf94c 1686 }
05ec9bb3 1687 CopFILE_free(PL_curcop);
d9095cec 1688 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1689 }
8818d409 1690 CopLINE_set(PL_curcop, line_num);
463ee0b2
LW
1691}
1692
29595ff2 1693#ifdef PERL_MAD
cd81e915 1694/* skip space before PL_thistoken */
29595ff2
NC
1695
1696STATIC char *
1697S_skipspace0(pTHX_ register char *s)
1698{
7918f24d
NC
1699 PERL_ARGS_ASSERT_SKIPSPACE0;
1700
29595ff2
NC
1701 s = skipspace(s);
1702 if (!PL_madskills)
1703 return s;
cd81e915
NC
1704 if (PL_skipwhite) {
1705 if (!PL_thiswhite)
6b29d1f5 1706 PL_thiswhite = newSVpvs("");
cd81e915
NC
1707 sv_catsv(PL_thiswhite, PL_skipwhite);
1708 sv_free(PL_skipwhite);
1709 PL_skipwhite = 0;
1710 }
1711 PL_realtokenstart = s - SvPVX(PL_linestr);
29595ff2
NC
1712 return s;
1713}
1714
cd81e915 1715/* skip space after PL_thistoken */
29595ff2
NC
1716
1717STATIC char *
1718S_skipspace1(pTHX_ register char *s)
1719{
d4c19fe8 1720 const char *start = s;
29595ff2
NC
1721 I32 startoff = start - SvPVX(PL_linestr);
1722
7918f24d
NC
1723 PERL_ARGS_ASSERT_SKIPSPACE1;
1724
29595ff2
NC
1725 s = skipspace(s);
1726 if (!PL_madskills)
1727 return s;
1728 start = SvPVX(PL_linestr) + startoff;
cd81e915 1729 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1730 const char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1731 PL_thistoken = newSVpvn(tstart, start - tstart);
1732 }
1733 PL_realtokenstart = -1;
1734 if (PL_skipwhite) {
1735 if (!PL_nextwhite)
6b29d1f5 1736 PL_nextwhite = newSVpvs("");
cd81e915
NC
1737 sv_catsv(PL_nextwhite, PL_skipwhite);
1738 sv_free(PL_skipwhite);
1739 PL_skipwhite = 0;
29595ff2
NC
1740 }
1741 return s;
1742}
1743
1744STATIC char *
1745S_skipspace2(pTHX_ register char *s, SV **svp)
1746{
c35e046a
AL
1747 char *start;
1748 const I32 bufptroff = PL_bufptr - SvPVX(PL_linestr);
1749 const I32 startoff = s - SvPVX(PL_linestr);
1750
7918f24d
NC
1751 PERL_ARGS_ASSERT_SKIPSPACE2;
1752
29595ff2
NC
1753 s = skipspace(s);
1754 PL_bufptr = SvPVX(PL_linestr) + bufptroff;
1755 if (!PL_madskills || !svp)
1756 return s;
1757 start = SvPVX(PL_linestr) + startoff;
cd81e915 1758 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1759 char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1760 PL_thistoken = newSVpvn(tstart, start - tstart);
1761 PL_realtokenstart = -1;
29595ff2 1762 }
cd81e915 1763 if (PL_skipwhite) {
29595ff2 1764 if (!*svp)
6b29d1f5 1765 *svp = newSVpvs("");
cd81e915
NC
1766 sv_setsv(*svp, PL_skipwhite);
1767 sv_free(PL_skipwhite);
1768 PL_skipwhite = 0;
29595ff2
NC
1769 }
1770
1771 return s;
1772}
1773#endif
1774
80a702cd 1775STATIC void
15f169a1 1776S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1777{
1778 AV *av = CopFILEAVx(PL_curcop);
1779 if (av) {
b9f83d2f 1780 SV * const sv = newSV_type(SVt_PVMG);
5fa550fb
NC
1781 if (orig_sv)
1782 sv_setsv(sv, orig_sv);
1783 else
1784 sv_setpvn(sv, buf, len);
80a702cd
RGS
1785 (void)SvIOK_on(sv);
1786 SvIV_set(sv, 0);
1787 av_store(av, (I32)CopLINE(PL_curcop), sv);
1788 }
1789}
1790
ffb4593c
NT
1791/*
1792 * S_skipspace
1793 * Called to gobble the appropriate amount and type of whitespace.
1794 * Skips comments as well.
1795 */
1796
76e3520e 1797STATIC char *
cea2e8a9 1798S_skipspace(pTHX_ register char *s)
a687059c 1799{
5db06880 1800#ifdef PERL_MAD
f0e67a1d
Z
1801 char *start = s;
1802#endif /* PERL_MAD */
7918f24d 1803 PERL_ARGS_ASSERT_SKIPSPACE;
f0e67a1d 1804#ifdef PERL_MAD
cd81e915
NC
1805 if (PL_skipwhite) {
1806 sv_free(PL_skipwhite);
f0e67a1d 1807 PL_skipwhite = NULL;
5db06880 1808 }
f0e67a1d 1809#endif /* PERL_MAD */
3280af22 1810 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
bf4acbe4 1811 while (s < PL_bufend && SPACE_OR_TAB(*s))
463ee0b2 1812 s++;
f0e67a1d
Z
1813 } else {
1814 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1815 PL_bufptr = s;
f0998909
Z
1816 lex_read_space(LEX_KEEP_PREVIOUS |
1817 (PL_sublex_info.sub_inwhat || PL_lex_state == LEX_FORMLINE ?
1818 LEX_NO_NEXT_CHUNK : 0));
3280af22 1819 s = PL_bufptr;
f0e67a1d
Z
1820 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1821 if (PL_linestart > PL_bufptr)
1822 PL_bufptr = PL_linestart;
1823 return s;
463ee0b2 1824 }
5db06880 1825#ifdef PERL_MAD
f0e67a1d
Z
1826 if (PL_madskills)
1827 PL_skipwhite = newSVpvn(start, s-start);
1828#endif /* PERL_MAD */
5db06880 1829 return s;
a687059c 1830}
378cc40b 1831
ffb4593c
NT
1832/*
1833 * S_check_uni
1834 * Check the unary operators to ensure there's no ambiguity in how they're
1835 * used. An ambiguous piece of code would be:
1836 * rand + 5
1837 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1838 * the +5 is its argument.
1839 */
1840
76e3520e 1841STATIC void
cea2e8a9 1842S_check_uni(pTHX)
ba106d47 1843{
97aff369 1844 dVAR;
d4c19fe8
AL
1845 const char *s;
1846 const char *t;
2f3197b3 1847
3280af22 1848 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1849 return;
3280af22
NIS
1850 while (isSPACE(*PL_last_uni))
1851 PL_last_uni++;
c35e046a
AL
1852 s = PL_last_uni;
1853 while (isALNUM_lazy_if(s,UTF) || *s == '-')
1854 s++;
3280af22 1855 if ((t = strchr(s, '(')) && t < PL_bufptr)
a0d0e21e 1856 return;
6136c704 1857
9b387841
NC
1858 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
1859 "Warning: Use of \"%.*s\" without parentheses is ambiguous",
1860 (int)(s - PL_last_uni), PL_last_uni);
2f3197b3
LW
1861}
1862
ffb4593c
NT
1863/*
1864 * LOP : macro to build a list operator. Its behaviour has been replaced
1865 * with a subroutine, S_lop() for which LOP is just another name.
1866 */
1867
a0d0e21e
LW
1868#define LOP(f,x) return lop(f,x,s)
1869
ffb4593c
NT
1870/*
1871 * S_lop
1872 * Build a list operator (or something that might be one). The rules:
1873 * - if we have a next token, then it's a list operator [why?]
1874 * - if the next thing is an opening paren, then it's a function
1875 * - else it's a list operator
1876 */
1877
76e3520e 1878STATIC I32
a0be28da 1879S_lop(pTHX_ I32 f, int x, char *s)
ffed7fef 1880{
97aff369 1881 dVAR;
7918f24d
NC
1882
1883 PERL_ARGS_ASSERT_LOP;
1884
6154021b 1885 pl_yylval.ival = f;
35c8bce7 1886 CLINE;
3280af22
NIS
1887 PL_expect = x;
1888 PL_bufptr = s;
1889 PL_last_lop = PL_oldbufptr;
eb160463 1890 PL_last_lop_op = (OPCODE)f;
5db06880
NC
1891#ifdef PERL_MAD
1892 if (PL_lasttoke)
78cdf107 1893 goto lstop;
5db06880 1894#else
3280af22 1895 if (PL_nexttoke)
78cdf107 1896 goto lstop;
5db06880 1897#endif
79072805 1898 if (*s == '(')
bbf60fe6 1899 return REPORT(FUNC);
29595ff2 1900 s = PEEKSPACE(s);
79072805 1901 if (*s == '(')
bbf60fe6 1902 return REPORT(FUNC);
78cdf107
Z
1903 else {
1904 lstop:
1905 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
1906 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
bbf60fe6 1907 return REPORT(LSTOP);
78cdf107 1908 }
79072805
LW
1909}
1910
5db06880
NC
1911#ifdef PERL_MAD
1912 /*
1913 * S_start_force
1914 * Sets up for an eventual force_next(). start_force(0) basically does
1915 * an unshift, while start_force(-1) does a push. yylex removes items
1916 * on the "pop" end.
1917 */
1918
1919STATIC void
1920S_start_force(pTHX_ int where)
1921{
1922 int i;
1923
cd81e915 1924 if (where < 0) /* so people can duplicate start_force(PL_curforce) */
5db06880 1925 where = PL_lasttoke;
cd81e915
NC
1926 assert(PL_curforce < 0 || PL_curforce == where);
1927 if (PL_curforce != where) {
5db06880
NC
1928 for (i = PL_lasttoke; i > where; --i) {
1929 PL_nexttoke[i] = PL_nexttoke[i-1];
1930 }
1931 PL_lasttoke++;
1932 }
cd81e915 1933 if (PL_curforce < 0) /* in case of duplicate start_force() */
5db06880 1934 Zero(&PL_nexttoke[where], 1, NEXTTOKE);
cd81e915
NC
1935 PL_curforce = where;
1936 if (PL_nextwhite) {
5db06880 1937 if (PL_madskills)
6b29d1f5 1938 curmad('^', newSVpvs(""));
cd81e915 1939 CURMAD('_', PL_nextwhite);
5db06880
NC
1940 }
1941}
1942
1943STATIC void
1944S_curmad(pTHX_ char slot, SV *sv)
1945{
1946 MADPROP **where;
1947
1948 if (!sv)
1949 return;
cd81e915
NC
1950 if (PL_curforce < 0)
1951 where = &PL_thismad;
5db06880 1952 else
cd81e915 1953 where = &PL_nexttoke[PL_curforce].next_mad;
5db06880 1954
cd81e915 1955 if (PL_faketokens)
76f68e9b 1956 sv_setpvs(sv, "");
5db06880
NC
1957 else {
1958 if (!IN_BYTES) {
1959 if (UTF && is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
1960 SvUTF8_on(sv);
1961 else if (PL_encoding) {
1962 sv_recode_to_utf8(sv, PL_encoding);
1963 }
1964 }
1965 }
1966
1967 /* keep a slot open for the head of the list? */
1968 if (slot != '_' && *where && (*where)->mad_key == '^') {
1969 (*where)->mad_key = slot;
daba3364 1970 sv_free(MUTABLE_SV(((*where)->mad_val)));
5db06880
NC
1971 (*where)->mad_val = (void*)sv;
1972 }
1973 else
1974 addmad(newMADsv(slot, sv), where, 0);
1975}
1976#else
b3f24c00
MHM
1977# define start_force(where) NOOP
1978# define curmad(slot, sv) NOOP
5db06880
NC
1979#endif
1980
ffb4593c
NT
1981/*
1982 * S_force_next
9cbb5ea2 1983 * When the lexer realizes it knows the next token (for instance,
ffb4593c 1984 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 1985 * to know what token to return the next time the lexer is called. Caller
5db06880
NC
1986 * will need to set PL_nextval[] (or PL_nexttoke[].next_val with PERL_MAD),
1987 * and possibly PL_expect to ensure the lexer handles the token correctly.
ffb4593c
NT
1988 */
1989
4e553d73 1990STATIC void
cea2e8a9 1991S_force_next(pTHX_ I32 type)
79072805 1992{
97aff369 1993 dVAR;
704d4215
GG
1994#ifdef DEBUGGING
1995 if (DEBUG_T_TEST) {
1996 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 1997 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
1998 }
1999#endif
6c7ae946
FC
2000 /* Don’t let opslab_force_free snatch it */
2001 if (S_is_opval_token(type & 0xffff) && NEXTVAL_NEXTTOKE.opval) {
2002 assert(!NEXTVAL_NEXTTOKE.opval->op_savefree);
2003 NEXTVAL_NEXTTOKE.opval->op_savefree = 1;
2004 }
5db06880 2005#ifdef PERL_MAD
cd81e915 2006 if (PL_curforce < 0)
5db06880 2007 start_force(PL_lasttoke);
cd81e915 2008 PL_nexttoke[PL_curforce].next_type = type;
5db06880
NC
2009 if (PL_lex_state != LEX_KNOWNEXT)
2010 PL_lex_defer = PL_lex_state;
2011 PL_lex_state = LEX_KNOWNEXT;
2012 PL_lex_expect = PL_expect;
cd81e915 2013 PL_curforce = -1;
5db06880 2014#else
3280af22
NIS
2015 PL_nexttype[PL_nexttoke] = type;
2016 PL_nexttoke++;
2017 if (PL_lex_state != LEX_KNOWNEXT) {
2018 PL_lex_defer = PL_lex_state;
2019 PL_lex_expect = PL_expect;
2020 PL_lex_state = LEX_KNOWNEXT;
79072805 2021 }
5db06880 2022#endif
79072805
LW
2023}
2024
28ac2b49
Z
2025void
2026Perl_yyunlex(pTHX)
2027{
a7aaec61
Z
2028 int yyc = PL_parser->yychar;
2029 if (yyc != YYEMPTY) {
2030 if (yyc) {
2031 start_force(-1);
2032 NEXTVAL_NEXTTOKE = PL_parser->yylval;
2033 if (yyc == '{'/*}*/ || yyc == HASHBRACK || yyc == '['/*]*/) {
78cdf107 2034 PL_lex_allbrackets--;
a7aaec61 2035 PL_lex_brackets--;
78cdf107
Z
2036 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
2037 } else if (yyc == '('/*)*/) {
2038 PL_lex_allbrackets--;
2039 yyc |= (2<<24);
a7aaec61
Z
2040 }
2041 force_next(yyc);
2042 }
28ac2b49
Z
2043 PL_parser->yychar = YYEMPTY;
2044 }
2045}
2046
d0a148a6 2047STATIC SV *
15f169a1 2048S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 2049{
97aff369 2050 dVAR;
740cce10 2051 SV * const sv = newSVpvn_utf8(start, len,
eaf7a4d2
CS
2052 !IN_BYTES
2053 && UTF
2054 && !is_ascii_string((const U8*)start, len)
740cce10 2055 && is_utf8_string((const U8*)start, len));
d0a148a6
NC
2056 return sv;
2057}
2058
ffb4593c
NT
2059/*
2060 * S_force_word
2061 * When the lexer knows the next thing is a word (for instance, it has
2062 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
2063 * it calls S_force_word to stick the next word into the PL_nexttoke/val
2064 * lookahead.
ffb4593c
NT
2065 *
2066 * Arguments:
b1b65b59 2067 * char *start : buffer position (must be within PL_linestr)
02b34bbe 2068 * int token : PL_next* will be this type of bare word (e.g., METHOD,WORD)
ffb4593c
NT
2069 * int check_keyword : if true, Perl checks to make sure the word isn't
2070 * a keyword (do this if the word is a label, e.g. goto FOO)
2071 * int allow_pack : if true, : characters will also be allowed (require,
2072 * use, etc. do this)
9cbb5ea2 2073 * int allow_initial_tick : used by the "sub" lexer only.
ffb4593c
NT
2074 */
2075
76e3520e 2076STATIC char *
cea2e8a9 2077S_force_word(pTHX_ register char *start, int token, int check_keyword, int allow_pack, int allow_initial_tick)
79072805 2078{
97aff369 2079 dVAR;
eb578fdb 2080 char *s;
463ee0b2 2081 STRLEN len;
4e553d73 2082
7918f24d
NC
2083 PERL_ARGS_ASSERT_FORCE_WORD;
2084
29595ff2 2085 start = SKIPSPACE1(start);
463ee0b2 2086 s = start;
7e2040f0 2087 if (isIDFIRST_lazy_if(s,UTF) ||
a0d0e21e 2088 (allow_pack && *s == ':') ||
15f0808c 2089 (allow_initial_tick && *s == '\'') )
a0d0e21e 2090 {
3280af22 2091 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
5458a98a 2092 if (check_keyword && keyword(PL_tokenbuf, len, 0))
463ee0b2 2093 return start;
cd81e915 2094 start_force(PL_curforce);
5db06880
NC
2095 if (PL_madskills)
2096 curmad('X', newSVpvn(start,s-start));
463ee0b2 2097 if (token == METHOD) {
29595ff2 2098 s = SKIPSPACE1(s);
463ee0b2 2099 if (*s == '(')
3280af22 2100 PL_expect = XTERM;
463ee0b2 2101 else {
3280af22 2102 PL_expect = XOPERATOR;
463ee0b2 2103 }
79072805 2104 }
e74e6b3d 2105 if (PL_madskills)
63575281 2106 curmad('g', newSVpvs( "forced" ));
9ded7720 2107 NEXTVAL_NEXTTOKE.opval
d0a148a6
NC
2108 = (OP*)newSVOP(OP_CONST,0,
2109 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 2110 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
2111 force_next(token);
2112 }
2113 return s;
2114}
2115
ffb4593c
NT
2116/*
2117 * S_force_ident
9cbb5ea2 2118 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
2119 * text only contains the "foo" portion. The first argument is a pointer
2120 * to the "foo", and the second argument is the type symbol to prefix.
2121 * Forces the next token to be a "WORD".
9cbb5ea2 2122 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2123 */
2124
76e3520e 2125STATIC void
bfed75c6 2126S_force_ident(pTHX_ register const char *s, int kind)
79072805 2127{
97aff369 2128 dVAR;
7918f24d
NC
2129
2130 PERL_ARGS_ASSERT_FORCE_IDENT;
2131
c35e046a 2132 if (*s) {
90e5519e 2133 const STRLEN len = strlen(s);
728847b1
BF
2134 OP* const o = (OP*)newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
2135 UTF ? SVf_UTF8 : 0));
cd81e915 2136 start_force(PL_curforce);
9ded7720 2137 NEXTVAL_NEXTTOKE.opval = o;
79072805 2138 force_next(WORD);
748a9306 2139 if (kind) {
11343788 2140 o->op_private = OPpCONST_ENTERED;
55497cff
PP
2141 /* XXX see note in pp_entereval() for why we forgo typo
2142 warnings if the symbol must be introduced in an eval.
2143 GSAR 96-10-12 */
90e5519e 2144 gv_fetchpvn_flags(s, len,
728847b1
BF
2145 (PL_in_eval ? (GV_ADDMULTI | GV_ADDINEVAL)
2146 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
90e5519e
NC
2147 kind == '$' ? SVt_PV :
2148 kind == '@' ? SVt_PVAV :
2149 kind == '%' ? SVt_PVHV :
a0d0e21e 2150 SVt_PVGV
90e5519e 2151 );
748a9306 2152 }
79072805
LW
2153 }
2154}
2155
1571675a
GS
2156NV
2157Perl_str_to_version(pTHX_ SV *sv)
2158{
2159 NV retval = 0.0;
2160 NV nshift = 1.0;
2161 STRLEN len;
cfd0369c 2162 const char *start = SvPV_const(sv,len);
9d4ba2ae 2163 const char * const end = start + len;
504618e9 2164 const bool utf = SvUTF8(sv) ? TRUE : FALSE;
7918f24d
NC
2165
2166 PERL_ARGS_ASSERT_STR_TO_VERSION;
2167
1571675a 2168 while (start < end) {
ba210ebe 2169 STRLEN skip;
1571675a
GS
2170 UV n;
2171 if (utf)
9041c2e3 2172 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2173 else {
2174 n = *(U8*)start;
2175 skip = 1;
2176 }
2177 retval += ((NV)n)/nshift;
2178 start += skip;
2179 nshift *= 1000;
2180 }
2181 return retval;
2182}
2183
4e553d73 2184/*
ffb4593c
NT
2185 * S_force_version
2186 * Forces the next token to be a version number.
e759cc13
RGS
2187 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2188 * and if "guessing" is TRUE, then no new token is created (and the caller
2189 * must use an alternative parsing method).
ffb4593c
NT
2190 */
2191
76e3520e 2192STATIC char *
e759cc13 2193S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2194{
97aff369 2195 dVAR;
5f66b61c 2196 OP *version = NULL;
44dcb63b 2197 char *d;
5db06880
NC
2198#ifdef PERL_MAD
2199 I32 startoff = s - SvPVX(PL_linestr);
2200#endif
89bfa8cd 2201
7918f24d
NC
2202 PERL_ARGS_ASSERT_FORCE_VERSION;
2203
29595ff2 2204 s = SKIPSPACE1(s);
89bfa8cd 2205
44dcb63b 2206 d = s;
dd629d5b 2207 if (*d == 'v')
44dcb63b 2208 d++;
44dcb63b 2209 if (isDIGIT(*d)) {
e759cc13
RGS
2210 while (isDIGIT(*d) || *d == '_' || *d == '.')
2211 d++;
5db06880
NC
2212#ifdef PERL_MAD
2213 if (PL_madskills) {
cd81e915 2214 start_force(PL_curforce);
5db06880
NC
2215 curmad('X', newSVpvn(s,d-s));
2216 }
2217#endif
4e4da3ac 2218 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
dd629d5b 2219 SV *ver;
8d08d9ba 2220#ifdef USE_LOCALE_NUMERIC
909d3787
KW
2221 char *loc = savepv(setlocale(LC_NUMERIC, NULL));
2222 setlocale(LC_NUMERIC, "C");
8d08d9ba 2223#endif
6154021b 2224 s = scan_num(s, &pl_yylval);
8d08d9ba
DG
2225#ifdef USE_LOCALE_NUMERIC
2226 setlocale(LC_NUMERIC, loc);
909d3787 2227 Safefree(loc);
8d08d9ba 2228#endif
6154021b 2229 version = pl_yylval.opval;
dd629d5b
GS
2230 ver = cSVOPx(version)->op_sv;
2231 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2232 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2233 SvNV_set(ver, str_to_version(ver));
1571675a 2234 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2235 }
89bfa8cd 2236 }
5db06880
NC
2237 else if (guessing) {
2238#ifdef PERL_MAD
2239 if (PL_madskills) {
cd81e915
NC
2240 sv_free(PL_nextwhite); /* let next token collect whitespace */
2241 PL_nextwhite = 0;
5db06880
NC
2242 s = SvPVX(PL_linestr) + startoff;
2243 }
2244#endif
e759cc13 2245 return s;
5db06880 2246 }
89bfa8cd
PP
2247 }
2248
5db06880
NC
2249#ifdef PERL_MAD
2250 if (PL_madskills && !version) {
cd81e915
NC
2251 sv_free(PL_nextwhite); /* let next token collect whitespace */
2252 PL_nextwhite = 0;
5db06880
NC
2253 s = SvPVX(PL_linestr) + startoff;
2254 }
2255#endif
89bfa8cd 2256 /* NOTE: The parser sees the package name and the VERSION swapped */
cd81e915 2257 start_force(PL_curforce);
9ded7720 2258 NEXTVAL_NEXTTOKE.opval = version;
4e553d73 2259 force_next(WORD);
89bfa8cd 2260
e759cc13 2261 return s;
89bfa8cd
PP
2262}
2263
ffb4593c 2264/*
91152fc1
DG
2265 * S_force_strict_version
2266 * Forces the next token to be a version number using strict syntax rules.
2267 */
2268
2269STATIC char *
2270S_force_strict_version(pTHX_ char *s)
2271{
2272 dVAR;
2273 OP *version = NULL;
2274#ifdef PERL_MAD
2275 I32 startoff = s - SvPVX(PL_linestr);
2276#endif
2277 const char *errstr = NULL;
2278
2279 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2280
2281 while (isSPACE(*s)) /* leading whitespace */
2282 s++;
2283
2284 if (is_STRICT_VERSION(s,&errstr)) {
2285 SV *ver = newSV(0);
2286 s = (char *)scan_version(s, ver, 0);
2287 version = newSVOP(OP_CONST, 0, ver);
2288 }
4e4da3ac
Z
2289 else if ( (*s != ';' && *s != '{' && *s != '}' ) &&
2290 (s = SKIPSPACE1(s), (*s != ';' && *s != '{' && *s != '}' )))
2291 {
91152fc1
DG
2292 PL_bufptr = s;
2293 if (errstr)
2294 yyerror(errstr); /* version required */
2295 return s;
2296 }
2297
2298#ifdef PERL_MAD
2299 if (PL_madskills && !version) {
2300 sv_free(PL_nextwhite); /* let next token collect whitespace */
2301 PL_nextwhite = 0;
2302 s = SvPVX(PL_linestr) + startoff;
2303 }
2304#endif
2305 /* NOTE: The parser sees the package name and the VERSION swapped */
2306 start_force(PL_curforce);
2307 NEXTVAL_NEXTTOKE.opval = version;
2308 force_next(WORD);
2309
2310 return s;
2311}
2312
2313/*
ffb4593c
NT
2314 * S_tokeq
2315 * Tokenize a quoted string passed in as an SV. It finds the next
2316 * chunk, up to end of string or a backslash. It may make a new
2317 * SV containing that chunk (if HINT_NEW_STRING is on). It also
2318 * turns \\ into \.
2319 */
2320
76e3520e 2321STATIC SV *
cea2e8a9 2322S_tokeq(pTHX_ SV *sv)
79072805 2323{
97aff369 2324 dVAR;
eb578fdb
KW
2325 char *s;
2326 char *send;
2327 char *d;
b3ac6de7
IZ
2328 STRLEN len = 0;
2329 SV *pv = sv;
79072805 2330
7918f24d
NC
2331 PERL_ARGS_ASSERT_TOKEQ;
2332
79072805 2333 if (!SvLEN(sv))
b3ac6de7 2334 goto finish;
79072805 2335
a0d0e21e 2336 s = SvPV_force(sv, len);
21a311ee 2337 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1)
b3ac6de7 2338 goto finish;
463ee0b2 2339 send = s + len;
dcb21ed6
NC
2340 /* This is relying on the SV being "well formed" with a trailing '\0' */
2341 while (s < send && !(*s == '\\' && s[1] == '\\'))
79072805
LW
2342 s++;
2343 if (s == send)
b3ac6de7 2344 goto finish;
79072805 2345 d = s;
be4731d2 2346 if ( PL_hints & HINT_NEW_STRING ) {
59cd0e26 2347 pv = newSVpvn_flags(SvPVX_const(pv), len, SVs_TEMP | SvUTF8(sv));
be4731d2 2348 }
79072805
LW
2349 while (s < send) {
2350 if (*s == '\\') {
a0d0e21e 2351 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2352 s++; /* all that, just for this */
2353 }
2354 *d++ = *s++;
2355 }
2356 *d = '\0';
95a20fc0 2357 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2358 finish:
3280af22 2359 if ( PL_hints & HINT_NEW_STRING )
eb0d8d16 2360 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
79072805
LW
2361 return sv;
2362}
2363
ffb4593c
NT
2364/*
2365 * Now come three functions related to double-quote context,
2366 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2367 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2368 * interact with PL_lex_state, and create fake ( ... ) argument lists
2369 * to handle functions and concatenation.
ecd24171
DM
2370 * For example,
2371 * "foo\lbar"
2372 * is tokenised as
2373 * stringify ( const[foo] concat lcfirst ( const[bar] ) )
ffb4593c
NT
2374 */
2375
2376/*
2377 * S_sublex_start
6154021b 2378 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2379 *
2380 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2381 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c
NT
2382 *
2383 * OP_CONST and OP_READLINE are easy--just make the new op and return.
2384 *
2385 * Everything else becomes a FUNC.
2386 *
2387 * Sets PL_lex_state to LEX_INTERPPUSH unless (ival was OP_NULL or we
2388 * had an OP_CONST or OP_READLINE). This just sets us up for a
2389 * call to S_sublex_push().
2390 */
2391
76e3520e 2392STATIC I32
cea2e8a9 2393S_sublex_start(pTHX)
79072805 2394{
97aff369 2395 dVAR;
eb578fdb 2396 const I32 op_type = pl_yylval.ival;
79072805
LW
2397
2398 if (op_type == OP_NULL) {
6154021b 2399 pl_yylval.opval = PL_lex_op;
5f66b61c 2400 PL_lex_op = NULL;
79072805
LW
2401 return THING;
2402 }
2403 if (op_type == OP_CONST || op_type == OP_READLINE) {
3280af22 2404 SV *sv = tokeq(PL_lex_stuff);
b3ac6de7
IZ
2405
2406 if (SvTYPE(sv) == SVt_PVIV) {
2407 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2408 STRLEN len;
96a5add6 2409 const char * const p = SvPV_const(sv, len);
740cce10 2410 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2411 SvREFCNT_dec(sv);
2412 sv = nsv;
4e553d73 2413 }
6154021b 2414 pl_yylval.opval = (OP*)newSVOP(op_type, 0, sv);
a0714e2c 2415 PL_lex_stuff = NULL;
6f33ba73
RGS
2416 /* Allow <FH> // "foo" */
2417 if (op_type == OP_READLINE)
2418 PL_expect = XTERMORDORDOR;
79072805
LW
2419 return THING;
2420 }
e3f73d4e
RGS
2421 else if (op_type == OP_BACKTICK && PL_lex_op) {
2422 /* readpipe() vas overriden */
2423 cSVOPx(cLISTOPx(cUNOPx(PL_lex_op)->op_first)->op_first->op_sibling)->op_sv = tokeq(PL_lex_stuff);
6154021b 2424 pl_yylval.opval = PL_lex_op;
9b201d7d 2425 PL_lex_op = NULL;
e3f73d4e
RGS
2426 PL_lex_stuff = NULL;
2427 return THING;
2428 }
79072805 2429
3280af22 2430 PL_sublex_info.super_state = PL_lex_state;
eac04b2e 2431 PL_sublex_info.sub_inwhat = (U16)op_type;
3280af22
NIS
2432 PL_sublex_info.sub_op = PL_lex_op;
2433 PL_lex_state = LEX_INTERPPUSH;
55497cff 2434
3280af22
NIS
2435 PL_expect = XTERM;
2436 if (PL_lex_op) {
6154021b 2437 pl_yylval.opval = PL_lex_op;
5f66b61c 2438 PL_lex_op = NULL;
55497cff
PP
2439 return PMFUNC;
2440 }
2441 else
2442 return FUNC;
2443}
2444
ffb4593c
NT
2445/*
2446 * S_sublex_push
2447 * Create a new scope to save the lexing state. The scope will be
2448 * ended in S_sublex_done. Returns a '(', starting the function arguments
2449 * to the uc, lc, etc. found before.
2450 * Sets PL_lex_state to LEX_INTERPCONCAT.
2451 */
2452
76e3520e 2453STATIC I32
cea2e8a9 2454S_sublex_push(pTHX)
55497cff 2455{
27da23d5 2456 dVAR;
78a635de 2457 LEXSHARED *shared;
f46d017c 2458 ENTER;
55497cff 2459
3280af22 2460 PL_lex_state = PL_sublex_info.super_state;
651b5b28 2461 SAVEBOOL(PL_lex_dojoin);
3280af22 2462 SAVEI32(PL_lex_brackets);
78cdf107 2463 SAVEI32(PL_lex_allbrackets);
b27dce25 2464 SAVEI32(PL_lex_formbrack);
78cdf107 2465 SAVEI8(PL_lex_fakeeof);
3280af22
NIS
2466 SAVEI32(PL_lex_casemods);
2467 SAVEI32(PL_lex_starts);
651b5b28 2468 SAVEI8(PL_lex_state);
7cc34111 2469 SAVESPTR(PL_lex_repl);
7766f137 2470 SAVEVPTR(PL_lex_inpat);
98246f1e 2471 SAVEI16(PL_lex_inwhat);
57843af0 2472 SAVECOPLINE(PL_curcop);
3280af22 2473 SAVEPPTR(PL_bufptr);
8452ff4b 2474 SAVEPPTR(PL_bufend);
3280af22
NIS
2475 SAVEPPTR(PL_oldbufptr);
2476 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2477 SAVEPPTR(PL_last_lop);
2478 SAVEPPTR(PL_last_uni);
3280af22
NIS
2479 SAVEPPTR(PL_linestart);
2480 SAVESPTR(PL_linestr);
8edd5f42
RGS
2481 SAVEGENERICPV(PL_lex_brackstack);
2482 SAVEGENERICPV(PL_lex_casestack);
78a635de 2483 SAVEGENERICPV(PL_parser->lex_shared);
3280af22 2484
99bd9d90 2485 /* The here-doc parser needs to be able to peek into outer lexing
60f40a38
FC
2486 scopes to find the body of the here-doc. So we put PL_linestr and
2487 PL_bufptr into lex_shared, to ‘share’ those values.
99bd9d90 2488 */
60f40a38
FC
2489 PL_parser->lex_shared->ls_linestr = PL_linestr;
2490 PL_parser->lex_shared->ls_bufptr = PL_bufptr;
99bd9d90 2491
3280af22 2492 PL_linestr = PL_lex_stuff;
7cc34111 2493 PL_lex_repl = PL_sublex_info.repl;
a0714e2c 2494 PL_lex_stuff = NULL;
7cc34111 2495 PL_sublex_info.repl = NULL;
3280af22 2496
9cbb5ea2
GS
2497 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2498 = SvPVX(PL_linestr);
3280af22 2499 PL_bufend += SvCUR(PL_linestr);
bd61b366 2500 PL_last_lop = PL_last_uni = NULL;
3280af22 2501 SAVEFREESV(PL_linestr);
4dc843bc 2502 if (PL_lex_repl) SAVEFREESV(PL_lex_repl);
3280af22
NIS
2503
2504 PL_lex_dojoin = FALSE;
b27dce25 2505 PL_lex_brackets = PL_lex_formbrack = 0;
78cdf107
Z
2506 PL_lex_allbrackets = 0;
2507 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
a02a5408
JC
2508 Newx(PL_lex_brackstack, 120, char);
2509 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2510 PL_lex_casemods = 0;
2511 *PL_lex_casestack = '\0';
2512 PL_lex_starts = 0;
2513 PL_lex_state = LEX_INTERPCONCAT;
eb160463 2514 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
78a635de
FC
2515
2516 Newxz(shared, 1, LEXSHARED);
2517 shared->ls_prev = PL_parser->lex_shared;
2518 PL_parser->lex_shared = shared;
3280af22
NIS
2519
2520 PL_lex_inwhat = PL_sublex_info.sub_inwhat;
bb16bae8 2521 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
3280af22
NIS
2522 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2523 PL_lex_inpat = PL_sublex_info.sub_op;
79072805 2524 else
5f66b61c 2525 PL_lex_inpat = NULL;
79072805 2526
55497cff 2527 return '(';
79072805
LW
2528}
2529
ffb4593c
NT
2530/*
2531 * S_sublex_done
2532 * Restores lexer state after a S_sublex_push.
2533 */
2534
76e3520e 2535STATIC I32
cea2e8a9 2536S_sublex_done(pTHX)
79072805 2537{
27da23d5 2538 dVAR;
3280af22 2539 if (!PL_lex_starts++) {
396482e1 2540 SV * const sv = newSVpvs("");
9aa983d2
JH
2541 if (SvUTF8(PL_linestr))
2542 SvUTF8_on(sv);
3280af22 2543 PL_expect = XOPERATOR;
6154021b 2544 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
79072805
LW
2545 return THING;
2546 }
2547
3280af22
NIS
2548 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2549 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2550 return yylex();
79072805
LW
2551 }
2552
ffb4593c 2553 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
bb16bae8 2554 assert(PL_lex_inwhat != OP_TRANSR);
3280af22
NIS
2555 if (PL_lex_repl && (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS)) {
2556 PL_linestr = PL_lex_repl;
2557 PL_lex_inpat = 0;
2558 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2559 PL_bufend += SvCUR(PL_linestr);
bd61b366 2560 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2561 PL_lex_dojoin = FALSE;
2562 PL_lex_brackets = 0;
78cdf107
Z
2563 PL_lex_allbrackets = 0;
2564 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
3280af22
NIS
2565 PL_lex_casemods = 0;
2566 *PL_lex_casestack = '\0';
2567 PL_lex_starts = 0;
25da4f38 2568 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2569 PL_lex_state = LEX_INTERPNORMAL;
2570 PL_lex_starts++;
e9fa98b2
HS
2571 /* we don't clear PL_lex_repl here, so that we can check later
2572 whether this is an evalled subst; that means we rely on the
2573 logic to ensure sublex_done() is called again only via the
2574 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2575 }
e9fa98b2 2576 else {
3280af22 2577 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2578 PL_lex_repl = NULL;
e9fa98b2 2579 }
79072805 2580 return ',';
ffed7fef
LW
2581 }
2582 else {
5db06880
NC
2583#ifdef PERL_MAD
2584 if (PL_madskills) {
cd81e915
NC
2585 if (PL_thiswhite) {
2586 if (!PL_endwhite)
6b29d1f5 2587 PL_endwhite = newSVpvs("");
cd81e915
NC
2588 sv_catsv(PL_endwhite, PL_thiswhite);
2589 PL_thiswhite = 0;
2590 }
2591 if (PL_thistoken)
76f68e9b 2592 sv_setpvs(PL_thistoken,"");
5db06880 2593 else
cd81e915 2594 PL_realtokenstart = -1;
5db06880
NC
2595 }
2596#endif
f46d017c 2597 LEAVE;
3280af22
NIS
2598 PL_bufend = SvPVX(PL_linestr);
2599 PL_bufend += SvCUR(PL_linestr);
2600 PL_expect = XOPERATOR;
09bef843 2601 PL_sublex_info.sub_inwhat = 0;
79072805 2602 return ')';
ffed7fef
LW
2603 }
2604}
2605
02aa26ce
NT
2606/*
2607 scan_const
2608
9da1dd8f
DM
2609 Extracts the next constant part of a pattern, double-quoted string,
2610 or transliteration. This is terrifying code.
2611
2612 For example, in parsing the double-quoted string "ab\x63$d", it would
2613 stop at the '$' and return an OP_CONST containing 'abc'.
02aa26ce 2614
94def140 2615 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 2616 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 2617 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 2618
94def140
ST
2619 Returns a pointer to the character scanned up to. If this is
2620 advanced from the start pointer supplied (i.e. if anything was
9da1dd8f 2621 successfully parsed), will leave an OP_CONST for the substring scanned
6154021b 2622 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
2623 by looking at the next characters herself.
2624
02aa26ce 2625 In patterns:
9da1dd8f
DM
2626 expand:
2627 \N{ABC} => \N{U+41.42.43}
2628
2629 pass through:
2630 all other \-char, including \N and \N{ apart from \N{ABC}
2631
2632 stops on:
2633 @ and $ where it appears to be a var, but not for $ as tail anchor
2634 \l \L \u \U \Q \E
2635 (?{ or (??{
2636
02aa26ce
NT
2637
2638 In transliterations:
2639 characters are VERY literal, except for - not at the start or end
94def140
ST
2640 of the string, which indicates a range. If the range is in bytes,
2641 scan_const expands the range to the full set of intermediate
2642 characters. If the range is in utf8, the hyphen is replaced with
2643 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
2644
2645 In double-quoted strings:
2646 backslashes:
2647 double-quoted style: \r and \n
ff3f963a 2648 constants: \x31, etc.
94def140 2649 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
2650 case and quoting: \U \Q \E
2651 stops on @ and $
2652
2653 scan_const does *not* construct ops to handle interpolated strings.
2654 It stops processing as soon as it finds an embedded $ or @ variable
2655 and leaves it to the caller to work out what's going on.
2656
94def140
ST
2657 embedded arrays (whether in pattern or not) could be:
2658 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2659
2660 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
2661
2662 $ in pattern could be $foo or could be tail anchor. Assumption:
2663 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 2664 followed by one of "()| \r\n\t"
02aa26ce 2665
9da1dd8f 2666 \1 (backreferences) are turned into $1 in substitutions
02aa26ce
NT
2667
2668 The structure of the code is
2669 while (there's a character to process) {
94def140
ST
2670 handle transliteration ranges
2671 skip regexp comments /(?#comment)/ and codes /(?{code})/
2672 skip #-initiated comments in //x patterns
2673 check for embedded arrays
02aa26ce
NT
2674 check for embedded scalars
2675 if (backslash) {
94def140 2676 deprecate \1 in substitution replacements
02aa26ce
NT
2677 handle string-changing backslashes \l \U \Q \E, etc.
2678 switch (what was escaped) {
94def140 2679 handle \- in a transliteration (becomes a literal -)
ff3f963a 2680 if a pattern and not \N{, go treat as regular character
94def140
ST
2681 handle \132 (octal characters)
2682 handle \x15 and \x{1234} (hex characters)
ff3f963a 2683 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
ST
2684 handle \cV (control characters)
2685 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 2686 } (end switch)
77a135fe 2687 continue
02aa26ce 2688 } (end if backslash)
77a135fe 2689 handle regular character
02aa26ce 2690 } (end while character to read)
4e553d73 2691
02aa26ce
NT
2692*/
2693
76e3520e 2694STATIC char *
cea2e8a9 2695S_scan_const(pTHX_ char *start)
79072805 2696{
97aff369 2697 dVAR;
eb578fdb 2698 char *send = PL_bufend; /* end of the constant */
77a135fe
KW
2699 SV *sv = newSV(send - start); /* sv for the constant. See
2700 note below on sizing. */
eb578fdb
KW
2701 char *s = start; /* start of the constant */
2702 char *d = SvPVX(sv); /* destination for copies */
02aa26ce 2703 bool dorange = FALSE; /* are we in a translit range? */
c2e66d9e 2704 bool didrange = FALSE; /* did we just finish a range? */
2866decb 2705 bool in_charclass = FALSE; /* within /[...]/ */
b953e60c
KW
2706 bool has_utf8 = FALSE; /* Output constant is UTF8 */
2707 bool this_utf8 = cBOOL(UTF); /* Is the source string assumed
77a135fe
KW
2708 to be UTF8? But, this can
2709 show as true when the source
2710 isn't utf8, as for example
2711 when it is entirely composed
2712 of hex constants */
2713
2714 /* Note on sizing: The scanned constant is placed into sv, which is
2715 * initialized by newSV() assuming one byte of output for every byte of
2716 * input. This routine expects newSV() to allocate an extra byte for a
2717 * trailing NUL, which this routine will append if it gets to the end of
2718 * the input. There may be more bytes of input than output (eg., \N{LATIN
2719 * CAPITAL LETTER A}), or more output than input if the constant ends up
2720 * recoded to utf8, but each time a construct is found that might increase
2721 * the needed size, SvGROW() is called. Its size parameter each time is
2722 * based on the best guess estimate at the time, namely the length used so
2723 * far, plus the length the current construct will occupy, plus room for
2724 * the trailing NUL, plus one byte for every input byte still unscanned */
2725
012bcf8d 2726 UV uv;
4c3a8340
ST
2727#ifdef EBCDIC
2728 UV literal_endpoint = 0;
e294cc5d 2729 bool native_range = TRUE; /* turned to FALSE if the first endpoint is Unicode. */
4c3a8340 2730#endif
012bcf8d 2731
7918f24d
NC
2732 PERL_ARGS_ASSERT_SCAN_CONST;
2733
bb16bae8 2734 assert(PL_lex_inwhat != OP_TRANSR);
2b9d42f0
NIS
2735 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
2736 /* If we are doing a trans and we know we want UTF8 set expectation */
2737 has_utf8 = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
2738 this_utf8 = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
2739 }
2740
2741
79072805 2742 while (s < send || dorange) {
ff3f963a 2743
02aa26ce 2744 /* get transliterations out of the way (they're most literal) */
3280af22 2745 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 2746 /* expand a range A-Z to the full set of characters. AIE! */
79072805 2747 if (dorange) {
1ba5c669
JH
2748 I32 i; /* current expanded character */
2749 I32 min; /* first character in range */
2750 I32 max; /* last character in range */
02aa26ce 2751
e294cc5d
JH
2752#ifdef EBCDIC
2753 UV uvmax = 0;
2754#endif
2755
2756 if (has_utf8
2757#ifdef EBCDIC
2758 && !native_range
2759#endif
2760 ) {
9d4ba2ae 2761 char * const c = (char*)utf8_hop((U8*)d, -1);
8973db79
JH
2762 char *e = d++;
2763 while (e-- > c)
2764 *(e + 1) = *e;
25716404 2765 *c = (char)UTF_TO_NATIVE(0xff);
8973db79
JH
2766 /* mark the range as done, and continue */
2767 dorange = FALSE;
2768 didrange = TRUE;
2769 continue;
2770 }
2b9d42f0 2771
95a20fc0 2772 i = d - SvPVX_const(sv); /* remember current offset */
e294cc5d
JH
2773#ifdef EBCDIC
2774 SvGROW(sv,
2775 SvLEN(sv) + (has_utf8 ?
2776 (512 - UTF_CONTINUATION_MARK +
2777 UNISKIP(0x100))
2778 : 256));
2779 /* How many two-byte within 0..255: 128 in UTF-8,
2780 * 96 in UTF-8-mod. */
2781#else
9cbb5ea2 2782 SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */
e294cc5d 2783#endif
9cbb5ea2 2784 d = SvPVX(sv) + i; /* refresh d after realloc */
e294cc5d
JH
2785#ifdef EBCDIC
2786 if (has_utf8) {
2787 int j;
2788 for (j = 0; j <= 1; j++) {
2789 char * const c = (char*)utf8_hop((U8*)d, -1);
2790 const UV uv = utf8n_to_uvchr((U8*)c, d - c, NULL, 0);
2791 if (j)
2792 min = (U8)uv;
2793 else if (uv < 256)
2794 max = (U8)uv;
2795 else {
2796 max = (U8)0xff; /* only to \xff */
2797 uvmax = uv; /* \x{100} to uvmax */
2798 }
2799 d = c; /* eat endpoint chars */
2800 }
2801 }
2802 else {
2803#endif
2804 d -= 2; /* eat the first char and the - */
2805 min = (U8)*d; /* first char in range */
2806 max = (U8)d[1]; /* last char in range */
2807#ifdef EBCDIC
2808 }
2809#endif
8ada0baa 2810
c2e66d9e 2811 if (min > max) {
4dc843bc 2812 SvREFCNT_dec(sv);
01ec43d0 2813 Perl_croak(aTHX_
d1573ac7 2814 "Invalid range \"%c-%c\" in transliteration operator",
1ba5c669 2815 (char)min, (char)max);
c2e66d9e
GS
2816 }
2817
c7f1f016 2818#ifdef EBCDIC
4c3a8340
ST
2819 if (literal_endpoint == 2 &&
2820 ((isLOWER(min) && isLOWER(max)) ||
2821 (isUPPER(min) && isUPPER(max)))) {
8ada0baa
JH
2822 if (isLOWER(min)) {
2823 for (i = min; i <= max; i++)
2824 if (isLOWER(i))
db42d148 2825 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2826 } else {
2827 for (i = min; i <= max; i++)
2828 if (isUPPER(i))
db42d148 2829 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2830 }
2831 }
2832 else
2833#endif
2834 for (i = min; i <= max; i++)
e294cc5d
JH
2835#ifdef EBCDIC
2836 if (has_utf8) {
2837 const U8 ch = (U8)NATIVE_TO_UTF(i);
2838 if (UNI_IS_INVARIANT(ch))
2839 *d++ = (U8)i;
2840 else {
2841 *d++ = (U8)UTF8_EIGHT_BIT_HI(ch);
2842 *d++ = (U8)UTF8_EIGHT_BIT_LO(ch);
2843 }
2844 }
2845 else
2846#endif
2847 *d++ = (char)i;
2848
2849#ifdef EBCDIC
2850 if (uvmax) {
2851 d = (char*)uvchr_to_utf8((U8*)d, 0x100);
2852 if (uvmax > 0x101)
2853 *d++ = (char)UTF_TO_NATIVE(0xff);
2854 if (uvmax > 0x100)
2855 d = (char*)uvchr_to_utf8((U8*)d, uvmax);
2856 }
2857#endif
02aa26ce
NT
2858
2859 /* mark the range as done, and continue */
79072805 2860 dorange = FALSE;
01ec43d0 2861 didrange = TRUE;
4c3a8340
ST
2862#ifdef EBCDIC
2863 literal_endpoint = 0;
2864#endif
79072805 2865 continue;
4e553d73 2866 }
02aa26ce
NT
2867
2868 /* range begins (ignore - as first or last char) */
79072805 2869 else if (*s == '-' && s+1 < send && s != start) {
4e553d73 2870 if (didrange) {
4dc843bc 2871 SvREFCNT_dec(sv);
1fafa243 2872 Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
01ec43d0 2873 }
e294cc5d
JH
2874 if (has_utf8
2875#ifdef EBCDIC
2876 && !native_range
2877#endif
2878 ) {
25716404 2879 *d++ = (char)UTF_TO_NATIVE(0xff); /* use illegal utf8 byte--see pmtrans */
a0ed51b3
LW
2880 s++;
2881 continue;
2882 }
79072805
LW
2883 dorange = TRUE;
2884 s++;
01ec43d0
GS
2885 }
2886 else {
2887 didrange = FALSE;
4c3a8340
ST
2888#ifdef EBCDIC
2889 literal_endpoint = 0;
e294cc5d 2890 native_range = TRUE;
4c3a8340 2891#endif
01ec43d0 2892 }
79072805 2893 }
02aa26ce
NT
2894
2895 /* if we get here, we're not doing a transliteration */
2896
e4a2df84
DM
2897 else if (*s == '[' && PL_lex_inpat && !in_charclass) {
2898 char *s1 = s-1;
2899 int esc = 0;
2900 while (s1 >= start && *s1-- == '\\')
2901 esc = !esc;
2902 if (!esc)
2903 in_charclass = TRUE;
2904 }
2866decb 2905
e4a2df84
DM
2906 else if (*s == ']' && PL_lex_inpat && in_charclass) {
2907 char *s1 = s-1;
2908 int esc = 0;
2909 while (s1 >= start && *s1-- == '\\')
2910 esc = !esc;
2911 if (!esc)
2912 in_charclass = FALSE;
2913 }
2866decb 2914
9da1dd8f
DM
2915 /* skip for regexp comments /(?#comment)/, except for the last
2916 * char, which will be done separately.
2917 * Stop on (?{..}) and friends */
2918
3280af22 2919 else if (*s == '(' && PL_lex_inpat && s[1] == '?') {
cc6b7395 2920 if (s[2] == '#') {
e994fd66 2921 while (s+1 < send && *s != ')')
db42d148 2922 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
155aba94 2923 }
2866decb 2924 else if (!PL_lex_casemods && !in_charclass &&
d3cec5e5
DM
2925 ( s[2] == '{' /* This should match regcomp.c */
2926 || (s[2] == '?' && s[3] == '{')))
155aba94 2927 {
9da1dd8f 2928 break;
cc6b7395 2929 }
748a9306 2930 }
02aa26ce
NT
2931
2932 /* likewise skip #-initiated comments in //x patterns */
3280af22 2933 else if (*s == '#' && PL_lex_inpat &&
73134a2e 2934 ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED) {
748a9306 2935 while (s+1 < send && *s != '\n')
db42d148 2936 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
748a9306 2937 }
02aa26ce 2938
9da1dd8f
DM
2939 /* no further processing of single-quoted regex */
2940 else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'')
2941 goto default_action;
2942
5d1d4326 2943 /* check for embedded arrays
da6eedaa 2944 (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
5d1d4326 2945 */
1749ea0d
ST
2946 else if (*s == '@' && s[1]) {
2947 if (isALNUM_lazy_if(s+1,UTF))
2948 break;
2949 if (strchr(":'{$", s[1]))
2950 break;
2951 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
2952 break; /* in regexp, neither @+ nor @- are interpolated */
2953 }
02aa26ce
NT
2954
2955 /* check for embedded scalars. only stop if we're sure it's a
2956 variable.
2957 */
79072805 2958 else if (*s == '$') {
3280af22 2959 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
79072805 2960 break;
77772344 2961 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
a2a5de95
NC
2962 if (s[1] == '\\') {
2963 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
2964 "Possible unintended interpolation of $\\ in regex");
77772344 2965 }
79072805 2966 break; /* in regexp, $ might be tail anchor */
77772344 2967 }
79072805 2968 }
02aa26ce 2969
2b9d42f0
NIS
2970 /* End of else if chain - OP_TRANS rejoin rest */
2971
02aa26ce 2972 /* backslashes */
79072805 2973 if (*s == '\\' && s+1 < send) {
ff3f963a
KW
2974 char* e; /* Can be used for ending '}', etc. */
2975
79072805 2976 s++;
02aa26ce 2977
7d0fc23c
KW
2978 /* warn on \1 - \9 in substitution replacements, but note that \11
2979 * is an octal; and \19 is \1 followed by '9' */
3280af22 2980 if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat &&
a0d0e21e 2981 isDIGIT(*s) && *s != '0' && !isDIGIT(s[1]))
79072805 2982 {
a2a5de95 2983 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
79072805
LW
2984 *--s = '$';
2985 break;
2986 }
02aa26ce
NT
2987
2988 /* string-change backslash escapes */
838f2281 2989 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQF", *s)) {
79072805
LW
2990 --s;
2991 break;
2992 }
ff3f963a
KW
2993 /* In a pattern, process \N, but skip any other backslash escapes.
2994 * This is because we don't want to translate an escape sequence
2995 * into a meta symbol and have the regex compiler use the meta
2996 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
2997 * in spite of this, we do have to process \N here while the proper
2998 * charnames handler is in scope. See bugs #56444 and #62056.
2999 * There is a complication because \N in a pattern may also stand
3000 * for 'match a non-nl', and not mean a charname, in which case its
3001 * processing should be deferred to the regex compiler. To be a
3002 * charname it must be followed immediately by a '{', and not look
3003 * like \N followed by a curly quantifier, i.e., not something like
3004 * \N{3,}. regcurly returns a boolean indicating if it is a legal
3005 * quantifier */
3006 else if (PL_lex_inpat
3007 && (*s != 'N'
3008 || s[1] != '{'
3009 || regcurly(s + 1)))
3010 {
cc74c5bd
ST
3011 *d++ = NATIVE_TO_NEED(has_utf8,'\\');
3012 goto default_action;
3013 }
02aa26ce 3014
79072805 3015 switch (*s) {
02aa26ce
NT
3016
3017 /* quoted - in transliterations */
79072805 3018 case '-':
3280af22 3019 if (PL_lex_inwhat == OP_TRANS) {
79072805
LW
3020 *d++ = *s++;
3021 continue;
3022 }
3023 /* FALL THROUGH */
3024 default:
11b8faa4 3025 {
e4ca4584 3026 if ((isALNUMC(*s)))
a2a5de95
NC
3027 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
3028 "Unrecognized escape \\%c passed through",
3029 *s);
11b8faa4 3030 /* default action is to copy the quoted character */
f9a63242 3031 goto default_action;
11b8faa4 3032 }
02aa26ce 3033
632403cc 3034 /* eg. \132 indicates the octal constant 0132 */
79072805
LW
3035 case '0': case '1': case '2': case '3':
3036 case '4': case '5': case '6': case '7':
ba210ebe 3037 {
53305cf1
NC
3038 I32 flags = 0;
3039 STRLEN len = 3;
77a135fe 3040 uv = NATIVE_TO_UNI(grok_oct(s, &len, &flags, NULL));
ba210ebe
JH
3041 s += len;
3042 }
012bcf8d 3043 goto NUM_ESCAPE_INSERT;
02aa26ce 3044
f0a2b745
KW
3045 /* eg. \o{24} indicates the octal constant \024 */
3046 case 'o':
3047 {
3048 STRLEN len;
454155d9 3049 const char* error;
f0a2b745 3050
454155d9 3051 bool valid = grok_bslash_o(s, &uv, &len, &error, 1);
f0a2b745 3052 s += len;
454155d9 3053 if (! valid) {
f0a2b745
KW
3054 yyerror(error);
3055 continue;
3056 }
3057 goto NUM_ESCAPE_INSERT;
3058 }
3059
77a135fe 3060 /* eg. \x24 indicates the hex constant 0x24 */
79072805 3061 case 'x':
a0481293 3062 {
53305cf1 3063 STRLEN len;
a0481293 3064 const char* error;
355860ce 3065
a0481293
KW
3066 bool valid = grok_bslash_x(s, &uv, &len, &error, 1);
3067 s += len;
3068 if (! valid) {
3069 yyerror(error);
355860ce 3070 continue;
ba210ebe 3071 }
012bcf8d
GS
3072 }
3073
3074 NUM_ESCAPE_INSERT:
ff3f963a
KW
3075 /* Insert oct or hex escaped character. There will always be
3076 * enough room in sv since such escapes will be longer than any
3077 * UTF-8 sequence they can end up as, except if they force us
3078 * to recode the rest of the string into utf8 */
ba7cea30 3079
77a135fe 3080 /* Here uv is the ordinal of the next character being added in
ff3f963a 3081 * unicode (converted from native). */
77a135fe 3082 if (!UNI_IS_INVARIANT(uv)) {
9aa983d2 3083 if (!has_utf8 && uv > 255) {
77a135fe
KW
3084 /* Might need to recode whatever we have accumulated so
3085 * far if it contains any chars variant in utf8 or
3086 * utf-ebcdic. */
3087
3088 SvCUR_set(sv, d - SvPVX_const(sv));
3089 SvPOK_on(sv);
3090 *d = '\0';
77a135fe 3091 /* See Note on sizing above. */
7bf79863
KW
3092 sv_utf8_upgrade_flags_grow(sv,
3093 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3094 UNISKIP(uv) + (STRLEN)(send - s) + 1);
77a135fe
KW
3095 d = SvPVX(sv) + SvCUR(sv);
3096 has_utf8 = TRUE;
012bcf8d
GS
3097 }
3098
77a135fe
KW
3099 if (has_utf8) {
3100 d = (char*)uvuni_to_utf8((U8*)d, uv);
f9a63242
JH
3101 if (PL_lex_inwhat == OP_TRANS &&
3102 PL_sublex_info.sub_op) {
3103 PL_sublex_info.sub_op->op_private |=
3104 (PL_lex_repl ? OPpTRANS_FROM_UTF
3105 : OPpTRANS_TO_UTF);
f9a63242 3106 }
e294cc5d
JH
3107#ifdef EBCDIC
3108 if (uv > 255 && !dorange)
3109 native_range = FALSE;
3110#endif
012bcf8d 3111 }
a0ed51b3 3112 else {
012bcf8d 3113 *d++ = (char)uv;
a0ed51b3 3114 }
012bcf8d
GS
3115 }
3116 else {
c4d5f83a 3117 *d++ = (char) uv;
a0ed51b3 3118 }
79072805 3119 continue;
02aa26ce 3120
4a2d328f 3121 case 'N':
ff3f963a
KW
3122 /* In a non-pattern \N must be a named character, like \N{LATIN
3123 * SMALL LETTER A} or \N{U+0041}. For patterns, it also can
3124 * mean to match a non-newline. For non-patterns, named
3125 * characters are converted to their string equivalents. In
3126 * patterns, named characters are not converted to their
3127 * ultimate forms for the same reasons that other escapes
3128 * aren't. Instead, they are converted to the \N{U+...} form
3129 * to get the value from the charnames that is in effect right
3130 * now, while preserving the fact that it was a named character
3131 * so that the regex compiler knows this */
3132
3133 /* This section of code doesn't generally use the
3134 * NATIVE_TO_NEED() macro to transform the input. I (khw) did
3135 * a close examination of this macro and determined it is a
3136 * no-op except on utfebcdic variant characters. Every
3137 * character generated by this that would normally need to be
3138 * enclosed by this macro is invariant, so the macro is not
7538f724
KW
3139 * needed, and would complicate use of copy(). XXX There are
3140 * other parts of this file where the macro is used
3141 * inconsistently, but are saved by it being a no-op */
ff3f963a
KW
3142
3143 /* The structure of this section of code (besides checking for
3144 * errors and upgrading to utf8) is:
3145 * Further disambiguate between the two meanings of \N, and if
3146 * not a charname, go process it elsewhere
0a96133f
KW
3147 * If of form \N{U+...}, pass it through if a pattern;
3148 * otherwise convert to utf8
3149 * Otherwise must be \N{NAME}: convert to \N{U+c1.c2...} if a
3150 * pattern; otherwise convert to utf8 */
ff3f963a
KW
3151
3152 /* Here, s points to the 'N'; the test below is guaranteed to
3153 * succeed if we are being called on a pattern as we already
3154 * know from a test above that the next character is a '{'.
3155 * On a non-pattern \N must mean 'named sequence, which
3156 * requires braces */
3157 s++;
3158 if (*s != '{') {
3159 yyerror("Missing braces on \\N{}");
3160 continue;
3161 }
3162 s++;
3163
0a96133f 3164 /* If there is no matching '}', it is an error. */
ff3f963a
KW
3165 if (! (e = strchr(s, '}'))) {
3166 if (! PL_lex_inpat) {
5777a3f7 3167 yyerror("Missing right brace on \\N{}");
0a96133f
KW
3168 } else {
3169 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N.");
dbc0d4f2 3170 }
0a96133f 3171 continue;
ff3f963a 3172 }
cddc7ef4 3173
ff3f963a 3174 /* Here it looks like a named character */
cddc7ef4 3175
ff3f963a
KW
3176 if (PL_lex_inpat) {
3177
3178 /* XXX This block is temporary code. \N{} implies that the
3179 * pattern is to have Unicode semantics, and therefore
3180 * currently has to be encoded in utf8. By putting it in
3181 * utf8 now, we save a whole pass in the regular expression
3182 * compiler. Once that code is changed so Unicode
3183 * semantics doesn't necessarily have to be in utf8, this
da3a4baf
KW
3184 * block should be removed. However, the code that parses
3185 * the output of this would have to be changed to not
3186 * necessarily expect utf8 */
ff3f963a 3187 if (!has_utf8) {
77a135fe 3188 SvCUR_set(sv, d - SvPVX_const(sv));
f08d6ad9 3189 SvPOK_on(sv);
e4f3eed8 3190 *d = '\0';
77a135fe 3191 /* See Note on sizing above. */
7bf79863 3192 sv_utf8_upgrade_flags_grow(sv,
ff3f963a
KW
3193 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3194 /* 5 = '\N{' + cur char + NUL */
3195 (STRLEN)(send - s) + 5);
f08d6ad9 3196 d = SvPVX(sv) + SvCUR(sv);
89491803 3197 has_utf8 = TRUE;
ff3f963a
KW
3198 }
3199 }
423cee85 3200
ff3f963a
KW
3201 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3202 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3203 | PERL_SCAN_DISALLOW_PREFIX;
3204 STRLEN len;
3205
3206 /* For \N{U+...}, the '...' is a unicode value even on
3207 * EBCDIC machines */
3208 s += 2; /* Skip to next char after the 'U+' */
3209 len = e - s;
3210 uv = grok_hex(s, &len, &flags, NULL);
3211 if (len == 0 || len != (STRLEN)(e - s)) {
3212 yyerror("Invalid hexadecimal number in \\N{U+...}");
3213 s = e + 1;
3214 continue;
3215 }
3216
3217 if (PL_lex_inpat) {
3218
e2a7e165
KW
3219 /* On non-EBCDIC platforms, pass through to the regex
3220 * compiler unchanged. The reason we evaluated the
3221 * number above is to make sure there wasn't a syntax
3222 * error. But on EBCDIC we convert to native so
3223 * downstream code can continue to assume it's native
3224 */
ff3f963a 3225 s -= 5; /* Include the '\N{U+' */
e2a7e165
KW
3226#ifdef EBCDIC
3227 d += my_snprintf(d, e - s + 1 + 1, /* includes the }
3228 and the \0 */
3229 "\\N{U+%X}",
3230 (unsigned int) UNI_TO_NATIVE(uv));
3231#else
ff3f963a
KW
3232 Copy(s, d, e - s + 1, char); /* 1 = include the } */
3233 d += e - s + 1;
e2a7e165 3234#endif
ff3f963a
KW
3235 }
3236 else { /* Not a pattern: convert the hex to string */
3237
3238 /* If destination is not in utf8, unconditionally
3239 * recode it to be so. This is because \N{} implies
3240 * Unicode semantics, and scalars have to be in utf8
3241 * to guarantee those semantics */
3242 if (! has_utf8) {
3243 SvCUR_set(sv, d - SvPVX_const(sv));
3244 SvPOK_on(sv);
3245 *d = '\0';
3246 /* See Note on sizing above. */
3247 sv_utf8_upgrade_flags_grow(
3248 sv,
3249 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3250 UNISKIP(uv) + (STRLEN)(send - e) + 1);
3251 d = SvPVX(sv) + SvCUR(sv);
3252 has_utf8 = TRUE;
3253 }
3254
3255 /* Add the string to the output */
3256 if (UNI_IS_INVARIANT(uv)) {
3257 *d++ = (char) uv;
3258 }
3259 else d = (char*)uvuni_to_utf8((U8*)d, uv);
3260 }
3261 }
3262 else { /* Here is \N{NAME} but not \N{U+...}. */
3263
3264 SV *res; /* result from charnames */
3265 const char *str; /* the string in 'res' */
3266 STRLEN len; /* its length */
3267
3268 /* Get the value for NAME */
3269 res = newSVpvn(s, e - s);
3270 res = new_constant( NULL, 0, "charnames",
3271 /* includes all of: \N{...} */
3272 res, NULL, s - 3, e - s + 4 );
3273
3274 /* Most likely res will be in utf8 already since the
3275 * standard charnames uses pack U, but a custom translator
3276 * can leave it otherwise, so make sure. XXX This can be
3277 * revisited to not have charnames use utf8 for characters
3278 * that don't need it when regexes don't have to be in utf8
3279 * for Unicode semantics. If doing so, remember EBCDIC */
3280 sv_utf8_upgrade(res);
3281 str = SvPV_const(res, len);
3282
3283 /* Don't accept malformed input */
3284 if (! is_utf8_string((U8 *) str, len)) {
3285 yyerror("Malformed UTF-8 returned by \\N");
3286 }
3287 else if (PL_lex_inpat) {
3288
3289 if (! len) { /* The name resolved to an empty string */
3290 Copy("\\N{}", d, 4, char);
3291 d += 4;
3292 }
3293 else {
3294 /* In order to not lose information for the regex
3295 * compiler, pass the result in the specially made
3296 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3297 * the code points in hex of each character
3298 * returned by charnames */
3299
3300 const char *str_end = str + len;
3301 STRLEN char_length; /* cur char's byte length */
3302 STRLEN output_length; /* and the number of bytes
3303 after this is translated
3304 into hex digits */
3305 const STRLEN off = d - SvPVX_const(sv);
3306
3307 /* 2 hex per byte; 2 chars for '\N'; 2 chars for
3308 * max('U+', '.'); and 1 for NUL */
3309 char hex_string[2 * UTF8_MAXBYTES + 5];
3310
3311 /* Get the first character of the result. */
3312 U32 uv = utf8n_to_uvuni((U8 *) str,
3313 len,
3314 &char_length,
3315 UTF8_ALLOW_ANYUV);
3316
3317 /* The call to is_utf8_string() above hopefully
3318 * guarantees that there won't be an error. But
3319 * it's easy here to make sure. The function just
3320 * above warns and returns 0 if invalid utf8, but
3321 * it can also return 0 if the input is validly a
3322 * NUL. Disambiguate */
3323 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3324 uv = UNICODE_REPLACEMENT;
3325 }
3326
3327 /* Convert first code point to hex, including the
e2a7e165
KW
3328 * boiler plate before it. For all these, we
3329 * convert to native format so that downstream code
3330 * can continue to assume the input is native */
78c35590 3331 output_length =
3353de27 3332 my_snprintf(hex_string, sizeof(hex_string),
e2a7e165
KW
3333 "\\N{U+%X",
3334 (unsigned int) UNI_TO_NATIVE(uv));
ff3f963a
KW
3335
3336 /* Make sure there is enough space to hold it */
3337 d = off + SvGROW(sv, off
3338 + output_length
3339 + (STRLEN)(send - e)
3340 + 2); /* '}' + NUL */
3341 /* And output it */
3342 Copy(hex_string, d, output_length, char);
3343 d += output_length;
3344
3345 /* For each subsequent character, append dot and
3346 * its ordinal in hex */
3347 while ((str += char_length) < str_end) {
3348 const STRLEN off = d - SvPVX_const(sv);
3349 U32 uv = utf8n_to_uvuni((U8 *) str,
3350 str_end - str,
3351 &char_length,
3352 UTF8_ALLOW_ANYUV);
3353 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3354 uv = UNICODE_REPLACEMENT;
3355 }
3356
78c35590 3357 output_length =
3353de27 3358 my_snprintf(hex_string, sizeof(hex_string),
e2a7e165
KW
3359 ".%X",
3360 (unsigned int) UNI_TO_NATIVE(uv));
ff3f963a
KW
3361
3362 d = off + SvGROW(sv, off
3363 + output_length
3364 + (STRLEN)(send - e)
3365 + 2); /* '}' + NUL */
3366 Copy(hex_string, d, output_length, char);
3367 d += output_length;
3368 }
3369
3370 *d++ = '}'; /* Done. Add the trailing brace */
3371 }
3372 }
3373 else { /* Here, not in a pattern. Convert the name to a
3374 * string. */
3375
3376 /* If destination is not in utf8, unconditionally
3377 * recode it to be so. This is because \N{} implies
3378 * Unicode semantics, and scalars have to be in utf8
3379 * to guarantee those semantics */
3380 if (! has_utf8) {
3381 SvCUR_set(sv, d - SvPVX_const(sv));
3382 SvPOK_on(sv);
3383 *d = '\0';
3384 /* See Note on sizing above. */
3385 sv_utf8_upgrade_flags_grow(sv,
3386 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3387 len + (STRLEN)(send - s) + 1);
3388 d = SvPVX(sv) + SvCUR(sv);
3389 has_utf8 = TRUE;
3390 } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
3391
3392 /* See Note on sizing above. (NOTE: SvCUR() is not
3393 * set correctly here). */
3394 const STRLEN off = d - SvPVX_const(sv);
3395 d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1);
3396 }
3397 Copy(str, d, len, char);
3398 d += len;
423cee85 3399 }
423cee85 3400 SvREFCNT_dec(res);
cb233ae3
KW
3401
3402 /* Deprecate non-approved name syntax */
3403 if (ckWARN_d(WARN_DEPRECATED)) {
3404 bool problematic = FALSE;
3405 char* i = s;
3406
3407 /* For non-ut8 input, look to see that the first
3408 * character is an alpha, then loop through the rest
3409 * checking that each is a continuation */
3410 if (! this_utf8) {
3411 if (! isALPHAU(*i)) problematic = TRUE;
3412 else for (i = s + 1; i < e; i++) {
3413 if (isCHARNAME_CONT(*i)) continue;
3414 problematic = TRUE;
3415 break;
3416 }
3417 }
3418 else {
3419 /* Similarly for utf8. For invariants can check
3420 * directly. We accept anything above the latin1
3421 * range because it is immaterial to Perl if it is
3422 * correct or not, and is expensive to check. But
3423 * it is fairly easy in the latin1 range to convert
3424 * the variants into a single character and check
3425 * those */
3426 if (UTF8_IS_INVARIANT(*i)) {
3427 if (! isALPHAU(*i)) problematic = TRUE;
3428 } else if (UTF8_IS_DOWNGRADEABLE_START(*i)) {
81c14aa2 3429 if (! isALPHAU(UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(*i,
cb233ae3
KW
3430 *(i+1)))))
3431 {
3432 problematic = TRUE;
3433 }
3434 }
3435 if (! problematic) for (i = s + UTF8SKIP(s);
3436 i < e;
3437 i+= UTF8SKIP(i))
3438 {
3439 if (UTF8_IS_INVARIANT(*i)) {
3440 if (isCHARNAME_CONT(*i)) continue;
3441 } else if (! UTF8_IS_DOWNGRADEABLE_START(*i)) {
3442 continue;
3443 } else if (isCHARNAME_CONT(
3444 UNI_TO_NATIVE(
81c14aa2 3445 TWO_BYTE_UTF8_TO_UNI(*i, *(i+1)))))
cb233ae3
KW
3446 {
3447 continue;
3448 }
3449 problematic = TRUE;
3450 break;
3451 }
3452 }
3453 if (problematic) {
6e1bad6c
KW
3454 /* The e-i passed to the final %.*s makes sure that
3455 * should the trailing NUL be missing that this
3456 * print won't run off the end of the string */
cb233ae3 3457 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
b00fc8d4
NC
3458 "Deprecated character in \\N{...}; marked by <-- HERE in \\N{%.*s<-- HERE %.*s",
3459 (int)(i - s + 1), s, (int)(e - i), i + 1);
cb233ae3
KW
3460 }
3461 }
3462 } /* End \N{NAME} */
ff3f963a
KW
3463#ifdef EBCDIC
3464 if (!dorange)
3465 native_range = FALSE; /* \N{} is defined to be Unicode */
3466#endif
3467 s = e + 1; /* Point to just after the '}' */
423cee85
JH
3468 continue;
3469
02aa26ce 3470 /* \c is a control character */
79072805
LW
3471 case 'c':
3472 s++;
961ce445 3473 if (s < send) {
17a3df4c 3474 *d++ = grok_bslash_c(*s++, has_utf8, 1);
ba210ebe 3475 }
961ce445
RGS
3476 else {
3477 yyerror("Missing control char name in \\c");
3478 }
79072805 3479 continue;
02aa26ce
NT
3480
3481 /* printf-style backslashes, formfeeds, newlines, etc */
79072805 3482 case 'b':
db42d148 3483 *d++ = NATIVE_TO_NEED(has_utf8,'\b');
79072805
LW
3484 break;
3485 case 'n':
db42d148 3486 *d++ = NATIVE_TO_NEED(has_utf8,'\n');
79072805
LW
3487 break;
3488 case 'r':
db42d148 3489 *d++ = NATIVE_TO_NEED(has_utf8,'\r');
79072805
LW
3490 break;
3491 case 'f':
db42d148 3492 *d++ = NATIVE_TO_NEED(has_utf8,'\f');
79072805
LW
3493 break;
3494 case 't':
db42d148 3495 *d++ = NATIVE_TO_NEED(has_utf8,'\t');
79072805 3496 break;
34a3fe2a 3497 case 'e':
db42d148 3498 *d++ = ASCII_TO_NEED(has_utf8,'\033');
34a3fe2a
PP
3499 break;
3500 case 'a':
db42d148 3501 *d++ = ASCII_TO_NEED(has_utf8,'\007');
79072805 3502 break;
02aa26ce
NT
3503 } /* end switch */
3504
79072805
LW
3505 s++;
3506 continue;
02aa26ce 3507 } /* end if (backslash) */
4c3a8340
ST
3508#ifdef EBCDIC
3509 else
3510 literal_endpoint++;
3511#endif
02aa26ce 3512
f9a63242 3513 default_action:
77a135fe
KW
3514 /* If we started with encoded form, or already know we want it,
3515 then encode the next character */
3516 if (! NATIVE_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
2b9d42f0 3517 STRLEN len = 1;
77a135fe
KW
3518
3519
3520 /* One might think that it is wasted effort in the case of the
3521 * source being utf8 (this_utf8 == TRUE) to take the next character
3522 * in the source, convert it to an unsigned value, and then convert
3523 * it back again. But the source has not been validated here. The
3524 * routine that does the conversion checks for errors like
3525 * malformed utf8 */
3526
5f66b61c
AL
3527 const UV nextuv = (this_utf8) ? utf8n_to_uvchr((U8*)s, send - s, &len, 0) : (UV) ((U8) *s);
3528 const STRLEN need = UNISKIP(NATIVE_TO_UNI(nextuv));
77a135fe
KW
3529 if (!has_utf8) {
3530 SvCUR_set(sv, d - SvPVX_const(sv));
3531 SvPOK_on(sv);
3532 *d = '\0';
77a135fe 3533 /* See Note on sizing above. */
7bf79863
KW
3534 sv_utf8_upgrade_flags_grow(sv,
3535 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3536 need + (STRLEN)(send - s) + 1);
77a135fe
KW
3537 d = SvPVX(sv) + SvCUR(sv);
3538 has_utf8 = TRUE;
3539 } else if (need > len) {
3540 /* encoded value larger than old, may need extra space (NOTE:
3541 * SvCUR() is not set correctly here). See Note on sizing
3542 * above. */
9d4ba2ae 3543 const STRLEN off = d - SvPVX_const(sv);
77a135fe 3544 d = SvGROW(sv, off + need + (STRLEN)(send - s) + 1) + off;
2b9d42f0 3545 }
77a135fe
KW
3546 s += len;
3547
5f66b61c 3548 d = (char*)uvchr_to_utf8((U8*)d, nextuv);
e294cc5d
JH
3549#ifdef EBCDIC
3550 if (uv > 255 && !dorange)
3551 native_range = FALSE;
3552#endif
2b9d42f0
NIS
3553 }
3554 else {
3555 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
3556 }
02aa26ce
NT
3557 } /* while loop to process each character */
3558
3559 /* terminate the string and set up the sv */
79072805 3560 *d = '\0';
95a20fc0 3561 SvCUR_set(sv, d - SvPVX_const(sv));
2b9d42f0 3562 if (SvCUR(sv) >= SvLEN(sv))
5637ef5b
NC
3563 Perl_croak(aTHX_ "panic: constant overflowed allocated space, %"UVuf
3564 " >= %"UVuf, (UV)SvCUR(sv), (UV)SvLEN(sv));
2b9d42f0 3565
79072805 3566 SvPOK_on(sv);
9f4817db 3567 if (PL_encoding && !has_utf8) {
d0063567
DK
3568 sv_recode_to_utf8(sv, PL_encoding);
3569 if (SvUTF8(sv))
3570 has_utf8 = TRUE;
9f4817db 3571 }
2b9d42f0 3572 if (has_utf8) {
7e2040f0 3573 SvUTF8_on(sv);
2b9d42f0 3574 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
d0063567 3575 PL_sublex_info.sub_op->op_private |=
2b9d42f0
NIS
3576 (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
3577 }
3578 }
79072805 3579
02aa26ce 3580 /* shrink the sv if we allocated more than we used */
79072805 3581 if (SvCUR(sv) + 5 < SvLEN(sv)) {
1da4ca5f 3582 SvPV_shrink_to_cur(sv);
79072805 3583 }
02aa26ce 3584
6154021b 3585 /* return the substring (via pl_yylval) only if we parsed anything */
3280af22 3586 if (s > PL_bufptr) {
eb0d8d16
NC
3587 if ( PL_hints & ( PL_lex_inpat ? HINT_NEW_RE : HINT_NEW_STRING ) ) {
3588 const char *const key = PL_lex_inpat ? "qr" : "q";
3589 const STRLEN keylen = PL_lex_inpat ? 2 : 1;
3590 const char *type;
3591 STRLEN typelen;
3592
3593 if (PL_lex_inwhat == OP_TRANS) {
3594 type = "tr";
3595 typelen = 2;
3596 } else if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat) {
3597 type = "s";
3598 typelen = 1;
9da1dd8f
DM
3599 } else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'') {
3600 type = "q";
3601 typelen = 1;
eb0d8d16
NC
3602 } else {
3603 type = "qq";
3604 typelen = 2;
3605 }
3606
3607 sv = S_new_constant(aTHX_ start, s - start, key, keylen, sv, NULL,
3608 type, typelen);
3609 }
6154021b 3610 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
b3ac6de7 3611 } else
8990e307 3612 SvREFCNT_dec(sv);
79072805
LW
3613 return s;
3614}
3615
ffb4593c
NT
3616/* S_intuit_more
3617 * Returns TRUE if there's more to the expression (e.g., a subscript),
3618 * FALSE otherwise.
ffb4593c
NT
3619 *
3620 * It deals with "$foo[3]" and /$foo[3]/ and /$foo[0123456789$]+/
3621 *
3622 * ->[ and ->{ return TRUE
3623 * { and [ outside a pattern are always subscripts, so return TRUE
3624 * if we're outside a pattern and it's not { or [, then return FALSE
3625 * if we're in a pattern and the first char is a {
3626 * {4,5} (any digits around the comma) returns FALSE
3627 * if we're in a pattern and the first char is a [
3628 * [] returns FALSE
3629 * [SOMETHING] has a funky algorithm to decide whether it's a
3630 * character class or not. It has to deal with things like
3631 * /$foo[-3]/ and /$foo[$bar]/ as well as /$foo[$\d]+/
3632 * anything else returns TRUE
3633 */
3634
9cbb5ea2
GS
3635/* This is the one truly awful dwimmer necessary to conflate C and sed. */
3636
76e3520e 3637STATIC int
cea2e8a9 3638S_intuit_more(pTHX_ register char *s)
79072805 3639{
97aff369 3640 dVAR;
7918f24d
NC
3641
3642 PERL_ARGS_ASSERT_INTUIT_MORE;
3643
3280af22 3644 if (PL_lex_brackets)
79072805
LW
3645 return TRUE;
3646 if (*s == '-' && s[1] == '>' && (s[2] == '[' || s[2] == '{'))
3647 return TRUE;
3648 if (*s != '{' && *s != '[')
3649 return FALSE;
3280af22 3650 if (!PL_lex_inpat)
79072805
LW
3651 return TRUE;
3652
3653 /* In a pattern, so maybe we have {n,m}. */
3654 if (*s == '{') {
b3155d95 3655 if (regcurly(s)) {
79072805 3656 return FALSE;
b3155d95 3657 }
79072805 3658 return TRUE;
79072805
LW
3659 }
3660
3661 /* On the other hand, maybe we have a character class */
3662
3663 s++;
3664 if (*s == ']' || *s == '^')
3665 return FALSE;
3666 else {
ffb4593c 3667 /* this is terrifying, and it works */
79072805
LW
3668 int weight = 2; /* let's weigh the evidence */
3669 char seen[256];
f27ffc4a 3670 unsigned char un_char = 255, last_un_char;
9d4ba2ae 3671 const char * const send = strchr(s,']');
3280af22 3672 char tmpbuf[sizeof PL_tokenbuf * 4];
79072805
LW
3673
3674 if (!send) /* has to be an expression */
3675 return TRUE;
3676
3677 Zero(seen,256,char);
3678 if (*s == '$')
3679 weight -= 3;
3680 else if (isDIGIT(*s)) {
3681 if (s[1] != ']') {
3682 if (isDIGIT(s[1]) && s[2] == ']')
3683 weight -= 10;
3684 }
3685 else
3686 weight -= 100;
3687 }
3688 for (; s < send; s++) {
3689 last_un_char = un_char;
3690 un_char = (unsigned char)*s;
3691 switch (*s) {
3692 case '@':
3693 case '&':
3694 case '$':
3695 weight -= seen[un_char] * 10;
7e2040f0 3696 if (isALNUM_lazy_if(s+1,UTF)) {
90e5519e 3697 int len;
8903cb82 3698 scan_ident(s, send, tmpbuf, sizeof tmpbuf, FALSE);
90e5519e 3699 len = (int)strlen(tmpbuf);
6fbd0d97
BF
3700 if (len > 1 && gv_fetchpvn_flags(tmpbuf, len,
3701 UTF ? SVf_UTF8 : 0, SVt_PV))
79072805
LW
3702 weight -= 100;
3703 else
3704 weight -= 10;
3705 }
3706 else if (*s == '$' && s[1] &&
93a17b20
LW
3707 strchr("[#!%*<>()-=",s[1])) {
3708 if (/*{*/ strchr("])} =",s[2]))
79072805
LW
3709 weight -= 10;
3710 else
3711 weight -= 1;
3712 }
3713 break;
3714 case '\\':
3715 un_char = 254;
3716 if (s[1]) {
93a17b20 3717 if (strchr("wds]",s[1]))
79072805 3718 weight += 100;
10edeb5d 3719 else if (seen[(U8)'\''] || seen[(U8)'"'])
79072805 3720 weight += 1;
93a17b20 3721 else if (strchr("rnftbxcav",s[1]))
79072805
LW
3722 weight += 40;
3723 else if (isDIGIT(s[1])) {
3724 weight += 40;
3725 while (s[1] && isDIGIT(s[1]))
3726 s++;
3727 }
3728 }
3729 else
3730 weight += 100;
3731 break;
3732 case '-':
3733 if (s[1] == '\\')
3734 weight += 50;
93a17b20 3735 if (strchr("aA01! ",last_un_char))
79072805 3736 weight += 30;
93a17b20 3737 if (strchr("zZ79~",s[1]))
79072805 3738 weight += 30;
f27ffc4a
GS
3739 if (last_un_char == 255 && (isDIGIT(s[1]) || s[1] == '$'))