This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Stop unterminated here-docs from leaking memory
[perl5.git] / toke.c
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
26
27This is the lower layer of the Perl parser, managing characters and tokens.
28
29=for apidoc AmU|yy_parser *|PL_parser
30
31Pointer to a structure encapsulating the state of the parsing operation
32currently in progress. The pointer can be locally changed to perform
33a nested parse without interfering with the state of an outer parse.
34Individual members of C<PL_parser> have their own documentation.
35
36=cut
37*/
38
378cc40b 39#include "EXTERN.h"
864dbfa3 40#define PERL_IN_TOKE_C
378cc40b 41#include "perl.h"
04e98a4d 42#include "dquote_static.c"
378cc40b 43
eb0d8d16
NC
44#define new_constant(a,b,c,d,e,f,g) \
45 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
46
6154021b 47#define pl_yylval (PL_parser->yylval)
d3b6f988 48
199e78b7
DM
49/* XXX temporary backwards compatibility */
50#define PL_lex_brackets (PL_parser->lex_brackets)
78cdf107
Z
51#define PL_lex_allbrackets (PL_parser->lex_allbrackets)
52#define PL_lex_fakeeof (PL_parser->lex_fakeeof)
199e78b7
DM
53#define PL_lex_brackstack (PL_parser->lex_brackstack)
54#define PL_lex_casemods (PL_parser->lex_casemods)
55#define PL_lex_casestack (PL_parser->lex_casestack)
56#define PL_lex_defer (PL_parser->lex_defer)
57#define PL_lex_dojoin (PL_parser->lex_dojoin)
58#define PL_lex_expect (PL_parser->lex_expect)
59#define PL_lex_formbrack (PL_parser->lex_formbrack)
60#define PL_lex_inpat (PL_parser->lex_inpat)
61#define PL_lex_inwhat (PL_parser->lex_inwhat)
62#define PL_lex_op (PL_parser->lex_op)
63#define PL_lex_repl (PL_parser->lex_repl)
64#define PL_lex_starts (PL_parser->lex_starts)
65#define PL_lex_stuff (PL_parser->lex_stuff)
66#define PL_multi_start (PL_parser->multi_start)
67#define PL_multi_open (PL_parser->multi_open)
68#define PL_multi_close (PL_parser->multi_close)
69#define PL_pending_ident (PL_parser->pending_ident)
70#define PL_preambled (PL_parser->preambled)
71#define PL_sublex_info (PL_parser->sublex_info)
bdc0bf6f 72#define PL_linestr (PL_parser->linestr)
c2598295
DM
73#define PL_expect (PL_parser->expect)
74#define PL_copline (PL_parser->copline)
f06b5848
DM
75#define PL_bufptr (PL_parser->bufptr)
76#define PL_oldbufptr (PL_parser->oldbufptr)
77#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
78#define PL_linestart (PL_parser->linestart)
79#define PL_bufend (PL_parser->bufend)
80#define PL_last_uni (PL_parser->last_uni)
81#define PL_last_lop (PL_parser->last_lop)
82#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 83#define PL_lex_state (PL_parser->lex_state)
2f9285f8 84#define PL_rsfp (PL_parser->rsfp)
5486870f 85#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
86#define PL_in_my (PL_parser->in_my)
87#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 88#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 89#define PL_multi_end (PL_parser->multi_end)
13765c85 90#define PL_error_count (PL_parser->error_count)
199e78b7
DM
91
92#ifdef PERL_MAD
93# define PL_endwhite (PL_parser->endwhite)
94# define PL_faketokens (PL_parser->faketokens)
95# define PL_lasttoke (PL_parser->lasttoke)
96# define PL_nextwhite (PL_parser->nextwhite)
97# define PL_realtokenstart (PL_parser->realtokenstart)
98# define PL_skipwhite (PL_parser->skipwhite)
99# define PL_thisclose (PL_parser->thisclose)
100# define PL_thismad (PL_parser->thismad)
101# define PL_thisopen (PL_parser->thisopen)
102# define PL_thisstuff (PL_parser->thisstuff)
103# define PL_thistoken (PL_parser->thistoken)
104# define PL_thiswhite (PL_parser->thiswhite)
fb205e7a
DM
105# define PL_thiswhite (PL_parser->thiswhite)
106# define PL_nexttoke (PL_parser->nexttoke)
107# define PL_curforce (PL_parser->curforce)
108#else
109# define PL_nexttoke (PL_parser->nexttoke)
110# define PL_nexttype (PL_parser->nexttype)
111# define PL_nextval (PL_parser->nextval)
199e78b7
DM
112#endif
113
16173588
NC
114/* This can't be done with embed.fnc, because struct yy_parser contains a
115 member named pending_ident, which clashes with the generated #define */
3cbf51f5
DM
116static int
117S_pending_ident(pTHX);
199e78b7 118
0bd48802 119static const char ident_too_long[] = "Identifier too long";
8903cb82 120
29595ff2 121#ifdef PERL_MAD
29595ff2 122# define CURMAD(slot,sv) if (PL_madskills) { curmad(slot,sv); sv = 0; }
cd81e915 123# define NEXTVAL_NEXTTOKE PL_nexttoke[PL_curforce].next_val
9ded7720 124#else
5db06880 125# define CURMAD(slot,sv)
9ded7720 126# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2
NC
127#endif
128
a7aaec61
Z
129#define XENUMMASK 0x3f
130#define XFAKEEOF 0x40
131#define XFAKEBRACK 0x80
9059aa12 132
39e02b42
JH
133#ifdef USE_UTF8_SCRIPTS
134# define UTF (!IN_BYTES)
2b9d42f0 135#else
802a15e9 136# define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
2b9d42f0 137#endif
a0ed51b3 138
b1fc3636
CJ
139/* The maximum number of characters preceding the unrecognized one to display */
140#define UNRECOGNIZED_PRECEDE_COUNT 10
141
61f0cdd9 142/* In variables named $^X, these are the legal values for X.
2b92dfce
GS
143 * 1999-02-27 mjd-perl-patch@plover.com */
144#define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
145
bf4acbe4 146#define SPACE_OR_TAB(c) ((c)==' '||(c)=='\t')
bf4acbe4 147
ffb4593c
NT
148/* LEX_* are values for PL_lex_state, the state of the lexer.
149 * They are arranged oddly so that the guard on the switch statement
79072805 150 * can get by with a single comparison (if the compiler is smart enough).
9da1dd8f
DM
151 *
152 * These values refer to the various states within a sublex parse,
153 * i.e. within a double quotish string
79072805
LW
154 */
155
fb73857a
PP
156/* #define LEX_NOTPARSING 11 is done in perl.h. */
157
b6007c36
DM
158#define LEX_NORMAL 10 /* normal code (ie not within "...") */
159#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
160#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
161#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
162#define LEX_INTERPSTART 6 /* expecting the start of a $var */
163
164 /* at end of code, eg "$x" followed by: */
165#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
166#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
167
168#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
169 string or after \E, $foo, etc */
170#define LEX_INTERPCONST 2 /* NOT USED */
171#define LEX_FORMLINE 1 /* expecting a format line */
172#define LEX_KNOWNEXT 0 /* next token known; just return it */
173
79072805 174
bbf60fe6 175#ifdef DEBUGGING
27da23d5 176static const char* const lex_state_names[] = {
bbf60fe6
DM
177 "KNOWNEXT",
178 "FORMLINE",
179 "INTERPCONST",
180 "INTERPCONCAT",
181 "INTERPENDMAYBE",
182 "INTERPEND",
183 "INTERPSTART",
184 "INTERPPUSH",
185 "INTERPCASEMOD",
186 "INTERPNORMAL",
187 "NORMAL"
188};
189#endif
190
79072805
LW
191#ifdef ff_next
192#undef ff_next
d48672a2
LW
193#endif
194
79072805 195#include "keywords.h"
fe14fcc3 196
ffb4593c
NT
197/* CLINE is a macro that ensures PL_copline has a sane value */
198
ae986130
LW
199#ifdef CLINE
200#undef CLINE
201#endif
57843af0 202#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 203
5db06880 204#ifdef PERL_MAD
29595ff2
NC
205# define SKIPSPACE0(s) skipspace0(s)
206# define SKIPSPACE1(s) skipspace1(s)
207# define SKIPSPACE2(s,tsv) skipspace2(s,&tsv)
208# define PEEKSPACE(s) skipspace2(s,0)
209#else
210# define SKIPSPACE0(s) skipspace(s)
211# define SKIPSPACE1(s) skipspace(s)
212# define SKIPSPACE2(s,tsv) skipspace(s)
213# define PEEKSPACE(s) skipspace(s)
214#endif
215
ffb4593c
NT
216/*
217 * Convenience functions to return different tokens and prime the
9cbb5ea2 218 * lexer for the next token. They all take an argument.
ffb4593c
NT
219 *
220 * TOKEN : generic token (used for '(', DOLSHARP, etc)
221 * OPERATOR : generic operator
222 * AOPERATOR : assignment operator
223 * PREBLOCK : beginning the block after an if, while, foreach, ...
224 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
225 * PREREF : *EXPR where EXPR is not a simple identifier
226 * TERM : expression term
227 * LOOPX : loop exiting command (goto, last, dump, etc)
228 * FTST : file test operator
229 * FUN0 : zero-argument function
7eb971ee 230 * FUN0OP : zero-argument function, with its op created in this file
2d2e263d 231 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
232 * BOop : bitwise or or xor
233 * BAop : bitwise and
234 * SHop : shift operator
235 * PWop : power operator
9cbb5ea2 236 * PMop : pattern-matching operator
ffb4593c
NT
237 * Aop : addition-level operator
238 * Mop : multiplication-level operator
239 * Eop : equality-testing operator
e5edeb50 240 * Rop : relational operator <= != gt
ffb4593c
NT
241 *
242 * Also see LOP and lop() below.
243 */
244
998054bd 245#ifdef DEBUGGING /* Serve -DT. */
704d4215 246# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 247#else
bbf60fe6 248# define REPORT(retval) (retval)
998054bd
SC
249#endif
250
bbf60fe6
DM
251#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
252#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
253#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, REPORT(retval)))
254#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
255#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
256#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
257#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
6154021b
RGS
258#define LOOPX(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)LOOPEX))
259#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
260#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
7eb971ee 261#define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
6154021b
RGS
262#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
263#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITOROP)))
264#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITANDOP)))
265#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)SHIFTOP)))
266#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)POWOP)))
267#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
268#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)ADDOP)))
269#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MULOP)))
270#define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
271#define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
2f3197b3 272
a687059c
LW
273/* This bit of chicanery makes a unary function followed by
274 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
275 * The UNIDOR macro is for unary functions that can be followed by the //
276 * operator (such as C<shift // 0>).
a687059c 277 */
d68ce4ac 278#define UNI3(f,x,have_x) { \
6154021b 279 pl_yylval.ival = f; \
d68ce4ac 280 if (have_x) PL_expect = x; \
376fcdbf
AL
281 PL_bufptr = s; \
282 PL_last_uni = PL_oldbufptr; \
283 PL_last_lop_op = f; \
284 if (*s == '(') \
285 return REPORT( (int)FUNC1 ); \
29595ff2 286 s = PEEKSPACE(s); \
376fcdbf
AL
287 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
288 }
d68ce4ac
FC
289#define UNI(f) UNI3(f,XTERM,1)
290#define UNIDOR(f) UNI3(f,XTERMORDORDOR,1)
b5fb7ce3
FC
291#define UNIPROTO(f,optional) { \
292 if (optional) PL_last_uni = PL_oldbufptr; \
22393538
MH
293 OPERATOR(f); \
294 }
a687059c 295
d68ce4ac 296#define UNIBRACK(f) UNI3(f,0,0)
79072805 297
9f68db38 298/* grandfather return to old style */
78cdf107
Z
299#define OLDLOP(f) \
300 do { \
301 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
302 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
303 pl_yylval.ival = (f); \
304 PL_expect = XTERM; \
305 PL_bufptr = s; \
306 return (int)LSTOP; \
307 } while(0)
79072805 308
83944c01
FC
309#define COPLINE_INC_WITH_HERELINES \
310 STMT_START { \
311 CopLINE_inc(PL_curcop); \
312 if (PL_parser->herelines) \
313 CopLINE(PL_curcop) += PL_parser->herelines, \
314 PL_parser->herelines = 0; \
315 } STMT_END
316
317
8fa7f367
JH
318#ifdef DEBUGGING
319
6154021b 320/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
321enum token_type {
322 TOKENTYPE_NONE,
323 TOKENTYPE_IVAL,
6154021b 324 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6 325 TOKENTYPE_PVAL,
aeaef349 326 TOKENTYPE_OPVAL
bbf60fe6
DM
327};
328
6d4a66ac
NC
329static struct debug_tokens {
330 const int token;
331 enum token_type type;
332 const char *name;
333} const debug_tokens[] =
9041c2e3 334{
bbf60fe6
DM
335 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
336 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
337 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
338 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
339 { ARROW, TOKENTYPE_NONE, "ARROW" },
340 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
341 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
342 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
343 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
344 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
0d863452 345 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
346 { DO, TOKENTYPE_NONE, "DO" },
347 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
348 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
349 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
350 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
351 { ELSE, TOKENTYPE_NONE, "ELSE" },
352 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
353 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
354 { FOR, TOKENTYPE_IVAL, "FOR" },
355 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
705fe0e5
FC
356 { FORMLBRACK, TOKENTYPE_NONE, "FORMLBRACK" },
357 { FORMRBRACK, TOKENTYPE_NONE, "FORMRBRACK" },
bbf60fe6
DM
358 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
359 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
7eb971ee 360 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
bbf60fe6
DM
361 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
362 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
363 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 364 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
365 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
366 { IF, TOKENTYPE_IVAL, "IF" },
5db1eb8d 367 { LABEL, TOKENTYPE_OPVAL, "LABEL" },
bbf60fe6
DM
368 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
369 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
370 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
371 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
372 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
373 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
374 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
375 { MY, TOKENTYPE_IVAL, "MY" },
376 { MYSUB, TOKENTYPE_NONE, "MYSUB" },
377 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
378 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
379 { OROP, TOKENTYPE_IVAL, "OROP" },
380 { OROR, TOKENTYPE_NONE, "OROR" },
381 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
f3f204dc 382 { PEG, TOKENTYPE_NONE, "PEG" },
88e1f1a2
JV
383 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
384 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6
DM
385 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
386 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
387 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
388 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
389 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
390 { PREINC, TOKENTYPE_NONE, "PREINC" },
391 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
f3f204dc 392 { QWLIST, TOKENTYPE_OPVAL, "QWLIST" },
bbf60fe6
DM
393 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
394 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
f3f204dc 395 { REQUIRE, TOKENTYPE_NONE, "REQUIRE" },
bbf60fe6
DM
396 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
397 { SUB, TOKENTYPE_NONE, "SUB" },
398 { THING, TOKENTYPE_OPVAL, "THING" },
399 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
400 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
401 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
402 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
403 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
404 { USE, TOKENTYPE_IVAL, "USE" },
0d863452 405 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6
DM
406 { WHILE, TOKENTYPE_IVAL, "WHILE" },
407 { WORD, TOKENTYPE_OPVAL, "WORD" },
be25f609 408 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 409 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
410};
411
6154021b 412/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 413
bbf60fe6 414STATIC int
704d4215 415S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 416{
97aff369 417 dVAR;
7918f24d
NC
418
419 PERL_ARGS_ASSERT_TOKEREPORT;
420
bbf60fe6 421 if (DEBUG_T_TEST) {
bd61b366 422 const char *name = NULL;
bbf60fe6 423 enum token_type type = TOKENTYPE_NONE;
f54cb97a 424 const struct debug_tokens *p;
396482e1 425 SV* const report = newSVpvs("<== ");
bbf60fe6 426
f54cb97a 427 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
428 if (p->token == (int)rv) {
429 name = p->name;
430 type = p->type;
431 break;
432 }
433 }
434 if (name)
54667de8 435 Perl_sv_catpv(aTHX_ report, name);
bbf60fe6
DM
436 else if ((char)rv > ' ' && (char)rv < '~')
437 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
438 else if (!rv)
396482e1 439 sv_catpvs(report, "EOF");
bbf60fe6
DM
440 else
441 Perl_sv_catpvf(aTHX_ report, "?? %"IVdf, (IV)rv);
442 switch (type) {
443 case TOKENTYPE_NONE:
bbf60fe6
DM
444 break;
445 case TOKENTYPE_IVAL:
704d4215 446 Perl_sv_catpvf(aTHX_ report, "(ival=%"IVdf")", (IV)lvalp->ival);
bbf60fe6
DM
447 break;
448 case TOKENTYPE_OPNUM:
449 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 450 PL_op_name[lvalp->ival]);
bbf60fe6
DM
451 break;
452 case TOKENTYPE_PVAL:
704d4215 453 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
454 break;
455 case TOKENTYPE_OPVAL:
704d4215 456 if (lvalp->opval) {
401441c0 457 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
458 PL_op_name[lvalp->opval->op_type]);
459 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 460 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 461 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
462 }
463
464 }
401441c0 465 else
396482e1 466 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
467 break;
468 }
b6007c36 469 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
470 };
471 return (int)rv;
998054bd
SC
472}
473
b6007c36
DM
474
475/* print the buffer with suitable escapes */
476
477STATIC void
15f169a1 478S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 479{
396482e1 480 SV* const tmp = newSVpvs("");
7918f24d
NC
481
482 PERL_ARGS_ASSERT_PRINTBUF;
483
b6007c36
DM
484 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
485 SvREFCNT_dec(tmp);
486}
487
8fa7f367
JH
488#endif
489
8290c323
NC
490static int
491S_deprecate_commaless_var_list(pTHX) {
492 PL_expect = XTERM;
493 deprecate("comma-less variable list");
494 return REPORT(','); /* grandfather non-comma-format format */
495}
496
ffb4593c
NT
497/*
498 * S_ao
499 *
c963b151
BD
500 * This subroutine detects &&=, ||=, and //= and turns an ANDAND, OROR or DORDOR
501 * into an OP_ANDASSIGN, OP_ORASSIGN, or OP_DORASSIGN
ffb4593c
NT
502 */
503
76e3520e 504STATIC int
cea2e8a9 505S_ao(pTHX_ int toketype)
a0d0e21e 506{
97aff369 507 dVAR;
3280af22
NIS
508 if (*PL_bufptr == '=') {
509 PL_bufptr++;
a0d0e21e 510 if (toketype == ANDAND)
6154021b 511 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 512 else if (toketype == OROR)
6154021b 513 pl_yylval.ival = OP_ORASSIGN;
c963b151 514 else if (toketype == DORDOR)
6154021b 515 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
516 toketype = ASSIGNOP;
517 }
518 return toketype;
519}
520
ffb4593c
NT
521/*
522 * S_no_op
523 * When Perl expects an operator and finds something else, no_op
524 * prints the warning. It always prints "<something> found where
525 * operator expected. It prints "Missing semicolon on previous line?"
526 * if the surprise occurs at the start of the line. "do you need to
527 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
528 * where the compiler doesn't know if foo is a method call or a function.
529 * It prints "Missing operator before end of line" if there's nothing
530 * after the missing operator, or "... before <...>" if there is something
531 * after the missing operator.
532 */
533
76e3520e 534STATIC void
15f169a1 535S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 536{
97aff369 537 dVAR;
9d4ba2ae
AL
538 char * const oldbp = PL_bufptr;
539 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 540
7918f24d
NC
541 PERL_ARGS_ASSERT_NO_OP;
542
1189a94a
GS
543 if (!s)
544 s = oldbp;
07c798fb 545 else
1189a94a 546 PL_bufptr = s;
734ab321 547 yywarn(Perl_form(aTHX_ "%s found where operator expected", what), UTF ? SVf_UTF8 : 0);
56da5a46
RGS
548 if (ckWARN_d(WARN_SYNTAX)) {
549 if (is_first)
550 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
551 "\t(Missing semicolon on previous line?)\n");
552 else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) {
f54cb97a 553 const char *t;
734ab321
BF
554 for (t = PL_oldoldbufptr; (isALNUM_lazy_if(t,UTF) || *t == ':');
555 t += UTF ? UTF8SKIP(t) : 1)
c35e046a 556 NOOP;
56da5a46
RGS
557 if (t < PL_bufptr && isSPACE(*t))
558 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
734ab321
BF
559 "\t(Do you need to predeclare %"SVf"?)\n",
560 SVfARG(newSVpvn_flags(PL_oldoldbufptr, (STRLEN)(t - PL_oldoldbufptr),
561 SVs_TEMP | (UTF ? SVf_UTF8 : 0))));
56da5a46
RGS
562 }
563 else {
564 assert(s >= oldbp);
565 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
734ab321
BF
566 "\t(Missing operator before %"SVf"?)\n",
567 SVfARG(newSVpvn_flags(oldbp, (STRLEN)(s - oldbp),
568 SVs_TEMP | (UTF ? SVf_UTF8 : 0))));
56da5a46 569 }
07c798fb 570 }
3280af22 571 PL_bufptr = oldbp;
8990e307
LW
572}
573
ffb4593c
NT
574/*
575 * S_missingterm
576 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 577 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
578 * If we're in a delimited string and the delimiter is a control
579 * character, it's reformatted into a two-char sequence like ^C.
580 * This is fatal.
581 */
582
76e3520e 583STATIC void
cea2e8a9 584S_missingterm(pTHX_ char *s)
8990e307 585{
97aff369 586 dVAR;
8990e307
LW
587 char tmpbuf[3];
588 char q;
589 if (s) {
9d4ba2ae 590 char * const nl = strrchr(s,'\n');
d2719217 591 if (nl)
8990e307
LW
592 *nl = '\0';
593 }
463559e7 594 else if (isCNTRL(PL_multi_close)) {
8990e307 595 *tmpbuf = '^';
585ec06d 596 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
597 tmpbuf[2] = '\0';
598 s = tmpbuf;
599 }
600 else {
eb160463 601 *tmpbuf = (char)PL_multi_close;
8990e307
LW
602 tmpbuf[1] = '\0';
603 s = tmpbuf;
604 }
605 q = strchr(s,'"') ? '\'' : '"';
cea2e8a9 606 Perl_croak(aTHX_ "Can't find string terminator %c%s%c anywhere before EOF",q,s,q);
463ee0b2 607}
79072805 608
dd0ac2b9
FC
609#include "feature.h"
610
0d863452 611/*
0d863452
RH
612 * Check whether the named feature is enabled.
613 */
26ea9e12 614bool
3fff3427 615Perl_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
0d863452 616{
97aff369 617 dVAR;
4a731d7b 618 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
7918f24d
NC
619
620 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
ca4d40c4
FC
621
622 assert(CURRENT_FEATURE_BUNDLE == FEATURE_BUNDLE_CUSTOM);
7918f24d 623
26ea9e12
NC
624 if (namelen > MAX_FEATURE_LEN)
625 return FALSE;
3fff3427 626 memcpy(&he_name[8], name, namelen);
7d69d4a6 627
c8ca97b0
NC
628 return cBOOL(cop_hints_fetch_pvn(PL_curcop, he_name, 8 + namelen, 0,
629 REFCOUNTED_HE_EXISTS));
0d863452
RH
630}
631
ffb4593c 632/*
9cbb5ea2
GS
633 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
634 * utf16-to-utf8-reversed.
ffb4593c
NT
635 */
636
c39cd008
GS
637#ifdef PERL_CR_FILTER
638static void
639strip_return(SV *sv)
640{
eb578fdb
KW
641 const char *s = SvPVX_const(sv);
642 const char * const e = s + SvCUR(sv);
7918f24d
NC
643
644 PERL_ARGS_ASSERT_STRIP_RETURN;
645
c39cd008
GS
646 /* outer loop optimized to do nothing if there are no CR-LFs */
647 while (s < e) {
648 if (*s++ == '\r' && *s == '\n') {
649 /* hit a CR-LF, need to copy the rest */
eb578fdb 650 char *d = s - 1;
c39cd008
GS
651 *d++ = *s++;
652 while (s < e) {
653 if (*s == '\r' && s[1] == '\n')
654 s++;
655 *d++ = *s++;
656 }
657 SvCUR(sv) -= s - d;
658 return;
659 }
660 }
661}
a868473f 662
76e3520e 663STATIC I32
c39cd008 664S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 665{
f54cb97a 666 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
667 if (count > 0 && !maxlen)
668 strip_return(sv);
669 return count;
a868473f
NIS
670}
671#endif
672
ffb4593c 673/*
8eaa0acf
Z
674=for apidoc Amx|void|lex_start|SV *line|PerlIO *rsfp|U32 flags
675
676Creates and initialises a new lexer/parser state object, supplying
677a context in which to lex and parse from a new source of Perl code.
678A pointer to the new state object is placed in L</PL_parser>. An entry
679is made on the save stack so that upon unwinding the new state object
680will be destroyed and the former value of L</PL_parser> will be restored.
681Nothing else need be done to clean up the parsing context.
682
683The code to be parsed comes from I<line> and I<rsfp>. I<line>, if
684non-null, provides a string (in SV form) containing code to be parsed.
685A copy of the string is made, so subsequent modification of I<line>
686does not affect parsing. I<rsfp>, if non-null, provides an input stream
687from which code will be read to be parsed. If both are non-null, the
688code in I<line> comes first and must consist of complete lines of input,
689and I<rsfp> supplies the remainder of the source.
690
e368b3bd
FC
691The I<flags> parameter is reserved for future use. Currently it is only
692used by perl internally, so extensions should always pass zero.
8eaa0acf
Z
693
694=cut
695*/
ffb4593c 696
27fcb6ee 697/* LEX_START_SAME_FILTER indicates that this is not a new file, so it
87606032
NC
698 can share filters with the current parser.
699 LEX_START_DONT_CLOSE indicates that the file handle wasn't opened by the
700 caller, hence isn't owned by the parser, so shouldn't be closed on parser
701 destruction. This is used to handle the case of defaulting to reading the
702 script from the standard input because no filename was given on the command
703 line (without getting confused by situation where STDIN has been closed, so
704 the script handle is opened on fd 0) */
27fcb6ee 705
a0d0e21e 706void
8eaa0acf 707Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
79072805 708{
97aff369 709 dVAR;
6ef55633 710 const char *s = NULL;
5486870f 711 yy_parser *parser, *oparser;
60d63348 712 if (flags && flags & ~LEX_START_FLAGS)
8eaa0acf 713 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
acdf0a21
DM
714
715 /* create and initialise a parser */
716
199e78b7 717 Newxz(parser, 1, yy_parser);
5486870f 718 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
719 PL_parser = parser;
720
28ac2b49
Z
721 parser->stack = NULL;
722 parser->ps = NULL;
723 parser->stack_size = 0;
acdf0a21 724
e3abe207
DM
725 /* on scope exit, free this parser and restore any outer one */
726 SAVEPARSER(parser);
7c4baf47 727 parser->saved_curcop = PL_curcop;
e3abe207 728
acdf0a21 729 /* initialise lexer state */
8990e307 730
fb205e7a
DM
731#ifdef PERL_MAD
732 parser->curforce = -1;
733#else
734 parser->nexttoke = 0;
735#endif
ca4cfd28 736 parser->error_count = oparser ? oparser->error_count : 0;
c2598295 737 parser->copline = NOLINE;
5afb0a62 738 parser->lex_state = LEX_NORMAL;
c2598295 739 parser->expect = XSTATE;
2f9285f8 740 parser->rsfp = rsfp;
27fcb6ee
FC
741 parser->rsfp_filters =
742 !(flags & LEX_START_SAME_FILTER) || !oparser
d3cd8e11
FC
743 ? NULL
744 : MUTABLE_AV(SvREFCNT_inc(
745 oparser->rsfp_filters
746 ? oparser->rsfp_filters
747 : (oparser->rsfp_filters = newAV())
748 ));
2f9285f8 749
199e78b7
DM
750 Newx(parser->lex_brackstack, 120, char);
751 Newx(parser->lex_casestack, 12, char);
752 *parser->lex_casestack = '\0';
02b34bbe 753
10efb74f 754 if (line) {
0528fd32 755 STRLEN len;
10efb74f 756 s = SvPV_const(line, len);
0abcdfa4
FC
757 parser->linestr = flags & LEX_START_COPIED
758 ? SvREFCNT_inc_simple_NN(line)
759 : newSVpvn_flags(s, len, SvUTF8(line));
37555a72 760 if (!len || s[len-1] != ';')
bdc0bf6f 761 sv_catpvs(parser->linestr, "\n;");
0abcdfa4
FC
762 } else {
763 parser->linestr = newSVpvs("\n;");
8990e307 764 }
f06b5848
DM
765 parser->oldoldbufptr =
766 parser->oldbufptr =
767 parser->bufptr =
768 parser->linestart = SvPVX(parser->linestr);
769 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
770 parser->last_lop = parser->last_uni = NULL;
87606032
NC
771 parser->lex_flags = flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
772 |LEX_DONT_CLOSE_RSFP);
737c24fc 773
60d63348 774 parser->in_pod = parser->filtered = 0;
79072805 775}
a687059c 776
e3abe207
DM
777
778/* delete a parser object */
779
780void
781Perl_parser_free(pTHX_ const yy_parser *parser)
782{
7918f24d
NC
783 PERL_ARGS_ASSERT_PARSER_FREE;
784
7c4baf47 785 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
786 SvREFCNT_dec(parser->linestr);
787
87606032 788 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
2f9285f8 789 PerlIO_clearerr(parser->rsfp);
799361c3
SH
790 else if (parser->rsfp && (!parser->old_parser ||
791 (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 792 PerlIO_close(parser->rsfp);
5486870f 793 SvREFCNT_dec(parser->rsfp_filters);
2f9285f8 794
e3abe207
DM
795 Safefree(parser->lex_brackstack);
796 Safefree(parser->lex_casestack);
797 PL_parser = parser->old_parser;
798 Safefree(parser);
799}
800
801
ffb4593c 802/*
f0e67a1d
Z
803=for apidoc AmxU|SV *|PL_parser-E<gt>linestr
804
805Buffer scalar containing the chunk currently under consideration of the
806text currently being lexed. This is always a plain string scalar (for
807which C<SvPOK> is true). It is not intended to be used as a scalar by
808normal scalar means; instead refer to the buffer directly by the pointer
809variables described below.
810
811The lexer maintains various C<char*> pointers to things in the
812C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
813reallocated, all of these pointers must be updated. Don't attempt to
814do this manually, but rather use L</lex_grow_linestr> if you need to
815reallocate the buffer.
816
817The content of the text chunk in the buffer is commonly exactly one
818complete line of input, up to and including a newline terminator,
819but there are situations where it is otherwise. The octets of the
820buffer may be intended to be interpreted as either UTF-8 or Latin-1.
821The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
822flag on this scalar, which may disagree with it.
823
824For direct examination of the buffer, the variable
825L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
826lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
827of these pointers is usually preferable to examination of the scalar
828through normal scalar means.
829
830=for apidoc AmxU|char *|PL_parser-E<gt>bufend
831
832Direct pointer to the end of the chunk of text currently being lexed, the
833end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
834+ SvCUR(PL_parser-E<gt>linestr)>. A NUL character (zero octet) is
835always located at the end of the buffer, and does not count as part of
836the buffer's contents.
837
838=for apidoc AmxU|char *|PL_parser-E<gt>bufptr
839
840Points to the current position of lexing inside the lexer buffer.
841Characters around this point may be freely examined, within
842the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
843L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
844interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
845
846Lexing code (whether in the Perl core or not) moves this pointer past
847the characters that it consumes. It is also expected to perform some
848bookkeeping whenever a newline character is consumed. This movement
849can be more conveniently performed by the function L</lex_read_to>,
850which handles newlines appropriately.
851
852Interpretation of the buffer's octets can be abstracted out by
853using the slightly higher-level functions L</lex_peek_unichar> and
854L</lex_read_unichar>.
855
856=for apidoc AmxU|char *|PL_parser-E<gt>linestart
857
858Points to the start of the current line inside the lexer buffer.
859This is useful for indicating at which column an error occurred, and
860not much else. This must be updated by any lexing code that consumes
861a newline; the function L</lex_read_to> handles this detail.
862
863=cut
864*/
865
866/*
867=for apidoc Amx|bool|lex_bufutf8
868
869Indicates whether the octets in the lexer buffer
870(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
871of Unicode characters. If not, they should be interpreted as Latin-1
872characters. This is analogous to the C<SvUTF8> flag for scalars.
873
874In UTF-8 mode, it is not guaranteed that the lexer buffer actually
875contains valid UTF-8. Lexing code must be robust in the face of invalid
876encoding.
877
878The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
879is significant, but not the whole story regarding the input character
880encoding. Normally, when a file is being read, the scalar contains octets
881and its C<SvUTF8> flag is off, but the octets should be interpreted as
882UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
883however, the scalar may have the C<SvUTF8> flag on, and in this case its
884octets should be interpreted as UTF-8 unless the C<use bytes> pragma
885is in effect. This logic may change in the future; use this function
886instead of implementing the logic yourself.
887
888=cut
889*/
890
891bool
892Perl_lex_bufutf8(pTHX)
893{
894 return UTF;
895}
896
897/*
898=for apidoc Amx|char *|lex_grow_linestr|STRLEN len
899
900Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
901at least I<len> octets (including terminating NUL). Returns a
902pointer to the reallocated buffer. This is necessary before making
903any direct modification of the buffer that would increase its length.
904L</lex_stuff_pvn> provides a more convenient way to insert text into
905the buffer.
906
907Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
908this function updates all of the lexer's variables that point directly
909into the buffer.
910
911=cut
912*/
913
914char *
915Perl_lex_grow_linestr(pTHX_ STRLEN len)
916{
917 SV *linestr;
918 char *buf;
919 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
c7641931 920 STRLEN linestart_pos, last_uni_pos, last_lop_pos, re_eval_start_pos;
f0e67a1d
Z
921 linestr = PL_parser->linestr;
922 buf = SvPVX(linestr);
923 if (len <= SvLEN(linestr))
924 return buf;
925 bufend_pos = PL_parser->bufend - buf;
926 bufptr_pos = PL_parser->bufptr - buf;
927 oldbufptr_pos = PL_parser->oldbufptr - buf;
928 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
929 linestart_pos = PL_parser->linestart - buf;
930 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
931 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
c7641931
DM
932 re_eval_start_pos = PL_sublex_info.re_eval_start ?
933 PL_sublex_info.re_eval_start - buf : 0;
934
f0e67a1d 935 buf = sv_grow(linestr, len);
c7641931 936
f0e67a1d
Z
937 PL_parser->bufend = buf + bufend_pos;
938 PL_parser->bufptr = buf + bufptr_pos;
939 PL_parser->oldbufptr = buf + oldbufptr_pos;
940 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
941 PL_parser->linestart = buf + linestart_pos;
942 if (PL_parser->last_uni)
943 PL_parser->last_uni = buf + last_uni_pos;
944 if (PL_parser->last_lop)
945 PL_parser->last_lop = buf + last_lop_pos;
c7641931
DM
946 if (PL_sublex_info.re_eval_start)
947 PL_sublex_info.re_eval_start = buf + re_eval_start_pos;
f0e67a1d
Z
948 return buf;
949}
950
951/*
83aa740e 952=for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
f0e67a1d
Z
953
954Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
955immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
956reallocating the buffer if necessary. This means that lexing code that
957runs later will see the characters as if they had appeared in the input.
958It is not recommended to do this as part of normal parsing, and most
959uses of this facility run the risk of the inserted characters being
960interpreted in an unintended manner.
961
962The string to be inserted is represented by I<len> octets starting
963at I<pv>. These octets are interpreted as either UTF-8 or Latin-1,
964according to whether the C<LEX_STUFF_UTF8> flag is set in I<flags>.
965The characters are recoded for the lexer buffer, according to how the
966buffer is currently being interpreted (L</lex_bufutf8>). If a string
9dcc53ea 967to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
f0e67a1d
Z
968function is more convenient.
969
970=cut
971*/
972
973void
83aa740e 974Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
f0e67a1d 975{
749123ff 976 dVAR;
f0e67a1d
Z
977 char *bufptr;
978 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
979 if (flags & ~(LEX_STUFF_UTF8))
980 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
981 if (UTF) {
982 if (flags & LEX_STUFF_UTF8) {
983 goto plain_copy;
984 } else {
985 STRLEN highhalf = 0;
83aa740e 986 const char *p, *e = pv+len;
f0e67a1d
Z
987 for (p = pv; p != e; p++)
988 highhalf += !!(((U8)*p) & 0x80);
989 if (!highhalf)
990 goto plain_copy;
991 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
992 bufptr = PL_parser->bufptr;
993 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
994 SvCUR_set(PL_parser->linestr,
995 SvCUR(PL_parser->linestr) + len+highhalf);
f0e67a1d
Z
996 PL_parser->bufend += len+highhalf;
997 for (p = pv; p != e; p++) {
998 U8 c = (U8)*p;
999 if (c & 0x80) {
1000 *bufptr++ = (char)(0xc0 | (c >> 6));
1001 *bufptr++ = (char)(0x80 | (c & 0x3f));
1002 } else {
1003 *bufptr++ = (char)c;
1004 }
1005 }
1006 }
1007 } else {
1008 if (flags & LEX_STUFF_UTF8) {
1009 STRLEN highhalf = 0;
83aa740e 1010 const char *p, *e = pv+len;
f0e67a1d
Z
1011 for (p = pv; p != e; p++) {
1012 U8 c = (U8)*p;
1013 if (c >= 0xc4) {
1014 Perl_croak(aTHX_ "Lexing code attempted to stuff "
1015 "non-Latin-1 character into Latin-1 input");
1016 } else if (c >= 0xc2 && p+1 != e &&
1017 (((U8)p[1]) & 0xc0) == 0x80) {
1018 p++;
1019 highhalf++;
1020 } else if (c >= 0x80) {
1021 /* malformed UTF-8 */
1022 ENTER;
1023 SAVESPTR(PL_warnhook);
1024 PL_warnhook = PERL_WARNHOOK_FATAL;
1025 utf8n_to_uvuni((U8*)p, e-p, NULL, 0);
1026 LEAVE;
1027 }
1028 }
1029 if (!highhalf)
1030 goto plain_copy;
1031 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
1032 bufptr = PL_parser->bufptr;
1033 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1034 SvCUR_set(PL_parser->linestr,
1035 SvCUR(PL_parser->linestr) + len-highhalf);
f0e67a1d
Z
1036 PL_parser->bufend += len-highhalf;
1037 for (p = pv; p != e; p++) {
1038 U8 c = (U8)*p;
1039 if (c & 0x80) {
1040 *bufptr++ = (char)(((c & 0x3) << 6) | (p[1] & 0x3f));
1041 p++;
1042 } else {
1043 *bufptr++ = (char)c;
1044 }
1045 }
1046 } else {
1047 plain_copy:
1048 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1049 bufptr = PL_parser->bufptr;
1050 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
255fdf19 1051 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
f0e67a1d
Z
1052 PL_parser->bufend += len;
1053 Copy(pv, bufptr, len, char);
1054 }
1055 }
1056}
1057
1058/*
9dcc53ea
Z
1059=for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
1060
1061Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1062immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1063reallocating the buffer if necessary. This means that lexing code that
1064runs later will see the characters as if they had appeared in the input.
1065It is not recommended to do this as part of normal parsing, and most
1066uses of this facility run the risk of the inserted characters being
1067interpreted in an unintended manner.
1068
1069The string to be inserted is represented by octets starting at I<pv>
1070and continuing to the first nul. These octets are interpreted as either
1071UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
1072in I<flags>. The characters are recoded for the lexer buffer, according
1073to how the buffer is currently being interpreted (L</lex_bufutf8>).
1074If it is not convenient to nul-terminate a string to be inserted, the
1075L</lex_stuff_pvn> function is more appropriate.
1076
1077=cut
1078*/
1079
1080void
1081Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1082{
1083 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1084 lex_stuff_pvn(pv, strlen(pv), flags);
1085}
1086
1087/*
f0e67a1d
Z
1088=for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1089
1090Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1091immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1092reallocating the buffer if necessary. This means that lexing code that
1093runs later will see the characters as if they had appeared in the input.
1094It is not recommended to do this as part of normal parsing, and most
1095uses of this facility run the risk of the inserted characters being
1096interpreted in an unintended manner.
1097
1098The string to be inserted is the string value of I<sv>. The characters
1099are recoded for the lexer buffer, according to how the buffer is currently
9dcc53ea 1100being interpreted (L</lex_bufutf8>). If a string to be inserted is
f0e67a1d
Z
1101not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1102need to construct a scalar.
1103
1104=cut
1105*/
1106
1107void
1108Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1109{
1110 char *pv;
1111 STRLEN len;
1112 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1113 if (flags)
1114 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1115 pv = SvPV(sv, len);
1116 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1117}
1118
1119/*
1120=for apidoc Amx|void|lex_unstuff|char *ptr
1121
1122Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1123I<ptr>. Text following I<ptr> will be moved, and the buffer shortened.
1124This hides the discarded text from any lexing code that runs later,
1125as if the text had never appeared.
1126
1127This is not the normal way to consume lexed text. For that, use
1128L</lex_read_to>.
1129
1130=cut
1131*/
1132
1133void
1134Perl_lex_unstuff(pTHX_ char *ptr)
1135{
1136 char *buf, *bufend;
1137 STRLEN unstuff_len;
1138 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1139 buf = PL_parser->bufptr;
1140 if (ptr < buf)
1141 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1142 if (ptr == buf)
1143 return;
1144 bufend = PL_parser->bufend;
1145 if (ptr > bufend)
1146 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1147 unstuff_len = ptr - buf;
1148 Move(ptr, buf, bufend+1-ptr, char);
1149 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1150 PL_parser->bufend = bufend - unstuff_len;
1151}
1152
1153/*
1154=for apidoc Amx|void|lex_read_to|char *ptr
1155
1156Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1157to I<ptr>. This advances L</PL_parser-E<gt>bufptr> to match I<ptr>,
1158performing the correct bookkeeping whenever a newline character is passed.
1159This is the normal way to consume lexed text.
1160
1161Interpretation of the buffer's octets can be abstracted out by
1162using the slightly higher-level functions L</lex_peek_unichar> and
1163L</lex_read_unichar>.
1164
1165=cut
1166*/
1167
1168void
1169Perl_lex_read_to(pTHX_ char *ptr)
1170{
1171 char *s;
1172 PERL_ARGS_ASSERT_LEX_READ_TO;
1173 s = PL_parser->bufptr;
1174 if (ptr < s || ptr > PL_parser->bufend)
1175 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1176 for (; s != ptr; s++)
1177 if (*s == '\n') {
83944c01 1178 COPLINE_INC_WITH_HERELINES;
f0e67a1d
Z
1179 PL_parser->linestart = s+1;
1180 }
1181 PL_parser->bufptr = ptr;
1182}
1183
1184/*
1185=for apidoc Amx|void|lex_discard_to|char *ptr
1186
1187Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1188up to I<ptr>. The remaining content of the buffer will be moved, and
1189all pointers into the buffer updated appropriately. I<ptr> must not
1190be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1191it is not permitted to discard text that has yet to be lexed.
1192
1193Normally it is not necessarily to do this directly, because it suffices to
1194use the implicit discarding behaviour of L</lex_next_chunk> and things
1195based on it. However, if a token stretches across multiple lines,
1f317c95 1196and the lexing code has kept multiple lines of text in the buffer for
f0e67a1d
Z
1197that purpose, then after completion of the token it would be wise to
1198explicitly discard the now-unneeded earlier lines, to avoid future
1199multi-line tokens growing the buffer without bound.
1200
1201=cut
1202*/
1203
1204void
1205Perl_lex_discard_to(pTHX_ char *ptr)
1206{
1207 char *buf;
1208 STRLEN discard_len;
1209 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1210 buf = SvPVX(PL_parser->linestr);
1211 if (ptr < buf)
1212 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1213 if (ptr == buf)
1214 return;
1215 if (ptr > PL_parser->bufptr)
1216 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1217 discard_len = ptr - buf;
1218 if (PL_parser->oldbufptr < ptr)
1219 PL_parser->oldbufptr = ptr;
1220 if (PL_parser->oldoldbufptr < ptr)
1221 PL_parser->oldoldbufptr = ptr;
1222 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1223 PL_parser->last_uni = NULL;
1224 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1225 PL_parser->last_lop = NULL;
1226 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1227 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1228 PL_parser->bufend -= discard_len;
1229 PL_parser->bufptr -= discard_len;
1230 PL_parser->oldbufptr -= discard_len;
1231 PL_parser->oldoldbufptr -= discard_len;
1232 if (PL_parser->last_uni)
1233 PL_parser->last_uni -= discard_len;
1234 if (PL_parser->last_lop)
1235 PL_parser->last_lop -= discard_len;
1236}
1237
1238/*
1239=for apidoc Amx|bool|lex_next_chunk|U32 flags
1240
1241Reads in the next chunk of text to be lexed, appending it to
1242L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1243looked to the end of the current chunk and wants to know more. It is
1244usual, but not necessary, for lexing to have consumed the entirety of
1245the current chunk at this time.
1246
1247If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1248chunk (i.e., the current chunk has been entirely consumed), normally the
1249current chunk will be discarded at the same time that the new chunk is
1250read in. If I<flags> includes C<LEX_KEEP_PREVIOUS>, the current chunk
1251will not be discarded. If the current chunk has not been entirely
1252consumed, then it will not be discarded regardless of the flag.
1253
1254Returns true if some new text was added to the buffer, or false if the
1255buffer has reached the end of the input text.
1256
1257=cut
1258*/
1259
1260#define LEX_FAKE_EOF 0x80000000
112d1284 1261#define LEX_NO_TERM 0x40000000
f0e67a1d
Z
1262
1263bool
1264Perl_lex_next_chunk(pTHX_ U32 flags)
1265{
1266 SV *linestr;
1267 char *buf;
1268 STRLEN old_bufend_pos, new_bufend_pos;
1269 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1270 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1271 bool got_some_for_debugger = 0;
f0e67a1d 1272 bool got_some;
112d1284 1273 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM))
f0e67a1d 1274 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
f0e67a1d
Z
1275 linestr = PL_parser->linestr;
1276 buf = SvPVX(linestr);
1277 if (!(flags & LEX_KEEP_PREVIOUS) &&
1278 PL_parser->bufptr == PL_parser->bufend) {
1279 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1280 linestart_pos = 0;
1281 if (PL_parser->last_uni != PL_parser->bufend)
1282 PL_parser->last_uni = NULL;
1283 if (PL_parser->last_lop != PL_parser->bufend)
1284 PL_parser->last_lop = NULL;
1285 last_uni_pos = last_lop_pos = 0;
1286 *buf = 0;
1287 SvCUR(linestr) = 0;
1288 } else {
1289 old_bufend_pos = PL_parser->bufend - buf;
1290 bufptr_pos = PL_parser->bufptr - buf;
1291 oldbufptr_pos = PL_parser->oldbufptr - buf;
1292 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1293 linestart_pos = PL_parser->linestart - buf;
1294 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1295 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1296 }
1297 if (flags & LEX_FAKE_EOF) {
1298 goto eof;
60d63348 1299 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
f0e67a1d
Z
1300 got_some = 0;
1301 } else if (filter_gets(linestr, old_bufend_pos)) {
1302 got_some = 1;
17cc9359 1303 got_some_for_debugger = 1;
112d1284
FC
1304 } else if (flags & LEX_NO_TERM) {
1305 got_some = 0;
f0e67a1d 1306 } else {
580561a3
Z
1307 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1308 sv_setpvs(linestr, "");
f0e67a1d
Z
1309 eof:
1310 /* End of real input. Close filehandle (unless it was STDIN),
1311 * then add implicit termination.
1312 */
87606032 1313 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
f0e67a1d
Z
1314 PerlIO_clearerr(PL_parser->rsfp);
1315 else if (PL_parser->rsfp)
1316 (void)PerlIO_close(PL_parser->rsfp);
1317 PL_parser->rsfp = NULL;
60d63348 1318 PL_parser->in_pod = PL_parser->filtered = 0;
f0e67a1d
Z
1319#ifdef PERL_MAD
1320 if (PL_madskills && !PL_in_eval && (PL_minus_p || PL_minus_n))
1321 PL_faketokens = 1;
1322#endif
1323 if (!PL_in_eval && PL_minus_p) {
1324 sv_catpvs(linestr,
1325 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1326 PL_minus_n = PL_minus_p = 0;
1327 } else if (!PL_in_eval && PL_minus_n) {
1328 sv_catpvs(linestr, /*{*/";}");
1329 PL_minus_n = 0;
1330 } else
1331 sv_catpvs(linestr, ";");
1332 got_some = 1;
1333 }
1334 buf = SvPVX(linestr);
1335 new_bufend_pos = SvCUR(linestr);
1336 PL_parser->bufend = buf + new_bufend_pos;
1337 PL_parser->bufptr = buf + bufptr_pos;
1338 PL_parser->oldbufptr = buf + oldbufptr_pos;
1339 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1340 PL_parser->linestart = buf + linestart_pos;
1341 if (PL_parser->last_uni)
1342 PL_parser->last_uni = buf + last_uni_pos;
1343 if (PL_parser->last_lop)
1344 PL_parser->last_lop = buf + last_lop_pos;
17cc9359 1345 if (got_some_for_debugger && (PERLDB_LINE || PERLDB_SAVESRC) &&
f0e67a1d
Z
1346 PL_curstash != PL_debstash) {
1347 /* debugger active and we're not compiling the debugger code,
1348 * so store the line into the debugger's array of lines
1349 */
1350 update_debugger_info(NULL, buf+old_bufend_pos,
1351 new_bufend_pos-old_bufend_pos);
1352 }
1353 return got_some;
1354}
1355
1356/*
1357=for apidoc Amx|I32|lex_peek_unichar|U32 flags
1358
1359Looks ahead one (Unicode) character in the text currently being lexed.
1360Returns the codepoint (unsigned integer value) of the next character,
1361or -1 if lexing has reached the end of the input text. To consume the
1362peeked character, use L</lex_read_unichar>.
1363
1364If the next character is in (or extends into) the next chunk of input
1365text, the next chunk will be read in. Normally the current chunk will be
1366discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1367then the current chunk will not be discarded.
1368
1369If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1370is encountered, an exception is generated.
1371
1372=cut
1373*/
1374
1375I32
1376Perl_lex_peek_unichar(pTHX_ U32 flags)
1377{
749123ff 1378 dVAR;
f0e67a1d
Z
1379 char *s, *bufend;
1380 if (flags & ~(LEX_KEEP_PREVIOUS))
1381 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1382 s = PL_parser->bufptr;
1383 bufend = PL_parser->bufend;
1384 if (UTF) {
1385 U8 head;
1386 I32 unichar;
1387 STRLEN len, retlen;
1388 if (s == bufend) {
1389 if (!lex_next_chunk(flags))
1390 return -1;
1391 s = PL_parser->bufptr;
1392 bufend = PL_parser->bufend;
1393 }
1394 head = (U8)*s;
1395 if (!(head & 0x80))
1396 return head;
1397 if (head & 0x40) {
1398 len = PL_utf8skip[head];
1399 while ((STRLEN)(bufend-s) < len) {
1400 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1401 break;
1402 s = PL_parser->bufptr;
1403 bufend = PL_parser->bufend;
1404 }
1405 }
1406 unichar = utf8n_to_uvuni((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
1407 if (retlen == (STRLEN)-1) {
1408 /* malformed UTF-8 */
1409 ENTER;
1410 SAVESPTR(PL_warnhook);
1411 PL_warnhook = PERL_WARNHOOK_FATAL;
1412 utf8n_to_uvuni((U8*)s, bufend-s, NULL, 0);
1413 LEAVE;
1414 }
1415 return unichar;
1416 } else {
1417 if (s == bufend) {
1418 if (!lex_next_chunk(flags))
1419 return -1;
1420 s = PL_parser->bufptr;
1421 }
1422 return (U8)*s;
1423 }
1424}
1425
1426/*
1427=for apidoc Amx|I32|lex_read_unichar|U32 flags
1428
1429Reads the next (Unicode) character in the text currently being lexed.
1430Returns the codepoint (unsigned integer value) of the character read,
1431and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1432if lexing has reached the end of the input text. To non-destructively
1433examine the next character, use L</lex_peek_unichar> instead.
1434
1435If the next character is in (or extends into) the next chunk of input
1436text, the next chunk will be read in. Normally the current chunk will be
1437discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1438then the current chunk will not be discarded.
1439
1440If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1441is encountered, an exception is generated.
1442
1443=cut
1444*/
1445
1446I32
1447Perl_lex_read_unichar(pTHX_ U32 flags)
1448{
1449 I32 c;
1450 if (flags & ~(LEX_KEEP_PREVIOUS))
1451 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1452 c = lex_peek_unichar(flags);
1453 if (c != -1) {
1454 if (c == '\n')
83944c01 1455 COPLINE_INC_WITH_HERELINES;
d9018cbe
EB
1456 if (UTF)
1457 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1458 else
1459 ++(PL_parser->bufptr);
f0e67a1d
Z
1460 }
1461 return c;
1462}
1463
1464/*
1465=for apidoc Amx|void|lex_read_space|U32 flags
1466
1467Reads optional spaces, in Perl style, in the text currently being
1468lexed. The spaces may include ordinary whitespace characters and
1469Perl-style comments. C<#line> directives are processed if encountered.
1470L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1471at a non-space character (or the end of the input text).
1472
1473If spaces extend into the next chunk of input text, the next chunk will
1474be read in. Normally the current chunk will be discarded at the same
1475time, but if I<flags> includes C<LEX_KEEP_PREVIOUS> then the current
1476chunk will not be discarded.
1477
1478=cut
1479*/
1480
f0998909
Z
1481#define LEX_NO_NEXT_CHUNK 0x80000000
1482
f0e67a1d
Z
1483void
1484Perl_lex_read_space(pTHX_ U32 flags)
1485{
1486 char *s, *bufend;
1487 bool need_incline = 0;
f0998909 1488 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK))
f0e67a1d
Z
1489 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
1490#ifdef PERL_MAD
1491 if (PL_skipwhite) {
1492 sv_free(PL_skipwhite);
1493 PL_skipwhite = NULL;
1494 }
1495 if (PL_madskills)
1496 PL_skipwhite = newSVpvs("");
1497#endif /* PERL_MAD */
1498 s = PL_parser->bufptr;
1499 bufend = PL_parser->bufend;
1500 while (1) {
1501 char c = *s;
1502 if (c == '#') {
1503 do {
1504 c = *++s;
1505 } while (!(c == '\n' || (c == 0 && s == bufend)));
1506 } else if (c == '\n') {
1507 s++;
1508 PL_parser->linestart = s;
1509 if (s == bufend)
1510 need_incline = 1;
1511 else
1512 incline(s);
1513 } else if (isSPACE(c)) {
1514 s++;
1515 } else if (c == 0 && s == bufend) {
1516 bool got_more;
1517#ifdef PERL_MAD
1518 if (PL_madskills)
1519 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1520#endif /* PERL_MAD */
f0998909
Z
1521 if (flags & LEX_NO_NEXT_CHUNK)
1522 break;
f0e67a1d 1523 PL_parser->bufptr = s;
83944c01 1524 COPLINE_INC_WITH_HERELINES;
f0e67a1d
Z
1525 got_more = lex_next_chunk(flags);
1526 CopLINE_dec(PL_curcop);
1527 s = PL_parser->bufptr;
1528 bufend = PL_parser->bufend;
1529 if (!got_more)
1530 break;
1531 if (need_incline && PL_parser->rsfp) {
1532 incline(s);
1533 need_incline = 0;
1534 }
1535 } else {
1536 break;
1537 }
1538 }
1539#ifdef PERL_MAD
1540 if (PL_madskills)
1541 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1542#endif /* PERL_MAD */
1543 PL_parser->bufptr = s;
1544}
1545
1546/*
ffb4593c
NT
1547 * S_incline
1548 * This subroutine has nothing to do with tilting, whether at windmills
1549 * or pinball tables. Its name is short for "increment line". It
57843af0 1550 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1551 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1552 * # line 500 "foo.pm"
1553 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1554 */
1555
76e3520e 1556STATIC void
d9095cec 1557S_incline(pTHX_ const char *s)
463ee0b2 1558{
97aff369 1559 dVAR;
d9095cec
NC
1560 const char *t;
1561 const char *n;
1562 const char *e;
8818d409 1563 line_t line_num;
463ee0b2 1564
7918f24d
NC
1565 PERL_ARGS_ASSERT_INCLINE;
1566
83944c01 1567 COPLINE_INC_WITH_HERELINES;
463ee0b2
LW
1568 if (*s++ != '#')
1569 return;
d4c19fe8
AL
1570 while (SPACE_OR_TAB(*s))
1571 s++;
73659bf1
GS
1572 if (strnEQ(s, "line", 4))
1573 s += 4;
1574 else
1575 return;
084592ab 1576 if (SPACE_OR_TAB(*s))
73659bf1 1577 s++;
4e553d73 1578 else
73659bf1 1579 return;
d4c19fe8
AL
1580 while (SPACE_OR_TAB(*s))
1581 s++;
463ee0b2
LW
1582 if (!isDIGIT(*s))
1583 return;
d4c19fe8 1584
463ee0b2
LW
1585 n = s;
1586 while (isDIGIT(*s))
1587 s++;
07714eb4 1588 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1589 return;
bf4acbe4 1590 while (SPACE_OR_TAB(*s))
463ee0b2 1591 s++;
73659bf1 1592 if (*s == '"' && (t = strchr(s+1, '"'))) {
463ee0b2 1593 s++;
73659bf1
GS
1594 e = t + 1;
1595 }
463ee0b2 1596 else {
c35e046a
AL
1597 t = s;
1598 while (!isSPACE(*t))
1599 t++;
73659bf1 1600 e = t;
463ee0b2 1601 }
bf4acbe4 1602 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1603 e++;
1604 if (*e != '\n' && *e != '\0')
1605 return; /* false alarm */
1606
8818d409
FC
1607 line_num = atoi(n)-1;
1608
f4dd75d9 1609 if (t - s > 0) {
d9095cec 1610 const STRLEN len = t - s;
19bad673
NC
1611 SV *const temp_sv = CopFILESV(PL_curcop);
1612 const char *cf;
1613 STRLEN tmplen;
1614
1615 if (temp_sv) {
1616 cf = SvPVX(temp_sv);
1617 tmplen = SvCUR(temp_sv);
1618 } else {
1619 cf = NULL;
1620 tmplen = 0;
1621 }
1622
d1299d44 1623 if (!PL_rsfp && !PL_parser->filtered) {
e66cf94c
RGS
1624 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1625 * to *{"::_<newfilename"} */
44867030
NC
1626 /* However, the long form of evals is only turned on by the
1627 debugger - usually they're "(eval %lu)" */
1628 char smallbuf[128];
1629 char *tmpbuf;
1630 GV **gvp;
d9095cec 1631 STRLEN tmplen2 = len;
798b63bc 1632 if (tmplen + 2 <= sizeof smallbuf)
e66cf94c
RGS
1633 tmpbuf = smallbuf;
1634 else
2ae0db35 1635 Newx(tmpbuf, tmplen + 2, char);
44867030
NC
1636 tmpbuf[0] = '_';
1637 tmpbuf[1] = '<';
2ae0db35 1638 memcpy(tmpbuf + 2, cf, tmplen);
44867030 1639 tmplen += 2;
8a5ee598
RGS
1640 gvp = (GV**)hv_fetch(PL_defstash, tmpbuf, tmplen, FALSE);
1641 if (gvp) {
44867030
NC
1642 char *tmpbuf2;
1643 GV *gv2;
1644
1645 if (tmplen2 + 2 <= sizeof smallbuf)
1646 tmpbuf2 = smallbuf;
1647 else
1648 Newx(tmpbuf2, tmplen2 + 2, char);
1649
1650 if (tmpbuf2 != smallbuf || tmpbuf != smallbuf) {
1651 /* Either they malloc'd it, or we malloc'd it,
1652 so no prefix is present in ours. */
1653 tmpbuf2[0] = '_';
1654 tmpbuf2[1] = '<';
1655 }
1656
1657 memcpy(tmpbuf2 + 2, s, tmplen2);
1658 tmplen2 += 2;
1659
8a5ee598 1660 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1661 if (!isGV(gv2)) {
8a5ee598 1662 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1663 /* adjust ${"::_<newfilename"} to store the new file name */
1664 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
8818d409
FC
1665 /* The line number may differ. If that is the case,
1666 alias the saved lines that are in the array.
1667 Otherwise alias the whole array. */
1668 if (CopLINE(PL_curcop) == line_num) {
1669 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(*gvp)));
1670 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(*gvp)));
1671 }
1672 else if (GvAV(*gvp)) {
1673 AV * const av = GvAV(*gvp);
1674 const I32 start = CopLINE(PL_curcop)+1;
1675 I32 items = AvFILLp(av) - start;
1676 if (items > 0) {
1677 AV * const av2 = GvAVn(gv2);
1678 SV **svp = AvARRAY(av) + start;
1679 I32 l = (I32)line_num+1;
1680 while (items--)
1681 av_store(av2, l++, SvREFCNT_inc(*svp++));
1682 }
1683 }
e5527e4b 1684 }
44867030
NC
1685
1686 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
8a5ee598 1687 }
e66cf94c 1688 if (tmpbuf != smallbuf) Safefree(tmpbuf);
e66cf94c 1689 }
05ec9bb3 1690 CopFILE_free(PL_curcop);
d9095cec 1691 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1692 }
8818d409 1693 CopLINE_set(PL_curcop, line_num);
463ee0b2
LW
1694}
1695
29595ff2 1696#ifdef PERL_MAD
cd81e915 1697/* skip space before PL_thistoken */
29595ff2
NC
1698
1699STATIC char *
1700S_skipspace0(pTHX_ register char *s)
1701{
7918f24d
NC
1702 PERL_ARGS_ASSERT_SKIPSPACE0;
1703
29595ff2
NC
1704 s = skipspace(s);
1705 if (!PL_madskills)
1706 return s;
cd81e915
NC
1707 if (PL_skipwhite) {
1708 if (!PL_thiswhite)
6b29d1f5 1709 PL_thiswhite = newSVpvs("");
cd81e915
NC
1710 sv_catsv(PL_thiswhite, PL_skipwhite);
1711 sv_free(PL_skipwhite);
1712 PL_skipwhite = 0;
1713 }
1714 PL_realtokenstart = s - SvPVX(PL_linestr);
29595ff2
NC
1715 return s;
1716}
1717
cd81e915 1718/* skip space after PL_thistoken */
29595ff2
NC
1719
1720STATIC char *
1721S_skipspace1(pTHX_ register char *s)
1722{
d4c19fe8 1723 const char *start = s;
29595ff2
NC
1724 I32 startoff = start - SvPVX(PL_linestr);
1725
7918f24d
NC
1726 PERL_ARGS_ASSERT_SKIPSPACE1;
1727
29595ff2
NC
1728 s = skipspace(s);
1729 if (!PL_madskills)
1730 return s;
1731 start = SvPVX(PL_linestr) + startoff;
cd81e915 1732 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1733 const char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1734 PL_thistoken = newSVpvn(tstart, start - tstart);
1735 }
1736 PL_realtokenstart = -1;
1737 if (PL_skipwhite) {
1738 if (!PL_nextwhite)
6b29d1f5 1739 PL_nextwhite = newSVpvs("");
cd81e915
NC
1740 sv_catsv(PL_nextwhite, PL_skipwhite);
1741 sv_free(PL_skipwhite);
1742 PL_skipwhite = 0;
29595ff2
NC
1743 }
1744 return s;
1745}
1746
1747STATIC char *
1748S_skipspace2(pTHX_ register char *s, SV **svp)
1749{
c35e046a
AL
1750 char *start;
1751 const I32 bufptroff = PL_bufptr - SvPVX(PL_linestr);
1752 const I32 startoff = s - SvPVX(PL_linestr);
1753
7918f24d
NC
1754 PERL_ARGS_ASSERT_SKIPSPACE2;
1755
29595ff2
NC
1756 s = skipspace(s);
1757 PL_bufptr = SvPVX(PL_linestr) + bufptroff;
1758 if (!PL_madskills || !svp)
1759 return s;
1760 start = SvPVX(PL_linestr) + startoff;
cd81e915 1761 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1762 char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1763 PL_thistoken = newSVpvn(tstart, start - tstart);
1764 PL_realtokenstart = -1;
29595ff2 1765 }
cd81e915 1766 if (PL_skipwhite) {
29595ff2 1767 if (!*svp)
6b29d1f5 1768 *svp = newSVpvs("");
cd81e915
NC
1769 sv_setsv(*svp, PL_skipwhite);
1770 sv_free(PL_skipwhite);
1771 PL_skipwhite = 0;
29595ff2
NC
1772 }
1773
1774 return s;
1775}
1776#endif
1777
80a702cd 1778STATIC void
15f169a1 1779S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1780{
1781 AV *av = CopFILEAVx(PL_curcop);
1782 if (av) {
b9f83d2f 1783 SV * const sv = newSV_type(SVt_PVMG);
5fa550fb
NC
1784 if (orig_sv)
1785 sv_setsv(sv, orig_sv);
1786 else
1787 sv_setpvn(sv, buf, len);
80a702cd
RGS
1788 (void)SvIOK_on(sv);
1789 SvIV_set(sv, 0);
1790 av_store(av, (I32)CopLINE(PL_curcop), sv);
1791 }
1792}
1793
ffb4593c
NT
1794/*
1795 * S_skipspace
1796 * Called to gobble the appropriate amount and type of whitespace.
1797 * Skips comments as well.
1798 */
1799
76e3520e 1800STATIC char *
cea2e8a9 1801S_skipspace(pTHX_ register char *s)
a687059c 1802{
5db06880 1803#ifdef PERL_MAD
f0e67a1d
Z
1804 char *start = s;
1805#endif /* PERL_MAD */
7918f24d 1806 PERL_ARGS_ASSERT_SKIPSPACE;
f0e67a1d 1807#ifdef PERL_MAD
cd81e915
NC
1808 if (PL_skipwhite) {
1809 sv_free(PL_skipwhite);
f0e67a1d 1810 PL_skipwhite = NULL;
5db06880 1811 }
f0e67a1d 1812#endif /* PERL_MAD */
3280af22 1813 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
bf4acbe4 1814 while (s < PL_bufend && SPACE_OR_TAB(*s))
463ee0b2 1815 s++;
f0e67a1d
Z
1816 } else {
1817 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1818 PL_bufptr = s;
f0998909
Z
1819 lex_read_space(LEX_KEEP_PREVIOUS |
1820 (PL_sublex_info.sub_inwhat || PL_lex_state == LEX_FORMLINE ?
1821 LEX_NO_NEXT_CHUNK : 0));
3280af22 1822 s = PL_bufptr;
f0e67a1d
Z
1823 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1824 if (PL_linestart > PL_bufptr)
1825 PL_bufptr = PL_linestart;
1826 return s;
463ee0b2 1827 }
5db06880 1828#ifdef PERL_MAD
f0e67a1d
Z
1829 if (PL_madskills)
1830 PL_skipwhite = newSVpvn(start, s-start);
1831#endif /* PERL_MAD */
5db06880 1832 return s;
a687059c 1833}
378cc40b 1834
ffb4593c
NT
1835/*
1836 * S_check_uni
1837 * Check the unary operators to ensure there's no ambiguity in how they're
1838 * used. An ambiguous piece of code would be:
1839 * rand + 5
1840 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1841 * the +5 is its argument.
1842 */
1843
76e3520e 1844STATIC void
cea2e8a9 1845S_check_uni(pTHX)
ba106d47 1846{
97aff369 1847 dVAR;
d4c19fe8
AL
1848 const char *s;
1849 const char *t;
2f3197b3 1850
3280af22 1851 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1852 return;
3280af22
NIS
1853 while (isSPACE(*PL_last_uni))
1854 PL_last_uni++;
c35e046a
AL
1855 s = PL_last_uni;
1856 while (isALNUM_lazy_if(s,UTF) || *s == '-')
1857 s++;
3280af22 1858 if ((t = strchr(s, '(')) && t < PL_bufptr)
a0d0e21e 1859 return;
6136c704 1860
9b387841
NC
1861 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
1862 "Warning: Use of \"%.*s\" without parentheses is ambiguous",
1863 (int)(s - PL_last_uni), PL_last_uni);
2f3197b3
LW
1864}
1865
ffb4593c
NT
1866/*
1867 * LOP : macro to build a list operator. Its behaviour has been replaced
1868 * with a subroutine, S_lop() for which LOP is just another name.
1869 */
1870
a0d0e21e
LW
1871#define LOP(f,x) return lop(f,x,s)
1872
ffb4593c
NT
1873/*
1874 * S_lop
1875 * Build a list operator (or something that might be one). The rules:
1876 * - if we have a next token, then it's a list operator [why?]
1877 * - if the next thing is an opening paren, then it's a function
1878 * - else it's a list operator
1879 */
1880
76e3520e 1881STATIC I32
a0be28da 1882S_lop(pTHX_ I32 f, int x, char *s)
ffed7fef 1883{
97aff369 1884 dVAR;
7918f24d
NC
1885
1886 PERL_ARGS_ASSERT_LOP;
1887
6154021b 1888 pl_yylval.ival = f;
35c8bce7 1889 CLINE;
3280af22
NIS
1890 PL_expect = x;
1891 PL_bufptr = s;
1892 PL_last_lop = PL_oldbufptr;
eb160463 1893 PL_last_lop_op = (OPCODE)f;
5db06880
NC
1894#ifdef PERL_MAD
1895 if (PL_lasttoke)
78cdf107 1896 goto lstop;
5db06880 1897#else
3280af22 1898 if (PL_nexttoke)
78cdf107 1899 goto lstop;
5db06880 1900#endif
79072805 1901 if (*s == '(')
bbf60fe6 1902 return REPORT(FUNC);
29595ff2 1903 s = PEEKSPACE(s);
79072805 1904 if (*s == '(')
bbf60fe6 1905 return REPORT(FUNC);
78cdf107
Z
1906 else {
1907 lstop:
1908 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
1909 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
bbf60fe6 1910 return REPORT(LSTOP);
78cdf107 1911 }
79072805
LW
1912}
1913
5db06880
NC
1914#ifdef PERL_MAD
1915 /*
1916 * S_start_force
1917 * Sets up for an eventual force_next(). start_force(0) basically does
1918 * an unshift, while start_force(-1) does a push. yylex removes items
1919 * on the "pop" end.
1920 */
1921
1922STATIC void
1923S_start_force(pTHX_ int where)
1924{
1925 int i;
1926
cd81e915 1927 if (where < 0) /* so people can duplicate start_force(PL_curforce) */
5db06880 1928 where = PL_lasttoke;
cd81e915
NC
1929 assert(PL_curforce < 0 || PL_curforce == where);
1930 if (PL_curforce != where) {
5db06880
NC
1931 for (i = PL_lasttoke; i > where; --i) {
1932 PL_nexttoke[i] = PL_nexttoke[i-1];
1933 }
1934 PL_lasttoke++;
1935 }
cd81e915 1936 if (PL_curforce < 0) /* in case of duplicate start_force() */
5db06880 1937 Zero(&PL_nexttoke[where], 1, NEXTTOKE);
cd81e915
NC
1938 PL_curforce = where;
1939 if (PL_nextwhite) {
5db06880 1940 if (PL_madskills)
6b29d1f5 1941 curmad('^', newSVpvs(""));
cd81e915 1942 CURMAD('_', PL_nextwhite);
5db06880
NC
1943 }
1944}
1945
1946STATIC void
1947S_curmad(pTHX_ char slot, SV *sv)
1948{
1949 MADPROP **where;
1950
1951 if (!sv)
1952 return;
cd81e915
NC
1953 if (PL_curforce < 0)
1954 where = &PL_thismad;
5db06880 1955 else
cd81e915 1956 where = &PL_nexttoke[PL_curforce].next_mad;
5db06880 1957
cd81e915 1958 if (PL_faketokens)
76f68e9b 1959 sv_setpvs(sv, "");
5db06880
NC
1960 else {
1961 if (!IN_BYTES) {
1962 if (UTF && is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
1963 SvUTF8_on(sv);
1964 else if (PL_encoding) {
1965 sv_recode_to_utf8(sv, PL_encoding);
1966 }
1967 }
1968 }
1969
1970 /* keep a slot open for the head of the list? */
1971 if (slot != '_' && *where && (*where)->mad_key == '^') {
1972 (*where)->mad_key = slot;
daba3364 1973 sv_free(MUTABLE_SV(((*where)->mad_val)));
5db06880
NC
1974 (*where)->mad_val = (void*)sv;
1975 }
1976 else
1977 addmad(newMADsv(slot, sv), where, 0);
1978}
1979#else
b3f24c00
MHM
1980# define start_force(where) NOOP
1981# define curmad(slot, sv) NOOP
5db06880
NC
1982#endif
1983
ffb4593c
NT
1984/*
1985 * S_force_next
9cbb5ea2 1986 * When the lexer realizes it knows the next token (for instance,
ffb4593c 1987 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 1988 * to know what token to return the next time the lexer is called. Caller
5db06880
NC
1989 * will need to set PL_nextval[] (or PL_nexttoke[].next_val with PERL_MAD),
1990 * and possibly PL_expect to ensure the lexer handles the token correctly.
ffb4593c
NT
1991 */
1992
4e553d73 1993STATIC void
cea2e8a9 1994S_force_next(pTHX_ I32 type)
79072805 1995{
97aff369 1996 dVAR;
704d4215
GG
1997#ifdef DEBUGGING
1998 if (DEBUG_T_TEST) {
1999 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 2000 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
2001 }
2002#endif
6c7ae946
FC
2003 /* Don’t let opslab_force_free snatch it */
2004 if (S_is_opval_token(type & 0xffff) && NEXTVAL_NEXTTOKE.opval) {
2005 assert(!NEXTVAL_NEXTTOKE.opval->op_savefree);
2006 NEXTVAL_NEXTTOKE.opval->op_savefree = 1;
2007 }
5db06880 2008#ifdef PERL_MAD
cd81e915 2009 if (PL_curforce < 0)
5db06880 2010 start_force(PL_lasttoke);
cd81e915 2011 PL_nexttoke[PL_curforce].next_type = type;
5db06880
NC
2012 if (PL_lex_state != LEX_KNOWNEXT)
2013 PL_lex_defer = PL_lex_state;
2014 PL_lex_state = LEX_KNOWNEXT;
2015 PL_lex_expect = PL_expect;
cd81e915 2016 PL_curforce = -1;
5db06880 2017#else
3280af22
NIS
2018 PL_nexttype[PL_nexttoke] = type;
2019 PL_nexttoke++;
2020 if (PL_lex_state != LEX_KNOWNEXT) {
2021 PL_lex_defer = PL_lex_state;
2022 PL_lex_expect = PL_expect;
2023 PL_lex_state = LEX_KNOWNEXT;
79072805 2024 }
5db06880 2025#endif
79072805
LW
2026}
2027
28ac2b49
Z
2028void
2029Perl_yyunlex(pTHX)
2030{
a7aaec61
Z
2031 int yyc = PL_parser->yychar;
2032 if (yyc != YYEMPTY) {
2033 if (yyc) {
2034 start_force(-1);
2035 NEXTVAL_NEXTTOKE = PL_parser->yylval;
2036 if (yyc == '{'/*}*/ || yyc == HASHBRACK || yyc == '['/*]*/) {
78cdf107 2037 PL_lex_allbrackets--;
a7aaec61 2038 PL_lex_brackets--;
78cdf107
Z
2039 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
2040 } else if (yyc == '('/*)*/) {
2041 PL_lex_allbrackets--;
2042 yyc |= (2<<24);
a7aaec61
Z
2043 }
2044 force_next(yyc);
2045 }
28ac2b49
Z
2046 PL_parser->yychar = YYEMPTY;
2047 }
2048}
2049
d0a148a6 2050STATIC SV *
15f169a1 2051S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 2052{
97aff369 2053 dVAR;
740cce10 2054 SV * const sv = newSVpvn_utf8(start, len,
eaf7a4d2
CS
2055 !IN_BYTES
2056 && UTF
2057 && !is_ascii_string((const U8*)start, len)
740cce10 2058 && is_utf8_string((const U8*)start, len));
d0a148a6
NC
2059 return sv;
2060}
2061
ffb4593c
NT
2062/*
2063 * S_force_word
2064 * When the lexer knows the next thing is a word (for instance, it has
2065 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
2066 * it calls S_force_word to stick the next word into the PL_nexttoke/val
2067 * lookahead.
ffb4593c
NT
2068 *
2069 * Arguments:
b1b65b59 2070 * char *start : buffer position (must be within PL_linestr)
02b34bbe 2071 * int token : PL_next* will be this type of bare word (e.g., METHOD,WORD)
ffb4593c
NT
2072 * int check_keyword : if true, Perl checks to make sure the word isn't
2073 * a keyword (do this if the word is a label, e.g. goto FOO)
2074 * int allow_pack : if true, : characters will also be allowed (require,
2075 * use, etc. do this)
9cbb5ea2 2076 * int allow_initial_tick : used by the "sub" lexer only.
ffb4593c
NT
2077 */
2078
76e3520e 2079STATIC char *
cea2e8a9 2080S_force_word(pTHX_ register char *start, int token, int check_keyword, int allow_pack, int allow_initial_tick)
79072805 2081{
97aff369 2082 dVAR;
eb578fdb 2083 char *s;
463ee0b2 2084 STRLEN len;
4e553d73 2085
7918f24d
NC
2086 PERL_ARGS_ASSERT_FORCE_WORD;
2087
29595ff2 2088 start = SKIPSPACE1(start);
463ee0b2 2089 s = start;
7e2040f0 2090 if (isIDFIRST_lazy_if(s,UTF) ||
a0d0e21e 2091 (allow_pack && *s == ':') ||
15f0808c 2092 (allow_initial_tick && *s == '\'') )
a0d0e21e 2093 {
3280af22 2094 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
5458a98a 2095 if (check_keyword && keyword(PL_tokenbuf, len, 0))
463ee0b2 2096 return start;
cd81e915 2097 start_force(PL_curforce);
5db06880
NC
2098 if (PL_madskills)
2099 curmad('X', newSVpvn(start,s-start));
463ee0b2 2100 if (token == METHOD) {
29595ff2 2101 s = SKIPSPACE1(s);
463ee0b2 2102 if (*s == '(')
3280af22 2103 PL_expect = XTERM;
463ee0b2 2104 else {
3280af22 2105 PL_expect = XOPERATOR;
463ee0b2 2106 }
79072805 2107 }
e74e6b3d 2108 if (PL_madskills)
63575281 2109 curmad('g', newSVpvs( "forced" ));
9ded7720 2110 NEXTVAL_NEXTTOKE.opval
d0a148a6
NC
2111 = (OP*)newSVOP(OP_CONST,0,
2112 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 2113 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
2114 force_next(token);
2115 }
2116 return s;
2117}
2118
ffb4593c
NT
2119/*
2120 * S_force_ident
9cbb5ea2 2121 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
2122 * text only contains the "foo" portion. The first argument is a pointer
2123 * to the "foo", and the second argument is the type symbol to prefix.
2124 * Forces the next token to be a "WORD".
9cbb5ea2 2125 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2126 */
2127
76e3520e 2128STATIC void
bfed75c6 2129S_force_ident(pTHX_ register const char *s, int kind)
79072805 2130{
97aff369 2131 dVAR;
7918f24d
NC
2132
2133 PERL_ARGS_ASSERT_FORCE_IDENT;
2134
c35e046a 2135 if (*s) {
90e5519e 2136 const STRLEN len = strlen(s);
728847b1
BF
2137 OP* const o = (OP*)newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
2138 UTF ? SVf_UTF8 : 0));
cd81e915 2139 start_force(PL_curforce);
9ded7720 2140 NEXTVAL_NEXTTOKE.opval = o;
79072805 2141 force_next(WORD);
748a9306 2142 if (kind) {
11343788 2143 o->op_private = OPpCONST_ENTERED;
55497cff
PP
2144 /* XXX see note in pp_entereval() for why we forgo typo
2145 warnings if the symbol must be introduced in an eval.
2146 GSAR 96-10-12 */
90e5519e 2147 gv_fetchpvn_flags(s, len,
728847b1
BF
2148 (PL_in_eval ? (GV_ADDMULTI | GV_ADDINEVAL)
2149 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
90e5519e
NC
2150 kind == '$' ? SVt_PV :
2151 kind == '@' ? SVt_PVAV :
2152 kind == '%' ? SVt_PVHV :
a0d0e21e 2153 SVt_PVGV
90e5519e 2154 );
748a9306 2155 }
79072805
LW
2156 }
2157}
2158
1571675a
GS
2159NV
2160Perl_str_to_version(pTHX_ SV *sv)
2161{
2162 NV retval = 0.0;
2163 NV nshift = 1.0;
2164 STRLEN len;
cfd0369c 2165 const char *start = SvPV_const(sv,len);
9d4ba2ae 2166 const char * const end = start + len;
504618e9 2167 const bool utf = SvUTF8(sv) ? TRUE : FALSE;
7918f24d
NC
2168
2169 PERL_ARGS_ASSERT_STR_TO_VERSION;
2170
1571675a 2171 while (start < end) {
ba210ebe 2172 STRLEN skip;
1571675a
GS
2173 UV n;
2174 if (utf)
9041c2e3 2175 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2176 else {
2177 n = *(U8*)start;
2178 skip = 1;
2179 }
2180 retval += ((NV)n)/nshift;
2181 start += skip;
2182 nshift *= 1000;
2183 }
2184 return retval;
2185}
2186
4e553d73 2187/*
ffb4593c
NT
2188 * S_force_version
2189 * Forces the next token to be a version number.
e759cc13
RGS
2190 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2191 * and if "guessing" is TRUE, then no new token is created (and the caller
2192 * must use an alternative parsing method).
ffb4593c
NT
2193 */
2194
76e3520e 2195STATIC char *
e759cc13 2196S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2197{
97aff369 2198 dVAR;
5f66b61c 2199 OP *version = NULL;
44dcb63b 2200 char *d;
5db06880
NC
2201#ifdef PERL_MAD
2202 I32 startoff = s - SvPVX(PL_linestr);
2203#endif
89bfa8cd 2204
7918f24d
NC
2205 PERL_ARGS_ASSERT_FORCE_VERSION;
2206
29595ff2 2207 s = SKIPSPACE1(s);
89bfa8cd 2208
44dcb63b 2209 d = s;
dd629d5b 2210 if (*d == 'v')
44dcb63b 2211 d++;
44dcb63b 2212 if (isDIGIT(*d)) {
e759cc13
RGS
2213 while (isDIGIT(*d) || *d == '_' || *d == '.')
2214 d++;
5db06880
NC
2215#ifdef PERL_MAD
2216 if (PL_madskills) {
cd81e915 2217 start_force(PL_curforce);
5db06880
NC
2218 curmad('X', newSVpvn(s,d-s));
2219 }
2220#endif
4e4da3ac 2221 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
dd629d5b 2222 SV *ver;
8d08d9ba 2223#ifdef USE_LOCALE_NUMERIC
909d3787
KW
2224 char *loc = savepv(setlocale(LC_NUMERIC, NULL));
2225 setlocale(LC_NUMERIC, "C");
8d08d9ba 2226#endif
6154021b 2227 s = scan_num(s, &pl_yylval);
8d08d9ba
DG
2228#ifdef USE_LOCALE_NUMERIC
2229 setlocale(LC_NUMERIC, loc);
909d3787 2230 Safefree(loc);
8d08d9ba 2231#endif
6154021b 2232 version = pl_yylval.opval;
dd629d5b
GS
2233 ver = cSVOPx(version)->op_sv;
2234 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2235 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2236 SvNV_set(ver, str_to_version(ver));
1571675a 2237 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2238 }
89bfa8cd 2239 }
5db06880
NC
2240 else if (guessing) {
2241#ifdef PERL_MAD
2242 if (PL_madskills) {
cd81e915
NC
2243 sv_free(PL_nextwhite); /* let next token collect whitespace */
2244 PL_nextwhite = 0;
5db06880
NC
2245 s = SvPVX(PL_linestr) + startoff;
2246 }
2247#endif
e759cc13 2248 return s;
5db06880 2249 }
89bfa8cd
PP
2250 }
2251
5db06880
NC
2252#ifdef PERL_MAD
2253 if (PL_madskills && !version) {
cd81e915
NC
2254 sv_free(PL_nextwhite); /* let next token collect whitespace */
2255 PL_nextwhite = 0;
5db06880
NC
2256 s = SvPVX(PL_linestr) + startoff;
2257 }
2258#endif
89bfa8cd 2259 /* NOTE: The parser sees the package name and the VERSION swapped */
cd81e915 2260 start_force(PL_curforce);
9ded7720 2261 NEXTVAL_NEXTTOKE.opval = version;
4e553d73 2262 force_next(WORD);
89bfa8cd 2263
e759cc13 2264 return s;
89bfa8cd
PP
2265}
2266
ffb4593c 2267/*
91152fc1
DG
2268 * S_force_strict_version
2269 * Forces the next token to be a version number using strict syntax rules.
2270 */
2271
2272STATIC char *
2273S_force_strict_version(pTHX_ char *s)
2274{
2275 dVAR;
2276 OP *version = NULL;
2277#ifdef PERL_MAD
2278 I32 startoff = s - SvPVX(PL_linestr);
2279#endif
2280 const char *errstr = NULL;
2281
2282 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2283
2284 while (isSPACE(*s)) /* leading whitespace */
2285 s++;
2286
2287 if (is_STRICT_VERSION(s,&errstr)) {
2288 SV *ver = newSV(0);
2289 s = (char *)scan_version(s, ver, 0);
2290 version = newSVOP(OP_CONST, 0, ver);
2291 }
4e4da3ac
Z
2292 else if ( (*s != ';' && *s != '{' && *s != '}' ) &&
2293 (s = SKIPSPACE1(s), (*s != ';' && *s != '{' && *s != '}' )))
2294 {
91152fc1
DG
2295 PL_bufptr = s;
2296 if (errstr)
2297 yyerror(errstr); /* version required */
2298 return s;
2299 }
2300
2301#ifdef PERL_MAD
2302 if (PL_madskills && !version) {
2303 sv_free(PL_nextwhite); /* let next token collect whitespace */
2304 PL_nextwhite = 0;
2305 s = SvPVX(PL_linestr) + startoff;
2306 }
2307#endif
2308 /* NOTE: The parser sees the package name and the VERSION swapped */
2309 start_force(PL_curforce);
2310 NEXTVAL_NEXTTOKE.opval = version;
2311 force_next(WORD);
2312
2313 return s;
2314}
2315
2316/*
ffb4593c
NT
2317 * S_tokeq
2318 * Tokenize a quoted string passed in as an SV. It finds the next
2319 * chunk, up to end of string or a backslash. It may make a new
2320 * SV containing that chunk (if HINT_NEW_STRING is on). It also
2321 * turns \\ into \.
2322 */
2323
76e3520e 2324STATIC SV *
cea2e8a9 2325S_tokeq(pTHX_ SV *sv)
79072805 2326{
97aff369 2327 dVAR;
eb578fdb
KW
2328 char *s;
2329 char *send;
2330 char *d;
b3ac6de7
IZ
2331 STRLEN len = 0;
2332 SV *pv = sv;
79072805 2333
7918f24d
NC
2334 PERL_ARGS_ASSERT_TOKEQ;
2335
79072805 2336 if (!SvLEN(sv))
b3ac6de7 2337 goto finish;
79072805 2338
a0d0e21e 2339 s = SvPV_force(sv, len);
21a311ee 2340 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1)
b3ac6de7 2341 goto finish;
463ee0b2 2342 send = s + len;
dcb21ed6
NC
2343 /* This is relying on the SV being "well formed" with a trailing '\0' */
2344 while (s < send && !(*s == '\\' && s[1] == '\\'))
79072805
LW
2345 s++;
2346 if (s == send)
b3ac6de7 2347 goto finish;
79072805 2348 d = s;
be4731d2 2349 if ( PL_hints & HINT_NEW_STRING ) {
59cd0e26 2350 pv = newSVpvn_flags(SvPVX_const(pv), len, SVs_TEMP | SvUTF8(sv));
be4731d2 2351 }
79072805
LW
2352 while (s < send) {
2353 if (*s == '\\') {
a0d0e21e 2354 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2355 s++; /* all that, just for this */
2356 }
2357 *d++ = *s++;
2358 }
2359 *d = '\0';
95a20fc0 2360 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2361 finish:
3280af22 2362 if ( PL_hints & HINT_NEW_STRING )
eb0d8d16 2363 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
79072805
LW
2364 return sv;
2365}
2366
ffb4593c
NT
2367/*
2368 * Now come three functions related to double-quote context,
2369 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2370 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2371 * interact with PL_lex_state, and create fake ( ... ) argument lists
2372 * to handle functions and concatenation.
ecd24171
DM
2373 * For example,
2374 * "foo\lbar"
2375 * is tokenised as
2376 * stringify ( const[foo] concat lcfirst ( const[bar] ) )
ffb4593c
NT
2377 */
2378
2379/*
2380 * S_sublex_start
6154021b 2381 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2382 *
2383 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2384 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c
NT
2385 *
2386 * OP_CONST and OP_READLINE are easy--just make the new op and return.
2387 *
2388 * Everything else becomes a FUNC.
2389 *
2390 * Sets PL_lex_state to LEX_INTERPPUSH unless (ival was OP_NULL or we
2391 * had an OP_CONST or OP_READLINE). This just sets us up for a
2392 * call to S_sublex_push().
2393 */
2394
76e3520e 2395STATIC I32
cea2e8a9 2396S_sublex_start(pTHX)
79072805 2397{
97aff369 2398 dVAR;
eb578fdb 2399 const I32 op_type = pl_yylval.ival;
79072805
LW
2400
2401 if (op_type == OP_NULL) {
6154021b 2402 pl_yylval.opval = PL_lex_op;
5f66b61c 2403 PL_lex_op = NULL;
79072805
LW
2404 return THING;
2405 }
2406 if (op_type == OP_CONST || op_type == OP_READLINE) {
3280af22 2407 SV *sv = tokeq(PL_lex_stuff);
b3ac6de7
IZ
2408
2409 if (SvTYPE(sv) == SVt_PVIV) {
2410 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2411 STRLEN len;
96a5add6 2412 const char * const p = SvPV_const(sv, len);
740cce10 2413 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2414 SvREFCNT_dec(sv);
2415 sv = nsv;
4e553d73 2416 }
6154021b 2417 pl_yylval.opval = (OP*)newSVOP(op_type, 0, sv);
a0714e2c 2418 PL_lex_stuff = NULL;
6f33ba73
RGS
2419 /* Allow <FH> // "foo" */
2420 if (op_type == OP_READLINE)
2421 PL_expect = XTERMORDORDOR;
79072805
LW
2422 return THING;
2423 }
e3f73d4e
RGS
2424 else if (op_type == OP_BACKTICK && PL_lex_op) {
2425 /* readpipe() vas overriden */
2426 cSVOPx(cLISTOPx(cUNOPx(PL_lex_op)->op_first)->op_first->op_sibling)->op_sv = tokeq(PL_lex_stuff);
6154021b 2427 pl_yylval.opval = PL_lex_op;
9b201d7d 2428 PL_lex_op = NULL;
e3f73d4e
RGS
2429 PL_lex_stuff = NULL;
2430 return THING;
2431 }
79072805 2432
3280af22 2433 PL_sublex_info.super_state = PL_lex_state;
eac04b2e 2434 PL_sublex_info.sub_inwhat = (U16)op_type;
3280af22
NIS
2435 PL_sublex_info.sub_op = PL_lex_op;
2436 PL_lex_state = LEX_INTERPPUSH;
55497cff 2437
3280af22
NIS
2438 PL_expect = XTERM;
2439 if (PL_lex_op) {
6154021b 2440 pl_yylval.opval = PL_lex_op;
5f66b61c 2441 PL_lex_op = NULL;
55497cff
PP
2442 return PMFUNC;
2443 }
2444 else
2445 return FUNC;
2446}
2447
ffb4593c
NT
2448/*
2449 * S_sublex_push
2450 * Create a new scope to save the lexing state. The scope will be
2451 * ended in S_sublex_done. Returns a '(', starting the function arguments
2452 * to the uc, lc, etc. found before.
2453 * Sets PL_lex_state to LEX_INTERPCONCAT.
2454 */
2455
76e3520e 2456STATIC I32
cea2e8a9 2457S_sublex_push(pTHX)
55497cff 2458{
27da23d5 2459 dVAR;
f46d017c 2460 ENTER;
55497cff 2461
3280af22 2462 PL_lex_state = PL_sublex_info.super_state;
651b5b28 2463 SAVEBOOL(PL_lex_dojoin);
3280af22 2464 SAVEI32(PL_lex_brackets);
78cdf107 2465 SAVEI32(PL_lex_allbrackets);
b27dce25 2466 SAVEI32(PL_lex_formbrack);
78cdf107 2467 SAVEI8(PL_lex_fakeeof);
3280af22
NIS
2468 SAVEI32(PL_lex_casemods);
2469 SAVEI32(PL_lex_starts);
651b5b28 2470 SAVEI8(PL_lex_state);
7cc34111 2471 SAVESPTR(PL_lex_repl);
9da1dd8f 2472 SAVEPPTR(PL_sublex_info.re_eval_start);
db444266 2473 SAVESPTR(PL_sublex_info.re_eval_str);
565b52df 2474 SAVEPPTR(PL_sublex_info.super_bufptr);
7766f137 2475 SAVEVPTR(PL_lex_inpat);
98246f1e 2476 SAVEI16(PL_lex_inwhat);
57843af0 2477 SAVECOPLINE(PL_curcop);
3280af22 2478 SAVEPPTR(PL_bufptr);
8452ff4b 2479 SAVEPPTR(PL_bufend);
3280af22
NIS
2480 SAVEPPTR(PL_oldbufptr);
2481 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2482 SAVEPPTR(PL_last_lop);
2483 SAVEPPTR(PL_last_uni);
3280af22
NIS
2484 SAVEPPTR(PL_linestart);
2485 SAVESPTR(PL_linestr);
8edd5f42
RGS
2486 SAVEGENERICPV(PL_lex_brackstack);
2487 SAVEGENERICPV(PL_lex_casestack);
3280af22 2488
99bd9d90
FC
2489 /* The here-doc parser needs to be able to peek into outer lexing
2490 scopes to find the body of the here-doc. We use SvIVX(PL_linestr)
2491 to store the outer PL_bufptr and SvNVX to store the outer
2492 PL_linestr. Since SvIVX already means something else, we use
2493 PL_sublex_info.super_bufptr for the innermost scope (the one we are
2494 now entering), and a localised SvIVX for outer scopes.
2495 */
2496 SvUPGRADE(PL_linestr, SVt_PVIV);
2497 /* A null super_bufptr means the outer lexing scope is not peekable,
2498 because it is a single line from an input stream. */
2499 SAVEIV(SvIVX(PL_linestr));
2500 SvIVX(PL_linestr) = PTR2IV(PL_sublex_info.super_bufptr);
2501 PL_sublex_info.super_bufptr =
2502 (SvTYPE(PL_linestr) < SVt_PVNV || !SvNVX(PL_linestr))
2503 && (PL_rsfp || PL_parser->filtered)
2504 ? NULL
2505 : PL_bufptr;
2506 SvUPGRADE(PL_lex_stuff, SVt_PVNV);
2507 SvNVX(PL_lex_stuff) = PTR2NV(PL_linestr);
2508
3280af22 2509 PL_linestr = PL_lex_stuff;
7cc34111 2510 PL_lex_repl = PL_sublex_info.repl;
a0714e2c 2511 PL_lex_stuff = NULL;
7cc34111 2512 PL_sublex_info.repl = NULL;
9da1dd8f 2513 PL_sublex_info.re_eval_start = NULL;
db444266 2514 PL_sublex_info.re_eval_str = NULL;
3280af22 2515
9cbb5ea2
GS
2516 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2517 = SvPVX(PL_linestr);
3280af22 2518 PL_bufend += SvCUR(PL_linestr);
bd61b366 2519 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2520 SAVEFREESV(PL_linestr);
2521
2522 PL_lex_dojoin = FALSE;
b27dce25 2523 PL_lex_brackets = PL_lex_formbrack = 0;
78cdf107
Z
2524 PL_lex_allbrackets = 0;
2525 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
a02a5408
JC
2526 Newx(PL_lex_brackstack, 120, char);
2527 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2528 PL_lex_casemods = 0;
2529 *PL_lex_casestack = '\0';
2530 PL_lex_starts = 0;
2531 PL_lex_state = LEX_INTERPCONCAT;
eb160463 2532 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
3280af22
NIS
2533
2534 PL_lex_inwhat = PL_sublex_info.sub_inwhat;
bb16bae8 2535 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
3280af22
NIS
2536 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2537 PL_lex_inpat = PL_sublex_info.sub_op;
79072805 2538 else
5f66b61c 2539 PL_lex_inpat = NULL;
79072805 2540
55497cff 2541 return '(';
79072805
LW
2542}
2543
ffb4593c
NT
2544/*
2545 * S_sublex_done
2546 * Restores lexer state after a S_sublex_push.
2547 */
2548
76e3520e 2549STATIC I32
cea2e8a9 2550S_sublex_done(pTHX)
79072805 2551{
27da23d5 2552 dVAR;
3280af22 2553 if (!PL_lex_starts++) {
396482e1 2554 SV * const sv = newSVpvs("");
9aa983d2
JH
2555 if (SvUTF8(PL_linestr))
2556 SvUTF8_on(sv);
3280af22 2557 PL_expect = XOPERATOR;
6154021b 2558 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
79072805
LW
2559 return THING;
2560 }
2561
3280af22
NIS
2562 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2563 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2564 return yylex();
79072805
LW
2565 }
2566
ffb4593c 2567 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
bb16bae8 2568 assert(PL_lex_inwhat != OP_TRANSR);
3280af22 2569 if (PL_lex_repl && (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS)) {
99bd9d90
FC
2570 SvUPGRADE(PL_lex_repl, SVt_PVNV);
2571 SvNVX(PL_lex_repl) = SvNVX(PL_linestr);
3280af22
NIS
2572 PL_linestr = PL_lex_repl;
2573 PL_lex_inpat = 0;
2574 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2575 PL_bufend += SvCUR(PL_linestr);
bd61b366 2576 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2577 SAVEFREESV(PL_linestr);
2578 PL_lex_dojoin = FALSE;
2579 PL_lex_brackets = 0;
78cdf107
Z
2580 PL_lex_allbrackets = 0;
2581 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
3280af22
NIS
2582 PL_lex_casemods = 0;
2583 *PL_lex_casestack = '\0';
2584 PL_lex_starts = 0;
25da4f38 2585 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2586 PL_lex_state = LEX_INTERPNORMAL;
2587 PL_lex_starts++;
e9fa98b2
HS
2588 /* we don't clear PL_lex_repl here, so that we can check later
2589 whether this is an evalled subst; that means we rely on the
2590 logic to ensure sublex_done() is called again only via the
2591 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2592 }
e9fa98b2 2593 else {
3280af22 2594 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2595 PL_lex_repl = NULL;
e9fa98b2 2596 }
79072805 2597 return ',';
ffed7fef
LW
2598 }
2599 else {
5db06880
NC
2600#ifdef PERL_MAD
2601 if (PL_madskills) {
cd81e915
NC
2602 if (PL_thiswhite) {
2603 if (!PL_endwhite)
6b29d1f5 2604 PL_endwhite = newSVpvs("");
cd81e915
NC
2605 sv_catsv(PL_endwhite, PL_thiswhite);
2606 PL_thiswhite = 0;
2607 }
2608 if (PL_thistoken)
76f68e9b 2609 sv_setpvs(PL_thistoken,"");
5db06880 2610 else
cd81e915 2611 PL_realtokenstart = -1;
5db06880
NC
2612 }
2613#endif
f46d017c 2614 LEAVE;
3280af22
NIS
2615 PL_bufend = SvPVX(PL_linestr);
2616 PL_bufend += SvCUR(PL_linestr);
2617 PL_expect = XOPERATOR;
09bef843 2618 PL_sublex_info.sub_inwhat = 0;
79072805 2619 return ')';
ffed7fef
LW
2620 }
2621}
2622
02aa26ce
NT
2623/*
2624 scan_const
2625
9da1dd8f
DM
2626 Extracts the next constant part of a pattern, double-quoted string,
2627 or transliteration. This is terrifying code.
2628
2629 For example, in parsing the double-quoted string "ab\x63$d", it would
2630 stop at the '$' and return an OP_CONST containing 'abc'.
02aa26ce 2631
94def140 2632 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 2633 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 2634 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 2635
94def140
ST
2636 Returns a pointer to the character scanned up to. If this is
2637 advanced from the start pointer supplied (i.e. if anything was
9da1dd8f 2638 successfully parsed), will leave an OP_CONST for the substring scanned
6154021b 2639 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
2640 by looking at the next characters herself.
2641
02aa26ce 2642 In patterns:
9da1dd8f
DM
2643 expand:
2644 \N{ABC} => \N{U+41.42.43}
2645
2646 pass through:
2647 all other \-char, including \N and \N{ apart from \N{ABC}
2648
2649 stops on:
2650 @ and $ where it appears to be a var, but not for $ as tail anchor
2651 \l \L \u \U \Q \E
2652 (?{ or (??{
2653
02aa26ce
NT
2654
2655 In transliterations:
2656 characters are VERY literal, except for - not at the start or end
94def140
ST
2657 of the string, which indicates a range. If the range is in bytes,
2658 scan_const expands the range to the full set of intermediate
2659 characters. If the range is in utf8, the hyphen is replaced with
2660 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
2661
2662 In double-quoted strings:
2663 backslashes:
2664 double-quoted style: \r and \n
ff3f963a 2665 constants: \x31, etc.
94def140 2666 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
2667 case and quoting: \U \Q \E
2668 stops on @ and $
2669
2670 scan_const does *not* construct ops to handle interpolated strings.
2671 It stops processing as soon as it finds an embedded $ or @ variable
2672 and leaves it to the caller to work out what's going on.
2673
94def140
ST
2674 embedded arrays (whether in pattern or not) could be:
2675 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2676
2677 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
2678
2679 $ in pattern could be $foo or could be tail anchor. Assumption:
2680 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 2681 followed by one of "()| \r\n\t"
02aa26ce 2682
9da1dd8f 2683 \1 (backreferences) are turned into $1 in substitutions
02aa26ce
NT
2684
2685 The structure of the code is
2686 while (there's a character to process) {
94def140
ST
2687 handle transliteration ranges
2688 skip regexp comments /(?#comment)/ and codes /(?{code})/
2689 skip #-initiated comments in //x patterns
2690 check for embedded arrays
02aa26ce
NT
2691 check for embedded scalars
2692 if (backslash) {
94def140 2693 deprecate \1 in substitution replacements
02aa26ce
NT
2694 handle string-changing backslashes \l \U \Q \E, etc.
2695 switch (what was escaped) {
94def140 2696 handle \- in a transliteration (becomes a literal -)
ff3f963a 2697 if a pattern and not \N{, go treat as regular character
94def140
ST
2698 handle \132 (octal characters)
2699 handle \x15 and \x{1234} (hex characters)
ff3f963a 2700 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
ST
2701 handle \cV (control characters)
2702 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 2703 } (end switch)
77a135fe 2704 continue
02aa26ce 2705 } (end if backslash)
77a135fe 2706 handle regular character
02aa26ce 2707 } (end while character to read)
4e553d73 2708
02aa26ce
NT
2709*/
2710
76e3520e 2711STATIC char *
cea2e8a9 2712S_scan_const(pTHX_ char *start)
79072805 2713{
97aff369 2714 dVAR;
eb578fdb 2715 char *send = PL_bufend; /* end of the constant */
77a135fe
KW
2716 SV *sv = newSV(send - start); /* sv for the constant. See
2717 note below on sizing. */
eb578fdb
KW
2718 char *s = start; /* start of the constant */
2719 char *d = SvPVX(sv); /* destination for copies */
02aa26ce 2720 bool dorange = FALSE; /* are we in a translit range? */
c2e66d9e 2721 bool didrange = FALSE; /* did we just finish a range? */
2866decb 2722 bool in_charclass = FALSE; /* within /[...]/ */
b953e60c
KW
2723 bool has_utf8 = FALSE; /* Output constant is UTF8 */
2724 bool this_utf8 = cBOOL(UTF); /* Is the source string assumed
77a135fe
KW
2725 to be UTF8? But, this can
2726 show as true when the source
2727 isn't utf8, as for example
2728 when it is entirely composed
2729 of hex constants */
2730
2731 /* Note on sizing: The scanned constant is placed into sv, which is
2732 * initialized by newSV() assuming one byte of output for every byte of
2733 * input. This routine expects newSV() to allocate an extra byte for a
2734 * trailing NUL, which this routine will append if it gets to the end of
2735 * the input. There may be more bytes of input than output (eg., \N{LATIN
2736 * CAPITAL LETTER A}), or more output than input if the constant ends up
2737 * recoded to utf8, but each time a construct is found that might increase
2738 * the needed size, SvGROW() is called. Its size parameter each time is
2739 * based on the best guess estimate at the time, namely the length used so
2740 * far, plus the length the current construct will occupy, plus room for
2741 * the trailing NUL, plus one byte for every input byte still unscanned */
2742
012bcf8d 2743 UV uv;
4c3a8340
ST
2744#ifdef EBCDIC
2745 UV literal_endpoint = 0;
e294cc5d 2746 bool native_range = TRUE; /* turned to FALSE if the first endpoint is Unicode. */
4c3a8340 2747#endif
012bcf8d 2748
7918f24d
NC
2749 PERL_ARGS_ASSERT_SCAN_CONST;
2750
bb16bae8 2751 assert(PL_lex_inwhat != OP_TRANSR);
2b9d42f0
NIS
2752 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
2753 /* If we are doing a trans and we know we want UTF8 set expectation */
2754 has_utf8 = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
2755 this_utf8 = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
2756 }
2757
2758
79072805 2759 while (s < send || dorange) {
ff3f963a 2760
02aa26ce 2761 /* get transliterations out of the way (they're most literal) */
3280af22 2762 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 2763 /* expand a range A-Z to the full set of characters. AIE! */
79072805 2764 if (dorange) {
1ba5c669
JH
2765 I32 i; /* current expanded character */
2766 I32 min; /* first character in range */
2767 I32 max; /* last character in range */
02aa26ce 2768
e294cc5d
JH
2769#ifdef EBCDIC
2770 UV uvmax = 0;
2771#endif
2772
2773 if (has_utf8
2774#ifdef EBCDIC
2775 && !native_range
2776#endif
2777 ) {
9d4ba2ae 2778 char * const c = (char*)utf8_hop((U8*)d, -1);
8973db79
JH
2779 char *e = d++;
2780 while (e-- > c)
2781 *(e + 1) = *e;
25716404 2782 *c = (char)UTF_TO_NATIVE(0xff);
8973db79
JH
2783 /* mark the range as done, and continue */
2784 dorange = FALSE;
2785 didrange = TRUE;
2786 continue;
2787 }
2b9d42f0 2788
95a20fc0 2789 i = d - SvPVX_const(sv); /* remember current offset */
e294cc5d
JH
2790#ifdef EBCDIC
2791 SvGROW(sv,
2792 SvLEN(sv) + (has_utf8 ?
2793 (512 - UTF_CONTINUATION_MARK +
2794 UNISKIP(0x100))
2795 : 256));
2796 /* How many two-byte within 0..255: 128 in UTF-8,
2797 * 96 in UTF-8-mod. */
2798#else
9cbb5ea2 2799 SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */
e294cc5d 2800#endif
9cbb5ea2 2801 d = SvPVX(sv) + i; /* refresh d after realloc */
e294cc5d
JH
2802#ifdef EBCDIC
2803 if (has_utf8) {
2804 int j;
2805 for (j = 0; j <= 1; j++) {
2806 char * const c = (char*)utf8_hop((U8*)d, -1);
2807 const UV uv = utf8n_to_uvchr((U8*)c, d - c, NULL, 0);
2808 if (j)
2809 min = (U8)uv;
2810 else if (uv < 256)
2811 max = (U8)uv;
2812 else {
2813 max = (U8)0xff; /* only to \xff */
2814 uvmax = uv; /* \x{100} to uvmax */
2815 }
2816 d = c; /* eat endpoint chars */
2817 }
2818 }
2819 else {
2820#endif
2821 d -= 2; /* eat the first char and the - */
2822 min = (U8)*d; /* first char in range */
2823 max = (U8)d[1]; /* last char in range */
2824#ifdef EBCDIC
2825 }
2826#endif
8ada0baa 2827
c2e66d9e 2828 if (min > max) {
01ec43d0 2829 Perl_croak(aTHX_
d1573ac7 2830 "Invalid range \"%c-%c\" in transliteration operator",
1ba5c669 2831 (char)min, (char)max);
c2e66d9e
GS
2832 }
2833
c7f1f016 2834#ifdef EBCDIC
4c3a8340
ST
2835 if (literal_endpoint == 2 &&
2836 ((isLOWER(min) && isLOWER(max)) ||
2837 (isUPPER(min) && isUPPER(max)))) {
8ada0baa
JH
2838 if (isLOWER(min)) {
2839 for (i = min; i <= max; i++)
2840 if (isLOWER(i))
db42d148 2841 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2842 } else {
2843 for (i = min; i <= max; i++)
2844 if (isUPPER(i))
db42d148 2845 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2846 }
2847 }
2848 else
2849#endif
2850 for (i = min; i <= max; i++)
e294cc5d
JH
2851#ifdef EBCDIC
2852 if (has_utf8) {
2853 const U8 ch = (U8)NATIVE_TO_UTF(i);
2854 if (UNI_IS_INVARIANT(ch))
2855 *d++ = (U8)i;
2856 else {
2857 *d++ = (U8)UTF8_EIGHT_BIT_HI(ch);
2858 *d++ = (U8)UTF8_EIGHT_BIT_LO(ch);
2859 }
2860 }
2861 else
2862#endif
2863 *d++ = (char)i;
2864
2865#ifdef EBCDIC
2866 if (uvmax) {
2867 d = (char*)uvchr_to_utf8((U8*)d, 0x100);
2868 if (uvmax > 0x101)
2869 *d++ = (char)UTF_TO_NATIVE(0xff);
2870 if (uvmax > 0x100)
2871 d = (char*)uvchr_to_utf8((U8*)d, uvmax);
2872 }
2873#endif
02aa26ce
NT
2874
2875 /* mark the range as done, and continue */
79072805 2876 dorange = FALSE;
01ec43d0 2877 didrange = TRUE;
4c3a8340
ST
2878#ifdef EBCDIC
2879 literal_endpoint = 0;
2880#endif
79072805 2881 continue;
4e553d73 2882 }
02aa26ce
NT
2883
2884 /* range begins (ignore - as first or last char) */
79072805 2885 else if (*s == '-' && s+1 < send && s != start) {
4e553d73 2886 if (didrange) {
1fafa243 2887 Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
01ec43d0 2888 }
e294cc5d
JH
2889 if (has_utf8
2890#ifdef EBCDIC
2891 && !native_range
2892#endif
2893 ) {
25716404 2894 *d++ = (char)UTF_TO_NATIVE(0xff); /* use illegal utf8 byte--see pmtrans */
a0ed51b3
LW
2895 s++;
2896 continue;
2897 }
79072805
LW
2898 dorange = TRUE;
2899 s++;
01ec43d0
GS
2900 }
2901 else {
2902 didrange = FALSE;
4c3a8340
ST
2903#ifdef EBCDIC
2904 literal_endpoint = 0;
e294cc5d 2905 native_range = TRUE;
4c3a8340 2906#endif
01ec43d0 2907 }
79072805 2908 }
02aa26ce
NT
2909
2910 /* if we get here, we're not doing a transliteration */
2911
e4a2df84
DM
2912 else if (*s == '[' && PL_lex_inpat && !in_charclass) {
2913 char *s1 = s-1;
2914 int esc = 0;
2915 while (s1 >= start && *s1-- == '\\')
2916 esc = !esc;
2917 if (!esc)
2918 in_charclass = TRUE;
2919 }
2866decb 2920
e4a2df84
DM
2921 else if (*s == ']' && PL_lex_inpat && in_charclass) {
2922 char *s1 = s-1;
2923 int esc = 0;
2924 while (s1 >= start && *s1-- == '\\')
2925 esc = !esc;
2926 if (!esc)
2927 in_charclass = FALSE;
2928 }
2866decb 2929
9da1dd8f
DM
2930 /* skip for regexp comments /(?#comment)/, except for the last
2931 * char, which will be done separately.
2932 * Stop on (?{..}) and friends */
2933
3280af22 2934 else if (*s == '(' && PL_lex_inpat && s[1] == '?') {
cc6b7395 2935 if (s[2] == '#') {
e994fd66 2936 while (s+1 < send && *s != ')')
db42d148 2937 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
155aba94 2938 }
2866decb 2939 else if (!PL_lex_casemods && !in_charclass &&
d3cec5e5
DM
2940 ( s[2] == '{' /* This should match regcomp.c */
2941 || (s[2] == '?' && s[3] == '{')))
155aba94 2942 {
9da1dd8f 2943 break;
cc6b7395 2944 }
748a9306 2945 }
02aa26ce
NT
2946
2947 /* likewise skip #-initiated comments in //x patterns */
3280af22 2948 else if (*s == '#' && PL_lex_inpat &&
73134a2e 2949 ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED) {
748a9306 2950 while (s+1 < send && *s != '\n')
db42d148 2951 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
748a9306 2952 }
02aa26ce 2953
9da1dd8f
DM
2954 /* no further processing of single-quoted regex */
2955 else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'')
2956 goto default_action;
2957
5d1d4326 2958 /* check for embedded arrays
da6eedaa 2959 (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
5d1d4326 2960 */
1749ea0d
ST
2961 else if (*s == '@' && s[1]) {
2962 if (isALNUM_lazy_if(s+1,UTF))
2963 break;
2964 if (strchr(":'{$", s[1]))
2965 break;
2966 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
2967 break; /* in regexp, neither @+ nor @- are interpolated */
2968 }
02aa26ce
NT
2969
2970 /* check for embedded scalars. only stop if we're sure it's a
2971 variable.
2972 */
79072805 2973 else if (*s == '$') {
3280af22 2974 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
79072805 2975 break;
77772344 2976 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
a2a5de95
NC
2977 if (s[1] == '\\') {
2978 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
2979 "Possible unintended interpolation of $\\ in regex");
77772344 2980 }
79072805 2981 break; /* in regexp, $ might be tail anchor */
77772344 2982 }
79072805 2983 }
02aa26ce 2984
2b9d42f0
NIS
2985 /* End of else if chain - OP_TRANS rejoin rest */
2986
02aa26ce 2987 /* backslashes */
79072805 2988 if (*s == '\\' && s+1 < send) {
ff3f963a
KW
2989 char* e; /* Can be used for ending '}', etc. */
2990
79072805 2991 s++;
02aa26ce 2992
7d0fc23c
KW
2993 /* warn on \1 - \9 in substitution replacements, but note that \11
2994 * is an octal; and \19 is \1 followed by '9' */
3280af22 2995 if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat &&
a0d0e21e 2996 isDIGIT(*s) && *s != '0' && !isDIGIT(s[1]))
79072805 2997 {
a2a5de95 2998 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
79072805
LW
2999 *--s = '$';
3000 break;
3001 }
02aa26ce
NT
3002
3003 /* string-change backslash escapes */
838f2281 3004 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQF", *s)) {
79072805
LW
3005 --s;
3006 break;
3007 }
ff3f963a
KW
3008 /* In a pattern, process \N, but skip any other backslash escapes.
3009 * This is because we don't want to translate an escape sequence
3010 * into a meta symbol and have the regex compiler use the meta
3011 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
3012 * in spite of this, we do have to process \N here while the proper
3013 * charnames handler is in scope. See bugs #56444 and #62056.
3014 * There is a complication because \N in a pattern may also stand
3015 * for 'match a non-nl', and not mean a charname, in which case its
3016 * processing should be deferred to the regex compiler. To be a
3017 * charname it must be followed immediately by a '{', and not look
3018 * like \N followed by a curly quantifier, i.e., not something like
3019 * \N{3,}. regcurly returns a boolean indicating if it is a legal
3020 * quantifier */
3021 else if (PL_lex_inpat
3022 && (*s != 'N'
3023 || s[1] != '{'
3024 || regcurly(s + 1)))
3025 {
cc74c5bd
ST
3026 *d++ = NATIVE_TO_NEED(has_utf8,'\\');
3027 goto default_action;
3028 }
02aa26ce 3029
79072805 3030 switch (*s) {
02aa26ce
NT
3031
3032 /* quoted - in transliterations */
79072805 3033 case '-':
3280af22 3034 if (PL_lex_inwhat == OP_TRANS) {
79072805
LW
3035 *d++ = *s++;
3036 continue;
3037 }
3038 /* FALL THROUGH */
3039 default:
11b8faa4 3040 {
e4ca4584 3041 if ((isALNUMC(*s)))
a2a5de95
NC
3042 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
3043 "Unrecognized escape \\%c passed through",
3044 *s);
11b8faa4 3045 /* default action is to copy the quoted character */
f9a63242 3046 goto default_action;
11b8faa4 3047 }
02aa26ce 3048
632403cc 3049 /* eg. \132 indicates the octal constant 0132 */
79072805
LW
3050 case '0': case '1': case '2': case '3':
3051 case '4': case '5': case '6': case '7':
ba210ebe 3052 {
53305cf1
NC
3053 I32 flags = 0;
3054 STRLEN len = 3;
77a135fe 3055 uv = NATIVE_TO_UNI(grok_oct(s, &len, &flags, NULL));
ba210ebe
JH
3056 s += len;
3057 }
012bcf8d 3058 goto NUM_ESCAPE_INSERT;
02aa26ce 3059
f0a2b745
KW
3060 /* eg. \o{24} indicates the octal constant \024 */
3061 case 'o':
3062 {
3063 STRLEN len;
454155d9 3064 const char* error;
f0a2b745 3065
454155d9 3066 bool valid = grok_bslash_o(s, &uv, &len, &error, 1);
f0a2b745 3067 s += len;
454155d9 3068 if (! valid) {
f0a2b745
KW
3069 yyerror(error);
3070 continue;
3071 }
3072 goto NUM_ESCAPE_INSERT;
3073 }
3074
77a135fe 3075 /* eg. \x24 indicates the hex constant 0x24 */
79072805 3076 case 'x':
a0481293 3077 {
53305cf1 3078 STRLEN len;
a0481293 3079 const char* error;
355860ce 3080
a0481293
KW
3081 bool valid = grok_bslash_x(s, &uv, &len, &error, 1);
3082 s += len;
3083 if (! valid) {
3084 yyerror(error);
355860ce 3085 continue;
ba210ebe 3086 }
012bcf8d
GS
3087 }
3088
3089 NUM_ESCAPE_INSERT:
ff3f963a
KW
3090 /* Insert oct or hex escaped character. There will always be
3091 * enough room in sv since such escapes will be longer than any
3092 * UTF-8 sequence they can end up as, except if they force us
3093 * to recode the rest of the string into utf8 */
ba7cea30 3094
77a135fe 3095 /* Here uv is the ordinal of the next character being added in
ff3f963a 3096 * unicode (converted from native). */
77a135fe 3097 if (!UNI_IS_INVARIANT(uv)) {
9aa983d2 3098 if (!has_utf8 && uv > 255) {
77a135fe
KW
3099 /* Might need to recode whatever we have accumulated so
3100 * far if it contains any chars variant in utf8 or
3101 * utf-ebcdic. */
3102
3103 SvCUR_set(sv, d - SvPVX_const(sv));
3104 SvPOK_on(sv);
3105 *d = '\0';
77a135fe 3106 /* See Note on sizing above. */
7bf79863
KW
3107 sv_utf8_upgrade_flags_grow(sv,
3108 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3109 UNISKIP(uv) + (STRLEN)(send - s) + 1);
77a135fe
KW
3110 d = SvPVX(sv) + SvCUR(sv);
3111 has_utf8 = TRUE;
012bcf8d
GS
3112 }
3113
77a135fe
KW
3114 if (has_utf8) {
3115 d = (char*)uvuni_to_utf8((U8*)d, uv);
f9a63242
JH
3116 if (PL_lex_inwhat == OP_TRANS &&
3117 PL_sublex_info.sub_op) {
3118 PL_sublex_info.sub_op->op_private |=
3119 (PL_lex_repl ? OPpTRANS_FROM_UTF
3120 : OPpTRANS_TO_UTF);
f9a63242 3121 }
e294cc5d
JH
3122#ifdef EBCDIC
3123 if (uv > 255 && !dorange)
3124 native_range = FALSE;
3125#endif
012bcf8d 3126 }
a0ed51b3 3127 else {
012bcf8d 3128 *d++ = (char)uv;
a0ed51b3 3129 }
012bcf8d
GS
3130 }
3131 else {
c4d5f83a 3132 *d++ = (char) uv;
a0ed51b3 3133 }
79072805 3134 continue;
02aa26ce 3135
4a2d328f 3136 case 'N':
ff3f963a
KW
3137 /* In a non-pattern \N must be a named character, like \N{LATIN
3138 * SMALL LETTER A} or \N{U+0041}. For patterns, it also can
3139 * mean to match a non-newline. For non-patterns, named
3140 * characters are converted to their string equivalents. In
3141 * patterns, named characters are not converted to their
3142 * ultimate forms for the same reasons that other escapes
3143 * aren't. Instead, they are converted to the \N{U+...} form
3144 * to get the value from the charnames that is in effect right
3145 * now, while preserving the fact that it was a named character
3146 * so that the regex compiler knows this */
3147
3148 /* This section of code doesn't generally use the
3149 * NATIVE_TO_NEED() macro to transform the input. I (khw) did
3150 * a close examination of this macro and determined it is a
3151 * no-op except on utfebcdic variant characters. Every
3152 * character generated by this that would normally need to be
3153 * enclosed by this macro is invariant, so the macro is not
7538f724
KW
3154 * needed, and would complicate use of copy(). XXX There are
3155 * other parts of this file where the macro is used
3156 * inconsistently, but are saved by it being a no-op */
ff3f963a
KW
3157
3158 /* The structure of this section of code (besides checking for
3159 * errors and upgrading to utf8) is:
3160 * Further disambiguate between the two meanings of \N, and if
3161 * not a charname, go process it elsewhere
0a96133f
KW
3162 * If of form \N{U+...}, pass it through if a pattern;
3163 * otherwise convert to utf8
3164 * Otherwise must be \N{NAME}: convert to \N{U+c1.c2...} if a
3165 * pattern; otherwise convert to utf8 */
ff3f963a
KW
3166
3167 /* Here, s points to the 'N'; the test below is guaranteed to
3168 * succeed if we are being called on a pattern as we already
3169 * know from a test above that the next character is a '{'.
3170 * On a non-pattern \N must mean 'named sequence, which
3171 * requires braces */
3172 s++;
3173 if (*s != '{') {
3174 yyerror("Missing braces on \\N{}");
3175 continue;
3176 }
3177 s++;
3178
0a96133f 3179 /* If there is no matching '}', it is an error. */
ff3f963a
KW
3180 if (! (e = strchr(s, '}'))) {
3181 if (! PL_lex_inpat) {
5777a3f7 3182 yyerror("Missing right brace on \\N{}");
0a96133f
KW
3183 } else {
3184 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N.");
dbc0d4f2 3185 }
0a96133f 3186 continue;
ff3f963a 3187 }
cddc7ef4 3188
ff3f963a 3189 /* Here it looks like a named character */
cddc7ef4 3190
ff3f963a
KW
3191 if (PL_lex_inpat) {
3192
3193 /* XXX This block is temporary code. \N{} implies that the
3194 * pattern is to have Unicode semantics, and therefore
3195 * currently has to be encoded in utf8. By putting it in
3196 * utf8 now, we save a whole pass in the regular expression
3197 * compiler. Once that code is changed so Unicode
3198 * semantics doesn't necessarily have to be in utf8, this
da3a4baf
KW
3199 * block should be removed. However, the code that parses
3200 * the output of this would have to be changed to not
3201 * necessarily expect utf8 */
ff3f963a 3202 if (!has_utf8) {
77a135fe 3203 SvCUR_set(sv, d - SvPVX_const(sv));
f08d6ad9 3204 SvPOK_on(sv);
e4f3eed8 3205 *d = '\0';
77a135fe 3206 /* See Note on sizing above. */
7bf79863 3207 sv_utf8_upgrade_flags_grow(sv,
ff3f963a
KW
3208 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3209 /* 5 = '\N{' + cur char + NUL */
3210 (STRLEN)(send - s) + 5);
f08d6ad9 3211 d = SvPVX(sv) + SvCUR(sv);
89491803 3212 has_utf8 = TRUE;
ff3f963a
KW
3213 }
3214 }
423cee85 3215
ff3f963a
KW
3216 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3217 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3218 | PERL_SCAN_DISALLOW_PREFIX;
3219 STRLEN len;
3220
3221 /* For \N{U+...}, the '...' is a unicode value even on
3222 * EBCDIC machines */
3223 s += 2; /* Skip to next char after the 'U+' */
3224 len = e - s;
3225 uv = grok_hex(s, &len, &flags, NULL);
3226 if (len == 0 || len != (STRLEN)(e - s)) {
3227 yyerror("Invalid hexadecimal number in \\N{U+...}");
3228 s = e + 1;
3229 continue;
3230 }
3231
3232 if (PL_lex_inpat) {
3233
e2a7e165
KW
3234 /* On non-EBCDIC platforms, pass through to the regex
3235 * compiler unchanged. The reason we evaluated the
3236 * number above is to make sure there wasn't a syntax
3237 * error. But on EBCDIC we convert to native so
3238 * downstream code can continue to assume it's native
3239 */
ff3f963a 3240 s -= 5; /* Include the '\N{U+' */
e2a7e165
KW
3241#ifdef EBCDIC
3242 d += my_snprintf(d, e - s + 1 + 1, /* includes the }
3243 and the \0 */
3244 "\\N{U+%X}",
3245 (unsigned int) UNI_TO_NATIVE(uv));
3246#else
ff3f963a
KW
3247 Copy(s, d, e - s + 1, char); /* 1 = include the } */
3248 d += e - s + 1;
e2a7e165 3249#endif
ff3f963a
KW
3250 }
3251 else { /* Not a pattern: convert the hex to string */
3252
3253 /* If destination is not in utf8, unconditionally
3254 * recode it to be so. This is because \N{} implies
3255 * Unicode semantics, and scalars have to be in utf8
3256 * to guarantee those semantics */
3257 if (! has_utf8) {
3258 SvCUR_set(sv, d - SvPVX_const(sv));
3259 SvPOK_on(sv);
3260 *d = '\0';
3261 /* See Note on sizing above. */
3262 sv_utf8_upgrade_flags_grow(
3263 sv,
3264 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3265 UNISKIP(uv) + (STRLEN)(send - e) + 1);
3266 d = SvPVX(sv) + SvCUR(sv);
3267 has_utf8 = TRUE;
3268 }
3269
3270 /* Add the string to the output */
3271 if (UNI_IS_INVARIANT(uv)) {
3272 *d++ = (char) uv;
3273 }
3274 else d = (char*)uvuni_to_utf8((U8*)d, uv);
3275 }
3276 }
3277 else { /* Here is \N{NAME} but not \N{U+...}. */
3278
3279 SV *res; /* result from charnames */
3280 const char *str; /* the string in 'res' */
3281 STRLEN len; /* its length */
3282
3283 /* Get the value for NAME */
3284 res = newSVpvn(s, e - s);
3285 res = new_constant( NULL, 0, "charnames",
3286 /* includes all of: \N{...} */
3287 res, NULL, s - 3, e - s + 4 );
3288
3289 /* Most likely res will be in utf8 already since the
3290 * standard charnames uses pack U, but a custom translator
3291 * can leave it otherwise, so make sure. XXX This can be
3292 * revisited to not have charnames use utf8 for characters
3293 * that don't need it when regexes don't have to be in utf8
3294 * for Unicode semantics. If doing so, remember EBCDIC */
3295 sv_utf8_upgrade(res);
3296 str = SvPV_const(res, len);
3297
3298 /* Don't accept malformed input */
3299 if (! is_utf8_string((U8 *) str, len)) {
3300 yyerror("Malformed UTF-8 returned by \\N");
3301 }
3302 else if (PL_lex_inpat) {
3303
3304 if (! len) { /* The name resolved to an empty string */
3305 Copy("\\N{}", d, 4, char);
3306 d += 4;
3307 }
3308 else {
3309 /* In order to not lose information for the regex
3310 * compiler, pass the result in the specially made
3311 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3312 * the code points in hex of each character
3313 * returned by charnames */
3314
3315 const char *str_end = str + len;
3316 STRLEN char_length; /* cur char's byte length */
3317 STRLEN output_length; /* and the number of bytes
3318 after this is translated
3319 into hex digits */
3320 const STRLEN off = d - SvPVX_const(sv);
3321
3322 /* 2 hex per byte; 2 chars for '\N'; 2 chars for
3323 * max('U+', '.'); and 1 for NUL */
3324 char hex_string[2 * UTF8_MAXBYTES + 5];
3325
3326 /* Get the first character of the result. */
3327 U32 uv = utf8n_to_uvuni((U8 *) str,
3328 len,
3329 &char_length,
3330 UTF8_ALLOW_ANYUV);
3331
3332 /* The call to is_utf8_string() above hopefully
3333 * guarantees that there won't be an error. But
3334 * it's easy here to make sure. The function just
3335 * above warns and returns 0 if invalid utf8, but
3336 * it can also return 0 if the input is validly a
3337 * NUL. Disambiguate */
3338 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3339 uv = UNICODE_REPLACEMENT;
3340 }
3341
3342 /* Convert first code point to hex, including the
e2a7e165
KW
3343 * boiler plate before it. For all these, we
3344 * convert to native format so that downstream code
3345 * can continue to assume the input is native */
78c35590 3346 output_length =
3353de27 3347 my_snprintf(hex_string, sizeof(hex_string),
e2a7e165
KW
3348 "\\N{U+%X",
3349 (unsigned int) UNI_TO_NATIVE(uv));
ff3f963a
KW
3350
3351 /* Make sure there is enough space to hold it */
3352 d = off + SvGROW(sv, off
3353 + output_length
3354 + (STRLEN)(send - e)
3355 + 2); /* '}' + NUL */
3356 /* And output it */
3357 Copy(hex_string, d, output_length, char);
3358 d += output_length;
3359
3360 /* For each subsequent character, append dot and
3361 * its ordinal in hex */
3362 while ((str += char_length) < str_end) {
3363 const STRLEN off = d - SvPVX_const(sv);
3364 U32 uv = utf8n_to_uvuni((U8 *) str,
3365 str_end - str,
3366 &char_length,
3367 UTF8_ALLOW_ANYUV);
3368 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3369 uv = UNICODE_REPLACEMENT;
3370 }
3371
78c35590 3372 output_length =
3353de27 3373 my_snprintf(hex_string, sizeof(hex_string),
e2a7e165
KW
3374 ".%X",
3375 (unsigned int) UNI_TO_NATIVE(uv));
ff3f963a
KW
3376
3377 d = off + SvGROW(sv, off
3378 + output_length
3379 + (STRLEN)(send - e)
3380 + 2); /* '}' + NUL */
3381 Copy(hex_string, d, output_length, char);
3382 d += output_length;
3383 }
3384
3385 *d++ = '}'; /* Done. Add the trailing brace */
3386 }
3387 }
3388 else { /* Here, not in a pattern. Convert the name to a
3389 * string. */
3390
3391 /* If destination is not in utf8, unconditionally
3392 * recode it to be so. This is because \N{} implies
3393 * Unicode semantics, and scalars have to be in utf8
3394 * to guarantee those semantics */
3395 if (! has_utf8) {
3396 SvCUR_set(sv, d - SvPVX_const(sv));
3397 SvPOK_on(sv);
3398 *d = '\0';
3399 /* See Note on sizing above. */
3400 sv_utf8_upgrade_flags_grow(sv,
3401 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3402 len + (STRLEN)(send - s) + 1);
3403 d = SvPVX(sv) + SvCUR(sv);
3404 has_utf8 = TRUE;
3405 } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
3406
3407 /* See Note on sizing above. (NOTE: SvCUR() is not
3408 * set correctly here). */
3409 const STRLEN off = d - SvPVX_const(sv);
3410 d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1);
3411 }
3412 Copy(str, d, len, char);
3413 d += len;
423cee85 3414 }
423cee85 3415 SvREFCNT_dec(res);
cb233ae3
KW
3416
3417 /* Deprecate non-approved name syntax */
3418 if (ckWARN_d(WARN_DEPRECATED)) {
3419 bool problematic = FALSE;
3420 char* i = s;
3421
3422 /* For non-ut8 input, look to see that the first
3423 * character is an alpha, then loop through the rest
3424 * checking that each is a continuation */
3425 if (! this_utf8) {
3426 if (! isALPHAU(*i)) problematic = TRUE;
3427 else for (i = s + 1; i < e; i++) {
3428 if (isCHARNAME_CONT(*i)) continue;
3429 problematic = TRUE;
3430 break;
3431 }
3432 }
3433 else {
3434 /* Similarly for utf8. For invariants can check
3435 * directly. We accept anything above the latin1
3436 * range because it is immaterial to Perl if it is
3437 * correct or not, and is expensive to check. But
3438 * it is fairly easy in the latin1 range to convert
3439 * the variants into a single character and check
3440 * those */
3441 if (UTF8_IS_INVARIANT(*i)) {
3442 if (! isALPHAU(*i)) problematic = TRUE;
3443 } else if (UTF8_IS_DOWNGRADEABLE_START(*i)) {
81c14aa2 3444 if (! isALPHAU(UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(*i,
cb233ae3
KW
3445 *(i+1)))))
3446 {
3447 problematic = TRUE;
3448 }
3449 }
3450 if (! problematic) for (i = s + UTF8SKIP(s);
3451 i < e;
3452 i+= UTF8SKIP(i))
3453 {
3454 if (UTF8_IS_INVARIANT(*i)) {
3455 if (isCHARNAME_CONT(*i)) continue;
3456 } else if (! UTF8_IS_DOWNGRADEABLE_START(*i)) {
3457 continue;
3458 } else if (isCHARNAME_CONT(
3459 UNI_TO_NATIVE(
81c14aa2 3460 TWO_BYTE_UTF8_TO_UNI(*i, *(i+1)))))
cb233ae3
KW
3461 {
3462 continue;
3463 }
3464 problematic = TRUE;
3465 break;
3466 }
3467 }
3468 if (problematic) {
6e1bad6c
KW
3469 /* The e-i passed to the final %.*s makes sure that
3470 * should the trailing NUL be missing that this
3471 * print won't run off the end of the string */
cb233ae3 3472 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
b00fc8d4
NC
3473 "Deprecated character in \\N{...}; marked by <-- HERE in \\N{%.*s<-- HERE %.*s",
3474 (int)(i - s + 1), s, (int)(e - i), i + 1);
cb233ae3
KW
3475 }
3476 }
3477 } /* End \N{NAME} */
ff3f963a
KW
3478#ifdef EBCDIC
3479 if (!dorange)
3480 native_range = FALSE; /* \N{} is defined to be Unicode */
3481#endif
3482 s = e + 1; /* Point to just after the '}' */
423cee85
JH
3483 continue;
3484
02aa26ce 3485 /* \c is a control character */
79072805
LW
3486 case 'c':
3487 s++;
961ce445 3488 if (s < send) {
17a3df4c 3489 *d++ = grok_bslash_c(*s++, has_utf8, 1);
ba210ebe 3490 }
961ce445
RGS
3491 else {
3492 yyerror("Missing control char name in \\c");
3493 }
79072805 3494 continue;
02aa26ce
NT
3495
3496 /* printf-style backslashes, formfeeds, newlines, etc */
79072805 3497 case 'b':
db42d148 3498 *d++ = NATIVE_TO_NEED(has_utf8,'\b');
79072805
LW
3499 break;
3500 case 'n':
db42d148 3501 *d++ = NATIVE_TO_NEED(has_utf8,'\n');
79072805
LW
3502 break;
3503 case 'r':
db42d148 3504 *d++ = NATIVE_TO_NEED(has_utf8,'\r');
79072805
LW
3505 break;
3506 case 'f':
db42d148 3507 *d++ = NATIVE_TO_NEED(has_utf8,'\f');
79072805
LW
3508 break;
3509 case 't':
db42d148 3510 *d++ = NATIVE_TO_NEED(has_utf8,'\t');
79072805 3511 break;
34a3fe2a 3512 case 'e':
db42d148 3513 *d++ = ASCII_TO_NEED(has_utf8,'\033');
34a3fe2a
PP
3514 break;
3515 case 'a':
db42d148 3516 *d++ = ASCII_TO_NEED(has_utf8,'\007');
79072805 3517 break;
02aa26ce
NT
3518 } /* end switch */
3519
79072805
LW
3520 s++;
3521 continue;
02aa26ce 3522 } /* end if (backslash) */
4c3a8340
ST
3523#ifdef EBCDIC
3524 else
3525 literal_endpoint++;
3526#endif
02aa26ce 3527
f9a63242 3528 default_action:
77a135fe
KW
3529 /* If we started with encoded form, or already know we want it,
3530 then encode the next character */
3531 if (! NATIVE_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
2b9d42f0 3532 STRLEN len = 1;
77a135fe
KW
3533
3534
3535 /* One might think that it is wasted effort in the case of the
3536 * source being utf8 (this_utf8 == TRUE) to take the next character
3537 * in the source, convert it to an unsigned value, and then convert
3538 * it back again. But the source has not been validated here. The
3539 * routine that does the conversion checks for errors like
3540 * malformed utf8 */
3541
5f66b61c
AL
3542 const UV nextuv = (this_utf8) ? utf8n_to_uvchr((U8*)s, send - s, &len, 0) : (UV) ((U8) *s);
3543 const STRLEN need = UNISKIP(NATIVE_TO_UNI(nextuv));
77a135fe
KW
3544 if (!has_utf8) {
3545 SvCUR_set(sv, d - SvPVX_const(sv));
3546 SvPOK_on(sv);
3547 *d = '\0';
77a135fe 3548 /* See Note on sizing above. */
7bf79863
KW
3549 sv_utf8_upgrade_flags_grow(sv,
3550 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3551 need + (STRLEN)(send - s) + 1);
77a135fe
KW
3552 d = SvPVX(sv) + SvCUR(sv);
3553 has_utf8 = TRUE;
3554 } else if (need > len) {
3555 /* encoded value larger than old, may need extra space (NOTE:
3556 * SvCUR() is not set correctly here). See Note on sizing
3557 * above. */
9d4ba2ae 3558 const STRLEN off = d - SvPVX_const(sv);
77a135fe 3559 d = SvGROW(sv, off + need + (STRLEN)(send - s) + 1) + off;
2b9d42f0 3560 }
77a135fe
KW
3561 s += len;
3562
5f66b61c 3563 d = (char*)uvchr_to_utf8((U8*)d, nextuv);
e294cc5d
JH
3564#ifdef EBCDIC
3565 if (uv > 255 && !dorange)
3566 native_range = FALSE;
3567#endif
2b9d42f0
NIS
3568 }
3569 else {
3570 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
3571 }
02aa26ce
NT
3572 } /* while loop to process each character */
3573
3574 /* terminate the string and set up the sv */
79072805 3575 *d = '\0';
95a20fc0 3576 SvCUR_set(sv, d - SvPVX_const(sv));
2b9d42f0 3577 if (SvCUR(sv) >= SvLEN(sv))
5637ef5b
NC
3578 Perl_croak(aTHX_ "panic: constant overflowed allocated space, %"UVuf
3579 " >= %"UVuf, (UV)SvCUR(sv), (UV)SvLEN(sv));
2b9d42f0 3580
79072805 3581 SvPOK_on(sv);
9f4817db 3582 if (PL_encoding && !has_utf8) {
d0063567
DK
3583 sv_recode_to_utf8(sv, PL_encoding);
3584 if (SvUTF8(sv))
3585 has_utf8 = TRUE;
9f4817db 3586 }
2b9d42f0 3587 if (has_utf8) {
7e2040f0 3588 SvUTF8_on(sv);
2b9d42f0 3589 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
d0063567 3590 PL_sublex_info.sub_op->op_private |=
2b9d42f0
NIS
3591 (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
3592 }
3593 }
79072805 3594
02aa26ce 3595 /* shrink the sv if we allocated more than we used */
79072805 3596 if (SvCUR(sv) + 5 < SvLEN(sv)) {
1da4ca5f 3597 SvPV_shrink_to_cur(sv);
79072805 3598 }
02aa26ce 3599
6154021b 3600 /* return the substring (via pl_yylval) only if we parsed anything */
3280af22 3601 if (s > PL_bufptr) {
eb0d8d16
NC
3602 if ( PL_hints & ( PL_lex_inpat ? HINT_NEW_RE : HINT_NEW_STRING ) ) {
3603 const char *const key = PL_lex_inpat ? "qr" : "q";
3604 const STRLEN keylen = PL_lex_inpat ? 2 : 1;
3605 const char *type;
3606 STRLEN typelen;
3607
3608 if (PL_lex_inwhat == OP_TRANS) {
3609 type = "tr";
3610 typelen = 2;
3611 } else if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat) {
3612 type = "s";
3613 typelen = 1;
9da1dd8f
DM
3614 } else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'') {
3615 type = "q";
3616 typelen = 1;
eb0d8d16
NC
3617 } else {
3618 type = "qq";
3619 typelen = 2;
3620 }
3621
3622 sv = S_new_constant(aTHX_ start, s - start, key, keylen, sv, NULL,
3623 type, typelen);
3624 }
6154021b 3625 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
b3ac6de7 3626 } else
8990e307 3627 SvREFCNT_dec(sv);
79072805
LW
3628 return s;
3629}
3630
ffb4593c
NT
3631/* S_intuit_more
3632 * Returns TRUE if there's more to the expression (e.g., a subscript),
3633 * FALSE otherwise.
ffb4593c
NT
3634 *
3635 * It deals with "$foo[3]" and /$foo[3]/ and /$foo[0123456789$]+/
3636 *
3637 * ->[ and ->{ return TRUE
3638 * { and [ outside a pattern are always subscripts, so return TRUE
3639 * if we're outside a pattern and it's not { or [, then return FALSE
3640 * if we're in a pattern and the first char is a {
3641 * {4,5} (any digits around the comma) returns FALSE
3642 * if we're in a pattern and the first char is a [
3643 * [] returns FALSE
3644 * [SOMETHING] has a funky algorithm to decide whether it's a
3645 * character class or not. It has to deal with things like
3646 * /$foo[-3]/ and /$foo[$bar]/ as well as /$foo[$\d]+/
3647 * anything else returns TRUE
3648 */
3649
9cbb5ea2
GS
3650/* This is the one truly awful dwimmer necessary to conflate C and sed. */
3651
76e3520e 3652STATIC int
cea2e8a9 3653S_intuit_more(pTHX_ register char *s)
79072805 3654{
97aff369 3655 dVAR;
7918f24d
NC
3656
3657 PERL_ARGS_ASSERT_INTUIT_MORE;
3658
3280af22 3659 if (PL_lex_brackets)
79072805
LW
3660 return TRUE;
3661 if (*s == '-' && s[1] == '>' && (s[2] == '[' || s[2] == '{'))
3662 return TRUE;
3663 if (*s != '{' && *s != '[')
3664 return FALSE;
3280af22 3665 if (!PL_lex_inpat)
79072805
LW
3666 return TRUE;
3667
3668 /* In a pattern, so maybe we have {n,m}. */
3669 if (*s == '{') {
b3155d95 3670 if (regcurly(s)) {
79072805 3671 return FALSE;
b3155d95 3672 }
79072805 3673 return TRUE;
79072805
LW
3674 }
3675
3676 /* On the other hand, maybe we have a character class */
3677
3678 s++;
3679 if (*s == ']' || *s == '^')
3680 return FALSE;
3681 else {
ffb4593c 3682 /* this is terrifying, and it works */
79072805
LW
3683 int weight = 2; /* let's weigh the evidence */
3684 char seen[256];
f27ffc4a 3685 unsigned char un_char = 255, last_un_char;
9d4ba2ae 3686 const char * const send = strchr(s,']');
3280af22 3687 char tmpbuf[sizeof PL_tokenbuf * 4];
79072805
LW
3688
3689 if (!send) /* has to be an expression */
3690 return TRUE;
3691
3692 Zero(seen,256,char);
3693 if (*s == '$')
3694 weight -= 3;
3695 else if (isDIGIT(*s)) {
3696 if (s[1] != ']') {
3697 if (isDIGIT(s[1]) && s[2] == ']')
3698 weight -= 10;
3699 }
3700 else
3701 weight -= 100;
3702 }
3703 for (; s < send; s++) {
3704 last_un_char = un_char;
3705 un_char = (unsigned char)*s;
3706 switch (*s) {
3707 case '@':
3708 case '&':
3709 case '$':
3710 weight -= seen[un_char] * 10;
7e2040f0 3711 if (isALNUM_lazy_if(s+1,UTF)) {
90e5519e 3712 int len;
8903cb82 3713 scan_ident(s, send, tmpbuf, sizeof tmpbuf, FALSE);
90e5519e 3714 len = (int)strlen(tmpbuf);
6fbd0d97
BF
3715 if (len > 1 && gv_fetchpvn_flags(tmpbuf, len,
3716 UTF ? SVf_UTF8 : 0, SVt_PV))
79072805
LW
3717 weight -= 100;
3718 else
3719 weight -= 10;
3720 }
3721 else if (*s == '$' && s[1] &&
93a17b20
LW
3722 strchr("[#!%*<>()-=",s[1])) {
3723 if (/*{*/ strchr("])} =",s[2]))
79072805
LW
3724 weight -= 10;
3725 else
3726 weight -= 1;
3727 }
3728 break;
3729 case '\\':
3730 un_char = 254;
3731 if (s[1]) {
93a17b20 3732 if (strchr("wds]",s[1]))
79072805 3733 weight += 100;
10edeb5d 3734 else if (seen[(U8)'\''] || seen[(U8)'"'])
79072805 3735 weight += 1;
93a17b20 3736 else if (strchr("rnftbxcav",s[1]))
79072805
LW
3737 weight += 40;
3738 else if (isDIGIT(s[1])) {
3739 weight += 40;
3740 while (s[1] && isDIGIT(s[1]))
3741 s++;
3742 }
3743 }
3744 else
3745 weight += 100;
3746 break;
3747 case '-':
3748 if (s[1] == '\\')
3749 weight += 50;
93a17b20 3750 if (strchr("aA01! ",last_un_char))
79072805 3751 weight += 30;<