This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
op.c: newSTATEOP: don’t check PL_parser after using it
[perl5.git] / toke.c
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
26
27This is the lower layer of the Perl parser, managing characters and tokens.
28
29=for apidoc AmU|yy_parser *|PL_parser
30
31Pointer to a structure encapsulating the state of the parsing operation
32currently in progress. The pointer can be locally changed to perform
33a nested parse without interfering with the state of an outer parse.
34Individual members of C<PL_parser> have their own documentation.
35
36=cut
37*/
38
378cc40b 39#include "EXTERN.h"
864dbfa3 40#define PERL_IN_TOKE_C
378cc40b 41#include "perl.h"
04e98a4d 42#include "dquote_static.c"
378cc40b 43
eb0d8d16
NC
44#define new_constant(a,b,c,d,e,f,g) \
45 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
46
6154021b 47#define pl_yylval (PL_parser->yylval)
d3b6f988 48
199e78b7
DM
49/* XXX temporary backwards compatibility */
50#define PL_lex_brackets (PL_parser->lex_brackets)
78cdf107
Z
51#define PL_lex_allbrackets (PL_parser->lex_allbrackets)
52#define PL_lex_fakeeof (PL_parser->lex_fakeeof)
199e78b7
DM
53#define PL_lex_brackstack (PL_parser->lex_brackstack)
54#define PL_lex_casemods (PL_parser->lex_casemods)
55#define PL_lex_casestack (PL_parser->lex_casestack)
56#define PL_lex_defer (PL_parser->lex_defer)
57#define PL_lex_dojoin (PL_parser->lex_dojoin)
58#define PL_lex_expect (PL_parser->lex_expect)
59#define PL_lex_formbrack (PL_parser->lex_formbrack)
60#define PL_lex_inpat (PL_parser->lex_inpat)
61#define PL_lex_inwhat (PL_parser->lex_inwhat)
62#define PL_lex_op (PL_parser->lex_op)
63#define PL_lex_repl (PL_parser->lex_repl)
64#define PL_lex_starts (PL_parser->lex_starts)
65#define PL_lex_stuff (PL_parser->lex_stuff)
66#define PL_multi_start (PL_parser->multi_start)
67#define PL_multi_open (PL_parser->multi_open)
68#define PL_multi_close (PL_parser->multi_close)
69#define PL_pending_ident (PL_parser->pending_ident)
70#define PL_preambled (PL_parser->preambled)
71#define PL_sublex_info (PL_parser->sublex_info)
bdc0bf6f 72#define PL_linestr (PL_parser->linestr)
c2598295
DM
73#define PL_expect (PL_parser->expect)
74#define PL_copline (PL_parser->copline)
f06b5848
DM
75#define PL_bufptr (PL_parser->bufptr)
76#define PL_oldbufptr (PL_parser->oldbufptr)
77#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
78#define PL_linestart (PL_parser->linestart)
79#define PL_bufend (PL_parser->bufend)
80#define PL_last_uni (PL_parser->last_uni)
81#define PL_last_lop (PL_parser->last_lop)
82#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 83#define PL_lex_state (PL_parser->lex_state)
2f9285f8 84#define PL_rsfp (PL_parser->rsfp)
5486870f 85#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
86#define PL_in_my (PL_parser->in_my)
87#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 88#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 89#define PL_multi_end (PL_parser->multi_end)
13765c85 90#define PL_error_count (PL_parser->error_count)
199e78b7
DM
91
92#ifdef PERL_MAD
93# define PL_endwhite (PL_parser->endwhite)
94# define PL_faketokens (PL_parser->faketokens)
95# define PL_lasttoke (PL_parser->lasttoke)
96# define PL_nextwhite (PL_parser->nextwhite)
97# define PL_realtokenstart (PL_parser->realtokenstart)
98# define PL_skipwhite (PL_parser->skipwhite)
99# define PL_thisclose (PL_parser->thisclose)
100# define PL_thismad (PL_parser->thismad)
101# define PL_thisopen (PL_parser->thisopen)
102# define PL_thisstuff (PL_parser->thisstuff)
103# define PL_thistoken (PL_parser->thistoken)
104# define PL_thiswhite (PL_parser->thiswhite)
fb205e7a
DM
105# define PL_thiswhite (PL_parser->thiswhite)
106# define PL_nexttoke (PL_parser->nexttoke)
107# define PL_curforce (PL_parser->curforce)
108#else
109# define PL_nexttoke (PL_parser->nexttoke)
110# define PL_nexttype (PL_parser->nexttype)
111# define PL_nextval (PL_parser->nextval)
199e78b7
DM
112#endif
113
16173588
NC
114/* This can't be done with embed.fnc, because struct yy_parser contains a
115 member named pending_ident, which clashes with the generated #define */
3cbf51f5
DM
116static int
117S_pending_ident(pTHX);
199e78b7 118
0bd48802 119static const char ident_too_long[] = "Identifier too long";
8903cb82 120
29595ff2 121#ifdef PERL_MAD
29595ff2 122# define CURMAD(slot,sv) if (PL_madskills) { curmad(slot,sv); sv = 0; }
cd81e915 123# define NEXTVAL_NEXTTOKE PL_nexttoke[PL_curforce].next_val
9ded7720 124#else
5db06880 125# define CURMAD(slot,sv)
9ded7720 126# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2
NC
127#endif
128
a7aaec61
Z
129#define XENUMMASK 0x3f
130#define XFAKEEOF 0x40
131#define XFAKEBRACK 0x80
9059aa12 132
39e02b42
JH
133#ifdef USE_UTF8_SCRIPTS
134# define UTF (!IN_BYTES)
2b9d42f0 135#else
802a15e9 136# define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
2b9d42f0 137#endif
a0ed51b3 138
b1fc3636
CJ
139/* The maximum number of characters preceding the unrecognized one to display */
140#define UNRECOGNIZED_PRECEDE_COUNT 10
141
61f0cdd9 142/* In variables named $^X, these are the legal values for X.
2b92dfce
GS
143 * 1999-02-27 mjd-perl-patch@plover.com */
144#define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
145
bf4acbe4 146#define SPACE_OR_TAB(c) ((c)==' '||(c)=='\t')
bf4acbe4 147
ffb4593c
NT
148/* LEX_* are values for PL_lex_state, the state of the lexer.
149 * They are arranged oddly so that the guard on the switch statement
79072805 150 * can get by with a single comparison (if the compiler is smart enough).
9da1dd8f
DM
151 *
152 * These values refer to the various states within a sublex parse,
153 * i.e. within a double quotish string
79072805
LW
154 */
155
fb73857a 156/* #define LEX_NOTPARSING 11 is done in perl.h. */
157
b6007c36
DM
158#define LEX_NORMAL 10 /* normal code (ie not within "...") */
159#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
160#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
161#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
162#define LEX_INTERPSTART 6 /* expecting the start of a $var */
163
164 /* at end of code, eg "$x" followed by: */
165#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
166#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
167
168#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
169 string or after \E, $foo, etc */
170#define LEX_INTERPCONST 2 /* NOT USED */
171#define LEX_FORMLINE 1 /* expecting a format line */
172#define LEX_KNOWNEXT 0 /* next token known; just return it */
173
79072805 174
bbf60fe6 175#ifdef DEBUGGING
27da23d5 176static const char* const lex_state_names[] = {
bbf60fe6
DM
177 "KNOWNEXT",
178 "FORMLINE",
179 "INTERPCONST",
180 "INTERPCONCAT",
181 "INTERPENDMAYBE",
182 "INTERPEND",
183 "INTERPSTART",
184 "INTERPPUSH",
185 "INTERPCASEMOD",
186 "INTERPNORMAL",
187 "NORMAL"
188};
189#endif
190
79072805
LW
191#ifdef ff_next
192#undef ff_next
d48672a2
LW
193#endif
194
79072805 195#include "keywords.h"
fe14fcc3 196
ffb4593c
NT
197/* CLINE is a macro that ensures PL_copline has a sane value */
198
ae986130
LW
199#ifdef CLINE
200#undef CLINE
201#endif
57843af0 202#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 203
5db06880 204#ifdef PERL_MAD
29595ff2
NC
205# define SKIPSPACE0(s) skipspace0(s)
206# define SKIPSPACE1(s) skipspace1(s)
207# define SKIPSPACE2(s,tsv) skipspace2(s,&tsv)
208# define PEEKSPACE(s) skipspace2(s,0)
209#else
210# define SKIPSPACE0(s) skipspace(s)
211# define SKIPSPACE1(s) skipspace(s)
212# define SKIPSPACE2(s,tsv) skipspace(s)
213# define PEEKSPACE(s) skipspace(s)
214#endif
215
ffb4593c
NT
216/*
217 * Convenience functions to return different tokens and prime the
9cbb5ea2 218 * lexer for the next token. They all take an argument.
ffb4593c
NT
219 *
220 * TOKEN : generic token (used for '(', DOLSHARP, etc)
221 * OPERATOR : generic operator
222 * AOPERATOR : assignment operator
223 * PREBLOCK : beginning the block after an if, while, foreach, ...
224 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
225 * PREREF : *EXPR where EXPR is not a simple identifier
226 * TERM : expression term
227 * LOOPX : loop exiting command (goto, last, dump, etc)
228 * FTST : file test operator
229 * FUN0 : zero-argument function
7eb971ee 230 * FUN0OP : zero-argument function, with its op created in this file
2d2e263d 231 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
232 * BOop : bitwise or or xor
233 * BAop : bitwise and
234 * SHop : shift operator
235 * PWop : power operator
9cbb5ea2 236 * PMop : pattern-matching operator
ffb4593c
NT
237 * Aop : addition-level operator
238 * Mop : multiplication-level operator
239 * Eop : equality-testing operator
e5edeb50 240 * Rop : relational operator <= != gt
ffb4593c
NT
241 *
242 * Also see LOP and lop() below.
243 */
244
998054bd 245#ifdef DEBUGGING /* Serve -DT. */
704d4215 246# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 247#else
bbf60fe6 248# define REPORT(retval) (retval)
998054bd
SC
249#endif
250
bbf60fe6
DM
251#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
252#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
253#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, REPORT(retval)))
254#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
255#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
256#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
257#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
6154021b
RGS
258#define LOOPX(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)LOOPEX))
259#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
260#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
7eb971ee 261#define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
6154021b
RGS
262#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
263#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITOROP)))
264#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITANDOP)))
265#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)SHIFTOP)))
266#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)POWOP)))
267#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
268#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)ADDOP)))
269#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MULOP)))
270#define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
271#define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
2f3197b3 272
a687059c
LW
273/* This bit of chicanery makes a unary function followed by
274 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
275 * The UNIDOR macro is for unary functions that can be followed by the //
276 * operator (such as C<shift // 0>).
a687059c 277 */
d68ce4ac 278#define UNI3(f,x,have_x) { \
6154021b 279 pl_yylval.ival = f; \
d68ce4ac 280 if (have_x) PL_expect = x; \
376fcdbf
AL
281 PL_bufptr = s; \
282 PL_last_uni = PL_oldbufptr; \
283 PL_last_lop_op = f; \
284 if (*s == '(') \
285 return REPORT( (int)FUNC1 ); \
29595ff2 286 s = PEEKSPACE(s); \
376fcdbf
AL
287 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
288 }
d68ce4ac
FC
289#define UNI(f) UNI3(f,XTERM,1)
290#define UNIDOR(f) UNI3(f,XTERMORDORDOR,1)
b5fb7ce3
FC
291#define UNIPROTO(f,optional) { \
292 if (optional) PL_last_uni = PL_oldbufptr; \
22393538
MH
293 OPERATOR(f); \
294 }
a687059c 295
d68ce4ac 296#define UNIBRACK(f) UNI3(f,0,0)
79072805 297
9f68db38 298/* grandfather return to old style */
78cdf107
Z
299#define OLDLOP(f) \
300 do { \
301 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
302 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
303 pl_yylval.ival = (f); \
304 PL_expect = XTERM; \
305 PL_bufptr = s; \
306 return (int)LSTOP; \
307 } while(0)
79072805 308
83944c01
FC
309#define COPLINE_INC_WITH_HERELINES \
310 STMT_START { \
311 CopLINE_inc(PL_curcop); \
d794b522
FC
312 if (PL_parser->lex_shared->herelines) \
313 CopLINE(PL_curcop) += PL_parser->lex_shared->herelines, \
314 PL_parser->lex_shared->herelines = 0; \
83944c01
FC
315 } STMT_END
316
317
8fa7f367
JH
318#ifdef DEBUGGING
319
6154021b 320/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
321enum token_type {
322 TOKENTYPE_NONE,
323 TOKENTYPE_IVAL,
6154021b 324 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6 325 TOKENTYPE_PVAL,
aeaef349 326 TOKENTYPE_OPVAL
bbf60fe6
DM
327};
328
6d4a66ac
NC
329static struct debug_tokens {
330 const int token;
331 enum token_type type;
332 const char *name;
333} const debug_tokens[] =
9041c2e3 334{
bbf60fe6
DM
335 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
336 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
337 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
338 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
339 { ARROW, TOKENTYPE_NONE, "ARROW" },
340 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
341 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
342 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
343 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
344 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
0d863452 345 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
346 { DO, TOKENTYPE_NONE, "DO" },
347 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
348 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
349 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
350 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
351 { ELSE, TOKENTYPE_NONE, "ELSE" },
352 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
353 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
354 { FOR, TOKENTYPE_IVAL, "FOR" },
355 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
705fe0e5
FC
356 { FORMLBRACK, TOKENTYPE_NONE, "FORMLBRACK" },
357 { FORMRBRACK, TOKENTYPE_NONE, "FORMRBRACK" },
bbf60fe6
DM
358 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
359 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
7eb971ee 360 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
bbf60fe6
DM
361 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
362 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
363 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 364 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
365 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
366 { IF, TOKENTYPE_IVAL, "IF" },
5db1eb8d 367 { LABEL, TOKENTYPE_OPVAL, "LABEL" },
bbf60fe6
DM
368 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
369 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
370 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
371 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
372 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
373 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
374 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
375 { MY, TOKENTYPE_IVAL, "MY" },
376 { MYSUB, TOKENTYPE_NONE, "MYSUB" },
377 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
378 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
379 { OROP, TOKENTYPE_IVAL, "OROP" },
380 { OROR, TOKENTYPE_NONE, "OROR" },
381 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
f3f204dc 382 { PEG, TOKENTYPE_NONE, "PEG" },
88e1f1a2
JV
383 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
384 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6
DM
385 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
386 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
387 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
388 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
389 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
390 { PREINC, TOKENTYPE_NONE, "PREINC" },
391 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
f3f204dc 392 { QWLIST, TOKENTYPE_OPVAL, "QWLIST" },
bbf60fe6
DM
393 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
394 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
f3f204dc 395 { REQUIRE, TOKENTYPE_NONE, "REQUIRE" },
bbf60fe6
DM
396 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
397 { SUB, TOKENTYPE_NONE, "SUB" },
398 { THING, TOKENTYPE_OPVAL, "THING" },
399 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
400 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
401 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
402 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
403 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
404 { USE, TOKENTYPE_IVAL, "USE" },
0d863452 405 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6
DM
406 { WHILE, TOKENTYPE_IVAL, "WHILE" },
407 { WORD, TOKENTYPE_OPVAL, "WORD" },
be25f609 408 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 409 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
410};
411
6154021b 412/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 413
bbf60fe6 414STATIC int
704d4215 415S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 416{
97aff369 417 dVAR;
7918f24d
NC
418
419 PERL_ARGS_ASSERT_TOKEREPORT;
420
bbf60fe6 421 if (DEBUG_T_TEST) {
bd61b366 422 const char *name = NULL;
bbf60fe6 423 enum token_type type = TOKENTYPE_NONE;
f54cb97a 424 const struct debug_tokens *p;
396482e1 425 SV* const report = newSVpvs("<== ");
bbf60fe6 426
f54cb97a 427 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
428 if (p->token == (int)rv) {
429 name = p->name;
430 type = p->type;
431 break;
432 }
433 }
434 if (name)
54667de8 435 Perl_sv_catpv(aTHX_ report, name);
bbf60fe6
DM
436 else if ((char)rv > ' ' && (char)rv < '~')
437 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
438 else if (!rv)
396482e1 439 sv_catpvs(report, "EOF");
bbf60fe6
DM
440 else
441 Perl_sv_catpvf(aTHX_ report, "?? %"IVdf, (IV)rv);
442 switch (type) {
443 case TOKENTYPE_NONE:
bbf60fe6
DM
444 break;
445 case TOKENTYPE_IVAL:
704d4215 446 Perl_sv_catpvf(aTHX_ report, "(ival=%"IVdf")", (IV)lvalp->ival);
bbf60fe6
DM
447 break;
448 case TOKENTYPE_OPNUM:
449 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 450 PL_op_name[lvalp->ival]);
bbf60fe6
DM
451 break;
452 case TOKENTYPE_PVAL:
704d4215 453 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
454 break;
455 case TOKENTYPE_OPVAL:
704d4215 456 if (lvalp->opval) {
401441c0 457 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
458 PL_op_name[lvalp->opval->op_type]);
459 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 460 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 461 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
462 }
463
464 }
401441c0 465 else
396482e1 466 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
467 break;
468 }
b6007c36 469 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
470 };
471 return (int)rv;
998054bd
SC
472}
473
b6007c36
DM
474
475/* print the buffer with suitable escapes */
476
477STATIC void
15f169a1 478S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 479{
396482e1 480 SV* const tmp = newSVpvs("");
7918f24d
NC
481
482 PERL_ARGS_ASSERT_PRINTBUF;
483
b6007c36
DM
484 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
485 SvREFCNT_dec(tmp);
486}
487
8fa7f367
JH
488#endif
489
8290c323
NC
490static int
491S_deprecate_commaless_var_list(pTHX) {
492 PL_expect = XTERM;
493 deprecate("comma-less variable list");
494 return REPORT(','); /* grandfather non-comma-format format */
495}
496
ffb4593c
NT
497/*
498 * S_ao
499 *
c963b151
BD
500 * This subroutine detects &&=, ||=, and //= and turns an ANDAND, OROR or DORDOR
501 * into an OP_ANDASSIGN, OP_ORASSIGN, or OP_DORASSIGN
ffb4593c
NT
502 */
503
76e3520e 504STATIC int
cea2e8a9 505S_ao(pTHX_ int toketype)
a0d0e21e 506{
97aff369 507 dVAR;
3280af22
NIS
508 if (*PL_bufptr == '=') {
509 PL_bufptr++;
a0d0e21e 510 if (toketype == ANDAND)
6154021b 511 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 512 else if (toketype == OROR)
6154021b 513 pl_yylval.ival = OP_ORASSIGN;
c963b151 514 else if (toketype == DORDOR)
6154021b 515 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
516 toketype = ASSIGNOP;
517 }
518 return toketype;
519}
520
ffb4593c
NT
521/*
522 * S_no_op
523 * When Perl expects an operator and finds something else, no_op
524 * prints the warning. It always prints "<something> found where
525 * operator expected. It prints "Missing semicolon on previous line?"
526 * if the surprise occurs at the start of the line. "do you need to
527 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
528 * where the compiler doesn't know if foo is a method call or a function.
529 * It prints "Missing operator before end of line" if there's nothing
530 * after the missing operator, or "... before <...>" if there is something
531 * after the missing operator.
532 */
533
76e3520e 534STATIC void
15f169a1 535S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 536{
97aff369 537 dVAR;
9d4ba2ae
AL
538 char * const oldbp = PL_bufptr;
539 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 540
7918f24d
NC
541 PERL_ARGS_ASSERT_NO_OP;
542
1189a94a
GS
543 if (!s)
544 s = oldbp;
07c798fb 545 else
1189a94a 546 PL_bufptr = s;
734ab321 547 yywarn(Perl_form(aTHX_ "%s found where operator expected", what), UTF ? SVf_UTF8 : 0);
56da5a46
RGS
548 if (ckWARN_d(WARN_SYNTAX)) {
549 if (is_first)
550 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
551 "\t(Missing semicolon on previous line?)\n");
552 else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) {
f54cb97a 553 const char *t;
734ab321
BF
554 for (t = PL_oldoldbufptr; (isALNUM_lazy_if(t,UTF) || *t == ':');
555 t += UTF ? UTF8SKIP(t) : 1)
c35e046a 556 NOOP;
56da5a46
RGS
557 if (t < PL_bufptr && isSPACE(*t))
558 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
734ab321
BF
559 "\t(Do you need to predeclare %"SVf"?)\n",
560 SVfARG(newSVpvn_flags(PL_oldoldbufptr, (STRLEN)(t - PL_oldoldbufptr),
561 SVs_TEMP | (UTF ? SVf_UTF8 : 0))));
56da5a46
RGS
562 }
563 else {
564 assert(s >= oldbp);
565 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
734ab321
BF
566 "\t(Missing operator before %"SVf"?)\n",
567 SVfARG(newSVpvn_flags(oldbp, (STRLEN)(s - oldbp),
568 SVs_TEMP | (UTF ? SVf_UTF8 : 0))));
56da5a46 569 }
07c798fb 570 }
3280af22 571 PL_bufptr = oldbp;
8990e307
LW
572}
573
ffb4593c
NT
574/*
575 * S_missingterm
576 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 577 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
578 * If we're in a delimited string and the delimiter is a control
579 * character, it's reformatted into a two-char sequence like ^C.
580 * This is fatal.
581 */
582
76e3520e 583STATIC void
cea2e8a9 584S_missingterm(pTHX_ char *s)
8990e307 585{
97aff369 586 dVAR;
8990e307
LW
587 char tmpbuf[3];
588 char q;
589 if (s) {
9d4ba2ae 590 char * const nl = strrchr(s,'\n');
d2719217 591 if (nl)
8990e307
LW
592 *nl = '\0';
593 }
463559e7 594 else if (isCNTRL(PL_multi_close)) {
8990e307 595 *tmpbuf = '^';
585ec06d 596 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
597 tmpbuf[2] = '\0';
598 s = tmpbuf;
599 }
600 else {
eb160463 601 *tmpbuf = (char)PL_multi_close;
8990e307
LW
602 tmpbuf[1] = '\0';
603 s = tmpbuf;
604 }
605 q = strchr(s,'"') ? '\'' : '"';
cea2e8a9 606 Perl_croak(aTHX_ "Can't find string terminator %c%s%c anywhere before EOF",q,s,q);
463ee0b2 607}
79072805 608
dd0ac2b9
FC
609#include "feature.h"
610
0d863452 611/*
0d863452
RH
612 * Check whether the named feature is enabled.
613 */
26ea9e12 614bool
3fff3427 615Perl_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
0d863452 616{
97aff369 617 dVAR;
4a731d7b 618 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
7918f24d
NC
619
620 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
ca4d40c4
FC
621
622 assert(CURRENT_FEATURE_BUNDLE == FEATURE_BUNDLE_CUSTOM);
7918f24d 623
26ea9e12
NC
624 if (namelen > MAX_FEATURE_LEN)
625 return FALSE;
3fff3427 626 memcpy(&he_name[8], name, namelen);
7d69d4a6 627
c8ca97b0
NC
628 return cBOOL(cop_hints_fetch_pvn(PL_curcop, he_name, 8 + namelen, 0,
629 REFCOUNTED_HE_EXISTS));
0d863452
RH
630}
631
ffb4593c 632/*
9cbb5ea2
GS
633 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
634 * utf16-to-utf8-reversed.
ffb4593c
NT
635 */
636
c39cd008
GS
637#ifdef PERL_CR_FILTER
638static void
639strip_return(SV *sv)
640{
eb578fdb
KW
641 const char *s = SvPVX_const(sv);
642 const char * const e = s + SvCUR(sv);
7918f24d
NC
643
644 PERL_ARGS_ASSERT_STRIP_RETURN;
645
c39cd008
GS
646 /* outer loop optimized to do nothing if there are no CR-LFs */
647 while (s < e) {
648 if (*s++ == '\r' && *s == '\n') {
649 /* hit a CR-LF, need to copy the rest */
eb578fdb 650 char *d = s - 1;
c39cd008
GS
651 *d++ = *s++;
652 while (s < e) {
653 if (*s == '\r' && s[1] == '\n')
654 s++;
655 *d++ = *s++;
656 }
657 SvCUR(sv) -= s - d;
658 return;
659 }
660 }
661}
a868473f 662
76e3520e 663STATIC I32
c39cd008 664S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 665{
f54cb97a 666 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
667 if (count > 0 && !maxlen)
668 strip_return(sv);
669 return count;
a868473f
NIS
670}
671#endif
672
ffb4593c 673/*
8eaa0acf
Z
674=for apidoc Amx|void|lex_start|SV *line|PerlIO *rsfp|U32 flags
675
676Creates and initialises a new lexer/parser state object, supplying
677a context in which to lex and parse from a new source of Perl code.
678A pointer to the new state object is placed in L</PL_parser>. An entry
679is made on the save stack so that upon unwinding the new state object
680will be destroyed and the former value of L</PL_parser> will be restored.
681Nothing else need be done to clean up the parsing context.
682
683The code to be parsed comes from I<line> and I<rsfp>. I<line>, if
684non-null, provides a string (in SV form) containing code to be parsed.
685A copy of the string is made, so subsequent modification of I<line>
686does not affect parsing. I<rsfp>, if non-null, provides an input stream
687from which code will be read to be parsed. If both are non-null, the
688code in I<line> comes first and must consist of complete lines of input,
689and I<rsfp> supplies the remainder of the source.
690
e368b3bd
FC
691The I<flags> parameter is reserved for future use. Currently it is only
692used by perl internally, so extensions should always pass zero.
8eaa0acf
Z
693
694=cut
695*/
ffb4593c 696
27fcb6ee 697/* LEX_START_SAME_FILTER indicates that this is not a new file, so it
87606032
NC
698 can share filters with the current parser.
699 LEX_START_DONT_CLOSE indicates that the file handle wasn't opened by the
700 caller, hence isn't owned by the parser, so shouldn't be closed on parser
701 destruction. This is used to handle the case of defaulting to reading the
702 script from the standard input because no filename was given on the command
703 line (without getting confused by situation where STDIN has been closed, so
704 the script handle is opened on fd 0) */
27fcb6ee 705
a0d0e21e 706void
8eaa0acf 707Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
79072805 708{
97aff369 709 dVAR;
6ef55633 710 const char *s = NULL;
5486870f 711 yy_parser *parser, *oparser;
60d63348 712 if (flags && flags & ~LEX_START_FLAGS)
8eaa0acf 713 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
acdf0a21
DM
714
715 /* create and initialise a parser */
716
199e78b7 717 Newxz(parser, 1, yy_parser);
5486870f 718 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
719 PL_parser = parser;
720
28ac2b49
Z
721 parser->stack = NULL;
722 parser->ps = NULL;
723 parser->stack_size = 0;
acdf0a21 724
e3abe207
DM
725 /* on scope exit, free this parser and restore any outer one */
726 SAVEPARSER(parser);
7c4baf47 727 parser->saved_curcop = PL_curcop;
e3abe207 728
acdf0a21 729 /* initialise lexer state */
8990e307 730
fb205e7a
DM
731#ifdef PERL_MAD
732 parser->curforce = -1;
733#else
734 parser->nexttoke = 0;
735#endif
ca4cfd28 736 parser->error_count = oparser ? oparser->error_count : 0;
c2598295 737 parser->copline = NOLINE;
5afb0a62 738 parser->lex_state = LEX_NORMAL;
c2598295 739 parser->expect = XSTATE;
2f9285f8 740 parser->rsfp = rsfp;
27fcb6ee
FC
741 parser->rsfp_filters =
742 !(flags & LEX_START_SAME_FILTER) || !oparser
d3cd8e11
FC
743 ? NULL
744 : MUTABLE_AV(SvREFCNT_inc(
745 oparser->rsfp_filters
746 ? oparser->rsfp_filters
747 : (oparser->rsfp_filters = newAV())
748 ));
2f9285f8 749
199e78b7
DM
750 Newx(parser->lex_brackstack, 120, char);
751 Newx(parser->lex_casestack, 12, char);
752 *parser->lex_casestack = '\0';
d794b522 753 Newxz(parser->lex_shared, 1, LEXSHARED);
02b34bbe 754
10efb74f 755 if (line) {
0528fd32 756 STRLEN len;
10efb74f 757 s = SvPV_const(line, len);
0abcdfa4
FC
758 parser->linestr = flags & LEX_START_COPIED
759 ? SvREFCNT_inc_simple_NN(line)
760 : newSVpvn_flags(s, len, SvUTF8(line));
37555a72 761 if (!len || s[len-1] != ';')
bdc0bf6f 762 sv_catpvs(parser->linestr, "\n;");
0abcdfa4
FC
763 } else {
764 parser->linestr = newSVpvs("\n;");
8990e307 765 }
f06b5848
DM
766 parser->oldoldbufptr =
767 parser->oldbufptr =
768 parser->bufptr =
769 parser->linestart = SvPVX(parser->linestr);
770 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
771 parser->last_lop = parser->last_uni = NULL;
87606032
NC
772 parser->lex_flags = flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
773 |LEX_DONT_CLOSE_RSFP);
737c24fc 774
60d63348 775 parser->in_pod = parser->filtered = 0;
79072805 776}
a687059c 777
e3abe207
DM
778
779/* delete a parser object */
780
781void
782Perl_parser_free(pTHX_ const yy_parser *parser)
783{
7918f24d
NC
784 PERL_ARGS_ASSERT_PARSER_FREE;
785
7c4baf47 786 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
787 SvREFCNT_dec(parser->linestr);
788
87606032 789 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
2f9285f8 790 PerlIO_clearerr(parser->rsfp);
799361c3
SH
791 else if (parser->rsfp && (!parser->old_parser ||
792 (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 793 PerlIO_close(parser->rsfp);
5486870f 794 SvREFCNT_dec(parser->rsfp_filters);
2f9285f8 795
e3abe207
DM
796 Safefree(parser->lex_brackstack);
797 Safefree(parser->lex_casestack);
d794b522 798 Safefree(parser->lex_shared);
e3abe207
DM
799 PL_parser = parser->old_parser;
800 Safefree(parser);
801}
802
803
ffb4593c 804/*
f0e67a1d
Z
805=for apidoc AmxU|SV *|PL_parser-E<gt>linestr
806
807Buffer scalar containing the chunk currently under consideration of the
808text currently being lexed. This is always a plain string scalar (for
809which C<SvPOK> is true). It is not intended to be used as a scalar by
810normal scalar means; instead refer to the buffer directly by the pointer
811variables described below.
812
813The lexer maintains various C<char*> pointers to things in the
814C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
815reallocated, all of these pointers must be updated. Don't attempt to
816do this manually, but rather use L</lex_grow_linestr> if you need to
817reallocate the buffer.
818
819The content of the text chunk in the buffer is commonly exactly one
820complete line of input, up to and including a newline terminator,
821but there are situations where it is otherwise. The octets of the
822buffer may be intended to be interpreted as either UTF-8 or Latin-1.
823The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
824flag on this scalar, which may disagree with it.
825
826For direct examination of the buffer, the variable
827L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
828lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
829of these pointers is usually preferable to examination of the scalar
830through normal scalar means.
831
832=for apidoc AmxU|char *|PL_parser-E<gt>bufend
833
834Direct pointer to the end of the chunk of text currently being lexed, the
835end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
836+ SvCUR(PL_parser-E<gt>linestr)>. A NUL character (zero octet) is
837always located at the end of the buffer, and does not count as part of
838the buffer's contents.
839
840=for apidoc AmxU|char *|PL_parser-E<gt>bufptr
841
842Points to the current position of lexing inside the lexer buffer.
843Characters around this point may be freely examined, within
844the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
845L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
846interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
847
848Lexing code (whether in the Perl core or not) moves this pointer past
849the characters that it consumes. It is also expected to perform some
850bookkeeping whenever a newline character is consumed. This movement
851can be more conveniently performed by the function L</lex_read_to>,
852which handles newlines appropriately.
853
854Interpretation of the buffer's octets can be abstracted out by
855using the slightly higher-level functions L</lex_peek_unichar> and
856L</lex_read_unichar>.
857
858=for apidoc AmxU|char *|PL_parser-E<gt>linestart
859
860Points to the start of the current line inside the lexer buffer.
861This is useful for indicating at which column an error occurred, and
862not much else. This must be updated by any lexing code that consumes
863a newline; the function L</lex_read_to> handles this detail.
864
865=cut
866*/
867
868/*
869=for apidoc Amx|bool|lex_bufutf8
870
871Indicates whether the octets in the lexer buffer
872(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
873of Unicode characters. If not, they should be interpreted as Latin-1
874characters. This is analogous to the C<SvUTF8> flag for scalars.
875
876In UTF-8 mode, it is not guaranteed that the lexer buffer actually
877contains valid UTF-8. Lexing code must be robust in the face of invalid
878encoding.
879
880The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
881is significant, but not the whole story regarding the input character
882encoding. Normally, when a file is being read, the scalar contains octets
883and its C<SvUTF8> flag is off, but the octets should be interpreted as
884UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
885however, the scalar may have the C<SvUTF8> flag on, and in this case its
886octets should be interpreted as UTF-8 unless the C<use bytes> pragma
887is in effect. This logic may change in the future; use this function
888instead of implementing the logic yourself.
889
890=cut
891*/
892
893bool
894Perl_lex_bufutf8(pTHX)
895{
896 return UTF;
897}
898
899/*
900=for apidoc Amx|char *|lex_grow_linestr|STRLEN len
901
902Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
903at least I<len> octets (including terminating NUL). Returns a
904pointer to the reallocated buffer. This is necessary before making
905any direct modification of the buffer that would increase its length.
906L</lex_stuff_pvn> provides a more convenient way to insert text into
907the buffer.
908
909Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
910this function updates all of the lexer's variables that point directly
911into the buffer.
912
913=cut
914*/
915
916char *
917Perl_lex_grow_linestr(pTHX_ STRLEN len)
918{
919 SV *linestr;
920 char *buf;
921 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
c7641931 922 STRLEN linestart_pos, last_uni_pos, last_lop_pos, re_eval_start_pos;
f0e67a1d
Z
923 linestr = PL_parser->linestr;
924 buf = SvPVX(linestr);
925 if (len <= SvLEN(linestr))
926 return buf;
927 bufend_pos = PL_parser->bufend - buf;
928 bufptr_pos = PL_parser->bufptr - buf;
929 oldbufptr_pos = PL_parser->oldbufptr - buf;
930 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
931 linestart_pos = PL_parser->linestart - buf;
932 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
933 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
c7641931
DM
934 re_eval_start_pos = PL_sublex_info.re_eval_start ?
935 PL_sublex_info.re_eval_start - buf : 0;
936
f0e67a1d 937 buf = sv_grow(linestr, len);
c7641931 938
f0e67a1d
Z
939 PL_parser->bufend = buf + bufend_pos;
940 PL_parser->bufptr = buf + bufptr_pos;
941 PL_parser->oldbufptr = buf + oldbufptr_pos;
942 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
943 PL_parser->linestart = buf + linestart_pos;
944 if (PL_parser->last_uni)
945 PL_parser->last_uni = buf + last_uni_pos;
946 if (PL_parser->last_lop)
947 PL_parser->last_lop = buf + last_lop_pos;
c7641931
DM
948 if (PL_sublex_info.re_eval_start)
949 PL_sublex_info.re_eval_start = buf + re_eval_start_pos;
f0e67a1d
Z
950 return buf;
951}
952
953/*
83aa740e 954=for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
f0e67a1d
Z
955
956Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
957immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
958reallocating the buffer if necessary. This means that lexing code that
959runs later will see the characters as if they had appeared in the input.
960It is not recommended to do this as part of normal parsing, and most
961uses of this facility run the risk of the inserted characters being
962interpreted in an unintended manner.
963
964The string to be inserted is represented by I<len> octets starting
965at I<pv>. These octets are interpreted as either UTF-8 or Latin-1,
966according to whether the C<LEX_STUFF_UTF8> flag is set in I<flags>.
967The characters are recoded for the lexer buffer, according to how the
968buffer is currently being interpreted (L</lex_bufutf8>). If a string
9dcc53ea 969to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
f0e67a1d
Z
970function is more convenient.
971
972=cut
973*/
974
975void
83aa740e 976Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
f0e67a1d 977{
749123ff 978 dVAR;
f0e67a1d
Z
979 char *bufptr;
980 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
981 if (flags & ~(LEX_STUFF_UTF8))
982 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
983 if (UTF) {
984 if (flags & LEX_STUFF_UTF8) {
985 goto plain_copy;
986 } else {
987 STRLEN highhalf = 0;
83aa740e 988 const char *p, *e = pv+len;
f0e67a1d
Z
989 for (p = pv; p != e; p++)
990 highhalf += !!(((U8)*p) & 0x80);
991 if (!highhalf)
992 goto plain_copy;
993 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
994 bufptr = PL_parser->bufptr;
995 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
996 SvCUR_set(PL_parser->linestr,
997 SvCUR(PL_parser->linestr) + len+highhalf);
f0e67a1d
Z
998 PL_parser->bufend += len+highhalf;
999 for (p = pv; p != e; p++) {
1000 U8 c = (U8)*p;
1001 if (c & 0x80) {
1002 *bufptr++ = (char)(0xc0 | (c >> 6));
1003 *bufptr++ = (char)(0x80 | (c & 0x3f));
1004 } else {
1005 *bufptr++ = (char)c;
1006 }
1007 }
1008 }
1009 } else {
1010 if (flags & LEX_STUFF_UTF8) {
1011 STRLEN highhalf = 0;
83aa740e 1012 const char *p, *e = pv+len;
f0e67a1d
Z
1013 for (p = pv; p != e; p++) {
1014 U8 c = (U8)*p;
1015 if (c >= 0xc4) {
1016 Perl_croak(aTHX_ "Lexing code attempted to stuff "
1017 "non-Latin-1 character into Latin-1 input");
1018 } else if (c >= 0xc2 && p+1 != e &&
1019 (((U8)p[1]) & 0xc0) == 0x80) {
1020 p++;
1021 highhalf++;
1022 } else if (c >= 0x80) {
1023 /* malformed UTF-8 */
1024 ENTER;
1025 SAVESPTR(PL_warnhook);
1026 PL_warnhook = PERL_WARNHOOK_FATAL;
1027 utf8n_to_uvuni((U8*)p, e-p, NULL, 0);
1028 LEAVE;
1029 }
1030 }
1031 if (!highhalf)
1032 goto plain_copy;
1033 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
1034 bufptr = PL_parser->bufptr;
1035 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1036 SvCUR_set(PL_parser->linestr,
1037 SvCUR(PL_parser->linestr) + len-highhalf);
f0e67a1d
Z
1038 PL_parser->bufend += len-highhalf;
1039 for (p = pv; p != e; p++) {
1040 U8 c = (U8)*p;
1041 if (c & 0x80) {
1042 *bufptr++ = (char)(((c & 0x3) << 6) | (p[1] & 0x3f));
1043 p++;
1044 } else {
1045 *bufptr++ = (char)c;
1046 }
1047 }
1048 } else {
1049 plain_copy:
1050 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1051 bufptr = PL_parser->bufptr;
1052 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
255fdf19 1053 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
f0e67a1d
Z
1054 PL_parser->bufend += len;
1055 Copy(pv, bufptr, len, char);
1056 }
1057 }
1058}
1059
1060/*
9dcc53ea
Z
1061=for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
1062
1063Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1064immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1065reallocating the buffer if necessary. This means that lexing code that
1066runs later will see the characters as if they had appeared in the input.
1067It is not recommended to do this as part of normal parsing, and most
1068uses of this facility run the risk of the inserted characters being
1069interpreted in an unintended manner.
1070
1071The string to be inserted is represented by octets starting at I<pv>
1072and continuing to the first nul. These octets are interpreted as either
1073UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
1074in I<flags>. The characters are recoded for the lexer buffer, according
1075to how the buffer is currently being interpreted (L</lex_bufutf8>).
1076If it is not convenient to nul-terminate a string to be inserted, the
1077L</lex_stuff_pvn> function is more appropriate.
1078
1079=cut
1080*/
1081
1082void
1083Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1084{
1085 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1086 lex_stuff_pvn(pv, strlen(pv), flags);
1087}
1088
1089/*
f0e67a1d
Z
1090=for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1091
1092Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1093immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1094reallocating the buffer if necessary. This means that lexing code that
1095runs later will see the characters as if they had appeared in the input.
1096It is not recommended to do this as part of normal parsing, and most
1097uses of this facility run the risk of the inserted characters being
1098interpreted in an unintended manner.
1099
1100The string to be inserted is the string value of I<sv>. The characters
1101are recoded for the lexer buffer, according to how the buffer is currently
9dcc53ea 1102being interpreted (L</lex_bufutf8>). If a string to be inserted is
f0e67a1d
Z
1103not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1104need to construct a scalar.
1105
1106=cut
1107*/
1108
1109void
1110Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1111{
1112 char *pv;
1113 STRLEN len;
1114 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1115 if (flags)
1116 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1117 pv = SvPV(sv, len);
1118 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1119}
1120
1121/*
1122=for apidoc Amx|void|lex_unstuff|char *ptr
1123
1124Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1125I<ptr>. Text following I<ptr> will be moved, and the buffer shortened.
1126This hides the discarded text from any lexing code that runs later,
1127as if the text had never appeared.
1128
1129This is not the normal way to consume lexed text. For that, use
1130L</lex_read_to>.
1131
1132=cut
1133*/
1134
1135void
1136Perl_lex_unstuff(pTHX_ char *ptr)
1137{
1138 char *buf, *bufend;
1139 STRLEN unstuff_len;
1140 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1141 buf = PL_parser->bufptr;
1142 if (ptr < buf)
1143 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1144 if (ptr == buf)
1145 return;
1146 bufend = PL_parser->bufend;
1147 if (ptr > bufend)
1148 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1149 unstuff_len = ptr - buf;
1150 Move(ptr, buf, bufend+1-ptr, char);
1151 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1152 PL_parser->bufend = bufend - unstuff_len;
1153}
1154
1155/*
1156=for apidoc Amx|void|lex_read_to|char *ptr
1157
1158Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1159to I<ptr>. This advances L</PL_parser-E<gt>bufptr> to match I<ptr>,
1160performing the correct bookkeeping whenever a newline character is passed.
1161This is the normal way to consume lexed text.
1162
1163Interpretation of the buffer's octets can be abstracted out by
1164using the slightly higher-level functions L</lex_peek_unichar> and
1165L</lex_read_unichar>.
1166
1167=cut
1168*/
1169
1170void
1171Perl_lex_read_to(pTHX_ char *ptr)
1172{
1173 char *s;
1174 PERL_ARGS_ASSERT_LEX_READ_TO;
1175 s = PL_parser->bufptr;
1176 if (ptr < s || ptr > PL_parser->bufend)
1177 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1178 for (; s != ptr; s++)
1179 if (*s == '\n') {
83944c01 1180 COPLINE_INC_WITH_HERELINES;
f0e67a1d
Z
1181 PL_parser->linestart = s+1;
1182 }
1183 PL_parser->bufptr = ptr;
1184}
1185
1186/*
1187=for apidoc Amx|void|lex_discard_to|char *ptr
1188
1189Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1190up to I<ptr>. The remaining content of the buffer will be moved, and
1191all pointers into the buffer updated appropriately. I<ptr> must not
1192be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1193it is not permitted to discard text that has yet to be lexed.
1194
1195Normally it is not necessarily to do this directly, because it suffices to
1196use the implicit discarding behaviour of L</lex_next_chunk> and things
1197based on it. However, if a token stretches across multiple lines,
1f317c95 1198and the lexing code has kept multiple lines of text in the buffer for
f0e67a1d
Z
1199that purpose, then after completion of the token it would be wise to
1200explicitly discard the now-unneeded earlier lines, to avoid future
1201multi-line tokens growing the buffer without bound.
1202
1203=cut
1204*/
1205
1206void
1207Perl_lex_discard_to(pTHX_ char *ptr)
1208{
1209 char *buf;
1210 STRLEN discard_len;
1211 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1212 buf = SvPVX(PL_parser->linestr);
1213 if (ptr < buf)
1214 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1215 if (ptr == buf)
1216 return;
1217 if (ptr > PL_parser->bufptr)
1218 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1219 discard_len = ptr - buf;
1220 if (PL_parser->oldbufptr < ptr)
1221 PL_parser->oldbufptr = ptr;
1222 if (PL_parser->oldoldbufptr < ptr)
1223 PL_parser->oldoldbufptr = ptr;
1224 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1225 PL_parser->last_uni = NULL;
1226 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1227 PL_parser->last_lop = NULL;
1228 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1229 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1230 PL_parser->bufend -= discard_len;
1231 PL_parser->bufptr -= discard_len;
1232 PL_parser->oldbufptr -= discard_len;
1233 PL_parser->oldoldbufptr -= discard_len;
1234 if (PL_parser->last_uni)
1235 PL_parser->last_uni -= discard_len;
1236 if (PL_parser->last_lop)
1237 PL_parser->last_lop -= discard_len;
1238}
1239
1240/*
1241=for apidoc Amx|bool|lex_next_chunk|U32 flags
1242
1243Reads in the next chunk of text to be lexed, appending it to
1244L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1245looked to the end of the current chunk and wants to know more. It is
1246usual, but not necessary, for lexing to have consumed the entirety of
1247the current chunk at this time.
1248
1249If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1250chunk (i.e., the current chunk has been entirely consumed), normally the
1251current chunk will be discarded at the same time that the new chunk is
1252read in. If I<flags> includes C<LEX_KEEP_PREVIOUS>, the current chunk
1253will not be discarded. If the current chunk has not been entirely
1254consumed, then it will not be discarded regardless of the flag.
1255
1256Returns true if some new text was added to the buffer, or false if the
1257buffer has reached the end of the input text.
1258
1259=cut
1260*/
1261
1262#define LEX_FAKE_EOF 0x80000000
112d1284 1263#define LEX_NO_TERM 0x40000000
f0e67a1d
Z
1264
1265bool
1266Perl_lex_next_chunk(pTHX_ U32 flags)
1267{
1268 SV *linestr;
1269 char *buf;
1270 STRLEN old_bufend_pos, new_bufend_pos;
1271 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1272 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1273 bool got_some_for_debugger = 0;
f0e67a1d 1274 bool got_some;
112d1284 1275 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM))
f0e67a1d 1276 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
f0e67a1d
Z
1277 linestr = PL_parser->linestr;
1278 buf = SvPVX(linestr);
1279 if (!(flags & LEX_KEEP_PREVIOUS) &&
1280 PL_parser->bufptr == PL_parser->bufend) {
1281 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1282 linestart_pos = 0;
1283 if (PL_parser->last_uni != PL_parser->bufend)
1284 PL_parser->last_uni = NULL;
1285 if (PL_parser->last_lop != PL_parser->bufend)
1286 PL_parser->last_lop = NULL;
1287 last_uni_pos = last_lop_pos = 0;
1288 *buf = 0;
1289 SvCUR(linestr) = 0;
1290 } else {
1291 old_bufend_pos = PL_parser->bufend - buf;
1292 bufptr_pos = PL_parser->bufptr - buf;
1293 oldbufptr_pos = PL_parser->oldbufptr - buf;
1294 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1295 linestart_pos = PL_parser->linestart - buf;
1296 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1297 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1298 }
1299 if (flags & LEX_FAKE_EOF) {
1300 goto eof;
60d63348 1301 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
f0e67a1d
Z
1302 got_some = 0;
1303 } else if (filter_gets(linestr, old_bufend_pos)) {
1304 got_some = 1;
17cc9359 1305 got_some_for_debugger = 1;
112d1284
FC
1306 } else if (flags & LEX_NO_TERM) {
1307 got_some = 0;
f0e67a1d 1308 } else {
580561a3
Z
1309 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1310 sv_setpvs(linestr, "");
f0e67a1d
Z
1311 eof:
1312 /* End of real input. Close filehandle (unless it was STDIN),
1313 * then add implicit termination.
1314 */
87606032 1315 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
f0e67a1d
Z
1316 PerlIO_clearerr(PL_parser->rsfp);
1317 else if (PL_parser->rsfp)
1318 (void)PerlIO_close(PL_parser->rsfp);
1319 PL_parser->rsfp = NULL;
60d63348 1320 PL_parser->in_pod = PL_parser->filtered = 0;
f0e67a1d
Z
1321#ifdef PERL_MAD
1322 if (PL_madskills && !PL_in_eval && (PL_minus_p || PL_minus_n))
1323 PL_faketokens = 1;
1324#endif
1325 if (!PL_in_eval && PL_minus_p) {
1326 sv_catpvs(linestr,
1327 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1328 PL_minus_n = PL_minus_p = 0;
1329 } else if (!PL_in_eval && PL_minus_n) {
1330 sv_catpvs(linestr, /*{*/";}");
1331 PL_minus_n = 0;
1332 } else
1333 sv_catpvs(linestr, ";");
1334 got_some = 1;
1335 }
1336 buf = SvPVX(linestr);
1337 new_bufend_pos = SvCUR(linestr);
1338 PL_parser->bufend = buf + new_bufend_pos;
1339 PL_parser->bufptr = buf + bufptr_pos;
1340 PL_parser->oldbufptr = buf + oldbufptr_pos;
1341 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1342 PL_parser->linestart = buf + linestart_pos;
1343 if (PL_parser->last_uni)
1344 PL_parser->last_uni = buf + last_uni_pos;
1345 if (PL_parser->last_lop)
1346 PL_parser->last_lop = buf + last_lop_pos;
17cc9359 1347 if (got_some_for_debugger && (PERLDB_LINE || PERLDB_SAVESRC) &&
f0e67a1d
Z
1348 PL_curstash != PL_debstash) {
1349 /* debugger active and we're not compiling the debugger code,
1350 * so store the line into the debugger's array of lines
1351 */
1352 update_debugger_info(NULL, buf+old_bufend_pos,
1353 new_bufend_pos-old_bufend_pos);
1354 }
1355 return got_some;
1356}
1357
1358/*
1359=for apidoc Amx|I32|lex_peek_unichar|U32 flags
1360
1361Looks ahead one (Unicode) character in the text currently being lexed.
1362Returns the codepoint (unsigned integer value) of the next character,
1363or -1 if lexing has reached the end of the input text. To consume the
1364peeked character, use L</lex_read_unichar>.
1365
1366If the next character is in (or extends into) the next chunk of input
1367text, the next chunk will be read in. Normally the current chunk will be
1368discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1369then the current chunk will not be discarded.
1370
1371If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1372is encountered, an exception is generated.
1373
1374=cut
1375*/
1376
1377I32
1378Perl_lex_peek_unichar(pTHX_ U32 flags)
1379{
749123ff 1380 dVAR;
f0e67a1d
Z
1381 char *s, *bufend;
1382 if (flags & ~(LEX_KEEP_PREVIOUS))
1383 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1384 s = PL_parser->bufptr;
1385 bufend = PL_parser->bufend;
1386 if (UTF) {
1387 U8 head;
1388 I32 unichar;
1389 STRLEN len, retlen;
1390 if (s == bufend) {
1391 if (!lex_next_chunk(flags))
1392 return -1;
1393 s = PL_parser->bufptr;
1394 bufend = PL_parser->bufend;
1395 }
1396 head = (U8)*s;
1397 if (!(head & 0x80))
1398 return head;
1399 if (head & 0x40) {
1400 len = PL_utf8skip[head];
1401 while ((STRLEN)(bufend-s) < len) {
1402 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1403 break;
1404 s = PL_parser->bufptr;
1405 bufend = PL_parser->bufend;
1406 }
1407 }
1408 unichar = utf8n_to_uvuni((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
1409 if (retlen == (STRLEN)-1) {
1410 /* malformed UTF-8 */
1411 ENTER;
1412 SAVESPTR(PL_warnhook);
1413 PL_warnhook = PERL_WARNHOOK_FATAL;
1414 utf8n_to_uvuni((U8*)s, bufend-s, NULL, 0);
1415 LEAVE;
1416 }
1417 return unichar;
1418 } else {
1419 if (s == bufend) {
1420 if (!lex_next_chunk(flags))
1421 return -1;
1422 s = PL_parser->bufptr;
1423 }
1424 return (U8)*s;
1425 }
1426}
1427
1428/*
1429=for apidoc Amx|I32|lex_read_unichar|U32 flags
1430
1431Reads the next (Unicode) character in the text currently being lexed.
1432Returns the codepoint (unsigned integer value) of the character read,
1433and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1434if lexing has reached the end of the input text. To non-destructively
1435examine the next character, use L</lex_peek_unichar> instead.
1436
1437If the next character is in (or extends into) the next chunk of input
1438text, the next chunk will be read in. Normally the current chunk will be
1439discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1440then the current chunk will not be discarded.
1441
1442If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1443is encountered, an exception is generated.
1444
1445=cut
1446*/
1447
1448I32
1449Perl_lex_read_unichar(pTHX_ U32 flags)
1450{
1451 I32 c;
1452 if (flags & ~(LEX_KEEP_PREVIOUS))
1453 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1454 c = lex_peek_unichar(flags);
1455 if (c != -1) {
1456 if (c == '\n')
83944c01 1457 COPLINE_INC_WITH_HERELINES;
d9018cbe
EB
1458 if (UTF)
1459 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1460 else
1461 ++(PL_parser->bufptr);
f0e67a1d
Z
1462 }
1463 return c;
1464}
1465
1466/*
1467=for apidoc Amx|void|lex_read_space|U32 flags
1468
1469Reads optional spaces, in Perl style, in the text currently being
1470lexed. The spaces may include ordinary whitespace characters and
1471Perl-style comments. C<#line> directives are processed if encountered.
1472L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1473at a non-space character (or the end of the input text).
1474
1475If spaces extend into the next chunk of input text, the next chunk will
1476be read in. Normally the current chunk will be discarded at the same
1477time, but if I<flags> includes C<LEX_KEEP_PREVIOUS> then the current
1478chunk will not be discarded.
1479
1480=cut
1481*/
1482
f0998909
Z
1483#define LEX_NO_NEXT_CHUNK 0x80000000
1484
f0e67a1d
Z
1485void
1486Perl_lex_read_space(pTHX_ U32 flags)
1487{
1488 char *s, *bufend;
1489 bool need_incline = 0;
f0998909 1490 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK))
f0e67a1d
Z
1491 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
1492#ifdef PERL_MAD
1493 if (PL_skipwhite) {
1494 sv_free(PL_skipwhite);
1495 PL_skipwhite = NULL;
1496 }
1497 if (PL_madskills)
1498 PL_skipwhite = newSVpvs("");
1499#endif /* PERL_MAD */
1500 s = PL_parser->bufptr;
1501 bufend = PL_parser->bufend;
1502 while (1) {
1503 char c = *s;
1504 if (c == '#') {
1505 do {
1506 c = *++s;
1507 } while (!(c == '\n' || (c == 0 && s == bufend)));
1508 } else if (c == '\n') {
1509 s++;
1510 PL_parser->linestart = s;
1511 if (s == bufend)
1512 need_incline = 1;
1513 else
1514 incline(s);
1515 } else if (isSPACE(c)) {
1516 s++;
1517 } else if (c == 0 && s == bufend) {
1518 bool got_more;
1519#ifdef PERL_MAD
1520 if (PL_madskills)
1521 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1522#endif /* PERL_MAD */
f0998909
Z
1523 if (flags & LEX_NO_NEXT_CHUNK)
1524 break;
f0e67a1d 1525 PL_parser->bufptr = s;
83944c01 1526 COPLINE_INC_WITH_HERELINES;
f0e67a1d
Z
1527 got_more = lex_next_chunk(flags);
1528 CopLINE_dec(PL_curcop);
1529 s = PL_parser->bufptr;
1530 bufend = PL_parser->bufend;
1531 if (!got_more)
1532 break;
1533 if (need_incline && PL_parser->rsfp) {
1534 incline(s);
1535 need_incline = 0;
1536 }
1537 } else {
1538 break;
1539 }
1540 }
1541#ifdef PERL_MAD
1542 if (PL_madskills)
1543 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1544#endif /* PERL_MAD */
1545 PL_parser->bufptr = s;
1546}
1547
1548/*
ffb4593c
NT
1549 * S_incline
1550 * This subroutine has nothing to do with tilting, whether at windmills
1551 * or pinball tables. Its name is short for "increment line". It
57843af0 1552 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1553 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1554 * # line 500 "foo.pm"
1555 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1556 */
1557
76e3520e 1558STATIC void
d9095cec 1559S_incline(pTHX_ const char *s)
463ee0b2 1560{
97aff369 1561 dVAR;
d9095cec
NC
1562 const char *t;
1563 const char *n;
1564 const char *e;
8818d409 1565 line_t line_num;
463ee0b2 1566
7918f24d
NC
1567 PERL_ARGS_ASSERT_INCLINE;
1568
83944c01 1569 COPLINE_INC_WITH_HERELINES;
463ee0b2
LW
1570 if (*s++ != '#')
1571 return;
d4c19fe8
AL
1572 while (SPACE_OR_TAB(*s))
1573 s++;
73659bf1
GS
1574 if (strnEQ(s, "line", 4))
1575 s += 4;
1576 else
1577 return;
084592ab 1578 if (SPACE_OR_TAB(*s))
73659bf1 1579 s++;
4e553d73 1580 else
73659bf1 1581 return;
d4c19fe8
AL
1582 while (SPACE_OR_TAB(*s))
1583 s++;
463ee0b2
LW
1584 if (!isDIGIT(*s))
1585 return;
d4c19fe8 1586
463ee0b2
LW
1587 n = s;
1588 while (isDIGIT(*s))
1589 s++;
07714eb4 1590 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1591 return;
bf4acbe4 1592 while (SPACE_OR_TAB(*s))
463ee0b2 1593 s++;
73659bf1 1594 if (*s == '"' && (t = strchr(s+1, '"'))) {
463ee0b2 1595 s++;
73659bf1
GS
1596 e = t + 1;
1597 }
463ee0b2 1598 else {
c35e046a
AL
1599 t = s;
1600 while (!isSPACE(*t))
1601 t++;
73659bf1 1602 e = t;
463ee0b2 1603 }
bf4acbe4 1604 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1605 e++;
1606 if (*e != '\n' && *e != '\0')
1607 return; /* false alarm */
1608
8818d409
FC
1609 line_num = atoi(n)-1;
1610
f4dd75d9 1611 if (t - s > 0) {
d9095cec 1612 const STRLEN len = t - s;
19bad673
NC
1613 SV *const temp_sv = CopFILESV(PL_curcop);
1614 const char *cf;
1615 STRLEN tmplen;
1616
1617 if (temp_sv) {
1618 cf = SvPVX(temp_sv);
1619 tmplen = SvCUR(temp_sv);
1620 } else {
1621 cf = NULL;
1622 tmplen = 0;
1623 }
1624
d1299d44 1625 if (!PL_rsfp && !PL_parser->filtered) {
e66cf94c
RGS
1626 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1627 * to *{"::_<newfilename"} */
44867030
NC
1628 /* However, the long form of evals is only turned on by the
1629 debugger - usually they're "(eval %lu)" */
1630 char smallbuf[128];
1631 char *tmpbuf;
1632 GV **gvp;
d9095cec 1633 STRLEN tmplen2 = len;
798b63bc 1634 if (tmplen + 2 <= sizeof smallbuf)
e66cf94c
RGS
1635 tmpbuf = smallbuf;
1636 else
2ae0db35 1637 Newx(tmpbuf, tmplen + 2, char);
44867030
NC
1638 tmpbuf[0] = '_';
1639 tmpbuf[1] = '<';
2ae0db35 1640 memcpy(tmpbuf + 2, cf, tmplen);
44867030 1641 tmplen += 2;
8a5ee598
RGS
1642 gvp = (GV**)hv_fetch(PL_defstash, tmpbuf, tmplen, FALSE);
1643 if (gvp) {
44867030
NC
1644 char *tmpbuf2;
1645 GV *gv2;
1646
1647 if (tmplen2 + 2 <= sizeof smallbuf)
1648 tmpbuf2 = smallbuf;
1649 else
1650 Newx(tmpbuf2, tmplen2 + 2, char);
1651
1652 if (tmpbuf2 != smallbuf || tmpbuf != smallbuf) {
1653 /* Either they malloc'd it, or we malloc'd it,
1654 so no prefix is present in ours. */
1655 tmpbuf2[0] = '_';
1656 tmpbuf2[1] = '<';
1657 }
1658
1659 memcpy(tmpbuf2 + 2, s, tmplen2);
1660 tmplen2 += 2;
1661
8a5ee598 1662 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1663 if (!isGV(gv2)) {
8a5ee598 1664 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1665 /* adjust ${"::_<newfilename"} to store the new file name */
1666 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
8818d409
FC
1667 /* The line number may differ. If that is the case,
1668 alias the saved lines that are in the array.
1669 Otherwise alias the whole array. */
1670 if (CopLINE(PL_curcop) == line_num) {
1671 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(*gvp)));
1672 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(*gvp)));
1673 }
1674 else if (GvAV(*gvp)) {
1675 AV * const av = GvAV(*gvp);
1676 const I32 start = CopLINE(PL_curcop)+1;
1677 I32 items = AvFILLp(av) - start;
1678 if (items > 0) {
1679 AV * const av2 = GvAVn(gv2);
1680 SV **svp = AvARRAY(av) + start;
1681 I32 l = (I32)line_num+1;
1682 while (items--)
1683 av_store(av2, l++, SvREFCNT_inc(*svp++));
1684 }
1685 }
e5527e4b 1686 }
44867030
NC
1687
1688 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
8a5ee598 1689 }
e66cf94c 1690 if (tmpbuf != smallbuf) Safefree(tmpbuf);
e66cf94c 1691 }
05ec9bb3 1692 CopFILE_free(PL_curcop);
d9095cec 1693 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1694 }
8818d409 1695 CopLINE_set(PL_curcop, line_num);
463ee0b2
LW
1696}
1697
29595ff2 1698#ifdef PERL_MAD
cd81e915 1699/* skip space before PL_thistoken */
29595ff2
NC
1700
1701STATIC char *
1702S_skipspace0(pTHX_ register char *s)
1703{
7918f24d
NC
1704 PERL_ARGS_ASSERT_SKIPSPACE0;
1705
29595ff2
NC
1706 s = skipspace(s);
1707 if (!PL_madskills)
1708 return s;
cd81e915
NC
1709 if (PL_skipwhite) {
1710 if (!PL_thiswhite)
6b29d1f5 1711 PL_thiswhite = newSVpvs("");
cd81e915
NC
1712 sv_catsv(PL_thiswhite, PL_skipwhite);
1713 sv_free(PL_skipwhite);
1714 PL_skipwhite = 0;
1715 }
1716 PL_realtokenstart = s - SvPVX(PL_linestr);
29595ff2
NC
1717 return s;
1718}
1719
cd81e915 1720/* skip space after PL_thistoken */
29595ff2
NC
1721
1722STATIC char *
1723S_skipspace1(pTHX_ register char *s)
1724{
d4c19fe8 1725 const char *start = s;
29595ff2
NC
1726 I32 startoff = start - SvPVX(PL_linestr);
1727
7918f24d
NC
1728 PERL_ARGS_ASSERT_SKIPSPACE1;
1729
29595ff2
NC
1730 s = skipspace(s);
1731 if (!PL_madskills)
1732 return s;
1733 start = SvPVX(PL_linestr) + startoff;
cd81e915 1734 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1735 const char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1736 PL_thistoken = newSVpvn(tstart, start - tstart);
1737 }
1738 PL_realtokenstart = -1;
1739 if (PL_skipwhite) {
1740 if (!PL_nextwhite)
6b29d1f5 1741 PL_nextwhite = newSVpvs("");
cd81e915
NC
1742 sv_catsv(PL_nextwhite, PL_skipwhite);
1743 sv_free(PL_skipwhite);
1744 PL_skipwhite = 0;
29595ff2
NC
1745 }
1746 return s;
1747}
1748
1749STATIC char *
1750S_skipspace2(pTHX_ register char *s, SV **svp)
1751{
c35e046a
AL
1752 char *start;
1753 const I32 bufptroff = PL_bufptr - SvPVX(PL_linestr);
1754 const I32 startoff = s - SvPVX(PL_linestr);
1755
7918f24d
NC
1756 PERL_ARGS_ASSERT_SKIPSPACE2;
1757
29595ff2
NC
1758 s = skipspace(s);
1759 PL_bufptr = SvPVX(PL_linestr) + bufptroff;
1760 if (!PL_madskills || !svp)
1761 return s;
1762 start = SvPVX(PL_linestr) + startoff;
cd81e915 1763 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1764 char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1765 PL_thistoken = newSVpvn(tstart, start - tstart);
1766 PL_realtokenstart = -1;
29595ff2 1767 }
cd81e915 1768 if (PL_skipwhite) {
29595ff2 1769 if (!*svp)
6b29d1f5 1770 *svp = newSVpvs("");
cd81e915
NC
1771 sv_setsv(*svp, PL_skipwhite);
1772 sv_free(PL_skipwhite);
1773 PL_skipwhite = 0;
29595ff2
NC
1774 }
1775
1776 return s;
1777}
1778#endif
1779
80a702cd 1780STATIC void
15f169a1 1781S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1782{
1783 AV *av = CopFILEAVx(PL_curcop);
1784 if (av) {
b9f83d2f 1785 SV * const sv = newSV_type(SVt_PVMG);
5fa550fb
NC
1786 if (orig_sv)
1787 sv_setsv(sv, orig_sv);
1788 else
1789 sv_setpvn(sv, buf, len);
80a702cd
RGS
1790 (void)SvIOK_on(sv);
1791 SvIV_set(sv, 0);
1792 av_store(av, (I32)CopLINE(PL_curcop), sv);
1793 }
1794}
1795
ffb4593c
NT
1796/*
1797 * S_skipspace
1798 * Called to gobble the appropriate amount and type of whitespace.
1799 * Skips comments as well.
1800 */
1801
76e3520e 1802STATIC char *
cea2e8a9 1803S_skipspace(pTHX_ register char *s)
a687059c 1804{
5db06880 1805#ifdef PERL_MAD
f0e67a1d
Z
1806 char *start = s;
1807#endif /* PERL_MAD */
7918f24d 1808 PERL_ARGS_ASSERT_SKIPSPACE;
f0e67a1d 1809#ifdef PERL_MAD
cd81e915
NC
1810 if (PL_skipwhite) {
1811 sv_free(PL_skipwhite);
f0e67a1d 1812 PL_skipwhite = NULL;
5db06880 1813 }
f0e67a1d 1814#endif /* PERL_MAD */
3280af22 1815 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
bf4acbe4 1816 while (s < PL_bufend && SPACE_OR_TAB(*s))
463ee0b2 1817 s++;
f0e67a1d
Z
1818 } else {
1819 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1820 PL_bufptr = s;
f0998909
Z
1821 lex_read_space(LEX_KEEP_PREVIOUS |
1822 (PL_sublex_info.sub_inwhat || PL_lex_state == LEX_FORMLINE ?
1823 LEX_NO_NEXT_CHUNK : 0));
3280af22 1824 s = PL_bufptr;
f0e67a1d
Z
1825 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1826 if (PL_linestart > PL_bufptr)
1827 PL_bufptr = PL_linestart;
1828 return s;
463ee0b2 1829 }
5db06880 1830#ifdef PERL_MAD
f0e67a1d
Z
1831 if (PL_madskills)
1832 PL_skipwhite = newSVpvn(start, s-start);
1833#endif /* PERL_MAD */
5db06880 1834 return s;
a687059c 1835}
378cc40b 1836
ffb4593c
NT
1837/*
1838 * S_check_uni
1839 * Check the unary operators to ensure there's no ambiguity in how they're
1840 * used. An ambiguous piece of code would be:
1841 * rand + 5
1842 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1843 * the +5 is its argument.
1844 */
1845
76e3520e 1846STATIC void
cea2e8a9 1847S_check_uni(pTHX)
ba106d47 1848{
97aff369 1849 dVAR;
d4c19fe8
AL
1850 const char *s;
1851 const char *t;
2f3197b3 1852
3280af22 1853 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1854 return;
3280af22
NIS
1855 while (isSPACE(*PL_last_uni))
1856 PL_last_uni++;
c35e046a
AL
1857 s = PL_last_uni;
1858 while (isALNUM_lazy_if(s,UTF) || *s == '-')
1859 s++;
3280af22 1860 if ((t = strchr(s, '(')) && t < PL_bufptr)
a0d0e21e 1861 return;
6136c704 1862
9b387841
NC
1863 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
1864 "Warning: Use of \"%.*s\" without parentheses is ambiguous",
1865 (int)(s - PL_last_uni), PL_last_uni);
2f3197b3
LW
1866}
1867
ffb4593c
NT
1868/*
1869 * LOP : macro to build a list operator. Its behaviour has been replaced
1870 * with a subroutine, S_lop() for which LOP is just another name.
1871 */
1872
a0d0e21e
LW
1873#define LOP(f,x) return lop(f,x,s)
1874
ffb4593c
NT
1875/*
1876 * S_lop
1877 * Build a list operator (or something that might be one). The rules:
1878 * - if we have a next token, then it's a list operator [why?]
1879 * - if the next thing is an opening paren, then it's a function
1880 * - else it's a list operator
1881 */
1882
76e3520e 1883STATIC I32
a0be28da 1884S_lop(pTHX_ I32 f, int x, char *s)
ffed7fef 1885{
97aff369 1886 dVAR;
7918f24d
NC
1887
1888 PERL_ARGS_ASSERT_LOP;
1889
6154021b 1890 pl_yylval.ival = f;
35c8bce7 1891 CLINE;
3280af22
NIS
1892 PL_expect = x;
1893 PL_bufptr = s;
1894 PL_last_lop = PL_oldbufptr;
eb160463 1895 PL_last_lop_op = (OPCODE)f;
5db06880
NC
1896#ifdef PERL_MAD
1897 if (PL_lasttoke)
78cdf107 1898 goto lstop;
5db06880 1899#else
3280af22 1900 if (PL_nexttoke)
78cdf107 1901 goto lstop;
5db06880 1902#endif
79072805 1903 if (*s == '(')
bbf60fe6 1904 return REPORT(FUNC);
29595ff2 1905 s = PEEKSPACE(s);
79072805 1906 if (*s == '(')
bbf60fe6 1907 return REPORT(FUNC);
78cdf107
Z
1908 else {
1909 lstop:
1910 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
1911 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
bbf60fe6 1912 return REPORT(LSTOP);
78cdf107 1913 }
79072805
LW
1914}
1915
5db06880
NC
1916#ifdef PERL_MAD
1917 /*
1918 * S_start_force
1919 * Sets up for an eventual force_next(). start_force(0) basically does
1920 * an unshift, while start_force(-1) does a push. yylex removes items
1921 * on the "pop" end.
1922 */
1923
1924STATIC void
1925S_start_force(pTHX_ int where)
1926{
1927 int i;
1928
cd81e915 1929 if (where < 0) /* so people can duplicate start_force(PL_curforce) */
5db06880 1930 where = PL_lasttoke;
cd81e915
NC
1931 assert(PL_curforce < 0 || PL_curforce == where);
1932 if (PL_curforce != where) {
5db06880
NC
1933 for (i = PL_lasttoke; i > where; --i) {
1934 PL_nexttoke[i] = PL_nexttoke[i-1];
1935 }
1936 PL_lasttoke++;
1937 }
cd81e915 1938 if (PL_curforce < 0) /* in case of duplicate start_force() */
5db06880 1939 Zero(&PL_nexttoke[where], 1, NEXTTOKE);
cd81e915
NC
1940 PL_curforce = where;
1941 if (PL_nextwhite) {
5db06880 1942 if (PL_madskills)
6b29d1f5 1943 curmad('^', newSVpvs(""));
cd81e915 1944 CURMAD('_', PL_nextwhite);
5db06880
NC
1945 }
1946}
1947
1948STATIC void
1949S_curmad(pTHX_ char slot, SV *sv)
1950{
1951 MADPROP **where;
1952
1953 if (!sv)
1954 return;
cd81e915
NC
1955 if (PL_curforce < 0)
1956 where = &PL_thismad;
5db06880 1957 else
cd81e915 1958 where = &PL_nexttoke[PL_curforce].next_mad;
5db06880 1959
cd81e915 1960 if (PL_faketokens)
76f68e9b 1961 sv_setpvs(sv, "");
5db06880
NC
1962 else {
1963 if (!IN_BYTES) {
1964 if (UTF && is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
1965 SvUTF8_on(sv);
1966 else if (PL_encoding) {
1967 sv_recode_to_utf8(sv, PL_encoding);
1968 }
1969 }
1970 }
1971
1972 /* keep a slot open for the head of the list? */
1973 if (slot != '_' && *where && (*where)->mad_key == '^') {
1974 (*where)->mad_key = slot;
daba3364 1975 sv_free(MUTABLE_SV(((*where)->mad_val)));
5db06880
NC
1976 (*where)->mad_val = (void*)sv;
1977 }
1978 else
1979 addmad(newMADsv(slot, sv), where, 0);
1980}
1981#else
b3f24c00
MHM
1982# define start_force(where) NOOP
1983# define curmad(slot, sv) NOOP
5db06880
NC
1984#endif
1985
ffb4593c
NT
1986/*
1987 * S_force_next
9cbb5ea2 1988 * When the lexer realizes it knows the next token (for instance,
ffb4593c 1989 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 1990 * to know what token to return the next time the lexer is called. Caller
5db06880
NC
1991 * will need to set PL_nextval[] (or PL_nexttoke[].next_val with PERL_MAD),
1992 * and possibly PL_expect to ensure the lexer handles the token correctly.
ffb4593c
NT
1993 */
1994
4e553d73 1995STATIC void
cea2e8a9 1996S_force_next(pTHX_ I32 type)
79072805 1997{
97aff369 1998 dVAR;
704d4215
GG
1999#ifdef DEBUGGING
2000 if (DEBUG_T_TEST) {
2001 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 2002 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
2003 }
2004#endif
6c7ae946
FC
2005 /* Don’t let opslab_force_free snatch it */
2006 if (S_is_opval_token(type & 0xffff) && NEXTVAL_NEXTTOKE.opval) {
2007 assert(!NEXTVAL_NEXTTOKE.opval->op_savefree);
2008 NEXTVAL_NEXTTOKE.opval->op_savefree = 1;
2009 }
5db06880 2010#ifdef PERL_MAD
cd81e915 2011 if (PL_curforce < 0)
5db06880 2012 start_force(PL_lasttoke);
cd81e915 2013 PL_nexttoke[PL_curforce].next_type = type;
5db06880
NC
2014 if (PL_lex_state != LEX_KNOWNEXT)
2015 PL_lex_defer = PL_lex_state;
2016 PL_lex_state = LEX_KNOWNEXT;
2017 PL_lex_expect = PL_expect;
cd81e915 2018 PL_curforce = -1;
5db06880 2019#else
3280af22
NIS
2020 PL_nexttype[PL_nexttoke] = type;
2021 PL_nexttoke++;
2022 if (PL_lex_state != LEX_KNOWNEXT) {
2023 PL_lex_defer = PL_lex_state;
2024 PL_lex_expect = PL_expect;
2025 PL_lex_state = LEX_KNOWNEXT;
79072805 2026 }
5db06880 2027#endif
79072805
LW
2028}
2029
28ac2b49
Z
2030void
2031Perl_yyunlex(pTHX)
2032{
a7aaec61
Z
2033 int yyc = PL_parser->yychar;
2034 if (yyc != YYEMPTY) {
2035 if (yyc) {
2036 start_force(-1);
2037 NEXTVAL_NEXTTOKE = PL_parser->yylval;
2038 if (yyc == '{'/*}*/ || yyc == HASHBRACK || yyc == '['/*]*/) {
78cdf107 2039 PL_lex_allbrackets--;
a7aaec61 2040 PL_lex_brackets--;
78cdf107
Z
2041 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
2042 } else if (yyc == '('/*)*/) {
2043 PL_lex_allbrackets--;
2044 yyc |= (2<<24);
a7aaec61
Z
2045 }
2046 force_next(yyc);
2047 }
28ac2b49
Z
2048 PL_parser->yychar = YYEMPTY;
2049 }
2050}
2051
d0a148a6 2052STATIC SV *
15f169a1 2053S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 2054{
97aff369 2055 dVAR;
740cce10 2056 SV * const sv = newSVpvn_utf8(start, len,
eaf7a4d2
CS
2057 !IN_BYTES
2058 && UTF
2059 && !is_ascii_string((const U8*)start, len)
740cce10 2060 && is_utf8_string((const U8*)start, len));
d0a148a6
NC
2061 return sv;
2062}
2063
ffb4593c
NT
2064/*
2065 * S_force_word
2066 * When the lexer knows the next thing is a word (for instance, it has
2067 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
2068 * it calls S_force_word to stick the next word into the PL_nexttoke/val
2069 * lookahead.
ffb4593c
NT
2070 *
2071 * Arguments:
b1b65b59 2072 * char *start : buffer position (must be within PL_linestr)
02b34bbe 2073 * int token : PL_next* will be this type of bare word (e.g., METHOD,WORD)
ffb4593c
NT
2074 * int check_keyword : if true, Perl checks to make sure the word isn't
2075 * a keyword (do this if the word is a label, e.g. goto FOO)
2076 * int allow_pack : if true, : characters will also be allowed (require,
2077 * use, etc. do this)
9cbb5ea2 2078 * int allow_initial_tick : used by the "sub" lexer only.
ffb4593c
NT
2079 */
2080
76e3520e 2081STATIC char *
cea2e8a9 2082S_force_word(pTHX_ register char *start, int token, int check_keyword, int allow_pack, int allow_initial_tick)
79072805 2083{
97aff369 2084 dVAR;
eb578fdb 2085 char *s;
463ee0b2 2086 STRLEN len;
4e553d73 2087
7918f24d
NC
2088 PERL_ARGS_ASSERT_FORCE_WORD;
2089
29595ff2 2090 start = SKIPSPACE1(start);
463ee0b2 2091 s = start;
7e2040f0 2092 if (isIDFIRST_lazy_if(s,UTF) ||
a0d0e21e 2093 (allow_pack && *s == ':') ||
15f0808c 2094 (allow_initial_tick && *s == '\'') )
a0d0e21e 2095 {
3280af22 2096 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
5458a98a 2097 if (check_keyword && keyword(PL_tokenbuf, len, 0))
463ee0b2 2098 return start;
cd81e915 2099 start_force(PL_curforce);
5db06880
NC
2100 if (PL_madskills)
2101 curmad('X', newSVpvn(start,s-start));
463ee0b2 2102 if (token == METHOD) {
29595ff2 2103 s = SKIPSPACE1(s);
463ee0b2 2104 if (*s == '(')
3280af22 2105 PL_expect = XTERM;
463ee0b2 2106 else {
3280af22 2107 PL_expect = XOPERATOR;
463ee0b2 2108 }
79072805 2109 }
e74e6b3d 2110 if (PL_madskills)
63575281 2111 curmad('g', newSVpvs( "forced" ));
9ded7720 2112 NEXTVAL_NEXTTOKE.opval
d0a148a6
NC
2113 = (OP*)newSVOP(OP_CONST,0,
2114 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 2115 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
2116 force_next(token);
2117 }
2118 return s;
2119}
2120
ffb4593c
NT
2121/*
2122 * S_force_ident
9cbb5ea2 2123 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
2124 * text only contains the "foo" portion. The first argument is a pointer
2125 * to the "foo", and the second argument is the type symbol to prefix.
2126 * Forces the next token to be a "WORD".
9cbb5ea2 2127 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2128 */
2129
76e3520e 2130STATIC void
bfed75c6 2131S_force_ident(pTHX_ register const char *s, int kind)
79072805 2132{
97aff369 2133 dVAR;
7918f24d
NC
2134
2135 PERL_ARGS_ASSERT_FORCE_IDENT;
2136
c35e046a 2137 if (*s) {
90e5519e 2138 const STRLEN len = strlen(s);
728847b1
BF
2139 OP* const o = (OP*)newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
2140 UTF ? SVf_UTF8 : 0));
cd81e915 2141 start_force(PL_curforce);
9ded7720 2142 NEXTVAL_NEXTTOKE.opval = o;
79072805 2143 force_next(WORD);
748a9306 2144 if (kind) {
11343788 2145 o->op_private = OPpCONST_ENTERED;
55497cff 2146 /* XXX see note in pp_entereval() for why we forgo typo
2147 warnings if the symbol must be introduced in an eval.
2148 GSAR 96-10-12 */
90e5519e 2149 gv_fetchpvn_flags(s, len,
728847b1
BF
2150 (PL_in_eval ? (GV_ADDMULTI | GV_ADDINEVAL)
2151 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
90e5519e
NC
2152 kind == '$' ? SVt_PV :
2153 kind == '@' ? SVt_PVAV :
2154 kind == '%' ? SVt_PVHV :
a0d0e21e 2155 SVt_PVGV
90e5519e 2156 );
748a9306 2157 }
79072805
LW
2158 }
2159}
2160
1571675a
GS
2161NV
2162Perl_str_to_version(pTHX_ SV *sv)
2163{
2164 NV retval = 0.0;
2165 NV nshift = 1.0;
2166 STRLEN len;
cfd0369c 2167 const char *start = SvPV_const(sv,len);
9d4ba2ae 2168 const char * const end = start + len;
504618e9 2169 const bool utf = SvUTF8(sv) ? TRUE : FALSE;
7918f24d
NC
2170
2171 PERL_ARGS_ASSERT_STR_TO_VERSION;
2172
1571675a 2173 while (start < end) {
ba210ebe 2174 STRLEN skip;
1571675a
GS
2175 UV n;
2176 if (utf)
9041c2e3 2177 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2178 else {
2179 n = *(U8*)start;
2180 skip = 1;
2181 }
2182 retval += ((NV)n)/nshift;
2183 start += skip;
2184 nshift *= 1000;
2185 }
2186 return retval;
2187}
2188
4e553d73 2189/*
ffb4593c
NT
2190 * S_force_version
2191 * Forces the next token to be a version number.
e759cc13
RGS
2192 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2193 * and if "guessing" is TRUE, then no new token is created (and the caller
2194 * must use an alternative parsing method).
ffb4593c
NT
2195 */
2196
76e3520e 2197STATIC char *
e759cc13 2198S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2199{
97aff369 2200 dVAR;
5f66b61c 2201 OP *version = NULL;
44dcb63b 2202 char *d;
5db06880
NC
2203#ifdef PERL_MAD
2204 I32 startoff = s - SvPVX(PL_linestr);
2205#endif
89bfa8cd 2206
7918f24d
NC
2207 PERL_ARGS_ASSERT_FORCE_VERSION;
2208
29595ff2 2209 s = SKIPSPACE1(s);
89bfa8cd 2210
44dcb63b 2211 d = s;
dd629d5b 2212 if (*d == 'v')
44dcb63b 2213 d++;
44dcb63b 2214 if (isDIGIT(*d)) {
e759cc13
RGS
2215 while (isDIGIT(*d) || *d == '_' || *d == '.')
2216 d++;
5db06880
NC
2217#ifdef PERL_MAD
2218 if (PL_madskills) {
cd81e915 2219 start_force(PL_curforce);
5db06880
NC
2220 curmad('X', newSVpvn(s,d-s));
2221 }
2222#endif
4e4da3ac 2223 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
dd629d5b 2224 SV *ver;
8d08d9ba 2225#ifdef USE_LOCALE_NUMERIC
909d3787
KW
2226 char *loc = savepv(setlocale(LC_NUMERIC, NULL));
2227 setlocale(LC_NUMERIC, "C");
8d08d9ba 2228#endif
6154021b 2229 s = scan_num(s, &pl_yylval);
8d08d9ba
DG
2230#ifdef USE_LOCALE_NUMERIC
2231 setlocale(LC_NUMERIC, loc);
909d3787 2232 Safefree(loc);
8d08d9ba 2233#endif
6154021b 2234 version = pl_yylval.opval;
dd629d5b
GS
2235 ver = cSVOPx(version)->op_sv;
2236 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2237 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2238 SvNV_set(ver, str_to_version(ver));
1571675a 2239 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2240 }
89bfa8cd 2241 }
5db06880
NC
2242 else if (guessing) {
2243#ifdef PERL_MAD
2244 if (PL_madskills) {
cd81e915
NC
2245 sv_free(PL_nextwhite); /* let next token collect whitespace */
2246 PL_nextwhite = 0;
5db06880
NC
2247 s = SvPVX(PL_linestr) + startoff;
2248 }
2249#endif
e759cc13 2250 return s;
5db06880 2251 }
89bfa8cd 2252 }
2253
5db06880
NC
2254#ifdef PERL_MAD
2255 if (PL_madskills && !version) {
cd81e915
NC
2256 sv_free(PL_nextwhite); /* let next token collect whitespace */
2257 PL_nextwhite = 0;
5db06880
NC
2258 s = SvPVX(PL_linestr) + startoff;
2259 }
2260#endif
89bfa8cd 2261 /* NOTE: The parser sees the package name and the VERSION swapped */
cd81e915 2262 start_force(PL_curforce);
9ded7720 2263 NEXTVAL_NEXTTOKE.opval = version;
4e553d73 2264 force_next(WORD);
89bfa8cd 2265
e759cc13 2266 return s;
89bfa8cd 2267}
2268
ffb4593c 2269/*
91152fc1
DG
2270 * S_force_strict_version
2271 * Forces the next token to be a version number using strict syntax rules.
2272 */
2273
2274STATIC char *
2275S_force_strict_version(pTHX_ char *s)
2276{
2277 dVAR;
2278 OP *version = NULL;
2279#ifdef PERL_MAD
2280 I32 startoff = s - SvPVX(PL_linestr);
2281#endif
2282 const char *errstr = NULL;
2283
2284 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2285
2286 while (isSPACE(*s)) /* leading whitespace */
2287 s++;
2288
2289 if (is_STRICT_VERSION(s,&errstr)) {
2290 SV *ver = newSV(0);
2291 s = (char *)scan_version(s, ver, 0);
2292 version = newSVOP(OP_CONST, 0, ver);
2293 }
4e4da3ac
Z
2294 else if ( (*s != ';' && *s != '{' && *s != '}' ) &&
2295 (s = SKIPSPACE1(s), (*s != ';' && *s != '{' && *s != '}' )))
2296 {
91152fc1
DG
2297 PL_bufptr = s;
2298 if (errstr)
2299 yyerror(errstr); /* version required */
2300 return s;
2301 }
2302
2303#ifdef PERL_MAD
2304 if (PL_madskills && !version) {
2305 sv_free(PL_nextwhite); /* let next token collect whitespace */
2306 PL_nextwhite = 0;
2307 s = SvPVX(PL_linestr) + startoff;
2308 }
2309#endif
2310 /* NOTE: The parser sees the package name and the VERSION swapped */
2311 start_force(PL_curforce);
2312 NEXTVAL_NEXTTOKE.opval = version;
2313 force_next(WORD);
2314
2315 return s;
2316}
2317
2318/*
ffb4593c
NT
2319 * S_tokeq
2320 * Tokenize a quoted string passed in as an SV. It finds the next
2321 * chunk, up to end of string or a backslash. It may make a new
2322 * SV containing that chunk (if HINT_NEW_STRING is on). It also
2323 * turns \\ into \.
2324 */
2325
76e3520e 2326STATIC SV *
cea2e8a9 2327S_tokeq(pTHX_ SV *sv)
79072805 2328{
97aff369 2329 dVAR;
eb578fdb
KW
2330 char *s;
2331 char *send;
2332 char *d;
b3ac6de7
IZ
2333 STRLEN len = 0;
2334 SV *pv = sv;
79072805 2335
7918f24d
NC
2336 PERL_ARGS_ASSERT_TOKEQ;
2337
79072805 2338 if (!SvLEN(sv))
b3ac6de7 2339 goto finish;
79072805 2340
a0d0e21e 2341 s = SvPV_force(sv, len);
21a311ee 2342 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1)
b3ac6de7 2343 goto finish;
463ee0b2 2344 send = s + len;
dcb21ed6
NC
2345 /* This is relying on the SV being "well formed" with a trailing '\0' */
2346 while (s < send && !(*s == '\\' && s[1] == '\\'))
79072805
LW
2347 s++;
2348 if (s == send)
b3ac6de7 2349 goto finish;
79072805 2350 d = s;
be4731d2 2351 if ( PL_hints & HINT_NEW_STRING ) {
59cd0e26 2352 pv = newSVpvn_flags(SvPVX_const(pv), len, SVs_TEMP | SvUTF8(sv));
be4731d2 2353 }
79072805
LW
2354 while (s < send) {
2355 if (*s == '\\') {
a0d0e21e 2356 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2357 s++; /* all that, just for this */
2358 }
2359 *d++ = *s++;
2360 }
2361 *d = '\0';
95a20fc0 2362 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2363 finish:
3280af22 2364 if ( PL_hints & HINT_NEW_STRING )
eb0d8d16 2365 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
79072805
LW
2366 return sv;
2367}
2368
ffb4593c
NT
2369/*
2370 * Now come three functions related to double-quote context,
2371 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2372 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2373 * interact with PL_lex_state, and create fake ( ... ) argument lists
2374 * to handle functions and concatenation.
ecd24171
DM
2375 * For example,
2376 * "foo\lbar"
2377 * is tokenised as
2378 * stringify ( const[foo] concat lcfirst ( const[bar] ) )
ffb4593c
NT
2379 */
2380
2381/*
2382 * S_sublex_start
6154021b 2383 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2384 *
2385 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2386 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c
NT
2387 *
2388 * OP_CONST and OP_READLINE are easy--just make the new op and return.
2389 *
2390 * Everything else becomes a FUNC.
2391 *
2392 * Sets PL_lex_state to LEX_INTERPPUSH unless (ival was OP_NULL or we
2393 * had an OP_CONST or OP_READLINE). This just sets us up for a
2394 * call to S_sublex_push().
2395 */
2396
76e3520e 2397STATIC I32
cea2e8a9 2398S_sublex_start(pTHX)
79072805 2399{
97aff369 2400 dVAR;
eb578fdb 2401 const I32 op_type = pl_yylval.ival;
79072805
LW
2402
2403 if (op_type == OP_NULL) {
6154021b 2404 pl_yylval.opval = PL_lex_op;
5f66b61c 2405 PL_lex_op = NULL;
79072805
LW
2406 return THING;
2407 }
2408 if (op_type == OP_CONST || op_type == OP_READLINE) {
3280af22 2409 SV *sv = tokeq(PL_lex_stuff);
b3ac6de7
IZ
2410
2411 if (SvTYPE(sv) == SVt_PVIV) {
2412 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2413 STRLEN len;
96a5add6 2414 const char * const p = SvPV_const(sv, len);
740cce10 2415 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2416 SvREFCNT_dec(sv);
2417 sv = nsv;
4e553d73 2418 }
6154021b 2419 pl_yylval.opval = (OP*)newSVOP(op_type, 0, sv);
a0714e2c 2420 PL_lex_stuff = NULL;
6f33ba73
RGS
2421 /* Allow <FH> // "foo" */
2422 if (op_type == OP_READLINE)
2423 PL_expect = XTERMORDORDOR;
79072805
LW
2424 return THING;
2425 }
e3f73d4e
RGS
2426 else if (op_type == OP_BACKTICK && PL_lex_op) {
2427 /* readpipe() vas overriden */
2428 cSVOPx(cLISTOPx(cUNOPx(PL_lex_op)->op_first)->op_first->op_sibling)->op_sv = tokeq(PL_lex_stuff);
6154021b 2429 pl_yylval.opval = PL_lex_op;
9b201d7d 2430 PL_lex_op = NULL;
e3f73d4e
RGS
2431 PL_lex_stuff = NULL;
2432 return THING;
2433 }
79072805 2434
3280af22 2435 PL_sublex_info.super_state = PL_lex_state;
eac04b2e 2436 PL_sublex_info.sub_inwhat = (U16)op_type;
3280af22
NIS
2437 PL_sublex_info.sub_op = PL_lex_op;
2438 PL_lex_state = LEX_INTERPPUSH;
55497cff 2439
3280af22
NIS
2440 PL_expect = XTERM;
2441 if (PL_lex_op) {
6154021b 2442 pl_yylval.opval = PL_lex_op;
5f66b61c 2443 PL_lex_op = NULL;
55497cff 2444 return PMFUNC;
2445 }
2446 else
2447 return FUNC;
2448}
2449
ffb4593c
NT
2450/*
2451 * S_sublex_push
2452 * Create a new scope to save the lexing state. The scope will be
2453 * ended in S_sublex_done. Returns a '(', starting the function arguments
2454 * to the uc, lc, etc. found before.
2455 * Sets PL_lex_state to LEX_INTERPCONCAT.
2456 */
2457
76e3520e 2458STATIC I32
cea2e8a9 2459S_sublex_push(pTHX)
55497cff 2460{
27da23d5 2461 dVAR;
f46d017c 2462 ENTER;
55497cff 2463
3280af22 2464 PL_lex_state = PL_sublex_info.super_state;
651b5b28 2465 SAVEBOOL(PL_lex_dojoin);
3280af22 2466 SAVEI32(PL_lex_brackets);
78cdf107 2467 SAVEI32(PL_lex_allbrackets);
b27dce25 2468 SAVEI32(PL_lex_formbrack);
78cdf107 2469 SAVEI8(PL_lex_fakeeof);
3280af22
NIS
2470 SAVEI32(PL_lex_casemods);
2471 SAVEI32(PL_lex_starts);
651b5b28 2472 SAVEI8(PL_lex_state);
7cc34111 2473 SAVESPTR(PL_lex_repl);
9da1dd8f 2474 SAVEPPTR(PL_sublex_info.re_eval_start);
db444266 2475 SAVESPTR(PL_sublex_info.re_eval_str);
565b52df 2476 SAVEPPTR(PL_sublex_info.super_bufptr);
7766f137 2477 SAVEVPTR(PL_lex_inpat);
98246f1e 2478 SAVEI16(PL_lex_inwhat);
57843af0 2479 SAVECOPLINE(PL_curcop);
3280af22 2480 SAVEPPTR(PL_bufptr);
8452ff4b 2481 SAVEPPTR(PL_bufend);
3280af22
NIS
2482 SAVEPPTR(PL_oldbufptr);
2483 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2484 SAVEPPTR(PL_last_lop);
2485 SAVEPPTR(PL_last_uni);
3280af22
NIS
2486 SAVEPPTR(PL_linestart);
2487 SAVESPTR(PL_linestr);
8edd5f42
RGS
2488 SAVEGENERICPV(PL_lex_brackstack);
2489 SAVEGENERICPV(PL_lex_casestack);
3280af22 2490
99bd9d90
FC
2491 /* The here-doc parser needs to be able to peek into outer lexing
2492 scopes to find the body of the here-doc. We use SvIVX(PL_linestr)
2493 to store the outer PL_bufptr and SvNVX to store the outer
2494 PL_linestr. Since SvIVX already means something else, we use
2495 PL_sublex_info.super_bufptr for the innermost scope (the one we are
2496 now entering), and a localised SvIVX for outer scopes.
2497 */
2498 SvUPGRADE(PL_linestr, SVt_PVIV);
2499 /* A null super_bufptr means the outer lexing scope is not peekable,
2500 because it is a single line from an input stream. */
2501 SAVEIV(SvIVX(PL_linestr));
2502 SvIVX(PL_linestr) = PTR2IV(PL_sublex_info.super_bufptr);
2503 PL_sublex_info.super_bufptr =
2504 (SvTYPE(PL_linestr) < SVt_PVNV || !SvNVX(PL_linestr))
2505 && (PL_rsfp || PL_parser->filtered)
2506 ? NULL
2507 : PL_bufptr;
2508 SvUPGRADE(PL_lex_stuff, SVt_PVNV);
2509 SvNVX(PL_lex_stuff) = PTR2NV(PL_linestr);
2510
3280af22 2511 PL_linestr = PL_lex_stuff;
7cc34111 2512 PL_lex_repl = PL_sublex_info.repl;
a0714e2c 2513 PL_lex_stuff = NULL;
7cc34111 2514 PL_sublex_info.repl = NULL;
9da1dd8f 2515 PL_sublex_info.re_eval_start = NULL;
db444266 2516 PL_sublex_info.re_eval_str = NULL;
3280af22 2517
9cbb5ea2
GS
2518 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2519 = SvPVX(PL_linestr);
3280af22 2520 PL_bufend += SvCUR(PL_linestr);
bd61b366 2521 PL_last_lop = PL_last_uni = NULL;
3280af22 2522 SAVEFREESV(PL_linestr);
4dc843bc 2523 if (PL_lex_repl) SAVEFREESV(PL_lex_repl);
3280af22
NIS
2524
2525 PL_lex_dojoin = FALSE;
b27dce25 2526 PL_lex_brackets = PL_lex_formbrack = 0;
78cdf107
Z
2527 PL_lex_allbrackets = 0;
2528 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
a02a5408
JC
2529 Newx(PL_lex_brackstack, 120, char);
2530 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2531 PL_lex_casemods = 0;
2532 *PL_lex_casestack = '\0';
2533 PL_lex_starts = 0;
2534 PL_lex_state = LEX_INTERPCONCAT;
eb160463 2535 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
3280af22
NIS
2536
2537 PL_lex_inwhat = PL_sublex_info.sub_inwhat;
bb16bae8 2538 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
3280af22
NIS
2539 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2540 PL_lex_inpat = PL_sublex_info.sub_op;
79072805 2541 else
5f66b61c 2542 PL_lex_inpat = NULL;
79072805 2543
55497cff 2544 return '(';
79072805
LW
2545}
2546
ffb4593c
NT
2547/*
2548 * S_sublex_done
2549 * Restores lexer state after a S_sublex_push.
2550 */
2551
76e3520e 2552STATIC I32
cea2e8a9 2553S_sublex_done(pTHX)
79072805 2554{
27da23d5 2555 dVAR;
3280af22 2556 if (!PL_lex_starts++) {
396482e1 2557 SV * const sv = newSVpvs("");
9aa983d2
JH
2558 if (SvUTF8(PL_linestr))
2559 SvUTF8_on(sv);
3280af22 2560 PL_expect = XOPERATOR;
6154021b 2561 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
79072805
LW
2562 return THING;
2563 }
2564
3280af22
NIS
2565 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2566 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2567 return yylex();
79072805
LW
2568 }
2569
ffb4593c 2570 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
bb16bae8 2571 assert(PL_lex_inwhat != OP_TRANSR);
3280af22 2572 if (PL_lex_repl && (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS)) {
99bd9d90
FC
2573 SvUPGRADE(PL_lex_repl, SVt_PVNV);
2574 SvNVX(PL_lex_repl) = SvNVX(PL_linestr);
3280af22
NIS
2575 PL_linestr = PL_lex_repl;
2576 PL_lex_inpat = 0;
2577 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2578 PL_bufend += SvCUR(PL_linestr);
bd61b366 2579 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2580 PL_lex_dojoin = FALSE;
2581 PL_lex_brackets = 0;
78cdf107
Z
2582 PL_lex_allbrackets = 0;
2583 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
3280af22
NIS
2584 PL_lex_casemods = 0;
2585 *PL_lex_casestack = '\0';
2586 PL_lex_starts = 0;
25da4f38 2587 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2588 PL_lex_state = LEX_INTERPNORMAL;
2589 PL_lex_starts++;
e9fa98b2
HS
2590 /* we don't clear PL_lex_repl here, so that we can check later
2591 whether this is an evalled subst; that means we rely on the
2592 logic to ensure sublex_done() is called again only via the
2593 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2594 }
e9fa98b2 2595 else {
3280af22 2596 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2597 PL_lex_repl = NULL;
e9fa98b2 2598 }
79072805 2599 return ',';
ffed7fef
LW
2600 }
2601 else {
5db06880
NC
2602#ifdef PERL_MAD
2603 if (PL_madskills) {
cd81e915
NC
2604 if (PL_thiswhite) {
2605 if (!PL_endwhite)
6b29d1f5 2606 PL_endwhite = newSVpvs("");
cd81e915
NC
2607 sv_catsv(PL_endwhite, PL_thiswhite);
2608 PL_thiswhite = 0;
2609 }
2610 if (PL_thistoken)
76f68e9b 2611 sv_setpvs(PL_thistoken,"");
5db06880 2612 else
cd81e915 2613 PL_realtokenstart = -1;
5db06880
NC
2614 }
2615#endif
f46d017c 2616 LEAVE;
3280af22
NIS
2617 PL_bufend = SvPVX(PL_linestr);
2618 PL_bufend += SvCUR(PL_linestr);
2619 PL_expect = XOPERATOR;
09bef843 2620 PL_sublex_info.sub_inwhat = 0;
79072805 2621 return ')';
ffed7fef
LW
2622 }
2623}
2624
02aa26ce
NT
2625/*
2626 scan_const
2627
9da1dd8f
DM
2628 Extracts the next constant part of a pattern, double-quoted string,
2629 or transliteration. This is terrifying code.
2630
2631 For example, in parsing the double-quoted string "ab\x63$d", it would
2632 stop at the '$' and return an OP_CONST containing 'abc'.
02aa26ce 2633
94def140 2634 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 2635 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 2636 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 2637
94def140
TS
2638 Returns a pointer to the character scanned up to. If this is
2639 advanced from the start pointer supplied (i.e. if anything was
9da1dd8f 2640 successfully parsed), will leave an OP_CONST for the substring scanned
6154021b 2641 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
2642 by looking at the next characters herself.
2643
02aa26ce 2644 In patterns:
9da1dd8f
DM
2645 expand:
2646 \N{ABC} => \N{U+41.42.43}
2647
2648 pass through:
2649 all other \-char, including \N and \N{ apart from \N{ABC}
2650
2651 stops on:
2652 @ and $ where it appears to be a var, but not for $ as tail anchor
2653 \l \L \u \U \Q \E
2654 (?{ or (??{
2655
02aa26ce
NT
2656
2657 In transliterations:
2658 characters are VERY literal, except for - not at the start or end
94def140
TS
2659 of the string, which indicates a range. If the range is in bytes,
2660 scan_const expands the range to the full set of intermediate
2661 characters. If the range is in utf8, the hyphen is replaced with
2662 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
2663
2664 In double-quoted strings:
2665 backslashes:
2666 double-quoted style: \r and \n
ff3f963a 2667 constants: \x31, etc.
94def140 2668 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
2669 case and quoting: \U \Q \E
2670 stops on @ and $
2671
2672 scan_const does *not* construct ops to handle interpolated strings.
2673 It stops processing as soon as it finds an embedded $ or @ variable
2674 and leaves it to the caller to work out what's going on.
2675
94def140
TS
2676 embedded arrays (whether in pattern or not) could be:
2677 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2678
2679 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
2680
2681 $ in pattern could be $foo or could be tail anchor. Assumption:
2682 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 2683 followed by one of "()| \r\n\t"
02aa26ce 2684
9da1dd8f 2685 \1 (backreferences) are turned into $1 in substitutions
02aa26ce
NT
2686
2687 The structure of the code is
2688 while (there's a character to process) {
94def140
TS
2689 handle transliteration ranges
2690 skip regexp comments /(?#comment)/ and codes /(?{code})/
2691 skip #-initiated comments in //x patterns
2692 check for embedded arrays
02aa26ce
NT
2693 check for embedded scalars
2694 if (backslash) {
94def140 2695 deprecate \1 in substitution replacements
02aa26ce
NT
2696 handle string-changing backslashes \l \U \Q \E, etc.
2697 switch (what was escaped) {
94def140 2698 handle \- in a transliteration (becomes a literal -)
ff3f963a 2699 if a pattern and not \N{, go treat as regular character
94def140
TS
2700 handle \132 (octal characters)
2701 handle \x15 and \x{1234} (hex characters)
ff3f963a 2702 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
TS
2703 handle \cV (control characters)
2704 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 2705 } (end switch)
77a135fe 2706 continue
02aa26ce 2707 } (end if backslash)
77a135fe 2708 handle regular character
02aa26ce 2709 } (end while character to read)
4e553d73 2710
02aa26ce
NT
2711*/
2712
76e3520e 2713STATIC char *
cea2e8a9 2714S_scan_const(pTHX_ char *start)
79072805 2715{
97aff369 2716 dVAR;
eb578fdb 2717 char *send = PL_bufend; /* end of the constant */
77a135fe
KW
2718 SV *sv = newSV(send - start); /* sv for the constant. See
2719 note below on sizing. */
eb578fdb
KW
2720 char *s = start; /* start of the constant */
2721 char *d = SvPVX(sv); /* destination for copies */
02aa26ce 2722 bool dorange = FALSE; /* are we in a translit range? */
c2e66d9e 2723 bool didrange = FALSE; /* did we just finish a range? */
2866decb 2724 bool in_charclass = FALSE; /* within /[...]/ */
b953e60c
KW
2725 bool has_utf8 = FALSE; /* Output constant is UTF8 */
2726 bool this_utf8 = cBOOL(UTF); /* Is the source string assumed
77a135fe
KW
2727 to be UTF8? But, this can
2728 show as true when the source
2729 isn't utf8, as for example
2730 when it is entirely composed
2731 of hex constants */
2732
2733 /* Note on sizing: The scanned constant is placed into sv, which is
2734 * initialized by newSV() assuming one byte of output for every byte of
2735 * input. This routine expects newSV() to allocate an extra byte for a
2736 * trailing NUL, which this routine will append if it gets to the end of
2737 * the input. There may be more bytes of input than output (eg., \N{LATIN
2738 * CAPITAL LETTER A}), or more output than input if the constant ends up
2739 * recoded to utf8, but each time a construct is found that might increase
2740 * the needed size, SvGROW() is called. Its size parameter each time is
2741 * based on the best guess estimate at the time, namely the length used so
2742 * far, plus the length the current construct will occupy, plus room for
2743 * the trailing NUL, plus one byte for every input byte still unscanned */
2744
012bcf8d 2745 UV uv;
4c3a8340
TS
2746#ifdef EBCDIC
2747 UV literal_endpoint = 0;
e294cc5d 2748 bool native_range = TRUE; /* turned to FALSE if the first endpoint is Unicode. */
4c3a8340 2749#endif
012bcf8d 2750
7918f24d
NC
2751 PERL_ARGS_ASSERT_SCAN_CONST;
2752
bb16bae8 2753 assert(PL_lex_inwhat != OP_TRANSR);
2b9d42f0
NIS
2754 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
2755 /* If we are doing a trans and we know we want UTF8 set expectation */
2756 has_utf8 = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
2757 this_utf8 = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
2758 }
2759
2760
79072805 2761 while (s < send || dorange) {
ff3f963a 2762
02aa26ce 2763 /* get transliterations out of the way (they're most literal) */
3280af22 2764 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 2765 /* expand a range A-Z to the full set of characters. AIE! */
79072805 2766 if (dorange) {
1ba5c669
JH
2767 I32 i; /* current expanded character */
2768 I32 min; /* first character in range */
2769 I32 max; /* last character in range */
02aa26ce 2770
e294cc5d
JH
2771#ifdef EBCDIC
2772 UV uvmax = 0;
2773#endif
2774
2775 if (has_utf8
2776#ifdef EBCDIC
2777 && !native_range
2778#endif
2779 ) {
9d4ba2ae 2780 char * const c = (char*)utf8_hop((U8*)d, -1);
8973db79
JH
2781 char *e = d++;
2782 while (e-- > c)
2783 *(e + 1) = *e;
25716404 2784 *c = (char)UTF_TO_NATIVE(0xff);
8973db79
JH
2785 /* mark the range as done, and continue */
2786 dorange = FALSE;
2787 didrange = TRUE;
2788 continue;
2789 }
2b9d42f0 2790
95a20fc0 2791 i = d - SvPVX_const(sv); /* remember current offset */
e294cc5d
JH
2792#ifdef EBCDIC
2793 SvGROW(sv,
2794 SvLEN(sv) + (has_utf8 ?
2795 (512 - UTF_CONTINUATION_MARK +
2796 UNISKIP(0x100))
2797 : 256));
2798 /* How many two-byte within 0..255: 128 in UTF-8,
2799 * 96 in UTF-8-mod. */
2800#else
9cbb5ea2 2801 SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */
e294cc5d 2802#endif
9cbb5ea2 2803 d = SvPVX(sv) + i; /* refresh d after realloc */
e294cc5d
JH
2804#ifdef EBCDIC
2805 if (has_utf8) {
2806 int j;
2807 for (j = 0; j <= 1; j++) {
2808 char * const c = (char*)utf8_hop((U8*)d, -1);
2809 const UV uv = utf8n_to_uvchr((U8*)c, d - c, NULL, 0);
2810 if (j)
2811 min = (U8)uv;
2812 else if (uv < 256)
2813 max = (U8)uv;
2814 else {
2815 max = (U8)0xff; /* only to \xff */
2816 uvmax = uv; /* \x{100} to uvmax */
2817 }
2818 d = c; /* eat endpoint chars */
2819 }
2820 }
2821 else {
2822#endif
2823 d -= 2; /* eat the first char and the - */
2824 min = (U8)*d; /* first char in range */
2825 max = (U8)d[1]; /* last char in range */
2826#ifdef EBCDIC
2827 }
2828#endif
8ada0baa 2829
c2e66d9e 2830 if (min > max) {
4dc843bc 2831 SvREFCNT_dec(sv);
01ec43d0 2832 Perl_croak(aTHX_
d1573ac7 2833 "Invalid range \"%c-%c\" in transliteration operator",
1ba5c669 2834 (char)min, (char)max);
c2e66d9e
GS
2835 }
2836
c7f1f016 2837#ifdef EBCDIC
4c3a8340
TS
2838 if (literal_endpoint == 2 &&
2839 ((isLOWER(min) && isLOWER(max)) ||
2840 (isUPPER(min) && isUPPER(max)))) {
8ada0baa
JH
2841 if (isLOWER(min)) {
2842 for (i = min; i <= max; i++)
2843 if (isLOWER(i))
db42d148 2844 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2845 } else {
2846 for (i = min; i <= max; i++)
2847 if (isUPPER(i))
db42d148 2848 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2849 }
2850 }
2851 else
2852#endif
2853 for (i = min; i <= max; i++)
e294cc5d
JH
2854#ifdef EBCDIC
2855 if (has_utf8) {
2856 const U8 ch = (U8)NATIVE_TO_UTF(i);
2857 if (UNI_IS_INVARIANT(ch))
2858 *d++ = (U8)i;
2859 else {
2860 *d++ = (U8)UTF8_EIGHT_BIT_HI(ch);
2861 *d++ = (U8)UTF8_EIGHT_BIT_LO(ch);
2862 }
2863 }
2864 else
2865#endif
2866 *d++ = (char)i;
2867
2868#ifdef EBCDIC
2869 if (uvmax) {
2870 d = (char*)uvchr_to_utf8((U8*)d, 0x100);
2871 if (uvmax > 0x101)
2872 *d++ = (char)UTF_TO_NATIVE(0xff);
2873 if (uvmax > 0x100)
2874 d = (char*)uvchr_to_utf8((U8*)d, uvmax);
2875 }
2876#endif
02aa26ce
NT
2877
2878 /* mark the range as done, and continue */
79072805 2879 dorange = FALSE;
01ec43d0 2880 didrange = TRUE;
4c3a8340
TS
2881#ifdef EBCDIC
2882 literal_endpoint = 0;
2883#endif
79072805 2884 continue;
4e553d73 2885 }
02aa26ce
NT
2886
2887 /* range begins (ignore - as first or last char) */
79072805 2888 else if (*s == '-' && s+1 < send && s != start) {
4e553d73 2889 if (didrange) {
4dc843bc 2890 SvREFCNT_dec(sv);
1fafa243 2891 Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
01ec43d0 2892 }
e294cc5d
JH
2893 if (has_utf8
2894#ifdef EBCDIC
2895 && !native_range
2896#endif
2897 ) {
25716404 2898 *d++ = (char)UTF_TO_NATIVE(0xff); /* use illegal utf8 byte--see pmtrans */
a0ed51b3
LW
2899 s++;
2900 continue;
2901 }
79072805
LW
2902 dorange = TRUE;
2903 s++;
01ec43d0
GS
2904 }
2905 else {
2906 didrange = FALSE;
4c3a8340
TS
2907#ifdef EBCDIC
2908 literal_endpoint = 0;
e294cc5d 2909 native_range = TRUE;
4c3a8340 2910#endif
01ec43d0 2911 }
79072805 2912 }
02aa26ce
NT
2913
2914 /* if we get here, we're not doing a transliteration */
2915
e4a2df84
DM
2916 else if (*s == '[' && PL_lex_inpat && !in_charclass) {
2917 char *s1 = s-1;
2918 int esc = 0;
2919 while (s1 >= start && *s1-- == '\\')
2920 esc = !esc;
2921 if (!esc)
2922 in_charclass = TRUE;
2923 }
2866decb 2924
e4a2df84
DM
2925 else if (*s == ']' && PL_lex_inpat && in_charclass) {
2926 char *s1 = s-1;
2927 int esc = 0;
2928 while (s1 >= start && *s1-- == '\\')
2929 esc = !esc;
2930 if (!esc)
2931 in_charclass = FALSE;
2932 }
2866decb 2933
9da1dd8f
DM
2934 /* skip for regexp comments /(?#comment)/, except for the last
2935 * char, which will be done separately.
2936 * Stop on (?{..}) and friends */
2937
3280af22 2938 else if (*s == '(' && PL_lex_inpat && s[1] == '?') {
cc6b7395 2939 if (s[2] == '#') {
e994fd66 2940 while (s+1 < send && *s != ')')
db42d148 2941 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
155aba94 2942 }
2866decb 2943 else if (!PL_lex_casemods && !in_charclass &&
d3cec5e5
DM
2944 ( s[2] == '{' /* This should match regcomp.c */
2945 || (s[2] == '?' && s[3] == '{')))
155aba94 2946 {
9da1dd8f 2947 break;
cc6b7395 2948 }
748a9306 2949 }
02aa26ce
NT
2950
2951 /* likewise skip #-initiated comments in //x patterns */
3280af22 2952 else if (*s == '#' && PL_lex_inpat &&
73134a2e 2953 ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED) {
748a9306 2954 while (s+1 < send && *s != '\n')
db42d148 2955 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
748a9306 2956 }
02aa26ce 2957
9da1dd8f
DM
2958 /* no further processing of single-quoted regex */
2959 else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'')
2960 goto default_action;
2961
5d1d4326 2962 /* check for embedded arrays
da6eedaa 2963 (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
5d1d4326 2964 */
1749ea0d
TS
2965 else if (*s == '@' && s[1]) {
2966 if (isALNUM_lazy_if(s+1,UTF))
2967 break;
2968 if (strchr(":'{$", s[1]))
2969 break;
2970 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
2971 break; /* in regexp, neither @+ nor @- are interpolated */
2972 }
02aa26ce
NT
2973
2974 /* check for embedded scalars. only stop if we're sure it's a
2975 variable.
2976 */
79072805 2977 else if (*s == '$') {
3280af22 2978 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
79072805 2979 break;
77772344 2980 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
a2a5de95
NC
2981 if (s[1] == '\\') {
2982 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
2983 "Possible unintended interpolation of $\\ in regex");
77772344 2984 }
79072805 2985 break; /* in regexp, $ might be tail anchor */
77772344 2986 }
79072805 2987 }
02aa26ce 2988
2b9d42f0
NIS
2989 /* End of else if chain - OP_TRANS rejoin rest */
2990
02aa26ce 2991 /* backslashes */
79072805 2992 if (*s == '\\' && s+1 < send) {
ff3f963a
KW
2993 char* e; /* Can be used for ending '}', etc. */
2994
79072805 2995 s++;
02aa26ce 2996
7d0fc23c
KW
2997 /* warn on \1 - \9 in substitution replacements, but note that \11
2998 * is an octal; and \19 is \1 followed by '9' */
3280af22 2999 if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat &&
a0d0e21e 3000 isDIGIT(*s) && *s != '0' && !isDIGIT(s[1]))
79072805 3001 {
a2a5de95 3002 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
79072805
LW
3003 *--s = '$';
3004 break;
3005 }
02aa26ce
NT
3006
3007 /* string-change backslash escapes */
838f2281 3008 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQF", *s)) {
79072805
LW
3009 --s;
3010 break;
3011 }
ff3f963a
KW
3012 /* In a pattern, process \N, but skip any other backslash escapes.
3013 * This is because we don't want to translate an escape sequence
3014 * into a meta symbol and have the regex compiler use the meta
3015 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
3016 * in spite of this, we do have to process \N here while the proper
3017 * charnames handler is in scope. See bugs #56444 and #62056.
3018 * There is a complication because \N in a pattern may also stand
3019 * for 'match a non-nl', and not mean a charname, in which case its
3020 * processing should be deferred to the regex compiler. To be a
3021 * charname it must be followed immediately by a '{', and not look
3022 * like \N followed by a curly quantifier, i.e., not something like
3023 * \N{3,}. regcurly returns a boolean indicating if it is a legal
3024 * quantifier */
3025 else if (PL_lex_inpat
3026 && (*s != 'N'
3027 || s[1] != '{'
3028 || regcurly(s + 1)))
3029 {
cc74c5bd
TS
3030 *d++ = NATIVE_TO_NEED(has_utf8,'\\');
3031 goto default_action;
3032 }
02aa26ce 3033
79072805 3034 switch (*s) {
02aa26ce
NT
3035
3036 /* quoted - in transliterations */
79072805 3037 case '-':
3280af22 3038 if (PL_lex_inwhat == OP_TRANS) {
79072805
LW
3039 *d++ = *s++;
3040 continue;
3041 }
3042 /* FALL THROUGH */
3043 default:
11b8faa4 3044 {
e4ca4584 3045 if ((isALNUMC(*s)))
a2a5de95
NC
3046 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
3047 "Unrecognized escape \\%c passed through",
3048 *s);
11b8faa4 3049 /* default action is to copy the quoted character */
f9a63242 3050 goto default_action;
11b8faa4 3051 }
02aa26ce 3052
632403cc 3053 /* eg. \132 indicates the octal constant 0132 */
79072805
LW
3054 case '0': case '1': case '2': case '3':
3055 case '4': case '5': case '6': case '7':
ba210ebe 3056 {
53305cf1
NC
3057 I32 flags = 0;
3058 STRLEN len = 3;
77a135fe 3059 uv = NATIVE_TO_UNI(grok_oct(s, &len, &flags, NULL));
ba210ebe
JH
3060 s += len;
3061 }
012bcf8d 3062 goto NUM_ESCAPE_INSERT;
02aa26ce 3063
f0a2b745
KW
3064 /* eg. \o{24} indicates the octal constant \024 */
3065 case 'o':
3066 {
3067 STRLEN len;
454155d9 3068 const char* error;
f0a2b745 3069
454155d9 3070 bool valid = grok_bslash_o(s, &uv, &len, &error, 1);
f0a2b745 3071 s += len;
454155d9 3072 if (! valid) {
f0a2b745
KW
3073 yyerror(error);
3074 continue;
3075 }
3076 goto NUM_ESCAPE_INSERT;
3077 }
3078
77a135fe 3079 /* eg. \x24 indicates the hex constant 0x24 */
79072805 3080 case 'x':
a0481293 3081 {
53305cf1 3082 STRLEN len;
a0481293 3083 const char* error;
355860ce 3084
a0481293
KW
3085 bool valid = grok_bslash_x(s, &uv, &len, &error, 1);
3086 s += len;
3087 if (! valid) {
3088 yyerror(error);
355860ce 3089 continue;
ba210ebe 3090 }
012bcf8d
GS
3091 }
3092
3093 NUM_ESCAPE_INSERT:
ff3f963a
KW
3094 /* Insert oct or hex escaped character. There will always be
3095 * enough room in sv since such escapes will be longer than any
3096 * UTF-8 sequence they can end up as, except if they force us
3097 * to recode the rest of the string into utf8 */
ba7cea30 3098
77a135fe 3099 /* Here uv is the ordinal of the next character being added in
ff3f963a 3100 * unicode (converted from native). */
77a135fe 3101 if (!UNI_IS_INVARIANT(uv)) {
9aa983d2 3102 if (!has_utf8 && uv > 255) {
77a135fe
KW
3103 /* Might need to recode whatever we have accumulated so
3104 * far if it contains any chars variant in utf8 or
3105 * utf-ebcdic. */
3106
3107 SvCUR_set(sv, d - SvPVX_const(sv));
3108 SvPOK_on(sv);
3109 *d = '\0';
77a135fe 3110 /* See Note on sizing above. */
7bf79863
KW
3111 sv_utf8_upgrade_flags_grow(sv,
3112 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3113 UNISKIP(uv) + (STRLEN)(send - s) + 1);
77a135fe
KW
3114 d = SvPVX(sv) + SvCUR(sv);
3115 has_utf8 = TRUE;
012bcf8d
GS
3116 }
3117
77a135fe
KW
3118 if (has_utf8) {
3119 d = (char*)uvuni_to_utf8((U8*)d, uv);
f9a63242
JH
3120 if (PL_lex_inwhat == OP_TRANS &&
3121 PL_sublex_info.sub_op) {
3122 PL_sublex_info.sub_op->op_private |=
3123 (PL_lex_repl ? OPpTRANS_FROM_UTF
3124 : OPpTRANS_TO_UTF);
f9a63242 3125 }
e294cc5d
JH
3126#ifdef EBCDIC
3127 if (uv > 255 && !dorange)
3128 native_range = FALSE;
3129#endif
012bcf8d 3130 }
a0ed51b3 3131 else {
012bcf8d 3132 *d++ = (char)uv;
a0ed51b3 3133 }
012bcf8d
GS
3134 }
3135 else {
c4d5f83a 3136 *d++ = (char) uv;
a0ed51b3 3137 }
79072805 3138 continue;
02aa26ce 3139
4a2d328f 3140 case 'N':
ff3f963a
KW
3141 /* In a non-pattern \N must be a named character, like \N{LATIN
3142 * SMALL LETTER A} or \N{U+0041}. For patterns, it also can
3143 * mean to match a non-newline. For non-patterns, named
3144 * characters are converted to their string equivalents. In
3145 * patterns, named characters are not converted to their
3146 * ultimate forms for the same reasons that other escapes
3147 * aren't. Instead, they are converted to the \N{U+...} form
3148 * to get the value from the charnames that is in effect right
3149 * now, while preserving the fact that it was a named character
3150 * so that the regex compiler knows this */
3151
3152 /* This section of code doesn't generally use the
3153 * NATIVE_TO_NEED() macro to transform the input. I (khw) did
3154 * a close examination of this macro and determined it is a
3155 * no-op except on utfebcdic variant characters. Every
3156 * character generated by this that would normally need to be
3157 * enclosed by this macro is invariant, so the macro is not
7538f724
KW
3158 * needed, and would complicate use of copy(). XXX There are
3159 * other parts of this file where the macro is used
3160 * inconsistently, but are saved by it being a no-op */
ff3f963a
KW
3161
3162 /* The structure of this section of code (besides checking for
3163 * errors and upgrading to utf8) is:
3164 * Further disambiguate between the two meanings of \N, and if
3165 * not a charname, go process it elsewhere
0a96133f
KW
3166 * If of form \N{U+...}, pass it through if a pattern;
3167 * otherwise convert to utf8
3168 * Otherwise must be \N{NAME}: convert to \N{U+c1.c2...} if a
3169 * pattern; otherwise convert to utf8 */
ff3f963a
KW
3170
3171 /* Here, s points to the 'N'; the test below is guaranteed to
3172 * succeed if we are being called on a pattern as we already
3173 * know from a test above that the next character is a '{'.
3174 * On a non-pattern \N must mean 'named sequence, which
3175 * requires braces */
3176 s++;
3177 if (*s != '{') {
3178 yyerror("Missing braces on \\N{}");
3179 continue;
3180 }
3181 s++;
3182
0a96133f 3183 /* If there is no matching '}', it is an error. */
ff3f963a
KW
3184 if (! (e = strchr(s, '}'))) {
3185 if (! PL_lex_inpat) {
5777a3f7 3186 yyerror("Missing right brace on \\N{}");
0a96133f
KW
3187 } else {
3188 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N.");
dbc0d4f2 3189 }
0a96133f 3190 continue;
ff3f963a 3191 }
cddc7ef4 3192
ff3f963a 3193 /* Here it looks like a named character */
cddc7ef4 3194
ff3f963a
KW
3195 if (PL_lex_inpat) {
3196
3197 /* XXX This block is temporary code. \N{} implies that the
3198 * pattern is to have Unicode semantics, and therefore
3199 * currently has to be encoded in utf8. By putting it in
3200 * utf8 now, we save a whole pass in the regular expression
3201 * compiler. Once that code is changed so Unicode
3202 * semantics doesn't necessarily have to be in utf8, this
da3a4baf
KW
3203 * block should be removed. However, the code that parses
3204 * the output of this would have to be changed to not
3205 * necessarily expect utf8 */
ff3f963a 3206 if (!has_utf8) {
77a135fe 3207 SvCUR_set(sv, d - SvPVX_const(sv));
f08d6ad9 3208 SvPOK_on(sv);
e4f3eed8 3209 *d = '\0';
77a135fe 3210 /* See Note on sizing above. */
7bf79863 3211 sv_utf8_upgrade_flags_grow(sv,
ff3f963a
KW
3212 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3213 /* 5 = '\N{' + cur char + NUL */
3214 (STRLEN)(send - s) + 5);
f08d6ad9 3215 d = SvPVX(sv) + SvCUR(sv);
89491803 3216 has_utf8 = TRUE;
ff3f963a
KW
3217 }
3218 }
423cee85 3219
ff3f963a
KW
3220 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3221 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3222 | PERL_SCAN_DISALLOW_PREFIX;
3223 STRLEN len;
3224
3225 /* For \N{U+...}, the '...' is a unicode value even on
3226 * EBCDIC machines */
3227 s += 2; /* Skip to next char after the 'U+' */
3228 len = e - s;
3229 uv = grok_hex(s, &len, &flags, NULL);
3230 if (len == 0 || len != (STRLEN)(e - s)) {
3231 yyerror("Invalid hexadecimal number in \\N{U+...}");
3232 s = e + 1;
3233 continue;
3234 }
3235
3236 if (PL_lex_inpat) {
3237
e2a7e165
KW
3238 /* On non-EBCDIC platforms, pass through to the regex
3239 * compiler unchanged. The reason we evaluated the
3240 * number above is to make sure there wasn't a syntax
3241 * error. But on EBCDIC we convert to native so
3242 * downstream code can continue to assume it's native
3243 */
ff3f963a 3244 s -= 5; /* Include the '\N{U+' */
e2a7e165
KW
3245#ifdef EBCDIC
3246 d += my_snprintf(d, e - s + 1 + 1, /* includes the }
3247 and the \0 */
3248 "\\N{U+%X}",
3249 (unsigned int) UNI_TO_NATIVE(uv));
3250#else
ff3f963a
KW
3251 Copy(s, d, e - s + 1, char); /* 1 = include the } */
3252 d += e - s + 1;
e2a7e165 3253#endif
ff3f963a
KW
3254 }
3255 else { /* Not a pattern: convert the hex to string */
3256
3257 /* If destination is not in utf8, unconditionally
3258 * recode it to be so. This is because \N{} implies
3259 * Unicode semantics, and scalars have to be in utf8
3260 * to guarantee those semantics */
3261 if (! has_utf8) {
3262 SvCUR_set(sv, d - SvPVX_const(sv));
3263 SvPOK_on(sv);
3264 *d = '\0';
3265 /* See Note on sizing above. */
3266 sv_utf8_upgrade_flags_grow(
3267 sv,
3268 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3269 UNISKIP(uv) + (STRLEN)(send - e) + 1);
3270 d = SvPVX(sv) + SvCUR(sv);
3271 has_utf8 = TRUE;
3272 }
3273
3274 /* Add the string to the output */
3275 if (UNI_IS_INVARIANT(uv)) {
3276 *d++ = (char) uv;
3277 }
3278 else d = (char*)uvuni_to_utf8((U8*)d, uv);
3279 }
3280 }
3281 else { /* Here is \N{NAME} but not \N{U+...}. */
3282
3283 SV *res; /* result from charnames */
3284 const char *str; /* the string in 'res' */
3285 STRLEN len; /* its length */
3286
3287 /* Get the value for NAME */
3288 res = newSVpvn(s, e - s);
3289 res = new_constant( NULL, 0, "charnames",
3290 /* includes all of: \N{...} */
3291 res, NULL, s - 3, e - s + 4 );
3292
3293 /* Most likely res will be in utf8 already since the
3294 * standard charnames uses pack U, but a custom translator
3295 * can leave it otherwise, so make sure. XXX This can be
3296 * revisited to not have charnames use utf8 for characters
3297 * that don't need it when regexes don't have to be in utf8
3298 * for Unicode semantics. If doing so, remember EBCDIC */
3299 sv_utf8_upgrade(res);
3300 str = SvPV_const(res, len);
3301
3302 /* Don't accept malformed input */
3303 if (! is_utf8_string((U8 *) str, len)) {
3304 yyerror("Malformed UTF-8 returned by \\N");
3305 }
3306 else if (PL_lex_inpat) {
3307
3308 if (! len) { /* The name resolved to an empty string */
3309 Copy("\\N{}", d, 4, char);
3310 d += 4;
3311 }
3312 else {
3313 /* In order to not lose information for the regex
3314 * compiler, pass the result in the specially made
3315 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3316 * the code points in hex of each character
3317 * returned by charnames */
3318
3319 const char *str_end = str + len;
3320 STRLEN char_length; /* cur char's byte length */
3321 STRLEN output_length; /* and the number of bytes
3322 after this is translated
3323 into hex digits */
3324 const STRLEN off = d - SvPVX_const(sv);
3325
3326 /* 2 hex per byte; 2 chars for '\N'; 2 chars for
3327 * max('U+', '.'); and 1 for NUL */
3328 char hex_string[2 * UTF8_MAXBYTES + 5];
3329
3330 /* Get the first character of the result. */
3331 U32 uv = utf8n_to_uvuni((U8 *) str,
3332 len,
3333 &char_length,
3334 UTF8_ALLOW_ANYUV);
3335
3336 /* The call to is_utf8_string() above hopefully
3337 * guarantees that there won't be an error. But
3338 * it's easy here to make sure. The function just
3339 * above warns and returns 0 if invalid utf8, but
3340 * it can also return 0 if the input is validly a
3341 * NUL. Disambiguate */
3342 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3343 uv = UNICODE_REPLACEMENT;
3344 }
3345
3346 /* Convert first code point to hex, including the
e2a7e165
KW
3347 * boiler plate before it. For all these, we
3348 * convert to native format so that downstream code
3349 * can continue to assume the input is native */
78c35590 3350 output_length =
3353de27 3351 my_snprintf(hex_string, sizeof(hex_string),
e2a7e165
KW
3352 "\\N{U+%X",
3353 (unsigned int) UNI_TO_NATIVE(uv));
ff3f963a
KW
3354
3355 /* Make sure there is enough space to hold it */
3356 d = off + SvGROW(sv, off
3357 + output_length
3358 + (STRLEN)(send - e)
3359 + 2); /* '}' + NUL */
3360 /* And output it */
3361 Copy(hex_string, d, output_length, char);
3362 d += output_length;
3363
3364 /* For each subsequent character, append dot and
3365 * its ordinal in hex */
3366 while ((str += char_length) < str_end) {
3367 const STRLEN off = d - SvPVX_const(sv);
3368 U32 uv = utf8n_to_uvuni((U8 *) str,
3369 str_end - str,
3370 &char_length,
3371 UTF8_ALLOW_ANYUV);
3372 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3373 uv = UNICODE_REPLACEMENT;
3374 }
3375
78c35590 3376 output_length =
3353de27 3377 my_snprintf(hex_string, sizeof(hex_string),
e2a7e165
KW
3378 ".%X",
3379 (unsigned int) UNI_TO_NATIVE(uv));
ff3f963a
KW
3380
3381 d = off + SvGROW(sv, off
3382 + output_length
3383 + (STRLEN)(send - e)
3384 + 2); /* '}' + NUL */
3385 Copy(hex_string, d, output_length, char);
3386 d += output_length;
3387 }
3388
3389 *d++ = '}'; /* Done. Add the trailing brace */
3390 }
3391 }
3392 else { /* Here, not in a pattern. Convert the name to a
3393 * string. */
3394
3395 /* If destination is not in utf8, unconditionally
3396 * recode it to be so. This is because \N{} implies
3397 * Unicode semantics, and scalars have to be in utf8
3398 * to guarantee those semantics */
3399 if (! has_utf8) {
3400 SvCUR_set(sv, d - SvPVX_const(sv));
3401 SvPOK_on(sv);
3402 *d = '\0';
3403 /* See Note on sizing above. */
3404 sv_utf8_upgrade_flags_grow(sv,
3405 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3406 len + (STRLEN)(send - s) + 1);
3407 d = SvPVX(sv) + SvCUR(sv);
3408 has_utf8 = TRUE;
3409 } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
3410
3411 /* See Note on sizing above. (NOTE: SvCUR() is not
3412 * set correctly here). */
3413 const STRLEN off = d - SvPVX_const(sv);
3414 d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1);
3415 }
3416 Copy(str, d, len, char);
3417 d += len;
423cee85 3418 }
423cee85 3419 SvREFCNT_dec(res);
cb233ae3
KW
3420
3421 /* Deprecate non-approved name syntax */
3422 if (ckWARN_d(WARN_DEPRECATED)) {
3423 bool problematic = FALSE;
3424 char* i = s;
3425
3426 /* For non-ut8 input, look to see that the first
3427 * character is an alpha, then loop through the rest
3428 * checking that each is a continuation */
3429 if (! this_utf8) {
3430 if (! isALPHAU(*i)) problematic = TRUE;
3431 else for (i = s + 1; i < e; i++) {
3432 if (isCHARNAME_CONT(*i)) continue;
3433 problematic = TRUE;
3434 break;
3435 }
3436 }
3437 else {
3438 /* Similarly for utf8. For invariants can check
3439 * directly. We accept anything above the latin1
3440 * range because it is immaterial to Perl if it is
3441 * correct or not, and is expensive to check. But
3442 * it is fairly easy in the latin1 range to convert
3443 * the variants into a single character and check
3444 * those */
3445 if (UTF8_IS_INVARIANT(*i)) {
3446 if (! isALPHAU(*i)) problematic = TRUE;
3447 } else if (UTF8_IS_DOWNGRADEABLE_START(*i)) {
81c14aa2 3448 if (! isALPHAU(UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(*i,
cb233ae3
KW
3449 *(i+1)))))
3450 {
3451 problematic = TRUE;
3452 }
3453 }
3454 if (! problematic) for (i = s + UTF8SKIP(s);
3455 i < e;
3456 i+= UTF8SKIP(i))
3457 {
3458 if (UTF8_IS_INVARIANT(*i)) {
3459 if (isCHARNAME_CONT(*i)) continue;
3460 } else if (! UTF8_IS_DOWNGRADEABLE_START(*i)) {
3461 continue;
3462 } else if (isCHARNAME_CONT(
3463 UNI_TO_NATIVE(
81c14aa2 3464 TWO_BYTE_UTF8_TO_UNI(*i, *(i+1)))))
cb233ae3
KW
3465 {
3466 continue;
3467 }
3468 problematic = TRUE;
3469 break;
3470 }
3471 }
3472 if (problematic) {
6e1bad6c
KW
3473 /* The e-i passed to the final %.*s makes sure that
3474 * should the trailing NUL be missing that this
3475 * print won't run off the end of the string */
cb233ae3 3476 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
b00fc8d4
NC
3477 "Deprecated character in \\N{...}; marked by <-- HERE in \\N{%.*s<-- HERE %.*s",
3478 (int)(i - s + 1), s, (int)(e - i), i + 1);
cb233ae3
KW
3479 }
3480 }
3481 } /* End \N{NAME} */
ff3f963a
KW
3482#ifdef EBCDIC
3483 if (!dorange)
3484 native_range = FALSE; /* \N{} is defined to be Unicode */
3485#endif
3486 s = e + 1; /* Point to just after the '}' */
423cee85
JH
3487 continue;
3488
02aa26ce 3489 /* \c is a control character */
79072805
LW
3490 case 'c':
3491 s++;
961ce445 3492 if (s < send) {
17a3df4c 3493 *d++ = grok_bslash_c(*s++, has_utf8, 1);
ba210ebe 3494 }
961ce445
RGS
3495 else {
3496 yyerror("Missing control char name in \\c");
3497 }
79072805 3498 continue;
02aa26ce
NT
3499
3500 /* printf-style backslashes, formfeeds, newlines, etc */
79072805 3501 case 'b':
db42d148 3502 *d++ = NATIVE_TO_NEED(has_utf8,'\b');
79072805
LW
3503 break;
3504 case 'n':
db42d148 3505 *d++ = NATIVE_TO_NEED(has_utf8,'\n');
79072805
LW
3506 break;
3507 case 'r':
db42d148 3508 *d++ = NATIVE_TO_NEED(has_utf8,'\r');
79072805
LW
3509 break;
3510 case 'f':
db42d148 3511 *d++ = NATIVE_TO_NEED(has_utf8,'\f');
79072805
LW
3512 break;
3513 case 't':
db42d148 3514 *d++ = NATIVE_TO_NEED(has_utf8,'\t');
79072805 3515 break;
34a3fe2a 3516 case 'e':
db42d148 3517 *d++ = ASCII_TO_NEED(has_utf8,'\033');
34a3fe2a
PP
3518 break;
3519 case 'a':
db42d148 3520 *d++ = ASCII_TO_NEED(has_utf8,'\007');
79072805 3521 break;
02aa26ce
NT
3522 } /* end switch */
3523
79072805
LW
3524 s++;
3525 continue;
02aa26ce 3526 } /* end if (backslash) */
4c3a8340
TS
3527#ifdef EBCDIC
3528 else
3529 literal_endpoint++;
3530#endif
02aa26ce 3531
f9a63242 3532 default_action:
77a135fe
KW
3533 /* If we started with encoded form, or already know we want it,
3534 then encode the next character */
3535 if (! NATIVE_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
2b9d42f0 3536 STRLEN len = 1;
77a135fe
KW
3537
3538
3539 /* One might think that it is wasted effort in the case of the
3540 * source being utf8 (this_utf8 == TRUE) to take the next character
3541 * in the source, convert it to an unsigned value, and then convert
3542 * it back again. But the source has not been validated here. The
3543 * routine that does the conversion checks for errors like
3544 * malformed utf8 */
3545
5f66b61c
AL
3546 const UV nextuv = (this_utf8) ? utf8n_to_uvchr((U8*)s, send - s, &len, 0) : (UV) ((U8) *s);
3547 const STRLEN need = UNISKIP(NATIVE_TO_UNI(nextuv));
77a135fe
KW
3548 if (!has_utf8) {
3549 SvCUR_set(sv, d - SvPVX_const(sv));
3550 SvPOK_on(sv);
3551 *d = '\0';
77a135fe 3552 /* See Note on sizing above. */
7bf79863
KW
3553 sv_utf8_upgrade_flags_grow(sv,
3554 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3555 need + (STRLEN)(send - s) + 1);
77a135fe
KW
3556 d = SvPVX(sv) + SvCUR(sv);
3557 has_utf8 = TRUE;
3558 } else if (need > len) {
3559 /* encoded value larger than old, may need extra space (NOTE:
3560 * SvCUR() is not set correctly here). See Note on sizing
3561 * above. */
9d4ba2ae 3562 const STRLEN off = d - SvPVX_const(sv);
77a135fe 3563 d = SvGROW(sv, off + need + (STRLEN)(send - s) + 1) + off;
2b9d42f0 3564 }
77a135fe
KW
3565 s += len;
3566
5f66b61c 3567 d = (char*)uvchr_to_utf8((U8*)d, nextuv);
e294cc5d
JH
3568#ifdef EBCDIC
3569 if (uv > 255 && !dorange)
3570 native_range = FALSE;
3571#endif
2b9d42f0
NIS
3572 }
3573 else {
3574 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
3575 }
02aa26ce
NT
3576 } /* while loop to process each character */
3577
3578 /* terminate the string and set up the sv */
79072805 3579 *d = '\0';
95a20fc0 3580 SvCUR_set(sv, d - SvPVX_const(sv));
2b9d42f0 3581 if (SvCUR(sv) >= SvLEN(sv))
5637ef5b
NC
3582 Perl_croak(aTHX_ "panic: constant overflowed allocated space, %"UVuf
3583 " >= %"UVuf, (UV)SvCUR(sv), (UV)SvLEN(sv));
2b9d42f0 3584
79072805 3585 SvPOK_on(sv);
9f4817db 3586 if (PL_encoding && !has_utf8) {
d0063567
DK
3587 sv_recode_to_utf8(sv, PL_encoding);
3588 if (SvUTF8(sv))
3589 has_utf8 = TRUE;
9f4817db 3590 }
2b9d42f0 3591 if (has_utf8) {
7e2040f0 3592 SvUTF8_on(sv);
2b9d42f0 3593 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
d0063567 3594 PL_sublex_info.sub_op->op_private |=
2b9d42f0
NIS
3595 (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
3596 }
3597 }
79072805 3598
02aa26ce 3599 /* shrink the sv if we allocated more than we used */
79072805 3600 if (SvCUR(sv) + 5 < SvLEN(sv)) {
1da4ca5f 3601 SvPV_shrink_to_cur(sv);
79072805 3602 }
02aa26ce 3603
6154021b 3604 /* return the substring (via pl_yylval) only if we parsed anything */
3280af22 3605 if (s > PL_bufptr) {
eb0d8d16
NC
3606 if ( PL_hints & ( PL_lex_inpat ? HINT_NEW_RE : HINT_NEW_STRING ) ) {
3607 const char *const key = PL_lex_inpat ? "qr" : "q";
3608 const STRLEN keylen = PL_lex_inpat ? 2 : 1;
3609 const char *type;
3610 STRLEN typelen;
3611
3612 if (PL_lex_inwhat == OP_TRANS) {
3613 type = "tr";
3614 typelen = 2;
3615 } else if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat) {
3616 type = "s";
3617 typelen = 1;
9da1dd8f
DM
3618 } else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'') {
3619 type = "q";
3620 typelen = 1;
eb0d8d16
NC
3621 } else {
3622 type = "qq";
3623 typelen = 2;
3624 }
3625
3626 sv = S_new_constant(aTHX_ start, s - start, key, keylen, sv, NULL,
3627 type, typelen);
3628 }
6154021b 3629 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
b3ac6de7 3630 } else
8990e307 3631 SvREFCNT_dec(sv);
79072805
LW
3632 return s;
3633}
3634
ffb4593c
NT
3635/* S_intuit_more
3636 * Returns TRUE if there's more to the expression (e.g., a subscript),
3637 * FALSE otherwise.
ffb4593c
NT
3638 *
3639 * It deals with "$foo[3]" and /$foo[3]/ and /$foo[0123456789$]+/
3640 *
3641 * ->[ and ->{ return TRUE
3642 * { and [ outside a pattern are always subscripts, so return TRUE
3643 * if we're outside a pattern and it's not { or [, then return FALSE
3644 * if we're in a pattern and the first char is a {
3645 * {4,5} (any digits around the comma) returns FALSE
3646 * if we're in a pattern and the first char is a [
3647 * [] returns FALSE
3648 * [SOMETHING] has a funky algorithm to decide whether it's a
3649 * character class or not. It has to deal with things like
3650 * /$foo[-3]/ and /$foo[$bar]/ as well as /$foo[$\d]+/
3651 * anything else returns TRUE
3652 */
3653
9cbb5ea2
GS
3654/* This is the one truly awful dwimmer necessary to conflate C and sed. */
3655
76e3520e 3656STATIC int
cea2e8a9 3657S_intuit_more(pTHX_ register char *s)
79072805 3658{
97aff369 3659 dVAR;
7918f24d
NC
3660
3661 PERL_ARGS_ASSERT_INTUIT_MORE;
3662
3280af22 3663 if (PL_lex_brackets)
79072805
LW
3664 return TRUE;
3665 if (*s == '-' && s[1] == '>' && (s[2] == '[' || s[2] == '{'))
3666 return TRUE;
3667 if (*s != '{' && *s != '[')
3668 return FALSE;
3280af22 3669 if (!PL_lex_inpat)
79072805
LW
3670 return TRUE;
3671
3672 /* In a pattern, so maybe we have {n,m}. */
3673 if (*s == '{') {
b3155d95 3674 if (regcurly(s)) {
79072805 3675 return FALSE;
b3155d95 3676 }
79072805 3677 return TRUE;
79072805
LW
3678 }
3679
3680 /* On the other hand, maybe we have a character class */
3681
3682 s++;
3683 if (*s == ']' || *s == '^')
3684 return FALSE;
3685 else {
ffb4593c 3686 /* this is terrifying, and it works */
79072805
LW
3687 int weight = 2; /* let's weigh the evidence */
3688 char seen[256];
f27ffc4a 3689 unsigned char un_char = 255, last_un_char;
9d4ba2ae 3690 const char * const send = strchr(s,']');
3280af22 3691 char tmpbuf[sizeof PL_tokenbuf * 4];
79072805
LW
3692
3693 if (!send) /* has to be an expression */
3694 return TRUE;
3695
3696 Zero(seen,256,char);
3697 if (*s == '$')
3698 weight -= 3;
3699 else if (isDIGIT(*s)) {
3700 if (s[1] != ']') {
3701 if (isDIGIT(s[1]) && s[2] == ']')
3702 weight -= 10;
3703 }
3704 else
3705 weight -= 100;
3706 }
3707 for (; s < send; s++) {
3708 last_un_char = un_char;
3709 un_char = (unsigned char)*s;
3710 switch (*s) {
3711 case '@':
3712 case '&':
3713 case '$':
3714 weight -= seen[un_char] * 10;
7e2040f0 3715 if (isALNUM_lazy_if(s+1,UTF)) {
90e5519e 3716 int len;
8903cb82 3717 scan_ident(s, send, tmpbuf, sizeof tmpbuf, FALSE);
90e5519e 3718 len = (int)strlen(tmpbuf);
6fbd0d97
BF
3719 if (len > 1 && gv_fetchpvn_flags(tmpbuf, len,
3720 UTF ? SVf_UTF8 : 0, SVt_PV))
79072805
LW
3721 weight -= 100;
3722 else
3723 weight -= 10;
3724 }
3725 else if (*s == '$' && s[1] &&
93a17b20
LW
3726 strchr("[#!%*<>()-=",s[1])) {
3727 if (/*{*/ strchr("])} =",s[2]))
79072805
LW
3728 weight -= 10;
3729 else
3730 weight -= 1;
3731 }
3732 break;
3733 case '\\':
3734 un_char = 254;
3735 if (s[1]) {
93a17b20 3736 if (strchr("wds]",s[1]))
79072805 3737 weight += 100;
10edeb5d 3738 else if (seen[(U8)'\''] || seen[(U8)'"'])
79072805 3739 weight += 1;
93a17b20 3740 else if (strchr("rnftbxcav",s[1]))
79072805
LW
3741 weight += 40;
3742 else if (isDIGIT(s[1])) {
3743 weight += 40;
3744 while (s[1] && isDIGIT(s[1]))
3745