This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
toke.c:scan_heredoc: Merge two adjacent #ifdefs
[perl5.git] / toke.c
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
26
27This is the lower layer of the Perl parser, managing characters and tokens.
28
29=for apidoc AmU|yy_parser *|PL_parser
30
31Pointer to a structure encapsulating the state of the parsing operation
32currently in progress. The pointer can be locally changed to perform
33a nested parse without interfering with the state of an outer parse.
34Individual members of C<PL_parser> have their own documentation.
35
36=cut
37*/
38
378cc40b 39#include "EXTERN.h"
864dbfa3 40#define PERL_IN_TOKE_C
378cc40b 41#include "perl.h"
04e98a4d 42#include "dquote_static.c"
378cc40b 43
eb0d8d16
NC
44#define new_constant(a,b,c,d,e,f,g) \
45 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
46
6154021b 47#define pl_yylval (PL_parser->yylval)
d3b6f988 48
199e78b7
DM
49/* XXX temporary backwards compatibility */
50#define PL_lex_brackets (PL_parser->lex_brackets)
78cdf107
Z
51#define PL_lex_allbrackets (PL_parser->lex_allbrackets)
52#define PL_lex_fakeeof (PL_parser->lex_fakeeof)
199e78b7
DM
53#define PL_lex_brackstack (PL_parser->lex_brackstack)
54#define PL_lex_casemods (PL_parser->lex_casemods)
55#define PL_lex_casestack (PL_parser->lex_casestack)
56#define PL_lex_defer (PL_parser->lex_defer)
57#define PL_lex_dojoin (PL_parser->lex_dojoin)
58#define PL_lex_expect (PL_parser->lex_expect)
59#define PL_lex_formbrack (PL_parser->lex_formbrack)
60#define PL_lex_inpat (PL_parser->lex_inpat)
61#define PL_lex_inwhat (PL_parser->lex_inwhat)
62#define PL_lex_op (PL_parser->lex_op)
63#define PL_lex_repl (PL_parser->lex_repl)
64#define PL_lex_starts (PL_parser->lex_starts)
65#define PL_lex_stuff (PL_parser->lex_stuff)
66#define PL_multi_start (PL_parser->multi_start)
67#define PL_multi_open (PL_parser->multi_open)
68#define PL_multi_close (PL_parser->multi_close)
69#define PL_pending_ident (PL_parser->pending_ident)
70#define PL_preambled (PL_parser->preambled)
71#define PL_sublex_info (PL_parser->sublex_info)
bdc0bf6f 72#define PL_linestr (PL_parser->linestr)
c2598295
DM
73#define PL_expect (PL_parser->expect)
74#define PL_copline (PL_parser->copline)
f06b5848
DM
75#define PL_bufptr (PL_parser->bufptr)
76#define PL_oldbufptr (PL_parser->oldbufptr)
77#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
78#define PL_linestart (PL_parser->linestart)
79#define PL_bufend (PL_parser->bufend)
80#define PL_last_uni (PL_parser->last_uni)
81#define PL_last_lop (PL_parser->last_lop)
82#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 83#define PL_lex_state (PL_parser->lex_state)
2f9285f8 84#define PL_rsfp (PL_parser->rsfp)
5486870f 85#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
86#define PL_in_my (PL_parser->in_my)
87#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 88#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 89#define PL_multi_end (PL_parser->multi_end)
13765c85 90#define PL_error_count (PL_parser->error_count)
199e78b7
DM
91
92#ifdef PERL_MAD
93# define PL_endwhite (PL_parser->endwhite)
94# define PL_faketokens (PL_parser->faketokens)
95# define PL_lasttoke (PL_parser->lasttoke)
96# define PL_nextwhite (PL_parser->nextwhite)
97# define PL_realtokenstart (PL_parser->realtokenstart)
98# define PL_skipwhite (PL_parser->skipwhite)
99# define PL_thisclose (PL_parser->thisclose)
100# define PL_thismad (PL_parser->thismad)
101# define PL_thisopen (PL_parser->thisopen)
102# define PL_thisstuff (PL_parser->thisstuff)
103# define PL_thistoken (PL_parser->thistoken)
104# define PL_thiswhite (PL_parser->thiswhite)
fb205e7a
DM
105# define PL_thiswhite (PL_parser->thiswhite)
106# define PL_nexttoke (PL_parser->nexttoke)
107# define PL_curforce (PL_parser->curforce)
108#else
109# define PL_nexttoke (PL_parser->nexttoke)
110# define PL_nexttype (PL_parser->nexttype)
111# define PL_nextval (PL_parser->nextval)
199e78b7
DM
112#endif
113
16173588
NC
114/* This can't be done with embed.fnc, because struct yy_parser contains a
115 member named pending_ident, which clashes with the generated #define */
3cbf51f5
DM
116static int
117S_pending_ident(pTHX);
199e78b7 118
0bd48802 119static const char ident_too_long[] = "Identifier too long";
8903cb82 120
29595ff2 121#ifdef PERL_MAD
29595ff2 122# define CURMAD(slot,sv) if (PL_madskills) { curmad(slot,sv); sv = 0; }
cd81e915 123# define NEXTVAL_NEXTTOKE PL_nexttoke[PL_curforce].next_val
9ded7720 124#else
5db06880 125# define CURMAD(slot,sv)
9ded7720 126# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2
NC
127#endif
128
a7aaec61
Z
129#define XENUMMASK 0x3f
130#define XFAKEEOF 0x40
131#define XFAKEBRACK 0x80
9059aa12 132
39e02b42
JH
133#ifdef USE_UTF8_SCRIPTS
134# define UTF (!IN_BYTES)
2b9d42f0 135#else
802a15e9 136# define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
2b9d42f0 137#endif
a0ed51b3 138
b1fc3636
CJ
139/* The maximum number of characters preceding the unrecognized one to display */
140#define UNRECOGNIZED_PRECEDE_COUNT 10
141
61f0cdd9 142/* In variables named $^X, these are the legal values for X.
2b92dfce
GS
143 * 1999-02-27 mjd-perl-patch@plover.com */
144#define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
145
bf4acbe4 146#define SPACE_OR_TAB(c) ((c)==' '||(c)=='\t')
bf4acbe4 147
ffb4593c
NT
148/* LEX_* are values for PL_lex_state, the state of the lexer.
149 * They are arranged oddly so that the guard on the switch statement
79072805 150 * can get by with a single comparison (if the compiler is smart enough).
9da1dd8f
DM
151 *
152 * These values refer to the various states within a sublex parse,
153 * i.e. within a double quotish string
79072805
LW
154 */
155
fb73857a 156/* #define LEX_NOTPARSING 11 is done in perl.h. */
157
b6007c36
DM
158#define LEX_NORMAL 10 /* normal code (ie not within "...") */
159#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
160#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
161#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
162#define LEX_INTERPSTART 6 /* expecting the start of a $var */
163
164 /* at end of code, eg "$x" followed by: */
165#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
166#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
167
168#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
169 string or after \E, $foo, etc */
170#define LEX_INTERPCONST 2 /* NOT USED */
171#define LEX_FORMLINE 1 /* expecting a format line */
172#define LEX_KNOWNEXT 0 /* next token known; just return it */
173
79072805 174
bbf60fe6 175#ifdef DEBUGGING
27da23d5 176static const char* const lex_state_names[] = {
bbf60fe6
DM
177 "KNOWNEXT",
178 "FORMLINE",
179 "INTERPCONST",
180 "INTERPCONCAT",
181 "INTERPENDMAYBE",
182 "INTERPEND",
183 "INTERPSTART",
184 "INTERPPUSH",
185 "INTERPCASEMOD",
186 "INTERPNORMAL",
187 "NORMAL"
188};
189#endif
190
79072805
LW
191#ifdef ff_next
192#undef ff_next
d48672a2
LW
193#endif
194
79072805 195#include "keywords.h"
fe14fcc3 196
ffb4593c
NT
197/* CLINE is a macro that ensures PL_copline has a sane value */
198
ae986130
LW
199#ifdef CLINE
200#undef CLINE
201#endif
57843af0 202#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 203
5db06880 204#ifdef PERL_MAD
29595ff2
NC
205# define SKIPSPACE0(s) skipspace0(s)
206# define SKIPSPACE1(s) skipspace1(s)
207# define SKIPSPACE2(s,tsv) skipspace2(s,&tsv)
208# define PEEKSPACE(s) skipspace2(s,0)
209#else
210# define SKIPSPACE0(s) skipspace(s)
211# define SKIPSPACE1(s) skipspace(s)
212# define SKIPSPACE2(s,tsv) skipspace(s)
213# define PEEKSPACE(s) skipspace(s)
214#endif
215
ffb4593c
NT
216/*
217 * Convenience functions to return different tokens and prime the
9cbb5ea2 218 * lexer for the next token. They all take an argument.
ffb4593c
NT
219 *
220 * TOKEN : generic token (used for '(', DOLSHARP, etc)
221 * OPERATOR : generic operator
222 * AOPERATOR : assignment operator
223 * PREBLOCK : beginning the block after an if, while, foreach, ...
224 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
225 * PREREF : *EXPR where EXPR is not a simple identifier
226 * TERM : expression term
227 * LOOPX : loop exiting command (goto, last, dump, etc)
228 * FTST : file test operator
229 * FUN0 : zero-argument function
7eb971ee 230 * FUN0OP : zero-argument function, with its op created in this file
2d2e263d 231 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
232 * BOop : bitwise or or xor
233 * BAop : bitwise and
234 * SHop : shift operator
235 * PWop : power operator
9cbb5ea2 236 * PMop : pattern-matching operator
ffb4593c
NT
237 * Aop : addition-level operator
238 * Mop : multiplication-level operator
239 * Eop : equality-testing operator
e5edeb50 240 * Rop : relational operator <= != gt
ffb4593c
NT
241 *
242 * Also see LOP and lop() below.
243 */
244
998054bd 245#ifdef DEBUGGING /* Serve -DT. */
704d4215 246# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 247#else
bbf60fe6 248# define REPORT(retval) (retval)
998054bd
SC
249#endif
250
bbf60fe6
DM
251#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
252#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
253#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, REPORT(retval)))
254#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
255#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
256#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
257#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
6154021b
RGS
258#define LOOPX(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)LOOPEX))
259#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
260#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
7eb971ee 261#define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
6154021b
RGS
262#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
263#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITOROP)))
264#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITANDOP)))
265#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)SHIFTOP)))
266#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)POWOP)))
267#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
268#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)ADDOP)))
269#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MULOP)))
270#define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
271#define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
2f3197b3 272
a687059c
LW
273/* This bit of chicanery makes a unary function followed by
274 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
275 * The UNIDOR macro is for unary functions that can be followed by the //
276 * operator (such as C<shift // 0>).
a687059c 277 */
d68ce4ac 278#define UNI3(f,x,have_x) { \
6154021b 279 pl_yylval.ival = f; \
d68ce4ac 280 if (have_x) PL_expect = x; \
376fcdbf
AL
281 PL_bufptr = s; \
282 PL_last_uni = PL_oldbufptr; \
283 PL_last_lop_op = f; \
284 if (*s == '(') \
285 return REPORT( (int)FUNC1 ); \
29595ff2 286 s = PEEKSPACE(s); \
376fcdbf
AL
287 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
288 }
d68ce4ac
FC
289#define UNI(f) UNI3(f,XTERM,1)
290#define UNIDOR(f) UNI3(f,XTERMORDORDOR,1)
b5fb7ce3
FC
291#define UNIPROTO(f,optional) { \
292 if (optional) PL_last_uni = PL_oldbufptr; \
22393538
MH
293 OPERATOR(f); \
294 }
a687059c 295
d68ce4ac 296#define UNIBRACK(f) UNI3(f,0,0)
79072805 297
9f68db38 298/* grandfather return to old style */
78cdf107
Z
299#define OLDLOP(f) \
300 do { \
301 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
302 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
303 pl_yylval.ival = (f); \
304 PL_expect = XTERM; \
305 PL_bufptr = s; \
306 return (int)LSTOP; \
307 } while(0)
79072805 308
83944c01
FC
309#define COPLINE_INC_WITH_HERELINES \
310 STMT_START { \
311 CopLINE_inc(PL_curcop); \
d794b522
FC
312 if (PL_parser->lex_shared->herelines) \
313 CopLINE(PL_curcop) += PL_parser->lex_shared->herelines, \
314 PL_parser->lex_shared->herelines = 0; \
83944c01
FC
315 } STMT_END
316
317
8fa7f367
JH
318#ifdef DEBUGGING
319
6154021b 320/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
321enum token_type {
322 TOKENTYPE_NONE,
323 TOKENTYPE_IVAL,
6154021b 324 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6 325 TOKENTYPE_PVAL,
aeaef349 326 TOKENTYPE_OPVAL
bbf60fe6
DM
327};
328
6d4a66ac
NC
329static struct debug_tokens {
330 const int token;
331 enum token_type type;
332 const char *name;
333} const debug_tokens[] =
9041c2e3 334{
bbf60fe6
DM
335 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
336 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
337 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
338 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
339 { ARROW, TOKENTYPE_NONE, "ARROW" },
340 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
341 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
342 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
343 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
344 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
0d863452 345 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
346 { DO, TOKENTYPE_NONE, "DO" },
347 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
348 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
349 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
350 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
351 { ELSE, TOKENTYPE_NONE, "ELSE" },
352 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
353 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
354 { FOR, TOKENTYPE_IVAL, "FOR" },
355 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
705fe0e5
FC
356 { FORMLBRACK, TOKENTYPE_NONE, "FORMLBRACK" },
357 { FORMRBRACK, TOKENTYPE_NONE, "FORMRBRACK" },
bbf60fe6
DM
358 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
359 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
7eb971ee 360 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
bbf60fe6
DM
361 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
362 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
363 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 364 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
365 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
366 { IF, TOKENTYPE_IVAL, "IF" },
5db1eb8d 367 { LABEL, TOKENTYPE_OPVAL, "LABEL" },
bbf60fe6
DM
368 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
369 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
370 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
371 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
372 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
373 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
374 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
375 { MY, TOKENTYPE_IVAL, "MY" },
376 { MYSUB, TOKENTYPE_NONE, "MYSUB" },
377 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
378 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
379 { OROP, TOKENTYPE_IVAL, "OROP" },
380 { OROR, TOKENTYPE_NONE, "OROR" },
381 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
f3f204dc 382 { PEG, TOKENTYPE_NONE, "PEG" },
88e1f1a2
JV
383 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
384 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6
DM
385 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
386 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
387 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
388 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
389 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
390 { PREINC, TOKENTYPE_NONE, "PREINC" },
391 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
f3f204dc 392 { QWLIST, TOKENTYPE_OPVAL, "QWLIST" },
bbf60fe6
DM
393 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
394 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
f3f204dc 395 { REQUIRE, TOKENTYPE_NONE, "REQUIRE" },
bbf60fe6
DM
396 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
397 { SUB, TOKENTYPE_NONE, "SUB" },
398 { THING, TOKENTYPE_OPVAL, "THING" },
399 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
400 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
401 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
402 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
403 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
404 { USE, TOKENTYPE_IVAL, "USE" },
0d863452 405 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6
DM
406 { WHILE, TOKENTYPE_IVAL, "WHILE" },
407 { WORD, TOKENTYPE_OPVAL, "WORD" },
be25f609 408 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 409 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
410};
411
6154021b 412/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 413
bbf60fe6 414STATIC int
704d4215 415S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 416{
97aff369 417 dVAR;
7918f24d
NC
418
419 PERL_ARGS_ASSERT_TOKEREPORT;
420
bbf60fe6 421 if (DEBUG_T_TEST) {
bd61b366 422 const char *name = NULL;
bbf60fe6 423 enum token_type type = TOKENTYPE_NONE;
f54cb97a 424 const struct debug_tokens *p;
396482e1 425 SV* const report = newSVpvs("<== ");
bbf60fe6 426
f54cb97a 427 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
428 if (p->token == (int)rv) {
429 name = p->name;
430 type = p->type;
431 break;
432 }
433 }
434 if (name)
54667de8 435 Perl_sv_catpv(aTHX_ report, name);
bbf60fe6
DM
436 else if ((char)rv > ' ' && (char)rv < '~')
437 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
438 else if (!rv)
396482e1 439 sv_catpvs(report, "EOF");
bbf60fe6
DM
440 else
441 Perl_sv_catpvf(aTHX_ report, "?? %"IVdf, (IV)rv);
442 switch (type) {
443 case TOKENTYPE_NONE:
bbf60fe6
DM
444 break;
445 case TOKENTYPE_IVAL:
704d4215 446 Perl_sv_catpvf(aTHX_ report, "(ival=%"IVdf")", (IV)lvalp->ival);
bbf60fe6
DM
447 break;
448 case TOKENTYPE_OPNUM:
449 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 450 PL_op_name[lvalp->ival]);
bbf60fe6
DM
451 break;
452 case TOKENTYPE_PVAL:
704d4215 453 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
454 break;
455 case TOKENTYPE_OPVAL:
704d4215 456 if (lvalp->opval) {
401441c0 457 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
458 PL_op_name[lvalp->opval->op_type]);
459 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 460 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 461 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
462 }
463
464 }
401441c0 465 else
396482e1 466 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
467 break;
468 }
b6007c36 469 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
470 };
471 return (int)rv;
998054bd
SC
472}
473
b6007c36
DM
474
475/* print the buffer with suitable escapes */
476
477STATIC void
15f169a1 478S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 479{
396482e1 480 SV* const tmp = newSVpvs("");
7918f24d
NC
481
482 PERL_ARGS_ASSERT_PRINTBUF;
483
b6007c36
DM
484 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
485 SvREFCNT_dec(tmp);
486}
487
8fa7f367
JH
488#endif
489
8290c323
NC
490static int
491S_deprecate_commaless_var_list(pTHX) {
492 PL_expect = XTERM;
493 deprecate("comma-less variable list");
494 return REPORT(','); /* grandfather non-comma-format format */
495}
496
ffb4593c
NT
497/*
498 * S_ao
499 *
c963b151
BD
500 * This subroutine detects &&=, ||=, and //= and turns an ANDAND, OROR or DORDOR
501 * into an OP_ANDASSIGN, OP_ORASSIGN, or OP_DORASSIGN
ffb4593c
NT
502 */
503
76e3520e 504STATIC int
cea2e8a9 505S_ao(pTHX_ int toketype)
a0d0e21e 506{
97aff369 507 dVAR;
3280af22
NIS
508 if (*PL_bufptr == '=') {
509 PL_bufptr++;
a0d0e21e 510 if (toketype == ANDAND)
6154021b 511 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 512 else if (toketype == OROR)
6154021b 513 pl_yylval.ival = OP_ORASSIGN;
c963b151 514 else if (toketype == DORDOR)
6154021b 515 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
516 toketype = ASSIGNOP;
517 }
518 return toketype;
519}
520
ffb4593c
NT
521/*
522 * S_no_op
523 * When Perl expects an operator and finds something else, no_op
524 * prints the warning. It always prints "<something> found where
525 * operator expected. It prints "Missing semicolon on previous line?"
526 * if the surprise occurs at the start of the line. "do you need to
527 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
528 * where the compiler doesn't know if foo is a method call or a function.
529 * It prints "Missing operator before end of line" if there's nothing
530 * after the missing operator, or "... before <...>" if there is something
531 * after the missing operator.
532 */
533
76e3520e 534STATIC void
15f169a1 535S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 536{
97aff369 537 dVAR;
9d4ba2ae
AL
538 char * const oldbp = PL_bufptr;
539 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 540
7918f24d
NC
541 PERL_ARGS_ASSERT_NO_OP;
542
1189a94a
GS
543 if (!s)
544 s = oldbp;
07c798fb 545 else
1189a94a 546 PL_bufptr = s;
734ab321 547 yywarn(Perl_form(aTHX_ "%s found where operator expected", what), UTF ? SVf_UTF8 : 0);
56da5a46
RGS
548 if (ckWARN_d(WARN_SYNTAX)) {
549 if (is_first)
550 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
551 "\t(Missing semicolon on previous line?)\n");
552 else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) {
f54cb97a 553 const char *t;
734ab321
BF
554 for (t = PL_oldoldbufptr; (isALNUM_lazy_if(t,UTF) || *t == ':');
555 t += UTF ? UTF8SKIP(t) : 1)
c35e046a 556 NOOP;
56da5a46
RGS
557 if (t < PL_bufptr && isSPACE(*t))
558 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
734ab321
BF
559 "\t(Do you need to predeclare %"SVf"?)\n",
560 SVfARG(newSVpvn_flags(PL_oldoldbufptr, (STRLEN)(t - PL_oldoldbufptr),
561 SVs_TEMP | (UTF ? SVf_UTF8 : 0))));
56da5a46
RGS
562 }
563 else {
564 assert(s >= oldbp);
565 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
734ab321
BF
566 "\t(Missing operator before %"SVf"?)\n",
567 SVfARG(newSVpvn_flags(oldbp, (STRLEN)(s - oldbp),
568 SVs_TEMP | (UTF ? SVf_UTF8 : 0))));
56da5a46 569 }
07c798fb 570 }
3280af22 571 PL_bufptr = oldbp;
8990e307
LW
572}
573
ffb4593c
NT
574/*
575 * S_missingterm
576 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 577 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
578 * If we're in a delimited string and the delimiter is a control
579 * character, it's reformatted into a two-char sequence like ^C.
580 * This is fatal.
581 */
582
76e3520e 583STATIC void
cea2e8a9 584S_missingterm(pTHX_ char *s)
8990e307 585{
97aff369 586 dVAR;
8990e307
LW
587 char tmpbuf[3];
588 char q;
589 if (s) {
9d4ba2ae 590 char * const nl = strrchr(s,'\n');
d2719217 591 if (nl)
8990e307
LW
592 *nl = '\0';
593 }
463559e7 594 else if (isCNTRL(PL_multi_close)) {
8990e307 595 *tmpbuf = '^';
585ec06d 596 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
597 tmpbuf[2] = '\0';
598 s = tmpbuf;
599 }
600 else {
eb160463 601 *tmpbuf = (char)PL_multi_close;
8990e307
LW
602 tmpbuf[1] = '\0';
603 s = tmpbuf;
604 }
605 q = strchr(s,'"') ? '\'' : '"';
cea2e8a9 606 Perl_croak(aTHX_ "Can't find string terminator %c%s%c anywhere before EOF",q,s,q);
463ee0b2 607}
79072805 608
dd0ac2b9
FC
609#include "feature.h"
610
0d863452 611/*
0d863452
RH
612 * Check whether the named feature is enabled.
613 */
26ea9e12 614bool
3fff3427 615Perl_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
0d863452 616{
97aff369 617 dVAR;
4a731d7b 618 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
7918f24d
NC
619
620 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
ca4d40c4
FC
621
622 assert(CURRENT_FEATURE_BUNDLE == FEATURE_BUNDLE_CUSTOM);
7918f24d 623
26ea9e12
NC
624 if (namelen > MAX_FEATURE_LEN)
625 return FALSE;
3fff3427 626 memcpy(&he_name[8], name, namelen);
7d69d4a6 627
c8ca97b0
NC
628 return cBOOL(cop_hints_fetch_pvn(PL_curcop, he_name, 8 + namelen, 0,
629 REFCOUNTED_HE_EXISTS));
0d863452
RH
630}
631
ffb4593c 632/*
9cbb5ea2
GS
633 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
634 * utf16-to-utf8-reversed.
ffb4593c
NT
635 */
636
c39cd008
GS
637#ifdef PERL_CR_FILTER
638static void
639strip_return(SV *sv)
640{
eb578fdb
KW
641 const char *s = SvPVX_const(sv);
642 const char * const e = s + SvCUR(sv);
7918f24d
NC
643
644 PERL_ARGS_ASSERT_STRIP_RETURN;
645
c39cd008
GS
646 /* outer loop optimized to do nothing if there are no CR-LFs */
647 while (s < e) {
648 if (*s++ == '\r' && *s == '\n') {
649 /* hit a CR-LF, need to copy the rest */
eb578fdb 650 char *d = s - 1;
c39cd008
GS
651 *d++ = *s++;
652 while (s < e) {
653 if (*s == '\r' && s[1] == '\n')
654 s++;
655 *d++ = *s++;
656 }
657 SvCUR(sv) -= s - d;
658 return;
659 }
660 }
661}
a868473f 662
76e3520e 663STATIC I32
c39cd008 664S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 665{
f54cb97a 666 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
667 if (count > 0 && !maxlen)
668 strip_return(sv);
669 return count;
a868473f
NIS
670}
671#endif
672
ffb4593c 673/*
8eaa0acf
Z
674=for apidoc Amx|void|lex_start|SV *line|PerlIO *rsfp|U32 flags
675
676Creates and initialises a new lexer/parser state object, supplying
677a context in which to lex and parse from a new source of Perl code.
678A pointer to the new state object is placed in L</PL_parser>. An entry
679is made on the save stack so that upon unwinding the new state object
680will be destroyed and the former value of L</PL_parser> will be restored.
681Nothing else need be done to clean up the parsing context.
682
683The code to be parsed comes from I<line> and I<rsfp>. I<line>, if
684non-null, provides a string (in SV form) containing code to be parsed.
685A copy of the string is made, so subsequent modification of I<line>
686does not affect parsing. I<rsfp>, if non-null, provides an input stream
687from which code will be read to be parsed. If both are non-null, the
688code in I<line> comes first and must consist of complete lines of input,
689and I<rsfp> supplies the remainder of the source.
690
e368b3bd
FC
691The I<flags> parameter is reserved for future use. Currently it is only
692used by perl internally, so extensions should always pass zero.
8eaa0acf
Z
693
694=cut
695*/
ffb4593c 696
27fcb6ee 697/* LEX_START_SAME_FILTER indicates that this is not a new file, so it
87606032
NC
698 can share filters with the current parser.
699 LEX_START_DONT_CLOSE indicates that the file handle wasn't opened by the
700 caller, hence isn't owned by the parser, so shouldn't be closed on parser
701 destruction. This is used to handle the case of defaulting to reading the
702 script from the standard input because no filename was given on the command
703 line (without getting confused by situation where STDIN has been closed, so
704 the script handle is opened on fd 0) */
27fcb6ee 705
a0d0e21e 706void
8eaa0acf 707Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
79072805 708{
97aff369 709 dVAR;
6ef55633 710 const char *s = NULL;
5486870f 711 yy_parser *parser, *oparser;
60d63348 712 if (flags && flags & ~LEX_START_FLAGS)
8eaa0acf 713 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
acdf0a21
DM
714
715 /* create and initialise a parser */
716
199e78b7 717 Newxz(parser, 1, yy_parser);
5486870f 718 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
719 PL_parser = parser;
720
28ac2b49
Z
721 parser->stack = NULL;
722 parser->ps = NULL;
723 parser->stack_size = 0;
acdf0a21 724
e3abe207
DM
725 /* on scope exit, free this parser and restore any outer one */
726 SAVEPARSER(parser);
7c4baf47 727 parser->saved_curcop = PL_curcop;
e3abe207 728
acdf0a21 729 /* initialise lexer state */
8990e307 730
fb205e7a
DM
731#ifdef PERL_MAD
732 parser->curforce = -1;
733#else
734 parser->nexttoke = 0;
735#endif
ca4cfd28 736 parser->error_count = oparser ? oparser->error_count : 0;
c2598295 737 parser->copline = NOLINE;
5afb0a62 738 parser->lex_state = LEX_NORMAL;
c2598295 739 parser->expect = XSTATE;
2f9285f8 740 parser->rsfp = rsfp;
27fcb6ee
FC
741 parser->rsfp_filters =
742 !(flags & LEX_START_SAME_FILTER) || !oparser
d3cd8e11
FC
743 ? NULL
744 : MUTABLE_AV(SvREFCNT_inc(
745 oparser->rsfp_filters
746 ? oparser->rsfp_filters
747 : (oparser->rsfp_filters = newAV())
748 ));
2f9285f8 749
199e78b7
DM
750 Newx(parser->lex_brackstack, 120, char);
751 Newx(parser->lex_casestack, 12, char);
752 *parser->lex_casestack = '\0';
d794b522 753 Newxz(parser->lex_shared, 1, LEXSHARED);
02b34bbe 754
10efb74f 755 if (line) {
0528fd32 756 STRLEN len;
10efb74f 757 s = SvPV_const(line, len);
0abcdfa4
FC
758 parser->linestr = flags & LEX_START_COPIED
759 ? SvREFCNT_inc_simple_NN(line)
760 : newSVpvn_flags(s, len, SvUTF8(line));
11076590 761 sv_catpvs(parser->linestr, "\n;");
0abcdfa4
FC
762 } else {
763 parser->linestr = newSVpvs("\n;");
8990e307 764 }
f06b5848
DM
765 parser->oldoldbufptr =
766 parser->oldbufptr =
767 parser->bufptr =
768 parser->linestart = SvPVX(parser->linestr);
769 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
770 parser->last_lop = parser->last_uni = NULL;
87606032
NC
771 parser->lex_flags = flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
772 |LEX_DONT_CLOSE_RSFP);
737c24fc 773
60d63348 774 parser->in_pod = parser->filtered = 0;
79072805 775}
a687059c 776
e3abe207
DM
777
778/* delete a parser object */
779
780void
781Perl_parser_free(pTHX_ const yy_parser *parser)
782{
7918f24d
NC
783 PERL_ARGS_ASSERT_PARSER_FREE;
784
7c4baf47 785 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
786 SvREFCNT_dec(parser->linestr);
787
87606032 788 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
2f9285f8 789 PerlIO_clearerr(parser->rsfp);
799361c3
SH
790 else if (parser->rsfp && (!parser->old_parser ||
791 (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 792 PerlIO_close(parser->rsfp);
5486870f 793 SvREFCNT_dec(parser->rsfp_filters);
2f9285f8 794
e3abe207
DM
795 Safefree(parser->lex_brackstack);
796 Safefree(parser->lex_casestack);
d794b522 797 Safefree(parser->lex_shared);
e3abe207
DM
798 PL_parser = parser->old_parser;
799 Safefree(parser);
800}
801
802
ffb4593c 803/*
f0e67a1d
Z
804=for apidoc AmxU|SV *|PL_parser-E<gt>linestr
805
806Buffer scalar containing the chunk currently under consideration of the
807text currently being lexed. This is always a plain string scalar (for
808which C<SvPOK> is true). It is not intended to be used as a scalar by
809normal scalar means; instead refer to the buffer directly by the pointer
810variables described below.
811
812The lexer maintains various C<char*> pointers to things in the
813C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
814reallocated, all of these pointers must be updated. Don't attempt to
815do this manually, but rather use L</lex_grow_linestr> if you need to
816reallocate the buffer.
817
818The content of the text chunk in the buffer is commonly exactly one
819complete line of input, up to and including a newline terminator,
820but there are situations where it is otherwise. The octets of the
821buffer may be intended to be interpreted as either UTF-8 or Latin-1.
822The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
823flag on this scalar, which may disagree with it.
824
825For direct examination of the buffer, the variable
826L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
827lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
828of these pointers is usually preferable to examination of the scalar
829through normal scalar means.
830
831=for apidoc AmxU|char *|PL_parser-E<gt>bufend
832
833Direct pointer to the end of the chunk of text currently being lexed, the
834end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
835+ SvCUR(PL_parser-E<gt>linestr)>. A NUL character (zero octet) is
836always located at the end of the buffer, and does not count as part of
837the buffer's contents.
838
839=for apidoc AmxU|char *|PL_parser-E<gt>bufptr
840
841Points to the current position of lexing inside the lexer buffer.
842Characters around this point may be freely examined, within
843the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
844L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
845interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
846
847Lexing code (whether in the Perl core or not) moves this pointer past
848the characters that it consumes. It is also expected to perform some
849bookkeeping whenever a newline character is consumed. This movement
850can be more conveniently performed by the function L</lex_read_to>,
851which handles newlines appropriately.
852
853Interpretation of the buffer's octets can be abstracted out by
854using the slightly higher-level functions L</lex_peek_unichar> and
855L</lex_read_unichar>.
856
857=for apidoc AmxU|char *|PL_parser-E<gt>linestart
858
859Points to the start of the current line inside the lexer buffer.
860This is useful for indicating at which column an error occurred, and
861not much else. This must be updated by any lexing code that consumes
862a newline; the function L</lex_read_to> handles this detail.
863
864=cut
865*/
866
867/*
868=for apidoc Amx|bool|lex_bufutf8
869
870Indicates whether the octets in the lexer buffer
871(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
872of Unicode characters. If not, they should be interpreted as Latin-1
873characters. This is analogous to the C<SvUTF8> flag for scalars.
874
875In UTF-8 mode, it is not guaranteed that the lexer buffer actually
876contains valid UTF-8. Lexing code must be robust in the face of invalid
877encoding.
878
879The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
880is significant, but not the whole story regarding the input character
881encoding. Normally, when a file is being read, the scalar contains octets
882and its C<SvUTF8> flag is off, but the octets should be interpreted as
883UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
884however, the scalar may have the C<SvUTF8> flag on, and in this case its
885octets should be interpreted as UTF-8 unless the C<use bytes> pragma
886is in effect. This logic may change in the future; use this function
887instead of implementing the logic yourself.
888
889=cut
890*/
891
892bool
893Perl_lex_bufutf8(pTHX)
894{
895 return UTF;
896}
897
898/*
899=for apidoc Amx|char *|lex_grow_linestr|STRLEN len
900
901Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
902at least I<len> octets (including terminating NUL). Returns a
903pointer to the reallocated buffer. This is necessary before making
904any direct modification of the buffer that would increase its length.
905L</lex_stuff_pvn> provides a more convenient way to insert text into
906the buffer.
907
908Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
909this function updates all of the lexer's variables that point directly
910into the buffer.
911
912=cut
913*/
914
915char *
916Perl_lex_grow_linestr(pTHX_ STRLEN len)
917{
918 SV *linestr;
919 char *buf;
920 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
c7641931 921 STRLEN linestart_pos, last_uni_pos, last_lop_pos, re_eval_start_pos;
f0e67a1d
Z
922 linestr = PL_parser->linestr;
923 buf = SvPVX(linestr);
924 if (len <= SvLEN(linestr))
925 return buf;
926 bufend_pos = PL_parser->bufend - buf;
927 bufptr_pos = PL_parser->bufptr - buf;
928 oldbufptr_pos = PL_parser->oldbufptr - buf;
929 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
930 linestart_pos = PL_parser->linestart - buf;
931 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
932 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
3328ab5a
FC
933 re_eval_start_pos = PL_parser->lex_shared->re_eval_start ?
934 PL_parser->lex_shared->re_eval_start - buf : 0;
c7641931 935
f0e67a1d 936 buf = sv_grow(linestr, len);
c7641931 937
f0e67a1d
Z
938 PL_parser->bufend = buf + bufend_pos;
939 PL_parser->bufptr = buf + bufptr_pos;
940 PL_parser->oldbufptr = buf + oldbufptr_pos;
941 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
942 PL_parser->linestart = buf + linestart_pos;
943 if (PL_parser->last_uni)
944 PL_parser->last_uni = buf + last_uni_pos;
945 if (PL_parser->last_lop)
946 PL_parser->last_lop = buf + last_lop_pos;
3328ab5a
FC
947 if (PL_parser->lex_shared->re_eval_start)
948 PL_parser->lex_shared->re_eval_start = buf + re_eval_start_pos;
f0e67a1d
Z
949 return buf;
950}
951
952/*
83aa740e 953=for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
f0e67a1d
Z
954
955Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
956immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
957reallocating the buffer if necessary. This means that lexing code that
958runs later will see the characters as if they had appeared in the input.
959It is not recommended to do this as part of normal parsing, and most
960uses of this facility run the risk of the inserted characters being
961interpreted in an unintended manner.
962
963The string to be inserted is represented by I<len> octets starting
964at I<pv>. These octets are interpreted as either UTF-8 or Latin-1,
965according to whether the C<LEX_STUFF_UTF8> flag is set in I<flags>.
966The characters are recoded for the lexer buffer, according to how the
967buffer is currently being interpreted (L</lex_bufutf8>). If a string
9dcc53ea 968to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
f0e67a1d
Z
969function is more convenient.
970
971=cut
972*/
973
974void
83aa740e 975Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
f0e67a1d 976{
749123ff 977 dVAR;
f0e67a1d
Z
978 char *bufptr;
979 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
980 if (flags & ~(LEX_STUFF_UTF8))
981 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
982 if (UTF) {
983 if (flags & LEX_STUFF_UTF8) {
984 goto plain_copy;
985 } else {
986 STRLEN highhalf = 0;
83aa740e 987 const char *p, *e = pv+len;
f0e67a1d
Z
988 for (p = pv; p != e; p++)
989 highhalf += !!(((U8)*p) & 0x80);
990 if (!highhalf)
991 goto plain_copy;
992 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
993 bufptr = PL_parser->bufptr;
994 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
995 SvCUR_set(PL_parser->linestr,
996 SvCUR(PL_parser->linestr) + len+highhalf);
f0e67a1d
Z
997 PL_parser->bufend += len+highhalf;
998 for (p = pv; p != e; p++) {
999 U8 c = (U8)*p;
1000 if (c & 0x80) {
1001 *bufptr++ = (char)(0xc0 | (c >> 6));
1002 *bufptr++ = (char)(0x80 | (c & 0x3f));
1003 } else {
1004 *bufptr++ = (char)c;
1005 }
1006 }
1007 }
1008 } else {
1009 if (flags & LEX_STUFF_UTF8) {
1010 STRLEN highhalf = 0;
83aa740e 1011 const char *p, *e = pv+len;
f0e67a1d
Z
1012 for (p = pv; p != e; p++) {
1013 U8 c = (U8)*p;
1014 if (c >= 0xc4) {
1015 Perl_croak(aTHX_ "Lexing code attempted to stuff "
1016 "non-Latin-1 character into Latin-1 input");
1017 } else if (c >= 0xc2 && p+1 != e &&
1018 (((U8)p[1]) & 0xc0) == 0x80) {
1019 p++;
1020 highhalf++;
1021 } else if (c >= 0x80) {
1022 /* malformed UTF-8 */
1023 ENTER;
1024 SAVESPTR(PL_warnhook);
1025 PL_warnhook = PERL_WARNHOOK_FATAL;
1026 utf8n_to_uvuni((U8*)p, e-p, NULL, 0);
1027 LEAVE;
1028 }
1029 }
1030 if (!highhalf)
1031 goto plain_copy;
1032 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
1033 bufptr = PL_parser->bufptr;
1034 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1035 SvCUR_set(PL_parser->linestr,
1036 SvCUR(PL_parser->linestr) + len-highhalf);
f0e67a1d
Z
1037 PL_parser->bufend += len-highhalf;
1038 for (p = pv; p != e; p++) {
1039 U8 c = (U8)*p;
1040 if (c & 0x80) {
1041 *bufptr++ = (char)(((c & 0x3) << 6) | (p[1] & 0x3f));
1042 p++;
1043 } else {
1044 *bufptr++ = (char)c;
1045 }
1046 }
1047 } else {
1048 plain_copy:
1049 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1050 bufptr = PL_parser->bufptr;
1051 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
255fdf19 1052 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
f0e67a1d
Z
1053 PL_parser->bufend += len;
1054 Copy(pv, bufptr, len, char);
1055 }
1056 }
1057}
1058
1059/*
9dcc53ea
Z
1060=for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
1061
1062Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1063immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1064reallocating the buffer if necessary. This means that lexing code that
1065runs later will see the characters as if they had appeared in the input.
1066It is not recommended to do this as part of normal parsing, and most
1067uses of this facility run the risk of the inserted characters being
1068interpreted in an unintended manner.
1069
1070The string to be inserted is represented by octets starting at I<pv>
1071and continuing to the first nul. These octets are interpreted as either
1072UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
1073in I<flags>. The characters are recoded for the lexer buffer, according
1074to how the buffer is currently being interpreted (L</lex_bufutf8>).
1075If it is not convenient to nul-terminate a string to be inserted, the
1076L</lex_stuff_pvn> function is more appropriate.
1077
1078=cut
1079*/
1080
1081void
1082Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1083{
1084 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1085 lex_stuff_pvn(pv, strlen(pv), flags);
1086}
1087
1088/*
f0e67a1d
Z
1089=for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1090
1091Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1092immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1093reallocating the buffer if necessary. This means that lexing code that
1094runs later will see the characters as if they had appeared in the input.
1095It is not recommended to do this as part of normal parsing, and most
1096uses of this facility run the risk of the inserted characters being
1097interpreted in an unintended manner.
1098
1099The string to be inserted is the string value of I<sv>. The characters
1100are recoded for the lexer buffer, according to how the buffer is currently
9dcc53ea 1101being interpreted (L</lex_bufutf8>). If a string to be inserted is
f0e67a1d
Z
1102not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1103need to construct a scalar.
1104
1105=cut
1106*/
1107
1108void
1109Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1110{
1111 char *pv;
1112 STRLEN len;
1113 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1114 if (flags)
1115 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1116 pv = SvPV(sv, len);
1117 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1118}
1119
1120/*
1121=for apidoc Amx|void|lex_unstuff|char *ptr
1122
1123Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1124I<ptr>. Text following I<ptr> will be moved, and the buffer shortened.
1125This hides the discarded text from any lexing code that runs later,
1126as if the text had never appeared.
1127
1128This is not the normal way to consume lexed text. For that, use
1129L</lex_read_to>.
1130
1131=cut
1132*/
1133
1134void
1135Perl_lex_unstuff(pTHX_ char *ptr)
1136{
1137 char *buf, *bufend;
1138 STRLEN unstuff_len;
1139 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1140 buf = PL_parser->bufptr;
1141 if (ptr < buf)
1142 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1143 if (ptr == buf)
1144 return;
1145 bufend = PL_parser->bufend;
1146 if (ptr > bufend)
1147 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1148 unstuff_len = ptr - buf;
1149 Move(ptr, buf, bufend+1-ptr, char);
1150 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1151 PL_parser->bufend = bufend - unstuff_len;
1152}
1153
1154/*
1155=for apidoc Amx|void|lex_read_to|char *ptr
1156
1157Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1158to I<ptr>. This advances L</PL_parser-E<gt>bufptr> to match I<ptr>,
1159performing the correct bookkeeping whenever a newline character is passed.
1160This is the normal way to consume lexed text.
1161
1162Interpretation of the buffer's octets can be abstracted out by
1163using the slightly higher-level functions L</lex_peek_unichar> and
1164L</lex_read_unichar>.
1165
1166=cut
1167*/
1168
1169void
1170Perl_lex_read_to(pTHX_ char *ptr)
1171{
1172 char *s;
1173 PERL_ARGS_ASSERT_LEX_READ_TO;
1174 s = PL_parser->bufptr;
1175 if (ptr < s || ptr > PL_parser->bufend)
1176 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1177 for (; s != ptr; s++)
1178 if (*s == '\n') {
83944c01 1179 COPLINE_INC_WITH_HERELINES;
f0e67a1d
Z
1180 PL_parser->linestart = s+1;
1181 }
1182 PL_parser->bufptr = ptr;
1183}
1184
1185/*
1186=for apidoc Amx|void|lex_discard_to|char *ptr
1187
1188Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1189up to I<ptr>. The remaining content of the buffer will be moved, and
1190all pointers into the buffer updated appropriately. I<ptr> must not
1191be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1192it is not permitted to discard text that has yet to be lexed.
1193
1194Normally it is not necessarily to do this directly, because it suffices to
1195use the implicit discarding behaviour of L</lex_next_chunk> and things
1196based on it. However, if a token stretches across multiple lines,
1f317c95 1197and the lexing code has kept multiple lines of text in the buffer for
f0e67a1d
Z
1198that purpose, then after completion of the token it would be wise to
1199explicitly discard the now-unneeded earlier lines, to avoid future
1200multi-line tokens growing the buffer without bound.
1201
1202=cut
1203*/
1204
1205void
1206Perl_lex_discard_to(pTHX_ char *ptr)
1207{
1208 char *buf;
1209 STRLEN discard_len;
1210 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1211 buf = SvPVX(PL_parser->linestr);
1212 if (ptr < buf)
1213 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1214 if (ptr == buf)
1215 return;
1216 if (ptr > PL_parser->bufptr)
1217 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1218 discard_len = ptr - buf;
1219 if (PL_parser->oldbufptr < ptr)
1220 PL_parser->oldbufptr = ptr;
1221 if (PL_parser->oldoldbufptr < ptr)
1222 PL_parser->oldoldbufptr = ptr;
1223 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1224 PL_parser->last_uni = NULL;
1225 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1226 PL_parser->last_lop = NULL;
1227 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1228 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1229 PL_parser->bufend -= discard_len;
1230 PL_parser->bufptr -= discard_len;
1231 PL_parser->oldbufptr -= discard_len;
1232 PL_parser->oldoldbufptr -= discard_len;
1233 if (PL_parser->last_uni)
1234 PL_parser->last_uni -= discard_len;
1235 if (PL_parser->last_lop)
1236 PL_parser->last_lop -= discard_len;
1237}
1238
1239/*
1240=for apidoc Amx|bool|lex_next_chunk|U32 flags
1241
1242Reads in the next chunk of text to be lexed, appending it to
1243L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1244looked to the end of the current chunk and wants to know more. It is
1245usual, but not necessary, for lexing to have consumed the entirety of
1246the current chunk at this time.
1247
1248If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1249chunk (i.e., the current chunk has been entirely consumed), normally the
1250current chunk will be discarded at the same time that the new chunk is
1251read in. If I<flags> includes C<LEX_KEEP_PREVIOUS>, the current chunk
1252will not be discarded. If the current chunk has not been entirely
1253consumed, then it will not be discarded regardless of the flag.
1254
1255Returns true if some new text was added to the buffer, or false if the
1256buffer has reached the end of the input text.
1257
1258=cut
1259*/
1260
1261#define LEX_FAKE_EOF 0x80000000
112d1284 1262#define LEX_NO_TERM 0x40000000
f0e67a1d
Z
1263
1264bool
1265Perl_lex_next_chunk(pTHX_ U32 flags)
1266{
1267 SV *linestr;
1268 char *buf;
1269 STRLEN old_bufend_pos, new_bufend_pos;
1270 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1271 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1272 bool got_some_for_debugger = 0;
f0e67a1d 1273 bool got_some;
112d1284 1274 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM))
f0e67a1d 1275 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
f0e67a1d
Z
1276 linestr = PL_parser->linestr;
1277 buf = SvPVX(linestr);
1278 if (!(flags & LEX_KEEP_PREVIOUS) &&
1279 PL_parser->bufptr == PL_parser->bufend) {
1280 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1281 linestart_pos = 0;
1282 if (PL_parser->last_uni != PL_parser->bufend)
1283 PL_parser->last_uni = NULL;
1284 if (PL_parser->last_lop != PL_parser->bufend)
1285 PL_parser->last_lop = NULL;
1286 last_uni_pos = last_lop_pos = 0;
1287 *buf = 0;
1288 SvCUR(linestr) = 0;
1289 } else {
1290 old_bufend_pos = PL_parser->bufend - buf;
1291 bufptr_pos = PL_parser->bufptr - buf;
1292 oldbufptr_pos = PL_parser->oldbufptr - buf;
1293 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1294 linestart_pos = PL_parser->linestart - buf;
1295 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1296 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1297 }
1298 if (flags & LEX_FAKE_EOF) {
1299 goto eof;
60d63348 1300 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
f0e67a1d
Z
1301 got_some = 0;
1302 } else if (filter_gets(linestr, old_bufend_pos)) {
1303 got_some = 1;
17cc9359 1304 got_some_for_debugger = 1;
112d1284
FC
1305 } else if (flags & LEX_NO_TERM) {
1306 got_some = 0;
f0e67a1d 1307 } else {
580561a3
Z
1308 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1309 sv_setpvs(linestr, "");
f0e67a1d
Z
1310 eof:
1311 /* End of real input. Close filehandle (unless it was STDIN),
1312 * then add implicit termination.
1313 */
87606032 1314 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
f0e67a1d
Z
1315 PerlIO_clearerr(PL_parser->rsfp);
1316 else if (PL_parser->rsfp)
1317 (void)PerlIO_close(PL_parser->rsfp);
1318 PL_parser->rsfp = NULL;
60d63348 1319 PL_parser->in_pod = PL_parser->filtered = 0;
f0e67a1d
Z
1320#ifdef PERL_MAD
1321 if (PL_madskills && !PL_in_eval && (PL_minus_p || PL_minus_n))
1322 PL_faketokens = 1;
1323#endif
1324 if (!PL_in_eval && PL_minus_p) {
1325 sv_catpvs(linestr,
1326 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1327 PL_minus_n = PL_minus_p = 0;
1328 } else if (!PL_in_eval && PL_minus_n) {
1329 sv_catpvs(linestr, /*{*/";}");
1330 PL_minus_n = 0;
1331 } else
1332 sv_catpvs(linestr, ";");
1333 got_some = 1;
1334 }
1335 buf = SvPVX(linestr);
1336 new_bufend_pos = SvCUR(linestr);
1337 PL_parser->bufend = buf + new_bufend_pos;
1338 PL_parser->bufptr = buf + bufptr_pos;
1339 PL_parser->oldbufptr = buf + oldbufptr_pos;
1340 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1341 PL_parser->linestart = buf + linestart_pos;
1342 if (PL_parser->last_uni)
1343 PL_parser->last_uni = buf + last_uni_pos;
1344 if (PL_parser->last_lop)
1345 PL_parser->last_lop = buf + last_lop_pos;
17cc9359 1346 if (got_some_for_debugger && (PERLDB_LINE || PERLDB_SAVESRC) &&
f0e67a1d
Z
1347 PL_curstash != PL_debstash) {
1348 /* debugger active and we're not compiling the debugger code,
1349 * so store the line into the debugger's array of lines
1350 */
1351 update_debugger_info(NULL, buf+old_bufend_pos,
1352 new_bufend_pos-old_bufend_pos);
1353 }
1354 return got_some;
1355}
1356
1357/*
1358=for apidoc Amx|I32|lex_peek_unichar|U32 flags
1359
1360Looks ahead one (Unicode) character in the text currently being lexed.
1361Returns the codepoint (unsigned integer value) of the next character,
1362or -1 if lexing has reached the end of the input text. To consume the
1363peeked character, use L</lex_read_unichar>.
1364
1365If the next character is in (or extends into) the next chunk of input
1366text, the next chunk will be read in. Normally the current chunk will be
1367discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1368then the current chunk will not be discarded.
1369
1370If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1371is encountered, an exception is generated.
1372
1373=cut
1374*/
1375
1376I32
1377Perl_lex_peek_unichar(pTHX_ U32 flags)
1378{
749123ff 1379 dVAR;
f0e67a1d
Z
1380 char *s, *bufend;
1381 if (flags & ~(LEX_KEEP_PREVIOUS))
1382 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1383 s = PL_parser->bufptr;
1384 bufend = PL_parser->bufend;
1385 if (UTF) {
1386 U8 head;
1387 I32 unichar;
1388 STRLEN len, retlen;
1389 if (s == bufend) {
1390 if (!lex_next_chunk(flags))
1391 return -1;
1392 s = PL_parser->bufptr;
1393 bufend = PL_parser->bufend;
1394 }
1395 head = (U8)*s;
1396 if (!(head & 0x80))
1397 return head;
1398 if (head & 0x40) {
1399 len = PL_utf8skip[head];
1400 while ((STRLEN)(bufend-s) < len) {
1401 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1402 break;
1403 s = PL_parser->bufptr;
1404 bufend = PL_parser->bufend;
1405 }
1406 }
1407 unichar = utf8n_to_uvuni((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
1408 if (retlen == (STRLEN)-1) {
1409 /* malformed UTF-8 */
1410 ENTER;
1411 SAVESPTR(PL_warnhook);
1412 PL_warnhook = PERL_WARNHOOK_FATAL;
1413 utf8n_to_uvuni((U8*)s, bufend-s, NULL, 0);
1414 LEAVE;
1415 }
1416 return unichar;
1417 } else {
1418 if (s == bufend) {
1419 if (!lex_next_chunk(flags))
1420 return -1;
1421 s = PL_parser->bufptr;
1422 }
1423 return (U8)*s;
1424 }
1425}
1426
1427/*
1428=for apidoc Amx|I32|lex_read_unichar|U32 flags
1429
1430Reads the next (Unicode) character in the text currently being lexed.
1431Returns the codepoint (unsigned integer value) of the character read,
1432and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1433if lexing has reached the end of the input text. To non-destructively
1434examine the next character, use L</lex_peek_unichar> instead.
1435
1436If the next character is in (or extends into) the next chunk of input
1437text, the next chunk will be read in. Normally the current chunk will be
1438discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1439then the current chunk will not be discarded.
1440
1441If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1442is encountered, an exception is generated.
1443
1444=cut
1445*/
1446
1447I32
1448Perl_lex_read_unichar(pTHX_ U32 flags)
1449{
1450 I32 c;
1451 if (flags & ~(LEX_KEEP_PREVIOUS))
1452 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1453 c = lex_peek_unichar(flags);
1454 if (c != -1) {
1455 if (c == '\n')
83944c01 1456 COPLINE_INC_WITH_HERELINES;
d9018cbe
EB
1457 if (UTF)
1458 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1459 else
1460 ++(PL_parser->bufptr);
f0e67a1d
Z
1461 }
1462 return c;
1463}
1464
1465/*
1466=for apidoc Amx|void|lex_read_space|U32 flags
1467
1468Reads optional spaces, in Perl style, in the text currently being
1469lexed. The spaces may include ordinary whitespace characters and
1470Perl-style comments. C<#line> directives are processed if encountered.
1471L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1472at a non-space character (or the end of the input text).
1473
1474If spaces extend into the next chunk of input text, the next chunk will
1475be read in. Normally the current chunk will be discarded at the same
1476time, but if I<flags> includes C<LEX_KEEP_PREVIOUS> then the current
1477chunk will not be discarded.
1478
1479=cut
1480*/
1481
f0998909
Z
1482#define LEX_NO_NEXT_CHUNK 0x80000000
1483
f0e67a1d
Z
1484void
1485Perl_lex_read_space(pTHX_ U32 flags)
1486{
1487 char *s, *bufend;
1488 bool need_incline = 0;
f0998909 1489 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK))
f0e67a1d
Z
1490 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
1491#ifdef PERL_MAD
1492 if (PL_skipwhite) {
1493 sv_free(PL_skipwhite);
1494 PL_skipwhite = NULL;
1495 }
1496 if (PL_madskills)
1497 PL_skipwhite = newSVpvs("");
1498#endif /* PERL_MAD */
1499 s = PL_parser->bufptr;
1500 bufend = PL_parser->bufend;
1501 while (1) {
1502 char c = *s;
1503 if (c == '#') {
1504 do {
1505 c = *++s;
1506 } while (!(c == '\n' || (c == 0 && s == bufend)));
1507 } else if (c == '\n') {
1508 s++;
1509 PL_parser->linestart = s;
1510 if (s == bufend)
1511 need_incline = 1;
1512 else
1513 incline(s);
1514 } else if (isSPACE(c)) {
1515 s++;
1516 } else if (c == 0 && s == bufend) {
1517 bool got_more;
1518#ifdef PERL_MAD
1519 if (PL_madskills)
1520 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1521#endif /* PERL_MAD */
f0998909
Z
1522 if (flags & LEX_NO_NEXT_CHUNK)
1523 break;
f0e67a1d 1524 PL_parser->bufptr = s;
83944c01 1525 COPLINE_INC_WITH_HERELINES;
f0e67a1d
Z
1526 got_more = lex_next_chunk(flags);
1527 CopLINE_dec(PL_curcop);
1528 s = PL_parser->bufptr;
1529 bufend = PL_parser->bufend;
1530 if (!got_more)
1531 break;
1532 if (need_incline && PL_parser->rsfp) {
1533 incline(s);
1534 need_incline = 0;
1535 }
1536 } else {
1537 break;
1538 }
1539 }
1540#ifdef PERL_MAD
1541 if (PL_madskills)
1542 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1543#endif /* PERL_MAD */
1544 PL_parser->bufptr = s;
1545}
1546
1547/*
ffb4593c
NT
1548 * S_incline
1549 * This subroutine has nothing to do with tilting, whether at windmills
1550 * or pinball tables. Its name is short for "increment line". It
57843af0 1551 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1552 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1553 * # line 500 "foo.pm"
1554 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1555 */
1556
76e3520e 1557STATIC void
d9095cec 1558S_incline(pTHX_ const char *s)
463ee0b2 1559{
97aff369 1560 dVAR;
d9095cec
NC
1561 const char *t;
1562 const char *n;
1563 const char *e;
8818d409 1564 line_t line_num;
463ee0b2 1565
7918f24d
NC
1566 PERL_ARGS_ASSERT_INCLINE;
1567
83944c01 1568 COPLINE_INC_WITH_HERELINES;
463ee0b2
LW
1569 if (*s++ != '#')
1570 return;
d4c19fe8
AL
1571 while (SPACE_OR_TAB(*s))
1572 s++;
73659bf1
GS
1573 if (strnEQ(s, "line", 4))
1574 s += 4;
1575 else
1576 return;
084592ab 1577 if (SPACE_OR_TAB(*s))
73659bf1 1578 s++;
4e553d73 1579 else
73659bf1 1580 return;
d4c19fe8
AL
1581 while (SPACE_OR_TAB(*s))
1582 s++;
463ee0b2
LW
1583 if (!isDIGIT(*s))
1584 return;
d4c19fe8 1585
463ee0b2
LW
1586 n = s;
1587 while (isDIGIT(*s))
1588 s++;
07714eb4 1589 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1590 return;
bf4acbe4 1591 while (SPACE_OR_TAB(*s))
463ee0b2 1592 s++;
73659bf1 1593 if (*s == '"' && (t = strchr(s+1, '"'))) {
463ee0b2 1594 s++;
73659bf1
GS
1595 e = t + 1;
1596 }
463ee0b2 1597 else {
c35e046a
AL
1598 t = s;
1599 while (!isSPACE(*t))
1600 t++;
73659bf1 1601 e = t;
463ee0b2 1602 }
bf4acbe4 1603 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1604 e++;
1605 if (*e != '\n' && *e != '\0')
1606 return; /* false alarm */
1607
8818d409
FC
1608 line_num = atoi(n)-1;
1609
f4dd75d9 1610 if (t - s > 0) {
d9095cec 1611 const STRLEN len = t - s;
19bad673
NC
1612 SV *const temp_sv = CopFILESV(PL_curcop);
1613 const char *cf;
1614 STRLEN tmplen;
1615
1616 if (temp_sv) {
1617 cf = SvPVX(temp_sv);
1618 tmplen = SvCUR(temp_sv);
1619 } else {
1620 cf = NULL;
1621 tmplen = 0;
1622 }
1623
d1299d44 1624 if (!PL_rsfp && !PL_parser->filtered) {
e66cf94c
RGS
1625 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1626 * to *{"::_<newfilename"} */
44867030
NC
1627 /* However, the long form of evals is only turned on by the
1628 debugger - usually they're "(eval %lu)" */
1629 char smallbuf[128];
1630 char *tmpbuf;
1631 GV **gvp;
d9095cec 1632 STRLEN tmplen2 = len;
798b63bc 1633 if (tmplen + 2 <= sizeof smallbuf)
e66cf94c
RGS
1634 tmpbuf = smallbuf;
1635 else
2ae0db35 1636 Newx(tmpbuf, tmplen + 2, char);
44867030
NC
1637 tmpbuf[0] = '_';
1638 tmpbuf[1] = '<';
2ae0db35 1639 memcpy(tmpbuf + 2, cf, tmplen);
44867030 1640 tmplen += 2;
8a5ee598
RGS
1641 gvp = (GV**)hv_fetch(PL_defstash, tmpbuf, tmplen, FALSE);
1642 if (gvp) {
44867030
NC
1643 char *tmpbuf2;
1644 GV *gv2;
1645
1646 if (tmplen2 + 2 <= sizeof smallbuf)
1647 tmpbuf2 = smallbuf;
1648 else
1649 Newx(tmpbuf2, tmplen2 + 2, char);
1650
1651 if (tmpbuf2 != smallbuf || tmpbuf != smallbuf) {
1652 /* Either they malloc'd it, or we malloc'd it,
1653 so no prefix is present in ours. */
1654 tmpbuf2[0] = '_';
1655 tmpbuf2[1] = '<';
1656 }
1657
1658 memcpy(tmpbuf2 + 2, s, tmplen2);
1659 tmplen2 += 2;
1660
8a5ee598 1661 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1662 if (!isGV(gv2)) {
8a5ee598 1663 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1664 /* adjust ${"::_<newfilename"} to store the new file name */
1665 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
8818d409
FC
1666 /* The line number may differ. If that is the case,
1667 alias the saved lines that are in the array.
1668 Otherwise alias the whole array. */
1669 if (CopLINE(PL_curcop) == line_num) {
1670 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(*gvp)));
1671 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(*gvp)));
1672 }
1673 else if (GvAV(*gvp)) {
1674 AV * const av = GvAV(*gvp);
1675 const I32 start = CopLINE(PL_curcop)+1;
1676 I32 items = AvFILLp(av) - start;
1677 if (items > 0) {
1678 AV * const av2 = GvAVn(gv2);
1679 SV **svp = AvARRAY(av) + start;
1680 I32 l = (I32)line_num+1;
1681 while (items--)
1682 av_store(av2, l++, SvREFCNT_inc(*svp++));
1683 }
1684 }
e5527e4b 1685 }
44867030
NC
1686
1687 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
8a5ee598 1688 }
e66cf94c 1689 if (tmpbuf != smallbuf) Safefree(tmpbuf);
e66cf94c 1690 }
05ec9bb3 1691 CopFILE_free(PL_curcop);
d9095cec 1692 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1693 }
8818d409 1694 CopLINE_set(PL_curcop, line_num);
463ee0b2
LW
1695}
1696
29595ff2 1697#ifdef PERL_MAD
cd81e915 1698/* skip space before PL_thistoken */
29595ff2
NC
1699
1700STATIC char *
1701S_skipspace0(pTHX_ register char *s)
1702{
7918f24d
NC
1703 PERL_ARGS_ASSERT_SKIPSPACE0;
1704
29595ff2
NC
1705 s = skipspace(s);
1706 if (!PL_madskills)
1707 return s;
cd81e915
NC
1708 if (PL_skipwhite) {
1709 if (!PL_thiswhite)
6b29d1f5 1710 PL_thiswhite = newSVpvs("");
cd81e915
NC
1711 sv_catsv(PL_thiswhite, PL_skipwhite);
1712 sv_free(PL_skipwhite);
1713 PL_skipwhite = 0;
1714 }
1715 PL_realtokenstart = s - SvPVX(PL_linestr);
29595ff2
NC
1716 return s;
1717}
1718
cd81e915 1719/* skip space after PL_thistoken */
29595ff2
NC
1720
1721STATIC char *
1722S_skipspace1(pTHX_ register char *s)
1723{
d4c19fe8 1724 const char *start = s;
29595ff2
NC
1725 I32 startoff = start - SvPVX(PL_linestr);
1726
7918f24d
NC
1727 PERL_ARGS_ASSERT_SKIPSPACE1;
1728
29595ff2
NC
1729 s = skipspace(s);
1730 if (!PL_madskills)
1731 return s;
1732 start = SvPVX(PL_linestr) + startoff;
cd81e915 1733 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1734 const char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1735 PL_thistoken = newSVpvn(tstart, start - tstart);
1736 }
1737 PL_realtokenstart = -1;
1738 if (PL_skipwhite) {
1739 if (!PL_nextwhite)
6b29d1f5 1740 PL_nextwhite = newSVpvs("");
cd81e915
NC
1741 sv_catsv(PL_nextwhite, PL_skipwhite);
1742 sv_free(PL_skipwhite);
1743 PL_skipwhite = 0;
29595ff2
NC
1744 }
1745 return s;
1746}
1747
1748STATIC char *
1749S_skipspace2(pTHX_ register char *s, SV **svp)
1750{
c35e046a
AL
1751 char *start;
1752 const I32 bufptroff = PL_bufptr - SvPVX(PL_linestr);
1753 const I32 startoff = s - SvPVX(PL_linestr);
1754
7918f24d
NC
1755 PERL_ARGS_ASSERT_SKIPSPACE2;
1756
29595ff2
NC
1757 s = skipspace(s);
1758 PL_bufptr = SvPVX(PL_linestr) + bufptroff;
1759 if (!PL_madskills || !svp)
1760 return s;
1761 start = SvPVX(PL_linestr) + startoff;
cd81e915 1762 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1763 char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1764 PL_thistoken = newSVpvn(tstart, start - tstart);
1765 PL_realtokenstart = -1;
29595ff2 1766 }
cd81e915 1767 if (PL_skipwhite) {
29595ff2 1768 if (!*svp)
6b29d1f5 1769 *svp = newSVpvs("");
cd81e915
NC
1770 sv_setsv(*svp, PL_skipwhite);
1771 sv_free(PL_skipwhite);
1772 PL_skipwhite = 0;
29595ff2
NC
1773 }
1774
1775 return s;
1776}
1777#endif
1778
80a702cd 1779STATIC void
15f169a1 1780S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1781{
1782 AV *av = CopFILEAVx(PL_curcop);
1783 if (av) {
b9f83d2f 1784 SV * const sv = newSV_type(SVt_PVMG);
5fa550fb
NC
1785 if (orig_sv)
1786 sv_setsv(sv, orig_sv);
1787 else
1788 sv_setpvn(sv, buf, len);
80a702cd
RGS
1789 (void)SvIOK_on(sv);
1790 SvIV_set(sv, 0);
1791 av_store(av, (I32)CopLINE(PL_curcop), sv);
1792 }
1793}
1794
ffb4593c
NT
1795/*
1796 * S_skipspace
1797 * Called to gobble the appropriate amount and type of whitespace.
1798 * Skips comments as well.
1799 */
1800
76e3520e 1801STATIC char *
cea2e8a9 1802S_skipspace(pTHX_ register char *s)
a687059c 1803{
5db06880 1804#ifdef PERL_MAD
f0e67a1d
Z
1805 char *start = s;
1806#endif /* PERL_MAD */
7918f24d 1807 PERL_ARGS_ASSERT_SKIPSPACE;
f0e67a1d 1808#ifdef PERL_MAD
cd81e915
NC
1809 if (PL_skipwhite) {
1810 sv_free(PL_skipwhite);
f0e67a1d 1811 PL_skipwhite = NULL;
5db06880 1812 }
f0e67a1d 1813#endif /* PERL_MAD */
3280af22 1814 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
bf4acbe4 1815 while (s < PL_bufend && SPACE_OR_TAB(*s))
463ee0b2 1816 s++;
f0e67a1d
Z
1817 } else {
1818 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1819 PL_bufptr = s;
f0998909
Z
1820 lex_read_space(LEX_KEEP_PREVIOUS |
1821 (PL_sublex_info.sub_inwhat || PL_lex_state == LEX_FORMLINE ?
1822 LEX_NO_NEXT_CHUNK : 0));
3280af22 1823 s = PL_bufptr;
f0e67a1d
Z
1824 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1825 if (PL_linestart > PL_bufptr)
1826 PL_bufptr = PL_linestart;
1827 return s;
463ee0b2 1828 }
5db06880 1829#ifdef PERL_MAD
f0e67a1d
Z
1830 if (PL_madskills)
1831 PL_skipwhite = newSVpvn(start, s-start);
1832#endif /* PERL_MAD */
5db06880 1833 return s;
a687059c 1834}
378cc40b 1835
ffb4593c
NT
1836/*
1837 * S_check_uni
1838 * Check the unary operators to ensure there's no ambiguity in how they're
1839 * used. An ambiguous piece of code would be:
1840 * rand + 5
1841 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1842 * the +5 is its argument.
1843 */
1844
76e3520e 1845STATIC void
cea2e8a9 1846S_check_uni(pTHX)
ba106d47 1847{
97aff369 1848 dVAR;
d4c19fe8
AL
1849 const char *s;
1850 const char *t;
2f3197b3 1851
3280af22 1852 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1853 return;
3280af22
NIS
1854 while (isSPACE(*PL_last_uni))
1855 PL_last_uni++;
c35e046a
AL
1856 s = PL_last_uni;
1857 while (isALNUM_lazy_if(s,UTF) || *s == '-')
1858 s++;
3280af22 1859 if ((t = strchr(s, '(')) && t < PL_bufptr)
a0d0e21e 1860 return;
6136c704 1861
9b387841
NC
1862 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
1863 "Warning: Use of \"%.*s\" without parentheses is ambiguous",
1864 (int)(s - PL_last_uni), PL_last_uni);
2f3197b3
LW
1865}
1866
ffb4593c
NT
1867/*
1868 * LOP : macro to build a list operator. Its behaviour has been replaced
1869 * with a subroutine, S_lop() for which LOP is just another name.
1870 */
1871
a0d0e21e
LW
1872#define LOP(f,x) return lop(f,x,s)
1873
ffb4593c
NT
1874/*
1875 * S_lop
1876 * Build a list operator (or something that might be one). The rules:
1877 * - if we have a next token, then it's a list operator [why?]
1878 * - if the next thing is an opening paren, then it's a function
1879 * - else it's a list operator
1880 */
1881
76e3520e 1882STATIC I32
a0be28da 1883S_lop(pTHX_ I32 f, int x, char *s)
ffed7fef 1884{
97aff369 1885 dVAR;
7918f24d
NC
1886
1887 PERL_ARGS_ASSERT_LOP;
1888
6154021b 1889 pl_yylval.ival = f;
35c8bce7 1890 CLINE;
3280af22
NIS
1891 PL_expect = x;
1892 PL_bufptr = s;
1893 PL_last_lop = PL_oldbufptr;
eb160463 1894 PL_last_lop_op = (OPCODE)f;
5db06880
NC
1895#ifdef PERL_MAD
1896 if (PL_lasttoke)
78cdf107 1897 goto lstop;
5db06880 1898#else
3280af22 1899 if (PL_nexttoke)
78cdf107 1900 goto lstop;
5db06880 1901#endif
79072805 1902 if (*s == '(')
bbf60fe6 1903 return REPORT(FUNC);
29595ff2 1904 s = PEEKSPACE(s);
79072805 1905 if (*s == '(')
bbf60fe6 1906 return REPORT(FUNC);
78cdf107
Z
1907 else {
1908 lstop:
1909 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
1910 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
bbf60fe6 1911 return REPORT(LSTOP);
78cdf107 1912 }
79072805
LW
1913}
1914
5db06880
NC
1915#ifdef PERL_MAD
1916 /*
1917 * S_start_force
1918 * Sets up for an eventual force_next(). start_force(0) basically does
1919 * an unshift, while start_force(-1) does a push. yylex removes items
1920 * on the "pop" end.
1921 */
1922
1923STATIC void
1924S_start_force(pTHX_ int where)
1925{
1926 int i;
1927
cd81e915 1928 if (where < 0) /* so people can duplicate start_force(PL_curforce) */
5db06880 1929 where = PL_lasttoke;
cd81e915
NC
1930 assert(PL_curforce < 0 || PL_curforce == where);
1931 if (PL_curforce != where) {
5db06880
NC
1932 for (i = PL_lasttoke; i > where; --i) {
1933 PL_nexttoke[i] = PL_nexttoke[i-1];
1934 }
1935 PL_lasttoke++;
1936 }
cd81e915 1937 if (PL_curforce < 0) /* in case of duplicate start_force() */
5db06880 1938 Zero(&PL_nexttoke[where], 1, NEXTTOKE);
cd81e915
NC
1939 PL_curforce = where;
1940 if (PL_nextwhite) {
5db06880 1941 if (PL_madskills)
6b29d1f5 1942 curmad('^', newSVpvs(""));
cd81e915 1943 CURMAD('_', PL_nextwhite);
5db06880
NC
1944 }
1945}
1946
1947STATIC void
1948S_curmad(pTHX_ char slot, SV *sv)
1949{
1950 MADPROP **where;
1951
1952 if (!sv)
1953 return;
cd81e915
NC
1954 if (PL_curforce < 0)
1955 where = &PL_thismad;
5db06880 1956 else
cd81e915 1957 where = &PL_nexttoke[PL_curforce].next_mad;
5db06880 1958
cd81e915 1959 if (PL_faketokens)
76f68e9b 1960 sv_setpvs(sv, "");
5db06880
NC
1961 else {
1962 if (!IN_BYTES) {
1963 if (UTF && is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
1964 SvUTF8_on(sv);
1965 else if (PL_encoding) {
1966 sv_recode_to_utf8(sv, PL_encoding);
1967 }
1968 }
1969 }
1970
1971 /* keep a slot open for the head of the list? */
1972 if (slot != '_' && *where && (*where)->mad_key == '^') {
1973 (*where)->mad_key = slot;
daba3364 1974 sv_free(MUTABLE_SV(((*where)->mad_val)));
5db06880
NC
1975 (*where)->mad_val = (void*)sv;
1976 }
1977 else
1978 addmad(newMADsv(slot, sv), where, 0);
1979}
1980#else
b3f24c00
MHM
1981# define start_force(where) NOOP
1982# define curmad(slot, sv) NOOP
5db06880
NC
1983#endif
1984
ffb4593c
NT
1985/*
1986 * S_force_next
9cbb5ea2 1987 * When the lexer realizes it knows the next token (for instance,
ffb4593c 1988 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 1989 * to know what token to return the next time the lexer is called. Caller
5db06880
NC
1990 * will need to set PL_nextval[] (or PL_nexttoke[].next_val with PERL_MAD),
1991 * and possibly PL_expect to ensure the lexer handles the token correctly.
ffb4593c
NT
1992 */
1993
4e553d73 1994STATIC void
cea2e8a9 1995S_force_next(pTHX_ I32 type)
79072805 1996{
97aff369 1997 dVAR;
704d4215
GG
1998#ifdef DEBUGGING
1999 if (DEBUG_T_TEST) {
2000 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 2001 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
2002 }
2003#endif
6c7ae946
FC
2004 /* Don’t let opslab_force_free snatch it */
2005 if (S_is_opval_token(type & 0xffff) && NEXTVAL_NEXTTOKE.opval) {
2006 assert(!NEXTVAL_NEXTTOKE.opval->op_savefree);
2007 NEXTVAL_NEXTTOKE.opval->op_savefree = 1;
2008 }
5db06880 2009#ifdef PERL_MAD
cd81e915 2010 if (PL_curforce < 0)
5db06880 2011 start_force(PL_lasttoke);
cd81e915 2012 PL_nexttoke[PL_curforce].next_type = type;
5db06880
NC
2013 if (PL_lex_state != LEX_KNOWNEXT)
2014 PL_lex_defer = PL_lex_state;
2015 PL_lex_state = LEX_KNOWNEXT;
2016 PL_lex_expect = PL_expect;
cd81e915 2017 PL_curforce = -1;
5db06880 2018#else
3280af22
NIS
2019 PL_nexttype[PL_nexttoke] = type;
2020 PL_nexttoke++;
2021 if (PL_lex_state != LEX_KNOWNEXT) {
2022 PL_lex_defer = PL_lex_state;
2023 PL_lex_expect = PL_expect;
2024 PL_lex_state = LEX_KNOWNEXT;
79072805 2025 }
5db06880 2026#endif
79072805
LW
2027}
2028
28ac2b49
Z
2029void
2030Perl_yyunlex(pTHX)
2031{
a7aaec61
Z
2032 int yyc = PL_parser->yychar;
2033 if (yyc != YYEMPTY) {
2034 if (yyc) {
2035 start_force(-1);
2036 NEXTVAL_NEXTTOKE = PL_parser->yylval;
2037 if (yyc == '{'/*}*/ || yyc == HASHBRACK || yyc == '['/*]*/) {
78cdf107 2038 PL_lex_allbrackets--;
a7aaec61 2039 PL_lex_brackets--;
78cdf107
Z
2040 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
2041 } else if (yyc == '('/*)*/) {
2042 PL_lex_allbrackets--;
2043 yyc |= (2<<24);
a7aaec61
Z
2044 }
2045 force_next(yyc);
2046 }
28ac2b49
Z
2047 PL_parser->yychar = YYEMPTY;
2048 }
2049}
2050
d0a148a6 2051STATIC SV *
15f169a1 2052S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 2053{
97aff369 2054 dVAR;
740cce10 2055 SV * const sv = newSVpvn_utf8(start, len,
eaf7a4d2
CS
2056 !IN_BYTES
2057 && UTF
2058 && !is_ascii_string((const U8*)start, len)
740cce10 2059 && is_utf8_string((const U8*)start, len));
d0a148a6
NC
2060 return sv;
2061}
2062
ffb4593c
NT
2063/*
2064 * S_force_word
2065 * When the lexer knows the next thing is a word (for instance, it has
2066 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
2067 * it calls S_force_word to stick the next word into the PL_nexttoke/val
2068 * lookahead.
ffb4593c
NT
2069 *
2070 * Arguments:
b1b65b59 2071 * char *start : buffer position (must be within PL_linestr)
02b34bbe 2072 * int token : PL_next* will be this type of bare word (e.g., METHOD,WORD)
ffb4593c
NT
2073 * int check_keyword : if true, Perl checks to make sure the word isn't
2074 * a keyword (do this if the word is a label, e.g. goto FOO)
2075 * int allow_pack : if true, : characters will also be allowed (require,
2076 * use, etc. do this)
9cbb5ea2 2077 * int allow_initial_tick : used by the "sub" lexer only.
ffb4593c
NT
2078 */
2079
76e3520e 2080STATIC char *
cea2e8a9 2081S_force_word(pTHX_ register char *start, int token, int check_keyword, int allow_pack, int allow_initial_tick)
79072805 2082{
97aff369 2083 dVAR;
eb578fdb 2084 char *s;
463ee0b2 2085 STRLEN len;
4e553d73 2086
7918f24d
NC
2087 PERL_ARGS_ASSERT_FORCE_WORD;
2088
29595ff2 2089 start = SKIPSPACE1(start);
463ee0b2 2090 s = start;
7e2040f0 2091 if (isIDFIRST_lazy_if(s,UTF) ||
a0d0e21e 2092 (allow_pack && *s == ':') ||
15f0808c 2093 (allow_initial_tick && *s == '\'') )
a0d0e21e 2094 {
3280af22 2095 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
5458a98a 2096 if (check_keyword && keyword(PL_tokenbuf, len, 0))
463ee0b2 2097 return start;
cd81e915 2098 start_force(PL_curforce);
5db06880
NC
2099 if (PL_madskills)
2100 curmad('X', newSVpvn(start,s-start));
463ee0b2 2101 if (token == METHOD) {
29595ff2 2102 s = SKIPSPACE1(s);
463ee0b2 2103 if (*s == '(')
3280af22 2104 PL_expect = XTERM;
463ee0b2 2105 else {
3280af22 2106 PL_expect = XOPERATOR;
463ee0b2 2107 }
79072805 2108 }
e74e6b3d 2109 if (PL_madskills)
63575281 2110 curmad('g', newSVpvs( "forced" ));
9ded7720 2111 NEXTVAL_NEXTTOKE.opval
d0a148a6
NC
2112 = (OP*)newSVOP(OP_CONST,0,
2113 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 2114 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
2115 force_next(token);
2116 }
2117 return s;
2118}
2119
ffb4593c
NT
2120/*
2121 * S_force_ident
9cbb5ea2 2122 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
2123 * text only contains the "foo" portion. The first argument is a pointer
2124 * to the "foo", and the second argument is the type symbol to prefix.
2125 * Forces the next token to be a "WORD".
9cbb5ea2 2126 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2127 */
2128
76e3520e 2129STATIC void
bfed75c6 2130S_force_ident(pTHX_ register const char *s, int kind)
79072805 2131{
97aff369 2132 dVAR;
7918f24d
NC
2133
2134 PERL_ARGS_ASSERT_FORCE_IDENT;
2135
c35e046a 2136 if (*s) {
90e5519e 2137 const STRLEN len = strlen(s);
728847b1
BF
2138 OP* const o = (OP*)newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
2139 UTF ? SVf_UTF8 : 0));
cd81e915 2140 start_force(PL_curforce);
9ded7720 2141 NEXTVAL_NEXTTOKE.opval = o;
79072805 2142 force_next(WORD);
748a9306 2143 if (kind) {
11343788 2144 o->op_private = OPpCONST_ENTERED;
55497cff 2145 /* XXX see note in pp_entereval() for why we forgo typo
2146 warnings if the symbol must be introduced in an eval.
2147 GSAR 96-10-12 */
90e5519e 2148 gv_fetchpvn_flags(s, len,
728847b1
BF
2149 (PL_in_eval ? (GV_ADDMULTI | GV_ADDINEVAL)
2150 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
90e5519e
NC
2151 kind == '$' ? SVt_PV :
2152 kind == '@' ? SVt_PVAV :
2153 kind == '%' ? SVt_PVHV :
a0d0e21e 2154 SVt_PVGV
90e5519e 2155 );
748a9306 2156 }
79072805
LW
2157 }
2158}
2159
1571675a
GS
2160NV
2161Perl_str_to_version(pTHX_ SV *sv)
2162{
2163 NV retval = 0.0;
2164 NV nshift = 1.0;
2165 STRLEN len;
cfd0369c 2166 const char *start = SvPV_const(sv,len);
9d4ba2ae 2167 const char * const end = start + len;
504618e9 2168 const bool utf = SvUTF8(sv) ? TRUE : FALSE;
7918f24d
NC
2169
2170 PERL_ARGS_ASSERT_STR_TO_VERSION;
2171
1571675a 2172 while (start < end) {
ba210ebe 2173 STRLEN skip;
1571675a
GS
2174 UV n;
2175 if (utf)
9041c2e3 2176 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2177 else {
2178 n = *(U8*)start;
2179 skip = 1;
2180 }
2181 retval += ((NV)n)/nshift;
2182 start += skip;
2183 nshift *= 1000;
2184 }
2185 return retval;
2186}
2187
4e553d73 2188/*
ffb4593c
NT
2189 * S_force_version
2190 * Forces the next token to be a version number.
e759cc13
RGS
2191 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2192 * and if "guessing" is TRUE, then no new token is created (and the caller
2193 * must use an alternative parsing method).
ffb4593c
NT
2194 */
2195
76e3520e 2196STATIC char *
e759cc13 2197S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2198{
97aff369 2199 dVAR;
5f66b61c 2200 OP *version = NULL;
44dcb63b 2201 char *d;
5db06880
NC
2202#ifdef PERL_MAD
2203 I32 startoff = s - SvPVX(PL_linestr);
2204#endif
89bfa8cd 2205
7918f24d
NC
2206 PERL_ARGS_ASSERT_FORCE_VERSION;
2207
29595ff2 2208 s = SKIPSPACE1(s);
89bfa8cd 2209
44dcb63b 2210 d = s;
dd629d5b 2211 if (*d == 'v')
44dcb63b 2212 d++;
44dcb63b 2213 if (isDIGIT(*d)) {
e759cc13
RGS
2214 while (isDIGIT(*d) || *d == '_' || *d == '.')
2215 d++;
5db06880
NC
2216#ifdef PERL_MAD
2217 if (PL_madskills) {
cd81e915 2218 start_force(PL_curforce);
5db06880
NC
2219 curmad('X', newSVpvn(s,d-s));
2220 }
2221#endif
4e4da3ac 2222 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
dd629d5b 2223 SV *ver;
8d08d9ba 2224#ifdef USE_LOCALE_NUMERIC
909d3787
KW
2225 char *loc = savepv(setlocale(LC_NUMERIC, NULL));
2226 setlocale(LC_NUMERIC, "C");
8d08d9ba 2227#endif
6154021b 2228 s = scan_num(s, &pl_yylval);
8d08d9ba
DG
2229#ifdef USE_LOCALE_NUMERIC
2230 setlocale(LC_NUMERIC, loc);
909d3787 2231 Safefree(loc);
8d08d9ba 2232#endif
6154021b 2233 version = pl_yylval.opval;
dd629d5b
GS
2234 ver = cSVOPx(version)->op_sv;
2235 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2236 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2237 SvNV_set(ver, str_to_version(ver));
1571675a 2238 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2239 }
89bfa8cd 2240 }
5db06880
NC
2241 else if (guessing) {
2242#ifdef PERL_MAD
2243 if (PL_madskills) {
cd81e915
NC
2244 sv_free(PL_nextwhite); /* let next token collect whitespace */
2245 PL_nextwhite = 0;
5db06880
NC
2246 s = SvPVX(PL_linestr) + startoff;
2247 }
2248#endif
e759cc13 2249 return s;
5db06880 2250 }
89bfa8cd 2251 }
2252
5db06880
NC
2253#ifdef PERL_MAD
2254 if (PL_madskills && !version) {
cd81e915
NC
2255 sv_free(PL_nextwhite); /* let next token collect whitespace */
2256 PL_nextwhite = 0;
5db06880
NC
2257 s = SvPVX(PL_linestr) + startoff;
2258 }
2259#endif
89bfa8cd 2260 /* NOTE: The parser sees the package name and the VERSION swapped */
cd81e915 2261 start_force(PL_curforce);
9ded7720 2262 NEXTVAL_NEXTTOKE.opval = version;
4e553d73 2263 force_next(WORD);
89bfa8cd 2264
e759cc13 2265 return s;
89bfa8cd 2266}
2267
ffb4593c 2268/*
91152fc1
DG
2269 * S_force_strict_version
2270 * Forces the next token to be a version number using strict syntax rules.
2271 */
2272
2273STATIC char *
2274S_force_strict_version(pTHX_ char *s)
2275{
2276 dVAR;
2277 OP *version = NULL;
2278#ifdef PERL_MAD
2279 I32 startoff = s - SvPVX(PL_linestr);
2280#endif
2281 const char *errstr = NULL;
2282
2283 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2284
2285 while (isSPACE(*s)) /* leading whitespace */
2286 s++;
2287
2288 if (is_STRICT_VERSION(s,&errstr)) {
2289 SV *ver = newSV(0);
2290 s = (char *)scan_version(s, ver, 0);
2291 version = newSVOP(OP_CONST, 0, ver);
2292 }
4e4da3ac
Z
2293 else if ( (*s != ';' && *s != '{' && *s != '}' ) &&
2294 (s = SKIPSPACE1(s), (*s != ';' && *s != '{' && *s != '}' )))
2295 {
91152fc1
DG
2296 PL_bufptr = s;
2297 if (errstr)
2298 yyerror(errstr); /* version required */
2299 return s;
2300 }
2301
2302#ifdef PERL_MAD
2303 if (PL_madskills && !version) {
2304 sv_free(PL_nextwhite); /* let next token collect whitespace */
2305 PL_nextwhite = 0;
2306 s = SvPVX(PL_linestr) + startoff;
2307 }
2308#endif
2309 /* NOTE: The parser sees the package name and the VERSION swapped */
2310 start_force(PL_curforce);
2311 NEXTVAL_NEXTTOKE.opval = version;
2312 force_next(WORD);
2313
2314 return s;
2315}
2316
2317/*
ffb4593c
NT
2318 * S_tokeq
2319 * Tokenize a quoted string passed in as an SV. It finds the next
2320 * chunk, up to end of string or a backslash. It may make a new
2321 * SV containing that chunk (if HINT_NEW_STRING is on). It also
2322 * turns \\ into \.
2323 */
2324
76e3520e 2325STATIC SV *
cea2e8a9 2326S_tokeq(pTHX_ SV *sv)
79072805 2327{
97aff369 2328 dVAR;
eb578fdb
KW
2329 char *s;
2330 char *send;
2331 char *d;
b3ac6de7
IZ
2332 STRLEN len = 0;
2333 SV *pv = sv;
79072805 2334
7918f24d
NC
2335 PERL_ARGS_ASSERT_TOKEQ;
2336
79072805 2337 if (!SvLEN(sv))
b3ac6de7 2338 goto finish;
79072805 2339
a0d0e21e 2340 s = SvPV_force(sv, len);
21a311ee 2341 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1)
b3ac6de7 2342 goto finish;
463ee0b2 2343 send = s + len;
dcb21ed6
NC
2344 /* This is relying on the SV being "well formed" with a trailing '\0' */
2345 while (s < send && !(*s == '\\' && s[1] == '\\'))
79072805
LW
2346 s++;
2347 if (s == send)
b3ac6de7 2348 goto finish;
79072805 2349 d = s;
be4731d2 2350 if ( PL_hints & HINT_NEW_STRING ) {
59cd0e26 2351 pv = newSVpvn_flags(SvPVX_const(pv), len, SVs_TEMP | SvUTF8(sv));
be4731d2 2352 }
79072805
LW
2353 while (s < send) {
2354 if (*s == '\\') {
a0d0e21e 2355 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2356 s++; /* all that, just for this */
2357 }
2358 *d++ = *s++;
2359 }
2360 *d = '\0';
95a20fc0 2361 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2362 finish:
3280af22 2363 if ( PL_hints & HINT_NEW_STRING )
eb0d8d16 2364 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
79072805
LW
2365 return sv;
2366}
2367
ffb4593c
NT
2368/*
2369 * Now come three functions related to double-quote context,
2370 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2371 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2372 * interact with PL_lex_state, and create fake ( ... ) argument lists
2373 * to handle functions and concatenation.
ecd24171
DM
2374 * For example,
2375 * "foo\lbar"
2376 * is tokenised as
2377 * stringify ( const[foo] concat lcfirst ( const[bar] ) )
ffb4593c
NT
2378 */
2379
2380/*
2381 * S_sublex_start
6154021b 2382 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2383 *
2384 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2385 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c
NT
2386 *
2387 * OP_CONST and OP_READLINE are easy--just make the new op and return.
2388 *
2389 * Everything else becomes a FUNC.
2390 *
2391 * Sets PL_lex_state to LEX_INTERPPUSH unless (ival was OP_NULL or we
2392 * had an OP_CONST or OP_READLINE). This just sets us up for a
2393 * call to S_sublex_push().
2394 */
2395
76e3520e 2396STATIC I32
cea2e8a9 2397S_sublex_start(pTHX)
79072805 2398{
97aff369 2399 dVAR;
eb578fdb 2400 const I32 op_type = pl_yylval.ival;
79072805
LW
2401
2402 if (op_type == OP_NULL) {
6154021b 2403 pl_yylval.opval = PL_lex_op;
5f66b61c 2404 PL_lex_op = NULL;
79072805
LW
2405 return THING;
2406 }
2407 if (op_type == OP_CONST || op_type == OP_READLINE) {
3280af22 2408 SV *sv = tokeq(PL_lex_stuff);
b3ac6de7
IZ
2409
2410 if (SvTYPE(sv) == SVt_PVIV) {
2411 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2412 STRLEN len;
96a5add6 2413 const char * const p = SvPV_const(sv, len);
740cce10 2414 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2415 SvREFCNT_dec(sv);
2416 sv = nsv;
4e553d73 2417 }
6154021b 2418 pl_yylval.opval = (OP*)newSVOP(op_type, 0, sv);
a0714e2c 2419 PL_lex_stuff = NULL;
6f33ba73
RGS
2420 /* Allow <FH> // "foo" */
2421 if (op_type == OP_READLINE)
2422 PL_expect = XTERMORDORDOR;
79072805
LW
2423 return THING;
2424 }
e3f73d4e
RGS
2425 else if (op_type == OP_BACKTICK && PL_lex_op) {
2426 /* readpipe() vas overriden */
2427 cSVOPx(cLISTOPx(cUNOPx(PL_lex_op)->op_first)->op_first->op_sibling)->op_sv = tokeq(PL_lex_stuff);
6154021b 2428 pl_yylval.opval = PL_lex_op;
9b201d7d 2429 PL_lex_op = NULL;
e3f73d4e
RGS
2430 PL_lex_stuff = NULL;
2431 return THING;
2432 }
79072805 2433
3280af22 2434 PL_sublex_info.super_state = PL_lex_state;
eac04b2e 2435 PL_sublex_info.sub_inwhat = (U16)op_type;
3280af22
NIS
2436 PL_sublex_info.sub_op = PL_lex_op;
2437 PL_lex_state = LEX_INTERPPUSH;
55497cff 2438
3280af22
NIS
2439 PL_expect = XTERM;
2440 if (PL_lex_op) {
6154021b 2441 pl_yylval.opval = PL_lex_op;
5f66b61c 2442 PL_lex_op = NULL;
55497cff 2443 return PMFUNC;
2444 }
2445 else
2446 return FUNC;
2447}
2448
ffb4593c
NT
2449/*
2450 * S_sublex_push
2451 * Create a new scope to save the lexing state. The scope will be
2452 * ended in S_sublex_done. Returns a '(', starting the function arguments
2453 * to the uc, lc, etc. found before.
2454 * Sets PL_lex_state to LEX_INTERPCONCAT.
2455 */
2456
76e3520e 2457STATIC I32
cea2e8a9 2458S_sublex_push(pTHX)
55497cff 2459{
27da23d5 2460 dVAR;
78a635de 2461 LEXSHARED *shared;
f46d017c 2462 ENTER;
55497cff 2463
3280af22 2464 PL_lex_state = PL_sublex_info.super_state;
651b5b28 2465 SAVEBOOL(PL_lex_dojoin);
3280af22 2466 SAVEI32(PL_lex_brackets);
78cdf107 2467 SAVEI32(PL_lex_allbrackets);
b27dce25 2468 SAVEI32(PL_lex_formbrack);
78cdf107 2469 SAVEI8(PL_lex_fakeeof);
3280af22
NIS
2470 SAVEI32(PL_lex_casemods);
2471 SAVEI32(PL_lex_starts);
651b5b28 2472 SAVEI8(PL_lex_state);
7cc34111 2473 SAVESPTR(PL_lex_repl);
7766f137 2474 SAVEVPTR(PL_lex_inpat);
98246f1e 2475 SAVEI16(PL_lex_inwhat);
57843af0 2476 SAVECOPLINE(PL_curcop);
3280af22 2477 SAVEPPTR(PL_bufptr);
8452ff4b 2478 SAVEPPTR(PL_bufend);
3280af22
NIS
2479 SAVEPPTR(PL_oldbufptr);
2480 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2481 SAVEPPTR(PL_last_lop);
2482 SAVEPPTR(PL_last_uni);
3280af22
NIS
2483 SAVEPPTR(PL_linestart);
2484 SAVESPTR(PL_linestr);
8edd5f42
RGS
2485 SAVEGENERICPV(PL_lex_brackstack);
2486 SAVEGENERICPV(PL_lex_casestack);
78a635de 2487 SAVEGENERICPV(PL_parser->lex_shared);
3280af22 2488
99bd9d90 2489 /* The here-doc parser needs to be able to peek into outer lexing
60f40a38
FC
2490 scopes to find the body of the here-doc. So we put PL_linestr and
2491 PL_bufptr into lex_shared, to ‘share’ those values.
99bd9d90 2492 */
60f40a38
FC
2493 PL_parser->lex_shared->ls_linestr = PL_linestr;
2494 PL_parser->lex_shared->ls_bufptr = PL_bufptr;
99bd9d90 2495
3280af22 2496 PL_linestr = PL_lex_stuff;
7cc34111 2497 PL_lex_repl = PL_sublex_info.repl;
a0714e2c 2498 PL_lex_stuff = NULL;
7cc34111 2499 PL_sublex_info.repl = NULL;
3280af22 2500
9cbb5ea2
GS
2501 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2502 = SvPVX(PL_linestr);
3280af22 2503 PL_bufend += SvCUR(PL_linestr);
bd61b366 2504 PL_last_lop = PL_last_uni = NULL;
3280af22 2505 SAVEFREESV(PL_linestr);
4dc843bc 2506 if (PL_lex_repl) SAVEFREESV(PL_lex_repl);
3280af22
NIS
2507
2508 PL_lex_dojoin = FALSE;
b27dce25 2509 PL_lex_brackets = PL_lex_formbrack = 0;
78cdf107
Z
2510 PL_lex_allbrackets = 0;
2511 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
a02a5408
JC
2512 Newx(PL_lex_brackstack, 120, char);
2513 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2514 PL_lex_casemods = 0;
2515 *PL_lex_casestack = '\0';
2516 PL_lex_starts = 0;
2517 PL_lex_state = LEX_INTERPCONCAT;
eb160463 2518 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
78a635de
FC
2519
2520 Newxz(shared, 1, LEXSHARED);
2521 shared->ls_prev = PL_parser->lex_shared;
2522 PL_parser->lex_shared = shared;
3280af22
NIS
2523
2524 PL_lex_inwhat = PL_sublex_info.sub_inwhat;
bb16bae8 2525 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
3280af22
NIS
2526 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2527 PL_lex_inpat = PL_sublex_info.sub_op;
79072805 2528 else
5f66b61c 2529 PL_lex_inpat = NULL;
79072805 2530
55497cff 2531 return '(';
79072805
LW
2532}
2533
ffb4593c
NT
2534/*
2535 * S_sublex_done
2536 * Restores lexer state after a S_sublex_push.
2537 */
2538
76e3520e 2539STATIC I32
cea2e8a9 2540S_sublex_done(pTHX)
79072805 2541{
27da23d5 2542 dVAR;
3280af22 2543 if (!PL_lex_starts++) {
396482e1 2544 SV * const sv = newSVpvs("");
9aa983d2
JH
2545 if (SvUTF8(PL_linestr))
2546 SvUTF8_on(sv);
3280af22 2547 PL_expect = XOPERATOR;
6154021b 2548 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
79072805
LW
2549 return THING;
2550 }
2551
3280af22
NIS
2552 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2553 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2554 return yylex();
79072805
LW
2555 }
2556
ffb4593c 2557 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
bb16bae8 2558 assert(PL_lex_inwhat != OP_TRANSR);
3280af22
NIS
2559 if (PL_lex_repl && (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS)) {
2560 PL_linestr = PL_lex_repl;
2561 PL_lex_inpat = 0;
2562 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2563 PL_bufend += SvCUR(PL_linestr);
bd61b366 2564 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2565 PL_lex_dojoin = FALSE;
2566 PL_lex_brackets = 0;
78cdf107
Z
2567 PL_lex_allbrackets = 0;
2568 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
3280af22
NIS
2569 PL_lex_casemods = 0;
2570 *PL_lex_casestack = '\0';
2571 PL_lex_starts = 0;
25da4f38 2572 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2573 PL_lex_state = LEX_INTERPNORMAL;
2574 PL_lex_starts++;
e9fa98b2
HS
2575 /* we don't clear PL_lex_repl here, so that we can check later
2576 whether this is an evalled subst; that means we rely on the
2577 logic to ensure sublex_done() is called again only via the
2578 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2579 }
e9fa98b2 2580 else {
3280af22 2581 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2582 PL_lex_repl = NULL;
e9fa98b2 2583 }
79072805 2584 return ',';
ffed7fef
LW
2585 }
2586 else {
5db06880
NC
2587#ifdef PERL_MAD
2588 if (PL_madskills) {
cd81e915
NC
2589 if (PL_thiswhite) {
2590 if (!PL_endwhite)
6b29d1f5 2591 PL_endwhite = newSVpvs("");
cd81e915
NC
2592 sv_catsv(PL_endwhite, PL_thiswhite);
2593 PL_thiswhite = 0;
2594 }
2595 if (PL_thistoken)
76f68e9b 2596 sv_setpvs(PL_thistoken,"");
5db06880 2597 else
cd81e915 2598 PL_realtokenstart = -1;
5db06880
NC
2599 }
2600#endif
f46d017c 2601 LEAVE;
3280af22
NIS
2602 PL_bufend = SvPVX(PL_linestr);
2603 PL_bufend += SvCUR(PL_linestr);
2604 PL_expect = XOPERATOR;
09bef843 2605 PL_sublex_info.sub_inwhat = 0;
79072805 2606 return ')';
ffed7fef
LW
2607 }
2608}
2609
02aa26ce
NT
2610/*
2611 scan_const
2612
9da1dd8f
DM
2613 Extracts the next constant part of a pattern, double-quoted string,
2614 or transliteration. This is terrifying code.
2615
2616 For example, in parsing the double-quoted string "ab\x63$d", it would
2617 stop at the '$' and return an OP_CONST containing 'abc'.
02aa26ce 2618
94def140 2619 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 2620 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 2621 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 2622
94def140
TS
2623 Returns a pointer to the character scanned up to. If this is
2624 advanced from the start pointer supplied (i.e. if anything was
9da1dd8f 2625 successfully parsed), will leave an OP_CONST for the substring scanned
6154021b 2626 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
2627 by looking at the next characters herself.
2628
02aa26ce 2629 In patterns:
9da1dd8f
DM
2630 expand:
2631 \N{ABC} => \N{U+41.42.43}
2632
2633 pass through:
2634 all other \-char, including \N and \N{ apart from \N{ABC}
2635
2636 stops on:
2637 @ and $ where it appears to be a var, but not for $ as tail anchor
2638 \l \L \u \U \Q \E
2639 (?{ or (??{
2640
02aa26ce
NT
2641
2642 In transliterations:
2643 characters are VERY literal, except for - not at the start or end
94def140
TS
2644 of the string, which indicates a range. If the range is in bytes,
2645 scan_const expands the range to the full set of intermediate
2646 characters. If the range is in utf8, the hyphen is replaced with
2647 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
2648
2649 In double-quoted strings:
2650 backslashes:
2651 double-quoted style: \r and \n
ff3f963a 2652 constants: \x31, etc.
94def140 2653 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
2654 case and quoting: \U \Q \E
2655 stops on @ and $
2656
2657 scan_const does *not* construct ops to handle interpolated strings.
2658 It stops processing as soon as it finds an embedded $ or @ variable
2659 and leaves it to the caller to work out what's going on.
2660
94def140
TS
2661 embedded arrays (whether in pattern or not) could be:
2662 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2663
2664 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
2665
2666 $ in pattern could be $foo or could be tail anchor. Assumption:
2667 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 2668 followed by one of "()| \r\n\t"
02aa26ce 2669
9da1dd8f 2670 \1 (backreferences) are turned into $1 in substitutions
02aa26ce
NT
2671
2672 The structure of the code is
2673 while (there's a character to process) {
94def140
TS
2674 handle transliteration ranges
2675 skip regexp comments /(?#comment)/ and codes /(?{code})/
2676 skip #-initiated comments in //x patterns
2677 check for embedded arrays
02aa26ce
NT
2678 check for embedded scalars
2679 if (backslash) {
94def140 2680 deprecate \1 in substitution replacements
02aa26ce
NT
2681 handle string-changing backslashes \l \U \Q \E, etc.
2682 switch (what was escaped) {
94def140 2683 handle \- in a transliteration (becomes a literal -)
ff3f963a 2684 if a pattern and not \N{, go treat as regular character
94def140
TS
2685 handle \132 (octal characters)
2686 handle \x15 and \x{1234} (hex characters)
ff3f963a 2687 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
TS
2688 handle \cV (control characters)
2689 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 2690 } (end switch)
77a135fe 2691 continue
02aa26ce 2692 } (end if backslash)
77a135fe 2693 handle regular character
02aa26ce 2694 } (end while character to read)
4e553d73 2695
02aa26ce
NT
2696*/
2697
76e3520e 2698STATIC char *
cea2e8a9 2699S_scan_const(pTHX_ char *start)
79072805 2700{
97aff369 2701 dVAR;
eb578fdb 2702 char *send = PL_bufend; /* end of the constant */
77a135fe
KW
2703 SV *sv = newSV(send - start); /* sv for the constant. See
2704 note below on sizing. */
eb578fdb
KW
2705 char *s = start; /* start of the constant */
2706 char *d = SvPVX(sv); /* destination for copies */
02aa26ce 2707 bool dorange = FALSE; /* are we in a translit range? */
c2e66d9e 2708 bool didrange = FALSE; /* did we just finish a range? */
2866decb 2709 bool in_charclass = FALSE; /* within /[...]/ */
b953e60c
KW
2710 bool has_utf8 = FALSE; /* Output constant is UTF8 */
2711 bool this_utf8 = cBOOL(UTF); /* Is the source string assumed
77a135fe
KW
2712 to be UTF8? But, this can
2713 show as true when the source
2714 isn't utf8, as for example
2715 when it is entirely composed
2716 of hex constants */
2717
2718 /* Note on sizing: The scanned constant is placed into sv, which is
2719 * initialized by newSV() assuming one byte of output for every byte of
2720 * input. This routine expects newSV() to allocate an extra byte for a
2721 * trailing NUL, which this routine will append if it gets to the end of
2722 * the input. There may be more bytes of input than output (eg., \N{LATIN
2723 * CAPITAL LETTER A}), or more output than input if the constant ends up
2724 * recoded to utf8, but each time a construct is found that might increase
2725 * the needed size, SvGROW() is called. Its size parameter each time is
2726 * based on the best guess estimate at the time, namely the length used so
2727 * far, plus the length the current construct will occupy, plus room for
2728 * the trailing NUL, plus one byte for every input byte still unscanned */
2729
012bcf8d 2730 UV uv;
4c3a8340
TS
2731#ifdef EBCDIC
2732 UV literal_endpoint = 0;
e294cc5d 2733 bool native_range = TRUE; /* turned to FALSE if the first endpoint is Unicode. */
4c3a8340 2734#endif
012bcf8d 2735
7918f24d
NC
2736 PERL_ARGS_ASSERT_SCAN_CONST;
2737
bb16bae8 2738 assert(PL_lex_inwhat != OP_TRANSR);
2b9d42f0
NIS
2739 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
2740 /* If we are doing a trans and we know we want UTF8 set expectation */
2741 has_utf8 = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
2742 this_utf8 = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
2743 }
2744
2745
79072805 2746 while (s < send || dorange) {
ff3f963a 2747
02aa26ce 2748 /* get transliterations out of the way (they're most literal) */
3280af22 2749 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 2750 /* expand a range A-Z to the full set of characters. AIE! */
79072805 2751 if (dorange) {
1ba5c669
JH
2752 I32 i; /* current expanded character */
2753 I32 min; /* first character in range */
2754 I32 max; /* last character in range */
02aa26ce 2755
e294cc5d
JH
2756#ifdef EBCDIC
2757 UV uvmax = 0;
2758#endif
2759
2760 if (has_utf8
2761#ifdef EBCDIC
2762 && !native_range
2763#endif
2764 ) {
9d4ba2ae 2765 char * const c = (char*)utf8_hop((U8*)d, -1);
8973db79
JH
2766 char *e = d++;
2767 while (e-- > c)
2768 *(e + 1) = *e;
25716404 2769 *c = (char)UTF_TO_NATIVE(0xff);
8973db79
JH
2770 /* mark the range as done, and continue */
2771 dorange = FALSE;
2772 didrange = TRUE;
2773 continue;
2774 }
2b9d42f0 2775
95a20fc0 2776 i = d - SvPVX_const(sv); /* remember current offset */
e294cc5d
JH
2777#ifdef EBCDIC
2778 SvGROW(sv,
2779 SvLEN(sv) + (has_utf8 ?
2780 (512 - UTF_CONTINUATION_MARK +
2781 UNISKIP(0x100))
2782 : 256));
2783 /* How many two-byte within 0..255: 128 in UTF-8,
2784 * 96 in UTF-8-mod. */
2785#else
9cbb5ea2 2786 SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */
e294cc5d 2787#endif
9cbb5ea2 2788 d = SvPVX(sv) + i; /* refresh d after realloc */
e294cc5d
JH
2789#ifdef EBCDIC
2790 if (has_utf8) {
2791 int j;
2792 for (j = 0; j <= 1; j++) {
2793 char * const c = (char*)utf8_hop((U8*)d, -1);
2794 const UV uv = utf8n_to_uvchr((U8*)c, d - c, NULL, 0);
2795 if (j)
2796 min = (U8)uv;
2797 else if (uv < 256)
2798 max = (U8)uv;
2799 else {
2800 max = (U8)0xff; /* only to \xff */
2801 uvmax = uv; /* \x{100} to uvmax */
2802 }
2803 d = c; /* eat endpoint chars */
2804 }
2805 }
2806 else {
2807#endif
2808 d -= 2; /* eat the first char and the - */
2809 min = (U8)*d; /* first char in range */
2810 max = (U8)d[1]; /* last char in range */
2811#ifdef EBCDIC
2812 }
2813#endif
8ada0baa 2814
c2e66d9e 2815 if (min > max) {
4dc843bc 2816 SvREFCNT_dec(sv);
01ec43d0 2817 Perl_croak(aTHX_
d1573ac7 2818 "Invalid range \"%c-%c\" in transliteration operator",
1ba5c669 2819 (char)min, (char)max);
c2e66d9e
GS
2820 }
2821
c7f1f016 2822#ifdef EBCDIC
4c3a8340
TS
2823 if (literal_endpoint == 2 &&
2824 ((isLOWER(min) && isLOWER(max)) ||
2825 (isUPPER(min) && isUPPER(max)))) {
8ada0baa
JH
2826 if (isLOWER(min)) {
2827 for (i = min; i <= max; i++)
2828 if (isLOWER(i))
db42d148 2829 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2830 } else {
2831 for (i = min; i <= max; i++)
2832 if (isUPPER(i))
db42d148 2833 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2834 }
2835 }
2836 else
2837#endif
2838 for (i = min; i <= max; i++)
e294cc5d
JH
2839#ifdef EBCDIC
2840 if (has_utf8) {
2841 const U8 ch = (U8)NATIVE_TO_UTF(i);
2842 if (UNI_IS_INVARIANT(ch))
2843 *d++ = (U8)i;
2844 else {
2845 *d++ = (U8)UTF8_EIGHT_BIT_HI(ch);
2846 *d++ = (U8)UTF8_EIGHT_BIT_LO(ch);
2847 }
2848 }
2849 else
2850#endif
2851 *d++ = (char)i;
2852
2853#ifdef EBCDIC
2854 if (uvmax) {
2855 d = (char*)uvchr_to_utf8((U8*)d, 0x100);
2856 if (uvmax > 0x101)
2857 *d++ = (char)UTF_TO_NATIVE(0xff);
2858 if (uvmax > 0x100)
2859 d = (char*)uvchr_to_utf8((U8*)d, uvmax);
2860 }
2861#endif
02aa26ce
NT
2862
2863 /* mark the range as done, and continue */
79072805 2864 dorange = FALSE;
01ec43d0 2865 didrange = TRUE;
4c3a8340
TS
2866#ifdef EBCDIC
2867 literal_endpoint = 0;
2868#endif
79072805 2869 continue;
4e553d73 2870 }
02aa26ce
NT
2871
2872 /* range begins (ignore - as first or last char) */
79072805 2873 else if (*s == '-' && s+1 < send && s != start) {
4e553d73 2874 if (didrange) {
4dc843bc 2875 SvREFCNT_dec(sv);
1fafa243 2876 Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
01ec43d0 2877 }
e294cc5d
JH
2878 if (has_utf8
2879#ifdef EBCDIC
2880 && !native_range
2881#endif
2882 ) {
25716404 2883 *d++ = (char)UTF_TO_NATIVE(0xff); /* use illegal utf8 byte--see pmtrans */
a0ed51b3
LW
2884 s++;
2885 continue;
2886 }
79072805
LW
2887 dorange = TRUE;
2888 s++;
01ec43d0
GS
2889 }
2890 else {
2891 didrange = FALSE;
4c3a8340
TS
2892#ifdef EBCDIC
2893 literal_endpoint = 0;
e294cc5d 2894 native_range = TRUE;
4c3a8340 2895#endif
01ec43d0 2896 }
79072805 2897 }
02aa26ce
NT
2898
2899 /* if we get here, we're not doing a transliteration */
2900
e4a2df84
DM
2901 else if (*s == '[' && PL_lex_inpat && !in_charclass) {
2902 char *s1 = s-1;
2903 int esc = 0;
2904 while (s1 >= start && *s1-- == '\\')
2905 esc = !esc;
2906 if (!esc)
2907 in_charclass = TRUE;
2908 }
2866decb 2909
e4a2df84
DM
2910 else if (*s == ']' && PL_lex_inpat && in_charclass) {
2911 char *s1 = s-1;
2912 int esc = 0;
2913 while (s1 >= start && *s1-- == '\\')
2914 esc = !esc;
2915 if (!esc)
2916 in_charclass = FALSE;
2917 }
2866decb 2918
9da1dd8f
DM
2919 /* skip for regexp comments /(?#comment)/, except for the last
2920 * char, which will be done separately.
2921 * Stop on (?{..}) and friends */
2922
3280af22 2923 else if (*s == '(' && PL_lex_inpat && s[1] == '?') {
cc6b7395 2924 if (s[2] == '#') {
e994fd66 2925 while (s+1 < send && *s != ')')
db42d148 2926 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
155aba94 2927 }
2866decb 2928 else if (!PL_lex_casemods && !in_charclass &&
d3cec5e5
DM
2929 ( s[2] == '{' /* This should match regcomp.c */
2930 || (s[2] == '?' && s[3] == '{')))
155aba94 2931 {
9da1dd8f 2932 break;
cc6b7395 2933 }
748a9306 2934 }
02aa26ce
NT
2935
2936 /* likewise skip #-initiated comments in //x patterns */
3280af22 2937 else if (*s == '#' && PL_lex_inpat &&
73134a2e 2938 ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED) {
748a9306 2939 while (s+1 < send && *s != '\n')
db42d148 2940 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
748a9306 2941 }
02aa26ce 2942
9da1dd8f
DM
2943 /* no further processing of single-quoted regex */
2944 else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'')
2945 goto default_action;
2946
5d1d4326 2947 /* check for embedded arrays
da6eedaa 2948 (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
5d1d4326 2949 */
1749ea0d
TS
2950 else if (*s == '@' && s[1]) {
2951 if (isALNUM_lazy_if(s+1,UTF))
2952 break;
2953 if (strchr(":'{$", s[1]))
2954 break;
2955 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
2956 break; /* in regexp, neither @+ nor @- are interpolated */
2957 }
02aa26ce
NT
2958
2959 /* check for embedded scalars. only stop if we're sure it's a
2960 variable.
2961 */
79072805 2962 else if (*s == '$') {
3280af22 2963 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
79072805 2964 break;
77772344 2965 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
a2a5de95
NC
2966 if (s[1] == '\\') {
2967 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
2968 "Possible unintended interpolation of $\\ in regex");
77772344 2969 }
79072805 2970 break; /* in regexp, $ might be tail anchor */
77772344 2971 }
79072805 2972 }
02aa26ce 2973
2b9d42f0
NIS
2974 /* End of else if chain - OP_TRANS rejoin rest */
2975
02aa26ce 2976 /* backslashes */
79072805 2977 if (*s == '\\' && s+1 < send) {
ff3f963a
KW
2978 char* e; /* Can be used for ending '}', etc. */
2979
79072805 2980 s++;
02aa26ce 2981
7d0fc23c
KW
2982 /* warn on \1 - \9 in substitution replacements, but note that \11
2983 * is an octal; and \19 is \1 followed by '9' */
3280af22 2984 if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat &&
a0d0e21e 2985 isDIGIT(*s) && *s != '0' && !isDIGIT(s[1]))
79072805 2986 {
a2a5de95 2987 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
79072805
LW
2988 *--s = '$';
2989 break;
2990 }
02aa26ce
NT
2991
2992 /* string-change backslash escapes */
838f2281 2993 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQF", *s)) {
79072805
LW
2994 --s;
2995 break;
2996 }
ff3f963a
KW
2997 /* In a pattern, process \N, but skip any other backslash escapes.
2998 * This is because we don't want to translate an escape sequence
2999 * into a meta symbol and have the regex compiler use the meta
3000 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
3001 * in spite of this, we do have to process \N here while the proper
3002 * charnames handler is in scope. See bugs #56444 and #62056.
3003 * There is a complication because \N in a pattern may also stand
3004 * for 'match a non-nl', and not mean a charname, in which case its
3005 * processing should be deferred to the regex compiler. To be a
3006 * charname it must be followed immediately by a '{', and not look
3007 * like \N followed by a curly quantifier, i.e., not something like
3008 * \N{3,}. regcurly returns a boolean indicating if it is a legal
3009 * quantifier */
3010 else if (PL_lex_inpat
3011 && (*s != 'N'
3012 || s[1] != '{'
3013 || regcurly(s + 1)))
3014 {
cc74c5bd
TS
3015 *d++ = NATIVE_TO_NEED(has_utf8,'\\');
3016 goto default_action;
3017 }
02aa26ce 3018
79072805 3019 switch (*s) {
02aa26ce
NT
3020
3021 /* quoted - in transliterations */
79072805 3022 case '-':
3280af22 3023 if (PL_lex_inwhat == OP_TRANS) {
79072805
LW
3024 *d++ = *s++;
3025 continue;
3026 }
3027 /* FALL THROUGH */
3028 default:
11b8faa4 3029 {
e4ca4584 3030 if ((isALNUMC(*s)))
a2a5de95
NC
3031 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
3032 "Unrecognized escape \\%c passed through",
3033 *s);
11b8faa4 3034 /* default action is to copy the quoted character */
f9a63242 3035 goto default_action;
11b8faa4 3036 }
02aa26ce 3037
632403cc 3038 /* eg. \132 indicates the octal constant 0132 */
79072805
LW
3039 case '0': case '1': case '2': case '3':
3040 case '4': case '5': case '6': case '7':
ba210ebe 3041 {
53305cf1
NC
3042 I32 flags = 0;
3043 STRLEN len = 3;
77a135fe 3044 uv = NATIVE_TO_UNI(grok_oct(s, &len, &flags, NULL));
ba210ebe
JH
3045 s += len;
3046 }
012bcf8d 3047 goto NUM_ESCAPE_INSERT;
02aa26ce 3048
f0a2b745
KW
3049 /* eg. \o{24} indicates the octal constant \024 */
3050 case 'o':
3051 {
3052 STRLEN len;
454155d9 3053 const char* error;
f0a2b745 3054
454155d9 3055 bool valid = grok_bslash_o(s, &uv, &len, &error, 1);
f0a2b745 3056 s += len;
454155d9 3057 if (! valid) {
f0a2b745
KW
3058 yyerror(error);
3059 continue;
3060 }
3061 goto NUM_ESCAPE_INSERT;
3062 }
3063
77a135fe 3064 /* eg. \x24 indicates the hex constant 0x24 */
79072805 3065 case 'x':
a0481293 3066 {
53305cf1 3067 STRLEN len;
a0481293 3068 const char* error;
355860ce 3069
a0481293
KW
3070 bool valid = grok_bslash_x(s, &uv, &len, &error, 1);
3071 s += len;
3072 if (! valid) {
3073 yyerror(error);
355860ce 3074 continue;
ba210ebe 3075 }
012bcf8d
GS
3076 }
3077
3078 NUM_ESCAPE_INSERT:
ff3f963a
KW
3079 /* Insert oct or hex escaped character. There will always be
3080 * enough room in sv since such escapes will be longer than any
3081 * UTF-8 sequence they can end up as, except if they force us
3082 * to recode the rest of the string into utf8 */
ba7cea30 3083
77a135fe 3084 /* Here uv is the ordinal of the next character being added in
ff3f963a 3085 * unicode (converted from native). */
77a135fe 3086 if (!UNI_IS_INVARIANT(uv)) {
9aa983d2 3087 if (!has_utf8 && uv > 255) {
77a135fe
KW
3088 /* Might need to recode whatever we have accumulated so
3089 * far if it contains any chars variant in utf8 or
3090 * utf-ebcdic. */
3091
3092 SvCUR_set(sv, d - SvPVX_const(sv));
3093 SvPOK_on(sv);
3094 *d = '\0';
77a135fe 3095 /* See Note on sizing above. */
7bf79863
KW
3096 sv_utf8_upgrade_flags_grow(sv,
3097 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3098 UNISKIP(uv) + (STRLEN)(send - s) + 1);
77a135fe
KW
3099 d = SvPVX(sv) + SvCUR(sv);
3100 has_utf8 = TRUE;
012bcf8d
GS
3101 }
3102
77a135fe
KW
3103 if (has_utf8) {
3104 d = (char*)uvuni_to_utf8((U8*)d, uv);
f9a63242
JH
3105 if (PL_lex_inwhat == OP_TRANS &&
3106 PL_sublex_info.sub_op) {
3107 PL_sublex_info.sub_op->op_private |=
3108 (PL_lex_repl ? OPpTRANS_FROM_UTF
3109 : OPpTRANS_TO_UTF);
f9a63242 3110 }
e294cc5d
JH
3111#ifdef EBCDIC
3112 if (uv > 255 && !dorange)
3113 native_range = FALSE;
3114#endif
012bcf8d 3115 }
a0ed51b3 3116 else {
012bcf8d 3117 *d++ = (char)uv;
a0ed51b3 3118 }
012bcf8d
GS
3119 }
3120 else {
c4d5f83a 3121 *d++ = (char) uv;
a0ed51b3 3122 }
79072805 3123 continue;
02aa26ce 3124
4a2d328f 3125 case 'N':
ff3f963a
KW
3126 /* In a non-pattern \N must be a named character, like \N{LATIN
3127 * SMALL LETTER A} or \N{U+0041}. For patterns, it also can
3128 * mean to match a non-newline. For non-patterns, named
3129 * characters are converted to their string equivalents. In
3130 * patterns, named characters are not converted to their
3131 * ultimate forms for the same reasons that other escapes
3132 * aren't. Instead, they are converted to the \N{U+...} form
3133 * to get the value from the charnames that is in effect right
3134 * now, while preserving the fact that it was a named character
3135 * so that the regex compiler knows this */
3136
3137 /* This section of code doesn't generally use the
3138 * NATIVE_TO_NEED() macro to transform the input. I (khw) did
3139 * a close examination of this macro and determined it is a
3140 * no-op except on utfebcdic variant characters. Every
3141 * character generated by this that would normally need to be
3142 * enclosed by this macro is invariant, so the macro is not
7538f724
KW
3143 * needed, and would complicate use of copy(). XXX There are
3144 * other parts of this file where the macro is used
3145 * inconsistently, but are saved by it being a no-op */
ff3f963a
KW
3146
3147 /* The structure of this section of code (besides checking for
3148 * errors and upgrading to utf8) is:
3149 * Further disambiguate between the two meanings of \N, and if
3150 * not a charname, go process it elsewhere
0a96133f
KW
3151 * If of form \N{U+...}, pass it through if a pattern;
3152 * otherwise convert to utf8
3153 * Otherwise must be \N{NAME}: convert to \N{U+c1.c2...} if a
3154 * pattern; otherwise convert to utf8 */
ff3f963a
KW
3155
3156 /* Here, s points to the 'N'; the test below is guaranteed to
3157 * succeed if we are being called on a pattern as we already
3158 * know from a test above that the next character is a '{'.
3159 * On a non-pattern \N must mean 'named sequence, which
3160 * requires braces */
3161 s++;
3162 if (*s != '{') {
3163 yyerror("Missing braces on \\N{}");
3164 continue;
3165 }
3166 s++;
3167
0a96133f 3168 /* If there is no matching '}', it is an error. */
ff3f963a
KW
3169 if (! (e = strchr(s, '}'))) {
3170 if (! PL_lex_inpat) {
5777a3f7 3171 yyerror("Missing right brace on \\N{}");
0a96133f
KW
3172 } else {
3173 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N.");
dbc0d4f2 3174 }
0a96133f 3175 continue;
ff3f963a 3176 }
cddc7ef4 3177
ff3f963a 3178 /* Here it looks like a named character */
cddc7ef4 3179
ff3f963a
KW
3180 if (PL_lex_inpat) {
3181
3182 /* XXX This block is temporary code. \N{} implies that the
3183 * pattern is to have Unicode semantics, and therefore
3184 * currently has to be encoded in utf8. By putting it in
3185 * utf8 now, we save a whole pass in the regular expression
3186 * compiler. Once that code is changed so Unicode
3187 * semantics doesn't necessarily have to be in utf8, this
da3a4baf
KW
3188 * block should be removed. However, the code that parses
3189 * the output of this would have to be changed to not
3190 * necessarily expect utf8 */
ff3f963a 3191 if (!has_utf8) {
77a135fe 3192 SvCUR_set(sv, d - SvPVX_const(sv));
f08d6ad9 3193 SvPOK_on(sv);
e4f3eed8 3194 *d = '\0';
77a135fe 3195 /* See Note on sizing above. */
7bf79863 3196 sv_utf8_upgrade_flags_grow(sv,
ff3f963a
KW
3197 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3198 /* 5 = '\N{' + cur char + NUL */
3199 (STRLEN)(send - s) + 5);
f08d6ad9 3200 d = SvPVX(sv) + SvCUR(sv);
89491803 3201 has_utf8 = TRUE;
ff3f963a
KW
3202 }
3203 }
423cee85 3204
ff3f963a
KW
3205 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3206 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3207 | PERL_SCAN_DISALLOW_PREFIX;
3208 STRLEN len;
3209
3210 /* For \N{U+...}, the '...' is a unicode value even on
3211 * EBCDIC machines */
3212 s += 2; /* Skip to next char after the 'U+' */
3213 len = e - s;
3214 uv = grok_hex(s, &len, &flags, NULL);
3215 if (len == 0 || len != (STRLEN)(e - s)) {
3216 yyerror("Invalid hexadecimal number in \\N{U+...}");
3217 s = e + 1;
3218 continue;
3219 }
3220
3221 if (PL_lex_inpat) {
3222
e2a7e165
KW
3223 /* On non-EBCDIC platforms, pass through to the regex
3224 * compiler unchanged. The reason we evaluated the
3225 * number above is to make sure there wasn't a syntax
3226 * error. But on EBCDIC we convert to native so
3227 * downstream code can continue to assume it's native
3228 */
ff3f963a 3229 s -= 5; /* Include the '\N{U+' */
e2a7e165
KW
3230#ifdef EBCDIC
3231 d += my_snprintf(d, e - s + 1 + 1, /* includes the }
3232 and the \0 */
3233 "\\N{U+%X}",
3234 (unsigned int) UNI_TO_NATIVE(uv));
3235#else
ff3f963a
KW
3236 Copy(s, d, e - s + 1, char); /* 1 = include the } */
3237 d += e - s + 1;
e2a7e165 3238#endif
ff3f963a
KW
3239 }
3240 else { /* Not a pattern: convert the hex to string */
3241
3242 /* If destination is not in utf8, unconditionally
3243 * recode it to be so. This is because \N{} implies
3244 * Unicode semantics, and scalars have to be in utf8
3245 * to guarantee those semantics */
3246 if (! has_utf8) {
3247 SvCUR_set(sv, d - SvPVX_const(sv));
3248 SvPOK_on(sv);
3249 *d = '\0';
3250 /* See Note on sizing above. */
3251 sv_utf8_upgrade_flags_grow(
3252 sv,
3253 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3254 UNISKIP(uv) + (STRLEN)(send - e) + 1);
3255 d = SvPVX(sv) + SvCUR(sv);
3256 has_utf8 = TRUE;
3257 }
3258
3259 /* Add the string to the output */
3260 if (UNI_IS_INVARIANT(uv)) {
3261 *d++ = (char) uv;
3262 }
3263 else d = (char*)uvuni_to_utf8((U8*)d, uv);
3264 }
3265 }
3266 else { /* Here is \N{NAME} but not \N{U+...}. */
3267
3268 SV *res; /* result from charnames */
3269 const char *str; /* the string in 'res' */
3270 STRLEN len; /* its length */
3271
3272 /* Get the value for NAME */
3273 res = newSVpvn(s, e - s);
3274 res = new_constant( NULL, 0, "charnames",
3275 /* includes all of: \N{...} */
3276 res, NULL, s - 3, e - s + 4 );
3277
3278 /* Most likely res will be in utf8 already since the
3279 * standard charnames uses pack U, but a custom translator
3280 * can leave it otherwise, so make sure. XXX This can be
3281 * revisited to not have charnames use utf8 for characters
3282 * that don't need it when regexes don't have to be in utf8
3283 * for Unicode semantics. If doing so, remember EBCDIC */
3284 sv_utf8_upgrade(res);
3285 str = SvPV_const(res, len);
3286
3287 /* Don't accept malformed input */
3288 if (! is_utf8_string((U8 *) str, len)) {
3289 yyerror("Malformed UTF-8 returned by \\N");
3290 }
3291 else if (PL_lex_inpat) {
3292
3293 if (! len) { /* The name resolved to an empty string */
3294 Copy("\\N{}", d, 4, char);
3295 d += 4;
3296 }
3297 else {
3298 /* In order to not lose information for the regex
3299 * compiler, pass the result in the specially made
3300 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3301 * the code points in hex of each character
3302 * returned by charnames */
3303
3304 const char *str_end = str + len;
3305 STRLEN char_length; /* cur char's byte length */
3306 STRLEN output_length; /* and the number of bytes
3307 after this is translated
3308 into hex digits */
3309 const STRLEN off = d - SvPVX_const(sv);
3310
3311 /* 2 hex per byte; 2 chars for '\N'; 2 chars for
3312 * max('U+', '.'); and 1 for NUL */
3313 char hex_string[2 * UTF8_MAXBYTES + 5];
3314
3315 /* Get the first character of the result. */
3316 U32 uv = utf8n_to_uvuni((U8 *) str,
3317 len,
3318 &char_length,
3319 UTF8_ALLOW_ANYUV);
3320
3321 /* The call to is_utf8_string() above hopefully
3322 * guarantees that there won't be an error. But
3323 * it's easy here to make sure. The function just
3324 * above warns and returns 0 if invalid utf8, but
3325 * it can also return 0 if the input is validly a
3326 * NUL. Disambiguate */
3327 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3328 uv = UNICODE_REPLACEMENT;
3329 }
3330
3331 /* Convert first code point to hex, including the
e2a7e165
KW
3332 * boiler plate before it. For all these, we
3333 * convert to native format so that downstream code
3334 * can continue to assume the input is native */
78c35590 3335 output_length =
3353de27 3336 my_snprintf(hex_string, sizeof(hex_string),
e2a7e165
KW
3337 "\\N{U+%X",
3338 (unsigned int) UNI_TO_NATIVE(uv));
ff3f963a
KW
3339
3340 /* Make sure there is enough space to hold it */
3341 d = off + SvGROW(sv, off
3342 + output_length
3343 + (STRLEN)(send - e)
3344 + 2); /* '}' + NUL */
3345 /* And output it */
3346 Copy(hex_string, d, output_length, char);
3347 d += output_length;
3348
3349 /* For each subsequent character, append dot and
3350 * its ordinal in hex */
3351 while ((str += char_length) < str_end) {
3352 const STRLEN off = d - SvPVX_const(sv);
3353 U32 uv = utf8n_to_uvuni((U8 *) str,
3354 str_end - str,
3355 &char_length,
3356 UTF8_ALLOW_ANYUV);
3357 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3358 uv = UNICODE_REPLACEMENT;
3359 }
3360
78c35590 3361 output_length =
3353de27 3362 my_snprintf(hex_string, sizeof(hex_string),
e2a7e165
KW
3363 ".%X",
3364 (unsigned int) UNI_TO_NATIVE(uv));
ff3f963a
KW
3365
3366 d = off + SvGROW(sv, off
3367 + output_length
3368 + (STRLEN)(send - e)
3369 + 2); /* '}' + NUL */
3370 Copy(hex_string, d, output_length, char);
3371 d += output_length;
3372 }
3373
3374 *d++ = '}'; /* Done. Add the trailing brace */
3375 }
3376 }
3377 else { /* Here, not in a pattern. Convert the name to a
3378 * string. */
3379
3380 /* If destination is not in utf8, unconditionally
3381 * recode it to be so. This is because \N{} implies
3382 * Unicode semantics, and scalars have to be in utf8
3383 * to guarantee those semantics */
3384 if (! has_utf8) {
3385 SvCUR_set(sv, d - SvPVX_const(sv));
3386 SvPOK_on(sv);
3387 *d = '\0';
3388 /* See Note on sizing above. */
3389 sv_utf8_upgrade_flags_grow(sv,
3390 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3391 len + (STRLEN)(send - s) + 1);
3392 d = SvPVX(sv) + SvCUR(sv);
3393 has_utf8 = TRUE;
3394 } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
3395
3396 /* See Note on sizing above. (NOTE: SvCUR() is not
3397 * set correctly here). */
3398 const STRLEN off = d - SvPVX_const(sv);
3399 d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1);
3400 }
3401 Copy(str, d, len, char);
3402 d += len;
423cee85 3403 }
423cee85 3404 SvREFCNT_dec(res);
cb233ae3
KW
3405
3406 /* Deprecate non-approved name syntax */
3407 if (ckWARN_d(WARN_DEPRECATED)) {
3408 bool problematic = FALSE;
3409 char* i = s;
3410
3411 /* For non-ut8 input, look to see that the first
3412 * character is an alpha, then loop through the rest
3413 * checking that each is a continuation */
3414 if (! this_utf8) {
3415 if (! isALPHAU(*i)) problematic = TRUE;
3416 else for (i = s + 1; i < e; i++) {
3417 if (isCHARNAME_CONT(*i)) continue;
3418 problematic = TRUE;
3419 break;
3420 }
3421 }
3422 else {
3423 /* Similarly for utf8. For invariants can check
3424 * directly. We accept anything above the latin1
3425 * range because it is immaterial to Perl if it is
3426 * correct or not, and is expensive to check. But
3427 * it is fairly easy in the latin1 range to convert
3428 * the variants into a single character and check
3429 * those */
3430 if (UTF8_IS_INVARIANT(*i)) {
3431 if (! isALPHAU(*i)) problematic = TRUE;
3432 } else if (UTF8_IS_DOWNGRADEABLE_START(*i)) {
81c14aa2 3433 if (! isALPHAU(UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(*i,
cb233ae3
KW
3434 *(i+1)))))
3435 {
3436 problematic = TRUE;
3437 }
3438 }
3439 if (! problematic) for (i = s + UTF8SKIP(s);
3440 i < e;
3441 i+= UTF8SKIP(i))
3442 {
3443 if (UTF8_IS_INVARIANT(*i)) {
3444 if (isCHARNAME_CONT(*i)) continue;
3445 } else if (! UTF8_IS_DOWNGRADEABLE_START(*i)) {
3446 continue;
3447 } else if (isCHARNAME_CONT(
3448 UNI_TO_NATIVE(
81c14aa2 3449 TWO_BYTE_UTF8_TO_UNI(*i, *(i+1)))))
cb233ae3
KW
3450 {
3451 continue;
3452 }
3453 problematic = TRUE;
3454 break;
3455 }
3456 }
3457 if (problematic) {
6e1bad6c
KW
3458 /* The e-i passed to the final %.*s makes sure that
3459 * should the trailing NUL be missing that this
3460 * print won't run off the end of the string */
cb233ae3 3461 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
b00fc8d4
NC
3462 "Deprecated character in \\N{...}; marked by <-- HERE in \\N{%.*s<-- HERE %.*s",
3463 (int)(i - s + 1), s, (int)(e - i), i + 1);
cb233ae3
KW
3464 }
3465 }
3466 } /* End \N{NAME} */
ff3f963a
KW
3467#ifdef EBCDIC
3468 if (!dorange)
3469 native_range = FALSE; /* \N{} is defined to be Unicode */
3470#endif
3471 s = e + 1; /* Point to just after the '}' */
423cee85
JH
3472 continue;
3473
02aa26ce 3474 /* \c is a control character */
79072805
LW
3475 case 'c':
3476 s++;
961ce445 3477 if (s < send) {
17a3df4c 3478 *d++ = grok_bslash_c(*s++, has_utf8, 1);
ba210ebe 3479 }
961ce445
RGS
3480 else {
3481 yyerror("Missing control char name in \\c");
3482 }
79072805 3483 continue;
02aa26ce
NT
3484
3485 /* printf-style backslashes, formfeeds, newlines, etc */
79072805 3486 case 'b':
db42d148 3487 *d++ = NATIVE_TO_NEED(has_utf8,'\b');
79072805
LW
3488 break;
3489 case 'n':
db42d148 3490 *d++ = NATIVE_TO_NEED(has_utf8,'\n');
79072805
LW
3491 break;
3492 case 'r':
db42d148 3493 *d++ = NATIVE_TO_NEED(has_utf8,'\r');
79072805
LW
3494 break;
3495 case 'f':
db42d148 3496 *d++ = NATIVE_TO_NEED(has_utf8,'\f');
79072805
LW
3497 break;
3498 case 't':
db42d148 3499 *d++ = NATIVE_TO_NEED(has_utf8,'\t');
79072805 3500 break;
34a3fe2a 3501 case 'e':
db42d148 3502 *d++ = ASCII_TO_NEED(has_utf8,'\033');
34a3fe2a
PP
3503 break;
3504 case 'a':
db42d148 3505 *d++ = ASCII_TO_NEED(has_utf8,'\007');
79072805 3506 break;
02aa26ce
NT
3507 } /* end switch */
3508
79072805
LW
3509 s++;
3510 continue;
02aa26ce 3511 } /* end if (backslash) */
4c3a8340
TS
3512#ifdef EBCDIC
3513 else
3514 literal_endpoint++;
3515#endif
02aa26ce 3516
f9a63242 3517 default_action:
77a135fe
KW
3518 /* If we started with encoded form, or already know we want it,
3519 then encode the next character */
3520 if (! NATIVE_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
2b9d42f0 3521 STRLEN len = 1;
77a135fe
KW
3522
3523
3524 /* One might think that it is wasted effort in the case of the
3525 * source being utf8 (this_utf8 == TRUE) to take the next character
3526 * in the source, convert it to an unsigned value, and then convert
3527 * it back again. But the source has not been validated here. The
3528 * routine that does the conversion checks for errors like
3529 * malformed utf8 */
3530
5f66b61c
AL
3531 const UV nextuv = (this_utf8) ? utf8n_to_uvchr((U8*)s, send - s, &len, 0) : (UV) ((U8) *s);
3532 const STRLEN need = UNISKIP(NATIVE_TO_UNI(nextuv));
77a135fe
KW
3533 if (!has_utf8) {
3534 SvCUR_set(sv, d - SvPVX_const(sv));
3535 SvPOK_on(sv);
3536 *d = '\0';
77a135fe 3537 /* See Note on sizing above. */
7bf79863
KW
3538 sv_utf8_upgrade_flags_grow(sv,
3539 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3540 need + (STRLEN)(send - s) + 1);
77a135fe
KW
3541 d = SvPVX(sv) + SvCUR(sv);
3542 has_utf8 = TRUE;
3543 } else if (need > len) {
3544 /* encoded value larger than old, may need extra space (NOTE:
3545 * SvCUR() is not set correctly here). See Note on sizing
3546 * above. */
9d4ba2ae 3547 const STRLEN off = d - SvPVX_const(sv);
77a135fe 3548 d = SvGROW(sv, off + need + (STRLEN)(send - s) + 1) + off;
2b9d42f0 3549 }
77a135fe
KW
3550 s += len;
3551
5f66b61c 3552 d = (char*)uvchr_to_utf8((U8*)d, nextuv);
e294cc5d
JH
3553#ifdef EBCDIC
3554 if (uv > 255 && !dorange)
3555 native_range = FALSE;
3556#endif
2b9d42f0
NIS
3557 }
3558 else {
3559 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
3560 }
02aa26ce
NT
3561 } /* while loop to process each character */
3562
3563 /* terminate the string and set up the sv */
79072805 3564 *d = '\0';
95a20fc0 3565 SvCUR_set(sv, d - SvPVX_const(sv));
2b9d42f0 3566 if (SvCUR(sv) >= SvLEN(sv))
5637ef5b
NC
3567 Perl_croak(aTHX_ "panic: constant overflowed allocated space, %"UVuf
3568 " >= %"UVuf, (UV)SvCUR(sv), (UV)SvLEN(sv));
2b9d42f0 3569
79072805 3570 SvPOK_on(sv);
9f4817db 3571 if (PL_encoding && !has_utf8) {
d0063567
DK
3572 sv_recode_to_utf8(sv, PL_encoding);
3573 if (SvUTF8(sv))
3574 has_utf8 = TRUE;
9f4817db 3575 }
2b9d42f0 3576 if (has_utf8) {
7e2040f0 3577 SvUTF8_on(sv);
2b9d42f0 3578 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
d0063567 3579 PL_sublex_info.sub_op->op_private |=
2b9d42f0
NIS
3580 (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
3581 }
3582 }
79072805 3583
02aa26ce 3584 /* shrink the sv if we allocated more than we used */
79072805 3585 if (SvCUR(sv) + 5 < SvLEN(sv)) {
1da4ca5f 3586 SvPV_shrink_to_cur(sv);
79072805 3587 }
02aa26ce 3588
6154021b 3589 /* return the substring (via pl_yylval) only if we parsed anything */
3280af22 3590 if (s > PL_bufptr) {
eb0d8d16
NC
3591 if ( PL_hints & ( PL_lex_inpat ? HINT_NEW_RE : HINT_NEW_STRING ) ) {
3592 const char *const key = PL_lex_inpat ? "qr" : "q";
3593 const STRLEN keylen = PL_lex_inpat ? 2 : 1;
3594 const char *type;
3595 STRLEN typelen;
3596
3597 if (PL_lex_inwhat == OP_TRANS) {
3598 type = "tr";
3599 typelen = 2;
3600 } else if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat) {
3601 type = "s";
3602 typelen = 1;
9da1dd8f
DM
3603 } else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'') {
3604 type = "q";
3605 typelen = 1;
eb0d8d16
NC
3606 } else {
3607 type = "qq";
3608 typelen = 2;
3609 }
3610
3611 sv = S_new_constant(aTHX_ start, s - start, key, keylen, sv, NULL,
3612 type, typelen);
3613 }
6154021b 3614 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
b3ac6de7 3615 } else
8990e307 3616 SvREFCNT_dec(sv);
79072805
LW
3617 return s;
3618}
3619
ffb4593c
NT
3620/* S_intuit_more
3621 * Returns TRUE if there's more to the expression (e.g., a subscript),
3622 * FALSE otherwise.
ffb4593c
NT
3623 *
3624 * It deals with "$foo[3]" and /$foo[3]/ and /$foo[0123456789$]+/
3625 *
3626 * ->[ and ->{ return TRUE
3627 * { and [ outside a pattern are always subscripts, so return TRUE
3628 * if we're outside a pattern and it's not { or [, then return FALSE
3629 * if we're in a pattern and the first char is a {
3630 * {4,5} (any digits around the comma) returns FALSE
3631 * if we're in a pattern and the first char is a [
3632 * [] returns FALSE
3633 * [SOMETHING] has a funky algorithm to decide whether it's a
3634 * character class or not. It has to deal with things like
3635 * /$foo[-3]/ and /$foo[$bar]/ as well as /$foo[$\d]+/
3636 * anything else returns TRUE
3637 */
3638
9cbb5ea2
GS
3639/* This is the one truly awful dwimmer necessary to conflate C and sed. */
3640
76e3520e 3641STATIC int
cea2e8a9 3642S_intuit_more(pTHX_ register char *s)
79072805 3643{
97aff369 3644 dVAR;
7918f24d
NC
3645
3646 PERL_ARGS_ASSERT_INTUIT_MORE;
3647
3280af22 3648 if (PL_lex_brackets)
79072805
LW
3649 return TRUE;
3650 if (*s == '-' && s[1] == '>' && (s[2] == '[' || s[2] == '{'))
3651 return TRUE;
3652 if (*s != '{' && *s != '[')
3653 return FALSE;
3280af22 3654 if (!PL_lex_inpat)
79072805
LW
3655 return TRUE;
3656
3657 /* In a pattern, so maybe we have {n,m}. */
3658 if (*s == '{') {
b3155d95 3659 if (regcurly(s)) {
79072805 3660 return FALSE;
b3155d95 3661 }
79072805 3662 return TRUE;
79072805
LW
3663 }
3664
3665 /* On the other hand, maybe we have a character class */
3666
3667 s++;
3668 if (*s == ']' || *s == '^')
3669 return FALSE;
3670 else {
ffb4593c 3671 /* this is terrifying, and it works */
79072805
LW
3672 int weight = 2; /* let's weigh the evidence */
3673 char seen[256];
f27ffc4a 3674 unsigned char un_char = 255, last_un_char;
9d4ba2ae 3675 const char * const send = strchr(s,']');
3280af22 3676 char tmpbuf[sizeof PL_tokenbuf * 4];
79072805
LW
3677
3678 if (!send) /* has to be an expression */
3679 return TRUE;
3680
3681 Zero(seen,256,char);
3682 if (*s == '$')
3683 weight -= 3;
3684 else if (isDIGIT(*s)) {
3685 if (s[1] != ']') {
3686 if (isDIGIT(s[1]) && s[2] == ']')
3687 weight -= 10;
3688 }
3689 else
3690 weight -= 100;
3691 }
3692 for (; s < send; s++) {
3693 last_un_char = un_char;
3694 un_char = (unsigned char)*s;
3695 switch (*s) {
3696 case '@':
3697 case '&':
3698 case '$':
3699 weight -= seen[un_char] * 10;
7e2040f0 3700 if (isALNUM_lazy_if(s+1,UTF)) {
90e5519e 3701 int len;
8903cb82 3702 scan_ident(s, send, tmpbuf, sizeof tmpbuf, FALSE);
90e5519e 3703 len = (int)strlen(tmpbuf);
6fbd0d97
BF
3704 if (len > 1 && gv_fetchpvn_flags(tmpbuf, len,
3705 UTF ? SVf_UTF8 : 0, SVt_PV))
79072805
LW
3706 weight -= 100;
3707 else
3708 weight -= 10;
3709 }
3710 else if (*s == '$' && s[1] &&
93a17b20
LW
3711 strchr("[#!%*<>()-=",s[1])) {
3712 if (/*{*/ strchr("])} =",s[2]))
79072805
LW
3713 weight -= 10;
3714 else
3715 weight -= 1;
3716 }
3717 break;
3718 case '\\':
3719 un_char = 254;
3720 if (s[1]) {
93a17b20 3721 if (strchr("wds]",s[1]))
79072805 3722 weight += 100;
10edeb5d 3723 else if (seen[(U8)'\''] || seen[(U8)'"'])
79072805 3724 weight += 1;
93a17b20 3725 else if (strchr("rnftbxcav",s[1]))
79072805
LW
3726 weight += 40;
3727 else if (isDIGIT(s[1])) {
3728 weight += 40;
3729 while (s[1] && isDIGIT(s[1]))
3730 s++;
3731 }
3732 }
3733 else
3734 weight += 100;
3735 break;
3736 case '-':
3737 if (s[1] == '\\')
3738 weight += 50;
93a17b20 3739 if (strchr("aA01! ",last_un_char))
79072805 3740 weight += 30;
93a17b20 3741 if (strchr("zZ79~",s[1]))
79072805 3742 weight += 30;