This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
toke.c: Rewrite bogus yylex comment
[perl5.git] / toke.c
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
26
27This is the lower layer of the Perl parser, managing characters and tokens.
28
29=for apidoc AmU|yy_parser *|PL_parser
30
31Pointer to a structure encapsulating the state of the parsing operation
32currently in progress. The pointer can be locally changed to perform
33a nested parse without interfering with the state of an outer parse.
34Individual members of C<PL_parser> have their own documentation.
35
36=cut
37*/
38
378cc40b 39#include "EXTERN.h"
864dbfa3 40#define PERL_IN_TOKE_C
378cc40b 41#include "perl.h"
04e98a4d 42#include "dquote_static.c"
378cc40b 43
eb0d8d16
NC
44#define new_constant(a,b,c,d,e,f,g) \
45 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
46
6154021b 47#define pl_yylval (PL_parser->yylval)
d3b6f988 48
199e78b7
DM
49/* XXX temporary backwards compatibility */
50#define PL_lex_brackets (PL_parser->lex_brackets)
78cdf107
Z
51#define PL_lex_allbrackets (PL_parser->lex_allbrackets)
52#define PL_lex_fakeeof (PL_parser->lex_fakeeof)
199e78b7
DM
53#define PL_lex_brackstack (PL_parser->lex_brackstack)
54#define PL_lex_casemods (PL_parser->lex_casemods)
55#define PL_lex_casestack (PL_parser->lex_casestack)
56#define PL_lex_defer (PL_parser->lex_defer)
57#define PL_lex_dojoin (PL_parser->lex_dojoin)
58#define PL_lex_expect (PL_parser->lex_expect)
59#define PL_lex_formbrack (PL_parser->lex_formbrack)
60#define PL_lex_inpat (PL_parser->lex_inpat)
61#define PL_lex_inwhat (PL_parser->lex_inwhat)
62#define PL_lex_op (PL_parser->lex_op)
63#define PL_lex_repl (PL_parser->lex_repl)
64#define PL_lex_starts (PL_parser->lex_starts)
65#define PL_lex_stuff (PL_parser->lex_stuff)
66#define PL_multi_start (PL_parser->multi_start)
67#define PL_multi_open (PL_parser->multi_open)
68#define PL_multi_close (PL_parser->multi_close)
199e78b7
DM
69#define PL_preambled (PL_parser->preambled)
70#define PL_sublex_info (PL_parser->sublex_info)
bdc0bf6f 71#define PL_linestr (PL_parser->linestr)
c2598295
DM
72#define PL_expect (PL_parser->expect)
73#define PL_copline (PL_parser->copline)
f06b5848
DM
74#define PL_bufptr (PL_parser->bufptr)
75#define PL_oldbufptr (PL_parser->oldbufptr)
76#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
77#define PL_linestart (PL_parser->linestart)
78#define PL_bufend (PL_parser->bufend)
79#define PL_last_uni (PL_parser->last_uni)
80#define PL_last_lop (PL_parser->last_lop)
81#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 82#define PL_lex_state (PL_parser->lex_state)
2f9285f8 83#define PL_rsfp (PL_parser->rsfp)
5486870f 84#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
85#define PL_in_my (PL_parser->in_my)
86#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 87#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 88#define PL_multi_end (PL_parser->multi_end)
13765c85 89#define PL_error_count (PL_parser->error_count)
199e78b7
DM
90
91#ifdef PERL_MAD
92# define PL_endwhite (PL_parser->endwhite)
93# define PL_faketokens (PL_parser->faketokens)
94# define PL_lasttoke (PL_parser->lasttoke)
95# define PL_nextwhite (PL_parser->nextwhite)
96# define PL_realtokenstart (PL_parser->realtokenstart)
97# define PL_skipwhite (PL_parser->skipwhite)
98# define PL_thisclose (PL_parser->thisclose)
99# define PL_thismad (PL_parser->thismad)
100# define PL_thisopen (PL_parser->thisopen)
101# define PL_thisstuff (PL_parser->thisstuff)
102# define PL_thistoken (PL_parser->thistoken)
103# define PL_thiswhite (PL_parser->thiswhite)
fb205e7a
DM
104# define PL_thiswhite (PL_parser->thiswhite)
105# define PL_nexttoke (PL_parser->nexttoke)
106# define PL_curforce (PL_parser->curforce)
107#else
108# define PL_nexttoke (PL_parser->nexttoke)
109# define PL_nexttype (PL_parser->nexttype)
110# define PL_nextval (PL_parser->nextval)
199e78b7
DM
111#endif
112
0bd48802 113static const char ident_too_long[] = "Identifier too long";
8903cb82 114
29595ff2 115#ifdef PERL_MAD
29595ff2 116# define CURMAD(slot,sv) if (PL_madskills) { curmad(slot,sv); sv = 0; }
cd81e915 117# define NEXTVAL_NEXTTOKE PL_nexttoke[PL_curforce].next_val
9ded7720 118#else
5db06880 119# define CURMAD(slot,sv)
9ded7720 120# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2
NC
121#endif
122
a7aaec61
Z
123#define XENUMMASK 0x3f
124#define XFAKEEOF 0x40
125#define XFAKEBRACK 0x80
9059aa12 126
39e02b42
JH
127#ifdef USE_UTF8_SCRIPTS
128# define UTF (!IN_BYTES)
2b9d42f0 129#else
802a15e9 130# define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
2b9d42f0 131#endif
a0ed51b3 132
b1fc3636
CJ
133/* The maximum number of characters preceding the unrecognized one to display */
134#define UNRECOGNIZED_PRECEDE_COUNT 10
135
61f0cdd9 136/* In variables named $^X, these are the legal values for X.
2b92dfce
GS
137 * 1999-02-27 mjd-perl-patch@plover.com */
138#define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
139
bf4acbe4 140#define SPACE_OR_TAB(c) ((c)==' '||(c)=='\t')
bf4acbe4 141
ffb4593c
NT
142/* LEX_* are values for PL_lex_state, the state of the lexer.
143 * They are arranged oddly so that the guard on the switch statement
79072805 144 * can get by with a single comparison (if the compiler is smart enough).
9da1dd8f
DM
145 *
146 * These values refer to the various states within a sublex parse,
147 * i.e. within a double quotish string
79072805
LW
148 */
149
fb73857a
PP
150/* #define LEX_NOTPARSING 11 is done in perl.h. */
151
b6007c36
DM
152#define LEX_NORMAL 10 /* normal code (ie not within "...") */
153#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
154#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
155#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
156#define LEX_INTERPSTART 6 /* expecting the start of a $var */
157
158 /* at end of code, eg "$x" followed by: */
159#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
160#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
161
162#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
163 string or after \E, $foo, etc */
164#define LEX_INTERPCONST 2 /* NOT USED */
165#define LEX_FORMLINE 1 /* expecting a format line */
166#define LEX_KNOWNEXT 0 /* next token known; just return it */
167
79072805 168
bbf60fe6 169#ifdef DEBUGGING
27da23d5 170static const char* const lex_state_names[] = {
bbf60fe6
DM
171 "KNOWNEXT",
172 "FORMLINE",
173 "INTERPCONST",
174 "INTERPCONCAT",
175 "INTERPENDMAYBE",
176 "INTERPEND",
177 "INTERPSTART",
178 "INTERPPUSH",
179 "INTERPCASEMOD",
180 "INTERPNORMAL",
181 "NORMAL"
182};
183#endif
184
79072805
LW
185#ifdef ff_next
186#undef ff_next
d48672a2
LW
187#endif
188
79072805 189#include "keywords.h"
fe14fcc3 190
ffb4593c
NT
191/* CLINE is a macro that ensures PL_copline has a sane value */
192
ae986130
LW
193#ifdef CLINE
194#undef CLINE
195#endif
57843af0 196#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 197
5db06880 198#ifdef PERL_MAD
29595ff2
NC
199# define SKIPSPACE0(s) skipspace0(s)
200# define SKIPSPACE1(s) skipspace1(s)
201# define SKIPSPACE2(s,tsv) skipspace2(s,&tsv)
202# define PEEKSPACE(s) skipspace2(s,0)
203#else
204# define SKIPSPACE0(s) skipspace(s)
205# define SKIPSPACE1(s) skipspace(s)
206# define SKIPSPACE2(s,tsv) skipspace(s)
207# define PEEKSPACE(s) skipspace(s)
208#endif
209
ffb4593c
NT
210/*
211 * Convenience functions to return different tokens and prime the
9cbb5ea2 212 * lexer for the next token. They all take an argument.
ffb4593c
NT
213 *
214 * TOKEN : generic token (used for '(', DOLSHARP, etc)
215 * OPERATOR : generic operator
216 * AOPERATOR : assignment operator
217 * PREBLOCK : beginning the block after an if, while, foreach, ...
218 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
219 * PREREF : *EXPR where EXPR is not a simple identifier
220 * TERM : expression term
221 * LOOPX : loop exiting command (goto, last, dump, etc)
222 * FTST : file test operator
223 * FUN0 : zero-argument function
7eb971ee 224 * FUN0OP : zero-argument function, with its op created in this file
2d2e263d 225 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
226 * BOop : bitwise or or xor
227 * BAop : bitwise and
228 * SHop : shift operator
229 * PWop : power operator
9cbb5ea2 230 * PMop : pattern-matching operator
ffb4593c
NT
231 * Aop : addition-level operator
232 * Mop : multiplication-level operator
233 * Eop : equality-testing operator
e5edeb50 234 * Rop : relational operator <= != gt
ffb4593c
NT
235 *
236 * Also see LOP and lop() below.
237 */
238
998054bd 239#ifdef DEBUGGING /* Serve -DT. */
704d4215 240# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 241#else
bbf60fe6 242# define REPORT(retval) (retval)
998054bd
SC
243#endif
244
bbf60fe6
DM
245#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
246#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
247#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, REPORT(retval)))
248#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
249#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
250#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
251#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
6154021b
RGS
252#define LOOPX(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)LOOPEX))
253#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
254#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
7eb971ee 255#define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
6154021b
RGS
256#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
257#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITOROP)))
258#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITANDOP)))
259#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)SHIFTOP)))
260#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)POWOP)))
261#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
262#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)ADDOP)))
263#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MULOP)))
264#define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
265#define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
2f3197b3 266
a687059c
LW
267/* This bit of chicanery makes a unary function followed by
268 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
269 * The UNIDOR macro is for unary functions that can be followed by the //
270 * operator (such as C<shift // 0>).
a687059c 271 */
d68ce4ac 272#define UNI3(f,x,have_x) { \
6154021b 273 pl_yylval.ival = f; \
d68ce4ac 274 if (have_x) PL_expect = x; \
376fcdbf
AL
275 PL_bufptr = s; \
276 PL_last_uni = PL_oldbufptr; \
277 PL_last_lop_op = f; \
278 if (*s == '(') \
279 return REPORT( (int)FUNC1 ); \
29595ff2 280 s = PEEKSPACE(s); \
376fcdbf
AL
281 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
282 }
d68ce4ac
FC
283#define UNI(f) UNI3(f,XTERM,1)
284#define UNIDOR(f) UNI3(f,XTERMORDORDOR,1)
b5fb7ce3
FC
285#define UNIPROTO(f,optional) { \
286 if (optional) PL_last_uni = PL_oldbufptr; \
22393538
MH
287 OPERATOR(f); \
288 }
a687059c 289
d68ce4ac 290#define UNIBRACK(f) UNI3(f,0,0)
79072805 291
9f68db38 292/* grandfather return to old style */
78cdf107
Z
293#define OLDLOP(f) \
294 do { \
295 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
296 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
297 pl_yylval.ival = (f); \
298 PL_expect = XTERM; \
299 PL_bufptr = s; \
300 return (int)LSTOP; \
301 } while(0)
79072805 302
83944c01
FC
303#define COPLINE_INC_WITH_HERELINES \
304 STMT_START { \
305 CopLINE_inc(PL_curcop); \
d794b522
FC
306 if (PL_parser->lex_shared->herelines) \
307 CopLINE(PL_curcop) += PL_parser->lex_shared->herelines, \
308 PL_parser->lex_shared->herelines = 0; \
83944c01
FC
309 } STMT_END
310
311
8fa7f367
JH
312#ifdef DEBUGGING
313
6154021b 314/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
315enum token_type {
316 TOKENTYPE_NONE,
317 TOKENTYPE_IVAL,
6154021b 318 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6 319 TOKENTYPE_PVAL,
aeaef349 320 TOKENTYPE_OPVAL
bbf60fe6
DM
321};
322
6d4a66ac
NC
323static struct debug_tokens {
324 const int token;
325 enum token_type type;
326 const char *name;
327} const debug_tokens[] =
9041c2e3 328{
bbf60fe6
DM
329 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
330 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
331 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
332 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
333 { ARROW, TOKENTYPE_NONE, "ARROW" },
334 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
335 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
336 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
337 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
338 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
0d863452 339 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
340 { DO, TOKENTYPE_NONE, "DO" },
341 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
342 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
343 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
344 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
345 { ELSE, TOKENTYPE_NONE, "ELSE" },
346 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
347 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
348 { FOR, TOKENTYPE_IVAL, "FOR" },
349 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
705fe0e5
FC
350 { FORMLBRACK, TOKENTYPE_NONE, "FORMLBRACK" },
351 { FORMRBRACK, TOKENTYPE_NONE, "FORMRBRACK" },
bbf60fe6
DM
352 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
353 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
7eb971ee 354 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
bbf60fe6
DM
355 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
356 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
357 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 358 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
359 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
360 { IF, TOKENTYPE_IVAL, "IF" },
5db1eb8d 361 { LABEL, TOKENTYPE_OPVAL, "LABEL" },
bbf60fe6
DM
362 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
363 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
364 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
365 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
366 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
367 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
368 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
369 { MY, TOKENTYPE_IVAL, "MY" },
bbf60fe6
DM
370 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
371 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
372 { OROP, TOKENTYPE_IVAL, "OROP" },
373 { OROR, TOKENTYPE_NONE, "OROR" },
374 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
f3f204dc 375 { PEG, TOKENTYPE_NONE, "PEG" },
88e1f1a2
JV
376 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
377 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6
DM
378 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
379 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
380 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
381 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
382 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
383 { PREINC, TOKENTYPE_NONE, "PREINC" },
384 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
f3f204dc 385 { QWLIST, TOKENTYPE_OPVAL, "QWLIST" },
bbf60fe6
DM
386 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
387 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
f3f204dc 388 { REQUIRE, TOKENTYPE_NONE, "REQUIRE" },
bbf60fe6
DM
389 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
390 { SUB, TOKENTYPE_NONE, "SUB" },
391 { THING, TOKENTYPE_OPVAL, "THING" },
392 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
393 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
394 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
395 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
396 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
397 { USE, TOKENTYPE_IVAL, "USE" },
0d863452 398 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6
DM
399 { WHILE, TOKENTYPE_IVAL, "WHILE" },
400 { WORD, TOKENTYPE_OPVAL, "WORD" },
be25f609 401 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 402 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
403};
404
6154021b 405/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 406
bbf60fe6 407STATIC int
704d4215 408S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 409{
97aff369 410 dVAR;
7918f24d
NC
411
412 PERL_ARGS_ASSERT_TOKEREPORT;
413
bbf60fe6 414 if (DEBUG_T_TEST) {
bd61b366 415 const char *name = NULL;
bbf60fe6 416 enum token_type type = TOKENTYPE_NONE;
f54cb97a 417 const struct debug_tokens *p;
396482e1 418 SV* const report = newSVpvs("<== ");
bbf60fe6 419
f54cb97a 420 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
421 if (p->token == (int)rv) {
422 name = p->name;
423 type = p->type;
424 break;
425 }
426 }
427 if (name)
54667de8 428 Perl_sv_catpv(aTHX_ report, name);
74736ae6 429 else if ((char)rv > ' ' && (char)rv <= '~')
bbf60fe6
DM
430 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
431 else if (!rv)
396482e1 432 sv_catpvs(report, "EOF");
bbf60fe6
DM
433 else
434 Perl_sv_catpvf(aTHX_ report, "?? %"IVdf, (IV)rv);
435 switch (type) {
436 case TOKENTYPE_NONE:
bbf60fe6
DM
437 break;
438 case TOKENTYPE_IVAL:
704d4215 439 Perl_sv_catpvf(aTHX_ report, "(ival=%"IVdf")", (IV)lvalp->ival);
bbf60fe6
DM
440 break;
441 case TOKENTYPE_OPNUM:
442 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 443 PL_op_name[lvalp->ival]);
bbf60fe6
DM
444 break;
445 case TOKENTYPE_PVAL:
704d4215 446 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
447 break;
448 case TOKENTYPE_OPVAL:
704d4215 449 if (lvalp->opval) {
401441c0 450 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
451 PL_op_name[lvalp->opval->op_type]);
452 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 453 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 454 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
455 }
456
457 }
401441c0 458 else
396482e1 459 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
460 break;
461 }
b6007c36 462 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
463 };
464 return (int)rv;
998054bd
SC
465}
466
b6007c36
DM
467
468/* print the buffer with suitable escapes */
469
470STATIC void
15f169a1 471S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 472{
396482e1 473 SV* const tmp = newSVpvs("");
7918f24d
NC
474
475 PERL_ARGS_ASSERT_PRINTBUF;
476
b6007c36
DM
477 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
478 SvREFCNT_dec(tmp);
479}
480
8fa7f367
JH
481#endif
482
8290c323
NC
483static int
484S_deprecate_commaless_var_list(pTHX) {
485 PL_expect = XTERM;
486 deprecate("comma-less variable list");
487 return REPORT(','); /* grandfather non-comma-format format */
488}
489
ffb4593c
NT
490/*
491 * S_ao
492 *
c963b151
BD
493 * This subroutine detects &&=, ||=, and //= and turns an ANDAND, OROR or DORDOR
494 * into an OP_ANDASSIGN, OP_ORASSIGN, or OP_DORASSIGN
ffb4593c
NT
495 */
496
76e3520e 497STATIC int
cea2e8a9 498S_ao(pTHX_ int toketype)
a0d0e21e 499{
97aff369 500 dVAR;
3280af22
NIS
501 if (*PL_bufptr == '=') {
502 PL_bufptr++;
a0d0e21e 503 if (toketype == ANDAND)
6154021b 504 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 505 else if (toketype == OROR)
6154021b 506 pl_yylval.ival = OP_ORASSIGN;
c963b151 507 else if (toketype == DORDOR)
6154021b 508 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
509 toketype = ASSIGNOP;
510 }
511 return toketype;
512}
513
ffb4593c
NT
514/*
515 * S_no_op
516 * When Perl expects an operator and finds something else, no_op
517 * prints the warning. It always prints "<something> found where
518 * operator expected. It prints "Missing semicolon on previous line?"
519 * if the surprise occurs at the start of the line. "do you need to
520 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
521 * where the compiler doesn't know if foo is a method call or a function.
522 * It prints "Missing operator before end of line" if there's nothing
523 * after the missing operator, or "... before <...>" if there is something
524 * after the missing operator.
525 */
526
76e3520e 527STATIC void
15f169a1 528S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 529{
97aff369 530 dVAR;
9d4ba2ae
AL
531 char * const oldbp = PL_bufptr;
532 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 533
7918f24d
NC
534 PERL_ARGS_ASSERT_NO_OP;
535
1189a94a
GS
536 if (!s)
537 s = oldbp;
07c798fb 538 else
1189a94a 539 PL_bufptr = s;
734ab321 540 yywarn(Perl_form(aTHX_ "%s found where operator expected", what), UTF ? SVf_UTF8 : 0);
56da5a46
RGS
541 if (ckWARN_d(WARN_SYNTAX)) {
542 if (is_first)
543 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
544 "\t(Missing semicolon on previous line?)\n");
545 else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) {
f54cb97a 546 const char *t;
734ab321
BF
547 for (t = PL_oldoldbufptr; (isALNUM_lazy_if(t,UTF) || *t == ':');
548 t += UTF ? UTF8SKIP(t) : 1)
c35e046a 549 NOOP;
56da5a46
RGS
550 if (t < PL_bufptr && isSPACE(*t))
551 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
734ab321
BF
552 "\t(Do you need to predeclare %"SVf"?)\n",
553 SVfARG(newSVpvn_flags(PL_oldoldbufptr, (STRLEN)(t - PL_oldoldbufptr),
554 SVs_TEMP | (UTF ? SVf_UTF8 : 0))));
56da5a46
RGS
555 }
556 else {
557 assert(s >= oldbp);
558 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
734ab321
BF
559 "\t(Missing operator before %"SVf"?)\n",
560 SVfARG(newSVpvn_flags(oldbp, (STRLEN)(s - oldbp),
561 SVs_TEMP | (UTF ? SVf_UTF8 : 0))));
56da5a46 562 }
07c798fb 563 }
3280af22 564 PL_bufptr = oldbp;
8990e307
LW
565}
566
ffb4593c
NT
567/*
568 * S_missingterm
569 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 570 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
571 * If we're in a delimited string and the delimiter is a control
572 * character, it's reformatted into a two-char sequence like ^C.
573 * This is fatal.
574 */
575
76e3520e 576STATIC void
cea2e8a9 577S_missingterm(pTHX_ char *s)
8990e307 578{
97aff369 579 dVAR;
8990e307
LW
580 char tmpbuf[3];
581 char q;
582 if (s) {
9d4ba2ae 583 char * const nl = strrchr(s,'\n');
d2719217 584 if (nl)
8990e307
LW
585 *nl = '\0';
586 }
463559e7 587 else if (isCNTRL(PL_multi_close)) {
8990e307 588 *tmpbuf = '^';
585ec06d 589 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
590 tmpbuf[2] = '\0';
591 s = tmpbuf;
592 }
593 else {
eb160463 594 *tmpbuf = (char)PL_multi_close;
8990e307
LW
595 tmpbuf[1] = '\0';
596 s = tmpbuf;
597 }
598 q = strchr(s,'"') ? '\'' : '"';
cea2e8a9 599 Perl_croak(aTHX_ "Can't find string terminator %c%s%c anywhere before EOF",q,s,q);
463ee0b2 600}
79072805 601
dd0ac2b9
FC
602#include "feature.h"
603
0d863452 604/*
0d863452
RH
605 * Check whether the named feature is enabled.
606 */
26ea9e12 607bool
3fff3427 608Perl_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
0d863452 609{
97aff369 610 dVAR;
4a731d7b 611 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
7918f24d
NC
612
613 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
ca4d40c4
FC
614
615 assert(CURRENT_FEATURE_BUNDLE == FEATURE_BUNDLE_CUSTOM);
7918f24d 616
26ea9e12
NC
617 if (namelen > MAX_FEATURE_LEN)
618 return FALSE;
3fff3427 619 memcpy(&he_name[8], name, namelen);
7d69d4a6 620
c8ca97b0
NC
621 return cBOOL(cop_hints_fetch_pvn(PL_curcop, he_name, 8 + namelen, 0,
622 REFCOUNTED_HE_EXISTS));
0d863452
RH
623}
624
ffb4593c 625/*
9cbb5ea2
GS
626 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
627 * utf16-to-utf8-reversed.
ffb4593c
NT
628 */
629
c39cd008
GS
630#ifdef PERL_CR_FILTER
631static void
632strip_return(SV *sv)
633{
eb578fdb
KW
634 const char *s = SvPVX_const(sv);
635 const char * const e = s + SvCUR(sv);
7918f24d
NC
636
637 PERL_ARGS_ASSERT_STRIP_RETURN;
638
c39cd008
GS
639 /* outer loop optimized to do nothing if there are no CR-LFs */
640 while (s < e) {
641 if (*s++ == '\r' && *s == '\n') {
642 /* hit a CR-LF, need to copy the rest */
eb578fdb 643 char *d = s - 1;
c39cd008
GS
644 *d++ = *s++;
645 while (s < e) {
646 if (*s == '\r' && s[1] == '\n')
647 s++;
648 *d++ = *s++;
649 }
650 SvCUR(sv) -= s - d;
651 return;
652 }
653 }
654}
a868473f 655
76e3520e 656STATIC I32
c39cd008 657S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 658{
f54cb97a 659 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
660 if (count > 0 && !maxlen)
661 strip_return(sv);
662 return count;
a868473f
NIS
663}
664#endif
665
ffb4593c 666/*
8eaa0acf
Z
667=for apidoc Amx|void|lex_start|SV *line|PerlIO *rsfp|U32 flags
668
669Creates and initialises a new lexer/parser state object, supplying
670a context in which to lex and parse from a new source of Perl code.
671A pointer to the new state object is placed in L</PL_parser>. An entry
672is made on the save stack so that upon unwinding the new state object
673will be destroyed and the former value of L</PL_parser> will be restored.
674Nothing else need be done to clean up the parsing context.
675
676The code to be parsed comes from I<line> and I<rsfp>. I<line>, if
677non-null, provides a string (in SV form) containing code to be parsed.
678A copy of the string is made, so subsequent modification of I<line>
679does not affect parsing. I<rsfp>, if non-null, provides an input stream
680from which code will be read to be parsed. If both are non-null, the
681code in I<line> comes first and must consist of complete lines of input,
682and I<rsfp> supplies the remainder of the source.
683
e368b3bd
FC
684The I<flags> parameter is reserved for future use. Currently it is only
685used by perl internally, so extensions should always pass zero.
8eaa0acf
Z
686
687=cut
688*/
ffb4593c 689
27fcb6ee 690/* LEX_START_SAME_FILTER indicates that this is not a new file, so it
87606032
NC
691 can share filters with the current parser.
692 LEX_START_DONT_CLOSE indicates that the file handle wasn't opened by the
693 caller, hence isn't owned by the parser, so shouldn't be closed on parser
694 destruction. This is used to handle the case of defaulting to reading the
695 script from the standard input because no filename was given on the command
696 line (without getting confused by situation where STDIN has been closed, so
697 the script handle is opened on fd 0) */
27fcb6ee 698
a0d0e21e 699void
8eaa0acf 700Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
79072805 701{
97aff369 702 dVAR;
6ef55633 703 const char *s = NULL;
5486870f 704 yy_parser *parser, *oparser;
60d63348 705 if (flags && flags & ~LEX_START_FLAGS)
8eaa0acf 706 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
acdf0a21
DM
707
708 /* create and initialise a parser */
709
199e78b7 710 Newxz(parser, 1, yy_parser);
5486870f 711 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
712 PL_parser = parser;
713
28ac2b49
Z
714 parser->stack = NULL;
715 parser->ps = NULL;
716 parser->stack_size = 0;
acdf0a21 717
e3abe207
DM
718 /* on scope exit, free this parser and restore any outer one */
719 SAVEPARSER(parser);
7c4baf47 720 parser->saved_curcop = PL_curcop;
e3abe207 721
acdf0a21 722 /* initialise lexer state */
8990e307 723
fb205e7a
DM
724#ifdef PERL_MAD
725 parser->curforce = -1;
726#else
727 parser->nexttoke = 0;
728#endif
ca4cfd28 729 parser->error_count = oparser ? oparser->error_count : 0;
c2598295 730 parser->copline = NOLINE;
5afb0a62 731 parser->lex_state = LEX_NORMAL;
c2598295 732 parser->expect = XSTATE;
2f9285f8 733 parser->rsfp = rsfp;
27fcb6ee
FC
734 parser->rsfp_filters =
735 !(flags & LEX_START_SAME_FILTER) || !oparser
d3cd8e11
FC
736 ? NULL
737 : MUTABLE_AV(SvREFCNT_inc(
738 oparser->rsfp_filters
739 ? oparser->rsfp_filters
740 : (oparser->rsfp_filters = newAV())
741 ));
2f9285f8 742
199e78b7
DM
743 Newx(parser->lex_brackstack, 120, char);
744 Newx(parser->lex_casestack, 12, char);
745 *parser->lex_casestack = '\0';
d794b522 746 Newxz(parser->lex_shared, 1, LEXSHARED);
02b34bbe 747
10efb74f 748 if (line) {
0528fd32 749 STRLEN len;
10efb74f 750 s = SvPV_const(line, len);
0abcdfa4
FC
751 parser->linestr = flags & LEX_START_COPIED
752 ? SvREFCNT_inc_simple_NN(line)
753 : newSVpvn_flags(s, len, SvUTF8(line));
11076590 754 sv_catpvs(parser->linestr, "\n;");
0abcdfa4
FC
755 } else {
756 parser->linestr = newSVpvs("\n;");
8990e307 757 }
f06b5848
DM
758 parser->oldoldbufptr =
759 parser->oldbufptr =
760 parser->bufptr =
761 parser->linestart = SvPVX(parser->linestr);
762 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
763 parser->last_lop = parser->last_uni = NULL;
87606032
NC
764 parser->lex_flags = flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
765 |LEX_DONT_CLOSE_RSFP);
737c24fc 766
60d63348 767 parser->in_pod = parser->filtered = 0;
79072805 768}
a687059c 769
e3abe207
DM
770
771/* delete a parser object */
772
773void
774Perl_parser_free(pTHX_ const yy_parser *parser)
775{
7918f24d
NC
776 PERL_ARGS_ASSERT_PARSER_FREE;
777
7c4baf47 778 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
779 SvREFCNT_dec(parser->linestr);
780
87606032 781 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
2f9285f8 782 PerlIO_clearerr(parser->rsfp);
799361c3
SH
783 else if (parser->rsfp && (!parser->old_parser ||
784 (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 785 PerlIO_close(parser->rsfp);
5486870f 786 SvREFCNT_dec(parser->rsfp_filters);
2f9285f8 787
e3abe207
DM
788 Safefree(parser->lex_brackstack);
789 Safefree(parser->lex_casestack);
d794b522 790 Safefree(parser->lex_shared);
e3abe207
DM
791 PL_parser = parser->old_parser;
792 Safefree(parser);
793}
794
795
ffb4593c 796/*
f0e67a1d
Z
797=for apidoc AmxU|SV *|PL_parser-E<gt>linestr
798
799Buffer scalar containing the chunk currently under consideration of the
800text currently being lexed. This is always a plain string scalar (for
801which C<SvPOK> is true). It is not intended to be used as a scalar by
802normal scalar means; instead refer to the buffer directly by the pointer
803variables described below.
804
805The lexer maintains various C<char*> pointers to things in the
806C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
807reallocated, all of these pointers must be updated. Don't attempt to
808do this manually, but rather use L</lex_grow_linestr> if you need to
809reallocate the buffer.
810
811The content of the text chunk in the buffer is commonly exactly one
812complete line of input, up to and including a newline terminator,
813but there are situations where it is otherwise. The octets of the
814buffer may be intended to be interpreted as either UTF-8 or Latin-1.
815The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
816flag on this scalar, which may disagree with it.
817
818For direct examination of the buffer, the variable
819L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
820lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
821of these pointers is usually preferable to examination of the scalar
822through normal scalar means.
823
824=for apidoc AmxU|char *|PL_parser-E<gt>bufend
825
826Direct pointer to the end of the chunk of text currently being lexed, the
827end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
828+ SvCUR(PL_parser-E<gt>linestr)>. A NUL character (zero octet) is
829always located at the end of the buffer, and does not count as part of
830the buffer's contents.
831
832=for apidoc AmxU|char *|PL_parser-E<gt>bufptr
833
834Points to the current position of lexing inside the lexer buffer.
835Characters around this point may be freely examined, within
836the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
837L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
838interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
839
840Lexing code (whether in the Perl core or not) moves this pointer past
841the characters that it consumes. It is also expected to perform some
842bookkeeping whenever a newline character is consumed. This movement
843can be more conveniently performed by the function L</lex_read_to>,
844which handles newlines appropriately.
845
846Interpretation of the buffer's octets can be abstracted out by
847using the slightly higher-level functions L</lex_peek_unichar> and
848L</lex_read_unichar>.
849
850=for apidoc AmxU|char *|PL_parser-E<gt>linestart
851
852Points to the start of the current line inside the lexer buffer.
853This is useful for indicating at which column an error occurred, and
854not much else. This must be updated by any lexing code that consumes
855a newline; the function L</lex_read_to> handles this detail.
856
857=cut
858*/
859
860/*
861=for apidoc Amx|bool|lex_bufutf8
862
863Indicates whether the octets in the lexer buffer
864(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
865of Unicode characters. If not, they should be interpreted as Latin-1
866characters. This is analogous to the C<SvUTF8> flag for scalars.
867
868In UTF-8 mode, it is not guaranteed that the lexer buffer actually
869contains valid UTF-8. Lexing code must be robust in the face of invalid
870encoding.
871
872The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
873is significant, but not the whole story regarding the input character
874encoding. Normally, when a file is being read, the scalar contains octets
875and its C<SvUTF8> flag is off, but the octets should be interpreted as
876UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
877however, the scalar may have the C<SvUTF8> flag on, and in this case its
878octets should be interpreted as UTF-8 unless the C<use bytes> pragma
879is in effect. This logic may change in the future; use this function
880instead of implementing the logic yourself.
881
882=cut
883*/
884
885bool
886Perl_lex_bufutf8(pTHX)
887{
888 return UTF;
889}
890
891/*
892=for apidoc Amx|char *|lex_grow_linestr|STRLEN len
893
894Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
895at least I<len> octets (including terminating NUL). Returns a
896pointer to the reallocated buffer. This is necessary before making
897any direct modification of the buffer that would increase its length.
898L</lex_stuff_pvn> provides a more convenient way to insert text into
899the buffer.
900
901Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
902this function updates all of the lexer's variables that point directly
903into the buffer.
904
905=cut
906*/
907
908char *
909Perl_lex_grow_linestr(pTHX_ STRLEN len)
910{
911 SV *linestr;
912 char *buf;
913 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
c7641931 914 STRLEN linestart_pos, last_uni_pos, last_lop_pos, re_eval_start_pos;
f0e67a1d
Z
915 linestr = PL_parser->linestr;
916 buf = SvPVX(linestr);
917 if (len <= SvLEN(linestr))
918 return buf;
919 bufend_pos = PL_parser->bufend - buf;
920 bufptr_pos = PL_parser->bufptr - buf;
921 oldbufptr_pos = PL_parser->oldbufptr - buf;
922 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
923 linestart_pos = PL_parser->linestart - buf;
924 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
925 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
3328ab5a
FC
926 re_eval_start_pos = PL_parser->lex_shared->re_eval_start ?
927 PL_parser->lex_shared->re_eval_start - buf : 0;
c7641931 928
f0e67a1d 929 buf = sv_grow(linestr, len);
c7641931 930
f0e67a1d
Z
931 PL_parser->bufend = buf + bufend_pos;
932 PL_parser->bufptr = buf + bufptr_pos;
933 PL_parser->oldbufptr = buf + oldbufptr_pos;
934 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
935 PL_parser->linestart = buf + linestart_pos;
936 if (PL_parser->last_uni)
937 PL_parser->last_uni = buf + last_uni_pos;
938 if (PL_parser->last_lop)
939 PL_parser->last_lop = buf + last_lop_pos;
3328ab5a
FC
940 if (PL_parser->lex_shared->re_eval_start)
941 PL_parser->lex_shared->re_eval_start = buf + re_eval_start_pos;
f0e67a1d
Z
942 return buf;
943}
944
945/*
83aa740e 946=for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
f0e67a1d
Z
947
948Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
949immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
950reallocating the buffer if necessary. This means that lexing code that
951runs later will see the characters as if they had appeared in the input.
952It is not recommended to do this as part of normal parsing, and most
953uses of this facility run the risk of the inserted characters being
954interpreted in an unintended manner.
955
956The string to be inserted is represented by I<len> octets starting
957at I<pv>. These octets are interpreted as either UTF-8 or Latin-1,
958according to whether the C<LEX_STUFF_UTF8> flag is set in I<flags>.
959The characters are recoded for the lexer buffer, according to how the
960buffer is currently being interpreted (L</lex_bufutf8>). If a string
9dcc53ea 961to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
f0e67a1d
Z
962function is more convenient.
963
964=cut
965*/
966
967void
83aa740e 968Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
f0e67a1d 969{
749123ff 970 dVAR;
f0e67a1d
Z
971 char *bufptr;
972 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
973 if (flags & ~(LEX_STUFF_UTF8))
974 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
975 if (UTF) {
976 if (flags & LEX_STUFF_UTF8) {
977 goto plain_copy;
978 } else {
979 STRLEN highhalf = 0;
83aa740e 980 const char *p, *e = pv+len;
f0e67a1d
Z
981 for (p = pv; p != e; p++)
982 highhalf += !!(((U8)*p) & 0x80);
983 if (!highhalf)
984 goto plain_copy;
985 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
986 bufptr = PL_parser->bufptr;
987 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
988 SvCUR_set(PL_parser->linestr,
989 SvCUR(PL_parser->linestr) + len+highhalf);
f0e67a1d
Z
990 PL_parser->bufend += len+highhalf;
991 for (p = pv; p != e; p++) {
992 U8 c = (U8)*p;
993 if (c & 0x80) {
994 *bufptr++ = (char)(0xc0 | (c >> 6));
995 *bufptr++ = (char)(0x80 | (c & 0x3f));
996 } else {
997 *bufptr++ = (char)c;
998 }
999 }
1000 }
1001 } else {
1002 if (flags & LEX_STUFF_UTF8) {
1003 STRLEN highhalf = 0;
83aa740e 1004 const char *p, *e = pv+len;
f0e67a1d
Z
1005 for (p = pv; p != e; p++) {
1006 U8 c = (U8)*p;
1007 if (c >= 0xc4) {
1008 Perl_croak(aTHX_ "Lexing code attempted to stuff "
1009 "non-Latin-1 character into Latin-1 input");
1010 } else if (c >= 0xc2 && p+1 != e &&
1011 (((U8)p[1]) & 0xc0) == 0x80) {
1012 p++;
1013 highhalf++;
1014 } else if (c >= 0x80) {
1015 /* malformed UTF-8 */
1016 ENTER;
1017 SAVESPTR(PL_warnhook);
1018 PL_warnhook = PERL_WARNHOOK_FATAL;
1019 utf8n_to_uvuni((U8*)p, e-p, NULL, 0);
1020 LEAVE;
1021 }
1022 }
1023 if (!highhalf)
1024 goto plain_copy;
1025 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
1026 bufptr = PL_parser->bufptr;
1027 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1028 SvCUR_set(PL_parser->linestr,
1029 SvCUR(PL_parser->linestr) + len-highhalf);
f0e67a1d
Z
1030 PL_parser->bufend += len-highhalf;
1031 for (p = pv; p != e; p++) {
1032 U8 c = (U8)*p;
1033 if (c & 0x80) {
1034 *bufptr++ = (char)(((c & 0x3) << 6) | (p[1] & 0x3f));
1035 p++;
1036 } else {
1037 *bufptr++ = (char)c;
1038 }
1039 }
1040 } else {
1041 plain_copy:
1042 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1043 bufptr = PL_parser->bufptr;
1044 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
255fdf19 1045 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
f0e67a1d
Z
1046 PL_parser->bufend += len;
1047 Copy(pv, bufptr, len, char);
1048 }
1049 }
1050}
1051
1052/*
9dcc53ea
Z
1053=for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
1054
1055Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1056immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1057reallocating the buffer if necessary. This means that lexing code that
1058runs later will see the characters as if they had appeared in the input.
1059It is not recommended to do this as part of normal parsing, and most
1060uses of this facility run the risk of the inserted characters being
1061interpreted in an unintended manner.
1062
1063The string to be inserted is represented by octets starting at I<pv>
1064and continuing to the first nul. These octets are interpreted as either
1065UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
1066in I<flags>. The characters are recoded for the lexer buffer, according
1067to how the buffer is currently being interpreted (L</lex_bufutf8>).
1068If it is not convenient to nul-terminate a string to be inserted, the
1069L</lex_stuff_pvn> function is more appropriate.
1070
1071=cut
1072*/
1073
1074void
1075Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1076{
1077 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1078 lex_stuff_pvn(pv, strlen(pv), flags);
1079}
1080
1081/*
f0e67a1d
Z
1082=for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1083
1084Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1085immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1086reallocating the buffer if necessary. This means that lexing code that
1087runs later will see the characters as if they had appeared in the input.
1088It is not recommended to do this as part of normal parsing, and most
1089uses of this facility run the risk of the inserted characters being
1090interpreted in an unintended manner.
1091
1092The string to be inserted is the string value of I<sv>. The characters
1093are recoded for the lexer buffer, according to how the buffer is currently
9dcc53ea 1094being interpreted (L</lex_bufutf8>). If a string to be inserted is
f0e67a1d
Z
1095not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1096need to construct a scalar.
1097
1098=cut
1099*/
1100
1101void
1102Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1103{
1104 char *pv;
1105 STRLEN len;
1106 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1107 if (flags)
1108 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1109 pv = SvPV(sv, len);
1110 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1111}
1112
1113/*
1114=for apidoc Amx|void|lex_unstuff|char *ptr
1115
1116Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1117I<ptr>. Text following I<ptr> will be moved, and the buffer shortened.
1118This hides the discarded text from any lexing code that runs later,
1119as if the text had never appeared.
1120
1121This is not the normal way to consume lexed text. For that, use
1122L</lex_read_to>.
1123
1124=cut
1125*/
1126
1127void
1128Perl_lex_unstuff(pTHX_ char *ptr)
1129{
1130 char *buf, *bufend;
1131 STRLEN unstuff_len;
1132 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1133 buf = PL_parser->bufptr;
1134 if (ptr < buf)
1135 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1136 if (ptr == buf)
1137 return;
1138 bufend = PL_parser->bufend;
1139 if (ptr > bufend)
1140 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1141 unstuff_len = ptr - buf;
1142 Move(ptr, buf, bufend+1-ptr, char);
1143 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1144 PL_parser->bufend = bufend - unstuff_len;
1145}
1146
1147/*
1148=for apidoc Amx|void|lex_read_to|char *ptr
1149
1150Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1151to I<ptr>. This advances L</PL_parser-E<gt>bufptr> to match I<ptr>,
1152performing the correct bookkeeping whenever a newline character is passed.
1153This is the normal way to consume lexed text.
1154
1155Interpretation of the buffer's octets can be abstracted out by
1156using the slightly higher-level functions L</lex_peek_unichar> and
1157L</lex_read_unichar>.
1158
1159=cut
1160*/
1161
1162void
1163Perl_lex_read_to(pTHX_ char *ptr)
1164{
1165 char *s;
1166 PERL_ARGS_ASSERT_LEX_READ_TO;
1167 s = PL_parser->bufptr;
1168 if (ptr < s || ptr > PL_parser->bufend)
1169 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1170 for (; s != ptr; s++)
1171 if (*s == '\n') {
83944c01 1172 COPLINE_INC_WITH_HERELINES;
f0e67a1d
Z
1173 PL_parser->linestart = s+1;
1174 }
1175 PL_parser->bufptr = ptr;
1176}
1177
1178/*
1179=for apidoc Amx|void|lex_discard_to|char *ptr
1180
1181Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1182up to I<ptr>. The remaining content of the buffer will be moved, and
1183all pointers into the buffer updated appropriately. I<ptr> must not
1184be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1185it is not permitted to discard text that has yet to be lexed.
1186
1187Normally it is not necessarily to do this directly, because it suffices to
1188use the implicit discarding behaviour of L</lex_next_chunk> and things
1189based on it. However, if a token stretches across multiple lines,
1f317c95 1190and the lexing code has kept multiple lines of text in the buffer for
f0e67a1d
Z
1191that purpose, then after completion of the token it would be wise to
1192explicitly discard the now-unneeded earlier lines, to avoid future
1193multi-line tokens growing the buffer without bound.
1194
1195=cut
1196*/
1197
1198void
1199Perl_lex_discard_to(pTHX_ char *ptr)
1200{
1201 char *buf;
1202 STRLEN discard_len;
1203 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1204 buf = SvPVX(PL_parser->linestr);
1205 if (ptr < buf)
1206 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1207 if (ptr == buf)
1208 return;
1209 if (ptr > PL_parser->bufptr)
1210 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1211 discard_len = ptr - buf;
1212 if (PL_parser->oldbufptr < ptr)
1213 PL_parser->oldbufptr = ptr;
1214 if (PL_parser->oldoldbufptr < ptr)
1215 PL_parser->oldoldbufptr = ptr;
1216 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1217 PL_parser->last_uni = NULL;
1218 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1219 PL_parser->last_lop = NULL;
1220 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1221 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1222 PL_parser->bufend -= discard_len;
1223 PL_parser->bufptr -= discard_len;
1224 PL_parser->oldbufptr -= discard_len;
1225 PL_parser->oldoldbufptr -= discard_len;
1226 if (PL_parser->last_uni)
1227 PL_parser->last_uni -= discard_len;
1228 if (PL_parser->last_lop)
1229 PL_parser->last_lop -= discard_len;
1230}
1231
1232/*
1233=for apidoc Amx|bool|lex_next_chunk|U32 flags
1234
1235Reads in the next chunk of text to be lexed, appending it to
1236L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1237looked to the end of the current chunk and wants to know more. It is
1238usual, but not necessary, for lexing to have consumed the entirety of
1239the current chunk at this time.
1240
1241If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1242chunk (i.e., the current chunk has been entirely consumed), normally the
1243current chunk will be discarded at the same time that the new chunk is
1244read in. If I<flags> includes C<LEX_KEEP_PREVIOUS>, the current chunk
1245will not be discarded. If the current chunk has not been entirely
1246consumed, then it will not be discarded regardless of the flag.
1247
1248Returns true if some new text was added to the buffer, or false if the
1249buffer has reached the end of the input text.
1250
1251=cut
1252*/
1253
1254#define LEX_FAKE_EOF 0x80000000
112d1284 1255#define LEX_NO_TERM 0x40000000
f0e67a1d
Z
1256
1257bool
1258Perl_lex_next_chunk(pTHX_ U32 flags)
1259{
1260 SV *linestr;
1261 char *buf;
1262 STRLEN old_bufend_pos, new_bufend_pos;
1263 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1264 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1265 bool got_some_for_debugger = 0;
f0e67a1d 1266 bool got_some;
112d1284 1267 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM))
f0e67a1d 1268 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
f0e67a1d
Z
1269 linestr = PL_parser->linestr;
1270 buf = SvPVX(linestr);
1271 if (!(flags & LEX_KEEP_PREVIOUS) &&
1272 PL_parser->bufptr == PL_parser->bufend) {
1273 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1274 linestart_pos = 0;
1275 if (PL_parser->last_uni != PL_parser->bufend)
1276 PL_parser->last_uni = NULL;
1277 if (PL_parser->last_lop != PL_parser->bufend)
1278 PL_parser->last_lop = NULL;
1279 last_uni_pos = last_lop_pos = 0;
1280 *buf = 0;
1281 SvCUR(linestr) = 0;
1282 } else {
1283 old_bufend_pos = PL_parser->bufend - buf;
1284 bufptr_pos = PL_parser->bufptr - buf;
1285 oldbufptr_pos = PL_parser->oldbufptr - buf;
1286 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1287 linestart_pos = PL_parser->linestart - buf;
1288 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1289 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1290 }
1291 if (flags & LEX_FAKE_EOF) {
1292 goto eof;
60d63348 1293 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
f0e67a1d
Z
1294 got_some = 0;
1295 } else if (filter_gets(linestr, old_bufend_pos)) {
1296 got_some = 1;
17cc9359 1297 got_some_for_debugger = 1;
112d1284
FC
1298 } else if (flags & LEX_NO_TERM) {
1299 got_some = 0;
f0e67a1d 1300 } else {
580561a3
Z
1301 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1302 sv_setpvs(linestr, "");
f0e67a1d
Z
1303 eof:
1304 /* End of real input. Close filehandle (unless it was STDIN),
1305 * then add implicit termination.
1306 */
87606032 1307 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
f0e67a1d
Z
1308 PerlIO_clearerr(PL_parser->rsfp);
1309 else if (PL_parser->rsfp)
1310 (void)PerlIO_close(PL_parser->rsfp);
1311 PL_parser->rsfp = NULL;
60d63348 1312 PL_parser->in_pod = PL_parser->filtered = 0;
f0e67a1d
Z
1313#ifdef PERL_MAD
1314 if (PL_madskills && !PL_in_eval && (PL_minus_p || PL_minus_n))
1315 PL_faketokens = 1;
1316#endif
1317 if (!PL_in_eval && PL_minus_p) {
1318 sv_catpvs(linestr,
1319 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1320 PL_minus_n = PL_minus_p = 0;
1321 } else if (!PL_in_eval && PL_minus_n) {
1322 sv_catpvs(linestr, /*{*/";}");
1323 PL_minus_n = 0;
1324 } else
1325 sv_catpvs(linestr, ";");
1326 got_some = 1;
1327 }
1328 buf = SvPVX(linestr);
1329 new_bufend_pos = SvCUR(linestr);
1330 PL_parser->bufend = buf + new_bufend_pos;
1331 PL_parser->bufptr = buf + bufptr_pos;
1332 PL_parser->oldbufptr = buf + oldbufptr_pos;
1333 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1334 PL_parser->linestart = buf + linestart_pos;
1335 if (PL_parser->last_uni)
1336 PL_parser->last_uni = buf + last_uni_pos;
1337 if (PL_parser->last_lop)
1338 PL_parser->last_lop = buf + last_lop_pos;
17cc9359 1339 if (got_some_for_debugger && (PERLDB_LINE || PERLDB_SAVESRC) &&
f0e67a1d
Z
1340 PL_curstash != PL_debstash) {
1341 /* debugger active and we're not compiling the debugger code,
1342 * so store the line into the debugger's array of lines
1343 */
1344 update_debugger_info(NULL, buf+old_bufend_pos,
1345 new_bufend_pos-old_bufend_pos);
1346 }
1347 return got_some;
1348}
1349
1350/*
1351=for apidoc Amx|I32|lex_peek_unichar|U32 flags
1352
1353Looks ahead one (Unicode) character in the text currently being lexed.
1354Returns the codepoint (unsigned integer value) of the next character,
1355or -1 if lexing has reached the end of the input text. To consume the
1356peeked character, use L</lex_read_unichar>.
1357
1358If the next character is in (or extends into) the next chunk of input
1359text, the next chunk will be read in. Normally the current chunk will be
1360discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1361then the current chunk will not be discarded.
1362
1363If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1364is encountered, an exception is generated.
1365
1366=cut
1367*/
1368
1369I32
1370Perl_lex_peek_unichar(pTHX_ U32 flags)
1371{
749123ff 1372 dVAR;
f0e67a1d
Z
1373 char *s, *bufend;
1374 if (flags & ~(LEX_KEEP_PREVIOUS))
1375 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1376 s = PL_parser->bufptr;
1377 bufend = PL_parser->bufend;
1378 if (UTF) {
1379 U8 head;
1380 I32 unichar;
1381 STRLEN len, retlen;
1382 if (s == bufend) {
1383 if (!lex_next_chunk(flags))
1384 return -1;
1385 s = PL_parser->bufptr;
1386 bufend = PL_parser->bufend;
1387 }
1388 head = (U8)*s;
1389 if (!(head & 0x80))
1390 return head;
1391 if (head & 0x40) {
1392 len = PL_utf8skip[head];
1393 while ((STRLEN)(bufend-s) < len) {
1394 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1395 break;
1396 s = PL_parser->bufptr;
1397 bufend = PL_parser->bufend;
1398 }
1399 }
1400 unichar = utf8n_to_uvuni((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
1401 if (retlen == (STRLEN)-1) {
1402 /* malformed UTF-8 */
1403 ENTER;
1404 SAVESPTR(PL_warnhook);
1405 PL_warnhook = PERL_WARNHOOK_FATAL;
1406 utf8n_to_uvuni((U8*)s, bufend-s, NULL, 0);
1407 LEAVE;
1408 }
1409 return unichar;
1410 } else {
1411 if (s == bufend) {
1412 if (!lex_next_chunk(flags))
1413 return -1;
1414 s = PL_parser->bufptr;
1415 }
1416 return (U8)*s;
1417 }
1418}
1419
1420/*
1421=for apidoc Amx|I32|lex_read_unichar|U32 flags
1422
1423Reads the next (Unicode) character in the text currently being lexed.
1424Returns the codepoint (unsigned integer value) of the character read,
1425and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1426if lexing has reached the end of the input text. To non-destructively
1427examine the next character, use L</lex_peek_unichar> instead.
1428
1429If the next character is in (or extends into) the next chunk of input
1430text, the next chunk will be read in. Normally the current chunk will be
1431discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1432then the current chunk will not be discarded.
1433
1434If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1435is encountered, an exception is generated.
1436
1437=cut
1438*/
1439
1440I32
1441Perl_lex_read_unichar(pTHX_ U32 flags)
1442{
1443 I32 c;
1444 if (flags & ~(LEX_KEEP_PREVIOUS))
1445 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1446 c = lex_peek_unichar(flags);
1447 if (c != -1) {
1448 if (c == '\n')
83944c01 1449 COPLINE_INC_WITH_HERELINES;
d9018cbe
EB
1450 if (UTF)
1451 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1452 else
1453 ++(PL_parser->bufptr);
f0e67a1d
Z
1454 }
1455 return c;
1456}
1457
1458/*
1459=for apidoc Amx|void|lex_read_space|U32 flags
1460
1461Reads optional spaces, in Perl style, in the text currently being
1462lexed. The spaces may include ordinary whitespace characters and
1463Perl-style comments. C<#line> directives are processed if encountered.
1464L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1465at a non-space character (or the end of the input text).
1466
1467If spaces extend into the next chunk of input text, the next chunk will
1468be read in. Normally the current chunk will be discarded at the same
1469time, but if I<flags> includes C<LEX_KEEP_PREVIOUS> then the current
1470chunk will not be discarded.
1471
1472=cut
1473*/
1474
f0998909
Z
1475#define LEX_NO_NEXT_CHUNK 0x80000000
1476
f0e67a1d
Z
1477void
1478Perl_lex_read_space(pTHX_ U32 flags)
1479{
1480 char *s, *bufend;
1481 bool need_incline = 0;
f0998909 1482 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK))
f0e67a1d
Z
1483 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
1484#ifdef PERL_MAD
1485 if (PL_skipwhite) {
1486 sv_free(PL_skipwhite);
1487 PL_skipwhite = NULL;
1488 }
1489 if (PL_madskills)
1490 PL_skipwhite = newSVpvs("");
1491#endif /* PERL_MAD */
1492 s = PL_parser->bufptr;
1493 bufend = PL_parser->bufend;
1494 while (1) {
1495 char c = *s;
1496 if (c == '#') {
1497 do {
1498 c = *++s;
1499 } while (!(c == '\n' || (c == 0 && s == bufend)));
1500 } else if (c == '\n') {
1501 s++;
1502 PL_parser->linestart = s;
1503 if (s == bufend)
1504 need_incline = 1;
1505 else
1506 incline(s);
1507 } else if (isSPACE(c)) {
1508 s++;
1509 } else if (c == 0 && s == bufend) {
1510 bool got_more;
1511#ifdef PERL_MAD
1512 if (PL_madskills)
1513 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1514#endif /* PERL_MAD */
f0998909
Z
1515 if (flags & LEX_NO_NEXT_CHUNK)
1516 break;
f0e67a1d 1517 PL_parser->bufptr = s;
83944c01 1518 COPLINE_INC_WITH_HERELINES;
f0e67a1d
Z
1519 got_more = lex_next_chunk(flags);
1520 CopLINE_dec(PL_curcop);
1521 s = PL_parser->bufptr;
1522 bufend = PL_parser->bufend;
1523 if (!got_more)
1524 break;
1525 if (need_incline && PL_parser->rsfp) {
1526 incline(s);
1527 need_incline = 0;
1528 }
1529 } else {
1530 break;
1531 }
1532 }
1533#ifdef PERL_MAD
1534 if (PL_madskills)
1535 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1536#endif /* PERL_MAD */
1537 PL_parser->bufptr = s;
1538}
1539
1540/*
ffb4593c
NT
1541 * S_incline
1542 * This subroutine has nothing to do with tilting, whether at windmills
1543 * or pinball tables. Its name is short for "increment line". It
57843af0 1544 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1545 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1546 * # line 500 "foo.pm"
1547 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1548 */
1549
76e3520e 1550STATIC void
d9095cec 1551S_incline(pTHX_ const char *s)
463ee0b2 1552{
97aff369 1553 dVAR;
d9095cec
NC
1554 const char *t;
1555 const char *n;
1556 const char *e;
8818d409 1557 line_t line_num;
463ee0b2 1558
7918f24d
NC
1559 PERL_ARGS_ASSERT_INCLINE;
1560
83944c01 1561 COPLINE_INC_WITH_HERELINES;
451f421f
FC
1562 if (!PL_rsfp && !PL_parser->filtered && PL_lex_state == LEX_NORMAL
1563 && s+1 == PL_bufend && *s == ';') {
1564 /* fake newline in string eval */
1565 CopLINE_dec(PL_curcop);
1566 return;
1567 }
463ee0b2
LW
1568 if (*s++ != '#')
1569 return;
d4c19fe8
AL
1570 while (SPACE_OR_TAB(*s))
1571 s++;
73659bf1
GS
1572 if (strnEQ(s, "line", 4))
1573 s += 4;
1574 else
1575 return;
084592ab 1576 if (SPACE_OR_TAB(*s))
73659bf1 1577 s++;
4e553d73 1578 else
73659bf1 1579 return;
d4c19fe8
AL
1580 while (SPACE_OR_TAB(*s))
1581 s++;
463ee0b2
LW
1582 if (!isDIGIT(*s))
1583 return;
d4c19fe8 1584
463ee0b2
LW
1585 n = s;
1586 while (isDIGIT(*s))
1587 s++;
07714eb4 1588 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1589 return;
bf4acbe4 1590 while (SPACE_OR_TAB(*s))
463ee0b2 1591 s++;
73659bf1 1592 if (*s == '"' && (t = strchr(s+1, '"'))) {
463ee0b2 1593 s++;
73659bf1
GS
1594 e = t + 1;
1595 }
463ee0b2 1596 else {
c35e046a
AL
1597 t = s;
1598 while (!isSPACE(*t))
1599 t++;
73659bf1 1600 e = t;
463ee0b2 1601 }
bf4acbe4 1602 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1603 e++;
1604 if (*e != '\n' && *e != '\0')
1605 return; /* false alarm */
1606
8818d409
FC
1607 line_num = atoi(n)-1;
1608
f4dd75d9 1609 if (t - s > 0) {
d9095cec 1610 const STRLEN len = t - s;
19bad673
NC
1611 SV *const temp_sv = CopFILESV(PL_curcop);
1612 const char *cf;
1613 STRLEN tmplen;
1614
1615 if (temp_sv) {
1616 cf = SvPVX(temp_sv);
1617 tmplen = SvCUR(temp_sv);
1618 } else {
1619 cf = NULL;
1620 tmplen = 0;
1621 }
1622
d1299d44 1623 if (!PL_rsfp && !PL_parser->filtered) {
e66cf94c
RGS
1624 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1625 * to *{"::_<newfilename"} */
44867030
NC
1626 /* However, the long form of evals is only turned on by the
1627 debugger - usually they're "(eval %lu)" */
1628 char smallbuf[128];
1629 char *tmpbuf;
1630 GV **gvp;
d9095cec 1631 STRLEN tmplen2 = len;
798b63bc 1632 if (tmplen + 2 <= sizeof smallbuf)
e66cf94c
RGS
1633 tmpbuf = smallbuf;
1634 else
2ae0db35 1635 Newx(tmpbuf, tmplen + 2, char);
44867030
NC
1636 tmpbuf[0] = '_';
1637 tmpbuf[1] = '<';
2ae0db35 1638 memcpy(tmpbuf + 2, cf, tmplen);
44867030 1639 tmplen += 2;
8a5ee598
RGS
1640 gvp = (GV**)hv_fetch(PL_defstash, tmpbuf, tmplen, FALSE);
1641 if (gvp) {
44867030
NC
1642 char *tmpbuf2;
1643 GV *gv2;
1644
1645 if (tmplen2 + 2 <= sizeof smallbuf)
1646 tmpbuf2 = smallbuf;
1647 else
1648 Newx(tmpbuf2, tmplen2 + 2, char);
1649
1650 if (tmpbuf2 != smallbuf || tmpbuf != smallbuf) {
1651 /* Either they malloc'd it, or we malloc'd it,
1652 so no prefix is present in ours. */
1653 tmpbuf2[0] = '_';
1654 tmpbuf2[1] = '<';
1655 }
1656
1657 memcpy(tmpbuf2 + 2, s, tmplen2);
1658 tmplen2 += 2;
1659
8a5ee598 1660 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1661 if (!isGV(gv2)) {
8a5ee598 1662 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1663 /* adjust ${"::_<newfilename"} to store the new file name */
1664 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
8818d409
FC
1665 /* The line number may differ. If that is the case,
1666 alias the saved lines that are in the array.
1667 Otherwise alias the whole array. */
1668 if (CopLINE(PL_curcop) == line_num) {
1669 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(*gvp)));
1670 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(*gvp)));
1671 }
1672 else if (GvAV(*gvp)) {
1673 AV * const av = GvAV(*gvp);
1674 const I32 start = CopLINE(PL_curcop)+1;
1675 I32 items = AvFILLp(av) - start;
1676 if (items > 0) {
1677 AV * const av2 = GvAVn(gv2);
1678 SV **svp = AvARRAY(av) + start;
1679 I32 l = (I32)line_num+1;
1680 while (items--)
1681 av_store(av2, l++, SvREFCNT_inc(*svp++));
1682 }
1683 }
e5527e4b 1684 }
44867030
NC
1685
1686 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
8a5ee598 1687 }
e66cf94c 1688 if (tmpbuf != smallbuf) Safefree(tmpbuf);
e66cf94c 1689 }
05ec9bb3 1690 CopFILE_free(PL_curcop);
d9095cec 1691 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1692 }
8818d409 1693 CopLINE_set(PL_curcop, line_num);
463ee0b2
LW
1694}
1695
29595ff2 1696#ifdef PERL_MAD
cd81e915 1697/* skip space before PL_thistoken */
29595ff2
NC
1698
1699STATIC char *
1700S_skipspace0(pTHX_ register char *s)
1701{
7918f24d
NC
1702 PERL_ARGS_ASSERT_SKIPSPACE0;
1703
29595ff2
NC
1704 s = skipspace(s);
1705 if (!PL_madskills)
1706 return s;
cd81e915
NC
1707 if (PL_skipwhite) {
1708 if (!PL_thiswhite)
6b29d1f5 1709 PL_thiswhite = newSVpvs("");
cd81e915
NC
1710 sv_catsv(PL_thiswhite, PL_skipwhite);
1711 sv_free(PL_skipwhite);
1712 PL_skipwhite = 0;
1713 }
1714 PL_realtokenstart = s - SvPVX(PL_linestr);
29595ff2
NC
1715 return s;
1716}
1717
cd81e915 1718/* skip space after PL_thistoken */
29595ff2
NC
1719
1720STATIC char *
1721S_skipspace1(pTHX_ register char *s)
1722{
d4c19fe8 1723 const char *start = s;
29595ff2
NC
1724 I32 startoff = start - SvPVX(PL_linestr);
1725
7918f24d
NC
1726 PERL_ARGS_ASSERT_SKIPSPACE1;
1727
29595ff2
NC
1728 s = skipspace(s);
1729 if (!PL_madskills)
1730 return s;
1731 start = SvPVX(PL_linestr) + startoff;
cd81e915 1732 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1733 const char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1734 PL_thistoken = newSVpvn(tstart, start - tstart);
1735 }
1736 PL_realtokenstart = -1;
1737 if (PL_skipwhite) {
1738 if (!PL_nextwhite)
6b29d1f5 1739 PL_nextwhite = newSVpvs("");
cd81e915
NC
1740 sv_catsv(PL_nextwhite, PL_skipwhite);
1741 sv_free(PL_skipwhite);
1742 PL_skipwhite = 0;
29595ff2
NC
1743 }
1744 return s;
1745}
1746
1747STATIC char *
1748S_skipspace2(pTHX_ register char *s, SV **svp)
1749{
c35e046a
AL
1750 char *start;
1751 const I32 bufptroff = PL_bufptr - SvPVX(PL_linestr);
1752 const I32 startoff = s - SvPVX(PL_linestr);
1753
7918f24d
NC
1754 PERL_ARGS_ASSERT_SKIPSPACE2;
1755
29595ff2
NC
1756 s = skipspace(s);
1757 PL_bufptr = SvPVX(PL_linestr) + bufptroff;
1758 if (!PL_madskills || !svp)
1759 return s;
1760 start = SvPVX(PL_linestr) + startoff;
cd81e915 1761 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1762 char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1763 PL_thistoken = newSVpvn(tstart, start - tstart);
1764 PL_realtokenstart = -1;
29595ff2 1765 }
cd81e915 1766 if (PL_skipwhite) {
29595ff2 1767 if (!*svp)
6b29d1f5 1768 *svp = newSVpvs("");
cd81e915
NC
1769 sv_setsv(*svp, PL_skipwhite);
1770 sv_free(PL_skipwhite);
1771 PL_skipwhite = 0;
29595ff2
NC
1772 }
1773
1774 return s;
1775}
1776#endif
1777
80a702cd 1778STATIC void
15f169a1 1779S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1780{
1781 AV *av = CopFILEAVx(PL_curcop);
1782 if (av) {
b9f83d2f 1783 SV * const sv = newSV_type(SVt_PVMG);
5fa550fb
NC
1784 if (orig_sv)
1785 sv_setsv(sv, orig_sv);
1786 else
1787 sv_setpvn(sv, buf, len);
80a702cd
RGS
1788 (void)SvIOK_on(sv);
1789 SvIV_set(sv, 0);
1790 av_store(av, (I32)CopLINE(PL_curcop), sv);
1791 }
1792}
1793
ffb4593c
NT
1794/*
1795 * S_skipspace
1796 * Called to gobble the appropriate amount and type of whitespace.
1797 * Skips comments as well.
1798 */
1799
76e3520e 1800STATIC char *
cea2e8a9 1801S_skipspace(pTHX_ register char *s)
a687059c 1802{
5db06880 1803#ifdef PERL_MAD
f0e67a1d
Z
1804 char *start = s;
1805#endif /* PERL_MAD */
7918f24d 1806 PERL_ARGS_ASSERT_SKIPSPACE;
f0e67a1d 1807#ifdef PERL_MAD
cd81e915
NC
1808 if (PL_skipwhite) {
1809 sv_free(PL_skipwhite);
f0e67a1d 1810 PL_skipwhite = NULL;
5db06880 1811 }
f0e67a1d 1812#endif /* PERL_MAD */
3280af22 1813 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
bf4acbe4 1814 while (s < PL_bufend && SPACE_OR_TAB(*s))
463ee0b2 1815 s++;
f0e67a1d
Z
1816 } else {
1817 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1818 PL_bufptr = s;
f0998909
Z
1819 lex_read_space(LEX_KEEP_PREVIOUS |
1820 (PL_sublex_info.sub_inwhat || PL_lex_state == LEX_FORMLINE ?
1821 LEX_NO_NEXT_CHUNK : 0));
3280af22 1822 s = PL_bufptr;
f0e67a1d
Z
1823 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1824 if (PL_linestart > PL_bufptr)
1825 PL_bufptr = PL_linestart;
1826 return s;
463ee0b2 1827 }
5db06880 1828#ifdef PERL_MAD
f0e67a1d
Z
1829 if (PL_madskills)
1830 PL_skipwhite = newSVpvn(start, s-start);
1831#endif /* PERL_MAD */
5db06880 1832 return s;
a687059c 1833}
378cc40b 1834
ffb4593c
NT
1835/*
1836 * S_check_uni
1837 * Check the unary operators to ensure there's no ambiguity in how they're
1838 * used. An ambiguous piece of code would be:
1839 * rand + 5
1840 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1841 * the +5 is its argument.
1842 */
1843
76e3520e 1844STATIC void
cea2e8a9 1845S_check_uni(pTHX)
ba106d47 1846{
97aff369 1847 dVAR;
d4c19fe8
AL
1848 const char *s;
1849 const char *t;
2f3197b3 1850
3280af22 1851 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1852 return;
3280af22
NIS
1853 while (isSPACE(*PL_last_uni))
1854 PL_last_uni++;
c35e046a
AL
1855 s = PL_last_uni;
1856 while (isALNUM_lazy_if(s,UTF) || *s == '-')
1857 s++;
3280af22 1858 if ((t = strchr(s, '(')) && t < PL_bufptr)
a0d0e21e 1859 return;
6136c704 1860
9b387841
NC
1861 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
1862 "Warning: Use of \"%.*s\" without parentheses is ambiguous",
1863 (int)(s - PL_last_uni), PL_last_uni);
2f3197b3
LW
1864}
1865
ffb4593c
NT
1866/*
1867 * LOP : macro to build a list operator. Its behaviour has been replaced
1868 * with a subroutine, S_lop() for which LOP is just another name.
1869 */
1870
a0d0e21e
LW
1871#define LOP(f,x) return lop(f,x,s)
1872
ffb4593c
NT
1873/*
1874 * S_lop
1875 * Build a list operator (or something that might be one). The rules:
1876 * - if we have a next token, then it's a list operator [why?]
1877 * - if the next thing is an opening paren, then it's a function
1878 * - else it's a list operator
1879 */
1880
76e3520e 1881STATIC I32
a0be28da 1882S_lop(pTHX_ I32 f, int x, char *s)
ffed7fef 1883{
97aff369 1884 dVAR;
7918f24d
NC
1885
1886 PERL_ARGS_ASSERT_LOP;
1887
6154021b 1888 pl_yylval.ival = f;
35c8bce7 1889 CLINE;
3280af22
NIS
1890 PL_expect = x;
1891 PL_bufptr = s;
1892 PL_last_lop = PL_oldbufptr;
eb160463 1893 PL_last_lop_op = (OPCODE)f;
5db06880
NC
1894#ifdef PERL_MAD
1895 if (PL_lasttoke)
78cdf107 1896 goto lstop;
5db06880 1897#else
3280af22 1898 if (PL_nexttoke)
78cdf107 1899 goto lstop;
5db06880 1900#endif
79072805 1901 if (*s == '(')
bbf60fe6 1902 return REPORT(FUNC);
29595ff2 1903 s = PEEKSPACE(s);
79072805 1904 if (*s == '(')
bbf60fe6 1905 return REPORT(FUNC);
78cdf107
Z
1906 else {
1907 lstop:
1908 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
1909 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
bbf60fe6 1910 return REPORT(LSTOP);
78cdf107 1911 }
79072805
LW
1912}
1913
5db06880
NC
1914#ifdef PERL_MAD
1915 /*
1916 * S_start_force
1917 * Sets up for an eventual force_next(). start_force(0) basically does
1918 * an unshift, while start_force(-1) does a push. yylex removes items
1919 * on the "pop" end.
1920 */
1921
1922STATIC void
1923S_start_force(pTHX_ int where)
1924{
1925 int i;
1926
cd81e915 1927 if (where < 0) /* so people can duplicate start_force(PL_curforce) */
5db06880 1928 where = PL_lasttoke;
cd81e915
NC
1929 assert(PL_curforce < 0 || PL_curforce == where);
1930 if (PL_curforce != where) {
5db06880
NC
1931 for (i = PL_lasttoke; i > where; --i) {
1932 PL_nexttoke[i] = PL_nexttoke[i-1];
1933 }
1934 PL_lasttoke++;
1935 }
cd81e915 1936 if (PL_curforce < 0) /* in case of duplicate start_force() */
5db06880 1937 Zero(&PL_nexttoke[where], 1, NEXTTOKE);
cd81e915
NC
1938 PL_curforce = where;
1939 if (PL_nextwhite) {
5db06880 1940 if (PL_madskills)
6b29d1f5 1941 curmad('^', newSVpvs(""));
cd81e915 1942 CURMAD('_', PL_nextwhite);
5db06880
NC
1943 }
1944}
1945
1946STATIC void
1947S_curmad(pTHX_ char slot, SV *sv)
1948{
1949 MADPROP **where;
1950
1951 if (!sv)
1952 return;
cd81e915
NC
1953 if (PL_curforce < 0)
1954 where = &PL_thismad;
5db06880 1955 else
cd81e915 1956 where = &PL_nexttoke[PL_curforce].next_mad;
5db06880 1957
cd81e915 1958 if (PL_faketokens)
76f68e9b 1959 sv_setpvs(sv, "");
5db06880
NC
1960 else {
1961 if (!IN_BYTES) {
1962 if (UTF && is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
1963 SvUTF8_on(sv);
1964 else if (PL_encoding) {
1965 sv_recode_to_utf8(sv, PL_encoding);
1966 }
1967 }
1968 }
1969
1970 /* keep a slot open for the head of the list? */
1971 if (slot != '_' && *where && (*where)->mad_key == '^') {
1972 (*where)->mad_key = slot;
daba3364 1973 sv_free(MUTABLE_SV(((*where)->mad_val)));
5db06880
NC
1974 (*where)->mad_val = (void*)sv;
1975 }
1976 else
1977 addmad(newMADsv(slot, sv), where, 0);
1978}
1979#else
b3f24c00
MHM
1980# define start_force(where) NOOP
1981# define curmad(slot, sv) NOOP
5db06880
NC
1982#endif
1983
ffb4593c
NT
1984/*
1985 * S_force_next
9cbb5ea2 1986 * When the lexer realizes it knows the next token (for instance,
ffb4593c 1987 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 1988 * to know what token to return the next time the lexer is called. Caller
5db06880
NC
1989 * will need to set PL_nextval[] (or PL_nexttoke[].next_val with PERL_MAD),
1990 * and possibly PL_expect to ensure the lexer handles the token correctly.
ffb4593c
NT
1991 */
1992
4e553d73 1993STATIC void
cea2e8a9 1994S_force_next(pTHX_ I32 type)
79072805 1995{
97aff369 1996 dVAR;
704d4215
GG
1997#ifdef DEBUGGING
1998 if (DEBUG_T_TEST) {
1999 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 2000 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
2001 }
2002#endif
6c7ae946
FC
2003 /* Don’t let opslab_force_free snatch it */
2004 if (S_is_opval_token(type & 0xffff) && NEXTVAL_NEXTTOKE.opval) {
2005 assert(!NEXTVAL_NEXTTOKE.opval->op_savefree);
2006 NEXTVAL_NEXTTOKE.opval->op_savefree = 1;
2007 }
5db06880 2008#ifdef PERL_MAD
cd81e915 2009 if (PL_curforce < 0)
5db06880 2010 start_force(PL_lasttoke);
cd81e915 2011 PL_nexttoke[PL_curforce].next_type = type;
5db06880
NC
2012 if (PL_lex_state != LEX_KNOWNEXT)
2013 PL_lex_defer = PL_lex_state;
2014 PL_lex_state = LEX_KNOWNEXT;
2015 PL_lex_expect = PL_expect;
cd81e915 2016 PL_curforce = -1;
5db06880 2017#else
3280af22
NIS
2018 PL_nexttype[PL_nexttoke] = type;
2019 PL_nexttoke++;
2020 if (PL_lex_state != LEX_KNOWNEXT) {
2021 PL_lex_defer = PL_lex_state;
2022 PL_lex_expect = PL_expect;
2023 PL_lex_state = LEX_KNOWNEXT;
79072805 2024 }
5db06880 2025#endif
79072805
LW
2026}
2027
28ac2b49
Z
2028void
2029Perl_yyunlex(pTHX)
2030{
a7aaec61
Z
2031 int yyc = PL_parser->yychar;
2032 if (yyc != YYEMPTY) {
2033 if (yyc) {
2034 start_force(-1);
2035 NEXTVAL_NEXTTOKE = PL_parser->yylval;
2036 if (yyc == '{'/*}*/ || yyc == HASHBRACK || yyc == '['/*]*/) {
78cdf107 2037 PL_lex_allbrackets--;
a7aaec61 2038 PL_lex_brackets--;
78cdf107
Z
2039 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
2040 } else if (yyc == '('/*)*/) {
2041 PL_lex_allbrackets--;
2042 yyc |= (2<<24);
a7aaec61
Z
2043 }
2044 force_next(yyc);
2045 }
28ac2b49
Z
2046 PL_parser->yychar = YYEMPTY;
2047 }
2048}
2049
d0a148a6 2050STATIC SV *
15f169a1 2051S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 2052{
97aff369 2053 dVAR;
740cce10 2054 SV * const sv = newSVpvn_utf8(start, len,
eaf7a4d2
CS
2055 !IN_BYTES
2056 && UTF
2057 && !is_ascii_string((const U8*)start, len)
740cce10 2058 && is_utf8_string((const U8*)start, len));
d0a148a6
NC
2059 return sv;
2060}
2061
ffb4593c
NT
2062/*
2063 * S_force_word
2064 * When the lexer knows the next thing is a word (for instance, it has
2065 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
2066 * it calls S_force_word to stick the next word into the PL_nexttoke/val
2067 * lookahead.
ffb4593c
NT
2068 *
2069 * Arguments:
b1b65b59 2070 * char *start : buffer position (must be within PL_linestr)
02b34bbe 2071 * int token : PL_next* will be this type of bare word (e.g., METHOD,WORD)
ffb4593c
NT
2072 * int check_keyword : if true, Perl checks to make sure the word isn't
2073 * a keyword (do this if the word is a label, e.g. goto FOO)
2074 * int allow_pack : if true, : characters will also be allowed (require,
2075 * use, etc. do this)
9cbb5ea2 2076 * int allow_initial_tick : used by the "sub" lexer only.
ffb4593c
NT
2077 */
2078
76e3520e 2079STATIC char *
cea2e8a9 2080S_force_word(pTHX_ register char *start, int token, int check_keyword, int allow_pack, int allow_initial_tick)
79072805 2081{
97aff369 2082 dVAR;
eb578fdb 2083 char *s;
463ee0b2 2084 STRLEN len;
4e553d73 2085
7918f24d
NC
2086 PERL_ARGS_ASSERT_FORCE_WORD;
2087
29595ff2 2088 start = SKIPSPACE1(start);
463ee0b2 2089 s = start;
7e2040f0 2090 if (isIDFIRST_lazy_if(s,UTF) ||
a0d0e21e 2091 (allow_pack && *s == ':') ||
15f0808c 2092 (allow_initial_tick && *s == '\'') )
a0d0e21e 2093 {
3280af22 2094 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
5458a98a 2095 if (check_keyword && keyword(PL_tokenbuf, len, 0))
463ee0b2 2096 return start;
cd81e915 2097 start_force(PL_curforce);
5db06880
NC
2098 if (PL_madskills)
2099 curmad('X', newSVpvn(start,s-start));
463ee0b2 2100 if (token == METHOD) {
29595ff2 2101 s = SKIPSPACE1(s);
463ee0b2 2102 if (*s == '(')
3280af22 2103 PL_expect = XTERM;
463ee0b2 2104 else {
3280af22 2105 PL_expect = XOPERATOR;
463ee0b2 2106 }
79072805 2107 }
e74e6b3d 2108 if (PL_madskills)
63575281 2109 curmad('g', newSVpvs( "forced" ));
9ded7720 2110 NEXTVAL_NEXTTOKE.opval
d0a148a6
NC
2111 = (OP*)newSVOP(OP_CONST,0,
2112 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 2113 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
2114 force_next(token);
2115 }
2116 return s;
2117}
2118
ffb4593c
NT
2119/*
2120 * S_force_ident
9cbb5ea2 2121 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
2122 * text only contains the "foo" portion. The first argument is a pointer
2123 * to the "foo", and the second argument is the type symbol to prefix.
2124 * Forces the next token to be a "WORD".
9cbb5ea2 2125 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2126 */
2127
76e3520e 2128STATIC void
bfed75c6 2129S_force_ident(pTHX_ register const char *s, int kind)
79072805 2130{
97aff369 2131 dVAR;
7918f24d
NC
2132
2133 PERL_ARGS_ASSERT_FORCE_IDENT;
2134
c35e046a 2135 if (*s) {
90e5519e 2136 const STRLEN len = strlen(s);
728847b1
BF
2137 OP* const o = (OP*)newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
2138 UTF ? SVf_UTF8 : 0));
cd81e915 2139 start_force(PL_curforce);
9ded7720 2140 NEXTVAL_NEXTTOKE.opval = o;
79072805 2141 force_next(WORD);
748a9306 2142 if (kind) {
11343788 2143 o->op_private = OPpCONST_ENTERED;
55497cff
PP
2144 /* XXX see note in pp_entereval() for why we forgo typo
2145 warnings if the symbol must be introduced in an eval.
2146 GSAR 96-10-12 */
90e5519e 2147 gv_fetchpvn_flags(s, len,
728847b1
BF
2148 (PL_in_eval ? (GV_ADDMULTI | GV_ADDINEVAL)
2149 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
90e5519e
NC
2150 kind == '$' ? SVt_PV :
2151 kind == '@' ? SVt_PVAV :
2152 kind == '%' ? SVt_PVHV :
a0d0e21e 2153 SVt_PVGV
90e5519e 2154 );
748a9306 2155 }
79072805
LW
2156 }
2157}
2158
3f33d153
FC
2159static void
2160S_force_ident_maybe_lex(pTHX_ char pit)
2161{
2162 start_force(PL_curforce);
2163 NEXTVAL_NEXTTOKE.ival = pit;
2164 force_next('p');
2165}
2166
1571675a
GS
2167NV
2168Perl_str_to_version(pTHX_ SV *sv)
2169{
2170 NV retval = 0.0;
2171 NV nshift = 1.0;
2172 STRLEN len;
cfd0369c 2173 const char *start = SvPV_const(sv,len);
9d4ba2ae 2174 const char * const end = start + len;
504618e9 2175 const bool utf = SvUTF8(sv) ? TRUE : FALSE;
7918f24d
NC
2176
2177 PERL_ARGS_ASSERT_STR_TO_VERSION;
2178
1571675a 2179 while (start < end) {
ba210ebe 2180 STRLEN skip;
1571675a
GS
2181 UV n;
2182 if (utf)
9041c2e3 2183 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2184 else {
2185 n = *(U8*)start;
2186 skip = 1;
2187 }
2188 retval += ((NV)n)/nshift;
2189 start += skip;
2190 nshift *= 1000;
2191 }
2192 return retval;
2193}
2194
4e553d73 2195/*
ffb4593c
NT
2196 * S_force_version
2197 * Forces the next token to be a version number.
e759cc13
RGS
2198 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2199 * and if "guessing" is TRUE, then no new token is created (and the caller
2200 * must use an alternative parsing method).
ffb4593c
NT
2201 */
2202
76e3520e 2203STATIC char *
e759cc13 2204S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2205{
97aff369 2206 dVAR;
5f66b61c 2207 OP *version = NULL;
44dcb63b 2208 char *d;
5db06880
NC
2209#ifdef PERL_MAD
2210 I32 startoff = s - SvPVX(PL_linestr);
2211#endif
89bfa8cd 2212
7918f24d
NC
2213 PERL_ARGS_ASSERT_FORCE_VERSION;
2214
29595ff2 2215 s = SKIPSPACE1(s);
89bfa8cd 2216
44dcb63b 2217 d = s;
dd629d5b 2218 if (*d == 'v')
44dcb63b 2219 d++;
44dcb63b 2220 if (isDIGIT(*d)) {
e759cc13
RGS
2221 while (isDIGIT(*d) || *d == '_' || *d == '.')
2222 d++;
5db06880
NC
2223#ifdef PERL_MAD
2224 if (PL_madskills) {
cd81e915 2225 start_force(PL_curforce);
5db06880
NC
2226 curmad('X', newSVpvn(s,d-s));
2227 }
2228#endif
4e4da3ac 2229 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
dd629d5b 2230 SV *ver;
8d08d9ba 2231#ifdef USE_LOCALE_NUMERIC
909d3787
KW
2232 char *loc = savepv(setlocale(LC_NUMERIC, NULL));
2233 setlocale(LC_NUMERIC, "C");
8d08d9ba 2234#endif
6154021b 2235 s = scan_num(s, &pl_yylval);
8d08d9ba
DG
2236#ifdef USE_LOCALE_NUMERIC
2237 setlocale(LC_NUMERIC, loc);
909d3787 2238 Safefree(loc);
8d08d9ba 2239#endif
6154021b 2240 version = pl_yylval.opval;
dd629d5b
GS
2241 ver = cSVOPx(version)->op_sv;
2242 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2243 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2244 SvNV_set(ver, str_to_version(ver));
1571675a 2245 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2246 }
89bfa8cd 2247 }
5db06880
NC
2248 else if (guessing) {
2249#ifdef PERL_MAD
2250 if (PL_madskills) {
cd81e915
NC
2251 sv_free(PL_nextwhite); /* let next token collect whitespace */
2252 PL_nextwhite = 0;
5db06880
NC
2253 s = SvPVX(PL_linestr) + startoff;
2254 }
2255#endif
e759cc13 2256 return s;
5db06880 2257 }
89bfa8cd
PP
2258 }
2259
5db06880
NC
2260#ifdef PERL_MAD
2261 if (PL_madskills && !version) {
cd81e915
NC
2262 sv_free(PL_nextwhite); /* let next token collect whitespace */
2263 PL_nextwhite = 0;
5db06880
NC
2264 s = SvPVX(PL_linestr) + startoff;
2265 }
2266#endif
89bfa8cd 2267 /* NOTE: The parser sees the package name and the VERSION swapped */
cd81e915 2268 start_force(PL_curforce);
9ded7720 2269 NEXTVAL_NEXTTOKE.opval = version;
4e553d73 2270 force_next(WORD);
89bfa8cd 2271
e759cc13 2272 return s;
89bfa8cd
PP
2273}
2274
ffb4593c 2275/*
91152fc1
DG
2276 * S_force_strict_version
2277 * Forces the next token to be a version number using strict syntax rules.
2278 */
2279
2280STATIC char *
2281S_force_strict_version(pTHX_ char *s)
2282{
2283 dVAR;
2284 OP *version = NULL;
2285#ifdef PERL_MAD
2286 I32 startoff = s - SvPVX(PL_linestr);
2287#endif
2288 const char *errstr = NULL;
2289
2290 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2291
2292 while (isSPACE(*s)) /* leading whitespace */
2293 s++;
2294
2295 if (is_STRICT_VERSION(s,&errstr)) {
2296 SV *ver = newSV(0);
2297 s = (char *)scan_version(s, ver, 0);
2298 version = newSVOP(OP_CONST, 0, ver);
2299 }
4e4da3ac
Z
2300 else if ( (*s != ';' && *s != '{' && *s != '}' ) &&
2301 (s = SKIPSPACE1(s), (*s != ';' && *s != '{' && *s != '}' )))
2302 {
91152fc1
DG
2303 PL_bufptr = s;
2304 if (errstr)
2305 yyerror(errstr); /* version required */
2306 return s;
2307 }
2308
2309#ifdef PERL_MAD
2310 if (PL_madskills && !version) {
2311 sv_free(PL_nextwhite); /* let next token collect whitespace */
2312 PL_nextwhite = 0;
2313 s = SvPVX(PL_linestr) + startoff;
2314 }
2315#endif
2316 /* NOTE: The parser sees the package name and the VERSION swapped */
2317 start_force(PL_curforce);
2318 NEXTVAL_NEXTTOKE.opval = version;
2319 force_next(WORD);
2320
2321 return s;
2322}
2323
2324/*
ffb4593c
NT
2325 * S_tokeq
2326 * Tokenize a quoted string passed in as an SV. It finds the next
2327 * chunk, up to end of string or a backslash. It may make a new
2328 * SV containing that chunk (if HINT_NEW_STRING is on). It also
2329 * turns \\ into \.
2330 */
2331
76e3520e 2332STATIC SV *
cea2e8a9 2333S_tokeq(pTHX_ SV *sv)
79072805 2334{
97aff369 2335 dVAR;
eb578fdb
KW
2336 char *s;
2337 char *send;
2338 char *d;
b3ac6de7
IZ
2339 STRLEN len = 0;
2340 SV *pv = sv;
79072805 2341
7918f24d
NC
2342 PERL_ARGS_ASSERT_TOKEQ;
2343
79072805 2344 if (!SvLEN(sv))
b3ac6de7 2345 goto finish;
79072805 2346
a0d0e21e 2347 s = SvPV_force(sv, len);
21a311ee 2348 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1)
b3ac6de7 2349 goto finish;
463ee0b2 2350 send = s + len;
dcb21ed6
NC
2351 /* This is relying on the SV being "well formed" with a trailing '\0' */
2352 while (s < send && !(*s == '\\' && s[1] == '\\'))
79072805
LW
2353 s++;
2354 if (s == send)
b3ac6de7 2355 goto finish;
79072805 2356 d = s;
be4731d2 2357 if ( PL_hints & HINT_NEW_STRING ) {
59cd0e26 2358 pv = newSVpvn_flags(SvPVX_const(pv), len, SVs_TEMP | SvUTF8(sv));
be4731d2 2359 }
79072805
LW
2360 while (s < send) {
2361 if (*s == '\\') {
a0d0e21e 2362 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2363 s++; /* all that, just for this */
2364 }
2365 *d++ = *s++;
2366 }
2367 *d = '\0';
95a20fc0 2368 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2369 finish:
3280af22 2370 if ( PL_hints & HINT_NEW_STRING )
eb0d8d16 2371 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
79072805
LW
2372 return sv;
2373}
2374
ffb4593c
NT
2375/*
2376 * Now come three functions related to double-quote context,
2377 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2378 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2379 * interact with PL_lex_state, and create fake ( ... ) argument lists
2380 * to handle functions and concatenation.
ecd24171
DM
2381 * For example,
2382 * "foo\lbar"
2383 * is tokenised as
2384 * stringify ( const[foo] concat lcfirst ( const[bar] ) )
ffb4593c
NT
2385 */
2386
2387/*
2388 * S_sublex_start
6154021b 2389 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2390 *
2391 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2392 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c
NT
2393 *
2394 * OP_CONST and OP_READLINE are easy--just make the new op and return.
2395 *
2396 * Everything else becomes a FUNC.
2397 *
2398 * Sets PL_lex_state to LEX_INTERPPUSH unless (ival was OP_NULL or we
2399 * had an OP_CONST or OP_READLINE). This just sets us up for a
2400 * call to S_sublex_push().
2401 */
2402
76e3520e 2403STATIC I32
cea2e8a9 2404S_sublex_start(pTHX)
79072805 2405{
97aff369 2406 dVAR;
eb578fdb 2407 const I32 op_type = pl_yylval.ival;
79072805
LW
2408
2409 if (op_type == OP_NULL) {
6154021b 2410 pl_yylval.opval = PL_lex_op;
5f66b61c 2411 PL_lex_op = NULL;
79072805
LW
2412 return THING;
2413 }
2414 if (op_type == OP_CONST || op_type == OP_READLINE) {
3280af22 2415 SV *sv = tokeq(PL_lex_stuff);
b3ac6de7
IZ
2416
2417 if (SvTYPE(sv) == SVt_PVIV) {
2418 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2419 STRLEN len;
96a5add6 2420 const char * const p = SvPV_const(sv, len);
740cce10 2421 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2422 SvREFCNT_dec(sv);
2423 sv = nsv;
4e553d73 2424 }
6154021b 2425 pl_yylval.opval = (OP*)newSVOP(op_type, 0, sv);
a0714e2c 2426 PL_lex_stuff = NULL;
6f33ba73
RGS
2427 /* Allow <FH> // "foo" */
2428 if (op_type == OP_READLINE)
2429 PL_expect = XTERMORDORDOR;
79072805
LW
2430 return THING;
2431 }
e3f73d4e
RGS
2432 else if (op_type == OP_BACKTICK && PL_lex_op) {
2433 /* readpipe() vas overriden */
2434 cSVOPx(cLISTOPx(cUNOPx(PL_lex_op)->op_first)->op_first->op_sibling)->op_sv = tokeq(PL_lex_stuff);
6154021b 2435 pl_yylval.opval = PL_lex_op;
9b201d7d 2436 PL_lex_op = NULL;
e3f73d4e
RGS
2437 PL_lex_stuff = NULL;
2438 return THING;
2439 }
79072805 2440
3280af22 2441 PL_sublex_info.super_state = PL_lex_state;
eac04b2e 2442 PL_sublex_info.sub_inwhat = (U16)op_type;
3280af22
NIS
2443 PL_sublex_info.sub_op = PL_lex_op;
2444 PL_lex_state = LEX_INTERPPUSH;
55497cff 2445
3280af22
NIS
2446 PL_expect = XTERM;
2447 if (PL_lex_op) {
6154021b 2448 pl_yylval.opval = PL_lex_op;
5f66b61c 2449 PL_lex_op = NULL;
55497cff
PP
2450 return PMFUNC;
2451 }
2452 else
2453 return FUNC;
2454}
2455
ffb4593c
NT
2456/*
2457 * S_sublex_push
2458 * Create a new scope to save the lexing state. The scope will be
2459 * ended in S_sublex_done. Returns a '(', starting the function arguments
2460 * to the uc, lc, etc. found before.
2461 * Sets PL_lex_state to LEX_INTERPCONCAT.
2462 */
2463
76e3520e 2464STATIC I32
cea2e8a9 2465S_sublex_push(pTHX)
55497cff 2466{
27da23d5 2467 dVAR;
78a635de 2468 LEXSHARED *shared;
f46d017c 2469 ENTER;
55497cff 2470
3280af22 2471 PL_lex_state = PL_sublex_info.super_state;
651b5b28 2472 SAVEBOOL(PL_lex_dojoin);
3280af22 2473 SAVEI32(PL_lex_brackets);
78cdf107 2474 SAVEI32(PL_lex_allbrackets);
b27dce25 2475 SAVEI32(PL_lex_formbrack);
78cdf107 2476 SAVEI8(PL_lex_fakeeof);
3280af22
NIS
2477 SAVEI32(PL_lex_casemods);
2478 SAVEI32(PL_lex_starts);
651b5b28 2479 SAVEI8(PL_lex_state);
7cc34111 2480 SAVESPTR(PL_lex_repl);
7766f137 2481 SAVEVPTR(PL_lex_inpat);
98246f1e 2482 SAVEI16(PL_lex_inwhat);
57843af0 2483 SAVECOPLINE(PL_curcop);
3280af22 2484 SAVEPPTR(PL_bufptr);
8452ff4b 2485 SAVEPPTR(PL_bufend);
3280af22
NIS
2486 SAVEPPTR(PL_oldbufptr);
2487 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2488 SAVEPPTR(PL_last_lop);
2489 SAVEPPTR(PL_last_uni);
3280af22
NIS
2490 SAVEPPTR(PL_linestart);
2491 SAVESPTR(PL_linestr);
8edd5f42
RGS
2492 SAVEGENERICPV(PL_lex_brackstack);
2493 SAVEGENERICPV(PL_lex_casestack);
78a635de 2494 SAVEGENERICPV(PL_parser->lex_shared);
3280af22 2495
99bd9d90 2496 /* The here-doc parser needs to be able to peek into outer lexing
60f40a38
FC
2497 scopes to find the body of the here-doc. So we put PL_linestr and
2498 PL_bufptr into lex_shared, to ‘share’ those values.
99bd9d90 2499 */
60f40a38
FC
2500 PL_parser->lex_shared->ls_linestr = PL_linestr;
2501 PL_parser->lex_shared->ls_bufptr = PL_bufptr;
99bd9d90 2502
3280af22 2503 PL_linestr = PL_lex_stuff;
7cc34111 2504 PL_lex_repl = PL_sublex_info.repl;
a0714e2c 2505 PL_lex_stuff = NULL;
7cc34111 2506 PL_sublex_info.repl = NULL;
3280af22 2507
9cbb5ea2
GS
2508 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2509 = SvPVX(PL_linestr);
3280af22 2510 PL_bufend += SvCUR(PL_linestr);
bd61b366 2511 PL_last_lop = PL_last_uni = NULL;
3280af22 2512 SAVEFREESV(PL_linestr);
4dc843bc 2513 if (PL_lex_repl) SAVEFREESV(PL_lex_repl);
3280af22
NIS
2514
2515 PL_lex_dojoin = FALSE;
b27dce25 2516 PL_lex_brackets = PL_lex_formbrack = 0;
78cdf107
Z
2517 PL_lex_allbrackets = 0;
2518 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
a02a5408
JC
2519 Newx(PL_lex_brackstack, 120, char);
2520 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2521 PL_lex_casemods = 0;
2522 *PL_lex_casestack = '\0';
2523 PL_lex_starts = 0;
2524 PL_lex_state = LEX_INTERPCONCAT;
eb160463 2525 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
78a635de
FC
2526
2527 Newxz(shared, 1, LEXSHARED);
2528 shared->ls_prev = PL_parser->lex_shared;
2529 PL_parser->lex_shared = shared;
3280af22
NIS
2530
2531 PL_lex_inwhat = PL_sublex_info.sub_inwhat;
bb16bae8 2532 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
3280af22
NIS
2533 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2534 PL_lex_inpat = PL_sublex_info.sub_op;
79072805 2535 else
5f66b61c 2536 PL_lex_inpat = NULL;
79072805 2537
55497cff 2538 return '(';
79072805
LW
2539}
2540
ffb4593c
NT
2541/*
2542 * S_sublex_done
2543 * Restores lexer state after a S_sublex_push.
2544 */
2545
76e3520e 2546STATIC I32
cea2e8a9 2547S_sublex_done(pTHX)
79072805 2548{
27da23d5 2549 dVAR;
3280af22 2550 if (!PL_lex_starts++) {
396482e1 2551 SV * const sv = newSVpvs("");
9aa983d2
JH
2552 if (SvUTF8(PL_linestr))
2553 SvUTF8_on(sv);
3280af22 2554 PL_expect = XOPERATOR;
6154021b 2555 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
79072805
LW
2556 return THING;
2557 }
2558
3280af22
NIS
2559 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2560 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2561 return yylex();
79072805
LW
2562 }
2563
ffb4593c 2564 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
bb16bae8 2565 assert(PL_lex_inwhat != OP_TRANSR);
3280af22
NIS
2566 if (PL_lex_repl && (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS)) {
2567 PL_linestr = PL_lex_repl;
2568 PL_lex_inpat = 0;
2569 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2570 PL_bufend += SvCUR(PL_linestr);
bd61b366 2571 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2572 PL_lex_dojoin = FALSE;
2573 PL_lex_brackets = 0;
78cdf107
Z
2574 PL_lex_allbrackets = 0;
2575 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
3280af22
NIS
2576 PL_lex_casemods = 0;
2577 *PL_lex_casestack = '\0';
2578 PL_lex_starts = 0;
25da4f38 2579 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2580 PL_lex_state = LEX_INTERPNORMAL;
2581 PL_lex_starts++;
e9fa98b2
HS
2582 /* we don't clear PL_lex_repl here, so that we can check later
2583 whether this is an evalled subst; that means we rely on the
2584 logic to ensure sublex_done() is called again only via the
2585 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2586 }
e9fa98b2 2587 else {
3280af22 2588 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2589 PL_lex_repl = NULL;
e9fa98b2 2590 }
79072805 2591 return ',';
ffed7fef
LW
2592 }
2593 else {
5db06880
NC
2594#ifdef PERL_MAD
2595 if (PL_madskills) {
cd81e915
NC
2596 if (PL_thiswhite) {
2597 if (!PL_endwhite)
6b29d1f5 2598 PL_endwhite = newSVpvs("");
cd81e915
NC
2599 sv_catsv(PL_endwhite, PL_thiswhite);
2600 PL_thiswhite = 0;
2601 }
2602 if (PL_thistoken)
76f68e9b 2603 sv_setpvs(PL_thistoken,"");
5db06880 2604 else
cd81e915 2605 PL_realtokenstart = -1;
5db06880
NC
2606 }
2607#endif
f46d017c 2608 LEAVE;
3280af22
NIS
2609 PL_bufend = SvPVX(PL_linestr);
2610 PL_bufend += SvCUR(PL_linestr);
2611 PL_expect = XOPERATOR;
09bef843 2612 PL_sublex_info.sub_inwhat = 0;
79072805 2613 return ')';
ffed7fef
LW
2614 }
2615}
2616
02aa26ce
NT
2617/*
2618 scan_const
2619
9da1dd8f
DM
2620 Extracts the next constant part of a pattern, double-quoted string,
2621 or transliteration. This is terrifying code.
2622
2623 For example, in parsing the double-quoted string "ab\x63$d", it would
2624 stop at the '$' and return an OP_CONST containing 'abc'.
02aa26ce 2625
94def140 2626 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 2627 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 2628 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 2629
94def140
ST
2630 Returns a pointer to the character scanned up to. If this is
2631 advanced from the start pointer supplied (i.e. if anything was
9da1dd8f 2632 successfully parsed), will leave an OP_CONST for the substring scanned
6154021b 2633 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
2634 by looking at the next characters herself.
2635
02aa26ce 2636 In patterns:
9da1dd8f
DM
2637 expand:
2638 \N{ABC} => \N{U+41.42.43}
2639
2640 pass through:
2641 all other \-char, including \N and \N{ apart from \N{ABC}
2642
2643 stops on:
2644 @ and $ where it appears to be a var, but not for $ as tail anchor
2645 \l \L \u \U \Q \E
2646 (?{ or (??{
2647
02aa26ce
NT
2648
2649 In transliterations:
2650 characters are VERY literal, except for - not at the start or end
94def140
ST
2651 of the string, which indicates a range. If the range is in bytes,
2652 scan_const expands the range to the full set of intermediate
2653 characters. If the range is in utf8, the hyphen is replaced with
2654 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
2655
2656 In double-quoted strings:
2657 backslashes:
2658 double-quoted style: \r and \n
ff3f963a 2659 constants: \x31, etc.
94def140 2660 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
2661 case and quoting: \U \Q \E
2662 stops on @ and $
2663
2664 scan_const does *not* construct ops to handle interpolated strings.
2665 It stops processing as soon as it finds an embedded $ or @ variable
2666 and leaves it to the caller to work out what's going on.
2667
94def140
ST
2668 embedded arrays (whether in pattern or not) could be:
2669 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2670
2671 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
2672
2673 $ in pattern could be $foo or could be tail anchor. Assumption:
2674 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 2675 followed by one of "()| \r\n\t"
02aa26ce 2676
9da1dd8f 2677 \1 (backreferences) are turned into $1 in substitutions
02aa26ce
NT
2678
2679 The structure of the code is
2680 while (there's a character to process) {
94def140
ST
2681 handle transliteration ranges
2682 skip regexp comments /(?#comment)/ and codes /(?{code})/
2683 skip #-initiated comments in //x patterns
2684 check for embedded arrays
02aa26ce
NT
2685 check for embedded scalars
2686 if (backslash) {
94def140 2687 deprecate \1 in substitution replacements
02aa26ce
NT
2688 handle string-changing backslashes \l \U \Q \E, etc.
2689 switch (what was escaped) {
94def140 2690 handle \- in a transliteration (becomes a literal -)
ff3f963a 2691 if a pattern and not \N{, go treat as regular character
94def140
ST
2692 handle \132 (octal characters)
2693 handle \x15 and \x{1234} (hex characters)
ff3f963a 2694 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
ST
2695 handle \cV (control characters)
2696 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 2697 } (end switch)
77a135fe 2698 continue
02aa26ce 2699 } (end if backslash)
77a135fe 2700 handle regular character
02aa26ce 2701 } (end while character to read)
4e553d73 2702
02aa26ce
NT
2703*/
2704
76e3520e 2705STATIC char *
cea2e8a9 2706S_scan_const(pTHX_ char *start)
79072805 2707{
97aff369 2708 dVAR;
eb578fdb 2709 char *send = PL_bufend; /* end of the constant */
77a135fe
KW
2710 SV *sv = newSV(send - start); /* sv for the constant. See
2711 note below on sizing. */
eb578fdb
KW
2712 char *s = start; /* start of the constant */
2713 char *d = SvPVX(sv); /* destination for copies */
02aa26ce 2714 bool dorange = FALSE; /* are we in a translit range? */
c2e66d9e 2715 bool didrange = FALSE; /* did we just finish a range? */
2866decb 2716 bool in_charclass = FALSE; /* within /[...]/ */
b953e60c
KW
2717 bool has_utf8 = FALSE; /* Output constant is UTF8 */
2718 bool this_utf8 = cBOOL(UTF); /* Is the source string assumed
77a135fe
KW
2719 to be UTF8? But, this can
2720 show as true when the source
2721 isn't utf8, as for example
2722 when it is entirely composed
2723 of hex constants */
2724
2725 /* Note on sizing: The scanned constant is placed into sv, which is
2726 * initialized by newSV() assuming one byte of output for every byte of
2727 * input. This routine expects newSV() to allocate an extra byte for a
2728 * trailing NUL, which this routine will append if it gets to the end of
2729 * the input. There may be more bytes of input than output (eg., \N{LATIN
2730 * CAPITAL LETTER A}), or more output than input if the constant ends up
2731 * recoded to utf8, but each time a construct is found that might increase
2732 * the needed size, SvGROW() is called. Its size parameter each time is
2733 * based on the best guess estimate at the time, namely the length used so
2734 * far, plus the length the current construct will occupy, plus room for
2735 * the trailing NUL, plus one byte for every input byte still unscanned */
2736
012bcf8d 2737 UV uv;
4c3a8340
ST
2738#ifdef EBCDIC
2739 UV literal_endpoint = 0;
e294cc5d 2740 bool native_range = TRUE; /* turned to FALSE if the first endpoint is Unicode. */
4c3a8340 2741#endif
012bcf8d 2742
7918f24d
NC
2743 PERL_ARGS_ASSERT_SCAN_CONST;
2744
bb16bae8 2745 assert(PL_lex_inwhat != OP_TRANSR);
2b9d42f0
NIS
2746 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
2747 /* If we are doing a trans and we know we want UTF8 set expectation */
2748 has_utf8 = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
2749 this_utf8 = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
2750 }
2751
2752
79072805 2753 while (s < send || dorange) {
ff3f963a 2754
02aa26ce 2755 /* get transliterations out of the way (they're most literal) */
3280af22 2756 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 2757 /* expand a range A-Z to the full set of characters. AIE! */
79072805 2758 if (dorange) {
1ba5c669
JH
2759 I32 i; /* current expanded character */
2760 I32 min; /* first character in range */
2761 I32 max; /* last character in range */
02aa26ce 2762
e294cc5d
JH
2763#ifdef EBCDIC
2764 UV uvmax = 0;
2765#endif
2766
2767 if (has_utf8
2768#ifdef EBCDIC
2769 && !native_range
2770#endif
2771 ) {
9d4ba2ae 2772 char * const c = (char*)utf8_hop((U8*)d, -1);
8973db79
JH
2773 char *e = d++;
2774 while (e-- > c)
2775 *(e + 1) = *e;
25716404 2776 *c = (char)UTF_TO_NATIVE(0xff);
8973db79
JH
2777 /* mark the range as done, and continue */
2778 dorange = FALSE;
2779 didrange = TRUE;
2780 continue;
2781 }
2b9d42f0 2782
95a20fc0 2783 i = d - SvPVX_const(sv); /* remember current offset */
e294cc5d
JH
2784#ifdef EBCDIC
2785 SvGROW(sv,
2786 SvLEN(sv) + (has_utf8 ?
2787 (512 - UTF_CONTINUATION_MARK +
2788 UNISKIP(0x100))
2789 : 256));
2790 /* How many two-byte within 0..255: 128 in UTF-8,
2791 * 96 in UTF-8-mod. */
2792#else
9cbb5ea2 2793 SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */
e294cc5d 2794#endif
9cbb5ea2 2795 d = SvPVX(sv) + i; /* refresh d after realloc */
e294cc5d
JH
2796#ifdef EBCDIC
2797 if (has_utf8) {
2798 int j;
2799 for (j = 0; j <= 1; j++) {
2800 char * const c = (char*)utf8_hop((U8*)d, -1);
2801 const UV uv = utf8n_to_uvchr((U8*)c, d - c, NULL, 0);
2802 if (j)
2803 min = (U8)uv;
2804 else if (uv < 256)
2805 max = (U8)uv;
2806 else {
2807 max = (U8)0xff; /* only to \xff */
2808 uvmax = uv; /* \x{100} to uvmax */
2809 }
2810 d = c; /* eat endpoint chars */
2811 }
2812 }
2813 else {
2814#endif
2815 d -= 2; /* eat the first char and the - */
2816 min = (U8)*d; /* first char in range */
2817 max = (U8)d[1]; /* last char in range */
2818#ifdef EBCDIC
2819 }
2820#endif
8ada0baa 2821
c2e66d9e 2822 if (min > max) {
4dc843bc 2823 SvREFCNT_dec(sv);
01ec43d0 2824 Perl_croak(aTHX_
d1573ac7 2825 "Invalid range \"%c-%c\" in transliteration operator",
1ba5c669 2826 (char)min, (char)max);
c2e66d9e
GS
2827 }
2828
c7f1f016 2829#ifdef EBCDIC
4c3a8340
ST
2830 if (literal_endpoint == 2 &&
2831 ((isLOWER(min) && isLOWER(max)) ||
2832 (isUPPER(min) && isUPPER(max)))) {
8ada0baa
JH
2833 if (isLOWER(min)) {
2834 for (i = min; i <= max; i++)
2835 if (isLOWER(i))
db42d148 2836 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2837 } else {
2838 for (i = min; i <= max; i++)
2839 if (isUPPER(i))
db42d148 2840 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2841 }
2842 }
2843 else
2844#endif
2845 for (i = min; i <= max; i++)
e294cc5d
JH
2846#ifdef EBCDIC
2847 if (has_utf8) {
2848 const U8 ch = (U8)NATIVE_TO_UTF(i);
2849 if (UNI_IS_INVARIANT(ch))
2850 *d++ = (U8)i;
2851 else {
2852 *d++ = (U8)UTF8_EIGHT_BIT_HI(ch);
2853 *d++ = (U8)UTF8_EIGHT_BIT_LO(ch);
2854 }
2855 }
2856 else
2857#endif
2858 *d++ = (char)i;
2859
2860#ifdef EBCDIC
2861 if (uvmax) {
2862 d = (char*)uvchr_to_utf8((U8*)d, 0x100);
2863 if (uvmax > 0x101)
2864 *d++ = (char)UTF_TO_NATIVE(0xff);
2865 if (uvmax > 0x100)
2866 d = (char*)uvchr_to_utf8((U8*)d, uvmax);
2867 }
2868#endif
02aa26ce
NT
2869
2870 /* mark the range as done, and continue */
79072805 2871 dorange = FALSE;
01ec43d0 2872 didrange = TRUE;
4c3a8340
ST
2873#ifdef EBCDIC
2874 literal_endpoint = 0;
2875#endif
79072805 2876 continue;
4e553d73 2877 }
02aa26ce
NT
2878
2879 /* range begins (ignore - as first or last char) */
79072805 2880 else if (*s == '-' && s+1 < send && s != start) {
4e553d73 2881 if (didrange) {
4dc843bc 2882 SvREFCNT_dec(sv);
1fafa243 2883 Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
01ec43d0 2884 }
e294cc5d
JH
2885 if (has_utf8
2886#ifdef EBCDIC
2887 && !native_range
2888#endif
2889 ) {
25716404 2890 *d++ = (char)UTF_TO_NATIVE(0xff); /* use illegal utf8 byte--see pmtrans */
a0ed51b3
LW
2891 s++;
2892 continue;
2893 }
79072805
LW
2894 dorange = TRUE;
2895 s++;
01ec43d0
GS
2896 }
2897 else {
2898 didrange = FALSE;
4c3a8340
ST
2899#ifdef EBCDIC
2900 literal_endpoint = 0;
e294cc5d 2901 native_range = TRUE;
4c3a8340 2902#endif
01ec43d0 2903 }
79072805 2904 }
02aa26ce
NT
2905
2906 /* if we get here, we're not doing a transliteration */
2907
e4a2df84
DM
2908 else if (*s == '[' && PL_lex_inpat && !in_charclass) {
2909 char *s1 = s-1;
2910 int esc = 0;
2911 while (s1 >= start && *s1-- == '\\')
2912 esc = !esc;
2913 if (!esc)
2914 in_charclass = TRUE;
2915 }
2866decb 2916
e4a2df84
DM
2917 else if (*s == ']' && PL_lex_inpat && in_charclass) {
2918 char *s1 = s-1;
2919 int esc = 0;
2920 while (s1 >= start && *s1-- == '\\')
2921 esc = !esc;
2922 if (!esc)
2923 in_charclass = FALSE;
2924 }
2866decb 2925
9da1dd8f
DM
2926 /* skip for regexp comments /(?#comment)/, except for the last
2927 * char, which will be done separately.
2928 * Stop on (?{..}) and friends */
2929
3280af22 2930 else if (*s == '(' && PL_lex_inpat && s[1] == '?') {
cc6b7395 2931 if (s[2] == '#') {
e994fd66 2932 while (s+1 < send && *s != ')')
db42d148 2933 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
155aba94 2934 }
2866decb 2935 else if (!PL_lex_casemods && !in_charclass &&
d3cec5e5
DM
2936 ( s[2] == '{' /* This should match regcomp.c */
2937 || (s[2] == '?' && s[3] == '{')))
155aba94 2938 {
9da1dd8f 2939 break;
cc6b7395 2940 }
748a9306 2941 }
02aa26ce
NT
2942
2943 /* likewise skip #-initiated comments in //x patterns */
3280af22 2944 else if (*s == '#' && PL_lex_inpat &&
73134a2e 2945 ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED) {
748a9306 2946 while (s+1 < send && *s != '\n')
db42d148 2947 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
748a9306 2948 }
02aa26ce 2949
9da1dd8f
DM
2950 /* no further processing of single-quoted regex */
2951 else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'')
2952 goto default_action;
2953
5d1d4326 2954 /* check for embedded arrays
da6eedaa 2955 (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
5d1d4326 2956 */
1749ea0d
ST
2957 else if (*s == '@' && s[1]) {
2958 if (isALNUM_lazy_if(s+1,UTF))
2959 break;
2960 if (strchr(":'{$", s[1]))
2961 break;
2962 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
2963 break; /* in regexp, neither @+ nor @- are interpolated */
2964 }
02aa26ce
NT
2965
2966 /* check for embedded scalars. only stop if we're sure it's a
2967 variable.
2968 */
79072805 2969 else if (*s == '$') {
3280af22 2970 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
79072805 2971 break;
77772344 2972 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
a2a5de95
NC
2973 if (s[1] == '\\') {
2974 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
2975 "Possible unintended interpolation of $\\ in regex");
77772344 2976 }
79072805 2977 break; /* in regexp, $ might be tail anchor */
77772344 2978 }
79072805 2979 }
02aa26ce 2980
2b9d42f0
NIS
2981 /* End of else if chain - OP_TRANS rejoin rest */
2982
02aa26ce 2983 /* backslashes */
79072805 2984 if (*s == '\\' && s+1 < send) {
ff3f963a
KW
2985 char* e; /* Can be used for ending '}', etc. */
2986
79072805 2987 s++;
02aa26ce 2988
7d0fc23c
KW
2989 /* warn on \1 - \9 in substitution replacements, but note that \11
2990 * is an octal; and \19 is \1 followed by '9' */
3280af22 2991 if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat &&
a0d0e21e 2992 isDIGIT(*s) && *s != '0' && !isDIGIT(s[1]))
79072805 2993 {
a2a5de95 2994 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
79072805
LW
2995 *--s = '$';
2996 break;
2997 }
02aa26ce
NT
2998
2999 /* string-change backslash escapes */
838f2281 3000 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQF", *s)) {
79072805
LW
3001 --s;
3002 break;
3003 }
ff3f963a
KW
3004 /* In a pattern, process \N, but skip any other backslash escapes.
3005 * This is because we don't want to translate an escape sequence
3006 * into a meta symbol and have the regex compiler use the meta
3007 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
3008 * in spite of this, we do have to process \N here while the proper
3009 * charnames handler is in scope. See bugs #56444 and #62056.
3010 * There is a complication because \N in a pattern may also stand
3011 * for 'match a non-nl', and not mean a charname, in which case its
3012 * processing should be deferred to the regex compiler. To be a
3013 * charname it must be followed immediately by a '{', and not look
3014 * like \N followed by a curly quantifier, i.e., not something like
3015 * \N{3,}. regcurly returns a boolean indicating if it is a legal
3016 * quantifier */
3017 else if (PL_lex_inpat
3018 && (*s != 'N'
3019 || s[1] != '{'
3020 || regcurly(s + 1)))
3021 {
cc74c5bd
ST
3022 *d++ = NATIVE_TO_NEED(has_utf8,'\\');
3023 goto default_action;
3024 }
02aa26ce 3025
79072805 3026 switch (*s) {
02aa26ce
NT
3027
3028 /* quoted - in transliterations */
79072805 3029 case '-':
3280af22 3030 if (PL_lex_inwhat == OP_TRANS) {
79072805
LW
3031 *d++ = *s++;
3032 continue;
3033 }
3034 /* FALL THROUGH */
3035 default:
11b8faa4 3036 {
e4ca4584 3037 if ((isALNUMC(*s)))
a2a5de95
NC
3038 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
3039 "Unrecognized escape \\%c passed through",
3040 *s);
11b8faa4 3041 /* default action is to copy the quoted character */
f9a63242 3042 goto default_action;
11b8faa4 3043 }
02aa26ce 3044
632403cc 3045 /* eg. \132 indicates the octal constant 0132 */
79072805
LW
3046 case '0': case '1': case '2': case '3':
3047 case '4': case '5': case '6': case '7':
ba210ebe 3048 {
53305cf1
NC
3049 I32 flags = 0;
3050 STRLEN len = 3;
77a135fe 3051 uv = NATIVE_TO_UNI(grok_oct(s, &len, &flags, NULL));
ba210ebe
JH
3052 s += len;
3053 }
012bcf8d 3054 goto NUM_ESCAPE_INSERT;
02aa26ce 3055
f0a2b745
KW
3056 /* eg. \o{24} indicates the octal constant \024 */
3057 case 'o':
3058 {
3059 STRLEN len;
454155d9 3060 const char* error;
f0a2b745 3061
454155d9 3062 bool valid = grok_bslash_o(s, &uv, &len, &error, 1);
f0a2b745 3063 s += len;
454155d9 3064 if (! valid) {
f0a2b745
KW
3065 yyerror(error);
3066 continue;
3067 }
3068 goto NUM_ESCAPE_INSERT;
3069 }
3070
77a135fe 3071 /* eg. \x24 indicates the hex constant 0x24 */
79072805 3072 case 'x':
a0481293 3073 {
53305cf1 3074 STRLEN len;
a0481293 3075 const char* error;
355860ce 3076
a0481293
KW
3077 bool valid = grok_bslash_x(s, &uv, &len, &error, 1);
3078 s += len;
3079 if (! valid) {
3080 yyerror(error);
355860ce 3081 continue;
ba210ebe 3082 }
012bcf8d
GS
3083 }
3084
3085 NUM_ESCAPE_INSERT:
ff3f963a
KW
3086 /* Insert oct or hex escaped character. There will always be
3087 * enough room in sv since such escapes will be longer than any
3088 * UTF-8 sequence they can end up as, except if they force us
3089 * to recode the rest of the string into utf8 */
ba7cea30 3090
77a135fe 3091 /* Here uv is the ordinal of the next character being added in
ff3f963a 3092 * unicode (converted from native). */
77a135fe 3093 if (!UNI_IS_INVARIANT(uv)) {
9aa983d2 3094 if (!has_utf8 && uv > 255) {
77a135fe
KW
3095 /* Might need to recode whatever we have accumulated so
3096 * far if it contains any chars variant in utf8 or
3097 * utf-ebcdic. */
3098
3099 SvCUR_set(sv, d - SvPVX_const(sv));
3100 SvPOK_on(sv);
3101 *d = '\0';
77a135fe 3102 /* See Note on sizing above. */
7bf79863
KW
3103 sv_utf8_upgrade_flags_grow(sv,
3104 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3105 UNISKIP(uv) + (STRLEN)(send - s) + 1);
77a135fe
KW
3106 d = SvPVX(sv) + SvCUR(sv);
3107 has_utf8 = TRUE;
012bcf8d
GS
3108 }
3109
77a135fe
KW
3110 if (has_utf8) {
3111 d = (char*)uvuni_to_utf8((U8*)d, uv);
f9a63242
JH
3112 if (PL_lex_inwhat == OP_TRANS &&
3113 PL_sublex_info.sub_op) {
3114 PL_sublex_info.sub_op->op_private |=
3115 (PL_lex_repl ? OPpTRANS_FROM_UTF
3116 : OPpTRANS_TO_UTF);
f9a63242 3117 }
e294cc5d
JH
3118#ifdef EBCDIC
3119 if (uv > 255 && !dorange)
3120 native_range = FALSE;
3121#endif
012bcf8d 3122 }
a0ed51b3 3123 else {
012bcf8d 3124 *d++ = (char)uv;
a0ed51b3 3125 }
012bcf8d
GS
3126 }
3127 else {
c4d5f83a 3128 *d++ = (char) uv;
a0ed51b3 3129 }
79072805 3130 continue;
02aa26ce 3131
4a2d328f 3132 case 'N':
ff3f963a
KW
3133 /* In a non-pattern \N must be a named character, like \N{LATIN
3134 * SMALL LETTER A} or \N{U+0041}. For patterns, it also can
3135 * mean to match a non-newline. For non-patterns, named
3136 * characters are converted to their string equivalents. In
3137 * patterns, named characters are not converted to their
3138 * ultimate forms for the same reasons that other escapes
3139 * aren't. Instead, they are converted to the \N{U+...} form
3140 * to get the value from the charnames that is in effect right
3141 * now, while preserving the fact that it was a named character
3142 * so that the regex compiler knows this */
3143
3144 /* This section of code doesn't generally use the
3145 * NATIVE_TO_NEED() macro to transform the input. I (khw) did
3146 * a close examination of this macro and determined it is a
3147 * no-op except on utfebcdic variant characters. Every
3148 * character generated by this that would normally need to be
3149 * enclosed by this macro is invariant, so the macro is not
7538f724
KW
3150 * needed, and would complicate use of copy(). XXX There are
3151 * other parts of this file where the macro is used
3152 * inconsistently, but are saved by it being a no-op */
ff3f963a
KW
3153
3154 /* The structure of this section of code (besides checking for
3155 * errors and upgrading to utf8) is:
3156 * Further disambiguate between the two meanings of \N, and if
3157 * not a charname, go process it elsewhere
0a96133f
KW
3158 * If of form \N{U+...}, pass it through if a pattern;
3159 * otherwise convert to utf8
3160 * Otherwise must be \N{NAME}: convert to \N{U+c1.c2...} if a
3161 * pattern; otherwise convert to utf8 */
ff3f963a
KW
3162
3163 /* Here, s points to the 'N'; the test below is guaranteed to
3164 * succeed if we are being called on a pattern as we already
3165 * know from a test above that the next character is a '{'.
3166 * On a non-pattern \N must mean 'named sequence, which
3167 * requires braces */
3168 s++;
3169 if (*s != '{') {
3170 yyerror("Missing braces on \\N{}");
3171 continue;
3172 }
3173 s++;
3174
0a96133f 3175 /* If there is no matching '}', it is an error. */
ff3f963a
KW
3176 if (! (e = strchr(s, '}'))) {
3177 if (! PL_lex_inpat) {
5777a3f7 3178 yyerror("Missing right brace on \\N{}");
0a96133f
KW
3179 } else {
3180 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N.");
dbc0d4f2 3181 }
0a96133f 3182 continue;
ff3f963a 3183 }
cddc7ef4 3184
ff3f963a 3185 /* Here it looks like a named character */
cddc7ef4 3186
ff3f963a
KW
3187 if (PL_lex_inpat) {
3188
3189 /* XXX This block is temporary code. \N{} implies that the
3190 * pattern is to have Unicode semantics, and therefore
3191 * currently has to be encoded in utf8. By putting it in
3192 * utf8 now, we save a whole pass in the regular expression
3193 * compiler. Once that code is changed so Unicode
3194 * semantics doesn't necessarily have to be in utf8, this
da3a4baf
KW
3195 * block should be removed. However, the code that parses
3196 * the output of this would have to be changed to not
3197 * necessarily expect utf8 */
ff3f963a 3198 if (!has_utf8) {
77a135fe 3199 SvCUR_set(sv, d - SvPVX_const(sv));
f08d6ad9 3200 SvPOK_on(sv);
e4f3eed8 3201 *d = '\0';
77a135fe 3202 /* See Note on sizing above. */
7bf79863 3203 sv_utf8_upgrade_flags_grow(sv,
ff3f963a
KW
3204 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3205 /* 5 = '\N{' + cur char + NUL */
3206 (STRLEN)(send - s) + 5);
f08d6ad9 3207 d = SvPVX(sv) + SvCUR(sv);
89491803 3208 has_utf8 = TRUE;
ff3f963a
KW
3209 }
3210 }
423cee85 3211
ff3f963a
KW
3212 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3213 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3214 | PERL_SCAN_DISALLOW_PREFIX;
3215 STRLEN len;
3216
3217 /* For \N{U+...}, the '...' is a unicode value even on
3218 * EBCDIC machines */
3219 s += 2; /* Skip to next char after the 'U+' */
3220 len = e - s;
3221 uv = grok_hex(s, &len, &flags, NULL);
3222 if (len == 0 || len != (STRLEN)(e - s)) {
3223 yyerror("Invalid hexadecimal number in \\N{U+...}");
3224 s = e + 1;
3225 continue;
3226 }
3227
3228 if (PL_lex_inpat) {
3229
e2a7e165
KW
3230 /* On non-EBCDIC platforms, pass through to the regex
3231 * compiler unchanged. The reason we evaluated the
3232 * number above is to make sure there wasn't a syntax
3233 * error. But on EBCDIC we convert to native so
3234 * downstream code can continue to assume it's native
3235 */
ff3f963a 3236 s -= 5; /* Include the '\N{U+' */
e2a7e165
KW
3237#ifdef EBCDIC
3238 d += my_snprintf(d, e - s + 1 + 1, /* includes the }
3239 and the \0 */
3240 "\\N{U+%X}",
3241 (unsigned int) UNI_TO_NATIVE(uv));
3242#else
ff3f963a
KW
3243 Copy(s, d, e - s + 1, char); /* 1 = include the } */
3244 d += e - s + 1;
e2a7e165 3245#endif
ff3f963a
KW
3246 }
3247 else { /* Not a pattern: convert the hex to string */
3248
3249 /* If destination is not in utf8, unconditionally
3250 * recode it to be so. This is because \N{} implies
3251 * Unicode semantics, and scalars have to be in utf8
3252 * to guarantee those semantics */
3253 if (! has_utf8) {
3254 SvCUR_set(sv, d - SvPVX_const(sv));
3255 SvPOK_on(sv);
3256 *d = '\0';
3257 /* See Note on sizing above. */
3258 sv_utf8_upgrade_flags_grow(
3259 sv,
3260 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3261 UNISKIP(uv) + (STRLEN)(send - e) + 1);
3262 d = SvPVX(sv) + SvCUR(sv);
3263 has_utf8 = TRUE;
3264 }
3265
3266 /* Add the string to the output */
3267 if (UNI_IS_INVARIANT(uv)) {
3268 *d++ = (char) uv;
3269 }
3270 else d = (char*)uvuni_to_utf8((U8*)d, uv);
3271 }
3272 }
3273 else { /* Here is \N{NAME} but not \N{U+...}. */
3274
3275 SV *res; /* result from charnames */
3276 const char *str; /* the string in 'res' */
3277 STRLEN len; /* its length */
3278
3279 /* Get the value for NAME */
3280 res = newSVpvn(s, e - s);
3281 res = new_constant( NULL, 0, "charnames",
3282 /* includes all of: \N{...} */
3283 res, NULL, s - 3, e - s + 4 );
3284
3285 /* Most likely res will be in utf8 already since the
3286 * standard charnames uses pack U, but a custom translator
3287 * can leave it otherwise, so make sure. XXX This can be
3288 * revisited to not have charnames use utf8 for characters
3289 * that don't need it when regexes don't have to be in utf8
3290 * for Unicode semantics. If doing so, remember EBCDIC */
3291 sv_utf8_upgrade(res);
3292 str = SvPV_const(res, len);
3293
3294 /* Don't accept malformed input */
3295 if (! is_utf8_string((U8 *) str, len)) {
3296 yyerror("Malformed UTF-8 returned by \\N");
3297 }
3298 else if (PL_lex_inpat) {
3299
3300 if (! len) { /* The name resolved to an empty string */
3301 Copy("\\N{}", d, 4, char);
3302 d += 4;
3303 }
3304 else {
3305 /* In order to not lose information for the regex
3306 * compiler, pass the result in the specially made
3307 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3308 * the code points in hex of each character
3309 * returned by charnames */
3310
3311 const char *str_end = str + len;
3312 STRLEN char_length; /* cur char's byte length */
3313 STRLEN output_length; /* and the number of bytes
3314 after this is translated
3315 into hex digits */
3316 const STRLEN off = d - SvPVX_const(sv);
3317
3318 /* 2 hex per byte; 2 chars for '\N'; 2 chars for
3319 * max('U+', '.'); and 1 for NUL */
3320 char hex_string[2 * UTF8_MAXBYTES + 5];
3321
3322 /* Get the first character of the result. */
3323 U32 uv = utf8n_to_uvuni((U8 *) str,
3324 len,
3325 &char_length,
3326 UTF8_ALLOW_ANYUV);
3327
3328 /* The call to is_utf8_string() above hopefully
3329 * guarantees that there won't be an error. But
3330 * it's easy here to make sure. The function just
3331 * above warns and returns 0 if invalid utf8, but
3332 * it can also return 0 if the input is validly a
3333 * NUL. Disambiguate */
3334 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3335 uv = UNICODE_REPLACEMENT;
3336 }
3337
3338 /* Convert first code point to hex, including the
e2a7e165
KW
3339 * boiler plate before it. For all these, we
3340 * convert to native format so that downstream code
3341 * can continue to assume the input is native */
78c35590 3342 output_length =
3353de27 3343 my_snprintf(hex_string, sizeof(hex_string),
e2a7e165
KW
3344 "\\N{U+%X",
3345 (unsigned int) UNI_TO_NATIVE(uv));
ff3f963a
KW
3346
3347 /* Make sure there is enough space to hold it */
3348 d = off + SvGROW(sv, off
3349 + output_length
3350 + (STRLEN)(send - e)
3351 + 2); /* '}' + NUL */
3352 /* And output it */
3353 Copy(hex_string, d, output_length, char);
3354 d += output_length;
3355
3356 /* For each subsequent character, append dot and
3357 * its ordinal in hex */
3358 while ((str += char_length) < str_end) {
3359 const STRLEN off = d - SvPVX_const(sv);
3360 U32 uv = utf8n_to_uvuni((U8 *) str,
3361 str_end - str,
3362 &char_length,
3363 UTF8_ALLOW_ANYUV);
3364 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3365 uv = UNICODE_REPLACEMENT;
3366 }
3367
78c35590 3368 output_length =
3353de27 3369 my_snprintf(hex_string, sizeof(hex_string),
e2a7e165
KW
3370 ".%X",
3371 (unsigned int) UNI_TO_NATIVE(uv));
ff3f963a
KW
3372
3373 d = off + SvGROW(sv, off
3374 + output_length
3375 + (STRLEN)(send - e)
3376 + 2); /* '}' + NUL */
3377 Copy(hex_string, d, output_length, char);
3378 d += output_length;
3379 }
3380
3381 *d++ = '}'; /* Done. Add the trailing brace */
3382 }
3383 }
3384 else { /* Here, not in a pattern. Convert the name to a
3385 * string. */
3386
3387 /* If destination is not in utf8, unconditionally
3388 * recode it to be so. This is because \N{} implies
3389 * Unicode semantics, and scalars have to be in utf8
3390 * to guarantee those semantics */
3391 if (! has_utf8) {
3392 SvCUR_set(sv, d - SvPVX_const(sv));
3393 SvPOK_on(sv);
3394 *d = '\0';
3395 /* See Note on sizing above. */
3396 sv_utf8_upgrade_flags_grow(sv,
3397 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3398 len + (STRLEN)(send - s) + 1);
3399 d = SvPVX(sv) + SvCUR(sv);
3400 has_utf8 = TRUE;
3401 } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
3402
3403 /* See Note on sizing above. (NOTE: SvCUR() is not
3404 * set correctly here). */
3405 const STRLEN off = d - SvPVX_const(sv);
3406 d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1);
3407 }
3408 Copy(str, d, len, char);
3409 d += len;
423cee85 3410 }
423cee85 3411 SvREFCNT_dec(res);
cb233ae3
KW
3412
3413 /* Deprecate non-approved name syntax */
3414 if (ckWARN_d(WARN_DEPRECATED)) {
3415 bool problematic = FALSE;
3416 char* i = s;
3417
3418 /* For non-ut8 input, look to see that the first
3419 * character is an alpha, then loop through the rest
3420 * checking that each is a continuation */
3421 if (! this_utf8) {
3422 if (! isALPHAU(*i)) problematic = TRUE;
3423 else for (i = s + 1; i < e; i++) {
3424 if (isCHARNAME_CONT(*i)) continue;
3425 problematic = TRUE;
3426 break;
3427 }
3428 }
3429 else {
3430 /* Similarly for utf8. For invariants can check
3431 * directly. We accept anything above the latin1
3432 * range because it is immaterial to Perl if it is
3433 * correct or not, and is expensive to check. But
3434 * it is fairly easy in the latin1 range to convert
3435 * the variants into a single character and check
3436 * those */
3437 if (UTF8_IS_INVARIANT(*i)) {
3438 if (! isALPHAU(*i)) problematic = TRUE;
3439 } else if (UTF8_IS_DOWNGRADEABLE_START(*i)) {
81c14aa2 3440 if (! isALPHAU(UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(*i,
cb233ae3
KW
3441 *(i+1)))))
3442 {
3443 problematic = TRUE;
3444 }
3445 }
3446 if (! problematic) for (i = s + UTF8SKIP(s);
3447 i < e;
3448 i+= UTF8SKIP(i))
3449 {
3450 if (UTF8_IS_INVARIANT(*i)) {
3451 if (isCHARNAME_CONT(*i)) continue;
3452 } else if (! UTF8_IS_DOWNGRADEABLE_START(*i)) {
3453 continue;
3454 } else if (isCHARNAME_CONT(
3455 UNI_TO_NATIVE(
81c14aa2 3456 TWO_BYTE_UTF8_TO_UNI(*i, *(i+1)))))
cb233ae3
KW
3457 {
3458 continue;
3459 }
3460 problematic = TRUE;
3461 break;
3462 }
3463 }
3464 if (problematic) {
6e1bad6c
KW
3465 /* The e-i passed to the final %.*s makes sure that
3466 * should the trailing NUL be missing that this
3467 * print won't run off the end of the string */
cb233ae3 3468 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
b00fc8d4
NC
3469 "Deprecated character in \\N{...}; marked by <-- HERE in \\N{%.*s<-- HERE %.*s",
3470 (int)(i - s + 1), s, (int)(e - i), i + 1);
cb233ae3
KW
3471 }
3472 }
3473 } /* End \N{NAME} */
ff3f963a
KW
3474#ifdef EBCDIC
3475 if (!dorange)
3476 native_range = FALSE; /* \N{} is defined to be Unicode */
3477#endif
3478 s = e + 1; /* Point to just after the '}' */
423cee85
JH
3479 continue;
3480
02aa26ce 3481 /* \c is a control character */
79072805
LW
3482 case 'c':
3483 s++;
961ce445 3484 if (s < send) {
17a3df4c 3485 *d++ = grok_bslash_c(*s++, has_utf8, 1);
ba210ebe 3486 }
961ce445
RGS
3487 else {
3488 yyerror("Missing control char name in \\c");
3489 }
79072805 3490 continue;
02aa26ce
NT
3491
3492 /* printf-style backslashes, formfeeds, newlines, etc */
79072805 3493 case 'b':
db42d148 3494 *d++ = NATIVE_TO_NEED(has_utf8,'\b');
79072805
LW
3495 break;
3496 case 'n':
db42d148 3497 *d++ = NATIVE_TO_NEED(has_utf8,'\n');
79072805
LW
3498 break;
3499 case 'r':
db42d148 3500 *d++ = NATIVE_TO_NEED(has_utf8,'\r');
79072805
LW
3501 break;
3502 case 'f':
db42d148 3503 *d++ = NATIVE_TO_NEED(has_utf8,'\f');
79072805
LW
3504 break;
3505 case 't':
db42d148 3506 *d++ = NATIVE_TO_NEED(has_utf8,'\t');
79072805 3507 break;
34a3fe2a 3508 case 'e':
db42d148 3509 *d++ = ASCII_TO_NEED(has_utf8,'\033');
34a3fe2a
PP
3510 break;
3511 case 'a':
db42d148 3512 *d++ = ASCII_TO_NEED(has_utf8,'\007');
79072805 3513 break;
02aa26ce
NT
3514 } /* end switch */
3515
79072805
LW
3516 s++;
3517 continue;
02aa26ce 3518 } /* end if (backslash) */
4c3a8340
ST
3519#ifdef EBCDIC
3520 else
3521 literal_endpoint++;
3522#endif
02aa26ce 3523
f9a63242 3524 default_action:
77a135fe
KW
3525 /* If we started with encoded form, or already know we want it,
3526 then encode the next character */
3527 if (! NATIVE_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
2b9d42f0 3528 STRLEN len = 1;
77a135fe
KW
3529
3530
3531 /* One might think that it is wasted effort in the case of the
3532 * source being utf8 (this_utf8 == TRUE) to take the next character
3533 * in the source, convert it to an unsigned value, and then convert
3534 * it back again. But the source has not been validated here. The
3535 * routine that does the conversion checks for errors like
3536 * malformed utf8 */
3537
5f66b61c
AL
3538 const UV nextuv = (this_utf8) ? utf8n_to_uvchr((U8*)s, send - s, &len, 0) : (UV) ((U8) *s);
3539 const STRLEN need = UNISKIP(NATIVE_TO_UNI(nextuv));
77a135fe
KW
3540 if (!has_utf8) {
3541 SvCUR_set(sv, d - SvPVX_const(sv));
3542 SvPOK_on(sv);
3543 *d = '\0';
77a135fe 3544 /* See Note on sizing above. */
7bf79863
KW
3545 sv_utf8_upgrade_flags_grow(sv,
3546 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3547 need + (STRLEN)(send - s) + 1);
77a135fe
KW
3548 d = SvPVX(sv) + SvCUR(sv);
3549 has_utf8 = TRUE;
3550 } else if (need > len) {
3551 /* encoded value larger than old, may need extra space (NOTE:
3552 * SvCUR() is not set correctly here). See Note on sizing
3553 * above. */
9d4ba2ae 3554 const STRLEN off = d - SvPVX_const(sv);
77a135fe 3555 d = SvGROW(sv, off + need + (STRLEN)(send - s) + 1) + off;
2b9d42f0 3556 }
77a135fe
KW
3557 s += len;
3558
5f66b61c 3559 d = (char*)uvchr_to_utf8((U8*)d, nextuv);
e294cc5d
JH
3560#ifdef EBCDIC
3561 if (uv > 255 && !dorange)
3562 native_range = FALSE;
3563#endif
2b9d42f0
NIS
3564 }
3565 else {
3566 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
3567 }
02aa26ce
NT
3568 } /* while loop to process each character */
3569
3570 /* terminate the string and set up the sv */
79072805 3571 *d = '\0';
95a20fc0 3572 SvCUR_set(sv, d - SvPVX_const(sv));
2b9d42f0 3573 if (SvCUR(sv) >= SvLEN(sv))
5637ef5b
NC
3574 Perl_croak(aTHX_ "panic: constant overflowed allocated space, %"UVuf
3575 " >= %"UVuf, (UV)SvCUR(sv), (UV)SvLEN(sv));
2b9d42f0 3576
79072805 3577 SvPOK_on(sv);
9f4817db 3578 if (PL_encoding && !has_utf8) {
d0063567
DK
3579 sv_recode_to_utf8(sv, PL_encoding);
3580 if (SvUTF8(sv))
3581 has_utf8 = TRUE;
9f4817db 3582 }
2b9d42f0 3583 if (has_utf8) {
7e2040f0 3584 SvUTF8_on(sv);
2b9d42f0 3585 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
d0063567 3586 PL_sublex_info.sub_op->op_private |=
2b9d42f0
NIS
3587 (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
3588 }
3589 }
79072805 3590
02aa26ce 3591 /* shrink the sv if we allocated more than we used */
79072805 3592 if (SvCUR(sv) + 5 < SvLEN(sv)) {
1da4ca5f 3593 SvPV_shrink_to_cur(sv);
79072805 3594 }
02aa26ce 3595
6154021b 3596 /* return the substring (via pl_yylval) only if we parsed anything */
3280af22 3597 if (s > PL_bufptr) {
eb0d8d16
NC
3598 if ( PL_hints & ( PL_lex_inpat ? HINT_NEW_RE : HINT_NEW_STRING ) ) {
3599 const char *const key = PL_lex_inpat ? "qr" : "q";
3600 const STRLEN keylen = PL_lex_inpat ? 2 : 1;
3601 const char *type;
3602 STRLEN typelen;
3603
3604 if (PL_lex_inwhat == OP_TRANS) {
3605 type = "tr";
3606 typelen = 2;
3607 } else if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat) {
3608 type = "s";
3609 typelen = 1;
9da1dd8f
DM
3610 } else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'') {
3611 type = "q";
3612 typelen = 1;
eb0d8d16
NC
3613 } else {
3614 type = "qq";
3615 typelen = 2;
3616 }
3617
3618 sv = S_new_constant(aTHX_ start, s - start, key, keylen, sv, NULL,
3619 type, typelen);
3620 }
6154021b 3621 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
b3ac6de7 3622 } else
8990e307 3623 SvREFCNT_dec(sv);
79072805
LW
3624 return s;
3625}
3626
ffb4593c
NT
3627/* S_intuit_more
3628 * Returns TRUE if there's more to the expression (e.g., a subscript),
3629 * FALSE otherwise.
ffb4593c
NT
3630 *
3631 * It deals with "$foo[3]" and /$foo[3]/ and /$foo[0123456789$]+/
3632 *
3633 * ->[ and ->{ return TRUE
3634 * { and [ outside a pattern are always subscripts, so return TRUE
3635 * if we're outside a pattern and it's not { or [, then return FALSE
3636 * if we're in a pattern and the first char is a {
3637 * {4,5} (any digits around the comma) returns FALSE
3638 * if we're in a pattern and the first char is a [
3639 * [] returns FALSE
3640 * [SOMETHING] has a funky algorithm to decide whether it's a
3641 * character class or not. It has to deal with things like
3642 * /$foo[-3]/ and /$foo[$bar]/ as well as /$foo[$\d]+/
3643 * anything else returns TRUE
3644 */
3645
9cbb5ea2
GS
3646/* This is the one truly awful dwimmer necessary to conflate C and sed. */
3647
76e3520e 3648STATIC int
cea2e8a9 3649S_intuit_more(pTHX_ register char *s)
79072805 3650{
97aff369 3651 dVAR;
7918f24d
NC
3652
3653 PERL_ARGS_ASSERT_INTUIT_MORE;
3654
3280af22 3655 if (PL_lex_brackets)
79072805
LW
3656 return TRUE;
3657 if (*s == '-' && s[1] == '>' && (s[2] == '[' || s[2] == '{'))
3658 return TRUE;
3659 if (*s != '{' && *s != '[')
3660 return FALSE;
3280af22 3661 if (!PL_lex_inpat)
79072805
LW
3662 return TRUE;
3663
3664 /* In a pattern, so maybe we have {n,m}. */
3665 if (*s == '{') {
b3155d95 3666 if (regcurly(s)) {
79072805 3667 return FALSE;
b3155d95 3668 }
79072805 3669 return TRUE;
79072805
LW
3670 }
3671
3672 /* On the other hand, maybe we have a character class */
3673
3674 s++;
3675 if (*s == ']' || *s == '^')
3676 return FALSE;
3677 else {
ffb4593c 3678 /* this is terrifying, and it works */
79072805
LW
3679 int weight = 2; /* let's weigh the evidence */
3680 char seen[256];
f27ffc4a 3681 unsigned char un_char = 255, last_un_char;
9d4ba2ae 3682 const char * const send = strchr(s,']');
3280af22 3683 char tmpbuf[sizeof PL_tokenbuf * 4];
79072805
LW
3684
3685 if (!send) /* has to be an expression */
3686 return TRUE;
3687
3688 Zero(seen,256,char);
3689 if (*s == '$')
3690 weight -= 3;
3691 else if (isDIGIT(*s)) {
3692 if (s[1] != ']') {
3693 if (isDIGIT(s[1]) && s[2] == ']')
3694 weight -= 10;
3695 }
3696 else
3697 weight -= 100;
3698 }
3699 for (; s < send; s++) {
3700 last_un_char = un_char;
3701 un_char = (unsigned char)*s;
3702 switch (*s) {
3703 case '@':
3704 case '&':
3705 case '$':
3706 weight -= seen[un_char] * 10;
7e2040f0 3707 if (isALNUM_lazy_if(s+1,UTF)) {
90e5519e 3708 int len;
8903cb82 3709 scan_ident(s, send, tmpbuf, sizeof tmpbuf, FALSE);
90e5519e 3710 len = (int)strlen(tmpbuf);
6fbd0d97
BF
3711 if (len > 1 && gv_fetchpvn_flags(tmpbuf, len,
3712 UTF ? SVf_UTF8 : 0, SVt_PV))
79072805
LW
3713 weight -= 100;
3714 else
3715 weight -= 10;
3716 }
3717 else if (*s == '$' && s[1] &&
93a17b20
LW
3718 strchr("[#!%*<>()-=",s[1])) {
3719 if (/*{*/ strchr("])} =",s[2]))
79072805
LW
3720 weight -= 10;
3721 else
3722 weight -= 1;
3723 }
3724 break;
3725 case '\\':
3726 un_char = 254;
3727 if (s[1]) {
93a17b20 3728 if (strchr("wds]",s[1]))
79072805 3729 weight += 100;
10edeb5d 3730 else if (seen[(U8)'\''] || seen[(U8)'"'])
79072805 3731 weight += 1;
93a17b20 3732 else if (strchr("rnftbxcav",s[1]))
79072805
LW
3733 weight += 40;
3734 else if (isDIGIT(s[1])) {
3735 weight += 40;
3736 while (s[1] && isDIGIT(s[1]))
3737 s++;
3738 }
3739 }
3740 else
3741 weight += 100;
3742 break;
3743 case '-':