This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Add utility and .h for character's UTF-8
[perl5.git] / toke.c
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
26
27This is the lower layer of the Perl parser, managing characters and tokens.
28
29=for apidoc AmU|yy_parser *|PL_parser
30
31Pointer to a structure encapsulating the state of the parsing operation
32currently in progress. The pointer can be locally changed to perform
33a nested parse without interfering with the state of an outer parse.
34Individual members of C<PL_parser> have their own documentation.
35
36=cut
37*/
38
378cc40b 39#include "EXTERN.h"
864dbfa3 40#define PERL_IN_TOKE_C
378cc40b 41#include "perl.h"
04e98a4d 42#include "dquote_static.c"
378cc40b 43
eb0d8d16
NC
44#define new_constant(a,b,c,d,e,f,g) \
45 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
46
6154021b 47#define pl_yylval (PL_parser->yylval)
d3b6f988 48
199e78b7
DM
49/* XXX temporary backwards compatibility */
50#define PL_lex_brackets (PL_parser->lex_brackets)
78cdf107
Z
51#define PL_lex_allbrackets (PL_parser->lex_allbrackets)
52#define PL_lex_fakeeof (PL_parser->lex_fakeeof)
199e78b7
DM
53#define PL_lex_brackstack (PL_parser->lex_brackstack)
54#define PL_lex_casemods (PL_parser->lex_casemods)
55#define PL_lex_casestack (PL_parser->lex_casestack)
56#define PL_lex_defer (PL_parser->lex_defer)
57#define PL_lex_dojoin (PL_parser->lex_dojoin)
58#define PL_lex_expect (PL_parser->lex_expect)
59#define PL_lex_formbrack (PL_parser->lex_formbrack)
60#define PL_lex_inpat (PL_parser->lex_inpat)
61#define PL_lex_inwhat (PL_parser->lex_inwhat)
62#define PL_lex_op (PL_parser->lex_op)
63#define PL_lex_repl (PL_parser->lex_repl)
64#define PL_lex_starts (PL_parser->lex_starts)
65#define PL_lex_stuff (PL_parser->lex_stuff)
66#define PL_multi_start (PL_parser->multi_start)
67#define PL_multi_open (PL_parser->multi_open)
68#define PL_multi_close (PL_parser->multi_close)
69#define PL_pending_ident (PL_parser->pending_ident)
70#define PL_preambled (PL_parser->preambled)
71#define PL_sublex_info (PL_parser->sublex_info)
bdc0bf6f 72#define PL_linestr (PL_parser->linestr)
c2598295
DM
73#define PL_expect (PL_parser->expect)
74#define PL_copline (PL_parser->copline)
f06b5848
DM
75#define PL_bufptr (PL_parser->bufptr)
76#define PL_oldbufptr (PL_parser->oldbufptr)
77#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
78#define PL_linestart (PL_parser->linestart)
79#define PL_bufend (PL_parser->bufend)
80#define PL_last_uni (PL_parser->last_uni)
81#define PL_last_lop (PL_parser->last_lop)
82#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 83#define PL_lex_state (PL_parser->lex_state)
2f9285f8 84#define PL_rsfp (PL_parser->rsfp)
5486870f 85#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
86#define PL_in_my (PL_parser->in_my)
87#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 88#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 89#define PL_multi_end (PL_parser->multi_end)
13765c85 90#define PL_error_count (PL_parser->error_count)
199e78b7
DM
91
92#ifdef PERL_MAD
93# define PL_endwhite (PL_parser->endwhite)
94# define PL_faketokens (PL_parser->faketokens)
95# define PL_lasttoke (PL_parser->lasttoke)
96# define PL_nextwhite (PL_parser->nextwhite)
97# define PL_realtokenstart (PL_parser->realtokenstart)
98# define PL_skipwhite (PL_parser->skipwhite)
99# define PL_thisclose (PL_parser->thisclose)
100# define PL_thismad (PL_parser->thismad)
101# define PL_thisopen (PL_parser->thisopen)
102# define PL_thisstuff (PL_parser->thisstuff)
103# define PL_thistoken (PL_parser->thistoken)
104# define PL_thiswhite (PL_parser->thiswhite)
fb205e7a
DM
105# define PL_thiswhite (PL_parser->thiswhite)
106# define PL_nexttoke (PL_parser->nexttoke)
107# define PL_curforce (PL_parser->curforce)
108#else
109# define PL_nexttoke (PL_parser->nexttoke)
110# define PL_nexttype (PL_parser->nexttype)
111# define PL_nextval (PL_parser->nextval)
199e78b7
DM
112#endif
113
16173588
NC
114/* This can't be done with embed.fnc, because struct yy_parser contains a
115 member named pending_ident, which clashes with the generated #define */
3cbf51f5
DM
116static int
117S_pending_ident(pTHX);
199e78b7 118
0bd48802 119static const char ident_too_long[] = "Identifier too long";
8903cb82 120
29595ff2 121#ifdef PERL_MAD
29595ff2 122# define CURMAD(slot,sv) if (PL_madskills) { curmad(slot,sv); sv = 0; }
cd81e915 123# define NEXTVAL_NEXTTOKE PL_nexttoke[PL_curforce].next_val
9ded7720 124#else
5db06880 125# define CURMAD(slot,sv)
9ded7720 126# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2
NC
127#endif
128
a7aaec61
Z
129#define XENUMMASK 0x3f
130#define XFAKEEOF 0x40
131#define XFAKEBRACK 0x80
9059aa12 132
39e02b42
JH
133#ifdef USE_UTF8_SCRIPTS
134# define UTF (!IN_BYTES)
2b9d42f0 135#else
802a15e9 136# define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
2b9d42f0 137#endif
a0ed51b3 138
b1fc3636
CJ
139/* The maximum number of characters preceding the unrecognized one to display */
140#define UNRECOGNIZED_PRECEDE_COUNT 10
141
61f0cdd9 142/* In variables named $^X, these are the legal values for X.
2b92dfce
GS
143 * 1999-02-27 mjd-perl-patch@plover.com */
144#define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
145
bf4acbe4 146#define SPACE_OR_TAB(c) ((c)==' '||(c)=='\t')
bf4acbe4 147
ffb4593c
NT
148/* LEX_* are values for PL_lex_state, the state of the lexer.
149 * They are arranged oddly so that the guard on the switch statement
79072805 150 * can get by with a single comparison (if the compiler is smart enough).
9da1dd8f
DM
151 *
152 * These values refer to the various states within a sublex parse,
153 * i.e. within a double quotish string
79072805
LW
154 */
155
fb73857a 156/* #define LEX_NOTPARSING 11 is done in perl.h. */
157
b6007c36
DM
158#define LEX_NORMAL 10 /* normal code (ie not within "...") */
159#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
160#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
161#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
162#define LEX_INTERPSTART 6 /* expecting the start of a $var */
163
164 /* at end of code, eg "$x" followed by: */
165#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
166#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
167
168#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
169 string or after \E, $foo, etc */
170#define LEX_INTERPCONST 2 /* NOT USED */
171#define LEX_FORMLINE 1 /* expecting a format line */
172#define LEX_KNOWNEXT 0 /* next token known; just return it */
173
79072805 174
bbf60fe6 175#ifdef DEBUGGING
27da23d5 176static const char* const lex_state_names[] = {
bbf60fe6
DM
177 "KNOWNEXT",
178 "FORMLINE",
179 "INTERPCONST",
180 "INTERPCONCAT",
181 "INTERPENDMAYBE",
182 "INTERPEND",
183 "INTERPSTART",
184 "INTERPPUSH",
185 "INTERPCASEMOD",
186 "INTERPNORMAL",
187 "NORMAL"
188};
189#endif
190
79072805
LW
191#ifdef ff_next
192#undef ff_next
d48672a2
LW
193#endif
194
79072805 195#include "keywords.h"
fe14fcc3 196
ffb4593c
NT
197/* CLINE is a macro that ensures PL_copline has a sane value */
198
ae986130
LW
199#ifdef CLINE
200#undef CLINE
201#endif
57843af0 202#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 203
5db06880 204#ifdef PERL_MAD
29595ff2
NC
205# define SKIPSPACE0(s) skipspace0(s)
206# define SKIPSPACE1(s) skipspace1(s)
207# define SKIPSPACE2(s,tsv) skipspace2(s,&tsv)
208# define PEEKSPACE(s) skipspace2(s,0)
209#else
210# define SKIPSPACE0(s) skipspace(s)
211# define SKIPSPACE1(s) skipspace(s)
212# define SKIPSPACE2(s,tsv) skipspace(s)
213# define PEEKSPACE(s) skipspace(s)
214#endif
215
ffb4593c
NT
216/*
217 * Convenience functions to return different tokens and prime the
9cbb5ea2 218 * lexer for the next token. They all take an argument.
ffb4593c
NT
219 *
220 * TOKEN : generic token (used for '(', DOLSHARP, etc)
221 * OPERATOR : generic operator
222 * AOPERATOR : assignment operator
223 * PREBLOCK : beginning the block after an if, while, foreach, ...
224 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
225 * PREREF : *EXPR where EXPR is not a simple identifier
226 * TERM : expression term
227 * LOOPX : loop exiting command (goto, last, dump, etc)
228 * FTST : file test operator
229 * FUN0 : zero-argument function
7eb971ee 230 * FUN0OP : zero-argument function, with its op created in this file
2d2e263d 231 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
232 * BOop : bitwise or or xor
233 * BAop : bitwise and
234 * SHop : shift operator
235 * PWop : power operator
9cbb5ea2 236 * PMop : pattern-matching operator
ffb4593c
NT
237 * Aop : addition-level operator
238 * Mop : multiplication-level operator
239 * Eop : equality-testing operator
e5edeb50 240 * Rop : relational operator <= != gt
ffb4593c
NT
241 *
242 * Also see LOP and lop() below.
243 */
244
998054bd 245#ifdef DEBUGGING /* Serve -DT. */
704d4215 246# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 247#else
bbf60fe6 248# define REPORT(retval) (retval)
998054bd
SC
249#endif
250
bbf60fe6
DM
251#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
252#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
253#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, REPORT(retval)))
254#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
255#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
256#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
257#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
6154021b
RGS
258#define LOOPX(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)LOOPEX))
259#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
260#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
7eb971ee 261#define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
6154021b
RGS
262#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
263#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITOROP)))
264#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITANDOP)))
265#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)SHIFTOP)))
266#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)POWOP)))
267#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
268#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)ADDOP)))
269#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MULOP)))
270#define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
271#define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
2f3197b3 272
a687059c
LW
273/* This bit of chicanery makes a unary function followed by
274 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
275 * The UNIDOR macro is for unary functions that can be followed by the //
276 * operator (such as C<shift // 0>).
a687059c 277 */
d68ce4ac 278#define UNI3(f,x,have_x) { \
6154021b 279 pl_yylval.ival = f; \
d68ce4ac 280 if (have_x) PL_expect = x; \
376fcdbf
AL
281 PL_bufptr = s; \
282 PL_last_uni = PL_oldbufptr; \
283 PL_last_lop_op = f; \
284 if (*s == '(') \
285 return REPORT( (int)FUNC1 ); \
29595ff2 286 s = PEEKSPACE(s); \
376fcdbf
AL
287 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
288 }
d68ce4ac
FC
289#define UNI(f) UNI3(f,XTERM,1)
290#define UNIDOR(f) UNI3(f,XTERMORDORDOR,1)
b5fb7ce3
FC
291#define UNIPROTO(f,optional) { \
292 if (optional) PL_last_uni = PL_oldbufptr; \
22393538
MH
293 OPERATOR(f); \
294 }
a687059c 295
d68ce4ac 296#define UNIBRACK(f) UNI3(f,0,0)
79072805 297
9f68db38 298/* grandfather return to old style */
78cdf107
Z
299#define OLDLOP(f) \
300 do { \
301 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
302 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
303 pl_yylval.ival = (f); \
304 PL_expect = XTERM; \
305 PL_bufptr = s; \
306 return (int)LSTOP; \
307 } while(0)
79072805 308
8fa7f367
JH
309#ifdef DEBUGGING
310
6154021b 311/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
312enum token_type {
313 TOKENTYPE_NONE,
314 TOKENTYPE_IVAL,
6154021b 315 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6 316 TOKENTYPE_PVAL,
aeaef349 317 TOKENTYPE_OPVAL
bbf60fe6
DM
318};
319
6d4a66ac
NC
320static struct debug_tokens {
321 const int token;
322 enum token_type type;
323 const char *name;
324} const debug_tokens[] =
9041c2e3 325{
bbf60fe6
DM
326 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
327 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
328 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
329 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
330 { ARROW, TOKENTYPE_NONE, "ARROW" },
331 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
332 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
333 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
334 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
335 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
0d863452 336 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
337 { DO, TOKENTYPE_NONE, "DO" },
338 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
339 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
340 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
341 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
342 { ELSE, TOKENTYPE_NONE, "ELSE" },
343 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
344 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
345 { FOR, TOKENTYPE_IVAL, "FOR" },
346 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
705fe0e5
FC
347 { FORMLBRACK, TOKENTYPE_NONE, "FORMLBRACK" },
348 { FORMRBRACK, TOKENTYPE_NONE, "FORMRBRACK" },
bbf60fe6
DM
349 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
350 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
7eb971ee 351 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
bbf60fe6
DM
352 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
353 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
354 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 355 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
356 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
357 { IF, TOKENTYPE_IVAL, "IF" },
5db1eb8d 358 { LABEL, TOKENTYPE_OPVAL, "LABEL" },
bbf60fe6
DM
359 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
360 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
361 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
362 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
363 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
364 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
365 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
366 { MY, TOKENTYPE_IVAL, "MY" },
367 { MYSUB, TOKENTYPE_NONE, "MYSUB" },
368 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
369 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
370 { OROP, TOKENTYPE_IVAL, "OROP" },
371 { OROR, TOKENTYPE_NONE, "OROR" },
372 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
f3f204dc 373 { PEG, TOKENTYPE_NONE, "PEG" },
88e1f1a2
JV
374 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
375 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6
DM
376 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
377 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
378 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
379 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
380 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
381 { PREINC, TOKENTYPE_NONE, "PREINC" },
382 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
f3f204dc 383 { QWLIST, TOKENTYPE_OPVAL, "QWLIST" },
bbf60fe6
DM
384 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
385 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
f3f204dc 386 { REQUIRE, TOKENTYPE_NONE, "REQUIRE" },
bbf60fe6
DM
387 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
388 { SUB, TOKENTYPE_NONE, "SUB" },
389 { THING, TOKENTYPE_OPVAL, "THING" },
390 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
391 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
392 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
393 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
394 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
395 { USE, TOKENTYPE_IVAL, "USE" },
0d863452 396 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6
DM
397 { WHILE, TOKENTYPE_IVAL, "WHILE" },
398 { WORD, TOKENTYPE_OPVAL, "WORD" },
be25f609 399 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 400 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
401};
402
6154021b 403/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 404
bbf60fe6 405STATIC int
704d4215 406S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 407{
97aff369 408 dVAR;
7918f24d
NC
409
410 PERL_ARGS_ASSERT_TOKEREPORT;
411
bbf60fe6 412 if (DEBUG_T_TEST) {
bd61b366 413 const char *name = NULL;
bbf60fe6 414 enum token_type type = TOKENTYPE_NONE;
f54cb97a 415 const struct debug_tokens *p;
396482e1 416 SV* const report = newSVpvs("<== ");
bbf60fe6 417
f54cb97a 418 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
419 if (p->token == (int)rv) {
420 name = p->name;
421 type = p->type;
422 break;
423 }
424 }
425 if (name)
54667de8 426 Perl_sv_catpv(aTHX_ report, name);
bbf60fe6
DM
427 else if ((char)rv > ' ' && (char)rv < '~')
428 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
429 else if (!rv)
396482e1 430 sv_catpvs(report, "EOF");
bbf60fe6
DM
431 else
432 Perl_sv_catpvf(aTHX_ report, "?? %"IVdf, (IV)rv);
433 switch (type) {
434 case TOKENTYPE_NONE:
bbf60fe6
DM
435 break;
436 case TOKENTYPE_IVAL:
704d4215 437 Perl_sv_catpvf(aTHX_ report, "(ival=%"IVdf")", (IV)lvalp->ival);
bbf60fe6
DM
438 break;
439 case TOKENTYPE_OPNUM:
440 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 441 PL_op_name[lvalp->ival]);
bbf60fe6
DM
442 break;
443 case TOKENTYPE_PVAL:
704d4215 444 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
445 break;
446 case TOKENTYPE_OPVAL:
704d4215 447 if (lvalp->opval) {
401441c0 448 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
449 PL_op_name[lvalp->opval->op_type]);
450 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 451 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 452 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
453 }
454
455 }
401441c0 456 else
396482e1 457 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
458 break;
459 }
b6007c36 460 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
461 };
462 return (int)rv;
998054bd
SC
463}
464
b6007c36
DM
465
466/* print the buffer with suitable escapes */
467
468STATIC void
15f169a1 469S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 470{
396482e1 471 SV* const tmp = newSVpvs("");
7918f24d
NC
472
473 PERL_ARGS_ASSERT_PRINTBUF;
474
b6007c36
DM
475 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
476 SvREFCNT_dec(tmp);
477}
478
8fa7f367
JH
479#endif
480
8290c323
NC
481static int
482S_deprecate_commaless_var_list(pTHX) {
483 PL_expect = XTERM;
484 deprecate("comma-less variable list");
485 return REPORT(','); /* grandfather non-comma-format format */
486}
487
ffb4593c
NT
488/*
489 * S_ao
490 *
c963b151
BD
491 * This subroutine detects &&=, ||=, and //= and turns an ANDAND, OROR or DORDOR
492 * into an OP_ANDASSIGN, OP_ORASSIGN, or OP_DORASSIGN
ffb4593c
NT
493 */
494
76e3520e 495STATIC int
cea2e8a9 496S_ao(pTHX_ int toketype)
a0d0e21e 497{
97aff369 498 dVAR;
3280af22
NIS
499 if (*PL_bufptr == '=') {
500 PL_bufptr++;
a0d0e21e 501 if (toketype == ANDAND)
6154021b 502 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 503 else if (toketype == OROR)
6154021b 504 pl_yylval.ival = OP_ORASSIGN;
c963b151 505 else if (toketype == DORDOR)
6154021b 506 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
507 toketype = ASSIGNOP;
508 }
509 return toketype;
510}
511
ffb4593c
NT
512/*
513 * S_no_op
514 * When Perl expects an operator and finds something else, no_op
515 * prints the warning. It always prints "<something> found where
516 * operator expected. It prints "Missing semicolon on previous line?"
517 * if the surprise occurs at the start of the line. "do you need to
518 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
519 * where the compiler doesn't know if foo is a method call or a function.
520 * It prints "Missing operator before end of line" if there's nothing
521 * after the missing operator, or "... before <...>" if there is something
522 * after the missing operator.
523 */
524
76e3520e 525STATIC void
15f169a1 526S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 527{
97aff369 528 dVAR;
9d4ba2ae
AL
529 char * const oldbp = PL_bufptr;
530 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 531
7918f24d
NC
532 PERL_ARGS_ASSERT_NO_OP;
533
1189a94a
GS
534 if (!s)
535 s = oldbp;
07c798fb 536 else
1189a94a 537 PL_bufptr = s;
734ab321 538 yywarn(Perl_form(aTHX_ "%s found where operator expected", what), UTF ? SVf_UTF8 : 0);
56da5a46
RGS
539 if (ckWARN_d(WARN_SYNTAX)) {
540 if (is_first)
541 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
542 "\t(Missing semicolon on previous line?)\n");
543 else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) {
f54cb97a 544 const char *t;
734ab321
BF
545 for (t = PL_oldoldbufptr; (isALNUM_lazy_if(t,UTF) || *t == ':');
546 t += UTF ? UTF8SKIP(t) : 1)
c35e046a 547 NOOP;
56da5a46
RGS
548 if (t < PL_bufptr && isSPACE(*t))
549 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
734ab321
BF
550 "\t(Do you need to predeclare %"SVf"?)\n",
551 SVfARG(newSVpvn_flags(PL_oldoldbufptr, (STRLEN)(t - PL_oldoldbufptr),
552 SVs_TEMP | (UTF ? SVf_UTF8 : 0))));
56da5a46
RGS
553 }
554 else {
555 assert(s >= oldbp);
556 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
734ab321
BF
557 "\t(Missing operator before %"SVf"?)\n",
558 SVfARG(newSVpvn_flags(oldbp, (STRLEN)(s - oldbp),
559 SVs_TEMP | (UTF ? SVf_UTF8 : 0))));
56da5a46 560 }
07c798fb 561 }
3280af22 562 PL_bufptr = oldbp;
8990e307
LW
563}
564
ffb4593c
NT
565/*
566 * S_missingterm
567 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 568 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
569 * If we're in a delimited string and the delimiter is a control
570 * character, it's reformatted into a two-char sequence like ^C.
571 * This is fatal.
572 */
573
76e3520e 574STATIC void
cea2e8a9 575S_missingterm(pTHX_ char *s)
8990e307 576{
97aff369 577 dVAR;
8990e307
LW
578 char tmpbuf[3];
579 char q;
580 if (s) {
9d4ba2ae 581 char * const nl = strrchr(s,'\n');
d2719217 582 if (nl)
8990e307
LW
583 *nl = '\0';
584 }
463559e7 585 else if (isCNTRL(PL_multi_close)) {
8990e307 586 *tmpbuf = '^';
585ec06d 587 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
588 tmpbuf[2] = '\0';
589 s = tmpbuf;
590 }
591 else {
eb160463 592 *tmpbuf = (char)PL_multi_close;
8990e307
LW
593 tmpbuf[1] = '\0';
594 s = tmpbuf;
595 }
596 q = strchr(s,'"') ? '\'' : '"';
cea2e8a9 597 Perl_croak(aTHX_ "Can't find string terminator %c%s%c anywhere before EOF",q,s,q);
463ee0b2 598}
79072805 599
dd0ac2b9
FC
600#include "feature.h"
601
0d863452 602/*
0d863452
RH
603 * Check whether the named feature is enabled.
604 */
26ea9e12 605bool
3fff3427 606Perl_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
0d863452 607{
97aff369 608 dVAR;
4a731d7b 609 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
7918f24d
NC
610
611 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
ca4d40c4
FC
612
613 assert(CURRENT_FEATURE_BUNDLE == FEATURE_BUNDLE_CUSTOM);
7918f24d 614
26ea9e12
NC
615 if (namelen > MAX_FEATURE_LEN)
616 return FALSE;
3fff3427 617 memcpy(&he_name[8], name, namelen);
7d69d4a6 618
c8ca97b0
NC
619 return cBOOL(cop_hints_fetch_pvn(PL_curcop, he_name, 8 + namelen, 0,
620 REFCOUNTED_HE_EXISTS));
0d863452
RH
621}
622
ffb4593c 623/*
9cbb5ea2
GS
624 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
625 * utf16-to-utf8-reversed.
ffb4593c
NT
626 */
627
c39cd008
GS
628#ifdef PERL_CR_FILTER
629static void
630strip_return(SV *sv)
631{
eb578fdb
KW
632 const char *s = SvPVX_const(sv);
633 const char * const e = s + SvCUR(sv);
7918f24d
NC
634
635 PERL_ARGS_ASSERT_STRIP_RETURN;
636
c39cd008
GS
637 /* outer loop optimized to do nothing if there are no CR-LFs */
638 while (s < e) {
639 if (*s++ == '\r' && *s == '\n') {
640 /* hit a CR-LF, need to copy the rest */
eb578fdb 641 char *d = s - 1;
c39cd008
GS
642 *d++ = *s++;
643 while (s < e) {
644 if (*s == '\r' && s[1] == '\n')
645 s++;
646 *d++ = *s++;
647 }
648 SvCUR(sv) -= s - d;
649 return;
650 }
651 }
652}
a868473f 653
76e3520e 654STATIC I32
c39cd008 655S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 656{
f54cb97a 657 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
658 if (count > 0 && !maxlen)
659 strip_return(sv);
660 return count;
a868473f
NIS
661}
662#endif
663
ffb4593c 664/*
8eaa0acf
Z
665=for apidoc Amx|void|lex_start|SV *line|PerlIO *rsfp|U32 flags
666
667Creates and initialises a new lexer/parser state object, supplying
668a context in which to lex and parse from a new source of Perl code.
669A pointer to the new state object is placed in L</PL_parser>. An entry
670is made on the save stack so that upon unwinding the new state object
671will be destroyed and the former value of L</PL_parser> will be restored.
672Nothing else need be done to clean up the parsing context.
673
674The code to be parsed comes from I<line> and I<rsfp>. I<line>, if
675non-null, provides a string (in SV form) containing code to be parsed.
676A copy of the string is made, so subsequent modification of I<line>
677does not affect parsing. I<rsfp>, if non-null, provides an input stream
678from which code will be read to be parsed. If both are non-null, the
679code in I<line> comes first and must consist of complete lines of input,
680and I<rsfp> supplies the remainder of the source.
681
e368b3bd
FC
682The I<flags> parameter is reserved for future use. Currently it is only
683used by perl internally, so extensions should always pass zero.
8eaa0acf
Z
684
685=cut
686*/
ffb4593c 687
27fcb6ee 688/* LEX_START_SAME_FILTER indicates that this is not a new file, so it
87606032
NC
689 can share filters with the current parser.
690 LEX_START_DONT_CLOSE indicates that the file handle wasn't opened by the
691 caller, hence isn't owned by the parser, so shouldn't be closed on parser
692 destruction. This is used to handle the case of defaulting to reading the
693 script from the standard input because no filename was given on the command
694 line (without getting confused by situation where STDIN has been closed, so
695 the script handle is opened on fd 0) */
27fcb6ee 696
a0d0e21e 697void
8eaa0acf 698Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
79072805 699{
97aff369 700 dVAR;
6ef55633 701 const char *s = NULL;
5486870f 702 yy_parser *parser, *oparser;
60d63348 703 if (flags && flags & ~LEX_START_FLAGS)
8eaa0acf 704 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
acdf0a21
DM
705
706 /* create and initialise a parser */
707
199e78b7 708 Newxz(parser, 1, yy_parser);
5486870f 709 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
710 PL_parser = parser;
711
28ac2b49
Z
712 parser->stack = NULL;
713 parser->ps = NULL;
714 parser->stack_size = 0;
acdf0a21 715
e3abe207
DM
716 /* on scope exit, free this parser and restore any outer one */
717 SAVEPARSER(parser);
7c4baf47 718 parser->saved_curcop = PL_curcop;
e3abe207 719
acdf0a21 720 /* initialise lexer state */
8990e307 721
fb205e7a
DM
722#ifdef PERL_MAD
723 parser->curforce = -1;
724#else
725 parser->nexttoke = 0;
726#endif
ca4cfd28 727 parser->error_count = oparser ? oparser->error_count : 0;
c2598295 728 parser->copline = NOLINE;
5afb0a62 729 parser->lex_state = LEX_NORMAL;
c2598295 730 parser->expect = XSTATE;
2f9285f8 731 parser->rsfp = rsfp;
27fcb6ee
FC
732 parser->rsfp_filters =
733 !(flags & LEX_START_SAME_FILTER) || !oparser
d3cd8e11
FC
734 ? NULL
735 : MUTABLE_AV(SvREFCNT_inc(
736 oparser->rsfp_filters
737 ? oparser->rsfp_filters
738 : (oparser->rsfp_filters = newAV())
739 ));
2f9285f8 740
199e78b7
DM
741 Newx(parser->lex_brackstack, 120, char);
742 Newx(parser->lex_casestack, 12, char);
743 *parser->lex_casestack = '\0';
02b34bbe 744
10efb74f 745 if (line) {
0528fd32 746 STRLEN len;
10efb74f 747 s = SvPV_const(line, len);
0abcdfa4
FC
748 parser->linestr = flags & LEX_START_COPIED
749 ? SvREFCNT_inc_simple_NN(line)
750 : newSVpvn_flags(s, len, SvUTF8(line));
37555a72 751 if (!len || s[len-1] != ';')
bdc0bf6f 752 sv_catpvs(parser->linestr, "\n;");
0abcdfa4
FC
753 } else {
754 parser->linestr = newSVpvs("\n;");
8990e307 755 }
f06b5848
DM
756 parser->oldoldbufptr =
757 parser->oldbufptr =
758 parser->bufptr =
759 parser->linestart = SvPVX(parser->linestr);
760 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
761 parser->last_lop = parser->last_uni = NULL;
87606032
NC
762 parser->lex_flags = flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
763 |LEX_DONT_CLOSE_RSFP);
737c24fc 764
60d63348 765 parser->in_pod = parser->filtered = 0;
79072805 766}
a687059c 767
e3abe207
DM
768
769/* delete a parser object */
770
771void
772Perl_parser_free(pTHX_ const yy_parser *parser)
773{
7918f24d
NC
774 PERL_ARGS_ASSERT_PARSER_FREE;
775
7c4baf47 776 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
777 SvREFCNT_dec(parser->linestr);
778
87606032 779 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
2f9285f8 780 PerlIO_clearerr(parser->rsfp);
799361c3
SH
781 else if (parser->rsfp && (!parser->old_parser ||
782 (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 783 PerlIO_close(parser->rsfp);
5486870f 784 SvREFCNT_dec(parser->rsfp_filters);
2f9285f8 785
e3abe207
DM
786 Safefree(parser->lex_brackstack);
787 Safefree(parser->lex_casestack);
788 PL_parser = parser->old_parser;
789 Safefree(parser);
790}
791
792
ffb4593c 793/*
f0e67a1d
Z
794=for apidoc AmxU|SV *|PL_parser-E<gt>linestr
795
796Buffer scalar containing the chunk currently under consideration of the
797text currently being lexed. This is always a plain string scalar (for
798which C<SvPOK> is true). It is not intended to be used as a scalar by
799normal scalar means; instead refer to the buffer directly by the pointer
800variables described below.
801
802The lexer maintains various C<char*> pointers to things in the
803C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
804reallocated, all of these pointers must be updated. Don't attempt to
805do this manually, but rather use L</lex_grow_linestr> if you need to
806reallocate the buffer.
807
808The content of the text chunk in the buffer is commonly exactly one
809complete line of input, up to and including a newline terminator,
810but there are situations where it is otherwise. The octets of the
811buffer may be intended to be interpreted as either UTF-8 or Latin-1.
812The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
813flag on this scalar, which may disagree with it.
814
815For direct examination of the buffer, the variable
816L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
817lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
818of these pointers is usually preferable to examination of the scalar
819through normal scalar means.
820
821=for apidoc AmxU|char *|PL_parser-E<gt>bufend
822
823Direct pointer to the end of the chunk of text currently being lexed, the
824end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
825+ SvCUR(PL_parser-E<gt>linestr)>. A NUL character (zero octet) is
826always located at the end of the buffer, and does not count as part of
827the buffer's contents.
828
829=for apidoc AmxU|char *|PL_parser-E<gt>bufptr
830
831Points to the current position of lexing inside the lexer buffer.
832Characters around this point may be freely examined, within
833the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
834L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
835interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
836
837Lexing code (whether in the Perl core or not) moves this pointer past
838the characters that it consumes. It is also expected to perform some
839bookkeeping whenever a newline character is consumed. This movement
840can be more conveniently performed by the function L</lex_read_to>,
841which handles newlines appropriately.
842
843Interpretation of the buffer's octets can be abstracted out by
844using the slightly higher-level functions L</lex_peek_unichar> and
845L</lex_read_unichar>.
846
847=for apidoc AmxU|char *|PL_parser-E<gt>linestart
848
849Points to the start of the current line inside the lexer buffer.
850This is useful for indicating at which column an error occurred, and
851not much else. This must be updated by any lexing code that consumes
852a newline; the function L</lex_read_to> handles this detail.
853
854=cut
855*/
856
857/*
858=for apidoc Amx|bool|lex_bufutf8
859
860Indicates whether the octets in the lexer buffer
861(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
862of Unicode characters. If not, they should be interpreted as Latin-1
863characters. This is analogous to the C<SvUTF8> flag for scalars.
864
865In UTF-8 mode, it is not guaranteed that the lexer buffer actually
866contains valid UTF-8. Lexing code must be robust in the face of invalid
867encoding.
868
869The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
870is significant, but not the whole story regarding the input character
871encoding. Normally, when a file is being read, the scalar contains octets
872and its C<SvUTF8> flag is off, but the octets should be interpreted as
873UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
874however, the scalar may have the C<SvUTF8> flag on, and in this case its
875octets should be interpreted as UTF-8 unless the C<use bytes> pragma
876is in effect. This logic may change in the future; use this function
877instead of implementing the logic yourself.
878
879=cut
880*/
881
882bool
883Perl_lex_bufutf8(pTHX)
884{
885 return UTF;
886}
887
888/*
889=for apidoc Amx|char *|lex_grow_linestr|STRLEN len
890
891Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
892at least I<len> octets (including terminating NUL). Returns a
893pointer to the reallocated buffer. This is necessary before making
894any direct modification of the buffer that would increase its length.
895L</lex_stuff_pvn> provides a more convenient way to insert text into
896the buffer.
897
898Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
899this function updates all of the lexer's variables that point directly
900into the buffer.
901
902=cut
903*/
904
905char *
906Perl_lex_grow_linestr(pTHX_ STRLEN len)
907{
908 SV *linestr;
909 char *buf;
910 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
c7641931 911 STRLEN linestart_pos, last_uni_pos, last_lop_pos, re_eval_start_pos;
f0e67a1d
Z
912 linestr = PL_parser->linestr;
913 buf = SvPVX(linestr);
914 if (len <= SvLEN(linestr))
915 return buf;
916 bufend_pos = PL_parser->bufend - buf;
917 bufptr_pos = PL_parser->bufptr - buf;
918 oldbufptr_pos = PL_parser->oldbufptr - buf;
919 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
920 linestart_pos = PL_parser->linestart - buf;
921 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
922 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
c7641931
DM
923 re_eval_start_pos = PL_sublex_info.re_eval_start ?
924 PL_sublex_info.re_eval_start - buf : 0;
925
f0e67a1d 926 buf = sv_grow(linestr, len);
c7641931 927
f0e67a1d
Z
928 PL_parser->bufend = buf + bufend_pos;
929 PL_parser->bufptr = buf + bufptr_pos;
930 PL_parser->oldbufptr = buf + oldbufptr_pos;
931 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
932 PL_parser->linestart = buf + linestart_pos;
933 if (PL_parser->last_uni)
934 PL_parser->last_uni = buf + last_uni_pos;
935 if (PL_parser->last_lop)
936 PL_parser->last_lop = buf + last_lop_pos;
c7641931
DM
937 if (PL_sublex_info.re_eval_start)
938 PL_sublex_info.re_eval_start = buf + re_eval_start_pos;
f0e67a1d
Z
939 return buf;
940}
941
942/*
83aa740e 943=for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
f0e67a1d
Z
944
945Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
946immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
947reallocating the buffer if necessary. This means that lexing code that
948runs later will see the characters as if they had appeared in the input.
949It is not recommended to do this as part of normal parsing, and most
950uses of this facility run the risk of the inserted characters being
951interpreted in an unintended manner.
952
953The string to be inserted is represented by I<len> octets starting
954at I<pv>. These octets are interpreted as either UTF-8 or Latin-1,
955according to whether the C<LEX_STUFF_UTF8> flag is set in I<flags>.
956The characters are recoded for the lexer buffer, according to how the
957buffer is currently being interpreted (L</lex_bufutf8>). If a string
9dcc53ea 958to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
f0e67a1d
Z
959function is more convenient.
960
961=cut
962*/
963
964void
83aa740e 965Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
f0e67a1d 966{
749123ff 967 dVAR;
f0e67a1d
Z
968 char *bufptr;
969 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
970 if (flags & ~(LEX_STUFF_UTF8))
971 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
972 if (UTF) {
973 if (flags & LEX_STUFF_UTF8) {
974 goto plain_copy;
975 } else {
976 STRLEN highhalf = 0;
83aa740e 977 const char *p, *e = pv+len;
f0e67a1d
Z
978 for (p = pv; p != e; p++)
979 highhalf += !!(((U8)*p) & 0x80);
980 if (!highhalf)
981 goto plain_copy;
982 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
983 bufptr = PL_parser->bufptr;
984 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
985 SvCUR_set(PL_parser->linestr,
986 SvCUR(PL_parser->linestr) + len+highhalf);
f0e67a1d
Z
987 PL_parser->bufend += len+highhalf;
988 for (p = pv; p != e; p++) {
989 U8 c = (U8)*p;
990 if (c & 0x80) {
991 *bufptr++ = (char)(0xc0 | (c >> 6));
992 *bufptr++ = (char)(0x80 | (c & 0x3f));
993 } else {
994 *bufptr++ = (char)c;
995 }
996 }
997 }
998 } else {
999 if (flags & LEX_STUFF_UTF8) {
1000 STRLEN highhalf = 0;
83aa740e 1001 const char *p, *e = pv+len;
f0e67a1d
Z
1002 for (p = pv; p != e; p++) {
1003 U8 c = (U8)*p;
1004 if (c >= 0xc4) {
1005 Perl_croak(aTHX_ "Lexing code attempted to stuff "
1006 "non-Latin-1 character into Latin-1 input");
1007 } else if (c >= 0xc2 && p+1 != e &&
1008 (((U8)p[1]) & 0xc0) == 0x80) {
1009 p++;
1010 highhalf++;
1011 } else if (c >= 0x80) {
1012 /* malformed UTF-8 */
1013 ENTER;
1014 SAVESPTR(PL_warnhook);
1015 PL_warnhook = PERL_WARNHOOK_FATAL;
1016 utf8n_to_uvuni((U8*)p, e-p, NULL, 0);
1017 LEAVE;
1018 }
1019 }
1020 if (!highhalf)
1021 goto plain_copy;
1022 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
1023 bufptr = PL_parser->bufptr;
1024 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1025 SvCUR_set(PL_parser->linestr,
1026 SvCUR(PL_parser->linestr) + len-highhalf);
f0e67a1d
Z
1027 PL_parser->bufend += len-highhalf;
1028 for (p = pv; p != e; p++) {
1029 U8 c = (U8)*p;
1030 if (c & 0x80) {
1031 *bufptr++ = (char)(((c & 0x3) << 6) | (p[1] & 0x3f));
1032 p++;
1033 } else {
1034 *bufptr++ = (char)c;
1035 }
1036 }
1037 } else {
1038 plain_copy:
1039 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1040 bufptr = PL_parser->bufptr;
1041 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
255fdf19 1042 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
f0e67a1d
Z
1043 PL_parser->bufend += len;
1044 Copy(pv, bufptr, len, char);
1045 }
1046 }
1047}
1048
1049/*
9dcc53ea
Z
1050=for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
1051
1052Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1053immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1054reallocating the buffer if necessary. This means that lexing code that
1055runs later will see the characters as if they had appeared in the input.
1056It is not recommended to do this as part of normal parsing, and most
1057uses of this facility run the risk of the inserted characters being
1058interpreted in an unintended manner.
1059
1060The string to be inserted is represented by octets starting at I<pv>
1061and continuing to the first nul. These octets are interpreted as either
1062UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
1063in I<flags>. The characters are recoded for the lexer buffer, according
1064to how the buffer is currently being interpreted (L</lex_bufutf8>).
1065If it is not convenient to nul-terminate a string to be inserted, the
1066L</lex_stuff_pvn> function is more appropriate.
1067
1068=cut
1069*/
1070
1071void
1072Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1073{
1074 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1075 lex_stuff_pvn(pv, strlen(pv), flags);
1076}
1077
1078/*
f0e67a1d
Z
1079=for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1080
1081Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1082immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1083reallocating the buffer if necessary. This means that lexing code that
1084runs later will see the characters as if they had appeared in the input.
1085It is not recommended to do this as part of normal parsing, and most
1086uses of this facility run the risk of the inserted characters being
1087interpreted in an unintended manner.
1088
1089The string to be inserted is the string value of I<sv>. The characters
1090are recoded for the lexer buffer, according to how the buffer is currently
9dcc53ea 1091being interpreted (L</lex_bufutf8>). If a string to be inserted is
f0e67a1d
Z
1092not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1093need to construct a scalar.
1094
1095=cut
1096*/
1097
1098void
1099Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1100{
1101 char *pv;
1102 STRLEN len;
1103 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1104 if (flags)
1105 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1106 pv = SvPV(sv, len);
1107 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1108}
1109
1110/*
1111=for apidoc Amx|void|lex_unstuff|char *ptr
1112
1113Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1114I<ptr>. Text following I<ptr> will be moved, and the buffer shortened.
1115This hides the discarded text from any lexing code that runs later,
1116as if the text had never appeared.
1117
1118This is not the normal way to consume lexed text. For that, use
1119L</lex_read_to>.
1120
1121=cut
1122*/
1123
1124void
1125Perl_lex_unstuff(pTHX_ char *ptr)
1126{
1127 char *buf, *bufend;
1128 STRLEN unstuff_len;
1129 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1130 buf = PL_parser->bufptr;
1131 if (ptr < buf)
1132 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1133 if (ptr == buf)
1134 return;
1135 bufend = PL_parser->bufend;
1136 if (ptr > bufend)
1137 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1138 unstuff_len = ptr - buf;
1139 Move(ptr, buf, bufend+1-ptr, char);
1140 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1141 PL_parser->bufend = bufend - unstuff_len;
1142}
1143
1144/*
1145=for apidoc Amx|void|lex_read_to|char *ptr
1146
1147Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1148to I<ptr>. This advances L</PL_parser-E<gt>bufptr> to match I<ptr>,
1149performing the correct bookkeeping whenever a newline character is passed.
1150This is the normal way to consume lexed text.
1151
1152Interpretation of the buffer's octets can be abstracted out by
1153using the slightly higher-level functions L</lex_peek_unichar> and
1154L</lex_read_unichar>.
1155
1156=cut
1157*/
1158
1159void
1160Perl_lex_read_to(pTHX_ char *ptr)
1161{
1162 char *s;
1163 PERL_ARGS_ASSERT_LEX_READ_TO;
1164 s = PL_parser->bufptr;
1165 if (ptr < s || ptr > PL_parser->bufend)
1166 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1167 for (; s != ptr; s++)
1168 if (*s == '\n') {
1169 CopLINE_inc(PL_curcop);
1170 PL_parser->linestart = s+1;
1171 }
1172 PL_parser->bufptr = ptr;
1173}
1174
1175/*
1176=for apidoc Amx|void|lex_discard_to|char *ptr
1177
1178Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1179up to I<ptr>. The remaining content of the buffer will be moved, and
1180all pointers into the buffer updated appropriately. I<ptr> must not
1181be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1182it is not permitted to discard text that has yet to be lexed.
1183
1184Normally it is not necessarily to do this directly, because it suffices to
1185use the implicit discarding behaviour of L</lex_next_chunk> and things
1186based on it. However, if a token stretches across multiple lines,
1f317c95 1187and the lexing code has kept multiple lines of text in the buffer for
f0e67a1d
Z
1188that purpose, then after completion of the token it would be wise to
1189explicitly discard the now-unneeded earlier lines, to avoid future
1190multi-line tokens growing the buffer without bound.
1191
1192=cut
1193*/
1194
1195void
1196Perl_lex_discard_to(pTHX_ char *ptr)
1197{
1198 char *buf;
1199 STRLEN discard_len;
1200 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1201 buf = SvPVX(PL_parser->linestr);
1202 if (ptr < buf)
1203 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1204 if (ptr == buf)
1205 return;
1206 if (ptr > PL_parser->bufptr)
1207 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1208 discard_len = ptr - buf;
1209 if (PL_parser->oldbufptr < ptr)
1210 PL_parser->oldbufptr = ptr;
1211 if (PL_parser->oldoldbufptr < ptr)
1212 PL_parser->oldoldbufptr = ptr;
1213 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1214 PL_parser->last_uni = NULL;
1215 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1216 PL_parser->last_lop = NULL;
1217 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1218 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1219 PL_parser->bufend -= discard_len;
1220 PL_parser->bufptr -= discard_len;
1221 PL_parser->oldbufptr -= discard_len;
1222 PL_parser->oldoldbufptr -= discard_len;
1223 if (PL_parser->last_uni)
1224 PL_parser->last_uni -= discard_len;
1225 if (PL_parser->last_lop)
1226 PL_parser->last_lop -= discard_len;
1227}
1228
1229/*
1230=for apidoc Amx|bool|lex_next_chunk|U32 flags
1231
1232Reads in the next chunk of text to be lexed, appending it to
1233L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1234looked to the end of the current chunk and wants to know more. It is
1235usual, but not necessary, for lexing to have consumed the entirety of
1236the current chunk at this time.
1237
1238If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1239chunk (i.e., the current chunk has been entirely consumed), normally the
1240current chunk will be discarded at the same time that the new chunk is
1241read in. If I<flags> includes C<LEX_KEEP_PREVIOUS>, the current chunk
1242will not be discarded. If the current chunk has not been entirely
1243consumed, then it will not be discarded regardless of the flag.
1244
1245Returns true if some new text was added to the buffer, or false if the
1246buffer has reached the end of the input text.
1247
1248=cut
1249*/
1250
1251#define LEX_FAKE_EOF 0x80000000
112d1284 1252#define LEX_NO_TERM 0x40000000
f0e67a1d
Z
1253
1254bool
1255Perl_lex_next_chunk(pTHX_ U32 flags)
1256{
1257 SV *linestr;
1258 char *buf;
1259 STRLEN old_bufend_pos, new_bufend_pos;
1260 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1261 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1262 bool got_some_for_debugger = 0;
f0e67a1d 1263 bool got_some;
112d1284 1264 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM))
f0e67a1d 1265 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
f0e67a1d
Z
1266 linestr = PL_parser->linestr;
1267 buf = SvPVX(linestr);
1268 if (!(flags & LEX_KEEP_PREVIOUS) &&
1269 PL_parser->bufptr == PL_parser->bufend) {
1270 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1271 linestart_pos = 0;
1272 if (PL_parser->last_uni != PL_parser->bufend)
1273 PL_parser->last_uni = NULL;
1274 if (PL_parser->last_lop != PL_parser->bufend)
1275 PL_parser->last_lop = NULL;
1276 last_uni_pos = last_lop_pos = 0;
1277 *buf = 0;
1278 SvCUR(linestr) = 0;
1279 } else {
1280 old_bufend_pos = PL_parser->bufend - buf;
1281 bufptr_pos = PL_parser->bufptr - buf;
1282 oldbufptr_pos = PL_parser->oldbufptr - buf;
1283 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1284 linestart_pos = PL_parser->linestart - buf;
1285 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1286 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1287 }
1288 if (flags & LEX_FAKE_EOF) {
1289 goto eof;
60d63348 1290 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
f0e67a1d
Z
1291 got_some = 0;
1292 } else if (filter_gets(linestr, old_bufend_pos)) {
1293 got_some = 1;
17cc9359 1294 got_some_for_debugger = 1;
112d1284
FC
1295 } else if (flags & LEX_NO_TERM) {
1296 got_some = 0;
f0e67a1d 1297 } else {
580561a3
Z
1298 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1299 sv_setpvs(linestr, "");
f0e67a1d
Z
1300 eof:
1301 /* End of real input. Close filehandle (unless it was STDIN),
1302 * then add implicit termination.
1303 */
87606032 1304 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
f0e67a1d
Z
1305 PerlIO_clearerr(PL_parser->rsfp);
1306 else if (PL_parser->rsfp)
1307 (void)PerlIO_close(PL_parser->rsfp);
1308 PL_parser->rsfp = NULL;
60d63348 1309 PL_parser->in_pod = PL_parser->filtered = 0;
f0e67a1d
Z
1310#ifdef PERL_MAD
1311 if (PL_madskills && !PL_in_eval && (PL_minus_p || PL_minus_n))
1312 PL_faketokens = 1;
1313#endif
1314 if (!PL_in_eval && PL_minus_p) {
1315 sv_catpvs(linestr,
1316 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1317 PL_minus_n = PL_minus_p = 0;
1318 } else if (!PL_in_eval && PL_minus_n) {
1319 sv_catpvs(linestr, /*{*/";}");
1320 PL_minus_n = 0;
1321 } else
1322 sv_catpvs(linestr, ";");
1323 got_some = 1;
1324 }
1325 buf = SvPVX(linestr);
1326 new_bufend_pos = SvCUR(linestr);
1327 PL_parser->bufend = buf + new_bufend_pos;
1328 PL_parser->bufptr = buf + bufptr_pos;
1329 PL_parser->oldbufptr = buf + oldbufptr_pos;
1330 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1331 PL_parser->linestart = buf + linestart_pos;
1332 if (PL_parser->last_uni)
1333 PL_parser->last_uni = buf + last_uni_pos;
1334 if (PL_parser->last_lop)
1335 PL_parser->last_lop = buf + last_lop_pos;
17cc9359 1336 if (got_some_for_debugger && (PERLDB_LINE || PERLDB_SAVESRC) &&
f0e67a1d
Z
1337 PL_curstash != PL_debstash) {
1338 /* debugger active and we're not compiling the debugger code,
1339 * so store the line into the debugger's array of lines
1340 */
1341 update_debugger_info(NULL, buf+old_bufend_pos,
1342 new_bufend_pos-old_bufend_pos);
1343 }
1344 return got_some;
1345}
1346
1347/*
1348=for apidoc Amx|I32|lex_peek_unichar|U32 flags
1349
1350Looks ahead one (Unicode) character in the text currently being lexed.
1351Returns the codepoint (unsigned integer value) of the next character,
1352or -1 if lexing has reached the end of the input text. To consume the
1353peeked character, use L</lex_read_unichar>.
1354
1355If the next character is in (or extends into) the next chunk of input
1356text, the next chunk will be read in. Normally the current chunk will be
1357discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1358then the current chunk will not be discarded.
1359
1360If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1361is encountered, an exception is generated.
1362
1363=cut
1364*/
1365
1366I32
1367Perl_lex_peek_unichar(pTHX_ U32 flags)
1368{
749123ff 1369 dVAR;
f0e67a1d
Z
1370 char *s, *bufend;
1371 if (flags & ~(LEX_KEEP_PREVIOUS))
1372 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1373 s = PL_parser->bufptr;
1374 bufend = PL_parser->bufend;
1375 if (UTF) {
1376 U8 head;
1377 I32 unichar;
1378 STRLEN len, retlen;
1379 if (s == bufend) {
1380 if (!lex_next_chunk(flags))
1381 return -1;
1382 s = PL_parser->bufptr;
1383 bufend = PL_parser->bufend;
1384 }
1385 head = (U8)*s;
1386 if (!(head & 0x80))
1387 return head;
1388 if (head & 0x40) {
1389 len = PL_utf8skip[head];
1390 while ((STRLEN)(bufend-s) < len) {
1391 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1392 break;
1393 s = PL_parser->bufptr;
1394 bufend = PL_parser->bufend;
1395 }
1396 }
1397 unichar = utf8n_to_uvuni((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
1398 if (retlen == (STRLEN)-1) {
1399 /* malformed UTF-8 */
1400 ENTER;
1401 SAVESPTR(PL_warnhook);
1402 PL_warnhook = PERL_WARNHOOK_FATAL;
1403 utf8n_to_uvuni((U8*)s, bufend-s, NULL, 0);
1404 LEAVE;
1405 }
1406 return unichar;
1407 } else {
1408 if (s == bufend) {
1409 if (!lex_next_chunk(flags))
1410 return -1;
1411 s = PL_parser->bufptr;
1412 }
1413 return (U8)*s;
1414 }
1415}
1416
1417/*
1418=for apidoc Amx|I32|lex_read_unichar|U32 flags
1419
1420Reads the next (Unicode) character in the text currently being lexed.
1421Returns the codepoint (unsigned integer value) of the character read,
1422and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1423if lexing has reached the end of the input text. To non-destructively
1424examine the next character, use L</lex_peek_unichar> instead.
1425
1426If the next character is in (or extends into) the next chunk of input
1427text, the next chunk will be read in. Normally the current chunk will be
1428discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1429then the current chunk will not be discarded.
1430
1431If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1432is encountered, an exception is generated.
1433
1434=cut
1435*/
1436
1437I32
1438Perl_lex_read_unichar(pTHX_ U32 flags)
1439{
1440 I32 c;
1441 if (flags & ~(LEX_KEEP_PREVIOUS))
1442 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1443 c = lex_peek_unichar(flags);
1444 if (c != -1) {
1445 if (c == '\n')
1446 CopLINE_inc(PL_curcop);
d9018cbe
EB
1447 if (UTF)
1448 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1449 else
1450 ++(PL_parser->bufptr);
f0e67a1d
Z
1451 }
1452 return c;
1453}
1454
1455/*
1456=for apidoc Amx|void|lex_read_space|U32 flags
1457
1458Reads optional spaces, in Perl style, in the text currently being
1459lexed. The spaces may include ordinary whitespace characters and
1460Perl-style comments. C<#line> directives are processed if encountered.
1461L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1462at a non-space character (or the end of the input text).
1463
1464If spaces extend into the next chunk of input text, the next chunk will
1465be read in. Normally the current chunk will be discarded at the same
1466time, but if I<flags> includes C<LEX_KEEP_PREVIOUS> then the current
1467chunk will not be discarded.
1468
1469=cut
1470*/
1471
f0998909
Z
1472#define LEX_NO_NEXT_CHUNK 0x80000000
1473
f0e67a1d
Z
1474void
1475Perl_lex_read_space(pTHX_ U32 flags)
1476{
1477 char *s, *bufend;
1478 bool need_incline = 0;
f0998909 1479 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK))
f0e67a1d
Z
1480 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
1481#ifdef PERL_MAD
1482 if (PL_skipwhite) {
1483 sv_free(PL_skipwhite);
1484 PL_skipwhite = NULL;
1485 }
1486 if (PL_madskills)
1487 PL_skipwhite = newSVpvs("");
1488#endif /* PERL_MAD */
1489 s = PL_parser->bufptr;
1490 bufend = PL_parser->bufend;
1491 while (1) {
1492 char c = *s;
1493 if (c == '#') {
1494 do {
1495 c = *++s;
1496 } while (!(c == '\n' || (c == 0 && s == bufend)));
1497 } else if (c == '\n') {
1498 s++;
1499 PL_parser->linestart = s;
1500 if (s == bufend)
1501 need_incline = 1;
1502 else
1503 incline(s);
1504 } else if (isSPACE(c)) {
1505 s++;
1506 } else if (c == 0 && s == bufend) {
1507 bool got_more;
1508#ifdef PERL_MAD
1509 if (PL_madskills)
1510 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1511#endif /* PERL_MAD */
f0998909
Z
1512 if (flags & LEX_NO_NEXT_CHUNK)
1513 break;
f0e67a1d
Z
1514 PL_parser->bufptr = s;
1515 CopLINE_inc(PL_curcop);
1516 got_more = lex_next_chunk(flags);
1517 CopLINE_dec(PL_curcop);
1518 s = PL_parser->bufptr;
1519 bufend = PL_parser->bufend;
1520 if (!got_more)
1521 break;
1522 if (need_incline && PL_parser->rsfp) {
1523 incline(s);
1524 need_incline = 0;
1525 }
1526 } else {
1527 break;
1528 }
1529 }
1530#ifdef PERL_MAD
1531 if (PL_madskills)
1532 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1533#endif /* PERL_MAD */
1534 PL_parser->bufptr = s;
1535}
1536
1537/*
ffb4593c
NT
1538 * S_incline
1539 * This subroutine has nothing to do with tilting, whether at windmills
1540 * or pinball tables. Its name is short for "increment line". It
57843af0 1541 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1542 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1543 * # line 500 "foo.pm"
1544 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1545 */
1546
76e3520e 1547STATIC void
d9095cec 1548S_incline(pTHX_ const char *s)
463ee0b2 1549{
97aff369 1550 dVAR;
d9095cec
NC
1551 const char *t;
1552 const char *n;
1553 const char *e;
8818d409 1554 line_t line_num;
463ee0b2 1555
7918f24d
NC
1556 PERL_ARGS_ASSERT_INCLINE;
1557
57843af0 1558 CopLINE_inc(PL_curcop);
463ee0b2
LW
1559 if (*s++ != '#')
1560 return;
d4c19fe8
AL
1561 while (SPACE_OR_TAB(*s))
1562 s++;
73659bf1
GS
1563 if (strnEQ(s, "line", 4))
1564 s += 4;
1565 else
1566 return;
084592ab 1567 if (SPACE_OR_TAB(*s))
73659bf1 1568 s++;
4e553d73 1569 else
73659bf1 1570 return;
d4c19fe8
AL
1571 while (SPACE_OR_TAB(*s))
1572 s++;
463ee0b2
LW
1573 if (!isDIGIT(*s))
1574 return;
d4c19fe8 1575
463ee0b2
LW
1576 n = s;
1577 while (isDIGIT(*s))
1578 s++;
07714eb4 1579 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1580 return;
bf4acbe4 1581 while (SPACE_OR_TAB(*s))
463ee0b2 1582 s++;
73659bf1 1583 if (*s == '"' && (t = strchr(s+1, '"'))) {
463ee0b2 1584 s++;
73659bf1
GS
1585 e = t + 1;
1586 }
463ee0b2 1587 else {
c35e046a
AL
1588 t = s;
1589 while (!isSPACE(*t))
1590 t++;
73659bf1 1591 e = t;
463ee0b2 1592 }
bf4acbe4 1593 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1594 e++;
1595 if (*e != '\n' && *e != '\0')
1596 return; /* false alarm */
1597
8818d409
FC
1598 line_num = atoi(n)-1;
1599
f4dd75d9 1600 if (t - s > 0) {
d9095cec 1601 const STRLEN len = t - s;
19bad673
NC
1602 SV *const temp_sv = CopFILESV(PL_curcop);
1603 const char *cf;
1604 STRLEN tmplen;
1605
1606 if (temp_sv) {
1607 cf = SvPVX(temp_sv);
1608 tmplen = SvCUR(temp_sv);
1609 } else {
1610 cf = NULL;
1611 tmplen = 0;
1612 }
1613
d1299d44 1614 if (!PL_rsfp && !PL_parser->filtered) {
e66cf94c
RGS
1615 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1616 * to *{"::_<newfilename"} */
44867030
NC
1617 /* However, the long form of evals is only turned on by the
1618 debugger - usually they're "(eval %lu)" */
1619 char smallbuf[128];
1620 char *tmpbuf;
1621 GV **gvp;
d9095cec 1622 STRLEN tmplen2 = len;
798b63bc 1623 if (tmplen + 2 <= sizeof smallbuf)
e66cf94c
RGS
1624 tmpbuf = smallbuf;
1625 else
2ae0db35 1626 Newx(tmpbuf, tmplen + 2, char);
44867030
NC
1627 tmpbuf[0] = '_';
1628 tmpbuf[1] = '<';
2ae0db35 1629 memcpy(tmpbuf + 2, cf, tmplen);
44867030 1630 tmplen += 2;
8a5ee598
RGS
1631 gvp = (GV**)hv_fetch(PL_defstash, tmpbuf, tmplen, FALSE);
1632 if (gvp) {
44867030
NC
1633 char *tmpbuf2;
1634 GV *gv2;
1635
1636 if (tmplen2 + 2 <= sizeof smallbuf)
1637 tmpbuf2 = smallbuf;
1638 else
1639 Newx(tmpbuf2, tmplen2 + 2, char);
1640
1641 if (tmpbuf2 != smallbuf || tmpbuf != smallbuf) {
1642 /* Either they malloc'd it, or we malloc'd it,
1643 so no prefix is present in ours. */
1644 tmpbuf2[0] = '_';
1645 tmpbuf2[1] = '<';
1646 }
1647
1648 memcpy(tmpbuf2 + 2, s, tmplen2);
1649 tmplen2 += 2;
1650
8a5ee598 1651 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1652 if (!isGV(gv2)) {
8a5ee598 1653 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1654 /* adjust ${"::_<newfilename"} to store the new file name */
1655 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
8818d409
FC
1656 /* The line number may differ. If that is the case,
1657 alias the saved lines that are in the array.
1658 Otherwise alias the whole array. */
1659 if (CopLINE(PL_curcop) == line_num) {
1660 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(*gvp)));
1661 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(*gvp)));
1662 }
1663 else if (GvAV(*gvp)) {
1664 AV * const av = GvAV(*gvp);
1665 const I32 start = CopLINE(PL_curcop)+1;
1666 I32 items = AvFILLp(av) - start;
1667 if (items > 0) {
1668 AV * const av2 = GvAVn(gv2);
1669 SV **svp = AvARRAY(av) + start;
1670 I32 l = (I32)line_num+1;
1671 while (items--)
1672 av_store(av2, l++, SvREFCNT_inc(*svp++));
1673 }
1674 }
e5527e4b 1675 }
44867030
NC
1676
1677 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
8a5ee598 1678 }
e66cf94c 1679 if (tmpbuf != smallbuf) Safefree(tmpbuf);
e66cf94c 1680 }
05ec9bb3 1681 CopFILE_free(PL_curcop);
d9095cec 1682 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1683 }
8818d409 1684 CopLINE_set(PL_curcop, line_num);
463ee0b2
LW
1685}
1686
29595ff2 1687#ifdef PERL_MAD
cd81e915 1688/* skip space before PL_thistoken */
29595ff2
NC
1689
1690STATIC char *
1691S_skipspace0(pTHX_ register char *s)
1692{
7918f24d
NC
1693 PERL_ARGS_ASSERT_SKIPSPACE0;
1694
29595ff2
NC
1695 s = skipspace(s);
1696 if (!PL_madskills)
1697 return s;
cd81e915
NC
1698 if (PL_skipwhite) {
1699 if (!PL_thiswhite)
6b29d1f5 1700 PL_thiswhite = newSVpvs("");
cd81e915
NC
1701 sv_catsv(PL_thiswhite, PL_skipwhite);
1702 sv_free(PL_skipwhite);
1703 PL_skipwhite = 0;
1704 }
1705 PL_realtokenstart = s - SvPVX(PL_linestr);
29595ff2
NC
1706 return s;
1707}
1708
cd81e915 1709/* skip space after PL_thistoken */
29595ff2
NC
1710
1711STATIC char *
1712S_skipspace1(pTHX_ register char *s)
1713{
d4c19fe8 1714 const char *start = s;
29595ff2
NC
1715 I32 startoff = start - SvPVX(PL_linestr);
1716
7918f24d
NC
1717 PERL_ARGS_ASSERT_SKIPSPACE1;
1718
29595ff2
NC
1719 s = skipspace(s);
1720 if (!PL_madskills)
1721 return s;
1722 start = SvPVX(PL_linestr) + startoff;
cd81e915 1723 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1724 const char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1725 PL_thistoken = newSVpvn(tstart, start - tstart);
1726 }
1727 PL_realtokenstart = -1;
1728 if (PL_skipwhite) {
1729 if (!PL_nextwhite)
6b29d1f5 1730 PL_nextwhite = newSVpvs("");
cd81e915
NC
1731 sv_catsv(PL_nextwhite, PL_skipwhite);
1732 sv_free(PL_skipwhite);
1733 PL_skipwhite = 0;
29595ff2
NC
1734 }
1735 return s;
1736}
1737
1738STATIC char *
1739S_skipspace2(pTHX_ register char *s, SV **svp)
1740{
c35e046a
AL
1741 char *start;
1742 const I32 bufptroff = PL_bufptr - SvPVX(PL_linestr);
1743 const I32 startoff = s - SvPVX(PL_linestr);
1744
7918f24d
NC
1745 PERL_ARGS_ASSERT_SKIPSPACE2;
1746
29595ff2
NC
1747 s = skipspace(s);
1748 PL_bufptr = SvPVX(PL_linestr) + bufptroff;
1749 if (!PL_madskills || !svp)
1750 return s;
1751 start = SvPVX(PL_linestr) + startoff;
cd81e915 1752 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1753 char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1754 PL_thistoken = newSVpvn(tstart, start - tstart);
1755 PL_realtokenstart = -1;
29595ff2 1756 }
cd81e915 1757 if (PL_skipwhite) {
29595ff2 1758 if (!*svp)
6b29d1f5 1759 *svp = newSVpvs("");
cd81e915
NC
1760 sv_setsv(*svp, PL_skipwhite);
1761 sv_free(PL_skipwhite);
1762 PL_skipwhite = 0;
29595ff2
NC
1763 }
1764
1765 return s;
1766}
1767#endif
1768
80a702cd 1769STATIC void
15f169a1 1770S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1771{
1772 AV *av = CopFILEAVx(PL_curcop);
1773 if (av) {
b9f83d2f 1774 SV * const sv = newSV_type(SVt_PVMG);
5fa550fb
NC
1775 if (orig_sv)
1776 sv_setsv(sv, orig_sv);
1777 else
1778 sv_setpvn(sv, buf, len);
80a702cd
RGS
1779 (void)SvIOK_on(sv);
1780 SvIV_set(sv, 0);
1781 av_store(av, (I32)CopLINE(PL_curcop), sv);
1782 }
1783}
1784
ffb4593c
NT
1785/*
1786 * S_skipspace
1787 * Called to gobble the appropriate amount and type of whitespace.
1788 * Skips comments as well.
1789 */
1790
76e3520e 1791STATIC char *
cea2e8a9 1792S_skipspace(pTHX_ register char *s)
a687059c 1793{
5db06880 1794#ifdef PERL_MAD
f0e67a1d
Z
1795 char *start = s;
1796#endif /* PERL_MAD */
7918f24d 1797 PERL_ARGS_ASSERT_SKIPSPACE;
f0e67a1d 1798#ifdef PERL_MAD
cd81e915
NC
1799 if (PL_skipwhite) {
1800 sv_free(PL_skipwhite);
f0e67a1d 1801 PL_skipwhite = NULL;
5db06880 1802 }
f0e67a1d 1803#endif /* PERL_MAD */
3280af22 1804 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
bf4acbe4 1805 while (s < PL_bufend && SPACE_OR_TAB(*s))
463ee0b2 1806 s++;
f0e67a1d
Z
1807 } else {
1808 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1809 PL_bufptr = s;
f0998909
Z
1810 lex_read_space(LEX_KEEP_PREVIOUS |
1811 (PL_sublex_info.sub_inwhat || PL_lex_state == LEX_FORMLINE ?
1812 LEX_NO_NEXT_CHUNK : 0));
3280af22 1813 s = PL_bufptr;
f0e67a1d
Z
1814 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1815 if (PL_linestart > PL_bufptr)
1816 PL_bufptr = PL_linestart;
1817 return s;
463ee0b2 1818 }
5db06880 1819#ifdef PERL_MAD
f0e67a1d
Z
1820 if (PL_madskills)
1821 PL_skipwhite = newSVpvn(start, s-start);
1822#endif /* PERL_MAD */
5db06880 1823 return s;
a687059c 1824}
378cc40b 1825
ffb4593c
NT
1826/*
1827 * S_check_uni
1828 * Check the unary operators to ensure there's no ambiguity in how they're
1829 * used. An ambiguous piece of code would be:
1830 * rand + 5
1831 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1832 * the +5 is its argument.
1833 */
1834
76e3520e 1835STATIC void
cea2e8a9 1836S_check_uni(pTHX)
ba106d47 1837{
97aff369 1838 dVAR;
d4c19fe8
AL
1839 const char *s;
1840 const char *t;
2f3197b3 1841
3280af22 1842 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1843 return;
3280af22
NIS
1844 while (isSPACE(*PL_last_uni))
1845 PL_last_uni++;
c35e046a
AL
1846 s = PL_last_uni;
1847 while (isALNUM_lazy_if(s,UTF) || *s == '-')
1848 s++;
3280af22 1849 if ((t = strchr(s, '(')) && t < PL_bufptr)
a0d0e21e 1850 return;
6136c704 1851
9b387841
NC
1852 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
1853 "Warning: Use of \"%.*s\" without parentheses is ambiguous",
1854 (int)(s - PL_last_uni), PL_last_uni);
2f3197b3
LW
1855}
1856
ffb4593c
NT
1857/*
1858 * LOP : macro to build a list operator. Its behaviour has been replaced
1859 * with a subroutine, S_lop() for which LOP is just another name.
1860 */
1861
a0d0e21e
LW
1862#define LOP(f,x) return lop(f,x,s)
1863
ffb4593c
NT
1864/*
1865 * S_lop
1866 * Build a list operator (or something that might be one). The rules:
1867 * - if we have a next token, then it's a list operator [why?]
1868 * - if the next thing is an opening paren, then it's a function
1869 * - else it's a list operator
1870 */
1871
76e3520e 1872STATIC I32
a0be28da 1873S_lop(pTHX_ I32 f, int x, char *s)
ffed7fef 1874{
97aff369 1875 dVAR;
7918f24d
NC
1876
1877 PERL_ARGS_ASSERT_LOP;
1878
6154021b 1879 pl_yylval.ival = f;
35c8bce7 1880 CLINE;
3280af22
NIS
1881 PL_expect = x;
1882 PL_bufptr = s;
1883 PL_last_lop = PL_oldbufptr;
eb160463 1884 PL_last_lop_op = (OPCODE)f;
5db06880
NC
1885#ifdef PERL_MAD
1886 if (PL_lasttoke)
78cdf107 1887 goto lstop;
5db06880 1888#else
3280af22 1889 if (PL_nexttoke)
78cdf107 1890 goto lstop;
5db06880 1891#endif
79072805 1892 if (*s == '(')
bbf60fe6 1893 return REPORT(FUNC);
29595ff2 1894 s = PEEKSPACE(s);
79072805 1895 if (*s == '(')
bbf60fe6 1896 return REPORT(FUNC);
78cdf107
Z
1897 else {
1898 lstop:
1899 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
1900 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
bbf60fe6 1901 return REPORT(LSTOP);
78cdf107 1902 }
79072805
LW
1903}
1904
5db06880
NC
1905#ifdef PERL_MAD
1906 /*
1907 * S_start_force
1908 * Sets up for an eventual force_next(). start_force(0) basically does
1909 * an unshift, while start_force(-1) does a push. yylex removes items
1910 * on the "pop" end.
1911 */
1912
1913STATIC void
1914S_start_force(pTHX_ int where)
1915{
1916 int i;
1917
cd81e915 1918 if (where < 0) /* so people can duplicate start_force(PL_curforce) */
5db06880 1919 where = PL_lasttoke;
cd81e915
NC
1920 assert(PL_curforce < 0 || PL_curforce == where);
1921 if (PL_curforce != where) {
5db06880
NC
1922 for (i = PL_lasttoke; i > where; --i) {
1923 PL_nexttoke[i] = PL_nexttoke[i-1];
1924 }
1925 PL_lasttoke++;
1926 }
cd81e915 1927 if (PL_curforce < 0) /* in case of duplicate start_force() */
5db06880 1928 Zero(&PL_nexttoke[where], 1, NEXTTOKE);
cd81e915
NC
1929 PL_curforce = where;
1930 if (PL_nextwhite) {
5db06880 1931 if (PL_madskills)
6b29d1f5 1932 curmad('^', newSVpvs(""));
cd81e915 1933 CURMAD('_', PL_nextwhite);
5db06880
NC
1934 }
1935}
1936
1937STATIC void
1938S_curmad(pTHX_ char slot, SV *sv)
1939{
1940 MADPROP **where;
1941
1942 if (!sv)
1943 return;
cd81e915
NC
1944 if (PL_curforce < 0)
1945 where = &PL_thismad;
5db06880 1946 else
cd81e915 1947 where = &PL_nexttoke[PL_curforce].next_mad;
5db06880 1948
cd81e915 1949 if (PL_faketokens)
76f68e9b 1950 sv_setpvs(sv, "");
5db06880
NC
1951 else {
1952 if (!IN_BYTES) {
1953 if (UTF && is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
1954 SvUTF8_on(sv);
1955 else if (PL_encoding) {
1956 sv_recode_to_utf8(sv, PL_encoding);
1957 }
1958 }
1959 }
1960
1961 /* keep a slot open for the head of the list? */
1962 if (slot != '_' && *where && (*where)->mad_key == '^') {
1963 (*where)->mad_key = slot;
daba3364 1964 sv_free(MUTABLE_SV(((*where)->mad_val)));
5db06880
NC
1965 (*where)->mad_val = (void*)sv;
1966 }
1967 else
1968 addmad(newMADsv(slot, sv), where, 0);
1969}
1970#else
b3f24c00
MHM
1971# define start_force(where) NOOP
1972# define curmad(slot, sv) NOOP
5db06880
NC
1973#endif
1974
ffb4593c
NT
1975/*
1976 * S_force_next
9cbb5ea2 1977 * When the lexer realizes it knows the next token (for instance,
ffb4593c 1978 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 1979 * to know what token to return the next time the lexer is called. Caller
5db06880
NC
1980 * will need to set PL_nextval[] (or PL_nexttoke[].next_val with PERL_MAD),
1981 * and possibly PL_expect to ensure the lexer handles the token correctly.
ffb4593c
NT
1982 */
1983
4e553d73 1984STATIC void
cea2e8a9 1985S_force_next(pTHX_ I32 type)
79072805 1986{
97aff369 1987 dVAR;
704d4215
GG
1988#ifdef DEBUGGING
1989 if (DEBUG_T_TEST) {
1990 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 1991 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
1992 }
1993#endif
6c7ae946
FC
1994 /* Don’t let opslab_force_free snatch it */
1995 if (S_is_opval_token(type & 0xffff) && NEXTVAL_NEXTTOKE.opval) {
1996 assert(!NEXTVAL_NEXTTOKE.opval->op_savefree);
1997 NEXTVAL_NEXTTOKE.opval->op_savefree = 1;
1998 }
5db06880 1999#ifdef PERL_MAD
cd81e915 2000 if (PL_curforce < 0)
5db06880 2001 start_force(PL_lasttoke);
cd81e915 2002 PL_nexttoke[PL_curforce].next_type = type;
5db06880
NC
2003 if (PL_lex_state != LEX_KNOWNEXT)
2004 PL_lex_defer = PL_lex_state;
2005 PL_lex_state = LEX_KNOWNEXT;
2006 PL_lex_expect = PL_expect;
cd81e915 2007 PL_curforce = -1;
5db06880 2008#else
3280af22
NIS
2009 PL_nexttype[PL_nexttoke] = type;
2010 PL_nexttoke++;
2011 if (PL_lex_state != LEX_KNOWNEXT) {
2012 PL_lex_defer = PL_lex_state;
2013 PL_lex_expect = PL_expect;
2014 PL_lex_state = LEX_KNOWNEXT;
79072805 2015 }
5db06880 2016#endif
79072805
LW
2017}
2018
28ac2b49
Z
2019void
2020Perl_yyunlex(pTHX)
2021{
a7aaec61
Z
2022 int yyc = PL_parser->yychar;
2023 if (yyc != YYEMPTY) {
2024 if (yyc) {
2025 start_force(-1);
2026 NEXTVAL_NEXTTOKE = PL_parser->yylval;
2027 if (yyc == '{'/*}*/ || yyc == HASHBRACK || yyc == '['/*]*/) {
78cdf107 2028 PL_lex_allbrackets--;
a7aaec61 2029 PL_lex_brackets--;
78cdf107
Z
2030 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
2031 } else if (yyc == '('/*)*/) {
2032 PL_lex_allbrackets--;
2033 yyc |= (2<<24);
a7aaec61
Z
2034 }
2035 force_next(yyc);
2036 }
28ac2b49
Z
2037 PL_parser->yychar = YYEMPTY;
2038 }
2039}
2040
d0a148a6 2041STATIC SV *
15f169a1 2042S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 2043{
97aff369 2044 dVAR;
740cce10 2045 SV * const sv = newSVpvn_utf8(start, len,
eaf7a4d2
CS
2046 !IN_BYTES
2047 && UTF
2048 && !is_ascii_string((const U8*)start, len)
740cce10 2049 && is_utf8_string((const U8*)start, len));
d0a148a6
NC
2050 return sv;
2051}
2052
ffb4593c
NT
2053/*
2054 * S_force_word
2055 * When the lexer knows the next thing is a word (for instance, it has
2056 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
2057 * it calls S_force_word to stick the next word into the PL_nexttoke/val
2058 * lookahead.
ffb4593c
NT
2059 *
2060 * Arguments:
b1b65b59 2061 * char *start : buffer position (must be within PL_linestr)
02b34bbe 2062 * int token : PL_next* will be this type of bare word (e.g., METHOD,WORD)
ffb4593c
NT
2063 * int check_keyword : if true, Perl checks to make sure the word isn't
2064 * a keyword (do this if the word is a label, e.g. goto FOO)
2065 * int allow_pack : if true, : characters will also be allowed (require,
2066 * use, etc. do this)
9cbb5ea2 2067 * int allow_initial_tick : used by the "sub" lexer only.
ffb4593c
NT
2068 */
2069
76e3520e 2070STATIC char *
cea2e8a9 2071S_force_word(pTHX_ register char *start, int token, int check_keyword, int allow_pack, int allow_initial_tick)
79072805 2072{
97aff369 2073 dVAR;
eb578fdb 2074 char *s;
463ee0b2 2075 STRLEN len;
4e553d73 2076
7918f24d
NC
2077 PERL_ARGS_ASSERT_FORCE_WORD;
2078
29595ff2 2079 start = SKIPSPACE1(start);
463ee0b2 2080 s = start;
7e2040f0 2081 if (isIDFIRST_lazy_if(s,UTF) ||
a0d0e21e 2082 (allow_pack && *s == ':') ||
15f0808c 2083 (allow_initial_tick && *s == '\'') )
a0d0e21e 2084 {
3280af22 2085 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
5458a98a 2086 if (check_keyword && keyword(PL_tokenbuf, len, 0))
463ee0b2 2087 return start;
cd81e915 2088 start_force(PL_curforce);
5db06880
NC
2089 if (PL_madskills)
2090 curmad('X', newSVpvn(start,s-start));
463ee0b2 2091 if (token == METHOD) {
29595ff2 2092 s = SKIPSPACE1(s);
463ee0b2 2093 if (*s == '(')
3280af22 2094 PL_expect = XTERM;
463ee0b2 2095 else {
3280af22 2096 PL_expect = XOPERATOR;
463ee0b2 2097 }
79072805 2098 }
e74e6b3d 2099 if (PL_madskills)
63575281 2100 curmad('g', newSVpvs( "forced" ));
9ded7720 2101 NEXTVAL_NEXTTOKE.opval
d0a148a6
NC
2102 = (OP*)newSVOP(OP_CONST,0,
2103 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 2104 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
2105 force_next(token);
2106 }
2107 return s;
2108}
2109
ffb4593c
NT
2110/*
2111 * S_force_ident
9cbb5ea2 2112 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
2113 * text only contains the "foo" portion. The first argument is a pointer
2114 * to the "foo", and the second argument is the type symbol to prefix.
2115 * Forces the next token to be a "WORD".
9cbb5ea2 2116 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2117 */
2118
76e3520e 2119STATIC void
bfed75c6 2120S_force_ident(pTHX_ register const char *s, int kind)
79072805 2121{
97aff369 2122 dVAR;
7918f24d
NC
2123
2124 PERL_ARGS_ASSERT_FORCE_IDENT;
2125
c35e046a 2126 if (*s) {
90e5519e 2127 const STRLEN len = strlen(s);
728847b1
BF
2128 OP* const o = (OP*)newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
2129 UTF ? SVf_UTF8 : 0));
cd81e915 2130 start_force(PL_curforce);
9ded7720 2131 NEXTVAL_NEXTTOKE.opval = o;
79072805 2132 force_next(WORD);
748a9306 2133 if (kind) {
11343788 2134 o->op_private = OPpCONST_ENTERED;
55497cff 2135 /* XXX see note in pp_entereval() for why we forgo typo
2136 warnings if the symbol must be introduced in an eval.
2137 GSAR 96-10-12 */
90e5519e 2138 gv_fetchpvn_flags(s, len,
728847b1
BF
2139 (PL_in_eval ? (GV_ADDMULTI | GV_ADDINEVAL)
2140 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
90e5519e
NC
2141 kind == '$' ? SVt_PV :
2142 kind == '@' ? SVt_PVAV :
2143 kind == '%' ? SVt_PVHV :
a0d0e21e 2144 SVt_PVGV
90e5519e 2145 );
748a9306 2146 }
79072805
LW
2147 }
2148}
2149
1571675a
GS
2150NV
2151Perl_str_to_version(pTHX_ SV *sv)
2152{
2153 NV retval = 0.0;
2154 NV nshift = 1.0;
2155 STRLEN len;
cfd0369c 2156 const char *start = SvPV_const(sv,len);
9d4ba2ae 2157 const char * const end = start + len;
504618e9 2158 const bool utf = SvUTF8(sv) ? TRUE : FALSE;
7918f24d
NC
2159
2160 PERL_ARGS_ASSERT_STR_TO_VERSION;
2161
1571675a 2162 while (start < end) {
ba210ebe 2163 STRLEN skip;
1571675a
GS
2164 UV n;
2165 if (utf)
9041c2e3 2166 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2167 else {
2168 n = *(U8*)start;
2169 skip = 1;
2170 }
2171 retval += ((NV)n)/nshift;
2172 start += skip;
2173 nshift *= 1000;
2174 }
2175 return retval;
2176}
2177
4e553d73 2178/*
ffb4593c
NT
2179 * S_force_version
2180 * Forces the next token to be a version number.
e759cc13
RGS
2181 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2182 * and if "guessing" is TRUE, then no new token is created (and the caller
2183 * must use an alternative parsing method).
ffb4593c
NT
2184 */
2185
76e3520e 2186STATIC char *
e759cc13 2187S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2188{
97aff369 2189 dVAR;
5f66b61c 2190 OP *version = NULL;
44dcb63b 2191 char *d;
5db06880
NC
2192#ifdef PERL_MAD
2193 I32 startoff = s - SvPVX(PL_linestr);
2194#endif
89bfa8cd 2195
7918f24d
NC
2196 PERL_ARGS_ASSERT_FORCE_VERSION;
2197
29595ff2 2198 s = SKIPSPACE1(s);
89bfa8cd 2199
44dcb63b 2200 d = s;
dd629d5b 2201 if (*d == 'v')
44dcb63b 2202 d++;
44dcb63b 2203 if (isDIGIT(*d)) {
e759cc13
RGS
2204 while (isDIGIT(*d) || *d == '_' || *d == '.')
2205 d++;
5db06880
NC
2206#ifdef PERL_MAD
2207 if (PL_madskills) {
cd81e915 2208 start_force(PL_curforce);
5db06880
NC
2209 curmad('X', newSVpvn(s,d-s));
2210 }
2211#endif
4e4da3ac 2212 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
dd629d5b 2213 SV *ver;
8d08d9ba 2214#ifdef USE_LOCALE_NUMERIC
909d3787
KW
2215 char *loc = savepv(setlocale(LC_NUMERIC, NULL));
2216 setlocale(LC_NUMERIC, "C");
8d08d9ba 2217#endif
6154021b 2218 s = scan_num(s, &pl_yylval);
8d08d9ba
DG
2219#ifdef USE_LOCALE_NUMERIC
2220 setlocale(LC_NUMERIC, loc);
909d3787 2221 Safefree(loc);
8d08d9ba 2222#endif
6154021b 2223 version = pl_yylval.opval;
dd629d5b
GS
2224 ver = cSVOPx(version)->op_sv;
2225 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2226 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2227 SvNV_set(ver, str_to_version(ver));
1571675a 2228 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2229 }
89bfa8cd 2230 }
5db06880
NC
2231 else if (guessing) {
2232#ifdef PERL_MAD
2233 if (PL_madskills) {
cd81e915
NC
2234 sv_free(PL_nextwhite); /* let next token collect whitespace */
2235 PL_nextwhite = 0;
5db06880
NC
2236 s = SvPVX(PL_linestr) + startoff;
2237 }
2238#endif
e759cc13 2239 return s;
5db06880 2240 }
89bfa8cd 2241 }
2242
5db06880
NC
2243#ifdef PERL_MAD
2244 if (PL_madskills && !version) {
cd81e915
NC
2245 sv_free(PL_nextwhite); /* let next token collect whitespace */
2246 PL_nextwhite = 0;
5db06880
NC
2247 s = SvPVX(PL_linestr) + startoff;
2248 }
2249#endif
89bfa8cd 2250 /* NOTE: The parser sees the package name and the VERSION swapped */
cd81e915 2251 start_force(PL_curforce);
9ded7720 2252 NEXTVAL_NEXTTOKE.opval = version;
4e553d73 2253 force_next(WORD);
89bfa8cd 2254
e759cc13 2255 return s;
89bfa8cd 2256}
2257
ffb4593c 2258/*
91152fc1
DG
2259 * S_force_strict_version
2260 * Forces the next token to be a version number using strict syntax rules.
2261 */
2262
2263STATIC char *
2264S_force_strict_version(pTHX_ char *s)
2265{
2266 dVAR;
2267 OP *version = NULL;
2268#ifdef PERL_MAD
2269 I32 startoff = s - SvPVX(PL_linestr);
2270#endif
2271 const char *errstr = NULL;
2272
2273 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2274
2275 while (isSPACE(*s)) /* leading whitespace */
2276 s++;
2277
2278 if (is_STRICT_VERSION(s,&errstr)) {
2279 SV *ver = newSV(0);
2280 s = (char *)scan_version(s, ver, 0);
2281 version = newSVOP(OP_CONST, 0, ver);
2282 }
4e4da3ac
Z
2283 else if ( (*s != ';' && *s != '{' && *s != '}' ) &&
2284 (s = SKIPSPACE1(s), (*s != ';' && *s != '{' && *s != '}' )))
2285 {
91152fc1
DG
2286 PL_bufptr = s;
2287 if (errstr)
2288 yyerror(errstr); /* version required */
2289 return s;
2290 }
2291
2292#ifdef PERL_MAD
2293 if (PL_madskills && !version) {
2294 sv_free(PL_nextwhite); /* let next token collect whitespace */
2295 PL_nextwhite = 0;
2296 s = SvPVX(PL_linestr) + startoff;
2297 }
2298#endif
2299 /* NOTE: The parser sees the package name and the VERSION swapped */
2300 start_force(PL_curforce);
2301 NEXTVAL_NEXTTOKE.opval = version;
2302 force_next(WORD);
2303
2304 return s;
2305}
2306
2307/*
ffb4593c
NT
2308 * S_tokeq
2309 * Tokenize a quoted string passed in as an SV. It finds the next
2310 * chunk, up to end of string or a backslash. It may make a new
2311 * SV containing that chunk (if HINT_NEW_STRING is on). It also
2312 * turns \\ into \.
2313 */
2314
76e3520e 2315STATIC SV *
cea2e8a9 2316S_tokeq(pTHX_ SV *sv)
79072805 2317{
97aff369 2318 dVAR;
eb578fdb
KW
2319 char *s;
2320 char *send;
2321 char *d;
b3ac6de7
IZ
2322 STRLEN len = 0;
2323 SV *pv = sv;
79072805 2324
7918f24d
NC
2325 PERL_ARGS_ASSERT_TOKEQ;
2326
79072805 2327 if (!SvLEN(sv))
b3ac6de7 2328 goto finish;
79072805 2329
a0d0e21e 2330 s = SvPV_force(sv, len);
21a311ee 2331 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1)
b3ac6de7 2332 goto finish;
463ee0b2 2333 send = s + len;
dcb21ed6
NC
2334 /* This is relying on the SV being "well formed" with a trailing '\0' */
2335 while (s < send && !(*s == '\\' && s[1] == '\\'))
79072805
LW
2336 s++;
2337 if (s == send)
b3ac6de7 2338 goto finish;
79072805 2339 d = s;
be4731d2 2340 if ( PL_hints & HINT_NEW_STRING ) {
59cd0e26 2341 pv = newSVpvn_flags(SvPVX_const(pv), len, SVs_TEMP | SvUTF8(sv));
be4731d2 2342 }
79072805
LW
2343 while (s < send) {
2344 if (*s == '\\') {
a0d0e21e 2345 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2346 s++; /* all that, just for this */
2347 }
2348 *d++ = *s++;
2349 }
2350 *d = '\0';
95a20fc0 2351 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2352 finish:
3280af22 2353 if ( PL_hints & HINT_NEW_STRING )
eb0d8d16 2354 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
79072805
LW
2355 return sv;
2356}
2357
ffb4593c
NT
2358/*
2359 * Now come three functions related to double-quote context,
2360 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2361 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2362 * interact with PL_lex_state, and create fake ( ... ) argument lists
2363 * to handle functions and concatenation.
ecd24171
DM
2364 * For example,
2365 * "foo\lbar"
2366 * is tokenised as
2367 * stringify ( const[foo] concat lcfirst ( const[bar] ) )
ffb4593c
NT
2368 */
2369
2370/*
2371 * S_sublex_start
6154021b 2372 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2373 *
2374 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2375 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c
NT
2376 *
2377 * OP_CONST and OP_READLINE are easy--just make the new op and return.
2378 *
2379 * Everything else becomes a FUNC.
2380 *
2381 * Sets PL_lex_state to LEX_INTERPPUSH unless (ival was OP_NULL or we
2382 * had an OP_CONST or OP_READLINE). This just sets us up for a
2383 * call to S_sublex_push().
2384 */
2385
76e3520e 2386STATIC I32
cea2e8a9 2387S_sublex_start(pTHX)
79072805 2388{
97aff369 2389 dVAR;
eb578fdb 2390 const I32 op_type = pl_yylval.ival;
79072805
LW
2391
2392 if (op_type == OP_NULL) {
6154021b 2393 pl_yylval.opval = PL_lex_op;
5f66b61c 2394 PL_lex_op = NULL;
79072805
LW
2395 return THING;
2396 }
2397 if (op_type == OP_CONST || op_type == OP_READLINE) {
3280af22 2398 SV *sv = tokeq(PL_lex_stuff);
b3ac6de7
IZ
2399
2400 if (SvTYPE(sv) == SVt_PVIV) {
2401 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2402 STRLEN len;
96a5add6 2403 const char * const p = SvPV_const(sv, len);
740cce10 2404 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2405 SvREFCNT_dec(sv);
2406 sv = nsv;
4e553d73 2407 }
6154021b 2408 pl_yylval.opval = (OP*)newSVOP(op_type, 0, sv);
a0714e2c 2409 PL_lex_stuff = NULL;
6f33ba73
RGS
2410 /* Allow <FH> // "foo" */
2411 if (op_type == OP_READLINE)
2412 PL_expect = XTERMORDORDOR;
79072805
LW
2413 return THING;
2414 }
e3f73d4e
RGS
2415 else if (op_type == OP_BACKTICK && PL_lex_op) {
2416 /* readpipe() vas overriden */
2417 cSVOPx(cLISTOPx(cUNOPx(PL_lex_op)->op_first)->op_first->op_sibling)->op_sv = tokeq(PL_lex_stuff);
6154021b 2418 pl_yylval.opval = PL_lex_op;
9b201d7d 2419 PL_lex_op = NULL;
e3f73d4e
RGS
2420 PL_lex_stuff = NULL;
2421 return THING;
2422 }
79072805 2423
3280af22 2424 PL_sublex_info.super_state = PL_lex_state;
eac04b2e 2425 PL_sublex_info.sub_inwhat = (U16)op_type;
3280af22
NIS
2426 PL_sublex_info.sub_op = PL_lex_op;
2427 PL_lex_state = LEX_INTERPPUSH;
55497cff 2428
3280af22
NIS
2429 PL_expect = XTERM;
2430 if (PL_lex_op) {
6154021b 2431 pl_yylval.opval = PL_lex_op;
5f66b61c 2432 PL_lex_op = NULL;
55497cff 2433 return PMFUNC;
2434 }
2435 else
2436 return FUNC;
2437}
2438
ffb4593c
NT
2439/*
2440 * S_sublex_push
2441 * Create a new scope to save the lexing state. The scope will be
2442 * ended in S_sublex_done. Returns a '(', starting the function arguments
2443 * to the uc, lc, etc. found before.
2444 * Sets PL_lex_state to LEX_INTERPCONCAT.
2445 */
2446
76e3520e 2447STATIC I32
cea2e8a9 2448S_sublex_push(pTHX)
55497cff 2449{
27da23d5 2450 dVAR;
f46d017c 2451 ENTER;
55497cff 2452
3280af22 2453 PL_lex_state = PL_sublex_info.super_state;
651b5b28 2454 SAVEBOOL(PL_lex_dojoin);
3280af22 2455 SAVEI32(PL_lex_brackets);
78cdf107 2456 SAVEI32(PL_lex_allbrackets);
b27dce25 2457 SAVEI32(PL_lex_formbrack);
78cdf107 2458 SAVEI8(PL_lex_fakeeof);
3280af22
NIS
2459 SAVEI32(PL_lex_casemods);
2460 SAVEI32(PL_lex_starts);
651b5b28 2461 SAVEI8(PL_lex_state);
7cc34111 2462 SAVESPTR(PL_lex_repl);
9da1dd8f 2463 SAVEPPTR(PL_sublex_info.re_eval_start);
db444266 2464 SAVESPTR(PL_sublex_info.re_eval_str);
565b52df 2465 SAVEPPTR(PL_sublex_info.super_bufptr);
7766f137 2466 SAVEVPTR(PL_lex_inpat);
98246f1e 2467 SAVEI16(PL_lex_inwhat);
57843af0 2468 SAVECOPLINE(PL_curcop);
3280af22 2469 SAVEPPTR(PL_bufptr);
8452ff4b 2470 SAVEPPTR(PL_bufend);
3280af22
NIS
2471 SAVEPPTR(PL_oldbufptr);
2472 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2473 SAVEPPTR(PL_last_lop);
2474 SAVEPPTR(PL_last_uni);
3280af22
NIS
2475 SAVEPPTR(PL_linestart);
2476 SAVESPTR(PL_linestr);
8edd5f42
RGS
2477 SAVEGENERICPV(PL_lex_brackstack);
2478 SAVEGENERICPV(PL_lex_casestack);
3280af22 2479
99bd9d90
FC
2480 /* The here-doc parser needs to be able to peek into outer lexing
2481 scopes to find the body of the here-doc. We use SvIVX(PL_linestr)
2482 to store the outer PL_bufptr and SvNVX to store the outer
2483 PL_linestr. Since SvIVX already means something else, we use
2484 PL_sublex_info.super_bufptr for the innermost scope (the one we are
2485 now entering), and a localised SvIVX for outer scopes.
2486 */
2487 SvUPGRADE(PL_linestr, SVt_PVIV);
2488 /* A null super_bufptr means the outer lexing scope is not peekable,
2489 because it is a single line from an input stream. */
2490 SAVEIV(SvIVX(PL_linestr));
2491 SvIVX(PL_linestr) = PTR2IV(PL_sublex_info.super_bufptr);
2492 PL_sublex_info.super_bufptr =
2493 (SvTYPE(PL_linestr) < SVt_PVNV || !SvNVX(PL_linestr))
2494 && (PL_rsfp || PL_parser->filtered)
2495 ? NULL
2496 : PL_bufptr;
2497 SvUPGRADE(PL_lex_stuff, SVt_PVNV);
2498 SvNVX(PL_lex_stuff) = PTR2NV(PL_linestr);
2499
3280af22 2500 PL_linestr = PL_lex_stuff;
7cc34111 2501 PL_lex_repl = PL_sublex_info.repl;
a0714e2c 2502 PL_lex_stuff = NULL;
7cc34111 2503 PL_sublex_info.repl = NULL;
9da1dd8f 2504 PL_sublex_info.re_eval_start = NULL;
db444266 2505 PL_sublex_info.re_eval_str = NULL;
3280af22 2506
9cbb5ea2
GS
2507 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2508 = SvPVX(PL_linestr);
3280af22 2509 PL_bufend += SvCUR(PL_linestr);
bd61b366 2510 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2511 SAVEFREESV(PL_linestr);
2512
2513 PL_lex_dojoin = FALSE;
b27dce25 2514 PL_lex_brackets = PL_lex_formbrack = 0;
78cdf107
Z
2515 PL_lex_allbrackets = 0;
2516 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
a02a5408
JC
2517 Newx(PL_lex_brackstack, 120, char);
2518 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2519 PL_lex_casemods = 0;
2520 *PL_lex_casestack = '\0';
2521 PL_lex_starts = 0;
2522 PL_lex_state = LEX_INTERPCONCAT;
eb160463 2523 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
3280af22
NIS
2524
2525 PL_lex_inwhat = PL_sublex_info.sub_inwhat;
bb16bae8 2526 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
3280af22
NIS
2527 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2528 PL_lex_inpat = PL_sublex_info.sub_op;
79072805 2529 else
5f66b61c 2530 PL_lex_inpat = NULL;
79072805 2531
55497cff 2532 return '(';
79072805
LW
2533}
2534
ffb4593c
NT
2535/*
2536 * S_sublex_done
2537 * Restores lexer state after a S_sublex_push.
2538 */
2539
76e3520e 2540STATIC I32
cea2e8a9 2541S_sublex_done(pTHX)
79072805 2542{
27da23d5 2543 dVAR;
3280af22 2544 if (!PL_lex_starts++) {
396482e1 2545 SV * const sv = newSVpvs("");
9aa983d2
JH
2546 if (SvUTF8(PL_linestr))
2547 SvUTF8_on(sv);
3280af22 2548 PL_expect = XOPERATOR;
6154021b 2549 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
79072805
LW
2550 return THING;
2551 }
2552
3280af22
NIS
2553 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2554 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2555 return yylex();
79072805
LW
2556 }
2557
ffb4593c 2558 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
bb16bae8 2559 assert(PL_lex_inwhat != OP_TRANSR);
3280af22 2560 if (PL_lex_repl && (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS)) {
99bd9d90
FC
2561 SvUPGRADE(PL_lex_repl, SVt_PVNV);
2562 SvNVX(PL_lex_repl) = SvNVX(PL_linestr);
3280af22
NIS
2563 PL_linestr = PL_lex_repl;
2564 PL_lex_inpat = 0;
2565 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2566 PL_bufend += SvCUR(PL_linestr);
bd61b366 2567 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2568 SAVEFREESV(PL_linestr);
2569 PL_lex_dojoin = FALSE;
2570 PL_lex_brackets = 0;
78cdf107
Z
2571 PL_lex_allbrackets = 0;
2572 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
3280af22
NIS
2573 PL_lex_casemods = 0;
2574 *PL_lex_casestack = '\0';
2575 PL_lex_starts = 0;
25da4f38 2576 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2577 PL_lex_state = LEX_INTERPNORMAL;
2578 PL_lex_starts++;
e9fa98b2
HS
2579 /* we don't clear PL_lex_repl here, so that we can check later
2580 whether this is an evalled subst; that means we rely on the
2581 logic to ensure sublex_done() is called again only via the
2582 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2583 }
e9fa98b2 2584 else {
3280af22 2585 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2586 PL_lex_repl = NULL;
e9fa98b2 2587 }
79072805 2588 return ',';
ffed7fef
LW
2589 }
2590 else {
5db06880
NC
2591#ifdef PERL_MAD
2592 if (PL_madskills) {
cd81e915
NC
2593 if (PL_thiswhite) {
2594 if (!PL_endwhite)
6b29d1f5 2595 PL_endwhite = newSVpvs("");
cd81e915
NC
2596 sv_catsv(PL_endwhite, PL_thiswhite);
2597 PL_thiswhite = 0;
2598 }
2599 if (PL_thistoken)
76f68e9b 2600 sv_setpvs(PL_thistoken,"");
5db06880 2601 else
cd81e915 2602 PL_realtokenstart = -1;
5db06880
NC
2603 }
2604#endif
f46d017c 2605 LEAVE;
3280af22
NIS
2606 PL_bufend = SvPVX(PL_linestr);
2607 PL_bufend += SvCUR(PL_linestr);
2608 PL_expect = XOPERATOR;
09bef843 2609 PL_sublex_info.sub_inwhat = 0;
79072805 2610 return ')';
ffed7fef
LW
2611 }
2612}
2613
02aa26ce
NT
2614/*
2615 scan_const
2616
9da1dd8f
DM
2617 Extracts the next constant part of a pattern, double-quoted string,
2618 or transliteration. This is terrifying code.
2619
2620 For example, in parsing the double-quoted string "ab\x63$d", it would
2621 stop at the '$' and return an OP_CONST containing 'abc'.
02aa26ce 2622
94def140 2623 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 2624 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 2625 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 2626
94def140
TS
2627 Returns a pointer to the character scanned up to. If this is
2628 advanced from the start pointer supplied (i.e. if anything was
9da1dd8f 2629 successfully parsed), will leave an OP_CONST for the substring scanned
6154021b 2630 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
2631 by looking at the next characters herself.
2632
02aa26ce 2633 In patterns:
9da1dd8f
DM
2634 expand:
2635 \N{ABC} => \N{U+41.42.43}
2636
2637 pass through:
2638 all other \-char, including \N and \N{ apart from \N{ABC}
2639
2640 stops on:
2641 @ and $ where it appears to be a var, but not for $ as tail anchor
2642 \l \L \u \U \Q \E
2643 (?{ or (??{
2644
02aa26ce
NT
2645
2646 In transliterations:
2647 characters are VERY literal, except for - not at the start or end
94def140
TS
2648 of the string, which indicates a range. If the range is in bytes,
2649 scan_const expands the range to the full set of intermediate
2650 characters. If the range is in utf8, the hyphen is replaced with
2651 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
2652
2653 In double-quoted strings:
2654 backslashes:
2655 double-quoted style: \r and \n
ff3f963a 2656 constants: \x31, etc.
94def140 2657 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
2658 case and quoting: \U \Q \E
2659 stops on @ and $
2660
2661 scan_const does *not* construct ops to handle interpolated strings.
2662 It stops processing as soon as it finds an embedded $ or @ variable
2663 and leaves it to the caller to work out what's going on.
2664
94def140
TS
2665 embedded arrays (whether in pattern or not) could be:
2666 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2667
2668 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
2669
2670 $ in pattern could be $foo or could be tail anchor. Assumption:
2671 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 2672 followed by one of "()| \r\n\t"
02aa26ce 2673
9da1dd8f 2674 \1 (backreferences) are turned into $1 in substitutions
02aa26ce
NT
2675
2676 The structure of the code is
2677 while (there's a character to process) {
94def140
TS
2678 handle transliteration ranges
2679 skip regexp comments /(?#comment)/ and codes /(?{code})/
2680 skip #-initiated comments in //x patterns
2681 check for embedded arrays
02aa26ce
NT
2682 check for embedded scalars
2683 if (backslash) {
94def140 2684 deprecate \1 in substitution replacements
02aa26ce
NT
2685 handle string-changing backslashes \l \U \Q \E, etc.
2686 switch (what was escaped) {
94def140 2687 handle \- in a transliteration (becomes a literal -)
ff3f963a 2688 if a pattern and not \N{, go treat as regular character
94def140
TS
2689 handle \132 (octal characters)
2690 handle \x15 and \x{1234} (hex characters)
ff3f963a 2691 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
TS
2692 handle \cV (control characters)
2693 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 2694 } (end switch)
77a135fe 2695 continue
02aa26ce 2696 } (end if backslash)
77a135fe 2697 handle regular character
02aa26ce 2698 } (end while character to read)
4e553d73 2699
02aa26ce
NT
2700*/
2701
76e3520e 2702STATIC char *
cea2e8a9 2703S_scan_const(pTHX_ char *start)
79072805 2704{
97aff369 2705 dVAR;
eb578fdb 2706 char *send = PL_bufend; /* end of the constant */
77a135fe
KW
2707 SV *sv = newSV(send - start); /* sv for the constant. See
2708 note below on sizing. */
eb578fdb
KW
2709 char *s = start; /* start of the constant */
2710 char *d = SvPVX(sv); /* destination for copies */
02aa26ce 2711 bool dorange = FALSE; /* are we in a translit range? */
c2e66d9e 2712 bool didrange = FALSE; /* did we just finish a range? */
2866decb 2713 bool in_charclass = FALSE; /* within /[...]/ */
b953e60c
KW
2714 bool has_utf8 = FALSE; /* Output constant is UTF8 */
2715 bool this_utf8 = cBOOL(UTF); /* Is the source string assumed
77a135fe
KW
2716 to be UTF8? But, this can
2717 show as true when the source
2718 isn't utf8, as for example
2719 when it is entirely composed
2720 of hex constants */
2721
2722 /* Note on sizing: The scanned constant is placed into sv, which is
2723 * initialized by newSV() assuming one byte of output for every byte of
2724 * input. This routine expects newSV() to allocate an extra byte for a
2725 * trailing NUL, which this routine will append if it gets to the end of
2726 * the input. There may be more bytes of input than output (eg., \N{LATIN
2727 * CAPITAL LETTER A}), or more output than input if the constant ends up
2728 * recoded to utf8, but each time a construct is found that might increase
2729 * the needed size, SvGROW() is called. Its size parameter each time is
2730 * based on the best guess estimate at the time, namely the length used so
2731 * far, plus the length the current construct will occupy, plus room for
2732 * the trailing NUL, plus one byte for every input byte still unscanned */
2733
012bcf8d 2734 UV uv;
4c3a8340
TS
2735#ifdef EBCDIC
2736 UV literal_endpoint = 0;
e294cc5d 2737 bool native_range = TRUE; /* turned to FALSE if the first endpoint is Unicode. */
4c3a8340 2738#endif
012bcf8d 2739
7918f24d
NC
2740 PERL_ARGS_ASSERT_SCAN_CONST;
2741
bb16bae8 2742 assert(PL_lex_inwhat != OP_TRANSR);
2b9d42f0
NIS
2743 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
2744 /* If we are doing a trans and we know we want UTF8 set expectation */
2745 has_utf8 = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
2746 this_utf8 = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
2747 }
2748
2749
79072805 2750 while (s < send || dorange) {
ff3f963a 2751
02aa26ce 2752 /* get transliterations out of the way (they're most literal) */
3280af22 2753 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 2754 /* expand a range A-Z to the full set of characters. AIE! */
79072805 2755 if (dorange) {
1ba5c669
JH
2756 I32 i; /* current expanded character */
2757 I32 min; /* first character in range */
2758 I32 max; /* last character in range */
02aa26ce 2759
e294cc5d
JH
2760#ifdef EBCDIC
2761 UV uvmax = 0;
2762#endif
2763
2764 if (has_utf8
2765#ifdef EBCDIC
2766 && !native_range
2767#endif
2768 ) {
9d4ba2ae 2769 char * const c = (char*)utf8_hop((U8*)d, -1);
8973db79
JH
2770 char *e = d++;
2771 while (e-- > c)
2772 *(e + 1) = *e;
25716404 2773 *c = (char)UTF_TO_NATIVE(0xff);
8973db79
JH
2774 /* mark the range as done, and continue */
2775 dorange = FALSE;
2776 didrange = TRUE;
2777 continue;
2778 }
2b9d42f0 2779
95a20fc0 2780 i = d - SvPVX_const(sv); /* remember current offset */
e294cc5d
JH
2781#ifdef EBCDIC
2782 SvGROW(sv,
2783 SvLEN(sv) + (has_utf8 ?
2784 (512 - UTF_CONTINUATION_MARK +
2785 UNISKIP(0x100))
2786 : 256));
2787 /* How many two-byte within 0..255: 128 in UTF-8,
2788 * 96 in UTF-8-mod. */
2789#else
9cbb5ea2 2790 SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */
e294cc5d 2791#endif
9cbb5ea2 2792 d = SvPVX(sv) + i; /* refresh d after realloc */
e294cc5d
JH
2793#ifdef EBCDIC
2794 if (has_utf8) {
2795 int j;
2796 for (j = 0; j <= 1; j++) {
2797 char * const c = (char*)utf8_hop((U8*)d, -1);
2798 const UV uv = utf8n_to_uvchr((U8*)c, d - c, NULL, 0);
2799 if (j)
2800 min = (U8)uv;
2801 else if (uv < 256)
2802 max = (U8)uv;
2803 else {
2804 max = (U8)0xff; /* only to \xff */
2805 uvmax = uv; /* \x{100} to uvmax */
2806 }
2807 d = c; /* eat endpoint chars */
2808 }
2809 }
2810 else {
2811#endif
2812 d -= 2; /* eat the first char and the - */
2813 min = (U8)*d; /* first char in range */
2814 max = (U8)d[1]; /* last char in range */
2815#ifdef EBCDIC
2816 }
2817#endif
8ada0baa 2818
c2e66d9e 2819 if (min > max) {
01ec43d0 2820 Perl_croak(aTHX_
d1573ac7 2821 "Invalid range \"%c-%c\" in transliteration operator",
1ba5c669 2822 (char)min, (char)max);
c2e66d9e
GS
2823 }
2824
c7f1f016 2825#ifdef EBCDIC
4c3a8340
TS
2826 if (literal_endpoint == 2 &&
2827 ((isLOWER(min) && isLOWER(max)) ||
2828 (isUPPER(min) && isUPPER(max)))) {
8ada0baa
JH
2829 if (isLOWER(min)) {
2830 for (i = min; i <= max; i++)
2831 if (isLOWER(i))
db42d148 2832 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2833 } else {
2834 for (i = min; i <= max; i++)
2835 if (isUPPER(i))
db42d148 2836 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2837 }
2838 }
2839 else
2840#endif
2841 for (i = min; i <= max; i++)
e294cc5d
JH
2842#ifdef EBCDIC
2843 if (has_utf8) {
2844 const U8 ch = (U8)NATIVE_TO_UTF(i);
2845 if (UNI_IS_INVARIANT(ch))
2846 *d++ = (U8)i;
2847 else {
2848 *d++ = (U8)UTF8_EIGHT_BIT_HI(ch);
2849 *d++ = (U8)UTF8_EIGHT_BIT_LO(ch);
2850 }
2851 }
2852 else
2853#endif
2854 *d++ = (char)i;
2855
2856#ifdef EBCDIC
2857 if (uvmax) {
2858 d = (char*)uvchr_to_utf8((U8*)d, 0x100);
2859 if (uvmax > 0x101)
2860 *d++ = (char)UTF_TO_NATIVE(0xff);
2861 if (uvmax > 0x100)
2862 d = (char*)uvchr_to_utf8((U8*)d, uvmax);
2863 }
2864#endif
02aa26ce
NT
2865
2866 /* mark the range as done, and continue */
79072805 2867 dorange = FALSE;
01ec43d0 2868 didrange = TRUE;
4c3a8340
TS
2869#ifdef EBCDIC
2870 literal_endpoint = 0;
2871#endif
79072805 2872 continue;
4e553d73 2873 }
02aa26ce
NT
2874
2875 /* range begins (ignore - as first or last char) */
79072805 2876 else if (*s == '-' && s+1 < send && s != start) {
4e553d73 2877 if (didrange) {
1fafa243 2878 Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
01ec43d0 2879 }
e294cc5d
JH
2880 if (has_utf8
2881#ifdef EBCDIC
2882 && !native_range
2883#endif
2884 ) {
25716404 2885 *d++ = (char)UTF_TO_NATIVE(0xff); /* use illegal utf8 byte--see pmtrans */
a0ed51b3
LW
2886 s++;
2887 continue;
2888 }
79072805
LW
2889 dorange = TRUE;
2890 s++;
01ec43d0
GS
2891 }
2892 else {
2893 didrange = FALSE;
4c3a8340
TS
2894#ifdef EBCDIC
2895 literal_endpoint = 0;
e294cc5d 2896 native_range = TRUE;
4c3a8340 2897#endif
01ec43d0 2898 }
79072805 2899 }
02aa26ce
NT
2900
2901 /* if we get here, we're not doing a transliteration */
2902
e4a2df84
DM
2903 else if (*s == '[' && PL_lex_inpat && !in_charclass) {
2904 char *s1 = s-1;
2905 int esc = 0;
2906 while (s1 >= start && *s1-- == '\\')
2907 esc = !esc;
2908 if (!esc)
2909 in_charclass = TRUE;
2910 }
2866decb 2911
e4a2df84
DM
2912 else if (*s == ']' && PL_lex_inpat && in_charclass) {
2913 char *s1 = s-1;
2914 int esc = 0;
2915 while (s1 >= start && *s1-- == '\\')
2916 esc = !esc;
2917 if (!esc)
2918 in_charclass = FALSE;
2919 }
2866decb 2920
9da1dd8f
DM
2921 /* skip for regexp comments /(?#comment)/, except for the last
2922 * char, which will be done separately.
2923 * Stop on (?{..}) and friends */
2924
3280af22 2925 else if (*s == '(' && PL_lex_inpat && s[1] == '?') {
cc6b7395 2926 if (s[2] == '#') {
e994fd66 2927 while (s+1 < send && *s != ')')
db42d148 2928 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
155aba94 2929 }
2866decb 2930 else if (!PL_lex_casemods && !in_charclass &&
d3cec5e5
DM
2931 ( s[2] == '{' /* This should match regcomp.c */
2932 || (s[2] == '?' && s[3] == '{')))
155aba94 2933 {
9da1dd8f 2934 break;
cc6b7395 2935 }
748a9306 2936 }
02aa26ce
NT
2937
2938 /* likewise skip #-initiated comments in //x patterns */
3280af22 2939 else if (*s == '#' && PL_lex_inpat &&
73134a2e 2940 ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED) {
748a9306 2941 while (s+1 < send && *s != '\n')
db42d148 2942 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
748a9306 2943 }
02aa26ce 2944
9da1dd8f
DM
2945 /* no further processing of single-quoted regex */
2946 else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'')
2947 goto default_action;
2948
5d1d4326 2949 /* check for embedded arrays
da6eedaa 2950 (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
5d1d4326 2951 */
1749ea0d
TS
2952 else if (*s == '@' && s[1]) {
2953 if (isALNUM_lazy_if(s+1,UTF))
2954 break;
2955 if (strchr(":'{$", s[1]))
2956 break;
2957 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
2958 break; /* in regexp, neither @+ nor @- are interpolated */
2959 }
02aa26ce
NT
2960
2961 /* check for embedded scalars. only stop if we're sure it's a
2962 variable.
2963 */
79072805 2964 else if (*s == '$') {
3280af22 2965 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
79072805 2966 break;
77772344 2967 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
a2a5de95
NC
2968 if (s[1] == '\\') {
2969 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
2970 "Possible unintended interpolation of $\\ in regex");
77772344 2971 }
79072805 2972 break; /* in regexp, $ might be tail anchor */
77772344 2973 }
79072805 2974 }
02aa26ce 2975
2b9d42f0
NIS
2976 /* End of else if chain - OP_TRANS rejoin rest */
2977
02aa26ce 2978 /* backslashes */
79072805 2979 if (*s == '\\' && s+1 < send) {
ff3f963a
KW
2980 char* e; /* Can be used for ending '}', etc. */
2981
79072805 2982 s++;
02aa26ce 2983
7d0fc23c
KW
2984 /* warn on \1 - \9 in substitution replacements, but note that \11
2985 * is an octal; and \19 is \1 followed by '9' */
3280af22 2986 if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat &&
a0d0e21e 2987 isDIGIT(*s) && *s != '0' && !isDIGIT(s[1]))
79072805 2988 {
a2a5de95 2989 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
79072805
LW
2990 *--s = '$';
2991 break;
2992 }
02aa26ce
NT
2993
2994 /* string-change backslash escapes */
838f2281 2995 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQF", *s)) {
79072805
LW
2996 --s;
2997 break;
2998 }
ff3f963a
KW
2999 /* In a pattern, process \N, but skip any other backslash escapes.
3000 * This is because we don't want to translate an escape sequence
3001 * into a meta symbol and have the regex compiler use the meta
3002 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
3003 * in spite of this, we do have to process \N here while the proper
3004 * charnames handler is in scope. See bugs #56444 and #62056.
3005 * There is a complication because \N in a pattern may also stand
3006 * for 'match a non-nl', and not mean a charname, in which case its
3007 * processing should be deferred to the regex compiler. To be a
3008 * charname it must be followed immediately by a '{', and not look
3009 * like \N followed by a curly quantifier, i.e., not something like
3010 * \N{3,}. regcurly returns a boolean indicating if it is a legal
3011 * quantifier */
3012 else if (PL_lex_inpat
3013 && (*s != 'N'
3014 || s[1] != '{'
3015 || regcurly(s + 1)))
3016 {
cc74c5bd
TS
3017 *d++ = NATIVE_TO_NEED(has_utf8,'\\');
3018 goto default_action;
3019 }
02aa26ce 3020
79072805 3021 switch (*s) {
02aa26ce
NT
3022
3023 /* quoted - in transliterations */
79072805 3024 case '-':
3280af22 3025 if (PL_lex_inwhat == OP_TRANS) {
79072805
LW
3026 *d++ = *s++;
3027 continue;
3028 }
3029 /* FALL THROUGH */
3030 default:
11b8faa4 3031 {
e4ca4584 3032 if ((isALNUMC(*s)))
a2a5de95
NC
3033 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
3034 "Unrecognized escape \\%c passed through",
3035 *s);
11b8faa4 3036 /* default action is to copy the quoted character */
f9a63242 3037 goto default_action;
11b8faa4 3038 }
02aa26ce 3039
632403cc 3040 /* eg. \132 indicates the octal constant 0132 */
79072805
LW
3041 case '0': case '1': case '2': case '3':
3042 case '4': case '5': case '6': case '7':
ba210ebe 3043 {
53305cf1
NC
3044 I32 flags = 0;
3045 STRLEN len = 3;
77a135fe 3046 uv = NATIVE_TO_UNI(grok_oct(s, &len, &flags, NULL));
ba210ebe
JH
3047 s += len;
3048 }
012bcf8d 3049 goto NUM_ESCAPE_INSERT;
02aa26ce 3050
f0a2b745
KW
3051 /* eg. \o{24} indicates the octal constant \024 */
3052 case 'o':
3053 {
3054 STRLEN len;
454155d9 3055 const char* error;
f0a2b745 3056
454155d9 3057 bool valid = grok_bslash_o(s, &uv, &len, &error, 1);
f0a2b745 3058 s += len;
454155d9 3059 if (! valid) {
f0a2b745
KW
3060 yyerror(error);
3061 continue;
3062 }
3063 goto NUM_ESCAPE_INSERT;
3064 }
3065
77a135fe 3066 /* eg. \x24 indicates the hex constant 0x24 */
79072805 3067 case 'x':
a0481293 3068 {
53305cf1 3069 STRLEN len;
a0481293 3070 const char* error;
355860ce 3071
a0481293
KW
3072 bool valid = grok_bslash_x(s, &uv, &len, &error, 1);
3073 s += len;
3074 if (! valid) {
3075 yyerror(error);
355860ce 3076 continue;
ba210ebe 3077 }
012bcf8d
GS
3078 }
3079
3080 NUM_ESCAPE_INSERT:
ff3f963a
KW
3081 /* Insert oct or hex escaped character. There will always be
3082 * enough room in sv since such escapes will be longer than any
3083 * UTF-8 sequence they can end up as, except if they force us
3084 * to recode the rest of the string into utf8 */
ba7cea30 3085
77a135fe 3086 /* Here uv is the ordinal of the next character being added in
ff3f963a 3087 * unicode (converted from native). */
77a135fe 3088 if (!UNI_IS_INVARIANT(uv)) {
9aa983d2 3089 if (!has_utf8 && uv > 255) {
77a135fe
KW
3090 /* Might need to recode whatever we have accumulated so
3091 * far if it contains any chars variant in utf8 or
3092 * utf-ebcdic. */
3093
3094 SvCUR_set(sv, d - SvPVX_const(sv));
3095 SvPOK_on(sv);
3096 *d = '\0';
77a135fe 3097 /* See Note on sizing above. */
7bf79863
KW
3098 sv_utf8_upgrade_flags_grow(sv,
3099 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3100 UNISKIP(uv) + (STRLEN)(send - s) + 1);
77a135fe
KW
3101 d = SvPVX(sv) + SvCUR(sv);
3102 has_utf8 = TRUE;
012bcf8d
GS
3103 }
3104
77a135fe
KW
3105 if (has_utf8) {
3106 d = (char*)uvuni_to_utf8((U8*)d, uv);
f9a63242
JH
3107 if (PL_lex_inwhat == OP_TRANS &&
3108 PL_sublex_info.sub_op) {
3109 PL_sublex_info.sub_op->op_private |=
3110 (PL_lex_repl ? OPpTRANS_FROM_UTF
3111 : OPpTRANS_TO_UTF);
f9a63242 3112 }
e294cc5d
JH
3113#ifdef EBCDIC
3114 if (uv > 255 && !dorange)
3115 native_range = FALSE;
3116#endif
012bcf8d 3117 }
a0ed51b3 3118 else {
012bcf8d 3119 *d++ = (char)uv;
a0ed51b3 3120 }
012bcf8d
GS
3121 }
3122 else {
c4d5f83a 3123 *d++ = (char) uv;
a0ed51b3 3124 }
79072805 3125 continue;
02aa26ce 3126
4a2d328f 3127 case 'N':
ff3f963a
KW
3128 /* In a non-pattern \N must be a named character, like \N{LATIN
3129 * SMALL LETTER A} or \N{U+0041}. For patterns, it also can
3130 * mean to match a non-newline. For non-patterns, named
3131 * characters are converted to their string equivalents. In
3132 * patterns, named characters are not converted to their
3133 * ultimate forms for the same reasons that other escapes
3134 * aren't. Instead, they are converted to the \N{U+...} form
3135 * to get the value from the charnames that is in effect right
3136 * now, while preserving the fact that it was a named character
3137 * so that the regex compiler knows this */
3138
3139 /* This section of code doesn't generally use the
3140 * NATIVE_TO_NEED() macro to transform the input. I (khw) did
3141 * a close examination of this macro and determined it is a
3142 * no-op except on utfebcdic variant characters. Every
3143 * character generated by this that would normally need to be
3144 * enclosed by this macro is invariant, so the macro is not
7538f724
KW
3145 * needed, and would complicate use of copy(). XXX There are
3146 * other parts of this file where the macro is used
3147 * inconsistently, but are saved by it being a no-op */
ff3f963a
KW
3148
3149 /* The structure of this section of code (besides checking for
3150 * errors and upgrading to utf8) is:
3151 * Further disambiguate between the two meanings of \N, and if
3152 * not a charname, go process it elsewhere
0a96133f
KW
3153 * If of form \N{U+...}, pass it through if a pattern;
3154 * otherwise convert to utf8
3155 * Otherwise must be \N{NAME}: convert to \N{U+c1.c2...} if a
3156 * pattern; otherwise convert to utf8 */
ff3f963a
KW
3157
3158 /* Here, s points to the 'N'; the test below is guaranteed to
3159 * succeed if we are being called on a pattern as we already
3160 * know from a test above that the next character is a '{'.
3161 * On a non-pattern \N must mean 'named sequence, which
3162 * requires braces */
3163 s++;
3164 if (*s != '{') {
3165 yyerror("Missing braces on \\N{}");
3166 continue;
3167 }
3168 s++;
3169
0a96133f 3170 /* If there is no matching '}', it is an error. */
ff3f963a
KW
3171 if (! (e = strchr(s, '}'))) {
3172 if (! PL_lex_inpat) {
5777a3f7 3173 yyerror("Missing right brace on \\N{}");
0a96133f
KW
3174 } else {
3175 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N.");
dbc0d4f2 3176 }
0a96133f 3177 continue;
ff3f963a 3178 }
cddc7ef4 3179
ff3f963a 3180 /* Here it looks like a named character */
cddc7ef4 3181
ff3f963a
KW
3182 if (PL_lex_inpat) {
3183
3184 /* XXX This block is temporary code. \N{} implies that the
3185 * pattern is to have Unicode semantics, and therefore
3186 * currently has to be encoded in utf8. By putting it in
3187 * utf8 now, we save a whole pass in the regular expression
3188 * compiler. Once that code is changed so Unicode
3189 * semantics doesn't necessarily have to be in utf8, this
da3a4baf
KW
3190 * block should be removed. However, the code that parses
3191 * the output of this would have to be changed to not
3192 * necessarily expect utf8 */
ff3f963a 3193 if (!has_utf8) {
77a135fe 3194 SvCUR_set(sv, d - SvPVX_const(sv));
f08d6ad9 3195 SvPOK_on(sv);
e4f3eed8 3196 *d = '\0';
77a135fe 3197 /* See Note on sizing above. */
7bf79863 3198 sv_utf8_upgrade_flags_grow(sv,
ff3f963a
KW
3199 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3200 /* 5 = '\N{' + cur char + NUL */
3201 (STRLEN)(send - s) + 5);
f08d6ad9 3202 d = SvPVX(sv) + SvCUR(sv);
89491803 3203 has_utf8 = TRUE;
ff3f963a
KW
3204 }
3205 }
423cee85 3206
ff3f963a
KW
3207 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3208 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3209 | PERL_SCAN_DISALLOW_PREFIX;
3210 STRLEN len;
3211
3212 /* For \N{U+...}, the '...' is a unicode value even on
3213 * EBCDIC machines */
3214 s += 2; /* Skip to next char after the 'U+' */
3215 len = e - s;
3216 uv = grok_hex(s, &len, &flags, NULL);
3217 if (len == 0 || len != (STRLEN)(e - s)) {
3218 yyerror("Invalid hexadecimal number in \\N{U+...}");
3219 s = e + 1;
3220 continue;
3221 }
3222
3223 if (PL_lex_inpat) {
3224
e2a7e165
KW
3225 /* On non-EBCDIC platforms, pass through to the regex
3226 * compiler unchanged. The reason we evaluated the
3227 * number above is to make sure there wasn't a syntax
3228 * error. But on EBCDIC we convert to native so
3229 * downstream code can continue to assume it's native
3230 */
ff3f963a 3231 s -= 5; /* Include the '\N{U+' */
e2a7e165
KW
3232#ifdef EBCDIC
3233 d += my_snprintf(d, e - s + 1 + 1, /* includes the }
3234 and the \0 */
3235 "\\N{U+%X}",
3236 (unsigned int) UNI_TO_NATIVE(uv));
3237#else
ff3f963a
KW
3238 Copy(s, d, e - s + 1, char); /* 1 = include the } */
3239 d += e - s + 1;
e2a7e165 3240#endif
ff3f963a
KW
3241 }
3242 else { /* Not a pattern: convert the hex to string */
3243
3244 /* If destination is not in utf8, unconditionally
3245 * recode it to be so. This is because \N{} implies
3246 * Unicode semantics, and scalars have to be in utf8
3247 * to guarantee those semantics */
3248 if (! has_utf8) {
3249 SvCUR_set(sv, d - SvPVX_const(sv));
3250 SvPOK_on(sv);
3251 *d = '\0';
3252 /* See Note on sizing above. */
3253 sv_utf8_upgrade_flags_grow(
3254 sv,
3255 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3256 UNISKIP(uv) + (STRLEN)(send - e) + 1);
3257 d = SvPVX(sv) + SvCUR(sv);
3258 has_utf8 = TRUE;
3259 }
3260
3261 /* Add the string to the output */
3262 if (UNI_IS_INVARIANT(uv)) {
3263 *d++ = (char) uv;
3264 }
3265 else d = (char*)uvuni_to_utf8((U8*)d, uv);
3266 }
3267 }
3268 else { /* Here is \N{NAME} but not \N{U+...}. */
3269
3270 SV *res; /* result from charnames */
3271 const char *str; /* the string in 'res' */
3272 STRLEN len; /* its length */
3273
3274 /* Get the value for NAME */
3275 res = newSVpvn(s, e - s);
3276 res = new_constant( NULL, 0, "charnames",
3277 /* includes all of: \N{...} */
3278 res, NULL, s - 3, e - s + 4 );
3279
3280 /* Most likely res will be in utf8 already since the
3281 * standard charnames uses pack U, but a custom translator
3282 * can leave it otherwise, so make sure. XXX This can be
3283 * revisited to not have charnames use utf8 for characters
3284 * that don't need it when regexes don't have to be in utf8
3285 * for Unicode semantics. If doing so, remember EBCDIC */
3286 sv_utf8_upgrade(res);
3287 str = SvPV_const(res, len);
3288
3289 /* Don't accept malformed input */
3290 if (! is_utf8_string((U8 *) str, len)) {
3291 yyerror("Malformed UTF-8 returned by \\N");
3292 }
3293 else if (PL_lex_inpat) {
3294
3295 if (! len) { /* The name resolved to an empty string */
3296 Copy("\\N{}", d, 4, char);
3297 d += 4;
3298 }
3299 else {
3300 /* In order to not lose information for the regex
3301 * compiler, pass the result in the specially made
3302 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3303 * the code points in hex of each character
3304 * returned by charnames */
3305
3306 const char *str_end = str + len;
3307 STRLEN char_length; /* cur char's byte length */
3308 STRLEN output_length; /* and the number of bytes
3309 after this is translated
3310 into hex digits */
3311 const STRLEN off = d - SvPVX_const(sv);
3312
3313 /* 2 hex per byte; 2 chars for '\N'; 2 chars for
3314 * max('U+', '.'); and 1 for NUL */
3315 char hex_string[2 * UTF8_MAXBYTES + 5];
3316
3317 /* Get the first character of the result. */
3318 U32 uv = utf8n_to_uvuni((U8 *) str,
3319 len,
3320 &char_length,
3321 UTF8_ALLOW_ANYUV);
3322
3323 /* The call to is_utf8_string() above hopefully
3324 * guarantees that there won't be an error. But
3325 * it's easy here to make sure. The function just
3326 * above warns and returns 0 if invalid utf8, but
3327 * it can also return 0 if the input is validly a
3328 * NUL. Disambiguate */
3329 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3330 uv = UNICODE_REPLACEMENT;
3331 }
3332
3333 /* Convert first code point to hex, including the
e2a7e165
KW
3334 * boiler plate before it. For all these, we
3335 * convert to native format so that downstream code
3336 * can continue to assume the input is native */
78c35590 3337 output_length =
3353de27 3338 my_snprintf(hex_string, sizeof(hex_string),
e2a7e165
KW
3339 "\\N{U+%X",
3340 (unsigned int) UNI_TO_NATIVE(uv));
ff3f963a
KW
3341
3342 /* Make sure there is enough space to hold it */
3343 d = off + SvGROW(sv, off
3344 + output_length
3345 + (STRLEN)(send - e)
3346 + 2); /* '}' + NUL */
3347 /* And output it */
3348 Copy(hex_string, d, output_length, char);
3349 d += output_length;
3350
3351 /* For each subsequent character, append dot and
3352 * its ordinal in hex */
3353 while ((str += char_length) < str_end) {
3354 const STRLEN off = d - SvPVX_const(sv);
3355 U32 uv = utf8n_to_uvuni((U8 *) str,
3356 str_end - str,
3357 &char_length,
3358 UTF8_ALLOW_ANYUV);
3359 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3360 uv = UNICODE_REPLACEMENT;
3361 }
3362
78c35590 3363 output_length =
3353de27 3364 my_snprintf(hex_string, sizeof(hex_string),
e2a7e165
KW
3365 ".%X",
3366 (unsigned int) UNI_TO_NATIVE(uv));
ff3f963a
KW
3367
3368 d = off + SvGROW(sv, off
3369 + output_length
3370 + (STRLEN)(send - e)
3371 + 2); /* '}' + NUL */
3372 Copy(hex_string, d, output_length, char);
3373 d += output_length;
3374 }
3375
3376 *d++ = '}'; /* Done. Add the trailing brace */
3377 }
3378 }
3379 else { /* Here, not in a pattern. Convert the name to a
3380 * string. */
3381
3382 /* If destination is not in utf8, unconditionally
3383 * recode it to be so. This is because \N{} implies
3384 * Unicode semantics, and scalars have to be in utf8
3385 * to guarantee those semantics */
3386 if (! has_utf8) {
3387 SvCUR_set(sv, d - SvPVX_const(sv));
3388 SvPOK_on(sv);
3389 *d = '\0';
3390 /* See Note on sizing above. */
3391 sv_utf8_upgrade_flags_grow(sv,
3392 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3393 len + (STRLEN)(send - s) + 1);
3394 d = SvPVX(sv) + SvCUR(sv);
3395 has_utf8 = TRUE;
3396 } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
3397
3398 /* See Note on sizing above. (NOTE: SvCUR() is not
3399 * set correctly here). */
3400 const STRLEN off = d - SvPVX_const(sv);
3401 d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1);
3402 }
3403 Copy(str, d, len, char);
3404 d += len;
423cee85 3405 }
423cee85 3406 SvREFCNT_dec(res);
cb233ae3
KW
3407
3408 /* Deprecate non-approved name syntax */
3409 if (ckWARN_d(WARN_DEPRECATED)) {
3410 bool problematic = FALSE;
3411 char* i = s;
3412
3413 /* For non-ut8 input, look to see that the first
3414 * character is an alpha, then loop through the rest
3415 * checking that each is a continuation */
3416 if (! this_utf8) {
3417 if (! isALPHAU(*i)) problematic = TRUE;
3418 else for (i = s + 1; i < e; i++) {
3419 if (isCHARNAME_CONT(*i)) continue;
3420 problematic = TRUE;
3421 break;
3422 }
3423 }
3424 else {
3425 /* Similarly for utf8. For invariants can check
3426 * directly. We accept anything above the latin1
3427 * range because it is immaterial to Perl if it is
3428 * correct or not, and is expensive to check. But
3429 * it is fairly easy in the latin1 range to convert
3430 * the variants into a single character and check
3431 * those */
3432 if (UTF8_IS_INVARIANT(*i)) {
3433 if (! isALPHAU(*i)) problematic = TRUE;
3434 } else if (UTF8_IS_DOWNGRADEABLE_START(*i)) {
81c14aa2 3435 if (! isALPHAU(UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(*i,
cb233ae3
KW
3436 *(i+1)))))
3437 {
3438 problematic = TRUE;
3439 }
3440 }
3441 if (! problematic) for (i = s + UTF8SKIP(s);
3442 i < e;
3443 i+= UTF8SKIP(i))
3444 {
3445 if (UTF8_IS_INVARIANT(*i)) {
3446 if (isCHARNAME_CONT(*i)) continue;
3447 } else if (! UTF8_IS_DOWNGRADEABLE_START(*i)) {
3448 continue;
3449 } else if (isCHARNAME_CONT(
3450 UNI_TO_NATIVE(
81c14aa2 3451 TWO_BYTE_UTF8_TO_UNI(*i, *(i+1)))))
cb233ae3
KW
3452 {
3453 continue;
3454 }
3455 problematic = TRUE;
3456 break;
3457 }
3458 }
3459 if (problematic) {
6e1bad6c
KW
3460 /* The e-i passed to the final %.*s makes sure that
3461 * should the trailing NUL be missing that this
3462 * print won't run off the end of the string */
cb233ae3 3463 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
b00fc8d4
NC
3464 "Deprecated character in \\N{...}; marked by <-- HERE in \\N{%.*s<-- HERE %.*s",
3465 (int)(i - s + 1), s, (int)(e - i), i + 1);
cb233ae3
KW
3466 }
3467 }
3468 } /* End \N{NAME} */
ff3f963a
KW
3469#ifdef EBCDIC
3470 if (!dorange)
3471 native_range = FALSE; /* \N{} is defined to be Unicode */
3472#endif
3473 s = e + 1; /* Point to just after the '}' */
423cee85
JH
3474 continue;
3475
02aa26ce 3476 /* \c is a control character */
79072805
LW
3477 case 'c':
3478 s++;
961ce445 3479 if (s < send) {
17a3df4c 3480 *d++ = grok_bslash_c(*s++, has_utf8, 1);
ba210ebe 3481 }
961ce445
RGS
3482 else {
3483 yyerror("Missing control char name in \\c");
3484 }
79072805 3485 continue;
02aa26ce
NT
3486
3487 /* printf-style backslashes, formfeeds, newlines, etc */
79072805 3488 case 'b':
db42d148 3489 *d++ = NATIVE_TO_NEED(has_utf8,'\b');
79072805
LW
3490 break;
3491 case 'n':
db42d148 3492 *d++ = NATIVE_TO_NEED(has_utf8,'\n');
79072805
LW
3493 break;
3494 case 'r':
db42d148 3495 *d++ = NATIVE_TO_NEED(has_utf8,'\r');
79072805
LW
3496 break;
3497 case 'f':
db42d148 3498 *d++ = NATIVE_TO_NEED(has_utf8,'\f');
79072805
LW
3499 break;
3500 case 't':
db42d148 3501 *d++ = NATIVE_TO_NEED(has_utf8,'\t');
79072805 3502 break;
34a3fe2a 3503 case 'e':
db42d148 3504 *d++ = ASCII_TO_NEED(has_utf8,'\033');
34a3fe2a
PP
3505 break;
3506 case 'a':
db42d148 3507 *d++ = ASCII_TO_NEED(has_utf8,'\007');
79072805 3508 break;
02aa26ce
NT
3509 } /* end switch */
3510
79072805
LW
3511 s++;
3512 continue;
02aa26ce 3513 } /* end if (backslash) */
4c3a8340
TS
3514#ifdef EBCDIC
3515 else
3516 literal_endpoint++;
3517#endif
02aa26ce 3518
f9a63242 3519 default_action:
77a135fe
KW
3520 /* If we started with encoded form, or already know we want it,
3521 then encode the next character */
3522 if (! NATIVE_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
2b9d42f0 3523 STRLEN len = 1;
77a135fe
KW
3524
3525
3526 /* One might think that it is wasted effort in the case of the
3527 * source being utf8 (this_utf8 == TRUE) to take the next character
3528 * in the source, convert it to an unsigned value, and then convert
3529 * it back again. But the source has not been validated here. The
3530 * routine that does the conversion checks for errors like
3531 * malformed utf8 */
3532
5f66b61c
AL
3533 const UV nextuv = (this_utf8) ? utf8n_to_uvchr((U8*)s, send - s, &len, 0) : (UV) ((U8) *s);
3534 const STRLEN need = UNISKIP(NATIVE_TO_UNI(nextuv));
77a135fe
KW
3535 if (!has_utf8) {
3536 SvCUR_set(sv, d - SvPVX_const(sv));
3537 SvPOK_on(sv);
3538 *d = '\0';
77a135fe 3539 /* See Note on sizing above. */
7bf79863
KW
3540 sv_utf8_upgrade_flags_grow(sv,
3541 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3542 need + (STRLEN)(send - s) + 1);
77a135fe
KW
3543 d = SvPVX(sv) + SvCUR(sv);
3544 has_utf8 = TRUE;
3545 } else if (need > len) {
3546 /* encoded value larger than old, may need extra space (NOTE:
3547 * SvCUR() is not set correctly here). See Note on sizing
3548 * above. */
9d4ba2ae 3549 const STRLEN off = d - SvPVX_const(sv);
77a135fe 3550 d = SvGROW(sv, off + need + (STRLEN)(send - s) + 1) + off;
2b9d42f0 3551 }
77a135fe
KW
3552 s += len;
3553
5f66b61c 3554 d = (char*)uvchr_to_utf8((U8*)d, nextuv);
e294cc5d
JH
3555#ifdef EBCDIC
3556 if (uv > 255 && !dorange)
3557 native_range = FALSE;
3558#endif
2b9d42f0
NIS
3559 }
3560 else {
3561 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
3562 }
02aa26ce
NT
3563 } /* while loop to process each character */
3564
3565 /* terminate the string and set up the sv */
79072805 3566 *d = '\0';
95a20fc0 3567 SvCUR_set(sv, d - SvPVX_const(sv));
2b9d42f0 3568 if (SvCUR(sv) >= SvLEN(sv))
5637ef5b
NC
3569 Perl_croak(aTHX_ "panic: constant overflowed allocated space, %"UVuf
3570 " >= %"UVuf, (UV)SvCUR(sv), (UV)SvLEN(sv));
2b9d42f0 3571
79072805 3572 SvPOK_on(sv);
9f4817db 3573 if (PL_encoding && !has_utf8) {
d0063567
DK
3574 sv_recode_to_utf8(sv, PL_encoding);
3575 if (SvUTF8(sv))
3576 has_utf8 = TRUE;
9f4817db 3577 }
2b9d42f0 3578 if (has_utf8) {
7e2040f0 3579 SvUTF8_on(sv);
2b9d42f0 3580 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
d0063567 3581 PL_sublex_info.sub_op->op_private |=
2b9d42f0
NIS
3582 (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
3583 }
3584 }
79072805 3585
02aa26ce 3586 /* shrink the sv if we allocated more than we used */
79072805 3587 if (SvCUR(sv) + 5 < SvLEN(sv)) {
1da4ca5f 3588 SvPV_shrink_to_cur(sv);
79072805 3589 }
02aa26ce 3590
6154021b 3591 /* return the substring (via pl_yylval) only if we parsed anything */
3280af22 3592 if (s > PL_bufptr) {
eb0d8d16
NC
3593 if ( PL_hints & ( PL_lex_inpat ? HINT_NEW_RE : HINT_NEW_STRING ) ) {
3594 const char *const key = PL_lex_inpat ? "qr" : "q";
3595 const STRLEN keylen = PL_lex_inpat ? 2 : 1;
3596 const char *type;
3597 STRLEN typelen;
3598
3599 if (PL_lex_inwhat == OP_TRANS) {
3600 type = "tr";
3601 typelen = 2;
3602 } else if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat) {
3603 type = "s";
3604 typelen = 1;
9da1dd8f
DM
3605 } else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'') {
3606 type = "q";
3607 typelen = 1;
eb0d8d16
NC
3608 } else {
3609 type = "qq";
3610 typelen = 2;
3611 }
3612
3613 sv = S_new_constant(aTHX_ start, s - start, key, keylen, sv, NULL,
3614 type, typelen);
3615 }
6154021b 3616 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
b3ac6de7 3617 } else
8990e307 3618 SvREFCNT_dec(sv);
79072805
LW
3619 return s;
3620}
3621
ffb4593c
NT
3622/* S_intuit_more
3623 * Returns TRUE if there's more to the expression (e.g., a subscript),
3624 * FALSE otherwise.
ffb4593c
NT
3625 *
3626 * It deals with "$foo[3]" and /$foo[3]/ and /$foo[0123456789$]+/
3627 *
3628 * ->[ and ->{ return TRUE
3629 * { and [ outside a pattern are always subscripts, so return TRUE
3630 * if we're outside a pattern and it's not { or [, then return FALSE
3631 * if we're in a pattern and the first char is a {
3632 * {4,5} (any digits around the comma) returns FALSE
3633 * if we're in a pattern and the first char is a [
3634 * [] returns FALSE
3635 * [SOMETHING] has a funky algorithm to decide whether it's a
3636 * character class or not. It has to deal with things like
3637 * /$foo[-3]/ and /$foo[$bar]/ as well as /$foo[$\d]+/
3638 * anything else returns TRUE
3639 */
3640
9cbb5ea2
GS
3641/* This is the one truly awful dwimmer necessary to conflate C and sed. */
3642
76e3520e 3643STATIC int
cea2e8a9 3644S_intuit_more(pTHX_ register char *s)
79072805 3645{
97aff369 3646 dVAR;
7918f24d
NC
3647
3648 PERL_ARGS_ASSERT_INTUIT_MORE;
3649
3280af22 3650 if (PL_lex_brackets)
79072805
LW
3651 return TRUE;
3652 if (*s == '-' && s[1] == '>' && (s[2] == '[' || s[2] == '{'))
3653 return TRUE;
3654 if (*s != '{' && *s != '[')
3655 return FALSE;
3280af22 3656 if (!PL_lex_inpat)
79072805
LW
3657 return TRUE;
3658
3659 /* In a pattern, so maybe we have {n,m}. */
3660 if (*s == '{') {
b3155d95 3661 if (regcurly(s)) {
79072805 3662 return FALSE;
b3155d95 3663 }
79072805 3664 return TRUE;
79072805
LW
3665 }
3666
3667 /* On the other hand, maybe we have a character class */
3668
3669 s++;
3670 if (*s == ']' || *s == '^')
3671 return FALSE;
3672 else {
ffb4593c 3673 /* this is terrifying, and it works */
79072805
LW
3674 int weight = 2; /* let's weigh the evidence */
3675 char seen[256];
f27ffc4a 3676 unsigned char un_char = 255, last_un_char;
9d4ba2ae 3677 const char * const send = strchr(s,']');
3280af22 3678 char tmpbuf[sizeof PL_tokenbuf * 4];
79072805
LW
3679
3680 if (!send) /* has to be an expression */
3681 return TRUE;
3682
3683 Zero(seen,256,char);
3684 if (*s == '$')
3685 weight -= 3;
3686 else if (isDIGIT(*s)) {
3687 if (s[1] != ']') {
3688 if (isDIGIT(s[1]) && s[2] == ']')
3689 weight -= 10;
3690 }
3691 else
3692 weight -= 100;
3693 }
3694 for (; s < send; s++) {
3695 last_un_char = un_char;
3696 un_char = (unsigned char)*s;
3697 switch (*s) {
3698 case '@':
3699 case '&':
3700 case '$':
3701 weight -= seen[un_char] * 10;
7e2040f0 3702 if (isALNUM_lazy_if(s+1,UTF)) {
90e5519e 3703 int len;
8903cb82 3704 scan_ident(s, send, tmpbuf, sizeof tmpbuf, FALSE);
90e5519e 3705 len = (int)strlen(tmpbuf);
6fbd0d97
BF
3706 if (len > 1 && gv_fetchpvn_flags(tmpbuf, len,
3707 UTF ? SVf_UTF8 : 0, SVt_PV))
79072805
LW
3708 weight -= 100;
3709 else
3710 weight -= 10;
3711 }
3712 else if (*s == '$' && s[1] &&
93a17b20
LW
3713 strchr("[#!%*<>()-=",s[1])) {
3714 if (/*{*/ strchr("])} =",s[2]))
79072805
LW
3715 weight -= 10;
3716 else
3717 weight -= 1;
3718 }
3719 break;
3720 case '\\':
3721 un_char = 254;
3722 if (s[1]) {
93a17b20 3723 if (strchr("wds]",s[1]))
79072805 3724 weight += 100;
10edeb5d 3725 else if (seen[(U8)'\''] || seen[(U8)'"'])
79072805 3726 weight += 1;
93a17b20 3727 else if (strchr("rnftbxcav",s[1]))
79072805
LW
3728 weight += 40;
3729 else if (isDIGIT(s[1])) {
3730 weight += 40;
3731 while (s[1] && isDIGIT(s[1]))
3732 s++;
3733 }
3734 }
3735 else
3736 weight += 100;
3737 break;
3738 case '-':
3739 if (s[1] == '\\')
3740 weight += 50;
93a17b20 3741 if (strchr("aA01! ",last_un_char))
79072805 3742 weight += 30;
93a17b20 3743 if (strchr("zZ79~",s[1]))
79072805 3744 weight += 30;
f27ffc4a
GS
3745 if (last_un_char == 255 && (isDIGIT(s[1]) || s[1] == '$'))
3746 weight -= 5; /* cope with negative subscript */
79072805
LW
3747 break;
3748 default:
3792a11b
NC
3749 if (!isALNUM(last_un_char)
3750 && !(last_un_char == '$' || last_un_char == '@'
3751 || last_un_char == '&')
3752 && isALPHA(*s) && s[1] && isALPHA(s[1])) {
79072805
LW