This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
utf8n_to_uvuni() pod: Add clarifications
[perl5.git] / toke.c
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
26
27This is the lower layer of the Perl parser, managing characters and tokens.
28
29=for apidoc AmU|yy_parser *|PL_parser
30
31Pointer to a structure encapsulating the state of the parsing operation
32currently in progress. The pointer can be locally changed to perform
33a nested parse without interfering with the state of an outer parse.
34Individual members of C<PL_parser> have their own documentation.
35
36=cut
37*/
38
378cc40b 39#include "EXTERN.h"
864dbfa3 40#define PERL_IN_TOKE_C
378cc40b 41#include "perl.h"
04e98a4d 42#include "dquote_static.c"
378cc40b 43
eb0d8d16
NC
44#define new_constant(a,b,c,d,e,f,g) \
45 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
46
6154021b 47#define pl_yylval (PL_parser->yylval)
d3b6f988 48
199e78b7
DM
49/* XXX temporary backwards compatibility */
50#define PL_lex_brackets (PL_parser->lex_brackets)
78cdf107
Z
51#define PL_lex_allbrackets (PL_parser->lex_allbrackets)
52#define PL_lex_fakeeof (PL_parser->lex_fakeeof)
199e78b7
DM
53#define PL_lex_brackstack (PL_parser->lex_brackstack)
54#define PL_lex_casemods (PL_parser->lex_casemods)
55#define PL_lex_casestack (PL_parser->lex_casestack)
56#define PL_lex_defer (PL_parser->lex_defer)
57#define PL_lex_dojoin (PL_parser->lex_dojoin)
58#define PL_lex_expect (PL_parser->lex_expect)
59#define PL_lex_formbrack (PL_parser->lex_formbrack)
60#define PL_lex_inpat (PL_parser->lex_inpat)
61#define PL_lex_inwhat (PL_parser->lex_inwhat)
62#define PL_lex_op (PL_parser->lex_op)
63#define PL_lex_repl (PL_parser->lex_repl)
64#define PL_lex_starts (PL_parser->lex_starts)
65#define PL_lex_stuff (PL_parser->lex_stuff)
66#define PL_multi_start (PL_parser->multi_start)
67#define PL_multi_open (PL_parser->multi_open)
68#define PL_multi_close (PL_parser->multi_close)
199e78b7
DM
69#define PL_preambled (PL_parser->preambled)
70#define PL_sublex_info (PL_parser->sublex_info)
bdc0bf6f 71#define PL_linestr (PL_parser->linestr)
c2598295
DM
72#define PL_expect (PL_parser->expect)
73#define PL_copline (PL_parser->copline)
f06b5848
DM
74#define PL_bufptr (PL_parser->bufptr)
75#define PL_oldbufptr (PL_parser->oldbufptr)
76#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
77#define PL_linestart (PL_parser->linestart)
78#define PL_bufend (PL_parser->bufend)
79#define PL_last_uni (PL_parser->last_uni)
80#define PL_last_lop (PL_parser->last_lop)
81#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 82#define PL_lex_state (PL_parser->lex_state)
2f9285f8 83#define PL_rsfp (PL_parser->rsfp)
5486870f 84#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
85#define PL_in_my (PL_parser->in_my)
86#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 87#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 88#define PL_multi_end (PL_parser->multi_end)
13765c85 89#define PL_error_count (PL_parser->error_count)
199e78b7
DM
90
91#ifdef PERL_MAD
92# define PL_endwhite (PL_parser->endwhite)
93# define PL_faketokens (PL_parser->faketokens)
94# define PL_lasttoke (PL_parser->lasttoke)
95# define PL_nextwhite (PL_parser->nextwhite)
96# define PL_realtokenstart (PL_parser->realtokenstart)
97# define PL_skipwhite (PL_parser->skipwhite)
98# define PL_thisclose (PL_parser->thisclose)
99# define PL_thismad (PL_parser->thismad)
100# define PL_thisopen (PL_parser->thisopen)
101# define PL_thisstuff (PL_parser->thisstuff)
102# define PL_thistoken (PL_parser->thistoken)
103# define PL_thiswhite (PL_parser->thiswhite)
fb205e7a
DM
104# define PL_thiswhite (PL_parser->thiswhite)
105# define PL_nexttoke (PL_parser->nexttoke)
106# define PL_curforce (PL_parser->curforce)
107#else
108# define PL_nexttoke (PL_parser->nexttoke)
109# define PL_nexttype (PL_parser->nexttype)
110# define PL_nextval (PL_parser->nextval)
199e78b7
DM
111#endif
112
0bd48802 113static const char ident_too_long[] = "Identifier too long";
8903cb82 114
29595ff2 115#ifdef PERL_MAD
29595ff2 116# define CURMAD(slot,sv) if (PL_madskills) { curmad(slot,sv); sv = 0; }
cd81e915 117# define NEXTVAL_NEXTTOKE PL_nexttoke[PL_curforce].next_val
9ded7720 118#else
5db06880 119# define CURMAD(slot,sv)
9ded7720 120# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2
NC
121#endif
122
a7aaec61
Z
123#define XENUMMASK 0x3f
124#define XFAKEEOF 0x40
125#define XFAKEBRACK 0x80
9059aa12 126
39e02b42
JH
127#ifdef USE_UTF8_SCRIPTS
128# define UTF (!IN_BYTES)
2b9d42f0 129#else
802a15e9 130# define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
2b9d42f0 131#endif
a0ed51b3 132
b1fc3636
CJ
133/* The maximum number of characters preceding the unrecognized one to display */
134#define UNRECOGNIZED_PRECEDE_COUNT 10
135
61f0cdd9 136/* In variables named $^X, these are the legal values for X.
2b92dfce
GS
137 * 1999-02-27 mjd-perl-patch@plover.com */
138#define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
139
bf4acbe4 140#define SPACE_OR_TAB(c) ((c)==' '||(c)=='\t')
bf4acbe4 141
ffb4593c
NT
142/* LEX_* are values for PL_lex_state, the state of the lexer.
143 * They are arranged oddly so that the guard on the switch statement
79072805 144 * can get by with a single comparison (if the compiler is smart enough).
9da1dd8f
DM
145 *
146 * These values refer to the various states within a sublex parse,
147 * i.e. within a double quotish string
79072805
LW
148 */
149
fb73857a 150/* #define LEX_NOTPARSING 11 is done in perl.h. */
151
b6007c36
DM
152#define LEX_NORMAL 10 /* normal code (ie not within "...") */
153#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
154#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
155#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
156#define LEX_INTERPSTART 6 /* expecting the start of a $var */
157
158 /* at end of code, eg "$x" followed by: */
159#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
160#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
161
162#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
163 string or after \E, $foo, etc */
164#define LEX_INTERPCONST 2 /* NOT USED */
165#define LEX_FORMLINE 1 /* expecting a format line */
166#define LEX_KNOWNEXT 0 /* next token known; just return it */
167
79072805 168
bbf60fe6 169#ifdef DEBUGGING
27da23d5 170static const char* const lex_state_names[] = {
bbf60fe6
DM
171 "KNOWNEXT",
172 "FORMLINE",
173 "INTERPCONST",
174 "INTERPCONCAT",
175 "INTERPENDMAYBE",
176 "INTERPEND",
177 "INTERPSTART",
178 "INTERPPUSH",
179 "INTERPCASEMOD",
180 "INTERPNORMAL",
181 "NORMAL"
182};
183#endif
184
79072805
LW
185#ifdef ff_next
186#undef ff_next
d48672a2
LW
187#endif
188
79072805 189#include "keywords.h"
fe14fcc3 190
ffb4593c
NT
191/* CLINE is a macro that ensures PL_copline has a sane value */
192
ae986130
LW
193#ifdef CLINE
194#undef CLINE
195#endif
57843af0 196#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 197
5db06880 198#ifdef PERL_MAD
29595ff2
NC
199# define SKIPSPACE0(s) skipspace0(s)
200# define SKIPSPACE1(s) skipspace1(s)
201# define SKIPSPACE2(s,tsv) skipspace2(s,&tsv)
202# define PEEKSPACE(s) skipspace2(s,0)
203#else
204# define SKIPSPACE0(s) skipspace(s)
205# define SKIPSPACE1(s) skipspace(s)
206# define SKIPSPACE2(s,tsv) skipspace(s)
207# define PEEKSPACE(s) skipspace(s)
208#endif
209
ffb4593c
NT
210/*
211 * Convenience functions to return different tokens and prime the
9cbb5ea2 212 * lexer for the next token. They all take an argument.
ffb4593c
NT
213 *
214 * TOKEN : generic token (used for '(', DOLSHARP, etc)
215 * OPERATOR : generic operator
216 * AOPERATOR : assignment operator
217 * PREBLOCK : beginning the block after an if, while, foreach, ...
218 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
219 * PREREF : *EXPR where EXPR is not a simple identifier
220 * TERM : expression term
221 * LOOPX : loop exiting command (goto, last, dump, etc)
222 * FTST : file test operator
223 * FUN0 : zero-argument function
7eb971ee 224 * FUN0OP : zero-argument function, with its op created in this file
2d2e263d 225 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
226 * BOop : bitwise or or xor
227 * BAop : bitwise and
228 * SHop : shift operator
229 * PWop : power operator
9cbb5ea2 230 * PMop : pattern-matching operator
ffb4593c
NT
231 * Aop : addition-level operator
232 * Mop : multiplication-level operator
233 * Eop : equality-testing operator
e5edeb50 234 * Rop : relational operator <= != gt
ffb4593c
NT
235 *
236 * Also see LOP and lop() below.
237 */
238
998054bd 239#ifdef DEBUGGING /* Serve -DT. */
704d4215 240# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 241#else
bbf60fe6 242# define REPORT(retval) (retval)
998054bd
SC
243#endif
244
bbf60fe6
DM
245#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
246#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
247#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, REPORT(retval)))
248#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
249#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
250#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
251#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
6154021b
RGS
252#define LOOPX(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)LOOPEX))
253#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
254#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
7eb971ee 255#define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
6154021b
RGS
256#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
257#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITOROP)))
258#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITANDOP)))
259#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)SHIFTOP)))
260#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)POWOP)))
261#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
262#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)ADDOP)))
263#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MULOP)))
264#define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
265#define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
2f3197b3 266
a687059c
LW
267/* This bit of chicanery makes a unary function followed by
268 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
269 * The UNIDOR macro is for unary functions that can be followed by the //
270 * operator (such as C<shift // 0>).
a687059c 271 */
d68ce4ac 272#define UNI3(f,x,have_x) { \
6154021b 273 pl_yylval.ival = f; \
d68ce4ac 274 if (have_x) PL_expect = x; \
376fcdbf
AL
275 PL_bufptr = s; \
276 PL_last_uni = PL_oldbufptr; \
277 PL_last_lop_op = f; \
278 if (*s == '(') \
279 return REPORT( (int)FUNC1 ); \
29595ff2 280 s = PEEKSPACE(s); \
376fcdbf
AL
281 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
282 }
d68ce4ac
FC
283#define UNI(f) UNI3(f,XTERM,1)
284#define UNIDOR(f) UNI3(f,XTERMORDORDOR,1)
b5fb7ce3
FC
285#define UNIPROTO(f,optional) { \
286 if (optional) PL_last_uni = PL_oldbufptr; \
22393538
MH
287 OPERATOR(f); \
288 }
a687059c 289
d68ce4ac 290#define UNIBRACK(f) UNI3(f,0,0)
79072805 291
9f68db38 292/* grandfather return to old style */
78cdf107
Z
293#define OLDLOP(f) \
294 do { \
295 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
296 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
297 pl_yylval.ival = (f); \
298 PL_expect = XTERM; \
299 PL_bufptr = s; \
300 return (int)LSTOP; \
301 } while(0)
79072805 302
83944c01
FC
303#define COPLINE_INC_WITH_HERELINES \
304 STMT_START { \
305 CopLINE_inc(PL_curcop); \
d794b522
FC
306 if (PL_parser->lex_shared->herelines) \
307 CopLINE(PL_curcop) += PL_parser->lex_shared->herelines, \
308 PL_parser->lex_shared->herelines = 0; \
83944c01
FC
309 } STMT_END
310
311
8fa7f367
JH
312#ifdef DEBUGGING
313
6154021b 314/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
315enum token_type {
316 TOKENTYPE_NONE,
317 TOKENTYPE_IVAL,
6154021b 318 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6 319 TOKENTYPE_PVAL,
aeaef349 320 TOKENTYPE_OPVAL
bbf60fe6
DM
321};
322
6d4a66ac
NC
323static struct debug_tokens {
324 const int token;
325 enum token_type type;
326 const char *name;
327} const debug_tokens[] =
9041c2e3 328{
bbf60fe6
DM
329 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
330 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
331 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
332 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
333 { ARROW, TOKENTYPE_NONE, "ARROW" },
334 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
335 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
336 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
337 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
338 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
0d863452 339 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
340 { DO, TOKENTYPE_NONE, "DO" },
341 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
342 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
343 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
344 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
345 { ELSE, TOKENTYPE_NONE, "ELSE" },
346 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
347 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
348 { FOR, TOKENTYPE_IVAL, "FOR" },
349 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
705fe0e5
FC
350 { FORMLBRACK, TOKENTYPE_NONE, "FORMLBRACK" },
351 { FORMRBRACK, TOKENTYPE_NONE, "FORMRBRACK" },
bbf60fe6
DM
352 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
353 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
7eb971ee 354 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
bbf60fe6
DM
355 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
356 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
357 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 358 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
359 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
360 { IF, TOKENTYPE_IVAL, "IF" },
5504e6cf 361 { LABEL, TOKENTYPE_PVAL, "LABEL" },
bbf60fe6
DM
362 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
363 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
364 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
365 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
366 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
367 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
368 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
369 { MY, TOKENTYPE_IVAL, "MY" },
bbf60fe6
DM
370 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
371 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
372 { OROP, TOKENTYPE_IVAL, "OROP" },
373 { OROR, TOKENTYPE_NONE, "OROR" },
374 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
f3f204dc 375 { PEG, TOKENTYPE_NONE, "PEG" },
88e1f1a2
JV
376 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
377 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6
DM
378 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
379 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
380 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
381 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
382 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
383 { PREINC, TOKENTYPE_NONE, "PREINC" },
384 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
f3f204dc 385 { QWLIST, TOKENTYPE_OPVAL, "QWLIST" },
bbf60fe6
DM
386 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
387 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
f3f204dc 388 { REQUIRE, TOKENTYPE_NONE, "REQUIRE" },
bbf60fe6
DM
389 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
390 { SUB, TOKENTYPE_NONE, "SUB" },
391 { THING, TOKENTYPE_OPVAL, "THING" },
392 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
393 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
394 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
395 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
396 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
397 { USE, TOKENTYPE_IVAL, "USE" },
0d863452 398 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6
DM
399 { WHILE, TOKENTYPE_IVAL, "WHILE" },
400 { WORD, TOKENTYPE_OPVAL, "WORD" },
be25f609 401 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 402 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
403};
404
6154021b 405/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 406
bbf60fe6 407STATIC int
704d4215 408S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 409{
97aff369 410 dVAR;
7918f24d
NC
411
412 PERL_ARGS_ASSERT_TOKEREPORT;
413
bbf60fe6 414 if (DEBUG_T_TEST) {
bd61b366 415 const char *name = NULL;
bbf60fe6 416 enum token_type type = TOKENTYPE_NONE;
f54cb97a 417 const struct debug_tokens *p;
396482e1 418 SV* const report = newSVpvs("<== ");
bbf60fe6 419
f54cb97a 420 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
421 if (p->token == (int)rv) {
422 name = p->name;
423 type = p->type;
424 break;
425 }
426 }
427 if (name)
54667de8 428 Perl_sv_catpv(aTHX_ report, name);
74736ae6 429 else if ((char)rv > ' ' && (char)rv <= '~')
bbf60fe6
DM
430 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
431 else if (!rv)
396482e1 432 sv_catpvs(report, "EOF");
bbf60fe6
DM
433 else
434 Perl_sv_catpvf(aTHX_ report, "?? %"IVdf, (IV)rv);
435 switch (type) {
436 case TOKENTYPE_NONE:
bbf60fe6
DM
437 break;
438 case TOKENTYPE_IVAL:
704d4215 439 Perl_sv_catpvf(aTHX_ report, "(ival=%"IVdf")", (IV)lvalp->ival);
bbf60fe6
DM
440 break;
441 case TOKENTYPE_OPNUM:
442 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 443 PL_op_name[lvalp->ival]);
bbf60fe6
DM
444 break;
445 case TOKENTYPE_PVAL:
704d4215 446 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
447 break;
448 case TOKENTYPE_OPVAL:
704d4215 449 if (lvalp->opval) {
401441c0 450 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
451 PL_op_name[lvalp->opval->op_type]);
452 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 453 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 454 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
455 }
456
457 }
401441c0 458 else
396482e1 459 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
460 break;
461 }
b6007c36 462 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
463 };
464 return (int)rv;
998054bd
SC
465}
466
b6007c36
DM
467
468/* print the buffer with suitable escapes */
469
470STATIC void
15f169a1 471S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 472{
396482e1 473 SV* const tmp = newSVpvs("");
7918f24d
NC
474
475 PERL_ARGS_ASSERT_PRINTBUF;
476
b6007c36
DM
477 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
478 SvREFCNT_dec(tmp);
479}
480
8fa7f367
JH
481#endif
482
8290c323
NC
483static int
484S_deprecate_commaless_var_list(pTHX) {
485 PL_expect = XTERM;
486 deprecate("comma-less variable list");
487 return REPORT(','); /* grandfather non-comma-format format */
488}
489
ffb4593c
NT
490/*
491 * S_ao
492 *
c963b151
BD
493 * This subroutine detects &&=, ||=, and //= and turns an ANDAND, OROR or DORDOR
494 * into an OP_ANDASSIGN, OP_ORASSIGN, or OP_DORASSIGN
ffb4593c
NT
495 */
496
76e3520e 497STATIC int
cea2e8a9 498S_ao(pTHX_ int toketype)
a0d0e21e 499{
97aff369 500 dVAR;
3280af22
NIS
501 if (*PL_bufptr == '=') {
502 PL_bufptr++;
a0d0e21e 503 if (toketype == ANDAND)
6154021b 504 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 505 else if (toketype == OROR)
6154021b 506 pl_yylval.ival = OP_ORASSIGN;
c963b151 507 else if (toketype == DORDOR)
6154021b 508 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
509 toketype = ASSIGNOP;
510 }
511 return toketype;
512}
513
ffb4593c
NT
514/*
515 * S_no_op
516 * When Perl expects an operator and finds something else, no_op
517 * prints the warning. It always prints "<something> found where
518 * operator expected. It prints "Missing semicolon on previous line?"
519 * if the surprise occurs at the start of the line. "do you need to
520 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
521 * where the compiler doesn't know if foo is a method call or a function.
522 * It prints "Missing operator before end of line" if there's nothing
523 * after the missing operator, or "... before <...>" if there is something
524 * after the missing operator.
525 */
526
76e3520e 527STATIC void
15f169a1 528S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 529{
97aff369 530 dVAR;
9d4ba2ae
AL
531 char * const oldbp = PL_bufptr;
532 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 533
7918f24d
NC
534 PERL_ARGS_ASSERT_NO_OP;
535
1189a94a
GS
536 if (!s)
537 s = oldbp;
07c798fb 538 else
1189a94a 539 PL_bufptr = s;
734ab321 540 yywarn(Perl_form(aTHX_ "%s found where operator expected", what), UTF ? SVf_UTF8 : 0);
56da5a46
RGS
541 if (ckWARN_d(WARN_SYNTAX)) {
542 if (is_first)
543 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
544 "\t(Missing semicolon on previous line?)\n");
545 else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) {
f54cb97a 546 const char *t;
734ab321
BF
547 for (t = PL_oldoldbufptr; (isALNUM_lazy_if(t,UTF) || *t == ':');
548 t += UTF ? UTF8SKIP(t) : 1)
c35e046a 549 NOOP;
56da5a46
RGS
550 if (t < PL_bufptr && isSPACE(*t))
551 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
734ab321
BF
552 "\t(Do you need to predeclare %"SVf"?)\n",
553 SVfARG(newSVpvn_flags(PL_oldoldbufptr, (STRLEN)(t - PL_oldoldbufptr),
554 SVs_TEMP | (UTF ? SVf_UTF8 : 0))));
56da5a46
RGS
555 }
556 else {
557 assert(s >= oldbp);
558 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
734ab321
BF
559 "\t(Missing operator before %"SVf"?)\n",
560 SVfARG(newSVpvn_flags(oldbp, (STRLEN)(s - oldbp),
561 SVs_TEMP | (UTF ? SVf_UTF8 : 0))));
56da5a46 562 }
07c798fb 563 }
3280af22 564 PL_bufptr = oldbp;
8990e307
LW
565}
566
ffb4593c
NT
567/*
568 * S_missingterm
569 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 570 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
571 * If we're in a delimited string and the delimiter is a control
572 * character, it's reformatted into a two-char sequence like ^C.
573 * This is fatal.
574 */
575
76e3520e 576STATIC void
cea2e8a9 577S_missingterm(pTHX_ char *s)
8990e307 578{
97aff369 579 dVAR;
8990e307
LW
580 char tmpbuf[3];
581 char q;
582 if (s) {
9d4ba2ae 583 char * const nl = strrchr(s,'\n');
d2719217 584 if (nl)
8990e307
LW
585 *nl = '\0';
586 }
463559e7 587 else if (isCNTRL(PL_multi_close)) {
8990e307 588 *tmpbuf = '^';
585ec06d 589 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
590 tmpbuf[2] = '\0';
591 s = tmpbuf;
592 }
593 else {
eb160463 594 *tmpbuf = (char)PL_multi_close;
8990e307
LW
595 tmpbuf[1] = '\0';
596 s = tmpbuf;
597 }
598 q = strchr(s,'"') ? '\'' : '"';
cea2e8a9 599 Perl_croak(aTHX_ "Can't find string terminator %c%s%c anywhere before EOF",q,s,q);
463ee0b2 600}
79072805 601
dd0ac2b9
FC
602#include "feature.h"
603
0d863452 604/*
0d863452
RH
605 * Check whether the named feature is enabled.
606 */
26ea9e12 607bool
3fff3427 608Perl_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
0d863452 609{
97aff369 610 dVAR;
4a731d7b 611 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
7918f24d
NC
612
613 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
ca4d40c4
FC
614
615 assert(CURRENT_FEATURE_BUNDLE == FEATURE_BUNDLE_CUSTOM);
7918f24d 616
26ea9e12
NC
617 if (namelen > MAX_FEATURE_LEN)
618 return FALSE;
3fff3427 619 memcpy(&he_name[8], name, namelen);
7d69d4a6 620
c8ca97b0
NC
621 return cBOOL(cop_hints_fetch_pvn(PL_curcop, he_name, 8 + namelen, 0,
622 REFCOUNTED_HE_EXISTS));
0d863452
RH
623}
624
ffb4593c 625/*
9cbb5ea2
GS
626 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
627 * utf16-to-utf8-reversed.
ffb4593c
NT
628 */
629
c39cd008
GS
630#ifdef PERL_CR_FILTER
631static void
632strip_return(SV *sv)
633{
eb578fdb
KW
634 const char *s = SvPVX_const(sv);
635 const char * const e = s + SvCUR(sv);
7918f24d
NC
636
637 PERL_ARGS_ASSERT_STRIP_RETURN;
638
c39cd008
GS
639 /* outer loop optimized to do nothing if there are no CR-LFs */
640 while (s < e) {
641 if (*s++ == '\r' && *s == '\n') {
642 /* hit a CR-LF, need to copy the rest */
eb578fdb 643 char *d = s - 1;
c39cd008
GS
644 *d++ = *s++;
645 while (s < e) {
646 if (*s == '\r' && s[1] == '\n')
647 s++;
648 *d++ = *s++;
649 }
650 SvCUR(sv) -= s - d;
651 return;
652 }
653 }
654}
a868473f 655
76e3520e 656STATIC I32
c39cd008 657S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 658{
f54cb97a 659 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
660 if (count > 0 && !maxlen)
661 strip_return(sv);
662 return count;
a868473f
NIS
663}
664#endif
665
ffb4593c 666/*
8eaa0acf
Z
667=for apidoc Amx|void|lex_start|SV *line|PerlIO *rsfp|U32 flags
668
669Creates and initialises a new lexer/parser state object, supplying
670a context in which to lex and parse from a new source of Perl code.
671A pointer to the new state object is placed in L</PL_parser>. An entry
672is made on the save stack so that upon unwinding the new state object
673will be destroyed and the former value of L</PL_parser> will be restored.
674Nothing else need be done to clean up the parsing context.
675
676The code to be parsed comes from I<line> and I<rsfp>. I<line>, if
677non-null, provides a string (in SV form) containing code to be parsed.
678A copy of the string is made, so subsequent modification of I<line>
679does not affect parsing. I<rsfp>, if non-null, provides an input stream
680from which code will be read to be parsed. If both are non-null, the
681code in I<line> comes first and must consist of complete lines of input,
682and I<rsfp> supplies the remainder of the source.
683
e368b3bd
FC
684The I<flags> parameter is reserved for future use. Currently it is only
685used by perl internally, so extensions should always pass zero.
8eaa0acf
Z
686
687=cut
688*/
ffb4593c 689
27fcb6ee 690/* LEX_START_SAME_FILTER indicates that this is not a new file, so it
87606032
NC
691 can share filters with the current parser.
692 LEX_START_DONT_CLOSE indicates that the file handle wasn't opened by the
693 caller, hence isn't owned by the parser, so shouldn't be closed on parser
694 destruction. This is used to handle the case of defaulting to reading the
695 script from the standard input because no filename was given on the command
696 line (without getting confused by situation where STDIN has been closed, so
697 the script handle is opened on fd 0) */
27fcb6ee 698
a0d0e21e 699void
8eaa0acf 700Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
79072805 701{
97aff369 702 dVAR;
6ef55633 703 const char *s = NULL;
5486870f 704 yy_parser *parser, *oparser;
60d63348 705 if (flags && flags & ~LEX_START_FLAGS)
8eaa0acf 706 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
acdf0a21
DM
707
708 /* create and initialise a parser */
709
199e78b7 710 Newxz(parser, 1, yy_parser);
5486870f 711 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
712 PL_parser = parser;
713
28ac2b49
Z
714 parser->stack = NULL;
715 parser->ps = NULL;
716 parser->stack_size = 0;
acdf0a21 717
e3abe207
DM
718 /* on scope exit, free this parser and restore any outer one */
719 SAVEPARSER(parser);
7c4baf47 720 parser->saved_curcop = PL_curcop;
e3abe207 721
acdf0a21 722 /* initialise lexer state */
8990e307 723
fb205e7a
DM
724#ifdef PERL_MAD
725 parser->curforce = -1;
726#else
727 parser->nexttoke = 0;
728#endif
ca4cfd28 729 parser->error_count = oparser ? oparser->error_count : 0;
c2598295 730 parser->copline = NOLINE;
5afb0a62 731 parser->lex_state = LEX_NORMAL;
c2598295 732 parser->expect = XSTATE;
2f9285f8 733 parser->rsfp = rsfp;
27fcb6ee
FC
734 parser->rsfp_filters =
735 !(flags & LEX_START_SAME_FILTER) || !oparser
d3cd8e11
FC
736 ? NULL
737 : MUTABLE_AV(SvREFCNT_inc(
738 oparser->rsfp_filters
739 ? oparser->rsfp_filters
740 : (oparser->rsfp_filters = newAV())
741 ));
2f9285f8 742
199e78b7
DM
743 Newx(parser->lex_brackstack, 120, char);
744 Newx(parser->lex_casestack, 12, char);
745 *parser->lex_casestack = '\0';
d794b522 746 Newxz(parser->lex_shared, 1, LEXSHARED);
02b34bbe 747
10efb74f 748 if (line) {
0528fd32 749 STRLEN len;
10efb74f 750 s = SvPV_const(line, len);
0abcdfa4
FC
751 parser->linestr = flags & LEX_START_COPIED
752 ? SvREFCNT_inc_simple_NN(line)
753 : newSVpvn_flags(s, len, SvUTF8(line));
11076590 754 sv_catpvs(parser->linestr, "\n;");
0abcdfa4
FC
755 } else {
756 parser->linestr = newSVpvs("\n;");
8990e307 757 }
f06b5848
DM
758 parser->oldoldbufptr =
759 parser->oldbufptr =
760 parser->bufptr =
761 parser->linestart = SvPVX(parser->linestr);
762 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
763 parser->last_lop = parser->last_uni = NULL;
87606032
NC
764 parser->lex_flags = flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
765 |LEX_DONT_CLOSE_RSFP);
737c24fc 766
60d63348 767 parser->in_pod = parser->filtered = 0;
79072805 768}
a687059c 769
e3abe207
DM
770
771/* delete a parser object */
772
773void
774Perl_parser_free(pTHX_ const yy_parser *parser)
775{
3ce3dcd9
FC
776#ifdef PERL_MAD
777 I32 nexttoke = parser->lasttoke;
778#else
779 I32 nexttoke = parser->nexttoke;
780#endif
781
7918f24d
NC
782 PERL_ARGS_ASSERT_PARSER_FREE;
783
7c4baf47 784 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
785 SvREFCNT_dec(parser->linestr);
786
87606032 787 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
2f9285f8 788 PerlIO_clearerr(parser->rsfp);
799361c3
SH
789 else if (parser->rsfp && (!parser->old_parser ||
790 (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 791 PerlIO_close(parser->rsfp);
5486870f 792 SvREFCNT_dec(parser->rsfp_filters);
10002bc1
FC
793 SvREFCNT_dec(parser->lex_stuff);
794 SvREFCNT_dec(parser->sublex_info.repl);
3ce3dcd9
FC
795 while (nexttoke--) {
796#ifdef PERL_MAD
797 if (S_is_opval_token(parser->nexttoke[nexttoke].next_type
798 & 0xffff))
799 op_free(parser->nexttoke[nexttoke].next_val.opval);
800#else
801 if (S_is_opval_token(parser->nexttype[nexttoke] & 0xffff))
802 op_free(parser->nextval[nexttoke].opval);
803#endif
804 }
2f9285f8 805
e3abe207
DM
806 Safefree(parser->lex_brackstack);
807 Safefree(parser->lex_casestack);
d794b522 808 Safefree(parser->lex_shared);
e3abe207
DM
809 PL_parser = parser->old_parser;
810 Safefree(parser);
811}
812
813
ffb4593c 814/*
f0e67a1d
Z
815=for apidoc AmxU|SV *|PL_parser-E<gt>linestr
816
817Buffer scalar containing the chunk currently under consideration of the
818text currently being lexed. This is always a plain string scalar (for
819which C<SvPOK> is true). It is not intended to be used as a scalar by
820normal scalar means; instead refer to the buffer directly by the pointer
821variables described below.
822
823The lexer maintains various C<char*> pointers to things in the
824C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
825reallocated, all of these pointers must be updated. Don't attempt to
826do this manually, but rather use L</lex_grow_linestr> if you need to
827reallocate the buffer.
828
829The content of the text chunk in the buffer is commonly exactly one
830complete line of input, up to and including a newline terminator,
831but there are situations where it is otherwise. The octets of the
832buffer may be intended to be interpreted as either UTF-8 or Latin-1.
833The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
834flag on this scalar, which may disagree with it.
835
836For direct examination of the buffer, the variable
837L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
838lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
839of these pointers is usually preferable to examination of the scalar
840through normal scalar means.
841
842=for apidoc AmxU|char *|PL_parser-E<gt>bufend
843
844Direct pointer to the end of the chunk of text currently being lexed, the
845end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
846+ SvCUR(PL_parser-E<gt>linestr)>. A NUL character (zero octet) is
847always located at the end of the buffer, and does not count as part of
848the buffer's contents.
849
850=for apidoc AmxU|char *|PL_parser-E<gt>bufptr
851
852Points to the current position of lexing inside the lexer buffer.
853Characters around this point may be freely examined, within
854the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
855L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
856interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
857
858Lexing code (whether in the Perl core or not) moves this pointer past
859the characters that it consumes. It is also expected to perform some
860bookkeeping whenever a newline character is consumed. This movement
861can be more conveniently performed by the function L</lex_read_to>,
862which handles newlines appropriately.
863
864Interpretation of the buffer's octets can be abstracted out by
865using the slightly higher-level functions L</lex_peek_unichar> and
866L</lex_read_unichar>.
867
868=for apidoc AmxU|char *|PL_parser-E<gt>linestart
869
870Points to the start of the current line inside the lexer buffer.
871This is useful for indicating at which column an error occurred, and
872not much else. This must be updated by any lexing code that consumes
873a newline; the function L</lex_read_to> handles this detail.
874
875=cut
876*/
877
878/*
879=for apidoc Amx|bool|lex_bufutf8
880
881Indicates whether the octets in the lexer buffer
882(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
883of Unicode characters. If not, they should be interpreted as Latin-1
884characters. This is analogous to the C<SvUTF8> flag for scalars.
885
886In UTF-8 mode, it is not guaranteed that the lexer buffer actually
887contains valid UTF-8. Lexing code must be robust in the face of invalid
888encoding.
889
890The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
891is significant, but not the whole story regarding the input character
892encoding. Normally, when a file is being read, the scalar contains octets
893and its C<SvUTF8> flag is off, but the octets should be interpreted as
894UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
895however, the scalar may have the C<SvUTF8> flag on, and in this case its
896octets should be interpreted as UTF-8 unless the C<use bytes> pragma
897is in effect. This logic may change in the future; use this function
898instead of implementing the logic yourself.
899
900=cut
901*/
902
903bool
904Perl_lex_bufutf8(pTHX)
905{
906 return UTF;
907}
908
909/*
910=for apidoc Amx|char *|lex_grow_linestr|STRLEN len
911
912Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
913at least I<len> octets (including terminating NUL). Returns a
914pointer to the reallocated buffer. This is necessary before making
915any direct modification of the buffer that would increase its length.
916L</lex_stuff_pvn> provides a more convenient way to insert text into
917the buffer.
918
919Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
920this function updates all of the lexer's variables that point directly
921into the buffer.
922
923=cut
924*/
925
926char *
927Perl_lex_grow_linestr(pTHX_ STRLEN len)
928{
929 SV *linestr;
930 char *buf;
931 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
c7641931 932 STRLEN linestart_pos, last_uni_pos, last_lop_pos, re_eval_start_pos;
f0e67a1d
Z
933 linestr = PL_parser->linestr;
934 buf = SvPVX(linestr);
935 if (len <= SvLEN(linestr))
936 return buf;
937 bufend_pos = PL_parser->bufend - buf;
938 bufptr_pos = PL_parser->bufptr - buf;
939 oldbufptr_pos = PL_parser->oldbufptr - buf;
940 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
941 linestart_pos = PL_parser->linestart - buf;
942 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
943 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
3328ab5a
FC
944 re_eval_start_pos = PL_parser->lex_shared->re_eval_start ?
945 PL_parser->lex_shared->re_eval_start - buf : 0;
c7641931 946
f0e67a1d 947 buf = sv_grow(linestr, len);
c7641931 948
f0e67a1d
Z
949 PL_parser->bufend = buf + bufend_pos;
950 PL_parser->bufptr = buf + bufptr_pos;
951 PL_parser->oldbufptr = buf + oldbufptr_pos;
952 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
953 PL_parser->linestart = buf + linestart_pos;
954 if (PL_parser->last_uni)
955 PL_parser->last_uni = buf + last_uni_pos;
956 if (PL_parser->last_lop)
957 PL_parser->last_lop = buf + last_lop_pos;
3328ab5a
FC
958 if (PL_parser->lex_shared->re_eval_start)
959 PL_parser->lex_shared->re_eval_start = buf + re_eval_start_pos;
f0e67a1d
Z
960 return buf;
961}
962
963/*
83aa740e 964=for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
f0e67a1d
Z
965
966Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
967immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
968reallocating the buffer if necessary. This means that lexing code that
969runs later will see the characters as if they had appeared in the input.
970It is not recommended to do this as part of normal parsing, and most
971uses of this facility run the risk of the inserted characters being
972interpreted in an unintended manner.
973
974The string to be inserted is represented by I<len> octets starting
975at I<pv>. These octets are interpreted as either UTF-8 or Latin-1,
976according to whether the C<LEX_STUFF_UTF8> flag is set in I<flags>.
977The characters are recoded for the lexer buffer, according to how the
978buffer is currently being interpreted (L</lex_bufutf8>). If a string
9dcc53ea 979to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
f0e67a1d
Z
980function is more convenient.
981
982=cut
983*/
984
985void
83aa740e 986Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
f0e67a1d 987{
749123ff 988 dVAR;
f0e67a1d
Z
989 char *bufptr;
990 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
991 if (flags & ~(LEX_STUFF_UTF8))
992 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
993 if (UTF) {
994 if (flags & LEX_STUFF_UTF8) {
995 goto plain_copy;
996 } else {
997 STRLEN highhalf = 0;
83aa740e 998 const char *p, *e = pv+len;
f0e67a1d
Z
999 for (p = pv; p != e; p++)
1000 highhalf += !!(((U8)*p) & 0x80);
1001 if (!highhalf)
1002 goto plain_copy;
1003 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
1004 bufptr = PL_parser->bufptr;
1005 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1006 SvCUR_set(PL_parser->linestr,
1007 SvCUR(PL_parser->linestr) + len+highhalf);
f0e67a1d
Z
1008 PL_parser->bufend += len+highhalf;
1009 for (p = pv; p != e; p++) {
1010 U8 c = (U8)*p;
1011 if (c & 0x80) {
1012 *bufptr++ = (char)(0xc0 | (c >> 6));
1013 *bufptr++ = (char)(0x80 | (c & 0x3f));
1014 } else {
1015 *bufptr++ = (char)c;
1016 }
1017 }
1018 }
1019 } else {
1020 if (flags & LEX_STUFF_UTF8) {
1021 STRLEN highhalf = 0;
83aa740e 1022 const char *p, *e = pv+len;
f0e67a1d
Z
1023 for (p = pv; p != e; p++) {
1024 U8 c = (U8)*p;
1025 if (c >= 0xc4) {
1026 Perl_croak(aTHX_ "Lexing code attempted to stuff "
1027 "non-Latin-1 character into Latin-1 input");
1028 } else if (c >= 0xc2 && p+1 != e &&
1029 (((U8)p[1]) & 0xc0) == 0x80) {
1030 p++;
1031 highhalf++;
1032 } else if (c >= 0x80) {
1033 /* malformed UTF-8 */
1034 ENTER;
1035 SAVESPTR(PL_warnhook);
1036 PL_warnhook = PERL_WARNHOOK_FATAL;
1037 utf8n_to_uvuni((U8*)p, e-p, NULL, 0);
1038 LEAVE;
1039 }
1040 }
1041 if (!highhalf)
1042 goto plain_copy;
1043 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
1044 bufptr = PL_parser->bufptr;
1045 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1046 SvCUR_set(PL_parser->linestr,
1047 SvCUR(PL_parser->linestr) + len-highhalf);
f0e67a1d
Z
1048 PL_parser->bufend += len-highhalf;
1049 for (p = pv; p != e; p++) {
1050 U8 c = (U8)*p;
1051 if (c & 0x80) {
1052 *bufptr++ = (char)(((c & 0x3) << 6) | (p[1] & 0x3f));
1053 p++;
1054 } else {
1055 *bufptr++ = (char)c;
1056 }
1057 }
1058 } else {
1059 plain_copy:
1060 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1061 bufptr = PL_parser->bufptr;
1062 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
255fdf19 1063 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
f0e67a1d
Z
1064 PL_parser->bufend += len;
1065 Copy(pv, bufptr, len, char);
1066 }
1067 }
1068}
1069
1070/*
9dcc53ea
Z
1071=for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
1072
1073Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1074immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1075reallocating the buffer if necessary. This means that lexing code that
1076runs later will see the characters as if they had appeared in the input.
1077It is not recommended to do this as part of normal parsing, and most
1078uses of this facility run the risk of the inserted characters being
1079interpreted in an unintended manner.
1080
1081The string to be inserted is represented by octets starting at I<pv>
1082and continuing to the first nul. These octets are interpreted as either
1083UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
1084in I<flags>. The characters are recoded for the lexer buffer, according
1085to how the buffer is currently being interpreted (L</lex_bufutf8>).
1086If it is not convenient to nul-terminate a string to be inserted, the
1087L</lex_stuff_pvn> function is more appropriate.
1088
1089=cut
1090*/
1091
1092void
1093Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1094{
1095 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1096 lex_stuff_pvn(pv, strlen(pv), flags);
1097}
1098
1099/*
f0e67a1d
Z
1100=for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1101
1102Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1103immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1104reallocating the buffer if necessary. This means that lexing code that
1105runs later will see the characters as if they had appeared in the input.
1106It is not recommended to do this as part of normal parsing, and most
1107uses of this facility run the risk of the inserted characters being
1108interpreted in an unintended manner.
1109
1110The string to be inserted is the string value of I<sv>. The characters
1111are recoded for the lexer buffer, according to how the buffer is currently
9dcc53ea 1112being interpreted (L</lex_bufutf8>). If a string to be inserted is
f0e67a1d
Z
1113not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1114need to construct a scalar.
1115
1116=cut
1117*/
1118
1119void
1120Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1121{
1122 char *pv;
1123 STRLEN len;
1124 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1125 if (flags)
1126 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1127 pv = SvPV(sv, len);
1128 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1129}
1130
1131/*
1132=for apidoc Amx|void|lex_unstuff|char *ptr
1133
1134Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1135I<ptr>. Text following I<ptr> will be moved, and the buffer shortened.
1136This hides the discarded text from any lexing code that runs later,
1137as if the text had never appeared.
1138
1139This is not the normal way to consume lexed text. For that, use
1140L</lex_read_to>.
1141
1142=cut
1143*/
1144
1145void
1146Perl_lex_unstuff(pTHX_ char *ptr)
1147{
1148 char *buf, *bufend;
1149 STRLEN unstuff_len;
1150 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1151 buf = PL_parser->bufptr;
1152 if (ptr < buf)
1153 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1154 if (ptr == buf)
1155 return;
1156 bufend = PL_parser->bufend;
1157 if (ptr > bufend)
1158 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1159 unstuff_len = ptr - buf;
1160 Move(ptr, buf, bufend+1-ptr, char);
1161 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1162 PL_parser->bufend = bufend - unstuff_len;
1163}
1164
1165/*
1166=for apidoc Amx|void|lex_read_to|char *ptr
1167
1168Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1169to I<ptr>. This advances L</PL_parser-E<gt>bufptr> to match I<ptr>,
1170performing the correct bookkeeping whenever a newline character is passed.
1171This is the normal way to consume lexed text.
1172
1173Interpretation of the buffer's octets can be abstracted out by
1174using the slightly higher-level functions L</lex_peek_unichar> and
1175L</lex_read_unichar>.
1176
1177=cut
1178*/
1179
1180void
1181Perl_lex_read_to(pTHX_ char *ptr)
1182{
1183 char *s;
1184 PERL_ARGS_ASSERT_LEX_READ_TO;
1185 s = PL_parser->bufptr;
1186 if (ptr < s || ptr > PL_parser->bufend)
1187 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1188 for (; s != ptr; s++)
1189 if (*s == '\n') {
83944c01 1190 COPLINE_INC_WITH_HERELINES;
f0e67a1d
Z
1191 PL_parser->linestart = s+1;
1192 }
1193 PL_parser->bufptr = ptr;
1194}
1195
1196/*
1197=for apidoc Amx|void|lex_discard_to|char *ptr
1198
1199Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1200up to I<ptr>. The remaining content of the buffer will be moved, and
1201all pointers into the buffer updated appropriately. I<ptr> must not
1202be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1203it is not permitted to discard text that has yet to be lexed.
1204
1205Normally it is not necessarily to do this directly, because it suffices to
1206use the implicit discarding behaviour of L</lex_next_chunk> and things
1207based on it. However, if a token stretches across multiple lines,
1f317c95 1208and the lexing code has kept multiple lines of text in the buffer for
f0e67a1d
Z
1209that purpose, then after completion of the token it would be wise to
1210explicitly discard the now-unneeded earlier lines, to avoid future
1211multi-line tokens growing the buffer without bound.
1212
1213=cut
1214*/
1215
1216void
1217Perl_lex_discard_to(pTHX_ char *ptr)
1218{
1219 char *buf;
1220 STRLEN discard_len;
1221 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1222 buf = SvPVX(PL_parser->linestr);
1223 if (ptr < buf)
1224 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1225 if (ptr == buf)
1226 return;
1227 if (ptr > PL_parser->bufptr)
1228 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1229 discard_len = ptr - buf;
1230 if (PL_parser->oldbufptr < ptr)
1231 PL_parser->oldbufptr = ptr;
1232 if (PL_parser->oldoldbufptr < ptr)
1233 PL_parser->oldoldbufptr = ptr;
1234 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1235 PL_parser->last_uni = NULL;
1236 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1237 PL_parser->last_lop = NULL;
1238 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1239 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1240 PL_parser->bufend -= discard_len;
1241 PL_parser->bufptr -= discard_len;
1242 PL_parser->oldbufptr -= discard_len;
1243 PL_parser->oldoldbufptr -= discard_len;
1244 if (PL_parser->last_uni)
1245 PL_parser->last_uni -= discard_len;
1246 if (PL_parser->last_lop)
1247 PL_parser->last_lop -= discard_len;
1248}
1249
1250/*
1251=for apidoc Amx|bool|lex_next_chunk|U32 flags
1252
1253Reads in the next chunk of text to be lexed, appending it to
1254L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1255looked to the end of the current chunk and wants to know more. It is
1256usual, but not necessary, for lexing to have consumed the entirety of
1257the current chunk at this time.
1258
1259If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1260chunk (i.e., the current chunk has been entirely consumed), normally the
1261current chunk will be discarded at the same time that the new chunk is
1262read in. If I<flags> includes C<LEX_KEEP_PREVIOUS>, the current chunk
1263will not be discarded. If the current chunk has not been entirely
1264consumed, then it will not be discarded regardless of the flag.
1265
1266Returns true if some new text was added to the buffer, or false if the
1267buffer has reached the end of the input text.
1268
1269=cut
1270*/
1271
1272#define LEX_FAKE_EOF 0x80000000
112d1284 1273#define LEX_NO_TERM 0x40000000
f0e67a1d
Z
1274
1275bool
1276Perl_lex_next_chunk(pTHX_ U32 flags)
1277{
1278 SV *linestr;
1279 char *buf;
1280 STRLEN old_bufend_pos, new_bufend_pos;
1281 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1282 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1283 bool got_some_for_debugger = 0;
f0e67a1d 1284 bool got_some;
112d1284 1285 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM))
f0e67a1d 1286 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
f0e67a1d
Z
1287 linestr = PL_parser->linestr;
1288 buf = SvPVX(linestr);
1289 if (!(flags & LEX_KEEP_PREVIOUS) &&
1290 PL_parser->bufptr == PL_parser->bufend) {
1291 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1292 linestart_pos = 0;
1293 if (PL_parser->last_uni != PL_parser->bufend)
1294 PL_parser->last_uni = NULL;
1295 if (PL_parser->last_lop != PL_parser->bufend)
1296 PL_parser->last_lop = NULL;
1297 last_uni_pos = last_lop_pos = 0;
1298 *buf = 0;
1299 SvCUR(linestr) = 0;
1300 } else {
1301 old_bufend_pos = PL_parser->bufend - buf;
1302 bufptr_pos = PL_parser->bufptr - buf;
1303 oldbufptr_pos = PL_parser->oldbufptr - buf;
1304 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1305 linestart_pos = PL_parser->linestart - buf;
1306 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1307 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1308 }
1309 if (flags & LEX_FAKE_EOF) {
1310 goto eof;
60d63348 1311 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
f0e67a1d
Z
1312 got_some = 0;
1313 } else if (filter_gets(linestr, old_bufend_pos)) {
1314 got_some = 1;
17cc9359 1315 got_some_for_debugger = 1;
112d1284
FC
1316 } else if (flags & LEX_NO_TERM) {
1317 got_some = 0;
f0e67a1d 1318 } else {
580561a3
Z
1319 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1320 sv_setpvs(linestr, "");
f0e67a1d
Z
1321 eof:
1322 /* End of real input. Close filehandle (unless it was STDIN),
1323 * then add implicit termination.
1324 */
87606032 1325 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
f0e67a1d
Z
1326 PerlIO_clearerr(PL_parser->rsfp);
1327 else if (PL_parser->rsfp)
1328 (void)PerlIO_close(PL_parser->rsfp);
1329 PL_parser->rsfp = NULL;
60d63348 1330 PL_parser->in_pod = PL_parser->filtered = 0;
f0e67a1d
Z
1331#ifdef PERL_MAD
1332 if (PL_madskills && !PL_in_eval && (PL_minus_p || PL_minus_n))
1333 PL_faketokens = 1;
1334#endif
1335 if (!PL_in_eval && PL_minus_p) {
1336 sv_catpvs(linestr,
1337 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1338 PL_minus_n = PL_minus_p = 0;
1339 } else if (!PL_in_eval && PL_minus_n) {
1340 sv_catpvs(linestr, /*{*/";}");
1341 PL_minus_n = 0;
1342 } else
1343 sv_catpvs(linestr, ";");
1344 got_some = 1;
1345 }
1346 buf = SvPVX(linestr);
1347 new_bufend_pos = SvCUR(linestr);
1348 PL_parser->bufend = buf + new_bufend_pos;
1349 PL_parser->bufptr = buf + bufptr_pos;
1350 PL_parser->oldbufptr = buf + oldbufptr_pos;
1351 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1352 PL_parser->linestart = buf + linestart_pos;
1353 if (PL_parser->last_uni)
1354 PL_parser->last_uni = buf + last_uni_pos;
1355 if (PL_parser->last_lop)
1356 PL_parser->last_lop = buf + last_lop_pos;
17cc9359 1357 if (got_some_for_debugger && (PERLDB_LINE || PERLDB_SAVESRC) &&
f0e67a1d
Z
1358 PL_curstash != PL_debstash) {
1359 /* debugger active and we're not compiling the debugger code,
1360 * so store the line into the debugger's array of lines
1361 */
1362 update_debugger_info(NULL, buf+old_bufend_pos,
1363 new_bufend_pos-old_bufend_pos);
1364 }
1365 return got_some;
1366}
1367
1368/*
1369=for apidoc Amx|I32|lex_peek_unichar|U32 flags
1370
1371Looks ahead one (Unicode) character in the text currently being lexed.
1372Returns the codepoint (unsigned integer value) of the next character,
1373or -1 if lexing has reached the end of the input text. To consume the
1374peeked character, use L</lex_read_unichar>.
1375
1376If the next character is in (or extends into) the next chunk of input
1377text, the next chunk will be read in. Normally the current chunk will be
1378discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1379then the current chunk will not be discarded.
1380
1381If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1382is encountered, an exception is generated.
1383
1384=cut
1385*/
1386
1387I32
1388Perl_lex_peek_unichar(pTHX_ U32 flags)
1389{
749123ff 1390 dVAR;
f0e67a1d
Z
1391 char *s, *bufend;
1392 if (flags & ~(LEX_KEEP_PREVIOUS))
1393 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1394 s = PL_parser->bufptr;
1395 bufend = PL_parser->bufend;
1396 if (UTF) {
1397 U8 head;
1398 I32 unichar;
1399 STRLEN len, retlen;
1400 if (s == bufend) {
1401 if (!lex_next_chunk(flags))
1402 return -1;
1403 s = PL_parser->bufptr;
1404 bufend = PL_parser->bufend;
1405 }
1406 head = (U8)*s;
1407 if (!(head & 0x80))
1408 return head;
1409 if (head & 0x40) {
1410 len = PL_utf8skip[head];
1411 while ((STRLEN)(bufend-s) < len) {
1412 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1413 break;
1414 s = PL_parser->bufptr;
1415 bufend = PL_parser->bufend;
1416 }
1417 }
1418 unichar = utf8n_to_uvuni((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
1419 if (retlen == (STRLEN)-1) {
1420 /* malformed UTF-8 */
1421 ENTER;
1422 SAVESPTR(PL_warnhook);
1423 PL_warnhook = PERL_WARNHOOK_FATAL;
1424 utf8n_to_uvuni((U8*)s, bufend-s, NULL, 0);
1425 LEAVE;
1426 }
1427 return unichar;
1428 } else {
1429 if (s == bufend) {
1430 if (!lex_next_chunk(flags))
1431 return -1;
1432 s = PL_parser->bufptr;
1433 }
1434 return (U8)*s;
1435 }
1436}
1437
1438/*
1439=for apidoc Amx|I32|lex_read_unichar|U32 flags
1440
1441Reads the next (Unicode) character in the text currently being lexed.
1442Returns the codepoint (unsigned integer value) of the character read,
1443and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1444if lexing has reached the end of the input text. To non-destructively
1445examine the next character, use L</lex_peek_unichar> instead.
1446
1447If the next character is in (or extends into) the next chunk of input
1448text, the next chunk will be read in. Normally the current chunk will be
1449discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1450then the current chunk will not be discarded.
1451
1452If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1453is encountered, an exception is generated.
1454
1455=cut
1456*/
1457
1458I32
1459Perl_lex_read_unichar(pTHX_ U32 flags)
1460{
1461 I32 c;
1462 if (flags & ~(LEX_KEEP_PREVIOUS))
1463 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1464 c = lex_peek_unichar(flags);
1465 if (c != -1) {
1466 if (c == '\n')
83944c01 1467 COPLINE_INC_WITH_HERELINES;
d9018cbe
EB
1468 if (UTF)
1469 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1470 else
1471 ++(PL_parser->bufptr);
f0e67a1d
Z
1472 }
1473 return c;
1474}
1475
1476/*
1477=for apidoc Amx|void|lex_read_space|U32 flags
1478
1479Reads optional spaces, in Perl style, in the text currently being
1480lexed. The spaces may include ordinary whitespace characters and
1481Perl-style comments. C<#line> directives are processed if encountered.
1482L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1483at a non-space character (or the end of the input text).
1484
1485If spaces extend into the next chunk of input text, the next chunk will
1486be read in. Normally the current chunk will be discarded at the same
1487time, but if I<flags> includes C<LEX_KEEP_PREVIOUS> then the current
1488chunk will not be discarded.
1489
1490=cut
1491*/
1492
f0998909
Z
1493#define LEX_NO_NEXT_CHUNK 0x80000000
1494
f0e67a1d
Z
1495void
1496Perl_lex_read_space(pTHX_ U32 flags)
1497{
1498 char *s, *bufend;
1499 bool need_incline = 0;
f0998909 1500 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK))
f0e67a1d
Z
1501 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
1502#ifdef PERL_MAD
1503 if (PL_skipwhite) {
1504 sv_free(PL_skipwhite);
1505 PL_skipwhite = NULL;
1506 }
1507 if (PL_madskills)
1508 PL_skipwhite = newSVpvs("");
1509#endif /* PERL_MAD */
1510 s = PL_parser->bufptr;
1511 bufend = PL_parser->bufend;
1512 while (1) {
1513 char c = *s;
1514 if (c == '#') {
1515 do {
1516 c = *++s;
1517 } while (!(c == '\n' || (c == 0 && s == bufend)));
1518 } else if (c == '\n') {
1519 s++;
1520 PL_parser->linestart = s;
1521 if (s == bufend)
1522 need_incline = 1;
1523 else
1524 incline(s);
1525 } else if (isSPACE(c)) {
1526 s++;
1527 } else if (c == 0 && s == bufend) {
1528 bool got_more;
1529#ifdef PERL_MAD
1530 if (PL_madskills)
1531 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1532#endif /* PERL_MAD */
f0998909
Z
1533 if (flags & LEX_NO_NEXT_CHUNK)
1534 break;
f0e67a1d 1535 PL_parser->bufptr = s;
83944c01 1536 COPLINE_INC_WITH_HERELINES;
f0e67a1d
Z
1537 got_more = lex_next_chunk(flags);
1538 CopLINE_dec(PL_curcop);
1539 s = PL_parser->bufptr;
1540 bufend = PL_parser->bufend;
1541 if (!got_more)
1542 break;
1543 if (need_incline && PL_parser->rsfp) {
1544 incline(s);
1545 need_incline = 0;
1546 }
1547 } else {
1548 break;
1549 }
1550 }
1551#ifdef PERL_MAD
1552 if (PL_madskills)
1553 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1554#endif /* PERL_MAD */
1555 PL_parser->bufptr = s;
1556}
1557
1558/*
ffb4593c
NT
1559 * S_incline
1560 * This subroutine has nothing to do with tilting, whether at windmills
1561 * or pinball tables. Its name is short for "increment line". It
57843af0 1562 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1563 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1564 * # line 500 "foo.pm"
1565 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1566 */
1567
76e3520e 1568STATIC void
d9095cec 1569S_incline(pTHX_ const char *s)
463ee0b2 1570{
97aff369 1571 dVAR;
d9095cec
NC
1572 const char *t;
1573 const char *n;
1574 const char *e;
8818d409 1575 line_t line_num;
463ee0b2 1576
7918f24d
NC
1577 PERL_ARGS_ASSERT_INCLINE;
1578
83944c01 1579 COPLINE_INC_WITH_HERELINES;
451f421f
FC
1580 if (!PL_rsfp && !PL_parser->filtered && PL_lex_state == LEX_NORMAL
1581 && s+1 == PL_bufend && *s == ';') {
1582 /* fake newline in string eval */
1583 CopLINE_dec(PL_curcop);
1584 return;
1585 }
463ee0b2
LW
1586 if (*s++ != '#')
1587 return;
d4c19fe8
AL
1588 while (SPACE_OR_TAB(*s))
1589 s++;
73659bf1
GS
1590 if (strnEQ(s, "line", 4))
1591 s += 4;
1592 else
1593 return;
084592ab 1594 if (SPACE_OR_TAB(*s))
73659bf1 1595 s++;
4e553d73 1596 else
73659bf1 1597 return;
d4c19fe8
AL
1598 while (SPACE_OR_TAB(*s))
1599 s++;
463ee0b2
LW
1600 if (!isDIGIT(*s))
1601 return;
d4c19fe8 1602
463ee0b2
LW
1603 n = s;
1604 while (isDIGIT(*s))
1605 s++;
07714eb4 1606 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1607 return;
bf4acbe4 1608 while (SPACE_OR_TAB(*s))
463ee0b2 1609 s++;
73659bf1 1610 if (*s == '"' && (t = strchr(s+1, '"'))) {
463ee0b2 1611 s++;
73659bf1
GS
1612 e = t + 1;
1613 }
463ee0b2 1614 else {
c35e046a
AL
1615 t = s;
1616 while (!isSPACE(*t))
1617 t++;
73659bf1 1618 e = t;
463ee0b2 1619 }
bf4acbe4 1620 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1621 e++;
1622 if (*e != '\n' && *e != '\0')
1623 return; /* false alarm */
1624
8818d409
FC
1625 line_num = atoi(n)-1;
1626
f4dd75d9 1627 if (t - s > 0) {
d9095cec 1628 const STRLEN len = t - s;
19bad673
NC
1629 SV *const temp_sv = CopFILESV(PL_curcop);
1630 const char *cf;
1631 STRLEN tmplen;
1632
1633 if (temp_sv) {
1634 cf = SvPVX(temp_sv);
1635 tmplen = SvCUR(temp_sv);
1636 } else {
1637 cf = NULL;
1638 tmplen = 0;
1639 }
1640
d1299d44 1641 if (!PL_rsfp && !PL_parser->filtered) {
e66cf94c
RGS
1642 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1643 * to *{"::_<newfilename"} */
44867030
NC
1644 /* However, the long form of evals is only turned on by the
1645 debugger - usually they're "(eval %lu)" */
1646 char smallbuf[128];
1647 char *tmpbuf;
1648 GV **gvp;
d9095cec 1649 STRLEN tmplen2 = len;
798b63bc 1650 if (tmplen + 2 <= sizeof smallbuf)
e66cf94c
RGS
1651 tmpbuf = smallbuf;
1652 else
2ae0db35 1653 Newx(tmpbuf, tmplen + 2, char);
44867030
NC
1654 tmpbuf[0] = '_';
1655 tmpbuf[1] = '<';
2ae0db35 1656 memcpy(tmpbuf + 2, cf, tmplen);
44867030 1657 tmplen += 2;
8a5ee598
RGS
1658 gvp = (GV**)hv_fetch(PL_defstash, tmpbuf, tmplen, FALSE);
1659 if (gvp) {
44867030
NC
1660 char *tmpbuf2;
1661 GV *gv2;
1662
1663 if (tmplen2 + 2 <= sizeof smallbuf)
1664 tmpbuf2 = smallbuf;
1665 else
1666 Newx(tmpbuf2, tmplen2 + 2, char);
1667
1668 if (tmpbuf2 != smallbuf || tmpbuf != smallbuf) {
1669 /* Either they malloc'd it, or we malloc'd it,
1670 so no prefix is present in ours. */
1671 tmpbuf2[0] = '_';
1672 tmpbuf2[1] = '<';
1673 }
1674
1675 memcpy(tmpbuf2 + 2, s, tmplen2);
1676 tmplen2 += 2;
1677
8a5ee598 1678 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1679 if (!isGV(gv2)) {
8a5ee598 1680 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1681 /* adjust ${"::_<newfilename"} to store the new file name */
1682 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
8818d409
FC
1683 /* The line number may differ. If that is the case,
1684 alias the saved lines that are in the array.
1685 Otherwise alias the whole array. */
1686 if (CopLINE(PL_curcop) == line_num) {
1687 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(*gvp)));
1688 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(*gvp)));
1689 }
1690 else if (GvAV(*gvp)) {
1691 AV * const av = GvAV(*gvp);
1692 const I32 start = CopLINE(PL_curcop)+1;
1693 I32 items = AvFILLp(av) - start;
1694 if (items > 0) {
1695 AV * const av2 = GvAVn(gv2);
1696 SV **svp = AvARRAY(av) + start;
1697 I32 l = (I32)line_num+1;
1698 while (items--)
1699 av_store(av2, l++, SvREFCNT_inc(*svp++));
1700 }
1701 }
e5527e4b 1702 }
44867030
NC
1703
1704 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
8a5ee598 1705 }
e66cf94c 1706 if (tmpbuf != smallbuf) Safefree(tmpbuf);
e66cf94c 1707 }
05ec9bb3 1708 CopFILE_free(PL_curcop);
d9095cec 1709 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1710 }
8818d409 1711 CopLINE_set(PL_curcop, line_num);
463ee0b2
LW
1712}
1713
29595ff2 1714#ifdef PERL_MAD
cd81e915 1715/* skip space before PL_thistoken */
29595ff2
NC
1716
1717STATIC char *
1718S_skipspace0(pTHX_ register char *s)
1719{
7918f24d
NC
1720 PERL_ARGS_ASSERT_SKIPSPACE0;
1721
29595ff2
NC
1722 s = skipspace(s);
1723 if (!PL_madskills)
1724 return s;
cd81e915
NC
1725 if (PL_skipwhite) {
1726 if (!PL_thiswhite)
6b29d1f5 1727 PL_thiswhite = newSVpvs("");
cd81e915
NC
1728 sv_catsv(PL_thiswhite, PL_skipwhite);
1729 sv_free(PL_skipwhite);
1730 PL_skipwhite = 0;
1731 }
1732 PL_realtokenstart = s - SvPVX(PL_linestr);
29595ff2
NC
1733 return s;
1734}
1735
cd81e915 1736/* skip space after PL_thistoken */
29595ff2
NC
1737
1738STATIC char *
1739S_skipspace1(pTHX_ register char *s)
1740{
d4c19fe8 1741 const char *start = s;
29595ff2
NC
1742 I32 startoff = start - SvPVX(PL_linestr);
1743
7918f24d
NC
1744 PERL_ARGS_ASSERT_SKIPSPACE1;
1745
29595ff2
NC
1746 s = skipspace(s);
1747 if (!PL_madskills)
1748 return s;
1749 start = SvPVX(PL_linestr) + startoff;
cd81e915 1750 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1751 const char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1752 PL_thistoken = newSVpvn(tstart, start - tstart);
1753 }
1754 PL_realtokenstart = -1;
1755 if (PL_skipwhite) {
1756 if (!PL_nextwhite)
6b29d1f5 1757 PL_nextwhite = newSVpvs("");
cd81e915
NC
1758 sv_catsv(PL_nextwhite, PL_skipwhite);
1759 sv_free(PL_skipwhite);
1760 PL_skipwhite = 0;
29595ff2
NC
1761 }
1762 return s;
1763}
1764
1765STATIC char *
1766S_skipspace2(pTHX_ register char *s, SV **svp)
1767{
c35e046a
AL
1768 char *start;
1769 const I32 bufptroff = PL_bufptr - SvPVX(PL_linestr);
1770 const I32 startoff = s - SvPVX(PL_linestr);
1771
7918f24d
NC
1772 PERL_ARGS_ASSERT_SKIPSPACE2;
1773
29595ff2
NC
1774 s = skipspace(s);
1775 PL_bufptr = SvPVX(PL_linestr) + bufptroff;
1776 if (!PL_madskills || !svp)
1777 return s;
1778 start = SvPVX(PL_linestr) + startoff;
cd81e915 1779 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1780 char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1781 PL_thistoken = newSVpvn(tstart, start - tstart);
1782 PL_realtokenstart = -1;
29595ff2 1783 }
cd81e915 1784 if (PL_skipwhite) {
29595ff2 1785 if (!*svp)
6b29d1f5 1786 *svp = newSVpvs("");
cd81e915
NC
1787 sv_setsv(*svp, PL_skipwhite);
1788 sv_free(PL_skipwhite);
1789 PL_skipwhite = 0;
29595ff2
NC
1790 }
1791
1792 return s;
1793}
1794#endif
1795
80a702cd 1796STATIC void
15f169a1 1797S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1798{
1799 AV *av = CopFILEAVx(PL_curcop);
1800 if (av) {
b9f83d2f 1801 SV * const sv = newSV_type(SVt_PVMG);
5fa550fb
NC
1802 if (orig_sv)
1803 sv_setsv(sv, orig_sv);
1804 else
1805 sv_setpvn(sv, buf, len);
80a702cd
RGS
1806 (void)SvIOK_on(sv);
1807 SvIV_set(sv, 0);
1808 av_store(av, (I32)CopLINE(PL_curcop), sv);
1809 }
1810}
1811
ffb4593c
NT
1812/*
1813 * S_skipspace
1814 * Called to gobble the appropriate amount and type of whitespace.
1815 * Skips comments as well.
1816 */
1817
76e3520e 1818STATIC char *
cea2e8a9 1819S_skipspace(pTHX_ register char *s)
a687059c 1820{
5db06880 1821#ifdef PERL_MAD
f0e67a1d
Z
1822 char *start = s;
1823#endif /* PERL_MAD */
7918f24d 1824 PERL_ARGS_ASSERT_SKIPSPACE;
f0e67a1d 1825#ifdef PERL_MAD
cd81e915
NC
1826 if (PL_skipwhite) {
1827 sv_free(PL_skipwhite);
f0e67a1d 1828 PL_skipwhite = NULL;
5db06880 1829 }
f0e67a1d 1830#endif /* PERL_MAD */
3280af22 1831 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
bf4acbe4 1832 while (s < PL_bufend && SPACE_OR_TAB(*s))
463ee0b2 1833 s++;
f0e67a1d
Z
1834 } else {
1835 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1836 PL_bufptr = s;
f0998909
Z
1837 lex_read_space(LEX_KEEP_PREVIOUS |
1838 (PL_sublex_info.sub_inwhat || PL_lex_state == LEX_FORMLINE ?
1839 LEX_NO_NEXT_CHUNK : 0));
3280af22 1840 s = PL_bufptr;
f0e67a1d
Z
1841 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1842 if (PL_linestart > PL_bufptr)
1843 PL_bufptr = PL_linestart;
1844 return s;
463ee0b2 1845 }
5db06880 1846#ifdef PERL_MAD
f0e67a1d
Z
1847 if (PL_madskills)
1848 PL_skipwhite = newSVpvn(start, s-start);
1849#endif /* PERL_MAD */
5db06880 1850 return s;
a687059c 1851}
378cc40b 1852
ffb4593c
NT
1853/*
1854 * S_check_uni
1855 * Check the unary operators to ensure there's no ambiguity in how they're
1856 * used. An ambiguous piece of code would be:
1857 * rand + 5
1858 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1859 * the +5 is its argument.
1860 */
1861
76e3520e 1862STATIC void
cea2e8a9 1863S_check_uni(pTHX)
ba106d47 1864{
97aff369 1865 dVAR;
d4c19fe8
AL
1866 const char *s;
1867 const char *t;
2f3197b3 1868
3280af22 1869 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1870 return;
3280af22
NIS
1871 while (isSPACE(*PL_last_uni))
1872 PL_last_uni++;
c35e046a
AL
1873 s = PL_last_uni;
1874 while (isALNUM_lazy_if(s,UTF) || *s == '-')
1875 s++;
3280af22 1876 if ((t = strchr(s, '(')) && t < PL_bufptr)
a0d0e21e 1877 return;
6136c704 1878
9b387841
NC
1879 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
1880 "Warning: Use of \"%.*s\" without parentheses is ambiguous",
1881 (int)(s - PL_last_uni), PL_last_uni);
2f3197b3
LW
1882}
1883
ffb4593c
NT
1884/*
1885 * LOP : macro to build a list operator. Its behaviour has been replaced
1886 * with a subroutine, S_lop() for which LOP is just another name.
1887 */
1888
a0d0e21e
LW
1889#define LOP(f,x) return lop(f,x,s)
1890
ffb4593c
NT
1891/*
1892 * S_lop
1893 * Build a list operator (or something that might be one). The rules:
1894 * - if we have a next token, then it's a list operator [why?]
1895 * - if the next thing is an opening paren, then it's a function
1896 * - else it's a list operator
1897 */
1898
76e3520e 1899STATIC I32
a0be28da 1900S_lop(pTHX_ I32 f, int x, char *s)
ffed7fef 1901{
97aff369 1902 dVAR;
7918f24d
NC
1903
1904 PERL_ARGS_ASSERT_LOP;
1905
6154021b 1906 pl_yylval.ival = f;
35c8bce7 1907 CLINE;
3280af22
NIS
1908 PL_expect = x;
1909 PL_bufptr = s;
1910 PL_last_lop = PL_oldbufptr;
eb160463 1911 PL_last_lop_op = (OPCODE)f;
5db06880
NC
1912#ifdef PERL_MAD
1913 if (PL_lasttoke)
78cdf107 1914 goto lstop;
5db06880 1915#else
3280af22 1916 if (PL_nexttoke)
78cdf107 1917 goto lstop;
5db06880 1918#endif
79072805 1919 if (*s == '(')
bbf60fe6 1920 return REPORT(FUNC);
29595ff2 1921 s = PEEKSPACE(s);
79072805 1922 if (*s == '(')
bbf60fe6 1923 return REPORT(FUNC);
78cdf107
Z
1924 else {
1925 lstop:
1926 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
1927 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
bbf60fe6 1928 return REPORT(LSTOP);
78cdf107 1929 }
79072805
LW
1930}
1931
5db06880
NC
1932#ifdef PERL_MAD
1933 /*
1934 * S_start_force
1935 * Sets up for an eventual force_next(). start_force(0) basically does
1936 * an unshift, while start_force(-1) does a push. yylex removes items
1937 * on the "pop" end.
1938 */
1939
1940STATIC void
1941S_start_force(pTHX_ int where)
1942{
1943 int i;
1944
cd81e915 1945 if (where < 0) /* so people can duplicate start_force(PL_curforce) */
5db06880 1946 where = PL_lasttoke;
cd81e915
NC
1947 assert(PL_curforce < 0 || PL_curforce == where);
1948 if (PL_curforce != where) {
5db06880
NC
1949 for (i = PL_lasttoke; i > where; --i) {
1950 PL_nexttoke[i] = PL_nexttoke[i-1];
1951 }
1952 PL_lasttoke++;
1953 }
cd81e915 1954 if (PL_curforce < 0) /* in case of duplicate start_force() */
5db06880 1955 Zero(&PL_nexttoke[where], 1, NEXTTOKE);
cd81e915
NC
1956 PL_curforce = where;
1957 if (PL_nextwhite) {
5db06880 1958 if (PL_madskills)
6b29d1f5 1959 curmad('^', newSVpvs(""));
cd81e915 1960 CURMAD('_', PL_nextwhite);
5db06880
NC
1961 }
1962}
1963
1964STATIC void
1965S_curmad(pTHX_ char slot, SV *sv)
1966{
1967 MADPROP **where;
1968
1969 if (!sv)
1970 return;
cd81e915
NC
1971 if (PL_curforce < 0)
1972 where = &PL_thismad;
5db06880 1973 else
cd81e915 1974 where = &PL_nexttoke[PL_curforce].next_mad;
5db06880 1975
cd81e915 1976 if (PL_faketokens)
76f68e9b 1977 sv_setpvs(sv, "");
5db06880
NC
1978 else {
1979 if (!IN_BYTES) {
1980 if (UTF && is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
1981 SvUTF8_on(sv);
1982 else if (PL_encoding) {
1983 sv_recode_to_utf8(sv, PL_encoding);
1984 }
1985 }
1986 }
1987
1988 /* keep a slot open for the head of the list? */
1989 if (slot != '_' && *where && (*where)->mad_key == '^') {
1990 (*where)->mad_key = slot;
daba3364 1991 sv_free(MUTABLE_SV(((*where)->mad_val)));
5db06880
NC
1992 (*where)->mad_val = (void*)sv;
1993 }
1994 else
1995 addmad(newMADsv(slot, sv), where, 0);
1996}
1997#else
b3f24c00
MHM
1998# define start_force(where) NOOP
1999# define curmad(slot, sv) NOOP
5db06880
NC
2000#endif
2001
ffb4593c
NT
2002/*
2003 * S_force_next
9cbb5ea2 2004 * When the lexer realizes it knows the next token (for instance,
ffb4593c 2005 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 2006 * to know what token to return the next time the lexer is called. Caller
5db06880
NC
2007 * will need to set PL_nextval[] (or PL_nexttoke[].next_val with PERL_MAD),
2008 * and possibly PL_expect to ensure the lexer handles the token correctly.
ffb4593c
NT
2009 */
2010
4e553d73 2011STATIC void
cea2e8a9 2012S_force_next(pTHX_ I32 type)
79072805 2013{
97aff369 2014 dVAR;
704d4215
GG
2015#ifdef DEBUGGING
2016 if (DEBUG_T_TEST) {
2017 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 2018 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
2019 }
2020#endif
6c7ae946
FC
2021 /* Don’t let opslab_force_free snatch it */
2022 if (S_is_opval_token(type & 0xffff) && NEXTVAL_NEXTTOKE.opval) {
2023 assert(!NEXTVAL_NEXTTOKE.opval->op_savefree);
2024 NEXTVAL_NEXTTOKE.opval->op_savefree = 1;
2025 }
5db06880 2026#ifdef PERL_MAD
cd81e915 2027 if (PL_curforce < 0)
5db06880 2028 start_force(PL_lasttoke);
cd81e915 2029 PL_nexttoke[PL_curforce].next_type = type;
5db06880
NC
2030 if (PL_lex_state != LEX_KNOWNEXT)
2031 PL_lex_defer = PL_lex_state;
2032 PL_lex_state = LEX_KNOWNEXT;
2033 PL_lex_expect = PL_expect;
cd81e915 2034 PL_curforce = -1;
5db06880 2035#else
3280af22
NIS
2036 PL_nexttype[PL_nexttoke] = type;
2037 PL_nexttoke++;
2038 if (PL_lex_state != LEX_KNOWNEXT) {
2039 PL_lex_defer = PL_lex_state;
2040 PL_lex_expect = PL_expect;
2041 PL_lex_state = LEX_KNOWNEXT;
79072805 2042 }
5db06880 2043#endif
79072805
LW
2044}
2045
28ac2b49
Z
2046void
2047Perl_yyunlex(pTHX)
2048{
a7aaec61
Z
2049 int yyc = PL_parser->yychar;
2050 if (yyc != YYEMPTY) {
2051 if (yyc) {
2052 start_force(-1);
2053 NEXTVAL_NEXTTOKE = PL_parser->yylval;
2054 if (yyc == '{'/*}*/ || yyc == HASHBRACK || yyc == '['/*]*/) {
78cdf107 2055 PL_lex_allbrackets--;
a7aaec61 2056 PL_lex_brackets--;
78cdf107
Z
2057 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
2058 } else if (yyc == '('/*)*/) {
2059 PL_lex_allbrackets--;
2060 yyc |= (2<<24);
a7aaec61
Z
2061 }
2062 force_next(yyc);
2063 }
28ac2b49
Z
2064 PL_parser->yychar = YYEMPTY;
2065 }
2066}
2067
d0a148a6 2068STATIC SV *
15f169a1 2069S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 2070{
97aff369 2071 dVAR;
740cce10 2072 SV * const sv = newSVpvn_utf8(start, len,
eaf7a4d2
CS
2073 !IN_BYTES
2074 && UTF
2075 && !is_ascii_string((const U8*)start, len)
740cce10 2076 && is_utf8_string((const U8*)start, len));
d0a148a6
NC
2077 return sv;
2078}
2079
ffb4593c
NT
2080/*
2081 * S_force_word
2082 * When the lexer knows the next thing is a word (for instance, it has
2083 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
2084 * it calls S_force_word to stick the next word into the PL_nexttoke/val
2085 * lookahead.
ffb4593c
NT
2086 *
2087 * Arguments:
b1b65b59 2088 * char *start : buffer position (must be within PL_linestr)
02b34bbe 2089 * int token : PL_next* will be this type of bare word (e.g., METHOD,WORD)
ffb4593c
NT
2090 * int check_keyword : if true, Perl checks to make sure the word isn't
2091 * a keyword (do this if the word is a label, e.g. goto FOO)
2092 * int allow_pack : if true, : characters will also be allowed (require,
2093 * use, etc. do this)
9cbb5ea2 2094 * int allow_initial_tick : used by the "sub" lexer only.
ffb4593c
NT
2095 */
2096
76e3520e 2097STATIC char *
cea2e8a9 2098S_force_word(pTHX_ register char *start, int token, int check_keyword, int allow_pack, int allow_initial_tick)
79072805 2099{
97aff369 2100 dVAR;
eb578fdb 2101 char *s;
463ee0b2 2102 STRLEN len;
4e553d73 2103
7918f24d
NC
2104 PERL_ARGS_ASSERT_FORCE_WORD;
2105
29595ff2 2106 start = SKIPSPACE1(start);
463ee0b2 2107 s = start;
7e2040f0 2108 if (isIDFIRST_lazy_if(s,UTF) ||
a0d0e21e 2109 (allow_pack && *s == ':') ||
15f0808c 2110 (allow_initial_tick && *s == '\'') )
a0d0e21e 2111 {
3280af22 2112 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
5458a98a 2113 if (check_keyword && keyword(PL_tokenbuf, len, 0))
463ee0b2 2114 return start;
cd81e915 2115 start_force(PL_curforce);
5db06880
NC
2116 if (PL_madskills)
2117 curmad('X', newSVpvn(start,s-start));
463ee0b2 2118 if (token == METHOD) {
29595ff2 2119 s = SKIPSPACE1(s);
463ee0b2 2120 if (*s == '(')
3280af22 2121 PL_expect = XTERM;
463ee0b2 2122 else {
3280af22 2123 PL_expect = XOPERATOR;
463ee0b2 2124 }
79072805 2125 }
e74e6b3d 2126 if (PL_madskills)
63575281 2127 curmad('g', newSVpvs( "forced" ));
9ded7720 2128 NEXTVAL_NEXTTOKE.opval
d0a148a6
NC
2129 = (OP*)newSVOP(OP_CONST,0,
2130 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 2131 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
2132 force_next(token);
2133 }
2134 return s;
2135}
2136
ffb4593c
NT
2137/*
2138 * S_force_ident
9cbb5ea2 2139 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
2140 * text only contains the "foo" portion. The first argument is a pointer
2141 * to the "foo", and the second argument is the type symbol to prefix.
2142 * Forces the next token to be a "WORD".
9cbb5ea2 2143 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2144 */
2145
76e3520e 2146STATIC void
bfed75c6 2147S_force_ident(pTHX_ register const char *s, int kind)
79072805 2148{
97aff369 2149 dVAR;
7918f24d
NC
2150
2151 PERL_ARGS_ASSERT_FORCE_IDENT;
2152
c35e046a 2153 if (*s) {
90e5519e 2154 const STRLEN len = strlen(s);
728847b1
BF
2155 OP* const o = (OP*)newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
2156 UTF ? SVf_UTF8 : 0));
cd81e915 2157 start_force(PL_curforce);
9ded7720 2158 NEXTVAL_NEXTTOKE.opval = o;
79072805 2159 force_next(WORD);
748a9306 2160 if (kind) {
11343788 2161 o->op_private = OPpCONST_ENTERED;
55497cff 2162 /* XXX see note in pp_entereval() for why we forgo typo
2163 warnings if the symbol must be introduced in an eval.
2164 GSAR 96-10-12 */
90e5519e 2165 gv_fetchpvn_flags(s, len,
728847b1
BF
2166 (PL_in_eval ? (GV_ADDMULTI | GV_ADDINEVAL)
2167 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
90e5519e
NC
2168 kind == '$' ? SVt_PV :
2169 kind == '@' ? SVt_PVAV :
2170 kind == '%' ? SVt_PVHV :
a0d0e21e 2171 SVt_PVGV
90e5519e 2172 );
748a9306 2173 }
79072805
LW
2174 }
2175}
2176
3f33d153
FC
2177static void
2178S_force_ident_maybe_lex(pTHX_ char pit)
2179{
2180 start_force(PL_curforce);
2181 NEXTVAL_NEXTTOKE.ival = pit;
2182 force_next('p');
2183}
2184
1571675a
GS
2185NV
2186Perl_str_to_version(pTHX_ SV *sv)
2187{
2188 NV retval = 0.0;
2189 NV nshift = 1.0;
2190 STRLEN len;
cfd0369c 2191 const char *start = SvPV_const(sv,len);
9d4ba2ae 2192 const char * const end = start + len;
504618e9 2193 const bool utf = SvUTF8(sv) ? TRUE : FALSE;
7918f24d
NC
2194
2195 PERL_ARGS_ASSERT_STR_TO_VERSION;
2196
1571675a 2197 while (start < end) {
ba210ebe 2198 STRLEN skip;
1571675a
GS
2199 UV n;
2200 if (utf)
9041c2e3 2201 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2202 else {
2203 n = *(U8*)start;
2204 skip = 1;
2205 }
2206 retval += ((NV)n)/nshift;
2207 start += skip;
2208 nshift *= 1000;
2209 }
2210 return retval;
2211}
2212
4e553d73 2213/*
ffb4593c
NT
2214 * S_force_version
2215 * Forces the next token to be a version number.
e759cc13
RGS
2216 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2217 * and if "guessing" is TRUE, then no new token is created (and the caller
2218 * must use an alternative parsing method).
ffb4593c
NT
2219 */
2220
76e3520e 2221STATIC char *
e759cc13 2222S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2223{
97aff369 2224 dVAR;
5f66b61c 2225 OP *version = NULL;
44dcb63b 2226 char *d;
5db06880
NC
2227#ifdef PERL_MAD
2228 I32 startoff = s - SvPVX(PL_linestr);
2229#endif
89bfa8cd 2230
7918f24d
NC
2231 PERL_ARGS_ASSERT_FORCE_VERSION;
2232
29595ff2 2233 s = SKIPSPACE1(s);
89bfa8cd 2234
44dcb63b 2235 d = s;
dd629d5b 2236 if (*d == 'v')
44dcb63b 2237 d++;
44dcb63b 2238 if (isDIGIT(*d)) {
e759cc13
RGS
2239 while (isDIGIT(*d) || *d == '_' || *d == '.')
2240 d++;
5db06880
NC
2241#ifdef PERL_MAD
2242 if (PL_madskills) {
cd81e915 2243 start_force(PL_curforce);
5db06880
NC
2244 curmad('X', newSVpvn(s,d-s));
2245 }
2246#endif
4e4da3ac 2247 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
dd629d5b 2248 SV *ver;
8d08d9ba 2249#ifdef USE_LOCALE_NUMERIC
909d3787
KW
2250 char *loc = savepv(setlocale(LC_NUMERIC, NULL));
2251 setlocale(LC_NUMERIC, "C");
8d08d9ba 2252#endif
6154021b 2253 s = scan_num(s, &pl_yylval);
8d08d9ba
DG
2254#ifdef USE_LOCALE_NUMERIC
2255 setlocale(LC_NUMERIC, loc);
909d3787 2256 Safefree(loc);
8d08d9ba 2257#endif
6154021b 2258 version = pl_yylval.opval;
dd629d5b
GS
2259 ver = cSVOPx(version)->op_sv;
2260 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2261 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2262 SvNV_set(ver, str_to_version(ver));
1571675a 2263 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2264 }
89bfa8cd 2265 }
5db06880
NC
2266 else if (guessing) {
2267#ifdef PERL_MAD
2268 if (PL_madskills) {
cd81e915
NC
2269 sv_free(PL_nextwhite); /* let next token collect whitespace */
2270 PL_nextwhite = 0;
5db06880
NC
2271 s = SvPVX(PL_linestr) + startoff;
2272 }
2273#endif
e759cc13 2274 return s;
5db06880 2275 }
89bfa8cd 2276 }
2277
5db06880
NC
2278#ifdef PERL_MAD
2279 if (PL_madskills && !version) {
cd81e915
NC
2280 sv_free(PL_nextwhite); /* let next token collect whitespace */
2281 PL_nextwhite = 0;
5db06880
NC
2282 s = SvPVX(PL_linestr) + startoff;
2283 }
2284#endif
89bfa8cd 2285 /* NOTE: The parser sees the package name and the VERSION swapped */
cd81e915 2286 start_force(PL_curforce);
9ded7720 2287 NEXTVAL_NEXTTOKE.opval = version;
4e553d73 2288 force_next(WORD);
89bfa8cd 2289
e759cc13 2290 return s;
89bfa8cd 2291}
2292
ffb4593c 2293/*
91152fc1
DG
2294 * S_force_strict_version
2295 * Forces the next token to be a version number using strict syntax rules.
2296 */
2297
2298STATIC char *
2299S_force_strict_version(pTHX_ char *s)
2300{
2301 dVAR;
2302 OP *version = NULL;
2303#ifdef PERL_MAD
2304 I32 startoff = s - SvPVX(PL_linestr);
2305#endif
2306 const char *errstr = NULL;
2307
2308 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2309
2310 while (isSPACE(*s)) /* leading whitespace */
2311 s++;
2312
2313 if (is_STRICT_VERSION(s,&errstr)) {
2314 SV *ver = newSV(0);
2315 s = (char *)scan_version(s, ver, 0);
2316 version = newSVOP(OP_CONST, 0, ver);
2317 }
4e4da3ac
Z
2318 else if ( (*s != ';' && *s != '{' && *s != '}' ) &&
2319 (s = SKIPSPACE1(s), (*s != ';' && *s != '{' && *s != '}' )))
2320 {
91152fc1
DG
2321 PL_bufptr = s;
2322 if (errstr)
2323 yyerror(errstr); /* version required */
2324 return s;
2325 }
2326
2327#ifdef PERL_MAD
2328 if (PL_madskills && !version) {
2329 sv_free(PL_nextwhite); /* let next token collect whitespace */
2330 PL_nextwhite = 0;
2331 s = SvPVX(PL_linestr) + startoff;
2332 }
2333#endif
2334 /* NOTE: The parser sees the package name and the VERSION swapped */
2335 start_force(PL_curforce);
2336 NEXTVAL_NEXTTOKE.opval = version;
2337 force_next(WORD);
2338
2339 return s;
2340}
2341
2342/*
ffb4593c
NT
2343 * S_tokeq
2344 * Tokenize a quoted string passed in as an SV. It finds the next
2345 * chunk, up to end of string or a backslash. It may make a new
2346 * SV containing that chunk (if HINT_NEW_STRING is on). It also
2347 * turns \\ into \.
2348 */
2349
76e3520e 2350STATIC SV *
cea2e8a9 2351S_tokeq(pTHX_ SV *sv)
79072805 2352{
97aff369 2353 dVAR;
eb578fdb
KW
2354 char *s;
2355 char *send;
2356 char *d;
b3ac6de7
IZ
2357 STRLEN len = 0;
2358 SV *pv = sv;
79072805 2359
7918f24d
NC
2360 PERL_ARGS_ASSERT_TOKEQ;
2361
79072805 2362 if (!SvLEN(sv))
b3ac6de7 2363 goto finish;
79072805 2364
a0d0e21e 2365 s = SvPV_force(sv, len);
21a311ee 2366 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1)
b3ac6de7 2367 goto finish;
463ee0b2 2368 send = s + len;
dcb21ed6
NC
2369 /* This is relying on the SV being "well formed" with a trailing '\0' */
2370 while (s < send && !(*s == '\\' && s[1] == '\\'))
79072805
LW
2371 s++;
2372 if (s == send)
b3ac6de7 2373 goto finish;
79072805 2374 d = s;
be4731d2 2375 if ( PL_hints & HINT_NEW_STRING ) {
59cd0e26 2376 pv = newSVpvn_flags(SvPVX_const(pv), len, SVs_TEMP | SvUTF8(sv));
be4731d2 2377 }
79072805
LW
2378 while (s < send) {
2379 if (*s == '\\') {
a0d0e21e 2380 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2381 s++; /* all that, just for this */
2382 }
2383 *d++ = *s++;
2384 }
2385 *d = '\0';
95a20fc0 2386 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2387 finish:
3280af22 2388 if ( PL_hints & HINT_NEW_STRING )
eb0d8d16 2389 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
79072805
LW
2390 return sv;
2391}
2392
ffb4593c
NT
2393/*
2394 * Now come three functions related to double-quote context,
2395 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2396 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2397 * interact with PL_lex_state, and create fake ( ... ) argument lists
2398 * to handle functions and concatenation.
ecd24171
DM
2399 * For example,
2400 * "foo\lbar"
2401 * is tokenised as
2402 * stringify ( const[foo] concat lcfirst ( const[bar] ) )
ffb4593c
NT
2403 */
2404
2405/*
2406 * S_sublex_start
6154021b 2407 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2408 *
2409 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2410 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c
NT
2411 *
2412 * OP_CONST and OP_READLINE are easy--just make the new op and return.
2413 *
2414 * Everything else becomes a FUNC.
2415 *
2416 * Sets PL_lex_state to LEX_INTERPPUSH unless (ival was OP_NULL or we
2417 * had an OP_CONST or OP_READLINE). This just sets us up for a
2418 * call to S_sublex_push().
2419 */
2420
76e3520e 2421STATIC I32
cea2e8a9 2422S_sublex_start(pTHX)
79072805 2423{
97aff369 2424 dVAR;
eb578fdb 2425 const I32 op_type = pl_yylval.ival;
79072805
LW
2426
2427 if (op_type == OP_NULL) {
6154021b 2428 pl_yylval.opval = PL_lex_op;
5f66b61c 2429 PL_lex_op = NULL;
79072805
LW
2430 return THING;
2431 }
2432 if (op_type == OP_CONST || op_type == OP_READLINE) {
3280af22 2433 SV *sv = tokeq(PL_lex_stuff);
b3ac6de7
IZ
2434
2435 if (SvTYPE(sv) == SVt_PVIV) {
2436 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2437 STRLEN len;
96a5add6 2438 const char * const p = SvPV_const(sv, len);
740cce10 2439 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2440 SvREFCNT_dec(sv);
2441 sv = nsv;
4e553d73 2442 }
6154021b 2443 pl_yylval.opval = (OP*)newSVOP(op_type, 0, sv);
a0714e2c 2444 PL_lex_stuff = NULL;
6f33ba73
RGS
2445 /* Allow <FH> // "foo" */
2446 if (op_type == OP_READLINE)
2447 PL_expect = XTERMORDORDOR;
79072805
LW
2448 return THING;
2449 }
e3f73d4e
RGS
2450 else if (op_type == OP_BACKTICK && PL_lex_op) {
2451 /* readpipe() vas overriden */
2452 cSVOPx(cLISTOPx(cUNOPx(PL_lex_op)->op_first)->op_first->op_sibling)->op_sv = tokeq(PL_lex_stuff);
6154021b 2453 pl_yylval.opval = PL_lex_op;
9b201d7d 2454 PL_lex_op = NULL;
e3f73d4e
RGS
2455 PL_lex_stuff = NULL;
2456 return THING;
2457 }
79072805 2458
3280af22 2459 PL_sublex_info.super_state = PL_lex_state;
eac04b2e 2460 PL_sublex_info.sub_inwhat = (U16)op_type;
3280af22
NIS
2461 PL_sublex_info.sub_op = PL_lex_op;
2462 PL_lex_state = LEX_INTERPPUSH;
55497cff 2463
3280af22
NIS
2464 PL_expect = XTERM;
2465 if (PL_lex_op) {
6154021b 2466 pl_yylval.opval = PL_lex_op;
5f66b61c 2467 PL_lex_op = NULL;
55497cff 2468 return PMFUNC;
2469 }
2470 else
2471 return FUNC;
2472}
2473
ffb4593c
NT
2474/*
2475 * S_sublex_push
2476 * Create a new scope to save the lexing state. The scope will be
2477 * ended in S_sublex_done. Returns a '(', starting the function arguments
2478 * to the uc, lc, etc. found before.
2479 * Sets PL_lex_state to LEX_INTERPCONCAT.
2480 */
2481
76e3520e 2482STATIC I32
cea2e8a9 2483S_sublex_push(pTHX)
55497cff 2484{
27da23d5 2485 dVAR;
78a635de 2486 LEXSHARED *shared;
f46d017c 2487 ENTER;
55497cff 2488
3280af22 2489 PL_lex_state = PL_sublex_info.super_state;
651b5b28 2490 SAVEBOOL(PL_lex_dojoin);
3280af22 2491 SAVEI32(PL_lex_brackets);
78cdf107 2492 SAVEI32(PL_lex_allbrackets);
b27dce25 2493 SAVEI32(PL_lex_formbrack);
78cdf107 2494 SAVEI8(PL_lex_fakeeof);
3280af22
NIS
2495 SAVEI32(PL_lex_casemods);
2496 SAVEI32(PL_lex_starts);
651b5b28 2497 SAVEI8(PL_lex_state);
7cc34111 2498 SAVESPTR(PL_lex_repl);
7766f137 2499 SAVEVPTR(PL_lex_inpat);
98246f1e 2500 SAVEI16(PL_lex_inwhat);
57843af0 2501 SAVECOPLINE(PL_curcop);
3280af22 2502 SAVEPPTR(PL_bufptr);
8452ff4b 2503 SAVEPPTR(PL_bufend);
3280af22
NIS
2504 SAVEPPTR(PL_oldbufptr);
2505 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2506 SAVEPPTR(PL_last_lop);
2507 SAVEPPTR(PL_last_uni);
3280af22
NIS
2508 SAVEPPTR(PL_linestart);
2509 SAVESPTR(PL_linestr);
8edd5f42
RGS
2510 SAVEGENERICPV(PL_lex_brackstack);
2511 SAVEGENERICPV(PL_lex_casestack);
78a635de 2512 SAVEGENERICPV(PL_parser->lex_shared);
3280af22 2513
99bd9d90 2514 /* The here-doc parser needs to be able to peek into outer lexing
60f40a38
FC
2515 scopes to find the body of the here-doc. So we put PL_linestr and
2516 PL_bufptr into lex_shared, to ‘share’ those values.
99bd9d90 2517 */
60f40a38
FC
2518 PL_parser->lex_shared->ls_linestr = PL_linestr;
2519 PL_parser->lex_shared->ls_bufptr = PL_bufptr;
99bd9d90 2520
3280af22 2521 PL_linestr = PL_lex_stuff;
7cc34111 2522 PL_lex_repl = PL_sublex_info.repl;
a0714e2c 2523 PL_lex_stuff = NULL;
7cc34111 2524 PL_sublex_info.repl = NULL;
3280af22 2525
9cbb5ea2
GS
2526 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2527 = SvPVX(PL_linestr);
3280af22 2528 PL_bufend += SvCUR(PL_linestr);
bd61b366 2529 PL_last_lop = PL_last_uni = NULL;
3280af22 2530 SAVEFREESV(PL_linestr);
4dc843bc 2531 if (PL_lex_repl) SAVEFREESV(PL_lex_repl);
3280af22
NIS
2532
2533 PL_lex_dojoin = FALSE;
b27dce25 2534 PL_lex_brackets = PL_lex_formbrack = 0;
78cdf107
Z
2535 PL_lex_allbrackets = 0;
2536 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
a02a5408
JC
2537 Newx(PL_lex_brackstack, 120, char);
2538 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2539 PL_lex_casemods = 0;
2540 *PL_lex_casestack = '\0';
2541 PL_lex_starts = 0;
2542 PL_lex_state = LEX_INTERPCONCAT;
eb160463 2543 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
78a635de
FC
2544
2545 Newxz(shared, 1, LEXSHARED);
2546 shared->ls_prev = PL_parser->lex_shared;
2547 PL_parser->lex_shared = shared;
3280af22
NIS
2548
2549 PL_lex_inwhat = PL_sublex_info.sub_inwhat;
bb16bae8 2550 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
3280af22
NIS
2551 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2552 PL_lex_inpat = PL_sublex_info.sub_op;
79072805 2553 else
5f66b61c 2554 PL_lex_inpat = NULL;
79072805 2555
55497cff 2556 return '(';
79072805
LW
2557}
2558
ffb4593c
NT
2559/*
2560 * S_sublex_done
2561 * Restores lexer state after a S_sublex_push.
2562 */
2563
76e3520e 2564STATIC I32
cea2e8a9 2565S_sublex_done(pTHX)
79072805 2566{
27da23d5 2567 dVAR;
3280af22 2568 if (!PL_lex_starts++) {
396482e1 2569 SV * const sv = newSVpvs("");
9aa983d2
JH
2570 if (SvUTF8(PL_linestr))
2571 SvUTF8_on(sv);
3280af22 2572 PL_expect = XOPERATOR;
6154021b 2573 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
79072805
LW
2574 return THING;
2575 }
2576
3280af22
NIS
2577 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2578 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2579 return yylex();
79072805
LW
2580 }
2581
ffb4593c 2582 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
bb16bae8 2583 assert(PL_lex_inwhat != OP_TRANSR);
3280af22
NIS
2584 if (PL_lex_repl && (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS)) {
2585 PL_linestr = PL_lex_repl;
2586 PL_lex_inpat = 0;
2587 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2588 PL_bufend += SvCUR(PL_linestr);
bd61b366 2589 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2590 PL_lex_dojoin = FALSE;
2591 PL_lex_brackets = 0;
78cdf107
Z
2592 PL_lex_allbrackets = 0;
2593 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
3280af22
NIS
2594 PL_lex_casemods = 0;
2595 *PL_lex_casestack = '\0';
2596 PL_lex_starts = 0;
25da4f38 2597 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2598 PL_lex_state = LEX_INTERPNORMAL;
2599 PL_lex_starts++;
e9fa98b2
HS
2600 /* we don't clear PL_lex_repl here, so that we can check later
2601 whether this is an evalled subst; that means we rely on the
2602 logic to ensure sublex_done() is called again only via the
2603 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2604 }
e9fa98b2 2605 else {
3280af22 2606 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2607 PL_lex_repl = NULL;
e9fa98b2 2608 }
79072805 2609 return ',';
ffed7fef
LW
2610 }
2611 else {
5db06880
NC
2612#ifdef PERL_MAD
2613 if (PL_madskills) {
cd81e915
NC
2614 if (PL_thiswhite) {
2615 if (!PL_endwhite)
6b29d1f5 2616 PL_endwhite = newSVpvs("");
cd81e915
NC
2617 sv_catsv(PL_endwhite, PL_thiswhite);
2618 PL_thiswhite = 0;
2619 }
2620 if (PL_thistoken)
76f68e9b 2621 sv_setpvs(PL_thistoken,"");
5db06880 2622 else
cd81e915 2623 PL_realtokenstart = -1;
5db06880
NC
2624 }
2625#endif
f46d017c 2626 LEAVE;
3280af22
NIS
2627 PL_bufend = SvPVX(PL_linestr);
2628 PL_bufend += SvCUR(PL_linestr);
2629 PL_expect = XOPERATOR;
09bef843 2630 PL_sublex_info.sub_inwhat = 0;
79072805 2631 return ')';
ffed7fef
LW
2632 }
2633}
2634
02aa26ce
NT
2635/*
2636 scan_const
2637
9da1dd8f
DM
2638 Extracts the next constant part of a pattern, double-quoted string,
2639 or transliteration. This is terrifying code.
2640
2641 For example, in parsing the double-quoted string "ab\x63$d", it would
2642 stop at the '$' and return an OP_CONST containing 'abc'.
02aa26ce 2643
94def140 2644 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 2645 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 2646 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 2647
94def140
TS
2648 Returns a pointer to the character scanned up to. If this is
2649 advanced from the start pointer supplied (i.e. if anything was
9da1dd8f 2650 successfully parsed), will leave an OP_CONST for the substring scanned
6154021b 2651 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
2652 by looking at the next characters herself.
2653
02aa26ce 2654 In patterns:
9da1dd8f
DM
2655 expand:
2656 \N{ABC} => \N{U+41.42.43}
2657
2658 pass through:
2659 all other \-char, including \N and \N{ apart from \N{ABC}
2660
2661 stops on:
2662 @ and $ where it appears to be a var, but not for $ as tail anchor
2663 \l \L \u \U \Q \E
2664 (?{ or (??{
2665
02aa26ce
NT
2666
2667 In transliterations:
2668 characters are VERY literal, except for - not at the start or end
94def140
TS
2669 of the string, which indicates a range. If the range is in bytes,
2670 scan_const expands the range to the full set of intermediate
2671 characters. If the range is in utf8, the hyphen is replaced with
2672 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
2673
2674 In double-quoted strings:
2675 backslashes:
2676 double-quoted style: \r and \n
ff3f963a 2677 constants: \x31, etc.
94def140 2678 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
2679 case and quoting: \U \Q \E
2680 stops on @ and $
2681
2682 scan_const does *not* construct ops to handle interpolated strings.
2683 It stops processing as soon as it finds an embedded $ or @ variable
2684 and leaves it to the caller to work out what's going on.
2685
94def140
TS
2686 embedded arrays (whether in pattern or not) could be:
2687 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2688
2689 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
2690
2691 $ in pattern could be $foo or could be tail anchor. Assumption:
2692 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 2693 followed by one of "()| \r\n\t"
02aa26ce 2694
9da1dd8f 2695 \1 (backreferences) are turned into $1 in substitutions
02aa26ce
NT
2696
2697 The structure of the code is
2698 while (there's a character to process) {
94def140
TS
2699 handle transliteration ranges
2700 skip regexp comments /(?#comment)/ and codes /(?{code})/
2701 skip #-initiated comments in //x patterns
2702 check for embedded arrays
02aa26ce
NT
2703 check for embedded scalars
2704 if (backslash) {
94def140 2705 deprecate \1 in substitution replacements
02aa26ce
NT
2706 handle string-changing backslashes \l \U \Q \E, etc.
2707 switch (what was escaped) {
94def140 2708 handle \- in a transliteration (becomes a literal -)
ff3f963a 2709 if a pattern and not \N{, go treat as regular character
94def140
TS
2710 handle \132 (octal characters)
2711 handle \x15 and \x{1234} (hex characters)
ff3f963a 2712 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
TS
2713 handle \cV (control characters)
2714 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 2715 } (end switch)
77a135fe 2716 continue
02aa26ce 2717 } (end if backslash)
77a135fe 2718 handle regular character
02aa26ce 2719 } (end while character to read)
4e553d73 2720
02aa26ce
NT
2721*/
2722
76e3520e 2723STATIC char *
cea2e8a9 2724S_scan_const(pTHX_ char *start)
79072805 2725{
97aff369 2726 dVAR;
eb578fdb 2727 char *send = PL_bufend; /* end of the constant */
77a135fe
KW
2728 SV *sv = newSV(send - start); /* sv for the constant. See
2729 note below on sizing. */
eb578fdb
KW
2730 char *s = start; /* start of the constant */
2731 char *d = SvPVX(sv); /* destination for copies */
02aa26ce 2732 bool dorange = FALSE; /* are we in a translit range? */
c2e66d9e 2733 bool didrange = FALSE; /* did we just finish a range? */
2866decb 2734 bool in_charclass = FALSE; /* within /[...]/ */
b953e60c
KW
2735 bool has_utf8 = FALSE; /* Output constant is UTF8 */
2736 bool this_utf8 = cBOOL(UTF); /* Is the source string assumed
77a135fe
KW
2737 to be UTF8? But, this can
2738 show as true when the source
2739 isn't utf8, as for example
2740 when it is entirely composed
2741 of hex constants */
2742
2743 /* Note on sizing: The scanned constant is placed into sv, which is
2744 * initialized by newSV() assuming one byte of output for every byte of
2745 * input. This routine expects newSV() to allocate an extra byte for a
2746 * trailing NUL, which this routine will append if it gets to the end of
2747 * the input. There may be more bytes of input than output (eg., \N{LATIN
2748 * CAPITAL LETTER A}), or more output than input if the constant ends up
2749 * recoded to utf8, but each time a construct is found that might increase
2750 * the needed size, SvGROW() is called. Its size parameter each time is
2751 * based on the best guess estimate at the time, namely the length used so
2752 * far, plus the length the current construct will occupy, plus room for
2753 * the trailing NUL, plus one byte for every input byte still unscanned */
2754
012bcf8d 2755 UV uv;
4c3a8340
TS
2756#ifdef EBCDIC
2757 UV literal_endpoint = 0;
e294cc5d 2758 bool native_range = TRUE; /* turned to FALSE if the first endpoint is Unicode. */
4c3a8340 2759#endif
012bcf8d 2760
7918f24d
NC
2761 PERL_ARGS_ASSERT_SCAN_CONST;
2762
bb16bae8 2763 assert(PL_lex_inwhat != OP_TRANSR);
2b9d42f0
NIS
2764 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
2765 /* If we are doing a trans and we know we want UTF8 set expectation */
2766 has_utf8 = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
2767 this_utf8 = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
2768 }
2769
2770
79072805 2771 while (s < send || dorange) {
ff3f963a 2772
02aa26ce 2773 /* get transliterations out of the way (they're most literal) */
3280af22 2774 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 2775 /* expand a range A-Z to the full set of characters. AIE! */
79072805 2776 if (dorange) {
1ba5c669
JH
2777 I32 i; /* current expanded character */
2778 I32 min; /* first character in range */
2779 I32 max; /* last character in range */
02aa26ce 2780
e294cc5d
JH
2781#ifdef EBCDIC
2782 UV uvmax = 0;
2783#endif
2784
2785 if (has_utf8
2786#ifdef EBCDIC
2787 && !native_range
2788#endif
2789 ) {
9d4ba2ae 2790 char * const c = (char*)utf8_hop((U8*)d, -1);
8973db79
JH
2791 char *e = d++;
2792 while (e-- > c)
2793 *(e + 1) = *e;
25716404 2794 *c = (char)UTF_TO_NATIVE(0xff);
8973db79
JH
2795 /* mark the range as done, and continue */
2796 dorange = FALSE;
2797 didrange = TRUE;
2798 continue;
2799 }
2b9d42f0 2800
95a20fc0 2801 i = d - SvPVX_const(sv); /* remember current offset */
e294cc5d
JH
2802#ifdef EBCDIC
2803 SvGROW(sv,
2804 SvLEN(sv) + (has_utf8 ?
2805 (512 - UTF_CONTINUATION_MARK +
2806 UNISKIP(0x100))
2807 : 256));
2808 /* How many two-byte within 0..255: 128 in UTF-8,
2809 * 96 in UTF-8-mod. */
2810#else
9cbb5ea2 2811 SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */
e294cc5d 2812#endif
9cbb5ea2 2813 d = SvPVX(sv) + i; /* refresh d after realloc */
e294cc5d
JH
2814#ifdef EBCDIC
2815 if (has_utf8) {
2816 int j;
2817 for (j = 0; j <= 1; j++) {
2818 char * const c = (char*)utf8_hop((U8*)d, -1);
2819 const UV uv = utf8n_to_uvchr((U8*)c, d - c, NULL, 0);
2820 if (j)
2821 min = (U8)uv;
2822 else if (uv < 256)
2823 max = (U8)uv;
2824 else {
2825 max = (U8)0xff; /* only to \xff */
2826 uvmax = uv; /* \x{100} to uvmax */
2827 }
2828 d = c; /* eat endpoint chars */
2829 }
2830 }
2831 else {
2832#endif
2833 d -= 2; /* eat the first char and the - */
2834 min = (U8)*d; /* first char in range */
2835 max = (U8)d[1]; /* last char in range */
2836#ifdef EBCDIC
2837 }
2838#endif
8ada0baa 2839
c2e66d9e 2840 if (min > max) {
4dc843bc 2841 SvREFCNT_dec(sv);
01ec43d0 2842 Perl_croak(aTHX_
d1573ac7 2843 "Invalid range \"%c-%c\" in transliteration operator",
1ba5c669 2844 (char)min, (char)max);
c2e66d9e
GS
2845 }
2846
c7f1f016 2847#ifdef EBCDIC
4c3a8340
TS
2848 if (literal_endpoint == 2 &&
2849 ((isLOWER(min) && isLOWER(max)) ||
2850 (isUPPER(min) && isUPPER(max)))) {
8ada0baa
JH
2851 if (isLOWER(min)) {
2852 for (i = min; i <= max; i++)
2853 if (isLOWER(i))
db42d148 2854 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2855 } else {
2856 for (i = min; i <= max; i++)
2857 if (isUPPER(i))
db42d148 2858 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2859 }
2860 }
2861 else
2862#endif
2863 for (i = min; i <= max; i++)
e294cc5d
JH
2864#ifdef EBCDIC
2865 if (has_utf8) {
2866 const U8 ch = (U8)NATIVE_TO_UTF(i);
2867 if (UNI_IS_INVARIANT(ch))
2868 *d++ = (U8)i;
2869 else {
2870 *d++ = (U8)UTF8_EIGHT_BIT_HI(ch);
2871 *d++ = (U8)UTF8_EIGHT_BIT_LO(ch);
2872 }
2873 }
2874 else
2875#endif
2876 *d++ = (char)i;
2877
2878#ifdef EBCDIC
2879 if (uvmax) {
2880 d = (char*)uvchr_to_utf8((U8*)d, 0x100);
2881 if (uvmax > 0x101)
2882 *d++ = (char)UTF_TO_NATIVE(0xff);
2883 if (uvmax > 0x100)
2884 d = (char*)uvchr_to_utf8((U8*)d, uvmax);
2885 }
2886#endif
02aa26ce
NT
2887
2888 /* mark the range as done, and continue */
79072805 2889 dorange = FALSE;
01ec43d0 2890 didrange = TRUE;
4c3a8340
TS
2891#ifdef EBCDIC
2892 literal_endpoint = 0;
2893#endif
79072805 2894 continue;
4e553d73 2895 }
02aa26ce
NT
2896
2897 /* range begins (ignore - as first or last char) */
79072805 2898 else if (*s == '-' && s+1 < send && s != start) {
4e553d73 2899 if (didrange) {
4dc843bc 2900 SvREFCNT_dec(sv);
1fafa243 2901 Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
01ec43d0 2902 }
e294cc5d
JH
2903 if (has_utf8
2904#ifdef EBCDIC
2905 && !native_range
2906#endif
2907 ) {
25716404 2908 *d++ = (char)UTF_TO_NATIVE(0xff); /* use illegal utf8 byte--see pmtrans */
a0ed51b3
LW
2909 s++;
2910 continue;
2911 }
79072805
LW
2912 dorange = TRUE;
2913 s++;
01ec43d0
GS
2914 }
2915 else {
2916 didrange = FALSE;
4c3a8340
TS
2917#ifdef EBCDIC
2918 literal_endpoint = 0;
e294cc5d 2919 native_range = TRUE;
4c3a8340 2920#endif
01ec43d0 2921 }
79072805 2922 }
02aa26ce
NT
2923
2924 /* if we get here, we're not doing a transliteration */
2925
e4a2df84
DM
2926 else if (*s == '[' && PL_lex_inpat && !in_charclass) {
2927 char *s1 = s-1;
2928 int esc = 0;
2929 while (s1 >= start && *s1-- == '\\')
2930 esc = !esc;
2931 if (!esc)
2932 in_charclass = TRUE;
2933 }
2866decb 2934
e4a2df84
DM
2935 else if (*s == ']' && PL_lex_inpat && in_charclass) {
2936 char *s1 = s-1;
2937 int esc = 0;
2938 while (s1 >= start && *s1-- == '\\')
2939 esc = !esc;
2940 if (!esc)
2941 in_charclass = FALSE;
2942 }
2866decb 2943
9da1dd8f
DM
2944 /* skip for regexp comments /(?#comment)/, except for the last
2945 * char, which will be done separately.
2946 * Stop on (?{..}) and friends */
2947
3280af22 2948 else if (*s == '(' && PL_lex_inpat && s[1] == '?') {
cc6b7395 2949 if (s[2] == '#') {
e994fd66 2950 while (s+1 < send && *s != ')')
db42d148 2951 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
155aba94 2952 }
2866decb 2953 else if (!PL_lex_casemods && !in_charclass &&
d3cec5e5
DM
2954 ( s[2] == '{' /* This should match regcomp.c */
2955 || (s[2] == '?' && s[3] == '{')))
155aba94 2956 {
9da1dd8f 2957 break;
cc6b7395 2958 }
748a9306 2959 }
02aa26ce
NT
2960
2961 /* likewise skip #-initiated comments in //x patterns */
3280af22 2962 else if (*s == '#' && PL_lex_inpat &&
73134a2e 2963 ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED) {
748a9306 2964 while (s+1 < send && *s != '\n')
db42d148 2965 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
748a9306 2966 }
02aa26ce 2967
9da1dd8f
DM
2968 /* no further processing of single-quoted regex */
2969 else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'')
2970 goto default_action;
2971
5d1d4326 2972 /* check for embedded arrays
da6eedaa 2973 (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
5d1d4326 2974 */
1749ea0d
TS
2975 else if (*s == '@' && s[1]) {
2976 if (isALNUM_lazy_if(s+1,UTF))
2977 break;
2978 if (strchr(":'{$", s[1]))
2979 break;
2980 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
2981 break; /* in regexp, neither @+ nor @- are interpolated */
2982 }
02aa26ce
NT
2983
2984 /* check for embedded scalars. only stop if we're sure it's a
2985 variable.
2986 */
79072805 2987 else if (*s == '$') {
3280af22 2988 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
79072805 2989 break;
77772344 2990 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
a2a5de95
NC
2991 if (s[1] == '\\') {
2992 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
2993 "Possible unintended interpolation of $\\ in regex");
77772344 2994 }
79072805 2995 break; /* in regexp, $ might be tail anchor */
77772344 2996 }
79072805 2997 }
02aa26ce 2998
2b9d42f0
NIS
2999 /* End of else if chain - OP_TRANS rejoin rest */
3000
02aa26ce 3001 /* backslashes */
79072805 3002 if (*s == '\\' && s+1 < send) {
ff3f963a
KW
3003 char* e; /* Can be used for ending '}', etc. */
3004
79072805 3005 s++;
02aa26ce 3006
7d0fc23c
KW
3007 /* warn on \1 - \9 in substitution replacements, but note that \11
3008 * is an octal; and \19 is \1 followed by '9' */
3280af22 3009 if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat &&
a0d0e21e 3010 isDIGIT(*s) && *s != '0' && !isDIGIT(s[1]))
79072805 3011 {
a2a5de95 3012 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
79072805
LW
3013 *--s = '$';
3014 break;
3015 }
02aa26ce
NT
3016
3017 /* string-change backslash escapes */
838f2281 3018 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQF", *s)) {
79072805
LW
3019 --s;
3020 break;
3021 }
ff3f963a
KW
3022 /* In a pattern, process \N, but skip any other backslash escapes.
3023 * This is because we don't want to translate an escape sequence
3024 * into a meta symbol and have the regex compiler use the meta
3025 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
3026 * in spite of this, we do have to process \N here while the proper
3027 * charnames handler is in scope. See bugs #56444 and #62056.
3028 * There is a complication because \N in a pattern may also stand
3029 * for 'match a non-nl', and not mean a charname, in which case its
3030 * processing should be deferred to the regex compiler. To be a
3031 * charname it must be followed immediately by a '{', and not look
3032 * like \N followed by a curly quantifier, i.e., not something like
3033 * \N{3,}. regcurly returns a boolean indicating if it is a legal
3034 * quantifier */
3035 else if (PL_lex_inpat
3036 && (*s != 'N'
3037 || s[1] != '{'
3038 || regcurly(s + 1)))
3039 {
cc74c5bd
TS
3040 *d++ = NATIVE_TO_NEED(has_utf8,'\\');
3041 goto default_action;
3042 }
02aa26ce 3043
79072805 3044 switch (*s) {
02aa26ce
NT
3045
3046 /* quoted - in transliterations */
79072805 3047 case '-':
3280af22 3048 if (PL_lex_inwhat == OP_TRANS) {
79072805
LW
3049 *d++ = *s++;
3050 continue;
3051 }
3052 /* FALL THROUGH */
3053 default:
11b8faa4 3054 {
e4ca4584 3055 if ((isALNUMC(*s)))
a2a5de95
NC
3056 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
3057 "Unrecognized escape \\%c passed through",
3058 *s);
11b8faa4 3059 /* default action is to copy the quoted character */
f9a63242 3060 goto default_action;
11b8faa4 3061 }
02aa26ce 3062
632403cc 3063 /* eg. \132 indicates the octal constant 0132 */
79072805
LW
3064 case '0': case '1': case '2': case '3':
3065 case '4': case '5': case '6': case '7':
ba210ebe 3066 {
53305cf1
NC
3067 I32 flags = 0;
3068 STRLEN len = 3;
77a135fe 3069 uv = NATIVE_TO_UNI(grok_oct(s, &len, &flags, NULL));
ba210ebe
JH
3070 s += len;
3071 }
012bcf8d 3072 goto NUM_ESCAPE_INSERT;
02aa26ce 3073
f0a2b745
KW
3074 /* eg. \o{24} indicates the octal constant \024 */
3075 case 'o':
3076 {
3077 STRLEN len;
454155d9 3078 const char* error;
f0a2b745 3079
454155d9 3080 bool valid = grok_bslash_o(s, &uv, &len, &error, 1);
f0a2b745 3081 s += len;
454155d9 3082 if (! valid) {
f0a2b745
KW
3083 yyerror(error);
3084 continue;
3085 }
3086 goto NUM_ESCAPE_INSERT;
3087 }
3088
77a135fe 3089 /* eg. \x24 indicates the hex constant 0x24 */
79072805 3090 case 'x':
a0481293 3091 {
53305cf1 3092 STRLEN len;
a0481293 3093 const char* error;
355860ce 3094
a0481293
KW
3095 bool valid = grok_bslash_x(s, &uv, &len, &error, 1);
3096 s += len;
3097 if (! valid) {
3098 yyerror(error);
355860ce 3099 continue;
ba210ebe 3100 }
012bcf8d
GS
3101 }
3102
3103 NUM_ESCAPE_INSERT:
ff3f963a
KW
3104 /* Insert oct or hex escaped character. There will always be
3105 * enough room in sv since such escapes will be longer than any
3106 * UTF-8 sequence they can end up as, except if they force us
3107 * to recode the rest of the string into utf8 */
ba7cea30 3108
77a135fe 3109 /* Here uv is the ordinal of the next character being added in
ff3f963a 3110 * unicode (converted from native). */
77a135fe 3111 if (!UNI_IS_INVARIANT(uv)) {
9aa983d2 3112 if (!has_utf8 && uv > 255) {
77a135fe
KW
3113 /* Might need to recode whatever we have accumulated so
3114 * far if it contains any chars variant in utf8 or
3115 * utf-ebcdic. */
3116
3117 SvCUR_set(sv, d - SvPVX_const(sv));
3118 SvPOK_on(sv);
3119 *d = '\0';
77a135fe 3120 /* See Note on sizing above. */
7bf79863
KW
3121 sv_utf8_upgrade_flags_grow(sv,
3122 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3123 UNISKIP(uv) + (STRLEN)(send - s) + 1);
77a135fe
KW
3124 d = SvPVX(sv) + SvCUR(sv);
3125 has_utf8 = TRUE;
012bcf8d
GS
3126 }
3127
77a135fe
KW
3128 if (has_utf8) {
3129 d = (char*)uvuni_to_utf8((U8*)d, uv);
f9a63242
JH
3130 if (PL_lex_inwhat == OP_TRANS &&
3131 PL_sublex_info.sub_op) {
3132 PL_sublex_info.sub_op->op_private |=
3133 (PL_lex_repl ? OPpTRANS_FROM_UTF
3134 : OPpTRANS_TO_UTF);
f9a63242 3135 }
e294cc5d
JH
3136#ifdef EBCDIC
3137 if (uv > 255 && !dorange)
3138 native_range = FALSE;
3139#endif
012bcf8d 3140 }
a0ed51b3 3141 else {
012bcf8d 3142 *d++ = (char)uv;
a0ed51b3 3143 }
012bcf8d
GS
3144 }
3145 else {
c4d5f83a 3146 *d++ = (char) uv;
a0ed51b3 3147 }
79072805 3148 continue;
02aa26ce 3149
4a2d328f 3150 case 'N':
ff3f963a
KW
3151 /* In a non-pattern \N must be a named character, like \N{LATIN
3152 * SMALL LETTER A} or \N{U+0041}. For patterns, it also can
3153 * mean to match a non-newline. For non-patterns, named
3154 * characters are converted to their string equivalents. In
3155 * patterns, named characters are not converted to their
3156 * ultimate forms for the same reasons that other escapes
3157 * aren't. Instead, they are converted to the \N{U+...} form
3158 * to get the value from the charnames that is in effect right
3159 * now, while preserving the fact that it was a named character
3160 * so that the regex compiler knows this */
3161
3162 /* This section of code doesn't generally use the
3163 * NATIVE_TO_NEED() macro to transform the input. I (khw) did
3164 * a close examination of this macro and determined it is a
3165 * no-op except on utfebcdic variant characters. Every
3166 * character generated by this that would normally need to be
3167 * enclosed by this macro is invariant, so the macro is not
7538f724
KW
3168 * needed, and would complicate use of copy(). XXX There are
3169 * other parts of this file where the macro is used
3170 * inconsistently, but are saved by it being a no-op */
ff3f963a
KW
3171
3172 /* The structure of this section of code (besides checking for
3173 * errors and upgrading to utf8) is:
3174 * Further disambiguate between the two meanings of \N, and if
3175 * not a charname, go process it elsewhere
0a96133f
KW
3176 * If of form \N{U+...}, pass it through if a pattern;
3177 * otherwise convert to utf8
3178 * Otherwise must be \N{NAME}: convert to \N{U+c1.c2...} if a
3179 * pattern; otherwise convert to utf8 */
ff3f963a
KW
3180
3181 /* Here, s points to the 'N'; the test below is guaranteed to
3182 * succeed if we are being called on a pattern as we already
3183 * know from a test above that the next character is a '{'.
3184 * On a non-pattern \N must mean 'named sequence, which
3185 * requires braces */
3186 s++;
3187 if (*s != '{') {
3188 yyerror("Missing braces on \\N{}");
3189 continue;
3190 }
3191 s++;
3192
0a96133f 3193 /* If there is no matching '}', it is an error. */
ff3f963a
KW
3194 if (! (e = strchr(s, '}'))) {
3195 if (! PL_lex_inpat) {
5777a3f7 3196 yyerror("Missing right brace on \\N{}");
0a96133f
KW
3197 } else {
3198 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N.");
dbc0d4f2 3199 }
0a96133f 3200 continue;
ff3f963a 3201 }
cddc7ef4 3202
ff3f963a 3203 /* Here it looks like a named character */
cddc7ef4 3204
ff3f963a
KW
3205 if (PL_lex_inpat) {
3206
3207 /* XXX This block is temporary code. \N{} implies that the
3208 * pattern is to have Unicode semantics, and therefore
3209 * currently has to be encoded in utf8. By putting it in
3210 * utf8 now, we save a whole pass in the regular expression
3211 * compiler. Once that code is changed so Unicode
3212 * semantics doesn't necessarily have to be in utf8, this
da3a4baf
KW
3213 * block should be removed. However, the code that parses
3214 * the output of this would have to be changed to not
3215 * necessarily expect utf8 */
ff3f963a 3216 if (!has_utf8) {
77a135fe 3217 SvCUR_set(sv, d - SvPVX_const(sv));
f08d6ad9 3218 SvPOK_on(sv);
e4f3eed8 3219 *d = '\0';
77a135fe 3220 /* See Note on sizing above. */
7bf79863 3221 sv_utf8_upgrade_flags_grow(sv,
ff3f963a
KW
3222 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3223 /* 5 = '\N{' + cur char + NUL */
3224 (STRLEN)(send - s) + 5);
f08d6ad9 3225 d = SvPVX(sv) + SvCUR(sv);
89491803 3226 has_utf8 = TRUE;
ff3f963a
KW
3227 }
3228 }
423cee85 3229
ff3f963a
KW
3230 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3231 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3232 | PERL_SCAN_DISALLOW_PREFIX;
3233 STRLEN len;
3234
3235 /* For \N{U+...}, the '...' is a unicode value even on
3236 * EBCDIC machines */
3237 s += 2; /* Skip to next char after the 'U+' */
3238 len = e - s;
3239 uv = grok_hex(s, &len, &flags, NULL);
3240 if (len == 0 || len != (STRLEN)(e - s)) {
3241 yyerror("Invalid hexadecimal number in \\N{U+...}");
3242 s = e + 1;
3243 continue;
3244 }
3245
3246 if (PL_lex_inpat) {
3247
e2a7e165
KW
3248 /* On non-EBCDIC platforms, pass through to the regex
3249 * compiler unchanged. The reason we evaluated the
3250 * number above is to make sure there wasn't a syntax
3251 * error. But on EBCDIC we convert to native so
3252 * downstream code can continue to assume it's native
3253 */
ff3f963a 3254 s -= 5; /* Include the '\N{U+' */
e2a7e165
KW
3255#ifdef EBCDIC
3256 d += my_snprintf(d, e - s + 1 + 1, /* includes the }
3257 and the \0 */
3258 "\\N{U+%X}",
3259 (unsigned int) UNI_TO_NATIVE(uv));
3260#else
ff3f963a
KW
3261 Copy(s, d, e - s + 1, char); /* 1 = include the } */
3262 d += e - s + 1;
e2a7e165 3263#endif
ff3f963a
KW
3264 }
3265 else { /* Not a pattern: convert the hex to string */
3266
3267 /* If destination is not in utf8, unconditionally
3268 * recode it to be so. This is because \N{} implies
3269 * Unicode semantics, and scalars have to be in utf8
3270 * to guarantee those semantics */
3271 if (! has_utf8) {
3272 SvCUR_set(sv, d - SvPVX_const(sv));
3273 SvPOK_on(sv);
3274 *d = '\0';
3275 /* See Note on sizing above. */
3276 sv_utf8_upgrade_flags_grow(
3277 sv,
3278 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3279 UNISKIP(uv) + (STRLEN)(send - e) + 1);
3280 d = SvPVX(sv) + SvCUR(sv);
3281 has_utf8 = TRUE;
3282 }
3283
3284 /* Add the string to the output */
3285 if (UNI_IS_INVARIANT(uv)) {
3286 *d++ = (char) uv;
3287 }
3288 else d = (char*)uvuni_to_utf8((U8*)d, uv);
3289 }
3290 }
3291 else { /* Here is \N{NAME} but not \N{U+...}. */
3292
3293 SV *res; /* result from charnames */
3294 const char *str; /* the string in 'res' */
3295 STRLEN len; /* its length */
3296
3297 /* Get the value for NAME */
3298 res = newSVpvn(s, e - s);
3299 res = new_constant( NULL, 0, "charnames",
3300 /* includes all of: \N{...} */
3301 res, NULL, s - 3, e - s + 4 );
3302
3303 /* Most likely res will be in utf8 already since the
3304 * standard charnames uses pack U, but a custom translator
3305 * can leave it otherwise, so make sure. XXX This can be
3306 * revisited to not have charnames use utf8 for characters
3307 * that don't need it when regexes don't have to be in utf8
3308 * for Unicode semantics. If doing so, remember EBCDIC */
1104029a 3309 if (SvPOK(res)) {
ff3f963a
KW
3310 sv_utf8_upgrade(res);
3311 str = SvPV_const(res, len);
3312
3313 /* Don't accept malformed input */
3314 if (! is_utf8_string((U8 *) str, len)) {
3315 yyerror("Malformed UTF-8 returned by \\N");
3316 }
3317 else if (PL_lex_inpat) {
3318
3319 if (! len) { /* The name resolved to an empty string */
3320 Copy("\\N{}", d, 4, char);
3321 d += 4;
3322 }
3323 else {
3324 /* In order to not lose information for the regex
3325 * compiler, pass the result in the specially made
3326 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3327 * the code points in hex of each character
3328 * returned by charnames */
3329
3330 const char *str_end = str + len;
3331 STRLEN char_length; /* cur char's byte length */
3332 STRLEN output_length; /* and the number of bytes
3333 after this is translated
3334 into hex digits */
3335 const STRLEN off = d - SvPVX_const(sv);
3336
3337 /* 2 hex per byte; 2 chars for '\N'; 2 chars for
3338 * max('U+', '.'); and 1 for NUL */
3339 char hex_string[2 * UTF8_MAXBYTES + 5];
3340
3341 /* Get the first character of the result. */
3342 U32 uv = utf8n_to_uvuni((U8 *) str,
3343 len,
3344 &char_length,
3345 UTF8_ALLOW_ANYUV);
3346
3347 /* The call to is_utf8_string() above hopefully
3348 * guarantees that there won't be an error. But
3349 * it's easy here to make sure. The function just
3350 * above warns and returns 0 if invalid utf8, but
3351 * it can also return 0 if the input is validly a
3352 * NUL. Disambiguate */
3353 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3354 uv = UNICODE_REPLACEMENT;
3355 }
3356
3357 /* Convert first code point to hex, including the
e2a7e165
KW
3358 * boiler plate before it. For all these, we
3359 * convert to native format so that downstream code
3360 * can continue to assume the input is native */
78c35590 3361 output_length =
3353de27 3362 my_snprintf(hex_string, sizeof(hex_string),
e2a7e165
KW
3363 "\\N{U+%X",
3364 (unsigned int) UNI_TO_NATIVE(uv));
ff3f963a
KW
3365
3366 /* Make sure there is enough space to hold it */
3367 d = off + SvGROW(sv, off
3368 + output_length
3369 + (STRLEN)(send - e)
3370 + 2); /* '}' + NUL */
3371 /* And output it */
3372 Copy(hex_string, d, output_length, char);
3373 d += output_length;
3374
3375 /* For each subsequent character, append dot and
3376 * its ordinal in hex */
3377 while ((str += char_length) < str_end) {
3378 const STRLEN off = d - SvPVX_const(sv);
3379 U32 uv = utf8n_to_uvuni((U8 *) str,
3380 str_end - str,
3381 &char_length,
3382 UTF8_ALLOW_ANYUV);
3383 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3384 uv = UNICODE_REPLACEMENT;
3385 }
3386
78c35590 3387 output_length =
3353de27 3388 my_snprintf(hex_string, sizeof(hex_string),
e2a7e165
KW
3389 ".%X",
3390 (unsigned int) UNI_TO_NATIVE(uv));
ff3f963a
KW
3391
3392 d = off + SvGROW(sv, off
3393 + output_length
3394 + (STRLEN)(send - e)
3395 + 2); /* '}' + NUL */
3396 Copy(hex_string, d, output_length, char);
3397 d += output_length;
3398 }
3399
3400 *d++ = '}'; /* Done. Add the trailing brace */
3401 }
3402 }
3403 else { /* Here, not in a pattern. Convert the name to a
3404 * string. */
3405
3406 /* If destination is not in utf8, unconditionally
3407 * recode it to be so. This is because \N{} implies
3408 * Unicode semantics, and scalars have to be in utf8
3409 * to guarantee those semantics */
3410 if (! has_utf8) {
3411 SvCUR_set(sv, d - SvPVX_const(sv));
3412 SvPOK_on(sv);
3413 *d = '\0';
3414 /* See Note on sizing above. */
3415 sv_utf8_upgrade_flags_grow(sv,
3416 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3417 len + (STRLEN)(send - s) + 1);
3418 d = SvPVX(sv) + SvCUR(sv);
3419 has_utf8 = TRUE;
3420 } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
3421
3422 /* See Note on sizing above. (NOTE: SvCUR() is not
3423 * set correctly here). */
3424 const STRLEN off = d - SvPVX_const(sv);
3425 d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1);
3426 }
3427 Copy(str, d, len, char);
3428 d += len;
423cee85 3429 }
423cee85 3430 SvREFCNT_dec(res);
cb233ae3
KW
3431
3432 /* Deprecate non-approved name syntax */
3433 if (ckWARN_d(WARN_DEPRECATED)) {
3434 bool problematic = FALSE;
3435 char* i = s;
3436
3437 /* For non-ut8 input, look to see that the first
3438 * character is an alpha, then loop through the rest
3439 * checking that each is a continuation */
3440 if (! this_utf8) {
3441 if (! isALPHAU(*i)) problematic = TRUE;
3442 else for (i = s + 1; i < e; i++) {
3443 if (isCHARNAME_CONT(*i)) continue;
3444 problematic = TRUE;
3445 break;
3446 }
3447 }
3448 else {
3449 /* Similarly for utf8. For invariants can check
3450 * directly. We accept anything above the latin1
3451 * range because it is immaterial to Perl if it is
3452 * correct or not, and is expensive to check. But
3453 * it is fairly easy in the latin1 range to convert
3454 * the variants into a single character and check
3455 * those */
3456 if (UTF8_IS_INVARIANT(*i)) {
3457 if (! isALPHAU(*i)) problematic = TRUE;
3458 } else if (UTF8_IS_DOWNGRADEABLE_START(*i)) {
81c14aa2 3459 if (! isALPHAU(UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(*i,
cb233ae3
KW
3460 *(i+1)))))
3461 {
3462 problematic = TRUE;
3463 }
3464 }
3465 if (! problematic) for (i = s + UTF8SKIP(s);
3466 i < e;
3467 i+= UTF8SKIP(i))
3468 {
3469 if (UTF8_IS_INVARIANT(*i)) {
3470 if (isCHARNAME_CONT(*i)) continue;
3471 } else if (! UTF8_IS_DOWNGRADEABLE_START(*i)) {
3472 continue;
3473 } else if (isCHARNAME_CONT(
3474 UNI_TO_NATIVE(
81c14aa2 3475 TWO_BYTE_UTF8_TO_UNI(*i, *(i+1)))))
cb233ae3
KW
3476 {
3477 continue;
3478 }
3479 problematic = TRUE;
3480 break;
3481 }
3482 }
3483 if (problematic) {
6e1bad6c
KW
3484 /* The e-i passed to the final %.*s makes sure that
3485 * should the trailing NUL be missing that this
3486 * print won't run off the end of the string */
cb233ae3 3487 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
b00fc8d4
NC
3488 "Deprecated character in \\N{...}; marked by <-- HERE in \\N{%.*s<-- HERE %.*s",
3489 (int)(i - s + 1), s, (int)(e - i), i + 1);
cb233ae3
KW
3490 }
3491 }
1104029a 3492 }
cb233ae3 3493 } /* End \N{NAME} */
ff3f963a
KW
3494#ifdef EBCDIC
3495 if (!dorange)
3496 native_range = FALSE; /* \N{} is defined to be Unicode */
3497#endif
3498 s = e + 1; /* Point to just after the '}' */
423cee85
JH
3499 continue;
3500
02aa26ce 3501 /* \c is a control character */
79072805
LW
3502 case 'c':
3503 s++;
961ce445 3504 if (s < send) {
17a3df4c 3505 *d++ = grok_bslash_c(*s++, has_utf8, 1);
ba210ebe 3506 }
961ce445
RGS
3507 else {
3508 yyerror("Missing control char name in \\c");
3509 }
79072805 3510 continue;
02aa26ce
NT
3511
3512 /* printf-style backslashes, formfeeds, newlines, etc */
79072805 3513 case 'b':
db42d148 3514 *d++ = NATIVE_TO_NEED(has_utf8,'\b');
79072805
LW
3515 break;
3516 case 'n':
db42d148 3517 *d++ = NATIVE_TO_NEED(has_utf8,'\n');
79072805
LW
3518 break;
3519 case 'r':
db42d148 3520 *d++ = NATIVE_TO_NEED(has_utf8,'\r');
79072805
LW
3521 break;
3522 case 'f':
db42d148 3523 *d++ = NATIVE_TO_NEED(has_utf8,'\f');
79072805
LW
3524 break;
3525 case 't':
db42d148 3526 *d++ = NATIVE_TO_NEED(has_utf8,'\t');
79072805 3527 break;
34a3fe2a 3528 case 'e':
db42d148 3529 *d++ = ASCII_TO_NEED(has_utf8,'\033');
34a3fe2a
PP
3530 break;
3531 case 'a':
db42d148 3532 *d++ = ASCII_TO_NEED(has_utf8,'\007');
79072805 3533 break;
02aa26ce
NT
3534 } /* end switch */
3535
79072805
LW
3536 s++;
3537 continue;
02aa26ce 3538 } /* end if (backslash) */
4c3a8340
TS
3539#ifdef EBCDIC
3540 else
3541 literal_endpoint++;
3542#endif
02aa26ce 3543
f9a63242 3544 default_action:
77a135fe
KW
3545 /* If we started with encoded form, or already know we want it,
3546 then encode the next character */
3547 if (! NATIVE_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
2b9d42f0 3548 STRLEN len = 1;
77a135fe
KW
3549
3550
3551 /* One might think that it is wasted effort in the case of the
3552 * source being utf8 (this_utf8 == TRUE) to take the next character
3553 * in the source, convert it to an unsigned value, and then convert
3554 * it back again. But the source has not been validated here. The
3555 * routine that does the conversion checks for errors like
3556 * malformed utf8 */
3557
5f66b61c
AL
3558 const UV nextuv = (this_utf8) ? utf8n_to_uvchr((U8*)s, send - s, &len, 0) : (UV) ((U8) *s);
3559 const STRLEN need = UNISKIP(NATIVE_TO_UNI(nextuv));
77a135fe
KW
3560 if (!has_utf8) {
3561 SvCUR_set(sv, d - SvPVX_const(sv));
3562 SvPOK_on(sv);
3563 *d = '\0';
77a135fe 3564 /* See Note on sizing above. */
7bf79863
KW
3565 sv_utf8_upgrade_flags_grow(sv,
3566 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3567 need + (STRLEN)(send - s) + 1);
77a135fe
KW
3568 d = SvPVX(sv) + SvCUR(sv);
3569 has_utf8 = TRUE;
3570 } else if (need > len) {
3571 /* encoded value larger than old, may need extra space (NOTE:
3572 * SvCUR() is not set correctly here). See Note on sizing
3573 * above. */
9d4ba2ae 3574 const STRLEN off = d - SvPVX_const(sv);
77a135fe 3575 d = SvGROW(sv, off + need + (STRLEN)(send - s) + 1) + off;
2b9d42f0 3576 }
77a135fe
KW
3577 s += len;
3578
5f66b61c 3579 d = (char*)uvchr_to_utf8((U8*)d, nextuv);
e294cc5d
JH
3580#ifdef EBCDIC
3581 if (uv > 255 && !dorange)
3582 native_range = FALSE;
3583#endif
2b9d42f0
NIS
3584 }
3585 else {
3586 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
3587 }
02aa26ce
NT
3588 } /* while loop to process each character */
3589
3590 /* terminate the string and set up the sv */
79072805 3591 *d = '\0';
95a20fc0 3592 SvCUR_set(sv, d - SvPVX_const(sv));
2b9d42f0 3593 if (SvCUR(sv) >= SvLEN(sv))
5637ef5b
NC
3594 Perl_croak(aTHX_ "panic: constant overflowed allocated space, %"UVuf
3595 " >= %"UVuf, (UV)SvCUR(sv), (UV)SvLEN(sv));
2b9d42f0 3596
79072805 3597 SvPOK_on(sv);
9f4817db 3598 if (PL_encoding && !has_utf8) {
d0063567
DK
3599 sv_recode_to_utf8(sv, PL_encoding);
3600 if (SvUTF8(sv))
3601 has_utf8 = TRUE;
9f4817db 3602 }
2b9d42f0 3603 if (has_utf8) {
7e2040f0 3604 SvUTF8_on(sv);
2b9d42f0 3605 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
d0063567 3606 PL_sublex_info.sub_op->op_private |=
2b9d42f0
NIS
3607 (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
3608 }
3609 }
79072805 3610
02aa26ce 3611 /* shrink the sv if we allocated more than we used */
79072805 3612 if (SvCUR(sv) + 5 < SvLEN(sv)) {
1da4ca5f 3613 SvPV_shrink_to_cur(sv);
79072805 3614 }
02aa26ce 3615
6154021b 3616 /* return the substring (via pl_yylval) only if we parsed anything */
3280af22 3617 if (s > PL_bufptr) {
eb0d8d16
NC
3618 if ( PL_hints & ( PL_lex_inpat ? HINT_NEW_RE : HINT_NEW_STRING ) ) {
3619 const char *const key = PL_lex_inpat ? "qr" : "q";
3620 const STRLEN keylen = PL_lex_inpat ? 2 : 1;
3621 const char *type;
3622 STRLEN typelen;
3623
3624 if (PL_lex_inwhat == OP_TRANS) {
3625 type = "tr";
3626 typelen = 2;
3627 } else if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat) {
3628 type = "s";
3629 typelen = 1;
9da1dd8f
DM
3630 } else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'') {
3631 type = "q";
3632 typelen = 1;
eb0d8d16
NC
3633 } else {
3634 type = "qq";
3635 typelen = 2;
3636 }
3637
3638 sv = S_new_constant(aTHX_ start, s - start, key, keylen, sv, NULL,
3639 type, typelen);
3640 }
6154021b 3641 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
b3ac6de7 3642 } else
8990e307 3643 SvREFCNT_dec(sv);
79072805
LW
3644 return s;
3645}
3646
ffb4593c
NT
3647/* S_intuit_more
3648 * Returns TRUE if there's more to the expression (e.g., a subscript),
3649 * FALSE otherwise.
ffb4593c
NT
3650 *
3651 * It deals with "$foo[3]" and /$foo[3]/ and /$foo[0123456789$]+/
3652 *
3653 * ->[ and ->{ return TRUE
3654 * { and [ outside a pattern are always subscripts, so return TRUE
3655 * if we're outside a pattern and it's not { or [, then return FALSE
3656 * if we're in a pattern and the first char is a {
3657 * {4,5} (any digits around the comma) returns FALSE
3658 * if we're in a pattern and the first char is a [
3659 * [] returns FALSE
3660 * [SOMETHING] has a funky algorithm to decide whether it's a
3661 * character class or not. It has to deal with things like
3662 * /$foo[-3]/ and /$foo[$bar]/ as well as /$foo[$\d]+/
3663 * anything else returns TRUE
3664 */
3665
9cbb5ea2
GS
3666/* This is the one truly awful dwimmer necessary to conflate C and sed. */
3667
76e3520e 3668STATIC int
cea2e8a9 3669S_intuit_more(pTHX_ register char *s)
79072805 3670{
97aff369 3671 dVAR;
7918f24d
NC
3672
3673 PERL_ARGS_ASSERT_INTUIT_MORE;
3674
3280af22 3675 if (PL_lex_brackets)
79072805
LW
3676 return TRUE;
3677 if (*s == '-' && s[1] == '>' && (s[2] == '[' || s[2] == '{'))
3678 return TRUE;
3679 if (*s != '{' && *s != '[')
3680 return FALSE;
3280af22 3681 if (!PL_lex_inpat)
79072805
LW
3682 return TRUE;
3683
3684 /* In a pattern, so maybe we have {n,m}. */
3685 if (*s == '{') {
b3155d95 3686 if (regcurly(s)) {
79072805 3687 return FALSE;
b3155d95 3688 }
79072805 3689 return TRUE;
79072805
LW
3690 }
3691
3692 /* On the other hand, maybe we have a character class */
3693
3694 s++;
3695 if (*s == ']' || *s == '^')
3696 return FALSE;
3697 else {
ffb4593c 3698 /* this is terrifying, and it works */
79072805
LW
3699 int weight = 2; /* let's weigh the evidence */
3700 char seen[256];
f27ffc4a 3701 unsigned char un_char = 255, last_un_char;
9d4ba2ae 3702 const char * const send = strchr(s,']');
3280af22 3703 char tmpbuf[sizeof PL_tokenbuf * 4];
79072805
LW
3704
3705 if (!send) /* has to be an expression */
3706 return TRUE;
3707
3708 Zero(seen,256,char);
3709 if (*s == '$')
3710 weight -= 3;
3711 else if (isDIGIT(*s)) {
3712 if (s[1] != ']') {
3713 if (isDIGIT(s[1]) && s[2] == ']')
3714 weight -= 10;
3715 }
3716 else
3717 weight -= 100;
3718 }
3719 for (; s < send; s++) {
3720 last_un_char = un_char;
3721 un_char = (unsigned char)*s;
3722 switch (*s) {
3723 case '@':
3724 case '&':
3725 case '$':
3726 weight -= seen[un_char] * 10;
7e2040f0 3727 if (isALNUM_lazy_if(s+1,UTF)) {
90e5519e 3728 int len;
8903cb82 3729 scan_ident(s, send, tmpbuf, sizeof tmpbuf, FALSE);
90e5519e 3730 len = (int)strlen(tmpbuf);
6fbd0d97
BF
3731 if (len > 1 && gv_fetchpvn_flags(tmpbuf, len,
3732 UTF ? SVf_UTF8 : 0, SVt_PV))
79072805
LW
3733 weight -= 100;
3734 else
3735 weight -= 10;
3736 }
3737 else if (*s == '$' && s[1] &&
93a17b20
LW
3738 strchr("[#!%*<>()-=",s[1])) {
3739 if (/*{*/ strchr("])} =",s[2]))
79072805
LW
3740 weight -= 10;
3741 else
3742 weight -= 1;