This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Distinguish C- and perly- literals - PERLY_PAREN_OPEN
[perl5.git] / toke.c
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
f0e67a1d
Z
26This is the lower layer of the Perl parser, managing characters and tokens.
27
78342678 28=for apidoc AmnU|yy_parser *|PL_parser
f0e67a1d
Z
29
30Pointer to a structure encapsulating the state of the parsing operation
31currently in progress. The pointer can be locally changed to perform
32a nested parse without interfering with the state of an outer parse.
33Individual members of C<PL_parser> have their own documentation.
34
35=cut
36*/
37
378cc40b 38#include "EXTERN.h"
864dbfa3 39#define PERL_IN_TOKE_C
378cc40b 40#include "perl.h"
f1bcae08 41#include "invlist_inline.h"
378cc40b 42
164e423c
KW
43#define new_constant(a,b,c,d,e,f,g, h) \
44 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g, h)
eb0d8d16 45
6154021b 46#define pl_yylval (PL_parser->yylval)
d3b6f988 47
199e78b7
DM
48/* XXX temporary backwards compatibility */
49#define PL_lex_brackets (PL_parser->lex_brackets)
78cdf107
Z
50#define PL_lex_allbrackets (PL_parser->lex_allbrackets)
51#define PL_lex_fakeeof (PL_parser->lex_fakeeof)
199e78b7
DM
52#define PL_lex_brackstack (PL_parser->lex_brackstack)
53#define PL_lex_casemods (PL_parser->lex_casemods)
54#define PL_lex_casestack (PL_parser->lex_casestack)
199e78b7 55#define PL_lex_dojoin (PL_parser->lex_dojoin)
199e78b7
DM
56#define PL_lex_formbrack (PL_parser->lex_formbrack)
57#define PL_lex_inpat (PL_parser->lex_inpat)
58#define PL_lex_inwhat (PL_parser->lex_inwhat)
59#define PL_lex_op (PL_parser->lex_op)
60#define PL_lex_repl (PL_parser->lex_repl)
61#define PL_lex_starts (PL_parser->lex_starts)
62#define PL_lex_stuff (PL_parser->lex_stuff)
63#define PL_multi_start (PL_parser->multi_start)
64#define PL_multi_open (PL_parser->multi_open)
65#define PL_multi_close (PL_parser->multi_close)
199e78b7 66#define PL_preambled (PL_parser->preambled)
bdc0bf6f 67#define PL_linestr (PL_parser->linestr)
c2598295
DM
68#define PL_expect (PL_parser->expect)
69#define PL_copline (PL_parser->copline)
f06b5848
DM
70#define PL_bufptr (PL_parser->bufptr)
71#define PL_oldbufptr (PL_parser->oldbufptr)
72#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
73#define PL_linestart (PL_parser->linestart)
74#define PL_bufend (PL_parser->bufend)
75#define PL_last_uni (PL_parser->last_uni)
76#define PL_last_lop (PL_parser->last_lop)
77#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 78#define PL_lex_state (PL_parser->lex_state)
2f9285f8 79#define PL_rsfp (PL_parser->rsfp)
5486870f 80#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
81#define PL_in_my (PL_parser->in_my)
82#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 83#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 84#define PL_multi_end (PL_parser->multi_end)
13765c85 85#define PL_error_count (PL_parser->error_count)
199e78b7 86
fb205e7a
DM
87# define PL_nexttoke (PL_parser->nexttoke)
88# define PL_nexttype (PL_parser->nexttype)
89# define PL_nextval (PL_parser->nextval)
199e78b7 90
6432a58a
DM
91
92#define SvEVALED(sv) \
93 (SvTYPE(sv) >= SVt_PVNV \
94 && ((XPVIV*)SvANY(sv))->xiv_u.xivu_eval_seen)
95
a1894d81 96static const char* const ident_too_long = "Identifier too long";
60267e1d 97static const char* const ident_var_zero_multi_digit = "Numeric variables with more than one digit may not start with '0'";
8903cb82 98
9ded7720 99# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2 100
a7aaec61
Z
101#define XENUMMASK 0x3f
102#define XFAKEEOF 0x40
103#define XFAKEBRACK 0x80
9059aa12 104
39e02b42 105#ifdef USE_UTF8_SCRIPTS
b3041197 106# define UTF cBOOL(!IN_BYTES)
2b9d42f0 107#else
b3041197 108# define UTF cBOOL((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
2b9d42f0 109#endif
a0ed51b3 110
b1fc3636
CJ
111/* The maximum number of characters preceding the unrecognized one to display */
112#define UNRECOGNIZED_PRECEDE_COUNT 10
113
61f0cdd9 114/* In variables named $^X, these are the legal values for X.
2b92dfce 115 * 1999-02-27 mjd-perl-patch@plover.com */
4aada8b9 116#define isCONTROLVAR(x) (isUPPER(x) || memCHRs("[\\]^_?", (x)))
2b92dfce 117
14bd96d0 118#define SPACE_OR_TAB(c) isBLANK_A(c)
bf4acbe4 119
9ff909cf
JH
120#define HEXFP_PEEK(s) \
121 (((s[0] == '.') && \
122 (isXDIGIT(s[1]) || isALPHA_FOLD_EQ(s[1], 'p'))) || \
123 isALPHA_FOLD_EQ(s[0], 'p'))
124
ffb4593c
NT
125/* LEX_* are values for PL_lex_state, the state of the lexer.
126 * They are arranged oddly so that the guard on the switch statement
79072805 127 * can get by with a single comparison (if the compiler is smart enough).
9da1dd8f
DM
128 *
129 * These values refer to the various states within a sublex parse,
130 * i.e. within a double quotish string
79072805
LW
131 */
132
fb73857a
PP
133/* #define LEX_NOTPARSING 11 is done in perl.h. */
134
b6007c36
DM
135#define LEX_NORMAL 10 /* normal code (ie not within "...") */
136#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
137#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
138#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
139#define LEX_INTERPSTART 6 /* expecting the start of a $var */
140
141 /* at end of code, eg "$x" followed by: */
142#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
143#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
144
145#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
146 string or after \E, $foo, etc */
147#define LEX_INTERPCONST 2 /* NOT USED */
148#define LEX_FORMLINE 1 /* expecting a format line */
b6007c36 149
56d9fe2f
TC
150/* returned to yyl_try() to request it to retry the parse loop, expected to only
151 be returned directly by yyl_fake_eof(), but functions that call yyl_fake_eof()
152 can also return it.
153
154 yylex (aka Perl_yylex) returns 0 on EOF rather than returning -1,
155 other token values are 258 or higher (see perly.h), so -1 should be
156 a safe value here.
157*/
158#define YYL_RETRY (-1)
79072805 159
bbf60fe6 160#ifdef DEBUGGING
27da23d5 161static const char* const lex_state_names[] = {
bbf60fe6
DM
162 "KNOWNEXT",
163 "FORMLINE",
164 "INTERPCONST",
165 "INTERPCONCAT",
166 "INTERPENDMAYBE",
167 "INTERPEND",
168 "INTERPSTART",
169 "INTERPPUSH",
170 "INTERPCASEMOD",
171 "INTERPNORMAL",
172 "NORMAL"
173};
174#endif
175
79072805 176#include "keywords.h"
fe14fcc3 177
ffb4593c
NT
178/* CLINE is a macro that ensures PL_copline has a sane value */
179
57843af0 180#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 181
ffb4593c
NT
182/*
183 * Convenience functions to return different tokens and prime the
9cbb5ea2 184 * lexer for the next token. They all take an argument.
ffb4593c
NT
185 *
186 * TOKEN : generic token (used for '(', DOLSHARP, etc)
187 * OPERATOR : generic operator
188 * AOPERATOR : assignment operator
189 * PREBLOCK : beginning the block after an if, while, foreach, ...
190 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
191 * PREREF : *EXPR where EXPR is not a simple identifier
192 * TERM : expression term
89f35911 193 * POSTDEREF : postfix dereference (->$* ->@[...] etc.)
ffb4593c
NT
194 * LOOPX : loop exiting command (goto, last, dump, etc)
195 * FTST : file test operator
196 * FUN0 : zero-argument function
7eb971ee 197 * FUN0OP : zero-argument function, with its op created in this file
2d2e263d 198 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
199 * BOop : bitwise or or xor
200 * BAop : bitwise and
8823cb89 201 * BCop : bitwise complement
ffb4593c
NT
202 * SHop : shift operator
203 * PWop : power operator
9cbb5ea2 204 * PMop : pattern-matching operator
ffb4593c 205 * Aop : addition-level operator
e4916dd1 206 * AopNOASSIGN : addition-level operator that is never part of .=
ffb4593c 207 * Mop : multiplication-level operator
02b85d3d
Z
208 * ChEop : chaining equality-testing operator
209 * NCEop : non-chaining comparison operator at equality precedence
210 * ChRop : chaining relational operator <= != gt
211 * NCRop : non-chaining relational operator isa
ffb4593c
NT
212 *
213 * Also see LOP and lop() below.
214 */
215
998054bd 216#ifdef DEBUGGING /* Serve -DT. */
704d4215 217# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 218#else
bbf60fe6 219# define REPORT(retval) (retval)
998054bd
SC
220#endif
221
bbf60fe6
DM
222#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
223#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
b1764551 224#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, retval))
bbf60fe6
DM
225#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
226#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
227#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
228#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
89f35911 229#define POSTDEREF(f) return (PL_bufptr = s, S_postderef(aTHX_ REPORT(f),s[1]))
185c2e96 230#define LOOPX(f) return (PL_bufptr = force_word(s,BAREWORD,TRUE,FALSE), \
7a61bf3c 231 pl_yylval.ival=f, \
a49203fd 232 PL_expect = PL_nexttoke ? XOPERATOR : XTERM, \
7a61bf3c 233 REPORT((int)LOOPEX))
6154021b
RGS
234#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
235#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
7eb971ee 236#define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
6154021b 237#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
b1764551
FC
238#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)BITOROP))
239#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)BITANDOP))
8823cb89 240#define BCop(f) return pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr = s, \
3d92c6b8 241 REPORT(PERLY_TILDE)
b1764551
FC
242#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)SHIFTOP))
243#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)POWOP))
6154021b 244#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
b1764551 245#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)ADDOP))
e4916dd1 246#define AopNOASSIGN(f) return (pl_yylval.ival=f, PL_bufptr=s, REPORT((int)ADDOP))
b1764551 247#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)MULOP))
02b85d3d
Z
248#define ChEop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)CHEQOP))
249#define NCEop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)NCEQOP))
250#define ChRop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)CHRELOP))
251#define NCRop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)NCRELOP))
2f3197b3 252
a687059c
LW
253/* This bit of chicanery makes a unary function followed by
254 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
255 * The UNIDOR macro is for unary functions that can be followed by the //
256 * operator (such as C<shift // 0>).
a687059c 257 */
d68ce4ac 258#define UNI3(f,x,have_x) { \
6154021b 259 pl_yylval.ival = f; \
d68ce4ac 260 if (have_x) PL_expect = x; \
376fcdbf
AL
261 PL_bufptr = s; \
262 PL_last_uni = PL_oldbufptr; \
0af40c75 263 PL_last_lop_op = (f) < 0 ? -(f) : (f); \
376fcdbf
AL
264 if (*s == '(') \
265 return REPORT( (int)FUNC1 ); \
294a536f 266 s = skipspace(s); \
376fcdbf
AL
267 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
268 }
d68ce4ac
FC
269#define UNI(f) UNI3(f,XTERM,1)
270#define UNIDOR(f) UNI3(f,XTERMORDORDOR,1)
b5fb7ce3
FC
271#define UNIPROTO(f,optional) { \
272 if (optional) PL_last_uni = PL_oldbufptr; \
22393538
MH
273 OPERATOR(f); \
274 }
a687059c 275
d68ce4ac 276#define UNIBRACK(f) UNI3(f,0,0)
79072805 277
9f68db38 278/* grandfather return to old style */
78cdf107
Z
279#define OLDLOP(f) \
280 do { \
281 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
282 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
283 pl_yylval.ival = (f); \
284 PL_expect = XTERM; \
285 PL_bufptr = s; \
286 return (int)LSTOP; \
287 } while(0)
79072805 288
83944c01
FC
289#define COPLINE_INC_WITH_HERELINES \
290 STMT_START { \
291 CopLINE_inc(PL_curcop); \
851b527a
FC
292 if (PL_parser->herelines) \
293 CopLINE(PL_curcop) += PL_parser->herelines, \
294 PL_parser->herelines = 0; \
83944c01 295 } STMT_END
ffdb8b16
FC
296/* Called after scan_str to update CopLINE(PL_curcop), but only when there
297 * is no sublex_push to follow. */
298#define COPLINE_SET_FROM_MULTI_END \
299 STMT_START { \
300 CopLINE_set(PL_curcop, PL_multi_end); \
301 if (PL_multi_end != PL_multi_start) \
851b527a 302 PL_parser->herelines = 0; \
ffdb8b16 303 } STMT_END
83944c01
FC
304
305
539c839f
AC
306/* A file-local structure for passing around information about subroutines and
307 * related definable words */
308struct code {
309 SV *sv;
310 CV *cv;
311 GV *gv, **gvp;
312 OP *rv2cv_op;
313 PADOFFSET off;
314 bool lex;
315};
316
317static const struct code no_code = { NULL, NULL, NULL, NULL, NULL, 0, FALSE };
318
8fa7f367
JH
319#ifdef DEBUGGING
320
6154021b 321/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
322enum token_type {
323 TOKENTYPE_NONE,
324 TOKENTYPE_IVAL,
6154021b 325 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6 326 TOKENTYPE_PVAL,
aeaef349 327 TOKENTYPE_OPVAL
bbf60fe6
DM
328};
329
c588171e
BZ
330#define DEBUG_TOKEN(Type, Name) \
331 { Name, TOKENTYPE_##Type, #Name }
332
6d4a66ac
NC
333static struct debug_tokens {
334 const int token;
335 enum token_type type;
336 const char *name;
337} const debug_tokens[] =
9041c2e3 338{
bbf60fe6
DM
339 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
340 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
341 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
342 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
436ddf68 343 { ANON_SIGSUB, TOKENTYPE_IVAL, "ANON_SIGSUB" },
bbf60fe6
DM
344 { ARROW, TOKENTYPE_NONE, "ARROW" },
345 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
346 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
347 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
02b85d3d
Z
348 { CHEQOP, TOKENTYPE_OPNUM, "CHEQOP" },
349 { CHRELOP, TOKENTYPE_OPNUM, "CHRELOP" },
bbf60fe6
DM
350 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
351 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
7896dde7 352 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
353 { DO, TOKENTYPE_NONE, "DO" },
354 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
355 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
356 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
357 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
358 { ELSE, TOKENTYPE_NONE, "ELSE" },
359 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
bbf60fe6
DM
360 { FOR, TOKENTYPE_IVAL, "FOR" },
361 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
705fe0e5
FC
362 { FORMLBRACK, TOKENTYPE_NONE, "FORMLBRACK" },
363 { FORMRBRACK, TOKENTYPE_NONE, "FORMRBRACK" },
bbf60fe6
DM
364 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
365 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
7eb971ee 366 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
bbf60fe6
DM
367 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
368 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
369 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 370 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
371 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
372 { IF, TOKENTYPE_IVAL, "IF" },
01719201 373 { LABEL, TOKENTYPE_OPVAL, "LABEL" },
bbf60fe6
DM
374 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
375 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
376 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
377 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
378 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
379 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
380 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
381 { MY, TOKENTYPE_IVAL, "MY" },
02b85d3d
Z
382 { NCEQOP, TOKENTYPE_OPNUM, "NCEQOP" },
383 { NCRELOP, TOKENTYPE_OPNUM, "NCRELOP" },
bbf60fe6
DM
384 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
385 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
386 { OROP, TOKENTYPE_IVAL, "OROP" },
387 { OROR, TOKENTYPE_NONE, "OROR" },
388 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
25a50500 389 DEBUG_TOKEN (IVAL, PERLY_AMPERSAND),
d0a6a9c7 390 DEBUG_TOKEN (IVAL, PERLY_BRACE_CLOSE),
c588171e 391 DEBUG_TOKEN (IVAL, PERLY_BRACE_OPEN),
fceeeb77 392 DEBUG_TOKEN (IVAL, PERLY_BRACKET_CLOSE),
669dd22c 393 DEBUG_TOKEN (IVAL, PERLY_BRACKET_OPEN),
3d9ccdfc 394 DEBUG_TOKEN (IVAL, PERLY_COLON),
581f9a7a 395 DEBUG_TOKEN (IVAL, PERLY_COMMA),
da4bce7d 396 DEBUG_TOKEN (IVAL, PERLY_DOT),
db83e45c 397 DEBUG_TOKEN (IVAL, PERLY_EQUAL_SIGN),
1c2e9449 398 DEBUG_TOKEN (IVAL, PERLY_EXCLAMATION_MARK),
68a66a8b 399 DEBUG_TOKEN (IVAL, PERLY_MINUS),
ee67f254 400 DEBUG_TOKEN (IVAL, PERLY_PAREN_OPEN),
0ba95c59 401 DEBUG_TOKEN (IVAL, PERLY_PERCENT_SIGN),
5776f3e5 402 DEBUG_TOKEN (IVAL, PERLY_PLUS),
a1ad62bf 403 DEBUG_TOKEN (IVAL, PERLY_QUESTION_MARK),
5adeeefb 404 DEBUG_TOKEN (IVAL, PERLY_SEMICOLON),
9086c946 405 DEBUG_TOKEN (IVAL, PERLY_SNAIL),
3d92c6b8 406 DEBUG_TOKEN (IVAL, PERLY_TILDE),
88e1f1a2
JV
407 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
408 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6 409 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
cc624add 410 { POSTJOIN, TOKENTYPE_NONE, "POSTJOIN" },
bbf60fe6
DM
411 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
412 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
413 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
414 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
415 { PREINC, TOKENTYPE_NONE, "PREINC" },
416 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
f3f204dc 417 { QWLIST, TOKENTYPE_OPVAL, "QWLIST" },
bbf60fe6 418 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
f3f204dc 419 { REQUIRE, TOKENTYPE_NONE, "REQUIRE" },
bbf60fe6 420 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
436ddf68 421 { SIGSUB, TOKENTYPE_NONE, "SIGSUB" },
bbf60fe6 422 { SUB, TOKENTYPE_NONE, "SUB" },
69afcc21
TC
423 { SUBLEXEND, TOKENTYPE_NONE, "SUBLEXEND" },
424 { SUBLEXSTART, TOKENTYPE_NONE, "SUBLEXSTART" },
bbf60fe6
DM
425 { THING, TOKENTYPE_OPVAL, "THING" },
426 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
427 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
428 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
429 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
430 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
431 { USE, TOKENTYPE_IVAL, "USE" },
7896dde7 432 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6 433 { WHILE, TOKENTYPE_IVAL, "WHILE" },
185c2e96 434 { BAREWORD, TOKENTYPE_OPVAL, "BAREWORD" },
be25f609 435 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 436 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
437};
438
c588171e
BZ
439#undef DEBUG_TOKEN
440
6154021b 441/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 442
bbf60fe6 443STATIC int
704d4215 444S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 445{
7918f24d
NC
446 PERL_ARGS_ASSERT_TOKEREPORT;
447
bbf60fe6 448 if (DEBUG_T_TEST) {
bd61b366 449 const char *name = NULL;
bbf60fe6 450 enum token_type type = TOKENTYPE_NONE;
f54cb97a 451 const struct debug_tokens *p;
396482e1 452 SV* const report = newSVpvs("<== ");
bbf60fe6 453
f54cb97a 454 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
455 if (p->token == (int)rv) {
456 name = p->name;
457 type = p->type;
458 break;
459 }
460 }
461 if (name)
54667de8 462 Perl_sv_catpv(aTHX_ report, name);
239f83d5 463 else if (isGRAPH(rv))
4ebc7986 464 {
bbf60fe6 465 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
4ebc7986
FC
466 if ((char)rv == 'p')
467 sv_catpvs(report, " (pending identifier)");
468 }
bbf60fe6 469 else if (!rv)
396482e1 470 sv_catpvs(report, "EOF");
bbf60fe6 471 else
147e3846 472 Perl_sv_catpvf(aTHX_ report, "?? %" IVdf, (IV)rv);
bbf60fe6
DM
473 switch (type) {
474 case TOKENTYPE_NONE:
bbf60fe6
DM
475 break;
476 case TOKENTYPE_IVAL:
147e3846 477 Perl_sv_catpvf(aTHX_ report, "(ival=%" IVdf ")", (IV)lvalp->ival);
bbf60fe6
DM
478 break;
479 case TOKENTYPE_OPNUM:
480 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 481 PL_op_name[lvalp->ival]);
bbf60fe6
DM
482 break;
483 case TOKENTYPE_PVAL:
704d4215 484 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
485 break;
486 case TOKENTYPE_OPVAL:
704d4215 487 if (lvalp->opval) {
401441c0 488 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
489 PL_op_name[lvalp->opval->op_type]);
490 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 491 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 492 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
493 }
494
495 }
401441c0 496 else
396482e1 497 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
498 break;
499 }
b6007c36 500 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
501 };
502 return (int)rv;
998054bd
SC
503}
504
b6007c36
DM
505
506/* print the buffer with suitable escapes */
507
508STATIC void
15f169a1 509S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 510{
396482e1 511 SV* const tmp = newSVpvs("");
7918f24d
NC
512
513 PERL_ARGS_ASSERT_PRINTBUF;
514
7347ee54 515 GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral); /* fmt checked by caller */
b6007c36 516 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
7347ee54 517 GCC_DIAG_RESTORE_STMT;
b6007c36
DM
518 SvREFCNT_dec(tmp);
519}
520
8fa7f367
JH
521#endif
522
ffb4593c
NT
523/*
524 * S_ao
525 *
f393a21a
FC
526 * This subroutine looks for an '=' next to the operator that has just been
527 * parsed and turns it into an ASSIGNOP if it finds one.
ffb4593c
NT
528 */
529
76e3520e 530STATIC int
cea2e8a9 531S_ao(pTHX_ int toketype)
a0d0e21e 532{
3280af22
NIS
533 if (*PL_bufptr == '=') {
534 PL_bufptr++;
a0d0e21e 535 if (toketype == ANDAND)
6154021b 536 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 537 else if (toketype == OROR)
6154021b 538 pl_yylval.ival = OP_ORASSIGN;
c963b151 539 else if (toketype == DORDOR)
6154021b 540 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
541 toketype = ASSIGNOP;
542 }
b1764551 543 return REPORT(toketype);
a0d0e21e
LW
544}
545
ffb4593c
NT
546/*
547 * S_no_op
548 * When Perl expects an operator and finds something else, no_op
549 * prints the warning. It always prints "<something> found where
550 * operator expected. It prints "Missing semicolon on previous line?"
551 * if the surprise occurs at the start of the line. "do you need to
552 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
553 * where the compiler doesn't know if foo is a method call or a function.
554 * It prints "Missing operator before end of line" if there's nothing
555 * after the missing operator, or "... before <...>" if there is something
556 * after the missing operator.
488bc579
FC
557 *
558 * PL_bufptr is expected to point to the start of the thing that was found,
559 * and s after the next token or partial token.
ffb4593c
NT
560 */
561
76e3520e 562STATIC void
15f169a1 563S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 564{
9d4ba2ae
AL
565 char * const oldbp = PL_bufptr;
566 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 567
7918f24d
NC
568 PERL_ARGS_ASSERT_NO_OP;
569
1189a94a
GS
570 if (!s)
571 s = oldbp;
07c798fb 572 else
1189a94a 573 PL_bufptr = s;
734ab321 574 yywarn(Perl_form(aTHX_ "%s found where operator expected", what), UTF ? SVf_UTF8 : 0);
56da5a46
RGS
575 if (ckWARN_d(WARN_SYNTAX)) {
576 if (is_first)
577 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
578 "\t(Missing semicolon on previous line?)\n");
fac0f7a3
KW
579 else if (PL_oldoldbufptr && isIDFIRST_lazy_if_safe(PL_oldoldbufptr,
580 PL_bufend,
581 UTF))
582 {
f54cb97a 583 const char *t;
fac0f7a3
KW
584 for (t = PL_oldoldbufptr;
585 (isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF) || *t == ':');
586 t += UTF ? UTF8SKIP(t) : 1)
587 {
c35e046a 588 NOOP;
fac0f7a3 589 }
56da5a46
RGS
590 if (t < PL_bufptr && isSPACE(*t))
591 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
147e3846 592 "\t(Do you need to predeclare %" UTF8f "?)\n",
b17a0679 593 UTF8fARG(UTF, t - PL_oldoldbufptr, PL_oldoldbufptr));
56da5a46
RGS
594 }
595 else {
596 assert(s >= oldbp);
597 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
147e3846 598 "\t(Missing operator before %" UTF8f "?)\n",
b17a0679 599 UTF8fARG(UTF, s - oldbp, oldbp));
56da5a46 600 }
07c798fb 601 }
3280af22 602 PL_bufptr = oldbp;
8990e307
LW
603}
604
ffb4593c
NT
605/*
606 * S_missingterm
607 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 608 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
609 * If we're in a delimited string and the delimiter is a control
610 * character, it's reformatted into a two-char sequence like ^C.
611 * This is fatal.
612 */
613
76e3520e 614STATIC void
1b8d3e0e 615S_missingterm(pTHX_ char *s, STRLEN len)
8990e307 616{
e487ff5e 617 char tmpbuf[UTF8_MAXBYTES + 1];
8990e307 618 char q;
cb650135
FC
619 bool uni = FALSE;
620 SV *sv;
8990e307 621 if (s) {
f88b23d7 622 char * const nl = (char *) my_memrchr(s, '\n', len);
1b8d3e0e
LM
623 if (nl) {
624 *nl = '\0';
625 len = nl - s;
626 }
7f7f7d08 627 uni = UTF;
8990e307 628 }
cb650135 629 else if (PL_multi_close < 32) {
8990e307 630 *tmpbuf = '^';
585ec06d 631 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
632 tmpbuf[2] = '\0';
633 s = tmpbuf;
1b8d3e0e 634 len = 2;
8990e307
LW
635 }
636 else {
cb650135
FC
637 if (LIKELY(PL_multi_close < 256)) {
638 *tmpbuf = (char)PL_multi_close;
639 tmpbuf[1] = '\0';
1b8d3e0e 640 len = 1;
cb650135
FC
641 }
642 else {
1b8d3e0e
LM
643 char *end = (char *)uvchr_to_utf8((U8 *)tmpbuf, PL_multi_close);
644 *end = '\0';
645 len = end - tmpbuf;
cb650135 646 uni = TRUE;
cb650135 647 }
8990e307
LW
648 s = tmpbuf;
649 }
9f8d27b7 650 q = memchr(s, '"', len) ? '\'' : '"';
1b8d3e0e 651 sv = sv_2mortal(newSVpvn(s, len));
cb650135
FC
652 if (uni)
653 SvUTF8_on(sv);
1b8d3e0e
LM
654 Perl_croak(aTHX_ "Can't find string terminator %c%" SVf "%c"
655 " anywhere before EOF", q, SVfARG(sv), q);
463ee0b2 656}
79072805 657
dd0ac2b9
FC
658#include "feature.h"
659
0d863452 660/*
9cbb5ea2
GS
661 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
662 * utf16-to-utf8-reversed.
ffb4593c
NT
663 */
664
c39cd008
GS
665#ifdef PERL_CR_FILTER
666static void
667strip_return(SV *sv)
668{
eb578fdb
KW
669 const char *s = SvPVX_const(sv);
670 const char * const e = s + SvCUR(sv);
7918f24d
NC
671
672 PERL_ARGS_ASSERT_STRIP_RETURN;
673
c39cd008
GS
674 /* outer loop optimized to do nothing if there are no CR-LFs */
675 while (s < e) {
676 if (*s++ == '\r' && *s == '\n') {
677 /* hit a CR-LF, need to copy the rest */
eb578fdb 678 char *d = s - 1;
c39cd008
GS
679 *d++ = *s++;
680 while (s < e) {
681 if (*s == '\r' && s[1] == '\n')
682 s++;
683 *d++ = *s++;
684 }
685 SvCUR(sv) -= s - d;
686 return;
687 }
688 }
689}
a868473f 690
76e3520e 691STATIC I32
c39cd008 692S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 693{
f54cb97a 694 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
695 if (count > 0 && !maxlen)
696 strip_return(sv);
697 return count;
a868473f
NIS
698}
699#endif
700
ffb4593c 701/*
44170c9a 702=for apidoc lex_start
8eaa0acf
Z
703
704Creates and initialises a new lexer/parser state object, supplying
705a context in which to lex and parse from a new source of Perl code.
706A pointer to the new state object is placed in L</PL_parser>. An entry
c83d5090 707is made on the save stack so that upon unwinding, the new state object
8eaa0acf
Z
708will be destroyed and the former value of L</PL_parser> will be restored.
709Nothing else need be done to clean up the parsing context.
710
2d7f6611 711The code to be parsed comes from C<line> and C<rsfp>. C<line>, if
8eaa0acf 712non-null, provides a string (in SV form) containing code to be parsed.
2d7f6611
KW
713A copy of the string is made, so subsequent modification of C<line>
714does not affect parsing. C<rsfp>, if non-null, provides an input stream
8eaa0acf 715from which code will be read to be parsed. If both are non-null, the
2d7f6611
KW
716code in C<line> comes first and must consist of complete lines of input,
717and C<rsfp> supplies the remainder of the source.
8eaa0acf 718
2d7f6611 719The C<flags> parameter is reserved for future use. Currently it is only
e368b3bd 720used by perl internally, so extensions should always pass zero.
8eaa0acf
Z
721
722=cut
723*/
ffb4593c 724
27fcb6ee 725/* LEX_START_SAME_FILTER indicates that this is not a new file, so it
87606032
NC
726 can share filters with the current parser.
727 LEX_START_DONT_CLOSE indicates that the file handle wasn't opened by the
728 caller, hence isn't owned by the parser, so shouldn't be closed on parser
729 destruction. This is used to handle the case of defaulting to reading the
730 script from the standard input because no filename was given on the command
731 line (without getting confused by situation where STDIN has been closed, so
732 the script handle is opened on fd 0) */
27fcb6ee 733
a0d0e21e 734void
8eaa0acf 735Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
79072805 736{
6ef55633 737 const char *s = NULL;
5486870f 738 yy_parser *parser, *oparser;
90b58c70 739
60d63348 740 if (flags && flags & ~LEX_START_FLAGS)
8eaa0acf 741 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
acdf0a21
DM
742
743 /* create and initialise a parser */
744
199e78b7 745 Newxz(parser, 1, yy_parser);
5486870f 746 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
747 PL_parser = parser;
748
28ac2b49 749 parser->stack = NULL;
df13534a 750 parser->stack_max1 = NULL;
28ac2b49 751 parser->ps = NULL;
acdf0a21 752
e3abe207
DM
753 /* on scope exit, free this parser and restore any outer one */
754 SAVEPARSER(parser);
7c4baf47 755 parser->saved_curcop = PL_curcop;
e3abe207 756
acdf0a21 757 /* initialise lexer state */
8990e307 758
fb205e7a 759 parser->nexttoke = 0;
ca4cfd28 760 parser->error_count = oparser ? oparser->error_count : 0;
7f1c3e8c 761 parser->copline = parser->preambling = NOLINE;
5afb0a62 762 parser->lex_state = LEX_NORMAL;
c2598295 763 parser->expect = XSTATE;
2f9285f8 764 parser->rsfp = rsfp;
b7b52646 765 parser->recheck_utf8_validity = TRUE;
27fcb6ee
FC
766 parser->rsfp_filters =
767 !(flags & LEX_START_SAME_FILTER) || !oparser
d3cd8e11
FC
768 ? NULL
769 : MUTABLE_AV(SvREFCNT_inc(
770 oparser->rsfp_filters
771 ? oparser->rsfp_filters
772 : (oparser->rsfp_filters = newAV())
773 ));
2f9285f8 774
199e78b7
DM
775 Newx(parser->lex_brackstack, 120, char);
776 Newx(parser->lex_casestack, 12, char);
777 *parser->lex_casestack = '\0';
d794b522 778 Newxz(parser->lex_shared, 1, LEXSHARED);
02b34bbe 779
10efb74f 780 if (line) {
0528fd32 781 STRLEN len;
a6909bd2
KW
782 const U8* first_bad_char_loc;
783
10efb74f 784 s = SvPV_const(line, len);
90b58c70 785
0aab20f2
KW
786 if ( SvUTF8(line)
787 && UNLIKELY(! is_utf8_string_loc((U8 *) s,
788 SvCUR(line),
789 &first_bad_char_loc)))
90b58c70
KW
790 {
791 _force_out_malformed_utf8_message(first_bad_char_loc,
792 (U8 *) s + SvCUR(line),
793 0,
794 1 /* 1 means die */ );
795 NOT_REACHED; /* NOTREACHED */
796 }
797
0abcdfa4
FC
798 parser->linestr = flags & LEX_START_COPIED
799 ? SvREFCNT_inc_simple_NN(line)
800 : newSVpvn_flags(s, len, SvUTF8(line));
b3dd0aba
FC
801 if (!rsfp)
802 sv_catpvs(parser->linestr, "\n;");
0abcdfa4 803 } else {
bf1b738b 804 parser->linestr = newSVpvn("\n;", rsfp ? 1 : 2);
8990e307 805 }
218304f9 806
f06b5848
DM
807 parser->oldoldbufptr =
808 parser->oldbufptr =
809 parser->bufptr =
810 parser->linestart = SvPVX(parser->linestr);
811 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
812 parser->last_lop = parser->last_uni = NULL;
b54f893d 813
6d59e610 814 STATIC_ASSERT_STMT(FITS_IN_8_BITS(LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
b54f893d
KW
815 |LEX_DONT_CLOSE_RSFP));
816 parser->lex_flags = (U8) (flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
817 |LEX_DONT_CLOSE_RSFP));
737c24fc 818
60d63348 819 parser->in_pod = parser->filtered = 0;
79072805 820}
a687059c 821
e3abe207
DM
822
823/* delete a parser object */
824
825void
826Perl_parser_free(pTHX_ const yy_parser *parser)
827{
7918f24d
NC
828 PERL_ARGS_ASSERT_PARSER_FREE;
829
7c4baf47 830 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
831 SvREFCNT_dec(parser->linestr);
832
87606032 833 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
2f9285f8 834 PerlIO_clearerr(parser->rsfp);
407f8cf2
KW
835 else if (parser->rsfp && (!parser->old_parser
836 || (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 837 PerlIO_close(parser->rsfp);
5486870f 838 SvREFCNT_dec(parser->rsfp_filters);
10002bc1 839 SvREFCNT_dec(parser->lex_stuff);
7ef70b3d 840 SvREFCNT_dec(parser->lex_sub_repl);
3ac7ff8f
FC
841
842 Safefree(parser->lex_brackstack);
843 Safefree(parser->lex_casestack);
844 Safefree(parser->lex_shared);
845 PL_parser = parser->old_parser;
846 Safefree(parser);
847}
848
849void
850Perl_parser_free_nexttoke_ops(pTHX_ yy_parser *parser, OPSLAB *slab)
851{
3ac7ff8f 852 I32 nexttoke = parser->nexttoke;
3ac7ff8f 853 PERL_ARGS_ASSERT_PARSER_FREE_NEXTTOKE_OPS;
3ce3dcd9 854 while (nexttoke--) {
3ac7ff8f
FC
855 if (S_is_opval_token(parser->nexttype[nexttoke] & 0xffff)
856 && parser->nextval[nexttoke].opval
857 && parser->nextval[nexttoke].opval->op_slabbed
858 && OpSLAB(parser->nextval[nexttoke].opval) == slab) {
3ce3dcd9 859 op_free(parser->nextval[nexttoke].opval);
3ac7ff8f
FC
860 parser->nextval[nexttoke].opval = NULL;
861 }
3ce3dcd9 862 }
e3abe207
DM
863}
864
865
ffb4593c 866/*
78342678 867=for apidoc AmnxUN|SV *|PL_parser-E<gt>linestr
f0e67a1d
Z
868
869Buffer scalar containing the chunk currently under consideration of the
870text currently being lexed. This is always a plain string scalar (for
871which C<SvPOK> is true). It is not intended to be used as a scalar by
872normal scalar means; instead refer to the buffer directly by the pointer
873variables described below.
874
875The lexer maintains various C<char*> pointers to things in the
876C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
877reallocated, all of these pointers must be updated. Don't attempt to
878do this manually, but rather use L</lex_grow_linestr> if you need to
879reallocate the buffer.
880
881The content of the text chunk in the buffer is commonly exactly one
882complete line of input, up to and including a newline terminator,
883but there are situations where it is otherwise. The octets of the
884buffer may be intended to be interpreted as either UTF-8 or Latin-1.
885The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
886flag on this scalar, which may disagree with it.
887
888For direct examination of the buffer, the variable
889L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
890lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
891of these pointers is usually preferable to examination of the scalar
892through normal scalar means.
893
78342678 894=for apidoc AmnxUN|char *|PL_parser-E<gt>bufend
f0e67a1d
Z
895
896Direct pointer to the end of the chunk of text currently being lexed, the
897end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
6602b933 898+ SvCUR(PL_parser-E<gt>linestr)>. A C<NUL> character (zero octet) is
f0e67a1d
Z
899always located at the end of the buffer, and does not count as part of
900the buffer's contents.
901
78342678 902=for apidoc AmnxUN|char *|PL_parser-E<gt>bufptr
f0e67a1d
Z
903
904Points to the current position of lexing inside the lexer buffer.
905Characters around this point may be freely examined, within
906the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
907L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
908interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
909
910Lexing code (whether in the Perl core or not) moves this pointer past
911the characters that it consumes. It is also expected to perform some
912bookkeeping whenever a newline character is consumed. This movement
913can be more conveniently performed by the function L</lex_read_to>,
914which handles newlines appropriately.
915
916Interpretation of the buffer's octets can be abstracted out by
917using the slightly higher-level functions L</lex_peek_unichar> and
918L</lex_read_unichar>.
919
78342678 920=for apidoc AmnxUN|char *|PL_parser-E<gt>linestart
f0e67a1d
Z
921
922Points to the start of the current line inside the lexer buffer.
923This is useful for indicating at which column an error occurred, and
924not much else. This must be updated by any lexing code that consumes
925a newline; the function L</lex_read_to> handles this detail.
926
927=cut
928*/
929
930/*
44170c9a 931=for apidoc lex_bufutf8
f0e67a1d
Z
932
933Indicates whether the octets in the lexer buffer
934(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
935of Unicode characters. If not, they should be interpreted as Latin-1
936characters. This is analogous to the C<SvUTF8> flag for scalars.
937
938In UTF-8 mode, it is not guaranteed that the lexer buffer actually
939contains valid UTF-8. Lexing code must be robust in the face of invalid
940encoding.
941
942The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
943is significant, but not the whole story regarding the input character
944encoding. Normally, when a file is being read, the scalar contains octets
945and its C<SvUTF8> flag is off, but the octets should be interpreted as
946UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
947however, the scalar may have the C<SvUTF8> flag on, and in this case its
948octets should be interpreted as UTF-8 unless the C<use bytes> pragma
949is in effect. This logic may change in the future; use this function
950instead of implementing the logic yourself.
951
952=cut
953*/
954
955bool
956Perl_lex_bufutf8(pTHX)
957{
958 return UTF;
959}
960
961/*
44170c9a 962=for apidoc lex_grow_linestr
f0e67a1d
Z
963
964Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
2d7f6611 965at least C<len> octets (including terminating C<NUL>). Returns a
f0e67a1d
Z
966pointer to the reallocated buffer. This is necessary before making
967any direct modification of the buffer that would increase its length.
968L</lex_stuff_pvn> provides a more convenient way to insert text into
969the buffer.
970
971Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
972this function updates all of the lexer's variables that point directly
973into the buffer.
974
975=cut
976*/
977
978char *
979Perl_lex_grow_linestr(pTHX_ STRLEN len)
980{
981 SV *linestr;
982 char *buf;
983 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
c7641931 984 STRLEN linestart_pos, last_uni_pos, last_lop_pos, re_eval_start_pos;
98d5e3ef
DM
985 bool current;
986
f0e67a1d
Z
987 linestr = PL_parser->linestr;
988 buf = SvPVX(linestr);
989 if (len <= SvLEN(linestr))
990 return buf;
98d5e3ef
DM
991
992 /* Is the lex_shared linestr SV the same as the current linestr SV?
993 * Only in this case does re_eval_start need adjusting, since it
994 * points within lex_shared->ls_linestr's buffer */
b1b8fb6a
DM
995 current = ( !PL_parser->lex_shared->ls_linestr
996 || linestr == PL_parser->lex_shared->ls_linestr);
98d5e3ef 997
f0e67a1d
Z
998 bufend_pos = PL_parser->bufend - buf;
999 bufptr_pos = PL_parser->bufptr - buf;
1000 oldbufptr_pos = PL_parser->oldbufptr - buf;
1001 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1002 linestart_pos = PL_parser->linestart - buf;
1003 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1004 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
98d5e3ef 1005 re_eval_start_pos = (current && PL_parser->lex_shared->re_eval_start) ?
3328ab5a 1006 PL_parser->lex_shared->re_eval_start - buf : 0;
c7641931 1007
f0e67a1d 1008 buf = sv_grow(linestr, len);
c7641931 1009
f0e67a1d
Z
1010 PL_parser->bufend = buf + bufend_pos;
1011 PL_parser->bufptr = buf + bufptr_pos;
1012 PL_parser->oldbufptr = buf + oldbufptr_pos;
1013 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1014 PL_parser->linestart = buf + linestart_pos;
1015 if (PL_parser->last_uni)
1016 PL_parser->last_uni = buf + last_uni_pos;
1017 if (PL_parser->last_lop)
1018 PL_parser->last_lop = buf + last_lop_pos;
98d5e3ef 1019 if (current && PL_parser->lex_shared->re_eval_start)
3328ab5a 1020 PL_parser->lex_shared->re_eval_start = buf + re_eval_start_pos;
f0e67a1d
Z
1021 return buf;
1022}
1023
1024/*
44170c9a 1025=for apidoc lex_stuff_pvn
f0e67a1d
Z
1026
1027Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1028immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1029reallocating the buffer if necessary. This means that lexing code that
1030runs later will see the characters as if they had appeared in the input.
1031It is not recommended to do this as part of normal parsing, and most
1032uses of this facility run the risk of the inserted characters being
1033interpreted in an unintended manner.
1034
2d7f6611
KW
1035The string to be inserted is represented by C<len> octets starting
1036at C<pv>. These octets are interpreted as either UTF-8 or Latin-1,
1037according to whether the C<LEX_STUFF_UTF8> flag is set in C<flags>.
f0e67a1d
Z
1038The characters are recoded for the lexer buffer, according to how the
1039buffer is currently being interpreted (L</lex_bufutf8>). If a string
9dcc53ea 1040to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
f0e67a1d
Z
1041function is more convenient.
1042
5af38e47
KW
1043=for apidoc Amnh||LEX_STUFF_UTF8
1044
f0e67a1d
Z
1045=cut
1046*/
1047
1048void
83aa740e 1049Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
f0e67a1d
Z
1050{
1051 char *bufptr;
1052 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
1053 if (flags & ~(LEX_STUFF_UTF8))
1054 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
1055 if (UTF) {
1056 if (flags & LEX_STUFF_UTF8) {
1057 goto plain_copy;
1058 } else {
01ccd497
KW
1059 STRLEN highhalf = variant_under_utf8_count((U8 *) pv,
1060 (U8 *) pv + len);
1061 const char *p, *e = pv+len;;
f0e67a1d
Z
1062 if (!highhalf)
1063 goto plain_copy;
1064 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
1065 bufptr = PL_parser->bufptr;
1066 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1067 SvCUR_set(PL_parser->linestr,
1068 SvCUR(PL_parser->linestr) + len+highhalf);
f0e67a1d
Z
1069 PL_parser->bufend += len+highhalf;
1070 for (p = pv; p != e; p++) {
27d0a47c 1071 append_utf8_from_native_byte(*p, (U8 **) &bufptr);
f0e67a1d
Z
1072 }
1073 }
1074 } else {
1075 if (flags & LEX_STUFF_UTF8) {
1076 STRLEN highhalf = 0;
83aa740e 1077 const char *p, *e = pv+len;
f0e67a1d
Z
1078 for (p = pv; p != e; p++) {
1079 U8 c = (U8)*p;
54d004e8 1080 if (UTF8_IS_ABOVE_LATIN1(c)) {
f0e67a1d
Z
1081 Perl_croak(aTHX_ "Lexing code attempted to stuff "
1082 "non-Latin-1 character into Latin-1 input");
54d004e8 1083 } else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e)) {
f0e67a1d
Z
1084 p++;
1085 highhalf++;
f88c6466 1086 } else assert(UTF8_IS_INVARIANT(c));
f0e67a1d
Z
1087 }
1088 if (!highhalf)
1089 goto plain_copy;
1090 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
1091 bufptr = PL_parser->bufptr;
1092 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1093 SvCUR_set(PL_parser->linestr,
1094 SvCUR(PL_parser->linestr) + len-highhalf);
f0e67a1d 1095 PL_parser->bufend += len-highhalf;
54d004e8
KW
1096 p = pv;
1097 while (p < e) {
1098 if (UTF8_IS_INVARIANT(*p)) {
1099 *bufptr++ = *p;
1100 p++;
f0e67a1d 1101 }
54d004e8
KW
1102 else {
1103 assert(p < e -1 );
a62b247b 1104 *bufptr++ = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
54d004e8
KW
1105 p += 2;
1106 }
f0e67a1d
Z
1107 }
1108 } else {
54d004e8 1109 plain_copy:
f0e67a1d
Z
1110 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1111 bufptr = PL_parser->bufptr;
1112 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
255fdf19 1113 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
f0e67a1d
Z
1114 PL_parser->bufend += len;
1115 Copy(pv, bufptr, len, char);
1116 }
1117 }
1118}
1119
1120/*
44170c9a 1121=for apidoc lex_stuff_pv
9dcc53ea
Z
1122
1123Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1124immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1125reallocating the buffer if necessary. This means that lexing code that
1126runs later will see the characters as if they had appeared in the input.
1127It is not recommended to do this as part of normal parsing, and most
1128uses of this facility run the risk of the inserted characters being
1129interpreted in an unintended manner.
1130
2d7f6611 1131The string to be inserted is represented by octets starting at C<pv>
9dcc53ea
Z
1132and continuing to the first nul. These octets are interpreted as either
1133UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
2d7f6611 1134in C<flags>. The characters are recoded for the lexer buffer, according
9dcc53ea
Z
1135to how the buffer is currently being interpreted (L</lex_bufutf8>).
1136If it is not convenient to nul-terminate a string to be inserted, the
1137L</lex_stuff_pvn> function is more appropriate.
1138
1139=cut
1140*/
1141
1142void
1143Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1144{
1145 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1146 lex_stuff_pvn(pv, strlen(pv), flags);
1147}
1148
1149/*
44170c9a 1150=for apidoc lex_stuff_sv
f0e67a1d
Z
1151
1152Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1153immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1154reallocating the buffer if necessary. This means that lexing code that
1155runs later will see the characters as if they had appeared in the input.
1156It is not recommended to do this as part of normal parsing, and most
1157uses of this facility run the risk of the inserted characters being
1158interpreted in an unintended manner.
1159
2d7f6611 1160The string to be inserted is the string value of C<sv>. The characters
f0e67a1d 1161are recoded for the lexer buffer, according to how the buffer is currently
9dcc53ea 1162being interpreted (L</lex_bufutf8>). If a string to be inserted is
f0e67a1d
Z
1163not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1164need to construct a scalar.
1165
1166=cut
1167*/
1168
1169void
1170Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1171{
1172 char *pv;
1173 STRLEN len;
1174 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1175 if (flags)
1176 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1177 pv = SvPV(sv, len);
1178 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1179}
1180
1181/*
44170c9a 1182=for apidoc lex_unstuff
f0e67a1d
Z
1183
1184Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
2d7f6611 1185C<ptr>. Text following C<ptr> will be moved, and the buffer shortened.
f0e67a1d
Z
1186This hides the discarded text from any lexing code that runs later,
1187as if the text had never appeared.
1188
1189This is not the normal way to consume lexed text. For that, use
1190L</lex_read_to>.
1191
1192=cut
1193*/
1194
1195void
1196Perl_lex_unstuff(pTHX_ char *ptr)
1197{
1198 char *buf, *bufend;
1199 STRLEN unstuff_len;
1200 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1201 buf = PL_parser->bufptr;
1202 if (ptr < buf)
1203 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1204 if (ptr == buf)
1205 return;
1206 bufend = PL_parser->bufend;
1207 if (ptr > bufend)
1208 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1209 unstuff_len = ptr - buf;
1210 Move(ptr, buf, bufend+1-ptr, char);
1211 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1212 PL_parser->bufend = bufend - unstuff_len;
1213}
1214
1215/*
44170c9a 1216=for apidoc lex_read_to
f0e67a1d
Z
1217
1218Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
2d7f6611 1219to C<ptr>. This advances L</PL_parser-E<gt>bufptr> to match C<ptr>,
f0e67a1d
Z
1220performing the correct bookkeeping whenever a newline character is passed.
1221This is the normal way to consume lexed text.
1222
1223Interpretation of the buffer's octets can be abstracted out by
1224using the slightly higher-level functions L</lex_peek_unichar> and
1225L</lex_read_unichar>.
1226
1227=cut
1228*/
1229
1230void
1231Perl_lex_read_to(pTHX_ char *ptr)
1232{
1233 char *s;
1234 PERL_ARGS_ASSERT_LEX_READ_TO;
1235 s = PL_parser->bufptr;
1236 if (ptr < s || ptr > PL_parser->bufend)
1237 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1238 for (; s != ptr; s++)
1239 if (*s == '\n') {
83944c01 1240 COPLINE_INC_WITH_HERELINES;
f0e67a1d
Z
1241 PL_parser->linestart = s+1;
1242 }
1243 PL_parser->bufptr = ptr;
1244}
1245
1246/*
44170c9a 1247=for apidoc lex_discard_to
f0e67a1d
Z
1248
1249Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
2d7f6611
KW
1250up to C<ptr>. The remaining content of the buffer will be moved, and
1251all pointers into the buffer updated appropriately. C<ptr> must not
f0e67a1d
Z
1252be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1253it is not permitted to discard text that has yet to be lexed.
1254
1255Normally it is not necessarily to do this directly, because it suffices to
1256use the implicit discarding behaviour of L</lex_next_chunk> and things
1257based on it. However, if a token stretches across multiple lines,
1f317c95 1258and the lexing code has kept multiple lines of text in the buffer for
f0e67a1d
Z
1259that purpose, then after completion of the token it would be wise to
1260explicitly discard the now-unneeded earlier lines, to avoid future
1261multi-line tokens growing the buffer without bound.
1262
1263=cut
1264*/
1265
1266void
1267Perl_lex_discard_to(pTHX_ char *ptr)
1268{
1269 char *buf;
1270 STRLEN discard_len;
1271 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1272 buf = SvPVX(PL_parser->linestr);
1273 if (ptr < buf)
1274 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1275 if (ptr == buf)
1276 return;
1277 if (ptr > PL_parser->bufptr)
1278 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1279 discard_len = ptr - buf;
1280 if (PL_parser->oldbufptr < ptr)
1281 PL_parser->oldbufptr = ptr;
1282 if (PL_parser->oldoldbufptr < ptr)
1283 PL_parser->oldoldbufptr = ptr;
1284 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1285 PL_parser->last_uni = NULL;
1286 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1287 PL_parser->last_lop = NULL;
1288 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1289 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1290 PL_parser->bufend -= discard_len;
1291 PL_parser->bufptr -= discard_len;
1292 PL_parser->oldbufptr -= discard_len;
1293 PL_parser->oldoldbufptr -= discard_len;
1294 if (PL_parser->last_uni)
1295 PL_parser->last_uni -= discard_len;
1296 if (PL_parser->last_lop)
1297 PL_parser->last_lop -= discard_len;
1298}
1299
efa571ab
KW
1300void
1301Perl_notify_parser_that_changed_to_utf8(pTHX)
1302{
1303 /* Called when $^H is changed to indicate that HINT_UTF8 has changed from
1304 * off to on. At compile time, this has the effect of entering a 'use
1305 * utf8' section. This means that any input was not previously checked for
1306 * UTF-8 (because it was off), but now we do need to check it, or our
1307 * assumptions about the input being sane could be wrong, and we could
1308 * segfault. This routine just sets a flag so that the next time we look
1309 * at the input we do the well-formed UTF-8 check. If we aren't in the
1310 * proper phase, there may not be a parser object, but if there is, setting
1311 * the flag is harmless */
1312
1313 if (PL_parser) {
1314 PL_parser->recheck_utf8_validity = TRUE;
1315 }
1316}
1317
f0e67a1d 1318/*
44170c9a 1319=for apidoc lex_next_chunk
f0e67a1d
Z
1320
1321Reads in the next chunk of text to be lexed, appending it to
1322L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1323looked to the end of the current chunk and wants to know more. It is
1324usual, but not necessary, for lexing to have consumed the entirety of
1325the current chunk at this time.
1326
1327If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1328chunk (i.e., the current chunk has been entirely consumed), normally the
1329current chunk will be discarded at the same time that the new chunk is
c5608a1f 1330read in. If C<flags> has the C<LEX_KEEP_PREVIOUS> bit set, the current chunk
f0e67a1d
Z
1331will not be discarded. If the current chunk has not been entirely
1332consumed, then it will not be discarded regardless of the flag.
1333
1334Returns true if some new text was added to the buffer, or false if the
1335buffer has reached the end of the input text.
1336
5af38e47
KW
1337=for apidoc Amnh||LEX_KEEP_PREVIOUS
1338
f0e67a1d
Z
1339=cut
1340*/
1341
1342#define LEX_FAKE_EOF 0x80000000
e47d32dc 1343#define LEX_NO_TERM 0x40000000 /* here-doc */
f0e67a1d
Z
1344
1345bool
1346Perl_lex_next_chunk(pTHX_ U32 flags)
1347{
1348 SV *linestr;
1349 char *buf;
1350 STRLEN old_bufend_pos, new_bufend_pos;
1351 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1352 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1353 bool got_some_for_debugger = 0;
f0e67a1d 1354 bool got_some;
6cdc5cd8 1355
112d1284 1356 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM))
f0e67a1d 1357 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
d27f4b91 1358 if (!(flags & LEX_NO_TERM) && PL_lex_inwhat)
e47d32dc 1359 return FALSE;
f0e67a1d
Z
1360 linestr = PL_parser->linestr;
1361 buf = SvPVX(linestr);
407f8cf2
KW
1362 if (!(flags & LEX_KEEP_PREVIOUS)
1363 && PL_parser->bufptr == PL_parser->bufend)
1364 {
f0e67a1d
Z
1365 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1366 linestart_pos = 0;
1367 if (PL_parser->last_uni != PL_parser->bufend)
1368 PL_parser->last_uni = NULL;
1369 if (PL_parser->last_lop != PL_parser->bufend)
1370 PL_parser->last_lop = NULL;
1371 last_uni_pos = last_lop_pos = 0;
1372 *buf = 0;
2324bdb9 1373 SvCUR_set(linestr, 0);
f0e67a1d
Z
1374 } else {
1375 old_bufend_pos = PL_parser->bufend - buf;
1376 bufptr_pos = PL_parser->bufptr - buf;
1377 oldbufptr_pos = PL_parser->oldbufptr - buf;
1378 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1379 linestart_pos = PL_parser->linestart - buf;
1380 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1381 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1382 }
1383 if (flags & LEX_FAKE_EOF) {
1384 goto eof;
60d63348 1385 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
f0e67a1d
Z
1386 got_some = 0;
1387 } else if (filter_gets(linestr, old_bufend_pos)) {
1388 got_some = 1;
17cc9359 1389 got_some_for_debugger = 1;
112d1284
FC
1390 } else if (flags & LEX_NO_TERM) {
1391 got_some = 0;
f0e67a1d 1392 } else {
580561a3 1393 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
847cc851 1394 SvPVCLEAR(linestr);
f0e67a1d
Z
1395 eof:
1396 /* End of real input. Close filehandle (unless it was STDIN),
1397 * then add implicit termination.
1398 */
87606032 1399 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
f0e67a1d
Z
1400 PerlIO_clearerr(PL_parser->rsfp);
1401 else if (PL_parser->rsfp)
1402 (void)PerlIO_close(PL_parser->rsfp);
1403 PL_parser->rsfp = NULL;
60d63348 1404 PL_parser->in_pod = PL_parser->filtered = 0;
f0e67a1d
Z
1405 if (!PL_in_eval && PL_minus_p) {
1406 sv_catpvs(linestr,
1407 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1408 PL_minus_n = PL_minus_p = 0;
1409 } else if (!PL_in_eval && PL_minus_n) {
1410 sv_catpvs(linestr, /*{*/";}");
1411 PL_minus_n = 0;
1412 } else
1413 sv_catpvs(linestr, ";");
1414 got_some = 1;
1415 }
1416 buf = SvPVX(linestr);
1417 new_bufend_pos = SvCUR(linestr);
1418 PL_parser->bufend = buf + new_bufend_pos;
1419 PL_parser->bufptr = buf + bufptr_pos;
6cdc5cd8 1420
07337b95
KW
1421 if (UTF) {
1422 const U8* first_bad_char_loc;
1423 if (UNLIKELY(! is_utf8_string_loc(
1424 (U8 *) PL_parser->bufptr,
1425 PL_parser->bufend - PL_parser->bufptr,
1426 &first_bad_char_loc)))
1427 {
1428 _force_out_malformed_utf8_message(first_bad_char_loc,
1429 (U8 *) PL_parser->bufend,
1430 0,
1431 1 /* 1 means die */ );
1432 NOT_REACHED; /* NOTREACHED */
1433 }
6cdc5cd8
KW
1434 }
1435
f0e67a1d
Z
1436 PL_parser->oldbufptr = buf + oldbufptr_pos;
1437 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1438 PL_parser->linestart = buf + linestart_pos;
1439 if (PL_parser->last_uni)
1440 PL_parser->last_uni = buf + last_uni_pos;
1441 if (PL_parser->last_lop)
1442 PL_parser->last_lop = buf + last_lop_pos;
7f1c3e8c
FC
1443 if (PL_parser->preambling != NOLINE) {
1444 CopLINE_set(PL_curcop, PL_parser->preambling + 1);
1445 PL_parser->preambling = NOLINE;
1446 }
407f8cf2
KW
1447 if ( got_some_for_debugger
1448 && PERLDB_LINE_OR_SAVESRC
1449 && PL_curstash != PL_debstash)
1450 {
f0e67a1d
Z
1451 /* debugger active and we're not compiling the debugger code,
1452 * so store the line into the debugger's array of lines
1453 */
1454 update_debugger_info(NULL, buf+old_bufend_pos,
1455 new_bufend_pos-old_bufend_pos);
1456 }
1457 return got_some;
1458}
1459
1460/*
44170c9a 1461=for apidoc lex_peek_unichar
f0e67a1d
Z
1462
1463Looks ahead one (Unicode) character in the text currently being lexed.
1464Returns the codepoint (unsigned integer value) of the next character,
1465or -1 if lexing has reached the end of the input text. To consume the
1466peeked character, use L</lex_read_unichar>.
1467
1468If the next character is in (or extends into) the next chunk of input
1469text, the next chunk will be read in. Normally the current chunk will be
c5608a1f
KW
1470discarded at the same time, but if C<flags> has the C<LEX_KEEP_PREVIOUS>
1471bit set, then the current chunk will not be discarded.
f0e67a1d
Z
1472
1473If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1474is encountered, an exception is generated.
1475
1476=cut
1477*/
1478
1479I32
1480Perl_lex_peek_unichar(pTHX_ U32 flags)
1481{
1482 char *s, *bufend;
1483 if (flags & ~(LEX_KEEP_PREVIOUS))
1484 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1485 s = PL_parser->bufptr;
1486 bufend = PL_parser->bufend;
1487 if (UTF) {
1488 U8 head;
1489 I32 unichar;
1490 STRLEN len, retlen;
1491 if (s == bufend) {
1492 if (!lex_next_chunk(flags))
1493 return -1;
1494 s = PL_parser->bufptr;
1495 bufend = PL_parser->bufend;
1496 }
1497 head = (U8)*s;
54d004e8 1498 if (UTF8_IS_INVARIANT(head))
f0e67a1d 1499 return head;
54d004e8
KW
1500 if (UTF8_IS_START(head)) {
1501 len = UTF8SKIP(&head);
f0e67a1d
Z
1502 while ((STRLEN)(bufend-s) < len) {
1503 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1504 break;
1505 s = PL_parser->bufptr;
1506 bufend = PL_parser->bufend;
1507 }
1508 }
c80e42f3 1509 unichar = utf8n_to_uvchr((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
f0e67a1d 1510 if (retlen == (STRLEN)-1) {
75219bac
KW
1511 _force_out_malformed_utf8_message((U8 *) s,
1512 (U8 *) bufend,
1513 0,
1514 1 /* 1 means die */ );
1515 NOT_REACHED; /* NOTREACHED */
f0e67a1d
Z
1516 }
1517 return unichar;
1518 } else {
1519 if (s == bufend) {
1520 if (!lex_next_chunk(flags))
1521 return -1;
1522 s = PL_parser->bufptr;
1523 }
1524 return (U8)*s;
1525 }
1526}
1527
1528/*
44170c9a 1529=for apidoc lex_read_unichar
f0e67a1d
Z
1530
1531Reads the next (Unicode) character in the text currently being lexed.
1532Returns the codepoint (unsigned integer value) of the character read,
1533and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1534if lexing has reached the end of the input text. To non-destructively
1535examine the next character, use L</lex_peek_unichar> instead.
1536
1537If the next character is in (or extends into) the next chunk of input
1538text, the next chunk will be read in. Normally the current chunk will be
c5608a1f
KW
1539discarded at the same time, but if C<flags> has the C<LEX_KEEP_PREVIOUS>
1540bit set, then the current chunk will not be discarded.
f0e67a1d
Z
1541
1542If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1543is encountered, an exception is generated.
1544
1545=cut
1546*/
1547
1548I32
1549Perl_lex_read_unichar(pTHX_ U32 flags)
1550{
1551 I32 c;
1552 if (flags & ~(LEX_KEEP_PREVIOUS))
1553 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1554 c = lex_peek_unichar(flags);
1555 if (c != -1) {
1556 if (c == '\n')
83944c01 1557 COPLINE_INC_WITH_HERELINES;
d9018cbe
EB
1558 if (UTF)
1559 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1560 else
1561 ++(PL_parser->bufptr);
f0e67a1d
Z
1562 }
1563 return c;
1564}
1565
1566/*
44170c9a 1567=for apidoc lex_read_space
f0e67a1d
Z
1568
1569Reads optional spaces, in Perl style, in the text currently being
1570lexed. The spaces may include ordinary whitespace characters and
1571Perl-style comments. C<#line> directives are processed if encountered.
1572L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1573at a non-space character (or the end of the input text).
1574
1575If spaces extend into the next chunk of input text, the next chunk will
1576be read in. Normally the current chunk will be discarded at the same
c5608a1f 1577time, but if C<flags> has the C<LEX_KEEP_PREVIOUS> bit set, then the current
f0e67a1d
Z
1578chunk will not be discarded.
1579
1580=cut
1581*/
1582
21791330 1583#define LEX_NO_INCLINE 0x40000000
f0998909
Z
1584#define LEX_NO_NEXT_CHUNK 0x80000000
1585
f0e67a1d
Z
1586void
1587Perl_lex_read_space(pTHX_ U32 flags)
1588{
1589 char *s, *bufend;
21791330 1590 const bool can_incline = !(flags & LEX_NO_INCLINE);
f0e67a1d 1591 bool need_incline = 0;
21791330 1592 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK|LEX_NO_INCLINE))
f0e67a1d 1593 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
f0e67a1d
Z
1594 s = PL_parser->bufptr;
1595 bufend = PL_parser->bufend;
1596 while (1) {
1597 char c = *s;
1598 if (c == '#') {
1599 do {
1600 c = *++s;
1601 } while (!(c == '\n' || (c == 0 && s == bufend)));
1602 } else if (c == '\n') {
1603 s++;
21791330
FC
1604 if (can_incline) {
1605 PL_parser->linestart = s;
1606 if (s == bufend)
1607 need_incline = 1;
1608 else
c6875f94 1609 incline(s, bufend);
21791330 1610 }
f0e67a1d
Z
1611 } else if (isSPACE(c)) {
1612 s++;
1613 } else if (c == 0 && s == bufend) {
1614 bool got_more;
65c68e17 1615 line_t l;
f0998909
Z
1616 if (flags & LEX_NO_NEXT_CHUNK)
1617 break;
f0e67a1d 1618 PL_parser->bufptr = s;
65c68e17 1619 l = CopLINE(PL_curcop);
851b527a 1620 CopLINE(PL_curcop) += PL_parser->herelines + 1;
f0e67a1d 1621 got_more = lex_next_chunk(flags);
65c68e17 1622 CopLINE_set(PL_curcop, l);
f0e67a1d
Z
1623 s = PL_parser->bufptr;
1624 bufend = PL_parser->bufend;
1625 if (!got_more)
1626 break;
21791330 1627 if (can_incline && need_incline && PL_parser->rsfp) {
c6875f94 1628 incline(s, bufend);
f0e67a1d
Z
1629 need_incline = 0;
1630 }
3c47da3c
FC
1631 } else if (!c) {
1632 s++;
f0e67a1d
Z
1633 } else {
1634 break;
1635 }
1636 }
f0e67a1d
Z
1637 PL_parser->bufptr = s;
1638}
1639
1640/*
fe788d6b 1641
44170c9a 1642=for apidoc validate_proto
fe788d6b
PM
1643
1644This function performs syntax checking on a prototype, C<proto>.
1645If C<warn> is true, any illegal characters or mismatched brackets
1646will trigger illegalproto warnings, declaring that they were
1647detected in the prototype for C<name>.
1648
1649The return value is C<true> if this is a valid prototype, and
1650C<false> if it is not, regardless of whether C<warn> was C<true> or
1651C<false>.
1652
1653Note that C<NULL> is a valid C<proto> and will always return C<true>.
1654
1655=cut
1656
1657 */
1658
1659bool
5783dc51 1660Perl_validate_proto(pTHX_ SV *name, SV *proto, bool warn, bool curstash)
fe788d6b
PM
1661{
1662 STRLEN len, origlen;
11327fa1 1663 char *p;
fe788d6b
PM
1664 bool bad_proto = FALSE;
1665 bool in_brackets = FALSE;
1666 bool after_slash = FALSE;
1667 char greedy_proto = ' ';
1668 bool proto_after_greedy_proto = FALSE;
1669 bool must_be_last = FALSE;
1670 bool underscore = FALSE;
f791a21a 1671 bool bad_proto_after_underscore = FALSE;
fe788d6b
PM
1672
1673 PERL_ARGS_ASSERT_VALIDATE_PROTO;
1674
1675 if (!proto)
1676 return TRUE;
1677
11327fa1 1678 p = SvPV(proto, len);
fe788d6b
PM
1679 origlen = len;
1680 for (; len--; p++) {
1681 if (!isSPACE(*p)) {
1682 if (must_be_last)
1683 proto_after_greedy_proto = TRUE;
f791a21a 1684 if (underscore) {
4aada8b9 1685 if (!memCHRs(";@%", *p))
f791a21a
PM
1686 bad_proto_after_underscore = TRUE;
1687 underscore = FALSE;
1688 }
4aada8b9 1689 if (!memCHRs("$@%*;[]&\\_+", *p) || *p == '\0') {
fe788d6b
PM
1690 bad_proto = TRUE;
1691 }
1692 else {
fe788d6b
PM
1693 if (*p == '[')
1694 in_brackets = TRUE;
1695 else if (*p == ']')
1696 in_brackets = FALSE;
407f8cf2
KW
1697 else if ((*p == '@' || *p == '%')
1698 && !after_slash
1699 && !in_brackets )
1700 {
fe788d6b
PM
1701 must_be_last = TRUE;
1702 greedy_proto = *p;
1703 }
1704 else if (*p == '_')
f791a21a 1705 underscore = TRUE;
fe788d6b
PM
1706 }
1707 if (*p == '\\')
1708 after_slash = TRUE;
1709 else
1710 after_slash = FALSE;
1711 }
1712 }
1713
1714 if (warn) {
b54d603d 1715 SV *tmpsv = newSVpvs_flags("", SVs_TEMP);
fe788d6b 1716 p -= origlen;
b54d603d
PM
1717 p = SvUTF8(proto)
1718 ? sv_uni_display(tmpsv, newSVpvn_flags(p, origlen, SVs_TEMP | SVf_UTF8),
1719 origlen, UNI_DISPLAY_ISPRINT)
1720 : pv_pretty(tmpsv, p, origlen, 60, NULL, NULL, PERL_PV_ESCAPE_NONASCII);
1721
cbf83791
FC
1722 if (curstash && !memchr(SvPVX(name), ':', SvCUR(name))) {
1723 SV *name2 = sv_2mortal(newSVsv(PL_curstname));
1724 sv_catpvs(name2, "::");
1725 sv_catsv(name2, (SV *)name);
1726 name = name2;
1727 }
1728
fe788d6b
PM
1729 if (proto_after_greedy_proto)
1730 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
147e3846 1731 "Prototype after '%c' for %" SVf " : %s",
fe788d6b 1732 greedy_proto, SVfARG(name), p);
50278ed0
PM
1733 if (in_brackets)
1734 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
147e3846 1735 "Missing ']' in prototype for %" SVf " : %s",
50278ed0 1736 SVfARG(name), p);
b54d603d 1737 if (bad_proto)
fe788d6b 1738 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
147e3846 1739 "Illegal character in prototype for %" SVf " : %s",
f791a21a
PM
1740 SVfARG(name), p);
1741 if (bad_proto_after_underscore)
1742 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
147e3846 1743 "Illegal character after '_' in prototype for %" SVf " : %s",
f791a21a 1744 SVfARG(name), p);
fe788d6b
PM
1745 }
1746
1747 return (! (proto_after_greedy_proto || bad_proto) );
1748}
1749
1750/*
ffb4593c
NT
1751 * S_incline
1752 * This subroutine has nothing to do with tilting, whether at windmills
1753 * or pinball tables. Its name is short for "increment line". It
57843af0 1754 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1755 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1756 * # line 500 "foo.pm"
1757 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1758 */
1759
76e3520e 1760STATIC void
c6875f94 1761S_incline(pTHX_ const char *s, const char *end)
463ee0b2 1762{
d9095cec
NC
1763 const char *t;
1764 const char *n;
1765 const char *e;
8818d409 1766 line_t line_num;
22ff3130 1767 UV uv;
463ee0b2 1768
7918f24d
NC
1769 PERL_ARGS_ASSERT_INCLINE;
1770
d77eff5d
KW
1771 assert(end >= s);
1772
83944c01 1773 COPLINE_INC_WITH_HERELINES;
451f421f
FC
1774 if (!PL_rsfp && !PL_parser->filtered && PL_lex_state == LEX_NORMAL
1775 && s+1 == PL_bufend && *s == ';') {
1776 /* fake newline in string eval */
1777 CopLINE_dec(PL_curcop);
1778 return;
1779 }
463ee0b2
LW
1780 if (*s++ != '#')
1781 return;
d4c19fe8
AL
1782 while (SPACE_OR_TAB(*s))
1783 s++;
d77eff5d
KW
1784 if (memBEGINs(s, (STRLEN) (end - s), "line"))
1785 s += sizeof("line") - 1;
73659bf1
GS
1786 else
1787 return;
084592ab 1788 if (SPACE_OR_TAB(*s))
73659bf1 1789 s++;
4e553d73 1790 else
73659bf1 1791 return;
d4c19fe8
AL
1792 while (SPACE_OR_TAB(*s))
1793 s++;
463ee0b2
LW
1794 if (!isDIGIT(*s))
1795 return;
d4c19fe8 1796
463ee0b2
LW
1797 n = s;
1798 while (isDIGIT(*s))
1799 s++;
07714eb4 1800 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1801 return;
bf4acbe4 1802 while (SPACE_OR_TAB(*s))
463ee0b2 1803 s++;
85bb8b90 1804 if (*s == '"' && (t = (char *) memchr(s+1, '"', end - s))) {
463ee0b2 1805 s++;
73659bf1
GS
1806 e = t + 1;
1807 }
463ee0b2 1808 else {
c35e046a 1809 t = s;
1bb1a3d6 1810 while (*t && !isSPACE(*t))
c35e046a 1811 t++;
73659bf1 1812 e = t;
463ee0b2 1813 }
bf4acbe4 1814 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1815 e++;
1816 if (*e != '\n' && *e != '\0')
1817 return; /* false alarm */
1818
22ff3130
HS
1819 if (!grok_atoUV(n, &uv, &e))
1820 return;
1821 line_num = ((line_t)uv) - 1;
8818d409 1822
f4dd75d9 1823 if (t - s > 0) {
d9095cec 1824 const STRLEN len = t - s;
3df32bda 1825
d36ee5be 1826 if (!PL_rsfp && !PL_parser->filtered) {
e66cf94c
RGS
1827 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1828 * to *{"::_<newfilename"} */
44867030
NC
1829 /* However, the long form of evals is only turned on by the
1830 debugger - usually they're "(eval %lu)" */
d36ee5be
FC
1831 GV * const cfgv = CopFILEGV(PL_curcop);
1832 if (cfgv) {
38bd7ad8
FC
1833 char smallbuf[128];
1834 STRLEN tmplen2 = len;
44867030 1835 char *tmpbuf2;
449dd039 1836 GV *gv2;
44867030
NC
1837
1838 if (tmplen2 + 2 <= sizeof smallbuf)
1839 tmpbuf2 = smallbuf;
1840 else
1841 Newx(tmpbuf2, tmplen2 + 2, char);
1842
38bd7ad8
FC
1843 tmpbuf2[0] = '_';
1844 tmpbuf2[1] = '<';
44867030
NC
1845
1846 memcpy(tmpbuf2 + 2, s, tmplen2);
1847 tmplen2 += 2;
1848
8a5ee598 1849 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1850 if (!isGV(gv2)) {
8a5ee598 1851 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1852 /* adjust ${"::_<newfilename"} to store the new file name */
1853 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
8818d409
FC
1854 /* The line number may differ. If that is the case,
1855 alias the saved lines that are in the array.
1856 Otherwise alias the whole array. */
1857 if (CopLINE(PL_curcop) == line_num) {
38bd7ad8
FC
1858 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(cfgv)));
1859 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(cfgv)));
8818d409 1860 }
38bd7ad8
FC
1861 else if (GvAV(cfgv)) {
1862 AV * const av = GvAV(cfgv);
515c395b
TC
1863 const line_t start = CopLINE(PL_curcop)+1;
1864 SSize_t items = AvFILLp(av) - start;
8818d409
FC
1865 if (items > 0) {
1866 AV * const av2 = GvAVn(gv2);
1867 SV **svp = AvARRAY(av) + start;
515c395b
TC
1868 Size_t l = line_num+1;
1869 while (items-- && l < SSize_t_MAX && l == (line_t)l)
1870 av_store(av2, (SSize_t)l++, SvREFCNT_inc(*svp++));
8818d409
FC
1871 }
1872 }
e5527e4b 1873 }
44867030
NC
1874
1875 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
d36ee5be 1876 }
e66cf94c 1877 }
05ec9bb3 1878 CopFILE_free(PL_curcop);
449dd039 1879 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1880 }
8818d409 1881 CopLINE_set(PL_curcop, line_num);
463ee0b2
LW
1882}
1883
80a702cd 1884STATIC void
15f169a1 1885S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1886{
1887 AV *av = CopFILEAVx(PL_curcop);
1888 if (av) {
7f1c3e8c
FC
1889 SV * sv;
1890 if (PL_parser->preambling == NOLINE) sv = newSV_type(SVt_PVMG);
1891 else {
1892 sv = *av_fetch(av, 0, 1);
1893 SvUPGRADE(sv, SVt_PVMG);
1894 }
847cc851 1895 if (!SvPOK(sv)) SvPVCLEAR(sv);
5fa550fb 1896 if (orig_sv)
7f1c3e8c 1897 sv_catsv(sv, orig_sv);
5fa550fb 1898 else
7f1c3e8c
FC
1899 sv_catpvn(sv, buf, len);
1900 if (!SvIOK(sv)) {
1901 (void)SvIOK_on(sv);
1902 SvIV_set(sv, 0);
1903 }
1904 if (PL_parser->preambling == NOLINE)
1905 av_store(av, CopLINE(PL_curcop), sv);
80a702cd
RGS
1906 }
1907}
1908
ffb4593c 1909/*
8c6b0c7d 1910 * skipspace
ffb4593c
NT
1911 * Called to gobble the appropriate amount and type of whitespace.
1912 * Skips comments as well.
71fff7cb 1913 * Returns the next character after the whitespace that is skipped.
8c6b0c7d
FC
1914 *
1915 * peekspace
1916 * Same thing, but look ahead without incrementing line numbers or
1917 * adjusting PL_linestart.
ffb4593c
NT
1918 */
1919
3218e223 1920#define skipspace(s) skipspace_flags(s, 0)
8c6b0c7d 1921#define peekspace(s) skipspace_flags(s, LEX_NO_INCLINE)
3218e223 1922
aabfeadc
KW
1923char *
1924Perl_skipspace_flags(pTHX_ char *s, U32 flags)
a687059c 1925{
21791330 1926 PERL_ARGS_ASSERT_SKIPSPACE_FLAGS;
3280af22 1927 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
3c47da3c 1928 while (s < PL_bufend && (SPACE_OR_TAB(*s) || !*s))
463ee0b2 1929 s++;
f0e67a1d
Z
1930 } else {
1931 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1932 PL_bufptr = s;
21791330 1933 lex_read_space(flags | LEX_KEEP_PREVIOUS |
d27f4b91 1934 (PL_lex_inwhat || PL_lex_state == LEX_FORMLINE ?
f0998909 1935 LEX_NO_NEXT_CHUNK : 0));
3280af22 1936 s = PL_bufptr;
f0e67a1d
Z
1937 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1938 if (PL_linestart > PL_bufptr)
1939 PL_bufptr = PL_linestart;
1940 return s;
463ee0b2 1941 }
5db06880 1942 return s;
a687059c 1943}
378cc40b 1944
ffb4593c
NT
1945/*
1946 * S_check_uni
1947 * Check the unary operators to ensure there's no ambiguity in how they're
1948 * used. An ambiguous piece of code would be:
1949 * rand + 5
1950 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1951 * the +5 is its argument.
1952 */
1953
76e3520e 1954STATIC void
cea2e8a9 1955S_check_uni(pTHX)
ba106d47 1956{
d4c19fe8 1957 const char *s;
2f3197b3 1958
3280af22 1959 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1960 return;
3280af22
NIS
1961 while (isSPACE(*PL_last_uni))
1962 PL_last_uni++;
c35e046a 1963 s = PL_last_uni;
fac0f7a3 1964 while (isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF) || *s == '-')
8ce2ba82 1965 s += UTF ? UTF8SKIP(s) : 1;
4efcdc02 1966 if (s < PL_bufptr && memchr(s, '(', PL_bufptr - s))
a0d0e21e 1967 return;
6136c704 1968
9b387841 1969 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
147e3846 1970 "Warning: Use of \"%" UTF8f "\" without parentheses is ambiguous",
b59c097b 1971 UTF8fARG(UTF, (int)(s - PL_last_uni), PL_last_uni));
2f3197b3
LW
1972}
1973
ffb4593c
NT
1974/*
1975 * LOP : macro to build a list operator. Its behaviour has been replaced
1976 * with a subroutine, S_lop() for which LOP is just another name.
1977 */
1978
a0d0e21e
LW
1979#define LOP(f,x) return lop(f,x,s)
1980
ffb4593c
NT
1981/*
1982 * S_lop
1983 * Build a list operator (or something that might be one). The rules:
41e8cbf4
FC
1984 * - if we have a next token, then it's a list operator (no parens) for
1985 * which the next token has already been parsed; e.g.,
1986 * sort foo @args
1987 * sort foo (@args)
ffb4593c
NT
1988 * - if the next thing is an opening paren, then it's a function
1989 * - else it's a list operator
1990 */
1991
76e3520e 1992STATIC I32
11288bb3 1993S_lop(pTHX_ I32 f, U8 x, char *s)
ffed7fef 1994{
7918f24d
NC
1995 PERL_ARGS_ASSERT_LOP;
1996
6154021b 1997 pl_yylval.ival = f;
35c8bce7 1998 CLINE;
3280af22
NIS
1999 PL_bufptr = s;
2000 PL_last_lop = PL_oldbufptr;
eb160463 2001 PL_last_lop_op = (OPCODE)f;
3280af22 2002 if (PL_nexttoke)
78cdf107 2003 goto lstop;
19f1898a 2004 PL_expect = x;
79072805 2005 if (*s == '(')
bbf60fe6 2006 return REPORT(FUNC);
294a536f 2007 s = skipspace(s);
79072805 2008 if (*s == '(')
bbf60fe6 2009 return REPORT(FUNC);
78cdf107
Z
2010 else {
2011 lstop:
2012 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
2013 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
bbf60fe6 2014 return REPORT(LSTOP);
78cdf107 2015 }
79072805
LW
2016}
2017
ffb4593c
NT
2018/*
2019 * S_force_next
9cbb5ea2 2020 * When the lexer realizes it knows the next token (for instance,
ffb4593c 2021 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 2022 * to know what token to return the next time the lexer is called. Caller
b5bbe64a
JH
2023 * will need to set PL_nextval[] and possibly PL_expect to ensure
2024 * the lexer handles the token correctly.
ffb4593c
NT
2025 */
2026
4e553d73 2027STATIC void
cea2e8a9 2028S_force_next(pTHX_ I32 type)
79072805 2029{
704d4215
GG
2030#ifdef DEBUGGING
2031 if (DEBUG_T_TEST) {
2032 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 2033 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
2034 }
2035#endif
1f7c3e7c 2036 assert(PL_nexttoke < C_ARRAY_LENGTH(PL_nexttype));
3280af22
NIS
2037 PL_nexttype[PL_nexttoke] = type;
2038 PL_nexttoke++;
79072805
LW
2039}
2040
89f35911
FC
2041/*
2042 * S_postderef
2043 *
2044 * This subroutine handles postfix deref syntax after the arrow has already
b3f7b7ad 2045 * been emitted. @* $* etc. are emitted as two separate tokens right here.
89f35911
FC
2046 * @[ @{ %[ %{ *{ are emitted also as two tokens, but this function emits
2047 * only the first, leaving yylex to find the next.
89f35911
FC
2048 */
2049
2050static int
ff25e5db 2051S_postderef(pTHX_ int const funny, char const next)
89f35911 2052{
9086c946
BZ
2053 assert(funny == DOLSHARP
2054 || memCHRs("$@%&*", funny)
2055 || funny == PERLY_SNAIL
0ba95c59 2056 || funny == PERLY_PERCENT_SIGN
9086c946
BZ
2057 || funny == PERLY_AMPERSAND
2058 );
89f35911
FC
2059 if (next == '*') {
2060 PL_expect = XOPERATOR;
cc624add 2061 if (PL_lex_state == LEX_INTERPNORMAL && !PL_lex_brackets) {
9086c946 2062 assert(PERLY_SNAIL == funny || '$' == funny || DOLSHARP == funny);
cc624add 2063 PL_lex_state = LEX_INTERPEND;
9086c946 2064 if (PERLY_SNAIL == funny)
c3492809 2065 force_next(POSTJOIN);
cc624add 2066 }
89f35911
FC
2067 force_next(next);
2068 PL_bufptr+=2;
2069 }
2070 else {
9086c946 2071 if (PERLY_SNAIL == funny && PL_lex_state == LEX_INTERPNORMAL
760ca746
FC
2072 && !PL_lex_brackets)
2073 PL_lex_dojoin = 2;
89f35911
FC
2074 PL_expect = XOPERATOR;
2075 PL_bufptr++;
2076 }
2077 return funny;
2078}
2079
28ac2b49
Z
2080void
2081Perl_yyunlex(pTHX)
2082{
a7aaec61
Z
2083 int yyc = PL_parser->yychar;
2084 if (yyc != YYEMPTY) {
2085 if (yyc) {
a7aaec61 2086 NEXTVAL_NEXTTOKE = PL_parser->yylval;
669dd22c 2087 if (yyc == PERLY_BRACE_OPEN || yyc == HASHBRACK || yyc == PERLY_BRACKET_OPEN) {
78cdf107 2088 PL_lex_allbrackets--;
a7aaec61 2089 PL_lex_brackets--;
78cdf107 2090 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
ee67f254 2091 } else if (yyc == PERLY_PAREN_OPEN) {
78cdf107
Z
2092 PL_lex_allbrackets--;
2093 yyc |= (2<<24);
a7aaec61
Z
2094 }
2095 force_next(yyc);
2096 }
28ac2b49
Z
2097 PL_parser->yychar = YYEMPTY;
2098 }
2099}
2100
d0a148a6 2101STATIC SV *
15f169a1 2102S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 2103{
740cce10 2104 SV * const sv = newSVpvn_utf8(start, len,
6d492fac
KW
2105 ! IN_BYTES
2106 && UTF
53463649 2107 && len != 0
6d492fac 2108 && is_utf8_non_invariant_string((const U8*)start, len));
d0a148a6
NC
2109 return sv;
2110}
2111
ffb4593c
NT
2112/*
2113 * S_force_word
2114 * When the lexer knows the next thing is a word (for instance, it has
2115 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
2116 * it calls S_force_word to stick the next word into the PL_nexttoke/val
2117 * lookahead.
ffb4593c
NT
2118 *
2119 * Arguments:
b1b65b59 2120 * char *start : buffer position (must be within PL_linestr)
185c2e96
DM
2121 * int token : PL_next* will be this type of bare word
2122 * (e.g., METHOD,BAREWORD)
ffb4593c
NT
2123 * int check_keyword : if true, Perl checks to make sure the word isn't
2124 * a keyword (do this if the word is a label, e.g. goto FOO)
2125 * int allow_pack : if true, : characters will also be allowed (require,
2126 * use, etc. do this)
ffb4593c
NT
2127 */
2128
76e3520e 2129STATIC char *
345b3785 2130S_force_word(pTHX_ char *start, int token, int check_keyword, int allow_pack)
79072805 2131{
eb578fdb 2132 char *s;
463ee0b2 2133 STRLEN len;
4e553d73 2134
7918f24d
NC
2135 PERL_ARGS_ASSERT_FORCE_WORD;
2136
294a536f 2137 start = skipspace(start);
463ee0b2 2138 s = start;
fac0f7a3 2139 if ( isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)
e7127e21 2140 || (allow_pack && *s == ':' && s[1] == ':') )
a0d0e21e 2141 {
3280af22 2142 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
01b5ef50
FC
2143 if (check_keyword) {
2144 char *s2 = PL_tokenbuf;
487e470d 2145 STRLEN len2 = len;
de627158
KW
2146 if (allow_pack && memBEGINPs(s2, len, "CORE::")) {
2147 s2 += sizeof("CORE::") - 1;
2148 len2 -= sizeof("CORE::") - 1;
2149 }
487e470d 2150 if (keyword(s2, len2, 0))
463ee0b2 2151 return start;
01b5ef50 2152 }
463ee0b2 2153 if (token == METHOD) {
294a536f 2154 s = skipspace(s);
463ee0b2 2155 if (*s == '(')
3280af22 2156 PL_expect = XTERM;
463ee0b2 2157 else {
3280af22 2158 PL_expect = XOPERATOR;
463ee0b2 2159 }
79072805 2160 }
9ded7720 2161 NEXTVAL_NEXTTOKE.opval
275103cd 2162 = newSVOP(OP_CONST,0,
d0a148a6 2163 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 2164 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
2165 force_next(token);
2166 }
2167 return s;
2168}
2169
ffb4593c
NT
2170/*
2171 * S_force_ident
9cbb5ea2 2172 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
2173 * text only contains the "foo" portion. The first argument is a pointer
2174 * to the "foo", and the second argument is the type symbol to prefix.
185c2e96 2175 * Forces the next token to be a "BAREWORD".
9cbb5ea2 2176 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2177 */
2178
76e3520e 2179STATIC void
5aaab254 2180S_force_ident(pTHX_ const char *s, int kind)
79072805 2181{
7918f24d
NC
2182 PERL_ARGS_ASSERT_FORCE_IDENT;
2183
c9b48522
DD
2184 if (s[0]) {
2185 const STRLEN len = s[1] ? strlen(s) : 1; /* s = "\"" see yylex */
275103cd 2186 OP* const o = newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
728847b1 2187 UTF ? SVf_UTF8 : 0));
9ded7720 2188 NEXTVAL_NEXTTOKE.opval = o;
185c2e96 2189 force_next(BAREWORD);
748a9306 2190 if (kind) {
11343788 2191 o->op_private = OPpCONST_ENTERED;
55497cff
PP
2192 /* XXX see note in pp_entereval() for why we forgo typo
2193 warnings if the symbol must be introduced in an eval.
2194 GSAR 96-10-12 */
90e5519e 2195 gv_fetchpvn_flags(s, len,
4bff32c5 2196 (PL_in_eval ? GV_ADDMULTI
728847b1 2197 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
90e5519e 2198 kind == '$' ? SVt_PV :
9086c946 2199 kind == PERLY_SNAIL ? SVt_PVAV :
0ba95c59 2200 kind == PERLY_PERCENT_SIGN ? SVt_PVHV :
a0d0e21e 2201 SVt_PVGV
90e5519e 2202 );
748a9306 2203 }
79072805
LW
2204 }
2205}
2206
3f33d153
FC
2207static void
2208S_force_ident_maybe_lex(pTHX_ char pit)
2209{
3f33d153
FC
2210 NEXTVAL_NEXTTOKE.ival = pit;
2211 force_next('p');
2212}
2213
1571675a
GS
2214NV
2215Perl_str_to_version(pTHX_ SV *sv)
2216{
2217 NV retval = 0.0;
2218 NV nshift = 1.0;
2219 STRLEN len;
cfd0369c 2220 const char *start = SvPV_const(sv,len);
9d4ba2ae 2221 const char * const end = start + len;
8298454c 2222 const bool utf = cBOOL(SvUTF8(sv));
7918f24d
NC
2223
2224 PERL_ARGS_ASSERT_STR_TO_VERSION;
2225
1571675a 2226 while (start < end) {
ba210ebe 2227 STRLEN skip;
1571675a
GS
2228 UV n;
2229 if (utf)
9041c2e3 2230 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2231 else {
2232 n = *(U8*)start;
2233 skip = 1;
2234 }
2235 retval += ((NV)n)/nshift;
2236 start += skip;
2237 nshift *= 1000;
2238 }
2239 return retval;
2240}
2241
4e553d73 2242/*
ffb4593c
NT
2243 * S_force_version
2244 * Forces the next token to be a version number.
e759cc13
RGS
2245 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2246 * and if "guessing" is TRUE, then no new token is created (and the caller
2247 * must use an alternative parsing method).
ffb4593c
NT
2248 */
2249
76e3520e 2250STATIC char *
e759cc13 2251S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2252{
5f66b61c 2253 OP *version = NULL;
44dcb63b 2254 char *d;
89bfa8cd 2255
7918f24d
NC
2256 PERL_ARGS_ASSERT_FORCE_VERSION;
2257
294a536f 2258 s = skipspace(s);
89bfa8cd 2259
44dcb63b 2260 d = s;
dd629d5b 2261 if (*d == 'v')
44dcb63b 2262 d++;
44dcb63b 2263 if (isDIGIT(*d)) {
e759cc13
RGS
2264 while (isDIGIT(*d) || *d == '_' || *d == '.')
2265 d++;
4e4da3ac 2266 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
dd629d5b 2267 SV *ver;
6154021b
RGS
2268 s = scan_num(s, &pl_yylval);
2269 version = pl_yylval.opval;
dd629d5b
GS
2270 ver = cSVOPx(version)->op_sv;
2271 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2272 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2273 SvNV_set(ver, str_to_version(ver));
1571675a 2274 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2275 }
89bfa8cd 2276 }
5db06880 2277 else if (guessing) {
e759cc13 2278 return s;
5db06880 2279 }
89bfa8cd
PP
2280 }
2281
2282 /* NOTE: The parser sees the package name and the VERSION swapped */
9ded7720 2283 NEXTVAL_NEXTTOKE.opval = version;
185c2e96 2284 force_next(BAREWORD);
89bfa8cd 2285
e759cc13 2286 return s;
89bfa8cd
PP
2287}
2288
ffb4593c 2289/*
91152fc1
DG
2290 * S_force_strict_version
2291 * Forces the next token to be a version number using strict syntax rules.
2292 */
2293
2294STATIC char *
2295S_force_strict_version(pTHX_ char *s)
2296{
91152fc1 2297 OP *version = NULL;
91152fc1
DG
2298 const char *errstr = NULL;
2299
2300 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2301
2302 while (isSPACE(*s)) /* leading whitespace */
2303 s++;
2304
2305 if (is_STRICT_VERSION(s,&errstr)) {
2306 SV *ver = newSV(0);
2307 s = (char *)scan_version(s, ver, 0);
2308 version = newSVOP(OP_CONST, 0, ver);
2309 }
407f8cf2
KW
2310 else if ((*s != ';' && *s != '{' && *s != '}' )
2311 && (s = skipspace(s), (*s != ';' && *s != '{' && *s != '}' )))
4e4da3ac 2312 {
91152fc1
DG
2313 PL_bufptr = s;
2314 if (errstr)
2315 yyerror(errstr); /* version required */
2316 return s;
2317 }
2318
91152fc1 2319 /* NOTE: The parser sees the package name and the VERSION swapped */
91152fc1 2320 NEXTVAL_NEXTTOKE.opval = version;
185c2e96 2321 force_next(BAREWORD);
91152fc1
DG
2322
2323 return s;
2324}
2325
2326/*
ffb4593c 2327 * S_tokeq
ef3ff34d
KW
2328 * Turns any \\ into \ in a quoted string passed in in 'sv', returning 'sv',
2329 * modified as necessary. However, if HINT_NEW_STRING is on, 'sv' is
2330 * unchanged, and a new SV containing the modified input is returned.
ffb4593c
NT
2331 */
2332
76e3520e 2333STATIC SV *
cea2e8a9 2334S_tokeq(pTHX_ SV *sv)
79072805 2335{
eb578fdb
KW
2336 char *s;
2337 char *send;
2338 char *d;
b3ac6de7 2339 SV *pv = sv;
79072805 2340
7918f24d
NC
2341 PERL_ARGS_ASSERT_TOKEQ;
2342
279b35ad
FC
2343 assert (SvPOK(sv));
2344 assert (SvLEN(sv));
2345 assert (!SvIsCOW(sv));
307ed071 2346 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1) /* <<'heredoc' */
b3ac6de7 2347 goto finish;
279b35ad
FC
2348 s = SvPVX(sv);
2349 send = SvEND(sv);
dcb21ed6
NC
2350 /* This is relying on the SV being "well formed" with a trailing '\0' */
2351 while (s < send && !(*s == '\\' && s[1] == '\\'))
79072805
LW
2352 s++;
2353 if (s == send)
b3ac6de7 2354 goto finish;
79072805 2355 d = s;
be4731d2 2356 if ( PL_hints & HINT_NEW_STRING ) {
279b35ad
FC
2357 pv = newSVpvn_flags(SvPVX_const(pv), SvCUR(sv),
2358 SVs_TEMP | SvUTF8(sv));
be4731d2 2359 }
79072805
LW
2360 while (s < send) {
2361 if (*s == '\\') {
a0d0e21e 2362 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2363 s++; /* all that, just for this */
2364 }
2365 *d++ = *s++;
2366 }
2367 *d = '\0';
95a20fc0 2368 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2369 finish:
3280af22 2370 if ( PL_hints & HINT_NEW_STRING )
164e423c 2371 return new_constant(NULL, 0, "q", sv, pv, "q", 1, NULL);
79072805
LW
2372 return sv;
2373}
2374
ffb4593c
NT
2375/*
2376 * Now come three functions related to double-quote context,
2377 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2378 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2379 * interact with PL_lex_state, and create fake ( ... ) argument lists
2380 * to handle functions and concatenation.
ecd24171
DM
2381 * For example,
2382 * "foo\lbar"
2383 * is tokenised as
2384 * stringify ( const[foo] concat lcfirst ( const[bar] ) )
ffb4593c
NT
2385 */
2386
2387/*
2388 * S_sublex_start
6154021b 2389 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2390 *
2391 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2392 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c 2393 *
16d1d8bd 2394 * OP_CONST is easy--just make the new op and return.
ffb4593c
NT
2395 *
2396 * Everything else becomes a FUNC.
2397 *
16d1d8bd
LM
2398 * Sets PL_lex_state to LEX_INTERPPUSH unless ival was OP_NULL or we
2399 * had an OP_CONST. This just sets us up for a
ffb4593c
NT
2400 * call to S_sublex_push().
2401 */
2402
76e3520e 2403STATIC I32
cea2e8a9 2404S_sublex_start(pTHX)
79072805 2405{
eb578fdb 2406 const I32 op_type = pl_yylval.ival;
79072805
LW
2407
2408 if (op_type == OP_NULL) {
6154021b 2409 pl_yylval.opval = PL_lex_op;
5f66b61c 2410 PL_lex_op = NULL;
79072805
LW
2411 return THING;
2412 }
466112bb 2413 if (op_type == OP_CONST) {
67c71cbb
FC
2414 SV *sv = PL_lex_stuff;
2415 PL_lex_stuff = NULL;
2416 sv = tokeq(sv);
b3ac6de7
IZ
2417
2418 if (SvTYPE(sv) == SVt_PVIV) {
2419 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2420 STRLEN len;
96a5add6 2421 const char * const p = SvPV_const(sv, len);
740cce10 2422 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2423 SvREFCNT_dec(sv);
2424 sv = nsv;
4e553d73 2425 }
275103cd 2426 pl_yylval.opval = newSVOP(op_type, 0, sv);
79072805
LW
2427 return THING;
2428 }
2429
7ef70b3d
FC
2430 PL_parser->lex_super_state = PL_lex_state;
2431 PL_parser->lex_sub_inwhat = (U16)op_type;
2432 PL_parser->lex_sub_op = PL_lex_op;
bb4e4c38
TC
2433 PL_parser->sub_no_recover = FALSE;
2434 PL_parser->sub_error_count = PL_error_count;
3280af22 2435 PL_lex_state = LEX_INTERPPUSH;
55497cff 2436
3280af22
NIS
2437 PL_expect = XTERM;
2438 if (PL_lex_op) {
6154021b 2439 pl_yylval.opval = PL_lex_op;
5f66b61c 2440 PL_lex_op = NULL;
55497cff
PP
2441 return PMFUNC;
2442 }
2443 else
2444 return FUNC;
2445}
2446
ffb4593c
NT
2447/*
2448 * S_sublex_push
2449 * Create a new scope to save the lexing state. The scope will be
2450 * ended in S_sublex_done. Returns a '(', starting the function arguments
2451 * to the uc, lc, etc. found before.
2452 * Sets PL_lex_state to LEX_INTERPCONCAT.
2453 */
2454
76e3520e 2455STATIC I32
cea2e8a9 2456S_sublex_push(pTHX)
55497cff 2457{
78a635de 2458 LEXSHARED *shared;
801d32ac 2459 const bool is_heredoc = PL_multi_close == '<';
f46d017c 2460 ENTER;
55497cff 2461
7ef70b3d 2462 PL_lex_state = PL_parser->lex_super_state;
cc624add 2463 SAVEI8(PL_lex_dojoin);
3280af22 2464 SAVEI32(PL_lex_brackets);
78cdf107 2465 SAVEI32(PL_lex_allbrackets);
b27dce25 2466 SAVEI32(PL_lex_formbrack);
78cdf107 2467 SAVEI8(PL_lex_fakeeof);
3280af22
NIS
2468 SAVEI32(PL_lex_casemods);
2469 SAVEI32(PL_lex_starts);
651b5b28 2470 SAVEI8(PL_lex_state);
7cc34111 2471 SAVESPTR(PL_lex_repl);
7766f137 2472 SAVEVPTR(PL_lex_inpat);
98246f1e 2473 SAVEI16(PL_lex_inwhat);
ffdb8b16 2474 if (is_heredoc)
b42366d4 2475 {
ffdb8b16 2476 SAVECOPLINE(PL_curcop);
b42366d4 2477 SAVEI32(PL_multi_end);
851b527a
FC
2478 SAVEI32(PL_parser->herelines);
2479 PL_parser->herelines = 0;
b42366d4 2480 }
2ca4363d 2481 SAVEIV(PL_multi_close);
3280af22 2482 SAVEPPTR(PL_bufptr);
8452ff4b 2483 SAVEPPTR(PL_bufend);
3280af22
NIS
2484 SAVEPPTR(PL_oldbufptr);
2485 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2486 SAVEPPTR(PL_last_lop);
2487 SAVEPPTR(PL_last_uni);
3280af22
NIS
2488 SAVEPPTR(PL_linestart);
2489 SAVESPTR(PL_linestr);
8edd5f42
RGS
2490 SAVEGENERICPV(PL_lex_brackstack);
2491 SAVEGENERICPV(PL_lex_casestack);
78a635de 2492 SAVEGENERICPV(PL_parser->lex_shared);
3a54fd60 2493 SAVEBOOL(PL_parser->lex_re_reparsing);
ffdb8b16 2494 SAVEI32(PL_copline);
3280af22 2495
99bd9d90 2496 /* The here-doc parser needs to be able to peek into outer lexing
60f40a38
FC
2497 scopes to find the body of the here-doc. So we put PL_linestr and
2498 PL_bufptr into lex_shared, to ‘share’ those values.
99bd9d90 2499 */
60f40a38
FC
2500 PL_parser->lex_shared->ls_linestr = PL_linestr;
2501 PL_parser->lex_shared->ls_bufptr = PL_bufptr;
99bd9d90 2502
3280af22 2503 PL_linestr = PL_lex_stuff;
7ef70b3d 2504 PL_lex_repl = PL_parser->lex_sub_repl;
a0714e2c 2505 PL_lex_stuff = NULL;
7ef70b3d 2506 PL_parser->lex_sub_repl = NULL;
3280af22 2507
eabab8bc
FC
2508 /* Arrange for PL_lex_stuff to be freed on scope exit, in case it gets
2509 set for an inner quote-like operator and then an error causes scope-
2510 popping. We must not have a PL_lex_stuff value left dangling, as
2511 that breaks assumptions elsewhere. See bug #123617. */
2512 SAVEGENERICSV(PL_lex_stuff);
7ef70b3d 2513 SAVEGENERICSV(PL_parser->lex_sub_repl);
eabab8bc 2514
9cbb5ea2
GS
2515 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2516 = SvPVX(PL_linestr);
3280af22 2517 PL_bufend += SvCUR(PL_linestr);
bd61b366 2518 PL_last_lop = PL_last_uni = NULL;
3280af22 2519 SAVEFREESV(PL_linestr);
4dc843bc 2520 if (PL_lex_repl) SAVEFREESV(PL_lex_repl);
3280af22
NIS
2521
2522 PL_lex_dojoin = FALSE;
b27dce25 2523 PL_lex_brackets = PL_lex_formbrack = 0;
78cdf107
Z
2524 PL_lex_allbrackets = 0;
2525 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
a02a5408
JC
2526 Newx(PL_lex_brackstack, 120, char);
2527 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2528 PL_lex_casemods = 0;
2529 *PL_lex_casestack = '\0';
2530 PL_lex_starts = 0;
2531 PL_lex_state = LEX_INTERPCONCAT;
ffdb8b16 2532 if (is_heredoc)
6ddcf93b 2533 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
ffdb8b16 2534 PL_copline = NOLINE;
caae0700 2535
78a635de
FC
2536 Newxz(shared, 1, LEXSHARED);
2537 shared->ls_prev = PL_parser->lex_shared;
2538 PL_parser->lex_shared = shared;
3280af22 2539
7ef70b3d 2540 PL_lex_inwhat = PL_parser->lex_sub_inwhat;
bb16bae8 2541 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
3280af22 2542 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
7ef70b3d 2543 PL_lex_inpat = PL_parser->lex_sub_op;
79072805 2544 else
5f66b61c 2545 PL_lex_inpat = NULL;
79072805 2546
3a54fd60
DM
2547 PL_parser->lex_re_reparsing = cBOOL(PL_in_eval & EVAL_RE_REPARSING);
2548 PL_in_eval &= ~EVAL_RE_REPARSING;
2549
69afcc21 2550 return SUBLEXSTART;
79072805
LW
2551}
2552
ffb4593c
NT
2553/*
2554 * S_sublex_done
2555 * Restores lexer state after a S_sublex_push.
2556 */
2557
76e3520e 2558STATIC I32
cea2e8a9 2559S_sublex_done(pTHX)
79072805 2560{
3280af22 2561 if (!PL_lex_starts++) {
396482e1 2562 SV * const sv = newSVpvs("");
9aa983d2
JH
2563 if (SvUTF8(PL_linestr))
2564 SvUTF8_on(sv);
3280af22 2565 PL_expect = XOPERATOR;
275103cd 2566 pl_yylval.opval = newSVOP(OP_CONST, 0, sv);
79072805
LW
2567 return THING;
2568 }
2569
3280af22
NIS
2570 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2571 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2572 return yylex();
79072805
LW
2573 }
2574
ffb4593c 2575 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
bb16bae8 2576 assert(PL_lex_inwhat != OP_TRANSR);
5aa91856
FC
2577 if (PL_lex_repl) {
2578 assert (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS);
3280af22
NIS
2579 PL_linestr = PL_lex_repl;
2580 PL_lex_inpat = 0;
2581 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2582 PL_bufend += SvCUR(PL_linestr);
bd61b366 2583 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2584 PL_lex_dojoin = FALSE;
2585 PL_lex_brackets = 0;
78cdf107
Z
2586 PL_lex_allbrackets = 0;
2587 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
3280af22
NIS
2588 PL_lex_casemods = 0;
2589 *PL_lex_casestack = '\0';
2590 PL_lex_starts = 0;
25da4f38 2591 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2592 PL_lex_state = LEX_INTERPNORMAL;
2593 PL_lex_starts++;
e9fa98b2
HS
2594 /* we don't clear PL_lex_repl here, so that we can check later
2595 whether this is an evalled subst; that means we rely on the
2596 logic to ensure sublex_done() is called again only via the
2597 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2598 }
e9fa98b2 2599 else {
3280af22 2600 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2601 PL_lex_repl = NULL;
e9fa98b2 2602 }
ffdb8b16
FC
2603 if (SvTYPE(PL_linestr) >= SVt_PVNV) {
2604 CopLINE(PL_curcop) +=
9420b268 2605 ((XPVNV*)SvANY(PL_linestr))->xnv_u.xnv_lines
851b527a
FC
2606 + PL_parser->herelines;
2607 PL_parser->herelines = 0;
ffdb8b16 2608 }
9b6b7be8 2609 return '/';
ffed7fef
LW
2610 }
2611 else {
b42366d4 2612 const line_t l = CopLINE(PL_curcop);
f46d017c 2613 LEAVE;
bb4e4c38 2614 if (PL_parser->sub_error_count != PL_error_count) {
bb4e4c38 2615 if (PL_parser->sub_no_recover) {
ad1ecdf7 2616 yyquit();
bb4e4c38
TC
2617 NOT_REACHED;
2618 }
2619 }
b42366d4 2620 if (PL_multi_close == '<')
851b527a 2621 PL_parser->herelines += l - PL_multi_end;
3280af22
NIS
2622 PL_bufend = SvPVX(PL_linestr);
2623 PL_bufend += SvCUR(PL_linestr);
2624 PL_expect = XOPERATOR;
69afcc21 2625 return SUBLEXEND;
ffed7fef
LW
2626 }
2627}
2628
4e8ee35f
KW
2629HV *
2630Perl_load_charnames(pTHX_ SV * char_name, const char * context,
2631 const STRLEN context_len, const char ** error_msg)
2632{
2633 /* Load the official _charnames module if not already there. The
2634 * parameters are just to give info for any error messages generated:
2635 * char_name a name to look up which is the reason for loading this
2636 * context 'char_name' in the context in the input in which it appears
2637 * context_len how many bytes 'context' occupies
2638 * error_msg *error_msg will be set to any error
2639 *
2640 * Returns the ^H table if success; otherwise NULL */
2641
2642 unsigned int i;
2643 HV * table;
2644 SV **cvp;
2645 SV * res;
2646
2647 PERL_ARGS_ASSERT_LOAD_CHARNAMES;
2648
2649 /* This loop is executed 1 1/2 times. On the first time through, if it
2650 * isn't already loaded, try loading it, and iterate just once to see if it
2651 * worked. */
2652 for (i = 0; i < 2; i++) {
2653 table = GvHV(PL_hintgv); /* ^H */
2654
2655 if ( table
2656 && (PL_hints & HINT_LOCALIZE_HH)
2657 && (cvp = hv_fetchs(table, "charnames", FALSE))
2658 && SvOK(*cvp))
2659 {
2660 return table; /* Quit if already loaded */
2661 }
2662
2663 if (i == 0) {
2664 Perl_load_module(aTHX_
2665 0,
2666 newSVpvs("_charnames"),
2667
2668 /* version parameter; no need to specify it, as if we get too early
2669 * a version, will fail anyway, not being able to find 'charnames'
2670 * */
2671 NULL,
2672 newSVpvs(":full"),
2673 newSVpvs(":short"),
2674 NULL);
2675 }
2676 }
2677
2678 /* Here, it failed; new_constant will give appropriate error messages */
2679 *error_msg = NULL;
2680 res = new_constant( NULL, 0, "charnames", char_name, NULL,
2681 context, context_len, error_msg);
2682 SvREFCNT_dec(res);
2683
2684 return NULL;
2685}
2686
7a7d14f3 2687STATIC SV*
2c43c309
KW
2688S_get_and_check_backslash_N_name_wrapper(pTHX_ const char* s, const char* const e)
2689{
2690 /* This justs wraps get_and_check_backslash_N_name() to output any error
2691 * message it returns. */
2692
2693 const char * error_msg = NULL;
25c7fb78 2694 SV * result;
2c43c309
KW
2695
2696 PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME_WRAPPER;
2697
25c7fb78
KW
2698 /* charnames doesn't work well if there have been errors found */
2699 if (PL_error_count > 0) {
2700 return NULL;
2701 }
2702
2703 result = get_and_check_backslash_N_name(s, e, cBOOL(UTF), &error_msg);
2704
2c43c309
KW
2705 if (error_msg) {
2706 yyerror_pv(error_msg, UTF ? SVf_UTF8 : 0);
2707 }
2708
2709 return result;
2710}
2711
25c7fb78
KW
2712SV*
2713Perl_get_and_check_backslash_N_name(pTHX_ const char* s,
2714 const char* const e,
2715 const bool is_utf8,
2716 const char ** error_msg)
6f613c73 2717{
140b12ad
KW
2718 /* <s> points to first character of interior of \N{}, <e> to one beyond the
2719 * interior, hence to the "}". Finds what the name resolves to, returning
25c7fb78
KW
2720 * an SV* containing it; NULL if no valid one found.
2721 *
2722 * 'is_utf8' is TRUE if we know we want the result to be UTF-8 even if it
2723 * doesn't have to be. */
6f613c73 2724
7303cc1f 2725 SV* char_name;
25c7fb78 2726 SV* res;
0c415a79
KW
2727 HV * table;
2728 SV **cvp;
2729 SV *cv;
2730 SV *rv;
2731 HV *stash;
8b0cce63
KW
2732
2733 /* Points to the beginning of the \N{... so that any messages include the
2734 * context of what's failing*/
2735 const char* context = s - 3;
2736 STRLEN context_len = e - context + 1; /* include all of \N{...} */
2737
0c415a79 2738
6f613c73
KW
2739 PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME;
2740
25c7fb78
KW
2741 assert(e >= s);
2742 assert(s > (char *) 3);
2743
7303cc1f 2744 char_name = newSVpvn_flags(s, e - s, (is_utf8) ? SVf_UTF8 : 0);
25c7fb78 2745
7303cc1f
KW
2746 if (!SvCUR(char_name)) {
2747 SvREFCNT_dec_NN(char_name);
be332ba0 2748 /* diag_listed_as: Unknown charname '%s' */
2c43c309 2749 *error_msg = Perl_form(aTHX_ "Unknown charname ''");
be332ba0 2750 return NULL;
d8d26cac 2751 }
b7e6151c 2752
7303cc1f 2753 /* Autoload the charnames module */
7303cc1f 2754
4e8ee35f
KW
2755 table = load_charnames(char_name, context, context_len, error_msg);
2756 if (table == NULL) {
2757 return NULL;
7303cc1f
KW
2758 }
2759
2760 *error_msg = NULL;
8b0cce63
KW
2761 res = new_constant( NULL, 0, "charnames", char_name, NULL,
2762 context, context_len, error_msg);
7303cc1f 2763 if (*error_msg) {
4e8ee35f 2764 *error_msg = Perl_form(aTHX_ "Unknown charname '%s'", SvPVX(char_name));
7303cc1f
KW
2765
2766 SvREFCNT_dec(res);
6f613c73
KW
2767 return NULL;
2768 }
2769
0c415a79
KW
2770 /* See if the charnames handler is the Perl core's, and if so, we can skip
2771 * the validation needed for a user-supplied one, as Perl's does its own
2772 * validation. */
0c415a79 2773 cvp = hv_fetchs(table, "charnames", FALSE);
5882ddb3
FC
2774 if (cvp && (cv = *cvp) && SvROK(cv) && (rv = SvRV(cv),
2775 SvTYPE(rv) == SVt_PVCV) && ((stash = CvSTASH(rv)) != NULL))
0c415a79
KW
2776 {
2777 const char * const name = HvNAME(stash);
b59bf0b2 2778 if (memEQs(name, HvNAMELEN(stash), "_charnames")) {
0c415a79
KW
2779 return res;
2780 }
2781 }
2782
bde9e88d
KW
2783 /* Here, it isn't Perl's charname handler. We can't rely on a
2784 * user-supplied handler to validate the input name. For non-ut8 input,
2785 * look to see that the first character is legal. Then loop through the
2786 * rest checking that each is a continuation */
6f613c73 2787
36897d64
KW
2788 /* This code makes the reasonable assumption that the only Latin1-range
2789 * characters that begin a character name alias are alphabetic, otherwise
2790 * would have to create a isCHARNAME_BEGIN macro */
b6ba1137 2791
25c7fb78 2792 if (! is_utf8) {
bde9e88d 2793 if (! isALPHAU(*s)) {
b6ba1137
KW
2794 goto bad_charname;
2795 }
bde9e88d
KW
2796 s++;
2797 while (s < e) {
2798 if (! isCHARNAME_CONT(*s)) {
b6ba1137
KW
2799 goto bad_charname;
2800 }
2d8eb851
KW
2801 if (*s == ' ' && *(s-1) == ' ') {
2802 goto multi_spaces;
bd299e29 2803 }
bde9e88d 2804 s++;
b6ba1137
KW
2805 }
2806 }
2807 else {
bde9e88d 2808 /* Similarly for utf8. For invariants can check directly; for other
58aa6738
KW
2809 * Latin1, can calculate their code point and check; otherwise use an
2810 * inversion list */
bde9e88d
KW
2811 if (UTF8_IS_INVARIANT(*s)) {
2812 if (! isALPHAU(*s)) {
140b12ad
KW
2813 goto bad_charname;
2814 }
bde9e88d
KW
2815 s++;
2816 } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
a62b247b 2817 if (! isALPHAU(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)))) {
b6ba1137 2818 goto bad_charname;
6f613c73 2819 }
bde9e88d 2820 s += 2;
6f613c73 2821 }
bde9e88d 2822 else {
f1bcae08
KW
2823 if (! _invlist_contains_cp(PL_utf8_charname_begin,
2824 utf8_to_uvchr_buf((U8 *) s,
2825 (U8 *) e,
2826 NULL)))
2827 {
bde9e88d
KW
2828 goto bad_charname;
2829 }
2830 s += UTF8SKIP(s);
2831 }
2832
2833 while (s < e) {
2834 if (UTF8_IS_INVARIANT(*s)) {
2835 if (! isCHARNAME_CONT(*s)) {
2836 goto bad_charname;
2837 }
2d8eb851
KW
2838 if (*s == ' ' && *(s-1) == ' ') {
2839 goto multi_spaces;
bd299e29 2840 }
bde9e88d
KW
2841 s++;
2842 }
2843 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
a62b247b 2844 if (! isCHARNAME_CONT(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1))))
bde9e88d
KW
2845 {
2846 goto bad_charname;
2847 }
2848 s += 2;
2849 }
2850 else {
f1bcae08
KW
2851 if (! _invlist_contains_cp(PL_utf8_charname_continue,
2852 utf8_to_uvchr_buf((U8 *) s,
2853 (U8 *) e,
2854 NULL)))
2855 {
bde9e88d
KW
2856 goto bad_charname;
2857 }
2858 s += UTF8SKIP(s);
6f613c73
KW
2859 }
2860 }
2d8eb851
KW
2861 }
2862 if (*(s-1) == ' ') {
8d9d0498
FC
2863 /* diag_listed_as: charnames alias definitions may not contain
2864 trailing white-space; marked by <-- HERE in %s
2865 */
2c43c309 2866 *error_msg = Perl_form(aTHX_
2d8eb851
KW
2867 "charnames alias definitions may not contain trailing "
2868 "white-space; marked by <-- HERE in %.*s<-- HERE %.*s",
8b0cce63 2869 (int)(s - context + 1), context,
2c43c309 2870 (int)(e - s + 1), s + 1);
2d8eb851 2871 return NULL;
6f613c73
KW
2872 }
2873
870fe793 2874 if (SvUTF8(res)) { /* Don't accept malformed charname value */
bde9e88d
KW
2875 const U8* first_bad_char_loc;
2876 STRLEN len;
2877 const char* const str = SvPV_const(res, len);
0aab20f2
KW
2878 if (UNLIKELY(! is_utf8_string_loc((U8 *) str, len,
2879 &first_bad_char_loc)))
2880 {
75219bac
KW
2881 _force_out_malformed_utf8_message(first_bad_char_loc,
2882 (U8 *) PL_parser->bufend,
2883 0,
2884 0 /* 0 means don't die */ );
8d9d0498
FC
2885 /* diag_listed_as: Malformed UTF-8 returned by \N{%s}
2886 immediately after '%s' */
2c43c309 2887 *error_msg = Perl_form(aTHX_
bde9e88d 2888 "Malformed UTF-8 returned by %.*s immediately after '%.*s'",
8b0cce63 2889 (int) context_len, context,
2c43c309 2890 (int) ((char *) first_bad_char_loc - str), str);
bde9e88d
KW
2891 return NULL;
2892 }
2893 }
140b12ad 2894
bde9e88d 2895 return res;
140b12ad 2896
bde9e88d 2897 bad_charname: {
bde9e88d
KW
2898
2899 /* The final %.*s makes sure that should the trailing NUL be missing
2900 * that this print won't run off the end of the string */
8d9d0498
FC
2901 /* diag_listed_as: Invalid character in \N{...}; marked by <-- HERE
2902 in \N{%s} */
2c43c309 2903 *error_msg = Perl_form(aTHX_
bde9e88d 2904 "Invalid character in \\N{...}; marked by <-- HERE in %.*s<-- HERE %.*s",
8b0cce63 2905 (int)(s - context + 1), context,
2c43c309 2906 (int)(e - s + 1), s + 1);
bde9e88d
KW
2907 return NULL;
2908 }
2d8eb851
KW
2909
2910 multi_spaces:
8d9d0498
FC
2911 /* diag_listed_as: charnames alias definitions may not contain a
2912 sequence of multiple spaces; marked by <-- HERE
2913 in %s */
2c43c309 2914 *error_msg = Perl_form(aTHX_
2d8eb851
KW
2915 "charnames alias definitions may not contain a sequence of "
2916 "multiple spaces; marked by <-- HERE in %.*s<-- HERE %.*s",
8b0cce63 2917 (int)(s - context + 1), context,
2c43c309 2918 (int)(e - s + 1), s + 1);
2d8eb851 2919 return NULL;
6f613c73
KW
2920}
2921
02aa26ce
NT
2922/*
2923 scan_const
2924
9da1dd8f
DM
2925 Extracts the next constant part of a pattern, double-quoted string,
2926 or transliteration. This is terrifying code.
2927
2928 For example, in parsing the double-quoted string "ab\x63$d", it would
2929 stop at the '$' and return an OP_CONST containing 'abc'.
02aa26ce 2930
94def140 2931 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 2932 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 2933 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 2934
94def140
ST
2935 Returns a pointer to the character scanned up to. If this is
2936 advanced from the start pointer supplied (i.e. if anything was
9da1dd8f 2937 successfully parsed), will leave an OP_CONST for the substring scanned
6154021b 2938 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
2939 by looking at the next characters herself.
2940
02aa26ce 2941 In patterns:
9da1dd8f 2942 expand:
537124e4
KW
2943 \N{FOO} => \N{U+hex_for_character_FOO}
2944 (if FOO expands to multiple characters, expands to \N{U+xx.XX.yy ...})
9da1dd8f
DM
2945
2946 pass through:
2947 all other \-char, including \N and \N{ apart from \N{ABC}
2948
2949 stops on:
2950 @ and $ where it appears to be a var, but not for $ as tail anchor
2951 \l \L \u \U \Q \E
2952 (?{ or (??{
2953
02aa26ce
NT
2954 In transliterations:
2955 characters are VERY literal, except for - not at the start or end
66b09263
KW
2956 of the string, which indicates a range. However some backslash sequences
2957 are recognized: \r, \n, and the like
2958 \007 \o{}, \x{}, \N{}
2959 If all elements in the transliteration are below 256,
94def140
ST
2960 scan_const expands the range to the full set of intermediate
2961 characters. If the range is in utf8, the hyphen is replaced with
2962 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
2963
2964 In double-quoted strings:
2965 backslashes:
66b09263 2966 all those recognized in transliterations
94def140 2967 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
2968 case and quoting: \U \Q \E
2969 stops on @ and $
2970
2971 scan_const does *not* construct ops to handle interpolated strings.
2972 It stops processing as soon as it finds an embedded $ or @ variable
2973 and leaves it to the caller to work out what's going on.
2974
94def140
ST
2975 embedded arrays (whether in pattern or not) could be:
2976 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2977
2978 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
2979
2980 $ in pattern could be $foo or could be tail anchor. Assumption:
2981 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 2982 followed by one of "()| \r\n\t"
02aa26ce 2983
9da1dd8f 2984 \1 (backreferences) are turned into $1 in substitutions
02aa26ce
NT
2985
2986 The structure of the code is
2987 while (there's a character to process) {
94def140
ST
2988 handle transliteration ranges
2989 skip regexp comments /(?#comment)/ and codes /(?{code})/
2990 skip #-initiated comments in //x patterns
2991 check for embedded arrays
02aa26ce
NT
2992 check for embedded scalars
2993 if (backslash) {
94def140 2994 deprecate \1 in substitution replacements
02aa26ce
NT
2995 handle string-changing backslashes \l \U \Q \E, etc.
2996 switch (what was escaped) {
94def140 2997 handle \- in a transliteration (becomes a literal -)
ff3f963a 2998 if a pattern and not \N{, go treat as regular character
94def140
ST
2999 handle \132 (octal characters)
3000 handle \x15 and \x{1234} (hex characters)
ff3f963a 3001 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
ST
3002 handle \cV (control characters)
3003 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 3004 } (end switch)
77a135fe 3005 continue
02aa26ce 3006 } (end if backslash)
77a135fe 3007 handle regular character
02aa26ce 3008 } (end while character to read)
02cd137d 3009
02aa26ce
NT
3010*/
3011
76e3520e 3012STATIC char *
cea2e8a9 3013S_scan_const(pTHX_ char *start)
79072805 3014{
eb578fdb 3015 char *send = PL_bufend; /* end of the constant */
dc023dbb
KW
3016 SV *sv = newSV(send - start); /* sv for the constant. See note below
3017 on sizing. */
eb578fdb
KW
3018 char *s = start; /* start of the constant */
3019 char *d = SvPVX(sv); /* destination for copies */
dc023dbb
KW
3020 bool dorange = FALSE; /* are we in a translit range? */
3021 bool didrange = FALSE; /* did we just finish a range? */
3022 bool in_charclass = FALSE; /* within /[...]/ */
ed2893cb 3023 bool s_is_utf8 = cBOOL(UTF); /* Is the source string assumed to be
dc023dbb
KW
3024 UTF8? But, this can show as true
3025 when the source isn't utf8, as for
3026 example when it is entirely composed
3027 of hex constants */
f34acfec 3028 bool d_is_utf8 = FALSE; /* Output constant is UTF8 */
af9be36c
KW
3029 STRLEN utf8_variant_count = 0; /* When not in UTF-8, this counts the
3030 number of characters found so far
3031 that will expand (into 2 bytes)
3032 should we have to convert to
3033 UTF-8) */
6f613c73 3034 SV *res; /* result from charnames */
c89db733
JH
3035 STRLEN offset_to_max = 0; /* The offset in the output to where the range
3036 high-end character is temporarily placed */
77a135fe 3037
fe2ba0a2
KW
3038 /* Does something require special handling in tr/// ? This avoids extra
3039 * work in a less likely case. As such, khw didn't feel it was worth
3040 * adding any branches to the more mainline code to handle this, which
3041 * means that this doesn't get set in some circumstances when things like
3042 * \x{100} get expanded out. As a result there needs to be extra testing
3043 * done in the tr code */
3044 bool has_above_latin1 = FALSE;
3045
77a135fe
KW
3046 /* Note on sizing: The scanned constant is placed into sv, which is
3047 * initialized by newSV() assuming one byte of output for every byte of
3048 * input. This routine expects newSV() to allocate an extra byte for a
3049 * trailing NUL, which this routine will append if it gets to the end of
3050 * the input. There may be more bytes of input than output (eg., \N{LATIN
3051 * CAPITAL LETTER A}), or more output than input if the constant ends up
3052 * recoded to utf8, but each time a construct is found that might increase
3053 * the needed size, SvGROW() is called. Its size parameter each time is
3054 * based on the best guess estimate at the time, namely the length used so
3055 * far, plus the length the current construct will occupy, plus room for
caae0700 3056 * the trailing NUL, plus one byte for every input byte still unscanned */
77a135fe 3057
c3320c2a
KW
3058 UV uv = UV_MAX; /* Initialize to weird value to try to catch any uses
3059 before set */
4c3a8340 3060#ifdef EBCDIC
f4240379
KW
3061 int backslash_N = 0; /* ? was the character from \N{} */
3062 int non_portable_endpoint = 0; /* ? In a range is an endpoint
3063 platform-specific like \x65 */
4c3a8340 3064#endif
012bcf8d 3065
7918f24d
NC
3066 PERL_ARGS_ASSERT_SCAN_CONST;
3067
bb16bae8 3068 assert(PL_lex_inwhat != OP_TRANSR);
2b9d42f0 3069
b899e89d
FC
3070 /* Protect sv from errors and fatal warnings. */
3071 ENTER_with_name("scan_const");
3072 SAVEFREESV(sv);
2b9d42f0 3073
1759517a
KW
3074 /* A bunch of code in the loop below assumes that if s[n] exists and is not
3075 * NUL, then s[n+1] exists. This assertion makes sure that assumption is
3076 * valid */
3077 assert(*send == '\0');
3078
f4240379
KW
3079 while (s < send
3080 || dorange /* Handle tr/// range at right edge of input */
3081 ) {
ff3f963a 3082
02aa26ce 3083 /* get transliterations out of the way (they're most literal) */
3280af22 3084 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 3085
f4240379
KW
3086 /* But there isn't any special handling necessary unless there is a
3087 * range, so for most cases we just drop down and handle the value
3088 * as any other. There are two exceptions.
3089 *
02cd137d
KW
3090 * 1. A hyphen indicates that we are actually going to have a
3091 * range. In this case, skip the '-', set a flag, then drop
f4240379
KW
3092 * down to handle what should be the end range value.
3093 * 2. After we've handled that value, the next time through, that
3094 * flag is set and we fix up the range.
3095 *
3096 * Ranges entirely within Latin1 are expanded out entirely, in
188d22cf
KW
3097 * order to make the transliteration a simple table look-up.
3098 * Ranges that extend above Latin1 have to be done differently, so
3099 * there is no advantage to expanding them here, so they are
dc8faf6b
KW
3100 * stored here as Min, RANGE_INDICATOR, Max. 'RANGE_INDICATOR' is
3101 * a byte that can't occur in legal UTF-8, and hence can signify a
3102 * hyphen without any possible ambiguity. On EBCDIC machines, if
3103 * the range is expressed as Unicode, the Latin1 portion is
3104 * expanded out even if the range extends above Latin1. This is
3105 * because each code point in it has to be processed here
3106 * individually to get its native translation */
f4240379
KW
3107
3108 if (! dorange) {
3109
02cd137d
KW
3110 /* Here, we don't think we're in a range. If the new character
3111 * is not a hyphen; or if it is a hyphen, but it's too close to
e8d55f27
TC
3112 * either edge to indicate a range, or if we haven't output any
3113 * characters yet then it's a regular character. */
04863ba1
KW
3114 if (*s != '-' || s >= send - 1 || s == start || d == SvPVX(sv))
3115 {
f4240379
KW
3116
3117 /* A regular character. Process like any other, but first
3118 * clear any flags */
3119 didrange = FALSE;
3120 dorange = FALSE;
e294cc5d 3121#ifdef EBCDIC
f4240379
KW
3122 non_portable_endpoint = 0;
3123 backslash_N = 0;
e294cc5d 3124#endif
02cd137d
KW
3125 /* The tests here for being above Latin1 and similar ones
3126 * in the following 'else' suffice to find all such
3127 * occurences in the constant, except those added by a
fe2ba0a2 3128 * backslash escape sequence, like \x{100}. Mostly, those
02cd137d 3129 * set 'has_above_latin1' as appropriate */
ed2893cb 3130 if (s_is_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) {
188d22cf
KW
3131 has_above_latin1 = TRUE;
3132 }
3133
f4240379
KW
3134 /* Drops down to generic code to process current byte */
3135 }
02cd137d 3136 else { /* Is a '-' in the context where it means a range */
f4240379 3137 if (didrange) { /* Something like y/A-C-Z// */
02cd137d
KW
3138 Perl_croak(aTHX_ "Ambiguous range in transliteration"
3139 " operator");
f4240379 3140 }
e294cc5d 3141
f4240379 3142 dorange = TRUE;
2b9d42f0 3143
02cd137d 3144 s++; /* Skip past the hyphen */
f4240379
KW
3145
3146 /* d now points to where the end-range character will be
d7f7b0e3
KW
3147 * placed. Drop down to get that character. We'll finish
3148 * processing the range the next time through the loop */
188d22cf 3149
ed2893cb 3150 if (s_is_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) {
188d22cf
KW
3151 has_above_latin1 = TRUE;
3152 }
02cd137d
KW
3153
3154 /* Drops down to generic code to process current byte */
f4240379
KW
3155 }
3156 } /* End of not a range */
3157 else {
3158 /* Here we have parsed a range. Now must handle it. At this
3159 * point:
3160 * 'sv' is a SV* that contains the output string we are
3161 * constructing. The final two characters in that string
3162 * are the range start and range end, in order.
3163 * 'd' points to just beyond the range end in the 'sv' string,
3164 * where we would next place something
f4240379 3165 */
0c311b7c 3166 char * max_ptr;
5ca37d54 3167 char * min_ptr;
f4240379
KW
3168 IV range_min;
3169 IV range_max; /* last character in range */
f4240379 3170 STRLEN grow;
5ca37d54
KW
3171 Size_t offset_to_min = 0;
3172 Size_t extras = 0;
11327fa1 3173#ifdef EBCDIC
f4240379
KW
3174 bool convert_unicode;
3175 IV real_range_max = 0;
e294cc5d 3176#endif
02cd137d 3177 /* Get the code point values of the range ends. */
0c311b7c
KW
3178 max_ptr = (d_is_utf8) ? (char *) utf8_hop( (U8*) d, -1) : d - 1;
3179 offset_to_max = max_ptr - SvPVX_const(sv);
ed2893cb 3180 if (d_is_utf8) {
f4240379
KW
3181 /* We know the utf8 is valid, because we just constructed
3182 * it ourselves in previous loop iterations */
3183 min_ptr = (char*) utf8_hop( (U8*) max_ptr, -1);
3184 range_min = valid_utf8_to_uvchr( (U8*) min_ptr, NULL);
3185 range_max = valid_utf8_to_uvchr( (U8*) max_ptr, NULL);
fe2ba0a2
KW
3186
3187 /* This compensates for not all code setting
3188 * 'has_above_latin1', so that we don't skip stuff that
3189 * should be executed */
3190 if (range_max > 255) {
3191 has_above_latin1 = TRUE;
3192 }
e294cc5d 3193 }
f4240379
KW
3194 else {
3195 min_ptr = max_ptr - 1;
3196 range_min = * (U8*) min_ptr;
3197 range_max = * (U8*) max_ptr;
3198 }
3199
8efef67c
KW
3200 /* If the range is just a single code point, like tr/a-a/.../,
3201 * that code point is already in the output, twice. We can
3202 * just back up over the second instance and avoid all the rest
3203 * of the work. But if it is a variant character, it's been
218304f9
KW
3204 * counted twice, so decrement. (This unlikely scenario is
3205 * special cased, like the one for a range of 2 code points
3206 * below, only because the main-line code below needs a range
3207 * of 3 or more to work without special casing. Might as well
3208 * get it out of the way now.) */
8efef67c
KW
3209 if (UNLIKELY(range_max == range_min)) {
3210 d = max_ptr;
ed2893cb 3211 if (! d_is_utf8 && ! UVCHR_IS_INVARIANT(range_max)) {
8efef67c
KW
3212 utf8_variant_count--;
3213 }
3214 goto range_done;
3215 }
3216
e294cc5d 3217#ifdef EBCDIC
f4240379
KW
3218 /* On EBCDIC platforms, we may have to deal with portable
3219 * ranges. These happen if at least one range endpoint is a
3220 * Unicode value (\N{...}), or if the range is a subset of
3221 * [A-Z] or [a-z], and both ends are literal characters,
3222 * like 'A', and not like \x{C1} */
87dd6ea7 3223 convert_unicode =
02cd137d
KW
3224 cBOOL(backslash_N) /* \N{} forces Unicode,
3225 hence portable range */
3226 || ( ! non_portable_endpoint
3227 && (( isLOWER_A(range_min) && isLOWER_A(range_max))
3228 || (isUPPER_A(range_min) && isUPPER_A(range_max))));
87dd6ea7 3229 if (convert_unicode) {
f4240379
KW
3230
3231 /* Special handling is needed for these portable ranges.
02cd137d
KW
3232 * They are defined to be in Unicode terms, which includes
3233 * all the Unicode code points between the end points.
f4240379
KW
3234 * Convert to Unicode to get the Unicode range. Later we
3235 * will convert each code point in the range back to
3236 * native. */
3237 range_min = NATIVE_TO_UNI(range_min);
3238 range_max = NATIVE_TO_UNI(range_max);
3239 }
e294cc5d 3240#endif
8ada0baa 3241
f4240379 3242 if (range_min > range_max) {
11327fa1 3243#ifdef EBCDIC
f4240379
KW
3244 if (convert_unicode) {
3245 /* Need to convert back to native for meaningful
3246 * messages for this platform */
3247 range_min = UNI_TO_NATIVE(range_min);
3248 range_max = UNI_TO_NATIVE(range_max);
3249 }
11327fa1 3250#endif
f4240379
KW
3251 /* Use the characters themselves for the error message if
3252 * ASCII printables; otherwise some visible representation
3253 * of them */
3254 if (isPRINT_A(range_min) && isPRINT_A(range_max)) {
3255 Perl_croak(aTHX_
3256 "Invalid range \"%c-%c\" in transliteration operator",
3257 (char)range_min, (char)range_max);
3258 }
11327fa1 3259#ifdef EBCDIC
f4240379 3260 else if (convert_unicode) {
02cd137d 3261 /* diag_listed_as: Invalid range "%s" in transliteration operator */
f4240379 3262 Perl_croak(aTHX_
02cd137d
KW
3263 "Invalid range \"\\N{U+%04" UVXf "}-\\N{U+%04"
3264 UVXf "}\" in transliteration operator",
3265 range_min, range_max);
f4240379 3266 }
11327fa1 3267#endif
f4240379 3268 else {
02cd137d 3269 /* diag_listed_as: Invalid range "%s" in transliteration operator */
f4240379 3270 Perl_croak(aTHX_
02cd137d
KW
3271 "Invalid range \"\\x{%04" UVXf "}-\\x{%04" UVXf "}\""
3272 " in transliteration operator",
3273 range_min, range_max);
f4240379 3274 }
c2e66d9e
GS
3275 }
3276
aca41667
KW
3277 /* If the range is exactly two code points long, they are
3278 * already both in the output */
3279 if (UNLIKELY(range_min + 1 == range_max)) {
3280 goto range_done;
3281 }
3282
3283 /* Here the range contains at least 3 code points */
3284
ed2893cb 3285 if (d_is_utf8) {
f4240379 3286
188d22cf
KW
3287 /* If everything in the transliteration is below 256, we
3288 * can avoid special handling later. A translation table
02cd137d
KW
3289 * for each of those bytes is created by op.c. So we
3290 * expand out all ranges to their constituent code points.
3291 * But if we've encountered something above 255, the
3292 * expanding won't help, so skip doing that. But if it's
3293 * EBCDIC, we may have to look at each character below 256
3294 * if we have to convert to/from Unicode values */
188d22cf 3295 if ( has_above_latin1
c7f1f016 3296#ifdef EBCDIC
f4240379 3297 && (range_min > 255 || ! convert_unicode)
8ada0baa 3298#endif
f4240379 3299 ) {
3fdfceb3
KW
3300 const STRLEN off = d - SvPVX(sv);
3301 const STRLEN extra = 1 + (send - s) + 1;
3302 char *e;
3303
f4240379
KW
3304 /* Move the high character one byte to the right; then
3305 * insert between it and the range begin, an illegal
3306 * byte which serves to indicate this is a range (using
02cd137d 3307 * a '-' would be ambiguous). */
3fdfceb3
KW
3308
3309 if (off + extra > SvLEN(sv)) {
3310 d = off + SvGROW(sv, off + extra);
3311 max_ptr = d - off + offset_to_max;
3312 }
3313
3314 e = d++;
f4240379
KW
3315 while (e-- > max_ptr) {
3316 *(e + 1) = *e;
e294cc5d 3317 }
dc8faf6b 3318 *(e + 1) = (char) RANGE_INDICATOR;
f4240379
KW
3319 goto range_done;
3320 }
3321
3322 /* Here, we're going to expand out the range. For EBCDIC
3323 * the range can extend above 255 (not so in ASCII), so
3324 * for EBCDIC, split it into the parts above and below
3325 * 255/256 */
e294cc5d 3326#ifdef EBCDIC
f4240379
KW
3327 if (range_max > 255) {
3328 real_range_max = range_max;
3329 range_max = 255;
3330 }
e294cc5d 3331#endif
f4240379 3332 }
02aa26ce 3333
f4240379 3334 /* Here we need to expand out the string to contain each
5ca37d54
KW
3335 * character in the range. Grow the output to handle this.
3336 * For non-UTF8, we need a byte for each code point in the
3337 * range, minus the three that we've already allocated for: the
3338 * hyphen, the min, and the max. For UTF-8, we need this
3339 * plus an extra byte for each code point that occupies two
3340 * bytes (is variant) when in UTF-8 (except we've already
3341 * allocated for the end points, including if they are
3342 * variants). For ASCII platforms and Unicode ranges on EBCDIC
3343 * platforms, it's easy to calculate a precise number. To
3344 * start, we count the variants in the range, which we need
3345 * elsewhere in this function anyway. (For the case where it
3346 * isn't easy to calculate, 'extras' has been initialized to 0,
3347 * and the calculation is done in a loop further down.) */
3348#ifdef EBCDIC
3349 if (convert_unicode)
3350#endif
3351 {
3352 /* This is executed unconditionally on ASCII, and for
3353 * Unicode ranges on EBCDIC. Under these conditions, all
3354 * code points above a certain value are variant; and none
3355 * under that value are. We just need to find out how much
3356 * of the range is above that value. We don't count the
3357 * end points here, as they will already have been counted
3358 * as they were parsed. */
3359 if (range_min >= UTF_CONTINUATION_MARK) {
3360
3361 /* The whole range is made up of variants */
3362 extras = (range_max - 1) - (range_min + 1) + 1;
3363 }
3364 else if (range_max >= UTF_CONTINUATION_MARK) {
f4240379 3365
5ca37d54
KW
3366 /* Only the higher portion of the range is variants */
3367 extras = (range_max - 1) - UTF_CONTINUATION_MARK + 1;
3368 }
f4240379 3369
5ca37d54
KW
3370 utf8_variant_count += extras;
3371 }
3372
3373 /* The base growth is the number of code points in the range,
3374 * not including the endpoints, which have already been sized
3375 * for (and output). We don't subtract for the hyphen, as it
3376 * has been parsed but not output, and the SvGROW below is
3377 * based only on what's been output plus what's left to parse.
3378 * */
3379 grow = (range_max - 1) - (range_min + 1) + 1;
f4240379 3380
ed2893cb 3381 if (d_is_utf8) {
4c3a8340 3382#ifdef EBCDIC
5ca37d54
KW
3383 /* In some cases in EBCDIC, we haven't yet calculated a
3384 * precise amount needed for the UTF-8 variants. Just
3385 * assume the worst case, that everything will expand by a
3386 * byte */
3387 if (! convert_unicode) {
f4240379
KW
3388 grow *= 2;
3389 }
5ca37d54 3390 else
4c3a8340 3391#endif
5ca37d54
KW
3392 {
3393 /* Otherwise we know exactly how many variants there
3394 * are in the range. */
3395 grow += extras;
3396 }
f4240379
KW
3397 }
3398
5ca37d54
KW
3399 /* Grow, but position the output to overwrite the range min end
3400 * point, because in some cases we overwrite that */
3401 SvCUR_set(sv, d - SvPVX_const(sv));
3402 offset_to_min = min_ptr - SvPVX_const(sv);
3403
3404 /* See Note on sizing above. */
3405 d = offset_to_min + SvGROW(sv, SvCUR(sv)
3406 + (send - s)
3407 + grow
3408 + 1 /* Trailing NUL */ );
f4240379 3409
5ca37d54 3410 /* Now, we can expand out the range. */
11327fa1 3411#ifdef EBCDIC
f4240379 3412 if (convert_unicode) {
5ca37d54 3413 SSize_t i;
02aa26ce 3414
f4240379
KW
3415 /* Recall that the min and max are now in Unicode terms, so
3416 * we have to convert each character to its native
3417 * equivalent */
ed2893cb 3418 if (d_is_utf8) {
f4240379 3419 for (i = range_min; i <= range_max; i++) {
02cd137d
KW
3420 append_utf8_from_native_byte(
3421 LATIN1_TO_NATIVE((U8) i),
3422 (U8 **) &d);
f4240379
KW
3423 }
3424 }
3425 else {
3426 for (i = range_min; i <= range_max; i++) {
81324705 3427 *d++ = (char)LATIN1_TO_NATIVE((U8) i);
f4240379
KW
3428 }
3429 }
01ec43d0 3430 }
11327fa1
AL
3431 else
3432#endif
3433 /* Always gets run for ASCII, and sometimes for EBCDIC. */
3434 {
f4240379
KW
3435 /* Here, no conversions are necessary, which means that the
3436 * first character in the range is already in 'd' and
3437 * valid, so we can skip overwriting it */
ed2893cb 3438 if (d_is_utf8) {
19742f39 3439 SSize_t i;
f4240379
KW
3440 d += UTF8SKIP(d);
3441 for (i = range_min + 1; i <= range_max; i++) {
81324705 3442 append_utf8_from_native_byte((U8) i, (U8 **) &d);
f4240379
KW
3443 }
3444 }
3445 else {
19742f39 3446 SSize_t i;
f4240379 3447 d++;
5ca37d54
KW
3448 assert(range_min + 1 <= range_max);
3449 for (i = range_min + 1; i < range_max; i++) {
3450#ifdef EBCDIC
3451 /* In this case on EBCDIC, we haven't calculated
3452 * the variants. Do it here, as we go along */
3453 if (! UVCHR_IS_INVARIANT(i)) {
3454 utf8_variant_count++;
3455 }
3456#endif
f4240379
KW
3457 *d++ = (char)i;
3458 }
5ca37d54
KW
3459
3460 /* The range_max is done outside the loop so as to
3461 * avoid having to special case not incrementing
3462 * 'utf8_variant_count' on EBCDIC (it's already been
3463 * counted when originally parsed) */
3464 *d++ = (char) range_max;
f4240379 3465 }
a0ed51b3 3466 }
02aa26ce 3467
11327fa1 3468#ifdef EBCDIC
02cd137d
KW
3469 /* If the original range extended above 255, add in that
3470 * portion. */
f4240379
KW
3471 if (real_range_max) {
3472 *d++ = (char) UTF8_TWO_BYTE_HI(0x100);
3473 *d++ = (char) UTF8_TWO_BYTE_LO(0x100);
043af6f6
KW
3474 if (real_range_max > 0x100) {
3475 if (real_range_max > 0x101) {
dc8faf6b 3476 *d++ = (char) RANGE_INDICATOR;
043af6f6 3477 }
f4240379 3478 d = (char*)uvchr_to_utf8((U8*)d, real_range_max);
043af6f6 3479 }
f4240379 3480 }
11327fa1 3481#endif
02aa26ce 3482
f4240379
KW
3483 range_done:
3484 /* mark the range as done, and continue */
3485 didrange = TRUE;
3486 dorange = FALSE;
3487#ifdef EBCDIC
3488 non_portable_endpoint = 0;
3489 backslash_N = 0;
3490#endif
3491 continue;
3492 } /* End of is a range */
3493 } /* End of transliteration. Joins main code after these else's */
e4a2df84
DM
3494 else if (*s == '[' && PL_lex_inpat && !in_charclass) {
3495 char *s1 = s-1;
3496 int esc = 0;
3497 while (s1 >= start && *s1-- == '\\')
3498 esc = !esc;
3499 if (!esc)
3500 in_charclass = TRUE;
3501 }
1e02a175 3502 else if (*s == ']' && PL_lex_inpat && in_charclass) {
e4a2df84
DM
3503 char *s1 = s-1;
3504 int esc = 0;
3505 while (s1 >= start && *s1-- == '\\')
3506 esc = !esc;
3507 if (!esc)
3508 in_charclass = FALSE;
3509 }
02cd137d
KW
3510 /* skip for regexp comments /(?#comment)/, except for the last
3511 * char, which will be done separately. Stop on (?{..}) and
3512 * friends */
c30fc27b 3513 else if (*s == '(' && PL_lex_inpat && s[1] == '?' && !in_charclass) {
cc6b7395 3514 if (s[2] == '#') {
f339d50e
KW
3515 if (s_is_utf8) {
3516 PERL_UINT_FAST8_T len = UTF8SKIP(s);
3517
3518 while (s + len < send && *s != ')') {
3519 Copy(s, d, len, U8);
3520 d += len;
3521 s += len;
3522 len = UTF8_SAFE_SKIP(s, send);
3523 }
3524 }
3525 else while (s+1 < send && *s != ')') {
3526 *d++ = *s++;
3527 }
155aba94 3528 }