This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
update the links to freenode
[perl5.git] / toke.c
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
f0e67a1d
Z
26This is the lower layer of the Perl parser, managing characters and tokens.
27
78342678 28=for apidoc AmnU|yy_parser *|PL_parser
f0e67a1d
Z
29
30Pointer to a structure encapsulating the state of the parsing operation
31currently in progress. The pointer can be locally changed to perform
32a nested parse without interfering with the state of an outer parse.
33Individual members of C<PL_parser> have their own documentation.
34
35=cut
36*/
37
378cc40b 38#include "EXTERN.h"
864dbfa3 39#define PERL_IN_TOKE_C
378cc40b 40#include "perl.h"
f1bcae08 41#include "invlist_inline.h"
378cc40b 42
164e423c
KW
43#define new_constant(a,b,c,d,e,f,g, h) \
44 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g, h)
eb0d8d16 45
6154021b 46#define pl_yylval (PL_parser->yylval)
d3b6f988 47
199e78b7
DM
48/* XXX temporary backwards compatibility */
49#define PL_lex_brackets (PL_parser->lex_brackets)
78cdf107
Z
50#define PL_lex_allbrackets (PL_parser->lex_allbrackets)
51#define PL_lex_fakeeof (PL_parser->lex_fakeeof)
199e78b7
DM
52#define PL_lex_brackstack (PL_parser->lex_brackstack)
53#define PL_lex_casemods (PL_parser->lex_casemods)
54#define PL_lex_casestack (PL_parser->lex_casestack)
199e78b7 55#define PL_lex_dojoin (PL_parser->lex_dojoin)
199e78b7
DM
56#define PL_lex_formbrack (PL_parser->lex_formbrack)
57#define PL_lex_inpat (PL_parser->lex_inpat)
58#define PL_lex_inwhat (PL_parser->lex_inwhat)
59#define PL_lex_op (PL_parser->lex_op)
60#define PL_lex_repl (PL_parser->lex_repl)
61#define PL_lex_starts (PL_parser->lex_starts)
62#define PL_lex_stuff (PL_parser->lex_stuff)
63#define PL_multi_start (PL_parser->multi_start)
64#define PL_multi_open (PL_parser->multi_open)
65#define PL_multi_close (PL_parser->multi_close)
199e78b7 66#define PL_preambled (PL_parser->preambled)
bdc0bf6f 67#define PL_linestr (PL_parser->linestr)
c2598295
DM
68#define PL_expect (PL_parser->expect)
69#define PL_copline (PL_parser->copline)
f06b5848
DM
70#define PL_bufptr (PL_parser->bufptr)
71#define PL_oldbufptr (PL_parser->oldbufptr)
72#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
73#define PL_linestart (PL_parser->linestart)
74#define PL_bufend (PL_parser->bufend)
75#define PL_last_uni (PL_parser->last_uni)
76#define PL_last_lop (PL_parser->last_lop)
77#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 78#define PL_lex_state (PL_parser->lex_state)
2f9285f8 79#define PL_rsfp (PL_parser->rsfp)
5486870f 80#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
81#define PL_in_my (PL_parser->in_my)
82#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 83#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 84#define PL_multi_end (PL_parser->multi_end)
13765c85 85#define PL_error_count (PL_parser->error_count)
199e78b7 86
fb205e7a
DM
87# define PL_nexttoke (PL_parser->nexttoke)
88# define PL_nexttype (PL_parser->nexttype)
89# define PL_nextval (PL_parser->nextval)
199e78b7 90
6432a58a
DM
91
92#define SvEVALED(sv) \
93 (SvTYPE(sv) >= SVt_PVNV \
94 && ((XPVIV*)SvANY(sv))->xiv_u.xivu_eval_seen)
95
a1894d81 96static const char* const ident_too_long = "Identifier too long";
60267e1d 97static const char* const ident_var_zero_multi_digit = "Numeric variables with more than one digit may not start with '0'";
8903cb82 98
9ded7720 99# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2 100
a7aaec61
Z
101#define XENUMMASK 0x3f
102#define XFAKEEOF 0x40
103#define XFAKEBRACK 0x80
9059aa12 104
39e02b42 105#ifdef USE_UTF8_SCRIPTS
b3041197 106# define UTF cBOOL(!IN_BYTES)
2b9d42f0 107#else
b3041197 108# define UTF cBOOL((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
2b9d42f0 109#endif
a0ed51b3 110
b1fc3636
CJ
111/* The maximum number of characters preceding the unrecognized one to display */
112#define UNRECOGNIZED_PRECEDE_COUNT 10
113
61f0cdd9 114/* In variables named $^X, these are the legal values for X.
2b92dfce 115 * 1999-02-27 mjd-perl-patch@plover.com */
4aada8b9 116#define isCONTROLVAR(x) (isUPPER(x) || memCHRs("[\\]^_?", (x)))
2b92dfce 117
14bd96d0 118#define SPACE_OR_TAB(c) isBLANK_A(c)
bf4acbe4 119
9ff909cf
JH
120#define HEXFP_PEEK(s) \
121 (((s[0] == '.') && \
122 (isXDIGIT(s[1]) || isALPHA_FOLD_EQ(s[1], 'p'))) || \
123 isALPHA_FOLD_EQ(s[0], 'p'))
124
ffb4593c
NT
125/* LEX_* are values for PL_lex_state, the state of the lexer.
126 * They are arranged oddly so that the guard on the switch statement
79072805 127 * can get by with a single comparison (if the compiler is smart enough).
9da1dd8f
DM
128 *
129 * These values refer to the various states within a sublex parse,
130 * i.e. within a double quotish string
79072805
LW
131 */
132
fb73857a
PP
133/* #define LEX_NOTPARSING 11 is done in perl.h. */
134
b6007c36
DM
135#define LEX_NORMAL 10 /* normal code (ie not within "...") */
136#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
137#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
138#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
139#define LEX_INTERPSTART 6 /* expecting the start of a $var */
140
141 /* at end of code, eg "$x" followed by: */
142#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
143#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
144
145#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
146 string or after \E, $foo, etc */
147#define LEX_INTERPCONST 2 /* NOT USED */
148#define LEX_FORMLINE 1 /* expecting a format line */
b6007c36 149
56d9fe2f
TC
150/* returned to yyl_try() to request it to retry the parse loop, expected to only
151 be returned directly by yyl_fake_eof(), but functions that call yyl_fake_eof()
152 can also return it.
153
154 yylex (aka Perl_yylex) returns 0 on EOF rather than returning -1,
155 other token values are 258 or higher (see perly.h), so -1 should be
156 a safe value here.
157*/
158#define YYL_RETRY (-1)
79072805 159
bbf60fe6 160#ifdef DEBUGGING
27da23d5 161static const char* const lex_state_names[] = {
bbf60fe6
DM
162 "KNOWNEXT",
163 "FORMLINE",
164 "INTERPCONST",
165 "INTERPCONCAT",
166 "INTERPENDMAYBE",
167 "INTERPEND",
168 "INTERPSTART",
169 "INTERPPUSH",
170 "INTERPCASEMOD",
171 "INTERPNORMAL",
172 "NORMAL"
173};
174#endif
175
79072805 176#include "keywords.h"
fe14fcc3 177
ffb4593c
NT
178/* CLINE is a macro that ensures PL_copline has a sane value */
179
57843af0 180#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 181
ffb4593c
NT
182/*
183 * Convenience functions to return different tokens and prime the
9cbb5ea2 184 * lexer for the next token. They all take an argument.
ffb4593c
NT
185 *
186 * TOKEN : generic token (used for '(', DOLSHARP, etc)
187 * OPERATOR : generic operator
188 * AOPERATOR : assignment operator
189 * PREBLOCK : beginning the block after an if, while, foreach, ...
190 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
191 * PREREF : *EXPR where EXPR is not a simple identifier
192 * TERM : expression term
89f35911 193 * POSTDEREF : postfix dereference (->$* ->@[...] etc.)
ffb4593c
NT
194 * LOOPX : loop exiting command (goto, last, dump, etc)
195 * FTST : file test operator
196 * FUN0 : zero-argument function
7eb971ee 197 * FUN0OP : zero-argument function, with its op created in this file
2d2e263d 198 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
199 * BOop : bitwise or or xor
200 * BAop : bitwise and
8823cb89 201 * BCop : bitwise complement
ffb4593c
NT
202 * SHop : shift operator
203 * PWop : power operator
9cbb5ea2 204 * PMop : pattern-matching operator
ffb4593c 205 * Aop : addition-level operator
e4916dd1 206 * AopNOASSIGN : addition-level operator that is never part of .=
ffb4593c 207 * Mop : multiplication-level operator
02b85d3d
Z
208 * ChEop : chaining equality-testing operator
209 * NCEop : non-chaining comparison operator at equality precedence
210 * ChRop : chaining relational operator <= != gt
211 * NCRop : non-chaining relational operator isa
ffb4593c
NT
212 *
213 * Also see LOP and lop() below.
214 */
215
998054bd 216#ifdef DEBUGGING /* Serve -DT. */
704d4215 217# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 218#else
bbf60fe6 219# define REPORT(retval) (retval)
998054bd
SC
220#endif
221
bbf60fe6
DM
222#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
223#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
b1764551 224#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, retval))
bbf60fe6
DM
225#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
226#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
227#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
228#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
89f35911 229#define POSTDEREF(f) return (PL_bufptr = s, S_postderef(aTHX_ REPORT(f),s[1]))
185c2e96 230#define LOOPX(f) return (PL_bufptr = force_word(s,BAREWORD,TRUE,FALSE), \
7a61bf3c 231 pl_yylval.ival=f, \
a49203fd 232 PL_expect = PL_nexttoke ? XOPERATOR : XTERM, \
7a61bf3c 233 REPORT((int)LOOPEX))
6154021b
RGS
234#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
235#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
7eb971ee 236#define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
6154021b 237#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
b1764551
FC
238#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)BITOROP))
239#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)BITANDOP))
8823cb89 240#define BCop(f) return pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr = s, \
3d92c6b8 241 REPORT(PERLY_TILDE)
b1764551
FC
242#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)SHIFTOP))
243#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)POWOP))
6154021b 244#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
b1764551 245#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)ADDOP))
e4916dd1 246#define AopNOASSIGN(f) return (pl_yylval.ival=f, PL_bufptr=s, REPORT((int)ADDOP))
b1764551 247#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)MULOP))
02b85d3d
Z
248#define ChEop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)CHEQOP))
249#define NCEop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)NCEQOP))
250#define ChRop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)CHRELOP))
251#define NCRop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)NCRELOP))
2f3197b3 252
a687059c
LW
253/* This bit of chicanery makes a unary function followed by
254 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
255 * The UNIDOR macro is for unary functions that can be followed by the //
256 * operator (such as C<shift // 0>).
a687059c 257 */
d68ce4ac 258#define UNI3(f,x,have_x) { \
6154021b 259 pl_yylval.ival = f; \
d68ce4ac 260 if (have_x) PL_expect = x; \
376fcdbf
AL
261 PL_bufptr = s; \
262 PL_last_uni = PL_oldbufptr; \
0af40c75 263 PL_last_lop_op = (f) < 0 ? -(f) : (f); \
376fcdbf
AL
264 if (*s == '(') \
265 return REPORT( (int)FUNC1 ); \
294a536f 266 s = skipspace(s); \
376fcdbf
AL
267 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
268 }
d68ce4ac
FC
269#define UNI(f) UNI3(f,XTERM,1)
270#define UNIDOR(f) UNI3(f,XTERMORDORDOR,1)
b5fb7ce3
FC
271#define UNIPROTO(f,optional) { \
272 if (optional) PL_last_uni = PL_oldbufptr; \
22393538
MH
273 OPERATOR(f); \
274 }
a687059c 275
d68ce4ac 276#define UNIBRACK(f) UNI3(f,0,0)
79072805 277
9f68db38 278/* grandfather return to old style */
78cdf107
Z
279#define OLDLOP(f) \
280 do { \
281 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
282 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
283 pl_yylval.ival = (f); \
284 PL_expect = XTERM; \
285 PL_bufptr = s; \
286 return (int)LSTOP; \
287 } while(0)
79072805 288
83944c01
FC
289#define COPLINE_INC_WITH_HERELINES \
290 STMT_START { \
291 CopLINE_inc(PL_curcop); \
851b527a
FC
292 if (PL_parser->herelines) \
293 CopLINE(PL_curcop) += PL_parser->herelines, \
294 PL_parser->herelines = 0; \
83944c01 295 } STMT_END
ffdb8b16
FC
296/* Called after scan_str to update CopLINE(PL_curcop), but only when there
297 * is no sublex_push to follow. */
298#define COPLINE_SET_FROM_MULTI_END \
299 STMT_START { \
300 CopLINE_set(PL_curcop, PL_multi_end); \
301 if (PL_multi_end != PL_multi_start) \
851b527a 302 PL_parser->herelines = 0; \
ffdb8b16 303 } STMT_END
83944c01
FC
304
305
539c839f
AC
306/* A file-local structure for passing around information about subroutines and
307 * related definable words */
308struct code {
309 SV *sv;
310 CV *cv;
311 GV *gv, **gvp;
312 OP *rv2cv_op;
313 PADOFFSET off;
314 bool lex;
315};
316
317static const struct code no_code = { NULL, NULL, NULL, NULL, NULL, 0, FALSE };
318
8fa7f367
JH
319#ifdef DEBUGGING
320
6154021b 321/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
322enum token_type {
323 TOKENTYPE_NONE,
324 TOKENTYPE_IVAL,
6154021b 325 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6 326 TOKENTYPE_PVAL,
aeaef349 327 TOKENTYPE_OPVAL
bbf60fe6
DM
328};
329
c588171e
BZ
330#define DEBUG_TOKEN(Type, Name) \
331 { Name, TOKENTYPE_##Type, #Name }
332
6d4a66ac
NC
333static struct debug_tokens {
334 const int token;
335 enum token_type type;
336 const char *name;
337} const debug_tokens[] =
9041c2e3 338{
bbf60fe6
DM
339 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
340 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
341 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
342 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
436ddf68 343 { ANON_SIGSUB, TOKENTYPE_IVAL, "ANON_SIGSUB" },
bbf60fe6
DM
344 { ARROW, TOKENTYPE_NONE, "ARROW" },
345 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
346 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
347 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
02b85d3d
Z
348 { CHEQOP, TOKENTYPE_OPNUM, "CHEQOP" },
349 { CHRELOP, TOKENTYPE_OPNUM, "CHRELOP" },
bbf60fe6
DM
350 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
351 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
7896dde7 352 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
353 { DO, TOKENTYPE_NONE, "DO" },
354 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
355 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
bbf60fe6
DM
356 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
357 { ELSE, TOKENTYPE_NONE, "ELSE" },
358 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
bbf60fe6
DM
359 { FOR, TOKENTYPE_IVAL, "FOR" },
360 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
705fe0e5
FC
361 { FORMLBRACK, TOKENTYPE_NONE, "FORMLBRACK" },
362 { FORMRBRACK, TOKENTYPE_NONE, "FORMRBRACK" },
bbf60fe6
DM
363 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
364 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
7eb971ee 365 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
bbf60fe6
DM
366 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
367 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
368 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 369 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
370 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
371 { IF, TOKENTYPE_IVAL, "IF" },
01719201 372 { LABEL, TOKENTYPE_OPVAL, "LABEL" },
bbf60fe6
DM
373 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
374 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
375 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
376 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
377 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
378 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
379 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
380 { MY, TOKENTYPE_IVAL, "MY" },
02b85d3d
Z
381 { NCEQOP, TOKENTYPE_OPNUM, "NCEQOP" },
382 { NCRELOP, TOKENTYPE_OPNUM, "NCRELOP" },
bbf60fe6
DM
383 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
384 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
385 { OROP, TOKENTYPE_IVAL, "OROP" },
386 { OROR, TOKENTYPE_NONE, "OROR" },
387 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
25a50500 388 DEBUG_TOKEN (IVAL, PERLY_AMPERSAND),
d0a6a9c7 389 DEBUG_TOKEN (IVAL, PERLY_BRACE_CLOSE),
c588171e 390 DEBUG_TOKEN (IVAL, PERLY_BRACE_OPEN),
fceeeb77 391 DEBUG_TOKEN (IVAL, PERLY_BRACKET_CLOSE),
669dd22c 392 DEBUG_TOKEN (IVAL, PERLY_BRACKET_OPEN),
3d9ccdfc 393 DEBUG_TOKEN (IVAL, PERLY_COLON),
581f9a7a 394 DEBUG_TOKEN (IVAL, PERLY_COMMA),
da4bce7d 395 DEBUG_TOKEN (IVAL, PERLY_DOT),
db83e45c 396 DEBUG_TOKEN (IVAL, PERLY_EQUAL_SIGN),
1c2e9449 397 DEBUG_TOKEN (IVAL, PERLY_EXCLAMATION_MARK),
68a66a8b 398 DEBUG_TOKEN (IVAL, PERLY_MINUS),
ee67f254 399 DEBUG_TOKEN (IVAL, PERLY_PAREN_OPEN),
0ba95c59 400 DEBUG_TOKEN (IVAL, PERLY_PERCENT_SIGN),
5776f3e5 401 DEBUG_TOKEN (IVAL, PERLY_PLUS),
a1ad62bf 402 DEBUG_TOKEN (IVAL, PERLY_QUESTION_MARK),
5adeeefb 403 DEBUG_TOKEN (IVAL, PERLY_SEMICOLON),
77b0379f 404 DEBUG_TOKEN (IVAL, PERLY_SLASH),
9086c946 405 DEBUG_TOKEN (IVAL, PERLY_SNAIL),
d02b2fbf 406 DEBUG_TOKEN (IVAL, PERLY_STAR),
3d92c6b8 407 DEBUG_TOKEN (IVAL, PERLY_TILDE),
88e1f1a2
JV
408 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
409 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6 410 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
cc624add 411 { POSTJOIN, TOKENTYPE_NONE, "POSTJOIN" },
bbf60fe6
DM
412 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
413 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
414 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
415 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
416 { PREINC, TOKENTYPE_NONE, "PREINC" },
417 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
f3f204dc 418 { QWLIST, TOKENTYPE_OPVAL, "QWLIST" },
bbf60fe6 419 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
f3f204dc 420 { REQUIRE, TOKENTYPE_NONE, "REQUIRE" },
bbf60fe6 421 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
436ddf68 422 { SIGSUB, TOKENTYPE_NONE, "SIGSUB" },
bbf60fe6 423 { SUB, TOKENTYPE_NONE, "SUB" },
69afcc21
TC
424 { SUBLEXEND, TOKENTYPE_NONE, "SUBLEXEND" },
425 { SUBLEXSTART, TOKENTYPE_NONE, "SUBLEXSTART" },
bbf60fe6
DM
426 { THING, TOKENTYPE_OPVAL, "THING" },
427 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
428 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
429 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
430 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
431 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
432 { USE, TOKENTYPE_IVAL, "USE" },
7896dde7 433 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6 434 { WHILE, TOKENTYPE_IVAL, "WHILE" },
185c2e96 435 { BAREWORD, TOKENTYPE_OPVAL, "BAREWORD" },
be25f609 436 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 437 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
438};
439
c588171e
BZ
440#undef DEBUG_TOKEN
441
6154021b 442/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 443
bbf60fe6 444STATIC int
704d4215 445S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 446{
7918f24d
NC
447 PERL_ARGS_ASSERT_TOKEREPORT;
448
bbf60fe6 449 if (DEBUG_T_TEST) {
bd61b366 450 const char *name = NULL;
bbf60fe6 451 enum token_type type = TOKENTYPE_NONE;
f54cb97a 452 const struct debug_tokens *p;
396482e1 453 SV* const report = newSVpvs("<== ");
bbf60fe6 454
f54cb97a 455 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
456 if (p->token == (int)rv) {
457 name = p->name;
458 type = p->type;
459 break;
460 }
461 }
462 if (name)
54667de8 463 Perl_sv_catpv(aTHX_ report, name);
239f83d5 464 else if (isGRAPH(rv))
4ebc7986 465 {
bbf60fe6 466 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
4ebc7986
FC
467 if ((char)rv == 'p')
468 sv_catpvs(report, " (pending identifier)");
469 }
bbf60fe6 470 else if (!rv)
396482e1 471 sv_catpvs(report, "EOF");
bbf60fe6 472 else
147e3846 473 Perl_sv_catpvf(aTHX_ report, "?? %" IVdf, (IV)rv);
bbf60fe6
DM
474 switch (type) {
475 case TOKENTYPE_NONE:
bbf60fe6
DM
476 break;
477 case TOKENTYPE_IVAL:
147e3846 478 Perl_sv_catpvf(aTHX_ report, "(ival=%" IVdf ")", (IV)lvalp->ival);
bbf60fe6
DM
479 break;
480 case TOKENTYPE_OPNUM:
481 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 482 PL_op_name[lvalp->ival]);
bbf60fe6
DM
483 break;
484 case TOKENTYPE_PVAL:
704d4215 485 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
486 break;
487 case TOKENTYPE_OPVAL:
704d4215 488 if (lvalp->opval) {
401441c0 489 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
490 PL_op_name[lvalp->opval->op_type]);
491 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 492 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 493 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
494 }
495
496 }
401441c0 497 else
396482e1 498 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
499 break;
500 }
b6007c36 501 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
502 };
503 return (int)rv;
998054bd
SC
504}
505
b6007c36
DM
506
507/* print the buffer with suitable escapes */
508
509STATIC void
15f169a1 510S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 511{
396482e1 512 SV* const tmp = newSVpvs("");
7918f24d
NC
513
514 PERL_ARGS_ASSERT_PRINTBUF;
515
7347ee54 516 GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral); /* fmt checked by caller */
b6007c36 517 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
7347ee54 518 GCC_DIAG_RESTORE_STMT;
b6007c36
DM
519 SvREFCNT_dec(tmp);
520}
521
8fa7f367
JH
522#endif
523
ffb4593c
NT
524/*
525 * S_ao
526 *
f393a21a
FC
527 * This subroutine looks for an '=' next to the operator that has just been
528 * parsed and turns it into an ASSIGNOP if it finds one.
ffb4593c
NT
529 */
530
76e3520e 531STATIC int
cea2e8a9 532S_ao(pTHX_ int toketype)
a0d0e21e 533{
3280af22
NIS
534 if (*PL_bufptr == '=') {
535 PL_bufptr++;
a0d0e21e 536 if (toketype == ANDAND)
6154021b 537 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 538 else if (toketype == OROR)
6154021b 539 pl_yylval.ival = OP_ORASSIGN;
c963b151 540 else if (toketype == DORDOR)
6154021b 541 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
542 toketype = ASSIGNOP;
543 }
b1764551 544 return REPORT(toketype);
a0d0e21e
LW
545}
546
ffb4593c
NT
547/*
548 * S_no_op
549 * When Perl expects an operator and finds something else, no_op
550 * prints the warning. It always prints "<something> found where
551 * operator expected. It prints "Missing semicolon on previous line?"
552 * if the surprise occurs at the start of the line. "do you need to
553 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
554 * where the compiler doesn't know if foo is a method call or a function.
555 * It prints "Missing operator before end of line" if there's nothing
556 * after the missing operator, or "... before <...>" if there is something
557 * after the missing operator.
488bc579
FC
558 *
559 * PL_bufptr is expected to point to the start of the thing that was found,
560 * and s after the next token or partial token.
ffb4593c
NT
561 */
562
76e3520e 563STATIC void
15f169a1 564S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 565{
9d4ba2ae
AL
566 char * const oldbp = PL_bufptr;
567 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 568
7918f24d
NC
569 PERL_ARGS_ASSERT_NO_OP;
570
1189a94a
GS
571 if (!s)
572 s = oldbp;
07c798fb 573 else
1189a94a 574 PL_bufptr = s;
734ab321 575 yywarn(Perl_form(aTHX_ "%s found where operator expected", what), UTF ? SVf_UTF8 : 0);
56da5a46
RGS
576 if (ckWARN_d(WARN_SYNTAX)) {
577 if (is_first)
578 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
579 "\t(Missing semicolon on previous line?)\n");
fac0f7a3
KW
580 else if (PL_oldoldbufptr && isIDFIRST_lazy_if_safe(PL_oldoldbufptr,
581 PL_bufend,
582 UTF))
583 {
f54cb97a 584 const char *t;
fac0f7a3
KW
585 for (t = PL_oldoldbufptr;
586 (isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF) || *t == ':');
587 t += UTF ? UTF8SKIP(t) : 1)
588 {
c35e046a 589 NOOP;
fac0f7a3 590 }
56da5a46
RGS
591 if (t < PL_bufptr && isSPACE(*t))
592 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
147e3846 593 "\t(Do you need to predeclare %" UTF8f "?)\n",
b17a0679 594 UTF8fARG(UTF, t - PL_oldoldbufptr, PL_oldoldbufptr));
56da5a46
RGS
595 }
596 else {
597 assert(s >= oldbp);
598 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
147e3846 599 "\t(Missing operator before %" UTF8f "?)\n",
b17a0679 600 UTF8fARG(UTF, s - oldbp, oldbp));
56da5a46 601 }
07c798fb 602 }
3280af22 603 PL_bufptr = oldbp;
8990e307
LW
604}
605
ffb4593c
NT
606/*
607 * S_missingterm
608 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 609 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
610 * If we're in a delimited string and the delimiter is a control
611 * character, it's reformatted into a two-char sequence like ^C.
612 * This is fatal.
613 */
614
76e3520e 615STATIC void
1b8d3e0e 616S_missingterm(pTHX_ char *s, STRLEN len)
8990e307 617{
e487ff5e 618 char tmpbuf[UTF8_MAXBYTES + 1];
8990e307 619 char q;
cb650135
FC
620 bool uni = FALSE;
621 SV *sv;
8990e307 622 if (s) {
f88b23d7 623 char * const nl = (char *) my_memrchr(s, '\n', len);
1b8d3e0e
LM
624 if (nl) {
625 *nl = '\0';
626 len = nl - s;
627 }
7f7f7d08 628 uni = UTF;
8990e307 629 }
cb650135 630 else if (PL_multi_close < 32) {
8990e307 631 *tmpbuf = '^';
585ec06d 632 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
633 tmpbuf[2] = '\0';
634 s = tmpbuf;
1b8d3e0e 635 len = 2;
8990e307
LW
636 }
637 else {
cb650135
FC
638 if (LIKELY(PL_multi_close < 256)) {
639 *tmpbuf = (char)PL_multi_close;
640 tmpbuf[1] = '\0';
1b8d3e0e 641 len = 1;
cb650135
FC
642 }
643 else {
1b8d3e0e
LM
644 char *end = (char *)uvchr_to_utf8((U8 *)tmpbuf, PL_multi_close);
645 *end = '\0';
646 len = end - tmpbuf;
cb650135 647 uni = TRUE;
cb650135 648 }
8990e307
LW
649 s = tmpbuf;
650 }
9f8d27b7 651 q = memchr(s, '"', len) ? '\'' : '"';
1b8d3e0e 652 sv = sv_2mortal(newSVpvn(s, len));
cb650135
FC
653 if (uni)
654 SvUTF8_on(sv);
1b8d3e0e
LM
655 Perl_croak(aTHX_ "Can't find string terminator %c%" SVf "%c"
656 " anywhere before EOF", q, SVfARG(sv), q);
463ee0b2 657}
79072805 658
dd0ac2b9
FC
659#include "feature.h"
660
0d863452 661/*
9cbb5ea2
GS
662 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
663 * utf16-to-utf8-reversed.
ffb4593c
NT
664 */
665
c39cd008
GS
666#ifdef PERL_CR_FILTER
667static void
668strip_return(SV *sv)
669{
eb578fdb
KW
670 const char *s = SvPVX_const(sv);
671 const char * const e = s + SvCUR(sv);
7918f24d
NC
672
673 PERL_ARGS_ASSERT_STRIP_RETURN;
674
c39cd008
GS
675 /* outer loop optimized to do nothing if there are no CR-LFs */
676 while (s < e) {
677 if (*s++ == '\r' && *s == '\n') {
678 /* hit a CR-LF, need to copy the rest */
eb578fdb 679 char *d = s - 1;
c39cd008
GS
680 *d++ = *s++;
681 while (s < e) {
682 if (*s == '\r' && s[1] == '\n')
683 s++;
684 *d++ = *s++;
685 }
686 SvCUR(sv) -= s - d;
687 return;
688 }
689 }
690}
a868473f 691
76e3520e 692STATIC I32
c39cd008 693S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 694{
f54cb97a 695 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
696 if (count > 0 && !maxlen)
697 strip_return(sv);
698 return count;
a868473f
NIS
699}
700#endif
701
ffb4593c 702/*
44170c9a 703=for apidoc lex_start
8eaa0acf
Z
704
705Creates and initialises a new lexer/parser state object, supplying
706a context in which to lex and parse from a new source of Perl code.
707A pointer to the new state object is placed in L</PL_parser>. An entry
c83d5090 708is made on the save stack so that upon unwinding, the new state object
8eaa0acf
Z
709will be destroyed and the former value of L</PL_parser> will be restored.
710Nothing else need be done to clean up the parsing context.
711
2d7f6611 712The code to be parsed comes from C<line> and C<rsfp>. C<line>, if
8eaa0acf 713non-null, provides a string (in SV form) containing code to be parsed.
2d7f6611
KW
714A copy of the string is made, so subsequent modification of C<line>
715does not affect parsing. C<rsfp>, if non-null, provides an input stream
8eaa0acf 716from which code will be read to be parsed. If both are non-null, the
2d7f6611
KW
717code in C<line> comes first and must consist of complete lines of input,
718and C<rsfp> supplies the remainder of the source.
8eaa0acf 719
2d7f6611 720The C<flags> parameter is reserved for future use. Currently it is only
e368b3bd 721used by perl internally, so extensions should always pass zero.
8eaa0acf
Z
722
723=cut
724*/
ffb4593c 725
27fcb6ee 726/* LEX_START_SAME_FILTER indicates that this is not a new file, so it
87606032
NC
727 can share filters with the current parser.
728 LEX_START_DONT_CLOSE indicates that the file handle wasn't opened by the
729 caller, hence isn't owned by the parser, so shouldn't be closed on parser
730 destruction. This is used to handle the case of defaulting to reading the
731 script from the standard input because no filename was given on the command
732 line (without getting confused by situation where STDIN has been closed, so
733 the script handle is opened on fd 0) */
27fcb6ee 734
a0d0e21e 735void
8eaa0acf 736Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
79072805 737{
6ef55633 738 const char *s = NULL;
5486870f 739 yy_parser *parser, *oparser;
90b58c70 740
60d63348 741 if (flags && flags & ~LEX_START_FLAGS)
8eaa0acf 742 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
acdf0a21
DM
743
744 /* create and initialise a parser */
745
199e78b7 746 Newxz(parser, 1, yy_parser);
5486870f 747 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
748 PL_parser = parser;
749
28ac2b49 750 parser->stack = NULL;
df13534a 751 parser->stack_max1 = NULL;
28ac2b49 752 parser->ps = NULL;
acdf0a21 753
e3abe207
DM
754 /* on scope exit, free this parser and restore any outer one */
755 SAVEPARSER(parser);
7c4baf47 756 parser->saved_curcop = PL_curcop;
e3abe207 757
acdf0a21 758 /* initialise lexer state */
8990e307 759
fb205e7a 760 parser->nexttoke = 0;
ca4cfd28 761 parser->error_count = oparser ? oparser->error_count : 0;
7f1c3e8c 762 parser->copline = parser->preambling = NOLINE;
5afb0a62 763 parser->lex_state = LEX_NORMAL;
c2598295 764 parser->expect = XSTATE;
2f9285f8 765 parser->rsfp = rsfp;
b7b52646 766 parser->recheck_utf8_validity = TRUE;
27fcb6ee
FC
767 parser->rsfp_filters =
768 !(flags & LEX_START_SAME_FILTER) || !oparser
d3cd8e11
FC
769 ? NULL
770 : MUTABLE_AV(SvREFCNT_inc(
771 oparser->rsfp_filters
772 ? oparser->rsfp_filters
773 : (oparser->rsfp_filters = newAV())
774 ));
2f9285f8 775
199e78b7
DM
776 Newx(parser->lex_brackstack, 120, char);
777 Newx(parser->lex_casestack, 12, char);
778 *parser->lex_casestack = '\0';
d794b522 779 Newxz(parser->lex_shared, 1, LEXSHARED);
02b34bbe 780
10efb74f 781 if (line) {
0528fd32 782 STRLEN len;
a6909bd2
KW
783 const U8* first_bad_char_loc;
784
10efb74f 785 s = SvPV_const(line, len);
90b58c70 786
0aab20f2
KW
787 if ( SvUTF8(line)
788 && UNLIKELY(! is_utf8_string_loc((U8 *) s,
789 SvCUR(line),
790 &first_bad_char_loc)))
90b58c70
KW
791 {
792 _force_out_malformed_utf8_message(first_bad_char_loc,
793 (U8 *) s + SvCUR(line),
794 0,
795 1 /* 1 means die */ );
796 NOT_REACHED; /* NOTREACHED */
797 }
798
0abcdfa4
FC
799 parser->linestr = flags & LEX_START_COPIED
800 ? SvREFCNT_inc_simple_NN(line)
801 : newSVpvn_flags(s, len, SvUTF8(line));
b3dd0aba
FC
802 if (!rsfp)
803 sv_catpvs(parser->linestr, "\n;");
0abcdfa4 804 } else {
bf1b738b 805 parser->linestr = newSVpvn("\n;", rsfp ? 1 : 2);
8990e307 806 }
218304f9 807
f06b5848
DM
808 parser->oldoldbufptr =
809 parser->oldbufptr =
810 parser->bufptr =
811 parser->linestart = SvPVX(parser->linestr);
812 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
813 parser->last_lop = parser->last_uni = NULL;
b54f893d 814
6d59e610 815 STATIC_ASSERT_STMT(FITS_IN_8_BITS(LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
b54f893d
KW
816 |LEX_DONT_CLOSE_RSFP));
817 parser->lex_flags = (U8) (flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
818 |LEX_DONT_CLOSE_RSFP));
737c24fc 819
60d63348 820 parser->in_pod = parser->filtered = 0;
79072805 821}
a687059c 822
e3abe207
DM
823
824/* delete a parser object */
825
826void
827Perl_parser_free(pTHX_ const yy_parser *parser)
828{
7918f24d
NC
829 PERL_ARGS_ASSERT_PARSER_FREE;
830
7c4baf47 831 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
832 SvREFCNT_dec(parser->linestr);
833
87606032 834 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
2f9285f8 835 PerlIO_clearerr(parser->rsfp);
407f8cf2
KW
836 else if (parser->rsfp && (!parser->old_parser
837 || (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 838 PerlIO_close(parser->rsfp);
5486870f 839 SvREFCNT_dec(parser->rsfp_filters);
10002bc1 840 SvREFCNT_dec(parser->lex_stuff);
7ef70b3d 841 SvREFCNT_dec(parser->lex_sub_repl);
3ac7ff8f
FC
842
843 Safefree(parser->lex_brackstack);
844 Safefree(parser->lex_casestack);
845 Safefree(parser->lex_shared);
846 PL_parser = parser->old_parser;
847 Safefree(parser);
848}
849
850void
851Perl_parser_free_nexttoke_ops(pTHX_ yy_parser *parser, OPSLAB *slab)
852{
3ac7ff8f 853 I32 nexttoke = parser->nexttoke;
3ac7ff8f 854 PERL_ARGS_ASSERT_PARSER_FREE_NEXTTOKE_OPS;
3ce3dcd9 855 while (nexttoke--) {
3ac7ff8f
FC
856 if (S_is_opval_token(parser->nexttype[nexttoke] & 0xffff)
857 && parser->nextval[nexttoke].opval
858 && parser->nextval[nexttoke].opval->op_slabbed
859 && OpSLAB(parser->nextval[nexttoke].opval) == slab) {
3ce3dcd9 860 op_free(parser->nextval[nexttoke].opval);
3ac7ff8f
FC
861 parser->nextval[nexttoke].opval = NULL;
862 }
3ce3dcd9 863 }
e3abe207
DM
864}
865
866
ffb4593c 867/*
78342678 868=for apidoc AmnxUN|SV *|PL_parser-E<gt>linestr
f0e67a1d
Z
869
870Buffer scalar containing the chunk currently under consideration of the
871text currently being lexed. This is always a plain string scalar (for
872which C<SvPOK> is true). It is not intended to be used as a scalar by
873normal scalar means; instead refer to the buffer directly by the pointer
874variables described below.
875
876The lexer maintains various C<char*> pointers to things in the
877C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
878reallocated, all of these pointers must be updated. Don't attempt to
879do this manually, but rather use L</lex_grow_linestr> if you need to
880reallocate the buffer.
881
882The content of the text chunk in the buffer is commonly exactly one
883complete line of input, up to and including a newline terminator,
884but there are situations where it is otherwise. The octets of the
885buffer may be intended to be interpreted as either UTF-8 or Latin-1.
886The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
887flag on this scalar, which may disagree with it.
888
889For direct examination of the buffer, the variable
890L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
891lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
892of these pointers is usually preferable to examination of the scalar
893through normal scalar means.
894
78342678 895=for apidoc AmnxUN|char *|PL_parser-E<gt>bufend
f0e67a1d
Z
896
897Direct pointer to the end of the chunk of text currently being lexed, the
898end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
6602b933 899+ SvCUR(PL_parser-E<gt>linestr)>. A C<NUL> character (zero octet) is
f0e67a1d
Z
900always located at the end of the buffer, and does not count as part of
901the buffer's contents.
902
78342678 903=for apidoc AmnxUN|char *|PL_parser-E<gt>bufptr
f0e67a1d
Z
904
905Points to the current position of lexing inside the lexer buffer.
906Characters around this point may be freely examined, within
907the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
908L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
909interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
910
911Lexing code (whether in the Perl core or not) moves this pointer past
912the characters that it consumes. It is also expected to perform some
913bookkeeping whenever a newline character is consumed. This movement
914can be more conveniently performed by the function L</lex_read_to>,
915which handles newlines appropriately.
916
917Interpretation of the buffer's octets can be abstracted out by
918using the slightly higher-level functions L</lex_peek_unichar> and
919L</lex_read_unichar>.
920
78342678 921=for apidoc AmnxUN|char *|PL_parser-E<gt>linestart
f0e67a1d
Z
922
923Points to the start of the current line inside the lexer buffer.
924This is useful for indicating at which column an error occurred, and
925not much else. This must be updated by any lexing code that consumes
926a newline; the function L</lex_read_to> handles this detail.
927
928=cut
929*/
930
931/*
44170c9a 932=for apidoc lex_bufutf8
f0e67a1d
Z
933
934Indicates whether the octets in the lexer buffer
935(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
936of Unicode characters. If not, they should be interpreted as Latin-1
937characters. This is analogous to the C<SvUTF8> flag for scalars.
938
939In UTF-8 mode, it is not guaranteed that the lexer buffer actually
940contains valid UTF-8. Lexing code must be robust in the face of invalid
941encoding.
942
943The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
944is significant, but not the whole story regarding the input character
945encoding. Normally, when a file is being read, the scalar contains octets
946and its C<SvUTF8> flag is off, but the octets should be interpreted as
947UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
948however, the scalar may have the C<SvUTF8> flag on, and in this case its
949octets should be interpreted as UTF-8 unless the C<use bytes> pragma
950is in effect. This logic may change in the future; use this function
951instead of implementing the logic yourself.
952
953=cut
954*/
955
956bool
957Perl_lex_bufutf8(pTHX)
958{
959 return UTF;
960}
961
962/*
44170c9a 963=for apidoc lex_grow_linestr
f0e67a1d
Z
964
965Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
2d7f6611 966at least C<len> octets (including terminating C<NUL>). Returns a
f0e67a1d
Z
967pointer to the reallocated buffer. This is necessary before making
968any direct modification of the buffer that would increase its length.
969L</lex_stuff_pvn> provides a more convenient way to insert text into
970the buffer.
971
972Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
973this function updates all of the lexer's variables that point directly
974into the buffer.
975
976=cut
977*/
978
979char *
980Perl_lex_grow_linestr(pTHX_ STRLEN len)
981{
982 SV *linestr;
983 char *buf;
984 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
c7641931 985 STRLEN linestart_pos, last_uni_pos, last_lop_pos, re_eval_start_pos;
98d5e3ef
DM
986 bool current;
987
f0e67a1d
Z
988 linestr = PL_parser->linestr;
989 buf = SvPVX(linestr);
990 if (len <= SvLEN(linestr))
991 return buf;
98d5e3ef
DM
992
993 /* Is the lex_shared linestr SV the same as the current linestr SV?
994 * Only in this case does re_eval_start need adjusting, since it
995 * points within lex_shared->ls_linestr's buffer */
b1b8fb6a
DM
996 current = ( !PL_parser->lex_shared->ls_linestr
997 || linestr == PL_parser->lex_shared->ls_linestr);
98d5e3ef 998
f0e67a1d
Z
999 bufend_pos = PL_parser->bufend - buf;
1000 bufptr_pos = PL_parser->bufptr - buf;
1001 oldbufptr_pos = PL_parser->oldbufptr - buf;
1002 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1003 linestart_pos = PL_parser->linestart - buf;
1004 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1005 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
98d5e3ef 1006 re_eval_start_pos = (current && PL_parser->lex_shared->re_eval_start) ?
3328ab5a 1007 PL_parser->lex_shared->re_eval_start - buf : 0;
c7641931 1008
f0e67a1d 1009 buf = sv_grow(linestr, len);
c7641931 1010
f0e67a1d
Z
1011 PL_parser->bufend = buf + bufend_pos;
1012 PL_parser->bufptr = buf + bufptr_pos;
1013 PL_parser->oldbufptr = buf + oldbufptr_pos;
1014 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1015 PL_parser->linestart = buf + linestart_pos;
1016 if (PL_parser->last_uni)
1017 PL_parser->last_uni = buf + last_uni_pos;
1018 if (PL_parser->last_lop)
1019 PL_parser->last_lop = buf + last_lop_pos;
98d5e3ef 1020 if (current && PL_parser->lex_shared->re_eval_start)
3328ab5a 1021 PL_parser->lex_shared->re_eval_start = buf + re_eval_start_pos;
f0e67a1d
Z
1022 return buf;
1023}
1024
1025/*
44170c9a 1026=for apidoc lex_stuff_pvn
f0e67a1d
Z
1027
1028Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1029immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1030reallocating the buffer if necessary. This means that lexing code that
1031runs later will see the characters as if they had appeared in the input.
1032It is not recommended to do this as part of normal parsing, and most
1033uses of this facility run the risk of the inserted characters being
1034interpreted in an unintended manner.
1035
2d7f6611
KW
1036The string to be inserted is represented by C<len> octets starting
1037at C<pv>. These octets are interpreted as either UTF-8 or Latin-1,
1038according to whether the C<LEX_STUFF_UTF8> flag is set in C<flags>.
f0e67a1d
Z
1039The characters are recoded for the lexer buffer, according to how the
1040buffer is currently being interpreted (L</lex_bufutf8>). If a string
9dcc53ea 1041to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
f0e67a1d
Z
1042function is more convenient.
1043
5af38e47
KW
1044=for apidoc Amnh||LEX_STUFF_UTF8
1045
f0e67a1d
Z
1046=cut
1047*/
1048
1049void
83aa740e 1050Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
f0e67a1d
Z
1051{
1052 char *bufptr;
1053 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
1054 if (flags & ~(LEX_STUFF_UTF8))
1055 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
1056 if (UTF) {
1057 if (flags & LEX_STUFF_UTF8) {
1058 goto plain_copy;
1059 } else {
01ccd497
KW
1060 STRLEN highhalf = variant_under_utf8_count((U8 *) pv,
1061 (U8 *) pv + len);
1062 const char *p, *e = pv+len;;
f0e67a1d
Z
1063 if (!highhalf)
1064 goto plain_copy;
1065 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
1066 bufptr = PL_parser->bufptr;
1067 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1068 SvCUR_set(PL_parser->linestr,
1069 SvCUR(PL_parser->linestr) + len+highhalf);
f0e67a1d
Z
1070 PL_parser->bufend += len+highhalf;
1071 for (p = pv; p != e; p++) {
27d0a47c 1072 append_utf8_from_native_byte(*p, (U8 **) &bufptr);
f0e67a1d
Z
1073 }
1074 }
1075 } else {
1076 if (flags & LEX_STUFF_UTF8) {
1077 STRLEN highhalf = 0;
83aa740e 1078 const char *p, *e = pv+len;
f0e67a1d
Z
1079 for (p = pv; p != e; p++) {
1080 U8 c = (U8)*p;
54d004e8 1081 if (UTF8_IS_ABOVE_LATIN1(c)) {
f0e67a1d
Z
1082 Perl_croak(aTHX_ "Lexing code attempted to stuff "
1083 "non-Latin-1 character into Latin-1 input");
54d004e8 1084 } else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e)) {
f0e67a1d
Z
1085 p++;
1086 highhalf++;
f88c6466 1087 } else assert(UTF8_IS_INVARIANT(c));
f0e67a1d
Z
1088 }
1089 if (!highhalf)
1090 goto plain_copy;
1091 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
1092 bufptr = PL_parser->bufptr;
1093 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1094 SvCUR_set(PL_parser->linestr,
1095 SvCUR(PL_parser->linestr) + len-highhalf);
f0e67a1d 1096 PL_parser->bufend += len-highhalf;
54d004e8
KW
1097 p = pv;
1098 while (p < e) {
1099 if (UTF8_IS_INVARIANT(*p)) {
1100 *bufptr++ = *p;
1101 p++;
f0e67a1d 1102 }
54d004e8
KW
1103 else {
1104 assert(p < e -1 );
a62b247b 1105 *bufptr++ = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
54d004e8
KW
1106 p += 2;
1107 }
f0e67a1d
Z
1108 }
1109 } else {
54d004e8 1110 plain_copy:
f0e67a1d
Z
1111 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1112 bufptr = PL_parser->bufptr;
1113 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
255fdf19 1114 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
f0e67a1d
Z
1115 PL_parser->bufend += len;
1116 Copy(pv, bufptr, len, char);
1117 }
1118 }
1119}
1120
1121/*
44170c9a 1122=for apidoc lex_stuff_pv
9dcc53ea
Z
1123
1124Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1125immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1126reallocating the buffer if necessary. This means that lexing code that
1127runs later will see the characters as if they had appeared in the input.
1128It is not recommended to do this as part of normal parsing, and most
1129uses of this facility run the risk of the inserted characters being
1130interpreted in an unintended manner.
1131
2d7f6611 1132The string to be inserted is represented by octets starting at C<pv>
9dcc53ea
Z
1133and continuing to the first nul. These octets are interpreted as either
1134UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
2d7f6611 1135in C<flags>. The characters are recoded for the lexer buffer, according
9dcc53ea
Z
1136to how the buffer is currently being interpreted (L</lex_bufutf8>).
1137If it is not convenient to nul-terminate a string to be inserted, the
1138L</lex_stuff_pvn> function is more appropriate.
1139
1140=cut
1141*/
1142
1143void
1144Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1145{
1146 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1147 lex_stuff_pvn(pv, strlen(pv), flags);
1148}
1149
1150/*
44170c9a 1151=for apidoc lex_stuff_sv
f0e67a1d
Z
1152
1153Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1154immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1155reallocating the buffer if necessary. This means that lexing code that
1156runs later will see the characters as if they had appeared in the input.
1157It is not recommended to do this as part of normal parsing, and most
1158uses of this facility run the risk of the inserted characters being
1159interpreted in an unintended manner.
1160
2d7f6611 1161The string to be inserted is the string value of C<sv>. The characters
f0e67a1d 1162are recoded for the lexer buffer, according to how the buffer is currently
9dcc53ea 1163being interpreted (L</lex_bufutf8>). If a string to be inserted is
f0e67a1d
Z
1164not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1165need to construct a scalar.
1166
1167=cut
1168*/
1169
1170void
1171Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1172{
1173 char *pv;
1174 STRLEN len;
1175 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1176 if (flags)
1177 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1178 pv = SvPV(sv, len);
1179 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1180}
1181
1182/*
44170c9a 1183=for apidoc lex_unstuff
f0e67a1d
Z
1184
1185Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
2d7f6611 1186C<ptr>. Text following C<ptr> will be moved, and the buffer shortened.
f0e67a1d
Z
1187This hides the discarded text from any lexing code that runs later,
1188as if the text had never appeared.
1189
1190This is not the normal way to consume lexed text. For that, use
1191L</lex_read_to>.
1192
1193=cut
1194*/
1195
1196void
1197Perl_lex_unstuff(pTHX_ char *ptr)
1198{
1199 char *buf, *bufend;
1200 STRLEN unstuff_len;
1201 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1202 buf = PL_parser->bufptr;
1203 if (ptr < buf)
1204 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1205 if (ptr == buf)
1206 return;
1207 bufend = PL_parser->bufend;
1208 if (ptr > bufend)
1209 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1210 unstuff_len = ptr - buf;
1211 Move(ptr, buf, bufend+1-ptr, char);
1212 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1213 PL_parser->bufend = bufend - unstuff_len;
1214}
1215
1216/*
44170c9a 1217=for apidoc lex_read_to
f0e67a1d
Z
1218
1219Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
2d7f6611 1220to C<ptr>. This advances L</PL_parser-E<gt>bufptr> to match C<ptr>,
f0e67a1d
Z
1221performing the correct bookkeeping whenever a newline character is passed.
1222This is the normal way to consume lexed text.
1223
1224Interpretation of the buffer's octets can be abstracted out by
1225using the slightly higher-level functions L</lex_peek_unichar> and
1226L</lex_read_unichar>.
1227
1228=cut
1229*/
1230
1231void
1232Perl_lex_read_to(pTHX_ char *ptr)
1233{
1234 char *s;
1235 PERL_ARGS_ASSERT_LEX_READ_TO;
1236 s = PL_parser->bufptr;
1237 if (ptr < s || ptr > PL_parser->bufend)
1238 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1239 for (; s != ptr; s++)
1240 if (*s == '\n') {
83944c01 1241 COPLINE_INC_WITH_HERELINES;
f0e67a1d
Z
1242 PL_parser->linestart = s+1;
1243 }
1244 PL_parser->bufptr = ptr;
1245}
1246
1247/*
44170c9a 1248=for apidoc lex_discard_to
f0e67a1d
Z
1249
1250Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
2d7f6611
KW
1251up to C<ptr>. The remaining content of the buffer will be moved, and
1252all pointers into the buffer updated appropriately. C<ptr> must not
f0e67a1d
Z
1253be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1254it is not permitted to discard text that has yet to be lexed.
1255
1256Normally it is not necessarily to do this directly, because it suffices to
1257use the implicit discarding behaviour of L</lex_next_chunk> and things
1258based on it. However, if a token stretches across multiple lines,
1f317c95 1259and the lexing code has kept multiple lines of text in the buffer for
f0e67a1d
Z
1260that purpose, then after completion of the token it would be wise to
1261explicitly discard the now-unneeded earlier lines, to avoid future
1262multi-line tokens growing the buffer without bound.
1263
1264=cut
1265*/
1266
1267void
1268Perl_lex_discard_to(pTHX_ char *ptr)
1269{
1270 char *buf;
1271 STRLEN discard_len;
1272 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1273 buf = SvPVX(PL_parser->linestr);
1274 if (ptr < buf)
1275 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1276 if (ptr == buf)
1277 return;
1278 if (ptr > PL_parser->bufptr)
1279 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1280 discard_len = ptr - buf;
1281 if (PL_parser->oldbufptr < ptr)
1282 PL_parser->oldbufptr = ptr;
1283 if (PL_parser->oldoldbufptr < ptr)
1284 PL_parser->oldoldbufptr = ptr;
1285 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1286 PL_parser->last_uni = NULL;
1287 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1288 PL_parser->last_lop = NULL;
1289 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1290 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1291 PL_parser->bufend -= discard_len;
1292 PL_parser->bufptr -= discard_len;
1293 PL_parser->oldbufptr -= discard_len;
1294 PL_parser->oldoldbufptr -= discard_len;
1295 if (PL_parser->last_uni)
1296 PL_parser->last_uni -= discard_len;
1297 if (PL_parser->last_lop)
1298 PL_parser->last_lop -= discard_len;
1299}
1300
efa571ab
KW
1301void
1302Perl_notify_parser_that_changed_to_utf8(pTHX)
1303{
1304 /* Called when $^H is changed to indicate that HINT_UTF8 has changed from
1305 * off to on. At compile time, this has the effect of entering a 'use
1306 * utf8' section. This means that any input was not previously checked for
1307 * UTF-8 (because it was off), but now we do need to check it, or our
1308 * assumptions about the input being sane could be wrong, and we could
1309 * segfault. This routine just sets a flag so that the next time we look
1310 * at the input we do the well-formed UTF-8 check. If we aren't in the
1311 * proper phase, there may not be a parser object, but if there is, setting
1312 * the flag is harmless */
1313
1314 if (PL_parser) {
1315 PL_parser->recheck_utf8_validity = TRUE;
1316 }
1317}
1318
f0e67a1d 1319/*
44170c9a 1320=for apidoc lex_next_chunk
f0e67a1d
Z
1321
1322Reads in the next chunk of text to be lexed, appending it to
1323L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1324looked to the end of the current chunk and wants to know more. It is
1325usual, but not necessary, for lexing to have consumed the entirety of
1326the current chunk at this time.
1327
1328If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1329chunk (i.e., the current chunk has been entirely consumed), normally the
1330current chunk will be discarded at the same time that the new chunk is
c5608a1f 1331read in. If C<flags> has the C<LEX_KEEP_PREVIOUS> bit set, the current chunk
f0e67a1d
Z
1332will not be discarded. If the current chunk has not been entirely
1333consumed, then it will not be discarded regardless of the flag.
1334
1335Returns true if some new text was added to the buffer, or false if the
1336buffer has reached the end of the input text.
1337
5af38e47
KW
1338=for apidoc Amnh||LEX_KEEP_PREVIOUS
1339
f0e67a1d
Z
1340=cut
1341*/
1342
1343#define LEX_FAKE_EOF 0x80000000
e47d32dc 1344#define LEX_NO_TERM 0x40000000 /* here-doc */
f0e67a1d
Z
1345
1346bool
1347Perl_lex_next_chunk(pTHX_ U32 flags)
1348{
1349 SV *linestr;
1350 char *buf;
1351 STRLEN old_bufend_pos, new_bufend_pos;
1352 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1353 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1354 bool got_some_for_debugger = 0;
f0e67a1d 1355 bool got_some;
6cdc5cd8 1356
112d1284 1357 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM))
f0e67a1d 1358 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
d27f4b91 1359 if (!(flags & LEX_NO_TERM) && PL_lex_inwhat)
e47d32dc 1360 return FALSE;
f0e67a1d
Z
1361 linestr = PL_parser->linestr;
1362 buf = SvPVX(linestr);
407f8cf2
KW
1363 if (!(flags & LEX_KEEP_PREVIOUS)
1364 && PL_parser->bufptr == PL_parser->bufend)
1365 {
f0e67a1d
Z
1366 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1367 linestart_pos = 0;
1368 if (PL_parser->last_uni != PL_parser->bufend)
1369 PL_parser->last_uni = NULL;
1370 if (PL_parser->last_lop != PL_parser->bufend)
1371 PL_parser->last_lop = NULL;
1372 last_uni_pos = last_lop_pos = 0;
1373 *buf = 0;
2324bdb9 1374 SvCUR_set(linestr, 0);
f0e67a1d
Z
1375 } else {
1376 old_bufend_pos = PL_parser->bufend - buf;
1377 bufptr_pos = PL_parser->bufptr - buf;
1378 oldbufptr_pos = PL_parser->oldbufptr - buf;
1379 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1380 linestart_pos = PL_parser->linestart - buf;
1381 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1382 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1383 }
1384 if (flags & LEX_FAKE_EOF) {
1385 goto eof;
60d63348 1386 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
f0e67a1d
Z
1387 got_some = 0;
1388 } else if (filter_gets(linestr, old_bufend_pos)) {
1389 got_some = 1;
17cc9359 1390 got_some_for_debugger = 1;
112d1284
FC
1391 } else if (flags & LEX_NO_TERM) {
1392 got_some = 0;
f0e67a1d 1393 } else {
580561a3 1394 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
847cc851 1395 SvPVCLEAR(linestr);
f0e67a1d
Z
1396 eof:
1397 /* End of real input. Close filehandle (unless it was STDIN),
1398 * then add implicit termination.
1399 */
87606032 1400 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
f0e67a1d
Z
1401 PerlIO_clearerr(PL_parser->rsfp);
1402 else if (PL_parser->rsfp)
1403 (void)PerlIO_close(PL_parser->rsfp);
1404 PL_parser->rsfp = NULL;
60d63348 1405 PL_parser->in_pod = PL_parser->filtered = 0;
f0e67a1d
Z
1406 if (!PL_in_eval && PL_minus_p) {
1407 sv_catpvs(linestr,
1408 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1409 PL_minus_n = PL_minus_p = 0;
1410 } else if (!PL_in_eval && PL_minus_n) {
1411 sv_catpvs(linestr, /*{*/";}");
1412 PL_minus_n = 0;
1413 } else
1414 sv_catpvs(linestr, ";");
1415 got_some = 1;
1416 }
1417 buf = SvPVX(linestr);
1418 new_bufend_pos = SvCUR(linestr);
1419 PL_parser->bufend = buf + new_bufend_pos;
1420 PL_parser->bufptr = buf + bufptr_pos;
6cdc5cd8 1421
07337b95
KW
1422 if (UTF) {
1423 const U8* first_bad_char_loc;
1424 if (UNLIKELY(! is_utf8_string_loc(
1425 (U8 *) PL_parser->bufptr,
1426 PL_parser->bufend - PL_parser->bufptr,
1427 &first_bad_char_loc)))
1428 {
1429 _force_out_malformed_utf8_message(first_bad_char_loc,
1430 (U8 *) PL_parser->bufend,
1431 0,
1432 1 /* 1 means die */ );
1433 NOT_REACHED; /* NOTREACHED */
1434 }
6cdc5cd8
KW
1435 }
1436
f0e67a1d
Z
1437 PL_parser->oldbufptr = buf + oldbufptr_pos;
1438 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1439 PL_parser->linestart = buf + linestart_pos;
1440 if (PL_parser->last_uni)
1441 PL_parser->last_uni = buf + last_uni_pos;
1442 if (PL_parser->last_lop)
1443 PL_parser->last_lop = buf + last_lop_pos;
7f1c3e8c
FC
1444 if (PL_parser->preambling != NOLINE) {
1445 CopLINE_set(PL_curcop, PL_parser->preambling + 1);
1446 PL_parser->preambling = NOLINE;
1447 }
407f8cf2
KW
1448 if ( got_some_for_debugger
1449 && PERLDB_LINE_OR_SAVESRC
1450 && PL_curstash != PL_debstash)
1451 {
f0e67a1d
Z
1452 /* debugger active and we're not compiling the debugger code,
1453 * so store the line into the debugger's array of lines
1454 */
1455 update_debugger_info(NULL, buf+old_bufend_pos,
1456 new_bufend_pos-old_bufend_pos);
1457 }
1458 return got_some;
1459}
1460
1461/*
44170c9a 1462=for apidoc lex_peek_unichar
f0e67a1d
Z
1463
1464Looks ahead one (Unicode) character in the text currently being lexed.
1465Returns the codepoint (unsigned integer value) of the next character,
1466or -1 if lexing has reached the end of the input text. To consume the
1467peeked character, use L</lex_read_unichar>.
1468
1469If the next character is in (or extends into) the next chunk of input
1470text, the next chunk will be read in. Normally the current chunk will be
c5608a1f
KW
1471discarded at the same time, but if C<flags> has the C<LEX_KEEP_PREVIOUS>
1472bit set, then the current chunk will not be discarded.
f0e67a1d
Z
1473
1474If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1475is encountered, an exception is generated.
1476
1477=cut
1478*/
1479
1480I32
1481Perl_lex_peek_unichar(pTHX_ U32 flags)
1482{
1483 char *s, *bufend;
1484 if (flags & ~(LEX_KEEP_PREVIOUS))
1485 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1486 s = PL_parser->bufptr;
1487 bufend = PL_parser->bufend;
1488 if (UTF) {
1489 U8 head;
1490 I32 unichar;
1491 STRLEN len, retlen;
1492 if (s == bufend) {
1493 if (!lex_next_chunk(flags))
1494 return -1;
1495 s = PL_parser->bufptr;
1496 bufend = PL_parser->bufend;
1497 }
1498 head = (U8)*s;
54d004e8 1499 if (UTF8_IS_INVARIANT(head))
f0e67a1d 1500 return head;
54d004e8
KW
1501 if (UTF8_IS_START(head)) {
1502 len = UTF8SKIP(&head);
f0e67a1d
Z
1503 while ((STRLEN)(bufend-s) < len) {
1504 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1505 break;
1506 s = PL_parser->bufptr;
1507 bufend = PL_parser->bufend;
1508 }
1509 }
c80e42f3 1510 unichar = utf8n_to_uvchr((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
f0e67a1d 1511 if (retlen == (STRLEN)-1) {
75219bac
KW
1512 _force_out_malformed_utf8_message((U8 *) s,
1513 (U8 *) bufend,
1514 0,
1515 1 /* 1 means die */ );
1516 NOT_REACHED; /* NOTREACHED */
f0e67a1d
Z
1517 }
1518 return unichar;
1519 } else {
1520 if (s == bufend) {
1521 if (!lex_next_chunk(flags))
1522 return -1;
1523 s = PL_parser->bufptr;
1524 }
1525 return (U8)*s;
1526 }
1527}
1528
1529/*
44170c9a 1530=for apidoc lex_read_unichar
f0e67a1d
Z
1531
1532Reads the next (Unicode) character in the text currently being lexed.
1533Returns the codepoint (unsigned integer value) of the character read,
1534and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1535if lexing has reached the end of the input text. To non-destructively
1536examine the next character, use L</lex_peek_unichar> instead.
1537
1538If the next character is in (or extends into) the next chunk of input
1539text, the next chunk will be read in. Normally the current chunk will be
c5608a1f
KW
1540discarded at the same time, but if C<flags> has the C<LEX_KEEP_PREVIOUS>
1541bit set, then the current chunk will not be discarded.
f0e67a1d
Z
1542
1543If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1544is encountered, an exception is generated.
1545
1546=cut
1547*/
1548
1549I32
1550Perl_lex_read_unichar(pTHX_ U32 flags)
1551{
1552 I32 c;
1553 if (flags & ~(LEX_KEEP_PREVIOUS))
1554 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1555 c = lex_peek_unichar(flags);
1556 if (c != -1) {
1557 if (c == '\n')
83944c01 1558 COPLINE_INC_WITH_HERELINES;
d9018cbe
EB
1559 if (UTF)
1560 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1561 else
1562 ++(PL_parser->bufptr);
f0e67a1d
Z
1563 }
1564 return c;
1565}
1566
1567/*
44170c9a 1568=for apidoc lex_read_space
f0e67a1d
Z
1569
1570Reads optional spaces, in Perl style, in the text currently being
1571lexed. The spaces may include ordinary whitespace characters and
1572Perl-style comments. C<#line> directives are processed if encountered.
1573L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1574at a non-space character (or the end of the input text).
1575
1576If spaces extend into the next chunk of input text, the next chunk will
1577be read in. Normally the current chunk will be discarded at the same
c5608a1f 1578time, but if C<flags> has the C<LEX_KEEP_PREVIOUS> bit set, then the current
f0e67a1d
Z
1579chunk will not be discarded.
1580
1581=cut
1582*/
1583
21791330 1584#define LEX_NO_INCLINE 0x40000000
f0998909
Z
1585#define LEX_NO_NEXT_CHUNK 0x80000000
1586
f0e67a1d
Z
1587void
1588Perl_lex_read_space(pTHX_ U32 flags)
1589{
1590 char *s, *bufend;
21791330 1591 const bool can_incline = !(flags & LEX_NO_INCLINE);
f0e67a1d 1592 bool need_incline = 0;
21791330 1593 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK|LEX_NO_INCLINE))
f0e67a1d 1594 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
f0e67a1d
Z
1595 s = PL_parser->bufptr;
1596 bufend = PL_parser->bufend;
1597 while (1) {
1598 char c = *s;
1599 if (c == '#') {
1600 do {
1601 c = *++s;
1602 } while (!(c == '\n' || (c == 0 && s == bufend)));
1603 } else if (c == '\n') {
1604 s++;
21791330
FC
1605 if (can_incline) {
1606 PL_parser->linestart = s;
1607 if (s == bufend)
1608 need_incline = 1;
1609 else
c6875f94 1610 incline(s, bufend);
21791330 1611 }
f0e67a1d
Z
1612 } else if (isSPACE(c)) {
1613 s++;
1614 } else if (c == 0 && s == bufend) {
1615 bool got_more;
65c68e17 1616 line_t l;
f0998909
Z
1617 if (flags & LEX_NO_NEXT_CHUNK)
1618 break;
f0e67a1d 1619 PL_parser->bufptr = s;
65c68e17 1620 l = CopLINE(PL_curcop);
851b527a 1621 CopLINE(PL_curcop) += PL_parser->herelines + 1;
f0e67a1d 1622 got_more = lex_next_chunk(flags);
65c68e17 1623 CopLINE_set(PL_curcop, l);
f0e67a1d
Z
1624 s = PL_parser->bufptr;
1625 bufend = PL_parser->bufend;
1626 if (!got_more)
1627 break;
21791330 1628 if (can_incline && need_incline && PL_parser->rsfp) {
c6875f94 1629 incline(s, bufend);
f0e67a1d
Z
1630 need_incline = 0;
1631 }
3c47da3c
FC
1632 } else if (!c) {
1633 s++;
f0e67a1d
Z
1634 } else {
1635 break;
1636 }
1637 }
f0e67a1d
Z
1638 PL_parser->bufptr = s;
1639}
1640
1641/*
fe788d6b 1642
44170c9a 1643=for apidoc validate_proto
fe788d6b
PM
1644
1645This function performs syntax checking on a prototype, C<proto>.
1646If C<warn> is true, any illegal characters or mismatched brackets
1647will trigger illegalproto warnings, declaring that they were
1648detected in the prototype for C<name>.
1649
1650The return value is C<true> if this is a valid prototype, and
1651C<false> if it is not, regardless of whether C<warn> was C<true> or
1652C<false>.
1653
1654Note that C<NULL> is a valid C<proto> and will always return C<true>.
1655
1656=cut
1657
1658 */
1659
1660bool
5783dc51 1661Perl_validate_proto(pTHX_ SV *name, SV *proto, bool warn, bool curstash)
fe788d6b
PM
1662{
1663 STRLEN len, origlen;
11327fa1 1664 char *p;
fe788d6b
PM
1665 bool bad_proto = FALSE;
1666 bool in_brackets = FALSE;
1667 bool after_slash = FALSE;
1668 char greedy_proto = ' ';
1669 bool proto_after_greedy_proto = FALSE;
1670 bool must_be_last = FALSE;
1671 bool underscore = FALSE;
f791a21a 1672 bool bad_proto_after_underscore = FALSE;
fe788d6b
PM
1673
1674 PERL_ARGS_ASSERT_VALIDATE_PROTO;
1675
1676 if (!proto)
1677 return TRUE;
1678
11327fa1 1679 p = SvPV(proto, len);
fe788d6b
PM
1680 origlen = len;
1681 for (; len--; p++) {
1682 if (!isSPACE(*p)) {
1683 if (must_be_last)
1684 proto_after_greedy_proto = TRUE;
f791a21a 1685 if (underscore) {
4aada8b9 1686 if (!memCHRs(";@%", *p))
f791a21a
PM
1687 bad_proto_after_underscore = TRUE;
1688 underscore = FALSE;
1689 }
4aada8b9 1690 if (!memCHRs("$@%*;[]&\\_+", *p) || *p == '\0') {
fe788d6b
PM
1691 bad_proto = TRUE;
1692 }
1693 else {
fe788d6b
PM
1694 if (*p == '[')
1695 in_brackets = TRUE;
1696 else if (*p == ']')
1697 in_brackets = FALSE;
407f8cf2
KW
1698 else if ((*p == '@' || *p == '%')
1699 && !after_slash
1700 && !in_brackets )
1701 {
fe788d6b
PM
1702 must_be_last = TRUE;
1703 greedy_proto = *p;
1704 }
1705 else if (*p == '_')
f791a21a 1706 underscore = TRUE;
fe788d6b
PM
1707 }
1708 if (*p == '\\')
1709 after_slash = TRUE;
1710 else
1711 after_slash = FALSE;
1712 }
1713 }
1714
1715 if (warn) {
b54d603d 1716 SV *tmpsv = newSVpvs_flags("", SVs_TEMP);
fe788d6b 1717 p -= origlen;
b54d603d
PM
1718 p = SvUTF8(proto)
1719 ? sv_uni_display(tmpsv, newSVpvn_flags(p, origlen, SVs_TEMP | SVf_UTF8),
1720 origlen, UNI_DISPLAY_ISPRINT)
1721 : pv_pretty(tmpsv, p, origlen, 60, NULL, NULL, PERL_PV_ESCAPE_NONASCII);
1722
cbf83791
FC
1723 if (curstash && !memchr(SvPVX(name), ':', SvCUR(name))) {
1724 SV *name2 = sv_2mortal(newSVsv(PL_curstname));
1725 sv_catpvs(name2, "::");
1726 sv_catsv(name2, (SV *)name);
1727 name = name2;
1728 }
1729
fe788d6b
PM
1730 if (proto_after_greedy_proto)
1731 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
147e3846 1732 "Prototype after '%c' for %" SVf " : %s",
fe788d6b 1733 greedy_proto, SVfARG(name), p);
50278ed0
PM
1734 if (in_brackets)
1735 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
147e3846 1736 "Missing ']' in prototype for %" SVf " : %s",
50278ed0 1737 SVfARG(name), p);
b54d603d 1738 if (bad_proto)
fe788d6b 1739 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
147e3846 1740 "Illegal character in prototype for %" SVf " : %s",
f791a21a
PM
1741 SVfARG(name), p);
1742 if (bad_proto_after_underscore)
1743 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
147e3846 1744 "Illegal character after '_' in prototype for %" SVf " : %s",
f791a21a 1745 SVfARG(name), p);
fe788d6b
PM
1746 }
1747
1748 return (! (proto_after_greedy_proto || bad_proto) );
1749}
1750
1751/*
ffb4593c
NT
1752 * S_incline
1753 * This subroutine has nothing to do with tilting, whether at windmills
1754 * or pinball tables. Its name is short for "increment line". It
57843af0 1755 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1756 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1757 * # line 500 "foo.pm"
1758 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1759 */
1760
76e3520e 1761STATIC void
c6875f94 1762S_incline(pTHX_ const char *s, const char *end)
463ee0b2 1763{
d9095cec
NC
1764 const char *t;
1765 const char *n;
1766 const char *e;
8818d409 1767 line_t line_num;
22ff3130 1768 UV uv;
463ee0b2 1769
7918f24d
NC
1770 PERL_ARGS_ASSERT_INCLINE;
1771
d77eff5d
KW
1772 assert(end >= s);
1773
83944c01 1774 COPLINE_INC_WITH_HERELINES;
451f421f
FC
1775 if (!PL_rsfp && !PL_parser->filtered && PL_lex_state == LEX_NORMAL
1776 && s+1 == PL_bufend && *s == ';') {
1777 /* fake newline in string eval */
1778 CopLINE_dec(PL_curcop);
1779 return;
1780 }
463ee0b2
LW
1781 if (*s++ != '#')
1782 return;
d4c19fe8
AL
1783 while (SPACE_OR_TAB(*s))
1784 s++;
d77eff5d
KW
1785 if (memBEGINs(s, (STRLEN) (end - s), "line"))
1786 s += sizeof("line") - 1;
73659bf1
GS
1787 else
1788 return;
084592ab 1789 if (SPACE_OR_TAB(*s))
73659bf1 1790 s++;
4e553d73 1791 else
73659bf1 1792 return;
d4c19fe8
AL
1793 while (SPACE_OR_TAB(*s))
1794 s++;
463ee0b2
LW
1795 if (!isDIGIT(*s))
1796 return;
d4c19fe8 1797
463ee0b2
LW
1798 n = s;
1799 while (isDIGIT(*s))
1800 s++;
07714eb4 1801 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1802 return;
bf4acbe4 1803 while (SPACE_OR_TAB(*s))
463ee0b2 1804 s++;
85bb8b90 1805 if (*s == '"' && (t = (char *) memchr(s+1, '"', end - s))) {
463ee0b2 1806 s++;
73659bf1
GS
1807 e = t + 1;
1808 }
463ee0b2 1809 else {
c35e046a 1810 t = s;
1bb1a3d6 1811 while (*t && !isSPACE(*t))
c35e046a 1812 t++;
73659bf1 1813 e = t;
463ee0b2 1814 }
bf4acbe4 1815 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1816 e++;
1817 if (*e != '\n' && *e != '\0')
1818 return; /* false alarm */
1819
22ff3130
HS
1820 if (!grok_atoUV(n, &uv, &e))
1821 return;
1822 line_num = ((line_t)uv) - 1;
8818d409 1823
f4dd75d9 1824 if (t - s > 0) {
d9095cec 1825 const STRLEN len = t - s;
3df32bda 1826
d36ee5be 1827 if (!PL_rsfp && !PL_parser->filtered) {
e66cf94c
RGS
1828 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1829 * to *{"::_<newfilename"} */
44867030
NC
1830 /* However, the long form of evals is only turned on by the
1831 debugger - usually they're "(eval %lu)" */
d36ee5be
FC
1832 GV * const cfgv = CopFILEGV(PL_curcop);
1833 if (cfgv) {
38bd7ad8
FC
1834 char smallbuf[128];
1835 STRLEN tmplen2 = len;
44867030 1836 char *tmpbuf2;
449dd039 1837 GV *gv2;
44867030
NC
1838
1839 if (tmplen2 + 2 <= sizeof smallbuf)
1840 tmpbuf2 = smallbuf;
1841 else
1842 Newx(tmpbuf2, tmplen2 + 2, char);
1843
38bd7ad8
FC
1844 tmpbuf2[0] = '_';
1845 tmpbuf2[1] = '<';
44867030
NC
1846
1847 memcpy(tmpbuf2 + 2, s, tmplen2);
1848 tmplen2 += 2;
1849
8a5ee598 1850 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1851 if (!isGV(gv2)) {
8a5ee598 1852 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1853 /* adjust ${"::_<newfilename"} to store the new file name */
1854 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
8818d409
FC
1855 /* The line number may differ. If that is the case,
1856 alias the saved lines that are in the array.
1857 Otherwise alias the whole array. */
1858 if (CopLINE(PL_curcop) == line_num) {
38bd7ad8
FC
1859 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(cfgv)));
1860 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(cfgv)));
8818d409 1861 }
38bd7ad8
FC
1862 else if (GvAV(cfgv)) {
1863 AV * const av = GvAV(cfgv);
515c395b
TC
1864 const line_t start = CopLINE(PL_curcop)+1;
1865 SSize_t items = AvFILLp(av) - start;
8818d409
FC
1866 if (items > 0) {
1867 AV * const av2 = GvAVn(gv2);
1868 SV **svp = AvARRAY(av) + start;
515c395b
TC
1869 Size_t l = line_num+1;
1870 while (items-- && l < SSize_t_MAX && l == (line_t)l)
1871 av_store(av2, (SSize_t)l++, SvREFCNT_inc(*svp++));
8818d409
FC
1872 }
1873 }
e5527e4b 1874 }
44867030
NC
1875
1876 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
d36ee5be 1877 }
e66cf94c 1878 }
05ec9bb3 1879 CopFILE_free(PL_curcop);
449dd039 1880 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1881 }
8818d409 1882 CopLINE_set(PL_curcop, line_num);
463ee0b2
LW
1883}
1884
80a702cd 1885STATIC void
15f169a1 1886S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1887{
1888 AV *av = CopFILEAVx(PL_curcop);
1889 if (av) {
7f1c3e8c
FC
1890 SV * sv;
1891 if (PL_parser->preambling == NOLINE) sv = newSV_type(SVt_PVMG);
1892 else {
1893 sv = *av_fetch(av, 0, 1);
1894 SvUPGRADE(sv, SVt_PVMG);
1895 }
847cc851 1896 if (!SvPOK(sv)) SvPVCLEAR(sv);
5fa550fb 1897 if (orig_sv)
7f1c3e8c 1898 sv_catsv(sv, orig_sv);
5fa550fb 1899 else
7f1c3e8c
FC
1900 sv_catpvn(sv, buf, len);
1901 if (!SvIOK(sv)) {
1902 (void)SvIOK_on(sv);
1903 SvIV_set(sv, 0);
1904 }
1905 if (PL_parser->preambling == NOLINE)
1906 av_store(av, CopLINE(PL_curcop), sv);
80a702cd
RGS
1907 }
1908}
1909
ffb4593c 1910/*
8c6b0c7d 1911 * skipspace
ffb4593c
NT
1912 * Called to gobble the appropriate amount and type of whitespace.
1913 * Skips comments as well.
71fff7cb 1914 * Returns the next character after the whitespace that is skipped.
8c6b0c7d
FC
1915 *
1916 * peekspace
1917 * Same thing, but look ahead without incrementing line numbers or
1918 * adjusting PL_linestart.
ffb4593c
NT
1919 */
1920
3218e223 1921#define skipspace(s) skipspace_flags(s, 0)
8c6b0c7d 1922#define peekspace(s) skipspace_flags(s, LEX_NO_INCLINE)
3218e223 1923
aabfeadc
KW
1924char *
1925Perl_skipspace_flags(pTHX_ char *s, U32 flags)
a687059c 1926{
21791330 1927 PERL_ARGS_ASSERT_SKIPSPACE_FLAGS;
3280af22 1928 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
3c47da3c 1929 while (s < PL_bufend && (SPACE_OR_TAB(*s) || !*s))
463ee0b2 1930 s++;
f0e67a1d
Z
1931 } else {
1932 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1933 PL_bufptr = s;
21791330 1934 lex_read_space(flags | LEX_KEEP_PREVIOUS |
d27f4b91 1935 (PL_lex_inwhat || PL_lex_state == LEX_FORMLINE ?
f0998909 1936 LEX_NO_NEXT_CHUNK : 0));
3280af22 1937 s = PL_bufptr;
f0e67a1d
Z
1938 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1939 if (PL_linestart > PL_bufptr)
1940 PL_bufptr = PL_linestart;
1941 return s;
463ee0b2 1942 }
5db06880 1943 return s;
a687059c 1944}
378cc40b 1945
ffb4593c
NT
1946/*
1947 * S_check_uni
1948 * Check the unary operators to ensure there's no ambiguity in how they're
1949 * used. An ambiguous piece of code would be:
1950 * rand + 5
1951 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1952 * the +5 is its argument.
1953 */
1954
76e3520e 1955STATIC void
cea2e8a9 1956S_check_uni(pTHX)
ba106d47 1957{
d4c19fe8 1958 const char *s;
2f3197b3 1959
3280af22 1960 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1961 return;
3280af22
NIS
1962 while (isSPACE(*PL_last_uni))
1963 PL_last_uni++;
c35e046a 1964 s = PL_last_uni;
fac0f7a3 1965 while (isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF) || *s == '-')
8ce2ba82 1966 s += UTF ? UTF8SKIP(s) : 1;
4efcdc02 1967 if (s < PL_bufptr && memchr(s, '(', PL_bufptr - s))
a0d0e21e 1968 return;
6136c704 1969
9b387841 1970 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
147e3846 1971 "Warning: Use of \"%" UTF8f "\" without parentheses is ambiguous",
b59c097b 1972 UTF8fARG(UTF, (int)(s - PL_last_uni), PL_last_uni));
2f3197b3
LW
1973}
1974
ffb4593c
NT
1975/*
1976 * LOP : macro to build a list operator. Its behaviour has been replaced
1977 * with a subroutine, S_lop() for which LOP is just another name.
1978 */
1979
a0d0e21e
LW
1980#define LOP(f,x) return lop(f,x,s)
1981
ffb4593c
NT
1982/*
1983 * S_lop
1984 * Build a list operator (or something that might be one). The rules:
41e8cbf4
FC
1985 * - if we have a next token, then it's a list operator (no parens) for
1986 * which the next token has already been parsed; e.g.,
1987 * sort foo @args
1988 * sort foo (@args)
ffb4593c
NT
1989 * - if the next thing is an opening paren, then it's a function
1990 * - else it's a list operator
1991 */
1992
76e3520e 1993STATIC I32
11288bb3 1994S_lop(pTHX_ I32 f, U8 x, char *s)
ffed7fef 1995{
7918f24d
NC
1996 PERL_ARGS_ASSERT_LOP;
1997
6154021b 1998 pl_yylval.ival = f;
35c8bce7 1999 CLINE;
3280af22
NIS
2000 PL_bufptr = s;
2001 PL_last_lop = PL_oldbufptr;
eb160463 2002 PL_last_lop_op = (OPCODE)f;
3280af22 2003 if (PL_nexttoke)
78cdf107 2004 goto lstop;
19f1898a 2005 PL_expect = x;
79072805 2006 if (*s == '(')
bbf60fe6 2007 return REPORT(FUNC);
294a536f 2008 s = skipspace(s);
79072805 2009 if (*s == '(')
bbf60fe6 2010 return REPORT(FUNC);
78cdf107
Z
2011 else {
2012 lstop:
2013 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
2014 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
bbf60fe6 2015 return REPORT(LSTOP);
78cdf107 2016 }
79072805
LW
2017}
2018
ffb4593c
NT
2019/*
2020 * S_force_next
9cbb5ea2 2021 * When the lexer realizes it knows the next token (for instance,
ffb4593c 2022 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 2023 * to know what token to return the next time the lexer is called. Caller
b5bbe64a
JH
2024 * will need to set PL_nextval[] and possibly PL_expect to ensure
2025 * the lexer handles the token correctly.
ffb4593c
NT
2026 */
2027
4e553d73 2028STATIC void
cea2e8a9 2029S_force_next(pTHX_ I32 type)
79072805 2030{
704d4215
GG
2031#ifdef DEBUGGING
2032 if (DEBUG_T_TEST) {
2033 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 2034 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
2035 }
2036#endif
1f7c3e7c 2037 assert(PL_nexttoke < C_ARRAY_LENGTH(PL_nexttype));
3280af22
NIS
2038 PL_nexttype[PL_nexttoke] = type;
2039 PL_nexttoke++;
79072805
LW
2040}
2041
89f35911
FC
2042/*
2043 * S_postderef
2044 *
2045 * This subroutine handles postfix deref syntax after the arrow has already
b3f7b7ad 2046 * been emitted. @* $* etc. are emitted as two separate tokens right here.
89f35911
FC
2047 * @[ @{ %[ %{ *{ are emitted also as two tokens, but this function emits
2048 * only the first, leaving yylex to find the next.
89f35911
FC
2049 */
2050
2051static int
ff25e5db 2052S_postderef(pTHX_ int const funny, char const next)
89f35911 2053{
9086c946
BZ
2054 assert(funny == DOLSHARP
2055 || memCHRs("$@%&*", funny)
bfa838cc 2056 || funny == PERLY_DOLLAR
9086c946 2057 || funny == PERLY_SNAIL
0ba95c59 2058 || funny == PERLY_PERCENT_SIGN
9086c946 2059 || funny == PERLY_AMPERSAND
d02b2fbf 2060 || funny == PERLY_STAR
9086c946 2061 );
89f35911
FC
2062 if (next == '*') {
2063 PL_expect = XOPERATOR;
cc624add 2064 if (PL_lex_state == LEX_INTERPNORMAL && !PL_lex_brackets) {
bfa838cc 2065 assert(PERLY_SNAIL == funny || PERLY_DOLLAR == funny || DOLSHARP == funny);
cc624add 2066 PL_lex_state = LEX_INTERPEND;
9086c946 2067 if (PERLY_SNAIL == funny)
c3492809 2068 force_next(POSTJOIN);
cc624add 2069 }
d02b2fbf 2070 force_next(PERLY_STAR);
89f35911
FC
2071 PL_bufptr+=2;
2072 }
2073 else {
9086c946 2074 if (PERLY_SNAIL == funny && PL_lex_state == LEX_INTERPNORMAL
760ca746
FC
2075 && !PL_lex_brackets)
2076 PL_lex_dojoin = 2;
89f35911
FC
2077 PL_expect = XOPERATOR;
2078 PL_bufptr++;
2079 }
2080 return funny;
2081}
2082
28ac2b49
Z
2083void
2084Perl_yyunlex(pTHX)
2085{
a7aaec61
Z
2086 int yyc = PL_parser->yychar;
2087 if (yyc != YYEMPTY) {
2088 if (yyc) {
a7aaec61 2089 NEXTVAL_NEXTTOKE = PL_parser->yylval;
669dd22c 2090 if (yyc == PERLY_BRACE_OPEN || yyc == HASHBRACK || yyc == PERLY_BRACKET_OPEN) {
78cdf107 2091 PL_lex_allbrackets--;
a7aaec61 2092 PL_lex_brackets--;
78cdf107 2093 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
ee67f254 2094 } else if (yyc == PERLY_PAREN_OPEN) {
78cdf107
Z
2095 PL_lex_allbrackets--;
2096 yyc |= (2<<24);
a7aaec61
Z
2097 }
2098 force_next(yyc);
2099 }
28ac2b49
Z
2100 PL_parser->yychar = YYEMPTY;
2101 }
2102}
2103
d0a148a6 2104STATIC SV *
15f169a1 2105S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 2106{
740cce10 2107 SV * const sv = newSVpvn_utf8(start, len,
6d492fac
KW
2108 ! IN_BYTES
2109 && UTF
53463649 2110 && len != 0
6d492fac 2111 && is_utf8_non_invariant_string((const U8*)start, len));
d0a148a6
NC
2112 return sv;
2113}
2114
ffb4593c
NT
2115/*
2116 * S_force_word
2117 * When the lexer knows the next thing is a word (for instance, it has
2118 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
2119 * it calls S_force_word to stick the next word into the PL_nexttoke/val
2120 * lookahead.
ffb4593c
NT
2121 *
2122 * Arguments:
b1b65b59 2123 * char *start : buffer position (must be within PL_linestr)
185c2e96
DM
2124 * int token : PL_next* will be this type of bare word
2125 * (e.g., METHOD,BAREWORD)
ffb4593c
NT
2126 * int check_keyword : if true, Perl checks to make sure the word isn't
2127 * a keyword (do this if the word is a label, e.g. goto FOO)
2128 * int allow_pack : if true, : characters will also be allowed (require,
2129 * use, etc. do this)
ffb4593c
NT
2130 */
2131
76e3520e 2132STATIC char *
345b3785 2133S_force_word(pTHX_ char *start, int token, int check_keyword, int allow_pack)
79072805 2134{
eb578fdb 2135 char *s;
463ee0b2 2136 STRLEN len;
4e553d73 2137
7918f24d
NC
2138 PERL_ARGS_ASSERT_FORCE_WORD;
2139
294a536f 2140 start = skipspace(start);
463ee0b2 2141 s = start;
fac0f7a3 2142 if ( isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)
e7127e21 2143 || (allow_pack && *s == ':' && s[1] == ':') )
a0d0e21e 2144 {
3280af22 2145 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
01b5ef50
FC
2146 if (check_keyword) {
2147 char *s2 = PL_tokenbuf;
487e470d 2148 STRLEN len2 = len;
de627158
KW
2149 if (allow_pack && memBEGINPs(s2, len, "CORE::")) {
2150 s2 += sizeof("CORE::") - 1;
2151 len2 -= sizeof("CORE::") - 1;
2152 }
487e470d 2153 if (keyword(s2, len2, 0))
463ee0b2 2154 return start;
01b5ef50 2155 }
463ee0b2 2156 if (token == METHOD) {
294a536f 2157 s = skipspace(s);
463ee0b2 2158 if (*s == '(')
3280af22 2159 PL_expect = XTERM;
463ee0b2 2160 else {
3280af22 2161 PL_expect = XOPERATOR;
463ee0b2 2162 }
79072805 2163 }
9ded7720 2164 NEXTVAL_NEXTTOKE.opval
275103cd 2165 = newSVOP(OP_CONST,0,
d0a148a6 2166 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 2167 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
2168 force_next(token);
2169 }
2170 return s;
2171}
2172
ffb4593c
NT
2173/*
2174 * S_force_ident
9cbb5ea2 2175 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
2176 * text only contains the "foo" portion. The first argument is a pointer
2177 * to the "foo", and the second argument is the type symbol to prefix.
185c2e96 2178 * Forces the next token to be a "BAREWORD".
9cbb5ea2 2179 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2180 */
2181
76e3520e 2182STATIC void
5aaab254 2183S_force_ident(pTHX_ const char *s, int kind)
79072805 2184{
7918f24d
NC
2185 PERL_ARGS_ASSERT_FORCE_IDENT;
2186
c9b48522
DD
2187 if (s[0]) {
2188 const STRLEN len = s[1] ? strlen(s) : 1; /* s = "\"" see yylex */
275103cd 2189 OP* const o = newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
728847b1 2190 UTF ? SVf_UTF8 : 0));
9ded7720 2191 NEXTVAL_NEXTTOKE.opval = o;
185c2e96 2192 force_next(BAREWORD);
748a9306 2193 if (kind) {
11343788 2194 o->op_private = OPpCONST_ENTERED;
55497cff
PP
2195 /* XXX see note in pp_entereval() for why we forgo typo
2196 warnings if the symbol must be introduced in an eval.
2197 GSAR 96-10-12 */
90e5519e 2198 gv_fetchpvn_flags(s, len,
4bff32c5 2199 (PL_in_eval ? GV_ADDMULTI
728847b1 2200 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
bfa838cc 2201 kind == PERLY_DOLLAR ? SVt_PV :
9086c946 2202 kind == PERLY_SNAIL ? SVt_PVAV :
0ba95c59 2203 kind == PERLY_PERCENT_SIGN ? SVt_PVHV :
a0d0e21e 2204 SVt_PVGV
90e5519e 2205 );
748a9306 2206 }
79072805
LW
2207 }
2208}
2209
3f33d153
FC
2210static void
2211S_force_ident_maybe_lex(pTHX_ char pit)
2212{
3f33d153
FC
2213 NEXTVAL_NEXTTOKE.ival = pit;
2214 force_next('p');
2215}
2216
1571675a
GS
2217NV
2218Perl_str_to_version(pTHX_ SV *sv)
2219{
2220 NV retval = 0.0;
2221 NV nshift = 1.0;
2222 STRLEN len;
cfd0369c 2223 const char *start = SvPV_const(sv,len);
9d4ba2ae 2224 const char * const end = start + len;
8298454c 2225 const bool utf = cBOOL(SvUTF8(sv));
7918f24d
NC
2226
2227 PERL_ARGS_ASSERT_STR_TO_VERSION;
2228
1571675a 2229 while (start < end) {
ba210ebe 2230 STRLEN skip;
1571675a
GS
2231 UV n;
2232 if (utf)
9041c2e3 2233 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2234 else {
2235 n = *(U8*)start;
2236 skip = 1;
2237 }
2238 retval += ((NV)n)/nshift;
2239 start += skip;
2240 nshift *= 1000;
2241 }
2242 return retval;
2243}
2244
4e553d73 2245/*
ffb4593c
NT
2246 * S_force_version
2247 * Forces the next token to be a version number.
e759cc13
RGS
2248 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2249 * and if "guessing" is TRUE, then no new token is created (and the caller
2250 * must use an alternative parsing method).
ffb4593c
NT
2251 */
2252
76e3520e 2253STATIC char *
e759cc13 2254S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2255{
5f66b61c 2256 OP *version = NULL;
44dcb63b 2257 char *d;
89bfa8cd 2258
7918f24d
NC
2259 PERL_ARGS_ASSERT_FORCE_VERSION;
2260
294a536f 2261 s = skipspace(s);
89bfa8cd 2262
44dcb63b 2263 d = s;
dd629d5b 2264 if (*d == 'v')
44dcb63b 2265 d++;
44dcb63b 2266 if (isDIGIT(*d)) {
e759cc13
RGS
2267 while (isDIGIT(*d) || *d == '_' || *d == '.')
2268 d++;
4e4da3ac 2269 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
dd629d5b 2270 SV *ver;
6154021b
RGS
2271 s = scan_num(s, &pl_yylval);
2272 version = pl_yylval.opval;
dd629d5b
GS
2273 ver = cSVOPx(version)->op_sv;
2274 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2275 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2276 SvNV_set(ver, str_to_version(ver));
1571675a 2277 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2278 }
89bfa8cd 2279 }
5db06880 2280 else if (guessing) {
e759cc13 2281 return s;
5db06880 2282 }
89bfa8cd
PP
2283 }
2284
2285 /* NOTE: The parser sees the package name and the VERSION swapped */
9ded7720 2286 NEXTVAL_NEXTTOKE.opval = version;
185c2e96 2287 force_next(BAREWORD);
89bfa8cd 2288
e759cc13 2289 return s;
89bfa8cd
PP
2290}
2291
ffb4593c 2292/*
91152fc1
DG
2293 * S_force_strict_version
2294 * Forces the next token to be a version number using strict syntax rules.
2295 */
2296
2297STATIC char *
2298S_force_strict_version(pTHX_ char *s)
2299{
91152fc1 2300 OP *version = NULL;
91152fc1
DG
2301 const char *errstr = NULL;
2302
2303 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2304
2305 while (isSPACE(*s)) /* leading whitespace */
2306 s++;
2307
2308 if (is_STRICT_VERSION(s,&errstr)) {
2309 SV *ver = newSV(0);
2310 s = (char *)scan_version(s, ver, 0);
2311 version = newSVOP(OP_CONST, 0, ver);
2312 }
407f8cf2
KW
2313 else if ((*s != ';' && *s != '{' && *s != '}' )
2314 && (s = skipspace(s), (*s != ';' && *s != '{' && *s != '}' )))
4e4da3ac 2315 {
91152fc1
DG
2316 PL_bufptr = s;
2317 if (errstr)
2318 yyerror(errstr); /* version required */
2319 return s;
2320 }
2321
91152fc1 2322 /* NOTE: The parser sees the package name and the VERSION swapped */
91152fc1 2323 NEXTVAL_NEXTTOKE.opval = version;
185c2e96 2324 force_next(BAREWORD);
91152fc1
DG
2325
2326 return s;
2327}
2328
2329/*
ffb4593c 2330 * S_tokeq
ef3ff34d
KW
2331 * Turns any \\ into \ in a quoted string passed in in 'sv', returning 'sv',
2332 * modified as necessary. However, if HINT_NEW_STRING is on, 'sv' is
2333 * unchanged, and a new SV containing the modified input is returned.
ffb4593c
NT
2334 */
2335
76e3520e 2336STATIC SV *
cea2e8a9 2337S_tokeq(pTHX_ SV *sv)
79072805 2338{
eb578fdb
KW
2339 char *s;
2340 char *send;
2341 char *d;
b3ac6de7 2342 SV *pv = sv;
79072805 2343
7918f24d
NC
2344 PERL_ARGS_ASSERT_TOKEQ;
2345
279b35ad
FC
2346 assert (SvPOK(sv));
2347 assert (SvLEN(sv));
2348 assert (!SvIsCOW(sv));
307ed071 2349 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1) /* <<'heredoc' */
b3ac6de7 2350 goto finish;
279b35ad
FC
2351 s = SvPVX(sv);
2352 send = SvEND(sv);
dcb21ed6
NC
2353 /* This is relying on the SV being "well formed" with a trailing '\0' */
2354 while (s < send && !(*s == '\\' && s[1] == '\\'))
79072805
LW
2355 s++;
2356 if (s == send)
b3ac6de7 2357 goto finish;
79072805 2358 d = s;
be4731d2 2359 if ( PL_hints & HINT_NEW_STRING ) {
279b35ad
FC
2360 pv = newSVpvn_flags(SvPVX_const(pv), SvCUR(sv),
2361 SVs_TEMP | SvUTF8(sv));
be4731d2 2362 }
79072805
LW
2363 while (s < send) {
2364 if (*s == '\\') {
a0d0e21e 2365 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2366 s++; /* all that, just for this */
2367 }
2368 *d++ = *s++;
2369 }
2370 *d = '\0';
95a20fc0 2371 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2372 finish:
3280af22 2373 if ( PL_hints & HINT_NEW_STRING )
164e423c 2374 return new_constant(NULL, 0, "q", sv, pv, "q", 1, NULL);
79072805
LW
2375 return sv;
2376}
2377
ffb4593c
NT
2378/*
2379 * Now come three functions related to double-quote context,
2380 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2381 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2382 * interact with PL_lex_state, and create fake ( ... ) argument lists
2383 * to handle functions and concatenation.
ecd24171
DM
2384 * For example,
2385 * "foo\lbar"
2386 * is tokenised as
2387 * stringify ( const[foo] concat lcfirst ( const[bar] ) )
ffb4593c
NT
2388 */
2389
2390/*
2391 * S_sublex_start
6154021b 2392 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2393 *
2394 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2395 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c 2396 *
16d1d8bd 2397 * OP_CONST is easy--just make the new op and return.
ffb4593c
NT
2398 *
2399 * Everything else becomes a FUNC.
2400 *
16d1d8bd
LM
2401 * Sets PL_lex_state to LEX_INTERPPUSH unless ival was OP_NULL or we
2402 * had an OP_CONST. This just sets us up for a
ffb4593c
NT
2403 * call to S_sublex_push().
2404 */
2405
76e3520e 2406STATIC I32
cea2e8a9 2407S_sublex_start(pTHX)
79072805 2408{
eb578fdb 2409 const I32 op_type = pl_yylval.ival;
79072805
LW
2410
2411 if (op_type == OP_NULL) {
6154021b 2412 pl_yylval.opval = PL_lex_op;
5f66b61c 2413 PL_lex_op = NULL;
79072805
LW
2414 return THING;
2415 }
466112bb 2416 if (op_type == OP_CONST) {
67c71cbb
FC
2417 SV *sv = PL_lex_stuff;
2418 PL_lex_stuff = NULL;
2419 sv = tokeq(sv);
b3ac6de7
IZ
2420
2421 if (SvTYPE(sv) == SVt_PVIV) {
2422 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2423 STRLEN len;
96a5add6 2424 const char * const p = SvPV_const(sv, len);
740cce10 2425 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2426 SvREFCNT_dec(sv);
2427 sv = nsv;
4e553d73 2428 }
275103cd 2429 pl_yylval.opval = newSVOP(op_type, 0, sv);
79072805
LW
2430 return THING;
2431 }
2432
7ef70b3d
FC
2433 PL_parser->lex_super_state = PL_lex_state;
2434 PL_parser->lex_sub_inwhat = (U16)op_type;
2435 PL_parser->lex_sub_op = PL_lex_op;
bb4e4c38
TC
2436 PL_parser->sub_no_recover = FALSE;
2437 PL_parser->sub_error_count = PL_error_count;
3280af22 2438 PL_lex_state = LEX_INTERPPUSH;
55497cff 2439
3280af22
NIS
2440 PL_expect = XTERM;
2441 if (PL_lex_op) {
6154021b 2442 pl_yylval.opval = PL_lex_op;
5f66b61c 2443 PL_lex_op = NULL;
55497cff
PP
2444 return PMFUNC;
2445 }
2446 else
2447 return FUNC;
2448}
2449
ffb4593c
NT
2450/*
2451 * S_sublex_push
2452 * Create a new scope to save the lexing state. The scope will be
2453 * ended in S_sublex_done. Returns a '(', starting the function arguments
2454 * to the uc, lc, etc. found before.
2455 * Sets PL_lex_state to LEX_INTERPCONCAT.
2456 */
2457
76e3520e 2458STATIC I32
cea2e8a9 2459S_sublex_push(pTHX)
55497cff 2460{
78a635de 2461 LEXSHARED *shared;
801d32ac 2462 const bool is_heredoc = PL_multi_close == '<';
f46d017c 2463 ENTER;
55497cff 2464
7ef70b3d 2465 PL_lex_state = PL_parser->lex_super_state;
cc624add 2466 SAVEI8(PL_lex_dojoin);
3280af22 2467 SAVEI32(PL_lex_brackets);
78cdf107 2468 SAVEI32(PL_lex_allbrackets);
b27dce25 2469 SAVEI32(PL_lex_formbrack);
78cdf107 2470 SAVEI8(PL_lex_fakeeof);
3280af22
NIS
2471 SAVEI32(PL_lex_casemods);
2472 SAVEI32(PL_lex_starts);
651b5b28 2473 SAVEI8(PL_lex_state);
7cc34111 2474 SAVESPTR(PL_lex_repl);
7766f137 2475 SAVEVPTR(PL_lex_inpat);
98246f1e 2476 SAVEI16(PL_lex_inwhat);
ffdb8b16 2477 if (is_heredoc)
b42366d4 2478 {
ffdb8b16 2479 SAVECOPLINE(PL_curcop);
b42366d4 2480 SAVEI32(PL_multi_end);
851b527a
FC
2481 SAVEI32(PL_parser->herelines);
2482 PL_parser->herelines = 0;
b42366d4 2483 }
2ca4363d 2484 SAVEIV(PL_multi_close);
3280af22 2485 SAVEPPTR(PL_bufptr);
8452ff4b 2486 SAVEPPTR(PL_bufend);
3280af22
NIS
2487 SAVEPPTR(PL_oldbufptr);
2488 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2489 SAVEPPTR(PL_last_lop);
2490 SAVEPPTR(PL_last_uni);
3280af22
NIS
2491 SAVEPPTR(PL_linestart);
2492 SAVESPTR(PL_linestr);
8edd5f42
RGS
2493 SAVEGENERICPV(PL_lex_brackstack);
2494 SAVEGENERICPV(PL_lex_casestack);
78a635de 2495 SAVEGENERICPV(PL_parser->lex_shared);
3a54fd60 2496 SAVEBOOL(PL_parser->lex_re_reparsing);
ffdb8b16 2497 SAVEI32(PL_copline);
3280af22 2498
99bd9d90 2499 /* The here-doc parser needs to be able to peek into outer lexing
60f40a38
FC
2500 scopes to find the body of the here-doc. So we put PL_linestr and
2501 PL_bufptr into lex_shared, to ‘share’ those values.
99bd9d90 2502 */
60f40a38
FC
2503 PL_parser->lex_shared->ls_linestr = PL_linestr;
2504 PL_parser->lex_shared->ls_bufptr = PL_bufptr;
99bd9d90 2505
3280af22 2506 PL_linestr = PL_lex_stuff;
7ef70b3d 2507 PL_lex_repl = PL_parser->lex_sub_repl;
a0714e2c 2508 PL_lex_stuff = NULL;
7ef70b3d 2509 PL_parser->lex_sub_repl = NULL;
3280af22 2510
eabab8bc
FC
2511 /* Arrange for PL_lex_stuff to be freed on scope exit, in case it gets
2512 set for an inner quote-like operator and then an error causes scope-
2513 popping. We must not have a PL_lex_stuff value left dangling, as
2514 that breaks assumptions elsewhere. See bug #123617. */
2515 SAVEGENERICSV(PL_lex_stuff);
7ef70b3d 2516 SAVEGENERICSV(PL_parser->lex_sub_repl);
eabab8bc 2517
9cbb5ea2
GS
2518 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2519 = SvPVX(PL_linestr);
3280af22 2520 PL_bufend += SvCUR(PL_linestr);
bd61b366 2521 PL_last_lop = PL_last_uni = NULL;
3280af22 2522 SAVEFREESV(PL_linestr);
4dc843bc 2523 if (PL_lex_repl) SAVEFREESV(PL_lex_repl);
3280af22
NIS
2524
2525 PL_lex_dojoin = FALSE;
b27dce25 2526 PL_lex_brackets = PL_lex_formbrack = 0;
78cdf107
Z
2527 PL_lex_allbrackets = 0;
2528 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
a02a5408
JC
2529 Newx(PL_lex_brackstack, 120, char);
2530 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2531 PL_lex_casemods = 0;
2532 *PL_lex_casestack = '\0';
2533 PL_lex_starts = 0;
2534 PL_lex_state = LEX_INTERPCONCAT;
ffdb8b16 2535 if (is_heredoc)
6ddcf93b 2536 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
ffdb8b16 2537 PL_copline = NOLINE;
caae0700 2538
78a635de
FC
2539 Newxz(shared, 1, LEXSHARED);
2540 shared->ls_prev = PL_parser->lex_shared;
2541 PL_parser->lex_shared = shared;
3280af22 2542
7ef70b3d 2543 PL_lex_inwhat = PL_parser->lex_sub_inwhat;
bb16bae8 2544 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
3280af22 2545 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
7ef70b3d 2546 PL_lex_inpat = PL_parser->lex_sub_op;
79072805 2547 else
5f66b61c 2548 PL_lex_inpat = NULL;
79072805 2549
3a54fd60
DM
2550 PL_parser->lex_re_reparsing = cBOOL(PL_in_eval & EVAL_RE_REPARSING);
2551 PL_in_eval &= ~EVAL_RE_REPARSING;
2552
69afcc21 2553 return SUBLEXSTART;
79072805
LW
2554}
2555
ffb4593c
NT
2556/*
2557 * S_sublex_done
2558 * Restores lexer state after a S_sublex_push.
2559 */
2560
76e3520e 2561STATIC I32
cea2e8a9 2562S_sublex_done(pTHX)
79072805 2563{
3280af22 2564 if (!PL_lex_starts++) {
396482e1 2565 SV * const sv = newSVpvs("");
9aa983d2
JH
2566 if (SvUTF8(PL_linestr))
2567 SvUTF8_on(sv);
3280af22 2568 PL_expect = XOPERATOR;
275103cd 2569 pl_yylval.opval = newSVOP(OP_CONST, 0, sv);
79072805
LW
2570 return THING;
2571 }
2572
3280af22
NIS
2573 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2574 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2575 return yylex();
79072805
LW
2576 }
2577
ffb4593c 2578 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
bb16bae8 2579 assert(PL_lex_inwhat != OP_TRANSR);
5aa91856
FC
2580 if (PL_lex_repl) {
2581 assert (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS);
3280af22
NIS
2582 PL_linestr = PL_lex_repl;
2583 PL_lex_inpat = 0;
2584 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2585 PL_bufend += SvCUR(PL_linestr);
bd61b366 2586 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2587 PL_lex_dojoin = FALSE;
2588 PL_lex_brackets = 0;
78cdf107
Z
2589 PL_lex_allbrackets = 0;
2590 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
3280af22
NIS
2591 PL_lex_casemods = 0;
2592 *PL_lex_casestack = '\0';
2593 PL_lex_starts = 0;
25da4f38 2594 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2595 PL_lex_state = LEX_INTERPNORMAL;
2596 PL_lex_starts++;
e9fa98b2
HS
2597 /* we don't clear PL_lex_repl here, so that we can check later
2598 whether this is an evalled subst; that means we rely on the
2599 logic to ensure sublex_done() is called again only via the
2600 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2601 }
e9fa98b2 2602 else {
3280af22 2603 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2604 PL_lex_repl = NULL;
e9fa98b2 2605 }
ffdb8b16
FC
2606 if (SvTYPE(PL_linestr) >= SVt_PVNV) {
2607 CopLINE(PL_curcop) +=
9420b268 2608 ((XPVNV*)SvANY(PL_linestr))->xnv_u.xnv_lines
851b527a
FC
2609 + PL_parser->herelines;
2610 PL_parser->herelines = 0;
ffdb8b16 2611 }
77b0379f 2612 return PERLY_SLASH;
ffed7fef
LW
2613 }
2614 else {
b42366d4 2615 const line_t l = CopLINE(PL_curcop);
f46d017c 2616 LEAVE;
bb4e4c38 2617 if (PL_parser->sub_error_count != PL_error_count) {
bb4e4c38 2618 if (PL_parser->sub_no_recover) {
ad1ecdf7 2619 yyquit();
bb4e4c38
TC
2620 NOT_REACHED;
2621 }
2622 }
b42366d4 2623 if (PL_multi_close == '<')
851b527a 2624 PL_parser->herelines += l - PL_multi_end;
3280af22
NIS
2625 PL_bufend = SvPVX(PL_linestr);
2626 PL_bufend += SvCUR(PL_linestr);
2627 PL_expect = XOPERATOR;
69afcc21 2628 return SUBLEXEND;
ffed7fef
LW
2629 }
2630}
2631
4e8ee35f
KW
2632HV *
2633Perl_load_charnames(pTHX_ SV * char_name, const char * context,
2634 const STRLEN context_len, const char ** error_msg)
2635{
2636 /* Load the official _charnames module if not already there. The
2637 * parameters are just to give info for any error messages generated:
2638 * char_name a name to look up which is the reason for loading this
2639 * context 'char_name' in the context in the input in which it appears
2640 * context_len how many bytes 'context' occupies
2641 * error_msg *error_msg will be set to any error
2642 *
2643 * Returns the ^H table if success; otherwise NULL */
2644
2645 unsigned int i;
2646 HV * table;
2647 SV **cvp;
2648 SV * res;
2649
2650 PERL_ARGS_ASSERT_LOAD_CHARNAMES;
2651
2652 /* This loop is executed 1 1/2 times. On the first time through, if it
2653 * isn't already loaded, try loading it, and iterate just once to see if it
2654 * worked. */
2655 for (i = 0; i < 2; i++) {
2656 table = GvHV(PL_hintgv); /* ^H */
2657
2658 if ( table
2659 && (PL_hints & HINT_LOCALIZE_HH)
2660 && (cvp = hv_fetchs(table, "charnames", FALSE))
2661 && SvOK(*cvp))
2662 {
2663 return table; /* Quit if already loaded */
2664 }
2665
2666 if (i == 0) {
2667 Perl_load_module(aTHX_
2668 0,
2669 newSVpvs("_charnames"),
2670
2671 /* version parameter; no need to specify it, as if we get too early
2672 * a version, will fail anyway, not being able to find 'charnames'
2673 * */
2674 NULL,
2675 newSVpvs(":full"),
2676 newSVpvs(":short"),
2677 NULL);
2678 }
2679 }
2680
2681 /* Here, it failed; new_constant will give appropriate error messages */
2682 *error_msg = NULL;
2683 res = new_constant( NULL, 0, "charnames", char_name, NULL,
2684 context, context_len, error_msg);
2685 SvREFCNT_dec(res);
2686
2687 return NULL;
2688}
2689
7a7d14f3 2690STATIC SV*
2c43c309
KW
2691S_get_and_check_backslash_N_name_wrapper(pTHX_ const char* s, const char* const e)
2692{
2693 /* This justs wraps get_and_check_backslash_N_name() to output any error
2694 * message it returns. */
2695
2696 const char * error_msg = NULL;
25c7fb78 2697 SV * result;
2c43c309
KW
2698
2699 PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME_WRAPPER;
2700
25c7fb78
KW
2701 /* charnames doesn't work well if there have been errors found */
2702 if (PL_error_count > 0) {
2703 return NULL;
2704 }
2705
2706 result = get_and_check_backslash_N_name(s, e, cBOOL(UTF), &error_msg);
2707
2c43c309
KW
2708 if (error_msg) {
2709 yyerror_pv(error_msg, UTF ? SVf_UTF8 : 0);
2710 }
2711
2712 return result;
2713}
2714
25c7fb78
KW
2715SV*
2716Perl_get_and_check_backslash_N_name(pTHX_ const char* s,
2717 const char* const e,
2718 const bool is_utf8,
2719 const char ** error_msg)
6f613c73 2720{
140b12ad
KW
2721 /* <s> points to first character of interior of \N{}, <e> to one beyond the
2722 * interior, hence to the "}". Finds what the name resolves to, returning
25c7fb78
KW
2723 * an SV* containing it; NULL if no valid one found.
2724 *
2725 * 'is_utf8' is TRUE if we know we want the result to be UTF-8 even if it
2726 * doesn't have to be. */
6f613c73 2727
7303cc1f 2728 SV* char_name;
25c7fb78 2729 SV* res;
0c415a79
KW
2730 HV * table;
2731 SV **cvp;
2732 SV *cv;
2733 SV *rv;
2734 HV *stash;
8b0cce63
KW
2735
2736 /* Points to the beginning of the \N{... so that any messages include the
2737 * context of what's failing*/
2738 const char* context = s - 3;
2739 STRLEN context_len = e - context + 1; /* include all of \N{...} */
2740
0c415a79 2741
6f613c73
KW
2742 PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME;
2743
25c7fb78
KW
2744 assert(e >= s);
2745 assert(s > (char *) 3);
2746
7303cc1f 2747 char_name = newSVpvn_flags(s, e - s, (is_utf8) ? SVf_UTF8 : 0);
25c7fb78 2748
7303cc1f
KW
2749 if (!SvCUR(char_name)) {
2750 SvREFCNT_dec_NN(char_name);
be332ba0 2751 /* diag_listed_as: Unknown charname '%s' */
2c43c309 2752 *error_msg = Perl_form(aTHX_ "Unknown charname ''");
be332ba0 2753 return NULL;
d8d26cac 2754 }
b7e6151c 2755
7303cc1f 2756 /* Autoload the charnames module */
7303cc1f 2757
4e8ee35f
KW
2758 table = load_charnames(char_name, context, context_len, error_msg);
2759 if (table == NULL) {
2760 return NULL;
7303cc1f
KW
2761 }
2762
2763 *error_msg = NULL;
8b0cce63
KW
2764 res = new_constant( NULL, 0, "charnames", char_name, NULL,
2765 context, context_len, error_msg);
7303cc1f 2766 if (*error_msg) {
4e8ee35f 2767 *error_msg = Perl_form(aTHX_ "Unknown charname '%s'", SvPVX(char_name));
7303cc1f
KW
2768
2769 SvREFCNT_dec(res);
6f613c73
KW
2770 return NULL;
2771 }
2772
0c415a79
KW
2773 /* See if the charnames handler is the Perl core's, and if so, we can skip
2774 * the validation needed for a user-supplied one, as Perl's does its own
2775 * validation. */
0c415a79 2776 cvp = hv_fetchs(table, "charnames", FALSE);
5882ddb3
FC
2777 if (cvp && (cv = *cvp) && SvROK(cv) && (rv = SvRV(cv),
2778 SvTYPE(rv) == SVt_PVCV) && ((stash = CvSTASH(rv)) != NULL))
0c415a79
KW
2779 {
2780 const char * const name = HvNAME(stash);
b59bf0b2 2781 if (memEQs(name, HvNAMELEN(stash), "_charnames")) {
0c415a79
KW
2782 return res;
2783 }
2784 }
2785
bde9e88d
KW
2786 /* Here, it isn't Perl's charname handler. We can't rely on a
2787 * user-supplied handler to validate the input name. For non-ut8 input,
2788 * look to see that the first character is legal. Then loop through the
2789 * rest checking that each is a continuation */
6f613c73 2790
36897d64
KW
2791 /* This code makes the reasonable assumption that the only Latin1-range
2792 * characters that begin a character name alias are alphabetic, otherwise
2793 * would have to create a isCHARNAME_BEGIN macro */
b6ba1137 2794
25c7fb78 2795 if (! is_utf8) {
bde9e88d 2796 if (! isALPHAU(*s)) {
b6ba1137
KW
2797 goto bad_charname;
2798 }
bde9e88d
KW
2799 s++;
2800 while (s < e) {
2801 if (! isCHARNAME_CONT(*s)) {
b6ba1137
KW
2802 goto bad_charname;
2803 }
2d8eb851
KW
2804 if (*s == ' ' && *(s-1) == ' ') {
2805 goto multi_spaces;
bd299e29 2806 }
bde9e88d 2807 s++;
b6ba1137
KW
2808 }
2809 }
2810 else {
bde9e88d 2811 /* Similarly for utf8. For invariants can check directly; for other
58aa6738
KW
2812 * Latin1, can calculate their code point and check; otherwise use an
2813 * inversion list */
bde9e88d
KW
2814 if (UTF8_IS_INVARIANT(*s)) {
2815 if (! isALPHAU(*s)) {
140b12ad
KW
2816 goto bad_charname;
2817 }
bde9e88d
KW
2818 s++;
2819 } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
a62b247b 2820 if (! isALPHAU(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)))) {
b6ba1137 2821 goto bad_charname;
6f613c73 2822 }
bde9e88d 2823 s += 2;
6f613c73 2824 }
bde9e88d 2825 else {
f1bcae08
KW
2826 if (! _invlist_contains_cp(PL_utf8_charname_begin,
2827 utf8_to_uvchr_buf((U8 *) s,
2828 (U8 *) e,
2829 NULL)))
2830 {
bde9e88d
KW
2831 goto bad_charname;
2832 }
2833 s += UTF8SKIP(s);
2834 }
2835
2836 while (s < e) {
2837 if (UTF8_IS_INVARIANT(*s)) {
2838 if (! isCHARNAME_CONT(*s)) {
2839 goto bad_charname;
2840 }
2d8eb851
KW
2841 if (*s == ' ' && *(s-1) == ' ') {
2842 goto multi_spaces;
bd299e29 2843 }
bde9e88d
KW
2844 s++;
2845 }
2846 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
a62b247b 2847 if (! isCHARNAME_CONT(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1))))
bde9e88d
KW
2848 {
2849 goto bad_charname;
2850 }
2851 s += 2;
2852 }
2853 else {
f1bcae08
KW
2854 if (! _invlist_contains_cp(PL_utf8_charname_continue,
2855 utf8_to_uvchr_buf((U8 *) s,
2856 (U8 *) e,
2857 NULL)))
2858 {
bde9e88d
KW
2859 goto bad_charname;
2860 }
2861 s += UTF8SKIP(s);
6f613c73
KW
2862 }
2863 }
2d8eb851
KW
2864 }
2865 if (*(s-1) == ' ') {
8d9d0498
FC
2866 /* diag_listed_as: charnames alias definitions may not contain
2867 trailing white-space; marked by <-- HERE in %s
2868 */
2c43c309 2869 *error_msg = Perl_form(aTHX_
2d8eb851
KW
2870 "charnames alias definitions may not contain trailing "
2871 "white-space; marked by <-- HERE in %.*s<-- HERE %.*s",
8b0cce63 2872 (int)(s - context + 1), context,
2c43c309 2873 (int)(e - s + 1), s + 1);
2d8eb851 2874 return NULL;
6f613c73
KW
2875 }
2876
870fe793 2877 if (SvUTF8(res)) { /* Don't accept malformed charname value */
bde9e88d
KW
2878 const U8* first_bad_char_loc;
2879 STRLEN len;
2880 const char* const str = SvPV_const(res, len);
0aab20f2
KW
2881 if (UNLIKELY(! is_utf8_string_loc((U8 *) str, len,
2882 &first_bad_char_loc)))
2883 {
75219bac
KW
2884 _force_out_malformed_utf8_message(first_bad_char_loc,
2885 (U8 *) PL_parser->bufend,
2886 0,
2887 0 /* 0 means don't die */ );
8d9d0498
FC
2888 /* diag_listed_as: Malformed UTF-8 returned by \N{%s}
2889 immediately after '%s' */
2c43c309 2890 *error_msg = Perl_form(aTHX_
bde9e88d 2891 "Malformed UTF-8 returned by %.*s immediately after '%.*s'",
8b0cce63 2892 (int) context_len, context,
2c43c309 2893 (int) ((char *) first_bad_char_loc - str), str);
bde9e88d
KW
2894 return NULL;
2895 }
2896 }
140b12ad 2897
bde9e88d 2898 return res;
140b12ad 2899
bde9e88d 2900 bad_charname: {
bde9e88d
KW
2901
2902 /* The final %.*s makes sure that should the trailing NUL be missing
2903 * that this print won't run off the end of the string */
8d9d0498
FC
2904 /* diag_listed_as: Invalid character in \N{...}; marked by <-- HERE
2905 in \N{%s} */
2c43c309 2906 *error_msg = Perl_form(aTHX_
bde9e88d 2907 "Invalid character in \\N{...}; marked by <-- HERE in %.*s<-- HERE %.*s",
8b0cce63 2908 (int)(s - context + 1), context,
2c43c309 2909 (int)(e - s + 1), s + 1);
bde9e88d
KW
2910 return NULL;
2911 }
2d8eb851
KW
2912
2913 multi_spaces:
8d9d0498
FC
2914 /* diag_listed_as: charnames alias definitions may not contain a
2915 sequence of multiple spaces; marked by <-- HERE
2916 in %s */
2c43c309 2917 *error_msg = Perl_form(aTHX_
2d8eb851
KW
2918 "charnames alias definitions may not contain a sequence of "
2919 "multiple spaces; marked by <-- HERE in %.*s<-- HERE %.*s",
8b0cce63 2920 (int)(s - context + 1), context,
2c43c309 2921 (int)(e - s + 1), s + 1);
2d8eb851 2922 return NULL;
6f613c73
KW
2923}
2924
02aa26ce
NT
2925/*
2926 scan_const
2927
9da1dd8f
DM
2928 Extracts the next constant part of a pattern, double-quoted string,
2929 or transliteration. This is terrifying code.
2930
2931 For example, in parsing the double-quoted string "ab\x63$d", it would
2932 stop at the '$' and return an OP_CONST containing 'abc'.
02aa26ce 2933
94def140 2934 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 2935 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 2936 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 2937
94def140
ST
2938 Returns a pointer to the character scanned up to. If this is
2939 advanced from the start pointer supplied (i.e. if anything was
9da1dd8f 2940 successfully parsed), will leave an OP_CONST for the substring scanned
6154021b 2941 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
2942 by looking at the next characters herself.
2943
02aa26ce 2944 In patterns:
9da1dd8f 2945 expand:
537124e4
KW
2946 \N{FOO} => \N{U+hex_for_character_FOO}
2947 (if FOO expands to multiple characters, expands to \N{U+xx.XX.yy ...})
9da1dd8f
DM
2948
2949 pass through:
2950 all other \-char, including \N and \N{ apart from \N{ABC}
2951
2952 stops on:
2953 @ and $ where it appears to be a var, but not for $ as tail anchor
2954 \l \L \u \U \Q \E
2955 (?{ or (??{
2956
02aa26ce
NT
2957 In transliterations:
2958 characters are VERY literal, except for - not at the start or end
66b09263
KW
2959 of the string, which indicates a range. However some backslash sequences
2960 are recognized: \r, \n, and the like
2961 \007 \o{}, \x{}, \N{}
2962 If all elements in the transliteration are below 256,
94def140
ST
2963 scan_const expands the range to the full set of intermediate
2964 characters. If the range is in utf8, the hyphen is replaced with
2965 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
2966
2967 In double-quoted strings:
2968 backslashes:
66b09263 2969 all those recognized in transliterations
94def140 2970 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
2971 case and quoting: \U \Q \E
2972 stops on @ and $
2973
2974 scan_const does *not* construct ops to handle interpolated strings.
2975 It stops processing as soon as it finds an embedded $ or @ variable
2976 and leaves it to the caller to work out what's going on.
2977
94def140
ST
2978 embedded arrays (whether in pattern or not) could be:
2979 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2980
2981 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
2982
2983 $ in pattern could be $foo or could be tail anchor. Assumption:
2984 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 2985 followed by one of "()| \r\n\t"
02aa26ce 2986
9da1dd8f 2987 \1 (backreferences) are turned into $1 in substitutions
02aa26ce
NT
2988
2989 The structure of the code is
2990 while (there's a character to process) {
94def140
ST
2991 handle transliteration ranges
2992 skip regexp comments /(?#comment)/ and codes /(?{code})/
2993 skip #-initiated comments in //x patterns
2994 check for embedded arrays
02aa26ce
NT
2995 check for embedded scalars
2996 if (backslash) {
94def140 2997 deprecate \1 in substitution replacements
02aa26ce
NT
2998 handle string-changing backslashes \l \U \Q \E, etc.
2999 switch (what was escaped) {
94def140 3000 handle \- in a transliteration (becomes a literal -)
ff3f963a 3001 if a pattern and not \N{, go treat as regular character
94def140
ST
3002 handle \132 (octal characters)
3003 handle \x15 and \x{1234} (hex characters)
ff3f963a 3004 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
ST
3005 handle \cV (control characters)
3006 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 3007 } (end switch)
77a135fe 3008 continue
02aa26ce 3009 } (end if backslash)
77a135fe 3010 handle regular character
02aa26ce 3011 } (end while character to read)
02cd137d 3012
02aa26ce
NT
3013*/
3014
76e3520e 3015STATIC char *
cea2e8a9 3016S_scan_const(pTHX_ char *start)
79072805 3017{
eb578fdb 3018 char *send = PL_bufend; /* end of the constant */
dc023dbb
KW
3019 SV *sv = newSV(send - start); /* sv for the constant. See note below
3020 on sizing. */
eb578fdb
KW
3021 char *s = start; /* start of the constant */
3022 char *d = SvPVX(sv); /* destination for copies */
dc023dbb
KW
3023 bool dorange = FALSE; /* are we in a translit range? */
3024 bool didrange = FALSE; /* did we just finish a range? */
3025 bool in_charclass = FALSE; /* within /[...]/ */
ed2893cb 3026 bool s_is_utf8 = cBOOL(UTF); /* Is the source string assumed to be
dc023dbb
KW
3027 UTF8? But, this can show as true
3028 when the source isn't utf8, as for
3029 example when it is entirely composed
3030 of hex constants */
f34acfec 3031 bool d_is_utf8 = FALSE; /* Output constant is UTF8 */
af9be36c
KW
3032 STRLEN utf8_variant_count = 0; /* When not in UTF-8, this counts the
3033 number of characters found so far
3034 that will expand (into 2 bytes)
3035 should we have to convert to
3036 UTF-8) */
6f613c73 3037 SV *res; /* result from charnames */
c89db733
JH
3038 STRLEN offset_to_max = 0; /* The offset in the output to where the range
3039 high-end character is temporarily placed */
77a135fe 3040
fe2ba0a2
KW
3041 /* Does something require special handling in tr/// ? This avoids extra
3042 * work in a less likely case. As such, khw didn't feel it was worth
3043 * adding any branches to the more mainline code to handle this, which
3044 * means that this doesn't get set in some circumstances when things like
3045 * \x{100} get expanded out. As a result there needs to be extra testing
3046 * done in the tr code */
3047 bool has_above_latin1 = FALSE;
3048
77a135fe
KW
3049 /* Note on sizing: The scanned constant is placed into sv, which is
3050 * initialized by newSV() assuming one byte of output for every byte of
3051 * input. This routine expects newSV() to allocate an extra byte for a
3052 * trailing NUL, which this routine will append if it gets to the end of
3053 * the input. There may be more bytes of input than output (eg., \N{LATIN
3054 * CAPITAL LETTER A}), or more output than input if the constant ends up
3055 * recoded to utf8, but each time a construct is found that might increase
3056 * the needed size, SvGROW() is called. Its size parameter each time is
3057 * based on the best guess estimate at the time, namely the length used so
3058 * far, plus the length the current construct will occupy, plus room for
caae0700 3059 * the trailing NUL, plus one byte for every input byte still unscanned */
77a135fe 3060
c3320c2a
KW
3061 UV uv = UV_MAX; /* Initialize to weird value to try to catch any uses
3062 before set */
4c3a8340 3063#ifdef EBCDIC
f4240379
KW
3064 int backslash_N = 0; /* ? was the character from \N{} */
3065 int non_portable_endpoint = 0; /* ? In a range is an endpoint
3066 platform-specific like \x65 */
4c3a8340 3067#endif
012bcf8d 3068
7918f24d
NC
3069 PERL_ARGS_ASSERT_SCAN_CONST;
3070
bb16bae8 3071 assert(PL_lex_inwhat != OP_TRANSR);
2b9d42f0 3072
b899e89d
FC
3073 /* Protect sv from errors and fatal warnings. */
3074 ENTER_with_name("scan_const");
3075 SAVEFREESV(sv);
2b9d42f0 3076
1759517a
KW
3077 /* A bunch of code in the loop below assumes that if s[n] exists and is not
3078 * NUL, then s[n+1] exists. This assertion makes sure that assumption is
3079 * valid */
3080 assert(*send == '\0');
3081
f4240379
KW
3082 while (s < send
3083 || dorange /* Handle tr/// range at right edge of input */
3084 ) {
ff3f963a 3085
02aa26ce 3086 /* get transliterations out of the way (they're most literal) */
3280af22 3087 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 3088
f4240379
KW
3089 /* But there isn't any special handling necessary unless there is a
3090 * range, so for most cases we just drop down and handle the value
3091 * as any other. There are two exceptions.
3092 *
02cd137d
KW
3093 * 1. A hyphen indicates that we are actually going to have a
3094 * range. In this case, skip the '-', set a flag, then drop
f4240379
KW
3095 * down to handle what should be the end range value.
3096 * 2. After we've handled that value, the next time through, that
3097 * flag is set and we fix up the range.
3098 *
3099 * Ranges entirely within Latin1 are expanded out entirely, in
188d22cf
KW
3100 * order to make the transliteration a simple table look-up.
3101 * Ranges that extend above Latin1 have to be done differently, so
3102 * there is no advantage to expanding them here, so they are
dc8faf6b
KW
3103 * stored here as Min, RANGE_INDICATOR, Max. 'RANGE_INDICATOR' is
3104 * a byte that can't occur in legal UTF-8, and hence can signify a
3105 * hyphen without any possible ambiguity. On EBCDIC machines, if
3106 * the range is expressed as Unicode, the Latin1 portion is
3107 * expanded out even if the range extends above Latin1. This is
3108 * because each code point in it has to be processed here
3109 * individually to get its native translation */
f4240379
KW
3110
3111 if (! dorange) {
3112
02cd137d
KW
3113 /* Here, we don't think we're in a range. If the new character
3114 * is not a hyphen; or if it is a hyphen, but it's too close to
e8d55f27
TC
3115 * either edge to indicate a range, or if we haven't output any
3116 * characters yet then it's a regular character. */
04863ba1
KW
3117 if (*s != '-' || s >= send - 1 || s == start || d == SvPVX(sv))
3118 {
f4240379
KW
3119
3120 /* A regular character. Process like any other, but first
3121 * clear any flags */
3122 didrange = FALSE;
3123 dorange = FALSE;
e294cc5d 3124#ifdef EBCDIC
f4240379
KW
3125 non_portable_endpoint = 0;
3126 backslash_N = 0;
e294cc5d 3127#endif
02cd137d
KW
3128 /* The tests here for being above Latin1 and similar ones
3129 * in the following 'else' suffice to find all such
3130 * occurences in the constant, except those added by a
fe2ba0a2 3131 * backslash escape sequence, like \x{100}. Mostly, those
02cd137d 3132 * set 'has_above_latin1' as appropriate */
ed2893cb 3133 if (s_is_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) {
188d22cf
KW
3134 has_above_latin1 = TRUE;
3135 }
3136
f4240379
KW
3137 /* Drops down to generic code to process current byte */
3138 }
02cd137d 3139 else { /* Is a '-' in the context where it means a range */
f4240379 3140 if (didrange) { /* Something like y/A-C-Z// */
02cd137d
KW
3141 Perl_croak(aTHX_ "Ambiguous range in transliteration"
3142 " operator");
f4240379 3143 }
e294cc5d 3144
f4240379 3145 dorange = TRUE;
2b9d42f0 3146
02cd137d 3147 s++; /* Skip past the hyphen */
f4240379
KW
3148
3149 /* d now points to where the end-range character will be
d7f7b0e3
KW
3150 * placed. Drop down to get that character. We'll finish
3151 * processing the range the next time through the loop */
188d22cf 3152
ed2893cb 3153 if (s_is_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) {
188d22cf
KW
3154 has_above_latin1 = TRUE;
3155 }
02cd137d
KW
3156
3157 /* Drops down to generic code to process current byte */
f4240379
KW
3158 }
3159 } /* End of not a range */
3160 else {
3161 /* Here we have parsed a range. Now must handle it. At this
3162 * point:
3163 * 'sv' is a SV* that contains the output string we are
3164 * constructing. The final two characters in that string
3165 * are the range start and range end, in order.
3166 * 'd' points to just beyond the range end in the 'sv' string,
3167 * where we would next place something
f4240379 3168 */
0c311b7c 3169 char * max_ptr;
5ca37d54 3170 char * min_ptr;
f4240379
KW
3171 IV range_min;
3172 IV range_max; /* last character in range */
f4240379 3173 STRLEN grow;
5ca37d54
KW
3174 Size_t offset_to_min = 0;
3175 Size_t extras = 0;
11327fa1 3176#ifdef EBCDIC
f4240379
KW
3177 bool convert_unicode;
3178 IV real_range_max = 0;
e294cc5d 3179#endif
02cd137d 3180 /* Get the code point values of the range ends. */
0c311b7c
KW
3181 max_ptr = (d_is_utf8) ? (char *) utf8_hop( (U8*) d, -1) : d - 1;
3182 offset_to_max = max_ptr - SvPVX_const(sv);
ed2893cb 3183 if (d_is_utf8) {
f4240379
KW
3184 /* We know the utf8 is valid, because we just constructed
3185 * it ourselves in previous loop iterations */
3186 min_ptr = (char*) utf8_hop( (U8*) max_ptr, -1);
3187 range_min = valid_utf8_to_uvchr( (U8*) min_ptr, NULL);
3188 range_max = valid_utf8_to_uvchr( (U8*) max_ptr, NULL);
fe2ba0a2
KW
3189
3190 /* This compensates for not all code setting
3191 * 'has_above_latin1', so that we don't skip stuff that
3192 * should be executed */
3193 if (range_max > 255) {
3194 has_above_latin1 = TRUE;
3195 }
e294cc5d 3196 }
f4240379
KW
3197 else {
3198 min_ptr = max_ptr - 1;
3199 range_min = * (U8*) min_ptr;
3200 range_max = * (U8*) max_ptr;
3201 }
3202
8efef67c
KW
3203 /* If the range is just a single code point, like tr/a-a/.../,
3204 * that code point is already in the output, twice. We can
3205 * just back up over the second instance and avoid all the rest
3206 * of the work. But if it is a variant character, it's been
218304f9
KW
3207 * counted twice, so decrement. (This unlikely scenario is
3208 * special cased, like the one for a range of 2 code points
3209 * below, only because the main-line code below needs a range
3210 * of 3 or more to work without special casing. Might as well
3211 * get it out of the way now.) */
8efef67c
KW
3212 if (UNLIKELY(range_max == range_min)) {
3213 d = max_ptr;
ed2893cb 3214 if (! d_is_utf8 && ! UVCHR_IS_INVARIANT(range_max)) {
8efef67c
KW
3215 utf8_variant_count--;
3216 }
3217 goto range_done;
3218 }
3219
e294cc5d 3220#ifdef EBCDIC
f4240379
KW
3221 /* On EBCDIC platforms, we may have to deal with portable
3222 * ranges. These happen if at least one range endpoint is a
3223 * Unicode value (\N{...}), or if the range is a subset of
3224 * [A-Z] or [a-z], and both ends are literal characters,
3225 * like 'A', and not like \x{C1} */
87dd6ea7 3226 convert_unicode =
02cd137d
KW
3227 cBOOL(backslash_N) /* \N{} forces Unicode,
3228 hence portable range */
3229 || ( ! non_portable_endpoint
3230 && (( isLOWER_A(range_min) && isLOWER_A(range_max))
3231 || (isUPPER_A(range_min) && isUPPER_A(range_max))));
87dd6ea7 3232 if (convert_unicode) {
f4240379
KW
3233
3234 /* Special handling is needed for these portable ranges.
02cd137d
KW
3235 * They are defined to be in Unicode terms, which includes
3236 * all the Unicode code points between the end points.
f4240379
KW
3237 * Convert to Unicode to get the Unicode range. Later we
3238 * will convert each code point in the range back to
3239 * native. */
3240 range_min = NATIVE_TO_UNI(range_min);
3241 range_max = NATIVE_TO_UNI(range_max);
3242 }
e294cc5d 3243#endif
8ada0baa 3244
f4240379 3245 if (range_min > range_max) {
11327fa1 3246#ifdef EBCDIC
f4240379
KW
3247 if (convert_unicode) {
3248 /* Need to convert back to native for meaningful
3249 * messages for this platform */
3250 range_min = UNI_TO_NATIVE(range_min);
3251 range_max = UNI_TO_NATIVE(range_max);
3252 }
11327fa1 3253#endif
f4240379
KW
3254 /* Use the characters themselves for the error message if
3255 * ASCII printables; otherwise some visible representation
3256 * of them */
3257 if (isPRINT_A(range_min) && isPRINT_A(range_max)) {
3258 Perl_croak(aTHX_
3259 "Invalid range \"%c-%c\" in transliteration operator",
3260 (char)range_min, (char)range_max);
3261 }
11327fa1 3262#ifdef EBCDIC
f4240379 3263 else if (convert_unicode) {
02cd137d 3264 /* diag_listed_as: Invalid range "%s" in transliteration operator */
f4240379 3265 Perl_croak(aTHX_
02cd137d
KW
3266 "Invalid range \"\\N{U+%04" UVXf "}-\\N{U+%04"
3267 UVXf "}\" in transliteration operator",
3268 range_min, range_max);
f4240379 3269 }
11327fa1 3270#endif
f4240379 3271 else {
02cd137d 3272 /* diag_listed_as: Invalid range "%s" in transliteration operator */
f4240379 3273 Perl_croak(aTHX_
02cd137d
KW
3274 "Invalid range \"\\x{%04" UVXf "}-\\x{%04" UVXf "}\""
3275 " in transliteration operator",
3276 range_min, range_max);
f4240379 3277 }
c2e66d9e
GS
3278 }
3279
aca41667
KW
3280 /* If the range is exactly two code points long, they are
3281 * already both in the output */
3282 if (UNLIKELY(range_min + 1 == range_max)) {
3283 goto range_done;
3284 }
3285
3286 /* Here the range contains at least 3 code points */
3287
ed2893cb 3288 if (d_is_utf8) {
f4240379 3289
188d22cf
KW
3290 /* If everything in the transliteration is below 256, we
3291 * can avoid special handling later. A translation table
02cd137d
KW
3292 * for each of those bytes is created by op.c. So we
3293 * expand out all ranges to their constituent code points.
3294 * But if we've encountered something above 255, the
3295 * expanding won't help, so skip doing that. But if it's
3296 * EBCDIC, we may have to look at each character below 256
3297 * if we have to convert to/from Unicode values */
188d22cf 3298 if ( has_above_latin1
c7f1f016 3299#ifdef EBCDIC
f4240379 3300 && (range_min > 255 || ! convert_unicode)
8ada0baa 3301#endif
f4240379 3302 ) {
3fdfceb3
KW
3303 const STRLEN off = d - SvPVX(sv);
3304 const STRLEN extra = 1 + (send - s) + 1;
3305 char *e;
3306
f4240379
KW
3307 /* Move the high character one byte to the right; then
3308 * insert between it and the range begin, an illegal
3309 * byte which serves to indicate this is a range (using
02cd137d 3310 * a '-' would be ambiguous). */
3fdfceb3
KW
3311
3312 if (off + extra > SvLEN(sv)) {
3313 d = off + SvGROW(sv, off + extra);
3314 max_ptr = d - off + offset_to_max;
3315 }
3316
3317 e = d++;
f4240379
KW
3318 while (e-- > max_ptr) {
3319 *(e + 1) = *e;
e294cc5d 3320 }
dc8faf6b 3321 *(e + 1) = (char) RANGE_INDICATOR;
f4240379
KW
3322 goto range_done;
3323 }
3324
3325 /* Here, we're going to expand out the range. For EBCDIC
3326 * the range can extend above 255 (not so in ASCII), so
3327 * for EBCDIC, split it into the parts above and below
3328 * 255/256 */
e294cc5d 3329#ifdef EBCDIC
f4240379
KW
3330 if (range_max > 255) {
3331 real_range_max = range_max;
3332 range_max = 255;
3333 }
e294cc5d 3334#endif
f4240379 3335 }
02aa26ce 3336
f4240379 3337 /* Here we need to expand out the string to contain each
5ca37d54
KW
3338 * character in the range. Grow the output to handle this.
3339 * For non-UTF8, we need a byte for each code point in the
3340 * range, minus the three that we've already allocated for: the
3341 * hyphen, the min, and the max. For UTF-8, we need this
3342 * plus an extra byte for each code point that occupies two
3343 * bytes (is variant) when in UTF-8 (except we've already
3344 * allocated for the end points, including if they are
3345 * variants). For ASCII platforms and Unicode ranges on EBCDIC
3346 * platforms, it's easy to calculate a precise number. To
3347 * start, we count the variants in the range, which we need
3348 * elsewhere in this function anyway. (For the case where it
3349 * isn't easy to calculate, 'extras' has been initialized to 0,
3350 * and the calculation is done in a loop further down.) */
3351#ifdef EBCDIC
3352 if (convert_unicode)
3353#endif
3354 {
3355 /* This is executed unconditionally on ASCII, and for
3356 * Unicode ranges on EBCDIC. Under these conditions, all
3357 * code points above a certain value are variant; and none
3358 * under that value are. We just need to find out how much
3359 * of the range is above that value. We don't count the
3360 * end points here, as they will already have been counted
3361 * as they were parsed. */
3362 if (range_min >= UTF_CONTINUATION_MARK) {
3363
3364 /* The whole range is made up of variants */
3365 extras = (range_max - 1) - (range_min + 1) + 1;
3366 }
3367 else if (range_max >= UTF_CONTINUATION_MARK) {
f4240379 3368
5ca37d54
KW
3369 /* Only the higher portion of the range is variants */
3370 extras = (range_max - 1) - UTF_CONTINUATION_MARK + 1;
3371 }
f4240379 3372
5ca37d54
KW
3373 utf8_variant_count += extras;
3374 }
3375
3376 /* The base growth is the number of code points in the range,
3377 * not including the endpoints, which have already been sized
3378 * for (and output). We don't subtract for the hyphen, as it
3379 * has been parsed but not output, and the SvGROW below is
3380 * based only on what's been output plus what's left to parse.
3381 * */
3382 grow = (range_max - 1) - (range_min + 1) + 1;
f4240379 3383
ed2893cb 3384 if (d_is_utf8) {
4c3a8340 3385#ifdef EBCDIC
5ca37d54
KW
3386 /* In some cases in EBCDIC, we haven't yet calculated a
3387 * precise amount needed for the UTF-8 variants. Just
3388 * assume the worst case, that everything will expand by a
3389 * byte */
3390 if (! convert_unicode) {
f4240379
KW
3391 grow *= 2;
3392 }
5ca37d54 3393 else
4c3a8340 3394#endif
5ca37d54
KW
3395 {
3396 /* Otherwise we know exactly how many variants there
3397 * are in the range. */
3398 grow += extras;
3399 }
f4240379
KW
3400 }
3401
5ca37d54
KW
3402 /* Grow, but position the output to overwrite the range min end
3403 * point, because in some cases we overwrite that */
3404 SvCUR_set(sv, d - SvPVX_const(sv));
3405 offset_to_min = min_ptr - SvPVX_const(sv);
3406
3407 /* See Note on sizing above. */
3408 d = offset_to_min + SvGROW(sv, SvCUR(sv)
3409 + (send - s)
3410 + grow
3411 + 1 /* Trailing NUL */ );
f4240379 3412
5ca37d54 3413 /* Now, we can expand out the range. */
11327fa1 3414#ifdef EBCDIC
f4240379 3415 if (convert_unicode) {
5ca37d54 3416 SSize_t i;
02aa26ce 3417
f4240379
KW
3418 /* Recall that the min and max are now in Unicode terms, so
3419 * we have to convert each character to its native
3420 * equivalent */
ed2893cb 3421 if (d_is_utf8) {
f4240379 3422 for (i = range_min; i <= range_max; i++) {
02cd137d
KW
3423 append_utf8_from_native_byte(
3424 LATIN1_TO_NATIVE((U8) i),
3425 (U8 **) &d);
f4240379
KW
3426 }
3427 }
3428 else {
3429 for (i = range_min; i <= range_max; i++) {
81324705 3430 *d++ = (char)LATIN1_TO_NATIVE((U8) i);
f4240379
KW
3431 }
3432 }
01ec43d0 3433 }
11327fa1
AL
3434 else
3435#endif
3436 /* Always gets run for ASCII, and sometimes for EBCDIC. */
3437 {
f4240379
KW
3438 /* Here, no conversions are necessary, which means that the
3439 * first character in the range is already in 'd' and
3440 * valid, so we can skip overwriting it */
ed2893cb 3441 if (d_is_utf8) {
19742f39 3442 SSize_t i;
f4240379
KW
3443 d += UTF8SKIP(d);
3444 for (i = range_min + 1; i <= range_max; i++) {
81324705 3445 append_utf8_from_native_byte((U8) i, (U8 **) &d);
f4240379
KW
3446 }
3447 }
3448 else {
19742f39 3449 SSize_t i;
f4240379 3450 d++;
5ca37d54
KW
3451 assert(range_min + 1 <= range_max);
3452 for (i = range_min + 1; i < range_max; i++) {
3453#ifdef EBCDIC
3454 /* In this case on EBCDIC, we haven't calculated
3455 * the variants. Do it here, as we go along */
3456 if (! UVCHR_IS_INVARIANT(i)) {
3457 utf8_variant_count++;
3458 }
3459#endif
f4240379
KW
3460 *d++ = (char)i;
3461 }
5ca37d54
KW
3462
3463 /* The range_max is done outside the loop so as to
3464 * avoid having to special case not incrementing
3465 * 'utf8_variant_count' on EBCDIC (it's already been
3466 * counted when originally parsed) */
3467 *d++ = (char) range_max;
f4240379 3468 }
a0ed51b3 3469 }
02aa26ce 3470
11327fa1 3471#ifdef EBCDIC
02cd137d
KW
3472 /* If the original range extended above 255, add in that
3473 * portion. */
f4240379
KW
3474 if (real_range_max) {
3475 *d++ = (char) UTF8_TWO_BYTE_HI(0x100);
3476 *d++ = (char) UTF8_TWO_BYTE_LO(0x100);
043af6f6
KW
3477 if (real_range_max > 0x100) {
3478 if (real_range_max > 0x101) {
dc8faf6b 3479 *d++ = (char) RANGE_INDICATOR;
043af6f6 3480 }
f4240379 3481 d = (char*)uvchr_to_utf8((U8*)d, real_range_max);
043af6f6 3482 }
f4240379 3483 }
11327fa1 3484#endif
02aa26ce 3485
f4240379
KW
3486 range_done:
3487 /* mark the range as done, and continue */
3488 didrange = TRUE;
3489 dorange = FALSE;
3490#ifdef EBCDIC
3491 non_portable_endpoint = 0;
3492 backslash_N = 0;
3493#endif
3494 continue;
3495 } /* End of is a range */
3496 } /* End of transliteration. Joins main code after these else's */
e4a2df84
DM
3497 else if (*s == '[' && PL_lex_inpat && !in_charclass) {
3498 char *s1 = s-1;
3499 int esc = 0;
3500 while (s1 >= start && *s1-- == '\\')
3501 esc = !esc;
3502 if (!esc)
3503 in_charclass = TRUE;
3504 }
1e02a175 3505 else if (*s == ']' && PL_lex_inpat && in_charclass) {
e4a2df84
DM
3506 char *s1 = s-1;
3507 int esc = 0;
3508 while (s1 >= start && *s1-- == '\\')
3509 esc = !esc;
3510 if (!esc)
3511 in_charclass = FALSE;
3512 }
02cd137d
KW
3513 /* skip for regexp comments /(?#comment)/, except for the last
3514 * char, which will be done separately. Stop on (?{..}) and
3515 * friends */
c30fc27b 3516 else if (*s == '(' && PL_lex_inpat && s[1] == '?' && !in_charclass) {
cc6b7395 3517 if (s[2] == '#') {
f339d50e
KW
3518 if (s_is_utf8) {
3519 PERL_UINT_FAST8_T len = UTF8SKIP(s);
3520
3521 while (s + len < send && *s != ')') {
3522 Copy(s, d, len, U8);
3523 d += len;
3524 s += len;
3525 len = UTF8_SAFE_SKIP(s, send);