This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
perldelta: document locale changes
[perl5.git] / toke.c
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
26
27This is the lower layer of the Perl parser, managing characters and tokens.
28
29=for apidoc AmU|yy_parser *|PL_parser
30
31Pointer to a structure encapsulating the state of the parsing operation
32currently in progress. The pointer can be locally changed to perform
33a nested parse without interfering with the state of an outer parse.
34Individual members of C<PL_parser> have their own documentation.
35
36=cut
37*/
38
378cc40b 39#include "EXTERN.h"
864dbfa3 40#define PERL_IN_TOKE_C
378cc40b 41#include "perl.h"
04e98a4d 42#include "dquote_static.c"
378cc40b 43
eb0d8d16
NC
44#define new_constant(a,b,c,d,e,f,g) \
45 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
46
6154021b 47#define pl_yylval (PL_parser->yylval)
d3b6f988 48
199e78b7
DM
49/* XXX temporary backwards compatibility */
50#define PL_lex_brackets (PL_parser->lex_brackets)
78cdf107
Z
51#define PL_lex_allbrackets (PL_parser->lex_allbrackets)
52#define PL_lex_fakeeof (PL_parser->lex_fakeeof)
199e78b7
DM
53#define PL_lex_brackstack (PL_parser->lex_brackstack)
54#define PL_lex_casemods (PL_parser->lex_casemods)
55#define PL_lex_casestack (PL_parser->lex_casestack)
56#define PL_lex_defer (PL_parser->lex_defer)
57#define PL_lex_dojoin (PL_parser->lex_dojoin)
58#define PL_lex_expect (PL_parser->lex_expect)
59#define PL_lex_formbrack (PL_parser->lex_formbrack)
60#define PL_lex_inpat (PL_parser->lex_inpat)
61#define PL_lex_inwhat (PL_parser->lex_inwhat)
62#define PL_lex_op (PL_parser->lex_op)
63#define PL_lex_repl (PL_parser->lex_repl)
64#define PL_lex_starts (PL_parser->lex_starts)
65#define PL_lex_stuff (PL_parser->lex_stuff)
66#define PL_multi_start (PL_parser->multi_start)
67#define PL_multi_open (PL_parser->multi_open)
68#define PL_multi_close (PL_parser->multi_close)
199e78b7
DM
69#define PL_preambled (PL_parser->preambled)
70#define PL_sublex_info (PL_parser->sublex_info)
bdc0bf6f 71#define PL_linestr (PL_parser->linestr)
c2598295
DM
72#define PL_expect (PL_parser->expect)
73#define PL_copline (PL_parser->copline)
f06b5848
DM
74#define PL_bufptr (PL_parser->bufptr)
75#define PL_oldbufptr (PL_parser->oldbufptr)
76#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
77#define PL_linestart (PL_parser->linestart)
78#define PL_bufend (PL_parser->bufend)
79#define PL_last_uni (PL_parser->last_uni)
80#define PL_last_lop (PL_parser->last_lop)
81#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 82#define PL_lex_state (PL_parser->lex_state)
2f9285f8 83#define PL_rsfp (PL_parser->rsfp)
5486870f 84#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
85#define PL_in_my (PL_parser->in_my)
86#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 87#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 88#define PL_multi_end (PL_parser->multi_end)
13765c85 89#define PL_error_count (PL_parser->error_count)
199e78b7
DM
90
91#ifdef PERL_MAD
92# define PL_endwhite (PL_parser->endwhite)
93# define PL_faketokens (PL_parser->faketokens)
94# define PL_lasttoke (PL_parser->lasttoke)
95# define PL_nextwhite (PL_parser->nextwhite)
96# define PL_realtokenstart (PL_parser->realtokenstart)
97# define PL_skipwhite (PL_parser->skipwhite)
98# define PL_thisclose (PL_parser->thisclose)
99# define PL_thismad (PL_parser->thismad)
100# define PL_thisopen (PL_parser->thisopen)
101# define PL_thisstuff (PL_parser->thisstuff)
102# define PL_thistoken (PL_parser->thistoken)
103# define PL_thiswhite (PL_parser->thiswhite)
fb205e7a
DM
104# define PL_thiswhite (PL_parser->thiswhite)
105# define PL_nexttoke (PL_parser->nexttoke)
106# define PL_curforce (PL_parser->curforce)
107#else
108# define PL_nexttoke (PL_parser->nexttoke)
109# define PL_nexttype (PL_parser->nexttype)
110# define PL_nextval (PL_parser->nextval)
199e78b7
DM
111#endif
112
a1894d81 113static const char* const ident_too_long = "Identifier too long";
8903cb82 114
29595ff2 115#ifdef PERL_MAD
29595ff2 116# define CURMAD(slot,sv) if (PL_madskills) { curmad(slot,sv); sv = 0; }
cd81e915 117# define NEXTVAL_NEXTTOKE PL_nexttoke[PL_curforce].next_val
9ded7720 118#else
5db06880 119# define CURMAD(slot,sv)
9ded7720 120# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2
NC
121#endif
122
a7aaec61
Z
123#define XENUMMASK 0x3f
124#define XFAKEEOF 0x40
125#define XFAKEBRACK 0x80
9059aa12 126
39e02b42
JH
127#ifdef USE_UTF8_SCRIPTS
128# define UTF (!IN_BYTES)
2b9d42f0 129#else
802a15e9 130# define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
2b9d42f0 131#endif
a0ed51b3 132
b1fc3636
CJ
133/* The maximum number of characters preceding the unrecognized one to display */
134#define UNRECOGNIZED_PRECEDE_COUNT 10
135
61f0cdd9 136/* In variables named $^X, these are the legal values for X.
2b92dfce
GS
137 * 1999-02-27 mjd-perl-patch@plover.com */
138#define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
139
14bd96d0 140#define SPACE_OR_TAB(c) isBLANK_A(c)
bf4acbe4 141
ffb4593c
NT
142/* LEX_* are values for PL_lex_state, the state of the lexer.
143 * They are arranged oddly so that the guard on the switch statement
79072805 144 * can get by with a single comparison (if the compiler is smart enough).
9da1dd8f
DM
145 *
146 * These values refer to the various states within a sublex parse,
147 * i.e. within a double quotish string
79072805
LW
148 */
149
fb73857a 150/* #define LEX_NOTPARSING 11 is done in perl.h. */
151
b6007c36
DM
152#define LEX_NORMAL 10 /* normal code (ie not within "...") */
153#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
154#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
155#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
156#define LEX_INTERPSTART 6 /* expecting the start of a $var */
157
158 /* at end of code, eg "$x" followed by: */
159#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
160#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
161
162#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
163 string or after \E, $foo, etc */
164#define LEX_INTERPCONST 2 /* NOT USED */
165#define LEX_FORMLINE 1 /* expecting a format line */
166#define LEX_KNOWNEXT 0 /* next token known; just return it */
167
79072805 168
bbf60fe6 169#ifdef DEBUGGING
27da23d5 170static const char* const lex_state_names[] = {
bbf60fe6
DM
171 "KNOWNEXT",
172 "FORMLINE",
173 "INTERPCONST",
174 "INTERPCONCAT",
175 "INTERPENDMAYBE",
176 "INTERPEND",
177 "INTERPSTART",
178 "INTERPPUSH",
179 "INTERPCASEMOD",
180 "INTERPNORMAL",
181 "NORMAL"
182};
183#endif
184
79072805
LW
185#ifdef ff_next
186#undef ff_next
d48672a2
LW
187#endif
188
79072805 189#include "keywords.h"
fe14fcc3 190
ffb4593c
NT
191/* CLINE is a macro that ensures PL_copline has a sane value */
192
ae986130
LW
193#ifdef CLINE
194#undef CLINE
195#endif
57843af0 196#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 197
5db06880 198#ifdef PERL_MAD
29595ff2
NC
199# define SKIPSPACE0(s) skipspace0(s)
200# define SKIPSPACE1(s) skipspace1(s)
201# define SKIPSPACE2(s,tsv) skipspace2(s,&tsv)
202# define PEEKSPACE(s) skipspace2(s,0)
203#else
204# define SKIPSPACE0(s) skipspace(s)
205# define SKIPSPACE1(s) skipspace(s)
206# define SKIPSPACE2(s,tsv) skipspace(s)
207# define PEEKSPACE(s) skipspace(s)
208#endif
209
ffb4593c
NT
210/*
211 * Convenience functions to return different tokens and prime the
9cbb5ea2 212 * lexer for the next token. They all take an argument.
ffb4593c
NT
213 *
214 * TOKEN : generic token (used for '(', DOLSHARP, etc)
215 * OPERATOR : generic operator
216 * AOPERATOR : assignment operator
217 * PREBLOCK : beginning the block after an if, while, foreach, ...
218 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
219 * PREREF : *EXPR where EXPR is not a simple identifier
220 * TERM : expression term
221 * LOOPX : loop exiting command (goto, last, dump, etc)
222 * FTST : file test operator
223 * FUN0 : zero-argument function
7eb971ee 224 * FUN0OP : zero-argument function, with its op created in this file
2d2e263d 225 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
226 * BOop : bitwise or or xor
227 * BAop : bitwise and
228 * SHop : shift operator
229 * PWop : power operator
9cbb5ea2 230 * PMop : pattern-matching operator
ffb4593c
NT
231 * Aop : addition-level operator
232 * Mop : multiplication-level operator
233 * Eop : equality-testing operator
e5edeb50 234 * Rop : relational operator <= != gt
ffb4593c
NT
235 *
236 * Also see LOP and lop() below.
237 */
238
998054bd 239#ifdef DEBUGGING /* Serve -DT. */
704d4215 240# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 241#else
bbf60fe6 242# define REPORT(retval) (retval)
998054bd
SC
243#endif
244
bbf60fe6
DM
245#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
246#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
247#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, REPORT(retval)))
248#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
249#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
250#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
251#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
6154021b
RGS
252#define LOOPX(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)LOOPEX))
253#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
254#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
7eb971ee 255#define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
6154021b
RGS
256#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
257#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITOROP)))
258#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITANDOP)))
259#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)SHIFTOP)))
260#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)POWOP)))
261#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
262#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)ADDOP)))
263#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MULOP)))
264#define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
265#define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
2f3197b3 266
a687059c
LW
267/* This bit of chicanery makes a unary function followed by
268 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
269 * The UNIDOR macro is for unary functions that can be followed by the //
270 * operator (such as C<shift // 0>).
a687059c 271 */
d68ce4ac 272#define UNI3(f,x,have_x) { \
6154021b 273 pl_yylval.ival = f; \
d68ce4ac 274 if (have_x) PL_expect = x; \
376fcdbf
AL
275 PL_bufptr = s; \
276 PL_last_uni = PL_oldbufptr; \
277 PL_last_lop_op = f; \
278 if (*s == '(') \
279 return REPORT( (int)FUNC1 ); \
29595ff2 280 s = PEEKSPACE(s); \
376fcdbf
AL
281 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
282 }
d68ce4ac
FC
283#define UNI(f) UNI3(f,XTERM,1)
284#define UNIDOR(f) UNI3(f,XTERMORDORDOR,1)
b5fb7ce3
FC
285#define UNIPROTO(f,optional) { \
286 if (optional) PL_last_uni = PL_oldbufptr; \
22393538
MH
287 OPERATOR(f); \
288 }
a687059c 289
d68ce4ac 290#define UNIBRACK(f) UNI3(f,0,0)
79072805 291
9f68db38 292/* grandfather return to old style */
78cdf107
Z
293#define OLDLOP(f) \
294 do { \
295 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
296 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
297 pl_yylval.ival = (f); \
298 PL_expect = XTERM; \
299 PL_bufptr = s; \
300 return (int)LSTOP; \
301 } while(0)
79072805 302
83944c01
FC
303#define COPLINE_INC_WITH_HERELINES \
304 STMT_START { \
305 CopLINE_inc(PL_curcop); \
d794b522
FC
306 if (PL_parser->lex_shared->herelines) \
307 CopLINE(PL_curcop) += PL_parser->lex_shared->herelines, \
308 PL_parser->lex_shared->herelines = 0; \
83944c01
FC
309 } STMT_END
310
311
8fa7f367
JH
312#ifdef DEBUGGING
313
6154021b 314/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
315enum token_type {
316 TOKENTYPE_NONE,
317 TOKENTYPE_IVAL,
6154021b 318 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6 319 TOKENTYPE_PVAL,
aeaef349 320 TOKENTYPE_OPVAL
bbf60fe6
DM
321};
322
6d4a66ac
NC
323static struct debug_tokens {
324 const int token;
325 enum token_type type;
326 const char *name;
327} const debug_tokens[] =
9041c2e3 328{
bbf60fe6
DM
329 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
330 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
331 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
332 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
333 { ARROW, TOKENTYPE_NONE, "ARROW" },
334 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
335 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
336 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
337 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
338 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
0d863452 339 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
340 { DO, TOKENTYPE_NONE, "DO" },
341 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
342 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
343 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
344 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
345 { ELSE, TOKENTYPE_NONE, "ELSE" },
346 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
347 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
348 { FOR, TOKENTYPE_IVAL, "FOR" },
349 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
705fe0e5
FC
350 { FORMLBRACK, TOKENTYPE_NONE, "FORMLBRACK" },
351 { FORMRBRACK, TOKENTYPE_NONE, "FORMRBRACK" },
bbf60fe6
DM
352 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
353 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
7eb971ee 354 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
bbf60fe6
DM
355 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
356 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
357 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 358 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
359 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
360 { IF, TOKENTYPE_IVAL, "IF" },
5504e6cf 361 { LABEL, TOKENTYPE_PVAL, "LABEL" },
bbf60fe6
DM
362 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
363 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
364 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
365 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
366 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
367 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
368 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
369 { MY, TOKENTYPE_IVAL, "MY" },
bbf60fe6
DM
370 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
371 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
372 { OROP, TOKENTYPE_IVAL, "OROP" },
373 { OROR, TOKENTYPE_NONE, "OROR" },
374 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
f3f204dc 375 { PEG, TOKENTYPE_NONE, "PEG" },
88e1f1a2
JV
376 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
377 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6
DM
378 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
379 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
380 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
381 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
382 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
383 { PREINC, TOKENTYPE_NONE, "PREINC" },
384 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
f3f204dc 385 { QWLIST, TOKENTYPE_OPVAL, "QWLIST" },
bbf60fe6
DM
386 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
387 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
f3f204dc 388 { REQUIRE, TOKENTYPE_NONE, "REQUIRE" },
bbf60fe6
DM
389 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
390 { SUB, TOKENTYPE_NONE, "SUB" },
391 { THING, TOKENTYPE_OPVAL, "THING" },
392 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
393 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
394 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
395 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
396 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
397 { USE, TOKENTYPE_IVAL, "USE" },
0d863452 398 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6
DM
399 { WHILE, TOKENTYPE_IVAL, "WHILE" },
400 { WORD, TOKENTYPE_OPVAL, "WORD" },
be25f609 401 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 402 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
403};
404
6154021b 405/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 406
bbf60fe6 407STATIC int
704d4215 408S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 409{
97aff369 410 dVAR;
7918f24d
NC
411
412 PERL_ARGS_ASSERT_TOKEREPORT;
413
bbf60fe6 414 if (DEBUG_T_TEST) {
bd61b366 415 const char *name = NULL;
bbf60fe6 416 enum token_type type = TOKENTYPE_NONE;
f54cb97a 417 const struct debug_tokens *p;
396482e1 418 SV* const report = newSVpvs("<== ");
bbf60fe6 419
f54cb97a 420 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
421 if (p->token == (int)rv) {
422 name = p->name;
423 type = p->type;
424 break;
425 }
426 }
427 if (name)
54667de8 428 Perl_sv_catpv(aTHX_ report, name);
74736ae6 429 else if ((char)rv > ' ' && (char)rv <= '~')
4ebc7986 430 {
bbf60fe6 431 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
4ebc7986
FC
432 if ((char)rv == 'p')
433 sv_catpvs(report, " (pending identifier)");
434 }
bbf60fe6 435 else if (!rv)
396482e1 436 sv_catpvs(report, "EOF");
bbf60fe6
DM
437 else
438 Perl_sv_catpvf(aTHX_ report, "?? %"IVdf, (IV)rv);
439 switch (type) {
440 case TOKENTYPE_NONE:
bbf60fe6
DM
441 break;
442 case TOKENTYPE_IVAL:
704d4215 443 Perl_sv_catpvf(aTHX_ report, "(ival=%"IVdf")", (IV)lvalp->ival);
bbf60fe6
DM
444 break;
445 case TOKENTYPE_OPNUM:
446 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 447 PL_op_name[lvalp->ival]);
bbf60fe6
DM
448 break;
449 case TOKENTYPE_PVAL:
704d4215 450 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
451 break;
452 case TOKENTYPE_OPVAL:
704d4215 453 if (lvalp->opval) {
401441c0 454 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
455 PL_op_name[lvalp->opval->op_type]);
456 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 457 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 458 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
459 }
460
461 }
401441c0 462 else
396482e1 463 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
464 break;
465 }
b6007c36 466 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
467 };
468 return (int)rv;
998054bd
SC
469}
470
b6007c36
DM
471
472/* print the buffer with suitable escapes */
473
474STATIC void
15f169a1 475S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 476{
396482e1 477 SV* const tmp = newSVpvs("");
7918f24d
NC
478
479 PERL_ARGS_ASSERT_PRINTBUF;
480
b6007c36
DM
481 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
482 SvREFCNT_dec(tmp);
483}
484
8fa7f367
JH
485#endif
486
8290c323
NC
487static int
488S_deprecate_commaless_var_list(pTHX) {
489 PL_expect = XTERM;
490 deprecate("comma-less variable list");
491 return REPORT(','); /* grandfather non-comma-format format */
492}
493
ffb4593c
NT
494/*
495 * S_ao
496 *
c963b151
BD
497 * This subroutine detects &&=, ||=, and //= and turns an ANDAND, OROR or DORDOR
498 * into an OP_ANDASSIGN, OP_ORASSIGN, or OP_DORASSIGN
ffb4593c
NT
499 */
500
76e3520e 501STATIC int
cea2e8a9 502S_ao(pTHX_ int toketype)
a0d0e21e 503{
97aff369 504 dVAR;
3280af22
NIS
505 if (*PL_bufptr == '=') {
506 PL_bufptr++;
a0d0e21e 507 if (toketype == ANDAND)
6154021b 508 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 509 else if (toketype == OROR)
6154021b 510 pl_yylval.ival = OP_ORASSIGN;
c963b151 511 else if (toketype == DORDOR)
6154021b 512 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
513 toketype = ASSIGNOP;
514 }
515 return toketype;
516}
517
ffb4593c
NT
518/*
519 * S_no_op
520 * When Perl expects an operator and finds something else, no_op
521 * prints the warning. It always prints "<something> found where
522 * operator expected. It prints "Missing semicolon on previous line?"
523 * if the surprise occurs at the start of the line. "do you need to
524 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
525 * where the compiler doesn't know if foo is a method call or a function.
526 * It prints "Missing operator before end of line" if there's nothing
527 * after the missing operator, or "... before <...>" if there is something
528 * after the missing operator.
529 */
530
76e3520e 531STATIC void
15f169a1 532S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 533{
97aff369 534 dVAR;
9d4ba2ae
AL
535 char * const oldbp = PL_bufptr;
536 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 537
7918f24d
NC
538 PERL_ARGS_ASSERT_NO_OP;
539
1189a94a
GS
540 if (!s)
541 s = oldbp;
07c798fb 542 else
1189a94a 543 PL_bufptr = s;
734ab321 544 yywarn(Perl_form(aTHX_ "%s found where operator expected", what), UTF ? SVf_UTF8 : 0);
56da5a46
RGS
545 if (ckWARN_d(WARN_SYNTAX)) {
546 if (is_first)
547 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
548 "\t(Missing semicolon on previous line?)\n");
549 else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) {
f54cb97a 550 const char *t;
8a2bca12 551 for (t = PL_oldoldbufptr; (isWORDCHAR_lazy_if(t,UTF) || *t == ':');
734ab321 552 t += UTF ? UTF8SKIP(t) : 1)
c35e046a 553 NOOP;
56da5a46
RGS
554 if (t < PL_bufptr && isSPACE(*t))
555 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
b17a0679
FC
556 "\t(Do you need to predeclare %"UTF8f"?)\n",
557 UTF8fARG(UTF, t - PL_oldoldbufptr, PL_oldoldbufptr));
56da5a46
RGS
558 }
559 else {
560 assert(s >= oldbp);
561 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
b17a0679
FC
562 "\t(Missing operator before %"UTF8f"?)\n",
563 UTF8fARG(UTF, s - oldbp, oldbp));
56da5a46 564 }
07c798fb 565 }
3280af22 566 PL_bufptr = oldbp;
8990e307
LW
567}
568
ffb4593c
NT
569/*
570 * S_missingterm
571 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 572 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
573 * If we're in a delimited string and the delimiter is a control
574 * character, it's reformatted into a two-char sequence like ^C.
575 * This is fatal.
576 */
577
76e3520e 578STATIC void
cea2e8a9 579S_missingterm(pTHX_ char *s)
8990e307 580{
97aff369 581 dVAR;
8990e307
LW
582 char tmpbuf[3];
583 char q;
584 if (s) {
9d4ba2ae 585 char * const nl = strrchr(s,'\n');
d2719217 586 if (nl)
8990e307
LW
587 *nl = '\0';
588 }
463559e7 589 else if (isCNTRL(PL_multi_close)) {
8990e307 590 *tmpbuf = '^';
585ec06d 591 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
592 tmpbuf[2] = '\0';
593 s = tmpbuf;
594 }
595 else {
eb160463 596 *tmpbuf = (char)PL_multi_close;
8990e307
LW
597 tmpbuf[1] = '\0';
598 s = tmpbuf;
599 }
600 q = strchr(s,'"') ? '\'' : '"';
cea2e8a9 601 Perl_croak(aTHX_ "Can't find string terminator %c%s%c anywhere before EOF",q,s,q);
463ee0b2 602}
79072805 603
dd0ac2b9
FC
604#include "feature.h"
605
0d863452 606/*
0d863452
RH
607 * Check whether the named feature is enabled.
608 */
26ea9e12 609bool
3fff3427 610Perl_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
0d863452 611{
97aff369 612 dVAR;
4a731d7b 613 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
7918f24d
NC
614
615 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
ca4d40c4
FC
616
617 assert(CURRENT_FEATURE_BUNDLE == FEATURE_BUNDLE_CUSTOM);
7918f24d 618
26ea9e12
NC
619 if (namelen > MAX_FEATURE_LEN)
620 return FALSE;
3fff3427 621 memcpy(&he_name[8], name, namelen);
7d69d4a6 622
c8ca97b0
NC
623 return cBOOL(cop_hints_fetch_pvn(PL_curcop, he_name, 8 + namelen, 0,
624 REFCOUNTED_HE_EXISTS));
0d863452
RH
625}
626
ffb4593c 627/*
9cbb5ea2
GS
628 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
629 * utf16-to-utf8-reversed.
ffb4593c
NT
630 */
631
c39cd008
GS
632#ifdef PERL_CR_FILTER
633static void
634strip_return(SV *sv)
635{
eb578fdb
KW
636 const char *s = SvPVX_const(sv);
637 const char * const e = s + SvCUR(sv);
7918f24d
NC
638
639 PERL_ARGS_ASSERT_STRIP_RETURN;
640
c39cd008
GS
641 /* outer loop optimized to do nothing if there are no CR-LFs */
642 while (s < e) {
643 if (*s++ == '\r' && *s == '\n') {
644 /* hit a CR-LF, need to copy the rest */
eb578fdb 645 char *d = s - 1;
c39cd008
GS
646 *d++ = *s++;
647 while (s < e) {
648 if (*s == '\r' && s[1] == '\n')
649 s++;
650 *d++ = *s++;
651 }
652 SvCUR(sv) -= s - d;
653 return;
654 }
655 }
656}
a868473f 657
76e3520e 658STATIC I32
c39cd008 659S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 660{
f54cb97a 661 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
662 if (count > 0 && !maxlen)
663 strip_return(sv);
664 return count;
a868473f
NIS
665}
666#endif
667
ffb4593c 668/*
8eaa0acf
Z
669=for apidoc Amx|void|lex_start|SV *line|PerlIO *rsfp|U32 flags
670
671Creates and initialises a new lexer/parser state object, supplying
672a context in which to lex and parse from a new source of Perl code.
673A pointer to the new state object is placed in L</PL_parser>. An entry
674is made on the save stack so that upon unwinding the new state object
675will be destroyed and the former value of L</PL_parser> will be restored.
676Nothing else need be done to clean up the parsing context.
677
678The code to be parsed comes from I<line> and I<rsfp>. I<line>, if
679non-null, provides a string (in SV form) containing code to be parsed.
680A copy of the string is made, so subsequent modification of I<line>
681does not affect parsing. I<rsfp>, if non-null, provides an input stream
682from which code will be read to be parsed. If both are non-null, the
683code in I<line> comes first and must consist of complete lines of input,
684and I<rsfp> supplies the remainder of the source.
685
e368b3bd
FC
686The I<flags> parameter is reserved for future use. Currently it is only
687used by perl internally, so extensions should always pass zero.
8eaa0acf
Z
688
689=cut
690*/
ffb4593c 691
27fcb6ee 692/* LEX_START_SAME_FILTER indicates that this is not a new file, so it
87606032
NC
693 can share filters with the current parser.
694 LEX_START_DONT_CLOSE indicates that the file handle wasn't opened by the
695 caller, hence isn't owned by the parser, so shouldn't be closed on parser
696 destruction. This is used to handle the case of defaulting to reading the
697 script from the standard input because no filename was given on the command
698 line (without getting confused by situation where STDIN has been closed, so
699 the script handle is opened on fd 0) */
27fcb6ee 700
a0d0e21e 701void
8eaa0acf 702Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
79072805 703{
97aff369 704 dVAR;
6ef55633 705 const char *s = NULL;
5486870f 706 yy_parser *parser, *oparser;
60d63348 707 if (flags && flags & ~LEX_START_FLAGS)
8eaa0acf 708 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
acdf0a21
DM
709
710 /* create and initialise a parser */
711
199e78b7 712 Newxz(parser, 1, yy_parser);
5486870f 713 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
714 PL_parser = parser;
715
28ac2b49
Z
716 parser->stack = NULL;
717 parser->ps = NULL;
718 parser->stack_size = 0;
acdf0a21 719
e3abe207
DM
720 /* on scope exit, free this parser and restore any outer one */
721 SAVEPARSER(parser);
7c4baf47 722 parser->saved_curcop = PL_curcop;
e3abe207 723
acdf0a21 724 /* initialise lexer state */
8990e307 725
fb205e7a
DM
726#ifdef PERL_MAD
727 parser->curforce = -1;
728#else
729 parser->nexttoke = 0;
730#endif
ca4cfd28 731 parser->error_count = oparser ? oparser->error_count : 0;
c2598295 732 parser->copline = NOLINE;
5afb0a62 733 parser->lex_state = LEX_NORMAL;
c2598295 734 parser->expect = XSTATE;
2f9285f8 735 parser->rsfp = rsfp;
27fcb6ee
FC
736 parser->rsfp_filters =
737 !(flags & LEX_START_SAME_FILTER) || !oparser
d3cd8e11
FC
738 ? NULL
739 : MUTABLE_AV(SvREFCNT_inc(
740 oparser->rsfp_filters
741 ? oparser->rsfp_filters
742 : (oparser->rsfp_filters = newAV())
743 ));
2f9285f8 744
199e78b7
DM
745 Newx(parser->lex_brackstack, 120, char);
746 Newx(parser->lex_casestack, 12, char);
747 *parser->lex_casestack = '\0';
d794b522 748 Newxz(parser->lex_shared, 1, LEXSHARED);
02b34bbe 749
10efb74f 750 if (line) {
0528fd32 751 STRLEN len;
10efb74f 752 s = SvPV_const(line, len);
0abcdfa4
FC
753 parser->linestr = flags & LEX_START_COPIED
754 ? SvREFCNT_inc_simple_NN(line)
755 : newSVpvn_flags(s, len, SvUTF8(line));
11076590 756 sv_catpvs(parser->linestr, "\n;");
0abcdfa4
FC
757 } else {
758 parser->linestr = newSVpvs("\n;");
8990e307 759 }
f06b5848
DM
760 parser->oldoldbufptr =
761 parser->oldbufptr =
762 parser->bufptr =
763 parser->linestart = SvPVX(parser->linestr);
764 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
765 parser->last_lop = parser->last_uni = NULL;
87606032
NC
766 parser->lex_flags = flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
767 |LEX_DONT_CLOSE_RSFP);
737c24fc 768
60d63348 769 parser->in_pod = parser->filtered = 0;
79072805 770}
a687059c 771
e3abe207
DM
772
773/* delete a parser object */
774
775void
776Perl_parser_free(pTHX_ const yy_parser *parser)
777{
7918f24d
NC
778 PERL_ARGS_ASSERT_PARSER_FREE;
779
7c4baf47 780 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
781 SvREFCNT_dec(parser->linestr);
782
87606032 783 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
2f9285f8 784 PerlIO_clearerr(parser->rsfp);
799361c3
SH
785 else if (parser->rsfp && (!parser->old_parser ||
786 (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 787 PerlIO_close(parser->rsfp);
5486870f 788 SvREFCNT_dec(parser->rsfp_filters);
10002bc1
FC
789 SvREFCNT_dec(parser->lex_stuff);
790 SvREFCNT_dec(parser->sublex_info.repl);
3ac7ff8f
FC
791
792 Safefree(parser->lex_brackstack);
793 Safefree(parser->lex_casestack);
794 Safefree(parser->lex_shared);
795 PL_parser = parser->old_parser;
796 Safefree(parser);
797}
798
799void
800Perl_parser_free_nexttoke_ops(pTHX_ yy_parser *parser, OPSLAB *slab)
801{
802#ifdef PERL_MAD
803 I32 nexttoke = parser->lasttoke;
804#else
805 I32 nexttoke = parser->nexttoke;
806#endif
807 PERL_ARGS_ASSERT_PARSER_FREE_NEXTTOKE_OPS;
3ce3dcd9
FC
808 while (nexttoke--) {
809#ifdef PERL_MAD
810 if (S_is_opval_token(parser->nexttoke[nexttoke].next_type
3ac7ff8f
FC
811 & 0xffff)
812 && parser->nexttoke[nexttoke].next_val.opval
813 && parser->nexttoke[nexttoke].next_val.opval->op_slabbed
814 && OpSLAB(parser->nexttoke[nexttoke].next_val.opval) == slab) {
815 op_free(parser->nexttoke[nexttoke].next_val.opval);
816 parser->nexttoke[nexttoke].next_val.opval = NULL;
817 }
3ce3dcd9 818#else
3ac7ff8f
FC
819 if (S_is_opval_token(parser->nexttype[nexttoke] & 0xffff)
820 && parser->nextval[nexttoke].opval
821 && parser->nextval[nexttoke].opval->op_slabbed
822 && OpSLAB(parser->nextval[nexttoke].opval) == slab) {
3ce3dcd9 823 op_free(parser->nextval[nexttoke].opval);
3ac7ff8f
FC
824 parser->nextval[nexttoke].opval = NULL;
825 }
3ce3dcd9
FC
826#endif
827 }
e3abe207
DM
828}
829
830
ffb4593c 831/*
f0e67a1d
Z
832=for apidoc AmxU|SV *|PL_parser-E<gt>linestr
833
834Buffer scalar containing the chunk currently under consideration of the
835text currently being lexed. This is always a plain string scalar (for
836which C<SvPOK> is true). It is not intended to be used as a scalar by
837normal scalar means; instead refer to the buffer directly by the pointer
838variables described below.
839
840The lexer maintains various C<char*> pointers to things in the
841C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
842reallocated, all of these pointers must be updated. Don't attempt to
843do this manually, but rather use L</lex_grow_linestr> if you need to
844reallocate the buffer.
845
846The content of the text chunk in the buffer is commonly exactly one
847complete line of input, up to and including a newline terminator,
848but there are situations where it is otherwise. The octets of the
849buffer may be intended to be interpreted as either UTF-8 or Latin-1.
850The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
851flag on this scalar, which may disagree with it.
852
853For direct examination of the buffer, the variable
854L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
855lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
856of these pointers is usually preferable to examination of the scalar
857through normal scalar means.
858
859=for apidoc AmxU|char *|PL_parser-E<gt>bufend
860
861Direct pointer to the end of the chunk of text currently being lexed, the
862end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
863+ SvCUR(PL_parser-E<gt>linestr)>. A NUL character (zero octet) is
864always located at the end of the buffer, and does not count as part of
865the buffer's contents.
866
867=for apidoc AmxU|char *|PL_parser-E<gt>bufptr
868
869Points to the current position of lexing inside the lexer buffer.
870Characters around this point may be freely examined, within
871the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
872L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
873interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
874
875Lexing code (whether in the Perl core or not) moves this pointer past
876the characters that it consumes. It is also expected to perform some
877bookkeeping whenever a newline character is consumed. This movement
878can be more conveniently performed by the function L</lex_read_to>,
879which handles newlines appropriately.
880
881Interpretation of the buffer's octets can be abstracted out by
882using the slightly higher-level functions L</lex_peek_unichar> and
883L</lex_read_unichar>.
884
885=for apidoc AmxU|char *|PL_parser-E<gt>linestart
886
887Points to the start of the current line inside the lexer buffer.
888This is useful for indicating at which column an error occurred, and
889not much else. This must be updated by any lexing code that consumes
890a newline; the function L</lex_read_to> handles this detail.
891
892=cut
893*/
894
895/*
896=for apidoc Amx|bool|lex_bufutf8
897
898Indicates whether the octets in the lexer buffer
899(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
900of Unicode characters. If not, they should be interpreted as Latin-1
901characters. This is analogous to the C<SvUTF8> flag for scalars.
902
903In UTF-8 mode, it is not guaranteed that the lexer buffer actually
904contains valid UTF-8. Lexing code must be robust in the face of invalid
905encoding.
906
907The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
908is significant, but not the whole story regarding the input character
909encoding. Normally, when a file is being read, the scalar contains octets
910and its C<SvUTF8> flag is off, but the octets should be interpreted as
911UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
912however, the scalar may have the C<SvUTF8> flag on, and in this case its
913octets should be interpreted as UTF-8 unless the C<use bytes> pragma
914is in effect. This logic may change in the future; use this function
915instead of implementing the logic yourself.
916
917=cut
918*/
919
920bool
921Perl_lex_bufutf8(pTHX)
922{
923 return UTF;
924}
925
926/*
927=for apidoc Amx|char *|lex_grow_linestr|STRLEN len
928
929Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
930at least I<len> octets (including terminating NUL). Returns a
931pointer to the reallocated buffer. This is necessary before making
932any direct modification of the buffer that would increase its length.
933L</lex_stuff_pvn> provides a more convenient way to insert text into
934the buffer.
935
936Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
937this function updates all of the lexer's variables that point directly
938into the buffer.
939
940=cut
941*/
942
943char *
944Perl_lex_grow_linestr(pTHX_ STRLEN len)
945{
946 SV *linestr;
947 char *buf;
948 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
c7641931 949 STRLEN linestart_pos, last_uni_pos, last_lop_pos, re_eval_start_pos;
f0e67a1d
Z
950 linestr = PL_parser->linestr;
951 buf = SvPVX(linestr);
952 if (len <= SvLEN(linestr))
953 return buf;
954 bufend_pos = PL_parser->bufend - buf;
955 bufptr_pos = PL_parser->bufptr - buf;
956 oldbufptr_pos = PL_parser->oldbufptr - buf;
957 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
958 linestart_pos = PL_parser->linestart - buf;
959 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
960 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
3328ab5a
FC
961 re_eval_start_pos = PL_parser->lex_shared->re_eval_start ?
962 PL_parser->lex_shared->re_eval_start - buf : 0;
c7641931 963
f0e67a1d 964 buf = sv_grow(linestr, len);
c7641931 965
f0e67a1d
Z
966 PL_parser->bufend = buf + bufend_pos;
967 PL_parser->bufptr = buf + bufptr_pos;
968 PL_parser->oldbufptr = buf + oldbufptr_pos;
969 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
970 PL_parser->linestart = buf + linestart_pos;
971 if (PL_parser->last_uni)
972 PL_parser->last_uni = buf + last_uni_pos;
973 if (PL_parser->last_lop)
974 PL_parser->last_lop = buf + last_lop_pos;
3328ab5a
FC
975 if (PL_parser->lex_shared->re_eval_start)
976 PL_parser->lex_shared->re_eval_start = buf + re_eval_start_pos;
f0e67a1d
Z
977 return buf;
978}
979
980/*
83aa740e 981=for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
f0e67a1d
Z
982
983Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
984immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
985reallocating the buffer if necessary. This means that lexing code that
986runs later will see the characters as if they had appeared in the input.
987It is not recommended to do this as part of normal parsing, and most
988uses of this facility run the risk of the inserted characters being
989interpreted in an unintended manner.
990
991The string to be inserted is represented by I<len> octets starting
992at I<pv>. These octets are interpreted as either UTF-8 or Latin-1,
993according to whether the C<LEX_STUFF_UTF8> flag is set in I<flags>.
994The characters are recoded for the lexer buffer, according to how the
995buffer is currently being interpreted (L</lex_bufutf8>). If a string
9dcc53ea 996to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
f0e67a1d
Z
997function is more convenient.
998
999=cut
1000*/
1001
1002void
83aa740e 1003Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
f0e67a1d 1004{
749123ff 1005 dVAR;
f0e67a1d
Z
1006 char *bufptr;
1007 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
1008 if (flags & ~(LEX_STUFF_UTF8))
1009 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
1010 if (UTF) {
1011 if (flags & LEX_STUFF_UTF8) {
1012 goto plain_copy;
1013 } else {
54d004e8 1014 STRLEN highhalf = 0; /* Count of variants */
83aa740e 1015 const char *p, *e = pv+len;
54d004e8
KW
1016 for (p = pv; p != e; p++) {
1017 if (! UTF8_IS_INVARIANT(*p)) {
1018 highhalf++;
1019 }
1020 }
f0e67a1d
Z
1021 if (!highhalf)
1022 goto plain_copy;
1023 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
1024 bufptr = PL_parser->bufptr;
1025 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1026 SvCUR_set(PL_parser->linestr,
1027 SvCUR(PL_parser->linestr) + len+highhalf);
f0e67a1d
Z
1028 PL_parser->bufend += len+highhalf;
1029 for (p = pv; p != e; p++) {
1030 U8 c = (U8)*p;
54d004e8
KW
1031 if (! UTF8_IS_INVARIANT(c)) {
1032 *bufptr++ = UTF8_TWO_BYTE_HI(c);
1033 *bufptr++ = UTF8_TWO_BYTE_LO(c);
f0e67a1d
Z
1034 } else {
1035 *bufptr++ = (char)c;
1036 }
1037 }
1038 }
1039 } else {
1040 if (flags & LEX_STUFF_UTF8) {
1041 STRLEN highhalf = 0;
83aa740e 1042 const char *p, *e = pv+len;
f0e67a1d
Z
1043 for (p = pv; p != e; p++) {
1044 U8 c = (U8)*p;
54d004e8 1045 if (UTF8_IS_ABOVE_LATIN1(c)) {
f0e67a1d
Z
1046 Perl_croak(aTHX_ "Lexing code attempted to stuff "
1047 "non-Latin-1 character into Latin-1 input");
54d004e8 1048 } else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e)) {
f0e67a1d
Z
1049 p++;
1050 highhalf++;
54d004e8 1051 } else if (! UTF8_IS_INVARIANT(c)) {
f0e67a1d
Z
1052 /* malformed UTF-8 */
1053 ENTER;
1054 SAVESPTR(PL_warnhook);
1055 PL_warnhook = PERL_WARNHOOK_FATAL;
1056 utf8n_to_uvuni((U8*)p, e-p, NULL, 0);
1057 LEAVE;
1058 }
1059 }
1060 if (!highhalf)
1061 goto plain_copy;
1062 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
1063 bufptr = PL_parser->bufptr;
1064 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1065 SvCUR_set(PL_parser->linestr,
1066 SvCUR(PL_parser->linestr) + len-highhalf);
f0e67a1d 1067 PL_parser->bufend += len-highhalf;
54d004e8
KW
1068 p = pv;
1069 while (p < e) {
1070 if (UTF8_IS_INVARIANT(*p)) {
1071 *bufptr++ = *p;
1072 p++;
f0e67a1d 1073 }
54d004e8
KW
1074 else {
1075 assert(p < e -1 );
1076 *bufptr++ = TWO_BYTE_UTF8_TO_UNI(*p, *(p+1));
1077 p += 2;
1078 }
f0e67a1d
Z
1079 }
1080 } else {
54d004e8 1081 plain_copy:
f0e67a1d
Z
1082 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1083 bufptr = PL_parser->bufptr;
1084 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
255fdf19 1085 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
f0e67a1d
Z
1086 PL_parser->bufend += len;
1087 Copy(pv, bufptr, len, char);
1088 }
1089 }
1090}
1091
1092/*
9dcc53ea
Z
1093=for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
1094
1095Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1096immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1097reallocating the buffer if necessary. This means that lexing code that
1098runs later will see the characters as if they had appeared in the input.
1099It is not recommended to do this as part of normal parsing, and most
1100uses of this facility run the risk of the inserted characters being
1101interpreted in an unintended manner.
1102
1103The string to be inserted is represented by octets starting at I<pv>
1104and continuing to the first nul. These octets are interpreted as either
1105UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
1106in I<flags>. The characters are recoded for the lexer buffer, according
1107to how the buffer is currently being interpreted (L</lex_bufutf8>).
1108If it is not convenient to nul-terminate a string to be inserted, the
1109L</lex_stuff_pvn> function is more appropriate.
1110
1111=cut
1112*/
1113
1114void
1115Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1116{
1117 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1118 lex_stuff_pvn(pv, strlen(pv), flags);
1119}
1120
1121/*
f0e67a1d
Z
1122=for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1123
1124Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1125immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1126reallocating the buffer if necessary. This means that lexing code that
1127runs later will see the characters as if they had appeared in the input.
1128It is not recommended to do this as part of normal parsing, and most
1129uses of this facility run the risk of the inserted characters being
1130interpreted in an unintended manner.
1131
1132The string to be inserted is the string value of I<sv>. The characters
1133are recoded for the lexer buffer, according to how the buffer is currently
9dcc53ea 1134being interpreted (L</lex_bufutf8>). If a string to be inserted is
f0e67a1d
Z
1135not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1136need to construct a scalar.
1137
1138=cut
1139*/
1140
1141void
1142Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1143{
1144 char *pv;
1145 STRLEN len;
1146 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1147 if (flags)
1148 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1149 pv = SvPV(sv, len);
1150 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1151}
1152
1153/*
1154=for apidoc Amx|void|lex_unstuff|char *ptr
1155
1156Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1157I<ptr>. Text following I<ptr> will be moved, and the buffer shortened.
1158This hides the discarded text from any lexing code that runs later,
1159as if the text had never appeared.
1160
1161This is not the normal way to consume lexed text. For that, use
1162L</lex_read_to>.
1163
1164=cut
1165*/
1166
1167void
1168Perl_lex_unstuff(pTHX_ char *ptr)
1169{
1170 char *buf, *bufend;
1171 STRLEN unstuff_len;
1172 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1173 buf = PL_parser->bufptr;
1174 if (ptr < buf)
1175 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1176 if (ptr == buf)
1177 return;
1178 bufend = PL_parser->bufend;
1179 if (ptr > bufend)
1180 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1181 unstuff_len = ptr - buf;
1182 Move(ptr, buf, bufend+1-ptr, char);
1183 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1184 PL_parser->bufend = bufend - unstuff_len;
1185}
1186
1187/*
1188=for apidoc Amx|void|lex_read_to|char *ptr
1189
1190Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1191to I<ptr>. This advances L</PL_parser-E<gt>bufptr> to match I<ptr>,
1192performing the correct bookkeeping whenever a newline character is passed.
1193This is the normal way to consume lexed text.
1194
1195Interpretation of the buffer's octets can be abstracted out by
1196using the slightly higher-level functions L</lex_peek_unichar> and
1197L</lex_read_unichar>.
1198
1199=cut
1200*/
1201
1202void
1203Perl_lex_read_to(pTHX_ char *ptr)
1204{
1205 char *s;
1206 PERL_ARGS_ASSERT_LEX_READ_TO;
1207 s = PL_parser->bufptr;
1208 if (ptr < s || ptr > PL_parser->bufend)
1209 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1210 for (; s != ptr; s++)
1211 if (*s == '\n') {
83944c01 1212 COPLINE_INC_WITH_HERELINES;
f0e67a1d
Z
1213 PL_parser->linestart = s+1;
1214 }
1215 PL_parser->bufptr = ptr;
1216}
1217
1218/*
1219=for apidoc Amx|void|lex_discard_to|char *ptr
1220
1221Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1222up to I<ptr>. The remaining content of the buffer will be moved, and
1223all pointers into the buffer updated appropriately. I<ptr> must not
1224be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1225it is not permitted to discard text that has yet to be lexed.
1226
1227Normally it is not necessarily to do this directly, because it suffices to
1228use the implicit discarding behaviour of L</lex_next_chunk> and things
1229based on it. However, if a token stretches across multiple lines,
1f317c95 1230and the lexing code has kept multiple lines of text in the buffer for
f0e67a1d
Z
1231that purpose, then after completion of the token it would be wise to
1232explicitly discard the now-unneeded earlier lines, to avoid future
1233multi-line tokens growing the buffer without bound.
1234
1235=cut
1236*/
1237
1238void
1239Perl_lex_discard_to(pTHX_ char *ptr)
1240{
1241 char *buf;
1242 STRLEN discard_len;
1243 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1244 buf = SvPVX(PL_parser->linestr);
1245 if (ptr < buf)
1246 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1247 if (ptr == buf)
1248 return;
1249 if (ptr > PL_parser->bufptr)
1250 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1251 discard_len = ptr - buf;
1252 if (PL_parser->oldbufptr < ptr)
1253 PL_parser->oldbufptr = ptr;
1254 if (PL_parser->oldoldbufptr < ptr)
1255 PL_parser->oldoldbufptr = ptr;
1256 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1257 PL_parser->last_uni = NULL;
1258 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1259 PL_parser->last_lop = NULL;
1260 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1261 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1262 PL_parser->bufend -= discard_len;
1263 PL_parser->bufptr -= discard_len;
1264 PL_parser->oldbufptr -= discard_len;
1265 PL_parser->oldoldbufptr -= discard_len;
1266 if (PL_parser->last_uni)
1267 PL_parser->last_uni -= discard_len;
1268 if (PL_parser->last_lop)
1269 PL_parser->last_lop -= discard_len;
1270}
1271
1272/*
1273=for apidoc Amx|bool|lex_next_chunk|U32 flags
1274
1275Reads in the next chunk of text to be lexed, appending it to
1276L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1277looked to the end of the current chunk and wants to know more. It is
1278usual, but not necessary, for lexing to have consumed the entirety of
1279the current chunk at this time.
1280
1281If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1282chunk (i.e., the current chunk has been entirely consumed), normally the
1283current chunk will be discarded at the same time that the new chunk is
1284read in. If I<flags> includes C<LEX_KEEP_PREVIOUS>, the current chunk
1285will not be discarded. If the current chunk has not been entirely
1286consumed, then it will not be discarded regardless of the flag.
1287
1288Returns true if some new text was added to the buffer, or false if the
1289buffer has reached the end of the input text.
1290
1291=cut
1292*/
1293
1294#define LEX_FAKE_EOF 0x80000000
112d1284 1295#define LEX_NO_TERM 0x40000000
f0e67a1d
Z
1296
1297bool
1298Perl_lex_next_chunk(pTHX_ U32 flags)
1299{
1300 SV *linestr;
1301 char *buf;
1302 STRLEN old_bufend_pos, new_bufend_pos;
1303 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1304 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1305 bool got_some_for_debugger = 0;
f0e67a1d 1306 bool got_some;
112d1284 1307 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM))
f0e67a1d 1308 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
f0e67a1d
Z
1309 linestr = PL_parser->linestr;
1310 buf = SvPVX(linestr);
1311 if (!(flags & LEX_KEEP_PREVIOUS) &&
1312 PL_parser->bufptr == PL_parser->bufend) {
1313 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1314 linestart_pos = 0;
1315 if (PL_parser->last_uni != PL_parser->bufend)
1316 PL_parser->last_uni = NULL;
1317 if (PL_parser->last_lop != PL_parser->bufend)
1318 PL_parser->last_lop = NULL;
1319 last_uni_pos = last_lop_pos = 0;
1320 *buf = 0;
1321 SvCUR(linestr) = 0;
1322 } else {
1323 old_bufend_pos = PL_parser->bufend - buf;
1324 bufptr_pos = PL_parser->bufptr - buf;
1325 oldbufptr_pos = PL_parser->oldbufptr - buf;
1326 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1327 linestart_pos = PL_parser->linestart - buf;
1328 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1329 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1330 }
1331 if (flags & LEX_FAKE_EOF) {
1332 goto eof;
60d63348 1333 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
f0e67a1d
Z
1334 got_some = 0;
1335 } else if (filter_gets(linestr, old_bufend_pos)) {
1336 got_some = 1;
17cc9359 1337 got_some_for_debugger = 1;
112d1284
FC
1338 } else if (flags & LEX_NO_TERM) {
1339 got_some = 0;
f0e67a1d 1340 } else {
580561a3
Z
1341 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1342 sv_setpvs(linestr, "");
f0e67a1d
Z
1343 eof:
1344 /* End of real input. Close filehandle (unless it was STDIN),
1345 * then add implicit termination.
1346 */
87606032 1347 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
f0e67a1d
Z
1348 PerlIO_clearerr(PL_parser->rsfp);
1349 else if (PL_parser->rsfp)
1350 (void)PerlIO_close(PL_parser->rsfp);
1351 PL_parser->rsfp = NULL;
60d63348 1352 PL_parser->in_pod = PL_parser->filtered = 0;
f0e67a1d
Z
1353#ifdef PERL_MAD
1354 if (PL_madskills && !PL_in_eval && (PL_minus_p || PL_minus_n))
1355 PL_faketokens = 1;
1356#endif
1357 if (!PL_in_eval && PL_minus_p) {
1358 sv_catpvs(linestr,
1359 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1360 PL_minus_n = PL_minus_p = 0;
1361 } else if (!PL_in_eval && PL_minus_n) {
1362 sv_catpvs(linestr, /*{*/";}");
1363 PL_minus_n = 0;
1364 } else
1365 sv_catpvs(linestr, ";");
1366 got_some = 1;
1367 }
1368 buf = SvPVX(linestr);
1369 new_bufend_pos = SvCUR(linestr);
1370 PL_parser->bufend = buf + new_bufend_pos;
1371 PL_parser->bufptr = buf + bufptr_pos;
1372 PL_parser->oldbufptr = buf + oldbufptr_pos;
1373 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1374 PL_parser->linestart = buf + linestart_pos;
1375 if (PL_parser->last_uni)
1376 PL_parser->last_uni = buf + last_uni_pos;
1377 if (PL_parser->last_lop)
1378 PL_parser->last_lop = buf + last_lop_pos;
17cc9359 1379 if (got_some_for_debugger && (PERLDB_LINE || PERLDB_SAVESRC) &&
f0e67a1d
Z
1380 PL_curstash != PL_debstash) {
1381 /* debugger active and we're not compiling the debugger code,
1382 * so store the line into the debugger's array of lines
1383 */
1384 update_debugger_info(NULL, buf+old_bufend_pos,
1385 new_bufend_pos-old_bufend_pos);
1386 }
1387 return got_some;
1388}
1389
1390/*
1391=for apidoc Amx|I32|lex_peek_unichar|U32 flags
1392
1393Looks ahead one (Unicode) character in the text currently being lexed.
1394Returns the codepoint (unsigned integer value) of the next character,
1395or -1 if lexing has reached the end of the input text. To consume the
1396peeked character, use L</lex_read_unichar>.
1397
1398If the next character is in (or extends into) the next chunk of input
1399text, the next chunk will be read in. Normally the current chunk will be
1400discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1401then the current chunk will not be discarded.
1402
1403If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1404is encountered, an exception is generated.
1405
1406=cut
1407*/
1408
1409I32
1410Perl_lex_peek_unichar(pTHX_ U32 flags)
1411{
749123ff 1412 dVAR;
f0e67a1d
Z
1413 char *s, *bufend;
1414 if (flags & ~(LEX_KEEP_PREVIOUS))
1415 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1416 s = PL_parser->bufptr;
1417 bufend = PL_parser->bufend;
1418 if (UTF) {
1419 U8 head;
1420 I32 unichar;
1421 STRLEN len, retlen;
1422 if (s == bufend) {
1423 if (!lex_next_chunk(flags))
1424 return -1;
1425 s = PL_parser->bufptr;
1426 bufend = PL_parser->bufend;
1427 }
1428 head = (U8)*s;
54d004e8 1429 if (UTF8_IS_INVARIANT(head))
f0e67a1d 1430 return head;
54d004e8
KW
1431 if (UTF8_IS_START(head)) {
1432 len = UTF8SKIP(&head);
f0e67a1d
Z
1433 while ((STRLEN)(bufend-s) < len) {
1434 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1435 break;
1436 s = PL_parser->bufptr;
1437 bufend = PL_parser->bufend;
1438 }
1439 }
1440 unichar = utf8n_to_uvuni((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
1441 if (retlen == (STRLEN)-1) {
1442 /* malformed UTF-8 */
1443 ENTER;
1444 SAVESPTR(PL_warnhook);
1445 PL_warnhook = PERL_WARNHOOK_FATAL;
1446 utf8n_to_uvuni((U8*)s, bufend-s, NULL, 0);
1447 LEAVE;
1448 }
1449 return unichar;
1450 } else {
1451 if (s == bufend) {
1452 if (!lex_next_chunk(flags))
1453 return -1;
1454 s = PL_parser->bufptr;
1455 }
1456 return (U8)*s;
1457 }
1458}
1459
1460/*
1461=for apidoc Amx|I32|lex_read_unichar|U32 flags
1462
1463Reads the next (Unicode) character in the text currently being lexed.
1464Returns the codepoint (unsigned integer value) of the character read,
1465and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1466if lexing has reached the end of the input text. To non-destructively
1467examine the next character, use L</lex_peek_unichar> instead.
1468
1469If the next character is in (or extends into) the next chunk of input
1470text, the next chunk will be read in. Normally the current chunk will be
1471discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1472then the current chunk will not be discarded.
1473
1474If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1475is encountered, an exception is generated.
1476
1477=cut
1478*/
1479
1480I32
1481Perl_lex_read_unichar(pTHX_ U32 flags)
1482{
1483 I32 c;
1484 if (flags & ~(LEX_KEEP_PREVIOUS))
1485 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1486 c = lex_peek_unichar(flags);
1487 if (c != -1) {
1488 if (c == '\n')
83944c01 1489 COPLINE_INC_WITH_HERELINES;
d9018cbe
EB
1490 if (UTF)
1491 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1492 else
1493 ++(PL_parser->bufptr);
f0e67a1d
Z
1494 }
1495 return c;
1496}
1497
1498/*
1499=for apidoc Amx|void|lex_read_space|U32 flags
1500
1501Reads optional spaces, in Perl style, in the text currently being
1502lexed. The spaces may include ordinary whitespace characters and
1503Perl-style comments. C<#line> directives are processed if encountered.
1504L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1505at a non-space character (or the end of the input text).
1506
1507If spaces extend into the next chunk of input text, the next chunk will
1508be read in. Normally the current chunk will be discarded at the same
1509time, but if I<flags> includes C<LEX_KEEP_PREVIOUS> then the current
1510chunk will not be discarded.
1511
1512=cut
1513*/
1514
f0998909
Z
1515#define LEX_NO_NEXT_CHUNK 0x80000000
1516
f0e67a1d
Z
1517void
1518Perl_lex_read_space(pTHX_ U32 flags)
1519{
1520 char *s, *bufend;
1521 bool need_incline = 0;
f0998909 1522 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK))
f0e67a1d
Z
1523 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
1524#ifdef PERL_MAD
1525 if (PL_skipwhite) {
1526 sv_free(PL_skipwhite);
1527 PL_skipwhite = NULL;
1528 }
1529 if (PL_madskills)
1530 PL_skipwhite = newSVpvs("");
1531#endif /* PERL_MAD */
1532 s = PL_parser->bufptr;
1533 bufend = PL_parser->bufend;
1534 while (1) {
1535 char c = *s;
1536 if (c == '#') {
1537 do {
1538 c = *++s;
1539 } while (!(c == '\n' || (c == 0 && s == bufend)));
1540 } else if (c == '\n') {
1541 s++;
1542 PL_parser->linestart = s;
1543 if (s == bufend)
1544 need_incline = 1;
1545 else
1546 incline(s);
1547 } else if (isSPACE(c)) {
1548 s++;
1549 } else if (c == 0 && s == bufend) {
1550 bool got_more;
1551#ifdef PERL_MAD
1552 if (PL_madskills)
1553 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1554#endif /* PERL_MAD */
f0998909
Z
1555 if (flags & LEX_NO_NEXT_CHUNK)
1556 break;
f0e67a1d 1557 PL_parser->bufptr = s;
83944c01 1558 COPLINE_INC_WITH_HERELINES;
f0e67a1d
Z
1559 got_more = lex_next_chunk(flags);
1560 CopLINE_dec(PL_curcop);
1561 s = PL_parser->bufptr;
1562 bufend = PL_parser->bufend;
1563 if (!got_more)
1564 break;
1565 if (need_incline && PL_parser->rsfp) {
1566 incline(s);
1567 need_incline = 0;
1568 }
1569 } else {
1570 break;
1571 }
1572 }
1573#ifdef PERL_MAD
1574 if (PL_madskills)
1575 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1576#endif /* PERL_MAD */
1577 PL_parser->bufptr = s;
1578}
1579
1580/*
fe788d6b
PM
1581
1582=for apidoc EXMp|bool|validate_proto|SV *name|SV *proto|bool warn
1583
1584This function performs syntax checking on a prototype, C<proto>.
1585If C<warn> is true, any illegal characters or mismatched brackets
1586will trigger illegalproto warnings, declaring that they were
1587detected in the prototype for C<name>.
1588
1589The return value is C<true> if this is a valid prototype, and
1590C<false> if it is not, regardless of whether C<warn> was C<true> or
1591C<false>.
1592
1593Note that C<NULL> is a valid C<proto> and will always return C<true>.
1594
1595=cut
1596
1597 */
1598
1599bool
1600Perl_validate_proto(pTHX_ SV *name, SV *proto, bool warn)
1601{
1602 STRLEN len, origlen;
1603 char *p = proto ? SvPV(proto, len) : NULL;
1604 bool bad_proto = FALSE;
1605 bool in_brackets = FALSE;
1606 bool after_slash = FALSE;
1607 char greedy_proto = ' ';
1608 bool proto_after_greedy_proto = FALSE;
1609 bool must_be_last = FALSE;
1610 bool underscore = FALSE;
1611 bool seen_underscore = FALSE;
1612
1613 PERL_ARGS_ASSERT_VALIDATE_PROTO;
1614
1615 if (!proto)
1616 return TRUE;
1617
1618 origlen = len;
1619 for (; len--; p++) {
1620 if (!isSPACE(*p)) {
1621 if (must_be_last)
1622 proto_after_greedy_proto = TRUE;
1623 if (!strchr("$@%*;[]&\\_+", *p) || *p == '\0') {
1624 bad_proto = TRUE;
1625 }
1626 else {
1627 if (underscore) {
1628 if(!strchr(";@%", *p))
1629 bad_proto = TRUE;
1630 underscore = FALSE;
1631 }
1632
1633 if (*p == '[')
1634 in_brackets = TRUE;
1635 else if (*p == ']')
1636 in_brackets = FALSE;
1637 else if ((*p == '@' || *p == '%') &&
1638 !after_slash &&
1639 !in_brackets ) {
1640 must_be_last = TRUE;
1641 greedy_proto = *p;
1642 }
1643 else if (*p == '_')
1644 underscore = seen_underscore = TRUE;
1645 }
1646 if (*p == '\\')
1647 after_slash = TRUE;
1648 else
1649 after_slash = FALSE;
1650 }
1651 }
1652
1653 if (warn) {
b54d603d 1654 SV *tmpsv = newSVpvs_flags("", SVs_TEMP);
fe788d6b 1655 p -= origlen;
b54d603d
PM
1656 p = SvUTF8(proto)
1657 ? sv_uni_display(tmpsv, newSVpvn_flags(p, origlen, SVs_TEMP | SVf_UTF8),
1658 origlen, UNI_DISPLAY_ISPRINT)
1659 : pv_pretty(tmpsv, p, origlen, 60, NULL, NULL, PERL_PV_ESCAPE_NONASCII);
1660
fe788d6b
PM
1661 if (proto_after_greedy_proto)
1662 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1663 "Prototype after '%c' for %"SVf" : %s",
1664 greedy_proto, SVfARG(name), p);
b54d603d 1665 if (bad_proto)
fe788d6b
PM
1666 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1667 "Illegal character %sin prototype for %"SVf" : %s",
b54d603d 1668 seen_underscore ? "after '_' " : "", SVfARG(name), p);
fe788d6b
PM
1669 }
1670
1671 return (! (proto_after_greedy_proto || bad_proto) );
1672}
1673
1674/*
ffb4593c
NT
1675 * S_incline
1676 * This subroutine has nothing to do with tilting, whether at windmills
1677 * or pinball tables. Its name is short for "increment line". It
57843af0 1678 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1679 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1680 * # line 500 "foo.pm"
1681 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1682 */
1683
76e3520e 1684STATIC void
d9095cec 1685S_incline(pTHX_ const char *s)
463ee0b2 1686{
97aff369 1687 dVAR;
d9095cec
NC
1688 const char *t;
1689 const char *n;
1690 const char *e;
8818d409 1691 line_t line_num;
463ee0b2 1692
7918f24d
NC
1693 PERL_ARGS_ASSERT_INCLINE;
1694
83944c01 1695 COPLINE_INC_WITH_HERELINES;
451f421f
FC
1696 if (!PL_rsfp && !PL_parser->filtered && PL_lex_state == LEX_NORMAL
1697 && s+1 == PL_bufend && *s == ';') {
1698 /* fake newline in string eval */
1699 CopLINE_dec(PL_curcop);
1700 return;
1701 }
463ee0b2
LW
1702 if (*s++ != '#')
1703 return;
d4c19fe8
AL
1704 while (SPACE_OR_TAB(*s))
1705 s++;
73659bf1
GS
1706 if (strnEQ(s, "line", 4))
1707 s += 4;
1708 else
1709 return;
084592ab 1710 if (SPACE_OR_TAB(*s))
73659bf1 1711 s++;
4e553d73 1712 else
73659bf1 1713 return;
d4c19fe8
AL
1714 while (SPACE_OR_TAB(*s))
1715 s++;
463ee0b2
LW
1716 if (!isDIGIT(*s))
1717 return;
d4c19fe8 1718
463ee0b2
LW
1719 n = s;
1720 while (isDIGIT(*s))
1721 s++;
07714eb4 1722 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1723 return;
bf4acbe4 1724 while (SPACE_OR_TAB(*s))
463ee0b2 1725 s++;
73659bf1 1726 if (*s == '"' && (t = strchr(s+1, '"'))) {
463ee0b2 1727 s++;
73659bf1
GS
1728 e = t + 1;
1729 }
463ee0b2 1730 else {
c35e046a
AL
1731 t = s;
1732 while (!isSPACE(*t))
1733 t++;
73659bf1 1734 e = t;
463ee0b2 1735 }
bf4acbe4 1736 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1737 e++;
1738 if (*e != '\n' && *e != '\0')
1739 return; /* false alarm */
1740
8818d409
FC
1741 line_num = atoi(n)-1;
1742
f4dd75d9 1743 if (t - s > 0) {
d9095cec 1744 const STRLEN len = t - s;
19bad673
NC
1745 SV *const temp_sv = CopFILESV(PL_curcop);
1746 const char *cf;
1747 STRLEN tmplen;
1748
1749 if (temp_sv) {
1750 cf = SvPVX(temp_sv);
1751 tmplen = SvCUR(temp_sv);
1752 } else {
1753 cf = NULL;
1754 tmplen = 0;
1755 }
1756
d1299d44 1757 if (!PL_rsfp && !PL_parser->filtered) {
e66cf94c
RGS
1758 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1759 * to *{"::_<newfilename"} */
44867030
NC
1760 /* However, the long form of evals is only turned on by the
1761 debugger - usually they're "(eval %lu)" */
1762 char smallbuf[128];
1763 char *tmpbuf;
1764 GV **gvp;
d9095cec 1765 STRLEN tmplen2 = len;
798b63bc 1766 if (tmplen + 2 <= sizeof smallbuf)
e66cf94c
RGS
1767 tmpbuf = smallbuf;
1768 else
2ae0db35 1769 Newx(tmpbuf, tmplen + 2, char);
44867030
NC
1770 tmpbuf[0] = '_';
1771 tmpbuf[1] = '<';
2ae0db35 1772 memcpy(tmpbuf + 2, cf, tmplen);
44867030 1773 tmplen += 2;
8a5ee598
RGS
1774 gvp = (GV**)hv_fetch(PL_defstash, tmpbuf, tmplen, FALSE);
1775 if (gvp) {
44867030
NC
1776 char *tmpbuf2;
1777 GV *gv2;
1778
1779 if (tmplen2 + 2 <= sizeof smallbuf)
1780 tmpbuf2 = smallbuf;
1781 else
1782 Newx(tmpbuf2, tmplen2 + 2, char);
1783
1784 if (tmpbuf2 != smallbuf || tmpbuf != smallbuf) {
1785 /* Either they malloc'd it, or we malloc'd it,
1786 so no prefix is present in ours. */
1787 tmpbuf2[0] = '_';
1788 tmpbuf2[1] = '<';
1789 }
1790
1791 memcpy(tmpbuf2 + 2, s, tmplen2);
1792 tmplen2 += 2;
1793
8a5ee598 1794 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1795 if (!isGV(gv2)) {
8a5ee598 1796 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1797 /* adjust ${"::_<newfilename"} to store the new file name */
1798 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
8818d409
FC
1799 /* The line number may differ. If that is the case,
1800 alias the saved lines that are in the array.
1801 Otherwise alias the whole array. */
1802 if (CopLINE(PL_curcop) == line_num) {
1803 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(*gvp)));
1804 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(*gvp)));
1805 }
1806 else if (GvAV(*gvp)) {
1807 AV * const av = GvAV(*gvp);
1808 const I32 start = CopLINE(PL_curcop)+1;
1809 I32 items = AvFILLp(av) - start;
1810 if (items > 0) {
1811 AV * const av2 = GvAVn(gv2);
1812 SV **svp = AvARRAY(av) + start;
1813 I32 l = (I32)line_num+1;
1814 while (items--)
1815 av_store(av2, l++, SvREFCNT_inc(*svp++));
1816 }
1817 }
e5527e4b 1818 }
44867030
NC
1819
1820 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
8a5ee598 1821 }
e66cf94c 1822 if (tmpbuf != smallbuf) Safefree(tmpbuf);
e66cf94c 1823 }
05ec9bb3 1824 CopFILE_free(PL_curcop);
d9095cec 1825 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1826 }
8818d409 1827 CopLINE_set(PL_curcop, line_num);
463ee0b2
LW
1828}
1829
29595ff2 1830#ifdef PERL_MAD
cd81e915 1831/* skip space before PL_thistoken */
29595ff2
NC
1832
1833STATIC char *
5aaab254 1834S_skipspace0(pTHX_ char *s)
29595ff2 1835{
7918f24d
NC
1836 PERL_ARGS_ASSERT_SKIPSPACE0;
1837
29595ff2
NC
1838 s = skipspace(s);
1839 if (!PL_madskills)
1840 return s;
cd81e915
NC
1841 if (PL_skipwhite) {
1842 if (!PL_thiswhite)
6b29d1f5 1843 PL_thiswhite = newSVpvs("");
cd81e915
NC
1844 sv_catsv(PL_thiswhite, PL_skipwhite);
1845 sv_free(PL_skipwhite);
1846 PL_skipwhite = 0;
1847 }
1848 PL_realtokenstart = s - SvPVX(PL_linestr);
29595ff2
NC
1849 return s;
1850}
1851
cd81e915 1852/* skip space after PL_thistoken */
29595ff2
NC
1853
1854STATIC char *
5aaab254 1855S_skipspace1(pTHX_ char *s)
29595ff2 1856{
d4c19fe8 1857 const char *start = s;
29595ff2
NC
1858 I32 startoff = start - SvPVX(PL_linestr);
1859
7918f24d
NC
1860 PERL_ARGS_ASSERT_SKIPSPACE1;
1861
29595ff2
NC
1862 s = skipspace(s);
1863 if (!PL_madskills)
1864 return s;
1865 start = SvPVX(PL_linestr) + startoff;
cd81e915 1866 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1867 const char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1868 PL_thistoken = newSVpvn(tstart, start - tstart);
1869 }
1870 PL_realtokenstart = -1;
1871 if (PL_skipwhite) {
1872 if (!PL_nextwhite)
6b29d1f5 1873 PL_nextwhite = newSVpvs("");
cd81e915
NC
1874 sv_catsv(PL_nextwhite, PL_skipwhite);
1875 sv_free(PL_skipwhite);
1876 PL_skipwhite = 0;
29595ff2
NC
1877 }
1878 return s;
1879}
1880
1881STATIC char *
5aaab254 1882S_skipspace2(pTHX_ char *s, SV **svp)
29595ff2 1883{
c35e046a
AL
1884 char *start;
1885 const I32 bufptroff = PL_bufptr - SvPVX(PL_linestr);
1886 const I32 startoff = s - SvPVX(PL_linestr);
1887
7918f24d
NC
1888 PERL_ARGS_ASSERT_SKIPSPACE2;
1889
29595ff2
NC
1890 s = skipspace(s);
1891 PL_bufptr = SvPVX(PL_linestr) + bufptroff;
1892 if (!PL_madskills || !svp)
1893 return s;
1894 start = SvPVX(PL_linestr) + startoff;
cd81e915 1895 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1896 char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1897 PL_thistoken = newSVpvn(tstart, start - tstart);
1898 PL_realtokenstart = -1;
29595ff2 1899 }
cd81e915 1900 if (PL_skipwhite) {
29595ff2 1901 if (!*svp)
6b29d1f5 1902 *svp = newSVpvs("");
cd81e915
NC
1903 sv_setsv(*svp, PL_skipwhite);
1904 sv_free(PL_skipwhite);
1905 PL_skipwhite = 0;
29595ff2
NC
1906 }
1907
1908 return s;
1909}
1910#endif
1911
80a702cd 1912STATIC void
15f169a1 1913S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1914{
1915 AV *av = CopFILEAVx(PL_curcop);
1916 if (av) {
b9f83d2f 1917 SV * const sv = newSV_type(SVt_PVMG);
5fa550fb
NC
1918 if (orig_sv)
1919 sv_setsv(sv, orig_sv);
1920 else
1921 sv_setpvn(sv, buf, len);
80a702cd
RGS
1922 (void)SvIOK_on(sv);
1923 SvIV_set(sv, 0);
1924 av_store(av, (I32)CopLINE(PL_curcop), sv);
1925 }
1926}
1927
ffb4593c
NT
1928/*
1929 * S_skipspace
1930 * Called to gobble the appropriate amount and type of whitespace.
1931 * Skips comments as well.
1932 */
1933
76e3520e 1934STATIC char *
5aaab254 1935S_skipspace(pTHX_ char *s)
a687059c 1936{
5db06880 1937#ifdef PERL_MAD
f0e67a1d
Z
1938 char *start = s;
1939#endif /* PERL_MAD */
7918f24d 1940 PERL_ARGS_ASSERT_SKIPSPACE;
f0e67a1d 1941#ifdef PERL_MAD
cd81e915
NC
1942 if (PL_skipwhite) {
1943 sv_free(PL_skipwhite);
f0e67a1d 1944 PL_skipwhite = NULL;
5db06880 1945 }
f0e67a1d 1946#endif /* PERL_MAD */
3280af22 1947 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
bf4acbe4 1948 while (s < PL_bufend && SPACE_OR_TAB(*s))
463ee0b2 1949 s++;
f0e67a1d
Z
1950 } else {
1951 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1952 PL_bufptr = s;
f0998909
Z
1953 lex_read_space(LEX_KEEP_PREVIOUS |
1954 (PL_sublex_info.sub_inwhat || PL_lex_state == LEX_FORMLINE ?
1955 LEX_NO_NEXT_CHUNK : 0));
3280af22 1956 s = PL_bufptr;
f0e67a1d
Z
1957 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1958 if (PL_linestart > PL_bufptr)
1959 PL_bufptr = PL_linestart;
1960 return s;
463ee0b2 1961 }
5db06880 1962#ifdef PERL_MAD
f0e67a1d
Z
1963 if (PL_madskills)
1964 PL_skipwhite = newSVpvn(start, s-start);
1965#endif /* PERL_MAD */
5db06880 1966 return s;
a687059c 1967}
378cc40b 1968
ffb4593c
NT
1969/*
1970 * S_check_uni
1971 * Check the unary operators to ensure there's no ambiguity in how they're
1972 * used. An ambiguous piece of code would be:
1973 * rand + 5
1974 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1975 * the +5 is its argument.
1976 */
1977
76e3520e 1978STATIC void
cea2e8a9 1979S_check_uni(pTHX)
ba106d47 1980{
97aff369 1981 dVAR;
d4c19fe8
AL
1982 const char *s;
1983 const char *t;
2f3197b3 1984
3280af22 1985 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1986 return;
3280af22
NIS
1987 while (isSPACE(*PL_last_uni))
1988 PL_last_uni++;
c35e046a 1989 s = PL_last_uni;
8a2bca12 1990 while (isWORDCHAR_lazy_if(s,UTF) || *s == '-')
c35e046a 1991 s++;
3280af22 1992 if ((t = strchr(s, '(')) && t < PL_bufptr)
a0d0e21e 1993 return;
6136c704 1994
9b387841
NC
1995 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
1996 "Warning: Use of \"%.*s\" without parentheses is ambiguous",
1997 (int)(s - PL_last_uni), PL_last_uni);
2f3197b3
LW
1998}
1999
ffb4593c
NT
2000/*
2001 * LOP : macro to build a list operator. Its behaviour has been replaced
2002 * with a subroutine, S_lop() for which LOP is just another name.
2003 */
2004
a0d0e21e
LW
2005#define LOP(f,x) return lop(f,x,s)
2006
ffb4593c
NT
2007/*
2008 * S_lop
2009 * Build a list operator (or something that might be one). The rules:
2010 * - if we have a next token, then it's a list operator [why?]
2011 * - if the next thing is an opening paren, then it's a function
2012 * - else it's a list operator
2013 */
2014
76e3520e 2015STATIC I32
a0be28da 2016S_lop(pTHX_ I32 f, int x, char *s)
ffed7fef 2017{
97aff369 2018 dVAR;
7918f24d
NC
2019
2020 PERL_ARGS_ASSERT_LOP;
2021
6154021b 2022 pl_yylval.ival = f;
35c8bce7 2023 CLINE;
3280af22
NIS
2024 PL_expect = x;
2025 PL_bufptr = s;
2026 PL_last_lop = PL_oldbufptr;
eb160463 2027 PL_last_lop_op = (OPCODE)f;
5db06880
NC
2028#ifdef PERL_MAD
2029 if (PL_lasttoke)
78cdf107 2030 goto lstop;
5db06880 2031#else
3280af22 2032 if (PL_nexttoke)
78cdf107 2033 goto lstop;
5db06880 2034#endif
79072805 2035 if (*s == '(')
bbf60fe6 2036 return REPORT(FUNC);
29595ff2 2037 s = PEEKSPACE(s);
79072805 2038 if (*s == '(')
bbf60fe6 2039 return REPORT(FUNC);
78cdf107
Z
2040 else {
2041 lstop:
2042 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
2043 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
bbf60fe6 2044 return REPORT(LSTOP);
78cdf107 2045 }
79072805
LW
2046}
2047
5db06880
NC
2048#ifdef PERL_MAD
2049 /*
2050 * S_start_force
2051 * Sets up for an eventual force_next(). start_force(0) basically does
2052 * an unshift, while start_force(-1) does a push. yylex removes items
2053 * on the "pop" end.
2054 */
2055
2056STATIC void
2057S_start_force(pTHX_ int where)
2058{
2059 int i;
2060
cd81e915 2061 if (where < 0) /* so people can duplicate start_force(PL_curforce) */
5db06880 2062 where = PL_lasttoke;
cd81e915
NC
2063 assert(PL_curforce < 0 || PL_curforce == where);
2064 if (PL_curforce != where) {
5db06880
NC
2065 for (i = PL_lasttoke; i > where; --i) {
2066 PL_nexttoke[i] = PL_nexttoke[i-1];
2067 }
2068 PL_lasttoke++;
2069 }
cd81e915 2070 if (PL_curforce < 0) /* in case of duplicate start_force() */
5db06880 2071 Zero(&PL_nexttoke[where], 1, NEXTTOKE);
cd81e915
NC
2072 PL_curforce = where;
2073 if (PL_nextwhite) {
5db06880 2074 if (PL_madskills)
6b29d1f5 2075 curmad('^', newSVpvs(""));
cd81e915 2076 CURMAD('_', PL_nextwhite);
5db06880
NC
2077 }
2078}
2079
2080STATIC void
2081S_curmad(pTHX_ char slot, SV *sv)
2082{
2083 MADPROP **where;
2084
2085 if (!sv)
2086 return;
cd81e915
NC
2087 if (PL_curforce < 0)
2088 where = &PL_thismad;
5db06880 2089 else
cd81e915 2090 where = &PL_nexttoke[PL_curforce].next_mad;
5db06880 2091
cd81e915 2092 if (PL_faketokens)
76f68e9b 2093 sv_setpvs(sv, "");
5db06880
NC
2094 else {
2095 if (!IN_BYTES) {
2096 if (UTF && is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
2097 SvUTF8_on(sv);
2098 else if (PL_encoding) {
2099 sv_recode_to_utf8(sv, PL_encoding);
2100 }
2101 }
2102 }
2103
2104 /* keep a slot open for the head of the list? */
2105 if (slot != '_' && *where && (*where)->mad_key == '^') {
2106 (*where)->mad_key = slot;
daba3364 2107 sv_free(MUTABLE_SV(((*where)->mad_val)));
5db06880
NC
2108 (*where)->mad_val = (void*)sv;
2109 }
2110 else
2111 addmad(newMADsv(slot, sv), where, 0);
2112}
2113#else
b3f24c00
MHM
2114# define start_force(where) NOOP
2115# define curmad(slot, sv) NOOP
5db06880
NC
2116#endif
2117
ffb4593c
NT
2118/*
2119 * S_force_next
9cbb5ea2 2120 * When the lexer realizes it knows the next token (for instance,
ffb4593c 2121 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 2122 * to know what token to return the next time the lexer is called. Caller
5db06880
NC
2123 * will need to set PL_nextval[] (or PL_nexttoke[].next_val with PERL_MAD),
2124 * and possibly PL_expect to ensure the lexer handles the token correctly.
ffb4593c
NT
2125 */
2126
4e553d73 2127STATIC void
cea2e8a9 2128S_force_next(pTHX_ I32 type)
79072805 2129{
97aff369 2130 dVAR;
704d4215
GG
2131#ifdef DEBUGGING
2132 if (DEBUG_T_TEST) {
2133 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 2134 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
2135 }
2136#endif
5db06880 2137#ifdef PERL_MAD
cd81e915 2138 if (PL_curforce < 0)
5db06880 2139 start_force(PL_lasttoke);
cd81e915 2140 PL_nexttoke[PL_curforce].next_type = type;
5db06880
NC
2141 if (PL_lex_state != LEX_KNOWNEXT)
2142 PL_lex_defer = PL_lex_state;
2143 PL_lex_state = LEX_KNOWNEXT;
2144 PL_lex_expect = PL_expect;
cd81e915 2145 PL_curforce = -1;
5db06880 2146#else
3280af22
NIS
2147 PL_nexttype[PL_nexttoke] = type;
2148 PL_nexttoke++;
2149 if (PL_lex_state != LEX_KNOWNEXT) {
2150 PL_lex_defer = PL_lex_state;
2151 PL_lex_expect = PL_expect;
2152 PL_lex_state = LEX_KNOWNEXT;
79072805 2153 }
5db06880 2154#endif
79072805
LW
2155}
2156
28ac2b49
Z
2157void
2158Perl_yyunlex(pTHX)
2159{
a7aaec61
Z
2160 int yyc = PL_parser->yychar;
2161 if (yyc != YYEMPTY) {
2162 if (yyc) {
2163 start_force(-1);
2164 NEXTVAL_NEXTTOKE = PL_parser->yylval;
2165 if (yyc == '{'/*}*/ || yyc == HASHBRACK || yyc == '['/*]*/) {
78cdf107 2166 PL_lex_allbrackets--;
a7aaec61 2167 PL_lex_brackets--;
78cdf107
Z
2168 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
2169 } else if (yyc == '('/*)*/) {
2170 PL_lex_allbrackets--;
2171 yyc |= (2<<24);
a7aaec61
Z
2172 }
2173 force_next(yyc);
2174 }
28ac2b49
Z
2175 PL_parser->yychar = YYEMPTY;
2176 }
2177}
2178
d0a148a6 2179STATIC SV *
15f169a1 2180S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 2181{
97aff369 2182 dVAR;
740cce10 2183 SV * const sv = newSVpvn_utf8(start, len,
eaf7a4d2
CS
2184 !IN_BYTES
2185 && UTF
2186 && !is_ascii_string((const U8*)start, len)
740cce10 2187 && is_utf8_string((const U8*)start, len));
d0a148a6
NC
2188 return sv;
2189}
2190
ffb4593c
NT
2191/*
2192 * S_force_word
2193 * When the lexer knows the next thing is a word (for instance, it has
2194 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
2195 * it calls S_force_word to stick the next word into the PL_nexttoke/val
2196 * lookahead.
ffb4593c
NT
2197 *
2198 * Arguments:
b1b65b59 2199 * char *start : buffer position (must be within PL_linestr)
02b34bbe 2200 * int token : PL_next* will be this type of bare word (e.g., METHOD,WORD)
ffb4593c
NT
2201 * int check_keyword : if true, Perl checks to make sure the word isn't
2202 * a keyword (do this if the word is a label, e.g. goto FOO)
2203 * int allow_pack : if true, : characters will also be allowed (require,
2204 * use, etc. do this)
9cbb5ea2 2205 * int allow_initial_tick : used by the "sub" lexer only.
ffb4593c
NT
2206 */
2207
76e3520e 2208STATIC char *
345b3785 2209S_force_word(pTHX_ char *start, int token, int check_keyword, int allow_pack)
79072805 2210{
97aff369 2211 dVAR;
eb578fdb 2212 char *s;
463ee0b2 2213 STRLEN len;
4e553d73 2214
7918f24d
NC
2215 PERL_ARGS_ASSERT_FORCE_WORD;
2216
29595ff2 2217 start = SKIPSPACE1(start);
463ee0b2 2218 s = start;
7e2040f0 2219 if (isIDFIRST_lazy_if(s,UTF) ||
345b3785 2220 (allow_pack && *s == ':') )
a0d0e21e 2221 {
3280af22 2222 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
01b5ef50
FC
2223 if (check_keyword) {
2224 char *s2 = PL_tokenbuf;
2225 if (allow_pack && len > 6 && strnEQ(s2, "CORE::", 6))
2226 s2 += 6, len -= 6;
2227 if (keyword(s2, len, 0))
463ee0b2 2228 return start;
01b5ef50 2229 }
cd81e915 2230 start_force(PL_curforce);
5db06880
NC
2231 if (PL_madskills)
2232 curmad('X', newSVpvn(start,s-start));
463ee0b2 2233 if (token == METHOD) {
29595ff2 2234 s = SKIPSPACE1(s);
463ee0b2 2235 if (*s == '(')
3280af22 2236 PL_expect = XTERM;
463ee0b2 2237 else {
3280af22 2238 PL_expect = XOPERATOR;
463ee0b2 2239 }
79072805 2240 }
e74e6b3d 2241 if (PL_madskills)
63575281 2242 curmad('g', newSVpvs( "forced" ));
9ded7720 2243 NEXTVAL_NEXTTOKE.opval
d0a148a6
NC
2244 = (OP*)newSVOP(OP_CONST,0,
2245 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 2246 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
2247 force_next(token);
2248 }
2249 return s;
2250}
2251
ffb4593c
NT
2252/*
2253 * S_force_ident
9cbb5ea2 2254 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
2255 * text only contains the "foo" portion. The first argument is a pointer
2256 * to the "foo", and the second argument is the type symbol to prefix.
2257 * Forces the next token to be a "WORD".
9cbb5ea2 2258 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2259 */
2260
76e3520e 2261STATIC void
5aaab254 2262S_force_ident(pTHX_ const char *s, int kind)
79072805 2263{
97aff369 2264 dVAR;
7918f24d
NC
2265
2266 PERL_ARGS_ASSERT_FORCE_IDENT;
2267
c9b48522
DD
2268 if (s[0]) {
2269 const STRLEN len = s[1] ? strlen(s) : 1; /* s = "\"" see yylex */
728847b1
BF
2270 OP* const o = (OP*)newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
2271 UTF ? SVf_UTF8 : 0));
cd81e915 2272 start_force(PL_curforce);
9ded7720 2273 NEXTVAL_NEXTTOKE.opval = o;
79072805 2274 force_next(WORD);
748a9306 2275 if (kind) {
11343788 2276 o->op_private = OPpCONST_ENTERED;
55497cff 2277 /* XXX see note in pp_entereval() for why we forgo typo
2278 warnings if the symbol must be introduced in an eval.
2279 GSAR 96-10-12 */
90e5519e 2280 gv_fetchpvn_flags(s, len,
728847b1
BF
2281 (PL_in_eval ? (GV_ADDMULTI | GV_ADDINEVAL)
2282 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
90e5519e
NC
2283 kind == '$' ? SVt_PV :
2284 kind == '@' ? SVt_PVAV :
2285 kind == '%' ? SVt_PVHV :
a0d0e21e 2286 SVt_PVGV
90e5519e 2287 );
748a9306 2288 }
79072805
LW
2289 }
2290}
2291
3f33d153
FC
2292static void
2293S_force_ident_maybe_lex(pTHX_ char pit)
2294{
2295 start_force(PL_curforce);
2296 NEXTVAL_NEXTTOKE.ival = pit;
2297 force_next('p');
2298}
2299
1571675a
GS
2300NV
2301Perl_str_to_version(pTHX_ SV *sv)
2302{
2303 NV retval = 0.0;
2304 NV nshift = 1.0;
2305 STRLEN len;
cfd0369c 2306 const char *start = SvPV_const(sv,len);
9d4ba2ae 2307 const char * const end = start + len;
504618e9 2308 const bool utf = SvUTF8(sv) ? TRUE : FALSE;
7918f24d
NC
2309
2310 PERL_ARGS_ASSERT_STR_TO_VERSION;
2311
1571675a 2312 while (start < end) {
ba210ebe 2313 STRLEN skip;
1571675a
GS
2314 UV n;
2315 if (utf)
9041c2e3 2316 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2317 else {
2318 n = *(U8*)start;
2319 skip = 1;
2320 }
2321 retval += ((NV)n)/nshift;
2322 start += skip;
2323 nshift *= 1000;
2324 }
2325 return retval;
2326}
2327
4e553d73 2328/*
ffb4593c
NT
2329 * S_force_version
2330 * Forces the next token to be a version number.
e759cc13
RGS
2331 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2332 * and if "guessing" is TRUE, then no new token is created (and the caller
2333 * must use an alternative parsing method).
ffb4593c
NT
2334 */
2335
76e3520e 2336STATIC char *
e759cc13 2337S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2338{
97aff369 2339 dVAR;
5f66b61c 2340 OP *version = NULL;
44dcb63b 2341 char *d;
5db06880
NC
2342#ifdef PERL_MAD
2343 I32 startoff = s - SvPVX(PL_linestr);
2344#endif
89bfa8cd 2345
7918f24d
NC
2346 PERL_ARGS_ASSERT_FORCE_VERSION;
2347
29595ff2 2348 s = SKIPSPACE1(s);
89bfa8cd 2349
44dcb63b 2350 d = s;
dd629d5b 2351 if (*d == 'v')
44dcb63b 2352 d++;
44dcb63b 2353 if (isDIGIT(*d)) {
e759cc13
RGS
2354 while (isDIGIT(*d) || *d == '_' || *d == '.')
2355 d++;
5db06880
NC
2356#ifdef PERL_MAD
2357 if (PL_madskills) {
cd81e915 2358 start_force(PL_curforce);
5db06880
NC
2359 curmad('X', newSVpvn(s,d-s));
2360 }
2361#endif
4e4da3ac 2362 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
dd629d5b 2363 SV *ver;
8d08d9ba 2364#ifdef USE_LOCALE_NUMERIC
909d3787
KW
2365 char *loc = savepv(setlocale(LC_NUMERIC, NULL));
2366 setlocale(LC_NUMERIC, "C");
8d08d9ba 2367#endif
6154021b 2368 s = scan_num(s, &pl_yylval);
8d08d9ba
DG
2369#ifdef USE_LOCALE_NUMERIC
2370 setlocale(LC_NUMERIC, loc);
909d3787 2371 Safefree(loc);
8d08d9ba 2372#endif
6154021b 2373 version = pl_yylval.opval;
dd629d5b
GS
2374 ver = cSVOPx(version)->op_sv;
2375 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2376 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2377 SvNV_set(ver, str_to_version(ver));
1571675a 2378 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2379 }
89bfa8cd 2380 }
5db06880
NC
2381 else if (guessing) {
2382#ifdef PERL_MAD
2383 if (PL_madskills) {
cd81e915
NC
2384 sv_free(PL_nextwhite); /* let next token collect whitespace */
2385 PL_nextwhite = 0;
5db06880
NC
2386 s = SvPVX(PL_linestr) + startoff;
2387 }
2388#endif
e759cc13 2389 return s;
5db06880 2390 }
89bfa8cd 2391 }
2392
5db06880
NC
2393#ifdef PERL_MAD
2394 if (PL_madskills && !version) {
cd81e915
NC
2395 sv_free(PL_nextwhite); /* let next token collect whitespace */
2396 PL_nextwhite = 0;
5db06880
NC
2397 s = SvPVX(PL_linestr) + startoff;
2398 }
2399#endif
89bfa8cd 2400 /* NOTE: The parser sees the package name and the VERSION swapped */
cd81e915 2401 start_force(PL_curforce);
9ded7720 2402 NEXTVAL_NEXTTOKE.opval = version;
4e553d73 2403 force_next(WORD);
89bfa8cd 2404
e759cc13 2405 return s;
89bfa8cd 2406}
2407
ffb4593c 2408/*
91152fc1
DG
2409 * S_force_strict_version
2410 * Forces the next token to be a version number using strict syntax rules.
2411 */
2412
2413STATIC char *
2414S_force_strict_version(pTHX_ char *s)
2415{
2416 dVAR;
2417 OP *version = NULL;
2418#ifdef PERL_MAD
2419 I32 startoff = s - SvPVX(PL_linestr);
2420#endif
2421 const char *errstr = NULL;
2422
2423 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2424
2425 while (isSPACE(*s)) /* leading whitespace */
2426 s++;
2427
2428 if (is_STRICT_VERSION(s,&errstr)) {
2429 SV *ver = newSV(0);
2430 s = (char *)scan_version(s, ver, 0);
2431 version = newSVOP(OP_CONST, 0, ver);
2432 }
4e4da3ac
Z
2433 else if ( (*s != ';' && *s != '{' && *s != '}' ) &&
2434 (s = SKIPSPACE1(s), (*s != ';' && *s != '{' && *s != '}' )))
2435 {
91152fc1
DG
2436 PL_bufptr = s;
2437 if (errstr)
2438 yyerror(errstr); /* version required */
2439 return s;
2440 }
2441
2442#ifdef PERL_MAD
2443 if (PL_madskills && !version) {
2444 sv_free(PL_nextwhite); /* let next token collect whitespace */
2445 PL_nextwhite = 0;
2446 s = SvPVX(PL_linestr) + startoff;
2447 }
2448#endif
2449 /* NOTE: The parser sees the package name and the VERSION swapped */
2450 start_force(PL_curforce);
2451 NEXTVAL_NEXTTOKE.opval = version;
2452 force_next(WORD);
2453
2454 return s;
2455}
2456
2457/*
ffb4593c
NT
2458 * S_tokeq
2459 * Tokenize a quoted string passed in as an SV. It finds the next
2460 * chunk, up to end of string or a backslash. It may make a new
2461 * SV containing that chunk (if HINT_NEW_STRING is on). It also
2462 * turns \\ into \.
2463 */
2464
76e3520e 2465STATIC SV *
cea2e8a9 2466S_tokeq(pTHX_ SV *sv)
79072805 2467{
97aff369 2468 dVAR;
eb578fdb
KW
2469 char *s;
2470 char *send;
2471 char *d;
b3ac6de7
IZ
2472 STRLEN len = 0;
2473 SV *pv = sv;
79072805 2474
7918f24d
NC
2475 PERL_ARGS_ASSERT_TOKEQ;
2476
79072805 2477 if (!SvLEN(sv))
b3ac6de7 2478 goto finish;
79072805 2479
a0d0e21e 2480 s = SvPV_force(sv, len);
21a311ee 2481 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1)
b3ac6de7 2482 goto finish;
463ee0b2 2483 send = s + len;
dcb21ed6
NC
2484 /* This is relying on the SV being "well formed" with a trailing '\0' */
2485 while (s < send && !(*s == '\\' && s[1] == '\\'))
79072805
LW
2486 s++;
2487 if (s == send)
b3ac6de7 2488 goto finish;
79072805 2489 d = s;
be4731d2 2490 if ( PL_hints & HINT_NEW_STRING ) {
59cd0e26 2491 pv = newSVpvn_flags(SvPVX_const(pv), len, SVs_TEMP | SvUTF8(sv));
be4731d2 2492 }
79072805
LW
2493 while (s < send) {
2494 if (*s == '\\') {
a0d0e21e 2495 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2496 s++; /* all that, just for this */
2497 }
2498 *d++ = *s++;
2499 }
2500 *d = '\0';
95a20fc0 2501 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2502 finish:
3280af22 2503 if ( PL_hints & HINT_NEW_STRING )
eb0d8d16 2504 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
79072805
LW
2505 return sv;
2506}
2507
ffb4593c
NT
2508/*
2509 * Now come three functions related to double-quote context,
2510 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2511 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2512 * interact with PL_lex_state, and create fake ( ... ) argument lists
2513 * to handle functions and concatenation.
ecd24171
DM
2514 * For example,
2515 * "foo\lbar"
2516 * is tokenised as
2517 * stringify ( const[foo] concat lcfirst ( const[bar] ) )
ffb4593c
NT
2518 */
2519
2520/*
2521 * S_sublex_start
6154021b 2522 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2523 *
2524 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2525 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c
NT
2526 *
2527 * OP_CONST and OP_READLINE are easy--just make the new op and return.
2528 *
2529 * Everything else becomes a FUNC.
2530 *
2531 * Sets PL_lex_state to LEX_INTERPPUSH unless (ival was OP_NULL or we
2532 * had an OP_CONST or OP_READLINE). This just sets us up for a
2533 * call to S_sublex_push().
2534 */
2535
76e3520e 2536STATIC I32
cea2e8a9 2537S_sublex_start(pTHX)
79072805 2538{
97aff369 2539 dVAR;
eb578fdb 2540 const I32 op_type = pl_yylval.ival;
79072805
LW
2541
2542 if (op_type == OP_NULL) {
6154021b 2543 pl_yylval.opval = PL_lex_op;
5f66b61c 2544 PL_lex_op = NULL;
79072805
LW
2545 return THING;
2546 }
2547 if (op_type == OP_CONST || op_type == OP_READLINE) {
3280af22 2548 SV *sv = tokeq(PL_lex_stuff);
b3ac6de7
IZ
2549
2550 if (SvTYPE(sv) == SVt_PVIV) {
2551 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2552 STRLEN len;
96a5add6 2553 const char * const p = SvPV_const(sv, len);
740cce10 2554 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2555 SvREFCNT_dec(sv);
2556 sv = nsv;
4e553d73 2557 }
6154021b 2558 pl_yylval.opval = (OP*)newSVOP(op_type, 0, sv);
a0714e2c 2559 PL_lex_stuff = NULL;
6f33ba73
RGS
2560 /* Allow <FH> // "foo" */
2561 if (op_type == OP_READLINE)
2562 PL_expect = XTERMORDORDOR;
79072805
LW
2563 return THING;
2564 }
e3f73d4e
RGS
2565 else if (op_type == OP_BACKTICK && PL_lex_op) {
2566 /* readpipe() vas overriden */
2567 cSVOPx(cLISTOPx(cUNOPx(PL_lex_op)->op_first)->op_first->op_sibling)->op_sv = tokeq(PL_lex_stuff);
6154021b 2568 pl_yylval.opval = PL_lex_op;
9b201d7d 2569 PL_lex_op = NULL;
e3f73d4e
RGS
2570 PL_lex_stuff = NULL;
2571 return THING;
2572 }
79072805 2573
3280af22 2574 PL_sublex_info.super_state = PL_lex_state;
eac04b2e 2575 PL_sublex_info.sub_inwhat = (U16)op_type;
3280af22
NIS
2576 PL_sublex_info.sub_op = PL_lex_op;
2577 PL_lex_state = LEX_INTERPPUSH;
55497cff 2578
3280af22
NIS
2579 PL_expect = XTERM;
2580 if (PL_lex_op) {
6154021b 2581 pl_yylval.opval = PL_lex_op;
5f66b61c 2582 PL_lex_op = NULL;
55497cff 2583 return PMFUNC;
2584 }
2585 else
2586 return FUNC;
2587}
2588
ffb4593c
NT
2589/*
2590 * S_sublex_push
2591 * Create a new scope to save the lexing state. The scope will be
2592 * ended in S_sublex_done. Returns a '(', starting the function arguments
2593 * to the uc, lc, etc. found before.
2594 * Sets PL_lex_state to LEX_INTERPCONCAT.
2595 */
2596
76e3520e 2597STATIC I32
cea2e8a9 2598S_sublex_push(pTHX)
55497cff 2599{
27da23d5 2600 dVAR;
78a635de 2601 LEXSHARED *shared;
f46d017c 2602 ENTER;
55497cff 2603
3280af22 2604 PL_lex_state = PL_sublex_info.super_state;
651b5b28 2605 SAVEBOOL(PL_lex_dojoin);
3280af22 2606 SAVEI32(PL_lex_brackets);
78cdf107 2607 SAVEI32(PL_lex_allbrackets);
b27dce25 2608 SAVEI32(PL_lex_formbrack);
78cdf107 2609 SAVEI8(PL_lex_fakeeof);
3280af22
NIS
2610 SAVEI32(PL_lex_casemods);
2611 SAVEI32(PL_lex_starts);
651b5b28 2612 SAVEI8(PL_lex_state);
7cc34111 2613 SAVESPTR(PL_lex_repl);
7766f137 2614 SAVEVPTR(PL_lex_inpat);
98246f1e 2615 SAVEI16(PL_lex_inwhat);
57843af0 2616 SAVECOPLINE(PL_curcop);
3280af22 2617 SAVEPPTR(PL_bufptr);
8452ff4b 2618 SAVEPPTR(PL_bufend);
3280af22
NIS
2619 SAVEPPTR(PL_oldbufptr);
2620 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2621 SAVEPPTR(PL_last_lop);
2622 SAVEPPTR(PL_last_uni);
3280af22
NIS
2623 SAVEPPTR(PL_linestart);
2624 SAVESPTR(PL_linestr);
8edd5f42
RGS
2625 SAVEGENERICPV(PL_lex_brackstack);
2626 SAVEGENERICPV(PL_lex_casestack);
78a635de 2627 SAVEGENERICPV(PL_parser->lex_shared);
3a54fd60 2628 SAVEBOOL(PL_parser->lex_re_reparsing);
3280af22 2629
99bd9d90 2630 /* The here-doc parser needs to be able to peek into outer lexing
60f40a38
FC
2631 scopes to find the body of the here-doc. So we put PL_linestr and
2632 PL_bufptr into lex_shared, to ‘share’ those values.
99bd9d90 2633 */
60f40a38
FC
2634 PL_parser->lex_shared->ls_linestr = PL_linestr;
2635 PL_parser->lex_shared->ls_bufptr = PL_bufptr;
99bd9d90 2636
3280af22 2637 PL_linestr = PL_lex_stuff;
7cc34111 2638 PL_lex_repl = PL_sublex_info.repl;
a0714e2c 2639 PL_lex_stuff = NULL;
7cc34111 2640 PL_sublex_info.repl = NULL;
3280af22 2641
9cbb5ea2
GS
2642 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2643 = SvPVX(PL_linestr);
3280af22 2644 PL_bufend += SvCUR(PL_linestr);
bd61b366 2645 PL_last_lop = PL_last_uni = NULL;
3280af22 2646 SAVEFREESV(PL_linestr);
4dc843bc 2647 if (PL_lex_repl) SAVEFREESV(PL_lex_repl);
3280af22
NIS
2648
2649 PL_lex_dojoin = FALSE;
b27dce25 2650 PL_lex_brackets = PL_lex_formbrack = 0;
78cdf107
Z
2651 PL_lex_allbrackets = 0;
2652 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
a02a5408
JC
2653 Newx(PL_lex_brackstack, 120, char);
2654 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2655 PL_lex_casemods = 0;
2656 *PL_lex_casestack = '\0';
2657 PL_lex_starts = 0;
2658 PL_lex_state = LEX_INTERPCONCAT;
eb160463 2659 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
78a635de
FC
2660
2661 Newxz(shared, 1, LEXSHARED);
2662 shared->ls_prev = PL_parser->lex_shared;
2663 PL_parser->lex_shared = shared;
3280af22
NIS
2664
2665 PL_lex_inwhat = PL_sublex_info.sub_inwhat;
bb16bae8 2666 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
3280af22
NIS
2667 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2668 PL_lex_inpat = PL_sublex_info.sub_op;
79072805 2669 else
5f66b61c 2670 PL_lex_inpat = NULL;
79072805 2671
3a54fd60
DM
2672 PL_parser->lex_re_reparsing = cBOOL(PL_in_eval & EVAL_RE_REPARSING);
2673 PL_in_eval &= ~EVAL_RE_REPARSING;
2674
55497cff 2675 return '(';
79072805
LW
2676}
2677
ffb4593c
NT
2678/*
2679 * S_sublex_done
2680 * Restores lexer state after a S_sublex_push.
2681 */
2682
76e3520e 2683STATIC I32
cea2e8a9 2684S_sublex_done(pTHX)
79072805 2685{
27da23d5 2686 dVAR;
3280af22 2687 if (!PL_lex_starts++) {
396482e1 2688 SV * const sv = newSVpvs("");
9aa983d2
JH
2689 if (SvUTF8(PL_linestr))
2690 SvUTF8_on(sv);
3280af22 2691 PL_expect = XOPERATOR;
6154021b 2692 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
79072805
LW
2693 return THING;
2694 }
2695
3280af22
NIS
2696 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2697 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2698 return yylex();
79072805
LW
2699 }
2700
ffb4593c 2701 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
bb16bae8 2702 assert(PL_lex_inwhat != OP_TRANSR);
3280af22
NIS
2703 if (PL_lex_repl && (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS)) {
2704 PL_linestr = PL_lex_repl;
2705 PL_lex_inpat = 0;
2706 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2707 PL_bufend += SvCUR(PL_linestr);
bd61b366 2708 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2709 PL_lex_dojoin = FALSE;
2710 PL_lex_brackets = 0;
78cdf107
Z
2711 PL_lex_allbrackets = 0;
2712 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
3280af22
NIS
2713 PL_lex_casemods = 0;
2714 *PL_lex_casestack = '\0';
2715 PL_lex_starts = 0;
25da4f38 2716 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2717 PL_lex_state = LEX_INTERPNORMAL;
2718 PL_lex_starts++;
e9fa98b2
HS
2719 /* we don't clear PL_lex_repl here, so that we can check later
2720 whether this is an evalled subst; that means we rely on the
2721 logic to ensure sublex_done() is called again only via the
2722 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2723 }
e9fa98b2 2724 else {
3280af22 2725 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2726 PL_lex_repl = NULL;
e9fa98b2 2727 }
79072805 2728 return ',';
ffed7fef
LW
2729 }
2730 else {
5db06880
NC
2731#ifdef PERL_MAD
2732 if (PL_madskills) {
cd81e915
NC
2733 if (PL_thiswhite) {
2734 if (!PL_endwhite)
6b29d1f5 2735 PL_endwhite = newSVpvs("");
cd81e915
NC
2736 sv_catsv(PL_endwhite, PL_thiswhite);
2737 PL_thiswhite = 0;
2738 }
2739 if (PL_thistoken)
76f68e9b 2740 sv_setpvs(PL_thistoken,"");
5db06880 2741 else
cd81e915 2742 PL_realtokenstart = -1;
5db06880
NC
2743 }
2744#endif
f46d017c 2745 LEAVE;
3280af22
NIS
2746 PL_bufend = SvPVX(PL_linestr);
2747 PL_bufend += SvCUR(PL_linestr);
2748 PL_expect = XOPERATOR;
09bef843 2749 PL_sublex_info.sub_inwhat = 0;
79072805 2750 return ')';
ffed7fef
LW
2751 }
2752}
2753
6f613c73
KW
2754PERL_STATIC_INLINE SV*
2755S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
2756{
140b12ad
KW
2757 /* <s> points to first character of interior of \N{}, <e> to one beyond the
2758 * interior, hence to the "}". Finds what the name resolves to, returning
2759 * an SV* containing it; NULL if no valid one found */
2760
dd2b1b72 2761 SV* res = newSVpvn_flags(s, e - s, UTF ? SVf_UTF8 : 0);
6f613c73 2762
0c415a79
KW
2763 HV * table;
2764 SV **cvp;
2765 SV *cv;
2766 SV *rv;
2767 HV *stash;
2768 const U8* first_bad_char_loc;
2769 const char* backslash_ptr = s - 3; /* Points to the <\> of \N{... */
2770
6f613c73
KW
2771 PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME;
2772
107160e2
KW
2773 if (UTF && ! is_utf8_string_loc((U8 *) backslash_ptr,
2774 e - backslash_ptr,
2775 &first_bad_char_loc))
2776 {
2777 /* If warnings are on, this will print a more detailed analysis of what
2778 * is wrong than the error message below */
2779 utf8n_to_uvuni(first_bad_char_loc,
2780 e - ((char *) first_bad_char_loc),
2781 NULL, 0);
2782
2783 /* We deliberately don't try to print the malformed character, which
2784 * might not print very well; it also may be just the first of many
2785 * malformations, so don't print what comes after it */
2786 yyerror(Perl_form(aTHX_
2787 "Malformed UTF-8 character immediately after '%.*s'",
2788 (int) (first_bad_char_loc - (U8 *) backslash_ptr), backslash_ptr));
2789 return NULL;
2790 }
2791
2792 res = new_constant( NULL, 0, "charnames", res, NULL, backslash_ptr,
2793 /* include the <}> */
2794 e - backslash_ptr + 1);
6f613c73 2795 if (! SvPOK(res)) {
b6407c49 2796 SvREFCNT_dec_NN(res);
6f613c73
KW
2797 return NULL;
2798 }
2799
0c415a79
KW
2800 /* See if the charnames handler is the Perl core's, and if so, we can skip
2801 * the validation needed for a user-supplied one, as Perl's does its own
2802 * validation. */
2803 table = GvHV(PL_hintgv); /* ^H */
2804 cvp = hv_fetchs(table, "charnames", FALSE);
67a057d6
FC
2805 if (cvp && (cv = *cvp) && SvROK(cv) && ((rv = SvRV(cv)) != NULL)
2806 && SvTYPE(rv) == SVt_PVCV && ((stash = CvSTASH(rv)) != NULL))
0c415a79
KW
2807 {
2808 const char * const name = HvNAME(stash);
2809 if strEQ(name, "_charnames") {
2810 return res;
2811 }
2812 }
2813
bde9e88d
KW
2814 /* Here, it isn't Perl's charname handler. We can't rely on a
2815 * user-supplied handler to validate the input name. For non-ut8 input,
2816 * look to see that the first character is legal. Then loop through the
2817 * rest checking that each is a continuation */
6f613c73 2818
b6ba1137
KW
2819 /* This code needs to be sync'ed with a regex in _charnames.pm which does
2820 * the same thing */
2821
b6ba1137 2822 if (! UTF) {
bde9e88d 2823 if (! isALPHAU(*s)) {
b6ba1137
KW
2824 goto bad_charname;
2825 }
bde9e88d
KW
2826 s++;
2827 while (s < e) {
2828 if (! isCHARNAME_CONT(*s)) {
b6ba1137
KW
2829 goto bad_charname;
2830 }
3036c853 2831 if (*s == ' ' && *(s-1) == ' ' && ckWARN_d(WARN_DEPRECATED)) {
94ec3a20
FC
2832 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2833 "A sequence of multiple spaces in a charnames "
2834 "alias definition is deprecated");
bd299e29 2835 }
bde9e88d 2836 s++;
b6ba1137 2837 }
3036c853 2838 if (*(s-1) == ' ' && ckWARN_d(WARN_DEPRECATED)) {
94ec3a20
FC
2839 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2840 "Trailing white-space in a charnames alias "
2841 "definition is deprecated");
bd299e29 2842 }
b6ba1137
KW
2843 }
2844 else {
bde9e88d
KW
2845 /* Similarly for utf8. For invariants can check directly; for other
2846 * Latin1, can calculate their code point and check; otherwise use a
2847 * swash */
2848 if (UTF8_IS_INVARIANT(*s)) {
2849 if (! isALPHAU(*s)) {
140b12ad
KW
2850 goto bad_charname;
2851 }
bde9e88d
KW
2852 s++;
2853 } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
2854 if (! isALPHAU(UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(*s, *(s+1))))) {
b6ba1137 2855 goto bad_charname;
6f613c73 2856 }
bde9e88d 2857 s += 2;
6f613c73 2858 }
bde9e88d
KW
2859 else {
2860 if (! PL_utf8_charname_begin) {
2861 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2862 PL_utf8_charname_begin = _core_swash_init("utf8",
2863 "_Perl_Charname_Begin",
2864 &PL_sv_undef,
2865 1, 0, NULL, &flags);
2866 }
2867 if (! swash_fetch(PL_utf8_charname_begin, (U8 *) s, TRUE)) {
2868 goto bad_charname;
2869 }
2870 s += UTF8SKIP(s);
2871 }
2872
2873 while (s < e) {
2874 if (UTF8_IS_INVARIANT(*s)) {
2875 if (! isCHARNAME_CONT(*s)) {
2876 goto bad_charname;
2877 }
3036c853
FC
2878 if (*s == ' ' && *(s-1) == ' '
2879 && ckWARN_d(WARN_DEPRECATED)) {
94ec3a20
FC
2880 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2881 "A sequence of multiple spaces in a charnam"
2882 "es alias definition is deprecated");
bd299e29 2883 }
bde9e88d
KW
2884 s++;
2885 }
2886 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
2887 if (! isCHARNAME_CONT(UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(*s,
2888 *(s+1)))))
2889 {
2890 goto bad_charname;
2891 }
2892 s += 2;
2893 }
2894 else {
2895 if (! PL_utf8_charname_continue) {
2896 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2897 PL_utf8_charname_continue = _core_swash_init("utf8",
2898 "_Perl_Charname_Continue",
2899 &PL_sv_undef,
2900 1, 0, NULL, &flags);
2901 }
2902 if (! swash_fetch(PL_utf8_charname_continue, (U8 *) s, TRUE)) {
2903 goto bad_charname;
2904 }
2905 s += UTF8SKIP(s);
6f613c73
KW
2906 }
2907 }
3036c853 2908 if (*(s-1) == ' ' && ckWARN_d(WARN_DEPRECATED)) {
94ec3a20
FC
2909 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2910 "Trailing white-space in a charnames alias "
2911 "definition is deprecated");
bd299e29 2912 }
6f613c73
KW
2913 }
2914
94ca1619 2915 if (SvUTF8(res)) { /* Don't accept malformed input */
bde9e88d
KW
2916 const U8* first_bad_char_loc;
2917 STRLEN len;
2918 const char* const str = SvPV_const(res, len);
2919 if (! is_utf8_string_loc((U8 *) str, len, &first_bad_char_loc)) {
2920 /* If warnings are on, this will print a more detailed analysis of
2921 * what is wrong than the error message below */
2922 utf8n_to_uvuni(first_bad_char_loc,
2923 (char *) first_bad_char_loc - str,
2924 NULL, 0);
2925
2926 /* We deliberately don't try to print the malformed character,
2927 * which might not print very well; it also may be just the first
2928 * of many malformations, so don't print what comes after it */
2929 yyerror_pv(
2930 Perl_form(aTHX_
2931 "Malformed UTF-8 returned by %.*s immediately after '%.*s'",
2932 (int) (e - backslash_ptr + 1), backslash_ptr,
2933 (int) ((char *) first_bad_char_loc - str), str
2934 ),
2935 SVf_UTF8);
2936 return NULL;
2937 }
2938 }
140b12ad 2939
bde9e88d 2940 return res;
140b12ad 2941
bde9e88d
KW
2942 bad_charname: {
2943 int bad_char_size = ((UTF) ? UTF8SKIP(s) : 1);
2944
2945 /* The final %.*s makes sure that should the trailing NUL be missing
2946 * that this print won't run off the end of the string */
2947 yyerror_pv(
2948 Perl_form(aTHX_
2949 "Invalid character in \\N{...}; marked by <-- HERE in %.*s<-- HERE %.*s",
2950 (int)(s - backslash_ptr + bad_char_size), backslash_ptr,
2951 (int)(e - s + bad_char_size), s + bad_char_size
2952 ),
2953 UTF ? SVf_UTF8 : 0);
2954 return NULL;
2955 }
6f613c73
KW
2956}
2957
02aa26ce
NT
2958/*
2959 scan_const
2960
9da1dd8f
DM
2961 Extracts the next constant part of a pattern, double-quoted string,
2962 or transliteration. This is terrifying code.
2963
2964 For example, in parsing the double-quoted string "ab\x63$d", it would
2965 stop at the '$' and return an OP_CONST containing 'abc'.
02aa26ce 2966
94def140 2967 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 2968 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 2969 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 2970
94def140
TS
2971 Returns a pointer to the character scanned up to. If this is
2972 advanced from the start pointer supplied (i.e. if anything was
9da1dd8f 2973 successfully parsed), will leave an OP_CONST for the substring scanned
6154021b 2974 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
2975 by looking at the next characters herself.
2976
02aa26ce 2977 In patterns:
9da1dd8f 2978 expand:
537124e4
KW
2979 \N{FOO} => \N{U+hex_for_character_FOO}
2980 (if FOO expands to multiple characters, expands to \N{U+xx.XX.yy ...})
9da1dd8f
DM
2981
2982 pass through:
2983 all other \-char, including \N and \N{ apart from \N{ABC}
2984
2985 stops on:
2986 @ and $ where it appears to be a var, but not for $ as tail anchor
2987 \l \L \u \U \Q \E
2988 (?{ or (??{
2989
02aa26ce
NT
2990
2991 In transliterations:
2992 characters are VERY literal, except for - not at the start or end
94def140
TS
2993 of the string, which indicates a range. If the range is in bytes,
2994 scan_const expands the range to the full set of intermediate
2995 characters. If the range is in utf8, the hyphen is replaced with
2996 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
2997
2998 In double-quoted strings:
2999 backslashes:
3000 double-quoted style: \r and \n
ff3f963a 3001 constants: \x31, etc.
94def140 3002 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
3003 case and quoting: \U \Q \E
3004 stops on @ and $
3005
3006 scan_const does *not* construct ops to handle interpolated strings.
3007 It stops processing as soon as it finds an embedded $ or @ variable
3008 and leaves it to the caller to work out what's going on.
3009
94def140
TS
3010 embedded arrays (whether in pattern or not) could be:
3011 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
3012
3013 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
3014
3015 $ in pattern could be $foo or could be tail anchor. Assumption:
3016 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 3017 followed by one of "()| \r\n\t"
02aa26ce 3018
9da1dd8f 3019 \1 (backreferences) are turned into $1 in substitutions
02aa26ce
NT
3020
3021 The structure of the code is
3022 while (there's a character to process) {
94def140
TS
3023 handle transliteration ranges
3024 skip regexp comments /(?#comment)/ and codes /(?{code})/
3025 skip #-initiated comments in //x patterns
3026 check for embedded arrays
02aa26ce
NT
3027 check for embedded scalars
3028 if (backslash) {
94def140 3029 deprecate \1 in substitution replacements
02aa26ce
NT
3030 handle string-changing backslashes \l \U \Q \E, etc.
3031 switch (what was escaped) {
94def140 3032 handle \- in a transliteration (becomes a literal -)
ff3f963a 3033 if a pattern and not \N{, go treat as regular character
94def140
TS
3034 handle \132 (octal characters)
3035 handle \x15 and \x{1234} (hex characters)
ff3f963a 3036 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
TS
3037 handle \cV (control characters)
3038 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 3039 } (end switch)
77a135fe 3040 continue
02aa26ce 3041 } (end if backslash)
77a135fe 3042 handle regular character
02aa26ce 3043 } (end while character to read)
4e553d73 3044
02aa26ce
NT
3045*/
3046
76e3520e 3047STATIC char *
cea2e8a9 3048S_scan_const(pTHX_ char *start)
79072805 3049{
97aff369 3050 dVAR;
eb578fdb 3051 char *send = PL_bufend; /* end of the constant */
77a135fe
KW
3052 SV *sv = newSV(send - start); /* sv for the constant. See
3053 note below on sizing. */
eb578fdb
KW
3054 char *s = start; /* start of the constant */
3055 char *d = SvPVX(sv); /* destination for copies */
02aa26ce 3056 bool dorange = FALSE; /* are we in a translit range? */
c2e66d9e 3057 bool didrange = FALSE; /* did we just finish a range? */
2866decb 3058 bool in_charclass = FALSE; /* within /[...]/ */
b953e60c
KW
3059 bool has_utf8 = FALSE; /* Output constant is UTF8 */
3060 bool this_utf8 = cBOOL(UTF); /* Is the source string assumed
77a135fe
KW
3061 to be UTF8? But, this can
3062 show as true when the source
3063 isn't utf8, as for example
3064 when it is entirely composed
3065 of hex constants */
6f613c73 3066 SV *res; /* result from charnames */
77a135fe
KW
3067
3068 /* Note on sizing: The scanned constant is placed into sv, which is
3069 * initialized by newSV() assuming one byte of output for every byte of
3070 * input. This routine expects newSV() to allocate an extra byte for a
3071 * trailing NUL, which this routine will append if it gets to the end of
3072 * the input. There may be more bytes of input than output (eg., \N{LATIN
3073 * CAPITAL LETTER A}), or more output than input if the constant ends up
3074 * recoded to utf8, but each time a construct is found that might increase
3075 * the needed size, SvGROW() is called. Its size parameter each time is
3076 * based on the best guess estimate at the time, namely the length used so
3077 * far, plus the length the current construct will occupy, plus room for
3078 * the trailing NUL, plus one byte for every input byte still unscanned */
3079
c3320c2a
KW
3080 UV uv = UV_MAX; /* Initialize to weird value to try to catch any uses
3081 before set */
4c3a8340
TS
3082#ifdef EBCDIC
3083 UV literal_endpoint = 0;
e294cc5d 3084 bool native_range = TRUE; /* turned to FALSE if the first endpoint is Unicode. */
4c3a8340 3085#endif
012bcf8d 3086
7918f24d
NC
3087 PERL_ARGS_ASSERT_SCAN_CONST;
3088
bb16bae8 3089 assert(PL_lex_inwhat != OP_TRANSR);
2b9d42f0
NIS
3090 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
3091 /* If we are doing a trans and we know we want UTF8 set expectation */
3092 has_utf8 = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
3093 this_utf8 = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
3094 }
3095
b899e89d
FC
3096 /* Protect sv from errors and fatal warnings. */
3097 ENTER_with_name("scan_const");
3098 SAVEFREESV(sv);
2b9d42f0 3099
79072805 3100 while (s < send || dorange) {
ff3f963a 3101
02aa26ce 3102 /* get transliterations out of the way (they're most literal) */
3280af22 3103 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 3104 /* expand a range A-Z to the full set of characters. AIE! */
79072805 3105 if (dorange) {
1ba5c669
JH
3106 I32 i; /* current expanded character */
3107 I32 min; /* first character in range */
3108 I32 max; /* last character in range */
02aa26ce 3109
e294cc5d
JH
3110#ifdef EBCDIC
3111 UV uvmax = 0;
3112#endif
3113
3114 if (has_utf8
3115#ifdef EBCDIC
3116 && !native_range
3117#endif
1953db30 3118 ) {
9d4ba2ae 3119 char * const c = (char*)utf8_hop((U8*)d, -1);
8973db79
JH
3120 char *e = d++;
3121 while (e-- > c)
3122 *(e + 1) = *e;
25716404 3123 *c = (char)UTF_TO_NATIVE(0xff);
8973db79
JH
3124 /* mark the range as done, and continue */
3125 dorange = FALSE;
3126 didrange = TRUE;
3127 continue;
3128 }
2b9d42f0 3129
95a20fc0 3130 i = d - SvPVX_const(sv); /* remember current offset */
e294cc5d
JH
3131#ifdef EBCDIC
3132 SvGROW(sv,
3133 SvLEN(sv) + (has_utf8 ?
3134 (512 - UTF_CONTINUATION_MARK +
3135 UNISKIP(0x100))
3136 : 256));
3137 /* How many two-byte within 0..255: 128 in UTF-8,
3138 * 96 in UTF-8-mod. */
3139#else
9cbb5ea2 3140 SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */
e294cc5d 3141#endif
9cbb5ea2 3142 d = SvPVX(sv) + i; /* refresh d after realloc */
e294cc5d
JH
3143#ifdef EBCDIC
3144 if (has_utf8) {
3145 int j;
3146 for (j = 0; j <= 1; j++) {
3147 char * const c = (char*)utf8_hop((U8*)d, -1);
3148 const UV uv = utf8n_to_uvchr((U8*)c, d - c, NULL, 0);
3149 if (j)
3150 min = (U8)uv;
3151 else if (uv < 256)
3152 max = (U8)uv;
3153 else {
3154 max = (U8)0xff; /* only to \xff */
3155 uvmax = uv; /* \x{100} to uvmax */
3156 }
3157 d = c; /* eat endpoint chars */
3158 }
3159 }
3160 else {
3161#endif
3162 d -= 2; /* eat the first char and the - */
3163 min = (U8)*d; /* first char in range */
3164 max = (U8)d[1]; /* last char in range */
3165#ifdef EBCDIC
3166 }
3167#endif
8ada0baa 3168
c2e66d9e 3169 if (min > max) {
01ec43d0 3170 Perl_croak(aTHX_
d1573ac7 3171 "Invalid range \"%c-%c\" in transliteration operator",
1ba5c669 3172 (char)min, (char)max);
c2e66d9e
GS
3173 }
3174
c7f1f016 3175#ifdef EBCDIC
4c3a8340
TS
3176 if (literal_endpoint == 2 &&
3177 ((isLOWER(min) && isLOWER(max)) ||
3178 (isUPPER(min) && isUPPER(max)))) {
8ada0baa
JH
3179 if (isLOWER(min)) {
3180 for (i = min; i <= max; i++)
3181 if (isLOWER(i))
db42d148 3182 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
3183 } else {
3184 for (i = min; i <= max; i++)
3185 if (isUPPER(i))
db42d148 3186 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
3187 }
3188 }
3189 else
3190#endif
3191 for (i = min; i <= max; i++)
e294cc5d
JH
3192#ifdef EBCDIC
3193 if (has_utf8) {
3194 const U8 ch = (U8)NATIVE_TO_UTF(i);
3195 if (UNI_IS_INVARIANT(ch))
3196 *d++ = (U8)i;
3197 else {
3198 *d++ = (U8)UTF8_EIGHT_BIT_HI(ch);
3199 *d++ = (U8)UTF8_EIGHT_BIT_LO(ch);
3200 }
3201 }
3202 else
3203#endif
3204 *d++ = (char)i;
3205
3206#ifdef EBCDIC
3207 if (uvmax) {
3208 d = (char*)uvchr_to_utf8((U8*)d, 0x100);
3209 if (uvmax > 0x101)
3210 *d++ = (char)UTF_TO_NATIVE(0xff);
3211 if (uvmax > 0x100)
3212 d = (char*)uvchr_to_utf8((U8*)d, uvmax);
3213 }
3214#endif
02aa26ce
NT
3215
3216 /* mark the range as done, and continue */
79072805 3217 dorange = FALSE;
01ec43d0 3218 didrange = TRUE;
4c3a8340
TS
3219#ifdef EBCDIC
3220 literal_endpoint = 0;
3221#endif
79072805 3222 continue;
4e553d73 3223 }
02aa26ce
NT
3224
3225 /* range begins (ignore - as first or last char) */
79072805 3226 else if (*s == '-' && s+1 < send && s != start) {
4e553d73 3227 if (didrange) {
1fafa243 3228 Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
01ec43d0 3229 }
e294cc5d
JH
3230 if (has_utf8
3231#ifdef EBCDIC
3232 && !native_range
3233#endif
3234 ) {
25716404 3235 *d++ = (char)UTF_TO_NATIVE(0xff); /* use illegal utf8 byte--see pmtrans */
a0ed51b3
LW
3236 s++;
3237 continue;
3238 }
79072805
LW
3239 dorange = TRUE;
3240 s++;
01ec43d0
GS
3241 }
3242 else {
3243 didrange = FALSE;
4c3a8340
TS
3244#ifdef EBCDIC
3245 literal_endpoint = 0;
e294cc5d 3246 native_range = TRUE;
4c3a8340 3247#endif
01ec43d0 3248 }
79072805 3249 }
02aa26ce
NT
3250
3251 /* if we get here, we're not doing a transliteration */
3252
e4a2df84
DM
3253 else if (*s == '[' && PL_lex_inpat && !in_charclass) {
3254 char *s1 = s-1;
3255 int esc = 0;
3256 while (s1 >= start && *s1-- == '\\')
3257 esc = !esc;
3258 if (!esc)
3259 in_charclass = TRUE;
3260 }
2866decb 3261
e4a2df84
DM
3262 else if (*s == ']' && PL_lex_inpat && in_charclass) {
3263 char *s1 = s-1;
3264 int esc = 0;
3265 while (s1 >= start && *s1-- == '\\')
3266 esc = !esc;
3267 if (!esc)
3268 in_charclass = FALSE;
3269 }
2866decb 3270
9da1dd8f
DM
3271 /* skip for regexp comments /(?#comment)/, except for the last
3272 * char, which will be done separately.
3273 * Stop on (?{..}) and friends */
3274
3280af22 3275 else if (*s == '(' && PL_lex_inpat && s[1] == '?') {
cc6b7395 3276 if (s[2] == '#') {
e994fd66 3277 while (s+1 < send && *s != ')')
db42d148 3278 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
155aba94 3279 }
2866decb 3280 else if (!PL_lex_casemods && !in_charclass &&
d3cec5e5
DM
3281 ( s[2] == '{' /* This should match regcomp.c */
3282 || (s[2] == '?' && s[3] == '{')))
155aba94 3283 {
9da1dd8f 3284 break;
cc6b7395 3285 }
748a9306 3286 }
02aa26ce
NT
3287
3288 /* likewise skip #-initiated comments in //x patterns */
3280af22 3289 else if (*s == '#' && PL_lex_inpat &&
73134a2e 3290 ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED) {
748a9306 3291 while (s+1 < send && *s != '\n')
db42d148 3292 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
748a9306 3293 }
02aa26ce 3294
9da1dd8f
DM
3295 /* no further processing of single-quoted regex */
3296 else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'')
3297 goto default_action;
3298
5d1d4326 3299 /* check for embedded arrays
da6eedaa 3300 (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
5d1d4326 3301 */
1749ea0d 3302 else if (*s == '@' && s[1]) {
8a2bca12 3303 if (isWORDCHAR_lazy_if(s+1,UTF))
1749ea0d
TS
3304 break;
3305 if (strchr(":'{$", s[1]))
3306 break;
3307 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
3308 break; /* in regexp, neither @+ nor @- are interpolated */
3309 }
02aa26ce
NT
3310
3311 /* check for embedded scalars. only stop if we're sure it's a
3312 variable.
3313 */
79072805 3314 else if (*s == '$') {
3280af22 3315 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
79072805 3316 break;
77772344 3317 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
a2a5de95
NC
3318 if (s[1] == '\\') {
3319 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
3320 "Possible unintended interpolation of $\\ in regex");
77772344 3321 }
79072805 3322 break; /* in regexp, $ might be tail anchor */
77772344 3323 }
79072805 3324 }
02aa26ce 3325
2b9d42f0
NIS
3326 /* End of else if chain - OP_TRANS rejoin rest */
3327
02aa26ce 3328 /* backslashes */
79072805 3329 if (*s == '\\' && s+1 < send) {
ff3f963a
KW
3330 char* e; /* Can be used for ending '}', etc. */
3331
79072805 3332 s++;
02aa26ce 3333
7d0fc23c
KW
3334 /* warn on \1 - \9 in substitution replacements, but note that \11
3335 * is an octal; and \19 is \1 followed by '9' */
3280af22 3336 if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat &&
a0d0e21e 3337 isDIGIT(*s) && *s != '0' && !isDIGIT(s[1]))
79072805 3338 {
a2a5de95 3339 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
79072805
LW
3340 *--s = '$';
3341 break;
3342 }
02aa26ce
NT
3343
3344 /* string-change backslash escapes */
838f2281 3345 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQF", *s)) {
79072805
LW
3346 --s;
3347 break;
3348 }
ff3f963a
KW
3349 /* In a pattern, process \N, but skip any other backslash escapes.
3350 * This is because we don't want to translate an escape sequence
3351 * into a meta symbol and have the regex compiler use the meta
3352 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
3353 * in spite of this, we do have to process \N here while the proper
3354 * charnames handler is in scope. See bugs #56444 and #62056.
3355 * There is a complication because \N in a pattern may also stand
3356 * for 'match a non-nl', and not mean a charname, in which case its
3357 * processing should be deferred to the regex compiler. To be a
3358 * charname it must be followed immediately by a '{', and not look
3359 * like \N followed by a curly quantifier, i.e., not something like
3360 * \N{3,}. regcurly returns a boolean indicating if it is a legal
3361 * quantifier */
3362 else if (PL_lex_inpat
3363 && (*s != 'N'
3364 || s[1] != '{'
4d68ffa0 3365 || regcurly(s + 1, FALSE)))
ff3f963a 3366 {
cc74c5bd
TS
3367 *d++ = NATIVE_TO_NEED(has_utf8,'\\');
3368 goto default_action;
3369 }
02aa26ce 3370
79072805 3371 switch (*s) {
02aa26ce
NT
3372
3373 /* quoted - in transliterations */
79072805 3374 case '-':
3280af22 3375 if (PL_lex_inwhat == OP_TRANS) {
79072805
LW
3376 *d++ = *s++;
3377 continue;
3378 }
3379 /* FALL THROUGH */
3380 default:
11b8faa4 3381 {
15861f94 3382 if ((isALPHANUMERIC(*s)))
a2a5de95
NC
3383 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
3384 "Unrecognized escape \\%c passed through",
3385 *s);
11b8faa4 3386 /* default action is to copy the quoted character */
f9a63242 3387 goto default_action;
11b8faa4 3388 }
02aa26ce 3389
632403cc 3390 /* eg. \132 indicates the octal constant 0132 */
79072805
LW
3391 case '0': case '1': case '2': case '3':
3392 case '4': case '5': case '6': case '7':
ba210ebe 3393 {
5e0a247b 3394 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
53305cf1 3395 STRLEN len = 3;
77a135fe 3396 uv = NATIVE_TO_UNI(grok_oct(s, &len, &flags, NULL));
ba210ebe 3397 s += len;
5e0a247b
KW
3398 if (len < 3 && s < send && isDIGIT(*s)
3399 && ckWARN(WARN_MISC))
3400 {
3401 Perl_warner(aTHX_ packWARN(WARN_MISC),
3402 "%s", form_short_octal_warning(s, len));
3403 }
ba210ebe 3404 }
012bcf8d 3405 goto NUM_ESCAPE_INSERT;
02aa26ce 3406
f0a2b745
KW
3407 /* eg. \o{24} indicates the octal constant \024 */
3408 case 'o':
3409 {
454155d9 3410 const char* error;
f0a2b745 3411
00ce5563 3412 bool valid = grok_bslash_o(&s, &uv, &error,
80f4111b
KW
3413 TRUE, /* Output warning */
3414 FALSE, /* Not strict */
17896a85
KW
3415 TRUE, /* Output warnings for
3416 non-portables */
80f4111b 3417 UTF);
454155d9 3418 if (! valid) {
f0a2b745
KW
3419 yyerror(error);
3420 continue;
3421 }
3422 goto NUM_ESCAPE_INSERT;
3423 }
3424
77a135fe 3425 /* eg. \x24 indicates the hex constant 0x24 */
79072805 3426 case 'x':
a0481293 3427 {
a0481293 3428 const char* error;
355860ce 3429
00ce5563 3430 bool valid = grok_bslash_x(&s, &uv, &error,
80f4111b
KW
3431 TRUE, /* Output warning */
3432 FALSE, /* Not strict */
17896a85
KW
3433 TRUE, /* Output warnings for
3434 non-portables */
80f4111b 3435 UTF);
a0481293
KW
3436 if (! valid) {
3437 yyerror(error);
355860ce 3438 continue;
ba210ebe 3439 }
012bcf8d
GS
3440 }
3441
3442 NUM_ESCAPE_INSERT:
ff3f963a
KW
3443 /* Insert oct or hex escaped character. There will always be
3444 * enough room in sv since such escapes will be longer than any
3445 * UTF-8 sequence they can end up as, except if they force us
3446 * to recode the rest of the string into utf8 */
ba7cea30 3447
77a135fe 3448 /* Here uv is the ordinal of the next character being added in
ff3f963a 3449 * unicode (converted from native). */
77a135fe 3450 if (!UNI_IS_INVARIANT(uv)) {
9aa983d2 3451 if (!has_utf8 && uv > 255) {
77a135fe
KW
3452 /* Might need to recode whatever we have accumulated so
3453 * far if it contains any chars variant in utf8 or
3454 * utf-ebcdic. */
3455
3456 SvCUR_set(sv, d - SvPVX_const(sv));
3457 SvPOK_on(sv);
3458 *d = '\0';
77a135fe 3459 /* See Note on sizing above. */
7bf79863
KW
3460 sv_utf8_upgrade_flags_grow(sv,
3461 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3462 UNISKIP(uv) + (STRLEN)(send - s) + 1);
77a135fe
KW
3463 d = SvPVX(sv) + SvCUR(sv);
3464 has_utf8 = TRUE;
012bcf8d
GS
3465 }
3466
77a135fe
KW
3467 if (has_utf8) {
3468 d = (char*)uvuni_to_utf8((U8*)d, uv);
f9a63242
JH
3469 if (PL_lex_inwhat == OP_TRANS &&
3470 PL_sublex_info.sub_op) {
3471 PL_sublex_info.sub_op->op_private |=
3472 (PL_lex_repl ? OPpTRANS_FROM_UTF
3473 : OPpTRANS_TO_UTF);
f9a63242 3474 }
e294cc5d
JH
3475#ifdef EBCDIC
3476 if (uv > 255 && !dorange)
3477 native_range = FALSE;
3478#endif
012bcf8d 3479 }
a0ed51b3 3480 else {
012bcf8d 3481 *d++ = (char)uv;
a0ed51b3 3482 }
012bcf8d
GS
3483 }
3484 else {
c4d5f83a 3485 *d++ = (char) uv;
a0ed51b3 3486 }
79072805 3487 continue;
02aa26ce 3488
4a2d328f 3489 case 'N':
ff3f963a
KW
3490 /* In a non-pattern \N must be a named character, like \N{LATIN
3491 * SMALL LETTER A} or \N{U+0041}. For patterns, it also can
3492 * mean to match a non-newline. For non-patterns, named
3493 * characters are converted to their string equivalents. In
3494 * patterns, named characters are not converted to their
3495 * ultimate forms for the same reasons that other escapes
3496 * aren't. Instead, they are converted to the \N{U+...} form
3497 * to get the value from the charnames that is in effect right
3498 * now, while preserving the fact that it was a named character
3499 * so that the regex compiler knows this */
3500
3501 /* This section of code doesn't generally use the
3502 * NATIVE_TO_NEED() macro to transform the input. I (khw) did
3503 * a close examination of this macro and determined it is a
3504 * no-op except on utfebcdic variant characters. Every
3505 * character generated by this that would normally need to be
3506 * enclosed by this macro is invariant, so the macro is not
7538f724
KW
3507 * needed, and would complicate use of copy(). XXX There are
3508 * other parts of this file where the macro is used
3509 * inconsistently, but are saved by it being a no-op */
ff3f963a
KW
3510
3511 /* The structure of this section of code (besides checking for
3512 * errors and upgrading to utf8) is:
3513 * Further disambiguate between the two meanings of \N, and if
3514 * not a charname, go process it elsewhere
0a96133f
KW
3515 * If of form \N{U+...}, pass it through if a pattern;
3516 * otherwise convert to utf8
3517 * Otherwise must be \N{NAME}: convert to \N{U+c1.c2...} if a
3518 * pattern; otherwise convert to utf8 */
ff3f963a
KW
3519
3520 /* Here, s points to the 'N'; the test below is guaranteed to
3521 * succeed if we are being called on a pattern as we already
3522 * know from a test above that the next character is a '{'.
3523 * On a non-pattern \N must mean 'named sequence, which
3524 * requires braces */
3525 s++;
3526 if (*s != '{') {
3527 yyerror("Missing braces on \\N{}");
3528 continue;
3529 }
3530 s++;
3531
0a96133f 3532 /* If there is no matching '}', it is an error. */
ff3f963a
KW
3533 if (! (e = strchr(s, '}'))) {
3534 if (! PL_lex_inpat) {
5777a3f7 3535 yyerror("Missing right brace on \\N{}");
0a96133f
KW
3536 } else {
3537 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N.");
dbc0d4f2 3538 }
0a96133f 3539 continue;
ff3f963a 3540 }
cddc7ef4 3541
ff3f963a 3542 /* Here it looks like a named character */
cddc7ef4 3543
ff3f963a
KW
3544 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3545 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3546 | PERL_SCAN_DISALLOW_PREFIX;
3547 STRLEN len;
3548
3549 /* For \N{U+...}, the '...' is a unicode value even on
3550 * EBCDIC machines */
3551 s += 2; /* Skip to next char after the 'U+' */
3552 len = e - s;
3553 uv = grok_hex(s, &len, &flags, NULL);
3554 if (len == 0 || len != (STRLEN)(e - s)) {
3555 yyerror("Invalid hexadecimal number in \\N{U+...}");
3556 s = e + 1;
3557 continue;
3558 }
3559
3560 if (PL_lex_inpat) {
3561
e2a7e165
KW
3562 /* On non-EBCDIC platforms, pass through to the regex
3563 * compiler unchanged. The reason we evaluated the
3564 * number above is to make sure there wasn't a syntax
3565 * error. But on EBCDIC we convert to native so
3566 * downstream code can continue to assume it's native
3567 */
ff3f963a 3568 s -= 5; /* Include the '\N{U+' */
e2a7e165
KW
3569#ifdef EBCDIC
3570 d += my_snprintf(d, e - s + 1 + 1, /* includes the }
3571 and the \0 */
3572 "\\N{U+%X}",
3573 (unsigned int) UNI_TO_NATIVE(uv));
3574#else
ff3f963a
KW
3575 Copy(s, d, e - s + 1, char); /* 1 = include the } */
3576 d += e - s + 1;
e2a7e165 3577#endif
ff3f963a
KW
3578 }
3579 else { /* Not a pattern: convert the hex to string */
3580
3581 /* If destination is not in utf8, unconditionally
3582 * recode it to be so. This is because \N{} implies
3583 * Unicode semantics, and scalars have to be in utf8
3584 * to guarantee those semantics */
3585 if (! has_utf8) {
3586 SvCUR_set(sv, d - SvPVX_const(sv));
3587 SvPOK_on(sv);
3588 *d = '\0';
3589 /* See Note on sizing above. */
3590 sv_utf8_upgrade_flags_grow(
3591 sv,
3592 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3593 UNISKIP(uv) + (STRLEN)(send - e) + 1);
3594 d = SvPVX(sv) + SvCUR(sv);
3595 has_utf8 = TRUE;
3596 }
3597
3598 /* Add the string to the output */
3599 if (UNI_IS_INVARIANT(uv)) {
3600 *d++ = (char) uv;
3601 }
3602 else d = (char*)uvuni_to_utf8((U8*)d, uv);
3603 }
3604 }
6f613c73
KW
3605 else /* Here is \N{NAME} but not \N{U+...}. */
3606 if ((res = get_and_check_backslash_N_name(s, e)))
3607 {
3608 STRLEN len;
3609 const char *str = SvPV_const(res, len);
3610 if (PL_lex_inpat) {
ff3f963a
KW
3611
3612 if (! len) { /* The name resolved to an empty string */
3613 Copy("\\N{}", d, 4, char);
3614 d += 4;
3615 }
3616 else {
3617 /* In order to not lose information for the regex
3618 * compiler, pass the result in the specially made
3619 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3620 * the code points in hex of each character
3621 * returned by charnames */
3622
3623 const char *str_end = str + len;
3b721c4f 3624 const STRLEN off = d - SvPVX_const(sv);
94ca1619
KW
3625
3626 if (! SvUTF8(res)) {
3627 /* For the non-UTF-8 case, we can determine the
3628 * exact length needed without having to parse
3629 * through the string. Each character takes up
3630 * 2 hex digits plus either a trailing dot or
3631 * the "}" */
3632 d = off + SvGROW(sv, off
3633 + 3 * len
3634 + 6 /* For the "\N{U+", and
3635 trailing NUL */
3636 + (STRLEN)(send - e));
3637 Copy("\\N{U+", d, 5, char);
3638 d += 5;
3639 while (str < str_end) {
3640 char hex_string[4];
3641 my_snprintf(hex_string, sizeof(hex_string),
3642 "%02X.", (U8) *str);
3643 Copy(hex_string, d, 3, char);
3644 d += 3;
3645 str++;
3646 }
3647 d--; /* We will overwrite below the final
3648 dot with a right brace */
3649 }
3650 else {
1953db30
KW
3651 STRLEN char_length; /* cur char's byte length */
3652
3653 /* and the number of bytes after this is
3654 * translated into hex digits */
3655 STRLEN output_length;
3656
3657 /* 2 hex per byte; 2 chars for '\N'; 2 chars
3658 * for max('U+', '.'); and 1 for NUL */
3659 char hex_string[2 * UTF8_MAXBYTES + 5];
3660
3661 /* Get the first character of the result. */
3662 U32 uv = utf8n_to_uvuni((U8 *) str,
3663 len,
3664 &char_length,
3665 UTF8_ALLOW_ANYUV);
3666 /* Convert first code point to hex, including
3667 * the boiler plate before it. For all these,
3668 * we convert to native format so that
3669 * downstream code can continue to assume the
3670 * input is native */
3671 output_length =
3672 my_snprintf(hex_string, sizeof(hex_string),
3673 "\\N{U+%X",
3674 (unsigned int) UNI_TO_NATIVE(uv));
3675
3676 /* Make sure there is enough space to hold it */
3677 d = off + SvGROW(sv, off
3678 + output_length
3679 + (STRLEN)(send - e)
3680 + 2); /* '}' + NUL */
3681 /* And output it */
3682 Copy(hex_string, d, output_length, char);
3683 d += output_length;
3684
3685 /* For each subsequent character, append dot and
3686 * its ordinal in hex */
3687 while ((str += char_length) < str_end) {
3688 const STRLEN off = d - SvPVX_const(sv);
3689 U32 uv = utf8n_to_uvuni((U8 *) str,
3690 str_end - str,
3691 &char_length,
3692 UTF8_ALLOW_ANYUV);
3693 output_length =
3694 my_snprintf(hex_string,
3695 sizeof(hex_string),
3696 ".%X",
3697 (unsigned int) UNI_TO_NATIVE(uv));
3698
3699 d = off + SvGROW(sv, off
3700 + output_length
3701 + (STRLEN)(send - e)
3702 + 2); /* '}' + NUL */
3703 Copy(hex_string, d, output_length, char);
3704 d += output_length;
3705 }
94ca1619 3706 }
ff3f963a
KW
3707
3708 *d++ = '}'; /* Done. Add the trailing brace */
3709 }
3710 }
3711 else { /* Here, not in a pattern. Convert the name to a
3712 * string. */
3713
3714 /* If destination is not in utf8, unconditionally
3715 * recode it to be so. This is because \N{} implies
3716 * Unicode semantics, and scalars have to be in utf8
3717 * to guarantee those semantics */
3718 if (! has_utf8) {
3719 SvCUR_set(sv, d - SvPVX_const(sv));
3720 SvPOK_on(sv);
3721 *d = '\0';
3722 /* See Note on sizing above. */
3723 sv_utf8_upgrade_flags_grow(sv,
3724 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3725 len + (STRLEN)(send - s) + 1);
3726 d = SvPVX(sv) + SvCUR(sv);
3727 has_utf8 = TRUE;
3728 } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
3729
3730 /* See Note on sizing above. (NOTE: SvCUR() is not
3731 * set correctly here). */
3732 const STRLEN off = d - SvPVX_const(sv);
3733 d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1);
3734 }
3735 Copy(str, d, len, char);
3736 d +