This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
avoid side-effecting source held in scalar
[perl5.git] / toke.c
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
26
27This is the lower layer of the Perl parser, managing characters and tokens.
28
29=for apidoc AmU|yy_parser *|PL_parser
30
31Pointer to a structure encapsulating the state of the parsing operation
32currently in progress. The pointer can be locally changed to perform
33a nested parse without interfering with the state of an outer parse.
34Individual members of C<PL_parser> have their own documentation.
35
36=cut
37*/
38
378cc40b 39#include "EXTERN.h"
864dbfa3 40#define PERL_IN_TOKE_C
378cc40b 41#include "perl.h"
04e98a4d 42#include "dquote_static.c"
378cc40b 43
eb0d8d16
NC
44#define new_constant(a,b,c,d,e,f,g) \
45 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
46
6154021b 47#define pl_yylval (PL_parser->yylval)
d3b6f988 48
199e78b7
DM
49/* XXX temporary backwards compatibility */
50#define PL_lex_brackets (PL_parser->lex_brackets)
51#define PL_lex_brackstack (PL_parser->lex_brackstack)
52#define PL_lex_casemods (PL_parser->lex_casemods)
53#define PL_lex_casestack (PL_parser->lex_casestack)
54#define PL_lex_defer (PL_parser->lex_defer)
55#define PL_lex_dojoin (PL_parser->lex_dojoin)
56#define PL_lex_expect (PL_parser->lex_expect)
57#define PL_lex_formbrack (PL_parser->lex_formbrack)
58#define PL_lex_inpat (PL_parser->lex_inpat)
59#define PL_lex_inwhat (PL_parser->lex_inwhat)
60#define PL_lex_op (PL_parser->lex_op)
61#define PL_lex_repl (PL_parser->lex_repl)
62#define PL_lex_starts (PL_parser->lex_starts)
63#define PL_lex_stuff (PL_parser->lex_stuff)
64#define PL_multi_start (PL_parser->multi_start)
65#define PL_multi_open (PL_parser->multi_open)
66#define PL_multi_close (PL_parser->multi_close)
67#define PL_pending_ident (PL_parser->pending_ident)
68#define PL_preambled (PL_parser->preambled)
69#define PL_sublex_info (PL_parser->sublex_info)
bdc0bf6f 70#define PL_linestr (PL_parser->linestr)
c2598295
DM
71#define PL_expect (PL_parser->expect)
72#define PL_copline (PL_parser->copline)
f06b5848
DM
73#define PL_bufptr (PL_parser->bufptr)
74#define PL_oldbufptr (PL_parser->oldbufptr)
75#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
76#define PL_linestart (PL_parser->linestart)
77#define PL_bufend (PL_parser->bufend)
78#define PL_last_uni (PL_parser->last_uni)
79#define PL_last_lop (PL_parser->last_lop)
80#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 81#define PL_lex_state (PL_parser->lex_state)
2f9285f8 82#define PL_rsfp (PL_parser->rsfp)
5486870f 83#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
84#define PL_in_my (PL_parser->in_my)
85#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 86#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 87#define PL_multi_end (PL_parser->multi_end)
13765c85 88#define PL_error_count (PL_parser->error_count)
199e78b7
DM
89
90#ifdef PERL_MAD
91# define PL_endwhite (PL_parser->endwhite)
92# define PL_faketokens (PL_parser->faketokens)
93# define PL_lasttoke (PL_parser->lasttoke)
94# define PL_nextwhite (PL_parser->nextwhite)
95# define PL_realtokenstart (PL_parser->realtokenstart)
96# define PL_skipwhite (PL_parser->skipwhite)
97# define PL_thisclose (PL_parser->thisclose)
98# define PL_thismad (PL_parser->thismad)
99# define PL_thisopen (PL_parser->thisopen)
100# define PL_thisstuff (PL_parser->thisstuff)
101# define PL_thistoken (PL_parser->thistoken)
102# define PL_thiswhite (PL_parser->thiswhite)
fb205e7a
DM
103# define PL_thiswhite (PL_parser->thiswhite)
104# define PL_nexttoke (PL_parser->nexttoke)
105# define PL_curforce (PL_parser->curforce)
106#else
107# define PL_nexttoke (PL_parser->nexttoke)
108# define PL_nexttype (PL_parser->nexttype)
109# define PL_nextval (PL_parser->nextval)
199e78b7
DM
110#endif
111
16173588
NC
112/* This can't be done with embed.fnc, because struct yy_parser contains a
113 member named pending_ident, which clashes with the generated #define */
3cbf51f5
DM
114static int
115S_pending_ident(pTHX);
199e78b7 116
0bd48802 117static const char ident_too_long[] = "Identifier too long";
8903cb82 118
29595ff2 119#ifdef PERL_MAD
29595ff2 120# define CURMAD(slot,sv) if (PL_madskills) { curmad(slot,sv); sv = 0; }
cd81e915 121# define NEXTVAL_NEXTTOKE PL_nexttoke[PL_curforce].next_val
9ded7720 122#else
5db06880 123# define CURMAD(slot,sv)
9ded7720 124# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2
NC
125#endif
126
9059aa12
LW
127#define XFAKEBRACK 128
128#define XENUMMASK 127
129
39e02b42
JH
130#ifdef USE_UTF8_SCRIPTS
131# define UTF (!IN_BYTES)
2b9d42f0 132#else
746b446a 133# define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || (PL_hints & HINT_UTF8))
2b9d42f0 134#endif
a0ed51b3 135
b1fc3636
CJ
136/* The maximum number of characters preceding the unrecognized one to display */
137#define UNRECOGNIZED_PRECEDE_COUNT 10
138
61f0cdd9 139/* In variables named $^X, these are the legal values for X.
2b92dfce
GS
140 * 1999-02-27 mjd-perl-patch@plover.com */
141#define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
142
bf4acbe4 143#define SPACE_OR_TAB(c) ((c)==' '||(c)=='\t')
bf4acbe4 144
ffb4593c
NT
145/* LEX_* are values for PL_lex_state, the state of the lexer.
146 * They are arranged oddly so that the guard on the switch statement
79072805
LW
147 * can get by with a single comparison (if the compiler is smart enough).
148 */
149
fb73857a
PP
150/* #define LEX_NOTPARSING 11 is done in perl.h. */
151
b6007c36
DM
152#define LEX_NORMAL 10 /* normal code (ie not within "...") */
153#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
154#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
155#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
156#define LEX_INTERPSTART 6 /* expecting the start of a $var */
157
158 /* at end of code, eg "$x" followed by: */
159#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
160#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
161
162#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
163 string or after \E, $foo, etc */
164#define LEX_INTERPCONST 2 /* NOT USED */
165#define LEX_FORMLINE 1 /* expecting a format line */
166#define LEX_KNOWNEXT 0 /* next token known; just return it */
167
79072805 168
bbf60fe6 169#ifdef DEBUGGING
27da23d5 170static const char* const lex_state_names[] = {
bbf60fe6
DM
171 "KNOWNEXT",
172 "FORMLINE",
173 "INTERPCONST",
174 "INTERPCONCAT",
175 "INTERPENDMAYBE",
176 "INTERPEND",
177 "INTERPSTART",
178 "INTERPPUSH",
179 "INTERPCASEMOD",
180 "INTERPNORMAL",
181 "NORMAL"
182};
183#endif
184
79072805
LW
185#ifdef ff_next
186#undef ff_next
d48672a2
LW
187#endif
188
79072805 189#include "keywords.h"
fe14fcc3 190
ffb4593c
NT
191/* CLINE is a macro that ensures PL_copline has a sane value */
192
ae986130
LW
193#ifdef CLINE
194#undef CLINE
195#endif
57843af0 196#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 197
5db06880 198#ifdef PERL_MAD
29595ff2
NC
199# define SKIPSPACE0(s) skipspace0(s)
200# define SKIPSPACE1(s) skipspace1(s)
201# define SKIPSPACE2(s,tsv) skipspace2(s,&tsv)
202# define PEEKSPACE(s) skipspace2(s,0)
203#else
204# define SKIPSPACE0(s) skipspace(s)
205# define SKIPSPACE1(s) skipspace(s)
206# define SKIPSPACE2(s,tsv) skipspace(s)
207# define PEEKSPACE(s) skipspace(s)
208#endif
209
ffb4593c
NT
210/*
211 * Convenience functions to return different tokens and prime the
9cbb5ea2 212 * lexer for the next token. They all take an argument.
ffb4593c
NT
213 *
214 * TOKEN : generic token (used for '(', DOLSHARP, etc)
215 * OPERATOR : generic operator
216 * AOPERATOR : assignment operator
217 * PREBLOCK : beginning the block after an if, while, foreach, ...
218 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
219 * PREREF : *EXPR where EXPR is not a simple identifier
220 * TERM : expression term
221 * LOOPX : loop exiting command (goto, last, dump, etc)
222 * FTST : file test operator
223 * FUN0 : zero-argument function
2d2e263d 224 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
225 * BOop : bitwise or or xor
226 * BAop : bitwise and
227 * SHop : shift operator
228 * PWop : power operator
9cbb5ea2 229 * PMop : pattern-matching operator
ffb4593c
NT
230 * Aop : addition-level operator
231 * Mop : multiplication-level operator
232 * Eop : equality-testing operator
e5edeb50 233 * Rop : relational operator <= != gt
ffb4593c
NT
234 *
235 * Also see LOP and lop() below.
236 */
237
998054bd 238#ifdef DEBUGGING /* Serve -DT. */
704d4215 239# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 240#else
bbf60fe6 241# define REPORT(retval) (retval)
998054bd
SC
242#endif
243
bbf60fe6
DM
244#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
245#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
246#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, REPORT(retval)))
247#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
248#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
249#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
250#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
6154021b
RGS
251#define LOOPX(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)LOOPEX))
252#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
253#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
254#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
255#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITOROP)))
256#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITANDOP)))
257#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)SHIFTOP)))
258#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)POWOP)))
259#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
260#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)ADDOP)))
261#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MULOP)))
262#define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
263#define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
2f3197b3 264
a687059c
LW
265/* This bit of chicanery makes a unary function followed by
266 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
267 * The UNIDOR macro is for unary functions that can be followed by the //
268 * operator (such as C<shift // 0>).
a687059c 269 */
376fcdbf 270#define UNI2(f,x) { \
6154021b 271 pl_yylval.ival = f; \
376fcdbf
AL
272 PL_expect = x; \
273 PL_bufptr = s; \
274 PL_last_uni = PL_oldbufptr; \
275 PL_last_lop_op = f; \
276 if (*s == '(') \
277 return REPORT( (int)FUNC1 ); \
29595ff2 278 s = PEEKSPACE(s); \
376fcdbf
AL
279 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
280 }
6f33ba73
RGS
281#define UNI(f) UNI2(f,XTERM)
282#define UNIDOR(f) UNI2(f,XTERMORDORDOR)
a687059c 283
376fcdbf 284#define UNIBRACK(f) { \
6154021b 285 pl_yylval.ival = f; \
376fcdbf
AL
286 PL_bufptr = s; \
287 PL_last_uni = PL_oldbufptr; \
288 if (*s == '(') \
289 return REPORT( (int)FUNC1 ); \
29595ff2 290 s = PEEKSPACE(s); \
376fcdbf
AL
291 return REPORT( (*s == '(') ? (int)FUNC1 : (int)UNIOP ); \
292 }
79072805 293
9f68db38 294/* grandfather return to old style */
6154021b 295#define OLDLOP(f) return(pl_yylval.ival=f,PL_expect = XTERM,PL_bufptr = s,(int)LSTOP)
79072805 296
8fa7f367
JH
297#ifdef DEBUGGING
298
6154021b 299/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
300enum token_type {
301 TOKENTYPE_NONE,
302 TOKENTYPE_IVAL,
6154021b 303 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6
DM
304 TOKENTYPE_PVAL,
305 TOKENTYPE_OPVAL,
306 TOKENTYPE_GVVAL
307};
308
6d4a66ac
NC
309static struct debug_tokens {
310 const int token;
311 enum token_type type;
312 const char *name;
313} const debug_tokens[] =
9041c2e3 314{
bbf60fe6
DM
315 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
316 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
317 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
318 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
319 { ARROW, TOKENTYPE_NONE, "ARROW" },
320 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
321 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
322 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
323 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
324 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
0d863452 325 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
326 { DO, TOKENTYPE_NONE, "DO" },
327 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
328 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
329 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
330 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
331 { ELSE, TOKENTYPE_NONE, "ELSE" },
332 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
333 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
334 { FOR, TOKENTYPE_IVAL, "FOR" },
335 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
336 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
337 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
338 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
339 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
340 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 341 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
342 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
343 { IF, TOKENTYPE_IVAL, "IF" },
344 { LABEL, TOKENTYPE_PVAL, "LABEL" },
345 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
346 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
347 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
348 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
349 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
350 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
351 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
352 { MY, TOKENTYPE_IVAL, "MY" },
353 { MYSUB, TOKENTYPE_NONE, "MYSUB" },
354 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
355 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
356 { OROP, TOKENTYPE_IVAL, "OROP" },
357 { OROR, TOKENTYPE_NONE, "OROR" },
358 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
88e1f1a2
JV
359 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
360 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6
DM
361 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
362 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
363 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
364 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
365 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
366 { PREINC, TOKENTYPE_NONE, "PREINC" },
367 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
368 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
369 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
370 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
371 { SUB, TOKENTYPE_NONE, "SUB" },
372 { THING, TOKENTYPE_OPVAL, "THING" },
373 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
374 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
375 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
376 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
377 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
378 { USE, TOKENTYPE_IVAL, "USE" },
0d863452 379 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6
DM
380 { WHILE, TOKENTYPE_IVAL, "WHILE" },
381 { WORD, TOKENTYPE_OPVAL, "WORD" },
be25f609 382 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 383 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
384};
385
6154021b 386/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 387
bbf60fe6 388STATIC int
704d4215 389S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 390{
97aff369 391 dVAR;
7918f24d
NC
392
393 PERL_ARGS_ASSERT_TOKEREPORT;
394
bbf60fe6 395 if (DEBUG_T_TEST) {
bd61b366 396 const char *name = NULL;
bbf60fe6 397 enum token_type type = TOKENTYPE_NONE;
f54cb97a 398 const struct debug_tokens *p;
396482e1 399 SV* const report = newSVpvs("<== ");
bbf60fe6 400
f54cb97a 401 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
402 if (p->token == (int)rv) {
403 name = p->name;
404 type = p->type;
405 break;
406 }
407 }
408 if (name)
54667de8 409 Perl_sv_catpv(aTHX_ report, name);
bbf60fe6
DM
410 else if ((char)rv > ' ' && (char)rv < '~')
411 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
412 else if (!rv)
396482e1 413 sv_catpvs(report, "EOF");
bbf60fe6
DM
414 else
415 Perl_sv_catpvf(aTHX_ report, "?? %"IVdf, (IV)rv);
416 switch (type) {
417 case TOKENTYPE_NONE:
418 case TOKENTYPE_GVVAL: /* doesn't appear to be used */
419 break;
420 case TOKENTYPE_IVAL:
704d4215 421 Perl_sv_catpvf(aTHX_ report, "(ival=%"IVdf")", (IV)lvalp->ival);
bbf60fe6
DM
422 break;
423 case TOKENTYPE_OPNUM:
424 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 425 PL_op_name[lvalp->ival]);
bbf60fe6
DM
426 break;
427 case TOKENTYPE_PVAL:
704d4215 428 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
429 break;
430 case TOKENTYPE_OPVAL:
704d4215 431 if (lvalp->opval) {
401441c0 432 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
433 PL_op_name[lvalp->opval->op_type]);
434 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 435 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 436 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
437 }
438
439 }
401441c0 440 else
396482e1 441 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
442 break;
443 }
b6007c36 444 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
445 };
446 return (int)rv;
998054bd
SC
447}
448
b6007c36
DM
449
450/* print the buffer with suitable escapes */
451
452STATIC void
15f169a1 453S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 454{
396482e1 455 SV* const tmp = newSVpvs("");
7918f24d
NC
456
457 PERL_ARGS_ASSERT_PRINTBUF;
458
b6007c36
DM
459 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
460 SvREFCNT_dec(tmp);
461}
462
8fa7f367
JH
463#endif
464
8290c323
NC
465static int
466S_deprecate_commaless_var_list(pTHX) {
467 PL_expect = XTERM;
468 deprecate("comma-less variable list");
469 return REPORT(','); /* grandfather non-comma-format format */
470}
471
ffb4593c
NT
472/*
473 * S_ao
474 *
c963b151
BD
475 * This subroutine detects &&=, ||=, and //= and turns an ANDAND, OROR or DORDOR
476 * into an OP_ANDASSIGN, OP_ORASSIGN, or OP_DORASSIGN
ffb4593c
NT
477 */
478
76e3520e 479STATIC int
cea2e8a9 480S_ao(pTHX_ int toketype)
a0d0e21e 481{
97aff369 482 dVAR;
3280af22
NIS
483 if (*PL_bufptr == '=') {
484 PL_bufptr++;
a0d0e21e 485 if (toketype == ANDAND)
6154021b 486 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 487 else if (toketype == OROR)
6154021b 488 pl_yylval.ival = OP_ORASSIGN;
c963b151 489 else if (toketype == DORDOR)
6154021b 490 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
491 toketype = ASSIGNOP;
492 }
493 return toketype;
494}
495
ffb4593c
NT
496/*
497 * S_no_op
498 * When Perl expects an operator and finds something else, no_op
499 * prints the warning. It always prints "<something> found where
500 * operator expected. It prints "Missing semicolon on previous line?"
501 * if the surprise occurs at the start of the line. "do you need to
502 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
503 * where the compiler doesn't know if foo is a method call or a function.
504 * It prints "Missing operator before end of line" if there's nothing
505 * after the missing operator, or "... before <...>" if there is something
506 * after the missing operator.
507 */
508
76e3520e 509STATIC void
15f169a1 510S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 511{
97aff369 512 dVAR;
9d4ba2ae
AL
513 char * const oldbp = PL_bufptr;
514 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 515
7918f24d
NC
516 PERL_ARGS_ASSERT_NO_OP;
517
1189a94a
GS
518 if (!s)
519 s = oldbp;
07c798fb 520 else
1189a94a 521 PL_bufptr = s;
cea2e8a9 522 yywarn(Perl_form(aTHX_ "%s found where operator expected", what));
56da5a46
RGS
523 if (ckWARN_d(WARN_SYNTAX)) {
524 if (is_first)
525 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
526 "\t(Missing semicolon on previous line?)\n");
527 else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) {
f54cb97a 528 const char *t;
c35e046a
AL
529 for (t = PL_oldoldbufptr; (isALNUM_lazy_if(t,UTF) || *t == ':'); t++)
530 NOOP;
56da5a46
RGS
531 if (t < PL_bufptr && isSPACE(*t))
532 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
533 "\t(Do you need to predeclare %.*s?)\n",
551405c4 534 (int)(t - PL_oldoldbufptr), PL_oldoldbufptr);
56da5a46
RGS
535 }
536 else {
537 assert(s >= oldbp);
538 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
551405c4 539 "\t(Missing operator before %.*s?)\n", (int)(s - oldbp), oldbp);
56da5a46 540 }
07c798fb 541 }
3280af22 542 PL_bufptr = oldbp;
8990e307
LW
543}
544
ffb4593c
NT
545/*
546 * S_missingterm
547 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 548 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
549 * If we're in a delimited string and the delimiter is a control
550 * character, it's reformatted into a two-char sequence like ^C.
551 * This is fatal.
552 */
553
76e3520e 554STATIC void
cea2e8a9 555S_missingterm(pTHX_ char *s)
8990e307 556{
97aff369 557 dVAR;
8990e307
LW
558 char tmpbuf[3];
559 char q;
560 if (s) {
9d4ba2ae 561 char * const nl = strrchr(s,'\n');
d2719217 562 if (nl)
8990e307
LW
563 *nl = '\0';
564 }
463559e7 565 else if (isCNTRL(PL_multi_close)) {
8990e307 566 *tmpbuf = '^';
585ec06d 567 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
568 tmpbuf[2] = '\0';
569 s = tmpbuf;
570 }
571 else {
eb160463 572 *tmpbuf = (char)PL_multi_close;
8990e307
LW
573 tmpbuf[1] = '\0';
574 s = tmpbuf;
575 }
576 q = strchr(s,'"') ? '\'' : '"';
cea2e8a9 577 Perl_croak(aTHX_ "Can't find string terminator %c%s%c anywhere before EOF",q,s,q);
463ee0b2 578}
79072805 579
ef89dcc3 580#define FEATURE_IS_ENABLED(name) \
0d863452 581 ((0 != (PL_hints & HINT_LOCALIZE_HH)) \
89529cee 582 && S_feature_is_enabled(aTHX_ STR_WITH_LEN(name)))
4a731d7b 583/* The longest string we pass in. */
1863b879 584#define MAX_FEATURE_LEN (sizeof("unicode_strings")-1)
4a731d7b 585
0d863452
RH
586/*
587 * S_feature_is_enabled
588 * Check whether the named feature is enabled.
589 */
590STATIC bool
15f169a1 591S_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
0d863452 592{
97aff369 593 dVAR;
0d863452 594 HV * const hinthv = GvHV(PL_hintgv);
4a731d7b 595 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
7918f24d
NC
596
597 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
598
4a731d7b
NC
599 assert(namelen <= MAX_FEATURE_LEN);
600 memcpy(&he_name[8], name, namelen);
d4c19fe8 601
7b9ef140 602 return (hinthv && hv_exists(hinthv, he_name, 8 + namelen));
0d863452
RH
603}
604
ffb4593c 605/*
9cbb5ea2
GS
606 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
607 * utf16-to-utf8-reversed.
ffb4593c
NT
608 */
609
c39cd008
GS
610#ifdef PERL_CR_FILTER
611static void
612strip_return(SV *sv)
613{
95a20fc0 614 register const char *s = SvPVX_const(sv);
9d4ba2ae 615 register const char * const e = s + SvCUR(sv);
7918f24d
NC
616
617 PERL_ARGS_ASSERT_STRIP_RETURN;
618
c39cd008
GS
619 /* outer loop optimized to do nothing if there are no CR-LFs */
620 while (s < e) {
621 if (*s++ == '\r' && *s == '\n') {
622 /* hit a CR-LF, need to copy the rest */
623 register char *d = s - 1;
624 *d++ = *s++;
625 while (s < e) {
626 if (*s == '\r' && s[1] == '\n')
627 s++;
628 *d++ = *s++;
629 }
630 SvCUR(sv) -= s - d;
631 return;
632 }
633 }
634}
a868473f 635
76e3520e 636STATIC I32
c39cd008 637S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 638{
f54cb97a 639 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
640 if (count > 0 && !maxlen)
641 strip_return(sv);
642 return count;
a868473f
NIS
643}
644#endif
645
199e78b7
DM
646
647
ffb4593c
NT
648/*
649 * Perl_lex_start
5486870f 650 *
e3abe207 651 * Create a parser object and initialise its parser and lexer fields
5486870f
DM
652 *
653 * rsfp is the opened file handle to read from (if any),
654 *
655 * line holds any initial content already read from the file (or in
656 * the case of no file, such as an eval, the whole contents);
ffb4593c
NT
657 */
658
a0d0e21e 659void
f07ec6dd 660Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp)
79072805 661{
97aff369 662 dVAR;
6ef55633 663 const char *s = NULL;
8990e307 664 STRLEN len;
5486870f 665 yy_parser *parser, *oparser;
acdf0a21
DM
666
667 /* create and initialise a parser */
668
199e78b7 669 Newxz(parser, 1, yy_parser);
5486870f 670 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
671 PL_parser = parser;
672
28ac2b49
Z
673 parser->stack = NULL;
674 parser->ps = NULL;
675 parser->stack_size = 0;
acdf0a21 676
e3abe207
DM
677 /* on scope exit, free this parser and restore any outer one */
678 SAVEPARSER(parser);
7c4baf47 679 parser->saved_curcop = PL_curcop;
e3abe207 680
acdf0a21 681 /* initialise lexer state */
8990e307 682
fb205e7a
DM
683#ifdef PERL_MAD
684 parser->curforce = -1;
685#else
686 parser->nexttoke = 0;
687#endif
ca4cfd28 688 parser->error_count = oparser ? oparser->error_count : 0;
c2598295 689 parser->copline = NOLINE;
5afb0a62 690 parser->lex_state = LEX_NORMAL;
c2598295 691 parser->expect = XSTATE;
2f9285f8 692 parser->rsfp = rsfp;
f07ec6dd 693 parser->rsfp_filters = newAV();
2f9285f8 694
199e78b7
DM
695 Newx(parser->lex_brackstack, 120, char);
696 Newx(parser->lex_casestack, 12, char);
697 *parser->lex_casestack = '\0';
02b34bbe 698
10efb74f
NC
699 if (line) {
700 s = SvPV_const(line, len);
701 } else {
702 len = 0;
703 }
bdc0bf6f 704
10efb74f 705 if (!len) {
bdc0bf6f 706 parser->linestr = newSVpvs("\n;");
805700c1 707 } else {
719a9bb0 708 parser->linestr = newSVpvn_flags(s, len, SvUTF8(line));
10efb74f 709 if (s[len-1] != ';')
bdc0bf6f 710 sv_catpvs(parser->linestr, "\n;");
8990e307 711 }
f06b5848
DM
712 parser->oldoldbufptr =
713 parser->oldbufptr =
714 parser->bufptr =
715 parser->linestart = SvPVX(parser->linestr);
716 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
717 parser->last_lop = parser->last_uni = NULL;
737c24fc
Z
718
719 parser->in_pod = 0;
79072805 720}
a687059c 721
e3abe207
DM
722
723/* delete a parser object */
724
725void
726Perl_parser_free(pTHX_ const yy_parser *parser)
727{
7918f24d
NC
728 PERL_ARGS_ASSERT_PARSER_FREE;
729
7c4baf47 730 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
731 SvREFCNT_dec(parser->linestr);
732
2f9285f8
DM
733 if (parser->rsfp == PerlIO_stdin())
734 PerlIO_clearerr(parser->rsfp);
799361c3
SH
735 else if (parser->rsfp && (!parser->old_parser ||
736 (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 737 PerlIO_close(parser->rsfp);
5486870f 738 SvREFCNT_dec(parser->rsfp_filters);
2f9285f8 739
e3abe207
DM
740 Safefree(parser->lex_brackstack);
741 Safefree(parser->lex_casestack);
742 PL_parser = parser->old_parser;
743 Safefree(parser);
744}
745
746
ffb4593c 747/*
f0e67a1d
Z
748=for apidoc AmxU|SV *|PL_parser-E<gt>linestr
749
750Buffer scalar containing the chunk currently under consideration of the
751text currently being lexed. This is always a plain string scalar (for
752which C<SvPOK> is true). It is not intended to be used as a scalar by
753normal scalar means; instead refer to the buffer directly by the pointer
754variables described below.
755
756The lexer maintains various C<char*> pointers to things in the
757C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
758reallocated, all of these pointers must be updated. Don't attempt to
759do this manually, but rather use L</lex_grow_linestr> if you need to
760reallocate the buffer.
761
762The content of the text chunk in the buffer is commonly exactly one
763complete line of input, up to and including a newline terminator,
764but there are situations where it is otherwise. The octets of the
765buffer may be intended to be interpreted as either UTF-8 or Latin-1.
766The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
767flag on this scalar, which may disagree with it.
768
769For direct examination of the buffer, the variable
770L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
771lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
772of these pointers is usually preferable to examination of the scalar
773through normal scalar means.
774
775=for apidoc AmxU|char *|PL_parser-E<gt>bufend
776
777Direct pointer to the end of the chunk of text currently being lexed, the
778end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
779+ SvCUR(PL_parser-E<gt>linestr)>. A NUL character (zero octet) is
780always located at the end of the buffer, and does not count as part of
781the buffer's contents.
782
783=for apidoc AmxU|char *|PL_parser-E<gt>bufptr
784
785Points to the current position of lexing inside the lexer buffer.
786Characters around this point may be freely examined, within
787the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
788L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
789interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
790
791Lexing code (whether in the Perl core or not) moves this pointer past
792the characters that it consumes. It is also expected to perform some
793bookkeeping whenever a newline character is consumed. This movement
794can be more conveniently performed by the function L</lex_read_to>,
795which handles newlines appropriately.
796
797Interpretation of the buffer's octets can be abstracted out by
798using the slightly higher-level functions L</lex_peek_unichar> and
799L</lex_read_unichar>.
800
801=for apidoc AmxU|char *|PL_parser-E<gt>linestart
802
803Points to the start of the current line inside the lexer buffer.
804This is useful for indicating at which column an error occurred, and
805not much else. This must be updated by any lexing code that consumes
806a newline; the function L</lex_read_to> handles this detail.
807
808=cut
809*/
810
811/*
812=for apidoc Amx|bool|lex_bufutf8
813
814Indicates whether the octets in the lexer buffer
815(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
816of Unicode characters. If not, they should be interpreted as Latin-1
817characters. This is analogous to the C<SvUTF8> flag for scalars.
818
819In UTF-8 mode, it is not guaranteed that the lexer buffer actually
820contains valid UTF-8. Lexing code must be robust in the face of invalid
821encoding.
822
823The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
824is significant, but not the whole story regarding the input character
825encoding. Normally, when a file is being read, the scalar contains octets
826and its C<SvUTF8> flag is off, but the octets should be interpreted as
827UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
828however, the scalar may have the C<SvUTF8> flag on, and in this case its
829octets should be interpreted as UTF-8 unless the C<use bytes> pragma
830is in effect. This logic may change in the future; use this function
831instead of implementing the logic yourself.
832
833=cut
834*/
835
836bool
837Perl_lex_bufutf8(pTHX)
838{
839 return UTF;
840}
841
842/*
843=for apidoc Amx|char *|lex_grow_linestr|STRLEN len
844
845Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
846at least I<len> octets (including terminating NUL). Returns a
847pointer to the reallocated buffer. This is necessary before making
848any direct modification of the buffer that would increase its length.
849L</lex_stuff_pvn> provides a more convenient way to insert text into
850the buffer.
851
852Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
853this function updates all of the lexer's variables that point directly
854into the buffer.
855
856=cut
857*/
858
859char *
860Perl_lex_grow_linestr(pTHX_ STRLEN len)
861{
862 SV *linestr;
863 char *buf;
864 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
865 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
866 linestr = PL_parser->linestr;
867 buf = SvPVX(linestr);
868 if (len <= SvLEN(linestr))
869 return buf;
870 bufend_pos = PL_parser->bufend - buf;
871 bufptr_pos = PL_parser->bufptr - buf;
872 oldbufptr_pos = PL_parser->oldbufptr - buf;
873 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
874 linestart_pos = PL_parser->linestart - buf;
875 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
876 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
877 buf = sv_grow(linestr, len);
878 PL_parser->bufend = buf + bufend_pos;
879 PL_parser->bufptr = buf + bufptr_pos;
880 PL_parser->oldbufptr = buf + oldbufptr_pos;
881 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
882 PL_parser->linestart = buf + linestart_pos;
883 if (PL_parser->last_uni)
884 PL_parser->last_uni = buf + last_uni_pos;
885 if (PL_parser->last_lop)
886 PL_parser->last_lop = buf + last_lop_pos;
887 return buf;
888}
889
890/*
83aa740e 891=for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
f0e67a1d
Z
892
893Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
894immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
895reallocating the buffer if necessary. This means that lexing code that
896runs later will see the characters as if they had appeared in the input.
897It is not recommended to do this as part of normal parsing, and most
898uses of this facility run the risk of the inserted characters being
899interpreted in an unintended manner.
900
901The string to be inserted is represented by I<len> octets starting
902at I<pv>. These octets are interpreted as either UTF-8 or Latin-1,
903according to whether the C<LEX_STUFF_UTF8> flag is set in I<flags>.
904The characters are recoded for the lexer buffer, according to how the
905buffer is currently being interpreted (L</lex_bufutf8>). If a string
9dcc53ea 906to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
f0e67a1d
Z
907function is more convenient.
908
909=cut
910*/
911
912void
83aa740e 913Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
f0e67a1d 914{
749123ff 915 dVAR;
f0e67a1d
Z
916 char *bufptr;
917 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
918 if (flags & ~(LEX_STUFF_UTF8))
919 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
920 if (UTF) {
921 if (flags & LEX_STUFF_UTF8) {
922 goto plain_copy;
923 } else {
924 STRLEN highhalf = 0;
83aa740e 925 const char *p, *e = pv+len;
f0e67a1d
Z
926 for (p = pv; p != e; p++)
927 highhalf += !!(((U8)*p) & 0x80);
928 if (!highhalf)
929 goto plain_copy;
930 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
931 bufptr = PL_parser->bufptr;
932 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
933 SvCUR_set(PL_parser->linestr,
934 SvCUR(PL_parser->linestr) + len+highhalf);
f0e67a1d
Z
935 PL_parser->bufend += len+highhalf;
936 for (p = pv; p != e; p++) {
937 U8 c = (U8)*p;
938 if (c & 0x80) {
939 *bufptr++ = (char)(0xc0 | (c >> 6));
940 *bufptr++ = (char)(0x80 | (c & 0x3f));
941 } else {
942 *bufptr++ = (char)c;
943 }
944 }
945 }
946 } else {
947 if (flags & LEX_STUFF_UTF8) {
948 STRLEN highhalf = 0;
83aa740e 949 const char *p, *e = pv+len;
f0e67a1d
Z
950 for (p = pv; p != e; p++) {
951 U8 c = (U8)*p;
952 if (c >= 0xc4) {
953 Perl_croak(aTHX_ "Lexing code attempted to stuff "
954 "non-Latin-1 character into Latin-1 input");
955 } else if (c >= 0xc2 && p+1 != e &&
956 (((U8)p[1]) & 0xc0) == 0x80) {
957 p++;
958 highhalf++;
959 } else if (c >= 0x80) {
960 /* malformed UTF-8 */
961 ENTER;
962 SAVESPTR(PL_warnhook);
963 PL_warnhook = PERL_WARNHOOK_FATAL;
964 utf8n_to_uvuni((U8*)p, e-p, NULL, 0);
965 LEAVE;
966 }
967 }
968 if (!highhalf)
969 goto plain_copy;
970 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
971 bufptr = PL_parser->bufptr;
972 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
973 SvCUR_set(PL_parser->linestr,
974 SvCUR(PL_parser->linestr) + len-highhalf);
f0e67a1d
Z
975 PL_parser->bufend += len-highhalf;
976 for (p = pv; p != e; p++) {
977 U8 c = (U8)*p;
978 if (c & 0x80) {
979 *bufptr++ = (char)(((c & 0x3) << 6) | (p[1] & 0x3f));
980 p++;
981 } else {
982 *bufptr++ = (char)c;
983 }
984 }
985 } else {
986 plain_copy:
987 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
988 bufptr = PL_parser->bufptr;
989 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
255fdf19 990 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
f0e67a1d
Z
991 PL_parser->bufend += len;
992 Copy(pv, bufptr, len, char);
993 }
994 }
995}
996
997/*
9dcc53ea
Z
998=for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
999
1000Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1001immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1002reallocating the buffer if necessary. This means that lexing code that
1003runs later will see the characters as if they had appeared in the input.
1004It is not recommended to do this as part of normal parsing, and most
1005uses of this facility run the risk of the inserted characters being
1006interpreted in an unintended manner.
1007
1008The string to be inserted is represented by octets starting at I<pv>
1009and continuing to the first nul. These octets are interpreted as either
1010UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
1011in I<flags>. The characters are recoded for the lexer buffer, according
1012to how the buffer is currently being interpreted (L</lex_bufutf8>).
1013If it is not convenient to nul-terminate a string to be inserted, the
1014L</lex_stuff_pvn> function is more appropriate.
1015
1016=cut
1017*/
1018
1019void
1020Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1021{
1022 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1023 lex_stuff_pvn(pv, strlen(pv), flags);
1024}
1025
1026/*
f0e67a1d
Z
1027=for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1028
1029Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1030immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1031reallocating the buffer if necessary. This means that lexing code that
1032runs later will see the characters as if they had appeared in the input.
1033It is not recommended to do this as part of normal parsing, and most
1034uses of this facility run the risk of the inserted characters being
1035interpreted in an unintended manner.
1036
1037The string to be inserted is the string value of I<sv>. The characters
1038are recoded for the lexer buffer, according to how the buffer is currently
9dcc53ea 1039being interpreted (L</lex_bufutf8>). If a string to be inserted is
f0e67a1d
Z
1040not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1041need to construct a scalar.
1042
1043=cut
1044*/
1045
1046void
1047Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1048{
1049 char *pv;
1050 STRLEN len;
1051 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1052 if (flags)
1053 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1054 pv = SvPV(sv, len);
1055 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1056}
1057
1058/*
1059=for apidoc Amx|void|lex_unstuff|char *ptr
1060
1061Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1062I<ptr>. Text following I<ptr> will be moved, and the buffer shortened.
1063This hides the discarded text from any lexing code that runs later,
1064as if the text had never appeared.
1065
1066This is not the normal way to consume lexed text. For that, use
1067L</lex_read_to>.
1068
1069=cut
1070*/
1071
1072void
1073Perl_lex_unstuff(pTHX_ char *ptr)
1074{
1075 char *buf, *bufend;
1076 STRLEN unstuff_len;
1077 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1078 buf = PL_parser->bufptr;
1079 if (ptr < buf)
1080 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1081 if (ptr == buf)
1082 return;
1083 bufend = PL_parser->bufend;
1084 if (ptr > bufend)
1085 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1086 unstuff_len = ptr - buf;
1087 Move(ptr, buf, bufend+1-ptr, char);
1088 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1089 PL_parser->bufend = bufend - unstuff_len;
1090}
1091
1092/*
1093=for apidoc Amx|void|lex_read_to|char *ptr
1094
1095Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1096to I<ptr>. This advances L</PL_parser-E<gt>bufptr> to match I<ptr>,
1097performing the correct bookkeeping whenever a newline character is passed.
1098This is the normal way to consume lexed text.
1099
1100Interpretation of the buffer's octets can be abstracted out by
1101using the slightly higher-level functions L</lex_peek_unichar> and
1102L</lex_read_unichar>.
1103
1104=cut
1105*/
1106
1107void
1108Perl_lex_read_to(pTHX_ char *ptr)
1109{
1110 char *s;
1111 PERL_ARGS_ASSERT_LEX_READ_TO;
1112 s = PL_parser->bufptr;
1113 if (ptr < s || ptr > PL_parser->bufend)
1114 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1115 for (; s != ptr; s++)
1116 if (*s == '\n') {
1117 CopLINE_inc(PL_curcop);
1118 PL_parser->linestart = s+1;
1119 }
1120 PL_parser->bufptr = ptr;
1121}
1122
1123/*
1124=for apidoc Amx|void|lex_discard_to|char *ptr
1125
1126Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1127up to I<ptr>. The remaining content of the buffer will be moved, and
1128all pointers into the buffer updated appropriately. I<ptr> must not
1129be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1130it is not permitted to discard text that has yet to be lexed.
1131
1132Normally it is not necessarily to do this directly, because it suffices to
1133use the implicit discarding behaviour of L</lex_next_chunk> and things
1134based on it. However, if a token stretches across multiple lines,
1f317c95 1135and the lexing code has kept multiple lines of text in the buffer for
f0e67a1d
Z
1136that purpose, then after completion of the token it would be wise to
1137explicitly discard the now-unneeded earlier lines, to avoid future
1138multi-line tokens growing the buffer without bound.
1139
1140=cut
1141*/
1142
1143void
1144Perl_lex_discard_to(pTHX_ char *ptr)
1145{
1146 char *buf;
1147 STRLEN discard_len;
1148 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1149 buf = SvPVX(PL_parser->linestr);
1150 if (ptr < buf)
1151 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1152 if (ptr == buf)
1153 return;
1154 if (ptr > PL_parser->bufptr)
1155 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1156 discard_len = ptr - buf;
1157 if (PL_parser->oldbufptr < ptr)
1158 PL_parser->oldbufptr = ptr;
1159 if (PL_parser->oldoldbufptr < ptr)
1160 PL_parser->oldoldbufptr = ptr;
1161 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1162 PL_parser->last_uni = NULL;
1163 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1164 PL_parser->last_lop = NULL;
1165 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1166 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1167 PL_parser->bufend -= discard_len;
1168 PL_parser->bufptr -= discard_len;
1169 PL_parser->oldbufptr -= discard_len;
1170 PL_parser->oldoldbufptr -= discard_len;
1171 if (PL_parser->last_uni)
1172 PL_parser->last_uni -= discard_len;
1173 if (PL_parser->last_lop)
1174 PL_parser->last_lop -= discard_len;
1175}
1176
1177/*
1178=for apidoc Amx|bool|lex_next_chunk|U32 flags
1179
1180Reads in the next chunk of text to be lexed, appending it to
1181L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1182looked to the end of the current chunk and wants to know more. It is
1183usual, but not necessary, for lexing to have consumed the entirety of
1184the current chunk at this time.
1185
1186If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1187chunk (i.e., the current chunk has been entirely consumed), normally the
1188current chunk will be discarded at the same time that the new chunk is
1189read in. If I<flags> includes C<LEX_KEEP_PREVIOUS>, the current chunk
1190will not be discarded. If the current chunk has not been entirely
1191consumed, then it will not be discarded regardless of the flag.
1192
1193Returns true if some new text was added to the buffer, or false if the
1194buffer has reached the end of the input text.
1195
1196=cut
1197*/
1198
1199#define LEX_FAKE_EOF 0x80000000
1200
1201bool
1202Perl_lex_next_chunk(pTHX_ U32 flags)
1203{
1204 SV *linestr;
1205 char *buf;
1206 STRLEN old_bufend_pos, new_bufend_pos;
1207 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1208 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1209 bool got_some_for_debugger = 0;
f0e67a1d
Z
1210 bool got_some;
1211 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF))
1212 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
f0e67a1d
Z
1213 linestr = PL_parser->linestr;
1214 buf = SvPVX(linestr);
1215 if (!(flags & LEX_KEEP_PREVIOUS) &&
1216 PL_parser->bufptr == PL_parser->bufend) {
1217 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1218 linestart_pos = 0;
1219 if (PL_parser->last_uni != PL_parser->bufend)
1220 PL_parser->last_uni = NULL;
1221 if (PL_parser->last_lop != PL_parser->bufend)
1222 PL_parser->last_lop = NULL;
1223 last_uni_pos = last_lop_pos = 0;
1224 *buf = 0;
1225 SvCUR(linestr) = 0;
1226 } else {
1227 old_bufend_pos = PL_parser->bufend - buf;
1228 bufptr_pos = PL_parser->bufptr - buf;
1229 oldbufptr_pos = PL_parser->oldbufptr - buf;
1230 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1231 linestart_pos = PL_parser->linestart - buf;
1232 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1233 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1234 }
1235 if (flags & LEX_FAKE_EOF) {
1236 goto eof;
1237 } else if (!PL_parser->rsfp) {
1238 got_some = 0;
1239 } else if (filter_gets(linestr, old_bufend_pos)) {
1240 got_some = 1;
17cc9359 1241 got_some_for_debugger = 1;
f0e67a1d 1242 } else {
580561a3
Z
1243 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1244 sv_setpvs(linestr, "");
f0e67a1d
Z
1245 eof:
1246 /* End of real input. Close filehandle (unless it was STDIN),
1247 * then add implicit termination.
1248 */
1249 if ((PerlIO*)PL_parser->rsfp == PerlIO_stdin())
1250 PerlIO_clearerr(PL_parser->rsfp);
1251 else if (PL_parser->rsfp)
1252 (void)PerlIO_close(PL_parser->rsfp);
1253 PL_parser->rsfp = NULL;
737c24fc 1254 PL_parser->in_pod = 0;
f0e67a1d
Z
1255#ifdef PERL_MAD
1256 if (PL_madskills && !PL_in_eval && (PL_minus_p || PL_minus_n))
1257 PL_faketokens = 1;
1258#endif
1259 if (!PL_in_eval && PL_minus_p) {
1260 sv_catpvs(linestr,
1261 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1262 PL_minus_n = PL_minus_p = 0;
1263 } else if (!PL_in_eval && PL_minus_n) {
1264 sv_catpvs(linestr, /*{*/";}");
1265 PL_minus_n = 0;
1266 } else
1267 sv_catpvs(linestr, ";");
1268 got_some = 1;
1269 }
1270 buf = SvPVX(linestr);
1271 new_bufend_pos = SvCUR(linestr);
1272 PL_parser->bufend = buf + new_bufend_pos;
1273 PL_parser->bufptr = buf + bufptr_pos;
1274 PL_parser->oldbufptr = buf + oldbufptr_pos;
1275 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1276 PL_parser->linestart = buf + linestart_pos;
1277 if (PL_parser->last_uni)
1278 PL_parser->last_uni = buf + last_uni_pos;
1279 if (PL_parser->last_lop)
1280 PL_parser->last_lop = buf + last_lop_pos;
17cc9359 1281 if (got_some_for_debugger && (PERLDB_LINE || PERLDB_SAVESRC) &&
f0e67a1d
Z
1282 PL_curstash != PL_debstash) {
1283 /* debugger active and we're not compiling the debugger code,
1284 * so store the line into the debugger's array of lines
1285 */
1286 update_debugger_info(NULL, buf+old_bufend_pos,
1287 new_bufend_pos-old_bufend_pos);
1288 }
1289 return got_some;
1290}
1291
1292/*
1293=for apidoc Amx|I32|lex_peek_unichar|U32 flags
1294
1295Looks ahead one (Unicode) character in the text currently being lexed.
1296Returns the codepoint (unsigned integer value) of the next character,
1297or -1 if lexing has reached the end of the input text. To consume the
1298peeked character, use L</lex_read_unichar>.
1299
1300If the next character is in (or extends into) the next chunk of input
1301text, the next chunk will be read in. Normally the current chunk will be
1302discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1303then the current chunk will not be discarded.
1304
1305If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1306is encountered, an exception is generated.
1307
1308=cut
1309*/
1310
1311I32
1312Perl_lex_peek_unichar(pTHX_ U32 flags)
1313{
749123ff 1314 dVAR;
f0e67a1d
Z
1315 char *s, *bufend;
1316 if (flags & ~(LEX_KEEP_PREVIOUS))
1317 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1318 s = PL_parser->bufptr;
1319 bufend = PL_parser->bufend;
1320 if (UTF) {
1321 U8 head;
1322 I32 unichar;
1323 STRLEN len, retlen;
1324 if (s == bufend) {
1325 if (!lex_next_chunk(flags))
1326 return -1;
1327 s = PL_parser->bufptr;
1328 bufend = PL_parser->bufend;
1329 }
1330 head = (U8)*s;
1331 if (!(head & 0x80))
1332 return head;
1333 if (head & 0x40) {
1334 len = PL_utf8skip[head];
1335 while ((STRLEN)(bufend-s) < len) {
1336 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1337 break;
1338 s = PL_parser->bufptr;
1339 bufend = PL_parser->bufend;
1340 }
1341 }
1342 unichar = utf8n_to_uvuni((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
1343 if (retlen == (STRLEN)-1) {
1344 /* malformed UTF-8 */
1345 ENTER;
1346 SAVESPTR(PL_warnhook);
1347 PL_warnhook = PERL_WARNHOOK_FATAL;
1348 utf8n_to_uvuni((U8*)s, bufend-s, NULL, 0);
1349 LEAVE;
1350 }
1351 return unichar;
1352 } else {
1353 if (s == bufend) {
1354 if (!lex_next_chunk(flags))
1355 return -1;
1356 s = PL_parser->bufptr;
1357 }
1358 return (U8)*s;
1359 }
1360}
1361
1362/*
1363=for apidoc Amx|I32|lex_read_unichar|U32 flags
1364
1365Reads the next (Unicode) character in the text currently being lexed.
1366Returns the codepoint (unsigned integer value) of the character read,
1367and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1368if lexing has reached the end of the input text. To non-destructively
1369examine the next character, use L</lex_peek_unichar> instead.
1370
1371If the next character is in (or extends into) the next chunk of input
1372text, the next chunk will be read in. Normally the current chunk will be
1373discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1374then the current chunk will not be discarded.
1375
1376If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1377is encountered, an exception is generated.
1378
1379=cut
1380*/
1381
1382I32
1383Perl_lex_read_unichar(pTHX_ U32 flags)
1384{
1385 I32 c;
1386 if (flags & ~(LEX_KEEP_PREVIOUS))
1387 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1388 c = lex_peek_unichar(flags);
1389 if (c != -1) {
1390 if (c == '\n')
1391 CopLINE_inc(PL_curcop);
1392 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1393 }
1394 return c;
1395}
1396
1397/*
1398=for apidoc Amx|void|lex_read_space|U32 flags
1399
1400Reads optional spaces, in Perl style, in the text currently being
1401lexed. The spaces may include ordinary whitespace characters and
1402Perl-style comments. C<#line> directives are processed if encountered.
1403L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1404at a non-space character (or the end of the input text).
1405
1406If spaces extend into the next chunk of input text, the next chunk will
1407be read in. Normally the current chunk will be discarded at the same
1408time, but if I<flags> includes C<LEX_KEEP_PREVIOUS> then the current
1409chunk will not be discarded.
1410
1411=cut
1412*/
1413
f0998909
Z
1414#define LEX_NO_NEXT_CHUNK 0x80000000
1415
f0e67a1d
Z
1416void
1417Perl_lex_read_space(pTHX_ U32 flags)
1418{
1419 char *s, *bufend;
1420 bool need_incline = 0;
f0998909 1421 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK))
f0e67a1d
Z
1422 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
1423#ifdef PERL_MAD
1424 if (PL_skipwhite) {
1425 sv_free(PL_skipwhite);
1426 PL_skipwhite = NULL;
1427 }
1428 if (PL_madskills)
1429 PL_skipwhite = newSVpvs("");
1430#endif /* PERL_MAD */
1431 s = PL_parser->bufptr;
1432 bufend = PL_parser->bufend;
1433 while (1) {
1434 char c = *s;
1435 if (c == '#') {
1436 do {
1437 c = *++s;
1438 } while (!(c == '\n' || (c == 0 && s == bufend)));
1439 } else if (c == '\n') {
1440 s++;
1441 PL_parser->linestart = s;
1442 if (s == bufend)
1443 need_incline = 1;
1444 else
1445 incline(s);
1446 } else if (isSPACE(c)) {
1447 s++;
1448 } else if (c == 0 && s == bufend) {
1449 bool got_more;
1450#ifdef PERL_MAD
1451 if (PL_madskills)
1452 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1453#endif /* PERL_MAD */
f0998909
Z
1454 if (flags & LEX_NO_NEXT_CHUNK)
1455 break;
f0e67a1d
Z
1456 PL_parser->bufptr = s;
1457 CopLINE_inc(PL_curcop);
1458 got_more = lex_next_chunk(flags);
1459 CopLINE_dec(PL_curcop);
1460 s = PL_parser->bufptr;
1461 bufend = PL_parser->bufend;
1462 if (!got_more)
1463 break;
1464 if (need_incline && PL_parser->rsfp) {
1465 incline(s);
1466 need_incline = 0;
1467 }
1468 } else {
1469 break;
1470 }
1471 }
1472#ifdef PERL_MAD
1473 if (PL_madskills)
1474 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1475#endif /* PERL_MAD */
1476 PL_parser->bufptr = s;
1477}
1478
1479/*
ffb4593c
NT
1480 * S_incline
1481 * This subroutine has nothing to do with tilting, whether at windmills
1482 * or pinball tables. Its name is short for "increment line". It
57843af0 1483 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1484 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1485 * # line 500 "foo.pm"
1486 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1487 */
1488
76e3520e 1489STATIC void
d9095cec 1490S_incline(pTHX_ const char *s)
463ee0b2 1491{
97aff369 1492 dVAR;
d9095cec
NC
1493 const char *t;
1494 const char *n;
1495 const char *e;
463ee0b2 1496
7918f24d
NC
1497 PERL_ARGS_ASSERT_INCLINE;
1498
57843af0 1499 CopLINE_inc(PL_curcop);
463ee0b2
LW
1500 if (*s++ != '#')
1501 return;
d4c19fe8
AL
1502 while (SPACE_OR_TAB(*s))
1503 s++;
73659bf1
GS
1504 if (strnEQ(s, "line", 4))
1505 s += 4;
1506 else
1507 return;
084592ab 1508 if (SPACE_OR_TAB(*s))
73659bf1 1509 s++;
4e553d73 1510 else
73659bf1 1511 return;
d4c19fe8
AL
1512 while (SPACE_OR_TAB(*s))
1513 s++;
463ee0b2
LW
1514 if (!isDIGIT(*s))
1515 return;
d4c19fe8 1516
463ee0b2
LW
1517 n = s;
1518 while (isDIGIT(*s))
1519 s++;
07714eb4 1520 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1521 return;
bf4acbe4 1522 while (SPACE_OR_TAB(*s))
463ee0b2 1523 s++;
73659bf1 1524 if (*s == '"' && (t = strchr(s+1, '"'))) {
463ee0b2 1525 s++;
73659bf1
GS
1526 e = t + 1;
1527 }
463ee0b2 1528 else {
c35e046a
AL
1529 t = s;
1530 while (!isSPACE(*t))
1531 t++;
73659bf1 1532 e = t;
463ee0b2 1533 }
bf4acbe4 1534 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1535 e++;
1536 if (*e != '\n' && *e != '\0')
1537 return; /* false alarm */
1538
f4dd75d9 1539 if (t - s > 0) {
d9095cec 1540 const STRLEN len = t - s;
8a5ee598 1541#ifndef USE_ITHREADS
19bad673
NC
1542 SV *const temp_sv = CopFILESV(PL_curcop);
1543 const char *cf;
1544 STRLEN tmplen;
1545
1546 if (temp_sv) {
1547 cf = SvPVX(temp_sv);
1548 tmplen = SvCUR(temp_sv);
1549 } else {
1550 cf = NULL;
1551 tmplen = 0;
1552 }
1553
42d9b98d 1554 if (tmplen > 7 && strnEQ(cf, "(eval ", 6)) {
e66cf94c
RGS
1555 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1556 * to *{"::_<newfilename"} */
44867030
NC
1557 /* However, the long form of evals is only turned on by the
1558 debugger - usually they're "(eval %lu)" */
1559 char smallbuf[128];
1560 char *tmpbuf;
1561 GV **gvp;
d9095cec 1562 STRLEN tmplen2 = len;
798b63bc 1563 if (tmplen + 2 <= sizeof smallbuf)
e66cf94c
RGS
1564 tmpbuf = smallbuf;
1565 else
2ae0db35 1566 Newx(tmpbuf, tmplen + 2, char);
44867030
NC
1567 tmpbuf[0] = '_';
1568 tmpbuf[1] = '<';
2ae0db35 1569 memcpy(tmpbuf + 2, cf, tmplen);
44867030 1570 tmplen += 2;
8a5ee598
RGS
1571 gvp = (GV**)hv_fetch(PL_defstash, tmpbuf, tmplen, FALSE);
1572 if (gvp) {
44867030
NC
1573 char *tmpbuf2;
1574 GV *gv2;
1575
1576 if (tmplen2 + 2 <= sizeof smallbuf)
1577 tmpbuf2 = smallbuf;
1578 else
1579 Newx(tmpbuf2, tmplen2 + 2, char);
1580
1581 if (tmpbuf2 != smallbuf || tmpbuf != smallbuf) {
1582 /* Either they malloc'd it, or we malloc'd it,
1583 so no prefix is present in ours. */
1584 tmpbuf2[0] = '_';
1585 tmpbuf2[1] = '<';
1586 }
1587
1588 memcpy(tmpbuf2 + 2, s, tmplen2);
1589 tmplen2 += 2;
1590
8a5ee598 1591 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1592 if (!isGV(gv2)) {
8a5ee598 1593 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1594 /* adjust ${"::_<newfilename"} to store the new file name */
1595 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
3cb1dbc6
NC
1596 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(*gvp)));
1597 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(*gvp)));
e5527e4b 1598 }
44867030
NC
1599
1600 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
8a5ee598 1601 }
e66cf94c 1602 if (tmpbuf != smallbuf) Safefree(tmpbuf);
e66cf94c 1603 }
8a5ee598 1604#endif
05ec9bb3 1605 CopFILE_free(PL_curcop);
d9095cec 1606 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1607 }
57843af0 1608 CopLINE_set(PL_curcop, atoi(n)-1);
463ee0b2
LW
1609}
1610
29595ff2 1611#ifdef PERL_MAD
cd81e915 1612/* skip space before PL_thistoken */
29595ff2
NC
1613
1614STATIC char *
1615S_skipspace0(pTHX_ register char *s)
1616{
7918f24d
NC
1617 PERL_ARGS_ASSERT_SKIPSPACE0;
1618
29595ff2
NC
1619 s = skipspace(s);
1620 if (!PL_madskills)
1621 return s;
cd81e915
NC
1622 if (PL_skipwhite) {
1623 if (!PL_thiswhite)
6b29d1f5 1624 PL_thiswhite = newSVpvs("");
cd81e915
NC
1625 sv_catsv(PL_thiswhite, PL_skipwhite);
1626 sv_free(PL_skipwhite);
1627 PL_skipwhite = 0;
1628 }
1629 PL_realtokenstart = s - SvPVX(PL_linestr);
29595ff2
NC
1630 return s;
1631}
1632
cd81e915 1633/* skip space after PL_thistoken */
29595ff2
NC
1634
1635STATIC char *
1636S_skipspace1(pTHX_ register char *s)
1637{
d4c19fe8 1638 const char *start = s;
29595ff2
NC
1639 I32 startoff = start - SvPVX(PL_linestr);
1640
7918f24d
NC
1641 PERL_ARGS_ASSERT_SKIPSPACE1;
1642
29595ff2
NC
1643 s = skipspace(s);
1644 if (!PL_madskills)
1645 return s;
1646 start = SvPVX(PL_linestr) + startoff;
cd81e915 1647 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1648 const char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1649 PL_thistoken = newSVpvn(tstart, start - tstart);
1650 }
1651 PL_realtokenstart = -1;
1652 if (PL_skipwhite) {
1653 if (!PL_nextwhite)
6b29d1f5 1654 PL_nextwhite = newSVpvs("");
cd81e915
NC
1655 sv_catsv(PL_nextwhite, PL_skipwhite);
1656 sv_free(PL_skipwhite);
1657 PL_skipwhite = 0;
29595ff2
NC
1658 }
1659 return s;
1660}
1661
1662STATIC char *
1663S_skipspace2(pTHX_ register char *s, SV **svp)
1664{
c35e046a
AL
1665 char *start;
1666 const I32 bufptroff = PL_bufptr - SvPVX(PL_linestr);
1667 const I32 startoff = s - SvPVX(PL_linestr);
1668
7918f24d
NC
1669 PERL_ARGS_ASSERT_SKIPSPACE2;
1670
29595ff2
NC
1671 s = skipspace(s);
1672 PL_bufptr = SvPVX(PL_linestr) + bufptroff;
1673 if (!PL_madskills || !svp)
1674 return s;
1675 start = SvPVX(PL_linestr) + startoff;
cd81e915 1676 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1677 char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1678 PL_thistoken = newSVpvn(tstart, start - tstart);
1679 PL_realtokenstart = -1;
29595ff2 1680 }
cd81e915 1681 if (PL_skipwhite) {
29595ff2 1682 if (!*svp)
6b29d1f5 1683 *svp = newSVpvs("");
cd81e915
NC
1684 sv_setsv(*svp, PL_skipwhite);
1685 sv_free(PL_skipwhite);
1686 PL_skipwhite = 0;
29595ff2
NC
1687 }
1688
1689 return s;
1690}
1691#endif
1692
80a702cd 1693STATIC void
15f169a1 1694S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1695{
1696 AV *av = CopFILEAVx(PL_curcop);
1697 if (av) {
b9f83d2f 1698 SV * const sv = newSV_type(SVt_PVMG);
5fa550fb
NC
1699 if (orig_sv)
1700 sv_setsv(sv, orig_sv);
1701 else
1702 sv_setpvn(sv, buf, len);
80a702cd
RGS
1703 (void)SvIOK_on(sv);
1704 SvIV_set(sv, 0);
1705 av_store(av, (I32)CopLINE(PL_curcop), sv);
1706 }
1707}
1708
ffb4593c
NT
1709/*
1710 * S_skipspace
1711 * Called to gobble the appropriate amount and type of whitespace.
1712 * Skips comments as well.
1713 */
1714
76e3520e 1715STATIC char *
cea2e8a9 1716S_skipspace(pTHX_ register char *s)
a687059c 1717{
5db06880 1718#ifdef PERL_MAD
f0e67a1d
Z
1719 char *start = s;
1720#endif /* PERL_MAD */
7918f24d 1721 PERL_ARGS_ASSERT_SKIPSPACE;
f0e67a1d 1722#ifdef PERL_MAD
cd81e915
NC
1723 if (PL_skipwhite) {
1724 sv_free(PL_skipwhite);
f0e67a1d 1725 PL_skipwhite = NULL;
5db06880 1726 }
f0e67a1d 1727#endif /* PERL_MAD */
3280af22 1728 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
bf4acbe4 1729 while (s < PL_bufend && SPACE_OR_TAB(*s))
463ee0b2 1730 s++;
f0e67a1d
Z
1731 } else {
1732 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1733 PL_bufptr = s;
f0998909
Z
1734 lex_read_space(LEX_KEEP_PREVIOUS |
1735 (PL_sublex_info.sub_inwhat || PL_lex_state == LEX_FORMLINE ?
1736 LEX_NO_NEXT_CHUNK : 0));
3280af22 1737 s = PL_bufptr;
f0e67a1d
Z
1738 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1739 if (PL_linestart > PL_bufptr)
1740 PL_bufptr = PL_linestart;
1741 return s;
463ee0b2 1742 }
5db06880 1743#ifdef PERL_MAD
f0e67a1d
Z
1744 if (PL_madskills)
1745 PL_skipwhite = newSVpvn(start, s-start);
1746#endif /* PERL_MAD */
5db06880 1747 return s;
a687059c 1748}
378cc40b 1749
ffb4593c
NT
1750/*
1751 * S_check_uni
1752 * Check the unary operators to ensure there's no ambiguity in how they're
1753 * used. An ambiguous piece of code would be:
1754 * rand + 5
1755 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1756 * the +5 is its argument.
1757 */
1758
76e3520e 1759STATIC void
cea2e8a9 1760S_check_uni(pTHX)
ba106d47 1761{
97aff369 1762 dVAR;
d4c19fe8
AL
1763 const char *s;
1764 const char *t;
2f3197b3 1765
3280af22 1766 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1767 return;
3280af22
NIS
1768 while (isSPACE(*PL_last_uni))
1769 PL_last_uni++;
c35e046a
AL
1770 s = PL_last_uni;
1771 while (isALNUM_lazy_if(s,UTF) || *s == '-')
1772 s++;
3280af22 1773 if ((t = strchr(s, '(')) && t < PL_bufptr)
a0d0e21e 1774 return;
6136c704 1775
9b387841
NC
1776 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
1777 "Warning: Use of \"%.*s\" without parentheses is ambiguous",
1778 (int)(s - PL_last_uni), PL_last_uni);
2f3197b3
LW
1779}
1780
ffb4593c
NT
1781/*
1782 * LOP : macro to build a list operator. Its behaviour has been replaced
1783 * with a subroutine, S_lop() for which LOP is just another name.
1784 */
1785
a0d0e21e
LW
1786#define LOP(f,x) return lop(f,x,s)
1787
ffb4593c
NT
1788/*
1789 * S_lop
1790 * Build a list operator (or something that might be one). The rules:
1791 * - if we have a next token, then it's a list operator [why?]
1792 * - if the next thing is an opening paren, then it's a function
1793 * - else it's a list operator
1794 */
1795
76e3520e 1796STATIC I32
a0be28da 1797S_lop(pTHX_ I32 f, int x, char *s)
ffed7fef 1798{
97aff369 1799 dVAR;
7918f24d
NC
1800
1801 PERL_ARGS_ASSERT_LOP;
1802
6154021b 1803 pl_yylval.ival = f;
35c8bce7 1804 CLINE;
3280af22
NIS
1805 PL_expect = x;
1806 PL_bufptr = s;
1807 PL_last_lop = PL_oldbufptr;
eb160463 1808 PL_last_lop_op = (OPCODE)f;
5db06880
NC
1809#ifdef PERL_MAD
1810 if (PL_lasttoke)
1811 return REPORT(LSTOP);
1812#else
3280af22 1813 if (PL_nexttoke)
bbf60fe6 1814 return REPORT(LSTOP);
5db06880 1815#endif
79072805 1816 if (*s == '(')
bbf60fe6 1817 return REPORT(FUNC);
29595ff2 1818 s = PEEKSPACE(s);
79072805 1819 if (*s == '(')
bbf60fe6 1820 return REPORT(FUNC);
79072805 1821 else
bbf60fe6 1822 return REPORT(LSTOP);
79072805
LW
1823}
1824
5db06880
NC
1825#ifdef PERL_MAD
1826 /*
1827 * S_start_force
1828 * Sets up for an eventual force_next(). start_force(0) basically does
1829 * an unshift, while start_force(-1) does a push. yylex removes items
1830 * on the "pop" end.
1831 */
1832
1833STATIC void
1834S_start_force(pTHX_ int where)
1835{
1836 int i;
1837
cd81e915 1838 if (where < 0) /* so people can duplicate start_force(PL_curforce) */
5db06880 1839 where = PL_lasttoke;
cd81e915
NC
1840 assert(PL_curforce < 0 || PL_curforce == where);
1841 if (PL_curforce != where) {
5db06880
NC
1842 for (i = PL_lasttoke; i > where; --i) {
1843 PL_nexttoke[i] = PL_nexttoke[i-1];
1844 }
1845 PL_lasttoke++;
1846 }
cd81e915 1847 if (PL_curforce < 0) /* in case of duplicate start_force() */
5db06880 1848 Zero(&PL_nexttoke[where], 1, NEXTTOKE);
cd81e915
NC
1849 PL_curforce = where;
1850 if (PL_nextwhite) {
5db06880 1851 if (PL_madskills)
6b29d1f5 1852 curmad('^', newSVpvs(""));
cd81e915 1853 CURMAD('_', PL_nextwhite);
5db06880
NC
1854 }
1855}
1856
1857STATIC void
1858S_curmad(pTHX_ char slot, SV *sv)
1859{
1860 MADPROP **where;
1861
1862 if (!sv)
1863 return;
cd81e915
NC
1864 if (PL_curforce < 0)
1865 where = &PL_thismad;
5db06880 1866 else
cd81e915 1867 where = &PL_nexttoke[PL_curforce].next_mad;
5db06880 1868
cd81e915 1869 if (PL_faketokens)
76f68e9b 1870 sv_setpvs(sv, "");
5db06880
NC
1871 else {
1872 if (!IN_BYTES) {
1873 if (UTF && is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
1874 SvUTF8_on(sv);
1875 else if (PL_encoding) {
1876 sv_recode_to_utf8(sv, PL_encoding);
1877 }
1878 }
1879 }
1880
1881 /* keep a slot open for the head of the list? */
1882 if (slot != '_' && *where && (*where)->mad_key == '^') {
1883 (*where)->mad_key = slot;
daba3364 1884 sv_free(MUTABLE_SV(((*where)->mad_val)));
5db06880
NC
1885 (*where)->mad_val = (void*)sv;
1886 }
1887 else
1888 addmad(newMADsv(slot, sv), where, 0);
1889}
1890#else
b3f24c00
MHM
1891# define start_force(where) NOOP
1892# define curmad(slot, sv) NOOP
5db06880
NC
1893#endif
1894
ffb4593c
NT
1895/*
1896 * S_force_next
9cbb5ea2 1897 * When the lexer realizes it knows the next token (for instance,
ffb4593c 1898 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 1899 * to know what token to return the next time the lexer is called. Caller
5db06880
NC
1900 * will need to set PL_nextval[] (or PL_nexttoke[].next_val with PERL_MAD),
1901 * and possibly PL_expect to ensure the lexer handles the token correctly.
ffb4593c
NT
1902 */
1903
4e553d73 1904STATIC void
cea2e8a9 1905S_force_next(pTHX_ I32 type)
79072805 1906{
97aff369 1907 dVAR;
704d4215
GG
1908#ifdef DEBUGGING
1909 if (DEBUG_T_TEST) {
1910 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 1911 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
1912 }
1913#endif
5db06880 1914#ifdef PERL_MAD
cd81e915 1915 if (PL_curforce < 0)
5db06880 1916 start_force(PL_lasttoke);
cd81e915 1917 PL_nexttoke[PL_curforce].next_type = type;
5db06880
NC
1918 if (PL_lex_state != LEX_KNOWNEXT)
1919 PL_lex_defer = PL_lex_state;
1920 PL_lex_state = LEX_KNOWNEXT;
1921 PL_lex_expect = PL_expect;
cd81e915 1922 PL_curforce = -1;
5db06880 1923#else
3280af22
NIS
1924 PL_nexttype[PL_nexttoke] = type;
1925 PL_nexttoke++;
1926 if (PL_lex_state != LEX_KNOWNEXT) {
1927 PL_lex_defer = PL_lex_state;
1928 PL_lex_expect = PL_expect;
1929 PL_lex_state = LEX_KNOWNEXT;
79072805 1930 }
5db06880 1931#endif
79072805
LW
1932}
1933
28ac2b49
Z
1934void
1935Perl_yyunlex(pTHX)
1936{
1937 if (PL_parser->yychar != YYEMPTY) {
1938 start_force(-1);
1939 NEXTVAL_NEXTTOKE = PL_parser->yylval;
1940 force_next(PL_parser->yychar);
1941 PL_parser->yychar = YYEMPTY;
1942 }
1943}
1944
d0a148a6 1945STATIC SV *
15f169a1 1946S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 1947{
97aff369 1948 dVAR;
740cce10 1949 SV * const sv = newSVpvn_utf8(start, len,
eaf7a4d2
CS
1950 !IN_BYTES
1951 && UTF
1952 && !is_ascii_string((const U8*)start, len)
740cce10 1953 && is_utf8_string((const U8*)start, len));
d0a148a6
NC
1954 return sv;
1955}
1956
ffb4593c
NT
1957/*
1958 * S_force_word
1959 * When the lexer knows the next thing is a word (for instance, it has
1960 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
1961 * it calls S_force_word to stick the next word into the PL_nexttoke/val
1962 * lookahead.
ffb4593c
NT
1963 *
1964 * Arguments:
b1b65b59 1965 * char *start : buffer position (must be within PL_linestr)
02b34bbe 1966 * int token : PL_next* will be this type of bare word (e.g., METHOD,WORD)
ffb4593c
NT
1967 * int check_keyword : if true, Perl checks to make sure the word isn't
1968 * a keyword (do this if the word is a label, e.g. goto FOO)
1969 * int allow_pack : if true, : characters will also be allowed (require,
1970 * use, etc. do this)
9cbb5ea2 1971 * int allow_initial_tick : used by the "sub" lexer only.
ffb4593c
NT
1972 */
1973
76e3520e 1974STATIC char *
cea2e8a9 1975S_force_word(pTHX_ register char *start, int token, int check_keyword, int allow_pack, int allow_initial_tick)
79072805 1976{
97aff369 1977 dVAR;
463ee0b2
LW
1978 register char *s;
1979 STRLEN len;
4e553d73 1980
7918f24d
NC
1981 PERL_ARGS_ASSERT_FORCE_WORD;
1982
29595ff2 1983 start = SKIPSPACE1(start);
463ee0b2 1984 s = start;
7e2040f0 1985 if (isIDFIRST_lazy_if(s,UTF) ||
a0d0e21e 1986 (allow_pack && *s == ':') ||
15f0808c 1987 (allow_initial_tick && *s == '\'') )
a0d0e21e 1988 {
3280af22 1989 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
5458a98a 1990 if (check_keyword && keyword(PL_tokenbuf, len, 0))
463ee0b2 1991 return start;
cd81e915 1992 start_force(PL_curforce);
5db06880
NC
1993 if (PL_madskills)
1994 curmad('X', newSVpvn(start,s-start));
463ee0b2 1995 if (token == METHOD) {
29595ff2 1996 s = SKIPSPACE1(s);
463ee0b2 1997 if (*s == '(')
3280af22 1998 PL_expect = XTERM;
463ee0b2 1999 else {
3280af22 2000 PL_expect = XOPERATOR;
463ee0b2 2001 }
79072805 2002 }
e74e6b3d 2003 if (PL_madskills)
63575281 2004 curmad('g', newSVpvs( "forced" ));
9ded7720 2005 NEXTVAL_NEXTTOKE.opval
d0a148a6
NC
2006 = (OP*)newSVOP(OP_CONST,0,
2007 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 2008 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
2009 force_next(token);
2010 }
2011 return s;
2012}
2013
ffb4593c
NT
2014/*
2015 * S_force_ident
9cbb5ea2 2016 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
2017 * text only contains the "foo" portion. The first argument is a pointer
2018 * to the "foo", and the second argument is the type symbol to prefix.
2019 * Forces the next token to be a "WORD".
9cbb5ea2 2020 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2021 */
2022
76e3520e 2023STATIC void
bfed75c6 2024S_force_ident(pTHX_ register const char *s, int kind)
79072805 2025{
97aff369 2026 dVAR;
7918f24d
NC
2027
2028 PERL_ARGS_ASSERT_FORCE_IDENT;
2029
c35e046a 2030 if (*s) {
90e5519e
NC
2031 const STRLEN len = strlen(s);
2032 OP* const o = (OP*)newSVOP(OP_CONST, 0, newSVpvn(s, len));
cd81e915 2033 start_force(PL_curforce);
9ded7720 2034 NEXTVAL_NEXTTOKE.opval = o;
79072805 2035 force_next(WORD);
748a9306 2036 if (kind) {
11343788 2037 o->op_private = OPpCONST_ENTERED;
55497cff
PP
2038 /* XXX see note in pp_entereval() for why we forgo typo
2039 warnings if the symbol must be introduced in an eval.
2040 GSAR 96-10-12 */
90e5519e
NC
2041 gv_fetchpvn_flags(s, len,
2042 PL_in_eval ? (GV_ADDMULTI | GV_ADDINEVAL)
2043 : GV_ADD,
2044 kind == '$' ? SVt_PV :
2045 kind == '@' ? SVt_PVAV :
2046 kind == '%' ? SVt_PVHV :
a0d0e21e 2047 SVt_PVGV
90e5519e 2048 );
748a9306 2049 }
79072805
LW
2050 }
2051}
2052
1571675a
GS
2053NV
2054Perl_str_to_version(pTHX_ SV *sv)
2055{
2056 NV retval = 0.0;
2057 NV nshift = 1.0;
2058 STRLEN len;
cfd0369c 2059 const char *start = SvPV_const(sv,len);
9d4ba2ae 2060 const char * const end = start + len;
504618e9 2061 const bool utf = SvUTF8(sv) ? TRUE : FALSE;
7918f24d
NC
2062
2063 PERL_ARGS_ASSERT_STR_TO_VERSION;
2064
1571675a 2065 while (start < end) {
ba210ebe 2066 STRLEN skip;
1571675a
GS
2067 UV n;
2068 if (utf)
9041c2e3 2069 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2070 else {
2071 n = *(U8*)start;
2072 skip = 1;
2073 }
2074 retval += ((NV)n)/nshift;
2075 start += skip;
2076 nshift *= 1000;
2077 }
2078 return retval;
2079}
2080
4e553d73 2081/*
ffb4593c
NT
2082 * S_force_version
2083 * Forces the next token to be a version number.
e759cc13
RGS
2084 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2085 * and if "guessing" is TRUE, then no new token is created (and the caller
2086 * must use an alternative parsing method).
ffb4593c
NT
2087 */
2088
76e3520e 2089STATIC char *
e759cc13 2090S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2091{
97aff369 2092 dVAR;
5f66b61c 2093 OP *version = NULL;
44dcb63b 2094 char *d;
5db06880
NC
2095#ifdef PERL_MAD
2096 I32 startoff = s - SvPVX(PL_linestr);
2097#endif
89bfa8cd 2098
7918f24d
NC
2099 PERL_ARGS_ASSERT_FORCE_VERSION;
2100
29595ff2 2101 s = SKIPSPACE1(s);
89bfa8cd 2102
44dcb63b 2103 d = s;
dd629d5b 2104 if (*d == 'v')
44dcb63b 2105 d++;
44dcb63b 2106 if (isDIGIT(*d)) {
e759cc13
RGS
2107 while (isDIGIT(*d) || *d == '_' || *d == '.')
2108 d++;
5db06880
NC
2109#ifdef PERL_MAD
2110 if (PL_madskills) {
cd81e915 2111 start_force(PL_curforce);
5db06880
NC
2112 curmad('X', newSVpvn(s,d-s));
2113 }
2114#endif
4e4da3ac 2115 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
dd629d5b 2116 SV *ver;
8d08d9ba
DG
2117#ifdef USE_LOCALE_NUMERIC
2118 char *loc = setlocale(LC_NUMERIC, "C");
2119#endif
6154021b 2120 s = scan_num(s, &pl_yylval);
8d08d9ba
DG
2121#ifdef USE_LOCALE_NUMERIC
2122 setlocale(LC_NUMERIC, loc);
2123#endif
6154021b 2124 version = pl_yylval.opval;
dd629d5b
GS
2125 ver = cSVOPx(version)->op_sv;
2126 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2127 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2128 SvNV_set(ver, str_to_version(ver));
1571675a 2129 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2130 }
89bfa8cd 2131 }
5db06880
NC
2132 else if (guessing) {
2133#ifdef PERL_MAD
2134 if (PL_madskills) {
cd81e915
NC
2135 sv_free(PL_nextwhite); /* let next token collect whitespace */
2136 PL_nextwhite = 0;
5db06880
NC
2137 s = SvPVX(PL_linestr) + startoff;
2138 }
2139#endif
e759cc13 2140 return s;
5db06880 2141 }
89bfa8cd
PP
2142 }
2143
5db06880
NC
2144#ifdef PERL_MAD
2145 if (PL_madskills && !version) {
cd81e915
NC
2146 sv_free(PL_nextwhite); /* let next token collect whitespace */
2147 PL_nextwhite = 0;
5db06880
NC
2148 s = SvPVX(PL_linestr) + startoff;
2149 }
2150#endif
89bfa8cd 2151 /* NOTE: The parser sees the package name and the VERSION swapped */
cd81e915 2152 start_force(PL_curforce);
9ded7720 2153 NEXTVAL_NEXTTOKE.opval = version;
4e553d73 2154 force_next(WORD);
89bfa8cd 2155
e759cc13 2156 return s;
89bfa8cd
PP
2157}
2158
ffb4593c 2159/*
91152fc1
DG
2160 * S_force_strict_version
2161 * Forces the next token to be a version number using strict syntax rules.
2162 */
2163
2164STATIC char *
2165S_force_strict_version(pTHX_ char *s)
2166{
2167 dVAR;
2168 OP *version = NULL;
2169#ifdef PERL_MAD
2170 I32 startoff = s - SvPVX(PL_linestr);
2171#endif
2172 const char *errstr = NULL;
2173
2174 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2175
2176 while (isSPACE(*s)) /* leading whitespace */
2177 s++;
2178
2179 if (is_STRICT_VERSION(s,&errstr)) {
2180 SV *ver = newSV(0);
2181 s = (char *)scan_version(s, ver, 0);
2182 version = newSVOP(OP_CONST, 0, ver);
2183 }
4e4da3ac
Z
2184 else if ( (*s != ';' && *s != '{' && *s != '}' ) &&
2185 (s = SKIPSPACE1(s), (*s != ';' && *s != '{' && *s != '}' )))
2186 {
91152fc1
DG
2187 PL_bufptr = s;
2188 if (errstr)
2189 yyerror(errstr); /* version required */
2190 return s;
2191 }
2192
2193#ifdef PERL_MAD
2194 if (PL_madskills && !version) {
2195 sv_free(PL_nextwhite); /* let next token collect whitespace */
2196 PL_nextwhite = 0;
2197 s = SvPVX(PL_linestr) + startoff;
2198 }
2199#endif
2200 /* NOTE: The parser sees the package name and the VERSION swapped */
2201 start_force(PL_curforce);
2202 NEXTVAL_NEXTTOKE.opval = version;
2203 force_next(WORD);
2204
2205 return s;
2206}
2207
2208/*
ffb4593c
NT
2209 * S_tokeq
2210 * Tokenize a quoted string passed in as an SV. It finds the next
2211 * chunk, up to end of string or a backslash. It may make a new
2212 * SV containing that chunk (if HINT_NEW_STRING is on). It also
2213 * turns \\ into \.
2214 */
2215
76e3520e 2216STATIC SV *
cea2e8a9 2217S_tokeq(pTHX_ SV *sv)
79072805 2218{
97aff369 2219 dVAR;
79072805
LW
2220 register char *s;
2221 register char *send;
2222 register char *d;
b3ac6de7
IZ
2223 STRLEN len = 0;
2224 SV *pv = sv;
79072805 2225
7918f24d
NC
2226 PERL_ARGS_ASSERT_TOKEQ;
2227
79072805 2228 if (!SvLEN(sv))
b3ac6de7 2229 goto finish;
79072805 2230
a0d0e21e 2231 s = SvPV_force(sv, len);
21a311ee 2232 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1)
b3ac6de7 2233 goto finish;
463ee0b2 2234 send = s + len;
79072805
LW
2235 while (s < send && *s != '\\')
2236 s++;
2237 if (s == send)
b3ac6de7 2238 goto finish;
79072805 2239 d = s;
be4731d2 2240 if ( PL_hints & HINT_NEW_STRING ) {
59cd0e26 2241 pv = newSVpvn_flags(SvPVX_const(pv), len, SVs_TEMP | SvUTF8(sv));
be4731d2 2242 }
79072805
LW
2243 while (s < send) {
2244 if (*s == '\\') {
a0d0e21e 2245 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2246 s++; /* all that, just for this */
2247 }
2248 *d++ = *s++;
2249 }
2250 *d = '\0';
95a20fc0 2251 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2252 finish:
3280af22 2253 if ( PL_hints & HINT_NEW_STRING )
eb0d8d16 2254 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
79072805
LW
2255 return sv;
2256}
2257
ffb4593c
NT
2258/*
2259 * Now come three functions related to double-quote context,
2260 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2261 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2262 * interact with PL_lex_state, and create fake ( ... ) argument lists
2263 * to handle functions and concatenation.
2264 * They assume that whoever calls them will be setting up a fake
2265 * join call, because each subthing puts a ',' after it. This lets
2266 * "lower \luPpEr"
2267 * become
2268 * join($, , 'lower ', lcfirst( 'uPpEr', ) ,)
2269 *
2270 * (I'm not sure whether the spurious commas at the end of lcfirst's
2271 * arguments and join's arguments are created or not).
2272 */
2273
2274/*
2275 * S_sublex_start
6154021b 2276 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2277 *
2278 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2279 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c
NT
2280 *
2281 * OP_CONST and OP_READLINE are easy--just make the new op and return.
2282 *
2283 * Everything else becomes a FUNC.
2284 *
2285 * Sets PL_lex_state to LEX_INTERPPUSH unless (ival was OP_NULL or we
2286 * had an OP_CONST or OP_READLINE). This just sets us up for a
2287 * call to S_sublex_push().
2288 */
2289
76e3520e 2290STATIC I32
cea2e8a9 2291S_sublex_start(pTHX)
79072805 2292{
97aff369 2293 dVAR;
6154021b 2294 register const I32 op_type = pl_yylval.ival;
79072805
LW
2295
2296 if (op_type == OP_NULL) {
6154021b 2297 pl_yylval.opval = PL_lex_op;
5f66b61c 2298 PL_lex_op = NULL;
79072805
LW
2299 return THING;
2300 }
2301 if (op_type == OP_CONST || op_type == OP_READLINE) {
3280af22 2302 SV *sv = tokeq(PL_lex_stuff);
b3ac6de7
IZ
2303
2304 if (SvTYPE(sv) == SVt_PVIV) {
2305 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2306 STRLEN len;
96a5add6 2307 const char * const p = SvPV_const(sv, len);
740cce10 2308 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2309 SvREFCNT_dec(sv);
2310 sv = nsv;
4e553d73 2311 }
6154021b 2312 pl_yylval.opval = (OP*)newSVOP(op_type, 0, sv);
a0714e2c 2313 PL_lex_stuff = NULL;
6f33ba73
RGS
2314 /* Allow <FH> // "foo" */
2315 if (op_type == OP_READLINE)
2316 PL_expect = XTERMORDORDOR;
79072805
LW
2317 return THING;
2318 }
e3f73d4e
RGS
2319 else if (op_type == OP_BACKTICK && PL_lex_op) {
2320 /* readpipe() vas overriden */
2321 cSVOPx(cLISTOPx(cUNOPx(PL_lex_op)->op_first)->op_first->op_sibling)->op_sv = tokeq(PL_lex_stuff);
6154021b 2322 pl_yylval.opval = PL_lex_op;
9b201d7d 2323 PL_lex_op = NULL;
e3f73d4e
RGS
2324 PL_lex_stuff = NULL;
2325 return THING;
2326 }
79072805 2327
3280af22 2328 PL_sublex_info.super_state = PL_lex_state;
eac04b2e 2329 PL_sublex_info.sub_inwhat = (U16)op_type;
3280af22
NIS
2330 PL_sublex_info.sub_op = PL_lex_op;
2331 PL_lex_state = LEX_INTERPPUSH;
55497cff 2332
3280af22
NIS
2333 PL_expect = XTERM;
2334 if (PL_lex_op) {
6154021b 2335 pl_yylval.opval = PL_lex_op;
5f66b61c 2336 PL_lex_op = NULL;
55497cff
PP
2337 return PMFUNC;
2338 }
2339 else
2340 return FUNC;
2341}
2342
ffb4593c
NT
2343/*
2344 * S_sublex_push
2345 * Create a new scope to save the lexing state. The scope will be
2346 * ended in S_sublex_done. Returns a '(', starting the function arguments
2347 * to the uc, lc, etc. found before.
2348 * Sets PL_lex_state to LEX_INTERPCONCAT.
2349 */
2350
76e3520e 2351STATIC I32
cea2e8a9 2352S_sublex_push(pTHX)
55497cff 2353{
27da23d5 2354 dVAR;
f46d017c 2355 ENTER;
55497cff 2356
3280af22 2357 PL_lex_state = PL_sublex_info.super_state;
651b5b28 2358 SAVEBOOL(PL_lex_dojoin);
3280af22 2359 SAVEI32(PL_lex_brackets);
3280af22
NIS
2360 SAVEI32(PL_lex_casemods);
2361 SAVEI32(PL_lex_starts);
651b5b28 2362 SAVEI8(PL_lex_state);
7766f137 2363 SAVEVPTR(PL_lex_inpat);
98246f1e 2364 SAVEI16(PL_lex_inwhat);
57843af0 2365 SAVECOPLINE(PL_curcop);
3280af22 2366 SAVEPPTR(PL_bufptr);
8452ff4b 2367 SAVEPPTR(PL_bufend);
3280af22
NIS
2368 SAVEPPTR(PL_oldbufptr);
2369 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2370 SAVEPPTR(PL_last_lop);
2371 SAVEPPTR(PL_last_uni);
3280af22
NIS
2372 SAVEPPTR(PL_linestart);
2373 SAVESPTR(PL_linestr);
8edd5f42
RGS
2374 SAVEGENERICPV(PL_lex_brackstack);
2375 SAVEGENERICPV(PL_lex_casestack);
3280af22
NIS
2376
2377 PL_linestr = PL_lex_stuff;
a0714e2c 2378 PL_lex_stuff = NULL;
3280af22 2379
9cbb5ea2
GS
2380 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2381 = SvPVX(PL_linestr);
3280af22 2382 PL_bufend += SvCUR(PL_linestr);
bd61b366 2383 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2384 SAVEFREESV(PL_linestr);
2385
2386 PL_lex_dojoin = FALSE;
2387 PL_lex_brackets = 0;
a02a5408
JC
2388 Newx(PL_lex_brackstack, 120, char);
2389 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2390 PL_lex_casemods = 0;
2391 *PL_lex_casestack = '\0';
2392 PL_lex_starts = 0;
2393 PL_lex_state = LEX_INTERPCONCAT;
eb160463 2394 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
3280af22
NIS
2395
2396 PL_lex_inwhat = PL_sublex_info.sub_inwhat;
2397 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2398 PL_lex_inpat = PL_sublex_info.sub_op;
79072805 2399 else
5f66b61c 2400 PL_lex_inpat = NULL;
79072805 2401
55497cff 2402 return '(';
79072805
LW
2403}
2404
ffb4593c
NT
2405/*
2406 * S_sublex_done
2407 * Restores lexer state after a S_sublex_push.
2408 */
2409
76e3520e 2410STATIC I32
cea2e8a9 2411S_sublex_done(pTHX)
79072805 2412{
27da23d5 2413 dVAR;
3280af22 2414 if (!PL_lex_starts++) {
396482e1 2415 SV * const sv = newSVpvs("");
9aa983d2
JH
2416 if (SvUTF8(PL_linestr))
2417 SvUTF8_on(sv);
3280af22 2418 PL_expect = XOPERATOR;
6154021b 2419 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
79072805
LW
2420 return THING;
2421 }
2422
3280af22
NIS
2423 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2424 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2425 return yylex();
79072805
LW
2426 }
2427
ffb4593c 2428 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
3280af22
NIS
2429 if (PL_lex_repl && (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS)) {
2430 PL_linestr = PL_lex_repl;
2431 PL_lex_inpat = 0;
2432 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2433 PL_bufend += SvCUR(PL_linestr);
bd61b366 2434 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2435 SAVEFREESV(PL_linestr);
2436 PL_lex_dojoin = FALSE;
2437 PL_lex_brackets = 0;
3280af22
NIS
2438 PL_lex_casemods = 0;
2439 *PL_lex_casestack = '\0';
2440 PL_lex_starts = 0;
25da4f38 2441 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2442 PL_lex_state = LEX_INTERPNORMAL;
2443 PL_lex_starts++;
e9fa98b2
HS
2444 /* we don't clear PL_lex_repl here, so that we can check later
2445 whether this is an evalled subst; that means we rely on the
2446 logic to ensure sublex_done() is called again only via the
2447 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2448 }
e9fa98b2 2449 else {
3280af22 2450 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2451 PL_lex_repl = NULL;
e9fa98b2 2452 }
79072805 2453 return ',';
ffed7fef
LW
2454 }
2455 else {
5db06880
NC
2456#ifdef PERL_MAD
2457 if (PL_madskills) {
cd81e915
NC
2458 if (PL_thiswhite) {
2459 if (!PL_endwhite)
6b29d1f5 2460 PL_endwhite = newSVpvs("");
cd81e915
NC
2461 sv_catsv(PL_endwhite, PL_thiswhite);
2462 PL_thiswhite = 0;
2463 }
2464 if (PL_thistoken)
76f68e9b 2465 sv_setpvs(PL_thistoken,"");
5db06880 2466 else
cd81e915 2467 PL_realtokenstart = -1;
5db06880
NC
2468 }
2469#endif
f46d017c 2470 LEAVE;
3280af22
NIS
2471 PL_bufend = SvPVX(PL_linestr);
2472 PL_bufend += SvCUR(PL_linestr);
2473 PL_expect = XOPERATOR;
09bef843 2474 PL_sublex_info.sub_inwhat = 0;
79072805 2475 return ')';
ffed7fef
LW
2476 }
2477}
2478
02aa26ce
NT
2479/*
2480 scan_const
2481
2482 Extracts a pattern, double-quoted string, or transliteration. This
2483 is terrifying code.
2484
94def140 2485 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 2486 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 2487 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 2488
94def140
ST
2489 Returns a pointer to the character scanned up to. If this is
2490 advanced from the start pointer supplied (i.e. if anything was
9b599b2a 2491 successfully parsed), will leave an OP for the substring scanned
6154021b 2492 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
2493 by looking at the next characters herself.
2494
02aa26ce
NT
2495 In patterns:
2496 backslashes:
ff3f963a 2497 constants: \N{NAME} only
02aa26ce
NT
2498 case and quoting: \U \Q \E
2499 stops on @ and $, but not for $ as tail anchor
2500
2501 In transliterations:
2502 characters are VERY literal, except for - not at the start or end
94def140
ST
2503 of the string, which indicates a range. If the range is in bytes,
2504 scan_const expands the range to the full set of intermediate
2505 characters. If the range is in utf8, the hyphen is replaced with
2506 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
2507
2508 In double-quoted strings:
2509 backslashes:
2510 double-quoted style: \r and \n
ff3f963a 2511 constants: \x31, etc.
94def140 2512 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
2513 case and quoting: \U \Q \E
2514 stops on @ and $
2515
2516 scan_const does *not* construct ops to handle interpolated strings.
2517 It stops processing as soon as it finds an embedded $ or @ variable
2518 and leaves it to the caller to work out what's going on.
2519
94def140
ST
2520 embedded arrays (whether in pattern or not) could be:
2521 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2522
2523 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
2524
2525 $ in pattern could be $foo or could be tail anchor. Assumption:
2526 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 2527 followed by one of "()| \r\n\t"
02aa26ce
NT
2528
2529 \1 (backreferences) are turned into $1
2530
2531 The structure of the code is
2532 while (there's a character to process) {
94def140
ST
2533 handle transliteration ranges
2534 skip regexp comments /(?#comment)/ and codes /(?{code})/
2535 skip #-initiated comments in //x patterns
2536 check for embedded arrays
02aa26ce
NT
2537 check for embedded scalars
2538 if (backslash) {
94def140 2539 deprecate \1 in substitution replacements
02aa26ce
NT
2540 handle string-changing backslashes \l \U \Q \E, etc.
2541 switch (what was escaped) {
94def140 2542 handle \- in a transliteration (becomes a literal -)
ff3f963a 2543 if a pattern and not \N{, go treat as regular character
94def140
ST
2544 handle \132 (octal characters)
2545 handle \x15 and \x{1234} (hex characters)
ff3f963a 2546 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
ST
2547 handle \cV (control characters)
2548 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 2549 } (end switch)
77a135fe 2550 continue
02aa26ce 2551 } (end if backslash)
77a135fe 2552 handle regular character
02aa26ce 2553 } (end while character to read)
4e553d73 2554
02aa26ce
NT
2555*/
2556
76e3520e 2557STATIC char *
cea2e8a9 2558S_scan_const(pTHX_ char *start)
79072805 2559{
97aff369 2560 dVAR;
3280af22 2561 register char *send = PL_bufend; /* end of the constant */
77a135fe
KW
2562 SV *sv = newSV(send - start); /* sv for the constant. See
2563 note below on sizing. */
02aa26ce
NT
2564 register char *s = start; /* start of the constant */
2565 register char *d = SvPVX(sv); /* destination for copies */
2566 bool dorange = FALSE; /* are we in a translit range? */
c2e66d9e 2567 bool didrange = FALSE; /* did we just finish a range? */
2b9d42f0 2568 I32 has_utf8 = FALSE; /* Output constant is UTF8 */
77a135fe
KW
2569 I32 this_utf8 = UTF; /* Is the source string assumed
2570 to be UTF8? But, this can
2571 show as true when the source
2572 isn't utf8, as for example
2573 when it is entirely composed
2574 of hex constants */
2575
2576 /* Note on sizing: The scanned constant is placed into sv, which is
2577 * initialized by newSV() assuming one byte of output for every byte of
2578 * input. This routine expects newSV() to allocate an extra byte for a
2579 * trailing NUL, which this routine will append if it gets to the end of
2580 * the input. There may be more bytes of input than output (eg., \N{LATIN
2581 * CAPITAL LETTER A}), or more output than input if the constant ends up
2582 * recoded to utf8, but each time a construct is found that might increase
2583 * the needed size, SvGROW() is called. Its size parameter each time is
2584 * based on the best guess estimate at the time, namely the length used so
2585 * far, plus the length the current construct will occupy, plus room for
2586 * the trailing NUL, plus one byte for every input byte still unscanned */
2587
012bcf8d 2588 UV uv;
4c3a8340
ST
2589#ifdef EBCDIC
2590 UV literal_endpoint = 0;
e294cc5d 2591 bool native_range = TRUE; /* turned to FALSE if the first endpoint is Unicode. */
4c3a8340 2592#endif
012bcf8d 2593
7918f24d
NC
2594 PERL_ARGS_ASSERT_SCAN_CONST;
2595
2b9d42f0
NIS
2596 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
2597 /* If we are doing a trans and we know we want UTF8 set expectation */
2598 has_utf8 = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
2599 this_utf8 = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
2600 }
2601
2602
79072805 2603 while (s < send || dorange) {
ff3f963a 2604
02aa26ce 2605 /* get transliterations out of the way (they're most literal) */
3280af22 2606 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 2607 /* expand a range A-Z to the full set of characters. AIE! */
79072805 2608 if (dorange) {
1ba5c669
JH
2609 I32 i; /* current expanded character */
2610 I32 min; /* first character in range */
2611 I32 max; /* last character in range */
02aa26ce 2612
e294cc5d
JH
2613#ifdef EBCDIC
2614 UV uvmax = 0;
2615#endif
2616
2617 if (has_utf8
2618#ifdef EBCDIC
2619 && !native_range
2620#endif
2621 ) {
9d4ba2ae 2622 char * const c = (char*)utf8_hop((U8*)d, -1);
8973db79
JH
2623 char *e = d++;
2624 while (e-- > c)
2625 *(e + 1) = *e;
25716404 2626 *c = (char)UTF_TO_NATIVE(0xff);
8973db79
JH
2627 /* mark the range as done, and continue */
2628 dorange = FALSE;
2629 didrange = TRUE;
2630 continue;
2631 }
2b9d42f0 2632
95a20fc0 2633 i = d - SvPVX_const(sv); /* remember current offset */
e294cc5d
JH
2634#ifdef EBCDIC
2635 SvGROW(sv,
2636 SvLEN(sv) + (has_utf8 ?
2637 (512 - UTF_CONTINUATION_MARK +
2638 UNISKIP(0x100))
2639 : 256));
2640 /* How many two-byte within 0..255: 128 in UTF-8,
2641 * 96 in UTF-8-mod. */
2642#else
9cbb5ea2 2643 SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */
e294cc5d 2644#endif
9cbb5ea2 2645 d = SvPVX(sv) + i; /* refresh d after realloc */
e294cc5d
JH
2646#ifdef EBCDIC
2647 if (has_utf8) {
2648 int j;
2649 for (j = 0; j <= 1; j++) {
2650 char * const c = (char*)utf8_hop((U8*)d, -1);
2651 const UV uv = utf8n_to_uvchr((U8*)c, d - c, NULL, 0);
2652 if (j)
2653 min = (U8)uv;
2654 else if (uv < 256)
2655 max = (U8)uv;
2656 else {
2657 max = (U8)0xff; /* only to \xff */
2658 uvmax = uv; /* \x{100} to uvmax */
2659 }
2660 d = c; /* eat endpoint chars */
2661 }
2662 }
2663 else {
2664#endif
2665 d -= 2; /* eat the first char and the - */
2666 min = (U8)*d; /* first char in range */
2667 max = (U8)d[1]; /* last char in range */
2668#ifdef EBCDIC
2669 }
2670#endif
8ada0baa 2671
c2e66d9e 2672 if (min > max) {
01ec43d0 2673 Perl_croak(aTHX_
d1573ac7 2674 "Invalid range \"%c-%c\" in transliteration operator",
1ba5c669 2675 (char)min, (char)max);
c2e66d9e
GS
2676 }
2677
c7f1f016 2678#ifdef EBCDIC
4c3a8340
ST
2679 if (literal_endpoint == 2 &&
2680 ((isLOWER(min) && isLOWER(max)) ||
2681 (isUPPER(min) && isUPPER(max)))) {
8ada0baa
JH
2682 if (isLOWER(min)) {
2683 for (i = min; i <= max; i++)
2684 if (isLOWER(i))
db42d148 2685 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2686 } else {
2687 for (i = min; i <= max; i++)
2688 if (isUPPER(i))
db42d148 2689 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2690 }
2691 }
2692 else
2693#endif
2694 for (i = min; i <= max; i++)
e294cc5d
JH
2695#ifdef EBCDIC
2696 if (has_utf8) {
2697 const U8 ch = (U8)NATIVE_TO_UTF(i);
2698 if (UNI_IS_INVARIANT(ch))
2699 *d++ = (U8)i;
2700 else {
2701 *d++ = (U8)UTF8_EIGHT_BIT_HI(ch);
2702 *d++ = (U8)UTF8_EIGHT_BIT_LO(ch);
2703 }
2704 }
2705 else
2706#endif
2707 *d++ = (char)i;
2708
2709#ifdef EBCDIC
2710 if (uvmax) {
2711 d = (char*)uvchr_to_utf8((U8*)d, 0x100);
2712 if (uvmax > 0x101)
2713 *d++ = (char)UTF_TO_NATIVE(0xff);
2714 if (uvmax > 0x100)
2715 d = (char*)uvchr_to_utf8((U8*)d, uvmax);
2716 }
2717#endif
02aa26ce
NT
2718
2719 /* mark the range as done, and continue */
79072805 2720 dorange = FALSE;
01ec43d0 2721 didrange = TRUE;
4c3a8340
ST
2722#ifdef EBCDIC
2723 literal_endpoint = 0;
2724#endif
79072805 2725 continue;
4e553d73 2726 }
02aa26ce
NT
2727
2728 /* range begins (ignore - as first or last char) */
79072805 2729 else if (*s == '-' && s+1 < send && s != start) {
4e553d73 2730 if (didrange) {
1fafa243 2731 Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
01ec43d0 2732 }
e294cc5d
JH
2733 if (has_utf8
2734#ifdef EBCDIC
2735 && !native_range
2736#endif
2737 ) {
25716404 2738 *d++ = (char)UTF_TO_NATIVE(0xff); /* use illegal utf8 byte--see pmtrans */
a0ed51b3
LW
2739 s++;
2740 continue;
2741 }
79072805
LW
2742 dorange = TRUE;
2743 s++;
01ec43d0
GS
2744 }
2745 else {
2746 didrange = FALSE;
4c3a8340
ST
2747#ifdef EBCDIC
2748 literal_endpoint = 0;
e294cc5d 2749 native_range = TRUE;
4c3a8340 2750#endif
01ec43d0 2751 }
79072805 2752 }
02aa26ce
NT
2753
2754 /* if we get here, we're not doing a transliteration */
2755
0f5d15d6
IZ
2756 /* skip for regexp comments /(?#comment)/ and code /(?{code})/,
2757 except for the last char, which will be done separately. */
3280af22 2758 else if (*s == '(' && PL_lex_inpat && s[1] == '?') {
cc6b7395 2759 if (s[2] == '#') {
e994fd66 2760 while (s+1 < send && *s != ')')
db42d148 2761 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
155aba94
GS
2762 }
2763 else if (s[2] == '{' /* This should match regcomp.c */
67edc0c9 2764 || (s[2] == '?' && s[3] == '{'))
155aba94 2765 {
cc6b7395 2766 I32 count = 1;
0f5d15d6 2767 char *regparse = s + (s[2] == '{' ? 3 : 4);
cc6b7395
IZ
2768 char c;
2769
d9f97599
GS
2770 while (count && (c = *regparse)) {
2771 if (c == '\\' && regparse[1])
2772 regparse++;
4e553d73 2773 else if (c == '{')
cc6b7395 2774 count++;
4e553d73 2775 else if (c == '}')
cc6b7395 2776 count--;
d9f97599 2777 regparse++;
cc6b7395 2778 }
e994fd66 2779 if (*regparse != ')')
5bdf89e7 2780 regparse--; /* Leave one char for continuation. */
0f5d15d6 2781 while (s < regparse)
db42d148 2782 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
cc6b7395 2783 }
748a9306 2784 }
02aa26ce
NT
2785
2786 /* likewise skip #-initiated comments in //x patterns */
3280af22
NIS
2787 else if (*s == '#' && PL_lex_inpat &&
2788 ((PMOP*)PL_lex_inpat)->op_pmflags & PMf_EXTENDED) {
748a9306 2789 while (s+1 < send && *s != '\n')
db42d148 2790 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
748a9306 2791 }
02aa26ce 2792
5d1d4326 2793 /* check for embedded arrays
da6eedaa 2794 (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
5d1d4326 2795 */
1749ea0d
ST
2796 else if (*s == '@' && s[1]) {
2797 if (isALNUM_lazy_if(s+1,UTF))
2798 break;
2799 if (strchr(":'{$", s[1]))
2800 break;
2801 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
2802 break; /* in regexp, neither @+ nor @- are interpolated */
2803 }
02aa26ce
NT
2804
2805 /* check for embedded scalars. only stop if we're sure it's a
2806 variable.
2807 */
79072805 2808 else if (*s == '$') {
3280af22 2809 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
79072805 2810 break;
77772344 2811 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
a2a5de95
NC
2812 if (s[1] == '\\') {
2813 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
2814 "Possible unintended interpolation of $\\ in regex");
77772344 2815 }
79072805 2816 break; /* in regexp, $ might be tail anchor */
77772344 2817 }
79072805 2818 }
02aa26ce 2819
2b9d42f0
NIS
2820 /* End of else if chain - OP_TRANS rejoin rest */
2821
02aa26ce 2822 /* backslashes */
79072805 2823 if (*s == '\\' && s+1 < send) {
ff3f963a
KW
2824 char* e; /* Can be used for ending '}', etc. */
2825
79072805 2826 s++;
02aa26ce 2827
7d0fc23c
KW
2828 /* warn on \1 - \9 in substitution replacements, but note that \11
2829 * is an octal; and \19 is \1 followed by '9' */
3280af22 2830 if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat &&
a0d0e21e 2831 isDIGIT(*s) && *s != '0' && !isDIGIT(s[1]))
79072805 2832 {
a2a5de95 2833 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
79072805
LW
2834 *--s = '$';
2835 break;
2836 }
02aa26ce
NT
2837
2838 /* string-change backslash escapes */
3280af22 2839 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQ", *s)) {
79072805
LW
2840 --s;
2841 break;
2842 }
ff3f963a
KW
2843 /* In a pattern, process \N, but skip any other backslash escapes.
2844 * This is because we don't want to translate an escape sequence
2845 * into a meta symbol and have the regex compiler use the meta
2846 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
2847 * in spite of this, we do have to process \N here while the proper
2848 * charnames handler is in scope. See bugs #56444 and #62056.
2849 * There is a complication because \N in a pattern may also stand
2850 * for 'match a non-nl', and not mean a charname, in which case its
2851 * processing should be deferred to the regex compiler. To be a
2852 * charname it must be followed immediately by a '{', and not look
2853 * like \N followed by a curly quantifier, i.e., not something like
2854 * \N{3,}. regcurly returns a boolean indicating if it is a legal
2855 * quantifier */
2856 else if (PL_lex_inpat
2857 && (*s != 'N'
2858 || s[1] != '{'
2859 || regcurly(s + 1)))
2860 {
cc74c5bd
ST
2861 *d++ = NATIVE_TO_NEED(has_utf8,'\\');
2862 goto default_action;
2863 }
02aa26ce 2864
79072805 2865 switch (*s) {
02aa26ce
NT
2866
2867 /* quoted - in transliterations */
79072805 2868 case '-':
3280af22 2869 if (PL_lex_inwhat == OP_TRANS) {
79072805
LW
2870 *d++ = *s++;
2871 continue;
2872 }
2873 /* FALL THROUGH */
2874 default:
11b8faa4 2875 {
a2a5de95
NC
2876 if ((isALPHA(*s) || isDIGIT(*s)))
2877 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
2878 "Unrecognized escape \\%c passed through",
2879 *s);
11b8faa4 2880 /* default action is to copy the quoted character */
f9a63242 2881 goto default_action;
11b8faa4 2882 }
02aa26ce 2883
632403cc 2884 /* eg. \132 indicates the octal constant 0132 */
79072805
LW
2885 case '0': case '1': case '2': case '3':
2886 case '4': case '5': case '6': case '7':
ba210ebe 2887 {
53305cf1
NC
2888 I32 flags = 0;
2889 STRLEN len = 3;
77a135fe 2890 uv = NATIVE_TO_UNI(grok_oct(s, &len, &flags, NULL));
ba210ebe
JH
2891 s += len;
2892 }
012bcf8d 2893 goto NUM_ESCAPE_INSERT;
02aa26ce 2894
f0a2b745
KW
2895 /* eg. \o{24} indicates the octal constant \024 */
2896 case 'o':
2897 {
2898 STRLEN len;
454155d9 2899 const char* error;
f0a2b745 2900
454155d9 2901 bool valid = grok_bslash_o(s, &uv, &len, &error, 1);
f0a2b745 2902 s += len;
454155d9 2903 if (! valid) {
f0a2b745
KW
2904 yyerror(error);
2905 continue;
2906 }
2907 goto NUM_ESCAPE_INSERT;
2908 }
2909
77a135fe 2910 /* eg. \x24 indicates the hex constant 0x24 */
79072805 2911 case 'x':
a0ed51b3
LW
2912 ++s;
2913 if (*s == '{') {
9d4ba2ae 2914 char* const e = strchr(s, '}');
a4c04bdc
NC
2915 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES |
2916 PERL_SCAN_DISALLOW_PREFIX;
53305cf1 2917 STRLEN len;
355860ce 2918
53305cf1 2919 ++s;
adaeee49 2920 if (!e) {
a0ed51b3 2921 yyerror("Missing right brace on \\x{}");
355860ce 2922 continue;
ba210ebe 2923 }
53305cf1 2924 len = e - s;
77a135fe 2925 uv = NATIVE_TO_UNI(grok_hex(s, &len, &flags, NULL));
ba210ebe 2926 s = e + 1;
a0ed51b3
LW
2927 }
2928 else {
ba210ebe 2929 {
53305cf1 2930 STRLEN len = 2;
a4c04bdc 2931 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
77a135fe 2932 uv = NATIVE_TO_UNI(grok_hex(s, &len, &flags, NULL));
ba210ebe
JH
2933 s += len;
2934 }
012bcf8d
GS
2935 }
2936
2937 NUM_ESCAPE_INSERT:
ff3f963a
KW
2938 /* Insert oct or hex escaped character. There will always be
2939 * enough room in sv since such escapes will be longer than any
2940 * UTF-8 sequence they can end up as, except if they force us
2941 * to recode the rest of the string into utf8 */
ba7cea30 2942
77a135fe 2943 /* Here uv is the ordinal of the next character being added in
ff3f963a 2944 * unicode (converted from native). */
77a135fe 2945 if (!UNI_IS_INVARIANT(uv)) {
9aa983d2 2946 if (!has_utf8 && uv > 255) {
77a135fe
KW
2947 /* Might need to recode whatever we have accumulated so
2948 * far if it contains any chars variant in utf8 or
2949 * utf-ebcdic. */
2950
2951 SvCUR_set(sv, d - SvPVX_const(sv));
2952 SvPOK_on(sv);
2953 *d = '\0';
77a135fe 2954 /* See Note on sizing above. */
7bf79863
KW
2955 sv_utf8_upgrade_flags_grow(sv,
2956 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
2957 UNISKIP(uv) + (STRLEN)(send - s) + 1);
77a135fe
KW
2958 d = SvPVX(sv) + SvCUR(sv);
2959 has_utf8 = TRUE;
012bcf8d
GS
2960 }
2961
77a135fe
KW
2962 if (has_utf8) {
2963 d = (char*)uvuni_to_utf8((U8*)d, uv);
f9a63242
JH
2964 if (PL_lex_inwhat == OP_TRANS &&
2965 PL_sublex_info.sub_op) {
2966 PL_sublex_info.sub_op->op_private |=
2967 (PL_lex_repl ? OPpTRANS_FROM_UTF
2968 : OPpTRANS_TO_UTF);
f9a63242 2969 }
e294cc5d
JH
2970#ifdef EBCDIC
2971 if (uv > 255 && !dorange)
2972 native_range = FALSE;
2973#endif
012bcf8d 2974 }
a0ed51b3 2975 else {
012bcf8d 2976 *d++ = (char)uv;
a0ed51b3 2977 }
012bcf8d
GS
2978 }
2979 else {
c4d5f83a 2980 *d++ = (char) uv;
a0ed51b3 2981 }
79072805 2982 continue;
02aa26ce 2983
4a2d328f 2984 case 'N':
ff3f963a
KW
2985 /* In a non-pattern \N must be a named character, like \N{LATIN
2986 * SMALL LETTER A} or \N{U+0041}. For patterns, it also can
2987 * mean to match a non-newline. For non-patterns, named
2988 * characters are converted to their string equivalents. In
2989 * patterns, named characters are not converted to their
2990 * ultimate forms for the same reasons that other escapes
2991 * aren't. Instead, they are converted to the \N{U+...} form
2992 * to get the value from the charnames that is in effect right
2993 * now, while preserving the fact that it was a named character
2994 * so that the regex compiler knows this */
2995
2996 /* This section of code doesn't generally use the
2997 * NATIVE_TO_NEED() macro to transform the input. I (khw) did
2998 * a close examination of this macro and determined it is a
2999 * no-op except on utfebcdic variant characters. Every
3000 * character generated by this that would normally need to be
3001 * enclosed by this macro is invariant, so the macro is not
3002 * needed, and would complicate use of copy(). There are other
3003 * parts of this file where the macro is used inconsistently,
3004 * but are saved by it being a no-op */
3005
3006 /* The structure of this section of code (besides checking for
3007 * errors and upgrading to utf8) is:
3008 * Further disambiguate between the two meanings of \N, and if
3009 * not a charname, go process it elsewhere
0a96133f
KW
3010 * If of form \N{U+...}, pass it through if a pattern;
3011 * otherwise convert to utf8
3012 * Otherwise must be \N{NAME}: convert to \N{U+c1.c2...} if a
3013 * pattern; otherwise convert to utf8 */
ff3f963a
KW
3014
3015 /* Here, s points to the 'N'; the test below is guaranteed to
3016 * succeed if we are being called on a pattern as we already
3017 * know from a test above that the next character is a '{'.
3018 * On a non-pattern \N must mean 'named sequence, which
3019 * requires braces */
3020 s++;
3021 if (*s != '{') {
3022 yyerror("Missing braces on \\N{}");
3023 continue;
3024 }
3025 s++;
3026
0a96133f 3027 /* If there is no matching '}', it is an error. */
ff3f963a
KW
3028 if (! (e = strchr(s, '}'))) {
3029 if (! PL_lex_inpat) {
5777a3f7 3030 yyerror("Missing right brace on \\N{}");
0a96133f
KW
3031 } else {
3032 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N.");
dbc0d4f2 3033 }
0a96133f 3034 continue;
ff3f963a 3035 }
cddc7ef4 3036
ff3f963a 3037 /* Here it looks like a named character */
cddc7ef4 3038
ff3f963a
KW
3039 if (PL_lex_inpat) {
3040
3041 /* XXX This block is temporary code. \N{} implies that the
3042 * pattern is to have Unicode semantics, and therefore
3043 * currently has to be encoded in utf8. By putting it in
3044 * utf8 now, we save a whole pass in the regular expression
3045 * compiler. Once that code is changed so Unicode
3046 * semantics doesn't necessarily have to be in utf8, this
3047 * block should be removed */
3048 if (!has_utf8) {
77a135fe 3049 SvCUR_set(sv, d - SvPVX_const(sv));
f08d6ad9 3050 SvPOK_on(sv);
e4f3eed8 3051 *d = '\0';
77a135fe 3052 /* See Note on sizing above. */
7bf79863 3053 sv_utf8_upgrade_flags_grow(sv,
ff3f963a
KW
3054 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3055 /* 5 = '\N{' + cur char + NUL */
3056 (STRLEN)(send - s) + 5);
f08d6ad9 3057 d = SvPVX(sv) + SvCUR(sv);
89491803 3058 has_utf8 = TRUE;
ff3f963a
KW
3059 }
3060 }
423cee85 3061
ff3f963a
KW
3062 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3063 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3064 | PERL_SCAN_DISALLOW_PREFIX;
3065 STRLEN len;
3066
3067 /* For \N{U+...}, the '...' is a unicode value even on
3068 * EBCDIC machines */
3069 s += 2; /* Skip to next char after the 'U+' */
3070 len = e - s;
3071 uv = grok_hex(s, &len, &flags, NULL);
3072 if (len == 0 || len != (STRLEN)(e - s)) {
3073 yyerror("Invalid hexadecimal number in \\N{U+...}");
3074 s = e + 1;
3075 continue;
3076 }
3077
3078 if (PL_lex_inpat) {
3079
3080 /* Pass through to the regex compiler unchanged. The
3081 * reason we evaluated the number above is to make sure
0a96133f 3082 * there wasn't a syntax error. */
ff3f963a
KW
3083 s -= 5; /* Include the '\N{U+' */
3084 Copy(s, d, e - s + 1, char); /* 1 = include the } */
3085 d += e - s + 1;
3086 }
3087 else { /* Not a pattern: convert the hex to string */
3088
3089 /* If destination is not in utf8, unconditionally
3090 * recode it to be so. This is because \N{} implies
3091 * Unicode semantics, and scalars have to be in utf8
3092 * to guarantee those semantics */
3093 if (! has_utf8) {
3094 SvCUR_set(sv, d - SvPVX_const(sv));
3095 SvPOK_on(sv);
3096 *d = '\0';
3097 /* See Note on sizing above. */
3098 sv_utf8_upgrade_flags_grow(
3099 sv,
3100 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3101 UNISKIP(uv) + (STRLEN)(send - e) + 1);
3102 d = SvPVX(sv) + SvCUR(sv);
3103 has_utf8 = TRUE;
3104 }
3105
3106 /* Add the string to the output */
3107 if (UNI_IS_INVARIANT(uv)) {
3108 *d++ = (char) uv;
3109 }
3110 else d = (char*)uvuni_to_utf8((U8*)d, uv);
3111 }
3112 }
3113 else { /* Here is \N{NAME} but not \N{U+...}. */
3114
3115 SV *res; /* result from charnames */
3116 const char *str; /* the string in 'res' */
3117 STRLEN len; /* its length */
3118
3119 /* Get the value for NAME */
3120 res = newSVpvn(s, e - s);
3121 res = new_constant( NULL, 0, "charnames",
3122 /* includes all of: \N{...} */
3123 res, NULL, s - 3, e - s + 4 );
3124
3125 /* Most likely res will be in utf8 already since the
3126 * standard charnames uses pack U, but a custom translator
3127 * can leave it otherwise, so make sure. XXX This can be
3128 * revisited to not have charnames use utf8 for characters
3129 * that don't need it when regexes don't have to be in utf8
3130 * for Unicode semantics. If doing so, remember EBCDIC */
3131 sv_utf8_upgrade(res);
3132 str = SvPV_const(res, len);
3133
3134 /* Don't accept malformed input */
3135 if (! is_utf8_string((U8 *) str, len)) {
3136 yyerror("Malformed UTF-8 returned by \\N");
3137 }
3138 else if (PL_lex_inpat) {
3139
3140 if (! len) { /* The name resolved to an empty string */
3141 Copy("\\N{}", d, 4, char);
3142 d += 4;
3143 }
3144 else {
3145 /* In order to not lose information for the regex
3146 * compiler, pass the result in the specially made
3147 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3148 * the code points in hex of each character
3149 * returned by charnames */
3150
3151 const char *str_end = str + len;
3152 STRLEN char_length; /* cur char's byte length */
3153 STRLEN output_length; /* and the number of bytes
3154 after this is translated
3155 into hex digits */
3156 const STRLEN off = d - SvPVX_const(sv);
3157
3158 /* 2 hex per byte; 2 chars for '\N'; 2 chars for
3159 * max('U+', '.'); and 1 for NUL */
3160 char hex_string[2 * UTF8_MAXBYTES + 5];
3161
3162 /* Get the first character of the result. */
3163 U32 uv = utf8n_to_uvuni((U8 *) str,
3164 len,
3165 &char_length,
3166 UTF8_ALLOW_ANYUV);
3167
3168 /* The call to is_utf8_string() above hopefully
3169 * guarantees that there won't be an error. But
3170 * it's easy here to make sure. The function just
3171 * above warns and returns 0 if invalid utf8, but
3172 * it can also return 0 if the input is validly a
3173 * NUL. Disambiguate */
3174 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3175 uv = UNICODE_REPLACEMENT;
3176 }
3177
3178 /* Convert first code point to hex, including the
3179 * boiler plate before it */
3180 sprintf(hex_string, "\\N{U+%X", (unsigned int) uv);
3181 output_length = strlen(hex_string);
3182
3183 /* Make sure there is enough space to hold it */
3184 d = off + SvGROW(sv, off
3185 + output_length
3186 + (STRLEN)(send - e)
3187 + 2); /* '}' + NUL */
3188 /* And output it */
3189 Copy(hex_string, d, output_length, char);
3190 d += output_length;
3191
3192 /* For each subsequent character, append dot and
3193 * its ordinal in hex */
3194 while ((str += char_length) < str_end) {
3195 const STRLEN off = d - SvPVX_const(sv);
3196 U32 uv = utf8n_to_uvuni((U8 *) str,
3197 str_end - str,
3198 &char_length,
3199 UTF8_ALLOW_ANYUV);
3200 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3201 uv = UNICODE_REPLACEMENT;
3202 }
3203
3204 sprintf(hex_string, ".%X", (unsigned int) uv);
3205 output_length = strlen(hex_string);
3206
3207 d = off + SvGROW(sv, off
3208 + output_length
3209 + (STRLEN)(send - e)
3210 + 2); /* '}' + NUL */
3211 Copy(hex_string, d, output_length, char);
3212 d += output_length;
3213 }
3214
3215 *d++ = '}'; /* Done. Add the trailing brace */
3216 }
3217 }
3218 else { /* Here, not in a pattern. Convert the name to a
3219 * string. */
3220
3221 /* If destination is not in utf8, unconditionally
3222 * recode it to be so. This is because \N{} implies
3223 * Unicode semantics, and scalars have to be in utf8
3224 * to guarantee those semantics */
3225 if (! has_utf8) {
3226 SvCUR_set(sv, d - SvPVX_const(sv));
3227 SvPOK_on(sv);
3228 *d = '\0';
3229 /* See Note on sizing above. */
3230 sv_utf8_upgrade_flags_grow(sv,
3231 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3232 len + (STRLEN)(send - s) + 1);
3233 d = SvPVX(sv) + SvCUR(sv);
3234 has_utf8 = TRUE;
3235 } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
3236
3237 /* See Note on sizing above. (NOTE: SvCUR() is not
3238 * set correctly here). */
3239 const STRLEN off = d - SvPVX_const(sv);
3240 d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1);
3241 }
3242 Copy(str, d, len, char);
3243 d += len;
423cee85 3244 }
423cee85 3245 SvREFCNT_dec(res);
cb233ae3
KW
3246
3247 /* Deprecate non-approved name syntax */
3248 if (ckWARN_d(WARN_DEPRECATED)) {
3249 bool problematic = FALSE;
3250 char* i = s;
3251
3252 /* For non-ut8 input, look to see that the first
3253 * character is an alpha, then loop through the rest
3254 * checking that each is a continuation */
3255 if (! this_utf8) {
3256 if (! isALPHAU(*i)) problematic = TRUE;
3257 else for (i = s + 1; i < e; i++) {
3258 if (isCHARNAME_CONT(*i)) continue;
3259 problematic = TRUE;
3260 break;
3261 }
3262 }
3263 else {
3264 /* Similarly for utf8. For invariants can check
3265 * directly. We accept anything above the latin1
3266 * range because it is immaterial to Perl if it is
3267 * correct or not, and is expensive to check. But
3268 * it is fairly easy in the latin1 range to convert
3269 * the variants into a single character and check
3270 * those */
3271 if (UTF8_IS_INVARIANT(*i)) {
3272 if (! isALPHAU(*i)) problematic = TRUE;
3273 } else if (UTF8_IS_DOWNGRADEABLE_START(*i)) {
3274 if (! isALPHAU(UNI_TO_NATIVE(UTF8_ACCUMULATE(*i,
3275 *(i+1)))))
3276 {
3277 problematic = TRUE;
3278 }
3279 }
3280 if (! problematic) for (i = s + UTF8SKIP(s);
3281 i < e;
3282 i+= UTF8SKIP(i))
3283 {
3284 if (UTF8_IS_INVARIANT(*i)) {
3285 if (isCHARNAME_CONT(*i)) continue;
3286 } else if (! UTF8_IS_DOWNGRADEABLE_START(*i)) {
3287 continue;
3288 } else if (isCHARNAME_CONT(
3289 UNI_TO_NATIVE(
3290 UTF8_ACCUMULATE(*i, *(i+1)))))
3291 {
3292 continue;
3293 }
3294 problematic = TRUE;
3295 break;
3296 }
3297 }
3298 if (problematic) {
6e1bad6c
KW
3299 /* The e-i passed to the final %.*s makes sure that
3300 * should the trailing NUL be missing that this
3301 * print won't run off the end of the string */
cb233ae3 3302 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
b00fc8d4
NC
3303 "Deprecated character in \\N{...}; marked by <-- HERE in \\N{%.*s<-- HERE %.*s",
3304 (int)(i - s + 1), s, (int)(e - i), i + 1);
cb233ae3
KW
3305 }
3306 }
3307 } /* End \N{NAME} */
ff3f963a
KW
3308#ifdef EBCDIC
3309 if (!dorange)
3310 native_range = FALSE; /* \N{} is defined to be Unicode */
3311#endif
3312 s = e + 1; /* Point to just after the '}' */
423cee85
JH
3313 continue;
3314
02aa26ce 3315 /* \c is a control character */
79072805
LW
3316 case 'c':
3317 s++;
961ce445 3318 if (s < send) {
f9d13529 3319 *d++ = grok_bslash_c(*s++, 1);
ba210ebe 3320 }
961ce445
RGS
3321 else {
3322 yyerror("Missing control char name in \\c");
3323 }
79072805 3324 continue;
02aa26ce
NT
3325
3326 /* printf-style backslashes, formfeeds, newlines, etc */
79072805 3327 case 'b':
db42d148 3328 *d++ = NATIVE_TO_NEED(has_utf8,'\b');
79072805
LW
3329 break;
3330 case 'n':
db42d148 3331 *d++ = NATIVE_TO_NEED(has_utf8,'\n');
79072805
LW
3332 break;
3333 case 'r':
db42d148 3334 *d++ = NATIVE_TO_NEED(has_utf8,'\r');
79072805
LW
3335 break;
3336 case 'f':
db42d148 3337 *d++ = NATIVE_TO_NEED(has_utf8,'\f');
79072805
LW
3338 break;
3339 case 't':
db42d148 3340 *d++ = NATIVE_TO_NEED(has_utf8,'\t');
79072805 3341 break;
34a3fe2a 3342 case 'e':
db42d148 3343 *d++ = ASCII_TO_NEED(has_utf8,'\033');
34a3fe2a
PP
3344 break;
3345 case 'a':
db42d148 3346 *d++ = ASCII_TO_NEED(has_utf8,'\007');
79072805 3347 break;
02aa26ce
NT
3348 } /* end switch */
3349
79072805
LW
3350 s++;
3351 continue;
02aa26ce 3352 } /* end if (backslash) */
4c3a8340
ST
3353#ifdef EBCDIC
3354 else
3355 literal_endpoint++;
3356#endif
02aa26ce 3357
f9a63242 3358 default_action:
77a135fe
KW
3359 /* If we started with encoded form, or already know we want it,
3360 then encode the next character */
3361 if (! NATIVE_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
2b9d42f0 3362 STRLEN len = 1;
77a135fe
KW
3363
3364
3365 /* One might think that it is wasted effort in the case of the
3366 * source being utf8 (this_utf8 == TRUE) to take the next character
3367 * in the source, convert it to an unsigned value, and then convert
3368 * it back again. But the source has not been validated here. The
3369 * routine that does the conversion checks for errors like
3370 * malformed utf8 */
3371
5f66b61c
AL
3372 const UV nextuv = (this_utf8) ? utf8n_to_uvchr((U8*)s, send - s, &len, 0) : (UV) ((U8) *s);
3373 const STRLEN need = UNISKIP(NATIVE_TO_UNI(nextuv));
77a135fe
KW
3374 if (!has_utf8) {
3375 SvCUR_set(sv, d - SvPVX_const(sv));
3376 SvPOK_on(sv);
3377 *d = '\0';
77a135fe 3378 /* See Note on sizing above. */
7bf79863
KW
3379 sv_utf8_upgrade_flags_grow(sv,
3380 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3381 need + (STRLEN)(send - s) + 1);
77a135fe
KW
3382 d = SvPVX(sv) + SvCUR(sv);
3383 has_utf8 = TRUE;
3384 } else if (need > len) {
3385 /* encoded value larger than old, may need extra space (NOTE:
3386 * SvCUR() is not set correctly here). See Note on sizing
3387 * above. */
9d4ba2ae 3388 const STRLEN off = d - SvPVX_const(sv);
77a135fe 3389 d = SvGROW(sv, off + need + (STRLEN)(send - s) + 1) + off;
2b9d42f0 3390 }
77a135fe
KW
3391 s += len;
3392
5f66b61c 3393 d = (char*)uvchr_to_utf8((U8*)d, nextuv);
e294cc5d
JH
3394#ifdef EBCDIC
3395 if (uv > 255 && !dorange)
3396 native_range = FALSE;
3397#endif
2b9d42f0
NIS
3398 }
3399 else {
3400 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
3401 }
02aa26ce
NT
3402 } /* while loop to process each character */
3403
3404 /* terminate the string and set up the sv */
79072805 3405 *d = '\0';
95a20fc0 3406 SvCUR_set(sv, d - SvPVX_const(sv));
2b9d42f0 3407 if (SvCUR(sv) >= SvLEN(sv))
d0063567 3408 Perl_croak(aTHX_ "panic: constant overflowed allocated space");
2b9d42f0 3409
79072805 3410 SvPOK_on(sv);
9f4817db 3411 if (PL_encoding && !has_utf8) {
d0063567
DK
3412 sv_recode_to_utf8(sv, PL_encoding);
3413 if (SvUTF8(sv))
3414 has_utf8 = TRUE;
9f4817db 3415 }
2b9d42f0 3416 if (has_utf8) {
7e2040f0 3417 SvUTF8_on(sv);
2b9d42f0 3418 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
d0063567 3419 PL_sublex_info.sub_op->op_private |=
2b9d42f0
NIS
3420 (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
3421 }
3422 }
79072805 3423
02aa26ce 3424 /* shrink the sv if we allocated more than we used */
79072805 3425 if (SvCUR(sv) + 5 < SvLEN(sv)) {
1da4ca5f 3426 SvPV_shrink_to_cur(sv);
79072805 3427 }
02aa26ce 3428
6154021b 3429 /* return the substring (via pl_yylval) only if we parsed anything */
3280af22 3430 if (s > PL_bufptr) {
eb0d8d16
NC
3431 if ( PL_hints & ( PL_lex_inpat ? HINT_NEW_RE : HINT_NEW_STRING ) ) {
3432 const char *const key = PL_lex_inpat ? "qr" : "q";
3433 const STRLEN keylen = PL_lex_inpat ? 2 : 1;
3434 const char *type;
3435 STRLEN typelen;
3436
3437 if (PL_lex_inwhat == OP_TRANS) {
3438 type = "tr";
3439 typelen = 2;
3440 } else if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat) {
3441 type = "s";
3442 typelen = 1;
3443 } else {
3444 type = "qq";
3445 typelen = 2;
3446 }
3447
3448 sv = S_new_constant(aTHX_ start, s - start, key, keylen, sv, NULL,
3449 type, typelen);
3450 }
6154021b 3451 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
b3ac6de7 3452 } else
8990e307 3453 SvREFCNT_dec(sv);
79072805
LW
3454 return s;
3455}
3456
ffb4593c
NT
3457/* S_intuit_more
3458 * Returns TRUE if there's more to the expression (e.g., a subscript),
3459 * FALSE otherwise.
ffb4593c
NT
3460 *
3461 * It deals with "$foo[3]" and /$foo[3]/ and /$foo[0123456789$]+/
3462 *
3463 * ->[ and ->{ return TRUE
3464 * { and [ outside a pattern are always subscripts, so return TRUE
3465 * if we're outside a pattern and it's not { or [, then return FALSE
3466 * if we're in a pattern and the first char is a {
3467 * {4,5} (any digits around the comma) returns FALSE
3468 * if we're in a pattern and the first char is a [
3469 * [] returns FALSE
3470 * [SOMETHING] has a funky algorithm to decide whether it's a
3471 * character class or not. It has to deal with things like
3472 * /$foo[-3]/ and /$foo[$bar]/ as well as /$foo[$\d]+/
3473 * anything else returns TRUE
3474 */
3475
9cbb5ea2
GS
3476/* This is the one truly awful dwimmer necessary to conflate C and sed. */
3477
76e3520e 3478STATIC int
cea2e8a9 3479S_intuit_more(pTHX_ register char *s)
79072805 3480{
97aff369 3481 dVAR;
7918f24d
NC
3482
3483 PERL_ARGS_ASSERT_INTUIT_MORE;
3484
3280af22 3485 if (PL_lex_brackets)
79072805
LW
3486 return TRUE;
3487 if (*s == '-' && s[1] == '>' && (s[2] == '[' || s[2] == '{'))
3488 return TRUE;
3489 if (*s != '{' && *s != '[')
3490 return FALSE;
3280af22 3491 if (!PL_lex_inpat)
79072805
LW
3492 return TRUE;
3493
3494 /* In a pattern, so maybe we have {n,m}. */
3495 if (*s == '{') {
b3155d95 3496 if (regcurly(s)) {
79072805 3497 return FALSE;
b3155d95 3498 }
79072805 3499 return TRUE;
79072805
LW
3500 }
3501
3502 /* On the other hand, maybe we have a character class */
3503
3504 s++;
3505 if (*s == ']' || *s == '^')
3506 return FALSE;
3507 else {
ffb4593c 3508 /* this is terrifying, and it works */
79072805
LW
3509 int weight = 2; /* let's weigh the evidence */
3510 char seen[256];
f27ffc4a 3511 unsigned char un_char = 255, last_un_char;
9d4ba2ae 3512 const char * const send = strchr(s,']');
3280af22 3513 char tmpbuf[sizeof PL_tokenbuf * 4];
79072805
LW
3514
3515 if (!send) /* has to be an expression */
3516 return TRUE;
3517
3518 Zero(seen,256,char);
3519 if (*s == '$')
3520 weight -= 3;
3521 else if (isDIGIT(*s)) {
3522 if (s[1] != ']') {
3523 if (isDIGIT(s[1]) && s[2] == ']')
3524 weight -= 10;
3525 }
3526 else
3527 weight -= 100;
3528 }
3529 for (; s < send; s++) {
3530 last_un_char = un_char;
3531 un_char = (unsigned char)*s;
3532 switch (*s) {
3533 case '@':
3534 case '&':
3535 case '$':
3536 weight -= seen[un_char] * 10;
7e2040f0 3537 if (isALNUM_lazy_if(s+1,UTF)) {
90e5519e 3538 int len;
8903cb82 3539 scan_ident(s, send, tmpbuf, sizeof tmpbuf, FALSE);
90e5519e
NC
3540 len = (int)strlen(tmpbuf);
3541 if (len > 1 && gv_fetchpvn_flags(tmpbuf, len, 0, SVt_PV))
79072805
LW
3542 weight -= 100;
3543 else
3544 weight -= 10;
3545 }
3546 else if (*s == '$' && s[1] &&
93a17b20
LW
3547 strchr("[#!%*<>()-=",s[1])) {
3548 if (/*{*/ strchr("])} =",s[2]))
79072805
LW
3549 weight -= 10;
3550 else
3551 weight -= 1;
3552 }
3553 break;
3554 case '\\':
3555 un_char = 254;
3556 if (s[1]) {
93a17b20 3557 if (strchr("wds]",s[1]))
79072805 3558 weight += 100;
10edeb5d 3559 else if (seen[(U8)'\''] || seen[(U8)'"'])
79072805 3560 weight += 1;
93a17b20 3561 else if (strchr("rnftbxcav",s[1]))
79072805
LW
3562 weight += 40;
3563 else if (isDIGIT(s[1])) {
3564 weight += 40;
3565 while (s[1] && isDIGIT(s[1]))
3566 s++;
3567 }
3568 }
3569 else
3570 weight += 100;
3571 break;
3572 case '-':
3573 if (s[1] == '\\')
3574 weight += 50;
93a17b20 3575 if (strchr("aA01! ",last_un_char))
79072805 3576 weight += 30;
93a17b20 3577 if (strchr("zZ79~",s[1]))
79072805 3578 weight += 30;
f27ffc4a
GS
3579 if (last_un_char == 255 && (isDIGIT(s[1]) || s[1] == '$'))
3580 weight -= 5; /* cope with negative subscript */
79072805
LW
3581 break;
3582 default:
3792a11b
NC
3583 if (!isALNUM(last_un_char)
3584 && !(last_un_char == '$' || last_un_char == '@'
3585 || last_un_char == '&')
3586 && isALPHA(*s) && s[1] && isALPHA(s[1])) {
79072805
LW
3587 char *d = tmpbuf;
3588 while (isALPHA(*s))
3589 *d++ = *s++;
3590 *d = '\0';
5458a98a 3591 if (keyword(tmpbuf, d - tmpbuf, 0))
79072805
LW
3592 weight -= 150;
3593 }
3594 if (un_char == last_un_char + 1)
3595 weight += 5;
3596 weight -= seen[un_char];
3597 break;
3598 }
3599 seen[un_char]++;
3600 }
3601 if (weight >= 0) /* probably a character class */
3602 return FALSE;
3603 }
3604
3605 return TRUE;
3606}
ffed7fef 3607
ffb4593c
NT
3608/*
3609 * S_intuit_method
3610 *
3611 * Does all the checking to disambiguate
3612 * foo bar
3613 * between foo(bar) and bar->foo. Returns 0 if not a method, otherwise
3614 * FUNCMETH (bar->foo(args)) or METHOD (bar->foo args).
3615 *
3616 * First argument is the stuff after the first token, e.g. "bar".
3617 *
3618 * Not a method if bar is a filehandle.
3619 * Not a method if foo is a subroutine prototyped to take a filehandle.
3620 * Not a method if it's really "Foo $bar"
3621 * Method if it's "foo $bar"
3622 * Not a method if it's really "print foo $bar"
3623 * Method if it's really "foo package::" (interpreted as package->foo)
8f8cf39c 3624 * Not a method if bar is known to be a subroutine ("sub bar; foo bar")
3cb0bbe5 3625 * Not a method if bar is a filehandle or package, but is quoted with
ffb4593c
NT
3626 * =>
3627 */
3628
76e3520e 3629STATIC int
62d55b22 3630S_intuit_method(pTHX_ char *start, GV *gv, CV *cv)
a0d0e21e 3631{
97aff369 3632 dVAR;
a0d0e21e 3633 char *s = start + (*start == '$');
3280af22 3634 char tmpbuf[sizeof PL_tokenbuf];
a0d0e21e
LW
3635 STRLEN len;
3636 GV* indirgv;
5db06880
NC
3637#ifdef PERL_MAD
3638 int soff;
3639#endif
a0d0e21e 3640
7918f24d
NC
3641 PERL_ARGS_ASSERT_INTUIT_METHOD;
3642
a0d0e21e 3643 if (gv) {
62d55b22 3644 if (SvTYPE(gv) == SVt_PVGV && GvIO(gv))
a0d0e21e 3645 return 0;
62d55b22
NC
3646 if (cv) {
3647 if (SvPOK(cv)) {
3648 const char *proto = SvPVX_const(cv);
3649 if (proto) {
3650 if (*proto == ';')
3651 proto++;
3652 if (*proto == '*')
3653 return 0;
3654 }
b6c543e3
IZ
3655 }
3656 } else
c35e046a 3657 gv = NULL;
a0d0e21e 3658 }
8903cb82 3659 s = scan_word(s, tmpbuf, sizeof tmpbuf, TRUE, &len);
ffb4593c
NT
3660 /* start is the beginning of the possible filehandle/object,
3661 * and s is the end of it
3662 * tmpbuf is a copy of it
3663 */
3664
a0d0e21e 3665 if (*start == '$') {
3ef1310e
RGS
3666 if (gv || PL_last_lop_op == OP_PRINT || PL_last_lop_op == OP_SAY ||
3667 isUPPER(*PL_tokenbuf))
a0d0e21e 3668 return 0;
5db06880
NC
3669#ifdef PERL_MAD
3670 len = start - SvPVX(PL_linestr);
3671#endif
29595ff2 3672 s = PEEKSPACE(s);
f0092767 3673#ifdef PERL_MAD
5db06880
NC
3674 start = SvPVX(PL_linestr) + len;
3675#endif
3280af22
NIS
3676 PL_bufptr = start;
3677 PL_expect = XREF;
a0d0e21e
LW
3678 return *s == '(' ? FUNCMETH : METHOD;
3679 }
5458a98a 3680 if (!keyword(tmpbuf, len, 0)) {
c3e0f903
GS
3681 if (len > 2 && tmpbuf[len - 2] == ':' && tmpbuf[len - 1] == ':') {
3682 len -= 2;
3683 tmpbuf[len] = '\0';
5db06880
NC
3684#ifdef PERL_MAD
3685 soff = s - SvPVX(PL_linestr);
3686#endif
c3e0f903
GS
3687 goto bare_package;
3688 }
90e5519e 3689 indirgv = gv_fetchpvn_flags(tmpbuf, len, 0, SVt_PVCV);
8ebc5c01 3690 if (indirgv && GvCVu(indirgv))
a0d0e21e
LW
3691 return 0;
3692 /* filehandle or package name makes it a method */
da51bb9b 3693 if (!gv || GvIO(indirgv) || gv_stashpvn(tmpbuf, len, 0)) {
5db06880
NC
3694#ifdef PERL_MAD
3695 soff = s - SvPVX(PL_linestr);
3696#endif
29595ff2 3697 s = PEEKSPACE(s);
3280af22 3698 if ((PL_bufend - s) >= 2 && *s == '=' && *(s+1) == '>')
55497cff 3699 return 0; /* no assumptions -- "=>" quotes bearword */
c3e0f903 3700 bare_package:
cd81e915 3701 start_force(PL_curforce);
9ded7720 3702 NEXTVAL_NEXTTOKE.opval = (OP*)newSVOP(OP_CONST, 0,
64142370 3703 S_newSV_maybe_utf8(aTHX_ tmpbuf, len));
9ded7720 3704 NEXTVAL_NEXTTOKE.opval->op_private = OPpCONST_BARE;
5db06880
NC
3705 if (PL_madskills)
3706 curmad('X', newSVpvn(start,SvPVX(PL_linestr) + soff - start));
3280af22 3707 PL_expect = XTERM;
a0d0e21e 3708 force_next(WORD);
3280af22 3709 PL_bufptr = s;
5db06880
NC
3710#ifdef PERL_MAD
3711 PL_bufptr = SvPVX(PL_linestr) + soff; /* restart before space */
3712#endif
a0d0e21e
LW
3713 return *s == '(' ? FUNCMETH : METHOD;
3714 }
3715 }
3716 return 0;
3717}
3718
16d20bd9 3719/* Encoded script support. filter_add() effectively inserts a
4e553d73 3720 * 'pre-processing' function into the current source input stream.
16d20bd9
AD
3721 * Note that the filter function only applies to the current source file
3722 * (e.g., it will not affect files 'require'd or 'use'd by this one).
3723 *
3724 * The datasv parameter (which may be NULL) can be used to pass
3725 * private data to this instance of the filter. The filter function
3726 * can recover the SV using the FILTER_DATA macro and use it to
3727 * store private buffers and state information.
3728 *
3729 * The supplied datasv parameter is upgraded to a PVIO type
4755096e 3730 * and the IoDIRP/IoANY field is used to store the function pointer,
e0c19803 3731 * and IOf_FAKE_DIRP is enabled on datasv to mark this as such.
16d20bd9
AD
3732 * Note that IoTOP_NAME, IoFMT_NAME, IoBOTTOM_NAME, if set for
3733 * private use must be set using malloc'd pointers.
3734 */
16d20bd9
AD
3735
3736SV *
864dbfa3 3737Perl_filter_add(pTHX_ filter_t funcp, SV *datasv)
16d20bd9 3738{
97aff369 3739 dVAR;
f4c556ac 3740 if (!funcp)
a0714e2c 3741 return NULL;
f4c556ac 3742
5486870f
DM
3743 if (!PL_parser)
3744 return NULL;
3745
3280af22
NIS
3746 if (!PL_rsfp_filters)
3747 PL_rsfp_filters = newAV();
16d20bd9 3748 if (!datasv)
561b68a9 3749 datasv = newSV(0);
862a34c6 3750 SvUPGRADE(datasv, SVt_PVIO);
8141890a 3751 IoANY(datasv) = FPTR2DPTR(void *, funcp); /* stash funcp into spare field */
e0c19803 3752 IoFLAGS(datasv) |= IOf_FAKE_DIRP;
f4c556ac 3753 DEBUG_P(PerlIO_printf(Perl_debug_log, "filter_add func %p (%s)\n",
55662e27
JH
3754 FPTR2DPTR(void *, IoANY(datasv)),
3755 SvPV_nolen(datasv)));
3280af22
NIS
3756 av_unshift(PL_rsfp_filters, 1);
3757 av_store(PL_rsfp_filters, 0, datasv) ;
16d20bd9
AD
3758 return(datasv);
3759}
4e553d73 3760
16d20bd9
AD
3761
3762/* Delete most recently added instance of this filter function. */
a0d0e21e 3763void
864dbfa3 3764Perl_filter_del(pTHX_ filter_t funcp)
16d20bd9 3765{
97aff369 3766 dVAR;
e0c19803 3767 SV *datasv;
24801a4b 3768
7918f24d
NC
3769 PERL_ARGS_ASSERT_FILTER_DEL;
3770
33073adb 3771#ifdef DEBUGGING
55662e27
JH
3772 DEBUG_P(PerlIO_printf(Perl_debug_log, "filter_del func %p",