This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
[perl #33752] B::Deparse fails with utf8 and our
[perl5.git] / toke.c
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
26
27This is the lower layer of the Perl parser, managing characters and tokens.
28
29=for apidoc AmU|yy_parser *|PL_parser
30
31Pointer to a structure encapsulating the state of the parsing operation
32currently in progress. The pointer can be locally changed to perform
33a nested parse without interfering with the state of an outer parse.
34Individual members of C<PL_parser> have their own documentation.
35
36=cut
37*/
38
378cc40b 39#include "EXTERN.h"
864dbfa3 40#define PERL_IN_TOKE_C
378cc40b 41#include "perl.h"
04e98a4d 42#include "dquote_static.c"
378cc40b 43
eb0d8d16
NC
44#define new_constant(a,b,c,d,e,f,g) \
45 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
46
6154021b 47#define pl_yylval (PL_parser->yylval)
d3b6f988 48
199e78b7
DM
49/* XXX temporary backwards compatibility */
50#define PL_lex_brackets (PL_parser->lex_brackets)
51#define PL_lex_brackstack (PL_parser->lex_brackstack)
52#define PL_lex_casemods (PL_parser->lex_casemods)
53#define PL_lex_casestack (PL_parser->lex_casestack)
54#define PL_lex_defer (PL_parser->lex_defer)
55#define PL_lex_dojoin (PL_parser->lex_dojoin)
56#define PL_lex_expect (PL_parser->lex_expect)
57#define PL_lex_formbrack (PL_parser->lex_formbrack)
58#define PL_lex_inpat (PL_parser->lex_inpat)
59#define PL_lex_inwhat (PL_parser->lex_inwhat)
60#define PL_lex_op (PL_parser->lex_op)
61#define PL_lex_repl (PL_parser->lex_repl)
62#define PL_lex_starts (PL_parser->lex_starts)
63#define PL_lex_stuff (PL_parser->lex_stuff)
64#define PL_multi_start (PL_parser->multi_start)
65#define PL_multi_open (PL_parser->multi_open)
66#define PL_multi_close (PL_parser->multi_close)
67#define PL_pending_ident (PL_parser->pending_ident)
68#define PL_preambled (PL_parser->preambled)
69#define PL_sublex_info (PL_parser->sublex_info)
bdc0bf6f 70#define PL_linestr (PL_parser->linestr)
c2598295
DM
71#define PL_expect (PL_parser->expect)
72#define PL_copline (PL_parser->copline)
f06b5848
DM
73#define PL_bufptr (PL_parser->bufptr)
74#define PL_oldbufptr (PL_parser->oldbufptr)
75#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
76#define PL_linestart (PL_parser->linestart)
77#define PL_bufend (PL_parser->bufend)
78#define PL_last_uni (PL_parser->last_uni)
79#define PL_last_lop (PL_parser->last_lop)
80#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 81#define PL_lex_state (PL_parser->lex_state)
2f9285f8 82#define PL_rsfp (PL_parser->rsfp)
5486870f 83#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
84#define PL_in_my (PL_parser->in_my)
85#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 86#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 87#define PL_multi_end (PL_parser->multi_end)
13765c85 88#define PL_error_count (PL_parser->error_count)
199e78b7
DM
89
90#ifdef PERL_MAD
91# define PL_endwhite (PL_parser->endwhite)
92# define PL_faketokens (PL_parser->faketokens)
93# define PL_lasttoke (PL_parser->lasttoke)
94# define PL_nextwhite (PL_parser->nextwhite)
95# define PL_realtokenstart (PL_parser->realtokenstart)
96# define PL_skipwhite (PL_parser->skipwhite)
97# define PL_thisclose (PL_parser->thisclose)
98# define PL_thismad (PL_parser->thismad)
99# define PL_thisopen (PL_parser->thisopen)
100# define PL_thisstuff (PL_parser->thisstuff)
101# define PL_thistoken (PL_parser->thistoken)
102# define PL_thiswhite (PL_parser->thiswhite)
fb205e7a
DM
103# define PL_thiswhite (PL_parser->thiswhite)
104# define PL_nexttoke (PL_parser->nexttoke)
105# define PL_curforce (PL_parser->curforce)
106#else
107# define PL_nexttoke (PL_parser->nexttoke)
108# define PL_nexttype (PL_parser->nexttype)
109# define PL_nextval (PL_parser->nextval)
199e78b7
DM
110#endif
111
16173588
NC
112/* This can't be done with embed.fnc, because struct yy_parser contains a
113 member named pending_ident, which clashes with the generated #define */
3cbf51f5
DM
114static int
115S_pending_ident(pTHX);
199e78b7 116
0bd48802 117static const char ident_too_long[] = "Identifier too long";
8903cb82 118
29595ff2 119#ifdef PERL_MAD
29595ff2 120# define CURMAD(slot,sv) if (PL_madskills) { curmad(slot,sv); sv = 0; }
cd81e915 121# define NEXTVAL_NEXTTOKE PL_nexttoke[PL_curforce].next_val
9ded7720 122#else
5db06880 123# define CURMAD(slot,sv)
9ded7720 124# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2
NC
125#endif
126
9059aa12
LW
127#define XFAKEBRACK 128
128#define XENUMMASK 127
129
39e02b42
JH
130#ifdef USE_UTF8_SCRIPTS
131# define UTF (!IN_BYTES)
2b9d42f0 132#else
746b446a 133# define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || (PL_hints & HINT_UTF8))
2b9d42f0 134#endif
a0ed51b3 135
b1fc3636
CJ
136/* The maximum number of characters preceding the unrecognized one to display */
137#define UNRECOGNIZED_PRECEDE_COUNT 10
138
61f0cdd9 139/* In variables named $^X, these are the legal values for X.
2b92dfce
GS
140 * 1999-02-27 mjd-perl-patch@plover.com */
141#define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
142
bf4acbe4 143#define SPACE_OR_TAB(c) ((c)==' '||(c)=='\t')
bf4acbe4 144
ffb4593c
NT
145/* LEX_* are values for PL_lex_state, the state of the lexer.
146 * They are arranged oddly so that the guard on the switch statement
79072805
LW
147 * can get by with a single comparison (if the compiler is smart enough).
148 */
149
fb73857a 150/* #define LEX_NOTPARSING 11 is done in perl.h. */
151
b6007c36
DM
152#define LEX_NORMAL 10 /* normal code (ie not within "...") */
153#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
154#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
155#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
156#define LEX_INTERPSTART 6 /* expecting the start of a $var */
157
158 /* at end of code, eg "$x" followed by: */
159#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
160#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
161
162#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
163 string or after \E, $foo, etc */
164#define LEX_INTERPCONST 2 /* NOT USED */
165#define LEX_FORMLINE 1 /* expecting a format line */
166#define LEX_KNOWNEXT 0 /* next token known; just return it */
167
79072805 168
bbf60fe6 169#ifdef DEBUGGING
27da23d5 170static const char* const lex_state_names[] = {
bbf60fe6
DM
171 "KNOWNEXT",
172 "FORMLINE",
173 "INTERPCONST",
174 "INTERPCONCAT",
175 "INTERPENDMAYBE",
176 "INTERPEND",
177 "INTERPSTART",
178 "INTERPPUSH",
179 "INTERPCASEMOD",
180 "INTERPNORMAL",
181 "NORMAL"
182};
183#endif
184
79072805
LW
185#ifdef ff_next
186#undef ff_next
d48672a2
LW
187#endif
188
79072805 189#include "keywords.h"
fe14fcc3 190
ffb4593c
NT
191/* CLINE is a macro that ensures PL_copline has a sane value */
192
ae986130
LW
193#ifdef CLINE
194#undef CLINE
195#endif
57843af0 196#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 197
5db06880 198#ifdef PERL_MAD
29595ff2
NC
199# define SKIPSPACE0(s) skipspace0(s)
200# define SKIPSPACE1(s) skipspace1(s)
201# define SKIPSPACE2(s,tsv) skipspace2(s,&tsv)
202# define PEEKSPACE(s) skipspace2(s,0)
203#else
204# define SKIPSPACE0(s) skipspace(s)
205# define SKIPSPACE1(s) skipspace(s)
206# define SKIPSPACE2(s,tsv) skipspace(s)
207# define PEEKSPACE(s) skipspace(s)
208#endif
209
ffb4593c
NT
210/*
211 * Convenience functions to return different tokens and prime the
9cbb5ea2 212 * lexer for the next token. They all take an argument.
ffb4593c
NT
213 *
214 * TOKEN : generic token (used for '(', DOLSHARP, etc)
215 * OPERATOR : generic operator
216 * AOPERATOR : assignment operator
217 * PREBLOCK : beginning the block after an if, while, foreach, ...
218 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
219 * PREREF : *EXPR where EXPR is not a simple identifier
220 * TERM : expression term
221 * LOOPX : loop exiting command (goto, last, dump, etc)
222 * FTST : file test operator
223 * FUN0 : zero-argument function
2d2e263d 224 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
225 * BOop : bitwise or or xor
226 * BAop : bitwise and
227 * SHop : shift operator
228 * PWop : power operator
9cbb5ea2 229 * PMop : pattern-matching operator
ffb4593c
NT
230 * Aop : addition-level operator
231 * Mop : multiplication-level operator
232 * Eop : equality-testing operator
e5edeb50 233 * Rop : relational operator <= != gt
ffb4593c
NT
234 *
235 * Also see LOP and lop() below.
236 */
237
998054bd 238#ifdef DEBUGGING /* Serve -DT. */
704d4215 239# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 240#else
bbf60fe6 241# define REPORT(retval) (retval)
998054bd
SC
242#endif
243
bbf60fe6
DM
244#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
245#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
246#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, REPORT(retval)))
247#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
248#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
249#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
250#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
6154021b
RGS
251#define LOOPX(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)LOOPEX))
252#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
253#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
254#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
255#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITOROP)))
256#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITANDOP)))
257#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)SHIFTOP)))
258#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)POWOP)))
259#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
260#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)ADDOP)))
261#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MULOP)))
262#define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
263#define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
2f3197b3 264
a687059c
LW
265/* This bit of chicanery makes a unary function followed by
266 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
267 * The UNIDOR macro is for unary functions that can be followed by the //
268 * operator (such as C<shift // 0>).
a687059c 269 */
376fcdbf 270#define UNI2(f,x) { \
6154021b 271 pl_yylval.ival = f; \
376fcdbf
AL
272 PL_expect = x; \
273 PL_bufptr = s; \
274 PL_last_uni = PL_oldbufptr; \
275 PL_last_lop_op = f; \
276 if (*s == '(') \
277 return REPORT( (int)FUNC1 ); \
29595ff2 278 s = PEEKSPACE(s); \
376fcdbf
AL
279 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
280 }
6f33ba73
RGS
281#define UNI(f) UNI2(f,XTERM)
282#define UNIDOR(f) UNI2(f,XTERMORDORDOR)
a687059c 283
376fcdbf 284#define UNIBRACK(f) { \
6154021b 285 pl_yylval.ival = f; \
376fcdbf
AL
286 PL_bufptr = s; \
287 PL_last_uni = PL_oldbufptr; \
288 if (*s == '(') \
289 return REPORT( (int)FUNC1 ); \
29595ff2 290 s = PEEKSPACE(s); \
376fcdbf
AL
291 return REPORT( (*s == '(') ? (int)FUNC1 : (int)UNIOP ); \
292 }
79072805 293
9f68db38 294/* grandfather return to old style */
6154021b 295#define OLDLOP(f) return(pl_yylval.ival=f,PL_expect = XTERM,PL_bufptr = s,(int)LSTOP)
79072805 296
8fa7f367
JH
297#ifdef DEBUGGING
298
6154021b 299/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
300enum token_type {
301 TOKENTYPE_NONE,
302 TOKENTYPE_IVAL,
6154021b 303 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6
DM
304 TOKENTYPE_PVAL,
305 TOKENTYPE_OPVAL,
306 TOKENTYPE_GVVAL
307};
308
6d4a66ac
NC
309static struct debug_tokens {
310 const int token;
311 enum token_type type;
312 const char *name;
313} const debug_tokens[] =
9041c2e3 314{
bbf60fe6
DM
315 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
316 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
317 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
318 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
319 { ARROW, TOKENTYPE_NONE, "ARROW" },
320 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
321 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
322 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
323 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
324 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
0d863452 325 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
326 { DO, TOKENTYPE_NONE, "DO" },
327 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
328 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
329 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
330 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
331 { ELSE, TOKENTYPE_NONE, "ELSE" },
332 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
333 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
334 { FOR, TOKENTYPE_IVAL, "FOR" },
335 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
336 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
337 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
338 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
339 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
340 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 341 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
342 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
343 { IF, TOKENTYPE_IVAL, "IF" },
344 { LABEL, TOKENTYPE_PVAL, "LABEL" },
345 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
346 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
347 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
348 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
349 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
350 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
351 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
352 { MY, TOKENTYPE_IVAL, "MY" },
353 { MYSUB, TOKENTYPE_NONE, "MYSUB" },
354 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
355 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
356 { OROP, TOKENTYPE_IVAL, "OROP" },
357 { OROR, TOKENTYPE_NONE, "OROR" },
358 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
88e1f1a2
JV
359 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
360 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6
DM
361 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
362 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
363 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
364 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
365 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
366 { PREINC, TOKENTYPE_NONE, "PREINC" },
367 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
368 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
369 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
370 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
371 { SUB, TOKENTYPE_NONE, "SUB" },
372 { THING, TOKENTYPE_OPVAL, "THING" },
373 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
374 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
375 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
376 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
377 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
378 { USE, TOKENTYPE_IVAL, "USE" },
0d863452 379 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6
DM
380 { WHILE, TOKENTYPE_IVAL, "WHILE" },
381 { WORD, TOKENTYPE_OPVAL, "WORD" },
be25f609 382 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 383 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
384};
385
6154021b 386/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 387
bbf60fe6 388STATIC int
704d4215 389S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 390{
97aff369 391 dVAR;
7918f24d
NC
392
393 PERL_ARGS_ASSERT_TOKEREPORT;
394
bbf60fe6 395 if (DEBUG_T_TEST) {
bd61b366 396 const char *name = NULL;
bbf60fe6 397 enum token_type type = TOKENTYPE_NONE;
f54cb97a 398 const struct debug_tokens *p;
396482e1 399 SV* const report = newSVpvs("<== ");
bbf60fe6 400
f54cb97a 401 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
402 if (p->token == (int)rv) {
403 name = p->name;
404 type = p->type;
405 break;
406 }
407 }
408 if (name)
54667de8 409 Perl_sv_catpv(aTHX_ report, name);
bbf60fe6
DM
410 else if ((char)rv > ' ' && (char)rv < '~')
411 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
412 else if (!rv)
396482e1 413 sv_catpvs(report, "EOF");
bbf60fe6
DM
414 else
415 Perl_sv_catpvf(aTHX_ report, "?? %"IVdf, (IV)rv);
416 switch (type) {
417 case TOKENTYPE_NONE:
418 case TOKENTYPE_GVVAL: /* doesn't appear to be used */
419 break;
420 case TOKENTYPE_IVAL:
704d4215 421 Perl_sv_catpvf(aTHX_ report, "(ival=%"IVdf")", (IV)lvalp->ival);
bbf60fe6
DM
422 break;
423 case TOKENTYPE_OPNUM:
424 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 425 PL_op_name[lvalp->ival]);
bbf60fe6
DM
426 break;
427 case TOKENTYPE_PVAL:
704d4215 428 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
429 break;
430 case TOKENTYPE_OPVAL:
704d4215 431 if (lvalp->opval) {
401441c0 432 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
433 PL_op_name[lvalp->opval->op_type]);
434 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 435 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 436 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
437 }
438
439 }
401441c0 440 else
396482e1 441 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
442 break;
443 }
b6007c36 444 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
445 };
446 return (int)rv;
998054bd
SC
447}
448
b6007c36
DM
449
450/* print the buffer with suitable escapes */
451
452STATIC void
15f169a1 453S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 454{
396482e1 455 SV* const tmp = newSVpvs("");
7918f24d
NC
456
457 PERL_ARGS_ASSERT_PRINTBUF;
458
b6007c36
DM
459 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
460 SvREFCNT_dec(tmp);
461}
462
8fa7f367
JH
463#endif
464
8290c323
NC
465static int
466S_deprecate_commaless_var_list(pTHX) {
467 PL_expect = XTERM;
468 deprecate("comma-less variable list");
469 return REPORT(','); /* grandfather non-comma-format format */
470}
471
ffb4593c
NT
472/*
473 * S_ao
474 *
c963b151
BD
475 * This subroutine detects &&=, ||=, and //= and turns an ANDAND, OROR or DORDOR
476 * into an OP_ANDASSIGN, OP_ORASSIGN, or OP_DORASSIGN
ffb4593c
NT
477 */
478
76e3520e 479STATIC int
cea2e8a9 480S_ao(pTHX_ int toketype)
a0d0e21e 481{
97aff369 482 dVAR;
3280af22
NIS
483 if (*PL_bufptr == '=') {
484 PL_bufptr++;
a0d0e21e 485 if (toketype == ANDAND)
6154021b 486 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 487 else if (toketype == OROR)
6154021b 488 pl_yylval.ival = OP_ORASSIGN;
c963b151 489 else if (toketype == DORDOR)
6154021b 490 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
491 toketype = ASSIGNOP;
492 }
493 return toketype;
494}
495
ffb4593c
NT
496/*
497 * S_no_op
498 * When Perl expects an operator and finds something else, no_op
499 * prints the warning. It always prints "<something> found where
500 * operator expected. It prints "Missing semicolon on previous line?"
501 * if the surprise occurs at the start of the line. "do you need to
502 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
503 * where the compiler doesn't know if foo is a method call or a function.
504 * It prints "Missing operator before end of line" if there's nothing
505 * after the missing operator, or "... before <...>" if there is something
506 * after the missing operator.
507 */
508
76e3520e 509STATIC void
15f169a1 510S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 511{
97aff369 512 dVAR;
9d4ba2ae
AL
513 char * const oldbp = PL_bufptr;
514 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 515
7918f24d
NC
516 PERL_ARGS_ASSERT_NO_OP;
517
1189a94a
GS
518 if (!s)
519 s = oldbp;
07c798fb 520 else
1189a94a 521 PL_bufptr = s;
cea2e8a9 522 yywarn(Perl_form(aTHX_ "%s found where operator expected", what));
56da5a46
RGS
523 if (ckWARN_d(WARN_SYNTAX)) {
524 if (is_first)
525 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
526 "\t(Missing semicolon on previous line?)\n");
527 else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) {
f54cb97a 528 const char *t;
c35e046a
AL
529 for (t = PL_oldoldbufptr; (isALNUM_lazy_if(t,UTF) || *t == ':'); t++)
530 NOOP;
56da5a46
RGS
531 if (t < PL_bufptr && isSPACE(*t))
532 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
533 "\t(Do you need to predeclare %.*s?)\n",
551405c4 534 (int)(t - PL_oldoldbufptr), PL_oldoldbufptr);
56da5a46
RGS
535 }
536 else {
537 assert(s >= oldbp);
538 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
551405c4 539 "\t(Missing operator before %.*s?)\n", (int)(s - oldbp), oldbp);
56da5a46 540 }
07c798fb 541 }
3280af22 542 PL_bufptr = oldbp;
8990e307
LW
543}
544
ffb4593c
NT
545/*
546 * S_missingterm
547 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 548 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
549 * If we're in a delimited string and the delimiter is a control
550 * character, it's reformatted into a two-char sequence like ^C.
551 * This is fatal.
552 */
553
76e3520e 554STATIC void
cea2e8a9 555S_missingterm(pTHX_ char *s)
8990e307 556{
97aff369 557 dVAR;
8990e307
LW
558 char tmpbuf[3];
559 char q;
560 if (s) {
9d4ba2ae 561 char * const nl = strrchr(s,'\n');
d2719217 562 if (nl)
8990e307
LW
563 *nl = '\0';
564 }
463559e7 565 else if (isCNTRL(PL_multi_close)) {
8990e307 566 *tmpbuf = '^';
585ec06d 567 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
568 tmpbuf[2] = '\0';
569 s = tmpbuf;
570 }
571 else {
eb160463 572 *tmpbuf = (char)PL_multi_close;
8990e307
LW
573 tmpbuf[1] = '\0';
574 s = tmpbuf;
575 }
576 q = strchr(s,'"') ? '\'' : '"';
cea2e8a9 577 Perl_croak(aTHX_ "Can't find string terminator %c%s%c anywhere before EOF",q,s,q);
463ee0b2 578}
79072805 579
ef89dcc3 580#define FEATURE_IS_ENABLED(name) \
0d863452 581 ((0 != (PL_hints & HINT_LOCALIZE_HH)) \
89529cee 582 && S_feature_is_enabled(aTHX_ STR_WITH_LEN(name)))
4a731d7b 583/* The longest string we pass in. */
1863b879 584#define MAX_FEATURE_LEN (sizeof("unicode_strings")-1)
4a731d7b 585
0d863452
RH
586/*
587 * S_feature_is_enabled
588 * Check whether the named feature is enabled.
589 */
590STATIC bool
15f169a1 591S_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
0d863452 592{
97aff369 593 dVAR;
0d863452 594 HV * const hinthv = GvHV(PL_hintgv);
4a731d7b 595 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
7918f24d
NC
596
597 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
598
4a731d7b
NC
599 assert(namelen <= MAX_FEATURE_LEN);
600 memcpy(&he_name[8], name, namelen);
d4c19fe8 601
7b9ef140 602 return (hinthv && hv_exists(hinthv, he_name, 8 + namelen));
0d863452
RH
603}
604
ffb4593c 605/*
9cbb5ea2
GS
606 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
607 * utf16-to-utf8-reversed.
ffb4593c
NT
608 */
609
c39cd008
GS
610#ifdef PERL_CR_FILTER
611static void
612strip_return(SV *sv)
613{
95a20fc0 614 register const char *s = SvPVX_const(sv);
9d4ba2ae 615 register const char * const e = s + SvCUR(sv);
7918f24d
NC
616
617 PERL_ARGS_ASSERT_STRIP_RETURN;
618
c39cd008
GS
619 /* outer loop optimized to do nothing if there are no CR-LFs */
620 while (s < e) {
621 if (*s++ == '\r' && *s == '\n') {
622 /* hit a CR-LF, need to copy the rest */
623 register char *d = s - 1;
624 *d++ = *s++;
625 while (s < e) {
626 if (*s == '\r' && s[1] == '\n')
627 s++;
628 *d++ = *s++;
629 }
630 SvCUR(sv) -= s - d;
631 return;
632 }
633 }
634}
a868473f 635
76e3520e 636STATIC I32
c39cd008 637S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 638{
f54cb97a 639 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
640 if (count > 0 && !maxlen)
641 strip_return(sv);
642 return count;
a868473f
NIS
643}
644#endif
645
ffb4593c 646/*
8eaa0acf
Z
647=for apidoc Amx|void|lex_start|SV *line|PerlIO *rsfp|U32 flags
648
649Creates and initialises a new lexer/parser state object, supplying
650a context in which to lex and parse from a new source of Perl code.
651A pointer to the new state object is placed in L</PL_parser>. An entry
652is made on the save stack so that upon unwinding the new state object
653will be destroyed and the former value of L</PL_parser> will be restored.
654Nothing else need be done to clean up the parsing context.
655
656The code to be parsed comes from I<line> and I<rsfp>. I<line>, if
657non-null, provides a string (in SV form) containing code to be parsed.
658A copy of the string is made, so subsequent modification of I<line>
659does not affect parsing. I<rsfp>, if non-null, provides an input stream
660from which code will be read to be parsed. If both are non-null, the
661code in I<line> comes first and must consist of complete lines of input,
662and I<rsfp> supplies the remainder of the source.
663
664The I<flags> parameter is reserved for future use, and must always
665be zero.
666
667=cut
668*/
ffb4593c 669
a0d0e21e 670void
8eaa0acf 671Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
79072805 672{
97aff369 673 dVAR;
6ef55633 674 const char *s = NULL;
8990e307 675 STRLEN len;
5486870f 676 yy_parser *parser, *oparser;
8eaa0acf
Z
677 if (flags)
678 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
acdf0a21
DM
679
680 /* create and initialise a parser */
681
199e78b7 682 Newxz(parser, 1, yy_parser);
5486870f 683 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
684 PL_parser = parser;
685
28ac2b49
Z
686 parser->stack = NULL;
687 parser->ps = NULL;
688 parser->stack_size = 0;
acdf0a21 689
e3abe207
DM
690 /* on scope exit, free this parser and restore any outer one */
691 SAVEPARSER(parser);
7c4baf47 692 parser->saved_curcop = PL_curcop;
e3abe207 693
acdf0a21 694 /* initialise lexer state */
8990e307 695
fb205e7a
DM
696#ifdef PERL_MAD
697 parser->curforce = -1;
698#else
699 parser->nexttoke = 0;
700#endif
ca4cfd28 701 parser->error_count = oparser ? oparser->error_count : 0;
c2598295 702 parser->copline = NOLINE;
5afb0a62 703 parser->lex_state = LEX_NORMAL;
c2598295 704 parser->expect = XSTATE;
2f9285f8 705 parser->rsfp = rsfp;
f07ec6dd 706 parser->rsfp_filters = newAV();
2f9285f8 707
199e78b7
DM
708 Newx(parser->lex_brackstack, 120, char);
709 Newx(parser->lex_casestack, 12, char);
710 *parser->lex_casestack = '\0';
02b34bbe 711
10efb74f
NC
712 if (line) {
713 s = SvPV_const(line, len);
714 } else {
715 len = 0;
716 }
bdc0bf6f 717
10efb74f 718 if (!len) {
bdc0bf6f 719 parser->linestr = newSVpvs("\n;");
805700c1 720 } else {
719a9bb0 721 parser->linestr = newSVpvn_flags(s, len, SvUTF8(line));
10efb74f 722 if (s[len-1] != ';')
bdc0bf6f 723 sv_catpvs(parser->linestr, "\n;");
8990e307 724 }
f06b5848
DM
725 parser->oldoldbufptr =
726 parser->oldbufptr =
727 parser->bufptr =
728 parser->linestart = SvPVX(parser->linestr);
729 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
730 parser->last_lop = parser->last_uni = NULL;
737c24fc
Z
731
732 parser->in_pod = 0;
79072805 733}
a687059c 734
e3abe207
DM
735
736/* delete a parser object */
737
738void
739Perl_parser_free(pTHX_ const yy_parser *parser)
740{
7918f24d
NC
741 PERL_ARGS_ASSERT_PARSER_FREE;
742
7c4baf47 743 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
744 SvREFCNT_dec(parser->linestr);
745
2f9285f8
DM
746 if (parser->rsfp == PerlIO_stdin())
747 PerlIO_clearerr(parser->rsfp);
799361c3
SH
748 else if (parser->rsfp && (!parser->old_parser ||
749 (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 750 PerlIO_close(parser->rsfp);
5486870f 751 SvREFCNT_dec(parser->rsfp_filters);
2f9285f8 752
e3abe207
DM
753 Safefree(parser->lex_brackstack);
754 Safefree(parser->lex_casestack);
755 PL_parser = parser->old_parser;
756 Safefree(parser);
757}
758
759
ffb4593c 760/*
f0e67a1d
Z
761=for apidoc AmxU|SV *|PL_parser-E<gt>linestr
762
763Buffer scalar containing the chunk currently under consideration of the
764text currently being lexed. This is always a plain string scalar (for
765which C<SvPOK> is true). It is not intended to be used as a scalar by
766normal scalar means; instead refer to the buffer directly by the pointer
767variables described below.
768
769The lexer maintains various C<char*> pointers to things in the
770C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
771reallocated, all of these pointers must be updated. Don't attempt to
772do this manually, but rather use L</lex_grow_linestr> if you need to
773reallocate the buffer.
774
775The content of the text chunk in the buffer is commonly exactly one
776complete line of input, up to and including a newline terminator,
777but there are situations where it is otherwise. The octets of the
778buffer may be intended to be interpreted as either UTF-8 or Latin-1.
779The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
780flag on this scalar, which may disagree with it.
781
782For direct examination of the buffer, the variable
783L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
784lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
785of these pointers is usually preferable to examination of the scalar
786through normal scalar means.
787
788=for apidoc AmxU|char *|PL_parser-E<gt>bufend
789
790Direct pointer to the end of the chunk of text currently being lexed, the
791end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
792+ SvCUR(PL_parser-E<gt>linestr)>. A NUL character (zero octet) is
793always located at the end of the buffer, and does not count as part of
794the buffer's contents.
795
796=for apidoc AmxU|char *|PL_parser-E<gt>bufptr
797
798Points to the current position of lexing inside the lexer buffer.
799Characters around this point may be freely examined, within
800the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
801L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
802interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
803
804Lexing code (whether in the Perl core or not) moves this pointer past
805the characters that it consumes. It is also expected to perform some
806bookkeeping whenever a newline character is consumed. This movement
807can be more conveniently performed by the function L</lex_read_to>,
808which handles newlines appropriately.
809
810Interpretation of the buffer's octets can be abstracted out by
811using the slightly higher-level functions L</lex_peek_unichar> and
812L</lex_read_unichar>.
813
814=for apidoc AmxU|char *|PL_parser-E<gt>linestart
815
816Points to the start of the current line inside the lexer buffer.
817This is useful for indicating at which column an error occurred, and
818not much else. This must be updated by any lexing code that consumes
819a newline; the function L</lex_read_to> handles this detail.
820
821=cut
822*/
823
824/*
825=for apidoc Amx|bool|lex_bufutf8
826
827Indicates whether the octets in the lexer buffer
828(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
829of Unicode characters. If not, they should be interpreted as Latin-1
830characters. This is analogous to the C<SvUTF8> flag for scalars.
831
832In UTF-8 mode, it is not guaranteed that the lexer buffer actually
833contains valid UTF-8. Lexing code must be robust in the face of invalid
834encoding.
835
836The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
837is significant, but not the whole story regarding the input character
838encoding. Normally, when a file is being read, the scalar contains octets
839and its C<SvUTF8> flag is off, but the octets should be interpreted as
840UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
841however, the scalar may have the C<SvUTF8> flag on, and in this case its
842octets should be interpreted as UTF-8 unless the C<use bytes> pragma
843is in effect. This logic may change in the future; use this function
844instead of implementing the logic yourself.
845
846=cut
847*/
848
849bool
850Perl_lex_bufutf8(pTHX)
851{
852 return UTF;
853}
854
855/*
856=for apidoc Amx|char *|lex_grow_linestr|STRLEN len
857
858Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
859at least I<len> octets (including terminating NUL). Returns a
860pointer to the reallocated buffer. This is necessary before making
861any direct modification of the buffer that would increase its length.
862L</lex_stuff_pvn> provides a more convenient way to insert text into
863the buffer.
864
865Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
866this function updates all of the lexer's variables that point directly
867into the buffer.
868
869=cut
870*/
871
872char *
873Perl_lex_grow_linestr(pTHX_ STRLEN len)
874{
875 SV *linestr;
876 char *buf;
877 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
878 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
879 linestr = PL_parser->linestr;
880 buf = SvPVX(linestr);
881 if (len <= SvLEN(linestr))
882 return buf;
883 bufend_pos = PL_parser->bufend - buf;
884 bufptr_pos = PL_parser->bufptr - buf;
885 oldbufptr_pos = PL_parser->oldbufptr - buf;
886 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
887 linestart_pos = PL_parser->linestart - buf;
888 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
889 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
890 buf = sv_grow(linestr, len);
891 PL_parser->bufend = buf + bufend_pos;
892 PL_parser->bufptr = buf + bufptr_pos;
893 PL_parser->oldbufptr = buf + oldbufptr_pos;
894 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
895 PL_parser->linestart = buf + linestart_pos;
896 if (PL_parser->last_uni)
897 PL_parser->last_uni = buf + last_uni_pos;
898 if (PL_parser->last_lop)
899 PL_parser->last_lop = buf + last_lop_pos;
900 return buf;
901}
902
903/*
83aa740e 904=for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
f0e67a1d
Z
905
906Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
907immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
908reallocating the buffer if necessary. This means that lexing code that
909runs later will see the characters as if they had appeared in the input.
910It is not recommended to do this as part of normal parsing, and most
911uses of this facility run the risk of the inserted characters being
912interpreted in an unintended manner.
913
914The string to be inserted is represented by I<len> octets starting
915at I<pv>. These octets are interpreted as either UTF-8 or Latin-1,
916according to whether the C<LEX_STUFF_UTF8> flag is set in I<flags>.
917The characters are recoded for the lexer buffer, according to how the
918buffer is currently being interpreted (L</lex_bufutf8>). If a string
9dcc53ea 919to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
f0e67a1d
Z
920function is more convenient.
921
922=cut
923*/
924
925void
83aa740e 926Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
f0e67a1d 927{
749123ff 928 dVAR;
f0e67a1d
Z
929 char *bufptr;
930 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
931 if (flags & ~(LEX_STUFF_UTF8))
932 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
933 if (UTF) {
934 if (flags & LEX_STUFF_UTF8) {
935 goto plain_copy;
936 } else {
937 STRLEN highhalf = 0;
83aa740e 938 const char *p, *e = pv+len;
f0e67a1d
Z
939 for (p = pv; p != e; p++)
940 highhalf += !!(((U8)*p) & 0x80);
941 if (!highhalf)
942 goto plain_copy;
943 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
944 bufptr = PL_parser->bufptr;
945 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
946 SvCUR_set(PL_parser->linestr,
947 SvCUR(PL_parser->linestr) + len+highhalf);
f0e67a1d
Z
948 PL_parser->bufend += len+highhalf;
949 for (p = pv; p != e; p++) {
950 U8 c = (U8)*p;
951 if (c & 0x80) {
952 *bufptr++ = (char)(0xc0 | (c >> 6));
953 *bufptr++ = (char)(0x80 | (c & 0x3f));
954 } else {
955 *bufptr++ = (char)c;
956 }
957 }
958 }
959 } else {
960 if (flags & LEX_STUFF_UTF8) {
961 STRLEN highhalf = 0;
83aa740e 962 const char *p, *e = pv+len;
f0e67a1d
Z
963 for (p = pv; p != e; p++) {
964 U8 c = (U8)*p;
965 if (c >= 0xc4) {
966 Perl_croak(aTHX_ "Lexing code attempted to stuff "
967 "non-Latin-1 character into Latin-1 input");
968 } else if (c >= 0xc2 && p+1 != e &&
969 (((U8)p[1]) & 0xc0) == 0x80) {
970 p++;
971 highhalf++;
972 } else if (c >= 0x80) {
973 /* malformed UTF-8 */
974 ENTER;
975 SAVESPTR(PL_warnhook);
976 PL_warnhook = PERL_WARNHOOK_FATAL;
977 utf8n_to_uvuni((U8*)p, e-p, NULL, 0);
978 LEAVE;
979 }
980 }
981 if (!highhalf)
982 goto plain_copy;
983 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
984 bufptr = PL_parser->bufptr;
985 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
986 SvCUR_set(PL_parser->linestr,
987 SvCUR(PL_parser->linestr) + len-highhalf);
f0e67a1d
Z
988 PL_parser->bufend += len-highhalf;
989 for (p = pv; p != e; p++) {
990 U8 c = (U8)*p;
991 if (c & 0x80) {
992 *bufptr++ = (char)(((c & 0x3) << 6) | (p[1] & 0x3f));
993 p++;
994 } else {
995 *bufptr++ = (char)c;
996 }
997 }
998 } else {
999 plain_copy:
1000 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1001 bufptr = PL_parser->bufptr;
1002 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
255fdf19 1003 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
f0e67a1d
Z
1004 PL_parser->bufend += len;
1005 Copy(pv, bufptr, len, char);
1006 }
1007 }
1008}
1009
1010/*
9dcc53ea
Z
1011=for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
1012
1013Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1014immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1015reallocating the buffer if necessary. This means that lexing code that
1016runs later will see the characters as if they had appeared in the input.
1017It is not recommended to do this as part of normal parsing, and most
1018uses of this facility run the risk of the inserted characters being
1019interpreted in an unintended manner.
1020
1021The string to be inserted is represented by octets starting at I<pv>
1022and continuing to the first nul. These octets are interpreted as either
1023UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
1024in I<flags>. The characters are recoded for the lexer buffer, according
1025to how the buffer is currently being interpreted (L</lex_bufutf8>).
1026If it is not convenient to nul-terminate a string to be inserted, the
1027L</lex_stuff_pvn> function is more appropriate.
1028
1029=cut
1030*/
1031
1032void
1033Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1034{
1035 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1036 lex_stuff_pvn(pv, strlen(pv), flags);
1037}
1038
1039/*
f0e67a1d
Z
1040=for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1041
1042Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1043immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1044reallocating the buffer if necessary. This means that lexing code that
1045runs later will see the characters as if they had appeared in the input.
1046It is not recommended to do this as part of normal parsing, and most
1047uses of this facility run the risk of the inserted characters being
1048interpreted in an unintended manner.
1049
1050The string to be inserted is the string value of I<sv>. The characters
1051are recoded for the lexer buffer, according to how the buffer is currently
9dcc53ea 1052being interpreted (L</lex_bufutf8>). If a string to be inserted is
f0e67a1d
Z
1053not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1054need to construct a scalar.
1055
1056=cut
1057*/
1058
1059void
1060Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1061{
1062 char *pv;
1063 STRLEN len;
1064 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1065 if (flags)
1066 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1067 pv = SvPV(sv, len);
1068 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1069}
1070
1071/*
1072=for apidoc Amx|void|lex_unstuff|char *ptr
1073
1074Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1075I<ptr>. Text following I<ptr> will be moved, and the buffer shortened.
1076This hides the discarded text from any lexing code that runs later,
1077as if the text had never appeared.
1078
1079This is not the normal way to consume lexed text. For that, use
1080L</lex_read_to>.
1081
1082=cut
1083*/
1084
1085void
1086Perl_lex_unstuff(pTHX_ char *ptr)
1087{
1088 char *buf, *bufend;
1089 STRLEN unstuff_len;
1090 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1091 buf = PL_parser->bufptr;
1092 if (ptr < buf)
1093 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1094 if (ptr == buf)
1095 return;
1096 bufend = PL_parser->bufend;
1097 if (ptr > bufend)
1098 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1099 unstuff_len = ptr - buf;
1100 Move(ptr, buf, bufend+1-ptr, char);
1101 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1102 PL_parser->bufend = bufend - unstuff_len;
1103}
1104
1105/*
1106=for apidoc Amx|void|lex_read_to|char *ptr
1107
1108Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1109to I<ptr>. This advances L</PL_parser-E<gt>bufptr> to match I<ptr>,
1110performing the correct bookkeeping whenever a newline character is passed.
1111This is the normal way to consume lexed text.
1112
1113Interpretation of the buffer's octets can be abstracted out by
1114using the slightly higher-level functions L</lex_peek_unichar> and
1115L</lex_read_unichar>.
1116
1117=cut
1118*/
1119
1120void
1121Perl_lex_read_to(pTHX_ char *ptr)
1122{
1123 char *s;
1124 PERL_ARGS_ASSERT_LEX_READ_TO;
1125 s = PL_parser->bufptr;
1126 if (ptr < s || ptr > PL_parser->bufend)
1127 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1128 for (; s != ptr; s++)
1129 if (*s == '\n') {
1130 CopLINE_inc(PL_curcop);
1131 PL_parser->linestart = s+1;
1132 }
1133 PL_parser->bufptr = ptr;
1134}
1135
1136/*
1137=for apidoc Amx|void|lex_discard_to|char *ptr
1138
1139Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1140up to I<ptr>. The remaining content of the buffer will be moved, and
1141all pointers into the buffer updated appropriately. I<ptr> must not
1142be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1143it is not permitted to discard text that has yet to be lexed.
1144
1145Normally it is not necessarily to do this directly, because it suffices to
1146use the implicit discarding behaviour of L</lex_next_chunk> and things
1147based on it. However, if a token stretches across multiple lines,
1f317c95 1148and the lexing code has kept multiple lines of text in the buffer for
f0e67a1d
Z
1149that purpose, then after completion of the token it would be wise to
1150explicitly discard the now-unneeded earlier lines, to avoid future
1151multi-line tokens growing the buffer without bound.
1152
1153=cut
1154*/
1155
1156void
1157Perl_lex_discard_to(pTHX_ char *ptr)
1158{
1159 char *buf;
1160 STRLEN discard_len;
1161 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1162 buf = SvPVX(PL_parser->linestr);
1163 if (ptr < buf)
1164 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1165 if (ptr == buf)
1166 return;
1167 if (ptr > PL_parser->bufptr)
1168 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1169 discard_len = ptr - buf;
1170 if (PL_parser->oldbufptr < ptr)
1171 PL_parser->oldbufptr = ptr;
1172 if (PL_parser->oldoldbufptr < ptr)
1173 PL_parser->oldoldbufptr = ptr;
1174 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1175 PL_parser->last_uni = NULL;
1176 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1177 PL_parser->last_lop = NULL;
1178 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1179 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1180 PL_parser->bufend -= discard_len;
1181 PL_parser->bufptr -= discard_len;
1182 PL_parser->oldbufptr -= discard_len;
1183 PL_parser->oldoldbufptr -= discard_len;
1184 if (PL_parser->last_uni)
1185 PL_parser->last_uni -= discard_len;
1186 if (PL_parser->last_lop)
1187 PL_parser->last_lop -= discard_len;
1188}
1189
1190/*
1191=for apidoc Amx|bool|lex_next_chunk|U32 flags
1192
1193Reads in the next chunk of text to be lexed, appending it to
1194L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1195looked to the end of the current chunk and wants to know more. It is
1196usual, but not necessary, for lexing to have consumed the entirety of
1197the current chunk at this time.
1198
1199If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1200chunk (i.e., the current chunk has been entirely consumed), normally the
1201current chunk will be discarded at the same time that the new chunk is
1202read in. If I<flags> includes C<LEX_KEEP_PREVIOUS>, the current chunk
1203will not be discarded. If the current chunk has not been entirely
1204consumed, then it will not be discarded regardless of the flag.
1205
1206Returns true if some new text was added to the buffer, or false if the
1207buffer has reached the end of the input text.
1208
1209=cut
1210*/
1211
1212#define LEX_FAKE_EOF 0x80000000
1213
1214bool
1215Perl_lex_next_chunk(pTHX_ U32 flags)
1216{
1217 SV *linestr;
1218 char *buf;
1219 STRLEN old_bufend_pos, new_bufend_pos;
1220 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1221 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1222 bool got_some_for_debugger = 0;
f0e67a1d
Z
1223 bool got_some;
1224 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF))
1225 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
f0e67a1d
Z
1226 linestr = PL_parser->linestr;
1227 buf = SvPVX(linestr);
1228 if (!(flags & LEX_KEEP_PREVIOUS) &&
1229 PL_parser->bufptr == PL_parser->bufend) {
1230 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1231 linestart_pos = 0;
1232 if (PL_parser->last_uni != PL_parser->bufend)
1233 PL_parser->last_uni = NULL;
1234 if (PL_parser->last_lop != PL_parser->bufend)
1235 PL_parser->last_lop = NULL;
1236 last_uni_pos = last_lop_pos = 0;
1237 *buf = 0;
1238 SvCUR(linestr) = 0;
1239 } else {
1240 old_bufend_pos = PL_parser->bufend - buf;
1241 bufptr_pos = PL_parser->bufptr - buf;
1242 oldbufptr_pos = PL_parser->oldbufptr - buf;
1243 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1244 linestart_pos = PL_parser->linestart - buf;
1245 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1246 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1247 }
1248 if (flags & LEX_FAKE_EOF) {
1249 goto eof;
1250 } else if (!PL_parser->rsfp) {
1251 got_some = 0;
1252 } else if (filter_gets(linestr, old_bufend_pos)) {
1253 got_some = 1;
17cc9359 1254 got_some_for_debugger = 1;
f0e67a1d 1255 } else {
580561a3
Z
1256 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1257 sv_setpvs(linestr, "");
f0e67a1d
Z
1258 eof:
1259 /* End of real input. Close filehandle (unless it was STDIN),
1260 * then add implicit termination.
1261 */
1262 if ((PerlIO*)PL_parser->rsfp == PerlIO_stdin())
1263 PerlIO_clearerr(PL_parser->rsfp);
1264 else if (PL_parser->rsfp)
1265 (void)PerlIO_close(PL_parser->rsfp);
1266 PL_parser->rsfp = NULL;
737c24fc 1267 PL_parser->in_pod = 0;
f0e67a1d
Z
1268#ifdef PERL_MAD
1269 if (PL_madskills && !PL_in_eval && (PL_minus_p || PL_minus_n))
1270 PL_faketokens = 1;
1271#endif
1272 if (!PL_in_eval && PL_minus_p) {
1273 sv_catpvs(linestr,
1274 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1275 PL_minus_n = PL_minus_p = 0;
1276 } else if (!PL_in_eval && PL_minus_n) {
1277 sv_catpvs(linestr, /*{*/";}");
1278 PL_minus_n = 0;
1279 } else
1280 sv_catpvs(linestr, ";");
1281 got_some = 1;
1282 }
1283 buf = SvPVX(linestr);
1284 new_bufend_pos = SvCUR(linestr);
1285 PL_parser->bufend = buf + new_bufend_pos;
1286 PL_parser->bufptr = buf + bufptr_pos;
1287 PL_parser->oldbufptr = buf + oldbufptr_pos;
1288 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1289 PL_parser->linestart = buf + linestart_pos;
1290 if (PL_parser->last_uni)
1291 PL_parser->last_uni = buf + last_uni_pos;
1292 if (PL_parser->last_lop)
1293 PL_parser->last_lop = buf + last_lop_pos;
17cc9359 1294 if (got_some_for_debugger && (PERLDB_LINE || PERLDB_SAVESRC) &&
f0e67a1d
Z
1295 PL_curstash != PL_debstash) {
1296 /* debugger active and we're not compiling the debugger code,
1297 * so store the line into the debugger's array of lines
1298 */
1299 update_debugger_info(NULL, buf+old_bufend_pos,
1300 new_bufend_pos-old_bufend_pos);
1301 }
1302 return got_some;
1303}
1304
1305/*
1306=for apidoc Amx|I32|lex_peek_unichar|U32 flags
1307
1308Looks ahead one (Unicode) character in the text currently being lexed.
1309Returns the codepoint (unsigned integer value) of the next character,
1310or -1 if lexing has reached the end of the input text. To consume the
1311peeked character, use L</lex_read_unichar>.
1312
1313If the next character is in (or extends into) the next chunk of input
1314text, the next chunk will be read in. Normally the current chunk will be
1315discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1316then the current chunk will not be discarded.
1317
1318If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1319is encountered, an exception is generated.
1320
1321=cut
1322*/
1323
1324I32
1325Perl_lex_peek_unichar(pTHX_ U32 flags)
1326{
749123ff 1327 dVAR;
f0e67a1d
Z
1328 char *s, *bufend;
1329 if (flags & ~(LEX_KEEP_PREVIOUS))
1330 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1331 s = PL_parser->bufptr;
1332 bufend = PL_parser->bufend;
1333 if (UTF) {
1334 U8 head;
1335 I32 unichar;
1336 STRLEN len, retlen;
1337 if (s == bufend) {
1338 if (!lex_next_chunk(flags))
1339 return -1;
1340 s = PL_parser->bufptr;
1341 bufend = PL_parser->bufend;
1342 }
1343 head = (U8)*s;
1344 if (!(head & 0x80))
1345 return head;
1346 if (head & 0x40) {
1347 len = PL_utf8skip[head];
1348 while ((STRLEN)(bufend-s) < len) {
1349 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1350 break;
1351 s = PL_parser->bufptr;
1352 bufend = PL_parser->bufend;
1353 }
1354 }
1355 unichar = utf8n_to_uvuni((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
1356 if (retlen == (STRLEN)-1) {
1357 /* malformed UTF-8 */
1358 ENTER;
1359 SAVESPTR(PL_warnhook);
1360 PL_warnhook = PERL_WARNHOOK_FATAL;
1361 utf8n_to_uvuni((U8*)s, bufend-s, NULL, 0);
1362 LEAVE;
1363 }
1364 return unichar;
1365 } else {
1366 if (s == bufend) {
1367 if (!lex_next_chunk(flags))
1368 return -1;
1369 s = PL_parser->bufptr;
1370 }
1371 return (U8)*s;
1372 }
1373}
1374
1375/*
1376=for apidoc Amx|I32|lex_read_unichar|U32 flags
1377
1378Reads the next (Unicode) character in the text currently being lexed.
1379Returns the codepoint (unsigned integer value) of the character read,
1380and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1381if lexing has reached the end of the input text. To non-destructively
1382examine the next character, use L</lex_peek_unichar> instead.
1383
1384If the next character is in (or extends into) the next chunk of input
1385text, the next chunk will be read in. Normally the current chunk will be
1386discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1387then the current chunk will not be discarded.
1388
1389If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1390is encountered, an exception is generated.
1391
1392=cut
1393*/
1394
1395I32
1396Perl_lex_read_unichar(pTHX_ U32 flags)
1397{
1398 I32 c;
1399 if (flags & ~(LEX_KEEP_PREVIOUS))
1400 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1401 c = lex_peek_unichar(flags);
1402 if (c != -1) {
1403 if (c == '\n')
1404 CopLINE_inc(PL_curcop);
1405 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1406 }
1407 return c;
1408}
1409
1410/*
1411=for apidoc Amx|void|lex_read_space|U32 flags
1412
1413Reads optional spaces, in Perl style, in the text currently being
1414lexed. The spaces may include ordinary whitespace characters and
1415Perl-style comments. C<#line> directives are processed if encountered.
1416L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1417at a non-space character (or the end of the input text).
1418
1419If spaces extend into the next chunk of input text, the next chunk will
1420be read in. Normally the current chunk will be discarded at the same
1421time, but if I<flags> includes C<LEX_KEEP_PREVIOUS> then the current
1422chunk will not be discarded.
1423
1424=cut
1425*/
1426
f0998909
Z
1427#define LEX_NO_NEXT_CHUNK 0x80000000
1428
f0e67a1d
Z
1429void
1430Perl_lex_read_space(pTHX_ U32 flags)
1431{
1432 char *s, *bufend;
1433 bool need_incline = 0;
f0998909 1434 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK))
f0e67a1d
Z
1435 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
1436#ifdef PERL_MAD
1437 if (PL_skipwhite) {
1438 sv_free(PL_skipwhite);
1439 PL_skipwhite = NULL;
1440 }
1441 if (PL_madskills)
1442 PL_skipwhite = newSVpvs("");
1443#endif /* PERL_MAD */
1444 s = PL_parser->bufptr;
1445 bufend = PL_parser->bufend;
1446 while (1) {
1447 char c = *s;
1448 if (c == '#') {
1449 do {
1450 c = *++s;
1451 } while (!(c == '\n' || (c == 0 && s == bufend)));
1452 } else if (c == '\n') {
1453 s++;
1454 PL_parser->linestart = s;
1455 if (s == bufend)
1456 need_incline = 1;
1457 else
1458 incline(s);
1459 } else if (isSPACE(c)) {
1460 s++;
1461 } else if (c == 0 && s == bufend) {
1462 bool got_more;
1463#ifdef PERL_MAD
1464 if (PL_madskills)
1465 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1466#endif /* PERL_MAD */
f0998909
Z
1467 if (flags & LEX_NO_NEXT_CHUNK)
1468 break;
f0e67a1d
Z
1469 PL_parser->bufptr = s;
1470 CopLINE_inc(PL_curcop);
1471 got_more = lex_next_chunk(flags);
1472 CopLINE_dec(PL_curcop);
1473 s = PL_parser->bufptr;
1474 bufend = PL_parser->bufend;
1475 if (!got_more)
1476 break;
1477 if (need_incline && PL_parser->rsfp) {
1478 incline(s);
1479 need_incline = 0;
1480 }
1481 } else {
1482 break;
1483 }
1484 }
1485#ifdef PERL_MAD
1486 if (PL_madskills)
1487 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1488#endif /* PERL_MAD */
1489 PL_parser->bufptr = s;
1490}
1491
1492/*
ffb4593c
NT
1493 * S_incline
1494 * This subroutine has nothing to do with tilting, whether at windmills
1495 * or pinball tables. Its name is short for "increment line". It
57843af0 1496 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1497 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1498 * # line 500 "foo.pm"
1499 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1500 */
1501
76e3520e 1502STATIC void
d9095cec 1503S_incline(pTHX_ const char *s)
463ee0b2 1504{
97aff369 1505 dVAR;
d9095cec
NC
1506 const char *t;
1507 const char *n;
1508 const char *e;
463ee0b2 1509
7918f24d
NC
1510 PERL_ARGS_ASSERT_INCLINE;
1511
57843af0 1512 CopLINE_inc(PL_curcop);
463ee0b2
LW
1513 if (*s++ != '#')
1514 return;
d4c19fe8
AL
1515 while (SPACE_OR_TAB(*s))
1516 s++;
73659bf1
GS
1517 if (strnEQ(s, "line", 4))
1518 s += 4;
1519 else
1520 return;
084592ab 1521 if (SPACE_OR_TAB(*s))
73659bf1 1522 s++;
4e553d73 1523 else
73659bf1 1524 return;
d4c19fe8
AL
1525 while (SPACE_OR_TAB(*s))
1526 s++;
463ee0b2
LW
1527 if (!isDIGIT(*s))
1528 return;
d4c19fe8 1529
463ee0b2
LW
1530 n = s;
1531 while (isDIGIT(*s))
1532 s++;
07714eb4 1533 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1534 return;
bf4acbe4 1535 while (SPACE_OR_TAB(*s))
463ee0b2 1536 s++;
73659bf1 1537 if (*s == '"' && (t = strchr(s+1, '"'))) {
463ee0b2 1538 s++;
73659bf1
GS
1539 e = t + 1;
1540 }
463ee0b2 1541 else {
c35e046a
AL
1542 t = s;
1543 while (!isSPACE(*t))
1544 t++;
73659bf1 1545 e = t;
463ee0b2 1546 }
bf4acbe4 1547 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1548 e++;
1549 if (*e != '\n' && *e != '\0')
1550 return; /* false alarm */
1551
f4dd75d9 1552 if (t - s > 0) {
d9095cec 1553 const STRLEN len = t - s;
8a5ee598 1554#ifndef USE_ITHREADS
19bad673
NC
1555 SV *const temp_sv = CopFILESV(PL_curcop);
1556 const char *cf;
1557 STRLEN tmplen;
1558
1559 if (temp_sv) {
1560 cf = SvPVX(temp_sv);
1561 tmplen = SvCUR(temp_sv);
1562 } else {
1563 cf = NULL;
1564 tmplen = 0;
1565 }
1566
42d9b98d 1567 if (tmplen > 7 && strnEQ(cf, "(eval ", 6)) {
e66cf94c
RGS
1568 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1569 * to *{"::_<newfilename"} */
44867030
NC
1570 /* However, the long form of evals is only turned on by the
1571 debugger - usually they're "(eval %lu)" */
1572 char smallbuf[128];
1573 char *tmpbuf;
1574 GV **gvp;
d9095cec 1575 STRLEN tmplen2 = len;
798b63bc 1576 if (tmplen + 2 <= sizeof smallbuf)
e66cf94c
RGS
1577 tmpbuf = smallbuf;
1578 else
2ae0db35 1579 Newx(tmpbuf, tmplen + 2, char);
44867030
NC
1580 tmpbuf[0] = '_';
1581 tmpbuf[1] = '<';
2ae0db35 1582 memcpy(tmpbuf + 2, cf, tmplen);
44867030 1583 tmplen += 2;
8a5ee598
RGS
1584 gvp = (GV**)hv_fetch(PL_defstash, tmpbuf, tmplen, FALSE);
1585 if (gvp) {
44867030
NC
1586 char *tmpbuf2;
1587 GV *gv2;
1588
1589 if (tmplen2 + 2 <= sizeof smallbuf)
1590 tmpbuf2 = smallbuf;
1591 else
1592 Newx(tmpbuf2, tmplen2 + 2, char);
1593
1594 if (tmpbuf2 != smallbuf || tmpbuf != smallbuf) {
1595 /* Either they malloc'd it, or we malloc'd it,
1596 so no prefix is present in ours. */
1597 tmpbuf2[0] = '_';
1598 tmpbuf2[1] = '<';
1599 }
1600
1601 memcpy(tmpbuf2 + 2, s, tmplen2);
1602 tmplen2 += 2;
1603
8a5ee598 1604 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1605 if (!isGV(gv2)) {
8a5ee598 1606 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1607 /* adjust ${"::_<newfilename"} to store the new file name */
1608 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
3cb1dbc6
NC
1609 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(*gvp)));
1610 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(*gvp)));
e5527e4b 1611 }
44867030
NC
1612
1613 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
8a5ee598 1614 }
e66cf94c 1615 if (tmpbuf != smallbuf) Safefree(tmpbuf);
e66cf94c 1616 }
8a5ee598 1617#endif
05ec9bb3 1618 CopFILE_free(PL_curcop);
d9095cec 1619 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1620 }
57843af0 1621 CopLINE_set(PL_curcop, atoi(n)-1);
463ee0b2
LW
1622}
1623
29595ff2 1624#ifdef PERL_MAD
cd81e915 1625/* skip space before PL_thistoken */
29595ff2
NC
1626
1627STATIC char *
1628S_skipspace0(pTHX_ register char *s)
1629{
7918f24d
NC
1630 PERL_ARGS_ASSERT_SKIPSPACE0;
1631
29595ff2
NC
1632 s = skipspace(s);
1633 if (!PL_madskills)
1634 return s;
cd81e915
NC
1635 if (PL_skipwhite) {
1636 if (!PL_thiswhite)
6b29d1f5 1637 PL_thiswhite = newSVpvs("");
cd81e915
NC
1638 sv_catsv(PL_thiswhite, PL_skipwhite);
1639 sv_free(PL_skipwhite);
1640 PL_skipwhite = 0;
1641 }
1642 PL_realtokenstart = s - SvPVX(PL_linestr);
29595ff2
NC
1643 return s;
1644}
1645
cd81e915 1646/* skip space after PL_thistoken */
29595ff2
NC
1647
1648STATIC char *
1649S_skipspace1(pTHX_ register char *s)
1650{
d4c19fe8 1651 const char *start = s;
29595ff2
NC
1652 I32 startoff = start - SvPVX(PL_linestr);
1653
7918f24d
NC
1654 PERL_ARGS_ASSERT_SKIPSPACE1;
1655
29595ff2
NC
1656 s = skipspace(s);
1657 if (!PL_madskills)
1658 return s;
1659 start = SvPVX(PL_linestr) + startoff;
cd81e915 1660 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1661 const char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1662 PL_thistoken = newSVpvn(tstart, start - tstart);
1663 }
1664 PL_realtokenstart = -1;
1665 if (PL_skipwhite) {
1666 if (!PL_nextwhite)
6b29d1f5 1667 PL_nextwhite = newSVpvs("");
cd81e915
NC
1668 sv_catsv(PL_nextwhite, PL_skipwhite);
1669 sv_free(PL_skipwhite);
1670 PL_skipwhite = 0;
29595ff2
NC
1671 }
1672 return s;
1673}
1674
1675STATIC char *
1676S_skipspace2(pTHX_ register char *s, SV **svp)
1677{
c35e046a
AL
1678 char *start;
1679 const I32 bufptroff = PL_bufptr - SvPVX(PL_linestr);
1680 const I32 startoff = s - SvPVX(PL_linestr);
1681
7918f24d
NC
1682 PERL_ARGS_ASSERT_SKIPSPACE2;
1683
29595ff2
NC
1684 s = skipspace(s);
1685 PL_bufptr = SvPVX(PL_linestr) + bufptroff;
1686 if (!PL_madskills || !svp)
1687 return s;
1688 start = SvPVX(PL_linestr) + startoff;
cd81e915 1689 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1690 char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1691 PL_thistoken = newSVpvn(tstart, start - tstart);
1692 PL_realtokenstart = -1;
29595ff2 1693 }
cd81e915 1694 if (PL_skipwhite) {
29595ff2 1695 if (!*svp)
6b29d1f5 1696 *svp = newSVpvs("");
cd81e915
NC
1697 sv_setsv(*svp, PL_skipwhite);
1698 sv_free(PL_skipwhite);
1699 PL_skipwhite = 0;
29595ff2
NC
1700 }
1701
1702 return s;
1703}
1704#endif
1705
80a702cd 1706STATIC void
15f169a1 1707S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1708{
1709 AV *av = CopFILEAVx(PL_curcop);
1710 if (av) {
b9f83d2f 1711 SV * const sv = newSV_type(SVt_PVMG);
5fa550fb
NC
1712 if (orig_sv)
1713 sv_setsv(sv, orig_sv);
1714 else
1715 sv_setpvn(sv, buf, len);
80a702cd
RGS
1716 (void)SvIOK_on(sv);
1717 SvIV_set(sv, 0);
1718 av_store(av, (I32)CopLINE(PL_curcop), sv);
1719 }
1720}
1721
ffb4593c
NT
1722/*
1723 * S_skipspace
1724 * Called to gobble the appropriate amount and type of whitespace.
1725 * Skips comments as well.
1726 */
1727
76e3520e 1728STATIC char *
cea2e8a9 1729S_skipspace(pTHX_ register char *s)
a687059c 1730{
5db06880 1731#ifdef PERL_MAD
f0e67a1d
Z
1732 char *start = s;
1733#endif /* PERL_MAD */
7918f24d 1734 PERL_ARGS_ASSERT_SKIPSPACE;
f0e67a1d 1735#ifdef PERL_MAD
cd81e915
NC
1736 if (PL_skipwhite) {
1737 sv_free(PL_skipwhite);
f0e67a1d 1738 PL_skipwhite = NULL;
5db06880 1739 }
f0e67a1d 1740#endif /* PERL_MAD */
3280af22 1741 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
bf4acbe4 1742 while (s < PL_bufend && SPACE_OR_TAB(*s))
463ee0b2 1743 s++;
f0e67a1d
Z
1744 } else {
1745 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1746 PL_bufptr = s;
f0998909
Z
1747 lex_read_space(LEX_KEEP_PREVIOUS |
1748 (PL_sublex_info.sub_inwhat || PL_lex_state == LEX_FORMLINE ?
1749 LEX_NO_NEXT_CHUNK : 0));
3280af22 1750 s = PL_bufptr;
f0e67a1d
Z
1751 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1752 if (PL_linestart > PL_bufptr)
1753 PL_bufptr = PL_linestart;
1754 return s;
463ee0b2 1755 }
5db06880 1756#ifdef PERL_MAD
f0e67a1d
Z
1757 if (PL_madskills)
1758 PL_skipwhite = newSVpvn(start, s-start);
1759#endif /* PERL_MAD */
5db06880 1760 return s;
a687059c 1761}
378cc40b 1762
ffb4593c
NT
1763/*
1764 * S_check_uni
1765 * Check the unary operators to ensure there's no ambiguity in how they're
1766 * used. An ambiguous piece of code would be:
1767 * rand + 5
1768 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1769 * the +5 is its argument.
1770 */
1771
76e3520e 1772STATIC void
cea2e8a9 1773S_check_uni(pTHX)
ba106d47 1774{
97aff369 1775 dVAR;
d4c19fe8
AL
1776 const char *s;
1777 const char *t;
2f3197b3 1778
3280af22 1779 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1780 return;
3280af22
NIS
1781 while (isSPACE(*PL_last_uni))
1782 PL_last_uni++;
c35e046a
AL
1783 s = PL_last_uni;
1784 while (isALNUM_lazy_if(s,UTF) || *s == '-')
1785 s++;
3280af22 1786 if ((t = strchr(s, '(')) && t < PL_bufptr)
a0d0e21e 1787 return;
6136c704 1788
9b387841
NC
1789 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
1790 "Warning: Use of \"%.*s\" without parentheses is ambiguous",
1791 (int)(s - PL_last_uni), PL_last_uni);
2f3197b3
LW
1792}
1793
ffb4593c
NT
1794/*
1795 * LOP : macro to build a list operator. Its behaviour has been replaced
1796 * with a subroutine, S_lop() for which LOP is just another name.
1797 */
1798
a0d0e21e
LW
1799#define LOP(f,x) return lop(f,x,s)
1800
ffb4593c
NT
1801/*
1802 * S_lop
1803 * Build a list operator (or something that might be one). The rules:
1804 * - if we have a next token, then it's a list operator [why?]
1805 * - if the next thing is an opening paren, then it's a function
1806 * - else it's a list operator
1807 */
1808
76e3520e 1809STATIC I32
a0be28da 1810S_lop(pTHX_ I32 f, int x, char *s)
ffed7fef 1811{
97aff369 1812 dVAR;
7918f24d
NC
1813
1814 PERL_ARGS_ASSERT_LOP;
1815
6154021b 1816 pl_yylval.ival = f;
35c8bce7 1817 CLINE;
3280af22
NIS
1818 PL_expect = x;
1819 PL_bufptr = s;
1820 PL_last_lop = PL_oldbufptr;
eb160463 1821 PL_last_lop_op = (OPCODE)f;
5db06880
NC
1822#ifdef PERL_MAD
1823 if (PL_lasttoke)
1824 return REPORT(LSTOP);
1825#else
3280af22 1826 if (PL_nexttoke)
bbf60fe6 1827 return REPORT(LSTOP);
5db06880 1828#endif
79072805 1829 if (*s == '(')
bbf60fe6 1830 return REPORT(FUNC);
29595ff2 1831 s = PEEKSPACE(s);
79072805 1832 if (*s == '(')
bbf60fe6 1833 return REPORT(FUNC);
79072805 1834 else
bbf60fe6 1835 return REPORT(LSTOP);
79072805
LW
1836}
1837
5db06880
NC
1838#ifdef PERL_MAD
1839 /*
1840 * S_start_force
1841 * Sets up for an eventual force_next(). start_force(0) basically does
1842 * an unshift, while start_force(-1) does a push. yylex removes items
1843 * on the "pop" end.
1844 */
1845
1846STATIC void
1847S_start_force(pTHX_ int where)
1848{
1849 int i;
1850
cd81e915 1851 if (where < 0) /* so people can duplicate start_force(PL_curforce) */
5db06880 1852 where = PL_lasttoke;
cd81e915
NC
1853 assert(PL_curforce < 0 || PL_curforce == where);
1854 if (PL_curforce != where) {
5db06880
NC
1855 for (i = PL_lasttoke; i > where; --i) {
1856 PL_nexttoke[i] = PL_nexttoke[i-1];
1857 }
1858 PL_lasttoke++;
1859 }
cd81e915 1860 if (PL_curforce < 0) /* in case of duplicate start_force() */
5db06880 1861 Zero(&PL_nexttoke[where], 1, NEXTTOKE);
cd81e915
NC
1862 PL_curforce = where;
1863 if (PL_nextwhite) {
5db06880 1864 if (PL_madskills)
6b29d1f5 1865 curmad('^', newSVpvs(""));
cd81e915 1866 CURMAD('_', PL_nextwhite);
5db06880
NC
1867 }
1868}
1869
1870STATIC void
1871S_curmad(pTHX_ char slot, SV *sv)
1872{
1873 MADPROP **where;
1874
1875 if (!sv)
1876 return;
cd81e915
NC
1877 if (PL_curforce < 0)
1878 where = &PL_thismad;
5db06880 1879 else
cd81e915 1880 where = &PL_nexttoke[PL_curforce].next_mad;
5db06880 1881
cd81e915 1882 if (PL_faketokens)
76f68e9b 1883 sv_setpvs(sv, "");
5db06880
NC
1884 else {
1885 if (!IN_BYTES) {
1886 if (UTF && is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
1887 SvUTF8_on(sv);
1888 else if (PL_encoding) {
1889 sv_recode_to_utf8(sv, PL_encoding);
1890 }
1891 }
1892 }
1893
1894 /* keep a slot open for the head of the list? */
1895 if (slot != '_' && *where && (*where)->mad_key == '^') {
1896 (*where)->mad_key = slot;
daba3364 1897 sv_free(MUTABLE_SV(((*where)->mad_val)));
5db06880
NC
1898 (*where)->mad_val = (void*)sv;
1899 }
1900 else
1901 addmad(newMADsv(slot, sv), where, 0);
1902}
1903#else
b3f24c00
MHM
1904# define start_force(where) NOOP
1905# define curmad(slot, sv) NOOP
5db06880
NC
1906#endif
1907
ffb4593c
NT
1908/*
1909 * S_force_next
9cbb5ea2 1910 * When the lexer realizes it knows the next token (for instance,
ffb4593c 1911 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 1912 * to know what token to return the next time the lexer is called. Caller
5db06880
NC
1913 * will need to set PL_nextval[] (or PL_nexttoke[].next_val with PERL_MAD),
1914 * and possibly PL_expect to ensure the lexer handles the token correctly.
ffb4593c
NT
1915 */
1916
4e553d73 1917STATIC void
cea2e8a9 1918S_force_next(pTHX_ I32 type)
79072805 1919{
97aff369 1920 dVAR;
704d4215
GG
1921#ifdef DEBUGGING
1922 if (DEBUG_T_TEST) {
1923 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 1924 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
1925 }
1926#endif
5db06880 1927#ifdef PERL_MAD
cd81e915 1928 if (PL_curforce < 0)
5db06880 1929 start_force(PL_lasttoke);
cd81e915 1930 PL_nexttoke[PL_curforce].next_type = type;
5db06880
NC
1931 if (PL_lex_state != LEX_KNOWNEXT)
1932 PL_lex_defer = PL_lex_state;
1933 PL_lex_state = LEX_KNOWNEXT;
1934 PL_lex_expect = PL_expect;
cd81e915 1935 PL_curforce = -1;
5db06880 1936#else
3280af22
NIS
1937 PL_nexttype[PL_nexttoke] = type;
1938 PL_nexttoke++;
1939 if (PL_lex_state != LEX_KNOWNEXT) {
1940 PL_lex_defer = PL_lex_state;
1941 PL_lex_expect = PL_expect;
1942 PL_lex_state = LEX_KNOWNEXT;
79072805 1943 }
5db06880 1944#endif
79072805
LW
1945}
1946
28ac2b49
Z
1947void
1948Perl_yyunlex(pTHX)
1949{
1950 if (PL_parser->yychar != YYEMPTY) {
1951 start_force(-1);
1952 NEXTVAL_NEXTTOKE = PL_parser->yylval;
1953 force_next(PL_parser->yychar);
1954 PL_parser->yychar = YYEMPTY;
1955 }
1956}
1957
d0a148a6 1958STATIC SV *
15f169a1 1959S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 1960{
97aff369 1961 dVAR;
740cce10 1962 SV * const sv = newSVpvn_utf8(start, len,
eaf7a4d2
CS
1963 !IN_BYTES
1964 && UTF
1965 && !is_ascii_string((const U8*)start, len)
740cce10 1966 && is_utf8_string((const U8*)start, len));
d0a148a6
NC
1967 return sv;
1968}
1969
ffb4593c
NT
1970/*
1971 * S_force_word
1972 * When the lexer knows the next thing is a word (for instance, it has
1973 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
1974 * it calls S_force_word to stick the next word into the PL_nexttoke/val
1975 * lookahead.
ffb4593c
NT
1976 *
1977 * Arguments:
b1b65b59 1978 * char *start : buffer position (must be within PL_linestr)
02b34bbe 1979 * int token : PL_next* will be this type of bare word (e.g., METHOD,WORD)
ffb4593c
NT
1980 * int check_keyword : if true, Perl checks to make sure the word isn't
1981 * a keyword (do this if the word is a label, e.g. goto FOO)
1982 * int allow_pack : if true, : characters will also be allowed (require,
1983 * use, etc. do this)
9cbb5ea2 1984 * int allow_initial_tick : used by the "sub" lexer only.
ffb4593c
NT
1985 */
1986
76e3520e 1987STATIC char *
cea2e8a9 1988S_force_word(pTHX_ register char *start, int token, int check_keyword, int allow_pack, int allow_initial_tick)
79072805 1989{
97aff369 1990 dVAR;
463ee0b2
LW
1991 register char *s;
1992 STRLEN len;
4e553d73 1993
7918f24d
NC
1994 PERL_ARGS_ASSERT_FORCE_WORD;
1995
29595ff2 1996 start = SKIPSPACE1(start);
463ee0b2 1997 s = start;
7e2040f0 1998 if (isIDFIRST_lazy_if(s,UTF) ||
a0d0e21e 1999 (allow_pack && *s == ':') ||
15f0808c 2000 (allow_initial_tick && *s == '\'') )
a0d0e21e 2001 {
3280af22 2002 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
5458a98a 2003 if (check_keyword && keyword(PL_tokenbuf, len, 0))
463ee0b2 2004 return start;
cd81e915 2005 start_force(PL_curforce);
5db06880
NC
2006 if (PL_madskills)
2007 curmad('X', newSVpvn(start,s-start));
463ee0b2 2008 if (token == METHOD) {
29595ff2 2009 s = SKIPSPACE1(s);
463ee0b2 2010 if (*s == '(')
3280af22 2011 PL_expect = XTERM;
463ee0b2 2012 else {
3280af22 2013 PL_expect = XOPERATOR;
463ee0b2 2014 }
79072805 2015 }
e74e6b3d 2016 if (PL_madskills)
63575281 2017 curmad('g', newSVpvs( "forced" ));
9ded7720 2018 NEXTVAL_NEXTTOKE.opval
d0a148a6
NC
2019 = (OP*)newSVOP(OP_CONST,0,
2020 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 2021 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
2022 force_next(token);
2023 }
2024 return s;
2025}
2026
ffb4593c
NT
2027/*
2028 * S_force_ident
9cbb5ea2 2029 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
2030 * text only contains the "foo" portion. The first argument is a pointer
2031 * to the "foo", and the second argument is the type symbol to prefix.
2032 * Forces the next token to be a "WORD".
9cbb5ea2 2033 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2034 */
2035
76e3520e 2036STATIC void
bfed75c6 2037S_force_ident(pTHX_ register const char *s, int kind)
79072805 2038{
97aff369 2039 dVAR;
7918f24d
NC
2040
2041 PERL_ARGS_ASSERT_FORCE_IDENT;
2042
c35e046a 2043 if (*s) {
90e5519e
NC
2044 const STRLEN len = strlen(s);
2045 OP* const o = (OP*)newSVOP(OP_CONST, 0, newSVpvn(s, len));
cd81e915 2046 start_force(PL_curforce);
9ded7720 2047 NEXTVAL_NEXTTOKE.opval = o;
79072805 2048 force_next(WORD);
748a9306 2049 if (kind) {
11343788 2050 o->op_private = OPpCONST_ENTERED;
55497cff 2051 /* XXX see note in pp_entereval() for why we forgo typo
2052 warnings if the symbol must be introduced in an eval.
2053 GSAR 96-10-12 */
90e5519e
NC
2054 gv_fetchpvn_flags(s, len,
2055 PL_in_eval ? (GV_ADDMULTI | GV_ADDINEVAL)
2056 : GV_ADD,
2057 kind == '$' ? SVt_PV :
2058 kind == '@' ? SVt_PVAV :
2059 kind == '%' ? SVt_PVHV :
a0d0e21e 2060 SVt_PVGV
90e5519e 2061 );
748a9306 2062 }
79072805
LW
2063 }
2064}
2065
1571675a
GS
2066NV
2067Perl_str_to_version(pTHX_ SV *sv)
2068{
2069 NV retval = 0.0;
2070 NV nshift = 1.0;
2071 STRLEN len;
cfd0369c 2072 const char *start = SvPV_const(sv,len);
9d4ba2ae 2073 const char * const end = start + len;
504618e9 2074 const bool utf = SvUTF8(sv) ? TRUE : FALSE;
7918f24d
NC
2075
2076 PERL_ARGS_ASSERT_STR_TO_VERSION;
2077
1571675a 2078 while (start < end) {
ba210ebe 2079 STRLEN skip;
1571675a
GS
2080 UV n;
2081 if (utf)
9041c2e3 2082 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2083 else {
2084 n = *(U8*)start;
2085 skip = 1;
2086 }
2087 retval += ((NV)n)/nshift;
2088 start += skip;
2089 nshift *= 1000;
2090 }
2091 return retval;
2092}
2093
4e553d73 2094/*
ffb4593c
NT
2095 * S_force_version
2096 * Forces the next token to be a version number.
e759cc13
RGS
2097 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2098 * and if "guessing" is TRUE, then no new token is created (and the caller
2099 * must use an alternative parsing method).
ffb4593c
NT
2100 */
2101
76e3520e 2102STATIC char *
e759cc13 2103S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2104{
97aff369 2105 dVAR;
5f66b61c 2106 OP *version = NULL;
44dcb63b 2107 char *d;
5db06880
NC
2108#ifdef PERL_MAD
2109 I32 startoff = s - SvPVX(PL_linestr);
2110#endif
89bfa8cd 2111
7918f24d
NC
2112 PERL_ARGS_ASSERT_FORCE_VERSION;
2113
29595ff2 2114 s = SKIPSPACE1(s);
89bfa8cd 2115
44dcb63b 2116 d = s;
dd629d5b 2117 if (*d == 'v')
44dcb63b 2118 d++;
44dcb63b 2119 if (isDIGIT(*d)) {
e759cc13
RGS
2120 while (isDIGIT(*d) || *d == '_' || *d == '.')
2121 d++;
5db06880
NC
2122#ifdef PERL_MAD
2123 if (PL_madskills) {
cd81e915 2124 start_force(PL_curforce);
5db06880
NC
2125 curmad('X', newSVpvn(s,d-s));
2126 }
2127#endif
4e4da3ac 2128 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
dd629d5b 2129 SV *ver;
8d08d9ba
DG
2130#ifdef USE_LOCALE_NUMERIC
2131 char *loc = setlocale(LC_NUMERIC, "C");
2132#endif
6154021b 2133 s = scan_num(s, &pl_yylval);
8d08d9ba
DG
2134#ifdef USE_LOCALE_NUMERIC
2135 setlocale(LC_NUMERIC, loc);
2136#endif
6154021b 2137 version = pl_yylval.opval;
dd629d5b
GS
2138 ver = cSVOPx(version)->op_sv;
2139 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2140 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2141 SvNV_set(ver, str_to_version(ver));
1571675a 2142 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2143 }
89bfa8cd 2144 }
5db06880
NC
2145 else if (guessing) {
2146#ifdef PERL_MAD
2147 if (PL_madskills) {
cd81e915
NC
2148 sv_free(PL_nextwhite); /* let next token collect whitespace */
2149 PL_nextwhite = 0;
5db06880
NC
2150 s = SvPVX(PL_linestr) + startoff;
2151 }
2152#endif
e759cc13 2153 return s;
5db06880 2154 }
89bfa8cd 2155 }
2156
5db06880
NC
2157#ifdef PERL_MAD
2158 if (PL_madskills && !version) {
cd81e915
NC
2159 sv_free(PL_nextwhite); /* let next token collect whitespace */
2160 PL_nextwhite = 0;
5db06880
NC
2161 s = SvPVX(PL_linestr) + startoff;
2162 }
2163#endif
89bfa8cd 2164 /* NOTE: The parser sees the package name and the VERSION swapped */
cd81e915 2165 start_force(PL_curforce);
9ded7720 2166 NEXTVAL_NEXTTOKE.opval = version;
4e553d73 2167 force_next(WORD);
89bfa8cd 2168
e759cc13 2169 return s;
89bfa8cd 2170}
2171
ffb4593c 2172/*
91152fc1
DG
2173 * S_force_strict_version
2174 * Forces the next token to be a version number using strict syntax rules.
2175 */
2176
2177STATIC char *
2178S_force_strict_version(pTHX_ char *s)
2179{
2180 dVAR;
2181 OP *version = NULL;
2182#ifdef PERL_MAD
2183 I32 startoff = s - SvPVX(PL_linestr);
2184#endif
2185 const char *errstr = NULL;
2186
2187 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2188
2189 while (isSPACE(*s)) /* leading whitespace */
2190 s++;
2191
2192 if (is_STRICT_VERSION(s,&errstr)) {
2193 SV *ver = newSV(0);
2194 s = (char *)scan_version(s, ver, 0);
2195 version = newSVOP(OP_CONST, 0, ver);
2196 }
4e4da3ac
Z
2197 else if ( (*s != ';' && *s != '{' && *s != '}' ) &&
2198 (s = SKIPSPACE1(s), (*s != ';' && *s != '{' && *s != '}' )))
2199 {
91152fc1
DG
2200 PL_bufptr = s;
2201 if (errstr)
2202 yyerror(errstr); /* version required */
2203 return s;
2204 }
2205
2206#ifdef PERL_MAD
2207 if (PL_madskills && !version) {
2208 sv_free(PL_nextwhite); /* let next token collect whitespace */
2209 PL_nextwhite = 0;
2210 s = SvPVX(PL_linestr) + startoff;
2211 }
2212#endif
2213 /* NOTE: The parser sees the package name and the VERSION swapped */
2214 start_force(PL_curforce);
2215 NEXTVAL_NEXTTOKE.opval = version;
2216 force_next(WORD);
2217
2218 return s;
2219}
2220
2221/*
ffb4593c
NT
2222 * S_tokeq
2223 * Tokenize a quoted string passed in as an SV. It finds the next
2224 * chunk, up to end of string or a backslash. It may make a new
2225 * SV containing that chunk (if HINT_NEW_STRING is on). It also
2226 * turns \\ into \.
2227 */
2228
76e3520e 2229STATIC SV *
cea2e8a9 2230S_tokeq(pTHX_ SV *sv)
79072805 2231{
97aff369 2232 dVAR;
79072805
LW
2233 register char *s;
2234 register char *send;
2235 register char *d;
b3ac6de7
IZ
2236 STRLEN len = 0;
2237 SV *pv = sv;
79072805 2238
7918f24d
NC
2239 PERL_ARGS_ASSERT_TOKEQ;
2240
79072805 2241 if (!SvLEN(sv))
b3ac6de7 2242 goto finish;
79072805 2243
a0d0e21e 2244 s = SvPV_force(sv, len);
21a311ee 2245 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1)
b3ac6de7 2246 goto finish;
463ee0b2 2247 send = s + len;
79072805
LW
2248 while (s < send && *s != '\\')
2249 s++;
2250 if (s == send)
b3ac6de7 2251 goto finish;
79072805 2252 d = s;
be4731d2 2253 if ( PL_hints & HINT_NEW_STRING ) {
59cd0e26 2254 pv = newSVpvn_flags(SvPVX_const(pv), len, SVs_TEMP | SvUTF8(sv));
be4731d2 2255 }
79072805
LW
2256 while (s < send) {
2257 if (*s == '\\') {
a0d0e21e 2258 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2259 s++; /* all that, just for this */
2260 }
2261 *d++ = *s++;
2262 }
2263 *d = '\0';
95a20fc0 2264 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2265 finish:
3280af22 2266 if ( PL_hints & HINT_NEW_STRING )
eb0d8d16 2267 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
79072805
LW
2268 return sv;
2269}
2270
ffb4593c
NT
2271/*
2272 * Now come three functions related to double-quote context,
2273 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2274 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2275 * interact with PL_lex_state, and create fake ( ... ) argument lists
2276 * to handle functions and concatenation.
2277 * They assume that whoever calls them will be setting up a fake
2278 * join call, because each subthing puts a ',' after it. This lets
2279 * "lower \luPpEr"
2280 * become
2281 * join($, , 'lower ', lcfirst( 'uPpEr', ) ,)
2282 *
2283 * (I'm not sure whether the spurious commas at the end of lcfirst's
2284 * arguments and join's arguments are created or not).
2285 */
2286
2287/*
2288 * S_sublex_start
6154021b 2289 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2290 *
2291 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2292 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c
NT
2293 *
2294 * OP_CONST and OP_READLINE are easy--just make the new op and return.
2295 *
2296 * Everything else becomes a FUNC.
2297 *
2298 * Sets PL_lex_state to LEX_INTERPPUSH unless (ival was OP_NULL or we
2299 * had an OP_CONST or OP_READLINE). This just sets us up for a
2300 * call to S_sublex_push().
2301 */
2302
76e3520e 2303STATIC I32
cea2e8a9 2304S_sublex_start(pTHX)
79072805 2305{
97aff369 2306 dVAR;
6154021b 2307 register const I32 op_type = pl_yylval.ival;
79072805
LW
2308
2309 if (op_type == OP_NULL) {
6154021b 2310 pl_yylval.opval = PL_lex_op;
5f66b61c 2311 PL_lex_op = NULL;
79072805
LW
2312 return THING;
2313 }
2314 if (op_type == OP_CONST || op_type == OP_READLINE) {
3280af22 2315 SV *sv = tokeq(PL_lex_stuff);
b3ac6de7
IZ
2316
2317 if (SvTYPE(sv) == SVt_PVIV) {
2318 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2319 STRLEN len;
96a5add6 2320 const char * const p = SvPV_const(sv, len);
740cce10 2321 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2322 SvREFCNT_dec(sv);
2323 sv = nsv;
4e553d73 2324 }
6154021b 2325 pl_yylval.opval = (OP*)newSVOP(op_type, 0, sv);
a0714e2c 2326 PL_lex_stuff = NULL;
6f33ba73
RGS
2327 /* Allow <FH> // "foo" */
2328 if (op_type == OP_READLINE)
2329 PL_expect = XTERMORDORDOR;
79072805
LW
2330 return THING;
2331 }
e3f73d4e
RGS
2332 else if (op_type == OP_BACKTICK && PL_lex_op) {
2333 /* readpipe() vas overriden */
2334 cSVOPx(cLISTOPx(cUNOPx(PL_lex_op)->op_first)->op_first->op_sibling)->op_sv = tokeq(PL_lex_stuff);
6154021b 2335 pl_yylval.opval = PL_lex_op;
9b201d7d 2336 PL_lex_op = NULL;
e3f73d4e
RGS
2337 PL_lex_stuff = NULL;
2338 return THING;
2339 }
79072805 2340
3280af22 2341 PL_sublex_info.super_state = PL_lex_state;
eac04b2e 2342 PL_sublex_info.sub_inwhat = (U16)op_type;
3280af22
NIS
2343 PL_sublex_info.sub_op = PL_lex_op;
2344 PL_lex_state = LEX_INTERPPUSH;
55497cff 2345
3280af22
NIS
2346 PL_expect = XTERM;
2347 if (PL_lex_op) {
6154021b 2348 pl_yylval.opval = PL_lex_op;
5f66b61c 2349 PL_lex_op = NULL;
55497cff 2350 return PMFUNC;
2351 }
2352 else
2353 return FUNC;
2354}
2355
ffb4593c
NT
2356/*
2357 * S_sublex_push
2358 * Create a new scope to save the lexing state. The scope will be
2359 * ended in S_sublex_done. Returns a '(', starting the function arguments
2360 * to the uc, lc, etc. found before.
2361 * Sets PL_lex_state to LEX_INTERPCONCAT.
2362 */
2363
76e3520e 2364STATIC I32
cea2e8a9 2365S_sublex_push(pTHX)
55497cff 2366{
27da23d5 2367 dVAR;
f46d017c 2368 ENTER;
55497cff 2369
3280af22 2370 PL_lex_state = PL_sublex_info.super_state;
651b5b28 2371 SAVEBOOL(PL_lex_dojoin);
3280af22 2372 SAVEI32(PL_lex_brackets);
3280af22
NIS
2373 SAVEI32(PL_lex_casemods);
2374 SAVEI32(PL_lex_starts);
651b5b28 2375 SAVEI8(PL_lex_state);
7766f137 2376 SAVEVPTR(PL_lex_inpat);
98246f1e 2377 SAVEI16(PL_lex_inwhat);
57843af0 2378 SAVECOPLINE(PL_curcop);
3280af22 2379 SAVEPPTR(PL_bufptr);
8452ff4b 2380 SAVEPPTR(PL_bufend);
3280af22
NIS
2381 SAVEPPTR(PL_oldbufptr);
2382 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2383 SAVEPPTR(PL_last_lop);
2384 SAVEPPTR(PL_last_uni);
3280af22
NIS
2385 SAVEPPTR(PL_linestart);
2386 SAVESPTR(PL_linestr);
8edd5f42
RGS
2387 SAVEGENERICPV(PL_lex_brackstack);
2388 SAVEGENERICPV(PL_lex_casestack);
3280af22
NIS
2389
2390 PL_linestr = PL_lex_stuff;
a0714e2c 2391 PL_lex_stuff = NULL;
3280af22 2392
9cbb5ea2
GS
2393 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2394 = SvPVX(PL_linestr);
3280af22 2395 PL_bufend += SvCUR(PL_linestr);
bd61b366 2396 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2397 SAVEFREESV(PL_linestr);
2398
2399 PL_lex_dojoin = FALSE;
2400 PL_lex_brackets = 0;
a02a5408
JC
2401 Newx(PL_lex_brackstack, 120, char);
2402 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2403 PL_lex_casemods = 0;
2404 *PL_lex_casestack = '\0';
2405 PL_lex_starts = 0;
2406 PL_lex_state = LEX_INTERPCONCAT;
eb160463 2407 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
3280af22
NIS
2408
2409 PL_lex_inwhat = PL_sublex_info.sub_inwhat;
2410 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2411 PL_lex_inpat = PL_sublex_info.sub_op;
79072805 2412 else
5f66b61c 2413 PL_lex_inpat = NULL;
79072805 2414
55497cff 2415 return '(';
79072805
LW
2416}
2417
ffb4593c
NT
2418/*
2419 * S_sublex_done
2420 * Restores lexer state after a S_sublex_push.
2421 */
2422
76e3520e 2423STATIC I32
cea2e8a9 2424S_sublex_done(pTHX)
79072805 2425{
27da23d5 2426 dVAR;
3280af22 2427 if (!PL_lex_starts++) {
396482e1 2428 SV * const sv = newSVpvs("");
9aa983d2
JH
2429 if (SvUTF8(PL_linestr))
2430 SvUTF8_on(sv);
3280af22 2431 PL_expect = XOPERATOR;
6154021b 2432 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
79072805
LW
2433 return THING;
2434 }
2435
3280af22
NIS
2436 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2437 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2438 return yylex();
79072805
LW
2439 }
2440
ffb4593c 2441 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
3280af22
NIS
2442 if (PL_lex_repl && (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS)) {
2443 PL_linestr = PL_lex_repl;
2444 PL_lex_inpat = 0;
2445 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2446 PL_bufend += SvCUR(PL_linestr);
bd61b366 2447 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2448 SAVEFREESV(PL_linestr);
2449 PL_lex_dojoin = FALSE;
2450 PL_lex_brackets = 0;
3280af22
NIS
2451 PL_lex_casemods = 0;
2452 *PL_lex_casestack = '\0';
2453 PL_lex_starts = 0;
25da4f38 2454 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2455 PL_lex_state = LEX_INTERPNORMAL;
2456 PL_lex_starts++;
e9fa98b2
HS
2457 /* we don't clear PL_lex_repl here, so that we can check later
2458 whether this is an evalled subst; that means we rely on the
2459 logic to ensure sublex_done() is called again only via the
2460 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2461 }
e9fa98b2 2462 else {
3280af22 2463 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2464 PL_lex_repl = NULL;
e9fa98b2 2465 }
79072805 2466 return ',';
ffed7fef
LW
2467 }
2468 else {
5db06880
NC
2469#ifdef PERL_MAD
2470 if (PL_madskills) {
cd81e915
NC
2471 if (PL_thiswhite) {
2472 if (!PL_endwhite)
6b29d1f5 2473 PL_endwhite = newSVpvs("");
cd81e915
NC
2474 sv_catsv(PL_endwhite, PL_thiswhite);
2475 PL_thiswhite = 0;
2476 }
2477 if (PL_thistoken)
76f68e9b 2478 sv_setpvs(PL_thistoken,"");
5db06880 2479 else
cd81e915 2480 PL_realtokenstart = -1;
5db06880
NC
2481 }
2482#endif
f46d017c 2483 LEAVE;
3280af22
NIS
2484 PL_bufend = SvPVX(PL_linestr);
2485 PL_bufend += SvCUR(PL_linestr);
2486 PL_expect = XOPERATOR;
09bef843 2487 PL_sublex_info.sub_inwhat = 0;
79072805 2488 return ')';
ffed7fef
LW
2489 }
2490}
2491
02aa26ce
NT
2492/*
2493 scan_const
2494
2495 Extracts a pattern, double-quoted string, or transliteration. This
2496 is terrifying code.
2497
94def140 2498 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 2499 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 2500 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 2501
94def140
TS
2502 Returns a pointer to the character scanned up to. If this is
2503 advanced from the start pointer supplied (i.e. if anything was
9b599b2a 2504 successfully parsed), will leave an OP for the substring scanned
6154021b 2505 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
2506 by looking at the next characters herself.
2507
02aa26ce
NT
2508 In patterns:
2509 backslashes:
ff3f963a 2510 constants: \N{NAME} only
02aa26ce
NT
2511 case and quoting: \U \Q \E
2512 stops on @ and $, but not for $ as tail anchor
2513
2514 In transliterations:
2515 characters are VERY literal, except for - not at the start or end
94def140
TS
2516 of the string, which indicates a range. If the range is in bytes,
2517 scan_const expands the range to the full set of intermediate
2518 characters. If the range is in utf8, the hyphen is replaced with
2519 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
2520
2521 In double-quoted strings:
2522 backslashes:
2523 double-quoted style: \r and \n
ff3f963a 2524 constants: \x31, etc.
94def140 2525 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
2526 case and quoting: \U \Q \E
2527 stops on @ and $
2528
2529 scan_const does *not* construct ops to handle interpolated strings.
2530 It stops processing as soon as it finds an embedded $ or @ variable
2531 and leaves it to the caller to work out what's going on.
2532
94def140
TS
2533 embedded arrays (whether in pattern or not) could be:
2534 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2535
2536 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
2537
2538 $ in pattern could be $foo or could be tail anchor. Assumption:
2539 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 2540 followed by one of "()| \r\n\t"
02aa26ce
NT
2541
2542 \1 (backreferences) are turned into $1
2543
2544 The structure of the code is
2545 while (there's a character to process) {
94def140
TS
2546 handle transliteration ranges
2547 skip regexp comments /(?#comment)/ and codes /(?{code})/
2548 skip #-initiated comments in //x patterns
2549 check for embedded arrays
02aa26ce
NT
2550 check for embedded scalars
2551 if (backslash) {
94def140 2552 deprecate \1 in substitution replacements
02aa26ce
NT
2553 handle string-changing backslashes \l \U \Q \E, etc.
2554 switch (what was escaped) {
94def140 2555 handle \- in a transliteration (becomes a literal -)
ff3f963a 2556 if a pattern and not \N{, go treat as regular character
94def140
TS
2557 handle \132 (octal characters)
2558 handle \x15 and \x{1234} (hex characters)
ff3f963a 2559 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
TS
2560 handle \cV (control characters)
2561 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 2562 } (end switch)
77a135fe 2563 continue
02aa26ce 2564 } (end if backslash)
77a135fe 2565 handle regular character
02aa26ce 2566 } (end while character to read)
4e553d73 2567
02aa26ce
NT
2568*/
2569
76e3520e 2570STATIC char *
cea2e8a9 2571S_scan_const(pTHX_ char *start)
79072805 2572{
97aff369 2573 dVAR;
3280af22 2574 register char *send = PL_bufend; /* end of the constant */
77a135fe
KW
2575 SV *sv = newSV(send - start); /* sv for the constant. See
2576 note below on sizing. */
02aa26ce
NT
2577 register char *s = start; /* start of the constant */
2578 register char *d = SvPVX(sv); /* destination for copies */
2579 bool dorange = FALSE; /* are we in a translit range? */
c2e66d9e 2580 bool didrange = FALSE; /* did we just finish a range? */
2b9d42f0 2581 I32 has_utf8 = FALSE; /* Output constant is UTF8 */
77a135fe
KW
2582 I32 this_utf8 = UTF; /* Is the source string assumed
2583 to be UTF8? But, this can
2584 show as true when the source
2585 isn't utf8, as for example
2586 when it is entirely composed
2587 of hex constants */
2588
2589 /* Note on sizing: The scanned constant is placed into sv, which is
2590 * initialized by newSV() assuming one byte of output for every byte of
2591 * input. This routine expects newSV() to allocate an extra byte for a
2592 * trailing NUL, which this routine will append if it gets to the end of
2593 * the input. There may be more bytes of input than output (eg., \N{LATIN
2594 * CAPITAL LETTER A}), or more output than input if the constant ends up
2595 * recoded to utf8, but each time a construct is found that might increase
2596 * the needed size, SvGROW() is called. Its size parameter each time is
2597 * based on the best guess estimate at the time, namely the length used so
2598 * far, plus the length the current construct will occupy, plus room for
2599 * the trailing NUL, plus one byte for every input byte still unscanned */
2600
012bcf8d 2601 UV uv;
4c3a8340
TS
2602#ifdef EBCDIC
2603 UV literal_endpoint = 0;
e294cc5d 2604 bool native_range = TRUE; /* turned to FALSE if the first endpoint is Unicode. */
4c3a8340 2605#endif
012bcf8d 2606
7918f24d
NC
2607 PERL_ARGS_ASSERT_SCAN_CONST;
2608
2b9d42f0
NIS
2609 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
2610 /* If we are doing a trans and we know we want UTF8 set expectation */
2611 has_utf8 = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
2612 this_utf8 = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
2613 }
2614
2615
79072805 2616 while (s < send || dorange) {
ff3f963a 2617
02aa26ce 2618 /* get transliterations out of the way (they're most literal) */
3280af22 2619 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 2620 /* expand a range A-Z to the full set of characters. AIE! */
79072805 2621 if (dorange) {
1ba5c669
JH
2622 I32 i; /* current expanded character */
2623 I32 min; /* first character in range */
2624 I32 max; /* last character in range */
02aa26ce 2625
e294cc5d
JH
2626#ifdef EBCDIC
2627 UV uvmax = 0;
2628#endif
2629
2630 if (has_utf8
2631#ifdef EBCDIC
2632 && !native_range
2633#endif
2634 ) {
9d4ba2ae 2635 char * const c = (char*)utf8_hop((U8*)d, -1);
8973db79
JH
2636 char *e = d++;
2637 while (e-- > c)
2638 *(e + 1) = *e;
25716404 2639 *c = (char)UTF_TO_NATIVE(0xff);
8973db79
JH
2640 /* mark the range as done, and continue */
2641 dorange = FALSE;
2642 didrange = TRUE;
2643 continue;
2644 }
2b9d42f0 2645
95a20fc0 2646 i = d - SvPVX_const(sv); /* remember current offset */
e294cc5d
JH
2647#ifdef EBCDIC
2648 SvGROW(sv,
2649 SvLEN(sv) + (has_utf8 ?
2650 (512 - UTF_CONTINUATION_MARK +
2651 UNISKIP(0x100))
2652 : 256));
2653 /* How many two-byte within 0..255: 128 in UTF-8,
2654 * 96 in UTF-8-mod. */
2655#else
9cbb5ea2 2656 SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */
e294cc5d 2657#endif
9cbb5ea2 2658 d = SvPVX(sv) + i; /* refresh d after realloc */
e294cc5d
JH
2659#ifdef EBCDIC
2660 if (has_utf8) {
2661 int j;
2662 for (j = 0; j <= 1; j++) {
2663 char * const c = (char*)utf8_hop((U8*)d, -1);
2664 const UV uv = utf8n_to_uvchr((U8*)c, d - c, NULL, 0);
2665 if (j)
2666 min = (U8)uv;
2667 else if (uv < 256)
2668 max = (U8)uv;
2669 else {
2670 max = (U8)0xff; /* only to \xff */
2671 uvmax = uv; /* \x{100} to uvmax */
2672 }
2673 d = c; /* eat endpoint chars */
2674 }
2675 }
2676 else {
2677#endif
2678 d -= 2; /* eat the first char and the - */
2679 min = (U8)*d; /* first char in range */
2680 max = (U8)d[1]; /* last char in range */
2681#ifdef EBCDIC
2682 }
2683#endif
8ada0baa 2684
c2e66d9e 2685 if (min > max) {
01ec43d0 2686 Perl_croak(aTHX_
d1573ac7 2687 "Invalid range \"%c-%c\" in transliteration operator",
1ba5c669 2688 (char)min, (char)max);
c2e66d9e
GS
2689 }
2690
c7f1f016 2691#ifdef EBCDIC
4c3a8340
TS
2692 if (literal_endpoint == 2 &&
2693 ((isLOWER(min) && isLOWER(max)) ||
2694 (isUPPER(min) && isUPPER(max)))) {
8ada0baa
JH
2695 if (isLOWER(min)) {
2696 for (i = min; i <= max; i++)
2697 if (isLOWER(i))
db42d148 2698 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2699 } else {
2700 for (i = min; i <= max; i++)
2701 if (isUPPER(i))
db42d148 2702 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2703 }
2704 }
2705 else
2706#endif
2707 for (i = min; i <= max; i++)
e294cc5d
JH
2708#ifdef EBCDIC
2709 if (has_utf8) {
2710 const U8 ch = (U8)NATIVE_TO_UTF(i);
2711 if (UNI_IS_INVARIANT(ch))
2712 *d++ = (U8)i;
2713 else {
2714 *d++ = (U8)UTF8_EIGHT_BIT_HI(ch);
2715 *d++ = (U8)UTF8_EIGHT_BIT_LO(ch);
2716 }
2717 }
2718 else
2719#endif
2720 *d++ = (char)i;
2721
2722#ifdef EBCDIC
2723 if (uvmax) {
2724 d = (char*)uvchr_to_utf8((U8*)d, 0x100);
2725 if (uvmax > 0x101)
2726 *d++ = (char)UTF_TO_NATIVE(0xff);
2727 if (uvmax > 0x100)
2728 d = (char*)uvchr_to_utf8((U8*)d, uvmax);
2729 }
2730#endif
02aa26ce
NT
2731
2732 /* mark the range as done, and continue */
79072805 2733 dorange = FALSE;
01ec43d0 2734 didrange = TRUE;
4c3a8340
TS
2735#ifdef EBCDIC
2736 literal_endpoint = 0;
2737#endif
79072805 2738 continue;
4e553d73 2739 }
02aa26ce
NT
2740
2741 /* range begins (ignore - as first or last char) */
79072805 2742 else if (*s == '-' && s+1 < send && s != start) {
4e553d73 2743 if (didrange) {
1fafa243 2744 Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
01ec43d0 2745 }
e294cc5d
JH
2746 if (has_utf8
2747#ifdef EBCDIC
2748 && !native_range
2749#endif
2750 ) {
25716404 2751 *d++ = (char)UTF_TO_NATIVE(0xff); /* use illegal utf8 byte--see pmtrans */
a0ed51b3
LW
2752 s++;
2753 continue;
2754 }
79072805
LW
2755 dorange = TRUE;
2756 s++;
01ec43d0
GS
2757 }
2758 else {
2759 didrange = FALSE;
4c3a8340
TS
2760#ifdef EBCDIC
2761 literal_endpoint = 0;
e294cc5d 2762 native_range = TRUE;
4c3a8340 2763#endif
01ec43d0 2764 }
79072805 2765 }
02aa26ce
NT
2766
2767 /* if we get here, we're not doing a transliteration */
2768
0f5d15d6
IZ
2769 /* skip for regexp comments /(?#comment)/ and code /(?{code})/,
2770 except for the last char, which will be done separately. */
3280af22 2771 else if (*s == '(' && PL_lex_inpat && s[1] == '?') {
cc6b7395 2772 if (s[2] == '#') {
e994fd66 2773 while (s+1 < send && *s != ')')
db42d148 2774 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
155aba94
GS
2775 }
2776 else if (s[2] == '{' /* This should match regcomp.c */
67edc0c9 2777 || (s[2] == '?' && s[3] == '{'))
155aba94 2778 {
cc6b7395 2779 I32 count = 1;
0f5d15d6 2780 char *regparse = s + (s[2] == '{' ? 3 : 4);
cc6b7395
IZ
2781 char c;
2782
d9f97599
GS
2783 while (count && (c = *regparse)) {
2784 if (c == '\\' && regparse[1])
2785 regparse++;
4e553d73 2786 else if (c == '{')
cc6b7395 2787 count++;
4e553d73 2788 else if (c == '}')
cc6b7395 2789 count--;
d9f97599 2790 regparse++;
cc6b7395 2791 }
e994fd66 2792 if (*regparse != ')')
5bdf89e7 2793 regparse--; /* Leave one char for continuation. */
0f5d15d6 2794 while (s < regparse)
db42d148 2795 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
cc6b7395 2796 }
748a9306 2797 }
02aa26ce
NT
2798
2799 /* likewise skip #-initiated comments in //x patterns */
3280af22
NIS
2800 else if (*s == '#' && PL_lex_inpat &&
2801 ((PMOP*)PL_lex_inpat)->op_pmflags & PMf_EXTENDED) {
748a9306 2802 while (s+1 < send && *s != '\n')
db42d148 2803 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
748a9306 2804 }
02aa26ce 2805
5d1d4326 2806 /* check for embedded arrays
da6eedaa 2807 (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
5d1d4326 2808 */
1749ea0d
TS
2809 else if (*s == '@' && s[1]) {
2810 if (isALNUM_lazy_if(s+1,UTF))
2811 break;
2812 if (strchr(":'{$", s[1]))
2813 break;
2814 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
2815 break; /* in regexp, neither @+ nor @- are interpolated */
2816 }
02aa26ce
NT
2817
2818 /* check for embedded scalars. only stop if we're sure it's a
2819 variable.
2820 */
79072805 2821 else if (*s == '$') {
3280af22 2822 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
79072805 2823 break;
77772344 2824 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
a2a5de95
NC
2825 if (s[1] == '\\') {
2826 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
2827 "Possible unintended interpolation of $\\ in regex");
77772344 2828 }
79072805 2829 break; /* in regexp, $ might be tail anchor */
77772344 2830 }
79072805 2831 }
02aa26ce 2832
2b9d42f0
NIS
2833 /* End of else if chain - OP_TRANS rejoin rest */
2834
02aa26ce 2835 /* backslashes */
79072805 2836 if (*s == '\\' && s+1 < send) {
ff3f963a
KW
2837 char* e; /* Can be used for ending '}', etc. */
2838
79072805 2839 s++;
02aa26ce 2840
7d0fc23c
KW
2841 /* warn on \1 - \9 in substitution replacements, but note that \11
2842 * is an octal; and \19 is \1 followed by '9' */
3280af22 2843 if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat &&
a0d0e21e 2844 isDIGIT(*s) && *s != '0' && !isDIGIT(s[1]))
79072805 2845 {
a2a5de95 2846 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
79072805
LW
2847 *--s = '$';
2848 break;
2849 }
02aa26ce
NT
2850
2851 /* string-change backslash escapes */
3280af22 2852 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQ", *s)) {
79072805
LW
2853 --s;
2854 break;
2855 }
ff3f963a
KW
2856 /* In a pattern, process \N, but skip any other backslash escapes.
2857 * This is because we don't want to translate an escape sequence
2858 * into a meta symbol and have the regex compiler use the meta
2859 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
2860 * in spite of this, we do have to process \N here while the proper
2861 * charnames handler is in scope. See bugs #56444 and #62056.
2862 * There is a complication because \N in a pattern may also stand
2863 * for 'match a non-nl', and not mean a charname, in which case its
2864 * processing should be deferred to the regex compiler. To be a
2865 * charname it must be followed immediately by a '{', and not look
2866 * like \N followed by a curly quantifier, i.e., not something like
2867 * \N{3,}. regcurly returns a boolean indicating if it is a legal
2868 * quantifier */
2869 else if (PL_lex_inpat
2870 && (*s != 'N'
2871 || s[1] != '{'
2872 || regcurly(s + 1)))
2873 {
cc74c5bd
TS
2874 *d++ = NATIVE_TO_NEED(has_utf8,'\\');
2875 goto default_action;
2876 }
02aa26ce 2877
79072805 2878 switch (*s) {
02aa26ce
NT
2879
2880 /* quoted - in transliterations */
79072805 2881 case '-':
3280af22 2882 if (PL_lex_inwhat == OP_TRANS) {
79072805
LW
2883 *d++ = *s++;
2884 continue;
2885 }
2886 /* FALL THROUGH */
2887 default:
11b8faa4 2888 {
a2a5de95
NC
2889 if ((isALPHA(*s) || isDIGIT(*s)))
2890 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
2891 "Unrecognized escape \\%c passed through",
2892 *s);
11b8faa4 2893 /* default action is to copy the quoted character */
f9a63242 2894 goto default_action;
11b8faa4 2895 }
02aa26ce 2896
632403cc 2897 /* eg. \132 indicates the octal constant 0132 */
79072805
LW
2898 case '0': case '1': case '2': case '3':
2899 case '4': case '5': case '6': case '7':
ba210ebe 2900 {
53305cf1
NC
2901 I32 flags = 0;
2902 STRLEN len = 3;
77a135fe 2903 uv = NATIVE_TO_UNI(grok_oct(s, &len, &flags, NULL));
ba210ebe
JH
2904 s += len;
2905 }
012bcf8d 2906 goto NUM_ESCAPE_INSERT;
02aa26ce 2907
f0a2b745
KW
2908 /* eg. \o{24} indicates the octal constant \024 */
2909 case 'o':
2910 {
2911 STRLEN len;
454155d9 2912 const char* error;
f0a2b745 2913
454155d9 2914 bool valid = grok_bslash_o(s, &uv, &len, &error, 1);
f0a2b745 2915 s += len;
454155d9 2916 if (! valid) {
f0a2b745
KW
2917 yyerror(error);
2918 continue;
2919 }
2920 goto NUM_ESCAPE_INSERT;
2921 }
2922
77a135fe 2923 /* eg. \x24 indicates the hex constant 0x24 */
79072805 2924 case 'x':
a0ed51b3
LW
2925 ++s;
2926 if (*s == '{') {
9d4ba2ae 2927 char* const e = strchr(s, '}');
a4c04bdc
NC
2928 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES |
2929 PERL_SCAN_DISALLOW_PREFIX;
53305cf1 2930 STRLEN len;
355860ce 2931
53305cf1 2932 ++s;
adaeee49 2933 if (!e) {
a0ed51b3 2934 yyerror("Missing right brace on \\x{}");
355860ce 2935 continue;
ba210ebe 2936 }
53305cf1 2937 len = e - s;
77a135fe 2938 uv = NATIVE_TO_UNI(grok_hex(s, &len, &flags, NULL));
ba210ebe 2939 s = e + 1;
a0ed51b3
LW
2940 }
2941 else {
ba210ebe 2942 {
53305cf1 2943 STRLEN len = 2;
a4c04bdc 2944 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
77a135fe 2945 uv = NATIVE_TO_UNI(grok_hex(s, &len, &flags, NULL));
ba210ebe
JH
2946 s += len;
2947 }
012bcf8d
GS
2948 }
2949
2950 NUM_ESCAPE_INSERT:
ff3f963a
KW
2951 /* Insert oct or hex escaped character. There will always be
2952 * enough room in sv since such escapes will be longer than any
2953 * UTF-8 sequence they can end up as, except if they force us
2954 * to recode the rest of the string into utf8 */
ba7cea30 2955
77a135fe 2956 /* Here uv is the ordinal of the next character being added in
ff3f963a 2957 * unicode (converted from native). */
77a135fe 2958 if (!UNI_IS_INVARIANT(uv)) {
9aa983d2 2959 if (!has_utf8 && uv > 255) {
77a135fe
KW
2960 /* Might need to recode whatever we have accumulated so
2961 * far if it contains any chars variant in utf8 or
2962 * utf-ebcdic. */
2963
2964 SvCUR_set(sv, d - SvPVX_const(sv));
2965 SvPOK_on(sv);
2966 *d = '\0';
77a135fe 2967 /* See Note on sizing above. */
7bf79863
KW
2968 sv_utf8_upgrade_flags_grow(sv,
2969 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
2970 UNISKIP(uv) + (STRLEN)(send - s) + 1);
77a135fe
KW
2971 d = SvPVX(sv) + SvCUR(sv);
2972 has_utf8 = TRUE;
012bcf8d
GS
2973 }
2974
77a135fe
KW
2975 if (has_utf8) {
2976 d = (char*)uvuni_to_utf8((U8*)d, uv);
f9a63242
JH
2977 if (PL_lex_inwhat == OP_TRANS &&
2978 PL_sublex_info.sub_op) {
2979 PL_sublex_info.sub_op->op_private |=
2980 (PL_lex_repl ? OPpTRANS_FROM_UTF
2981 : OPpTRANS_TO_UTF);
f9a63242 2982 }
e294cc5d
JH
2983#ifdef EBCDIC
2984 if (uv > 255 && !dorange)
2985 native_range = FALSE;
2986#endif
012bcf8d 2987 }
a0ed51b3 2988 else {
012bcf8d 2989 *d++ = (char)uv;
a0ed51b3 2990 }
012bcf8d
GS
2991 }
2992 else {
c4d5f83a 2993 *d++ = (char) uv;
a0ed51b3 2994 }
79072805 2995 continue;
02aa26ce 2996
4a2d328f 2997 case 'N':
ff3f963a
KW
2998 /* In a non-pattern \N must be a named character, like \N{LATIN
2999 * SMALL LETTER A} or \N{U+0041}. For patterns, it also can
3000 * mean to match a non-newline. For non-patterns, named
3001 * characters are converted to their string equivalents. In
3002 * patterns, named characters are not converted to their
3003 * ultimate forms for the same reasons that other escapes
3004 * aren't. Instead, they are converted to the \N{U+...} form
3005 * to get the value from the charnames that is in effect right
3006 * now, while preserving the fact that it was a named character
3007 * so that the regex compiler knows this */
3008
3009 /* This section of code doesn't generally use the
3010 * NATIVE_TO_NEED() macro to transform the input. I (khw) did
3011 * a close examination of this macro and determined it is a
3012 * no-op except on utfebcdic variant characters. Every
3013 * character generated by this that would normally need to be
3014 * enclosed by this macro is invariant, so the macro is not
3015 * needed, and would complicate use of copy(). There are other
3016 * parts of this file where the macro is used inconsistently,
3017 * but are saved by it being a no-op */
3018
3019 /* The structure of this section of code (besides checking for
3020 * errors and upgrading to utf8) is:
3021 * Further disambiguate between the two meanings of \N, and if
3022 * not a charname, go process it elsewhere
0a96133f
KW
3023 * If of form \N{U+...}, pass it through if a pattern;
3024 * otherwise convert to utf8
3025 * Otherwise must be \N{NAME}: convert to \N{U+c1.c2...} if a
3026 * pattern; otherwise convert to utf8 */
ff3f963a
KW
3027
3028 /* Here, s points to the 'N'; the test below is guaranteed to
3029 * succeed if we are being called on a pattern as we already
3030 * know from a test above that the next character is a '{'.
3031 * On a non-pattern \N must mean 'named sequence, which
3032 * requires braces */
3033 s++;
3034 if (*s != '{') {
3035 yyerror("Missing braces on \\N{}");
3036 continue;
3037 }
3038 s++;
3039
0a96133f 3040 /* If there is no matching '}', it is an error. */
ff3f963a
KW
3041 if (! (e = strchr(s, '}'))) {
3042 if (! PL_lex_inpat) {
5777a3f7 3043 yyerror("Missing right brace on \\N{}");
0a96133f
KW
3044 } else {
3045 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N.");
dbc0d4f2 3046 }
0a96133f 3047 continue;
ff3f963a 3048 }
cddc7ef4 3049
ff3f963a 3050 /* Here it looks like a named character */
cddc7ef4 3051
ff3f963a
KW
3052 if (PL_lex_inpat) {
3053
3054 /* XXX This block is temporary code. \N{} implies that the
3055 * pattern is to have Unicode semantics, and therefore
3056 * currently has to be encoded in utf8. By putting it in
3057 * utf8 now, we save a whole pass in the regular expression
3058 * compiler. Once that code is changed so Unicode
3059 * semantics doesn't necessarily have to be in utf8, this
3060 * block should be removed */
3061 if (!has_utf8) {
77a135fe 3062 SvCUR_set(sv, d - SvPVX_const(sv));
f08d6ad9 3063 SvPOK_on(sv);
e4f3eed8 3064 *d = '\0';
77a135fe 3065 /* See Note on sizing above. */
7bf79863 3066 sv_utf8_upgrade_flags_grow(sv,
ff3f963a
KW
3067 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3068 /* 5 = '\N{' + cur char + NUL */
3069 (STRLEN)(send - s) + 5);
f08d6ad9 3070 d = SvPVX(sv) + SvCUR(sv);
89491803 3071 has_utf8 = TRUE;
ff3f963a
KW
3072 }
3073 }
423cee85 3074
ff3f963a
KW
3075 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3076 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3077 | PERL_SCAN_DISALLOW_PREFIX;
3078 STRLEN len;
3079
3080 /* For \N{U+...}, the '...' is a unicode value even on
3081 * EBCDIC machines */
3082 s += 2; /* Skip to next char after the 'U+' */
3083 len = e - s;
3084 uv = grok_hex(s, &len, &flags, NULL);
3085 if (len == 0 || len != (STRLEN)(e - s)) {
3086 yyerror("Invalid hexadecimal number in \\N{U+...}");
3087 s = e + 1;
3088 continue;
3089 }
3090
3091 if (PL_lex_inpat) {
3092
3093 /* Pass through to the regex compiler unchanged. The
3094 * reason we evaluated the number above is to make sure
0a96133f 3095 * there wasn't a syntax error. */
ff3f963a
KW
3096 s -= 5; /* Include the '\N{U+' */
3097 Copy(s, d, e - s + 1, char); /* 1 = include the } */
3098 d += e - s + 1;
3099 }
3100 else { /* Not a pattern: convert the hex to string */
3101
3102 /* If destination is not in utf8, unconditionally
3103 * recode it to be so. This is because \N{} implies
3104 * Unicode semantics, and scalars have to be in utf8
3105 * to guarantee those semantics */
3106 if (! has_utf8) {
3107 SvCUR_set(sv, d - SvPVX_const(sv));
3108 SvPOK_on(sv);
3109 *d = '\0';
3110 /* See Note on sizing above. */
3111 sv_utf8_upgrade_flags_grow(
3112 sv,
3113 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3114 UNISKIP(uv) + (STRLEN)(send - e) + 1);
3115 d = SvPVX(sv) + SvCUR(sv);
3116 has_utf8 = TRUE;
3117 }
3118
3119 /* Add the string to the output */
3120 if (UNI_IS_INVARIANT(uv)) {
3121 *d++ = (char) uv;
3122 }
3123 else d = (char*)uvuni_to_utf8((U8*)d, uv);
3124 }
3125 }
3126 else { /* Here is \N{NAME} but not \N{U+...}. */
3127
3128 SV *res; /* result from charnames */
3129 const char *str; /* the string in 'res' */
3130 STRLEN len; /* its length */
3131
3132 /* Get the value for NAME */
3133 res = newSVpvn(s, e - s);
3134 res = new_constant( NULL, 0, "charnames",
3135 /* includes all of: \N{...} */
3136 res, NULL, s - 3, e - s + 4 );
3137
3138 /* Most likely res will be in utf8 already since the
3139 * standard charnames uses pack U, but a custom translator
3140 * can leave it otherwise, so make sure. XXX This can be
3141 * revisited to not have charnames use utf8 for characters
3142 * that don't need it when regexes don't have to be in utf8
3143 * for Unicode semantics. If doing so, remember EBCDIC */
3144 sv_utf8_upgrade(res);
3145 str = SvPV_const(res, len);
3146
3147 /* Don't accept malformed input */
3148 if (! is_utf8_string((U8 *) str, len)) {
3149 yyerror("Malformed UTF-8 returned by \\N");
3150 }
3151 else if (PL_lex_inpat) {
3152
3153 if (! len) { /* The name resolved to an empty string */
3154 Copy("\\N{}", d, 4, char);
3155 d += 4;
3156 }
3157 else {
3158 /* In order to not lose information for the regex
3159 * compiler, pass the result in the specially made
3160 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3161 * the code points in hex of each character
3162 * returned by charnames */
3163
3164 const char *str_end = str + len;
3165 STRLEN char_length; /* cur char's byte length */
3166 STRLEN output_length; /* and the number of bytes
3167 after this is translated
3168 into hex digits */
3169 const STRLEN off = d - SvPVX_const(sv);
3170
3171 /* 2 hex per byte; 2 chars for '\N'; 2 chars for
3172 * max('U+', '.'); and 1 for NUL */
3173 char hex_string[2 * UTF8_MAXBYTES + 5];
3174
3175 /* Get the first character of the result. */
3176 U32 uv = utf8n_to_uvuni((U8 *) str,
3177 len,
3178 &char_length,
3179 UTF8_ALLOW_ANYUV);
3180
3181 /* The call to is_utf8_string() above hopefully
3182 * guarantees that there won't be an error. But
3183 * it's easy here to make sure. The function just
3184 * above warns and returns 0 if invalid utf8, but
3185 * it can also return 0 if the input is validly a
3186 * NUL. Disambiguate */
3187 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3188 uv = UNICODE_REPLACEMENT;
3189 }
3190
3191 /* Convert first code point to hex, including the
3192 * boiler plate before it */
3193 sprintf(hex_string, "\\N{U+%X", (unsigned int) uv);
3194 output_length = strlen(hex_string);
3195
3196 /* Make sure there is enough space to hold it */
3197 d = off + SvGROW(sv, off
3198 + output_length
3199 + (STRLEN)(send - e)
3200 + 2); /* '}' + NUL */
3201 /* And output it */
3202 Copy(hex_string, d, output_length, char);
3203 d += output_length;
3204
3205 /* For each subsequent character, append dot and
3206 * its ordinal in hex */
3207 while ((str += char_length) < str_end) {
3208 const STRLEN off = d - SvPVX_const(sv);
3209 U32 uv = utf8n_to_uvuni((U8 *) str,
3210 str_end - str,
3211 &char_length,
3212 UTF8_ALLOW_ANYUV);
3213 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3214 uv = UNICODE_REPLACEMENT;
3215 }
3216
3217 sprintf(hex_string, ".%X", (unsigned int) uv);
3218 output_length = strlen(hex_string);
3219
3220 d = off + SvGROW(sv, off
3221 + output_length
3222 + (STRLEN)(send - e)
3223 + 2); /* '}' + NUL */
3224 Copy(hex_string, d, output_length, char);
3225 d += output_length;
3226 }
3227
3228 *d++ = '}'; /* Done. Add the trailing brace */
3229 }
3230 }
3231 else { /* Here, not in a pattern. Convert the name to a
3232 * string. */
3233
3234 /* If destination is not in utf8, unconditionally
3235 * recode it to be so. This is because \N{} implies
3236 * Unicode semantics, and scalars have to be in utf8
3237 * to guarantee those semantics */
3238 if (! has_utf8) {
3239 SvCUR_set(sv, d - SvPVX_const(sv));
3240 SvPOK_on(sv);
3241 *d = '\0';
3242 /* See Note on sizing above. */
3243 sv_utf8_upgrade_flags_grow(sv,
3244 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3245 len + (STRLEN)(send - s) + 1);
3246 d = SvPVX(sv) + SvCUR(sv);
3247 has_utf8 = TRUE;
3248 } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
3249
3250 /* See Note on sizing above. (NOTE: SvCUR() is not
3251 * set correctly here). */
3252 const STRLEN off = d - SvPVX_const(sv);
3253 d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1);
3254 }
3255 Copy(str, d, len, char);
3256 d += len;
423cee85 3257 }
423cee85 3258 SvREFCNT_dec(res);
cb233ae3
KW
3259
3260 /* Deprecate non-approved name syntax */
3261 if (ckWARN_d(WARN_DEPRECATED)) {
3262 bool problematic = FALSE;
3263 char* i = s;
3264
3265 /* For non-ut8 input, look to see that the first
3266 * character is an alpha, then loop through the rest
3267 * checking that each is a continuation */
3268 if (! this_utf8) {
3269 if (! isALPHAU(*i)) problematic = TRUE;
3270 else for (i = s + 1; i < e; i++) {
3271 if (isCHARNAME_CONT(*i)) continue;
3272 problematic = TRUE;
3273 break;
3274 }
3275 }
3276 else {
3277 /* Similarly for utf8. For invariants can check
3278 * directly. We accept anything above the latin1
3279 * range because it is immaterial to Perl if it is
3280 * correct or not, and is expensive to check. But
3281 * it is fairly easy in the latin1 range to convert
3282 * the variants into a single character and check
3283 * those */
3284 if (UTF8_IS_INVARIANT(*i)) {
3285 if (! isALPHAU(*i)) problematic = TRUE;
3286 } else if (UTF8_IS_DOWNGRADEABLE_START(*i)) {
3287 if (! isALPHAU(UNI_TO_NATIVE(UTF8_ACCUMULATE(*i,
3288 *(i+1)))))
3289 {
3290 problematic = TRUE;
3291 }
3292 }
3293 if (! problematic) for (i = s + UTF8SKIP(s);
3294 i < e;
3295 i+= UTF8SKIP(i))
3296 {
3297 if (UTF8_IS_INVARIANT(*i)) {
3298 if (isCHARNAME_CONT(*i)) continue;
3299 } else if (! UTF8_IS_DOWNGRADEABLE_START(*i)) {
3300 continue;
3301 } else if (isCHARNAME_CONT(
3302 UNI_TO_NATIVE(
3303 UTF8_ACCUMULATE(*i, *(i+1)))))
3304 {
3305 continue;
3306 }
3307 problematic = TRUE;
3308 break;
3309 }
3310 }
3311 if (problematic) {
6e1bad6c
KW
3312 /* The e-i passed to the final %.*s makes sure that
3313 * should the trailing NUL be missing that this
3314 * print won't run off the end of the string */
cb233ae3 3315 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
b00fc8d4
NC
3316 "Deprecated character in \\N{...}; marked by <-- HERE in \\N{%.*s<-- HERE %.*s",
3317 (int)(i - s + 1), s, (int)(e - i), i + 1);
cb233ae3
KW
3318 }
3319 }
3320 } /* End \N{NAME} */
ff3f963a
KW
3321#ifdef EBCDIC
3322 if (!dorange)
3323 native_range = FALSE; /* \N{} is defined to be Unicode */
3324#endif
3325 s = e + 1; /* Point to just after the '}' */
423cee85
JH
3326 continue;
3327
02aa26ce 3328 /* \c is a control character */
79072805
LW
3329 case 'c':
3330 s++;
961ce445 3331 if (s < send) {
f9d13529 3332 *d++ = grok_bslash_c(*s++, 1);
ba210ebe 3333 }
961ce445
RGS
3334 else {
3335 yyerror("Missing control char name in \\c");
3336 }
79072805 3337 continue;
02aa26ce
NT
3338
3339 /* printf-style backslashes, formfeeds, newlines, etc */
79072805 3340 case 'b':
db42d148 3341 *d++ = NATIVE_TO_NEED(has_utf8,'\b');
79072805
LW
3342 break;
3343 case 'n':
db42d148 3344 *d++ = NATIVE_TO_NEED(has_utf8,'\n');
79072805
LW
3345 break;
3346 case 'r':
db42d148 3347 *d++ = NATIVE_TO_NEED(has_utf8,'\r');
79072805
LW
3348 break;
3349 case 'f':
db42d148 3350 *d++ = NATIVE_TO_NEED(has_utf8,'\f');
79072805
LW
3351 break;
3352 case 't':
db42d148 3353 *d++ = NATIVE_TO_NEED(has_utf8,'\t');
79072805 3354 break;
34a3fe2a 3355 case 'e':
db42d148 3356 *d++ = ASCII_TO_NEED(has_utf8,'\033');
34a3fe2a
PP
3357 break;
3358 case 'a':
db42d148 3359 *d++ = ASCII_TO_NEED(has_utf8,'\007');
79072805 3360 break;
02aa26ce
NT
3361 } /* end switch */
3362
79072805
LW
3363 s++;
3364 continue;
02aa26ce 3365 } /* end if (backslash) */
4c3a8340
TS
3366#ifdef EBCDIC
3367 else
3368 literal_endpoint++;
3369#endif
02aa26ce 3370
f9a63242 3371 default_action:
77a135fe
KW
3372 /* If we started with encoded form, or already know we want it,
3373 then encode the next character */
3374 if (! NATIVE_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
2b9d42f0 3375 STRLEN len = 1;
77a135fe
KW
3376
3377
3378 /* One might think that it is wasted effort in the case of the
3379 * source being utf8 (this_utf8 == TRUE) to take the next character
3380 * in the source, convert it to an unsigned value, and then convert
3381 * it back again. But the source has not been validated here. The
3382 * routine that does the conversion checks for errors like
3383 * malformed utf8 */
3384
5f66b61c
AL
3385 const UV nextuv = (this_utf8) ? utf8n_to_uvchr((U8*)s, send - s, &len, 0) : (UV) ((U8) *s);
3386 const STRLEN need = UNISKIP(NATIVE_TO_UNI(nextuv));
77a135fe
KW
3387 if (!has_utf8) {
3388 SvCUR_set(sv, d - SvPVX_const(sv));
3389 SvPOK_on(sv);
3390 *d = '\0';
77a135fe 3391 /* See Note on sizing above. */
7bf79863
KW
3392 sv_utf8_upgrade_flags_grow(sv,
3393 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3394 need + (STRLEN)(send - s) + 1);
77a135fe
KW
3395 d = SvPVX(sv) + SvCUR(sv);
3396 has_utf8 = TRUE;
3397 } else if (need > len) {
3398 /* encoded value larger than old, may need extra space (NOTE:
3399 * SvCUR() is not set correctly here). See Note on sizing
3400 * above. */
9d4ba2ae 3401 const STRLEN off = d - SvPVX_const(sv);
77a135fe 3402 d = SvGROW(sv, off + need + (STRLEN)(send - s) + 1) + off;
2b9d42f0 3403 }
77a135fe
KW
3404 s += len;
3405
5f66b61c 3406 d = (char*)uvchr_to_utf8((U8*)d, nextuv);
e294cc5d
JH
3407#ifdef EBCDIC
3408 if (uv > 255 && !dorange)
3409 native_range = FALSE;
3410#endif
2b9d42f0
NIS
3411 }
3412 else {
3413 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
3414 }
02aa26ce
NT
3415 } /* while loop to process each character */
3416
3417 /* terminate the string and set up the sv */
79072805 3418 *d = '\0';
95a20fc0 3419 SvCUR_set(sv, d - SvPVX_const(sv));
2b9d42f0 3420 if (SvCUR(sv) >= SvLEN(sv))
d0063567 3421 Perl_croak(aTHX_ "panic: constant overflowed allocated space");
2b9d42f0 3422
79072805 3423 SvPOK_on(sv);
9f4817db 3424 if (PL_encoding && !has_utf8) {
d0063567
DK
3425 sv_recode_to_utf8(sv, PL_encoding);
3426 if (SvUTF8(sv))
3427 has_utf8 = TRUE;
9f4817db 3428 }
2b9d42f0 3429 if (has_utf8) {
7e2040f0 3430 SvUTF8_on(sv);
2b9d42f0 3431 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
d0063567 3432 PL_sublex_info.sub_op->op_private |=
2b9d42f0
NIS
3433 (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
3434 }
3435 }
79072805 3436
02aa26ce 3437 /* shrink the sv if we allocated more than we used */
79072805 3438 if (SvCUR(sv) + 5 < SvLEN(sv)) {
1da4ca5f 3439 SvPV_shrink_to_cur(sv);
79072805 3440 }
02aa26ce 3441
6154021b 3442 /* return the substring (via pl_yylval) only if we parsed anything */
3280af22 3443 if (s > PL_bufptr) {
eb0d8d16
NC
3444 if ( PL_hints & ( PL_lex_inpat ? HINT_NEW_RE : HINT_NEW_STRING ) ) {
3445 const char *const key = PL_lex_inpat ? "qr" : "q";
3446 const STRLEN keylen = PL_lex_inpat ? 2 : 1;
3447 const char *type;
3448 STRLEN typelen;
3449
3450 if (PL_lex_inwhat == OP_TRANS) {
3451 type = "tr";
3452 typelen = 2;
3453 } else if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat) {
3454 type = "s";
3455 typelen = 1;
3456 } else {
3457 type = "qq";
3458 typelen = 2;
3459 }
3460
3461 sv = S_new_constant(aTHX_ start, s - start, key, keylen, sv, NULL,
3462 type, typelen);
3463 }
6154021b 3464 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
b3ac6de7 3465 } else
8990e307 3466 SvREFCNT_dec(sv);
79072805
LW
3467 return s;
3468}
3469
ffb4593c
NT
3470/* S_intuit_more
3471 * Returns TRUE if there's more to the expression (e.g., a subscript),
3472 * FALSE otherwise.
ffb4593c
NT
3473 *
3474 * It deals with "$foo[3]" and /$foo[3]/ and /$foo[0123456789$]+/
3475 *
3476 * ->[ and ->{ return TRUE
3477 * { and [ outside a pattern are always subscripts, so return TRUE
3478 * if we're outside a pattern and it's not { or [, then return FALSE
3479 * if we're in a pattern and the first char is a {
3480 * {4,5} (any digits around the comma) returns FALSE
3481 * if we're in a pattern and the first char is a [
3482 * [] returns FALSE
3483 * [SOMETHING] has a funky algorithm to decide whether it's a
3484 * character class or not. It has to deal with things like
3485 * /$foo[-3]/ and /$foo[$bar]/ as well as /$foo[$\d]+/
3486 * anything else returns TRUE
3487 */
3488
9cbb5ea2
GS
3489/* This is the one truly awful dwimmer necessary to conflate C and sed. */
3490
76e3520e 3491STATIC int
cea2e8a9 3492S_intuit_more(pTHX_ register char *s)
79072805 3493{
97aff369 3494 dVAR;
7918f24d
NC
3495
3496 PERL_ARGS_ASSERT_INTUIT_MORE;
3497
3280af22 3498 if (PL_lex_brackets)
79072805
LW
3499 return TRUE;
3500 if (*s == '-' && s[1] == '>' && (s[2] == '[' || s[2] == '{'))
3501 return TRUE;
3502 if (*s != '{' && *s != '[')
3503 return FALSE;
3280af22 3504 if (!PL_lex_inpat)
79072805
LW
3505 return TRUE;
3506
3507 /* In a pattern, so maybe we have {n,m}. */
3508 if (*s == '{') {
b3155d95 3509 if (regcurly(s)) {
79072805 3510 return FALSE;
b3155d95 3511 }
79072805 3512 return TRUE;
79072805
LW
3513 }
3514
3515 /* On the other hand, maybe we have a character class */
3516
3517 s++;
3518 if (*s == ']' || *s == '^')
3519 return FALSE;
3520 else {
ffb4593c 3521 /* this is terrifying, and it works */
79072805
LW
3522 int weight = 2; /* let's weigh the evidence */
3523 char seen[256];
f27ffc4a 3524 unsigned char un_char = 255, last_un_char;
9d4ba2ae 3525 const char * const send = strchr(s,']');
3280af22 3526 char tmpbuf[sizeof PL_tokenbuf * 4];
79072805
LW
3527
3528 if (!send) /* has to be an expression */
3529 return TRUE;
3530
3531 Zero(seen,256,char);
3532 if (*s == '$')
3533 weight -= 3;
3534 else if (isDIGIT(*s)) {
3535 if (s[1] != ']') {
3536 if (isDIGIT(s[1]) && s[2] == ']')
3537 weight -= 10;
3538 }
3539 else
3540 weight -= 100;
3541 }
3542 for (; s < send; s++) {
3543 last_un_char = un_char;
3544 un_char = (unsigned char)*s;
3545 switch (*s) {
3546 case '@':
3547 case '&':
3548 case '$':
3549 weight -= seen[un_char] * 10;
7e2040f0 3550 if (isALNUM_lazy_if(s+1,UTF)) {
90e5519e 3551 int len;
8903cb82 3552 scan_ident(s, send, tmpbuf, sizeof tmpbuf, FALSE);
90e5519e
NC
3553 len = (int)strlen(tmpbuf);
3554 if (len > 1 && gv_fetchpvn_flags(tmpbuf, len, 0, SVt_PV))
79072805
LW
3555 weight -= 100;
3556 else
3557 weight -= 10;
3558 }
3559 else if (*s == '$' && s[1] &&
93a17b20
LW
3560 strchr("[#!%*<>()-=",s[1])) {
3561 if (/*{*/ strchr("])} =",s[2]))
79072805
LW
3562 weight -= 10;
3563 else
3564 weight -= 1;
3565 }
3566 break;
3567 case '\\':
3568 un_char = 254;
3569 if (s[1]) {
93a17b20 3570 if (strchr("wds]",s[1]))
79072805 3571 weight += 100;
10edeb5d 3572 else if (seen[(U8)'\''] || seen[(U8)'"'])
79072805 3573 weight += 1;
93a17b20 3574 else if (strchr("rnftbxcav",s[1]))
79072805
LW
3575 weight += 40;
3576 else if (isDIGIT(s[1])) {
3577 weight += 40;
3578 while (s[1] && isDIGIT(s[1]))
3579 s++;
3580 }
3581 }
3582 else
3583 weight += 100;
3584 break;
3585 case '-':
3586 if (s[1] == '\\')
3587 weight += 50;
93a17b20 3588 if (strchr("aA01! ",last_un_char))
79072805 3589 weight += 30;
93a17b20 3590 if (strchr("zZ79~",s[1]))
79072805 3591 weight += 30;
f27ffc4a
GS
3592 if (last_un_char == 255 && (isDIGIT(s[1]) || s[1] == '$'))
3593 weight -= 5; /* cope with negative subscript */
79072805
LW
3594 break;
3595 default:
3792a11b
NC
3596 if (!isALNUM(last_un_char)
3597 && !(last_un_char == '$' || last_un_char == '@'
3598 || last_un_char == '&')
3599 && isALPHA(*s) && s[1] && isALPHA(s[1])) {
79072805
LW
3600 char *d = tmpbuf;
3601 while (isALPHA(*s))
3602 *d++ = *s++;
3603 *d = '\0';
5458a98a 3604 if (keyword(tmpbuf, d - tmpbuf, 0))
79072805
LW
3605 weight -= 150;
3606 }
3607 if (un_char == last_un_char + 1)
3608 weight += 5;
3609 weight -= seen[un_char];
3610 break;
3611 }
3612 seen[un_char]++;
3613 }
3614 if (weight >= 0) /* probably a character class */
3615 return FALSE;
3616 }
3617
3618 return TRUE;
3619}
ffed7fef 3620
ffb4593c
NT
3621/*
3622 * S_intuit_method
3623 *
3624 * Does all the checking to disambiguate
3625 * foo bar
3626 * between foo(bar) and bar->foo. Returns 0 if not a method, otherwise
3627 * FUNCMETH (bar->foo(args)) or METHOD (bar->foo args).
3628 *
3629 * First argument is the stuff after the first token, e.g. "bar".
3630 *
3631 * Not a method if bar is a filehandle.
3632 * Not a method if foo is a subroutine prototyped to take a filehandle.
3633 * Not a method if it's really "Foo $bar"
3634 * Method if it's "foo $bar"
3635 * Not a method if it's really "print foo $bar"
3636 * Method if it's really "foo package::" (interpreted as package->foo)
8f8cf39c 3637 * Not a method if bar is known to be a subroutine ("sub bar; foo bar")
3cb0bbe5 3638 * Not a method if bar is a filehandle or package, but is quoted with
ffb4593c
NT
3639 * =>
3640 */
3641
76e3520e 3642STATIC int
62d55b22 3643S_intuit_method(pTHX_ char *start, GV *gv, CV *cv)
a0d0e21e 3644{
97aff369 3645 dVAR;
a0d0e21e 3646 char *s = start + (*start == '$');
3280af22 3647 char tmpbuf[sizeof PL_tokenbuf];
a0d0e21e
LW
3648 STRLEN len;
3649 GV* indirgv;
5db06880
NC
3650#ifdef PERL_MAD
3651 int soff;
3652#endif
a0d0e21e 3653
7918f24d
NC
3654 PERL_ARGS_ASSERT_INTUIT_METHOD;
3655
a0d0e21e 3656 if (gv) {
62d55b22 3657 if (SvTYPE(gv) == SVt_PVGV && GvIO(gv))
a0d0e21e 3658 return 0;
62d55b22
NC
3659 if (cv) {
3660 if (SvPOK(cv)) {
3661 const char *proto = SvPVX_const(cv);
3662 if (proto) {
3663 if (*proto == ';')
3664 proto++;
3665 if (*proto == '*')
3666 return 0;
3667 }
b6c543e3
IZ
3668 }
3669 } else
c35e046a 3670 gv = NULL;
a0d0e21e 3671 }
8903cb82 3672 s = scan_word(s, tmpbuf, sizeof tmpbuf, TRUE, &len);
ffb4593c
NT
3673 /* start is the beginning of the possible filehandle/object,
3674 * and s is the end of it
3675 * tmpbuf is a copy of it
3676 */
3677
a0d0e21e 3678 if (*start == '$') {
3ef1310e
RGS
3679 if (gv || PL_last_lop_op == OP_PRINT || PL_last_lop_op == OP_SAY ||
3680 isUPPER(*PL_tokenbuf))
a0d0e21e 3681 return 0;
5db06880
NC
3682#ifdef PERL_MAD
3683 len = start - SvPVX(PL_linestr);
3684#endif
29595ff2 3685 s = PEEKSPACE(s);
f0092767 3686#ifdef PERL_MAD
5db06880
NC
3687 start = SvPVX(PL_linestr) + len;
3688#endif
3280af22
NIS
3689 PL_bufptr = start;
3690 PL_expect = XREF;
a0d0e21e
LW
3691 return *s == '(' ? FUNCMETH : METHOD;
3692 }
5458a98a 3693 if (!keyword(tmpbuf, len, 0)) {
c3e0f903
GS
3694 if (len > 2 && tmpbuf[len - 2] == ':' && tmpbuf[len - 1] == ':') {
3695 len -= 2;
3696 tmpbuf[len] = '\0';
5db06880
NC
3697#ifdef PERL_MAD
3698 soff = s - SvPVX(PL_linestr);
3699#endif
c3e0f903
GS
3700 goto bare_package;
3701 }
90e5519e 3702 indirgv = gv_fetchpvn_flags(tmpbuf, len, 0, SVt_PVCV);
8ebc5c01 3703 if (indirgv && GvCVu(indirgv))
a0d0e21e
LW
3704 return 0;
3705 /* filehandle or package name makes it a method */
da51bb9b 3706 if (!gv || GvIO(indirgv) || gv_stashpvn(tmpbuf, len, 0)) {
5db06880
NC
3707#ifdef PERL_MAD
3708 soff = s - SvPVX(PL_linestr);
3709#endif
29595ff2 3710 s = PEEKSPACE(s);
3280af22 3711 if ((PL_bufend - s) >= 2 && *s == '=' && *(s+1) == '>')
55497cff 3712 return 0; /* no assumptions -- "=>" quotes bearword */
c3e0f903 3713 bare_package:
cd81e915 3714 start_force(PL_curforce);
9ded7720 3715 NEXTVAL_NEXTTOKE.opval = (OP*)newSVOP(OP_CONST, 0,
64142370 3716 S_newSV_maybe_utf8(aTHX_ tmpbuf, len));
9ded7720 3717 NEXTVAL_NEXTTOKE.opval->op_private = OPpCONST_BARE;
5db06880
NC
3718 if (PL_madskills)
3719 curmad('X', newSVpvn(start,SvPVX(PL_linestr) + soff - start));
3280af22 3720 PL_expect = XTERM;
a0d0e21e 3721 force_next(WORD);
3280af22 3722 PL_bufptr = s;
5db06880
NC
3723#ifdef PERL_MAD
3724 PL_bufptr = SvPVX(PL_linestr) + soff; /* restart before space */
3725#endif
a0d0e21e
LW
3726 return *s == '(' ? FUNCMETH : METHOD;
3727 }
3728 }
3729 return 0;
3730}
3731
16d20bd9 3732/* Encoded script support. filter_add() effectively inserts a
4e553d73 3733 * 'pre-processing' function into the current source input stream.
16d20bd9
AD
3734 * Note that the filter function only applies to the current source file
3735 * (e.g., it will not affect files 'require'd or 'use'd by this one).
3736 *
3737 * The datasv parameter (which may be NULL) can be used to pass
3738 * private data to this instance of the filter. The filter function
3739 * can recover the SV using the FILTER_DATA macro and use it to
3740 * store private buffers and state information.
3741 *
3742 * The supplied datasv parameter is upgraded to a PVIO type
4755096e 3743 * and the IoDIRP/IoANY field is used to store the function pointer,
e0c19803 3744 * and IOf_FAKE_DIRP is enabled on datasv to mark this as such.
16d20bd9
AD
3745 * Note that IoTOP_NAME, IoFMT_NAME, IoBOTTOM_NAME, if set for
3746 * private use must be set using malloc'd pointers.
3747 */
16d20bd9
AD
3748
3749SV *
864dbfa3 3750Perl_filter_add(pTHX_ filter_t funcp, SV *datasv)
16d20bd9 3751{
97aff369 3752 dVAR;
f4c556ac 3753 if (!funcp)
a0714e2c 3754 return NULL;
f4c556ac 3755
5486870f
DM
3756 if (!PL_parser)
3757 return NULL;
3758
3280af22
NIS
3759 if (!PL_rsfp_filters)
3760 PL_rsfp_filters = newAV();
16d20bd9 3761 if (!datasv)
561b68a9 3762 datasv = newSV(0);
862a34c6 3763 SvUPGRADE(datasv, SVt_PVIO);
8141890a 3764 IoANY(datasv) = FPTR2DPTR(void *, funcp); /* stash funcp into spare field */
e0c19803 3765 IoFLAGS(datasv) |= IOf_FAKE_DIRP;
f4c556ac 3766 DEBUG_P(PerlIO_printf(Perl_debug_log, "filter_add func %p (%s)\n",
55662e27
JH
3767 FPTR2DPTR(void *, IoANY(datasv)),
3768 SvPV_nolen(datasv)));
3280af22
NIS
3769 av_unshift(PL_rsfp_filters, 1);
3770 av_store(PL_rsfp_filters, 0, datasv) ;
16d20bd9
AD
3771 return(datasv);
3772}
4e553d73 3773
16d20bd9
AD
3774
3775/* Delete most recently added instance of this filter function. */
a0d0e21e 3776void
864dbfa3 3777Perl_filter_del(pTHX_ filter_t funcp)
16d20bd9