This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
[PATCH] function to parse Perl statement sequence
[perl5.git] / toke.c
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
26
27This is the lower layer of the Perl parser, managing characters and tokens.
28
29=for apidoc AmU|yy_parser *|PL_parser
30
31Pointer to a structure encapsulating the state of the parsing operation
32currently in progress. The pointer can be locally changed to perform
33a nested parse without interfering with the state of an outer parse.
34Individual members of C<PL_parser> have their own documentation.
35
36=cut
37*/
38
378cc40b 39#include "EXTERN.h"
864dbfa3 40#define PERL_IN_TOKE_C
378cc40b 41#include "perl.h"
04e98a4d 42#include "dquote_static.c"
378cc40b 43
eb0d8d16
NC
44#define new_constant(a,b,c,d,e,f,g) \
45 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
46
6154021b 47#define pl_yylval (PL_parser->yylval)
d3b6f988 48
199e78b7
DM
49/* XXX temporary backwards compatibility */
50#define PL_lex_brackets (PL_parser->lex_brackets)
51#define PL_lex_brackstack (PL_parser->lex_brackstack)
52#define PL_lex_casemods (PL_parser->lex_casemods)
53#define PL_lex_casestack (PL_parser->lex_casestack)
54#define PL_lex_defer (PL_parser->lex_defer)
55#define PL_lex_dojoin (PL_parser->lex_dojoin)
56#define PL_lex_expect (PL_parser->lex_expect)
57#define PL_lex_formbrack (PL_parser->lex_formbrack)
58#define PL_lex_inpat (PL_parser->lex_inpat)
59#define PL_lex_inwhat (PL_parser->lex_inwhat)
60#define PL_lex_op (PL_parser->lex_op)
61#define PL_lex_repl (PL_parser->lex_repl)
62#define PL_lex_starts (PL_parser->lex_starts)
63#define PL_lex_stuff (PL_parser->lex_stuff)
64#define PL_multi_start (PL_parser->multi_start)
65#define PL_multi_open (PL_parser->multi_open)
66#define PL_multi_close (PL_parser->multi_close)
67#define PL_pending_ident (PL_parser->pending_ident)
68#define PL_preambled (PL_parser->preambled)
69#define PL_sublex_info (PL_parser->sublex_info)
bdc0bf6f 70#define PL_linestr (PL_parser->linestr)
c2598295
DM
71#define PL_expect (PL_parser->expect)
72#define PL_copline (PL_parser->copline)
f06b5848
DM
73#define PL_bufptr (PL_parser->bufptr)
74#define PL_oldbufptr (PL_parser->oldbufptr)
75#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
76#define PL_linestart (PL_parser->linestart)
77#define PL_bufend (PL_parser->bufend)
78#define PL_last_uni (PL_parser->last_uni)
79#define PL_last_lop (PL_parser->last_lop)
80#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 81#define PL_lex_state (PL_parser->lex_state)
2f9285f8 82#define PL_rsfp (PL_parser->rsfp)
5486870f 83#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
84#define PL_in_my (PL_parser->in_my)
85#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 86#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 87#define PL_multi_end (PL_parser->multi_end)
13765c85 88#define PL_error_count (PL_parser->error_count)
199e78b7
DM
89
90#ifdef PERL_MAD
91# define PL_endwhite (PL_parser->endwhite)
92# define PL_faketokens (PL_parser->faketokens)
93# define PL_lasttoke (PL_parser->lasttoke)
94# define PL_nextwhite (PL_parser->nextwhite)
95# define PL_realtokenstart (PL_parser->realtokenstart)
96# define PL_skipwhite (PL_parser->skipwhite)
97# define PL_thisclose (PL_parser->thisclose)
98# define PL_thismad (PL_parser->thismad)
99# define PL_thisopen (PL_parser->thisopen)
100# define PL_thisstuff (PL_parser->thisstuff)
101# define PL_thistoken (PL_parser->thistoken)
102# define PL_thiswhite (PL_parser->thiswhite)
fb205e7a
DM
103# define PL_thiswhite (PL_parser->thiswhite)
104# define PL_nexttoke (PL_parser->nexttoke)
105# define PL_curforce (PL_parser->curforce)
106#else
107# define PL_nexttoke (PL_parser->nexttoke)
108# define PL_nexttype (PL_parser->nexttype)
109# define PL_nextval (PL_parser->nextval)
199e78b7
DM
110#endif
111
16173588
NC
112/* This can't be done with embed.fnc, because struct yy_parser contains a
113 member named pending_ident, which clashes with the generated #define */
3cbf51f5
DM
114static int
115S_pending_ident(pTHX);
199e78b7 116
0bd48802 117static const char ident_too_long[] = "Identifier too long";
8903cb82 118
29595ff2 119#ifdef PERL_MAD
29595ff2 120# define CURMAD(slot,sv) if (PL_madskills) { curmad(slot,sv); sv = 0; }
cd81e915 121# define NEXTVAL_NEXTTOKE PL_nexttoke[PL_curforce].next_val
9ded7720 122#else
5db06880 123# define CURMAD(slot,sv)
9ded7720 124# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2
NC
125#endif
126
9059aa12
LW
127#define XFAKEBRACK 128
128#define XENUMMASK 127
129
39e02b42
JH
130#ifdef USE_UTF8_SCRIPTS
131# define UTF (!IN_BYTES)
2b9d42f0 132#else
746b446a 133# define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || (PL_hints & HINT_UTF8))
2b9d42f0 134#endif
a0ed51b3 135
b1fc3636
CJ
136/* The maximum number of characters preceding the unrecognized one to display */
137#define UNRECOGNIZED_PRECEDE_COUNT 10
138
61f0cdd9 139/* In variables named $^X, these are the legal values for X.
2b92dfce
GS
140 * 1999-02-27 mjd-perl-patch@plover.com */
141#define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
142
bf4acbe4 143#define SPACE_OR_TAB(c) ((c)==' '||(c)=='\t')
bf4acbe4 144
ffb4593c
NT
145/* LEX_* are values for PL_lex_state, the state of the lexer.
146 * They are arranged oddly so that the guard on the switch statement
79072805
LW
147 * can get by with a single comparison (if the compiler is smart enough).
148 */
149
fb73857a
PP
150/* #define LEX_NOTPARSING 11 is done in perl.h. */
151
b6007c36
DM
152#define LEX_NORMAL 10 /* normal code (ie not within "...") */
153#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
154#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
155#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
156#define LEX_INTERPSTART 6 /* expecting the start of a $var */
157
158 /* at end of code, eg "$x" followed by: */
159#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
160#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
161
162#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
163 string or after \E, $foo, etc */
164#define LEX_INTERPCONST 2 /* NOT USED */
165#define LEX_FORMLINE 1 /* expecting a format line */
166#define LEX_KNOWNEXT 0 /* next token known; just return it */
167
79072805 168
bbf60fe6 169#ifdef DEBUGGING
27da23d5 170static const char* const lex_state_names[] = {
bbf60fe6
DM
171 "KNOWNEXT",
172 "FORMLINE",
173 "INTERPCONST",
174 "INTERPCONCAT",
175 "INTERPENDMAYBE",
176 "INTERPEND",
177 "INTERPSTART",
178 "INTERPPUSH",
179 "INTERPCASEMOD",
180 "INTERPNORMAL",
181 "NORMAL"
182};
183#endif
184
79072805
LW
185#ifdef ff_next
186#undef ff_next
d48672a2
LW
187#endif
188
79072805 189#include "keywords.h"
fe14fcc3 190
ffb4593c
NT
191/* CLINE is a macro that ensures PL_copline has a sane value */
192
ae986130
LW
193#ifdef CLINE
194#undef CLINE
195#endif
57843af0 196#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 197
5db06880 198#ifdef PERL_MAD
29595ff2
NC
199# define SKIPSPACE0(s) skipspace0(s)
200# define SKIPSPACE1(s) skipspace1(s)
201# define SKIPSPACE2(s,tsv) skipspace2(s,&tsv)
202# define PEEKSPACE(s) skipspace2(s,0)
203#else
204# define SKIPSPACE0(s) skipspace(s)
205# define SKIPSPACE1(s) skipspace(s)
206# define SKIPSPACE2(s,tsv) skipspace(s)
207# define PEEKSPACE(s) skipspace(s)
208#endif
209
ffb4593c
NT
210/*
211 * Convenience functions to return different tokens and prime the
9cbb5ea2 212 * lexer for the next token. They all take an argument.
ffb4593c
NT
213 *
214 * TOKEN : generic token (used for '(', DOLSHARP, etc)
215 * OPERATOR : generic operator
216 * AOPERATOR : assignment operator
217 * PREBLOCK : beginning the block after an if, while, foreach, ...
218 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
219 * PREREF : *EXPR where EXPR is not a simple identifier
220 * TERM : expression term
221 * LOOPX : loop exiting command (goto, last, dump, etc)
222 * FTST : file test operator
223 * FUN0 : zero-argument function
2d2e263d 224 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
225 * BOop : bitwise or or xor
226 * BAop : bitwise and
227 * SHop : shift operator
228 * PWop : power operator
9cbb5ea2 229 * PMop : pattern-matching operator
ffb4593c
NT
230 * Aop : addition-level operator
231 * Mop : multiplication-level operator
232 * Eop : equality-testing operator
e5edeb50 233 * Rop : relational operator <= != gt
ffb4593c
NT
234 *
235 * Also see LOP and lop() below.
236 */
237
998054bd 238#ifdef DEBUGGING /* Serve -DT. */
704d4215 239# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 240#else
bbf60fe6 241# define REPORT(retval) (retval)
998054bd
SC
242#endif
243
bbf60fe6
DM
244#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
245#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
246#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, REPORT(retval)))
247#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
248#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
249#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
250#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
6154021b
RGS
251#define LOOPX(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)LOOPEX))
252#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
253#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
254#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
255#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITOROP)))
256#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITANDOP)))
257#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)SHIFTOP)))
258#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)POWOP)))
259#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
260#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)ADDOP)))
261#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MULOP)))
262#define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
263#define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
2f3197b3 264
a687059c
LW
265/* This bit of chicanery makes a unary function followed by
266 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
267 * The UNIDOR macro is for unary functions that can be followed by the //
268 * operator (such as C<shift // 0>).
a687059c 269 */
376fcdbf 270#define UNI2(f,x) { \
6154021b 271 pl_yylval.ival = f; \
376fcdbf
AL
272 PL_expect = x; \
273 PL_bufptr = s; \
274 PL_last_uni = PL_oldbufptr; \
275 PL_last_lop_op = f; \
276 if (*s == '(') \
277 return REPORT( (int)FUNC1 ); \
29595ff2 278 s = PEEKSPACE(s); \
376fcdbf
AL
279 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
280 }
6f33ba73
RGS
281#define UNI(f) UNI2(f,XTERM)
282#define UNIDOR(f) UNI2(f,XTERMORDORDOR)
a687059c 283
376fcdbf 284#define UNIBRACK(f) { \
6154021b 285 pl_yylval.ival = f; \
376fcdbf
AL
286 PL_bufptr = s; \
287 PL_last_uni = PL_oldbufptr; \
288 if (*s == '(') \
289 return REPORT( (int)FUNC1 ); \
29595ff2 290 s = PEEKSPACE(s); \
376fcdbf
AL
291 return REPORT( (*s == '(') ? (int)FUNC1 : (int)UNIOP ); \
292 }
79072805 293
9f68db38 294/* grandfather return to old style */
6154021b 295#define OLDLOP(f) return(pl_yylval.ival=f,PL_expect = XTERM,PL_bufptr = s,(int)LSTOP)
79072805 296
8fa7f367
JH
297#ifdef DEBUGGING
298
6154021b 299/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
300enum token_type {
301 TOKENTYPE_NONE,
302 TOKENTYPE_IVAL,
6154021b 303 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6
DM
304 TOKENTYPE_PVAL,
305 TOKENTYPE_OPVAL,
306 TOKENTYPE_GVVAL
307};
308
6d4a66ac
NC
309static struct debug_tokens {
310 const int token;
311 enum token_type type;
312 const char *name;
313} const debug_tokens[] =
9041c2e3 314{
bbf60fe6
DM
315 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
316 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
317 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
318 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
319 { ARROW, TOKENTYPE_NONE, "ARROW" },
320 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
321 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
322 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
323 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
324 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
0d863452 325 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
326 { DO, TOKENTYPE_NONE, "DO" },
327 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
328 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
329 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
330 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
331 { ELSE, TOKENTYPE_NONE, "ELSE" },
332 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
333 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
334 { FOR, TOKENTYPE_IVAL, "FOR" },
335 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
336 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
337 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
338 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
339 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
340 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 341 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
342 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
343 { IF, TOKENTYPE_IVAL, "IF" },
344 { LABEL, TOKENTYPE_PVAL, "LABEL" },
345 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
346 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
347 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
348 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
349 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
350 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
351 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
352 { MY, TOKENTYPE_IVAL, "MY" },
353 { MYSUB, TOKENTYPE_NONE, "MYSUB" },
354 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
355 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
356 { OROP, TOKENTYPE_IVAL, "OROP" },
357 { OROR, TOKENTYPE_NONE, "OROR" },
358 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
88e1f1a2
JV
359 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
360 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6
DM
361 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
362 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
363 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
364 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
365 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
366 { PREINC, TOKENTYPE_NONE, "PREINC" },
367 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
368 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
369 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
370 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
371 { SUB, TOKENTYPE_NONE, "SUB" },
372 { THING, TOKENTYPE_OPVAL, "THING" },
373 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
374 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
375 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
376 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
377 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
378 { USE, TOKENTYPE_IVAL, "USE" },
0d863452 379 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6
DM
380 { WHILE, TOKENTYPE_IVAL, "WHILE" },
381 { WORD, TOKENTYPE_OPVAL, "WORD" },
be25f609 382 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 383 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
384};
385
6154021b 386/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 387
bbf60fe6 388STATIC int
704d4215 389S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 390{
97aff369 391 dVAR;
7918f24d
NC
392
393 PERL_ARGS_ASSERT_TOKEREPORT;
394
bbf60fe6 395 if (DEBUG_T_TEST) {
bd61b366 396 const char *name = NULL;
bbf60fe6 397 enum token_type type = TOKENTYPE_NONE;
f54cb97a 398 const struct debug_tokens *p;
396482e1 399 SV* const report = newSVpvs("<== ");
bbf60fe6 400
f54cb97a 401 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
402 if (p->token == (int)rv) {
403 name = p->name;
404 type = p->type;
405 break;
406 }
407 }
408 if (name)
54667de8 409 Perl_sv_catpv(aTHX_ report, name);
bbf60fe6
DM
410 else if ((char)rv > ' ' && (char)rv < '~')
411 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
412 else if (!rv)
396482e1 413 sv_catpvs(report, "EOF");
bbf60fe6
DM
414 else
415 Perl_sv_catpvf(aTHX_ report, "?? %"IVdf, (IV)rv);
416 switch (type) {
417 case TOKENTYPE_NONE:
418 case TOKENTYPE_GVVAL: /* doesn't appear to be used */
419 break;
420 case TOKENTYPE_IVAL:
704d4215 421 Perl_sv_catpvf(aTHX_ report, "(ival=%"IVdf")", (IV)lvalp->ival);
bbf60fe6
DM
422 break;
423 case TOKENTYPE_OPNUM:
424 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 425 PL_op_name[lvalp->ival]);
bbf60fe6
DM
426 break;
427 case TOKENTYPE_PVAL:
704d4215 428 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
429 break;
430 case TOKENTYPE_OPVAL:
704d4215 431 if (lvalp->opval) {
401441c0 432 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
433 PL_op_name[lvalp->opval->op_type]);
434 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 435 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 436 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
437 }
438
439 }
401441c0 440 else
396482e1 441 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
442 break;
443 }
b6007c36 444 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
445 };
446 return (int)rv;
998054bd
SC
447}
448
b6007c36
DM
449
450/* print the buffer with suitable escapes */
451
452STATIC void
15f169a1 453S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 454{
396482e1 455 SV* const tmp = newSVpvs("");
7918f24d
NC
456
457 PERL_ARGS_ASSERT_PRINTBUF;
458
b6007c36
DM
459 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
460 SvREFCNT_dec(tmp);
461}
462
8fa7f367
JH
463#endif
464
8290c323
NC
465static int
466S_deprecate_commaless_var_list(pTHX) {
467 PL_expect = XTERM;
468 deprecate("comma-less variable list");
469 return REPORT(','); /* grandfather non-comma-format format */
470}
471
ffb4593c
NT
472/*
473 * S_ao
474 *
c963b151
BD
475 * This subroutine detects &&=, ||=, and //= and turns an ANDAND, OROR or DORDOR
476 * into an OP_ANDASSIGN, OP_ORASSIGN, or OP_DORASSIGN
ffb4593c
NT
477 */
478
76e3520e 479STATIC int
cea2e8a9 480S_ao(pTHX_ int toketype)
a0d0e21e 481{
97aff369 482 dVAR;
3280af22
NIS
483 if (*PL_bufptr == '=') {
484 PL_bufptr++;
a0d0e21e 485 if (toketype == ANDAND)
6154021b 486 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 487 else if (toketype == OROR)
6154021b 488 pl_yylval.ival = OP_ORASSIGN;
c963b151 489 else if (toketype == DORDOR)
6154021b 490 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
491 toketype = ASSIGNOP;
492 }
493 return toketype;
494}
495
ffb4593c
NT
496/*
497 * S_no_op
498 * When Perl expects an operator and finds something else, no_op
499 * prints the warning. It always prints "<something> found where
500 * operator expected. It prints "Missing semicolon on previous line?"
501 * if the surprise occurs at the start of the line. "do you need to
502 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
503 * where the compiler doesn't know if foo is a method call or a function.
504 * It prints "Missing operator before end of line" if there's nothing
505 * after the missing operator, or "... before <...>" if there is something
506 * after the missing operator.
507 */
508
76e3520e 509STATIC void
15f169a1 510S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 511{
97aff369 512 dVAR;
9d4ba2ae
AL
513 char * const oldbp = PL_bufptr;
514 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 515
7918f24d
NC
516 PERL_ARGS_ASSERT_NO_OP;
517
1189a94a
GS
518 if (!s)
519 s = oldbp;
07c798fb 520 else
1189a94a 521 PL_bufptr = s;
cea2e8a9 522 yywarn(Perl_form(aTHX_ "%s found where operator expected", what));
56da5a46
RGS
523 if (ckWARN_d(WARN_SYNTAX)) {
524 if (is_first)
525 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
526 "\t(Missing semicolon on previous line?)\n");
527 else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) {
f54cb97a 528 const char *t;
c35e046a
AL
529 for (t = PL_oldoldbufptr; (isALNUM_lazy_if(t,UTF) || *t == ':'); t++)
530 NOOP;
56da5a46
RGS
531 if (t < PL_bufptr && isSPACE(*t))
532 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
533 "\t(Do you need to predeclare %.*s?)\n",
551405c4 534 (int)(t - PL_oldoldbufptr), PL_oldoldbufptr);
56da5a46
RGS
535 }
536 else {
537 assert(s >= oldbp);
538 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
551405c4 539 "\t(Missing operator before %.*s?)\n", (int)(s - oldbp), oldbp);
56da5a46 540 }
07c798fb 541 }
3280af22 542 PL_bufptr = oldbp;
8990e307
LW
543}
544
ffb4593c
NT
545/*
546 * S_missingterm
547 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 548 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
549 * If we're in a delimited string and the delimiter is a control
550 * character, it's reformatted into a two-char sequence like ^C.
551 * This is fatal.
552 */
553
76e3520e 554STATIC void
cea2e8a9 555S_missingterm(pTHX_ char *s)
8990e307 556{
97aff369 557 dVAR;
8990e307
LW
558 char tmpbuf[3];
559 char q;
560 if (s) {
9d4ba2ae 561 char * const nl = strrchr(s,'\n');
d2719217 562 if (nl)
8990e307
LW
563 *nl = '\0';
564 }
463559e7 565 else if (isCNTRL(PL_multi_close)) {
8990e307 566 *tmpbuf = '^';
585ec06d 567 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
568 tmpbuf[2] = '\0';
569 s = tmpbuf;
570 }
571 else {
eb160463 572 *tmpbuf = (char)PL_multi_close;
8990e307
LW
573 tmpbuf[1] = '\0';
574 s = tmpbuf;
575 }
576 q = strchr(s,'"') ? '\'' : '"';
cea2e8a9 577 Perl_croak(aTHX_ "Can't find string terminator %c%s%c anywhere before EOF",q,s,q);
463ee0b2 578}
79072805 579
ef89dcc3 580#define FEATURE_IS_ENABLED(name) \
0d863452 581 ((0 != (PL_hints & HINT_LOCALIZE_HH)) \
89529cee 582 && S_feature_is_enabled(aTHX_ STR_WITH_LEN(name)))
4a731d7b 583/* The longest string we pass in. */
1863b879 584#define MAX_FEATURE_LEN (sizeof("unicode_strings")-1)
4a731d7b 585
0d863452
RH
586/*
587 * S_feature_is_enabled
588 * Check whether the named feature is enabled.
589 */
590STATIC bool
15f169a1 591S_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
0d863452 592{
97aff369 593 dVAR;
0d863452 594 HV * const hinthv = GvHV(PL_hintgv);
4a731d7b 595 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
7918f24d
NC
596
597 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
598
4a731d7b
NC
599 assert(namelen <= MAX_FEATURE_LEN);
600 memcpy(&he_name[8], name, namelen);
d4c19fe8 601
7b9ef140 602 return (hinthv && hv_exists(hinthv, he_name, 8 + namelen));
0d863452
RH
603}
604
ffb4593c 605/*
9cbb5ea2
GS
606 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
607 * utf16-to-utf8-reversed.
ffb4593c
NT
608 */
609
c39cd008
GS
610#ifdef PERL_CR_FILTER
611static void
612strip_return(SV *sv)
613{
95a20fc0 614 register const char *s = SvPVX_const(sv);
9d4ba2ae 615 register const char * const e = s + SvCUR(sv);
7918f24d
NC
616
617 PERL_ARGS_ASSERT_STRIP_RETURN;
618
c39cd008
GS
619 /* outer loop optimized to do nothing if there are no CR-LFs */
620 while (s < e) {
621 if (*s++ == '\r' && *s == '\n') {
622 /* hit a CR-LF, need to copy the rest */
623 register char *d = s - 1;
624 *d++ = *s++;
625 while (s < e) {
626 if (*s == '\r' && s[1] == '\n')
627 s++;
628 *d++ = *s++;
629 }
630 SvCUR(sv) -= s - d;
631 return;
632 }
633 }
634}
a868473f 635
76e3520e 636STATIC I32
c39cd008 637S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 638{
f54cb97a 639 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
640 if (count > 0 && !maxlen)
641 strip_return(sv);
642 return count;
a868473f
NIS
643}
644#endif
645
199e78b7
DM
646
647
ffb4593c
NT
648/*
649 * Perl_lex_start
5486870f 650 *
e3abe207 651 * Create a parser object and initialise its parser and lexer fields
5486870f
DM
652 *
653 * rsfp is the opened file handle to read from (if any),
654 *
655 * line holds any initial content already read from the file (or in
656 * the case of no file, such as an eval, the whole contents);
657 *
658 * new_filter indicates that this is a new file and it shouldn't inherit
659 * the filters from the current parser (ie require).
ffb4593c
NT
660 */
661
a0d0e21e 662void
5486870f 663Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, bool new_filter)
79072805 664{
97aff369 665 dVAR;
6ef55633 666 const char *s = NULL;
8990e307 667 STRLEN len;
5486870f 668 yy_parser *parser, *oparser;
acdf0a21
DM
669
670 /* create and initialise a parser */
671
199e78b7 672 Newxz(parser, 1, yy_parser);
5486870f 673 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
674 PL_parser = parser;
675
28ac2b49
Z
676 parser->stack = NULL;
677 parser->ps = NULL;
678 parser->stack_size = 0;
acdf0a21 679
e3abe207
DM
680 /* on scope exit, free this parser and restore any outer one */
681 SAVEPARSER(parser);
7c4baf47 682 parser->saved_curcop = PL_curcop;
e3abe207 683
acdf0a21 684 /* initialise lexer state */
8990e307 685
fb205e7a
DM
686#ifdef PERL_MAD
687 parser->curforce = -1;
688#else
689 parser->nexttoke = 0;
690#endif
ca4cfd28 691 parser->error_count = oparser ? oparser->error_count : 0;
c2598295 692 parser->copline = NOLINE;
5afb0a62 693 parser->lex_state = LEX_NORMAL;
c2598295 694 parser->expect = XSTATE;
2f9285f8 695 parser->rsfp = rsfp;
56b27c9a 696 parser->rsfp_filters = (new_filter || !oparser) ? newAV()
502c6561 697 : MUTABLE_AV(SvREFCNT_inc(oparser->rsfp_filters));
2f9285f8 698
199e78b7
DM
699 Newx(parser->lex_brackstack, 120, char);
700 Newx(parser->lex_casestack, 12, char);
701 *parser->lex_casestack = '\0';
02b34bbe 702
10efb74f
NC
703 if (line) {
704 s = SvPV_const(line, len);
705 } else {
706 len = 0;
707 }
bdc0bf6f 708
10efb74f 709 if (!len) {
bdc0bf6f 710 parser->linestr = newSVpvs("\n;");
3e5c0189 711 } else if (SvREADONLY(line) || s[len-1] != ';' || !SvPOK(line)) {
719a9bb0
NC
712 /* avoid tie/overload weirdness */
713 parser->linestr = newSVpvn_flags(s, len, SvUTF8(line));
10efb74f 714 if (s[len-1] != ';')
bdc0bf6f 715 sv_catpvs(parser->linestr, "\n;");
6c5ce11d
NC
716 } else {
717 SvTEMP_off(line);
718 SvREFCNT_inc_simple_void_NN(line);
bdc0bf6f 719 parser->linestr = line;
8990e307 720 }
f06b5848
DM
721 parser->oldoldbufptr =
722 parser->oldbufptr =
723 parser->bufptr =
724 parser->linestart = SvPVX(parser->linestr);
725 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
726 parser->last_lop = parser->last_uni = NULL;
79072805 727}
a687059c 728
e3abe207
DM
729
730/* delete a parser object */
731
732void
733Perl_parser_free(pTHX_ const yy_parser *parser)
734{
7918f24d
NC
735 PERL_ARGS_ASSERT_PARSER_FREE;
736
7c4baf47 737 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
738 SvREFCNT_dec(parser->linestr);
739
2f9285f8
DM
740 if (parser->rsfp == PerlIO_stdin())
741 PerlIO_clearerr(parser->rsfp);
799361c3
SH
742 else if (parser->rsfp && (!parser->old_parser ||
743 (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 744 PerlIO_close(parser->rsfp);
5486870f 745 SvREFCNT_dec(parser->rsfp_filters);
2f9285f8 746
e3abe207
DM
747 Safefree(parser->lex_brackstack);
748 Safefree(parser->lex_casestack);
749 PL_parser = parser->old_parser;
750 Safefree(parser);
751}
752
753
ffb4593c
NT
754/*
755 * Perl_lex_end
9cbb5ea2
GS
756 * Finalizer for lexing operations. Must be called when the parser is
757 * done with the lexer.
ffb4593c
NT
758 */
759
463ee0b2 760void
864dbfa3 761Perl_lex_end(pTHX)
463ee0b2 762{
97aff369 763 dVAR;
3280af22 764 PL_doextract = FALSE;
463ee0b2
LW
765}
766
ffb4593c 767/*
f0e67a1d
Z
768=for apidoc AmxU|SV *|PL_parser-E<gt>linestr
769
770Buffer scalar containing the chunk currently under consideration of the
771text currently being lexed. This is always a plain string scalar (for
772which C<SvPOK> is true). It is not intended to be used as a scalar by
773normal scalar means; instead refer to the buffer directly by the pointer
774variables described below.
775
776The lexer maintains various C<char*> pointers to things in the
777C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
778reallocated, all of these pointers must be updated. Don't attempt to
779do this manually, but rather use L</lex_grow_linestr> if you need to
780reallocate the buffer.
781
782The content of the text chunk in the buffer is commonly exactly one
783complete line of input, up to and including a newline terminator,
784but there are situations where it is otherwise. The octets of the
785buffer may be intended to be interpreted as either UTF-8 or Latin-1.
786The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
787flag on this scalar, which may disagree with it.
788
789For direct examination of the buffer, the variable
790L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
791lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
792of these pointers is usually preferable to examination of the scalar
793through normal scalar means.
794
795=for apidoc AmxU|char *|PL_parser-E<gt>bufend
796
797Direct pointer to the end of the chunk of text currently being lexed, the
798end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
799+ SvCUR(PL_parser-E<gt>linestr)>. A NUL character (zero octet) is
800always located at the end of the buffer, and does not count as part of
801the buffer's contents.
802
803=for apidoc AmxU|char *|PL_parser-E<gt>bufptr
804
805Points to the current position of lexing inside the lexer buffer.
806Characters around this point may be freely examined, within
807the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
808L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
809interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
810
811Lexing code (whether in the Perl core or not) moves this pointer past
812the characters that it consumes. It is also expected to perform some
813bookkeeping whenever a newline character is consumed. This movement
814can be more conveniently performed by the function L</lex_read_to>,
815which handles newlines appropriately.
816
817Interpretation of the buffer's octets can be abstracted out by
818using the slightly higher-level functions L</lex_peek_unichar> and
819L</lex_read_unichar>.
820
821=for apidoc AmxU|char *|PL_parser-E<gt>linestart
822
823Points to the start of the current line inside the lexer buffer.
824This is useful for indicating at which column an error occurred, and
825not much else. This must be updated by any lexing code that consumes
826a newline; the function L</lex_read_to> handles this detail.
827
828=cut
829*/
830
831/*
832=for apidoc Amx|bool|lex_bufutf8
833
834Indicates whether the octets in the lexer buffer
835(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
836of Unicode characters. If not, they should be interpreted as Latin-1
837characters. This is analogous to the C<SvUTF8> flag for scalars.
838
839In UTF-8 mode, it is not guaranteed that the lexer buffer actually
840contains valid UTF-8. Lexing code must be robust in the face of invalid
841encoding.
842
843The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
844is significant, but not the whole story regarding the input character
845encoding. Normally, when a file is being read, the scalar contains octets
846and its C<SvUTF8> flag is off, but the octets should be interpreted as
847UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
848however, the scalar may have the C<SvUTF8> flag on, and in this case its
849octets should be interpreted as UTF-8 unless the C<use bytes> pragma
850is in effect. This logic may change in the future; use this function
851instead of implementing the logic yourself.
852
853=cut
854*/
855
856bool
857Perl_lex_bufutf8(pTHX)
858{
859 return UTF;
860}
861
862/*
863=for apidoc Amx|char *|lex_grow_linestr|STRLEN len
864
865Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
866at least I<len> octets (including terminating NUL). Returns a
867pointer to the reallocated buffer. This is necessary before making
868any direct modification of the buffer that would increase its length.
869L</lex_stuff_pvn> provides a more convenient way to insert text into
870the buffer.
871
872Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
873this function updates all of the lexer's variables that point directly
874into the buffer.
875
876=cut
877*/
878
879char *
880Perl_lex_grow_linestr(pTHX_ STRLEN len)
881{
882 SV *linestr;
883 char *buf;
884 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
885 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
886 linestr = PL_parser->linestr;
887 buf = SvPVX(linestr);
888 if (len <= SvLEN(linestr))
889 return buf;
890 bufend_pos = PL_parser->bufend - buf;
891 bufptr_pos = PL_parser->bufptr - buf;
892 oldbufptr_pos = PL_parser->oldbufptr - buf;
893 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
894 linestart_pos = PL_parser->linestart - buf;
895 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
896 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
897 buf = sv_grow(linestr, len);
898 PL_parser->bufend = buf + bufend_pos;
899 PL_parser->bufptr = buf + bufptr_pos;
900 PL_parser->oldbufptr = buf + oldbufptr_pos;
901 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
902 PL_parser->linestart = buf + linestart_pos;
903 if (PL_parser->last_uni)
904 PL_parser->last_uni = buf + last_uni_pos;
905 if (PL_parser->last_lop)
906 PL_parser->last_lop = buf + last_lop_pos;
907 return buf;
908}
909
910/*
83aa740e 911=for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
f0e67a1d
Z
912
913Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
914immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
915reallocating the buffer if necessary. This means that lexing code that
916runs later will see the characters as if they had appeared in the input.
917It is not recommended to do this as part of normal parsing, and most
918uses of this facility run the risk of the inserted characters being
919interpreted in an unintended manner.
920
921The string to be inserted is represented by I<len> octets starting
922at I<pv>. These octets are interpreted as either UTF-8 or Latin-1,
923according to whether the C<LEX_STUFF_UTF8> flag is set in I<flags>.
924The characters are recoded for the lexer buffer, according to how the
925buffer is currently being interpreted (L</lex_bufutf8>). If a string
9dcc53ea 926to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
f0e67a1d
Z
927function is more convenient.
928
929=cut
930*/
931
932void
83aa740e 933Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
f0e67a1d 934{
749123ff 935 dVAR;
f0e67a1d
Z
936 char *bufptr;
937 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
938 if (flags & ~(LEX_STUFF_UTF8))
939 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
940 if (UTF) {
941 if (flags & LEX_STUFF_UTF8) {
942 goto plain_copy;
943 } else {
944 STRLEN highhalf = 0;
83aa740e 945 const char *p, *e = pv+len;
f0e67a1d
Z
946 for (p = pv; p != e; p++)
947 highhalf += !!(((U8)*p) & 0x80);
948 if (!highhalf)
949 goto plain_copy;
950 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
951 bufptr = PL_parser->bufptr;
952 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
953 SvCUR_set(PL_parser->linestr,
954 SvCUR(PL_parser->linestr) + len+highhalf);
f0e67a1d
Z
955 PL_parser->bufend += len+highhalf;
956 for (p = pv; p != e; p++) {
957 U8 c = (U8)*p;
958 if (c & 0x80) {
959 *bufptr++ = (char)(0xc0 | (c >> 6));
960 *bufptr++ = (char)(0x80 | (c & 0x3f));
961 } else {
962 *bufptr++ = (char)c;
963 }
964 }
965 }
966 } else {
967 if (flags & LEX_STUFF_UTF8) {
968 STRLEN highhalf = 0;
83aa740e 969 const char *p, *e = pv+len;
f0e67a1d
Z
970 for (p = pv; p != e; p++) {
971 U8 c = (U8)*p;
972 if (c >= 0xc4) {
973 Perl_croak(aTHX_ "Lexing code attempted to stuff "
974 "non-Latin-1 character into Latin-1 input");
975 } else if (c >= 0xc2 && p+1 != e &&
976 (((U8)p[1]) & 0xc0) == 0x80) {
977 p++;
978 highhalf++;
979 } else if (c >= 0x80) {
980 /* malformed UTF-8 */
981 ENTER;
982 SAVESPTR(PL_warnhook);
983 PL_warnhook = PERL_WARNHOOK_FATAL;
984 utf8n_to_uvuni((U8*)p, e-p, NULL, 0);
985 LEAVE;
986 }
987 }
988 if (!highhalf)
989 goto plain_copy;
990 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
991 bufptr = PL_parser->bufptr;
992 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
993 SvCUR_set(PL_parser->linestr,
994 SvCUR(PL_parser->linestr) + len-highhalf);
f0e67a1d
Z
995 PL_parser->bufend += len-highhalf;
996 for (p = pv; p != e; p++) {
997 U8 c = (U8)*p;
998 if (c & 0x80) {
999 *bufptr++ = (char)(((c & 0x3) << 6) | (p[1] & 0x3f));
1000 p++;
1001 } else {
1002 *bufptr++ = (char)c;
1003 }
1004 }
1005 } else {
1006 plain_copy:
1007 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1008 bufptr = PL_parser->bufptr;
1009 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
255fdf19 1010 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
f0e67a1d
Z
1011 PL_parser->bufend += len;
1012 Copy(pv, bufptr, len, char);
1013 }
1014 }
1015}
1016
1017/*
9dcc53ea
Z
1018=for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
1019
1020Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1021immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1022reallocating the buffer if necessary. This means that lexing code that
1023runs later will see the characters as if they had appeared in the input.
1024It is not recommended to do this as part of normal parsing, and most
1025uses of this facility run the risk of the inserted characters being
1026interpreted in an unintended manner.
1027
1028The string to be inserted is represented by octets starting at I<pv>
1029and continuing to the first nul. These octets are interpreted as either
1030UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
1031in I<flags>. The characters are recoded for the lexer buffer, according
1032to how the buffer is currently being interpreted (L</lex_bufutf8>).
1033If it is not convenient to nul-terminate a string to be inserted, the
1034L</lex_stuff_pvn> function is more appropriate.
1035
1036=cut
1037*/
1038
1039void
1040Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1041{
1042 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1043 lex_stuff_pvn(pv, strlen(pv), flags);
1044}
1045
1046/*
f0e67a1d
Z
1047=for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1048
1049Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1050immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1051reallocating the buffer if necessary. This means that lexing code that
1052runs later will see the characters as if they had appeared in the input.
1053It is not recommended to do this as part of normal parsing, and most
1054uses of this facility run the risk of the inserted characters being
1055interpreted in an unintended manner.
1056
1057The string to be inserted is the string value of I<sv>. The characters
1058are recoded for the lexer buffer, according to how the buffer is currently
9dcc53ea 1059being interpreted (L</lex_bufutf8>). If a string to be inserted is
f0e67a1d
Z
1060not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1061need to construct a scalar.
1062
1063=cut
1064*/
1065
1066void
1067Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1068{
1069 char *pv;
1070 STRLEN len;
1071 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1072 if (flags)
1073 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1074 pv = SvPV(sv, len);
1075 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1076}
1077
1078/*
1079=for apidoc Amx|void|lex_unstuff|char *ptr
1080
1081Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1082I<ptr>. Text following I<ptr> will be moved, and the buffer shortened.
1083This hides the discarded text from any lexing code that runs later,
1084as if the text had never appeared.
1085
1086This is not the normal way to consume lexed text. For that, use
1087L</lex_read_to>.
1088
1089=cut
1090*/
1091
1092void
1093Perl_lex_unstuff(pTHX_ char *ptr)
1094{
1095 char *buf, *bufend;
1096 STRLEN unstuff_len;
1097 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1098 buf = PL_parser->bufptr;
1099 if (ptr < buf)
1100 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1101 if (ptr == buf)
1102 return;
1103 bufend = PL_parser->bufend;
1104 if (ptr > bufend)
1105 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1106 unstuff_len = ptr - buf;
1107 Move(ptr, buf, bufend+1-ptr, char);
1108 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1109 PL_parser->bufend = bufend - unstuff_len;
1110}
1111
1112/*
1113=for apidoc Amx|void|lex_read_to|char *ptr
1114
1115Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1116to I<ptr>. This advances L</PL_parser-E<gt>bufptr> to match I<ptr>,
1117performing the correct bookkeeping whenever a newline character is passed.
1118This is the normal way to consume lexed text.
1119
1120Interpretation of the buffer's octets can be abstracted out by
1121using the slightly higher-level functions L</lex_peek_unichar> and
1122L</lex_read_unichar>.
1123
1124=cut
1125*/
1126
1127void
1128Perl_lex_read_to(pTHX_ char *ptr)
1129{
1130 char *s;
1131 PERL_ARGS_ASSERT_LEX_READ_TO;
1132 s = PL_parser->bufptr;
1133 if (ptr < s || ptr > PL_parser->bufend)
1134 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1135 for (; s != ptr; s++)
1136 if (*s == '\n') {
1137 CopLINE_inc(PL_curcop);
1138 PL_parser->linestart = s+1;
1139 }
1140 PL_parser->bufptr = ptr;
1141}
1142
1143/*
1144=for apidoc Amx|void|lex_discard_to|char *ptr
1145
1146Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1147up to I<ptr>. The remaining content of the buffer will be moved, and
1148all pointers into the buffer updated appropriately. I<ptr> must not
1149be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1150it is not permitted to discard text that has yet to be lexed.
1151
1152Normally it is not necessarily to do this directly, because it suffices to
1153use the implicit discarding behaviour of L</lex_next_chunk> and things
1154based on it. However, if a token stretches across multiple lines,
1f317c95 1155and the lexing code has kept multiple lines of text in the buffer for
f0e67a1d
Z
1156that purpose, then after completion of the token it would be wise to
1157explicitly discard the now-unneeded earlier lines, to avoid future
1158multi-line tokens growing the buffer without bound.
1159
1160=cut
1161*/
1162
1163void
1164Perl_lex_discard_to(pTHX_ char *ptr)
1165{
1166 char *buf;
1167 STRLEN discard_len;
1168 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1169 buf = SvPVX(PL_parser->linestr);
1170 if (ptr < buf)
1171 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1172 if (ptr == buf)
1173 return;
1174 if (ptr > PL_parser->bufptr)
1175 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1176 discard_len = ptr - buf;
1177 if (PL_parser->oldbufptr < ptr)
1178 PL_parser->oldbufptr = ptr;
1179 if (PL_parser->oldoldbufptr < ptr)
1180 PL_parser->oldoldbufptr = ptr;
1181 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1182 PL_parser->last_uni = NULL;
1183 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1184 PL_parser->last_lop = NULL;
1185 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1186 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1187 PL_parser->bufend -= discard_len;
1188 PL_parser->bufptr -= discard_len;
1189 PL_parser->oldbufptr -= discard_len;
1190 PL_parser->oldoldbufptr -= discard_len;
1191 if (PL_parser->last_uni)
1192 PL_parser->last_uni -= discard_len;
1193 if (PL_parser->last_lop)
1194 PL_parser->last_lop -= discard_len;
1195}
1196
1197/*
1198=for apidoc Amx|bool|lex_next_chunk|U32 flags
1199
1200Reads in the next chunk of text to be lexed, appending it to
1201L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1202looked to the end of the current chunk and wants to know more. It is
1203usual, but not necessary, for lexing to have consumed the entirety of
1204the current chunk at this time.
1205
1206If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1207chunk (i.e., the current chunk has been entirely consumed), normally the
1208current chunk will be discarded at the same time that the new chunk is
1209read in. If I<flags> includes C<LEX_KEEP_PREVIOUS>, the current chunk
1210will not be discarded. If the current chunk has not been entirely
1211consumed, then it will not be discarded regardless of the flag.
1212
1213Returns true if some new text was added to the buffer, or false if the
1214buffer has reached the end of the input text.
1215
1216=cut
1217*/
1218
1219#define LEX_FAKE_EOF 0x80000000
1220
1221bool
1222Perl_lex_next_chunk(pTHX_ U32 flags)
1223{
1224 SV *linestr;
1225 char *buf;
1226 STRLEN old_bufend_pos, new_bufend_pos;
1227 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1228 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1229 bool got_some_for_debugger = 0;
f0e67a1d
Z
1230 bool got_some;
1231 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF))
1232 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
f0e67a1d
Z
1233 linestr = PL_parser->linestr;
1234 buf = SvPVX(linestr);
1235 if (!(flags & LEX_KEEP_PREVIOUS) &&
1236 PL_parser->bufptr == PL_parser->bufend) {
1237 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1238 linestart_pos = 0;
1239 if (PL_parser->last_uni != PL_parser->bufend)
1240 PL_parser->last_uni = NULL;
1241 if (PL_parser->last_lop != PL_parser->bufend)
1242 PL_parser->last_lop = NULL;
1243 last_uni_pos = last_lop_pos = 0;
1244 *buf = 0;
1245 SvCUR(linestr) = 0;
1246 } else {
1247 old_bufend_pos = PL_parser->bufend - buf;
1248 bufptr_pos = PL_parser->bufptr - buf;
1249 oldbufptr_pos = PL_parser->oldbufptr - buf;
1250 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1251 linestart_pos = PL_parser->linestart - buf;
1252 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1253 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1254 }
1255 if (flags & LEX_FAKE_EOF) {
1256 goto eof;
1257 } else if (!PL_parser->rsfp) {
1258 got_some = 0;
1259 } else if (filter_gets(linestr, old_bufend_pos)) {
1260 got_some = 1;
17cc9359 1261 got_some_for_debugger = 1;
f0e67a1d 1262 } else {
580561a3
Z
1263 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1264 sv_setpvs(linestr, "");
f0e67a1d
Z
1265 eof:
1266 /* End of real input. Close filehandle (unless it was STDIN),
1267 * then add implicit termination.
1268 */
1269 if ((PerlIO*)PL_parser->rsfp == PerlIO_stdin())
1270 PerlIO_clearerr(PL_parser->rsfp);
1271 else if (PL_parser->rsfp)
1272 (void)PerlIO_close(PL_parser->rsfp);
1273 PL_parser->rsfp = NULL;
1274 PL_doextract = FALSE;
1275#ifdef PERL_MAD
1276 if (PL_madskills && !PL_in_eval && (PL_minus_p || PL_minus_n))
1277 PL_faketokens = 1;
1278#endif
1279 if (!PL_in_eval && PL_minus_p) {
1280 sv_catpvs(linestr,
1281 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1282 PL_minus_n = PL_minus_p = 0;
1283 } else if (!PL_in_eval && PL_minus_n) {
1284 sv_catpvs(linestr, /*{*/";}");
1285 PL_minus_n = 0;
1286 } else
1287 sv_catpvs(linestr, ";");
1288 got_some = 1;
1289 }
1290 buf = SvPVX(linestr);
1291 new_bufend_pos = SvCUR(linestr);
1292 PL_parser->bufend = buf + new_bufend_pos;
1293 PL_parser->bufptr = buf + bufptr_pos;
1294 PL_parser->oldbufptr = buf + oldbufptr_pos;
1295 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1296 PL_parser->linestart = buf + linestart_pos;
1297 if (PL_parser->last_uni)
1298 PL_parser->last_uni = buf + last_uni_pos;
1299 if (PL_parser->last_lop)
1300 PL_parser->last_lop = buf + last_lop_pos;
17cc9359 1301 if (got_some_for_debugger && (PERLDB_LINE || PERLDB_SAVESRC) &&
f0e67a1d
Z
1302 PL_curstash != PL_debstash) {
1303 /* debugger active and we're not compiling the debugger code,
1304 * so store the line into the debugger's array of lines
1305 */
1306 update_debugger_info(NULL, buf+old_bufend_pos,
1307 new_bufend_pos-old_bufend_pos);
1308 }
1309 return got_some;
1310}
1311
1312/*
1313=for apidoc Amx|I32|lex_peek_unichar|U32 flags
1314
1315Looks ahead one (Unicode) character in the text currently being lexed.
1316Returns the codepoint (unsigned integer value) of the next character,
1317or -1 if lexing has reached the end of the input text. To consume the
1318peeked character, use L</lex_read_unichar>.
1319
1320If the next character is in (or extends into) the next chunk of input
1321text, the next chunk will be read in. Normally the current chunk will be
1322discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1323then the current chunk will not be discarded.
1324
1325If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1326is encountered, an exception is generated.
1327
1328=cut
1329*/
1330
1331I32
1332Perl_lex_peek_unichar(pTHX_ U32 flags)
1333{
749123ff 1334 dVAR;
f0e67a1d
Z
1335 char *s, *bufend;
1336 if (flags & ~(LEX_KEEP_PREVIOUS))
1337 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1338 s = PL_parser->bufptr;
1339 bufend = PL_parser->bufend;
1340 if (UTF) {
1341 U8 head;
1342 I32 unichar;
1343 STRLEN len, retlen;
1344 if (s == bufend) {
1345 if (!lex_next_chunk(flags))
1346 return -1;
1347 s = PL_parser->bufptr;
1348 bufend = PL_parser->bufend;
1349 }
1350 head = (U8)*s;
1351 if (!(head & 0x80))
1352 return head;
1353 if (head & 0x40) {
1354 len = PL_utf8skip[head];
1355 while ((STRLEN)(bufend-s) < len) {
1356 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1357 break;
1358 s = PL_parser->bufptr;
1359 bufend = PL_parser->bufend;
1360 }
1361 }
1362 unichar = utf8n_to_uvuni((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
1363 if (retlen == (STRLEN)-1) {
1364 /* malformed UTF-8 */
1365 ENTER;
1366 SAVESPTR(PL_warnhook);
1367 PL_warnhook = PERL_WARNHOOK_FATAL;
1368 utf8n_to_uvuni((U8*)s, bufend-s, NULL, 0);
1369 LEAVE;
1370 }
1371 return unichar;
1372 } else {
1373 if (s == bufend) {
1374 if (!lex_next_chunk(flags))
1375 return -1;
1376 s = PL_parser->bufptr;
1377 }
1378 return (U8)*s;
1379 }
1380}
1381
1382/*
1383=for apidoc Amx|I32|lex_read_unichar|U32 flags
1384
1385Reads the next (Unicode) character in the text currently being lexed.
1386Returns the codepoint (unsigned integer value) of the character read,
1387and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1388if lexing has reached the end of the input text. To non-destructively
1389examine the next character, use L</lex_peek_unichar> instead.
1390
1391If the next character is in (or extends into) the next chunk of input
1392text, the next chunk will be read in. Normally the current chunk will be
1393discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1394then the current chunk will not be discarded.
1395
1396If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1397is encountered, an exception is generated.
1398
1399=cut
1400*/
1401
1402I32
1403Perl_lex_read_unichar(pTHX_ U32 flags)
1404{
1405 I32 c;
1406 if (flags & ~(LEX_KEEP_PREVIOUS))
1407 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1408 c = lex_peek_unichar(flags);
1409 if (c != -1) {
1410 if (c == '\n')
1411 CopLINE_inc(PL_curcop);
1412 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1413 }
1414 return c;
1415}
1416
1417/*
1418=for apidoc Amx|void|lex_read_space|U32 flags
1419
1420Reads optional spaces, in Perl style, in the text currently being
1421lexed. The spaces may include ordinary whitespace characters and
1422Perl-style comments. C<#line> directives are processed if encountered.
1423L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1424at a non-space character (or the end of the input text).
1425
1426If spaces extend into the next chunk of input text, the next chunk will
1427be read in. Normally the current chunk will be discarded at the same
1428time, but if I<flags> includes C<LEX_KEEP_PREVIOUS> then the current
1429chunk will not be discarded.
1430
1431=cut
1432*/
1433
f0998909
Z
1434#define LEX_NO_NEXT_CHUNK 0x80000000
1435
f0e67a1d
Z
1436void
1437Perl_lex_read_space(pTHX_ U32 flags)
1438{
1439 char *s, *bufend;
1440 bool need_incline = 0;
f0998909 1441 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK))
f0e67a1d
Z
1442 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
1443#ifdef PERL_MAD
1444 if (PL_skipwhite) {
1445 sv_free(PL_skipwhite);
1446 PL_skipwhite = NULL;
1447 }
1448 if (PL_madskills)
1449 PL_skipwhite = newSVpvs("");
1450#endif /* PERL_MAD */
1451 s = PL_parser->bufptr;
1452 bufend = PL_parser->bufend;
1453 while (1) {
1454 char c = *s;
1455 if (c == '#') {
1456 do {
1457 c = *++s;
1458 } while (!(c == '\n' || (c == 0 && s == bufend)));
1459 } else if (c == '\n') {
1460 s++;
1461 PL_parser->linestart = s;
1462 if (s == bufend)
1463 need_incline = 1;
1464 else
1465 incline(s);
1466 } else if (isSPACE(c)) {
1467 s++;
1468 } else if (c == 0 && s == bufend) {
1469 bool got_more;
1470#ifdef PERL_MAD
1471 if (PL_madskills)
1472 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1473#endif /* PERL_MAD */
f0998909
Z
1474 if (flags & LEX_NO_NEXT_CHUNK)
1475 break;
f0e67a1d
Z
1476 PL_parser->bufptr = s;
1477 CopLINE_inc(PL_curcop);
1478 got_more = lex_next_chunk(flags);
1479 CopLINE_dec(PL_curcop);
1480 s = PL_parser->bufptr;
1481 bufend = PL_parser->bufend;
1482 if (!got_more)
1483 break;
1484 if (need_incline && PL_parser->rsfp) {
1485 incline(s);
1486 need_incline = 0;
1487 }
1488 } else {
1489 break;
1490 }
1491 }
1492#ifdef PERL_MAD
1493 if (PL_madskills)
1494 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1495#endif /* PERL_MAD */
1496 PL_parser->bufptr = s;
1497}
1498
1499/*
ffb4593c
NT
1500 * S_incline
1501 * This subroutine has nothing to do with tilting, whether at windmills
1502 * or pinball tables. Its name is short for "increment line". It
57843af0 1503 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1504 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1505 * # line 500 "foo.pm"
1506 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1507 */
1508
76e3520e 1509STATIC void
d9095cec 1510S_incline(pTHX_ const char *s)
463ee0b2 1511{
97aff369 1512 dVAR;
d9095cec
NC
1513 const char *t;
1514 const char *n;
1515 const char *e;
463ee0b2 1516
7918f24d
NC
1517 PERL_ARGS_ASSERT_INCLINE;
1518
57843af0 1519 CopLINE_inc(PL_curcop);
463ee0b2
LW
1520 if (*s++ != '#')
1521 return;
d4c19fe8
AL
1522 while (SPACE_OR_TAB(*s))
1523 s++;
73659bf1
GS
1524 if (strnEQ(s, "line", 4))
1525 s += 4;
1526 else
1527 return;
084592ab 1528 if (SPACE_OR_TAB(*s))
73659bf1 1529 s++;
4e553d73 1530 else
73659bf1 1531 return;
d4c19fe8
AL
1532 while (SPACE_OR_TAB(*s))
1533 s++;
463ee0b2
LW
1534 if (!isDIGIT(*s))
1535 return;
d4c19fe8 1536
463ee0b2
LW
1537 n = s;
1538 while (isDIGIT(*s))
1539 s++;
07714eb4 1540 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1541 return;
bf4acbe4 1542 while (SPACE_OR_TAB(*s))
463ee0b2 1543 s++;
73659bf1 1544 if (*s == '"' && (t = strchr(s+1, '"'))) {
463ee0b2 1545 s++;
73659bf1
GS
1546 e = t + 1;
1547 }
463ee0b2 1548 else {
c35e046a
AL
1549 t = s;
1550 while (!isSPACE(*t))
1551 t++;
73659bf1 1552 e = t;
463ee0b2 1553 }
bf4acbe4 1554 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1555 e++;
1556 if (*e != '\n' && *e != '\0')
1557 return; /* false alarm */
1558
f4dd75d9 1559 if (t - s > 0) {
d9095cec 1560 const STRLEN len = t - s;
8a5ee598 1561#ifndef USE_ITHREADS
19bad673
NC
1562 SV *const temp_sv = CopFILESV(PL_curcop);
1563 const char *cf;
1564 STRLEN tmplen;
1565
1566 if (temp_sv) {
1567 cf = SvPVX(temp_sv);
1568 tmplen = SvCUR(temp_sv);
1569 } else {
1570 cf = NULL;
1571 tmplen = 0;
1572 }
1573
42d9b98d 1574 if (tmplen > 7 && strnEQ(cf, "(eval ", 6)) {
e66cf94c
RGS
1575 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1576 * to *{"::_<newfilename"} */
44867030
NC
1577 /* However, the long form of evals is only turned on by the
1578 debugger - usually they're "(eval %lu)" */
1579 char smallbuf[128];
1580 char *tmpbuf;
1581 GV **gvp;
d9095cec 1582 STRLEN tmplen2 = len;
798b63bc 1583 if (tmplen + 2 <= sizeof smallbuf)
e66cf94c
RGS
1584 tmpbuf = smallbuf;
1585 else
2ae0db35 1586 Newx(tmpbuf, tmplen + 2, char);
44867030
NC
1587 tmpbuf[0] = '_';
1588 tmpbuf[1] = '<';
2ae0db35 1589 memcpy(tmpbuf + 2, cf, tmplen);
44867030 1590 tmplen += 2;
8a5ee598
RGS
1591 gvp = (GV**)hv_fetch(PL_defstash, tmpbuf, tmplen, FALSE);
1592 if (gvp) {
44867030
NC
1593 char *tmpbuf2;
1594 GV *gv2;
1595
1596 if (tmplen2 + 2 <= sizeof smallbuf)
1597 tmpbuf2 = smallbuf;
1598 else
1599 Newx(tmpbuf2, tmplen2 + 2, char);
1600
1601 if (tmpbuf2 != smallbuf || tmpbuf != smallbuf) {
1602 /* Either they malloc'd it, or we malloc'd it,
1603 so no prefix is present in ours. */
1604 tmpbuf2[0] = '_';
1605 tmpbuf2[1] = '<';
1606 }
1607
1608 memcpy(tmpbuf2 + 2, s, tmplen2);
1609 tmplen2 += 2;
1610
8a5ee598 1611 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1612 if (!isGV(gv2)) {
8a5ee598 1613 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1614 /* adjust ${"::_<newfilename"} to store the new file name */
1615 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
3cb1dbc6
NC
1616 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(*gvp)));
1617 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(*gvp)));
e5527e4b 1618 }
44867030
NC
1619
1620 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
8a5ee598 1621 }
e66cf94c 1622 if (tmpbuf != smallbuf) Safefree(tmpbuf);
e66cf94c 1623 }
8a5ee598 1624#endif
05ec9bb3 1625 CopFILE_free(PL_curcop);
d9095cec 1626 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1627 }
57843af0 1628 CopLINE_set(PL_curcop, atoi(n)-1);
463ee0b2
LW
1629}
1630
29595ff2 1631#ifdef PERL_MAD
cd81e915 1632/* skip space before PL_thistoken */
29595ff2
NC
1633
1634STATIC char *
1635S_skipspace0(pTHX_ register char *s)
1636{
7918f24d
NC
1637 PERL_ARGS_ASSERT_SKIPSPACE0;
1638
29595ff2
NC
1639 s = skipspace(s);
1640 if (!PL_madskills)
1641 return s;
cd81e915
NC
1642 if (PL_skipwhite) {
1643 if (!PL_thiswhite)
6b29d1f5 1644 PL_thiswhite = newSVpvs("");
cd81e915
NC
1645 sv_catsv(PL_thiswhite, PL_skipwhite);
1646 sv_free(PL_skipwhite);
1647 PL_skipwhite = 0;
1648 }
1649 PL_realtokenstart = s - SvPVX(PL_linestr);
29595ff2
NC
1650 return s;
1651}
1652
cd81e915 1653/* skip space after PL_thistoken */
29595ff2
NC
1654
1655STATIC char *
1656S_skipspace1(pTHX_ register char *s)
1657{
d4c19fe8 1658 const char *start = s;
29595ff2
NC
1659 I32 startoff = start - SvPVX(PL_linestr);
1660
7918f24d
NC
1661 PERL_ARGS_ASSERT_SKIPSPACE1;
1662
29595ff2
NC
1663 s = skipspace(s);
1664 if (!PL_madskills)
1665 return s;
1666 start = SvPVX(PL_linestr) + startoff;
cd81e915 1667 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1668 const char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1669 PL_thistoken = newSVpvn(tstart, start - tstart);
1670 }
1671 PL_realtokenstart = -1;
1672 if (PL_skipwhite) {
1673 if (!PL_nextwhite)
6b29d1f5 1674 PL_nextwhite = newSVpvs("");
cd81e915
NC
1675 sv_catsv(PL_nextwhite, PL_skipwhite);
1676 sv_free(PL_skipwhite);
1677 PL_skipwhite = 0;
29595ff2
NC
1678 }
1679 return s;
1680}
1681
1682STATIC char *
1683S_skipspace2(pTHX_ register char *s, SV **svp)
1684{
c35e046a
AL
1685 char *start;
1686 const I32 bufptroff = PL_bufptr - SvPVX(PL_linestr);
1687 const I32 startoff = s - SvPVX(PL_linestr);
1688
7918f24d
NC
1689 PERL_ARGS_ASSERT_SKIPSPACE2;
1690
29595ff2
NC
1691 s = skipspace(s);
1692 PL_bufptr = SvPVX(PL_linestr) + bufptroff;
1693 if (!PL_madskills || !svp)
1694 return s;
1695 start = SvPVX(PL_linestr) + startoff;
cd81e915 1696 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1697 char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1698 PL_thistoken = newSVpvn(tstart, start - tstart);
1699 PL_realtokenstart = -1;
29595ff2 1700 }
cd81e915 1701 if (PL_skipwhite) {
29595ff2 1702 if (!*svp)
6b29d1f5 1703 *svp = newSVpvs("");
cd81e915
NC
1704 sv_setsv(*svp, PL_skipwhite);
1705 sv_free(PL_skipwhite);
1706 PL_skipwhite = 0;
29595ff2
NC
1707 }
1708
1709 return s;
1710}
1711#endif
1712
80a702cd 1713STATIC void
15f169a1 1714S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1715{
1716 AV *av = CopFILEAVx(PL_curcop);
1717 if (av) {
b9f83d2f 1718 SV * const sv = newSV_type(SVt_PVMG);
5fa550fb
NC
1719 if (orig_sv)
1720 sv_setsv(sv, orig_sv);
1721 else
1722 sv_setpvn(sv, buf, len);
80a702cd
RGS
1723 (void)SvIOK_on(sv);
1724 SvIV_set(sv, 0);
1725 av_store(av, (I32)CopLINE(PL_curcop), sv);
1726 }
1727}
1728
ffb4593c
NT
1729/*
1730 * S_skipspace
1731 * Called to gobble the appropriate amount and type of whitespace.
1732 * Skips comments as well.
1733 */
1734
76e3520e 1735STATIC char *
cea2e8a9 1736S_skipspace(pTHX_ register char *s)
a687059c 1737{
5db06880 1738#ifdef PERL_MAD
f0e67a1d
Z
1739 char *start = s;
1740#endif /* PERL_MAD */
7918f24d 1741 PERL_ARGS_ASSERT_SKIPSPACE;
f0e67a1d 1742#ifdef PERL_MAD
cd81e915
NC
1743 if (PL_skipwhite) {
1744 sv_free(PL_skipwhite);
f0e67a1d 1745 PL_skipwhite = NULL;
5db06880 1746 }
f0e67a1d 1747#endif /* PERL_MAD */
3280af22 1748 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
bf4acbe4 1749 while (s < PL_bufend && SPACE_OR_TAB(*s))
463ee0b2 1750 s++;
f0e67a1d
Z
1751 } else {
1752 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1753 PL_bufptr = s;
f0998909
Z
1754 lex_read_space(LEX_KEEP_PREVIOUS |
1755 (PL_sublex_info.sub_inwhat || PL_lex_state == LEX_FORMLINE ?
1756 LEX_NO_NEXT_CHUNK : 0));
3280af22 1757 s = PL_bufptr;
f0e67a1d
Z
1758 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1759 if (PL_linestart > PL_bufptr)
1760 PL_bufptr = PL_linestart;
1761 return s;
463ee0b2 1762 }
5db06880 1763#ifdef PERL_MAD
f0e67a1d
Z
1764 if (PL_madskills)
1765 PL_skipwhite = newSVpvn(start, s-start);
1766#endif /* PERL_MAD */
5db06880 1767 return s;
a687059c 1768}
378cc40b 1769
ffb4593c
NT
1770/*
1771 * S_check_uni
1772 * Check the unary operators to ensure there's no ambiguity in how they're
1773 * used. An ambiguous piece of code would be:
1774 * rand + 5
1775 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1776 * the +5 is its argument.
1777 */
1778
76e3520e 1779STATIC void
cea2e8a9 1780S_check_uni(pTHX)
ba106d47 1781{
97aff369 1782 dVAR;
d4c19fe8
AL
1783 const char *s;
1784 const char *t;
2f3197b3 1785
3280af22 1786 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1787 return;
3280af22
NIS
1788 while (isSPACE(*PL_last_uni))
1789 PL_last_uni++;
c35e046a
AL
1790 s = PL_last_uni;
1791 while (isALNUM_lazy_if(s,UTF) || *s == '-')
1792 s++;
3280af22 1793 if ((t = strchr(s, '(')) && t < PL_bufptr)
a0d0e21e 1794 return;
6136c704 1795
9b387841
NC
1796 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
1797 "Warning: Use of \"%.*s\" without parentheses is ambiguous",
1798 (int)(s - PL_last_uni), PL_last_uni);
2f3197b3
LW
1799}
1800
ffb4593c
NT
1801/*
1802 * LOP : macro to build a list operator. Its behaviour has been replaced
1803 * with a subroutine, S_lop() for which LOP is just another name.
1804 */
1805
a0d0e21e
LW
1806#define LOP(f,x) return lop(f,x,s)
1807
ffb4593c
NT
1808/*
1809 * S_lop
1810 * Build a list operator (or something that might be one). The rules:
1811 * - if we have a next token, then it's a list operator [why?]
1812 * - if the next thing is an opening paren, then it's a function
1813 * - else it's a list operator
1814 */
1815
76e3520e 1816STATIC I32
a0be28da 1817S_lop(pTHX_ I32 f, int x, char *s)
ffed7fef 1818{
97aff369 1819 dVAR;
7918f24d
NC
1820
1821 PERL_ARGS_ASSERT_LOP;
1822
6154021b 1823 pl_yylval.ival = f;
35c8bce7 1824 CLINE;
3280af22
NIS
1825 PL_expect = x;
1826 PL_bufptr = s;
1827 PL_last_lop = PL_oldbufptr;
eb160463 1828 PL_last_lop_op = (OPCODE)f;
5db06880
NC
1829#ifdef PERL_MAD
1830 if (PL_lasttoke)
1831 return REPORT(LSTOP);
1832#else
3280af22 1833 if (PL_nexttoke)
bbf60fe6 1834 return REPORT(LSTOP);
5db06880 1835#endif
79072805 1836 if (*s == '(')
bbf60fe6 1837 return REPORT(FUNC);
29595ff2 1838 s = PEEKSPACE(s);
79072805 1839 if (*s == '(')
bbf60fe6 1840 return REPORT(FUNC);
79072805 1841 else
bbf60fe6 1842 return REPORT(LSTOP);
79072805
LW
1843}
1844
5db06880
NC
1845#ifdef PERL_MAD
1846 /*
1847 * S_start_force
1848 * Sets up for an eventual force_next(). start_force(0) basically does
1849 * an unshift, while start_force(-1) does a push. yylex removes items
1850 * on the "pop" end.
1851 */
1852
1853STATIC void
1854S_start_force(pTHX_ int where)
1855{
1856 int i;
1857
cd81e915 1858 if (where < 0) /* so people can duplicate start_force(PL_curforce) */
5db06880 1859 where = PL_lasttoke;
cd81e915
NC
1860 assert(PL_curforce < 0 || PL_curforce == where);
1861 if (PL_curforce != where) {
5db06880
NC
1862 for (i = PL_lasttoke; i > where; --i) {
1863 PL_nexttoke[i] = PL_nexttoke[i-1];
1864 }
1865 PL_lasttoke++;
1866 }
cd81e915 1867 if (PL_curforce < 0) /* in case of duplicate start_force() */
5db06880 1868 Zero(&PL_nexttoke[where], 1, NEXTTOKE);
cd81e915
NC
1869 PL_curforce = where;
1870 if (PL_nextwhite) {
5db06880 1871 if (PL_madskills)
6b29d1f5 1872 curmad('^', newSVpvs(""));
cd81e915 1873 CURMAD('_', PL_nextwhite);
5db06880
NC
1874 }
1875}
1876
1877STATIC void
1878S_curmad(pTHX_ char slot, SV *sv)
1879{
1880 MADPROP **where;
1881
1882 if (!sv)
1883 return;
cd81e915
NC
1884 if (PL_curforce < 0)
1885 where = &PL_thismad;
5db06880 1886 else
cd81e915 1887 where = &PL_nexttoke[PL_curforce].next_mad;
5db06880 1888
cd81e915 1889 if (PL_faketokens)
76f68e9b 1890 sv_setpvs(sv, "");
5db06880
NC
1891 else {
1892 if (!IN_BYTES) {
1893 if (UTF && is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
1894 SvUTF8_on(sv);
1895 else if (PL_encoding) {
1896 sv_recode_to_utf8(sv, PL_encoding);
1897 }
1898 }
1899 }
1900
1901 /* keep a slot open for the head of the list? */
1902 if (slot != '_' && *where && (*where)->mad_key == '^') {
1903 (*where)->mad_key = slot;
daba3364 1904 sv_free(MUTABLE_SV(((*where)->mad_val)));
5db06880
NC
1905 (*where)->mad_val = (void*)sv;
1906 }
1907 else
1908 addmad(newMADsv(slot, sv), where, 0);
1909}
1910#else
b3f24c00
MHM
1911# define start_force(where) NOOP
1912# define curmad(slot, sv) NOOP
5db06880
NC
1913#endif
1914
ffb4593c
NT
1915/*
1916 * S_force_next
9cbb5ea2 1917 * When the lexer realizes it knows the next token (for instance,
ffb4593c 1918 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 1919 * to know what token to return the next time the lexer is called. Caller
5db06880
NC
1920 * will need to set PL_nextval[] (or PL_nexttoke[].next_val with PERL_MAD),
1921 * and possibly PL_expect to ensure the lexer handles the token correctly.
ffb4593c
NT
1922 */
1923
4e553d73 1924STATIC void
cea2e8a9 1925S_force_next(pTHX_ I32 type)
79072805 1926{
97aff369 1927 dVAR;
704d4215
GG
1928#ifdef DEBUGGING
1929 if (DEBUG_T_TEST) {
1930 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 1931 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
1932 }
1933#endif
5db06880 1934#ifdef PERL_MAD
cd81e915 1935 if (PL_curforce < 0)
5db06880 1936 start_force(PL_lasttoke);
cd81e915 1937 PL_nexttoke[PL_curforce].next_type = type;
5db06880
NC
1938 if (PL_lex_state != LEX_KNOWNEXT)
1939 PL_lex_defer = PL_lex_state;
1940 PL_lex_state = LEX_KNOWNEXT;
1941 PL_lex_expect = PL_expect;
cd81e915 1942 PL_curforce = -1;
5db06880 1943#else
3280af22
NIS
1944 PL_nexttype[PL_nexttoke] = type;
1945 PL_nexttoke++;
1946 if (PL_lex_state != LEX_KNOWNEXT) {
1947 PL_lex_defer = PL_lex_state;
1948 PL_lex_expect = PL_expect;
1949 PL_lex_state = LEX_KNOWNEXT;
79072805 1950 }
5db06880 1951#endif
79072805
LW
1952}
1953
28ac2b49
Z
1954void
1955Perl_yyunlex(pTHX)
1956{
1957 if (PL_parser->yychar != YYEMPTY) {
1958 start_force(-1);
1959 NEXTVAL_NEXTTOKE = PL_parser->yylval;
1960 force_next(PL_parser->yychar);
1961 PL_parser->yychar = YYEMPTY;
1962 }
1963}
1964
d0a148a6 1965STATIC SV *
15f169a1 1966S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 1967{
97aff369 1968 dVAR;
740cce10 1969 SV * const sv = newSVpvn_utf8(start, len,
eaf7a4d2
CS
1970 !IN_BYTES
1971 && UTF
1972 && !is_ascii_string((const U8*)start, len)
740cce10 1973 && is_utf8_string((const U8*)start, len));
d0a148a6
NC
1974 return sv;
1975}
1976
ffb4593c
NT
1977/*
1978 * S_force_word
1979 * When the lexer knows the next thing is a word (for instance, it has
1980 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
1981 * it calls S_force_word to stick the next word into the PL_nexttoke/val
1982 * lookahead.
ffb4593c
NT
1983 *
1984 * Arguments:
b1b65b59 1985 * char *start : buffer position (must be within PL_linestr)
02b34bbe 1986 * int token : PL_next* will be this type of bare word (e.g., METHOD,WORD)
ffb4593c
NT
1987 * int check_keyword : if true, Perl checks to make sure the word isn't
1988 * a keyword (do this if the word is a label, e.g. goto FOO)
1989 * int allow_pack : if true, : characters will also be allowed (require,
1990 * use, etc. do this)
9cbb5ea2 1991 * int allow_initial_tick : used by the "sub" lexer only.
ffb4593c
NT
1992 */
1993
76e3520e 1994STATIC char *
cea2e8a9 1995S_force_word(pTHX_ register char *start, int token, int check_keyword, int allow_pack, int allow_initial_tick)
79072805 1996{
97aff369 1997 dVAR;
463ee0b2
LW
1998 register char *s;
1999 STRLEN len;
4e553d73 2000
7918f24d
NC
2001 PERL_ARGS_ASSERT_FORCE_WORD;
2002
29595ff2 2003 start = SKIPSPACE1(start);
463ee0b2 2004 s = start;
7e2040f0 2005 if (isIDFIRST_lazy_if(s,UTF) ||
a0d0e21e 2006 (allow_pack && *s == ':') ||
15f0808c 2007 (allow_initial_tick && *s == '\'') )
a0d0e21e 2008 {
3280af22 2009 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
5458a98a 2010 if (check_keyword && keyword(PL_tokenbuf, len, 0))
463ee0b2 2011 return start;
cd81e915 2012 start_force(PL_curforce);
5db06880
NC
2013 if (PL_madskills)
2014 curmad('X', newSVpvn(start,s-start));
463ee0b2 2015 if (token == METHOD) {
29595ff2 2016 s = SKIPSPACE1(s);
463ee0b2 2017 if (*s == '(')
3280af22 2018 PL_expect = XTERM;
463ee0b2 2019 else {
3280af22 2020 PL_expect = XOPERATOR;
463ee0b2 2021 }
79072805 2022 }
e74e6b3d 2023 if (PL_madskills)
63575281 2024 curmad('g', newSVpvs( "forced" ));
9ded7720 2025 NEXTVAL_NEXTTOKE.opval
d0a148a6
NC
2026 = (OP*)newSVOP(OP_CONST,0,
2027 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 2028 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
2029 force_next(token);
2030 }
2031 return s;
2032}
2033
ffb4593c
NT
2034/*
2035 * S_force_ident
9cbb5ea2 2036 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
2037 * text only contains the "foo" portion. The first argument is a pointer
2038 * to the "foo", and the second argument is the type symbol to prefix.
2039 * Forces the next token to be a "WORD".
9cbb5ea2 2040 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2041 */
2042
76e3520e 2043STATIC void
bfed75c6 2044S_force_ident(pTHX_ register const char *s, int kind)
79072805 2045{
97aff369 2046 dVAR;
7918f24d
NC
2047
2048 PERL_ARGS_ASSERT_FORCE_IDENT;
2049
c35e046a 2050 if (*s) {
90e5519e
NC
2051 const STRLEN len = strlen(s);
2052 OP* const o = (OP*)newSVOP(OP_CONST, 0, newSVpvn(s, len));
cd81e915 2053 start_force(PL_curforce);
9ded7720 2054 NEXTVAL_NEXTTOKE.opval = o;
79072805 2055 force_next(WORD);
748a9306 2056 if (kind) {
11343788 2057 o->op_private = OPpCONST_ENTERED;
55497cff
PP
2058 /* XXX see note in pp_entereval() for why we forgo typo
2059 warnings if the symbol must be introduced in an eval.
2060 GSAR 96-10-12 */
90e5519e
NC
2061 gv_fetchpvn_flags(s, len,
2062 PL_in_eval ? (GV_ADDMULTI | GV_ADDINEVAL)
2063 : GV_ADD,
2064 kind == '$' ? SVt_PV :
2065 kind == '@' ? SVt_PVAV :
2066 kind == '%' ? SVt_PVHV :
a0d0e21e 2067 SVt_PVGV
90e5519e 2068 );
748a9306 2069 }
79072805
LW
2070 }
2071}
2072
1571675a
GS
2073NV
2074Perl_str_to_version(pTHX_ SV *sv)
2075{
2076 NV retval = 0.0;
2077 NV nshift = 1.0;
2078 STRLEN len;
cfd0369c 2079 const char *start = SvPV_const(sv,len);
9d4ba2ae 2080 const char * const end = start + len;
504618e9 2081 const bool utf = SvUTF8(sv) ? TRUE : FALSE;
7918f24d
NC
2082
2083 PERL_ARGS_ASSERT_STR_TO_VERSION;
2084
1571675a 2085 while (start < end) {
ba210ebe 2086 STRLEN skip;
1571675a
GS
2087 UV n;
2088 if (utf)
9041c2e3 2089 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2090 else {
2091 n = *(U8*)start;
2092 skip = 1;
2093 }
2094 retval += ((NV)n)/nshift;
2095 start += skip;
2096 nshift *= 1000;
2097 }
2098 return retval;
2099}
2100
4e553d73 2101/*
ffb4593c
NT
2102 * S_force_version
2103 * Forces the next token to be a version number.
e759cc13
RGS
2104 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2105 * and if "guessing" is TRUE, then no new token is created (and the caller
2106 * must use an alternative parsing method).
ffb4593c
NT
2107 */
2108
76e3520e 2109STATIC char *
e759cc13 2110S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2111{
97aff369 2112 dVAR;
5f66b61c 2113 OP *version = NULL;
44dcb63b 2114 char *d;
5db06880
NC
2115#ifdef PERL_MAD
2116 I32 startoff = s - SvPVX(PL_linestr);
2117#endif
89bfa8cd 2118
7918f24d
NC
2119 PERL_ARGS_ASSERT_FORCE_VERSION;
2120
29595ff2 2121 s = SKIPSPACE1(s);
89bfa8cd 2122
44dcb63b 2123 d = s;
dd629d5b 2124 if (*d == 'v')
44dcb63b 2125 d++;
44dcb63b 2126 if (isDIGIT(*d)) {
e759cc13
RGS
2127 while (isDIGIT(*d) || *d == '_' || *d == '.')
2128 d++;
5db06880
NC
2129#ifdef PERL_MAD
2130 if (PL_madskills) {
cd81e915 2131 start_force(PL_curforce);
5db06880
NC
2132 curmad('X', newSVpvn(s,d-s));
2133 }
2134#endif
4e4da3ac 2135 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
dd629d5b 2136 SV *ver;
8d08d9ba
DG
2137#ifdef USE_LOCALE_NUMERIC
2138 char *loc = setlocale(LC_NUMERIC, "C");
2139#endif
6154021b 2140 s = scan_num(s, &pl_yylval);
8d08d9ba
DG
2141#ifdef USE_LOCALE_NUMERIC
2142 setlocale(LC_NUMERIC, loc);
2143#endif
6154021b 2144 version = pl_yylval.opval;
dd629d5b
GS
2145 ver = cSVOPx(version)->op_sv;
2146 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2147 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2148 SvNV_set(ver, str_to_version(ver));
1571675a 2149 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2150 }
89bfa8cd 2151 }
5db06880
NC
2152 else if (guessing) {
2153#ifdef PERL_MAD
2154 if (PL_madskills) {
cd81e915
NC
2155 sv_free(PL_nextwhite); /* let next token collect whitespace */
2156 PL_nextwhite = 0;
5db06880
NC
2157 s = SvPVX(PL_linestr) + startoff;
2158 }
2159#endif
e759cc13 2160 return s;
5db06880 2161 }
89bfa8cd
PP
2162 }
2163
5db06880
NC
2164#ifdef PERL_MAD
2165 if (PL_madskills && !version) {
cd81e915
NC
2166 sv_free(PL_nextwhite); /* let next token collect whitespace */
2167 PL_nextwhite = 0;
5db06880
NC
2168 s = SvPVX(PL_linestr) + startoff;
2169 }
2170#endif
89bfa8cd 2171 /* NOTE: The parser sees the package name and the VERSION swapped */
cd81e915 2172 start_force(PL_curforce);
9ded7720 2173 NEXTVAL_NEXTTOKE.opval = version;
4e553d73 2174 force_next(WORD);
89bfa8cd 2175
e759cc13 2176 return s;
89bfa8cd
PP
2177}
2178
ffb4593c 2179/*
91152fc1
DG
2180 * S_force_strict_version
2181 * Forces the next token to be a version number using strict syntax rules.
2182 */
2183
2184STATIC char *
2185S_force_strict_version(pTHX_ char *s)
2186{
2187 dVAR;
2188 OP *version = NULL;
2189#ifdef PERL_MAD
2190 I32 startoff = s - SvPVX(PL_linestr);
2191#endif
2192 const char *errstr = NULL;
2193
2194 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2195
2196 while (isSPACE(*s)) /* leading whitespace */
2197 s++;
2198
2199 if (is_STRICT_VERSION(s,&errstr)) {
2200 SV *ver = newSV(0);
2201 s = (char *)scan_version(s, ver, 0);
2202 version = newSVOP(OP_CONST, 0, ver);
2203 }
4e4da3ac
Z
2204 else if ( (*s != ';' && *s != '{' && *s != '}' ) &&
2205 (s = SKIPSPACE1(s), (*s != ';' && *s != '{' && *s != '}' )))
2206 {
91152fc1
DG
2207 PL_bufptr = s;
2208 if (errstr)
2209 yyerror(errstr); /* version required */
2210 return s;
2211 }
2212
2213#ifdef PERL_MAD
2214 if (PL_madskills && !version) {
2215 sv_free(PL_nextwhite); /* let next token collect whitespace */
2216 PL_nextwhite = 0;
2217 s = SvPVX(PL_linestr) + startoff;
2218 }
2219#endif
2220 /* NOTE: The parser sees the package name and the VERSION swapped */
2221 start_force(PL_curforce);
2222 NEXTVAL_NEXTTOKE.opval = version;
2223 force_next(WORD);
2224
2225 return s;
2226}
2227
2228/*
ffb4593c
NT
2229 * S_tokeq
2230 * Tokenize a quoted string passed in as an SV. It finds the next
2231 * chunk, up to end of string or a backslash. It may make a new
2232 * SV containing that chunk (if HINT_NEW_STRING is on). It also
2233 * turns \\ into \.
2234 */
2235
76e3520e 2236STATIC SV *
cea2e8a9 2237S_tokeq(pTHX_ SV *sv)
79072805 2238{
97aff369 2239 dVAR;
79072805
LW
2240 register char *s;
2241 register char *send;
2242 register char *d;
b3ac6de7
IZ
2243 STRLEN len = 0;
2244 SV *pv = sv;
79072805 2245
7918f24d
NC
2246 PERL_ARGS_ASSERT_TOKEQ;
2247
79072805 2248 if (!SvLEN(sv))
b3ac6de7 2249 goto finish;
79072805 2250
a0d0e21e 2251 s = SvPV_force(sv, len);
21a311ee 2252 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1)
b3ac6de7 2253 goto finish;
463ee0b2 2254 send = s + len;
79072805
LW
2255 while (s < send && *s != '\\')
2256 s++;
2257 if (s == send)
b3ac6de7 2258 goto finish;
79072805 2259 d = s;
be4731d2 2260 if ( PL_hints & HINT_NEW_STRING ) {
59cd0e26 2261 pv = newSVpvn_flags(SvPVX_const(pv), len, SVs_TEMP | SvUTF8(sv));
be4731d2 2262 }
79072805
LW
2263 while (s < send) {
2264 if (*s == '\\') {
a0d0e21e 2265 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2266 s++; /* all that, just for this */
2267 }
2268 *d++ = *s++;
2269 }
2270 *d = '\0';
95a20fc0 2271 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2272 finish:
3280af22 2273 if ( PL_hints & HINT_NEW_STRING )
eb0d8d16 2274 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
79072805
LW
2275 return sv;
2276}
2277
ffb4593c
NT
2278/*
2279 * Now come three functions related to double-quote context,
2280 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2281 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2282 * interact with PL_lex_state, and create fake ( ... ) argument lists
2283 * to handle functions and concatenation.
2284 * They assume that whoever calls them will be setting up a fake
2285 * join call, because each subthing puts a ',' after it. This lets
2286 * "lower \luPpEr"
2287 * become
2288 * join($, , 'lower ', lcfirst( 'uPpEr', ) ,)
2289 *
2290 * (I'm not sure whether the spurious commas at the end of lcfirst's
2291 * arguments and join's arguments are created or not).
2292 */
2293
2294/*
2295 * S_sublex_start
6154021b 2296 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2297 *
2298 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2299 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c
NT
2300 *
2301 * OP_CONST and OP_READLINE are easy--just make the new op and return.
2302 *
2303 * Everything else becomes a FUNC.
2304 *
2305 * Sets PL_lex_state to LEX_INTERPPUSH unless (ival was OP_NULL or we
2306 * had an OP_CONST or OP_READLINE). This just sets us up for a
2307 * call to S_sublex_push().
2308 */
2309
76e3520e 2310STATIC I32
cea2e8a9 2311S_sublex_start(pTHX)
79072805 2312{
97aff369 2313 dVAR;
6154021b 2314 register const I32 op_type = pl_yylval.ival;
79072805
LW
2315
2316 if (op_type == OP_NULL) {
6154021b 2317 pl_yylval.opval = PL_lex_op;
5f66b61c 2318 PL_lex_op = NULL;
79072805
LW
2319 return THING;
2320 }
2321 if (op_type == OP_CONST || op_type == OP_READLINE) {
3280af22 2322 SV *sv = tokeq(PL_lex_stuff);
b3ac6de7
IZ
2323
2324 if (SvTYPE(sv) == SVt_PVIV) {
2325 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2326 STRLEN len;
96a5add6 2327 const char * const p = SvPV_const(sv, len);
740cce10 2328 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2329 SvREFCNT_dec(sv);
2330 sv = nsv;
4e553d73 2331 }
6154021b 2332 pl_yylval.opval = (OP*)newSVOP(op_type, 0, sv);
a0714e2c 2333 PL_lex_stuff = NULL;
6f33ba73
RGS
2334 /* Allow <FH> // "foo" */
2335 if (op_type == OP_READLINE)
2336 PL_expect = XTERMORDORDOR;
79072805
LW
2337 return THING;
2338 }
e3f73d4e
RGS
2339 else if (op_type == OP_BACKTICK && PL_lex_op) {
2340 /* readpipe() vas overriden */
2341 cSVOPx(cLISTOPx(cUNOPx(PL_lex_op)->op_first)->op_first->op_sibling)->op_sv = tokeq(PL_lex_stuff);
6154021b 2342 pl_yylval.opval = PL_lex_op;
9b201d7d 2343 PL_lex_op = NULL;
e3f73d4e
RGS
2344 PL_lex_stuff = NULL;
2345 return THING;
2346 }
79072805 2347
3280af22 2348 PL_sublex_info.super_state = PL_lex_state;
eac04b2e 2349 PL_sublex_info.sub_inwhat = (U16)op_type;
3280af22
NIS
2350 PL_sublex_info.sub_op = PL_lex_op;
2351 PL_lex_state = LEX_INTERPPUSH;
55497cff 2352
3280af22
NIS
2353 PL_expect = XTERM;
2354 if (PL_lex_op) {
6154021b 2355 pl_yylval.opval = PL_lex_op;
5f66b61c 2356 PL_lex_op = NULL;
55497cff
PP
2357 return PMFUNC;
2358 }
2359 else
2360 return FUNC;
2361}
2362
ffb4593c
NT
2363/*
2364 * S_sublex_push
2365 * Create a new scope to save the lexing state. The scope will be
2366 * ended in S_sublex_done. Returns a '(', starting the function arguments
2367 * to the uc, lc, etc. found before.
2368 * Sets PL_lex_state to LEX_INTERPCONCAT.
2369 */
2370
76e3520e 2371STATIC I32
cea2e8a9 2372S_sublex_push(pTHX)
55497cff 2373{
27da23d5 2374 dVAR;
f46d017c 2375 ENTER;
55497cff 2376
3280af22 2377 PL_lex_state = PL_sublex_info.super_state;
651b5b28 2378 SAVEBOOL(PL_lex_dojoin);
3280af22 2379 SAVEI32(PL_lex_brackets);
3280af22
NIS
2380 SAVEI32(PL_lex_casemods);
2381 SAVEI32(PL_lex_starts);
651b5b28 2382 SAVEI8(PL_lex_state);
7766f137 2383 SAVEVPTR(PL_lex_inpat);
98246f1e 2384 SAVEI16(PL_lex_inwhat);
57843af0 2385 SAVECOPLINE(PL_curcop);
3280af22 2386 SAVEPPTR(PL_bufptr);
8452ff4b 2387 SAVEPPTR(PL_bufend);
3280af22
NIS
2388 SAVEPPTR(PL_oldbufptr);
2389 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2390 SAVEPPTR(PL_last_lop);
2391 SAVEPPTR(PL_last_uni);
3280af22
NIS
2392 SAVEPPTR(PL_linestart);
2393 SAVESPTR(PL_linestr);
8edd5f42
RGS
2394 SAVEGENERICPV(PL_lex_brackstack);
2395 SAVEGENERICPV(PL_lex_casestack);
3280af22
NIS
2396
2397 PL_linestr = PL_lex_stuff;
a0714e2c 2398 PL_lex_stuff = NULL;
3280af22 2399
9cbb5ea2
GS
2400 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2401 = SvPVX(PL_linestr);
3280af22 2402 PL_bufend += SvCUR(PL_linestr);
bd61b366 2403 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2404 SAVEFREESV(PL_linestr);
2405
2406 PL_lex_dojoin = FALSE;
2407 PL_lex_brackets = 0;
a02a5408
JC
2408 Newx(PL_lex_brackstack, 120, char);
2409 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2410 PL_lex_casemods = 0;
2411 *PL_lex_casestack = '\0';
2412 PL_lex_starts = 0;
2413 PL_lex_state = LEX_INTERPCONCAT;
eb160463 2414 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
3280af22
NIS
2415
2416 PL_lex_inwhat = PL_sublex_info.sub_inwhat;
2417 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2418 PL_lex_inpat = PL_sublex_info.sub_op;
79072805 2419 else
5f66b61c 2420 PL_lex_inpat = NULL;
79072805 2421
55497cff 2422 return '(';
79072805
LW
2423}
2424
ffb4593c
NT
2425/*
2426 * S_sublex_done
2427 * Restores lexer state after a S_sublex_push.
2428 */
2429
76e3520e 2430STATIC I32
cea2e8a9 2431S_sublex_done(pTHX)
79072805 2432{
27da23d5 2433 dVAR;
3280af22 2434 if (!PL_lex_starts++) {
396482e1 2435 SV * const sv = newSVpvs("");
9aa983d2
JH
2436 if (SvUTF8(PL_linestr))
2437 SvUTF8_on(sv);
3280af22 2438 PL_expect = XOPERATOR;
6154021b 2439 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
79072805
LW
2440 return THING;
2441 }
2442
3280af22
NIS
2443 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2444 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2445 return yylex();
79072805
LW
2446 }
2447
ffb4593c 2448 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
3280af22
NIS
2449 if (PL_lex_repl && (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS)) {
2450 PL_linestr = PL_lex_repl;
2451 PL_lex_inpat = 0;
2452 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2453 PL_bufend += SvCUR(PL_linestr);
bd61b366 2454 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2455 SAVEFREESV(PL_linestr);
2456 PL_lex_dojoin = FALSE;
2457 PL_lex_brackets = 0;
3280af22
NIS
2458 PL_lex_casemods = 0;
2459 *PL_lex_casestack = '\0';
2460 PL_lex_starts = 0;
25da4f38 2461 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2462 PL_lex_state = LEX_INTERPNORMAL;
2463 PL_lex_starts++;
e9fa98b2
HS
2464 /* we don't clear PL_lex_repl here, so that we can check later
2465 whether this is an evalled subst; that means we rely on the
2466 logic to ensure sublex_done() is called again only via the
2467 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2468 }
e9fa98b2 2469 else {
3280af22 2470 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2471 PL_lex_repl = NULL;
e9fa98b2 2472 }
79072805 2473 return ',';
ffed7fef
LW
2474 }
2475 else {
5db06880
NC
2476#ifdef PERL_MAD
2477 if (PL_madskills) {
cd81e915
NC
2478 if (PL_thiswhite) {
2479 if (!PL_endwhite)
6b29d1f5 2480 PL_endwhite = newSVpvs("");
cd81e915
NC
2481 sv_catsv(PL_endwhite, PL_thiswhite);
2482 PL_thiswhite = 0;
2483 }
2484 if (PL_thistoken)
76f68e9b 2485 sv_setpvs(PL_thistoken,"");
5db06880 2486 else
cd81e915 2487 PL_realtokenstart = -1;
5db06880
NC
2488 }
2489#endif
f46d017c 2490 LEAVE;
3280af22
NIS
2491 PL_bufend = SvPVX(PL_linestr);
2492 PL_bufend += SvCUR(PL_linestr);
2493 PL_expect = XOPERATOR;
09bef843 2494 PL_sublex_info.sub_inwhat = 0;
79072805 2495 return ')';
ffed7fef
LW
2496 }
2497}
2498
02aa26ce
NT
2499/*
2500 scan_const
2501
2502 Extracts a pattern, double-quoted string, or transliteration. This
2503 is terrifying code.
2504
94def140 2505 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 2506 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 2507 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 2508
94def140
ST
2509 Returns a pointer to the character scanned up to. If this is
2510 advanced from the start pointer supplied (i.e. if anything was
9b599b2a 2511 successfully parsed), will leave an OP for the substring scanned
6154021b 2512 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
2513 by looking at the next characters herself.
2514
02aa26ce
NT
2515 In patterns:
2516 backslashes:
ff3f963a 2517 constants: \N{NAME} only
02aa26ce
NT
2518 case and quoting: \U \Q \E
2519 stops on @ and $, but not for $ as tail anchor
2520
2521 In transliterations:
2522 characters are VERY literal, except for - not at the start or end
94def140
ST
2523 of the string, which indicates a range. If the range is in bytes,
2524 scan_const expands the range to the full set of intermediate
2525 characters. If the range is in utf8, the hyphen is replaced with
2526 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
2527
2528 In double-quoted strings:
2529 backslashes:
2530 double-quoted style: \r and \n
ff3f963a 2531 constants: \x31, etc.
94def140 2532 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
2533 case and quoting: \U \Q \E
2534 stops on @ and $
2535
2536 scan_const does *not* construct ops to handle interpolated strings.
2537 It stops processing as soon as it finds an embedded $ or @ variable
2538 and leaves it to the caller to work out what's going on.
2539
94def140
ST
2540 embedded arrays (whether in pattern or not) could be:
2541 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2542
2543 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
2544
2545 $ in pattern could be $foo or could be tail anchor. Assumption:
2546 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 2547 followed by one of "()| \r\n\t"
02aa26ce
NT
2548
2549 \1 (backreferences) are turned into $1
2550
2551 The structure of the code is
2552 while (there's a character to process) {
94def140
ST
2553 handle transliteration ranges
2554 skip regexp comments /(?#comment)/ and codes /(?{code})/
2555 skip #-initiated comments in //x patterns
2556 check for embedded arrays
02aa26ce
NT
2557 check for embedded scalars
2558 if (backslash) {
94def140 2559 deprecate \1 in substitution replacements
02aa26ce
NT
2560 handle string-changing backslashes \l \U \Q \E, etc.
2561 switch (what was escaped) {
94def140 2562 handle \- in a transliteration (becomes a literal -)
ff3f963a 2563 if a pattern and not \N{, go treat as regular character
94def140
ST
2564 handle \132 (octal characters)
2565 handle \x15 and \x{1234} (hex characters)
ff3f963a 2566 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
ST
2567 handle \cV (control characters)
2568 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 2569 } (end switch)
77a135fe 2570 continue
02aa26ce 2571 } (end if backslash)
77a135fe 2572 handle regular character
02aa26ce 2573 } (end while character to read)
4e553d73 2574
02aa26ce
NT
2575*/
2576
76e3520e 2577STATIC char *
cea2e8a9 2578S_scan_const(pTHX_ char *start)
79072805 2579{
97aff369 2580 dVAR;
3280af22 2581 register char *send = PL_bufend; /* end of the constant */
77a135fe
KW
2582 SV *sv = newSV(send - start); /* sv for the constant. See
2583 note below on sizing. */
02aa26ce
NT
2584 register char *s = start; /* start of the constant */
2585 register char *d = SvPVX(sv); /* destination for copies */
2586 bool dorange = FALSE; /* are we in a translit range? */
c2e66d9e 2587 bool didrange = FALSE; /* did we just finish a range? */
2b9d42f0 2588 I32 has_utf8 = FALSE; /* Output constant is UTF8 */
77a135fe
KW
2589 I32 this_utf8 = UTF; /* Is the source string assumed
2590 to be UTF8? But, this can
2591 show as true when the source
2592 isn't utf8, as for example
2593 when it is entirely composed
2594 of hex constants */
2595
2596 /* Note on sizing: The scanned constant is placed into sv, which is
2597 * initialized by newSV() assuming one byte of output for every byte of
2598 * input. This routine expects newSV() to allocate an extra byte for a
2599 * trailing NUL, which this routine will append if it gets to the end of
2600 * the input. There may be more bytes of input than output (eg., \N{LATIN
2601 * CAPITAL LETTER A}), or more output than input if the constant ends up
2602 * recoded to utf8, but each time a construct is found that might increase
2603 * the needed size, SvGROW() is called. Its size parameter each time is
2604 * based on the best guess estimate at the time, namely the length used so
2605 * far, plus the length the current construct will occupy, plus room for
2606 * the trailing NUL, plus one byte for every input byte still unscanned */
2607
012bcf8d 2608 UV uv;
4c3a8340
ST
2609#ifdef EBCDIC
2610 UV literal_endpoint = 0;
e294cc5d 2611 bool native_range = TRUE; /* turned to FALSE if the first endpoint is Unicode. */
4c3a8340 2612#endif
012bcf8d 2613
7918f24d
NC
2614 PERL_ARGS_ASSERT_SCAN_CONST;
2615
2b9d42f0
NIS
2616 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
2617 /* If we are doing a trans and we know we want UTF8 set expectation */
2618 has_utf8 = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
2619 this_utf8 = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
2620 }
2621
2622
79072805 2623 while (s < send || dorange) {
ff3f963a 2624
02aa26ce 2625 /* get transliterations out of the way (they're most literal) */
3280af22 2626 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 2627 /* expand a range A-Z to the full set of characters. AIE! */
79072805 2628 if (dorange) {
1ba5c669
JH
2629 I32 i; /* current expanded character */
2630 I32 min; /* first character in range */
2631 I32 max; /* last character in range */
02aa26ce 2632
e294cc5d
JH
2633#ifdef EBCDIC
2634 UV uvmax = 0;
2635#endif
2636
2637 if (has_utf8
2638#ifdef EBCDIC
2639 && !native_range
2640#endif
2641 ) {
9d4ba2ae 2642 char * const c = (char*)utf8_hop((U8*)d, -1);
8973db79
JH
2643 char *e = d++;
2644 while (e-- > c)
2645 *(e + 1) = *e;
25716404 2646 *c = (char)UTF_TO_NATIVE(0xff);
8973db79
JH
2647 /* mark the range as done, and continue */
2648 dorange = FALSE;
2649 didrange = TRUE;
2650 continue;
2651 }
2b9d42f0 2652
95a20fc0 2653 i = d - SvPVX_const(sv); /* remember current offset */
e294cc5d
JH
2654#ifdef EBCDIC
2655 SvGROW(sv,
2656 SvLEN(sv) + (has_utf8 ?
2657 (512 - UTF_CONTINUATION_MARK +
2658 UNISKIP(0x100))
2659 : 256));
2660 /* How many two-byte within 0..255: 128 in UTF-8,
2661 * 96 in UTF-8-mod. */
2662#else
9cbb5ea2 2663 SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */
e294cc5d 2664#endif
9cbb5ea2 2665 d = SvPVX(sv) + i; /* refresh d after realloc */
e294cc5d
JH
2666#ifdef EBCDIC
2667 if (has_utf8) {
2668 int j;
2669 for (j = 0; j <= 1; j++) {
2670 char * const c = (char*)utf8_hop((U8*)d, -1);
2671 const UV uv = utf8n_to_uvchr((U8*)c, d - c, NULL, 0);
2672 if (j)
2673 min = (U8)uv;
2674 else if (uv < 256)
2675 max = (U8)uv;
2676 else {
2677 max = (U8)0xff; /* only to \xff */
2678 uvmax = uv; /* \x{100} to uvmax */
2679 }
2680 d = c; /* eat endpoint chars */
2681 }
2682 }
2683 else {
2684#endif
2685 d -= 2; /* eat the first char and the - */
2686 min = (U8)*d; /* first char in range */
2687 max = (U8)d[1]; /* last char in range */
2688#ifdef EBCDIC
2689 }
2690#endif
8ada0baa 2691
c2e66d9e 2692 if (min > max) {
01ec43d0 2693 Perl_croak(aTHX_
d1573ac7 2694 "Invalid range \"%c-%c\" in transliteration operator",
1ba5c669 2695 (char)min, (char)max);
c2e66d9e
GS
2696 }
2697
c7f1f016 2698#ifdef EBCDIC
4c3a8340
ST
2699 if (literal_endpoint == 2 &&
2700 ((isLOWER(min) && isLOWER(max)) ||
2701 (isUPPER(min) && isUPPER(max)))) {
8ada0baa
JH
2702 if (isLOWER(min)) {
2703 for (i = min; i <= max; i++)
2704 if (isLOWER(i))
db42d148 2705 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2706 } else {
2707 for (i = min; i <= max; i++)
2708 if (isUPPER(i))
db42d148 2709 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2710 }
2711 }
2712 else
2713#endif
2714 for (i = min; i <= max; i++)
e294cc5d
JH
2715#ifdef EBCDIC
2716 if (has_utf8) {
2717 const U8 ch = (U8)NATIVE_TO_UTF(i);
2718 if (UNI_IS_INVARIANT(ch))
2719 *d++ = (U8)i;
2720 else {
2721 *d++ = (U8)UTF8_EIGHT_BIT_HI(ch);
2722 *d++ = (U8)UTF8_EIGHT_BIT_LO(ch);
2723 }
2724 }
2725 else
2726#endif
2727 *d++ = (char)i;
2728
2729#ifdef EBCDIC
2730 if (uvmax) {
2731 d = (char*)uvchr_to_utf8((U8*)d, 0x100);
2732 if (uvmax > 0x101)
2733 *d++ = (char)UTF_TO_NATIVE(0xff);
2734 if (uvmax > 0x100)
2735 d = (char*)uvchr_to_utf8((U8*)d, uvmax);
2736 }
2737#endif
02aa26ce
NT
2738
2739 /* mark the range as done, and continue */
79072805 2740 dorange = FALSE;
01ec43d0 2741 didrange = TRUE;
4c3a8340
ST
2742#ifdef EBCDIC
2743 literal_endpoint = 0;
2744#endif
79072805 2745 continue;
4e553d73 2746 }
02aa26ce
NT
2747
2748 /* range begins (ignore - as first or last char) */
79072805 2749 else if (*s == '-' && s+1 < send && s != start) {
4e553d73 2750 if (didrange) {
1fafa243 2751 Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
01ec43d0 2752 }
e294cc5d
JH
2753 if (has_utf8
2754#ifdef EBCDIC
2755 && !native_range
2756#endif
2757 ) {
25716404 2758 *d++ = (char)UTF_TO_NATIVE(0xff); /* use illegal utf8 byte--see pmtrans */
a0ed51b3
LW
2759 s++;
2760 continue;
2761 }
79072805
LW
2762 dorange = TRUE;
2763 s++;
01ec43d0
GS
2764 }
2765 else {
2766 didrange = FALSE;
4c3a8340
ST
2767#ifdef EBCDIC
2768 literal_endpoint = 0;
e294cc5d 2769 native_range = TRUE;
4c3a8340 2770#endif
01ec43d0 2771 }
79072805 2772 }
02aa26ce
NT
2773
2774 /* if we get here, we're not doing a transliteration */
2775
0f5d15d6
IZ
2776 /* skip for regexp comments /(?#comment)/ and code /(?{code})/,
2777 except for the last char, which will be done separately. */
3280af22 2778 else if (*s == '(' && PL_lex_inpat && s[1] == '?') {
cc6b7395 2779 if (s[2] == '#') {
e994fd66 2780 while (s+1 < send && *s != ')')
db42d148 2781 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
155aba94
GS
2782 }
2783 else if (s[2] == '{' /* This should match regcomp.c */
67edc0c9 2784 || (s[2] == '?' && s[3] == '{'))
155aba94 2785 {
cc6b7395 2786 I32 count = 1;
0f5d15d6 2787 char *regparse = s + (s[2] == '{' ? 3 : 4);
cc6b7395
IZ
2788 char c;
2789
d9f97599
GS
2790 while (count && (c = *regparse)) {
2791 if (c == '\\' && regparse[1])
2792 regparse++;
4e553d73 2793 else if (c == '{')
cc6b7395 2794 count++;
4e553d73 2795 else if (c == '}')
cc6b7395 2796 count--;
d9f97599 2797 regparse++;
cc6b7395 2798 }
e994fd66 2799 if (*regparse != ')')
5bdf89e7 2800 regparse--; /* Leave one char for continuation. */
0f5d15d6 2801 while (s < regparse)
db42d148 2802 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
cc6b7395 2803 }
748a9306 2804 }
02aa26ce
NT
2805
2806 /* likewise skip #-initiated comments in //x patterns */
3280af22
NIS
2807 else if (*s == '#' && PL_lex_inpat &&
2808 ((PMOP*)PL_lex_inpat)->op_pmflags & PMf_EXTENDED) {
748a9306 2809 while (s+1 < send && *s != '\n')
db42d148 2810 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
748a9306 2811 }
02aa26ce 2812
5d1d4326 2813 /* check for embedded arrays
da6eedaa 2814 (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
5d1d4326 2815 */
1749ea0d
ST
2816 else if (*s == '@' && s[1]) {
2817 if (isALNUM_lazy_if(s+1,UTF))
2818 break;
2819 if (strchr(":'{$", s[1]))
2820 break;
2821 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
2822 break; /* in regexp, neither @+ nor @- are interpolated */
2823 }
02aa26ce
NT
2824
2825 /* check for embedded scalars. only stop if we're sure it's a
2826 variable.
2827 */
79072805 2828 else if (*s == '$') {
3280af22 2829 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
79072805 2830 break;
77772344 2831 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
a2a5de95
NC
2832 if (s[1] == '\\') {
2833 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
2834 "Possible unintended interpolation of $\\ in regex");
77772344 2835 }
79072805 2836 break; /* in regexp, $ might be tail anchor */
77772344 2837 }
79072805 2838 }
02aa26ce 2839
2b9d42f0
NIS
2840 /* End of else if chain - OP_TRANS rejoin rest */
2841
02aa26ce 2842 /* backslashes */
79072805 2843 if (*s == '\\' && s+1 < send) {
ff3f963a
KW
2844 char* e; /* Can be used for ending '}', etc. */
2845
79072805 2846 s++;
02aa26ce 2847
7d0fc23c
KW
2848 /* warn on \1 - \9 in substitution replacements, but note that \11
2849 * is an octal; and \19 is \1 followed by '9' */
3280af22 2850 if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat &&
a0d0e21e 2851 isDIGIT(*s) && *s != '0' && !isDIGIT(s[1]))
79072805 2852 {
a2a5de95 2853 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
79072805
LW
2854 *--s = '$';
2855 break;
2856 }
02aa26ce
NT
2857
2858 /* string-change backslash escapes */
3280af22 2859 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQ", *s)) {
79072805
LW
2860 --s;
2861 break;
2862 }
ff3f963a
KW
2863 /* In a pattern, process \N, but skip any other backslash escapes.
2864 * This is because we don't want to translate an escape sequence
2865 * into a meta symbol and have the regex compiler use the meta
2866 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
2867 * in spite of this, we do have to process \N here while the proper
2868 * charnames handler is in scope. See bugs #56444 and #62056.
2869 * There is a complication because \N in a pattern may also stand
2870 * for 'match a non-nl', and not mean a charname, in which case its
2871 * processing should be deferred to the regex compiler. To be a
2872 * charname it must be followed immediately by a '{', and not look
2873 * like \N followed by a curly quantifier, i.e., not something like
2874 * \N{3,}. regcurly returns a boolean indicating if it is a legal
2875 * quantifier */
2876 else if (PL_lex_inpat
2877 && (*s != 'N'
2878 || s[1] != '{'
2879 || regcurly(s + 1)))
2880 {
cc74c5bd
ST
2881 *d++ = NATIVE_TO_NEED(has_utf8,'\\');
2882 goto default_action;
2883 }
02aa26ce 2884
79072805 2885 switch (*s) {
02aa26ce
NT
2886
2887 /* quoted - in transliterations */
79072805 2888 case '-':
3280af22 2889 if (PL_lex_inwhat == OP_TRANS) {
79072805
LW
2890 *d++ = *s++;
2891 continue;
2892 }
2893 /* FALL THROUGH */
2894 default:
11b8faa4 2895 {
a2a5de95
NC
2896 if ((isALPHA(*s) || isDIGIT(*s)))
2897 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
2898 "Unrecognized escape \\%c passed through",
2899 *s);
11b8faa4 2900 /* default action is to copy the quoted character */
f9a63242 2901 goto default_action;
11b8faa4 2902 }
02aa26ce 2903
632403cc 2904 /* eg. \132 indicates the octal constant 0132 */
79072805
LW
2905 case '0': case '1': case '2': case '3':
2906 case '4': case '5': case '6': case '7':
ba210ebe 2907 {
53305cf1
NC
2908 I32 flags = 0;
2909 STRLEN len = 3;
77a135fe 2910 uv = NATIVE_TO_UNI(grok_oct(s, &len, &flags, NULL));
ba210ebe
JH
2911 s += len;
2912 }
012bcf8d 2913 goto NUM_ESCAPE_INSERT;
02aa26ce 2914
f0a2b745
KW
2915 /* eg. \o{24} indicates the octal constant \024 */
2916 case 'o':
2917 {
2918 STRLEN len;
454155d9 2919 const char* error;
f0a2b745 2920
454155d9 2921 bool valid = grok_bslash_o(s, &uv, &len, &error, 1);
f0a2b745 2922 s += len;
454155d9 2923 if (! valid) {
f0a2b745
KW
2924 yyerror(error);
2925 continue;
2926 }
2927 goto NUM_ESCAPE_INSERT;
2928 }
2929
77a135fe 2930 /* eg. \x24 indicates the hex constant 0x24 */
79072805 2931 case 'x':
a0ed51b3
LW
2932 ++s;
2933 if (*s == '{') {
9d4ba2ae 2934 char* const e = strchr(s, '}');
a4c04bdc
NC
2935 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES |
2936 PERL_SCAN_DISALLOW_PREFIX;
53305cf1 2937 STRLEN len;
355860ce 2938
53305cf1 2939 ++s;
adaeee49 2940 if (!e) {
a0ed51b3 2941 yyerror("Missing right brace on \\x{}");
355860ce 2942 continue;
ba210ebe 2943 }
53305cf1 2944 len = e - s;
77a135fe 2945 uv = NATIVE_TO_UNI(grok_hex(s, &len, &flags, NULL));
ba210ebe 2946 s = e + 1;
a0ed51b3
LW
2947 }
2948 else {
ba210ebe 2949 {
53305cf1 2950 STRLEN len = 2;
a4c04bdc 2951 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
77a135fe 2952 uv = NATIVE_TO_UNI(grok_hex(s, &len, &flags, NULL));
ba210ebe
JH
2953 s += len;
2954 }
012bcf8d
GS
2955 }
2956
2957 NUM_ESCAPE_INSERT:
ff3f963a
KW
2958 /* Insert oct or hex escaped character. There will always be
2959 * enough room in sv since such escapes will be longer than any
2960 * UTF-8 sequence they can end up as, except if they force us
2961 * to recode the rest of the string into utf8 */
ba7cea30 2962
77a135fe 2963 /* Here uv is the ordinal of the next character being added in
ff3f963a 2964 * unicode (converted from native). */
77a135fe 2965 if (!UNI_IS_INVARIANT(uv)) {
9aa983d2 2966 if (!has_utf8 && uv > 255) {
77a135fe
KW
2967 /* Might need to recode whatever we have accumulated so
2968 * far if it contains any chars variant in utf8 or
2969 * utf-ebcdic. */
2970
2971 SvCUR_set(sv, d - SvPVX_const(sv));
2972 SvPOK_on(sv);
2973 *d = '\0';
77a135fe 2974 /* See Note on sizing above. */
7bf79863
KW
2975 sv_utf8_upgrade_flags_grow(sv,
2976 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
2977 UNISKIP(uv) + (STRLEN)(send - s) + 1);
77a135fe
KW
2978 d = SvPVX(sv) + SvCUR(sv);
2979 has_utf8 = TRUE;
012bcf8d
GS
2980 }
2981
77a135fe
KW
2982 if (has_utf8) {
2983 d = (char*)uvuni_to_utf8((U8*)d, uv);
f9a63242
JH
2984 if (PL_lex_inwhat == OP_TRANS &&
2985 PL_sublex_info.sub_op) {
2986 PL_sublex_info.sub_op->op_private |=
2987 (PL_lex_repl ? OPpTRANS_FROM_UTF
2988 : OPpTRANS_TO_UTF);
f9a63242 2989 }
e294cc5d
JH
2990#ifdef EBCDIC
2991 if (uv > 255 && !dorange)
2992 native_range = FALSE;
2993#endif
012bcf8d 2994 }
a0ed51b3 2995 else {
012bcf8d 2996 *d++ = (char)uv;
a0ed51b3 2997 }
012bcf8d
GS
2998 }
2999 else {
c4d5f83a 3000 *d++ = (char) uv;
a0ed51b3 3001 }
79072805 3002 continue;
02aa26ce 3003
4a2d328f 3004 case 'N':
ff3f963a
KW
3005 /* In a non-pattern \N must be a named character, like \N{LATIN
3006 * SMALL LETTER A} or \N{U+0041}. For patterns, it also can
3007 * mean to match a non-newline. For non-patterns, named
3008 * characters are converted to their string equivalents. In
3009 * patterns, named characters are not converted to their
3010 * ultimate forms for the same reasons that other escapes
3011 * aren't. Instead, they are converted to the \N{U+...} form
3012 * to get the value from the charnames that is in effect right
3013 * now, while preserving the fact that it was a named character
3014 * so that the regex compiler knows this */
3015
3016 /* This section of code doesn't generally use the
3017 * NATIVE_TO_NEED() macro to transform the input. I (khw) did
3018 * a close examination of this macro and determined it is a
3019 * no-op except on utfebcdic variant characters. Every
3020 * character generated by this that would normally need to be
3021 * enclosed by this macro is invariant, so the macro is not
3022 * needed, and would complicate use of copy(). There are other
3023 * parts of this file where the macro is used inconsistently,
3024 * but are saved by it being a no-op */
3025
3026 /* The structure of this section of code (besides checking for
3027 * errors and upgrading to utf8) is:
3028 * Further disambiguate between the two meanings of \N, and if
3029 * not a charname, go process it elsewhere
0a96133f
KW
3030 * If of form \N{U+...}, pass it through if a pattern;
3031 * otherwise convert to utf8
3032 * Otherwise must be \N{NAME}: convert to \N{U+c1.c2...} if a
3033 * pattern; otherwise convert to utf8 */
ff3f963a
KW
3034
3035 /* Here, s points to the 'N'; the test below is guaranteed to
3036 * succeed if we are being called on a pattern as we already
3037 * know from a test above that the next character is a '{'.
3038 * On a non-pattern \N must mean 'named sequence, which
3039 * requires braces */
3040 s++;
3041 if (*s != '{') {
3042 yyerror("Missing braces on \\N{}");
3043 continue;
3044 }
3045 s++;
3046
0a96133f 3047 /* If there is no matching '}', it is an error. */
ff3f963a
KW
3048 if (! (e = strchr(s, '}'))) {
3049 if (! PL_lex_inpat) {
5777a3f7 3050 yyerror("Missing right brace on \\N{}");
0a96133f
KW
3051 } else {
3052 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N.");
dbc0d4f2 3053 }
0a96133f 3054 continue;
ff3f963a 3055 }
cddc7ef4 3056
ff3f963a 3057 /* Here it looks like a named character */
cddc7ef4 3058
ff3f963a
KW
3059 if (PL_lex_inpat) {
3060
3061 /* XXX This block is temporary code. \N{} implies that the
3062 * pattern is to have Unicode semantics, and therefore
3063 * currently has to be encoded in utf8. By putting it in
3064 * utf8 now, we save a whole pass in the regular expression
3065 * compiler. Once that code is changed so Unicode
3066 * semantics doesn't necessarily have to be in utf8, this
3067 * block should be removed */
3068 if (!has_utf8) {
77a135fe 3069 SvCUR_set(sv, d - SvPVX_const(sv));
f08d6ad9 3070 SvPOK_on(sv);
e4f3eed8 3071 *d = '\0';
77a135fe 3072 /* See Note on sizing above. */
7bf79863 3073 sv_utf8_upgrade_flags_grow(sv,
ff3f963a
KW
3074 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3075 /* 5 = '\N{' + cur char + NUL */
3076 (STRLEN)(send - s) + 5);
f08d6ad9 3077 d = SvPVX(sv) + SvCUR(sv);
89491803 3078 has_utf8 = TRUE;
ff3f963a
KW
3079 }
3080 }
423cee85 3081
ff3f963a
KW
3082 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3083 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3084 | PERL_SCAN_DISALLOW_PREFIX;
3085 STRLEN len;
3086
3087 /* For \N{U+...}, the '...' is a unicode value even on
3088 * EBCDIC machines */
3089 s += 2; /* Skip to next char after the 'U+' */
3090 len = e - s;
3091 uv = grok_hex(s, &len, &flags, NULL);
3092 if (len == 0 || len != (STRLEN)(e - s)) {
3093 yyerror("Invalid hexadecimal number in \\N{U+...}");
3094 s = e + 1;
3095 continue;
3096 }
3097
3098 if (PL_lex_inpat) {
3099
3100 /* Pass through to the regex compiler unchanged. The
3101 * reason we evaluated the number above is to make sure
0a96133f 3102 * there wasn't a syntax error. */
ff3f963a
KW
3103 s -= 5; /* Include the '\N{U+' */
3104 Copy(s, d, e - s + 1, char); /* 1 = include the } */
3105 d += e - s + 1;
3106 }
3107 else { /* Not a pattern: convert the hex to string */
3108
3109 /* If destination is not in utf8, unconditionally
3110 * recode it to be so. This is because \N{} implies
3111 * Unicode semantics, and scalars have to be in utf8
3112 * to guarantee those semantics */
3113 if (! has_utf8) {
3114 SvCUR_set(sv, d - SvPVX_const(sv));
3115 SvPOK_on(sv);
3116 *d = '\0';
3117 /* See Note on sizing above. */
3118 sv_utf8_upgrade_flags_grow(
3119 sv,
3120 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3121 UNISKIP(uv) + (STRLEN)(send - e) + 1);
3122 d = SvPVX(sv) + SvCUR(sv);
3123 has_utf8 = TRUE;
3124 }
3125
3126 /* Add the string to the output */
3127 if (UNI_IS_INVARIANT(uv)) {
3128 *d++ = (char) uv;
3129 }
3130 else d = (char*)uvuni_to_utf8((U8*)d, uv);
3131 }
3132 }
3133 else { /* Here is \N{NAME} but not \N{U+...}. */
3134
3135 SV *res; /* result from charnames */
3136 const char *str; /* the string in 'res' */
3137 STRLEN len; /* its length */
3138
3139 /* Get the value for NAME */
3140 res = newSVpvn(s, e - s);
3141 res = new_constant( NULL, 0, "charnames",
3142 /* includes all of: \N{...} */
3143 res, NULL, s - 3, e - s + 4 );
3144
3145 /* Most likely res will be in utf8 already since the
3146 * standard charnames uses pack U, but a custom translator
3147 * can leave it otherwise, so make sure. XXX This can be
3148 * revisited to not have charnames use utf8 for characters
3149 * that don't need it when regexes don't have to be in utf8
3150 * for Unicode semantics. If doing so, remember EBCDIC */
3151 sv_utf8_upgrade(res);
3152 str = SvPV_const(res, len);
3153
3154 /* Don't accept malformed input */
3155 if (! is_utf8_string((U8 *) str, len)) {
3156 yyerror("Malformed UTF-8 returned by \\N");
3157 }
3158 else if (PL_lex_inpat) {
3159
3160 if (! len) { /* The name resolved to an empty string */
3161 Copy("\\N{}", d, 4, char);
3162 d += 4;
3163 }
3164 else {
3165 /* In order to not lose information for the regex
3166 * compiler, pass the result in the specially made
3167 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3168 * the code points in hex of each character
3169 * returned by charnames */
3170
3171 const char *str_end = str + len;
3172 STRLEN char_length; /* cur char's byte length */
3173 STRLEN output_length; /* and the number of bytes
3174 after this is translated
3175 into hex digits */
3176 const STRLEN off = d - SvPVX_const(sv);
3177
3178 /* 2 hex per byte; 2 chars for '\N'; 2 chars for
3179 * max('U+', '.'); and 1 for NUL */
3180 char hex_string[2 * UTF8_MAXBYTES + 5];
3181
3182 /* Get the first character of the result. */
3183 U32 uv = utf8n_to_uvuni((U8 *) str,
3184 len,
3185 &char_length,
3186 UTF8_ALLOW_ANYUV);
3187
3188 /* The call to is_utf8_string() above hopefully
3189 * guarantees that there won't be an error. But
3190 * it's easy here to make sure. The function just
3191 * above warns and returns 0 if invalid utf8, but
3192 * it can also return 0 if the input is validly a
3193 * NUL. Disambiguate */
3194 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3195 uv = UNICODE_REPLACEMENT;
3196 }
3197
3198 /* Convert first code point to hex, including the
3199 * boiler plate before it */
3200 sprintf(hex_string, "\\N{U+%X", (unsigned int) uv);
3201 output_length = strlen(hex_string);
3202
3203 /* Make sure there is enough space to hold it */
3204 d = off + SvGROW(sv, off
3205 + output_length
3206 + (STRLEN)(send - e)
3207 + 2); /* '}' + NUL */
3208 /* And output it */
3209 Copy(hex_string, d, output_length, char);
3210 d += output_length;
3211
3212 /* For each subsequent character, append dot and
3213 * its ordinal in hex */
3214 while ((str += char_length) < str_end) {
3215 const STRLEN off = d - SvPVX_const(sv);
3216 U32 uv = utf8n_to_uvuni((U8 *) str,
3217 str_end - str,
3218 &char_length,
3219 UTF8_ALLOW_ANYUV);
3220 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3221 uv = UNICODE_REPLACEMENT;
3222 }
3223
3224 sprintf(hex_string, ".%X", (unsigned int) uv);
3225 output_length = strlen(hex_string);
3226
3227 d = off + SvGROW(sv, off
3228 + output_length
3229 + (STRLEN)(send - e)
3230 + 2); /* '}' + NUL */
3231 Copy(hex_string, d, output_length, char);
3232 d += output_length;
3233 }
3234
3235 *d++ = '}'; /* Done. Add the trailing brace */
3236 }
3237 }
3238 else { /* Here, not in a pattern. Convert the name to a
3239 * string. */
3240
3241 /* If destination is not in utf8, unconditionally
3242 * recode it to be so. This is because \N{} implies
3243 * Unicode semantics, and scalars have to be in utf8
3244 * to guarantee those semantics */
3245 if (! has_utf8) {
3246 SvCUR_set(sv, d - SvPVX_const(sv));
3247 SvPOK_on(sv);
3248 *d = '\0';
3249 /* See Note on sizing above. */
3250 sv_utf8_upgrade_flags_grow(sv,
3251 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3252 len + (STRLEN)(send - s) + 1);
3253 d = SvPVX(sv) + SvCUR(sv);
3254 has_utf8 = TRUE;
3255 } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
3256
3257 /* See Note on sizing above. (NOTE: SvCUR() is not
3258 * set correctly here). */
3259 const STRLEN off = d - SvPVX_const(sv);
3260 d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1);
3261 }
3262 Copy(str, d, len, char);
3263 d += len;
423cee85 3264 }
423cee85 3265 SvREFCNT_dec(res);
cb233ae3
KW
3266
3267 /* Deprecate non-approved name syntax */
3268 if (ckWARN_d(WARN_DEPRECATED)) {
3269 bool problematic = FALSE;
3270 char* i = s;
3271
3272 /* For non-ut8 input, look to see that the first
3273 * character is an alpha, then loop through the rest
3274 * checking that each is a continuation */
3275 if (! this_utf8) {
3276 if (! isALPHAU(*i)) problematic = TRUE;
3277 else for (i = s + 1; i < e; i++) {
3278 if (isCHARNAME_CONT(*i)) continue;
3279 problematic = TRUE;
3280 break;
3281 }
3282 }
3283 else {
3284 /* Similarly for utf8. For invariants can check
3285 * directly. We accept anything above the latin1
3286 * range because it is immaterial to Perl if it is
3287 * correct or not, and is expensive to check. But
3288 * it is fairly easy in the latin1 range to convert
3289 * the variants into a single character and check
3290 * those */
3291 if (UTF8_IS_INVARIANT(*i)) {
3292 if (! isALPHAU(*i)) problematic = TRUE;
3293 } else if (UTF8_IS_DOWNGRADEABLE_START(*i)) {
3294 if (! isALPHAU(UNI_TO_NATIVE(UTF8_ACCUMULATE(*i,
3295 *(i+1)))))
3296 {
3297 problematic = TRUE;
3298 }
3299 }
3300 if (! problematic) for (i = s + UTF8SKIP(s);
3301 i < e;
3302 i+= UTF8SKIP(i))
3303 {
3304 if (UTF8_IS_INVARIANT(*i)) {
3305 if (isCHARNAME_CONT(*i)) continue;
3306 } else if (! UTF8_IS_DOWNGRADEABLE_START(*i)) {
3307 continue;
3308 } else if (isCHARNAME_CONT(
3309 UNI_TO_NATIVE(
3310 UTF8_ACCUMULATE(*i, *(i+1)))))
3311 {
3312 continue;
3313 }
3314 problematic = TRUE;
3315 break;
3316 }
3317 }
3318 if (problematic) {
6e1bad6c
KW
3319 /* The e-i passed to the final %.*s makes sure that
3320 * should the trailing NUL be missing that this
3321 * print won't run off the end of the string */
cb233ae3 3322 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
b00fc8d4
NC
3323 "Deprecated character in \\N{...}; marked by <-- HERE in \\N{%.*s<-- HERE %.*s",
3324 (int)(i - s + 1), s, (int)(e - i), i + 1);
cb233ae3
KW
3325 }
3326 }
3327 } /* End \N{NAME} */
ff3f963a
KW
3328#ifdef EBCDIC
3329 if (!dorange)
3330 native_range = FALSE; /* \N{} is defined to be Unicode */
3331#endif
3332 s = e + 1; /* Point to just after the '}' */
423cee85
JH
3333 continue;
3334
02aa26ce 3335 /* \c is a control character */
79072805
LW
3336 case 'c':
3337 s++;
961ce445 3338 if (s < send) {
f9d13529 3339 *d++ = grok_bslash_c(*s++, 1);
ba210ebe 3340 }
961ce445
RGS
3341 else {
3342 yyerror("Missing control char name in \\c");
3343 }
79072805 3344 continue;
02aa26ce
NT
3345
3346 /* printf-style backslashes, formfeeds, newlines, etc */
79072805 3347 case 'b':
db42d148 3348 *d++ = NATIVE_TO_NEED(has_utf8,'\b');
79072805
LW
3349 break;
3350 case 'n':
db42d148 3351 *d++ = NATIVE_TO_NEED(has_utf8,'\n');
79072805
LW
3352 break;
3353 case 'r':
db42d148 3354 *d++ = NATIVE_TO_NEED(has_utf8,'\r');
79072805
LW
3355 break;
3356 case 'f':
db42d148 3357 *d++ = NATIVE_TO_NEED(has_utf8,'\f');
79072805
LW
3358 break;
3359 case 't':
db42d148 3360 *d++ = NATIVE_TO_NEED(has_utf8,'\t');
79072805 3361 break;
34a3fe2a 3362 case 'e':
db42d148 3363 *d++ = ASCII_TO_NEED(has_utf8,'\033');
34a3fe2a
PP
3364 break;
3365 case 'a':
db42d148 3366 *d++ = ASCII_TO_NEED(has_utf8,'\007');
79072805 3367 break;
02aa26ce
NT
3368 } /* end switch */
3369
79072805
LW
3370 s++;
3371 continue;
02aa26ce 3372 } /* end if (backslash) */
4c3a8340
ST
3373#ifdef EBCDIC
3374 else
3375 literal_endpoint++;
3376#endif
02aa26ce 3377
f9a63242 3378 default_action:
77a135fe
KW
3379 /* If we started with encoded form, or already know we want it,
3380 then encode the next character */
3381 if (! NATIVE_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
2b9d42f0 3382 STRLEN len = 1;
77a135fe
KW
3383
3384
3385 /* One might think that it is wasted effort in the case of the
3386 * source being utf8 (this_utf8 == TRUE) to take the next character
3387 * in the source, convert it to an unsigned value, and then convert
3388 * it back again. But the source has not been validated here. The
3389 * routine that does the conversion checks for errors like
3390 * malformed utf8 */
3391
5f66b61c
AL
3392 const UV nextuv = (this_utf8) ? utf8n_to_uvchr((U8*)s, send - s, &len, 0) : (UV) ((U8) *s);
3393 const STRLEN need = UNISKIP(NATIVE_TO_UNI(nextuv));
77a135fe
KW
3394 if (!has_utf8) {
3395 SvCUR_set(sv, d - SvPVX_const(sv));
3396 SvPOK_on(sv);
3397 *d = '\0';
77a135fe 3398 /* See Note on sizing above. */
7bf79863
KW
3399 sv_utf8_upgrade_flags_grow(sv,
3400 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3401 need + (STRLEN)(send - s) + 1);
77a135fe
KW
3402 d = SvPVX(sv) + SvCUR(sv);
3403 has_utf8 = TRUE;
3404 } else if (need > len) {
3405 /* encoded value larger than old, may need extra space (NOTE:
3406 * SvCUR() is not set correctly here). See Note on sizing
3407 * above. */
9d4ba2ae 3408 const STRLEN off = d - SvPVX_const(sv);
77a135fe 3409 d = SvGROW(sv, off + need + (STRLEN)(send - s) + 1) + off;
2b9d42f0 3410 }
77a135fe
KW
3411 s += len;
3412
5f66b61c 3413 d = (char*)uvchr_to_utf8((U8*)d, nextuv);
e294cc5d
JH
3414#ifdef EBCDIC
3415 if (uv > 255 && !dorange)
3416 native_range = FALSE;
3417#endif
2b9d42f0
NIS
3418 }
3419 else {
3420 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
3421 }
02aa26ce
NT
3422 } /* while loop to process each character */
3423
3424 /* terminate the string and set up the sv */
79072805 3425 *d = '\0';
95a20fc0 3426 SvCUR_set(sv, d - SvPVX_const(sv));
2b9d42f0 3427 if (SvCUR(sv) >= SvLEN(sv))
d0063567 3428 Perl_croak(aTHX_ "panic: constant overflowed allocated space");
2b9d42f0 3429
79072805 3430 SvPOK_on(sv);
9f4817db 3431 if (PL_encoding && !has_utf8) {
d0063567
DK
3432 sv_recode_to_utf8(sv, PL_encoding);
3433 if (SvUTF8(sv))
3434 has_utf8 = TRUE;
9f4817db 3435 }
2b9d42f0 3436 if (has_utf8) {
7e2040f0 3437 SvUTF8_on(sv);
2b9d42f0 3438 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
d0063567 3439 PL_sublex_info.sub_op->op_private |=
2b9d42f0
NIS
3440 (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
3441 }
3442 }
79072805 3443
02aa26ce 3444 /* shrink the sv if we allocated more than we used */
79072805 3445 if (SvCUR(sv) + 5 < SvLEN(sv)) {
1da4ca5f 3446 SvPV_shrink_to_cur(sv);
79072805 3447 }
02aa26ce 3448
6154021b 3449 /* return the substring (via pl_yylval) only if we parsed anything */
3280af22 3450 if (s > PL_bufptr) {
eb0d8d16
NC
3451 if ( PL_hints & ( PL_lex_inpat ? HINT_NEW_RE : HINT_NEW_STRING ) ) {
3452 const char *const key = PL_lex_inpat ? "qr" : "q";
3453 const STRLEN keylen = PL_lex_inpat ? 2 : 1;
3454 const char *type;
3455 STRLEN typelen;
3456
3457 if (PL_lex_inwhat == OP_TRANS) {
3458 type = "tr";
3459 typelen = 2;
3460 } else if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat) {
3461 type = "s";
3462 typelen = 1;
3463 } else {
3464 type = "qq";
3465 typelen = 2;
3466 }
3467
3468 sv = S_new_constant(aTHX_ start, s - start, key, keylen, sv, NULL,
3469 type, typelen);
3470 }
6154021b 3471 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
b3ac6de7 3472 } else
8990e307 3473 SvREFCNT_dec(sv);
79072805
LW
3474 return s;
3475}
3476
ffb4593c
NT
3477/* S_intuit_more
3478 * Returns TRUE if there's more to the expression (e.g., a subscript),
3479 * FALSE otherwise.
ffb4593c
NT
3480 *
3481 * It deals with "$foo[3]" and /$foo[3]/ and /$foo[0123456789$]+/
3482 *
3483 * ->[ and ->{ return TRUE
3484 * { and [ outside a pattern are always subscripts, so return TRUE
3485 * if we're outside a pattern and it's not { or [, then return FALSE
3486 * if we're in a pattern and the first char is a {
3487 * {4,5} (any digits around the comma) returns FALSE
3488 * if we're in a pattern and the first char is a [
3489 * [] returns FALSE
3490 * [SOMETHING] has a funky algorithm to decide whether it's a
3491 * character class or not. It has to deal with things like
3492 * /$foo[-3]/ and /$foo[$bar]/ as well as /$foo[$\d]+/
3493 * anything else returns TRUE
3494 */
3495
9cbb5ea2
GS
3496/* This is the one truly awful dwimmer necessary to conflate C and sed. */
3497
76e3520e 3498STATIC int
cea2e8a9 3499S_intuit_more(pTHX_ register char *s)
79072805 3500{
97aff369 3501 dVAR;
7918f24d
NC
3502
3503 PERL_ARGS_ASSERT_INTUIT_MORE;
3504
3280af22 3505 if (PL_lex_brackets)
79072805
LW
3506 return TRUE;
3507 if (*s == '-' && s[1] == '>' && (s[2] == '[' || s[2] == '{'))
3508 return TRUE;
3509 if (*s != '{' && *s != '[')
3510 return FALSE;
3280af22 3511 if (!PL_lex_inpat)
79072805
LW
3512 return TRUE;
3513
3514 /* In a pattern, so maybe we have {n,m}. */
3515 if (*s == '{') {
b3155d95 3516 if (regcurly(s)) {
79072805 3517 return FALSE;
b3155d95 3518 }
79072805 3519 return TRUE;
79072805
LW
3520 }
3521
3522 /* On the other hand, maybe we have a character class */
3523
3524 s++;
3525 if (*s == ']' || *s == '^')
3526 return FALSE;
3527 else {
ffb4593c 3528 /* this is terrifying, and it works */
79072805
LW
3529 int weight = 2; /* let's weigh the evidence */
3530 char seen[256];
f27ffc4a 3531 unsigned char un_char = 255, last_un_char;
9d4ba2ae 3532 const char * const send = strchr(s,']');
3280af22 3533 char tmpbuf[sizeof PL_tokenbuf * 4];
79072805
LW
3534
3535 if (!send) /* has to be an expression */
3536 return TRUE;
3537
3538 Zero(seen,256,char);
3539 if (*s == '$')
3540 weight -= 3;
3541 else if (isDIGIT(*s)) {
3542 if (s[1] != ']') {
3543 if (isDIGIT(s[1]) && s[2] == ']')
3544 weight -= 10;
3545 }
3546 else
3547 weight -= 100;
3548 }
3549 for (; s < send; s++) {
3550 last_un_char = un_char;
3551 un_char = (unsigned char)*s;
3552 switch (*s) {
3553 case '@':
3554 case '&':
3555 case '$':
3556 weight -= seen[un_char] * 10;
7e2040f0 3557 if (isALNUM_lazy_if(s+1,UTF)) {
90e5519e 3558 int len;
8903cb82 3559 scan_ident(s, send, tmpbuf, sizeof tmpbuf, FALSE);
90e5519e
NC
3560 len = (int)strlen(tmpbuf);
3561 if (len > 1 && gv_fetchpvn_flags(tmpbuf, len, 0, SVt_PV))
79072805
LW
3562 weight -= 100;
3563 else
3564 weight -= 10;
3565 }
3566 else if (*s == '$' && s[1] &&
93a17b20
LW
3567 strchr("[#!%*<>()-=",s[1])) {
3568 if (/*{*/ strchr("])} =",s[2]))
79072805
LW
3569 weight -= 10;
3570 else
3571 weight -= 1;
3572 }
3573 break;
3574 case '\\':
3575 un_char = 254;
3576 if (s[1]) {
93a17b20 3577 if (strchr("wds]",s[1]))
79072805 3578 weight += 100;
10edeb5d 3579 else if (seen[(U8)'\''] || seen[(U8)'"'])
79072805 3580 weight += 1;
93a17b20 3581 else if (strchr("rnftbxcav",s[1]))
79072805
LW
3582 weight += 40;
3583 else if (isDIGIT(s[1])) {
3584 weight += 40;
3585 while (s[1] && isDIGIT(s[1]))
3586 s++;
3587 }
3588 }
3589 else
3590 weight += 100;
3591 break;
3592 case '-':
3593 if (s[1] == '\\')
3594 weight += 50;
93a17b20 3595 if (strchr("aA01! ",last_un_char))
79072805 3596 weight += 30;
93a17b20 3597 if (strchr("zZ79~",s[1]))
79072805 3598 weight += 30;
f27ffc4a
GS
3599 if (last_un_char == 255 && (isDIGIT(s[1]) || s[1] == '$'))
3600 weight -= 5; /* cope with negative subscript */
79072805
LW
3601 break;
3602 default:
3792a11b
NC
3603 if (!isALNUM(last_un_char)
3604 && !(last_un_char == '$' || last_un_char == '@'
3605 || last_un_char == '&')
3606 && isALPHA(*s) && s[1] && isALPHA(s[1])) {
79072805
LW
3607 char *d = tmpbuf;
3608 while (isALPHA(*s))
3609 *d++ = *s++;
3610 *d = '\0';
5458a98a 3611 if (keyword(tmpbuf, d - tmpbuf, 0))
79072805
LW
3612 weight -= 150;
3613 }
3614 if (un_char == last_un_char + 1)
3615 weight += 5;
3616 weight -= seen[un_char];
3617 break;
3618 }
3619 seen[un_char]++;
3620 }
3621 if (weight >= 0) /* probably a character class */
3622 return FALSE;
3623 }
3624
3625 return TRUE;
3626}
ffed7fef 3627
ffb4593c
NT
3628/*
3629 * S_intuit_method
3630 *
3631 * Does all the checking to disambiguate
3632 * foo bar
3633 * between foo(bar) and bar->foo. Returns 0 if not a method, otherwise
3634 * FUNCMETH (bar->foo(args)) or METHOD (bar->foo args).
3635 *
3636 * First argument is the stuff after the first token, e.g. "bar".
3637 *
3638 * Not a method if bar is a filehandle.
3639 * Not a method if foo is a subroutine prototyped to take a filehandle.
3640 * Not a method if it's really "Foo $bar"
3641 * Method if it's "foo $bar"
3642 * Not a method if it's really "print foo $bar"
3643 * Method if it's really "foo package::" (interpreted as package->foo)
8f8cf39c 3644 * Not a method if bar is known to be a subroutine ("sub bar; foo bar")
3cb0bbe5 3645 * Not a method if bar is a filehandle or package, but is quoted with
ffb4593c
NT
3646 * =>
3647 */
3648
76e3520e 3649STATIC int
62d55b22 3650S_intuit_method(pTHX_ char *start, GV *gv, CV *cv)
a0d0e21e 3651{
97aff369 3652 dVAR;
a0d0e21e 3653 char *s = start + (*start == '$');
3280af22 3654 char tmpbuf[sizeof PL_tokenbuf];
a0d0e21e
LW
3655 STRLEN len;
3656 GV* indirgv;
5db06880
NC
3657#ifdef PERL_MAD
3658 int soff;
3659#endif
a0d0e21e 3660
7918f24d
NC
3661 PERL_ARGS_ASSERT_INTUIT_METHOD;
3662
a0d0e21e 3663 if (gv) {
62d55b22 3664 if (SvTYPE(gv) == SVt_PVGV && GvIO(gv))
a0d0e21e 3665 return 0;
62d55b22
NC
3666 if (cv) {
3667 if (SvPOK(cv)) {
3668 const char *proto = SvPVX_const(cv);
3669 if (proto) {
3670 if (*proto == ';')
3671 proto++;
3672 if (*proto == '*')
3673 return 0;
3674 }
b6c543e3
IZ
3675 }
3676 } else
c35e046a 3677 gv = NULL;
a0d0e21e 3678 }
8903cb82 3679 s = scan_word(s, tmpbuf, sizeof tmpbuf, TRUE, &len);
ffb4593c
NT
3680 /* start is the beginning of the possible filehandle/object,
3681 * and s is the end of it
3682 * tmpbuf is a copy of it
3683 */
3684
a0d0e21e 3685 if (*start == '$') {
3ef1310e
RGS
3686 if (gv || PL_last_lop_op == OP_PRINT || PL_last_lop_op == OP_SAY ||
3687 isUPPER(*PL_tokenbuf))
a0d0e21e 3688 return 0;
5db06880
NC
3689#ifdef PERL_MAD
3690 len = start - SvPVX(PL_linestr);
3691#endif
29595ff2 3692 s = PEEKSPACE(s);
f0092767 3693#ifdef PERL_MAD
5db06880
NC
3694 start = SvPVX(PL_linestr) + len;
3695#endif
3280af22
NIS
3696 PL_bufptr = start;
3697 PL_expect = XREF;
a0d0e21e
LW
3698 return *s == '(' ? FUNCMETH : METHOD;
3699 }
5458a98a 3700 if (!keyword(tmpbuf, len, 0)) {
c3e0f903
GS
3701 if (len > 2 && tmpbuf[len - 2] == ':' && tmpbuf[len - 1] == ':') {
3702 len -= 2;
3703 tmpbuf[len] = '\0';
5db06880
NC
3704#ifdef PERL_MAD
3705 soff = s - SvPVX(PL_linestr);
3706#endif
c3e0f903
GS
3707 goto bare_package;
3708 }
90e5519e 3709 indirgv = gv_fetchpvn_flags(tmpbuf, len, 0, SVt_PVCV);
8ebc5c01 3710 if (indirgv && GvCVu(indirgv))
a0d0e21e
LW
3711 return 0;
3712 /* filehandle or package name makes it a method */
da51bb9b 3713 if (!gv || GvIO(indirgv) || gv_stashpvn(tmpbuf, len, 0)) {
5db06880
NC
3714#ifdef PERL_MAD
3715 soff = s - SvPVX(PL_linestr);
3716#endif
29595ff2 3717 s = PEEKSPACE(s);
3280af22 3718 if ((PL_bufend - s) >= 2 && *s == '=' && *(s+1) == '>')
55497cff 3719 return 0; /* no assumptions -- "=>" quotes bearword */
c3e0f903 3720 bare_package:
cd81e915 3721 start_force(PL_curforce);
9ded7720 3722 NEXTVAL_NEXTTOKE.opval = (OP*)newSVOP(OP_CONST, 0,
64142370 3723 S_newSV_maybe_utf8(aTHX_ tmpbuf, len));
9ded7720 3724 NEXTVAL_NEXTTOKE.opval->op_private = OPpCONST_BARE;
5db06880
NC
3725 if (PL_madskills)
3726 curmad('X', newSVpvn(start,SvPVX(PL_linestr) + soff - start));
3280af22 3727 PL_expect = XTERM;
a0d0e21e 3728 force_next(WORD);
3280af22 3729 PL_bufptr = s;
5db06880
NC
3730#ifdef PERL_MAD
3731 PL_bufptr = SvPVX(PL_linestr) + soff; /* restart before space */
3732#endif
a0d0e21e
LW
3733 return *s == '(' ? FUNCMETH : METHOD;
3734 }
3735 }
3736 return 0;
3737}
3738
16d20bd9 3739/* Encoded script support. filter_add() effectively inserts a
4e553d73 3740 * 'pre-processing' function into the current source input stream.
16d20bd9
AD
3741 * Note that the filter function only applies to the current source file
3742 * (e.g., it will not affect files 'require'd or 'use'd by this one).
3743 *
3744 * The datasv parameter (which may be NULL) can be used to pass
3745 * private data to this instance of the filter. The filter function
3746 * can recover the SV using the FILTER_DATA macro and use it to
3747 * store private buffers and state information.
3748 *
3749 * The supplied datasv parameter is upgraded to a PVIO type
4755096e 3750 * and the IoDIRP/IoANY field is used to store the function pointer,
e0c19803 3751 * and IOf_FAKE_DIRP is enabled on datasv to mark this as such.
16d20bd9
AD
3752 * Note that IoTOP_NAME, IoFMT_NAME, IoBOTTOM_NAME, if set for
3753 * private use must be set using malloc'd pointers.
3754 */
16d20bd9
AD
3755
3756SV *
864dbfa3 3757Perl_filter_add(pTHX_ filter_t funcp, SV *datasv)
16d20bd9 3758{
97aff369 3759 dVAR;
f4c556ac 3760 if (!funcp)
a0714e2c 3761 return NULL;
f4c556ac 3762
5486870f
DM
3763 if (!PL_parser)
3764 return NULL;
3765
3280af22
NIS
3766 if (!PL_rsfp_filters)
3767 PL_rsfp_filters = newAV();
16d20bd9 3768 if (!datasv)
561b68a9 3769 datasv = newSV(0);
862a34c6 3770 SvUPGRADE(datasv, SVt_PVIO);
8141890a 3771 IoANY(datasv) = FPTR2DPTR(void *, funcp); /* stash funcp into spare field */
e0c19803 3772 IoFLAGS(datasv) |= IOf_FAKE_DIRP;
f4c556ac 3773 DEBUG_P(PerlIO_printf(Perl_debug_log, "filter_add func %p (%s)\n",
55662e27
JH
3774 FPTR2DPTR(void *, IoANY(datasv)),
3775 SvPV_nolen(datasv)));