This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
rv2cv hooks should not create 2nd-class subs
[perl5.git] / toke.c
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
26
27This is the lower layer of the Perl parser, managing characters and tokens.
28
29=for apidoc AmU|yy_parser *|PL_parser
30
31Pointer to a structure encapsulating the state of the parsing operation
32currently in progress. The pointer can be locally changed to perform
33a nested parse without interfering with the state of an outer parse.
34Individual members of C<PL_parser> have their own documentation.
35
36=cut
37*/
38
378cc40b 39#include "EXTERN.h"
864dbfa3 40#define PERL_IN_TOKE_C
378cc40b 41#include "perl.h"
04e98a4d 42#include "dquote_static.c"
378cc40b 43
eb0d8d16
NC
44#define new_constant(a,b,c,d,e,f,g) \
45 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
46
6154021b 47#define pl_yylval (PL_parser->yylval)
d3b6f988 48
199e78b7
DM
49/* XXX temporary backwards compatibility */
50#define PL_lex_brackets (PL_parser->lex_brackets)
78cdf107
Z
51#define PL_lex_allbrackets (PL_parser->lex_allbrackets)
52#define PL_lex_fakeeof (PL_parser->lex_fakeeof)
199e78b7
DM
53#define PL_lex_brackstack (PL_parser->lex_brackstack)
54#define PL_lex_casemods (PL_parser->lex_casemods)
55#define PL_lex_casestack (PL_parser->lex_casestack)
56#define PL_lex_defer (PL_parser->lex_defer)
57#define PL_lex_dojoin (PL_parser->lex_dojoin)
58#define PL_lex_expect (PL_parser->lex_expect)
59#define PL_lex_formbrack (PL_parser->lex_formbrack)
60#define PL_lex_inpat (PL_parser->lex_inpat)
61#define PL_lex_inwhat (PL_parser->lex_inwhat)
62#define PL_lex_op (PL_parser->lex_op)
63#define PL_lex_repl (PL_parser->lex_repl)
64#define PL_lex_starts (PL_parser->lex_starts)
65#define PL_lex_stuff (PL_parser->lex_stuff)
66#define PL_multi_start (PL_parser->multi_start)
67#define PL_multi_open (PL_parser->multi_open)
68#define PL_multi_close (PL_parser->multi_close)
69#define PL_pending_ident (PL_parser->pending_ident)
70#define PL_preambled (PL_parser->preambled)
71#define PL_sublex_info (PL_parser->sublex_info)
bdc0bf6f 72#define PL_linestr (PL_parser->linestr)
c2598295
DM
73#define PL_expect (PL_parser->expect)
74#define PL_copline (PL_parser->copline)
f06b5848
DM
75#define PL_bufptr (PL_parser->bufptr)
76#define PL_oldbufptr (PL_parser->oldbufptr)
77#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
78#define PL_linestart (PL_parser->linestart)
79#define PL_bufend (PL_parser->bufend)
80#define PL_last_uni (PL_parser->last_uni)
81#define PL_last_lop (PL_parser->last_lop)
82#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 83#define PL_lex_state (PL_parser->lex_state)
2f9285f8 84#define PL_rsfp (PL_parser->rsfp)
5486870f 85#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
86#define PL_in_my (PL_parser->in_my)
87#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 88#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 89#define PL_multi_end (PL_parser->multi_end)
13765c85 90#define PL_error_count (PL_parser->error_count)
199e78b7
DM
91
92#ifdef PERL_MAD
93# define PL_endwhite (PL_parser->endwhite)
94# define PL_faketokens (PL_parser->faketokens)
95# define PL_lasttoke (PL_parser->lasttoke)
96# define PL_nextwhite (PL_parser->nextwhite)
97# define PL_realtokenstart (PL_parser->realtokenstart)
98# define PL_skipwhite (PL_parser->skipwhite)
99# define PL_thisclose (PL_parser->thisclose)
100# define PL_thismad (PL_parser->thismad)
101# define PL_thisopen (PL_parser->thisopen)
102# define PL_thisstuff (PL_parser->thisstuff)
103# define PL_thistoken (PL_parser->thistoken)
104# define PL_thiswhite (PL_parser->thiswhite)
fb205e7a
DM
105# define PL_thiswhite (PL_parser->thiswhite)
106# define PL_nexttoke (PL_parser->nexttoke)
107# define PL_curforce (PL_parser->curforce)
108#else
109# define PL_nexttoke (PL_parser->nexttoke)
110# define PL_nexttype (PL_parser->nexttype)
111# define PL_nextval (PL_parser->nextval)
199e78b7
DM
112#endif
113
16173588
NC
114/* This can't be done with embed.fnc, because struct yy_parser contains a
115 member named pending_ident, which clashes with the generated #define */
3cbf51f5
DM
116static int
117S_pending_ident(pTHX);
199e78b7 118
0bd48802 119static const char ident_too_long[] = "Identifier too long";
8903cb82 120
29595ff2 121#ifdef PERL_MAD
29595ff2 122# define CURMAD(slot,sv) if (PL_madskills) { curmad(slot,sv); sv = 0; }
cd81e915 123# define NEXTVAL_NEXTTOKE PL_nexttoke[PL_curforce].next_val
9ded7720 124#else
5db06880 125# define CURMAD(slot,sv)
9ded7720 126# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2
NC
127#endif
128
a7aaec61
Z
129#define XENUMMASK 0x3f
130#define XFAKEEOF 0x40
131#define XFAKEBRACK 0x80
9059aa12 132
39e02b42
JH
133#ifdef USE_UTF8_SCRIPTS
134# define UTF (!IN_BYTES)
2b9d42f0 135#else
802a15e9 136# define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
2b9d42f0 137#endif
a0ed51b3 138
b1fc3636
CJ
139/* The maximum number of characters preceding the unrecognized one to display */
140#define UNRECOGNIZED_PRECEDE_COUNT 10
141
61f0cdd9 142/* In variables named $^X, these are the legal values for X.
2b92dfce
GS
143 * 1999-02-27 mjd-perl-patch@plover.com */
144#define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
145
bf4acbe4 146#define SPACE_OR_TAB(c) ((c)==' '||(c)=='\t')
bf4acbe4 147
ffb4593c
NT
148/* LEX_* are values for PL_lex_state, the state of the lexer.
149 * They are arranged oddly so that the guard on the switch statement
79072805 150 * can get by with a single comparison (if the compiler is smart enough).
9da1dd8f
DM
151 *
152 * These values refer to the various states within a sublex parse,
153 * i.e. within a double quotish string
79072805
LW
154 */
155
fb73857a
PP
156/* #define LEX_NOTPARSING 11 is done in perl.h. */
157
b6007c36
DM
158#define LEX_NORMAL 10 /* normal code (ie not within "...") */
159#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
160#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
161#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
162#define LEX_INTERPSTART 6 /* expecting the start of a $var */
163
164 /* at end of code, eg "$x" followed by: */
165#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
166#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
167
168#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
169 string or after \E, $foo, etc */
170#define LEX_INTERPCONST 2 /* NOT USED */
171#define LEX_FORMLINE 1 /* expecting a format line */
172#define LEX_KNOWNEXT 0 /* next token known; just return it */
173
79072805 174
bbf60fe6 175#ifdef DEBUGGING
27da23d5 176static const char* const lex_state_names[] = {
bbf60fe6
DM
177 "KNOWNEXT",
178 "FORMLINE",
179 "INTERPCONST",
180 "INTERPCONCAT",
181 "INTERPENDMAYBE",
182 "INTERPEND",
183 "INTERPSTART",
184 "INTERPPUSH",
185 "INTERPCASEMOD",
186 "INTERPNORMAL",
187 "NORMAL"
188};
189#endif
190
79072805
LW
191#ifdef ff_next
192#undef ff_next
d48672a2
LW
193#endif
194
79072805 195#include "keywords.h"
fe14fcc3 196
ffb4593c
NT
197/* CLINE is a macro that ensures PL_copline has a sane value */
198
ae986130
LW
199#ifdef CLINE
200#undef CLINE
201#endif
57843af0 202#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 203
5db06880 204#ifdef PERL_MAD
29595ff2
NC
205# define SKIPSPACE0(s) skipspace0(s)
206# define SKIPSPACE1(s) skipspace1(s)
207# define SKIPSPACE2(s,tsv) skipspace2(s,&tsv)
208# define PEEKSPACE(s) skipspace2(s,0)
209#else
210# define SKIPSPACE0(s) skipspace(s)
211# define SKIPSPACE1(s) skipspace(s)
212# define SKIPSPACE2(s,tsv) skipspace(s)
213# define PEEKSPACE(s) skipspace(s)
214#endif
215
ffb4593c
NT
216/*
217 * Convenience functions to return different tokens and prime the
9cbb5ea2 218 * lexer for the next token. They all take an argument.
ffb4593c
NT
219 *
220 * TOKEN : generic token (used for '(', DOLSHARP, etc)
221 * OPERATOR : generic operator
222 * AOPERATOR : assignment operator
223 * PREBLOCK : beginning the block after an if, while, foreach, ...
224 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
225 * PREREF : *EXPR where EXPR is not a simple identifier
226 * TERM : expression term
227 * LOOPX : loop exiting command (goto, last, dump, etc)
228 * FTST : file test operator
229 * FUN0 : zero-argument function
7eb971ee 230 * FUN0OP : zero-argument function, with its op created in this file
2d2e263d 231 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
232 * BOop : bitwise or or xor
233 * BAop : bitwise and
234 * SHop : shift operator
235 * PWop : power operator
9cbb5ea2 236 * PMop : pattern-matching operator
ffb4593c
NT
237 * Aop : addition-level operator
238 * Mop : multiplication-level operator
239 * Eop : equality-testing operator
e5edeb50 240 * Rop : relational operator <= != gt
ffb4593c
NT
241 *
242 * Also see LOP and lop() below.
243 */
244
998054bd 245#ifdef DEBUGGING /* Serve -DT. */
704d4215 246# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 247#else
bbf60fe6 248# define REPORT(retval) (retval)
998054bd
SC
249#endif
250
bbf60fe6
DM
251#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
252#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
253#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, REPORT(retval)))
254#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
255#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
256#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
257#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
6154021b
RGS
258#define LOOPX(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)LOOPEX))
259#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
260#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
7eb971ee 261#define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
6154021b
RGS
262#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
263#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITOROP)))
264#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITANDOP)))
265#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)SHIFTOP)))
266#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)POWOP)))
267#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
268#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)ADDOP)))
269#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MULOP)))
270#define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
271#define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
2f3197b3 272
a687059c
LW
273/* This bit of chicanery makes a unary function followed by
274 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
275 * The UNIDOR macro is for unary functions that can be followed by the //
276 * operator (such as C<shift // 0>).
a687059c 277 */
d68ce4ac 278#define UNI3(f,x,have_x) { \
6154021b 279 pl_yylval.ival = f; \
d68ce4ac 280 if (have_x) PL_expect = x; \
376fcdbf
AL
281 PL_bufptr = s; \
282 PL_last_uni = PL_oldbufptr; \
283 PL_last_lop_op = f; \
284 if (*s == '(') \
285 return REPORT( (int)FUNC1 ); \
29595ff2 286 s = PEEKSPACE(s); \
376fcdbf
AL
287 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
288 }
d68ce4ac
FC
289#define UNI(f) UNI3(f,XTERM,1)
290#define UNIDOR(f) UNI3(f,XTERMORDORDOR,1)
b5fb7ce3
FC
291#define UNIPROTO(f,optional) { \
292 if (optional) PL_last_uni = PL_oldbufptr; \
22393538
MH
293 OPERATOR(f); \
294 }
a687059c 295
d68ce4ac 296#define UNIBRACK(f) UNI3(f,0,0)
79072805 297
9f68db38 298/* grandfather return to old style */
78cdf107
Z
299#define OLDLOP(f) \
300 do { \
301 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
302 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
303 pl_yylval.ival = (f); \
304 PL_expect = XTERM; \
305 PL_bufptr = s; \
306 return (int)LSTOP; \
307 } while(0)
79072805 308
8fa7f367
JH
309#ifdef DEBUGGING
310
6154021b 311/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
312enum token_type {
313 TOKENTYPE_NONE,
314 TOKENTYPE_IVAL,
6154021b 315 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6
DM
316 TOKENTYPE_PVAL,
317 TOKENTYPE_OPVAL,
318 TOKENTYPE_GVVAL
319};
320
6d4a66ac
NC
321static struct debug_tokens {
322 const int token;
323 enum token_type type;
324 const char *name;
325} const debug_tokens[] =
9041c2e3 326{
bbf60fe6
DM
327 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
328 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
329 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
330 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
331 { ARROW, TOKENTYPE_NONE, "ARROW" },
332 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
333 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
334 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
335 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
336 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
0d863452 337 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
338 { DO, TOKENTYPE_NONE, "DO" },
339 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
340 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
341 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
342 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
343 { ELSE, TOKENTYPE_NONE, "ELSE" },
344 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
345 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
346 { FOR, TOKENTYPE_IVAL, "FOR" },
347 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
348 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
349 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
7eb971ee 350 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
bbf60fe6
DM
351 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
352 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
353 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 354 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
355 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
356 { IF, TOKENTYPE_IVAL, "IF" },
5db1eb8d 357 { LABEL, TOKENTYPE_OPVAL, "LABEL" },
bbf60fe6
DM
358 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
359 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
360 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
361 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
362 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
363 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
364 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
365 { MY, TOKENTYPE_IVAL, "MY" },
366 { MYSUB, TOKENTYPE_NONE, "MYSUB" },
367 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
368 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
369 { OROP, TOKENTYPE_IVAL, "OROP" },
370 { OROR, TOKENTYPE_NONE, "OROR" },
371 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
88e1f1a2
JV
372 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
373 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6
DM
374 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
375 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
376 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
377 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
378 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
379 { PREINC, TOKENTYPE_NONE, "PREINC" },
380 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
381 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
382 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
383 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
384 { SUB, TOKENTYPE_NONE, "SUB" },
385 { THING, TOKENTYPE_OPVAL, "THING" },
386 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
387 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
388 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
389 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
390 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
391 { USE, TOKENTYPE_IVAL, "USE" },
0d863452 392 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6
DM
393 { WHILE, TOKENTYPE_IVAL, "WHILE" },
394 { WORD, TOKENTYPE_OPVAL, "WORD" },
be25f609 395 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 396 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
397};
398
6154021b 399/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 400
bbf60fe6 401STATIC int
704d4215 402S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 403{
97aff369 404 dVAR;
7918f24d
NC
405
406 PERL_ARGS_ASSERT_TOKEREPORT;
407
bbf60fe6 408 if (DEBUG_T_TEST) {
bd61b366 409 const char *name = NULL;
bbf60fe6 410 enum token_type type = TOKENTYPE_NONE;
f54cb97a 411 const struct debug_tokens *p;
396482e1 412 SV* const report = newSVpvs("<== ");
bbf60fe6 413
f54cb97a 414 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
415 if (p->token == (int)rv) {
416 name = p->name;
417 type = p->type;
418 break;
419 }
420 }
421 if (name)
54667de8 422 Perl_sv_catpv(aTHX_ report, name);
bbf60fe6
DM
423 else if ((char)rv > ' ' && (char)rv < '~')
424 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
425 else if (!rv)
396482e1 426 sv_catpvs(report, "EOF");
bbf60fe6
DM
427 else
428 Perl_sv_catpvf(aTHX_ report, "?? %"IVdf, (IV)rv);
429 switch (type) {
430 case TOKENTYPE_NONE:
431 case TOKENTYPE_GVVAL: /* doesn't appear to be used */
432 break;
433 case TOKENTYPE_IVAL:
704d4215 434 Perl_sv_catpvf(aTHX_ report, "(ival=%"IVdf")", (IV)lvalp->ival);
bbf60fe6
DM
435 break;
436 case TOKENTYPE_OPNUM:
437 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 438 PL_op_name[lvalp->ival]);
bbf60fe6
DM
439 break;
440 case TOKENTYPE_PVAL:
704d4215 441 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
442 break;
443 case TOKENTYPE_OPVAL:
704d4215 444 if (lvalp->opval) {
401441c0 445 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
446 PL_op_name[lvalp->opval->op_type]);
447 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 448 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 449 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
450 }
451
452 }
401441c0 453 else
396482e1 454 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
455 break;
456 }
b6007c36 457 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
458 };
459 return (int)rv;
998054bd
SC
460}
461
b6007c36
DM
462
463/* print the buffer with suitable escapes */
464
465STATIC void
15f169a1 466S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 467{
396482e1 468 SV* const tmp = newSVpvs("");
7918f24d
NC
469
470 PERL_ARGS_ASSERT_PRINTBUF;
471
b6007c36
DM
472 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
473 SvREFCNT_dec(tmp);
474}
475
8fa7f367
JH
476#endif
477
8290c323
NC
478static int
479S_deprecate_commaless_var_list(pTHX) {
480 PL_expect = XTERM;
481 deprecate("comma-less variable list");
482 return REPORT(','); /* grandfather non-comma-format format */
483}
484
ffb4593c
NT
485/*
486 * S_ao
487 *
c963b151
BD
488 * This subroutine detects &&=, ||=, and //= and turns an ANDAND, OROR or DORDOR
489 * into an OP_ANDASSIGN, OP_ORASSIGN, or OP_DORASSIGN
ffb4593c
NT
490 */
491
76e3520e 492STATIC int
cea2e8a9 493S_ao(pTHX_ int toketype)
a0d0e21e 494{
97aff369 495 dVAR;
3280af22
NIS
496 if (*PL_bufptr == '=') {
497 PL_bufptr++;
a0d0e21e 498 if (toketype == ANDAND)
6154021b 499 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 500 else if (toketype == OROR)
6154021b 501 pl_yylval.ival = OP_ORASSIGN;
c963b151 502 else if (toketype == DORDOR)
6154021b 503 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
504 toketype = ASSIGNOP;
505 }
506 return toketype;
507}
508
ffb4593c
NT
509/*
510 * S_no_op
511 * When Perl expects an operator and finds something else, no_op
512 * prints the warning. It always prints "<something> found where
513 * operator expected. It prints "Missing semicolon on previous line?"
514 * if the surprise occurs at the start of the line. "do you need to
515 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
516 * where the compiler doesn't know if foo is a method call or a function.
517 * It prints "Missing operator before end of line" if there's nothing
518 * after the missing operator, or "... before <...>" if there is something
519 * after the missing operator.
520 */
521
76e3520e 522STATIC void
15f169a1 523S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 524{
97aff369 525 dVAR;
9d4ba2ae
AL
526 char * const oldbp = PL_bufptr;
527 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 528
7918f24d
NC
529 PERL_ARGS_ASSERT_NO_OP;
530
1189a94a
GS
531 if (!s)
532 s = oldbp;
07c798fb 533 else
1189a94a 534 PL_bufptr = s;
734ab321 535 yywarn(Perl_form(aTHX_ "%s found where operator expected", what), UTF ? SVf_UTF8 : 0);
56da5a46
RGS
536 if (ckWARN_d(WARN_SYNTAX)) {
537 if (is_first)
538 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
539 "\t(Missing semicolon on previous line?)\n");
540 else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) {
f54cb97a 541 const char *t;
734ab321
BF
542 for (t = PL_oldoldbufptr; (isALNUM_lazy_if(t,UTF) || *t == ':');
543 t += UTF ? UTF8SKIP(t) : 1)
c35e046a 544 NOOP;
56da5a46
RGS
545 if (t < PL_bufptr && isSPACE(*t))
546 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
734ab321
BF
547 "\t(Do you need to predeclare %"SVf"?)\n",
548 SVfARG(newSVpvn_flags(PL_oldoldbufptr, (STRLEN)(t - PL_oldoldbufptr),
549 SVs_TEMP | (UTF ? SVf_UTF8 : 0))));
56da5a46
RGS
550 }
551 else {
552 assert(s >= oldbp);
553 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
734ab321
BF
554 "\t(Missing operator before %"SVf"?)\n",
555 SVfARG(newSVpvn_flags(oldbp, (STRLEN)(s - oldbp),
556 SVs_TEMP | (UTF ? SVf_UTF8 : 0))));
56da5a46 557 }
07c798fb 558 }
3280af22 559 PL_bufptr = oldbp;
8990e307
LW
560}
561
ffb4593c
NT
562/*
563 * S_missingterm
564 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 565 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
566 * If we're in a delimited string and the delimiter is a control
567 * character, it's reformatted into a two-char sequence like ^C.
568 * This is fatal.
569 */
570
76e3520e 571STATIC void
cea2e8a9 572S_missingterm(pTHX_ char *s)
8990e307 573{
97aff369 574 dVAR;
8990e307
LW
575 char tmpbuf[3];
576 char q;
577 if (s) {
9d4ba2ae 578 char * const nl = strrchr(s,'\n');
d2719217 579 if (nl)
8990e307
LW
580 *nl = '\0';
581 }
463559e7 582 else if (isCNTRL(PL_multi_close)) {
8990e307 583 *tmpbuf = '^';
585ec06d 584 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
585 tmpbuf[2] = '\0';
586 s = tmpbuf;
587 }
588 else {
eb160463 589 *tmpbuf = (char)PL_multi_close;
8990e307
LW
590 tmpbuf[1] = '\0';
591 s = tmpbuf;
592 }
593 q = strchr(s,'"') ? '\'' : '"';
cea2e8a9 594 Perl_croak(aTHX_ "Can't find string terminator %c%s%c anywhere before EOF",q,s,q);
463ee0b2 595}
79072805 596
dd0ac2b9
FC
597#include "feature.h"
598
0d863452 599/*
0d863452
RH
600 * Check whether the named feature is enabled.
601 */
26ea9e12 602bool
3fff3427 603Perl_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
0d863452 604{
97aff369 605 dVAR;
4a731d7b 606 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
7918f24d
NC
607
608 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
ca4d40c4
FC
609
610 assert(CURRENT_FEATURE_BUNDLE == FEATURE_BUNDLE_CUSTOM);
7918f24d 611
26ea9e12
NC
612 if (namelen > MAX_FEATURE_LEN)
613 return FALSE;
3fff3427 614 memcpy(&he_name[8], name, namelen);
7d69d4a6 615
c8ca97b0
NC
616 return cBOOL(cop_hints_fetch_pvn(PL_curcop, he_name, 8 + namelen, 0,
617 REFCOUNTED_HE_EXISTS));
0d863452
RH
618}
619
ffb4593c 620/*
9cbb5ea2
GS
621 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
622 * utf16-to-utf8-reversed.
ffb4593c
NT
623 */
624
c39cd008
GS
625#ifdef PERL_CR_FILTER
626static void
627strip_return(SV *sv)
628{
95a20fc0 629 register const char *s = SvPVX_const(sv);
9d4ba2ae 630 register const char * const e = s + SvCUR(sv);
7918f24d
NC
631
632 PERL_ARGS_ASSERT_STRIP_RETURN;
633
c39cd008
GS
634 /* outer loop optimized to do nothing if there are no CR-LFs */
635 while (s < e) {
636 if (*s++ == '\r' && *s == '\n') {
637 /* hit a CR-LF, need to copy the rest */
638 register char *d = s - 1;
639 *d++ = *s++;
640 while (s < e) {
641 if (*s == '\r' && s[1] == '\n')
642 s++;
643 *d++ = *s++;
644 }
645 SvCUR(sv) -= s - d;
646 return;
647 }
648 }
649}
a868473f 650
76e3520e 651STATIC I32
c39cd008 652S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 653{
f54cb97a 654 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
655 if (count > 0 && !maxlen)
656 strip_return(sv);
657 return count;
a868473f
NIS
658}
659#endif
660
ffb4593c 661/*
8eaa0acf
Z
662=for apidoc Amx|void|lex_start|SV *line|PerlIO *rsfp|U32 flags
663
664Creates and initialises a new lexer/parser state object, supplying
665a context in which to lex and parse from a new source of Perl code.
666A pointer to the new state object is placed in L</PL_parser>. An entry
667is made on the save stack so that upon unwinding the new state object
668will be destroyed and the former value of L</PL_parser> will be restored.
669Nothing else need be done to clean up the parsing context.
670
671The code to be parsed comes from I<line> and I<rsfp>. I<line>, if
672non-null, provides a string (in SV form) containing code to be parsed.
673A copy of the string is made, so subsequent modification of I<line>
674does not affect parsing. I<rsfp>, if non-null, provides an input stream
675from which code will be read to be parsed. If both are non-null, the
676code in I<line> comes first and must consist of complete lines of input,
677and I<rsfp> supplies the remainder of the source.
678
e368b3bd
FC
679The I<flags> parameter is reserved for future use. Currently it is only
680used by perl internally, so extensions should always pass zero.
8eaa0acf
Z
681
682=cut
683*/
ffb4593c 684
27fcb6ee 685/* LEX_START_SAME_FILTER indicates that this is not a new file, so it
87606032
NC
686 can share filters with the current parser.
687 LEX_START_DONT_CLOSE indicates that the file handle wasn't opened by the
688 caller, hence isn't owned by the parser, so shouldn't be closed on parser
689 destruction. This is used to handle the case of defaulting to reading the
690 script from the standard input because no filename was given on the command
691 line (without getting confused by situation where STDIN has been closed, so
692 the script handle is opened on fd 0) */
27fcb6ee 693
a0d0e21e 694void
8eaa0acf 695Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
79072805 696{
97aff369 697 dVAR;
6ef55633 698 const char *s = NULL;
5486870f 699 yy_parser *parser, *oparser;
60d63348 700 if (flags && flags & ~LEX_START_FLAGS)
8eaa0acf 701 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
acdf0a21
DM
702
703 /* create and initialise a parser */
704
199e78b7 705 Newxz(parser, 1, yy_parser);
5486870f 706 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
707 PL_parser = parser;
708
28ac2b49
Z
709 parser->stack = NULL;
710 parser->ps = NULL;
711 parser->stack_size = 0;
acdf0a21 712
e3abe207
DM
713 /* on scope exit, free this parser and restore any outer one */
714 SAVEPARSER(parser);
7c4baf47 715 parser->saved_curcop = PL_curcop;
e3abe207 716
acdf0a21 717 /* initialise lexer state */
8990e307 718
fb205e7a
DM
719#ifdef PERL_MAD
720 parser->curforce = -1;
721#else
722 parser->nexttoke = 0;
723#endif
ca4cfd28 724 parser->error_count = oparser ? oparser->error_count : 0;
c2598295 725 parser->copline = NOLINE;
5afb0a62 726 parser->lex_state = LEX_NORMAL;
c2598295 727 parser->expect = XSTATE;
2f9285f8 728 parser->rsfp = rsfp;
27fcb6ee
FC
729 parser->rsfp_filters =
730 !(flags & LEX_START_SAME_FILTER) || !oparser
d3cd8e11
FC
731 ? NULL
732 : MUTABLE_AV(SvREFCNT_inc(
733 oparser->rsfp_filters
734 ? oparser->rsfp_filters
735 : (oparser->rsfp_filters = newAV())
736 ));
2f9285f8 737
199e78b7
DM
738 Newx(parser->lex_brackstack, 120, char);
739 Newx(parser->lex_casestack, 12, char);
740 *parser->lex_casestack = '\0';
02b34bbe 741
10efb74f 742 if (line) {
0528fd32 743 STRLEN len;
10efb74f 744 s = SvPV_const(line, len);
0abcdfa4
FC
745 parser->linestr = flags & LEX_START_COPIED
746 ? SvREFCNT_inc_simple_NN(line)
747 : newSVpvn_flags(s, len, SvUTF8(line));
37555a72 748 if (!len || s[len-1] != ';')
bdc0bf6f 749 sv_catpvs(parser->linestr, "\n;");
0abcdfa4
FC
750 } else {
751 parser->linestr = newSVpvs("\n;");
8990e307 752 }
f06b5848
DM
753 parser->oldoldbufptr =
754 parser->oldbufptr =
755 parser->bufptr =
756 parser->linestart = SvPVX(parser->linestr);
757 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
758 parser->last_lop = parser->last_uni = NULL;
87606032
NC
759 parser->lex_flags = flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
760 |LEX_DONT_CLOSE_RSFP);
737c24fc 761
60d63348 762 parser->in_pod = parser->filtered = 0;
79072805 763}
a687059c 764
e3abe207
DM
765
766/* delete a parser object */
767
768void
769Perl_parser_free(pTHX_ const yy_parser *parser)
770{
7918f24d
NC
771 PERL_ARGS_ASSERT_PARSER_FREE;
772
7c4baf47 773 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
774 SvREFCNT_dec(parser->linestr);
775
87606032 776 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
2f9285f8 777 PerlIO_clearerr(parser->rsfp);
799361c3
SH
778 else if (parser->rsfp && (!parser->old_parser ||
779 (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 780 PerlIO_close(parser->rsfp);
5486870f 781 SvREFCNT_dec(parser->rsfp_filters);
2f9285f8 782
e3abe207
DM
783 Safefree(parser->lex_brackstack);
784 Safefree(parser->lex_casestack);
785 PL_parser = parser->old_parser;
786 Safefree(parser);
787}
788
789
ffb4593c 790/*
f0e67a1d
Z
791=for apidoc AmxU|SV *|PL_parser-E<gt>linestr
792
793Buffer scalar containing the chunk currently under consideration of the
794text currently being lexed. This is always a plain string scalar (for
795which C<SvPOK> is true). It is not intended to be used as a scalar by
796normal scalar means; instead refer to the buffer directly by the pointer
797variables described below.
798
799The lexer maintains various C<char*> pointers to things in the
800C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
801reallocated, all of these pointers must be updated. Don't attempt to
802do this manually, but rather use L</lex_grow_linestr> if you need to
803reallocate the buffer.
804
805The content of the text chunk in the buffer is commonly exactly one
806complete line of input, up to and including a newline terminator,
807but there are situations where it is otherwise. The octets of the
808buffer may be intended to be interpreted as either UTF-8 or Latin-1.
809The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
810flag on this scalar, which may disagree with it.
811
812For direct examination of the buffer, the variable
813L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
814lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
815of these pointers is usually preferable to examination of the scalar
816through normal scalar means.
817
818=for apidoc AmxU|char *|PL_parser-E<gt>bufend
819
820Direct pointer to the end of the chunk of text currently being lexed, the
821end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
822+ SvCUR(PL_parser-E<gt>linestr)>. A NUL character (zero octet) is
823always located at the end of the buffer, and does not count as part of
824the buffer's contents.
825
826=for apidoc AmxU|char *|PL_parser-E<gt>bufptr
827
828Points to the current position of lexing inside the lexer buffer.
829Characters around this point may be freely examined, within
830the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
831L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
832interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
833
834Lexing code (whether in the Perl core or not) moves this pointer past
835the characters that it consumes. It is also expected to perform some
836bookkeeping whenever a newline character is consumed. This movement
837can be more conveniently performed by the function L</lex_read_to>,
838which handles newlines appropriately.
839
840Interpretation of the buffer's octets can be abstracted out by
841using the slightly higher-level functions L</lex_peek_unichar> and
842L</lex_read_unichar>.
843
844=for apidoc AmxU|char *|PL_parser-E<gt>linestart
845
846Points to the start of the current line inside the lexer buffer.
847This is useful for indicating at which column an error occurred, and
848not much else. This must be updated by any lexing code that consumes
849a newline; the function L</lex_read_to> handles this detail.
850
851=cut
852*/
853
854/*
855=for apidoc Amx|bool|lex_bufutf8
856
857Indicates whether the octets in the lexer buffer
858(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
859of Unicode characters. If not, they should be interpreted as Latin-1
860characters. This is analogous to the C<SvUTF8> flag for scalars.
861
862In UTF-8 mode, it is not guaranteed that the lexer buffer actually
863contains valid UTF-8. Lexing code must be robust in the face of invalid
864encoding.
865
866The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
867is significant, but not the whole story regarding the input character
868encoding. Normally, when a file is being read, the scalar contains octets
869and its C<SvUTF8> flag is off, but the octets should be interpreted as
870UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
871however, the scalar may have the C<SvUTF8> flag on, and in this case its
872octets should be interpreted as UTF-8 unless the C<use bytes> pragma
873is in effect. This logic may change in the future; use this function
874instead of implementing the logic yourself.
875
876=cut
877*/
878
879bool
880Perl_lex_bufutf8(pTHX)
881{
882 return UTF;
883}
884
885/*
886=for apidoc Amx|char *|lex_grow_linestr|STRLEN len
887
888Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
889at least I<len> octets (including terminating NUL). Returns a
890pointer to the reallocated buffer. This is necessary before making
891any direct modification of the buffer that would increase its length.
892L</lex_stuff_pvn> provides a more convenient way to insert text into
893the buffer.
894
895Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
896this function updates all of the lexer's variables that point directly
897into the buffer.
898
899=cut
900*/
901
902char *
903Perl_lex_grow_linestr(pTHX_ STRLEN len)
904{
905 SV *linestr;
906 char *buf;
907 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
908 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
909 linestr = PL_parser->linestr;
910 buf = SvPVX(linestr);
911 if (len <= SvLEN(linestr))
912 return buf;
913 bufend_pos = PL_parser->bufend - buf;
914 bufptr_pos = PL_parser->bufptr - buf;
915 oldbufptr_pos = PL_parser->oldbufptr - buf;
916 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
917 linestart_pos = PL_parser->linestart - buf;
918 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
919 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
920 buf = sv_grow(linestr, len);
921 PL_parser->bufend = buf + bufend_pos;
922 PL_parser->bufptr = buf + bufptr_pos;
923 PL_parser->oldbufptr = buf + oldbufptr_pos;
924 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
925 PL_parser->linestart = buf + linestart_pos;
926 if (PL_parser->last_uni)
927 PL_parser->last_uni = buf + last_uni_pos;
928 if (PL_parser->last_lop)
929 PL_parser->last_lop = buf + last_lop_pos;
930 return buf;
931}
932
933/*
83aa740e 934=for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
f0e67a1d
Z
935
936Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
937immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
938reallocating the buffer if necessary. This means that lexing code that
939runs later will see the characters as if they had appeared in the input.
940It is not recommended to do this as part of normal parsing, and most
941uses of this facility run the risk of the inserted characters being
942interpreted in an unintended manner.
943
944The string to be inserted is represented by I<len> octets starting
945at I<pv>. These octets are interpreted as either UTF-8 or Latin-1,
946according to whether the C<LEX_STUFF_UTF8> flag is set in I<flags>.
947The characters are recoded for the lexer buffer, according to how the
948buffer is currently being interpreted (L</lex_bufutf8>). If a string
9dcc53ea 949to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
f0e67a1d
Z
950function is more convenient.
951
952=cut
953*/
954
955void
83aa740e 956Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
f0e67a1d 957{
749123ff 958 dVAR;
f0e67a1d
Z
959 char *bufptr;
960 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
961 if (flags & ~(LEX_STUFF_UTF8))
962 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
963 if (UTF) {
964 if (flags & LEX_STUFF_UTF8) {
965 goto plain_copy;
966 } else {
967 STRLEN highhalf = 0;
83aa740e 968 const char *p, *e = pv+len;
f0e67a1d
Z
969 for (p = pv; p != e; p++)
970 highhalf += !!(((U8)*p) & 0x80);
971 if (!highhalf)
972 goto plain_copy;
973 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
974 bufptr = PL_parser->bufptr;
975 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
976 SvCUR_set(PL_parser->linestr,
977 SvCUR(PL_parser->linestr) + len+highhalf);
f0e67a1d
Z
978 PL_parser->bufend += len+highhalf;
979 for (p = pv; p != e; p++) {
980 U8 c = (U8)*p;
981 if (c & 0x80) {
982 *bufptr++ = (char)(0xc0 | (c >> 6));
983 *bufptr++ = (char)(0x80 | (c & 0x3f));
984 } else {
985 *bufptr++ = (char)c;
986 }
987 }
988 }
989 } else {
990 if (flags & LEX_STUFF_UTF8) {
991 STRLEN highhalf = 0;
83aa740e 992 const char *p, *e = pv+len;
f0e67a1d
Z
993 for (p = pv; p != e; p++) {
994 U8 c = (U8)*p;
995 if (c >= 0xc4) {
996 Perl_croak(aTHX_ "Lexing code attempted to stuff "
997 "non-Latin-1 character into Latin-1 input");
998 } else if (c >= 0xc2 && p+1 != e &&
999 (((U8)p[1]) & 0xc0) == 0x80) {
1000 p++;
1001 highhalf++;
1002 } else if (c >= 0x80) {
1003 /* malformed UTF-8 */
1004 ENTER;
1005 SAVESPTR(PL_warnhook);
1006 PL_warnhook = PERL_WARNHOOK_FATAL;
1007 utf8n_to_uvuni((U8*)p, e-p, NULL, 0);
1008 LEAVE;
1009 }
1010 }
1011 if (!highhalf)
1012 goto plain_copy;
1013 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
1014 bufptr = PL_parser->bufptr;
1015 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1016 SvCUR_set(PL_parser->linestr,
1017 SvCUR(PL_parser->linestr) + len-highhalf);
f0e67a1d
Z
1018 PL_parser->bufend += len-highhalf;
1019 for (p = pv; p != e; p++) {
1020 U8 c = (U8)*p;
1021 if (c & 0x80) {
1022 *bufptr++ = (char)(((c & 0x3) << 6) | (p[1] & 0x3f));
1023 p++;
1024 } else {
1025 *bufptr++ = (char)c;
1026 }
1027 }
1028 } else {
1029 plain_copy:
1030 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1031 bufptr = PL_parser->bufptr;
1032 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
255fdf19 1033 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
f0e67a1d
Z
1034 PL_parser->bufend += len;
1035 Copy(pv, bufptr, len, char);
1036 }
1037 }
1038}
1039
1040/*
9dcc53ea
Z
1041=for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
1042
1043Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1044immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1045reallocating the buffer if necessary. This means that lexing code that
1046runs later will see the characters as if they had appeared in the input.
1047It is not recommended to do this as part of normal parsing, and most
1048uses of this facility run the risk of the inserted characters being
1049interpreted in an unintended manner.
1050
1051The string to be inserted is represented by octets starting at I<pv>
1052and continuing to the first nul. These octets are interpreted as either
1053UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
1054in I<flags>. The characters are recoded for the lexer buffer, according
1055to how the buffer is currently being interpreted (L</lex_bufutf8>).
1056If it is not convenient to nul-terminate a string to be inserted, the
1057L</lex_stuff_pvn> function is more appropriate.
1058
1059=cut
1060*/
1061
1062void
1063Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1064{
1065 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1066 lex_stuff_pvn(pv, strlen(pv), flags);
1067}
1068
1069/*
f0e67a1d
Z
1070=for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1071
1072Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1073immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1074reallocating the buffer if necessary. This means that lexing code that
1075runs later will see the characters as if they had appeared in the input.
1076It is not recommended to do this as part of normal parsing, and most
1077uses of this facility run the risk of the inserted characters being
1078interpreted in an unintended manner.
1079
1080The string to be inserted is the string value of I<sv>. The characters
1081are recoded for the lexer buffer, according to how the buffer is currently
9dcc53ea 1082being interpreted (L</lex_bufutf8>). If a string to be inserted is
f0e67a1d
Z
1083not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1084need to construct a scalar.
1085
1086=cut
1087*/
1088
1089void
1090Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1091{
1092 char *pv;
1093 STRLEN len;
1094 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1095 if (flags)
1096 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1097 pv = SvPV(sv, len);
1098 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1099}
1100
1101/*
1102=for apidoc Amx|void|lex_unstuff|char *ptr
1103
1104Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1105I<ptr>. Text following I<ptr> will be moved, and the buffer shortened.
1106This hides the discarded text from any lexing code that runs later,
1107as if the text had never appeared.
1108
1109This is not the normal way to consume lexed text. For that, use
1110L</lex_read_to>.
1111
1112=cut
1113*/
1114
1115void
1116Perl_lex_unstuff(pTHX_ char *ptr)
1117{
1118 char *buf, *bufend;
1119 STRLEN unstuff_len;
1120 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1121 buf = PL_parser->bufptr;
1122 if (ptr < buf)
1123 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1124 if (ptr == buf)
1125 return;
1126 bufend = PL_parser->bufend;
1127 if (ptr > bufend)
1128 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1129 unstuff_len = ptr - buf;
1130 Move(ptr, buf, bufend+1-ptr, char);
1131 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1132 PL_parser->bufend = bufend - unstuff_len;
1133}
1134
1135/*
1136=for apidoc Amx|void|lex_read_to|char *ptr
1137
1138Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1139to I<ptr>. This advances L</PL_parser-E<gt>bufptr> to match I<ptr>,
1140performing the correct bookkeeping whenever a newline character is passed.
1141This is the normal way to consume lexed text.
1142
1143Interpretation of the buffer's octets can be abstracted out by
1144using the slightly higher-level functions L</lex_peek_unichar> and
1145L</lex_read_unichar>.
1146
1147=cut
1148*/
1149
1150void
1151Perl_lex_read_to(pTHX_ char *ptr)
1152{
1153 char *s;
1154 PERL_ARGS_ASSERT_LEX_READ_TO;
1155 s = PL_parser->bufptr;
1156 if (ptr < s || ptr > PL_parser->bufend)
1157 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1158 for (; s != ptr; s++)
1159 if (*s == '\n') {
1160 CopLINE_inc(PL_curcop);
1161 PL_parser->linestart = s+1;
1162 }
1163 PL_parser->bufptr = ptr;
1164}
1165
1166/*
1167=for apidoc Amx|void|lex_discard_to|char *ptr
1168
1169Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1170up to I<ptr>. The remaining content of the buffer will be moved, and
1171all pointers into the buffer updated appropriately. I<ptr> must not
1172be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1173it is not permitted to discard text that has yet to be lexed.
1174
1175Normally it is not necessarily to do this directly, because it suffices to
1176use the implicit discarding behaviour of L</lex_next_chunk> and things
1177based on it. However, if a token stretches across multiple lines,
1f317c95 1178and the lexing code has kept multiple lines of text in the buffer for
f0e67a1d
Z
1179that purpose, then after completion of the token it would be wise to
1180explicitly discard the now-unneeded earlier lines, to avoid future
1181multi-line tokens growing the buffer without bound.
1182
1183=cut
1184*/
1185
1186void
1187Perl_lex_discard_to(pTHX_ char *ptr)
1188{
1189 char *buf;
1190 STRLEN discard_len;
1191 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1192 buf = SvPVX(PL_parser->linestr);
1193 if (ptr < buf)
1194 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1195 if (ptr == buf)
1196 return;
1197 if (ptr > PL_parser->bufptr)
1198 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1199 discard_len = ptr - buf;
1200 if (PL_parser->oldbufptr < ptr)
1201 PL_parser->oldbufptr = ptr;
1202 if (PL_parser->oldoldbufptr < ptr)
1203 PL_parser->oldoldbufptr = ptr;
1204 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1205 PL_parser->last_uni = NULL;
1206 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1207 PL_parser->last_lop = NULL;
1208 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1209 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1210 PL_parser->bufend -= discard_len;
1211 PL_parser->bufptr -= discard_len;
1212 PL_parser->oldbufptr -= discard_len;
1213 PL_parser->oldoldbufptr -= discard_len;
1214 if (PL_parser->last_uni)
1215 PL_parser->last_uni -= discard_len;
1216 if (PL_parser->last_lop)
1217 PL_parser->last_lop -= discard_len;
1218}
1219
1220/*
1221=for apidoc Amx|bool|lex_next_chunk|U32 flags
1222
1223Reads in the next chunk of text to be lexed, appending it to
1224L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1225looked to the end of the current chunk and wants to know more. It is
1226usual, but not necessary, for lexing to have consumed the entirety of
1227the current chunk at this time.
1228
1229If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1230chunk (i.e., the current chunk has been entirely consumed), normally the
1231current chunk will be discarded at the same time that the new chunk is
1232read in. If I<flags> includes C<LEX_KEEP_PREVIOUS>, the current chunk
1233will not be discarded. If the current chunk has not been entirely
1234consumed, then it will not be discarded regardless of the flag.
1235
1236Returns true if some new text was added to the buffer, or false if the
1237buffer has reached the end of the input text.
1238
1239=cut
1240*/
1241
1242#define LEX_FAKE_EOF 0x80000000
1243
1244bool
1245Perl_lex_next_chunk(pTHX_ U32 flags)
1246{
1247 SV *linestr;
1248 char *buf;
1249 STRLEN old_bufend_pos, new_bufend_pos;
1250 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1251 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1252 bool got_some_for_debugger = 0;
f0e67a1d
Z
1253 bool got_some;
1254 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF))
1255 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
f0e67a1d
Z
1256 linestr = PL_parser->linestr;
1257 buf = SvPVX(linestr);
1258 if (!(flags & LEX_KEEP_PREVIOUS) &&
1259 PL_parser->bufptr == PL_parser->bufend) {
1260 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1261 linestart_pos = 0;
1262 if (PL_parser->last_uni != PL_parser->bufend)
1263 PL_parser->last_uni = NULL;
1264 if (PL_parser->last_lop != PL_parser->bufend)
1265 PL_parser->last_lop = NULL;
1266 last_uni_pos = last_lop_pos = 0;
1267 *buf = 0;
1268 SvCUR(linestr) = 0;
1269 } else {
1270 old_bufend_pos = PL_parser->bufend - buf;
1271 bufptr_pos = PL_parser->bufptr - buf;
1272 oldbufptr_pos = PL_parser->oldbufptr - buf;
1273 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1274 linestart_pos = PL_parser->linestart - buf;
1275 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1276 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1277 }
1278 if (flags & LEX_FAKE_EOF) {
1279 goto eof;
60d63348 1280 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
f0e67a1d
Z
1281 got_some = 0;
1282 } else if (filter_gets(linestr, old_bufend_pos)) {
1283 got_some = 1;
17cc9359 1284 got_some_for_debugger = 1;
f0e67a1d 1285 } else {
580561a3
Z
1286 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1287 sv_setpvs(linestr, "");
f0e67a1d
Z
1288 eof:
1289 /* End of real input. Close filehandle (unless it was STDIN),
1290 * then add implicit termination.
1291 */
87606032 1292 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
f0e67a1d
Z
1293 PerlIO_clearerr(PL_parser->rsfp);
1294 else if (PL_parser->rsfp)
1295 (void)PerlIO_close(PL_parser->rsfp);
1296 PL_parser->rsfp = NULL;
60d63348 1297 PL_parser->in_pod = PL_parser->filtered = 0;
f0e67a1d
Z
1298#ifdef PERL_MAD
1299 if (PL_madskills && !PL_in_eval && (PL_minus_p || PL_minus_n))
1300 PL_faketokens = 1;
1301#endif
1302 if (!PL_in_eval && PL_minus_p) {
1303 sv_catpvs(linestr,
1304 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1305 PL_minus_n = PL_minus_p = 0;
1306 } else if (!PL_in_eval && PL_minus_n) {
1307 sv_catpvs(linestr, /*{*/";}");
1308 PL_minus_n = 0;
1309 } else
1310 sv_catpvs(linestr, ";");
1311 got_some = 1;
1312 }
1313 buf = SvPVX(linestr);
1314 new_bufend_pos = SvCUR(linestr);
1315 PL_parser->bufend = buf + new_bufend_pos;
1316 PL_parser->bufptr = buf + bufptr_pos;
1317 PL_parser->oldbufptr = buf + oldbufptr_pos;
1318 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1319 PL_parser->linestart = buf + linestart_pos;
1320 if (PL_parser->last_uni)
1321 PL_parser->last_uni = buf + last_uni_pos;
1322 if (PL_parser->last_lop)
1323 PL_parser->last_lop = buf + last_lop_pos;
17cc9359 1324 if (got_some_for_debugger && (PERLDB_LINE || PERLDB_SAVESRC) &&
f0e67a1d
Z
1325 PL_curstash != PL_debstash) {
1326 /* debugger active and we're not compiling the debugger code,
1327 * so store the line into the debugger's array of lines
1328 */
1329 update_debugger_info(NULL, buf+old_bufend_pos,
1330 new_bufend_pos-old_bufend_pos);
1331 }
1332 return got_some;
1333}
1334
1335/*
1336=for apidoc Amx|I32|lex_peek_unichar|U32 flags
1337
1338Looks ahead one (Unicode) character in the text currently being lexed.
1339Returns the codepoint (unsigned integer value) of the next character,
1340or -1 if lexing has reached the end of the input text. To consume the
1341peeked character, use L</lex_read_unichar>.
1342
1343If the next character is in (or extends into) the next chunk of input
1344text, the next chunk will be read in. Normally the current chunk will be
1345discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1346then the current chunk will not be discarded.
1347
1348If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1349is encountered, an exception is generated.
1350
1351=cut
1352*/
1353
1354I32
1355Perl_lex_peek_unichar(pTHX_ U32 flags)
1356{
749123ff 1357 dVAR;
f0e67a1d
Z
1358 char *s, *bufend;
1359 if (flags & ~(LEX_KEEP_PREVIOUS))
1360 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1361 s = PL_parser->bufptr;
1362 bufend = PL_parser->bufend;
1363 if (UTF) {
1364 U8 head;
1365 I32 unichar;
1366 STRLEN len, retlen;
1367 if (s == bufend) {
1368 if (!lex_next_chunk(flags))
1369 return -1;
1370 s = PL_parser->bufptr;
1371 bufend = PL_parser->bufend;
1372 }
1373 head = (U8)*s;
1374 if (!(head & 0x80))
1375 return head;
1376 if (head & 0x40) {
1377 len = PL_utf8skip[head];
1378 while ((STRLEN)(bufend-s) < len) {
1379 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1380 break;
1381 s = PL_parser->bufptr;
1382 bufend = PL_parser->bufend;
1383 }
1384 }
1385 unichar = utf8n_to_uvuni((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
1386 if (retlen == (STRLEN)-1) {
1387 /* malformed UTF-8 */
1388 ENTER;
1389 SAVESPTR(PL_warnhook);
1390 PL_warnhook = PERL_WARNHOOK_FATAL;
1391 utf8n_to_uvuni((U8*)s, bufend-s, NULL, 0);
1392 LEAVE;
1393 }
1394 return unichar;
1395 } else {
1396 if (s == bufend) {
1397 if (!lex_next_chunk(flags))
1398 return -1;
1399 s = PL_parser->bufptr;
1400 }
1401 return (U8)*s;
1402 }
1403}
1404
1405/*
1406=for apidoc Amx|I32|lex_read_unichar|U32 flags
1407
1408Reads the next (Unicode) character in the text currently being lexed.
1409Returns the codepoint (unsigned integer value) of the character read,
1410and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1411if lexing has reached the end of the input text. To non-destructively
1412examine the next character, use L</lex_peek_unichar> instead.
1413
1414If the next character is in (or extends into) the next chunk of input
1415text, the next chunk will be read in. Normally the current chunk will be
1416discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1417then the current chunk will not be discarded.
1418
1419If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1420is encountered, an exception is generated.
1421
1422=cut
1423*/
1424
1425I32
1426Perl_lex_read_unichar(pTHX_ U32 flags)
1427{
1428 I32 c;
1429 if (flags & ~(LEX_KEEP_PREVIOUS))
1430 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1431 c = lex_peek_unichar(flags);
1432 if (c != -1) {
1433 if (c == '\n')
1434 CopLINE_inc(PL_curcop);
d9018cbe
EB
1435 if (UTF)
1436 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1437 else
1438 ++(PL_parser->bufptr);
f0e67a1d
Z
1439 }
1440 return c;
1441}
1442
1443/*
1444=for apidoc Amx|void|lex_read_space|U32 flags
1445
1446Reads optional spaces, in Perl style, in the text currently being
1447lexed. The spaces may include ordinary whitespace characters and
1448Perl-style comments. C<#line> directives are processed if encountered.
1449L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1450at a non-space character (or the end of the input text).
1451
1452If spaces extend into the next chunk of input text, the next chunk will
1453be read in. Normally the current chunk will be discarded at the same
1454time, but if I<flags> includes C<LEX_KEEP_PREVIOUS> then the current
1455chunk will not be discarded.
1456
1457=cut
1458*/
1459
f0998909
Z
1460#define LEX_NO_NEXT_CHUNK 0x80000000
1461
f0e67a1d
Z
1462void
1463Perl_lex_read_space(pTHX_ U32 flags)
1464{
1465 char *s, *bufend;
1466 bool need_incline = 0;
f0998909 1467 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK))
f0e67a1d
Z
1468 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
1469#ifdef PERL_MAD
1470 if (PL_skipwhite) {
1471 sv_free(PL_skipwhite);
1472 PL_skipwhite = NULL;
1473 }
1474 if (PL_madskills)
1475 PL_skipwhite = newSVpvs("");
1476#endif /* PERL_MAD */
1477 s = PL_parser->bufptr;
1478 bufend = PL_parser->bufend;
1479 while (1) {
1480 char c = *s;
1481 if (c == '#') {
1482 do {
1483 c = *++s;
1484 } while (!(c == '\n' || (c == 0 && s == bufend)));
1485 } else if (c == '\n') {
1486 s++;
1487 PL_parser->linestart = s;
1488 if (s == bufend)
1489 need_incline = 1;
1490 else
1491 incline(s);
1492 } else if (isSPACE(c)) {
1493 s++;
1494 } else if (c == 0 && s == bufend) {
1495 bool got_more;
1496#ifdef PERL_MAD
1497 if (PL_madskills)
1498 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1499#endif /* PERL_MAD */
f0998909
Z
1500 if (flags & LEX_NO_NEXT_CHUNK)
1501 break;
f0e67a1d
Z
1502 PL_parser->bufptr = s;
1503 CopLINE_inc(PL_curcop);
1504 got_more = lex_next_chunk(flags);
1505 CopLINE_dec(PL_curcop);
1506 s = PL_parser->bufptr;
1507 bufend = PL_parser->bufend;
1508 if (!got_more)
1509 break;
1510 if (need_incline && PL_parser->rsfp) {
1511 incline(s);
1512 need_incline = 0;
1513 }
1514 } else {
1515 break;
1516 }
1517 }
1518#ifdef PERL_MAD
1519 if (PL_madskills)
1520 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1521#endif /* PERL_MAD */
1522 PL_parser->bufptr = s;
1523}
1524
1525/*
ffb4593c
NT
1526 * S_incline
1527 * This subroutine has nothing to do with tilting, whether at windmills
1528 * or pinball tables. Its name is short for "increment line". It
57843af0 1529 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1530 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1531 * # line 500 "foo.pm"
1532 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1533 */
1534
76e3520e 1535STATIC void
d9095cec 1536S_incline(pTHX_ const char *s)
463ee0b2 1537{
97aff369 1538 dVAR;
d9095cec
NC
1539 const char *t;
1540 const char *n;
1541 const char *e;
8818d409 1542 line_t line_num;
463ee0b2 1543
7918f24d
NC
1544 PERL_ARGS_ASSERT_INCLINE;
1545
57843af0 1546 CopLINE_inc(PL_curcop);
463ee0b2
LW
1547 if (*s++ != '#')
1548 return;
d4c19fe8
AL
1549 while (SPACE_OR_TAB(*s))
1550 s++;
73659bf1
GS
1551 if (strnEQ(s, "line", 4))
1552 s += 4;
1553 else
1554 return;
084592ab 1555 if (SPACE_OR_TAB(*s))
73659bf1 1556 s++;
4e553d73 1557 else
73659bf1 1558 return;
d4c19fe8
AL
1559 while (SPACE_OR_TAB(*s))
1560 s++;
463ee0b2
LW
1561 if (!isDIGIT(*s))
1562 return;
d4c19fe8 1563
463ee0b2
LW
1564 n = s;
1565 while (isDIGIT(*s))
1566 s++;
07714eb4 1567 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1568 return;
bf4acbe4 1569 while (SPACE_OR_TAB(*s))
463ee0b2 1570 s++;
73659bf1 1571 if (*s == '"' && (t = strchr(s+1, '"'))) {
463ee0b2 1572 s++;
73659bf1
GS
1573 e = t + 1;
1574 }
463ee0b2 1575 else {
c35e046a
AL
1576 t = s;
1577 while (!isSPACE(*t))
1578 t++;
73659bf1 1579 e = t;
463ee0b2 1580 }
bf4acbe4 1581 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1582 e++;
1583 if (*e != '\n' && *e != '\0')
1584 return; /* false alarm */
1585
8818d409
FC
1586 line_num = atoi(n)-1;
1587
f4dd75d9 1588 if (t - s > 0) {
d9095cec 1589 const STRLEN len = t - s;
19bad673
NC
1590 SV *const temp_sv = CopFILESV(PL_curcop);
1591 const char *cf;
1592 STRLEN tmplen;
1593
1594 if (temp_sv) {
1595 cf = SvPVX(temp_sv);
1596 tmplen = SvCUR(temp_sv);
1597 } else {
1598 cf = NULL;
1599 tmplen = 0;
1600 }
1601
d1299d44 1602 if (!PL_rsfp && !PL_parser->filtered) {
e66cf94c
RGS
1603 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1604 * to *{"::_<newfilename"} */
44867030
NC
1605 /* However, the long form of evals is only turned on by the
1606 debugger - usually they're "(eval %lu)" */
1607 char smallbuf[128];
1608 char *tmpbuf;
1609 GV **gvp;
d9095cec 1610 STRLEN tmplen2 = len;
798b63bc 1611 if (tmplen + 2 <= sizeof smallbuf)
e66cf94c
RGS
1612 tmpbuf = smallbuf;
1613 else
2ae0db35 1614 Newx(tmpbuf, tmplen + 2, char);
44867030
NC
1615 tmpbuf[0] = '_';
1616 tmpbuf[1] = '<';
2ae0db35 1617 memcpy(tmpbuf + 2, cf, tmplen);
44867030 1618 tmplen += 2;
8a5ee598
RGS
1619 gvp = (GV**)hv_fetch(PL_defstash, tmpbuf, tmplen, FALSE);
1620 if (gvp) {
44867030
NC
1621 char *tmpbuf2;
1622 GV *gv2;
1623
1624 if (tmplen2 + 2 <= sizeof smallbuf)
1625 tmpbuf2 = smallbuf;
1626 else
1627 Newx(tmpbuf2, tmplen2 + 2, char);
1628
1629 if (tmpbuf2 != smallbuf || tmpbuf != smallbuf) {
1630 /* Either they malloc'd it, or we malloc'd it,
1631 so no prefix is present in ours. */
1632 tmpbuf2[0] = '_';
1633 tmpbuf2[1] = '<';
1634 }
1635
1636 memcpy(tmpbuf2 + 2, s, tmplen2);
1637 tmplen2 += 2;
1638
8a5ee598 1639 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1640 if (!isGV(gv2)) {
8a5ee598 1641 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1642 /* adjust ${"::_<newfilename"} to store the new file name */
1643 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
8818d409
FC
1644 /* The line number may differ. If that is the case,
1645 alias the saved lines that are in the array.
1646 Otherwise alias the whole array. */
1647 if (CopLINE(PL_curcop) == line_num) {
1648 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(*gvp)));
1649 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(*gvp)));
1650 }
1651 else if (GvAV(*gvp)) {
1652 AV * const av = GvAV(*gvp);
1653 const I32 start = CopLINE(PL_curcop)+1;
1654 I32 items = AvFILLp(av) - start;
1655 if (items > 0) {
1656 AV * const av2 = GvAVn(gv2);
1657 SV **svp = AvARRAY(av) + start;
1658 I32 l = (I32)line_num+1;
1659 while (items--)
1660 av_store(av2, l++, SvREFCNT_inc(*svp++));
1661 }
1662 }
e5527e4b 1663 }
44867030
NC
1664
1665 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
8a5ee598 1666 }
e66cf94c 1667 if (tmpbuf != smallbuf) Safefree(tmpbuf);
e66cf94c 1668 }
05ec9bb3 1669 CopFILE_free(PL_curcop);
d9095cec 1670 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1671 }
8818d409 1672 CopLINE_set(PL_curcop, line_num);
463ee0b2
LW
1673}
1674
29595ff2 1675#ifdef PERL_MAD
cd81e915 1676/* skip space before PL_thistoken */
29595ff2
NC
1677
1678STATIC char *
1679S_skipspace0(pTHX_ register char *s)
1680{
7918f24d
NC
1681 PERL_ARGS_ASSERT_SKIPSPACE0;
1682
29595ff2
NC
1683 s = skipspace(s);
1684 if (!PL_madskills)
1685 return s;
cd81e915
NC
1686 if (PL_skipwhite) {
1687 if (!PL_thiswhite)
6b29d1f5 1688 PL_thiswhite = newSVpvs("");
cd81e915
NC
1689 sv_catsv(PL_thiswhite, PL_skipwhite);
1690 sv_free(PL_skipwhite);
1691 PL_skipwhite = 0;
1692 }
1693 PL_realtokenstart = s - SvPVX(PL_linestr);
29595ff2
NC
1694 return s;
1695}
1696
cd81e915 1697/* skip space after PL_thistoken */
29595ff2
NC
1698
1699STATIC char *
1700S_skipspace1(pTHX_ register char *s)
1701{
d4c19fe8 1702 const char *start = s;
29595ff2
NC
1703 I32 startoff = start - SvPVX(PL_linestr);
1704
7918f24d
NC
1705 PERL_ARGS_ASSERT_SKIPSPACE1;
1706
29595ff2
NC
1707 s = skipspace(s);
1708 if (!PL_madskills)
1709 return s;
1710 start = SvPVX(PL_linestr) + startoff;
cd81e915 1711 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1712 const char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1713 PL_thistoken = newSVpvn(tstart, start - tstart);
1714 }
1715 PL_realtokenstart = -1;
1716 if (PL_skipwhite) {
1717 if (!PL_nextwhite)
6b29d1f5 1718 PL_nextwhite = newSVpvs("");
cd81e915
NC
1719 sv_catsv(PL_nextwhite, PL_skipwhite);
1720 sv_free(PL_skipwhite);
1721 PL_skipwhite = 0;
29595ff2
NC
1722 }
1723 return s;
1724}
1725
1726STATIC char *
1727S_skipspace2(pTHX_ register char *s, SV **svp)
1728{
c35e046a
AL
1729 char *start;
1730 const I32 bufptroff = PL_bufptr - SvPVX(PL_linestr);
1731 const I32 startoff = s - SvPVX(PL_linestr);
1732
7918f24d
NC
1733 PERL_ARGS_ASSERT_SKIPSPACE2;
1734
29595ff2
NC
1735 s = skipspace(s);
1736 PL_bufptr = SvPVX(PL_linestr) + bufptroff;
1737 if (!PL_madskills || !svp)
1738 return s;
1739 start = SvPVX(PL_linestr) + startoff;
cd81e915 1740 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1741 char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1742 PL_thistoken = newSVpvn(tstart, start - tstart);
1743 PL_realtokenstart = -1;
29595ff2 1744 }
cd81e915 1745 if (PL_skipwhite) {
29595ff2 1746 if (!*svp)
6b29d1f5 1747 *svp = newSVpvs("");
cd81e915
NC
1748 sv_setsv(*svp, PL_skipwhite);
1749 sv_free(PL_skipwhite);
1750 PL_skipwhite = 0;
29595ff2
NC
1751 }
1752
1753 return s;
1754}
1755#endif
1756
80a702cd 1757STATIC void
15f169a1 1758S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1759{
1760 AV *av = CopFILEAVx(PL_curcop);
1761 if (av) {
b9f83d2f 1762 SV * const sv = newSV_type(SVt_PVMG);
5fa550fb
NC
1763 if (orig_sv)
1764 sv_setsv(sv, orig_sv);
1765 else
1766 sv_setpvn(sv, buf, len);
80a702cd
RGS
1767 (void)SvIOK_on(sv);
1768 SvIV_set(sv, 0);
1769 av_store(av, (I32)CopLINE(PL_curcop), sv);
1770 }
1771}
1772
ffb4593c
NT
1773/*
1774 * S_skipspace
1775 * Called to gobble the appropriate amount and type of whitespace.
1776 * Skips comments as well.
1777 */
1778
76e3520e 1779STATIC char *
cea2e8a9 1780S_skipspace(pTHX_ register char *s)
a687059c 1781{
5db06880 1782#ifdef PERL_MAD
f0e67a1d
Z
1783 char *start = s;
1784#endif /* PERL_MAD */
7918f24d 1785 PERL_ARGS_ASSERT_SKIPSPACE;
f0e67a1d 1786#ifdef PERL_MAD
cd81e915
NC
1787 if (PL_skipwhite) {
1788 sv_free(PL_skipwhite);
f0e67a1d 1789 PL_skipwhite = NULL;
5db06880 1790 }
f0e67a1d 1791#endif /* PERL_MAD */
3280af22 1792 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
bf4acbe4 1793 while (s < PL_bufend && SPACE_OR_TAB(*s))
463ee0b2 1794 s++;
f0e67a1d
Z
1795 } else {
1796 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1797 PL_bufptr = s;
f0998909
Z
1798 lex_read_space(LEX_KEEP_PREVIOUS |
1799 (PL_sublex_info.sub_inwhat || PL_lex_state == LEX_FORMLINE ?
1800 LEX_NO_NEXT_CHUNK : 0));
3280af22 1801 s = PL_bufptr;
f0e67a1d
Z
1802 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1803 if (PL_linestart > PL_bufptr)
1804 PL_bufptr = PL_linestart;
1805 return s;
463ee0b2 1806 }
5db06880 1807#ifdef PERL_MAD
f0e67a1d
Z
1808 if (PL_madskills)
1809 PL_skipwhite = newSVpvn(start, s-start);
1810#endif /* PERL_MAD */
5db06880 1811 return s;
a687059c 1812}
378cc40b 1813
ffb4593c
NT
1814/*
1815 * S_check_uni
1816 * Check the unary operators to ensure there's no ambiguity in how they're
1817 * used. An ambiguous piece of code would be:
1818 * rand + 5
1819 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1820 * the +5 is its argument.
1821 */
1822
76e3520e 1823STATIC void
cea2e8a9 1824S_check_uni(pTHX)
ba106d47 1825{
97aff369 1826 dVAR;
d4c19fe8
AL
1827 const char *s;
1828 const char *t;
2f3197b3 1829
3280af22 1830 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1831 return;
3280af22
NIS
1832 while (isSPACE(*PL_last_uni))
1833 PL_last_uni++;
c35e046a
AL
1834 s = PL_last_uni;
1835 while (isALNUM_lazy_if(s,UTF) || *s == '-')
1836 s++;
3280af22 1837 if ((t = strchr(s, '(')) && t < PL_bufptr)
a0d0e21e 1838 return;
6136c704 1839
9b387841
NC
1840 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
1841 "Warning: Use of \"%.*s\" without parentheses is ambiguous",
1842 (int)(s - PL_last_uni), PL_last_uni);
2f3197b3
LW
1843}
1844
ffb4593c
NT
1845/*
1846 * LOP : macro to build a list operator. Its behaviour has been replaced
1847 * with a subroutine, S_lop() for which LOP is just another name.
1848 */
1849
a0d0e21e
LW
1850#define LOP(f,x) return lop(f,x,s)
1851
ffb4593c
NT
1852/*
1853 * S_lop
1854 * Build a list operator (or something that might be one). The rules:
1855 * - if we have a next token, then it's a list operator [why?]
1856 * - if the next thing is an opening paren, then it's a function
1857 * - else it's a list operator
1858 */
1859
76e3520e 1860STATIC I32
a0be28da 1861S_lop(pTHX_ I32 f, int x, char *s)
ffed7fef 1862{
97aff369 1863 dVAR;
7918f24d
NC
1864
1865 PERL_ARGS_ASSERT_LOP;
1866
6154021b 1867 pl_yylval.ival = f;
35c8bce7 1868 CLINE;
3280af22
NIS
1869 PL_expect = x;
1870 PL_bufptr = s;
1871 PL_last_lop = PL_oldbufptr;
eb160463 1872 PL_last_lop_op = (OPCODE)f;
5db06880
NC
1873#ifdef PERL_MAD
1874 if (PL_lasttoke)
78cdf107 1875 goto lstop;
5db06880 1876#else
3280af22 1877 if (PL_nexttoke)
78cdf107 1878 goto lstop;
5db06880 1879#endif
79072805 1880 if (*s == '(')
bbf60fe6 1881 return REPORT(FUNC);
29595ff2 1882 s = PEEKSPACE(s);
79072805 1883 if (*s == '(')
bbf60fe6 1884 return REPORT(FUNC);
78cdf107
Z
1885 else {
1886 lstop:
1887 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
1888 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
bbf60fe6 1889 return REPORT(LSTOP);
78cdf107 1890 }
79072805
LW
1891}
1892
5db06880
NC
1893#ifdef PERL_MAD
1894 /*
1895 * S_start_force
1896 * Sets up for an eventual force_next(). start_force(0) basically does
1897 * an unshift, while start_force(-1) does a push. yylex removes items
1898 * on the "pop" end.
1899 */
1900
1901STATIC void
1902S_start_force(pTHX_ int where)
1903{
1904 int i;
1905
cd81e915 1906 if (where < 0) /* so people can duplicate start_force(PL_curforce) */
5db06880 1907 where = PL_lasttoke;
cd81e915
NC
1908 assert(PL_curforce < 0 || PL_curforce == where);
1909 if (PL_curforce != where) {
5db06880
NC
1910 for (i = PL_lasttoke; i > where; --i) {
1911 PL_nexttoke[i] = PL_nexttoke[i-1];
1912 }
1913 PL_lasttoke++;
1914 }
cd81e915 1915 if (PL_curforce < 0) /* in case of duplicate start_force() */
5db06880 1916 Zero(&PL_nexttoke[where], 1, NEXTTOKE);
cd81e915
NC
1917 PL_curforce = where;
1918 if (PL_nextwhite) {
5db06880 1919 if (PL_madskills)
6b29d1f5 1920 curmad('^', newSVpvs(""));
cd81e915 1921 CURMAD('_', PL_nextwhite);
5db06880
NC
1922 }
1923}
1924
1925STATIC void
1926S_curmad(pTHX_ char slot, SV *sv)
1927{
1928 MADPROP **where;
1929
1930 if (!sv)
1931 return;
cd81e915
NC
1932 if (PL_curforce < 0)
1933 where = &PL_thismad;
5db06880 1934 else
cd81e915 1935 where = &PL_nexttoke[PL_curforce].next_mad;
5db06880 1936
cd81e915 1937 if (PL_faketokens)
76f68e9b 1938 sv_setpvs(sv, "");
5db06880
NC
1939 else {
1940 if (!IN_BYTES) {
1941 if (UTF && is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
1942 SvUTF8_on(sv);
1943 else if (PL_encoding) {
1944 sv_recode_to_utf8(sv, PL_encoding);
1945 }
1946 }
1947 }
1948
1949 /* keep a slot open for the head of the list? */
1950 if (slot != '_' && *where && (*where)->mad_key == '^') {
1951 (*where)->mad_key = slot;
daba3364 1952 sv_free(MUTABLE_SV(((*where)->mad_val)));
5db06880
NC
1953 (*where)->mad_val = (void*)sv;
1954 }
1955 else
1956 addmad(newMADsv(slot, sv), where, 0);
1957}
1958#else
b3f24c00
MHM
1959# define start_force(where) NOOP
1960# define curmad(slot, sv) NOOP
5db06880
NC
1961#endif
1962
ffb4593c
NT
1963/*
1964 * S_force_next
9cbb5ea2 1965 * When the lexer realizes it knows the next token (for instance,
ffb4593c 1966 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 1967 * to know what token to return the next time the lexer is called. Caller
5db06880
NC
1968 * will need to set PL_nextval[] (or PL_nexttoke[].next_val with PERL_MAD),
1969 * and possibly PL_expect to ensure the lexer handles the token correctly.
ffb4593c
NT
1970 */
1971
4e553d73 1972STATIC void
cea2e8a9 1973S_force_next(pTHX_ I32 type)
79072805 1974{
97aff369 1975 dVAR;
704d4215
GG
1976#ifdef DEBUGGING
1977 if (DEBUG_T_TEST) {
1978 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 1979 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
1980 }
1981#endif
5db06880 1982#ifdef PERL_MAD
cd81e915 1983 if (PL_curforce < 0)
5db06880 1984 start_force(PL_lasttoke);
cd81e915 1985 PL_nexttoke[PL_curforce].next_type = type;
5db06880
NC
1986 if (PL_lex_state != LEX_KNOWNEXT)
1987 PL_lex_defer = PL_lex_state;
1988 PL_lex_state = LEX_KNOWNEXT;
1989 PL_lex_expect = PL_expect;
cd81e915 1990 PL_curforce = -1;
5db06880 1991#else
3280af22
NIS
1992 PL_nexttype[PL_nexttoke] = type;
1993 PL_nexttoke++;
1994 if (PL_lex_state != LEX_KNOWNEXT) {
1995 PL_lex_defer = PL_lex_state;
1996 PL_lex_expect = PL_expect;
1997 PL_lex_state = LEX_KNOWNEXT;
79072805 1998 }
5db06880 1999#endif
79072805
LW
2000}
2001
28ac2b49
Z
2002void
2003Perl_yyunlex(pTHX)
2004{
a7aaec61
Z
2005 int yyc = PL_parser->yychar;
2006 if (yyc != YYEMPTY) {
2007 if (yyc) {
2008 start_force(-1);
2009 NEXTVAL_NEXTTOKE = PL_parser->yylval;
2010 if (yyc == '{'/*}*/ || yyc == HASHBRACK || yyc == '['/*]*/) {
78cdf107 2011 PL_lex_allbrackets--;
a7aaec61 2012 PL_lex_brackets--;
78cdf107
Z
2013 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
2014 } else if (yyc == '('/*)*/) {
2015 PL_lex_allbrackets--;
2016 yyc |= (2<<24);
a7aaec61
Z
2017 }
2018 force_next(yyc);
2019 }
28ac2b49
Z
2020 PL_parser->yychar = YYEMPTY;
2021 }
2022}
2023
d0a148a6 2024STATIC SV *
15f169a1 2025S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 2026{
97aff369 2027 dVAR;
740cce10 2028 SV * const sv = newSVpvn_utf8(start, len,
eaf7a4d2
CS
2029 !IN_BYTES
2030 && UTF
2031 && !is_ascii_string((const U8*)start, len)
740cce10 2032 && is_utf8_string((const U8*)start, len));
d0a148a6
NC
2033 return sv;
2034}
2035
ffb4593c
NT
2036/*
2037 * S_force_word
2038 * When the lexer knows the next thing is a word (for instance, it has
2039 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
2040 * it calls S_force_word to stick the next word into the PL_nexttoke/val
2041 * lookahead.
ffb4593c
NT
2042 *
2043 * Arguments:
b1b65b59 2044 * char *start : buffer position (must be within PL_linestr)
02b34bbe 2045 * int token : PL_next* will be this type of bare word (e.g., METHOD,WORD)
ffb4593c
NT
2046 * int check_keyword : if true, Perl checks to make sure the word isn't
2047 * a keyword (do this if the word is a label, e.g. goto FOO)
2048 * int allow_pack : if true, : characters will also be allowed (require,
2049 * use, etc. do this)
9cbb5ea2 2050 * int allow_initial_tick : used by the "sub" lexer only.
ffb4593c
NT
2051 */
2052
76e3520e 2053STATIC char *
cea2e8a9 2054S_force_word(pTHX_ register char *start, int token, int check_keyword, int allow_pack, int allow_initial_tick)
79072805 2055{
97aff369 2056 dVAR;
463ee0b2
LW
2057 register char *s;
2058 STRLEN len;
4e553d73 2059
7918f24d
NC
2060 PERL_ARGS_ASSERT_FORCE_WORD;
2061
29595ff2 2062 start = SKIPSPACE1(start);
463ee0b2 2063 s = start;
7e2040f0 2064 if (isIDFIRST_lazy_if(s,UTF) ||
a0d0e21e 2065 (allow_pack && *s == ':') ||
15f0808c 2066 (allow_initial_tick && *s == '\'') )
a0d0e21e 2067 {
3280af22 2068 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
5458a98a 2069 if (check_keyword && keyword(PL_tokenbuf, len, 0))
463ee0b2 2070 return start;
cd81e915 2071 start_force(PL_curforce);
5db06880
NC
2072 if (PL_madskills)
2073 curmad('X', newSVpvn(start,s-start));
463ee0b2 2074 if (token == METHOD) {
29595ff2 2075 s = SKIPSPACE1(s);
463ee0b2 2076 if (*s == '(')
3280af22 2077 PL_expect = XTERM;
463ee0b2 2078 else {
3280af22 2079 PL_expect = XOPERATOR;
463ee0b2 2080 }
79072805 2081 }
e74e6b3d 2082 if (PL_madskills)
63575281 2083 curmad('g', newSVpvs( "forced" ));
9ded7720 2084 NEXTVAL_NEXTTOKE.opval
d0a148a6
NC
2085 = (OP*)newSVOP(OP_CONST,0,
2086 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 2087 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
2088 force_next(token);
2089 }
2090 return s;
2091}
2092
ffb4593c
NT
2093/*
2094 * S_force_ident
9cbb5ea2 2095 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
2096 * text only contains the "foo" portion. The first argument is a pointer
2097 * to the "foo", and the second argument is the type symbol to prefix.
2098 * Forces the next token to be a "WORD".
9cbb5ea2 2099 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2100 */
2101
76e3520e 2102STATIC void
bfed75c6 2103S_force_ident(pTHX_ register const char *s, int kind)
79072805 2104{
97aff369 2105 dVAR;
7918f24d
NC
2106
2107 PERL_ARGS_ASSERT_FORCE_IDENT;
2108
c35e046a 2109 if (*s) {
90e5519e 2110 const STRLEN len = strlen(s);
728847b1
BF
2111 OP* const o = (OP*)newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
2112 UTF ? SVf_UTF8 : 0));
cd81e915 2113 start_force(PL_curforce);
9ded7720 2114 NEXTVAL_NEXTTOKE.opval = o;
79072805 2115 force_next(WORD);
748a9306 2116 if (kind) {
11343788 2117 o->op_private = OPpCONST_ENTERED;
55497cff
PP
2118 /* XXX see note in pp_entereval() for why we forgo typo
2119 warnings if the symbol must be introduced in an eval.
2120 GSAR 96-10-12 */
90e5519e 2121 gv_fetchpvn_flags(s, len,
728847b1
BF
2122 (PL_in_eval ? (GV_ADDMULTI | GV_ADDINEVAL)
2123 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
90e5519e
NC
2124 kind == '$' ? SVt_PV :
2125 kind == '@' ? SVt_PVAV :
2126 kind == '%' ? SVt_PVHV :
a0d0e21e 2127 SVt_PVGV
90e5519e 2128 );
748a9306 2129 }
79072805
LW
2130 }
2131}
2132
1571675a
GS
2133NV
2134Perl_str_to_version(pTHX_ SV *sv)
2135{
2136 NV retval = 0.0;
2137 NV nshift = 1.0;
2138 STRLEN len;
cfd0369c 2139 const char *start = SvPV_const(sv,len);
9d4ba2ae 2140 const char * const end = start + len;
504618e9 2141 const bool utf = SvUTF8(sv) ? TRUE : FALSE;
7918f24d
NC
2142
2143 PERL_ARGS_ASSERT_STR_TO_VERSION;
2144
1571675a 2145 while (start < end) {
ba210ebe 2146 STRLEN skip;
1571675a
GS
2147 UV n;
2148 if (utf)
9041c2e3 2149 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2150 else {
2151 n = *(U8*)start;
2152 skip = 1;
2153 }
2154 retval += ((NV)n)/nshift;
2155 start += skip;
2156 nshift *= 1000;
2157 }
2158 return retval;
2159}
2160
4e553d73 2161/*
ffb4593c
NT
2162 * S_force_version
2163 * Forces the next token to be a version number.
e759cc13
RGS
2164 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2165 * and if "guessing" is TRUE, then no new token is created (and the caller
2166 * must use an alternative parsing method).
ffb4593c
NT
2167 */
2168
76e3520e 2169STATIC char *
e759cc13 2170S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2171{
97aff369 2172 dVAR;
5f66b61c 2173 OP *version = NULL;
44dcb63b 2174 char *d;
5db06880
NC
2175#ifdef PERL_MAD
2176 I32 startoff = s - SvPVX(PL_linestr);
2177#endif
89bfa8cd 2178
7918f24d
NC
2179 PERL_ARGS_ASSERT_FORCE_VERSION;
2180
29595ff2 2181 s = SKIPSPACE1(s);
89bfa8cd 2182
44dcb63b 2183 d = s;
dd629d5b 2184 if (*d == 'v')
44dcb63b 2185 d++;
44dcb63b 2186 if (isDIGIT(*d)) {
e759cc13
RGS
2187 while (isDIGIT(*d) || *d == '_' || *d == '.')
2188 d++;
5db06880
NC
2189#ifdef PERL_MAD
2190 if (PL_madskills) {
cd81e915 2191 start_force(PL_curforce);
5db06880
NC
2192 curmad('X', newSVpvn(s,d-s));
2193 }
2194#endif
4e4da3ac 2195 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
dd629d5b 2196 SV *ver;
8d08d9ba 2197#ifdef USE_LOCALE_NUMERIC
909d3787
KW
2198 char *loc = savepv(setlocale(LC_NUMERIC, NULL));
2199 setlocale(LC_NUMERIC, "C");
8d08d9ba 2200#endif
6154021b 2201 s = scan_num(s, &pl_yylval);
8d08d9ba
DG
2202#ifdef USE_LOCALE_NUMERIC
2203 setlocale(LC_NUMERIC, loc);
909d3787 2204 Safefree(loc);
8d08d9ba 2205#endif
6154021b 2206 version = pl_yylval.opval;
dd629d5b
GS
2207 ver = cSVOPx(version)->op_sv;
2208 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2209 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2210 SvNV_set(ver, str_to_version(ver));
1571675a 2211 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2212 }
89bfa8cd 2213 }
5db06880
NC
2214 else if (guessing) {
2215#ifdef PERL_MAD
2216 if (PL_madskills) {
cd81e915
NC
2217 sv_free(PL_nextwhite); /* let next token collect whitespace */
2218 PL_nextwhite = 0;
5db06880
NC
2219 s = SvPVX(PL_linestr) + startoff;
2220 }
2221#endif
e759cc13 2222 return s;
5db06880 2223 }
89bfa8cd
PP
2224 }
2225
5db06880
NC
2226#ifdef PERL_MAD
2227 if (PL_madskills && !version) {
cd81e915
NC
2228 sv_free(PL_nextwhite); /* let next token collect whitespace */
2229 PL_nextwhite = 0;
5db06880
NC
2230 s = SvPVX(PL_linestr) + startoff;
2231 }
2232#endif
89bfa8cd 2233 /* NOTE: The parser sees the package name and the VERSION swapped */
cd81e915 2234 start_force(PL_curforce);
9ded7720 2235 NEXTVAL_NEXTTOKE.opval = version;
4e553d73 2236 force_next(WORD);
89bfa8cd 2237
e759cc13 2238 return s;
89bfa8cd
PP
2239}
2240
ffb4593c 2241/*
91152fc1
DG
2242 * S_force_strict_version
2243 * Forces the next token to be a version number using strict syntax rules.
2244 */
2245
2246STATIC char *
2247S_force_strict_version(pTHX_ char *s)
2248{
2249 dVAR;
2250 OP *version = NULL;
2251#ifdef PERL_MAD
2252 I32 startoff = s - SvPVX(PL_linestr);
2253#endif
2254 const char *errstr = NULL;
2255
2256 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2257
2258 while (isSPACE(*s)) /* leading whitespace */
2259 s++;
2260
2261 if (is_STRICT_VERSION(s,&errstr)) {
2262 SV *ver = newSV(0);
2263 s = (char *)scan_version(s, ver, 0);
2264 version = newSVOP(OP_CONST, 0, ver);
2265 }
4e4da3ac
Z
2266 else if ( (*s != ';' && *s != '{' && *s != '}' ) &&
2267 (s = SKIPSPACE1(s), (*s != ';' && *s != '{' && *s != '}' )))
2268 {
91152fc1
DG
2269 PL_bufptr = s;
2270 if (errstr)
2271 yyerror(errstr); /* version required */
2272 return s;
2273 }
2274
2275#ifdef PERL_MAD
2276 if (PL_madskills && !version) {
2277 sv_free(PL_nextwhite); /* let next token collect whitespace */
2278 PL_nextwhite = 0;
2279 s = SvPVX(PL_linestr) + startoff;
2280 }
2281#endif
2282 /* NOTE: The parser sees the package name and the VERSION swapped */
2283 start_force(PL_curforce);
2284 NEXTVAL_NEXTTOKE.opval = version;
2285 force_next(WORD);
2286
2287 return s;
2288}
2289
2290/*
ffb4593c
NT
2291 * S_tokeq
2292 * Tokenize a quoted string passed in as an SV. It finds the next
2293 * chunk, up to end of string or a backslash. It may make a new
2294 * SV containing that chunk (if HINT_NEW_STRING is on). It also
2295 * turns \\ into \.
2296 */
2297
76e3520e 2298STATIC SV *
cea2e8a9 2299S_tokeq(pTHX_ SV *sv)
79072805 2300{
97aff369 2301 dVAR;
79072805
LW
2302 register char *s;
2303 register char *send;
2304 register char *d;
b3ac6de7
IZ
2305 STRLEN len = 0;
2306 SV *pv = sv;
79072805 2307
7918f24d
NC
2308 PERL_ARGS_ASSERT_TOKEQ;
2309
79072805 2310 if (!SvLEN(sv))
b3ac6de7 2311 goto finish;
79072805 2312
a0d0e21e 2313 s = SvPV_force(sv, len);
21a311ee 2314 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1)
b3ac6de7 2315 goto finish;
463ee0b2 2316 send = s + len;
dcb21ed6
NC
2317 /* This is relying on the SV being "well formed" with a trailing '\0' */
2318 while (s < send && !(*s == '\\' && s[1] == '\\'))
79072805
LW
2319 s++;
2320 if (s == send)
b3ac6de7 2321 goto finish;
79072805 2322 d = s;
be4731d2 2323 if ( PL_hints & HINT_NEW_STRING ) {
59cd0e26 2324 pv = newSVpvn_flags(SvPVX_const(pv), len, SVs_TEMP | SvUTF8(sv));
be4731d2 2325 }
79072805
LW
2326 while (s < send) {
2327 if (*s == '\\') {
a0d0e21e 2328 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2329 s++; /* all that, just for this */
2330 }
2331 *d++ = *s++;
2332 }
2333 *d = '\0';
95a20fc0 2334 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2335 finish:
3280af22 2336 if ( PL_hints & HINT_NEW_STRING )
eb0d8d16 2337 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
79072805
LW
2338 return sv;
2339}
2340
ffb4593c
NT
2341/*
2342 * Now come three functions related to double-quote context,
2343 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2344 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2345 * interact with PL_lex_state, and create fake ( ... ) argument lists
2346 * to handle functions and concatenation.
ecd24171
DM
2347 * For example,
2348 * "foo\lbar"
2349 * is tokenised as
2350 * stringify ( const[foo] concat lcfirst ( const[bar] ) )
ffb4593c
NT
2351 */
2352
2353/*
2354 * S_sublex_start
6154021b 2355 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2356 *
2357 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2358 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c
NT
2359 *
2360 * OP_CONST and OP_READLINE are easy--just make the new op and return.
2361 *
2362 * Everything else becomes a FUNC.
2363 *
2364 * Sets PL_lex_state to LEX_INTERPPUSH unless (ival was OP_NULL or we
2365 * had an OP_CONST or OP_READLINE). This just sets us up for a
2366 * call to S_sublex_push().
2367 */
2368
76e3520e 2369STATIC I32
cea2e8a9 2370S_sublex_start(pTHX)
79072805 2371{
97aff369 2372 dVAR;
6154021b 2373 register const I32 op_type = pl_yylval.ival;
79072805
LW
2374
2375 if (op_type == OP_NULL) {
6154021b 2376 pl_yylval.opval = PL_lex_op;
5f66b61c 2377 PL_lex_op = NULL;
79072805
LW
2378 return THING;
2379 }
2380 if (op_type == OP_CONST || op_type == OP_READLINE) {
3280af22 2381 SV *sv = tokeq(PL_lex_stuff);
b3ac6de7
IZ
2382
2383 if (SvTYPE(sv) == SVt_PVIV) {
2384 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2385 STRLEN len;
96a5add6 2386 const char * const p = SvPV_const(sv, len);
740cce10 2387 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2388 SvREFCNT_dec(sv);
2389 sv = nsv;
4e553d73 2390 }
6154021b 2391 pl_yylval.opval = (OP*)newSVOP(op_type, 0, sv);
a0714e2c 2392 PL_lex_stuff = NULL;
6f33ba73
RGS
2393 /* Allow <FH> // "foo" */
2394 if (op_type == OP_READLINE)
2395 PL_expect = XTERMORDORDOR;
79072805
LW
2396 return THING;
2397 }
e3f73d4e
RGS
2398 else if (op_type == OP_BACKTICK && PL_lex_op) {
2399 /* readpipe() vas overriden */
2400 cSVOPx(cLISTOPx(cUNOPx(PL_lex_op)->op_first)->op_first->op_sibling)->op_sv = tokeq(PL_lex_stuff);
6154021b 2401 pl_yylval.opval = PL_lex_op;
9b201d7d 2402 PL_lex_op = NULL;
e3f73d4e
RGS
2403 PL_lex_stuff = NULL;
2404 return THING;
2405 }
79072805 2406
3280af22 2407 PL_sublex_info.super_state = PL_lex_state;
eac04b2e 2408 PL_sublex_info.sub_inwhat = (U16)op_type;
3280af22
NIS
2409 PL_sublex_info.sub_op = PL_lex_op;
2410 PL_lex_state = LEX_INTERPPUSH;
55497cff 2411
3280af22
NIS
2412 PL_expect = XTERM;
2413 if (PL_lex_op) {
6154021b 2414 pl_yylval.opval = PL_lex_op;
5f66b61c 2415 PL_lex_op = NULL;
55497cff
PP
2416 return PMFUNC;
2417 }
2418 else
2419 return FUNC;
2420}
2421
ffb4593c
NT
2422/*
2423 * S_sublex_push
2424 * Create a new scope to save the lexing state. The scope will be
2425 * ended in S_sublex_done. Returns a '(', starting the function arguments
2426 * to the uc, lc, etc. found before.
2427 * Sets PL_lex_state to LEX_INTERPCONCAT.
2428 */
2429
76e3520e 2430STATIC I32
cea2e8a9 2431S_sublex_push(pTHX)
55497cff 2432{
27da23d5 2433 dVAR;
f46d017c 2434 ENTER;
55497cff 2435
3280af22 2436 PL_lex_state = PL_sublex_info.super_state;
651b5b28 2437 SAVEBOOL(PL_lex_dojoin);
3280af22 2438 SAVEI32(PL_lex_brackets);
78cdf107
Z
2439 SAVEI32(PL_lex_allbrackets);
2440 SAVEI8(PL_lex_fakeeof);
3280af22
NIS
2441 SAVEI32(PL_lex_casemods);
2442 SAVEI32(PL_lex_starts);
651b5b28 2443 SAVEI8(PL_lex_state);
9da1dd8f 2444 SAVEPPTR(PL_sublex_info.re_eval_start);
7766f137 2445 SAVEVPTR(PL_lex_inpat);
98246f1e 2446 SAVEI16(PL_lex_inwhat);
57843af0 2447 SAVECOPLINE(PL_curcop);
3280af22 2448 SAVEPPTR(PL_bufptr);
8452ff4b 2449 SAVEPPTR(PL_bufend);
3280af22
NIS
2450 SAVEPPTR(PL_oldbufptr);
2451 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2452 SAVEPPTR(PL_last_lop);
2453 SAVEPPTR(PL_last_uni);
3280af22
NIS
2454 SAVEPPTR(PL_linestart);
2455 SAVESPTR(PL_linestr);
8edd5f42
RGS
2456 SAVEGENERICPV(PL_lex_brackstack);
2457 SAVEGENERICPV(PL_lex_casestack);
3280af22
NIS
2458
2459 PL_linestr = PL_lex_stuff;
a0714e2c 2460 PL_lex_stuff = NULL;
9da1dd8f 2461 PL_sublex_info.re_eval_start = NULL;
3280af22 2462
9cbb5ea2
GS
2463 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2464 = SvPVX(PL_linestr);
3280af22 2465 PL_bufend += SvCUR(PL_linestr);
bd61b366 2466 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2467 SAVEFREESV(PL_linestr);
2468
2469 PL_lex_dojoin = FALSE;
2470 PL_lex_brackets = 0;
78cdf107
Z
2471 PL_lex_allbrackets = 0;
2472 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
a02a5408
JC
2473 Newx(PL_lex_brackstack, 120, char);
2474 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2475 PL_lex_casemods = 0;
2476 *PL_lex_casestack = '\0';
2477 PL_lex_starts = 0;
2478 PL_lex_state = LEX_INTERPCONCAT;
eb160463 2479 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
3280af22
NIS
2480
2481 PL_lex_inwhat = PL_sublex_info.sub_inwhat;
bb16bae8 2482 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
3280af22
NIS
2483 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2484 PL_lex_inpat = PL_sublex_info.sub_op;
79072805 2485 else
5f66b61c 2486 PL_lex_inpat = NULL;
79072805 2487
55497cff 2488 return '(';
79072805
LW
2489}
2490
ffb4593c
NT
2491/*
2492 * S_sublex_done
2493 * Restores lexer state after a S_sublex_push.
2494 */
2495
76e3520e 2496STATIC I32
cea2e8a9 2497S_sublex_done(pTHX)
79072805 2498{
27da23d5 2499 dVAR;
3280af22 2500 if (!PL_lex_starts++) {
396482e1 2501 SV * const sv = newSVpvs("");
9aa983d2
JH
2502 if (SvUTF8(PL_linestr))
2503 SvUTF8_on(sv);
3280af22 2504 PL_expect = XOPERATOR;
6154021b 2505 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
79072805
LW
2506 return THING;
2507 }
2508
3280af22
NIS
2509 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2510 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2511 return yylex();
79072805
LW
2512 }
2513
ffb4593c 2514 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
bb16bae8 2515 assert(PL_lex_inwhat != OP_TRANSR);
3280af22
NIS
2516 if (PL_lex_repl && (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS)) {
2517 PL_linestr = PL_lex_repl;
2518 PL_lex_inpat = 0;
2519 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2520 PL_bufend += SvCUR(PL_linestr);
bd61b366 2521 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2522 SAVEFREESV(PL_linestr);
2523 PL_lex_dojoin = FALSE;
2524 PL_lex_brackets = 0;
78cdf107
Z
2525 PL_lex_allbrackets = 0;
2526 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
3280af22
NIS
2527 PL_lex_casemods = 0;
2528 *PL_lex_casestack = '\0';
2529 PL_lex_starts = 0;
25da4f38 2530 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2531 PL_lex_state = LEX_INTERPNORMAL;
2532 PL_lex_starts++;
e9fa98b2
HS
2533 /* we don't clear PL_lex_repl here, so that we can check later
2534 whether this is an evalled subst; that means we rely on the
2535 logic to ensure sublex_done() is called again only via the
2536 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2537 }
e9fa98b2 2538 else {
3280af22 2539 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2540 PL_lex_repl = NULL;
e9fa98b2 2541 }
79072805 2542 return ',';
ffed7fef
LW
2543 }
2544 else {
5db06880
NC
2545#ifdef PERL_MAD
2546 if (PL_madskills) {
cd81e915
NC
2547 if (PL_thiswhite) {
2548 if (!PL_endwhite)
6b29d1f5 2549 PL_endwhite = newSVpvs("");
cd81e915
NC
2550 sv_catsv(PL_endwhite, PL_thiswhite);
2551 PL_thiswhite = 0;
2552 }
2553 if (PL_thistoken)
76f68e9b 2554 sv_setpvs(PL_thistoken,"");
5db06880 2555 else
cd81e915 2556 PL_realtokenstart = -1;
5db06880
NC
2557 }
2558#endif
f46d017c 2559 LEAVE;
3280af22
NIS
2560 PL_bufend = SvPVX(PL_linestr);
2561 PL_bufend += SvCUR(PL_linestr);
2562 PL_expect = XOPERATOR;
09bef843 2563 PL_sublex_info.sub_inwhat = 0;
79072805 2564 return ')';
ffed7fef
LW
2565 }
2566}
2567
02aa26ce
NT
2568/*
2569 scan_const
2570
9da1dd8f
DM
2571 Extracts the next constant part of a pattern, double-quoted string,
2572 or transliteration. This is terrifying code.
2573
2574 For example, in parsing the double-quoted string "ab\x63$d", it would
2575 stop at the '$' and return an OP_CONST containing 'abc'.
02aa26ce 2576
94def140 2577 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 2578 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 2579 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 2580
94def140
ST
2581 Returns a pointer to the character scanned up to. If this is
2582 advanced from the start pointer supplied (i.e. if anything was
9da1dd8f 2583 successfully parsed), will leave an OP_CONST for the substring scanned
6154021b 2584 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
2585 by looking at the next characters herself.
2586
02aa26ce 2587 In patterns:
9da1dd8f
DM
2588 expand:
2589 \N{ABC} => \N{U+41.42.43}
2590
2591 pass through:
2592 all other \-char, including \N and \N{ apart from \N{ABC}
2593
2594 stops on:
2595 @ and $ where it appears to be a var, but not for $ as tail anchor
2596 \l \L \u \U \Q \E
2597 (?{ or (??{
2598
02aa26ce
NT
2599
2600 In transliterations:
2601 characters are VERY literal, except for - not at the start or end
94def140
ST
2602 of the string, which indicates a range. If the range is in bytes,
2603 scan_const expands the range to the full set of intermediate
2604 characters. If the range is in utf8, the hyphen is replaced with
2605 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
2606
2607 In double-quoted strings:
2608 backslashes:
2609 double-quoted style: \r and \n
ff3f963a 2610 constants: \x31, etc.
94def140 2611 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
2612 case and quoting: \U \Q \E
2613 stops on @ and $
2614
2615 scan_const does *not* construct ops to handle interpolated strings.
2616 It stops processing as soon as it finds an embedded $ or @ variable
2617 and leaves it to the caller to work out what's going on.
2618
94def140
ST
2619 embedded arrays (whether in pattern or not) could be:
2620 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2621
2622 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
2623
2624 $ in pattern could be $foo or could be tail anchor. Assumption:
2625 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 2626 followed by one of "()| \r\n\t"
02aa26ce 2627
9da1dd8f 2628 \1 (backreferences) are turned into $1 in substitutions
02aa26ce
NT
2629
2630 The structure of the code is
2631 while (there's a character to process) {
94def140
ST
2632 handle transliteration ranges
2633 skip regexp comments /(?#comment)/ and codes /(?{code})/
2634 skip #-initiated comments in //x patterns
2635 check for embedded arrays
02aa26ce
NT
2636 check for embedded scalars
2637 if (backslash) {
94def140 2638 deprecate \1 in substitution replacements
02aa26ce
NT
2639 handle string-changing backslashes \l \U \Q \E, etc.
2640 switch (what was escaped) {
94def140 2641 handle \- in a transliteration (becomes a literal -)
ff3f963a 2642 if a pattern and not \N{, go treat as regular character
94def140
ST
2643 handle \132 (octal characters)
2644 handle \x15 and \x{1234} (hex characters)
ff3f963a 2645 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
ST
2646 handle \cV (control characters)
2647 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 2648 } (end switch)
77a135fe 2649 continue
02aa26ce 2650 } (end if backslash)
77a135fe 2651 handle regular character
02aa26ce 2652 } (end while character to read)
4e553d73 2653
02aa26ce
NT
2654*/
2655
76e3520e 2656STATIC char *
cea2e8a9 2657S_scan_const(pTHX_ char *start)
79072805 2658{
97aff369 2659 dVAR;
3280af22 2660 register char *send = PL_bufend; /* end of the constant */
77a135fe
KW
2661 SV *sv = newSV(send - start); /* sv for the constant. See
2662 note below on sizing. */
02aa26ce
NT
2663 register char *s = start; /* start of the constant */
2664 register char *d = SvPVX(sv); /* destination for copies */
2665 bool dorange = FALSE; /* are we in a translit range? */
c2e66d9e 2666 bool didrange = FALSE; /* did we just finish a range? */
2866decb 2667 bool in_charclass = FALSE; /* within /[...]/ */
b953e60c
KW
2668 bool has_utf8 = FALSE; /* Output constant is UTF8 */
2669 bool this_utf8 = cBOOL(UTF); /* Is the source string assumed
77a135fe
KW
2670 to be UTF8? But, this can
2671 show as true when the source
2672 isn't utf8, as for example
2673 when it is entirely composed
2674 of hex constants */
2675
2676 /* Note on sizing: The scanned constant is placed into sv, which is
2677 * initialized by newSV() assuming one byte of output for every byte of
2678 * input. This routine expects newSV() to allocate an extra byte for a
2679 * trailing NUL, which this routine will append if it gets to the end of
2680 * the input. There may be more bytes of input than output (eg., \N{LATIN
2681 * CAPITAL LETTER A}), or more output than input if the constant ends up
2682 * recoded to utf8, but each time a construct is found that might increase
2683 * the needed size, SvGROW() is called. Its size parameter each time is
2684 * based on the best guess estimate at the time, namely the length used so
2685 * far, plus the length the current construct will occupy, plus room for
2686 * the trailing NUL, plus one byte for every input byte still unscanned */
2687
012bcf8d 2688 UV uv;
4c3a8340
ST
2689#ifdef EBCDIC
2690 UV literal_endpoint = 0;
e294cc5d 2691 bool native_range = TRUE; /* turned to FALSE if the first endpoint is Unicode. */
4c3a8340 2692#endif
012bcf8d 2693
7918f24d
NC
2694 PERL_ARGS_ASSERT_SCAN_CONST;
2695
bb16bae8 2696 assert(PL_lex_inwhat != OP_TRANSR);
2b9d42f0
NIS
2697 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
2698 /* If we are doing a trans and we know we want UTF8 set expectation */
2699 has_utf8 = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
2700 this_utf8 = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
2701 }
2702
2703
79072805 2704 while (s < send || dorange) {
ff3f963a 2705
02aa26ce 2706 /* get transliterations out of the way (they're most literal) */
3280af22 2707 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 2708 /* expand a range A-Z to the full set of characters. AIE! */
79072805 2709 if (dorange) {
1ba5c669
JH
2710 I32 i; /* current expanded character */
2711 I32 min; /* first character in range */
2712 I32 max; /* last character in range */
02aa26ce 2713
e294cc5d
JH
2714#ifdef EBCDIC
2715 UV uvmax = 0;
2716#endif
2717
2718 if (has_utf8
2719#ifdef EBCDIC
2720 && !native_range
2721#endif
2722 ) {
9d4ba2ae 2723 char * const c = (char*)utf8_hop((U8*)d, -1);
8973db79
JH
2724 char *e = d++;
2725 while (e-- > c)
2726 *(e + 1) = *e;
25716404 2727 *c = (char)UTF_TO_NATIVE(0xff);
8973db79
JH
2728 /* mark the range as done, and continue */
2729 dorange = FALSE;
2730 didrange = TRUE;
2731 continue;
2732 }
2b9d42f0 2733
95a20fc0 2734 i = d - SvPVX_const(sv); /* remember current offset */
e294cc5d
JH
2735#ifdef EBCDIC
2736 SvGROW(sv,
2737 SvLEN(sv) + (has_utf8 ?
2738 (512 - UTF_CONTINUATION_MARK +
2739 UNISKIP(0x100))
2740 : 256));
2741 /* How many two-byte within 0..255: 128 in UTF-8,
2742 * 96 in UTF-8-mod. */
2743#else
9cbb5ea2 2744 SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */
e294cc5d 2745#endif
9cbb5ea2 2746 d = SvPVX(sv) + i; /* refresh d after realloc */
e294cc5d
JH
2747#ifdef EBCDIC
2748 if (has_utf8) {
2749 int j;
2750 for (j = 0; j <= 1; j++) {
2751 char * const c = (char*)utf8_hop((U8*)d, -1);
2752 const UV uv = utf8n_to_uvchr((U8*)c, d - c, NULL, 0);
2753 if (j)
2754 min = (U8)uv;
2755 else if (uv < 256)
2756 max = (U8)uv;
2757 else {
2758 max = (U8)0xff; /* only to \xff */
2759 uvmax = uv; /* \x{100} to uvmax */
2760 }
2761 d = c; /* eat endpoint chars */
2762 }
2763 }
2764 else {
2765#endif
2766 d -= 2; /* eat the first char and the - */
2767 min = (U8)*d; /* first char in range */
2768 max = (U8)d[1]; /* last char in range */
2769#ifdef EBCDIC
2770 }
2771#endif
8ada0baa 2772
c2e66d9e 2773 if (min > max) {
01ec43d0 2774 Perl_croak(aTHX_
d1573ac7 2775 "Invalid range \"%c-%c\" in transliteration operator",
1ba5c669 2776 (char)min, (char)max);
c2e66d9e
GS
2777 }
2778
c7f1f016 2779#ifdef EBCDIC
4c3a8340
ST
2780 if (literal_endpoint == 2 &&
2781 ((isLOWER(min) && isLOWER(max)) ||
2782 (isUPPER(min) && isUPPER(max)))) {
8ada0baa
JH
2783 if (isLOWER(min)) {
2784 for (i = min; i <= max; i++)
2785 if (isLOWER(i))
db42d148 2786 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2787 } else {
2788 for (i = min; i <= max; i++)
2789 if (isUPPER(i))
db42d148 2790 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2791 }
2792 }
2793 else
2794#endif
2795 for (i = min; i <= max; i++)
e294cc5d
JH
2796#ifdef EBCDIC
2797 if (has_utf8) {
2798 const U8 ch = (U8)NATIVE_TO_UTF(i);
2799 if (UNI_IS_INVARIANT(ch))
2800 *d++ = (U8)i;
2801 else {
2802 *d++ = (U8)UTF8_EIGHT_BIT_HI(ch);
2803 *d++ = (U8)UTF8_EIGHT_BIT_LO(ch);
2804 }
2805 }
2806 else
2807#endif
2808 *d++ = (char)i;
2809
2810#ifdef EBCDIC
2811 if (uvmax) {
2812 d = (char*)uvchr_to_utf8((U8*)d, 0x100);
2813 if (uvmax > 0x101)
2814 *d++ = (char)UTF_TO_NATIVE(0xff);
2815 if (uvmax > 0x100)
2816 d = (char*)uvchr_to_utf8((U8*)d, uvmax);
2817 }
2818#endif
02aa26ce
NT
2819
2820 /* mark the range as done, and continue */
79072805 2821 dorange = FALSE;
01ec43d0 2822 didrange = TRUE;
4c3a8340
ST
2823#ifdef EBCDIC
2824 literal_endpoint = 0;
2825#endif
79072805 2826 continue;
4e553d73 2827 }
02aa26ce
NT
2828
2829 /* range begins (ignore - as first or last char) */
79072805 2830 else if (*s == '-' && s+1 < send && s != start) {
4e553d73 2831 if (didrange) {
1fafa243 2832 Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
01ec43d0 2833 }
e294cc5d
JH
2834 if (has_utf8
2835#ifdef EBCDIC
2836 && !native_range
2837#endif
2838 ) {
25716404 2839 *d++ = (char)UTF_TO_NATIVE(0xff); /* use illegal utf8 byte--see pmtrans */
a0ed51b3
LW
2840 s++;
2841 continue;
2842 }
79072805
LW
2843 dorange = TRUE;
2844 s++;
01ec43d0
GS
2845 }
2846 else {
2847 didrange = FALSE;
4c3a8340
ST
2848#ifdef EBCDIC
2849 literal_endpoint = 0;
e294cc5d 2850 native_range = TRUE;
4c3a8340 2851#endif
01ec43d0 2852 }
79072805 2853 }
02aa26ce
NT
2854
2855 /* if we get here, we're not doing a transliteration */
2856
e4a2df84
DM
2857 else if (*s == '[' && PL_lex_inpat && !in_charclass) {
2858 char *s1 = s-1;
2859 int esc = 0;
2860 while (s1 >= start && *s1-- == '\\')
2861 esc = !esc;
2862 if (!esc)
2863 in_charclass = TRUE;
2864 }
2866decb 2865
e4a2df84
DM
2866 else if (*s == ']' && PL_lex_inpat && in_charclass) {
2867 char *s1 = s-1;
2868 int esc = 0;
2869 while (s1 >= start && *s1-- == '\\')
2870 esc = !esc;
2871 if (!esc)
2872 in_charclass = FALSE;
2873 }
2866decb 2874
9da1dd8f
DM
2875 /* skip for regexp comments /(?#comment)/, except for the last
2876 * char, which will be done separately.
2877 * Stop on (?{..}) and friends */
2878
3280af22 2879 else if (*s == '(' && PL_lex_inpat && s[1] == '?') {
cc6b7395 2880 if (s[2] == '#') {
e994fd66 2881 while (s+1 < send && *s != ')')
db42d148 2882 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
155aba94 2883 }
2866decb 2884 else if (!PL_lex_casemods && !in_charclass &&
d3cec5e5
DM
2885 ( s[2] == '{' /* This should match regcomp.c */
2886 || (s[2] == '?' && s[3] == '{')))
155aba94 2887 {
9da1dd8f 2888 break;
cc6b7395 2889 }
748a9306 2890 }
02aa26ce
NT
2891
2892 /* likewise skip #-initiated comments in //x patterns */
3280af22 2893 else if (*s == '#' && PL_lex_inpat &&
73134a2e 2894 ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED) {
748a9306 2895 while (s+1 < send && *s != '\n')
db42d148 2896 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
748a9306 2897 }
02aa26ce 2898
9da1dd8f
DM
2899 /* no further processing of single-quoted regex */
2900 else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'')
2901 goto default_action;
2902
5d1d4326 2903 /* check for embedded arrays
da6eedaa 2904 (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
5d1d4326 2905 */
1749ea0d
ST
2906 else if (*s == '@' && s[1]) {
2907 if (isALNUM_lazy_if(s+1,UTF))
2908 break;
2909 if (strchr(":'{$", s[1]))
2910 break;
2911 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
2912 break; /* in regexp, neither @+ nor @- are interpolated */
2913 }
02aa26ce
NT
2914
2915 /* check for embedded scalars. only stop if we're sure it's a
2916 variable.
2917 */
79072805 2918 else if (*s == '$') {
3280af22 2919 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
79072805 2920 break;
77772344 2921 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
a2a5de95
NC
2922 if (s[1] == '\\') {
2923 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
2924 "Possible unintended interpolation of $\\ in regex");
77772344 2925 }
79072805 2926 break; /* in regexp, $ might be tail anchor */
77772344 2927 }
79072805 2928 }
02aa26ce 2929
2b9d42f0
NIS
2930 /* End of else if chain - OP_TRANS rejoin rest */
2931
02aa26ce 2932 /* backslashes */
79072805 2933 if (*s == '\\' && s+1 < send) {
ff3f963a
KW
2934 char* e; /* Can be used for ending '}', etc. */
2935
79072805 2936 s++;
02aa26ce 2937
7d0fc23c
KW
2938 /* warn on \1 - \9 in substitution replacements, but note that \11
2939 * is an octal; and \19 is \1 followed by '9' */
3280af22 2940 if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat &&
a0d0e21e 2941 isDIGIT(*s) && *s != '0' && !isDIGIT(s[1]))
79072805 2942 {
a2a5de95 2943 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
79072805
LW
2944 *--s = '$';
2945 break;
2946 }
02aa26ce
NT
2947
2948 /* string-change backslash escapes */
838f2281 2949 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQF", *s)) {
79072805
LW
2950 --s;
2951 break;
2952 }
ff3f963a
KW
2953 /* In a pattern, process \N, but skip any other backslash escapes.
2954 * This is because we don't want to translate an escape sequence
2955 * into a meta symbol and have the regex compiler use the meta
2956 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
2957 * in spite of this, we do have to process \N here while the proper
2958 * charnames handler is in scope. See bugs #56444 and #62056.
2959 * There is a complication because \N in a pattern may also stand
2960 * for 'match a non-nl', and not mean a charname, in which case its
2961 * processing should be deferred to the regex compiler. To be a
2962 * charname it must be followed immediately by a '{', and not look
2963 * like \N followed by a curly quantifier, i.e., not something like
2964 * \N{3,}. regcurly returns a boolean indicating if it is a legal
2965 * quantifier */
2966 else if (PL_lex_inpat
2967 && (*s != 'N'
2968 || s[1] != '{'
2969 || regcurly(s + 1)))
2970 {
cc74c5bd
ST
2971 *d++ = NATIVE_TO_NEED(has_utf8,'\\');
2972 goto default_action;
2973 }
02aa26ce 2974
79072805 2975 switch (*s) {
02aa26ce
NT
2976
2977 /* quoted - in transliterations */
79072805 2978 case '-':
3280af22 2979 if (PL_lex_inwhat == OP_TRANS) {
79072805
LW
2980 *d++ = *s++;
2981 continue;
2982 }
2983 /* FALL THROUGH */
2984 default:
11b8faa4 2985 {
e4ca4584 2986 if ((isALNUMC(*s)))
a2a5de95
NC
2987 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
2988 "Unrecognized escape \\%c passed through",
2989 *s);
11b8faa4 2990 /* default action is to copy the quoted character */
f9a63242 2991 goto default_action;
11b8faa4 2992 }
02aa26ce 2993
632403cc 2994 /* eg. \132 indicates the octal constant 0132 */
79072805
LW
2995 case '0': case '1': case '2': case '3':
2996 case '4': case '5': case '6': case '7':
ba210ebe 2997 {
53305cf1
NC
2998 I32 flags = 0;
2999 STRLEN len = 3;
77a135fe 3000 uv = NATIVE_TO_UNI(grok_oct(s, &len, &flags, NULL));
ba210ebe
JH
3001 s += len;
3002 }
012bcf8d 3003 goto NUM_ESCAPE_INSERT;
02aa26ce 3004
f0a2b745
KW
3005 /* eg. \o{24} indicates the octal constant \024 */
3006 case 'o':
3007 {
3008 STRLEN len;
454155d9 3009 const char* error;
f0a2b745 3010
454155d9 3011 bool valid = grok_bslash_o(s, &uv, &len, &error, 1);
f0a2b745 3012 s += len;
454155d9 3013 if (! valid) {
f0a2b745
KW
3014 yyerror(error);
3015 continue;
3016 }
3017 goto NUM_ESCAPE_INSERT;
3018 }
3019
77a135fe 3020 /* eg. \x24 indicates the hex constant 0x24 */
79072805 3021 case 'x':
a0481293 3022 {
53305cf1 3023 STRLEN len;
a0481293 3024 const char* error;
355860ce 3025
a0481293
KW
3026 bool valid = grok_bslash_x(s, &uv, &len, &error, 1);
3027 s += len;
3028 if (! valid) {
3029 yyerror(error);
355860ce 3030 continue;
ba210ebe 3031 }
012bcf8d
GS
3032 }
3033
3034 NUM_ESCAPE_INSERT:
ff3f963a
KW
3035 /* Insert oct or hex escaped character. There will always be
3036 * enough room in sv since such escapes will be longer than any
3037 * UTF-8 sequence they can end up as, except if they force us
3038 * to recode the rest of the string into utf8 */
ba7cea30 3039
77a135fe 3040 /* Here uv is the ordinal of the next character being added in
ff3f963a 3041 * unicode (converted from native). */
77a135fe 3042 if (!UNI_IS_INVARIANT(uv)) {
9aa983d2 3043 if (!has_utf8 && uv > 255) {
77a135fe
KW
3044 /* Might need to recode whatever we have accumulated so
3045 * far if it contains any chars variant in utf8 or
3046 * utf-ebcdic. */
3047
3048 SvCUR_set(sv, d - SvPVX_const(sv));
3049 SvPOK_on(sv);
3050 *d = '\0';
77a135fe 3051 /* See Note on sizing above. */
7bf79863
KW
3052 sv_utf8_upgrade_flags_grow(sv,
3053 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3054 UNISKIP(uv) + (STRLEN)(send - s) + 1);
77a135fe
KW
3055 d = SvPVX(sv) + SvCUR(sv);
3056 has_utf8 = TRUE;
012bcf8d
GS
3057 }
3058
77a135fe
KW
3059 if (has_utf8) {
3060 d = (char*)uvuni_to_utf8((U8*)d, uv);
f9a63242
JH
3061 if (PL_lex_inwhat == OP_TRANS &&
3062 PL_sublex_info.sub_op) {
3063 PL_sublex_info.sub_op->op_private |=
3064 (PL_lex_repl ? OPpTRANS_FROM_UTF
3065 : OPpTRANS_TO_UTF);
f9a63242 3066 }
e294cc5d
JH
3067#ifdef EBCDIC
3068 if (uv > 255 && !dorange)
3069 native_range = FALSE;
3070#endif
012bcf8d 3071 }
a0ed51b3 3072 else {
012bcf8d 3073 *d++ = (char)uv;
a0ed51b3 3074 }
012bcf8d
GS
3075 }
3076 else {
c4d5f83a 3077 *d++ = (char) uv;
a0ed51b3 3078 }
79072805 3079 continue;
02aa26ce 3080
4a2d328f 3081 case 'N':
ff3f963a
KW
3082 /* In a non-pattern \N must be a named character, like \N{LATIN
3083 * SMALL LETTER A} or \N{U+0041}. For patterns, it also can
3084 * mean to match a non-newline. For non-patterns, named
3085 * characters are converted to their string equivalents. In
3086 * patterns, named characters are not converted to their
3087 * ultimate forms for the same reasons that other escapes
3088 * aren't. Instead, they are converted to the \N{U+...} form
3089 * to get the value from the charnames that is in effect right
3090 * now, while preserving the fact that it was a named character
3091 * so that the regex compiler knows this */
3092
3093 /* This section of code doesn't generally use the
3094 * NATIVE_TO_NEED() macro to transform the input. I (khw) did
3095 * a close examination of this macro and determined it is a
3096 * no-op except on utfebcdic variant characters. Every
3097 * character generated by this that would normally need to be
3098 * enclosed by this macro is invariant, so the macro is not
7538f724
KW
3099 * needed, and would complicate use of copy(). XXX There are
3100 * other parts of this file where the macro is used
3101 * inconsistently, but are saved by it being a no-op */
ff3f963a
KW
3102
3103 /* The structure of this section of code (besides checking for
3104 * errors and upgrading to utf8) is:
3105 * Further disambiguate between the two meanings of \N, and if
3106 * not a charname, go process it elsewhere
0a96133f
KW
3107 * If of form \N{U+...}, pass it through if a pattern;
3108 * otherwise convert to utf8
3109 * Otherwise must be \N{NAME}: convert to \N{U+c1.c2...} if a
3110 * pattern; otherwise convert to utf8 */
ff3f963a
KW
3111
3112 /* Here, s points to the 'N'; the test below is guaranteed to
3113 * succeed if we are being called on a pattern as we already
3114 * know from a test above that the next character is a '{'.
3115 * On a non-pattern \N must mean 'named sequence, which
3116 * requires braces */
3117 s++;
3118 if (*s != '{') {
3119 yyerror("Missing braces on \\N{}");
3120 continue;
3121 }
3122 s++;
3123
0a96133f 3124 /* If there is no matching '}', it is an error. */
ff3f963a
KW
3125 if (! (e = strchr(s, '}'))) {
3126 if (! PL_lex_inpat) {
5777a3f7 3127 yyerror("Missing right brace on \\N{}");
0a96133f
KW
3128 } else {
3129 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N.");
dbc0d4f2 3130 }
0a96133f 3131 continue;
ff3f963a 3132 }
cddc7ef4 3133
ff3f963a 3134 /* Here it looks like a named character */
cddc7ef4 3135
ff3f963a
KW
3136 if (PL_lex_inpat) {
3137
3138 /* XXX This block is temporary code. \N{} implies that the
3139 * pattern is to have Unicode semantics, and therefore
3140 * currently has to be encoded in utf8. By putting it in
3141 * utf8 now, we save a whole pass in the regular expression
3142 * compiler. Once that code is changed so Unicode
3143 * semantics doesn't necessarily have to be in utf8, this
da3a4baf
KW
3144 * block should be removed. However, the code that parses
3145 * the output of this would have to be changed to not
3146 * necessarily expect utf8 */
ff3f963a 3147 if (!has_utf8) {
77a135fe 3148 SvCUR_set(sv, d - SvPVX_const(sv));
f08d6ad9 3149 SvPOK_on(sv);
e4f3eed8 3150 *d = '\0';
77a135fe 3151 /* See Note on sizing above. */
7bf79863 3152 sv_utf8_upgrade_flags_grow(sv,
ff3f963a
KW
3153 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3154 /* 5 = '\N{' + cur char + NUL */
3155 (STRLEN)(send - s) + 5);
f08d6ad9 3156 d = SvPVX(sv) + SvCUR(sv);
89491803 3157 has_utf8 = TRUE;
ff3f963a
KW
3158 }
3159 }
423cee85 3160
ff3f963a
KW
3161 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3162 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3163 | PERL_SCAN_DISALLOW_PREFIX;
3164 STRLEN len;
3165
3166 /* For \N{U+...}, the '...' is a unicode value even on
3167 * EBCDIC machines */
3168 s += 2; /* Skip to next char after the 'U+' */
3169 len = e - s;
3170 uv = grok_hex(s, &len, &flags, NULL);
3171 if (len == 0 || len != (STRLEN)(e - s)) {
3172 yyerror("Invalid hexadecimal number in \\N{U+...}");
3173 s = e + 1;
3174 continue;
3175 }
3176
3177 if (PL_lex_inpat) {
3178
e2a7e165
KW
3179 /* On non-EBCDIC platforms, pass through to the regex
3180 * compiler unchanged. The reason we evaluated the
3181 * number above is to make sure there wasn't a syntax
3182 * error. But on EBCDIC we convert to native so
3183 * downstream code can continue to assume it's native
3184 */
ff3f963a 3185 s -= 5; /* Include the '\N{U+' */
e2a7e165
KW
3186#ifdef EBCDIC
3187 d += my_snprintf(d, e - s + 1 + 1, /* includes the }
3188 and the \0 */
3189 "\\N{U+%X}",
3190 (unsigned int) UNI_TO_NATIVE(uv));
3191#else
ff3f963a
KW
3192 Copy(s, d, e - s + 1, char); /* 1 = include the } */
3193 d += e - s + 1;
e2a7e165 3194#endif
ff3f963a
KW
3195 }
3196 else { /* Not a pattern: convert the hex to string */
3197
3198 /* If destination is not in utf8, unconditionally
3199 * recode it to be so. This is because \N{} implies
3200 * Unicode semantics, and scalars have to be in utf8
3201 * to guarantee those semantics */
3202 if (! has_utf8) {
3203 SvCUR_set(sv, d - SvPVX_const(sv));
3204 SvPOK_on(sv);
3205 *d = '\0';
3206 /* See Note on sizing above. */
3207 sv_utf8_upgrade_flags_grow(
3208 sv,
3209 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3210 UNISKIP(uv) + (STRLEN)(send - e) + 1);
3211 d = SvPVX(sv) + SvCUR(sv);
3212 has_utf8 = TRUE;
3213 }
3214
3215 /* Add the string to the output */
3216 if (UNI_IS_INVARIANT(uv)) {
3217 *d++ = (char) uv;
3218 }
3219 else d = (char*)uvuni_to_utf8((U8*)d, uv);
3220 }
3221 }
3222 else { /* Here is \N{NAME} but not \N{U+...}. */
3223
3224 SV *res; /* result from charnames */
3225 const char *str; /* the string in 'res' */
3226 STRLEN len; /* its length */
3227
3228 /* Get the value for NAME */
3229 res = newSVpvn(s, e - s);
3230 res = new_constant( NULL, 0, "charnames",
3231 /* includes all of: \N{...} */
3232 res, NULL, s - 3, e - s + 4 );
3233
3234 /* Most likely res will be in utf8 already since the
3235 * standard charnames uses pack U, but a custom translator
3236 * can leave it otherwise, so make sure. XXX This can be
3237 * revisited to not have charnames use utf8 for characters
3238 * that don't need it when regexes don't have to be in utf8
3239 * for Unicode semantics. If doing so, remember EBCDIC */
3240 sv_utf8_upgrade(res);
3241 str = SvPV_const(res, len);
3242
3243 /* Don't accept malformed input */
3244 if (! is_utf8_string((U8 *) str, len)) {
3245 yyerror("Malformed UTF-8 returned by \\N");
3246 }
3247 else if (PL_lex_inpat) {
3248
3249 if (! len) { /* The name resolved to an empty string */
3250 Copy("\\N{}", d, 4, char);
3251 d += 4;
3252 }
3253 else {
3254 /* In order to not lose information for the regex
3255 * compiler, pass the result in the specially made
3256 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3257 * the code points in hex of each character
3258 * returned by charnames */
3259
3260 const char *str_end = str + len;
3261 STRLEN char_length; /* cur char's byte length */
3262 STRLEN output_length; /* and the number of bytes
3263 after this is translated
3264 into hex digits */
3265 const STRLEN off = d - SvPVX_const(sv);
3266
3267 /* 2 hex per byte; 2 chars for '\N'; 2 chars for
3268 * max('U+', '.'); and 1 for NUL */
3269 char hex_string[2 * UTF8_MAXBYTES + 5];
3270
3271 /* Get the first character of the result. */
3272 U32 uv = utf8n_to_uvuni((U8 *) str,
3273 len,
3274 &char_length,
3275 UTF8_ALLOW_ANYUV);
3276
3277 /* The call to is_utf8_string() above hopefully
3278 * guarantees that there won't be an error. But
3279 * it's easy here to make sure. The function just
3280 * above warns and returns 0 if invalid utf8, but
3281 * it can also return 0 if the input is validly a
3282 * NUL. Disambiguate */
3283 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3284 uv = UNICODE_REPLACEMENT;
3285 }
3286
3287 /* Convert first code point to hex, including the
e2a7e165
KW
3288 * boiler plate before it. For all these, we
3289 * convert to native format so that downstream code
3290 * can continue to assume the input is native */
78c35590 3291 output_length =
3353de27 3292 my_snprintf(hex_string, sizeof(hex_string),
e2a7e165
KW
3293 "\\N{U+%X",
3294 (unsigned int) UNI_TO_NATIVE(uv));
ff3f963a
KW
3295
3296 /* Make sure there is enough space to hold it */
3297 d = off + SvGROW(sv, off
3298 + output_length
3299 + (STRLEN)(send - e)
3300 + 2); /* '}' + NUL */
3301 /* And output it */
3302 Copy(hex_string, d, output_length, char);
3303 d += output_length;
3304
3305 /* For each subsequent character, append dot and
3306 * its ordinal in hex */
3307 while ((str += char_length) < str_end) {
3308 const STRLEN off = d - SvPVX_const(sv);
3309 U32 uv = utf8n_to_uvuni((U8 *) str,
3310 str_end - str,
3311 &char_length,
3312 UTF8_ALLOW_ANYUV);
3313 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3314 uv = UNICODE_REPLACEMENT;
3315 }
3316
78c35590 3317 output_length =
3353de27 3318 my_snprintf(hex_string, sizeof(hex_string),
e2a7e165
KW
3319 ".%X",
3320 (unsigned int) UNI_TO_NATIVE(uv));
ff3f963a
KW
3321
3322 d = off + SvGROW(sv, off
3323 + output_length
3324 + (STRLEN)(send - e)
3325 + 2); /* '}' + NUL */
3326 Copy(hex_string, d, output_length, char);
3327 d += output_length;
3328 }
3329
3330 *d++ = '}'; /* Done. Add the trailing brace */
3331 }
3332 }
3333 else { /* Here, not in a pattern. Convert the name to a
3334 * string. */
3335
3336 /* If destination is not in utf8, unconditionally
3337 * recode it to be so. This is because \N{} implies
3338 * Unicode semantics, and scalars have to be in utf8
3339 * to guarantee those semantics */
3340 if (! has_utf8) {
3341 SvCUR_set(sv, d - SvPVX_const(sv));
3342 SvPOK_on(sv);
3343 *d = '\0';
3344 /* See Note on sizing above. */
3345 sv_utf8_upgrade_flags_grow(sv,
3346 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3347 len + (STRLEN)(send - s) + 1);
3348 d = SvPVX(sv) + SvCUR(sv);
3349 has_utf8 = TRUE;
3350 } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
3351
3352 /* See Note on sizing above. (NOTE: SvCUR() is not
3353 * set correctly here). */
3354 const STRLEN off = d - SvPVX_const(sv);
3355 d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1);
3356 }
3357 Copy(str, d, len, char);
3358 d += len;
423cee85 3359 }
423cee85 3360 SvREFCNT_dec(res);
cb233ae3
KW
3361
3362 /* Deprecate non-approved name syntax */
3363 if (ckWARN_d(WARN_DEPRECATED)) {
3364 bool problematic = FALSE;
3365 char* i = s;
3366
3367 /* For non-ut8 input, look to see that the first
3368 * character is an alpha, then loop through the rest
3369 * checking that each is a continuation */
3370 if (! this_utf8) {
3371 if (! isALPHAU(*i)) problematic = TRUE;
3372 else for (i = s + 1; i < e; i++) {
3373 if (isCHARNAME_CONT(*i)) continue;
3374 problematic = TRUE;
3375 break;
3376 }
3377 }
3378 else {
3379 /* Similarly for utf8. For invariants can check
3380 * directly. We accept anything above the latin1
3381 * range because it is immaterial to Perl if it is
3382 * correct or not, and is expensive to check. But
3383 * it is fairly easy in the latin1 range to convert
3384 * the variants into a single character and check
3385 * those */
3386 if (UTF8_IS_INVARIANT(*i)) {
3387 if (! isALPHAU(*i)) problematic = TRUE;
3388 } else if (UTF8_IS_DOWNGRADEABLE_START(*i)) {
81c14aa2 3389 if (! isALPHAU(UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(*i,
cb233ae3
KW
3390 *(i+1)))))
3391 {
3392 problematic = TRUE;
3393 }
3394 }
3395 if (! problematic) for (i = s + UTF8SKIP(s);
3396 i < e;
3397 i+= UTF8SKIP(i))
3398 {
3399 if (UTF8_IS_INVARIANT(*i)) {
3400 if (isCHARNAME_CONT(*i)) continue;
3401 } else if (! UTF8_IS_DOWNGRADEABLE_START(*i)) {
3402 continue;
3403 } else if (isCHARNAME_CONT(
3404 UNI_TO_NATIVE(
81c14aa2 3405 TWO_BYTE_UTF8_TO_UNI(*i, *(i+1)))))
cb233ae3
KW
3406 {
3407 continue;
3408 }
3409 problematic = TRUE;
3410 break;
3411 }
3412 }
3413 if (problematic) {
6e1bad6c
KW
3414 /* The e-i passed to the final %.*s makes sure that
3415 * should the trailing NUL be missing that this
3416 * print won't run off the end of the string */
cb233ae3 3417 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
b00fc8d4
NC
3418 "Deprecated character in \\N{...}; marked by <-- HERE in \\N{%.*s<-- HERE %.*s",
3419 (int)(i - s + 1), s, (int)(e - i), i + 1);
cb233ae3
KW
3420 }
3421 }
3422 } /* End \N{NAME} */
ff3f963a
KW
3423#ifdef EBCDIC
3424 if (!dorange)
3425 native_range = FALSE; /* \N{} is defined to be Unicode */
3426#endif
3427 s = e + 1; /* Point to just after the '}' */
423cee85
JH
3428 continue;
3429
02aa26ce 3430 /* \c is a control character */
79072805
LW
3431 case 'c':
3432 s++;
961ce445 3433 if (s < send) {
17a3df4c 3434 *d++ = grok_bslash_c(*s++, has_utf8, 1);
ba210ebe 3435 }
961ce445
RGS
3436 else {
3437 yyerror("Missing control char name in \\c");
3438 }
79072805 3439 continue;
02aa26ce
NT
3440
3441 /* printf-style backslashes, formfeeds, newlines, etc */
79072805 3442 case 'b':
db42d148 3443 *d++ = NATIVE_TO_NEED(has_utf8,'\b');
79072805
LW
3444 break;
3445 case 'n':
db42d148 3446 *d++ = NATIVE_TO_NEED(has_utf8,'\n');
79072805
LW
3447 break;
3448 case 'r':
db42d148 3449 *d++ = NATIVE_TO_NEED(has_utf8,'\r');
79072805
LW
3450 break;
3451 case 'f':
db42d148 3452 *d++ = NATIVE_TO_NEED(has_utf8,'\f');
79072805
LW
3453 break;
3454 case 't':
db42d148 3455 *d++ = NATIVE_TO_NEED(has_utf8,'\t');
79072805 3456 break;
34a3fe2a 3457 case 'e':
db42d148 3458 *d++ = ASCII_TO_NEED(has_utf8,'\033');
34a3fe2a
PP
3459 break;
3460 case 'a':
db42d148 3461 *d++ = ASCII_TO_NEED(has_utf8,'\007');
79072805 3462 break;
02aa26ce
NT
3463 } /* end switch */
3464
79072805
LW
3465 s++;
3466 continue;
02aa26ce 3467 } /* end if (backslash) */
4c3a8340
ST
3468#ifdef EBCDIC
3469 else
3470 literal_endpoint++;
3471#endif
02aa26ce 3472
f9a63242 3473 default_action:
77a135fe
KW
3474 /* If we started with encoded form, or already know we want it,
3475 then encode the next character */
3476 if (! NATIVE_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
2b9d42f0 3477 STRLEN len = 1;
77a135fe
KW
3478
3479
3480 /* One might think that it is wasted effort in the case of the
3481 * source being utf8 (this_utf8 == TRUE) to take the next character
3482 * in the source, convert it to an unsigned value, and then convert
3483 * it back again. But the source has not been validated here. The
3484 * routine that does the conversion checks for errors like
3485 * malformed utf8 */
3486
5f66b61c
AL
3487 const UV nextuv = (this_utf8) ? utf8n_to_uvchr((U8*)s, send - s, &len, 0) : (UV) ((U8) *s);
3488 const STRLEN need = UNISKIP(NATIVE_TO_UNI(nextuv));
77a135fe
KW
3489 if (!has_utf8) {
3490 SvCUR_set(sv, d - SvPVX_const(sv));
3491 SvPOK_on(sv);
3492 *d = '\0';
77a135fe 3493 /* See Note on sizing above. */
7bf79863
KW
3494 sv_utf8_upgrade_flags_grow(sv,
3495 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3496 need + (STRLEN)(send - s) + 1);
77a135fe
KW
3497 d = SvPVX(sv) + SvCUR(sv);
3498 has_utf8 = TRUE;
3499 } else if (need > len) {
3500 /* encoded value larger than old, may need extra space (NOTE:
3501 * SvCUR() is not set correctly here). See Note on sizing
3502 * above. */
9d4ba2ae 3503 const STRLEN off = d - SvPVX_const(sv);
77a135fe 3504 d = SvGROW(sv, off + need + (STRLEN)(send - s) + 1) + off;
2b9d42f0 3505 }
77a135fe
KW
3506 s += len;
3507
5f66b61c 3508 d = (char*)uvchr_to_utf8((U8*)d, nextuv);
e294cc5d
JH
3509#ifdef EBCDIC
3510 if (uv > 255 && !dorange)
3511 native_range = FALSE;
3512#endif
2b9d42f0
NIS
3513 }
3514 else {
3515 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
3516 }
02aa26ce
NT
3517 } /* while loop to process each character */
3518
3519 /* terminate the string and set up the sv */
79072805 3520 *d = '\0';
95a20fc0 3521 SvCUR_set(sv, d - SvPVX_const(sv));
2b9d42f0 3522 if (SvCUR(sv) >= SvLEN(sv))
5637ef5b
NC
3523 Perl_croak(aTHX_ "panic: constant overflowed allocated space, %"UVuf
3524 " >= %"UVuf, (UV)SvCUR(sv), (UV)SvLEN(sv));
2b9d42f0 3525
79072805 3526 SvPOK_on(sv);
9f4817db 3527 if (PL_encoding && !has_utf8) {
d0063567
DK
3528 sv_recode_to_utf8(sv, PL_encoding);
3529 if (SvUTF8(sv))
3530 has_utf8 = TRUE;
9f4817db 3531 }
2b9d42f0 3532 if (has_utf8) {
7e2040f0 3533 SvUTF8_on(sv);
2b9d42f0 3534 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
d0063567 3535 PL_sublex_info.sub_op->op_private |=
2b9d42f0
NIS
3536 (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
3537 }
3538 }
79072805 3539
02aa26ce 3540 /* shrink the sv if we allocated more than we used */
79072805 3541 if (SvCUR(sv) + 5 < SvLEN(sv)) {
1da4ca5f 3542 SvPV_shrink_to_cur(sv);
79072805 3543 }
02aa26ce 3544
6154021b 3545 /* return the substring (via pl_yylval) only if we parsed anything */
3280af22 3546 if (s > PL_bufptr) {
eb0d8d16
NC
3547 if ( PL_hints & ( PL_lex_inpat ? HINT_NEW_RE : HINT_NEW_STRING ) ) {
3548 const char *const key = PL_lex_inpat ? "qr" : "q";
3549 const STRLEN keylen = PL_lex_inpat ? 2 : 1;
3550 const char *type;
3551 STRLEN typelen;
3552
3553 if (PL_lex_inwhat == OP_TRANS) {
3554 type = "tr";
3555 typelen = 2;
3556 } else if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat) {
3557 type = "s";
3558 typelen = 1;
9da1dd8f
DM
3559 } else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'') {
3560 type = "q";
3561 typelen = 1;
eb0d8d16
NC
3562 } else {
3563 type = "qq";
3564 typelen = 2;
3565 }
3566
3567 sv = S_new_constant(aTHX_ start, s - start, key, keylen, sv, NULL,
3568 type, typelen);
3569 }
6154021b 3570 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
b3ac6de7 3571 } else
8990e307 3572 SvREFCNT_dec(sv);
79072805
LW
3573 return s;
3574}
3575
ffb4593c
NT
3576/* S_intuit_more
3577 * Returns TRUE if there's more to the expression (e.g., a subscript),
3578 * FALSE otherwise.
ffb4593c
NT
3579 *
3580 * It deals with "$foo[3]" and /$foo[3]/ and /$foo[0123456789$]+/
3581 *
3582 * ->[ and ->{ return TRUE
3583 * { and [ outside a pattern are always subscripts, so return TRUE
3584 * if we're outside a pattern and it's not { or [, then return FALSE
3585 * if we're in a pattern and the first char is a {
3586 * {4,5} (any digits around the comma) returns FALSE
3587 * if we're in a pattern and the first char is a [
3588 * [] returns FALSE
3589 * [SOMETHING] has a funky algorithm to decide whether it's a
3590 * character class or not. It has to deal with things like
3591 * /$foo[-3]/ and /$foo[$bar]/ as well as /$foo[$\d]+/
3592 * anything else returns TRUE
3593 */
3594
9cbb5ea2
GS
3595/* This is the one truly awful dwimmer necessary to conflate C and sed. */
3596
76e3520e 3597STATIC int
cea2e8a9 3598S_intuit_more(pTHX_ register char *s)
79072805 3599{
97aff369 3600 dVAR;
7918f24d
NC
3601
3602 PERL_ARGS_ASSERT_INTUIT_MORE;
3603
3280af22 3604 if (PL_lex_brackets)
79072805
LW
3605 return TRUE;
3606 if (*s == '-' && s[1] == '>' && (s[2] == '[' || s[2] == '{'))
3607 return TRUE;
3608 if (*s != '{' && *s != '[')
3609 return FALSE;
3280af22 3610 if (!PL_lex_inpat)
79072805
LW
3611 return TRUE;
3612
3613 /* In a pattern, so maybe we have {n,m}. */
3614 if (*s == '{') {
b3155d95 3615 if (regcurly(s)) {
79072805 3616 return FALSE;
b3155d95 3617 }
79072805 3618 return TRUE;
79072805
LW
3619 }
3620
3621 /* On the other hand, maybe we have a character class */
3622
3623 s++;
3624 if (*s == ']' || *s == '^')
3625 return FALSE;
3626 else {
ffb4593c 3627 /* this is terrifying, and it works */
79072805
LW
3628 int weight = 2; /* let's weigh the evidence */
3629 char seen[256];
f27ffc4a 3630 unsigned char un_char = 255, last_un_char;
9d4ba2ae 3631 const char * const send = strchr(s,']');
3280af22 3632 char tmpbuf[sizeof PL_tokenbuf * 4];
79072805
LW
3633
3634 if (!send) /* has to be an expression */
3635 return TRUE;
3636
3637 Zero(seen,256,char);
3638 if (*s == '$')
3639 weight -= 3;
3640 else if (isDIGIT(*s)) {
3641 if (s[1] != ']') {
3642 if (isDIGIT(s[1]) && s[2] == ']')
3643 weight -= 10;
3644 }
3645 else
3646 weight -= 100;
3647 }
3648 for (; s < send; s++) {
3649 last_un_char = un_char;
3650 un_char = (unsigned char)*s;
3651 switch (*s) {
3652 case '@':
3653 case '&':
3654 case '$':
3655 weight -= seen[un_char] * 10;
7e2040f0 3656 if (isALNUM_lazy_if(s+1,UTF)) {
90e5519e 3657 int len;
8903cb82 3658 scan_ident(s, send, tmpbuf, sizeof tmpbuf, FALSE);
90e5519e 3659 len = (int)strlen(tmpbuf);
6fbd0d97
BF
3660 if (len > 1 && gv_fetchpvn_flags(tmpbuf, len,
3661 UTF ? SVf_UTF8 : 0, SVt_PV))
79072805
LW
3662 weight -= 100;
3663 else
3664 weight -= 10;
3665 }
3666 else if (*s == '$' && s[1] &&
93a17b20
LW
3667 strchr("[#!%*<>()-=",s[1])) {
3668 if (/*{*/ strchr("])} =",s[2]))
79072805
LW
3669 weight -= 10;
3670 else
3671 weight -= 1;
3672 }
3673 break;
3674 case '\\':
3675 un_char = 254;
3676 if (s[1]) {
93a17b20 3677 if (strchr("wds]",s[1]))
79072805 3678 weight += 100;
10edeb5d 3679 else if (seen[(U8)'\''] || seen[(U8)'"'])
79072805 3680 weight += 1;
93a17b20 3681 else if (strchr("rnftbxcav",s[1]))
79072805
LW
3682 weight += 40;
3683 else if (isDIGIT(s[1])) {
3684 weight += 40;
3685 while (s[1] && isDIGIT(s[1]))
3686 s++;
3687 }
3688 }
3689 else
3690 weight += 100;
3691 break;
3692 case '-':
3693 if (s[1] == '\\')
3694 weight += 50;
93a17b20 3695 if (strchr("aA01! ",last_un_char))
79072805 3696 weight += 30;
93a17b20 3697 if (strchr("zZ79~",s[1]))
79072805 3698 weight += 30;
f27ffc4a
GS
3699 if (last_un_char == 255 && (isDIGIT(s[1]) || s[1] == '$'))
3700 weight -= 5; /* cope with negative subscript */
79072805
LW
3701 break;
3702 default:
3792a11b
NC
3703 if (!isALNUM(last_un_char)
3704 && !(last_un_char == '$' || last_un_char == '@'
3705 || last_un_char == '&')
3706 && isALPHA(*s) && s[1] && isALPHA(s[1])) {
79072805
LW
3707 char *d = tmpbuf;
3708 while (isALPHA(*s))
3709 *d++ = *s++;
3710 *d = '\0';
5458a98a 3711 if (keyword(tmpbuf, d - tmpbuf, 0))
79072805
LW
3712 weight -= 150;
3713 }
3714 if (un_char == last_un_char + 1)
3715 weight += 5;
3716 weight -= seen[un_char];
3717 break;
3718 }
3719 seen[un_char]++;
3720 }
3721 if (weight >= 0) /* probably a character class */
3722 return FALSE;
3723 }
3724
3725 return TRUE;
3726}
ffed7fef 3727
ffb4593c
NT
3728/*
3729 * S_intuit_method
3730 *
3731 * Does all the checking to disambiguate
3732 * foo bar
3733 * between foo(bar) and bar->foo. Returns 0 if not a method, otherwise
3734 * FUNCMETH (bar->foo(args)) or METHOD (bar->foo args).
3735 *
3736 * First argument is the stuff after the first token, e.g. "bar".
3737 *
a4fd4a89 3738 * Not a method if foo is a filehandle.
ffb4593c
NT
3739 * Not a method if foo is a subroutine prototyped to take a filehandle.
3740 * Not a method if it's really "Foo $bar"
3741 * Method if it's "foo $bar"
3742 * Not a method if it's really "print foo $bar"
3743 * Method if it's really "foo package::" (interpreted as package->foo)
8f8cf39c 3744 * Not a method if bar is known to be a subroutine ("sub bar; foo bar")
3cb0bbe5 3745 * Not a method if bar is a filehandle or package, but is quoted with
ffb4593c
NT
3746 * =>
3747 */
3748
76e3520e 3749STATIC int
62d55b22 3750S_intuit_method(pTHX_ char *start, GV *gv, CV *cv)
a0d0e21e 3751{
97aff369 3752 dVAR;
a0d0e21e 3753 char *s = start + (*start == '$');
3280af22 3754 char tmpbuf[sizeof PL_tokenbuf];
a0d0e21e
LW
3755 STRLEN len;
3756 GV* indirgv;
5db06880
NC
3757#ifdef PERL_MAD
3758 int soff;
3759#endif
a0d0e21e 3760
7918f24d
NC
3761 PERL_ARGS_ASSERT_INTUIT_METHOD;
3762
a0d0e21e 3763 if (gv) {
62d55b22 3764 if (SvTYPE(gv) == SVt_PVGV && GvIO(gv))
a0d0e21e 3765 return 0;
62d55b22
NC
3766 if (cv) {
3767 if (SvPOK(cv)) {