This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Prevent double frees/crashes with format syntax errs
[perl5.git] / toke.c
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
26
27This is the lower layer of the Perl parser, managing characters and tokens.
28
29=for apidoc AmU|yy_parser *|PL_parser
30
31Pointer to a structure encapsulating the state of the parsing operation
32currently in progress. The pointer can be locally changed to perform
33a nested parse without interfering with the state of an outer parse.
34Individual members of C<PL_parser> have their own documentation.
35
36=cut
37*/
38
378cc40b 39#include "EXTERN.h"
864dbfa3 40#define PERL_IN_TOKE_C
378cc40b 41#include "perl.h"
04e98a4d 42#include "dquote_static.c"
378cc40b 43
eb0d8d16
NC
44#define new_constant(a,b,c,d,e,f,g) \
45 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
46
6154021b 47#define pl_yylval (PL_parser->yylval)
d3b6f988 48
199e78b7
DM
49/* XXX temporary backwards compatibility */
50#define PL_lex_brackets (PL_parser->lex_brackets)
78cdf107
Z
51#define PL_lex_allbrackets (PL_parser->lex_allbrackets)
52#define PL_lex_fakeeof (PL_parser->lex_fakeeof)
199e78b7
DM
53#define PL_lex_brackstack (PL_parser->lex_brackstack)
54#define PL_lex_casemods (PL_parser->lex_casemods)
55#define PL_lex_casestack (PL_parser->lex_casestack)
56#define PL_lex_defer (PL_parser->lex_defer)
57#define PL_lex_dojoin (PL_parser->lex_dojoin)
58#define PL_lex_expect (PL_parser->lex_expect)
59#define PL_lex_formbrack (PL_parser->lex_formbrack)
60#define PL_lex_inpat (PL_parser->lex_inpat)
61#define PL_lex_inwhat (PL_parser->lex_inwhat)
62#define PL_lex_op (PL_parser->lex_op)
63#define PL_lex_repl (PL_parser->lex_repl)
64#define PL_lex_starts (PL_parser->lex_starts)
65#define PL_lex_stuff (PL_parser->lex_stuff)
66#define PL_multi_start (PL_parser->multi_start)
67#define PL_multi_open (PL_parser->multi_open)
68#define PL_multi_close (PL_parser->multi_close)
69#define PL_pending_ident (PL_parser->pending_ident)
70#define PL_preambled (PL_parser->preambled)
71#define PL_sublex_info (PL_parser->sublex_info)
bdc0bf6f 72#define PL_linestr (PL_parser->linestr)
c2598295
DM
73#define PL_expect (PL_parser->expect)
74#define PL_copline (PL_parser->copline)
f06b5848
DM
75#define PL_bufptr (PL_parser->bufptr)
76#define PL_oldbufptr (PL_parser->oldbufptr)
77#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
78#define PL_linestart (PL_parser->linestart)
79#define PL_bufend (PL_parser->bufend)
80#define PL_last_uni (PL_parser->last_uni)
81#define PL_last_lop (PL_parser->last_lop)
82#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 83#define PL_lex_state (PL_parser->lex_state)
2f9285f8 84#define PL_rsfp (PL_parser->rsfp)
5486870f 85#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
86#define PL_in_my (PL_parser->in_my)
87#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 88#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 89#define PL_multi_end (PL_parser->multi_end)
13765c85 90#define PL_error_count (PL_parser->error_count)
199e78b7
DM
91
92#ifdef PERL_MAD
93# define PL_endwhite (PL_parser->endwhite)
94# define PL_faketokens (PL_parser->faketokens)
95# define PL_lasttoke (PL_parser->lasttoke)
96# define PL_nextwhite (PL_parser->nextwhite)
97# define PL_realtokenstart (PL_parser->realtokenstart)
98# define PL_skipwhite (PL_parser->skipwhite)
99# define PL_thisclose (PL_parser->thisclose)
100# define PL_thismad (PL_parser->thismad)
101# define PL_thisopen (PL_parser->thisopen)
102# define PL_thisstuff (PL_parser->thisstuff)
103# define PL_thistoken (PL_parser->thistoken)
104# define PL_thiswhite (PL_parser->thiswhite)
fb205e7a
DM
105# define PL_thiswhite (PL_parser->thiswhite)
106# define PL_nexttoke (PL_parser->nexttoke)
107# define PL_curforce (PL_parser->curforce)
108#else
109# define PL_nexttoke (PL_parser->nexttoke)
110# define PL_nexttype (PL_parser->nexttype)
111# define PL_nextval (PL_parser->nextval)
199e78b7
DM
112#endif
113
16173588
NC
114/* This can't be done with embed.fnc, because struct yy_parser contains a
115 member named pending_ident, which clashes with the generated #define */
3cbf51f5
DM
116static int
117S_pending_ident(pTHX);
199e78b7 118
0bd48802 119static const char ident_too_long[] = "Identifier too long";
8903cb82 120
29595ff2 121#ifdef PERL_MAD
29595ff2 122# define CURMAD(slot,sv) if (PL_madskills) { curmad(slot,sv); sv = 0; }
cd81e915 123# define NEXTVAL_NEXTTOKE PL_nexttoke[PL_curforce].next_val
9ded7720 124#else
5db06880 125# define CURMAD(slot,sv)
9ded7720 126# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2
NC
127#endif
128
a7aaec61
Z
129#define XENUMMASK 0x3f
130#define XFAKEEOF 0x40
131#define XFAKEBRACK 0x80
9059aa12 132
39e02b42
JH
133#ifdef USE_UTF8_SCRIPTS
134# define UTF (!IN_BYTES)
2b9d42f0 135#else
802a15e9 136# define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
2b9d42f0 137#endif
a0ed51b3 138
b1fc3636
CJ
139/* The maximum number of characters preceding the unrecognized one to display */
140#define UNRECOGNIZED_PRECEDE_COUNT 10
141
61f0cdd9 142/* In variables named $^X, these are the legal values for X.
2b92dfce
GS
143 * 1999-02-27 mjd-perl-patch@plover.com */
144#define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
145
bf4acbe4 146#define SPACE_OR_TAB(c) ((c)==' '||(c)=='\t')
bf4acbe4 147
ffb4593c
NT
148/* LEX_* are values for PL_lex_state, the state of the lexer.
149 * They are arranged oddly so that the guard on the switch statement
79072805 150 * can get by with a single comparison (if the compiler is smart enough).
9da1dd8f
DM
151 *
152 * These values refer to the various states within a sublex parse,
153 * i.e. within a double quotish string
79072805
LW
154 */
155
fb73857a
PP
156/* #define LEX_NOTPARSING 11 is done in perl.h. */
157
b6007c36
DM
158#define LEX_NORMAL 10 /* normal code (ie not within "...") */
159#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
160#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
161#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
162#define LEX_INTERPSTART 6 /* expecting the start of a $var */
163
164 /* at end of code, eg "$x" followed by: */
165#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
166#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
167
168#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
169 string or after \E, $foo, etc */
170#define LEX_INTERPCONST 2 /* NOT USED */
171#define LEX_FORMLINE 1 /* expecting a format line */
172#define LEX_KNOWNEXT 0 /* next token known; just return it */
173
79072805 174
bbf60fe6 175#ifdef DEBUGGING
27da23d5 176static const char* const lex_state_names[] = {
bbf60fe6
DM
177 "KNOWNEXT",
178 "FORMLINE",
179 "INTERPCONST",
180 "INTERPCONCAT",
181 "INTERPENDMAYBE",
182 "INTERPEND",
183 "INTERPSTART",
184 "INTERPPUSH",
185 "INTERPCASEMOD",
186 "INTERPNORMAL",
187 "NORMAL"
188};
189#endif
190
79072805
LW
191#ifdef ff_next
192#undef ff_next
d48672a2
LW
193#endif
194
79072805 195#include "keywords.h"
fe14fcc3 196
ffb4593c
NT
197/* CLINE is a macro that ensures PL_copline has a sane value */
198
ae986130
LW
199#ifdef CLINE
200#undef CLINE
201#endif
57843af0 202#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 203
5db06880 204#ifdef PERL_MAD
29595ff2
NC
205# define SKIPSPACE0(s) skipspace0(s)
206# define SKIPSPACE1(s) skipspace1(s)
207# define SKIPSPACE2(s,tsv) skipspace2(s,&tsv)
208# define PEEKSPACE(s) skipspace2(s,0)
209#else
210# define SKIPSPACE0(s) skipspace(s)
211# define SKIPSPACE1(s) skipspace(s)
212# define SKIPSPACE2(s,tsv) skipspace(s)
213# define PEEKSPACE(s) skipspace(s)
214#endif
215
ffb4593c
NT
216/*
217 * Convenience functions to return different tokens and prime the
9cbb5ea2 218 * lexer for the next token. They all take an argument.
ffb4593c
NT
219 *
220 * TOKEN : generic token (used for '(', DOLSHARP, etc)
221 * OPERATOR : generic operator
222 * AOPERATOR : assignment operator
223 * PREBLOCK : beginning the block after an if, while, foreach, ...
224 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
225 * PREREF : *EXPR where EXPR is not a simple identifier
226 * TERM : expression term
227 * LOOPX : loop exiting command (goto, last, dump, etc)
228 * FTST : file test operator
229 * FUN0 : zero-argument function
7eb971ee 230 * FUN0OP : zero-argument function, with its op created in this file
2d2e263d 231 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
232 * BOop : bitwise or or xor
233 * BAop : bitwise and
234 * SHop : shift operator
235 * PWop : power operator
9cbb5ea2 236 * PMop : pattern-matching operator
ffb4593c
NT
237 * Aop : addition-level operator
238 * Mop : multiplication-level operator
239 * Eop : equality-testing operator
e5edeb50 240 * Rop : relational operator <= != gt
ffb4593c
NT
241 *
242 * Also see LOP and lop() below.
243 */
244
998054bd 245#ifdef DEBUGGING /* Serve -DT. */
704d4215 246# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 247#else
bbf60fe6 248# define REPORT(retval) (retval)
998054bd
SC
249#endif
250
bbf60fe6
DM
251#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
252#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
253#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, REPORT(retval)))
254#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
255#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
256#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
257#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
6154021b
RGS
258#define LOOPX(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)LOOPEX))
259#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
260#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
7eb971ee 261#define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
6154021b
RGS
262#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
263#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITOROP)))
264#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITANDOP)))
265#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)SHIFTOP)))
266#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)POWOP)))
267#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
268#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)ADDOP)))
269#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MULOP)))
270#define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
271#define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
2f3197b3 272
a687059c
LW
273/* This bit of chicanery makes a unary function followed by
274 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
275 * The UNIDOR macro is for unary functions that can be followed by the //
276 * operator (such as C<shift // 0>).
a687059c 277 */
d68ce4ac 278#define UNI3(f,x,have_x) { \
6154021b 279 pl_yylval.ival = f; \
d68ce4ac 280 if (have_x) PL_expect = x; \
376fcdbf
AL
281 PL_bufptr = s; \
282 PL_last_uni = PL_oldbufptr; \
283 PL_last_lop_op = f; \
284 if (*s == '(') \
285 return REPORT( (int)FUNC1 ); \
29595ff2 286 s = PEEKSPACE(s); \
376fcdbf
AL
287 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
288 }
d68ce4ac
FC
289#define UNI(f) UNI3(f,XTERM,1)
290#define UNIDOR(f) UNI3(f,XTERMORDORDOR,1)
b5fb7ce3
FC
291#define UNIPROTO(f,optional) { \
292 if (optional) PL_last_uni = PL_oldbufptr; \
22393538
MH
293 OPERATOR(f); \
294 }
a687059c 295
d68ce4ac 296#define UNIBRACK(f) UNI3(f,0,0)
79072805 297
9f68db38 298/* grandfather return to old style */
78cdf107
Z
299#define OLDLOP(f) \
300 do { \
301 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
302 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
303 pl_yylval.ival = (f); \
304 PL_expect = XTERM; \
305 PL_bufptr = s; \
306 return (int)LSTOP; \
307 } while(0)
79072805 308
8fa7f367
JH
309#ifdef DEBUGGING
310
6154021b 311/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
312enum token_type {
313 TOKENTYPE_NONE,
314 TOKENTYPE_IVAL,
6154021b 315 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6 316 TOKENTYPE_PVAL,
aeaef349 317 TOKENTYPE_OPVAL
bbf60fe6
DM
318};
319
6d4a66ac
NC
320static struct debug_tokens {
321 const int token;
322 enum token_type type;
323 const char *name;
324} const debug_tokens[] =
9041c2e3 325{
bbf60fe6
DM
326 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
327 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
328 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
329 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
330 { ARROW, TOKENTYPE_NONE, "ARROW" },
331 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
332 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
333 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
334 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
335 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
0d863452 336 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
337 { DO, TOKENTYPE_NONE, "DO" },
338 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
339 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
340 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
341 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
342 { ELSE, TOKENTYPE_NONE, "ELSE" },
343 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
344 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
345 { FOR, TOKENTYPE_IVAL, "FOR" },
346 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
347 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
348 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
7eb971ee 349 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
bbf60fe6
DM
350 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
351 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
352 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 353 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
354 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
355 { IF, TOKENTYPE_IVAL, "IF" },
5db1eb8d 356 { LABEL, TOKENTYPE_OPVAL, "LABEL" },
bbf60fe6
DM
357 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
358 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
359 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
360 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
361 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
362 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
363 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
364 { MY, TOKENTYPE_IVAL, "MY" },
365 { MYSUB, TOKENTYPE_NONE, "MYSUB" },
366 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
367 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
368 { OROP, TOKENTYPE_IVAL, "OROP" },
369 { OROR, TOKENTYPE_NONE, "OROR" },
370 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
f3f204dc 371 { PEG, TOKENTYPE_NONE, "PEG" },
88e1f1a2
JV
372 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
373 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6
DM
374 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
375 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
376 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
377 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
378 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
379 { PREINC, TOKENTYPE_NONE, "PREINC" },
380 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
f3f204dc 381 { QWLIST, TOKENTYPE_OPVAL, "QWLIST" },
bbf60fe6
DM
382 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
383 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
f3f204dc 384 { REQUIRE, TOKENTYPE_NONE, "REQUIRE" },
bbf60fe6
DM
385 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
386 { SUB, TOKENTYPE_NONE, "SUB" },
387 { THING, TOKENTYPE_OPVAL, "THING" },
388 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
389 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
390 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
391 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
392 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
393 { USE, TOKENTYPE_IVAL, "USE" },
0d863452 394 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6
DM
395 { WHILE, TOKENTYPE_IVAL, "WHILE" },
396 { WORD, TOKENTYPE_OPVAL, "WORD" },
be25f609 397 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 398 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
399};
400
6154021b 401/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 402
bbf60fe6 403STATIC int
704d4215 404S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 405{
97aff369 406 dVAR;
7918f24d
NC
407
408 PERL_ARGS_ASSERT_TOKEREPORT;
409
bbf60fe6 410 if (DEBUG_T_TEST) {
bd61b366 411 const char *name = NULL;
bbf60fe6 412 enum token_type type = TOKENTYPE_NONE;
f54cb97a 413 const struct debug_tokens *p;
396482e1 414 SV* const report = newSVpvs("<== ");
bbf60fe6 415
f54cb97a 416 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
417 if (p->token == (int)rv) {
418 name = p->name;
419 type = p->type;
420 break;
421 }
422 }
423 if (name)
54667de8 424 Perl_sv_catpv(aTHX_ report, name);
bbf60fe6
DM
425 else if ((char)rv > ' ' && (char)rv < '~')
426 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
427 else if (!rv)
396482e1 428 sv_catpvs(report, "EOF");
bbf60fe6
DM
429 else
430 Perl_sv_catpvf(aTHX_ report, "?? %"IVdf, (IV)rv);
431 switch (type) {
432 case TOKENTYPE_NONE:
bbf60fe6
DM
433 break;
434 case TOKENTYPE_IVAL:
704d4215 435 Perl_sv_catpvf(aTHX_ report, "(ival=%"IVdf")", (IV)lvalp->ival);
bbf60fe6
DM
436 break;
437 case TOKENTYPE_OPNUM:
438 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 439 PL_op_name[lvalp->ival]);
bbf60fe6
DM
440 break;
441 case TOKENTYPE_PVAL:
704d4215 442 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
443 break;
444 case TOKENTYPE_OPVAL:
704d4215 445 if (lvalp->opval) {
401441c0 446 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
447 PL_op_name[lvalp->opval->op_type]);
448 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 449 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 450 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
451 }
452
453 }
401441c0 454 else
396482e1 455 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
456 break;
457 }
b6007c36 458 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
459 };
460 return (int)rv;
998054bd
SC
461}
462
b6007c36
DM
463
464/* print the buffer with suitable escapes */
465
466STATIC void
15f169a1 467S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 468{
396482e1 469 SV* const tmp = newSVpvs("");
7918f24d
NC
470
471 PERL_ARGS_ASSERT_PRINTBUF;
472
b6007c36
DM
473 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
474 SvREFCNT_dec(tmp);
475}
476
8fa7f367
JH
477#endif
478
8290c323
NC
479static int
480S_deprecate_commaless_var_list(pTHX) {
481 PL_expect = XTERM;
482 deprecate("comma-less variable list");
483 return REPORT(','); /* grandfather non-comma-format format */
484}
485
ffb4593c
NT
486/*
487 * S_ao
488 *
c963b151
BD
489 * This subroutine detects &&=, ||=, and //= and turns an ANDAND, OROR or DORDOR
490 * into an OP_ANDASSIGN, OP_ORASSIGN, or OP_DORASSIGN
ffb4593c
NT
491 */
492
76e3520e 493STATIC int
cea2e8a9 494S_ao(pTHX_ int toketype)
a0d0e21e 495{
97aff369 496 dVAR;
3280af22
NIS
497 if (*PL_bufptr == '=') {
498 PL_bufptr++;
a0d0e21e 499 if (toketype == ANDAND)
6154021b 500 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 501 else if (toketype == OROR)
6154021b 502 pl_yylval.ival = OP_ORASSIGN;
c963b151 503 else if (toketype == DORDOR)
6154021b 504 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
505 toketype = ASSIGNOP;
506 }
507 return toketype;
508}
509
ffb4593c
NT
510/*
511 * S_no_op
512 * When Perl expects an operator and finds something else, no_op
513 * prints the warning. It always prints "<something> found where
514 * operator expected. It prints "Missing semicolon on previous line?"
515 * if the surprise occurs at the start of the line. "do you need to
516 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
517 * where the compiler doesn't know if foo is a method call or a function.
518 * It prints "Missing operator before end of line" if there's nothing
519 * after the missing operator, or "... before <...>" if there is something
520 * after the missing operator.
521 */
522
76e3520e 523STATIC void
15f169a1 524S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 525{
97aff369 526 dVAR;
9d4ba2ae
AL
527 char * const oldbp = PL_bufptr;
528 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 529
7918f24d
NC
530 PERL_ARGS_ASSERT_NO_OP;
531
1189a94a
GS
532 if (!s)
533 s = oldbp;
07c798fb 534 else
1189a94a 535 PL_bufptr = s;
734ab321 536 yywarn(Perl_form(aTHX_ "%s found where operator expected", what), UTF ? SVf_UTF8 : 0);
56da5a46
RGS
537 if (ckWARN_d(WARN_SYNTAX)) {
538 if (is_first)
539 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
540 "\t(Missing semicolon on previous line?)\n");
541 else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) {
f54cb97a 542 const char *t;
734ab321
BF
543 for (t = PL_oldoldbufptr; (isALNUM_lazy_if(t,UTF) || *t == ':');
544 t += UTF ? UTF8SKIP(t) : 1)
c35e046a 545 NOOP;
56da5a46
RGS
546 if (t < PL_bufptr && isSPACE(*t))
547 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
734ab321
BF
548 "\t(Do you need to predeclare %"SVf"?)\n",
549 SVfARG(newSVpvn_flags(PL_oldoldbufptr, (STRLEN)(t - PL_oldoldbufptr),
550 SVs_TEMP | (UTF ? SVf_UTF8 : 0))));
56da5a46
RGS
551 }
552 else {
553 assert(s >= oldbp);
554 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
734ab321
BF
555 "\t(Missing operator before %"SVf"?)\n",
556 SVfARG(newSVpvn_flags(oldbp, (STRLEN)(s - oldbp),
557 SVs_TEMP | (UTF ? SVf_UTF8 : 0))));
56da5a46 558 }
07c798fb 559 }
3280af22 560 PL_bufptr = oldbp;
8990e307
LW
561}
562
ffb4593c
NT
563/*
564 * S_missingterm
565 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 566 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
567 * If we're in a delimited string and the delimiter is a control
568 * character, it's reformatted into a two-char sequence like ^C.
569 * This is fatal.
570 */
571
76e3520e 572STATIC void
cea2e8a9 573S_missingterm(pTHX_ char *s)
8990e307 574{
97aff369 575 dVAR;
8990e307
LW
576 char tmpbuf[3];
577 char q;
578 if (s) {
9d4ba2ae 579 char * const nl = strrchr(s,'\n');
d2719217 580 if (nl)
8990e307
LW
581 *nl = '\0';
582 }
463559e7 583 else if (isCNTRL(PL_multi_close)) {
8990e307 584 *tmpbuf = '^';
585ec06d 585 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
586 tmpbuf[2] = '\0';
587 s = tmpbuf;
588 }
589 else {
eb160463 590 *tmpbuf = (char)PL_multi_close;
8990e307
LW
591 tmpbuf[1] = '\0';
592 s = tmpbuf;
593 }
594 q = strchr(s,'"') ? '\'' : '"';
cea2e8a9 595 Perl_croak(aTHX_ "Can't find string terminator %c%s%c anywhere before EOF",q,s,q);
463ee0b2 596}
79072805 597
dd0ac2b9
FC
598#include "feature.h"
599
0d863452 600/*
0d863452
RH
601 * Check whether the named feature is enabled.
602 */
26ea9e12 603bool
3fff3427 604Perl_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
0d863452 605{
97aff369 606 dVAR;
4a731d7b 607 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
7918f24d
NC
608
609 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
ca4d40c4
FC
610
611 assert(CURRENT_FEATURE_BUNDLE == FEATURE_BUNDLE_CUSTOM);
7918f24d 612
26ea9e12
NC
613 if (namelen > MAX_FEATURE_LEN)
614 return FALSE;
3fff3427 615 memcpy(&he_name[8], name, namelen);
7d69d4a6 616
c8ca97b0
NC
617 return cBOOL(cop_hints_fetch_pvn(PL_curcop, he_name, 8 + namelen, 0,
618 REFCOUNTED_HE_EXISTS));
0d863452
RH
619}
620
ffb4593c 621/*
9cbb5ea2
GS
622 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
623 * utf16-to-utf8-reversed.
ffb4593c
NT
624 */
625
c39cd008
GS
626#ifdef PERL_CR_FILTER
627static void
628strip_return(SV *sv)
629{
95a20fc0 630 register const char *s = SvPVX_const(sv);
9d4ba2ae 631 register const char * const e = s + SvCUR(sv);
7918f24d
NC
632
633 PERL_ARGS_ASSERT_STRIP_RETURN;
634
c39cd008
GS
635 /* outer loop optimized to do nothing if there are no CR-LFs */
636 while (s < e) {
637 if (*s++ == '\r' && *s == '\n') {
638 /* hit a CR-LF, need to copy the rest */
639 register char *d = s - 1;
640 *d++ = *s++;
641 while (s < e) {
642 if (*s == '\r' && s[1] == '\n')
643 s++;
644 *d++ = *s++;
645 }
646 SvCUR(sv) -= s - d;
647 return;
648 }
649 }
650}
a868473f 651
76e3520e 652STATIC I32
c39cd008 653S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 654{
f54cb97a 655 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
656 if (count > 0 && !maxlen)
657 strip_return(sv);
658 return count;
a868473f
NIS
659}
660#endif
661
ffb4593c 662/*
8eaa0acf
Z
663=for apidoc Amx|void|lex_start|SV *line|PerlIO *rsfp|U32 flags
664
665Creates and initialises a new lexer/parser state object, supplying
666a context in which to lex and parse from a new source of Perl code.
667A pointer to the new state object is placed in L</PL_parser>. An entry
668is made on the save stack so that upon unwinding the new state object
669will be destroyed and the former value of L</PL_parser> will be restored.
670Nothing else need be done to clean up the parsing context.
671
672The code to be parsed comes from I<line> and I<rsfp>. I<line>, if
673non-null, provides a string (in SV form) containing code to be parsed.
674A copy of the string is made, so subsequent modification of I<line>
675does not affect parsing. I<rsfp>, if non-null, provides an input stream
676from which code will be read to be parsed. If both are non-null, the
677code in I<line> comes first and must consist of complete lines of input,
678and I<rsfp> supplies the remainder of the source.
679
e368b3bd
FC
680The I<flags> parameter is reserved for future use. Currently it is only
681used by perl internally, so extensions should always pass zero.
8eaa0acf
Z
682
683=cut
684*/
ffb4593c 685
27fcb6ee 686/* LEX_START_SAME_FILTER indicates that this is not a new file, so it
87606032
NC
687 can share filters with the current parser.
688 LEX_START_DONT_CLOSE indicates that the file handle wasn't opened by the
689 caller, hence isn't owned by the parser, so shouldn't be closed on parser
690 destruction. This is used to handle the case of defaulting to reading the
691 script from the standard input because no filename was given on the command
692 line (without getting confused by situation where STDIN has been closed, so
693 the script handle is opened on fd 0) */
27fcb6ee 694
a0d0e21e 695void
8eaa0acf 696Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
79072805 697{
97aff369 698 dVAR;
6ef55633 699 const char *s = NULL;
5486870f 700 yy_parser *parser, *oparser;
60d63348 701 if (flags && flags & ~LEX_START_FLAGS)
8eaa0acf 702 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
acdf0a21
DM
703
704 /* create and initialise a parser */
705
199e78b7 706 Newxz(parser, 1, yy_parser);
5486870f 707 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
708 PL_parser = parser;
709
28ac2b49
Z
710 parser->stack = NULL;
711 parser->ps = NULL;
712 parser->stack_size = 0;
acdf0a21 713
e3abe207
DM
714 /* on scope exit, free this parser and restore any outer one */
715 SAVEPARSER(parser);
7c4baf47 716 parser->saved_curcop = PL_curcop;
e3abe207 717
acdf0a21 718 /* initialise lexer state */
8990e307 719
fb205e7a
DM
720#ifdef PERL_MAD
721 parser->curforce = -1;
722#else
723 parser->nexttoke = 0;
724#endif
ca4cfd28 725 parser->error_count = oparser ? oparser->error_count : 0;
c2598295 726 parser->copline = NOLINE;
5afb0a62 727 parser->lex_state = LEX_NORMAL;
c2598295 728 parser->expect = XSTATE;
2f9285f8 729 parser->rsfp = rsfp;
27fcb6ee
FC
730 parser->rsfp_filters =
731 !(flags & LEX_START_SAME_FILTER) || !oparser
d3cd8e11
FC
732 ? NULL
733 : MUTABLE_AV(SvREFCNT_inc(
734 oparser->rsfp_filters
735 ? oparser->rsfp_filters
736 : (oparser->rsfp_filters = newAV())
737 ));
2f9285f8 738
199e78b7
DM
739 Newx(parser->lex_brackstack, 120, char);
740 Newx(parser->lex_casestack, 12, char);
741 *parser->lex_casestack = '\0';
02b34bbe 742
10efb74f 743 if (line) {
0528fd32 744 STRLEN len;
10efb74f 745 s = SvPV_const(line, len);
0abcdfa4
FC
746 parser->linestr = flags & LEX_START_COPIED
747 ? SvREFCNT_inc_simple_NN(line)
748 : newSVpvn_flags(s, len, SvUTF8(line));
37555a72 749 if (!len || s[len-1] != ';')
bdc0bf6f 750 sv_catpvs(parser->linestr, "\n;");
0abcdfa4
FC
751 } else {
752 parser->linestr = newSVpvs("\n;");
8990e307 753 }
f06b5848
DM
754 parser->oldoldbufptr =
755 parser->oldbufptr =
756 parser->bufptr =
757 parser->linestart = SvPVX(parser->linestr);
758 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
759 parser->last_lop = parser->last_uni = NULL;
87606032
NC
760 parser->lex_flags = flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
761 |LEX_DONT_CLOSE_RSFP);
737c24fc 762
60d63348 763 parser->in_pod = parser->filtered = 0;
79072805 764}
a687059c 765
e3abe207
DM
766
767/* delete a parser object */
768
769void
770Perl_parser_free(pTHX_ const yy_parser *parser)
771{
7918f24d
NC
772 PERL_ARGS_ASSERT_PARSER_FREE;
773
7c4baf47 774 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
775 SvREFCNT_dec(parser->linestr);
776
87606032 777 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
2f9285f8 778 PerlIO_clearerr(parser->rsfp);
799361c3
SH
779 else if (parser->rsfp && (!parser->old_parser ||
780 (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 781 PerlIO_close(parser->rsfp);
5486870f 782 SvREFCNT_dec(parser->rsfp_filters);
2f9285f8 783
e3abe207
DM
784 Safefree(parser->lex_brackstack);
785 Safefree(parser->lex_casestack);
786 PL_parser = parser->old_parser;
787 Safefree(parser);
788}
789
790
ffb4593c 791/*
f0e67a1d
Z
792=for apidoc AmxU|SV *|PL_parser-E<gt>linestr
793
794Buffer scalar containing the chunk currently under consideration of the
795text currently being lexed. This is always a plain string scalar (for
796which C<SvPOK> is true). It is not intended to be used as a scalar by
797normal scalar means; instead refer to the buffer directly by the pointer
798variables described below.
799
800The lexer maintains various C<char*> pointers to things in the
801C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
802reallocated, all of these pointers must be updated. Don't attempt to
803do this manually, but rather use L</lex_grow_linestr> if you need to
804reallocate the buffer.
805
806The content of the text chunk in the buffer is commonly exactly one
807complete line of input, up to and including a newline terminator,
808but there are situations where it is otherwise. The octets of the
809buffer may be intended to be interpreted as either UTF-8 or Latin-1.
810The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
811flag on this scalar, which may disagree with it.
812
813For direct examination of the buffer, the variable
814L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
815lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
816of these pointers is usually preferable to examination of the scalar
817through normal scalar means.
818
819=for apidoc AmxU|char *|PL_parser-E<gt>bufend
820
821Direct pointer to the end of the chunk of text currently being lexed, the
822end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
823+ SvCUR(PL_parser-E<gt>linestr)>. A NUL character (zero octet) is
824always located at the end of the buffer, and does not count as part of
825the buffer's contents.
826
827=for apidoc AmxU|char *|PL_parser-E<gt>bufptr
828
829Points to the current position of lexing inside the lexer buffer.
830Characters around this point may be freely examined, within
831the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
832L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
833interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
834
835Lexing code (whether in the Perl core or not) moves this pointer past
836the characters that it consumes. It is also expected to perform some
837bookkeeping whenever a newline character is consumed. This movement
838can be more conveniently performed by the function L</lex_read_to>,
839which handles newlines appropriately.
840
841Interpretation of the buffer's octets can be abstracted out by
842using the slightly higher-level functions L</lex_peek_unichar> and
843L</lex_read_unichar>.
844
845=for apidoc AmxU|char *|PL_parser-E<gt>linestart
846
847Points to the start of the current line inside the lexer buffer.
848This is useful for indicating at which column an error occurred, and
849not much else. This must be updated by any lexing code that consumes
850a newline; the function L</lex_read_to> handles this detail.
851
852=cut
853*/
854
855/*
856=for apidoc Amx|bool|lex_bufutf8
857
858Indicates whether the octets in the lexer buffer
859(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
860of Unicode characters. If not, they should be interpreted as Latin-1
861characters. This is analogous to the C<SvUTF8> flag for scalars.
862
863In UTF-8 mode, it is not guaranteed that the lexer buffer actually
864contains valid UTF-8. Lexing code must be robust in the face of invalid
865encoding.
866
867The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
868is significant, but not the whole story regarding the input character
869encoding. Normally, when a file is being read, the scalar contains octets
870and its C<SvUTF8> flag is off, but the octets should be interpreted as
871UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
872however, the scalar may have the C<SvUTF8> flag on, and in this case its
873octets should be interpreted as UTF-8 unless the C<use bytes> pragma
874is in effect. This logic may change in the future; use this function
875instead of implementing the logic yourself.
876
877=cut
878*/
879
880bool
881Perl_lex_bufutf8(pTHX)
882{
883 return UTF;
884}
885
886/*
887=for apidoc Amx|char *|lex_grow_linestr|STRLEN len
888
889Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
890at least I<len> octets (including terminating NUL). Returns a
891pointer to the reallocated buffer. This is necessary before making
892any direct modification of the buffer that would increase its length.
893L</lex_stuff_pvn> provides a more convenient way to insert text into
894the buffer.
895
896Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
897this function updates all of the lexer's variables that point directly
898into the buffer.
899
900=cut
901*/
902
903char *
904Perl_lex_grow_linestr(pTHX_ STRLEN len)
905{
906 SV *linestr;
907 char *buf;
908 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
c7641931 909 STRLEN linestart_pos, last_uni_pos, last_lop_pos, re_eval_start_pos;
f0e67a1d
Z
910 linestr = PL_parser->linestr;
911 buf = SvPVX(linestr);
912 if (len <= SvLEN(linestr))
913 return buf;
914 bufend_pos = PL_parser->bufend - buf;
915 bufptr_pos = PL_parser->bufptr - buf;
916 oldbufptr_pos = PL_parser->oldbufptr - buf;
917 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
918 linestart_pos = PL_parser->linestart - buf;
919 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
920 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
c7641931
DM
921 re_eval_start_pos = PL_sublex_info.re_eval_start ?
922 PL_sublex_info.re_eval_start - buf : 0;
923
f0e67a1d 924 buf = sv_grow(linestr, len);
c7641931 925
f0e67a1d
Z
926 PL_parser->bufend = buf + bufend_pos;
927 PL_parser->bufptr = buf + bufptr_pos;
928 PL_parser->oldbufptr = buf + oldbufptr_pos;
929 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
930 PL_parser->linestart = buf + linestart_pos;
931 if (PL_parser->last_uni)
932 PL_parser->last_uni = buf + last_uni_pos;
933 if (PL_parser->last_lop)
934 PL_parser->last_lop = buf + last_lop_pos;
c7641931
DM
935 if (PL_sublex_info.re_eval_start)
936 PL_sublex_info.re_eval_start = buf + re_eval_start_pos;
f0e67a1d
Z
937 return buf;
938}
939
940/*
83aa740e 941=for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
f0e67a1d
Z
942
943Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
944immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
945reallocating the buffer if necessary. This means that lexing code that
946runs later will see the characters as if they had appeared in the input.
947It is not recommended to do this as part of normal parsing, and most
948uses of this facility run the risk of the inserted characters being
949interpreted in an unintended manner.
950
951The string to be inserted is represented by I<len> octets starting
952at I<pv>. These octets are interpreted as either UTF-8 or Latin-1,
953according to whether the C<LEX_STUFF_UTF8> flag is set in I<flags>.
954The characters are recoded for the lexer buffer, according to how the
955buffer is currently being interpreted (L</lex_bufutf8>). If a string
9dcc53ea 956to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
f0e67a1d
Z
957function is more convenient.
958
959=cut
960*/
961
962void
83aa740e 963Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
f0e67a1d 964{
749123ff 965 dVAR;
f0e67a1d
Z
966 char *bufptr;
967 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
968 if (flags & ~(LEX_STUFF_UTF8))
969 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
970 if (UTF) {
971 if (flags & LEX_STUFF_UTF8) {
972 goto plain_copy;
973 } else {
974 STRLEN highhalf = 0;
83aa740e 975 const char *p, *e = pv+len;
f0e67a1d
Z
976 for (p = pv; p != e; p++)
977 highhalf += !!(((U8)*p) & 0x80);
978 if (!highhalf)
979 goto plain_copy;
980 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
981 bufptr = PL_parser->bufptr;
982 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
983 SvCUR_set(PL_parser->linestr,
984 SvCUR(PL_parser->linestr) + len+highhalf);
f0e67a1d
Z
985 PL_parser->bufend += len+highhalf;
986 for (p = pv; p != e; p++) {
987 U8 c = (U8)*p;
988 if (c & 0x80) {
989 *bufptr++ = (char)(0xc0 | (c >> 6));
990 *bufptr++ = (char)(0x80 | (c & 0x3f));
991 } else {
992 *bufptr++ = (char)c;
993 }
994 }
995 }
996 } else {
997 if (flags & LEX_STUFF_UTF8) {
998 STRLEN highhalf = 0;
83aa740e 999 const char *p, *e = pv+len;
f0e67a1d
Z
1000 for (p = pv; p != e; p++) {
1001 U8 c = (U8)*p;
1002 if (c >= 0xc4) {
1003 Perl_croak(aTHX_ "Lexing code attempted to stuff "
1004 "non-Latin-1 character into Latin-1 input");
1005 } else if (c >= 0xc2 && p+1 != e &&
1006 (((U8)p[1]) & 0xc0) == 0x80) {
1007 p++;
1008 highhalf++;
1009 } else if (c >= 0x80) {
1010 /* malformed UTF-8 */
1011 ENTER;
1012 SAVESPTR(PL_warnhook);
1013 PL_warnhook = PERL_WARNHOOK_FATAL;
1014 utf8n_to_uvuni((U8*)p, e-p, NULL, 0);
1015 LEAVE;
1016 }
1017 }
1018 if (!highhalf)
1019 goto plain_copy;
1020 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
1021 bufptr = PL_parser->bufptr;
1022 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1023 SvCUR_set(PL_parser->linestr,
1024 SvCUR(PL_parser->linestr) + len-highhalf);
f0e67a1d
Z
1025 PL_parser->bufend += len-highhalf;
1026 for (p = pv; p != e; p++) {
1027 U8 c = (U8)*p;
1028 if (c & 0x80) {
1029 *bufptr++ = (char)(((c & 0x3) << 6) | (p[1] & 0x3f));
1030 p++;
1031 } else {
1032 *bufptr++ = (char)c;
1033 }
1034 }
1035 } else {
1036 plain_copy:
1037 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1038 bufptr = PL_parser->bufptr;
1039 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
255fdf19 1040 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
f0e67a1d
Z
1041 PL_parser->bufend += len;
1042 Copy(pv, bufptr, len, char);
1043 }
1044 }
1045}
1046
1047/*
9dcc53ea
Z
1048=for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
1049
1050Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1051immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1052reallocating the buffer if necessary. This means that lexing code that
1053runs later will see the characters as if they had appeared in the input.
1054It is not recommended to do this as part of normal parsing, and most
1055uses of this facility run the risk of the inserted characters being
1056interpreted in an unintended manner.
1057
1058The string to be inserted is represented by octets starting at I<pv>
1059and continuing to the first nul. These octets are interpreted as either
1060UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
1061in I<flags>. The characters are recoded for the lexer buffer, according
1062to how the buffer is currently being interpreted (L</lex_bufutf8>).
1063If it is not convenient to nul-terminate a string to be inserted, the
1064L</lex_stuff_pvn> function is more appropriate.
1065
1066=cut
1067*/
1068
1069void
1070Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1071{
1072 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1073 lex_stuff_pvn(pv, strlen(pv), flags);
1074}
1075
1076/*
f0e67a1d
Z
1077=for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1078
1079Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1080immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1081reallocating the buffer if necessary. This means that lexing code that
1082runs later will see the characters as if they had appeared in the input.
1083It is not recommended to do this as part of normal parsing, and most
1084uses of this facility run the risk of the inserted characters being
1085interpreted in an unintended manner.
1086
1087The string to be inserted is the string value of I<sv>. The characters
1088are recoded for the lexer buffer, according to how the buffer is currently
9dcc53ea 1089being interpreted (L</lex_bufutf8>). If a string to be inserted is
f0e67a1d
Z
1090not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1091need to construct a scalar.
1092
1093=cut
1094*/
1095
1096void
1097Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1098{
1099 char *pv;
1100 STRLEN len;
1101 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1102 if (flags)
1103 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1104 pv = SvPV(sv, len);
1105 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1106}
1107
1108/*
1109=for apidoc Amx|void|lex_unstuff|char *ptr
1110
1111Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1112I<ptr>. Text following I<ptr> will be moved, and the buffer shortened.
1113This hides the discarded text from any lexing code that runs later,
1114as if the text had never appeared.
1115
1116This is not the normal way to consume lexed text. For that, use
1117L</lex_read_to>.
1118
1119=cut
1120*/
1121
1122void
1123Perl_lex_unstuff(pTHX_ char *ptr)
1124{
1125 char *buf, *bufend;
1126 STRLEN unstuff_len;
1127 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1128 buf = PL_parser->bufptr;
1129 if (ptr < buf)
1130 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1131 if (ptr == buf)
1132 return;
1133 bufend = PL_parser->bufend;
1134 if (ptr > bufend)
1135 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1136 unstuff_len = ptr - buf;
1137 Move(ptr, buf, bufend+1-ptr, char);
1138 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1139 PL_parser->bufend = bufend - unstuff_len;
1140}
1141
1142/*
1143=for apidoc Amx|void|lex_read_to|char *ptr
1144
1145Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1146to I<ptr>. This advances L</PL_parser-E<gt>bufptr> to match I<ptr>,
1147performing the correct bookkeeping whenever a newline character is passed.
1148This is the normal way to consume lexed text.
1149
1150Interpretation of the buffer's octets can be abstracted out by
1151using the slightly higher-level functions L</lex_peek_unichar> and
1152L</lex_read_unichar>.
1153
1154=cut
1155*/
1156
1157void
1158Perl_lex_read_to(pTHX_ char *ptr)
1159{
1160 char *s;
1161 PERL_ARGS_ASSERT_LEX_READ_TO;
1162 s = PL_parser->bufptr;
1163 if (ptr < s || ptr > PL_parser->bufend)
1164 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1165 for (; s != ptr; s++)
1166 if (*s == '\n') {
1167 CopLINE_inc(PL_curcop);
1168 PL_parser->linestart = s+1;
1169 }
1170 PL_parser->bufptr = ptr;
1171}
1172
1173/*
1174=for apidoc Amx|void|lex_discard_to|char *ptr
1175
1176Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1177up to I<ptr>. The remaining content of the buffer will be moved, and
1178all pointers into the buffer updated appropriately. I<ptr> must not
1179be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1180it is not permitted to discard text that has yet to be lexed.
1181
1182Normally it is not necessarily to do this directly, because it suffices to
1183use the implicit discarding behaviour of L</lex_next_chunk> and things
1184based on it. However, if a token stretches across multiple lines,
1f317c95 1185and the lexing code has kept multiple lines of text in the buffer for
f0e67a1d
Z
1186that purpose, then after completion of the token it would be wise to
1187explicitly discard the now-unneeded earlier lines, to avoid future
1188multi-line tokens growing the buffer without bound.
1189
1190=cut
1191*/
1192
1193void
1194Perl_lex_discard_to(pTHX_ char *ptr)
1195{
1196 char *buf;
1197 STRLEN discard_len;
1198 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1199 buf = SvPVX(PL_parser->linestr);
1200 if (ptr < buf)
1201 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1202 if (ptr == buf)
1203 return;
1204 if (ptr > PL_parser->bufptr)
1205 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1206 discard_len = ptr - buf;
1207 if (PL_parser->oldbufptr < ptr)
1208 PL_parser->oldbufptr = ptr;
1209 if (PL_parser->oldoldbufptr < ptr)
1210 PL_parser->oldoldbufptr = ptr;
1211 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1212 PL_parser->last_uni = NULL;
1213 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1214 PL_parser->last_lop = NULL;
1215 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1216 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1217 PL_parser->bufend -= discard_len;
1218 PL_parser->bufptr -= discard_len;
1219 PL_parser->oldbufptr -= discard_len;
1220 PL_parser->oldoldbufptr -= discard_len;
1221 if (PL_parser->last_uni)
1222 PL_parser->last_uni -= discard_len;
1223 if (PL_parser->last_lop)
1224 PL_parser->last_lop -= discard_len;
1225}
1226
1227/*
1228=for apidoc Amx|bool|lex_next_chunk|U32 flags
1229
1230Reads in the next chunk of text to be lexed, appending it to
1231L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1232looked to the end of the current chunk and wants to know more. It is
1233usual, but not necessary, for lexing to have consumed the entirety of
1234the current chunk at this time.
1235
1236If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1237chunk (i.e., the current chunk has been entirely consumed), normally the
1238current chunk will be discarded at the same time that the new chunk is
1239read in. If I<flags> includes C<LEX_KEEP_PREVIOUS>, the current chunk
1240will not be discarded. If the current chunk has not been entirely
1241consumed, then it will not be discarded regardless of the flag.
1242
1243Returns true if some new text was added to the buffer, or false if the
1244buffer has reached the end of the input text.
1245
1246=cut
1247*/
1248
1249#define LEX_FAKE_EOF 0x80000000
1250
1251bool
1252Perl_lex_next_chunk(pTHX_ U32 flags)
1253{
1254 SV *linestr;
1255 char *buf;
1256 STRLEN old_bufend_pos, new_bufend_pos;
1257 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1258 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1259 bool got_some_for_debugger = 0;
f0e67a1d
Z
1260 bool got_some;
1261 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF))
1262 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
f0e67a1d
Z
1263 linestr = PL_parser->linestr;
1264 buf = SvPVX(linestr);
1265 if (!(flags & LEX_KEEP_PREVIOUS) &&
1266 PL_parser->bufptr == PL_parser->bufend) {
1267 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1268 linestart_pos = 0;
1269 if (PL_parser->last_uni != PL_parser->bufend)
1270 PL_parser->last_uni = NULL;
1271 if (PL_parser->last_lop != PL_parser->bufend)
1272 PL_parser->last_lop = NULL;
1273 last_uni_pos = last_lop_pos = 0;
1274 *buf = 0;
1275 SvCUR(linestr) = 0;
1276 } else {
1277 old_bufend_pos = PL_parser->bufend - buf;
1278 bufptr_pos = PL_parser->bufptr - buf;
1279 oldbufptr_pos = PL_parser->oldbufptr - buf;
1280 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1281 linestart_pos = PL_parser->linestart - buf;
1282 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1283 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1284 }
1285 if (flags & LEX_FAKE_EOF) {
1286 goto eof;
60d63348 1287 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
f0e67a1d
Z
1288 got_some = 0;
1289 } else if (filter_gets(linestr, old_bufend_pos)) {
1290 got_some = 1;
17cc9359 1291 got_some_for_debugger = 1;
f0e67a1d 1292 } else {
580561a3
Z
1293 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1294 sv_setpvs(linestr, "");
f0e67a1d
Z
1295 eof:
1296 /* End of real input. Close filehandle (unless it was STDIN),
1297 * then add implicit termination.
1298 */
87606032 1299 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
f0e67a1d
Z
1300 PerlIO_clearerr(PL_parser->rsfp);
1301 else if (PL_parser->rsfp)
1302 (void)PerlIO_close(PL_parser->rsfp);
1303 PL_parser->rsfp = NULL;
60d63348 1304 PL_parser->in_pod = PL_parser->filtered = 0;
f0e67a1d
Z
1305#ifdef PERL_MAD
1306 if (PL_madskills && !PL_in_eval && (PL_minus_p || PL_minus_n))
1307 PL_faketokens = 1;
1308#endif
1309 if (!PL_in_eval && PL_minus_p) {
1310 sv_catpvs(linestr,
1311 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1312 PL_minus_n = PL_minus_p = 0;
1313 } else if (!PL_in_eval && PL_minus_n) {
1314 sv_catpvs(linestr, /*{*/";}");
1315 PL_minus_n = 0;
1316 } else
1317 sv_catpvs(linestr, ";");
1318 got_some = 1;
1319 }
1320 buf = SvPVX(linestr);
1321 new_bufend_pos = SvCUR(linestr);
1322 PL_parser->bufend = buf + new_bufend_pos;
1323 PL_parser->bufptr = buf + bufptr_pos;
1324 PL_parser->oldbufptr = buf + oldbufptr_pos;
1325 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1326 PL_parser->linestart = buf + linestart_pos;
1327 if (PL_parser->last_uni)
1328 PL_parser->last_uni = buf + last_uni_pos;
1329 if (PL_parser->last_lop)
1330 PL_parser->last_lop = buf + last_lop_pos;
17cc9359 1331 if (got_some_for_debugger && (PERLDB_LINE || PERLDB_SAVESRC) &&
f0e67a1d
Z
1332 PL_curstash != PL_debstash) {
1333 /* debugger active and we're not compiling the debugger code,
1334 * so store the line into the debugger's array of lines
1335 */
1336 update_debugger_info(NULL, buf+old_bufend_pos,
1337 new_bufend_pos-old_bufend_pos);
1338 }
1339 return got_some;
1340}
1341
1342/*
1343=for apidoc Amx|I32|lex_peek_unichar|U32 flags
1344
1345Looks ahead one (Unicode) character in the text currently being lexed.
1346Returns the codepoint (unsigned integer value) of the next character,
1347or -1 if lexing has reached the end of the input text. To consume the
1348peeked character, use L</lex_read_unichar>.
1349
1350If the next character is in (or extends into) the next chunk of input
1351text, the next chunk will be read in. Normally the current chunk will be
1352discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1353then the current chunk will not be discarded.
1354
1355If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1356is encountered, an exception is generated.
1357
1358=cut
1359*/
1360
1361I32
1362Perl_lex_peek_unichar(pTHX_ U32 flags)
1363{
749123ff 1364 dVAR;
f0e67a1d
Z
1365 char *s, *bufend;
1366 if (flags & ~(LEX_KEEP_PREVIOUS))
1367 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1368 s = PL_parser->bufptr;
1369 bufend = PL_parser->bufend;
1370 if (UTF) {
1371 U8 head;
1372 I32 unichar;
1373 STRLEN len, retlen;
1374 if (s == bufend) {
1375 if (!lex_next_chunk(flags))
1376 return -1;
1377 s = PL_parser->bufptr;
1378 bufend = PL_parser->bufend;
1379 }
1380 head = (U8)*s;
1381 if (!(head & 0x80))
1382 return head;
1383 if (head & 0x40) {
1384 len = PL_utf8skip[head];
1385 while ((STRLEN)(bufend-s) < len) {
1386 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1387 break;
1388 s = PL_parser->bufptr;
1389 bufend = PL_parser->bufend;
1390 }
1391 }
1392 unichar = utf8n_to_uvuni((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
1393 if (retlen == (STRLEN)-1) {
1394 /* malformed UTF-8 */
1395 ENTER;
1396 SAVESPTR(PL_warnhook);
1397 PL_warnhook = PERL_WARNHOOK_FATAL;
1398 utf8n_to_uvuni((U8*)s, bufend-s, NULL, 0);
1399 LEAVE;
1400 }
1401 return unichar;
1402 } else {
1403 if (s == bufend) {
1404 if (!lex_next_chunk(flags))
1405 return -1;
1406 s = PL_parser->bufptr;
1407 }
1408 return (U8)*s;
1409 }
1410}
1411
1412/*
1413=for apidoc Amx|I32|lex_read_unichar|U32 flags
1414
1415Reads the next (Unicode) character in the text currently being lexed.
1416Returns the codepoint (unsigned integer value) of the character read,
1417and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1418if lexing has reached the end of the input text. To non-destructively
1419examine the next character, use L</lex_peek_unichar> instead.
1420
1421If the next character is in (or extends into) the next chunk of input
1422text, the next chunk will be read in. Normally the current chunk will be
1423discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1424then the current chunk will not be discarded.
1425
1426If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1427is encountered, an exception is generated.
1428
1429=cut
1430*/
1431
1432I32
1433Perl_lex_read_unichar(pTHX_ U32 flags)
1434{
1435 I32 c;
1436 if (flags & ~(LEX_KEEP_PREVIOUS))
1437 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1438 c = lex_peek_unichar(flags);
1439 if (c != -1) {
1440 if (c == '\n')
1441 CopLINE_inc(PL_curcop);
d9018cbe
EB
1442 if (UTF)
1443 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1444 else
1445 ++(PL_parser->bufptr);
f0e67a1d
Z
1446 }
1447 return c;
1448}
1449
1450/*
1451=for apidoc Amx|void|lex_read_space|U32 flags
1452
1453Reads optional spaces, in Perl style, in the text currently being
1454lexed. The spaces may include ordinary whitespace characters and
1455Perl-style comments. C<#line> directives are processed if encountered.
1456L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1457at a non-space character (or the end of the input text).
1458
1459If spaces extend into the next chunk of input text, the next chunk will
1460be read in. Normally the current chunk will be discarded at the same
1461time, but if I<flags> includes C<LEX_KEEP_PREVIOUS> then the current
1462chunk will not be discarded.
1463
1464=cut
1465*/
1466
f0998909
Z
1467#define LEX_NO_NEXT_CHUNK 0x80000000
1468
f0e67a1d
Z
1469void
1470Perl_lex_read_space(pTHX_ U32 flags)
1471{
1472 char *s, *bufend;
1473 bool need_incline = 0;
f0998909 1474 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK))
f0e67a1d
Z
1475 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
1476#ifdef PERL_MAD
1477 if (PL_skipwhite) {
1478 sv_free(PL_skipwhite);
1479 PL_skipwhite = NULL;
1480 }
1481 if (PL_madskills)
1482 PL_skipwhite = newSVpvs("");
1483#endif /* PERL_MAD */
1484 s = PL_parser->bufptr;
1485 bufend = PL_parser->bufend;
1486 while (1) {
1487 char c = *s;
1488 if (c == '#') {
1489 do {
1490 c = *++s;
1491 } while (!(c == '\n' || (c == 0 && s == bufend)));
1492 } else if (c == '\n') {
1493 s++;
1494 PL_parser->linestart = s;
1495 if (s == bufend)
1496 need_incline = 1;
1497 else
1498 incline(s);
1499 } else if (isSPACE(c)) {
1500 s++;
1501 } else if (c == 0 && s == bufend) {
1502 bool got_more;
1503#ifdef PERL_MAD
1504 if (PL_madskills)
1505 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1506#endif /* PERL_MAD */
f0998909
Z
1507 if (flags & LEX_NO_NEXT_CHUNK)
1508 break;
f0e67a1d
Z
1509 PL_parser->bufptr = s;
1510 CopLINE_inc(PL_curcop);
1511 got_more = lex_next_chunk(flags);
1512 CopLINE_dec(PL_curcop);
1513 s = PL_parser->bufptr;
1514 bufend = PL_parser->bufend;
1515 if (!got_more)
1516 break;
1517 if (need_incline && PL_parser->rsfp) {
1518 incline(s);
1519 need_incline = 0;
1520 }
1521 } else {
1522 break;
1523 }
1524 }
1525#ifdef PERL_MAD
1526 if (PL_madskills)
1527 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1528#endif /* PERL_MAD */
1529 PL_parser->bufptr = s;
1530}
1531
1532/*
ffb4593c
NT
1533 * S_incline
1534 * This subroutine has nothing to do with tilting, whether at windmills
1535 * or pinball tables. Its name is short for "increment line". It
57843af0 1536 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1537 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1538 * # line 500 "foo.pm"
1539 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1540 */
1541
76e3520e 1542STATIC void
d9095cec 1543S_incline(pTHX_ const char *s)
463ee0b2 1544{
97aff369 1545 dVAR;
d9095cec
NC
1546 const char *t;
1547 const char *n;
1548 const char *e;
8818d409 1549 line_t line_num;
463ee0b2 1550
7918f24d
NC
1551 PERL_ARGS_ASSERT_INCLINE;
1552
57843af0 1553 CopLINE_inc(PL_curcop);
463ee0b2
LW
1554 if (*s++ != '#')
1555 return;
d4c19fe8
AL
1556 while (SPACE_OR_TAB(*s))
1557 s++;
73659bf1
GS
1558 if (strnEQ(s, "line", 4))
1559 s += 4;
1560 else
1561 return;
084592ab 1562 if (SPACE_OR_TAB(*s))
73659bf1 1563 s++;
4e553d73 1564 else
73659bf1 1565 return;
d4c19fe8
AL
1566 while (SPACE_OR_TAB(*s))
1567 s++;
463ee0b2
LW
1568 if (!isDIGIT(*s))
1569 return;
d4c19fe8 1570
463ee0b2
LW
1571 n = s;
1572 while (isDIGIT(*s))
1573 s++;
07714eb4 1574 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1575 return;
bf4acbe4 1576 while (SPACE_OR_TAB(*s))
463ee0b2 1577 s++;
73659bf1 1578 if (*s == '"' && (t = strchr(s+1, '"'))) {
463ee0b2 1579 s++;
73659bf1
GS
1580 e = t + 1;
1581 }
463ee0b2 1582 else {
c35e046a
AL
1583 t = s;
1584 while (!isSPACE(*t))
1585 t++;
73659bf1 1586 e = t;
463ee0b2 1587 }
bf4acbe4 1588 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1589 e++;
1590 if (*e != '\n' && *e != '\0')
1591 return; /* false alarm */
1592
8818d409
FC
1593 line_num = atoi(n)-1;
1594
f4dd75d9 1595 if (t - s > 0) {
d9095cec 1596 const STRLEN len = t - s;
19bad673
NC
1597 SV *const temp_sv = CopFILESV(PL_curcop);
1598 const char *cf;
1599 STRLEN tmplen;
1600
1601 if (temp_sv) {
1602 cf = SvPVX(temp_sv);
1603 tmplen = SvCUR(temp_sv);
1604 } else {
1605 cf = NULL;
1606 tmplen = 0;
1607 }
1608
d1299d44 1609 if (!PL_rsfp && !PL_parser->filtered) {
e66cf94c
RGS
1610 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1611 * to *{"::_<newfilename"} */
44867030
NC
1612 /* However, the long form of evals is only turned on by the
1613 debugger - usually they're "(eval %lu)" */
1614 char smallbuf[128];
1615 char *tmpbuf;
1616 GV **gvp;
d9095cec 1617 STRLEN tmplen2 = len;
798b63bc 1618 if (tmplen + 2 <= sizeof smallbuf)
e66cf94c
RGS
1619 tmpbuf = smallbuf;
1620 else
2ae0db35 1621 Newx(tmpbuf, tmplen + 2, char);
44867030
NC
1622 tmpbuf[0] = '_';
1623 tmpbuf[1] = '<';
2ae0db35 1624 memcpy(tmpbuf + 2, cf, tmplen);
44867030 1625 tmplen += 2;
8a5ee598
RGS
1626 gvp = (GV**)hv_fetch(PL_defstash, tmpbuf, tmplen, FALSE);
1627 if (gvp) {
44867030
NC
1628 char *tmpbuf2;
1629 GV *gv2;
1630
1631 if (tmplen2 + 2 <= sizeof smallbuf)
1632 tmpbuf2 = smallbuf;
1633 else
1634 Newx(tmpbuf2, tmplen2 + 2, char);
1635
1636 if (tmpbuf2 != smallbuf || tmpbuf != smallbuf) {
1637 /* Either they malloc'd it, or we malloc'd it,
1638 so no prefix is present in ours. */
1639 tmpbuf2[0] = '_';
1640 tmpbuf2[1] = '<';
1641 }
1642
1643 memcpy(tmpbuf2 + 2, s, tmplen2);
1644 tmplen2 += 2;
1645
8a5ee598 1646 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1647 if (!isGV(gv2)) {
8a5ee598 1648 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1649 /* adjust ${"::_<newfilename"} to store the new file name */
1650 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
8818d409
FC
1651 /* The line number may differ. If that is the case,
1652 alias the saved lines that are in the array.
1653 Otherwise alias the whole array. */
1654 if (CopLINE(PL_curcop) == line_num) {
1655 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(*gvp)));
1656 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(*gvp)));
1657 }
1658 else if (GvAV(*gvp)) {
1659 AV * const av = GvAV(*gvp);
1660 const I32 start = CopLINE(PL_curcop)+1;
1661 I32 items = AvFILLp(av) - start;
1662 if (items > 0) {
1663 AV * const av2 = GvAVn(gv2);
1664 SV **svp = AvARRAY(av) + start;
1665 I32 l = (I32)line_num+1;
1666 while (items--)
1667 av_store(av2, l++, SvREFCNT_inc(*svp++));
1668 }
1669 }
e5527e4b 1670 }
44867030
NC
1671
1672 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
8a5ee598 1673 }
e66cf94c 1674 if (tmpbuf != smallbuf) Safefree(tmpbuf);
e66cf94c 1675 }
05ec9bb3 1676 CopFILE_free(PL_curcop);
d9095cec 1677 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1678 }
8818d409 1679 CopLINE_set(PL_curcop, line_num);
463ee0b2
LW
1680}
1681
29595ff2 1682#ifdef PERL_MAD
cd81e915 1683/* skip space before PL_thistoken */
29595ff2
NC
1684
1685STATIC char *
1686S_skipspace0(pTHX_ register char *s)
1687{
7918f24d
NC
1688 PERL_ARGS_ASSERT_SKIPSPACE0;
1689
29595ff2
NC
1690 s = skipspace(s);
1691 if (!PL_madskills)
1692 return s;
cd81e915
NC
1693 if (PL_skipwhite) {
1694 if (!PL_thiswhite)
6b29d1f5 1695 PL_thiswhite = newSVpvs("");
cd81e915
NC
1696 sv_catsv(PL_thiswhite, PL_skipwhite);
1697 sv_free(PL_skipwhite);
1698 PL_skipwhite = 0;
1699 }
1700 PL_realtokenstart = s - SvPVX(PL_linestr);
29595ff2
NC
1701 return s;
1702}
1703
cd81e915 1704/* skip space after PL_thistoken */
29595ff2
NC
1705
1706STATIC char *
1707S_skipspace1(pTHX_ register char *s)
1708{
d4c19fe8 1709 const char *start = s;
29595ff2
NC
1710 I32 startoff = start - SvPVX(PL_linestr);
1711
7918f24d
NC
1712 PERL_ARGS_ASSERT_SKIPSPACE1;
1713
29595ff2
NC
1714 s = skipspace(s);
1715 if (!PL_madskills)
1716 return s;
1717 start = SvPVX(PL_linestr) + startoff;
cd81e915 1718 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1719 const char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1720 PL_thistoken = newSVpvn(tstart, start - tstart);
1721 }
1722 PL_realtokenstart = -1;
1723 if (PL_skipwhite) {
1724 if (!PL_nextwhite)
6b29d1f5 1725 PL_nextwhite = newSVpvs("");
cd81e915
NC
1726 sv_catsv(PL_nextwhite, PL_skipwhite);
1727 sv_free(PL_skipwhite);
1728 PL_skipwhite = 0;
29595ff2
NC
1729 }
1730 return s;
1731}
1732
1733STATIC char *
1734S_skipspace2(pTHX_ register char *s, SV **svp)
1735{
c35e046a
AL
1736 char *start;
1737 const I32 bufptroff = PL_bufptr - SvPVX(PL_linestr);
1738 const I32 startoff = s - SvPVX(PL_linestr);
1739
7918f24d
NC
1740 PERL_ARGS_ASSERT_SKIPSPACE2;
1741
29595ff2
NC
1742 s = skipspace(s);
1743 PL_bufptr = SvPVX(PL_linestr) + bufptroff;
1744 if (!PL_madskills || !svp)
1745 return s;
1746 start = SvPVX(PL_linestr) + startoff;
cd81e915 1747 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1748 char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1749 PL_thistoken = newSVpvn(tstart, start - tstart);
1750 PL_realtokenstart = -1;
29595ff2 1751 }
cd81e915 1752 if (PL_skipwhite) {
29595ff2 1753 if (!*svp)
6b29d1f5 1754 *svp = newSVpvs("");
cd81e915
NC
1755 sv_setsv(*svp, PL_skipwhite);
1756 sv_free(PL_skipwhite);
1757 PL_skipwhite = 0;
29595ff2
NC
1758 }
1759
1760 return s;
1761}
1762#endif
1763
80a702cd 1764STATIC void
15f169a1 1765S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1766{
1767 AV *av = CopFILEAVx(PL_curcop);
1768 if (av) {
b9f83d2f 1769 SV * const sv = newSV_type(SVt_PVMG);
5fa550fb
NC
1770 if (orig_sv)
1771 sv_setsv(sv, orig_sv);
1772 else
1773 sv_setpvn(sv, buf, len);
80a702cd
RGS
1774 (void)SvIOK_on(sv);
1775 SvIV_set(sv, 0);
1776 av_store(av, (I32)CopLINE(PL_curcop), sv);
1777 }
1778}
1779
ffb4593c
NT
1780/*
1781 * S_skipspace
1782 * Called to gobble the appropriate amount and type of whitespace.
1783 * Skips comments as well.
1784 */
1785
76e3520e 1786STATIC char *
cea2e8a9 1787S_skipspace(pTHX_ register char *s)
a687059c 1788{
5db06880 1789#ifdef PERL_MAD
f0e67a1d
Z
1790 char *start = s;
1791#endif /* PERL_MAD */
7918f24d 1792 PERL_ARGS_ASSERT_SKIPSPACE;
f0e67a1d 1793#ifdef PERL_MAD
cd81e915
NC
1794 if (PL_skipwhite) {
1795 sv_free(PL_skipwhite);
f0e67a1d 1796 PL_skipwhite = NULL;
5db06880 1797 }
f0e67a1d 1798#endif /* PERL_MAD */
3280af22 1799 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
bf4acbe4 1800 while (s < PL_bufend && SPACE_OR_TAB(*s))
463ee0b2 1801 s++;
f0e67a1d
Z
1802 } else {
1803 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1804 PL_bufptr = s;
f0998909
Z
1805 lex_read_space(LEX_KEEP_PREVIOUS |
1806 (PL_sublex_info.sub_inwhat || PL_lex_state == LEX_FORMLINE ?
1807 LEX_NO_NEXT_CHUNK : 0));
3280af22 1808 s = PL_bufptr;
f0e67a1d
Z
1809 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1810 if (PL_linestart > PL_bufptr)
1811 PL_bufptr = PL_linestart;
1812 return s;
463ee0b2 1813 }
5db06880 1814#ifdef PERL_MAD
f0e67a1d
Z
1815 if (PL_madskills)
1816 PL_skipwhite = newSVpvn(start, s-start);
1817#endif /* PERL_MAD */
5db06880 1818 return s;
a687059c 1819}
378cc40b 1820
ffb4593c
NT
1821/*
1822 * S_check_uni
1823 * Check the unary operators to ensure there's no ambiguity in how they're
1824 * used. An ambiguous piece of code would be:
1825 * rand + 5
1826 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1827 * the +5 is its argument.
1828 */
1829
76e3520e 1830STATIC void
cea2e8a9 1831S_check_uni(pTHX)
ba106d47 1832{
97aff369 1833 dVAR;
d4c19fe8
AL
1834 const char *s;
1835 const char *t;
2f3197b3 1836
3280af22 1837 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1838 return;
3280af22
NIS
1839 while (isSPACE(*PL_last_uni))
1840 PL_last_uni++;
c35e046a
AL
1841 s = PL_last_uni;
1842 while (isALNUM_lazy_if(s,UTF) || *s == '-')
1843 s++;
3280af22 1844 if ((t = strchr(s, '(')) && t < PL_bufptr)
a0d0e21e 1845 return;
6136c704 1846
9b387841
NC
1847 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
1848 "Warning: Use of \"%.*s\" without parentheses is ambiguous",
1849 (int)(s - PL_last_uni), PL_last_uni);
2f3197b3
LW
1850}
1851
ffb4593c
NT
1852/*
1853 * LOP : macro to build a list operator. Its behaviour has been replaced
1854 * with a subroutine, S_lop() for which LOP is just another name.
1855 */
1856
a0d0e21e
LW
1857#define LOP(f,x) return lop(f,x,s)
1858
ffb4593c
NT
1859/*
1860 * S_lop
1861 * Build a list operator (or something that might be one). The rules:
1862 * - if we have a next token, then it's a list operator [why?]
1863 * - if the next thing is an opening paren, then it's a function
1864 * - else it's a list operator
1865 */
1866
76e3520e 1867STATIC I32
a0be28da 1868S_lop(pTHX_ I32 f, int x, char *s)
ffed7fef 1869{
97aff369 1870 dVAR;
7918f24d
NC
1871
1872 PERL_ARGS_ASSERT_LOP;
1873
6154021b 1874 pl_yylval.ival = f;
35c8bce7 1875 CLINE;
3280af22
NIS
1876 PL_expect = x;
1877 PL_bufptr = s;
1878 PL_last_lop = PL_oldbufptr;
eb160463 1879 PL_last_lop_op = (OPCODE)f;
5db06880
NC
1880#ifdef PERL_MAD
1881 if (PL_lasttoke)
78cdf107 1882 goto lstop;
5db06880 1883#else
3280af22 1884 if (PL_nexttoke)
78cdf107 1885 goto lstop;
5db06880 1886#endif
79072805 1887 if (*s == '(')
bbf60fe6 1888 return REPORT(FUNC);
29595ff2 1889 s = PEEKSPACE(s);
79072805 1890 if (*s == '(')
bbf60fe6 1891 return REPORT(FUNC);
78cdf107
Z
1892 else {
1893 lstop:
1894 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
1895 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
bbf60fe6 1896 return REPORT(LSTOP);
78cdf107 1897 }
79072805
LW
1898}
1899
5db06880
NC
1900#ifdef PERL_MAD
1901 /*
1902 * S_start_force
1903 * Sets up for an eventual force_next(). start_force(0) basically does
1904 * an unshift, while start_force(-1) does a push. yylex removes items
1905 * on the "pop" end.
1906 */
1907
1908STATIC void
1909S_start_force(pTHX_ int where)
1910{
1911 int i;
1912
cd81e915 1913 if (where < 0) /* so people can duplicate start_force(PL_curforce) */
5db06880 1914 where = PL_lasttoke;
cd81e915
NC
1915 assert(PL_curforce < 0 || PL_curforce == where);
1916 if (PL_curforce != where) {
5db06880
NC
1917 for (i = PL_lasttoke; i > where; --i) {
1918 PL_nexttoke[i] = PL_nexttoke[i-1];
1919 }
1920 PL_lasttoke++;
1921 }
cd81e915 1922 if (PL_curforce < 0) /* in case of duplicate start_force() */
5db06880 1923 Zero(&PL_nexttoke[where], 1, NEXTTOKE);
cd81e915
NC
1924 PL_curforce = where;
1925 if (PL_nextwhite) {
5db06880 1926 if (PL_madskills)
6b29d1f5 1927 curmad('^', newSVpvs(""));
cd81e915 1928 CURMAD('_', PL_nextwhite);
5db06880
NC
1929 }
1930}
1931
1932STATIC void
1933S_curmad(pTHX_ char slot, SV *sv)
1934{
1935 MADPROP **where;
1936
1937 if (!sv)
1938 return;
cd81e915
NC
1939 if (PL_curforce < 0)
1940 where = &PL_thismad;
5db06880 1941 else
cd81e915 1942 where = &PL_nexttoke[PL_curforce].next_mad;
5db06880 1943
cd81e915 1944 if (PL_faketokens)
76f68e9b 1945 sv_setpvs(sv, "");
5db06880
NC
1946 else {
1947 if (!IN_BYTES) {
1948 if (UTF && is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
1949 SvUTF8_on(sv);
1950 else if (PL_encoding) {
1951 sv_recode_to_utf8(sv, PL_encoding);
1952 }
1953 }
1954 }
1955
1956 /* keep a slot open for the head of the list? */
1957 if (slot != '_' && *where && (*where)->mad_key == '^') {
1958 (*where)->mad_key = slot;
daba3364 1959 sv_free(MUTABLE_SV(((*where)->mad_val)));
5db06880
NC
1960 (*where)->mad_val = (void*)sv;
1961 }
1962 else
1963 addmad(newMADsv(slot, sv), where, 0);
1964}
1965#else
b3f24c00
MHM
1966# define start_force(where) NOOP
1967# define curmad(slot, sv) NOOP
5db06880
NC
1968#endif
1969
ffb4593c
NT
1970/*
1971 * S_force_next
9cbb5ea2 1972 * When the lexer realizes it knows the next token (for instance,
ffb4593c 1973 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 1974 * to know what token to return the next time the lexer is called. Caller
5db06880
NC
1975 * will need to set PL_nextval[] (or PL_nexttoke[].next_val with PERL_MAD),
1976 * and possibly PL_expect to ensure the lexer handles the token correctly.
ffb4593c
NT
1977 */
1978
4e553d73 1979STATIC void
cea2e8a9 1980S_force_next(pTHX_ I32 type)
79072805 1981{
97aff369 1982 dVAR;
704d4215
GG
1983#ifdef DEBUGGING
1984 if (DEBUG_T_TEST) {
1985 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 1986 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
1987 }
1988#endif
6c7ae946
FC
1989 /* Don’t let opslab_force_free snatch it */
1990 if (S_is_opval_token(type & 0xffff) && NEXTVAL_NEXTTOKE.opval) {
1991 assert(!NEXTVAL_NEXTTOKE.opval->op_savefree);
1992 NEXTVAL_NEXTTOKE.opval->op_savefree = 1;
1993 }
5db06880 1994#ifdef PERL_MAD
cd81e915 1995 if (PL_curforce < 0)
5db06880 1996 start_force(PL_lasttoke);
cd81e915 1997 PL_nexttoke[PL_curforce].next_type = type;
5db06880
NC
1998 if (PL_lex_state != LEX_KNOWNEXT)
1999 PL_lex_defer = PL_lex_state;
2000 PL_lex_state = LEX_KNOWNEXT;
2001 PL_lex_expect = PL_expect;
cd81e915 2002 PL_curforce = -1;
5db06880 2003#else
3280af22
NIS
2004 PL_nexttype[PL_nexttoke] = type;
2005 PL_nexttoke++;
2006 if (PL_lex_state != LEX_KNOWNEXT) {
2007 PL_lex_defer = PL_lex_state;
2008 PL_lex_expect = PL_expect;
2009 PL_lex_state = LEX_KNOWNEXT;
79072805 2010 }
5db06880 2011#endif
79072805
LW
2012}
2013
28ac2b49
Z
2014void
2015Perl_yyunlex(pTHX)
2016{
a7aaec61
Z
2017 int yyc = PL_parser->yychar;
2018 if (yyc != YYEMPTY) {
2019 if (yyc) {
2020 start_force(-1);
2021 NEXTVAL_NEXTTOKE = PL_parser->yylval;
2022 if (yyc == '{'/*}*/ || yyc == HASHBRACK || yyc == '['/*]*/) {
78cdf107 2023 PL_lex_allbrackets--;
a7aaec61 2024 PL_lex_brackets--;
78cdf107
Z
2025 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
2026 } else if (yyc == '('/*)*/) {
2027 PL_lex_allbrackets--;
2028 yyc |= (2<<24);
a7aaec61
Z
2029 }
2030 force_next(yyc);
2031 }
28ac2b49
Z
2032 PL_parser->yychar = YYEMPTY;
2033 }
2034}
2035
d0a148a6 2036STATIC SV *
15f169a1 2037S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 2038{
97aff369 2039 dVAR;
740cce10 2040 SV * const sv = newSVpvn_utf8(start, len,
eaf7a4d2
CS
2041 !IN_BYTES
2042 && UTF
2043 && !is_ascii_string((const U8*)start, len)
740cce10 2044 && is_utf8_string((const U8*)start, len));
d0a148a6
NC
2045 return sv;
2046}
2047
ffb4593c
NT
2048/*
2049 * S_force_word
2050 * When the lexer knows the next thing is a word (for instance, it has
2051 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
2052 * it calls S_force_word to stick the next word into the PL_nexttoke/val
2053 * lookahead.
ffb4593c
NT
2054 *
2055 * Arguments:
b1b65b59 2056 * char *start : buffer position (must be within PL_linestr)
02b34bbe 2057 * int token : PL_next* will be this type of bare word (e.g., METHOD,WORD)
ffb4593c
NT
2058 * int check_keyword : if true, Perl checks to make sure the word isn't
2059 * a keyword (do this if the word is a label, e.g. goto FOO)
2060 * int allow_pack : if true, : characters will also be allowed (require,
2061 * use, etc. do this)
9cbb5ea2 2062 * int allow_initial_tick : used by the "sub" lexer only.
ffb4593c
NT
2063 */
2064
76e3520e 2065STATIC char *
cea2e8a9 2066S_force_word(pTHX_ register char *start, int token, int check_keyword, int allow_pack, int allow_initial_tick)
79072805 2067{
97aff369 2068 dVAR;
463ee0b2
LW
2069 register char *s;
2070 STRLEN len;
4e553d73 2071
7918f24d
NC
2072 PERL_ARGS_ASSERT_FORCE_WORD;
2073
29595ff2 2074 start = SKIPSPACE1(start);
463ee0b2 2075 s = start;
7e2040f0 2076 if (isIDFIRST_lazy_if(s,UTF) ||
a0d0e21e 2077 (allow_pack && *s == ':') ||
15f0808c 2078 (allow_initial_tick && *s == '\'') )
a0d0e21e 2079 {
3280af22 2080 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
5458a98a 2081 if (check_keyword && keyword(PL_tokenbuf, len, 0))
463ee0b2 2082 return start;
cd81e915 2083 start_force(PL_curforce);
5db06880
NC
2084 if (PL_madskills)
2085 curmad('X', newSVpvn(start,s-start));
463ee0b2 2086 if (token == METHOD) {
29595ff2 2087 s = SKIPSPACE1(s);
463ee0b2 2088 if (*s == '(')
3280af22 2089 PL_expect = XTERM;
463ee0b2 2090 else {
3280af22 2091 PL_expect = XOPERATOR;
463ee0b2 2092 }
79072805 2093 }
e74e6b3d 2094 if (PL_madskills)
63575281 2095 curmad('g', newSVpvs( "forced" ));
9ded7720 2096 NEXTVAL_NEXTTOKE.opval
d0a148a6
NC
2097 = (OP*)newSVOP(OP_CONST,0,
2098 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 2099 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
2100 force_next(token);
2101 }
2102 return s;
2103}
2104
ffb4593c
NT
2105/*
2106 * S_force_ident
9cbb5ea2 2107 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
2108 * text only contains the "foo" portion. The first argument is a pointer
2109 * to the "foo", and the second argument is the type symbol to prefix.
2110 * Forces the next token to be a "WORD".
9cbb5ea2 2111 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2112 */
2113
76e3520e 2114STATIC void
bfed75c6 2115S_force_ident(pTHX_ register const char *s, int kind)
79072805 2116{
97aff369 2117 dVAR;
7918f24d
NC
2118
2119 PERL_ARGS_ASSERT_FORCE_IDENT;
2120
c35e046a 2121 if (*s) {
90e5519e 2122 const STRLEN len = strlen(s);
728847b1
BF
2123 OP* const o = (OP*)newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
2124 UTF ? SVf_UTF8 : 0));
cd81e915 2125 start_force(PL_curforce);
9ded7720 2126 NEXTVAL_NEXTTOKE.opval = o;
79072805 2127 force_next(WORD);
748a9306 2128 if (kind) {
11343788 2129 o->op_private = OPpCONST_ENTERED;
55497cff
PP
2130 /* XXX see note in pp_entereval() for why we forgo typo
2131 warnings if the symbol must be introduced in an eval.
2132 GSAR 96-10-12 */
90e5519e 2133 gv_fetchpvn_flags(s, len,
728847b1
BF
2134 (PL_in_eval ? (GV_ADDMULTI | GV_ADDINEVAL)
2135 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
90e5519e
NC
2136 kind == '$' ? SVt_PV :
2137 kind == '@' ? SVt_PVAV :
2138 kind == '%' ? SVt_PVHV :
a0d0e21e 2139 SVt_PVGV
90e5519e 2140 );
748a9306 2141 }
79072805
LW
2142 }
2143}
2144
1571675a
GS
2145NV
2146Perl_str_to_version(pTHX_ SV *sv)
2147{
2148 NV retval = 0.0;
2149 NV nshift = 1.0;
2150 STRLEN len;
cfd0369c 2151 const char *start = SvPV_const(sv,len);
9d4ba2ae 2152 const char * const end = start + len;
504618e9 2153 const bool utf = SvUTF8(sv) ? TRUE : FALSE;
7918f24d
NC
2154
2155 PERL_ARGS_ASSERT_STR_TO_VERSION;
2156
1571675a 2157 while (start < end) {
ba210ebe 2158 STRLEN skip;
1571675a
GS
2159 UV n;
2160 if (utf)
9041c2e3 2161 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2162 else {
2163 n = *(U8*)start;
2164 skip = 1;
2165 }
2166 retval += ((NV)n)/nshift;
2167 start += skip;
2168 nshift *= 1000;
2169 }
2170 return retval;
2171}
2172
4e553d73 2173/*
ffb4593c
NT
2174 * S_force_version
2175 * Forces the next token to be a version number.
e759cc13
RGS
2176 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2177 * and if "guessing" is TRUE, then no new token is created (and the caller
2178 * must use an alternative parsing method).
ffb4593c
NT
2179 */
2180
76e3520e 2181STATIC char *
e759cc13 2182S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2183{
97aff369 2184 dVAR;
5f66b61c 2185 OP *version = NULL;
44dcb63b 2186 char *d;
5db06880
NC
2187#ifdef PERL_MAD
2188 I32 startoff = s - SvPVX(PL_linestr);
2189#endif
89bfa8cd 2190
7918f24d
NC
2191 PERL_ARGS_ASSERT_FORCE_VERSION;
2192
29595ff2 2193 s = SKIPSPACE1(s);
89bfa8cd 2194
44dcb63b 2195 d = s;
dd629d5b 2196 if (*d == 'v')
44dcb63b 2197 d++;
44dcb63b 2198 if (isDIGIT(*d)) {
e759cc13
RGS
2199 while (isDIGIT(*d) || *d == '_' || *d == '.')
2200 d++;
5db06880
NC
2201#ifdef PERL_MAD
2202 if (PL_madskills) {
cd81e915 2203 start_force(PL_curforce);
5db06880
NC
2204 curmad('X', newSVpvn(s,d-s));
2205 }
2206#endif
4e4da3ac 2207 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
dd629d5b 2208 SV *ver;
8d08d9ba 2209#ifdef USE_LOCALE_NUMERIC
909d3787
KW
2210 char *loc = savepv(setlocale(LC_NUMERIC, NULL));
2211 setlocale(LC_NUMERIC, "C");
8d08d9ba 2212#endif
6154021b 2213 s = scan_num(s, &pl_yylval);
8d08d9ba
DG
2214#ifdef USE_LOCALE_NUMERIC
2215 setlocale(LC_NUMERIC, loc);
909d3787 2216 Safefree(loc);
8d08d9ba 2217#endif
6154021b 2218 version = pl_yylval.opval;
dd629d5b
GS
2219 ver = cSVOPx(version)->op_sv;
2220 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2221 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2222 SvNV_set(ver, str_to_version(ver));
1571675a 2223 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2224 }
89bfa8cd 2225 }
5db06880
NC
2226 else if (guessing) {
2227#ifdef PERL_MAD
2228 if (PL_madskills) {
cd81e915
NC
2229 sv_free(PL_nextwhite); /* let next token collect whitespace */
2230 PL_nextwhite = 0;
5db06880
NC
2231 s = SvPVX(PL_linestr) + startoff;
2232 }
2233#endif
e759cc13 2234 return s;
5db06880 2235 }
89bfa8cd
PP
2236 }
2237
5db06880
NC
2238#ifdef PERL_MAD
2239 if (PL_madskills && !version) {
cd81e915
NC
2240 sv_free(PL_nextwhite); /* let next token collect whitespace */
2241 PL_nextwhite = 0;
5db06880
NC
2242 s = SvPVX(PL_linestr) + startoff;
2243 }
2244#endif
89bfa8cd 2245 /* NOTE: The parser sees the package name and the VERSION swapped */
cd81e915 2246 start_force(PL_curforce);
9ded7720 2247 NEXTVAL_NEXTTOKE.opval = version;
4e553d73 2248 force_next(WORD);
89bfa8cd 2249
e759cc13 2250 return s;
89bfa8cd
PP
2251}
2252
ffb4593c 2253/*
91152fc1
DG
2254 * S_force_strict_version
2255 * Forces the next token to be a version number using strict syntax rules.
2256 */
2257
2258STATIC char *
2259S_force_strict_version(pTHX_ char *s)
2260{
2261 dVAR;
2262 OP *version = NULL;
2263#ifdef PERL_MAD
2264 I32 startoff = s - SvPVX(PL_linestr);
2265#endif
2266 const char *errstr = NULL;
2267
2268 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2269
2270 while (isSPACE(*s)) /* leading whitespace */
2271 s++;
2272
2273 if (is_STRICT_VERSION(s,&errstr)) {
2274 SV *ver = newSV(0);
2275 s = (char *)scan_version(s, ver, 0);
2276 version = newSVOP(OP_CONST, 0, ver);
2277 }
4e4da3ac
Z
2278 else if ( (*s != ';' && *s != '{' && *s != '}' ) &&
2279 (s = SKIPSPACE1(s), (*s != ';' && *s != '{' && *s != '}' )))
2280 {
91152fc1
DG
2281 PL_bufptr = s;
2282 if (errstr)
2283 yyerror(errstr); /* version required */
2284 return s;
2285 }
2286
2287#ifdef PERL_MAD
2288 if (PL_madskills && !version) {
2289 sv_free(PL_nextwhite); /* let next token collect whitespace */
2290 PL_nextwhite = 0;
2291 s = SvPVX(PL_linestr) + startoff;
2292 }
2293#endif
2294 /* NOTE: The parser sees the package name and the VERSION swapped */
2295 start_force(PL_curforce);
2296 NEXTVAL_NEXTTOKE.opval = version;
2297 force_next(WORD);
2298
2299 return s;
2300}
2301
2302/*
ffb4593c
NT
2303 * S_tokeq
2304 * Tokenize a quoted string passed in as an SV. It finds the next
2305 * chunk, up to end of string or a backslash. It may make a new
2306 * SV containing that chunk (if HINT_NEW_STRING is on). It also
2307 * turns \\ into \.
2308 */
2309
76e3520e 2310STATIC SV *
cea2e8a9 2311S_tokeq(pTHX_ SV *sv)
79072805 2312{
97aff369 2313 dVAR;
79072805
LW
2314 register char *s;
2315 register char *send;
2316 register char *d;
b3ac6de7
IZ
2317 STRLEN len = 0;
2318 SV *pv = sv;
79072805 2319
7918f24d
NC
2320 PERL_ARGS_ASSERT_TOKEQ;
2321
79072805 2322 if (!SvLEN(sv))
b3ac6de7 2323 goto finish;
79072805 2324
a0d0e21e 2325 s = SvPV_force(sv, len);
21a311ee 2326 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1)
b3ac6de7 2327 goto finish;
463ee0b2 2328 send = s + len;
dcb21ed6
NC
2329 /* This is relying on the SV being "well formed" with a trailing '\0' */
2330 while (s < send && !(*s == '\\' && s[1] == '\\'))
79072805
LW
2331 s++;
2332 if (s == send)
b3ac6de7 2333 goto finish;
79072805 2334 d = s;
be4731d2 2335 if ( PL_hints & HINT_NEW_STRING ) {
59cd0e26 2336 pv = newSVpvn_flags(SvPVX_const(pv), len, SVs_TEMP | SvUTF8(sv));
be4731d2 2337 }
79072805
LW
2338 while (s < send) {
2339 if (*s == '\\') {
a0d0e21e 2340 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2341 s++; /* all that, just for this */
2342 }
2343 *d++ = *s++;
2344 }
2345 *d = '\0';
95a20fc0 2346 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2347 finish:
3280af22 2348 if ( PL_hints & HINT_NEW_STRING )
eb0d8d16 2349 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
79072805
LW
2350 return sv;
2351}
2352
ffb4593c
NT
2353/*
2354 * Now come three functions related to double-quote context,
2355 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2356 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2357 * interact with PL_lex_state, and create fake ( ... ) argument lists
2358 * to handle functions and concatenation.
ecd24171
DM
2359 * For example,
2360 * "foo\lbar"
2361 * is tokenised as
2362 * stringify ( const[foo] concat lcfirst ( const[bar] ) )
ffb4593c
NT
2363 */
2364
2365/*
2366 * S_sublex_start
6154021b 2367 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2368 *
2369 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2370 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c
NT
2371 *
2372 * OP_CONST and OP_READLINE are easy--just make the new op and return.
2373 *
2374 * Everything else becomes a FUNC.
2375 *
2376 * Sets PL_lex_state to LEX_INTERPPUSH unless (ival was OP_NULL or we
2377 * had an OP_CONST or OP_READLINE). This just sets us up for a
2378 * call to S_sublex_push().
2379 */
2380
76e3520e 2381STATIC I32
cea2e8a9 2382S_sublex_start(pTHX)
79072805 2383{
97aff369 2384 dVAR;
6154021b 2385 register const I32 op_type = pl_yylval.ival;
79072805
LW
2386
2387 if (op_type == OP_NULL) {
6154021b 2388 pl_yylval.opval = PL_lex_op;
5f66b61c 2389 PL_lex_op = NULL;
79072805
LW
2390 return THING;
2391 }
2392 if (op_type == OP_CONST || op_type == OP_READLINE) {
3280af22 2393 SV *sv = tokeq(PL_lex_stuff);
b3ac6de7
IZ
2394
2395 if (SvTYPE(sv) == SVt_PVIV) {
2396 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2397 STRLEN len;
96a5add6 2398 const char * const p = SvPV_const(sv, len);
740cce10 2399 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2400 SvREFCNT_dec(sv);
2401 sv = nsv;
4e553d73 2402 }
6154021b 2403 pl_yylval.opval = (OP*)newSVOP(op_type, 0, sv);
a0714e2c 2404 PL_lex_stuff = NULL;
6f33ba73
RGS
2405 /* Allow <FH> // "foo" */
2406 if (op_type == OP_READLINE)
2407 PL_expect = XTERMORDORDOR;
79072805
LW
2408 return THING;
2409 }
e3f73d4e
RGS
2410 else if (op_type == OP_BACKTICK && PL_lex_op) {
2411 /* readpipe() vas overriden */
2412 cSVOPx(cLISTOPx(cUNOPx(PL_lex_op)->op_first)->op_first->op_sibling)->op_sv = tokeq(PL_lex_stuff);
6154021b 2413 pl_yylval.opval = PL_lex_op;
9b201d7d 2414 PL_lex_op = NULL;
e3f73d4e
RGS
2415 PL_lex_stuff = NULL;
2416 return THING;
2417 }
79072805 2418
3280af22 2419 PL_sublex_info.super_state = PL_lex_state;
eac04b2e 2420 PL_sublex_info.sub_inwhat = (U16)op_type;
3280af22
NIS
2421 PL_sublex_info.sub_op = PL_lex_op;
2422 PL_lex_state = LEX_INTERPPUSH;
55497cff 2423
3280af22
NIS
2424 PL_expect = XTERM;
2425 if (PL_lex_op) {
6154021b 2426 pl_yylval.opval = PL_lex_op;
5f66b61c 2427 PL_lex_op = NULL;
55497cff
PP
2428 return PMFUNC;
2429 }
2430 else
2431 return FUNC;
2432}
2433
ffb4593c
NT
2434/*
2435 * S_sublex_push
2436 * Create a new scope to save the lexing state. The scope will be
2437 * ended in S_sublex_done. Returns a '(', starting the function arguments
2438 * to the uc, lc, etc. found before.
2439 * Sets PL_lex_state to LEX_INTERPCONCAT.
2440 */
2441
76e3520e 2442STATIC I32
cea2e8a9 2443S_sublex_push(pTHX)
55497cff 2444{
27da23d5 2445 dVAR;
f46d017c 2446 ENTER;
55497cff 2447
3280af22 2448 PL_lex_state = PL_sublex_info.super_state;
651b5b28 2449 SAVEBOOL(PL_lex_dojoin);
3280af22 2450 SAVEI32(PL_lex_brackets);
78cdf107 2451 SAVEI32(PL_lex_allbrackets);
b27dce25 2452 SAVEI32(PL_lex_formbrack);
78cdf107 2453 SAVEI8(PL_lex_fakeeof);
3280af22
NIS
2454 SAVEI32(PL_lex_casemods);
2455 SAVEI32(PL_lex_starts);
651b5b28 2456 SAVEI8(PL_lex_state);
9da1dd8f 2457 SAVEPPTR(PL_sublex_info.re_eval_start);
7766f137 2458 SAVEVPTR(PL_lex_inpat);
98246f1e 2459 SAVEI16(PL_lex_inwhat);
57843af0 2460 SAVECOPLINE(PL_curcop);
3280af22 2461 SAVEPPTR(PL_bufptr);
8452ff4b 2462 SAVEPPTR(PL_bufend);
3280af22
NIS
2463 SAVEPPTR(PL_oldbufptr);
2464 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2465 SAVEPPTR(PL_last_lop);
2466 SAVEPPTR(PL_last_uni);
3280af22
NIS
2467 SAVEPPTR(PL_linestart);
2468 SAVESPTR(PL_linestr);
8edd5f42
RGS
2469 SAVEGENERICPV(PL_lex_brackstack);
2470 SAVEGENERICPV(PL_lex_casestack);
3280af22
NIS
2471
2472 PL_linestr = PL_lex_stuff;
a0714e2c 2473 PL_lex_stuff = NULL;
9da1dd8f 2474 PL_sublex_info.re_eval_start = NULL;
3280af22 2475
9cbb5ea2
GS
2476 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2477 = SvPVX(PL_linestr);
3280af22 2478 PL_bufend += SvCUR(PL_linestr);
bd61b366 2479 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2480 SAVEFREESV(PL_linestr);
2481
2482 PL_lex_dojoin = FALSE;
b27dce25 2483 PL_lex_brackets = PL_lex_formbrack = 0;
78cdf107
Z
2484 PL_lex_allbrackets = 0;
2485 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
a02a5408
JC
2486 Newx(PL_lex_brackstack, 120, char);
2487 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2488 PL_lex_casemods = 0;
2489 *PL_lex_casestack = '\0';
2490 PL_lex_starts = 0;
2491 PL_lex_state = LEX_INTERPCONCAT;
eb160463 2492 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
3280af22
NIS
2493
2494 PL_lex_inwhat = PL_sublex_info.sub_inwhat;
bb16bae8 2495 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
3280af22
NIS
2496 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2497 PL_lex_inpat = PL_sublex_info.sub_op;
79072805 2498 else
5f66b61c 2499 PL_lex_inpat = NULL;
79072805 2500
55497cff 2501 return '(';
79072805
LW
2502}
2503
ffb4593c
NT
2504/*
2505 * S_sublex_done
2506 * Restores lexer state after a S_sublex_push.
2507 */
2508
76e3520e 2509STATIC I32
cea2e8a9 2510S_sublex_done(pTHX)
79072805 2511{
27da23d5 2512 dVAR;
3280af22 2513 if (!PL_lex_starts++) {
396482e1 2514 SV * const sv = newSVpvs("");
9aa983d2
JH
2515 if (SvUTF8(PL_linestr))
2516 SvUTF8_on(sv);
3280af22 2517 PL_expect = XOPERATOR;
6154021b 2518 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
79072805
LW
2519 return THING;
2520 }
2521
3280af22
NIS
2522 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2523 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2524 return yylex();
79072805
LW
2525 }
2526
ffb4593c 2527 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
bb16bae8 2528 assert(PL_lex_inwhat != OP_TRANSR);
3280af22
NIS
2529 if (PL_lex_repl && (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS)) {
2530 PL_linestr = PL_lex_repl;
2531 PL_lex_inpat = 0;
2532 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2533 PL_bufend += SvCUR(PL_linestr);
bd61b366 2534 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2535 SAVEFREESV(PL_linestr);
2536 PL_lex_dojoin = FALSE;
2537 PL_lex_brackets = 0;
78cdf107
Z
2538 PL_lex_allbrackets = 0;
2539 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
3280af22
NIS
2540 PL_lex_casemods = 0;
2541 *PL_lex_casestack = '\0';
2542 PL_lex_starts = 0;
25da4f38 2543 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2544 PL_lex_state = LEX_INTERPNORMAL;
2545 PL_lex_starts++;
e9fa98b2
HS
2546 /* we don't clear PL_lex_repl here, so that we can check later
2547 whether this is an evalled subst; that means we rely on the
2548 logic to ensure sublex_done() is called again only via the
2549 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2550 }
e9fa98b2 2551 else {
3280af22 2552 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2553 PL_lex_repl = NULL;
e9fa98b2 2554 }
79072805 2555 return ',';
ffed7fef
LW
2556 }
2557 else {
5db06880
NC
2558#ifdef PERL_MAD
2559 if (PL_madskills) {
cd81e915
NC
2560 if (PL_thiswhite) {
2561 if (!PL_endwhite)
6b29d1f5 2562 PL_endwhite = newSVpvs("");
cd81e915
NC
2563 sv_catsv(PL_endwhite, PL_thiswhite);
2564 PL_thiswhite = 0;
2565 }
2566 if (PL_thistoken)
76f68e9b 2567 sv_setpvs(PL_thistoken,"");
5db06880 2568 else
cd81e915 2569 PL_realtokenstart = -1;
5db06880
NC
2570 }
2571#endif
f46d017c 2572 LEAVE;
3280af22
NIS
2573 PL_bufend = SvPVX(PL_linestr);
2574 PL_bufend += SvCUR(PL_linestr);
2575 PL_expect = XOPERATOR;
09bef843 2576 PL_sublex_info.sub_inwhat = 0;
79072805 2577 return ')';
ffed7fef
LW
2578 }
2579}
2580
02aa26ce
NT
2581/*
2582 scan_const
2583
9da1dd8f
DM
2584 Extracts the next constant part of a pattern, double-quoted string,
2585 or transliteration. This is terrifying code.
2586
2587 For example, in parsing the double-quoted string "ab\x63$d", it would
2588 stop at the '$' and return an OP_CONST containing 'abc'.
02aa26ce 2589
94def140 2590 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 2591 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 2592 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 2593
94def140
ST
2594 Returns a pointer to the character scanned up to. If this is
2595 advanced from the start pointer supplied (i.e. if anything was
9da1dd8f 2596 successfully parsed), will leave an OP_CONST for the substring scanned
6154021b 2597 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
2598 by looking at the next characters herself.
2599
02aa26ce 2600 In patterns:
9da1dd8f
DM
2601 expand:
2602 \N{ABC} => \N{U+41.42.43}
2603
2604 pass through:
2605 all other \-char, including \N and \N{ apart from \N{ABC}
2606
2607 stops on:
2608 @ and $ where it appears to be a var, but not for $ as tail anchor
2609 \l \L \u \U \Q \E
2610 (?{ or (??{
2611
02aa26ce
NT
2612
2613 In transliterations:
2614 characters are VERY literal, except for - not at the start or end
94def140
ST
2615 of the string, which indicates a range. If the range is in bytes,
2616 scan_const expands the range to the full set of intermediate
2617 characters. If the range is in utf8, the hyphen is replaced with
2618 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
2619
2620 In double-quoted strings:
2621 backslashes:
2622 double-quoted style: \r and \n
ff3f963a 2623 constants: \x31, etc.
94def140 2624 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
2625 case and quoting: \U \Q \E
2626 stops on @ and $
2627
2628 scan_const does *not* construct ops to handle interpolated strings.
2629 It stops processing as soon as it finds an embedded $ or @ variable
2630 and leaves it to the caller to work out what's going on.
2631
94def140
ST
2632 embedded arrays (whether in pattern or not) could be:
2633 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2634
2635 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
2636
2637 $ in pattern could be $foo or could be tail anchor. Assumption:
2638 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 2639 followed by one of "()| \r\n\t"
02aa26ce 2640
9da1dd8f 2641 \1 (backreferences) are turned into $1 in substitutions
02aa26ce
NT
2642
2643 The structure of the code is
2644 while (there's a character to process) {
94def140
ST
2645 handle transliteration ranges
2646 skip regexp comments /(?#comment)/ and codes /(?{code})/
2647 skip #-initiated comments in //x patterns
2648 check for embedded arrays
02aa26ce
NT
2649 check for embedded scalars
2650 if (backslash) {
94def140 2651 deprecate \1 in substitution replacements
02aa26ce
NT
2652 handle string-changing backslashes \l \U \Q \E, etc.
2653 switch (what was escaped) {
94def140 2654 handle \- in a transliteration (becomes a literal -)
ff3f963a 2655 if a pattern and not \N{, go treat as regular character
94def140
ST
2656 handle \132 (octal characters)
2657 handle \x15 and \x{1234} (hex characters)
ff3f963a 2658 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
ST
2659 handle \cV (control characters)
2660 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 2661 } (end switch)
77a135fe 2662 continue
02aa26ce 2663 } (end if backslash)
77a135fe 2664 handle regular character
02aa26ce 2665 } (end while character to read)
4e553d73 2666
02aa26ce
NT
2667*/
2668
76e3520e 2669STATIC char *
cea2e8a9 2670S_scan_const(pTHX_ char *start)
79072805 2671{
97aff369 2672 dVAR;
3280af22 2673 register char *send = PL_bufend; /* end of the constant */
77a135fe
KW
2674 SV *sv = newSV(send - start); /* sv for the constant. See
2675 note below on sizing. */
02aa26ce
NT
2676 register char *s = start; /* start of the constant */
2677 register char *d = SvPVX(sv); /* destination for copies */
2678 bool dorange = FALSE; /* are we in a translit range? */
c2e66d9e 2679 bool didrange = FALSE; /* did we just finish a range? */
2866decb 2680 bool in_charclass = FALSE; /* within /[...]/ */
b953e60c
KW
2681 bool has_utf8 = FALSE; /* Output constant is UTF8 */
2682 bool this_utf8 = cBOOL(UTF); /* Is the source string assumed
77a135fe
KW
2683 to be UTF8? But, this can
2684 show as true when the source
2685 isn't utf8, as for example
2686 when it is entirely composed
2687 of hex constants */
2688
2689 /* Note on sizing: The scanned constant is placed into sv, which is
2690 * initialized by newSV() assuming one byte of output for every byte of
2691 * input. This routine expects newSV() to allocate an extra byte for a
2692 * trailing NUL, which this routine will append if it gets to the end of
2693 * the input. There may be more bytes of input than output (eg., \N{LATIN
2694 * CAPITAL LETTER A}), or more output than input if the constant ends up
2695 * recoded to utf8, but each time a construct is found that might increase
2696 * the needed size, SvGROW() is called. Its size parameter each time is
2697 * based on the best guess estimate at the time, namely the length used so
2698 * far, plus the length the current construct will occupy, plus room for
2699 * the trailing NUL, plus one byte for every input byte still unscanned */
2700
012bcf8d 2701 UV uv;
4c3a8340
ST
2702#ifdef EBCDIC
2703 UV literal_endpoint = 0;
e294cc5d 2704 bool native_range = TRUE; /* turned to FALSE if the first endpoint is Unicode. */
4c3a8340 2705#endif
012bcf8d 2706
7918f24d
NC
2707 PERL_ARGS_ASSERT_SCAN_CONST;
2708
bb16bae8 2709 assert(PL_lex_inwhat != OP_TRANSR);
2b9d42f0
NIS
2710 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
2711 /* If we are doing a trans and we know we want UTF8 set expectation */
2712 has_utf8 = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
2713 this_utf8 = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
2714 }
2715
2716
79072805 2717 while (s < send || dorange) {
ff3f963a 2718
02aa26ce 2719 /* get transliterations out of the way (they're most literal) */
3280af22 2720 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 2721 /* expand a range A-Z to the full set of characters. AIE! */
79072805 2722 if (dorange) {
1ba5c669
JH
2723 I32 i; /* current expanded character */
2724 I32 min; /* first character in range */
2725 I32 max; /* last character in range */
02aa26ce 2726
e294cc5d
JH
2727#ifdef EBCDIC
2728 UV uvmax = 0;
2729#endif
2730
2731 if (has_utf8
2732#ifdef EBCDIC
2733 && !native_range
2734#endif
2735 ) {
9d4ba2ae 2736 char * const c = (char*)utf8_hop((U8*)d, -1);
8973db79
JH
2737 char *e = d++;
2738 while (e-- > c)
2739 *(e + 1) = *e;
25716404 2740 *c = (char)UTF_TO_NATIVE(0xff);
8973db79
JH
2741 /* mark the range as done, and continue */
2742 dorange = FALSE;
2743 didrange = TRUE;
2744 continue;
2745 }
2b9d42f0 2746
95a20fc0 2747 i = d - SvPVX_const(sv); /* remember current offset */
e294cc5d
JH
2748#ifdef EBCDIC
2749 SvGROW(sv,
2750 SvLEN(sv) + (has_utf8 ?
2751 (512 - UTF_CONTINUATION_MARK +
2752 UNISKIP(0x100))
2753 : 256));
2754 /* How many two-byte within 0..255: 128 in UTF-8,
2755 * 96 in UTF-8-mod. */
2756#else
9cbb5ea2 2757 SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */
e294cc5d 2758#endif
9cbb5ea2 2759 d = SvPVX(sv) + i; /* refresh d after realloc */
e294cc5d
JH
2760#ifdef EBCDIC
2761 if (has_utf8) {
2762 int j;
2763 for (j = 0; j <= 1; j++) {
2764 char * const c = (char*)utf8_hop((U8*)d, -1);
2765 const UV uv = utf8n_to_uvchr((U8*)c, d - c, NULL, 0);
2766 if (j)
2767 min = (U8)uv;
2768 else if (uv < 256)
2769 max = (U8)uv;
2770 else {
2771 max = (U8)0xff; /* only to \xff */
2772 uvmax = uv; /* \x{100} to uvmax */
2773 }
2774 d = c; /* eat endpoint chars */
2775 }
2776 }
2777 else {
2778#endif
2779 d -= 2; /* eat the first char and the - */
2780 min = (U8)*d; /* first char in range */
2781 max = (U8)d[1]; /* last char in range */
2782#ifdef EBCDIC
2783 }
2784#endif
8ada0baa 2785
c2e66d9e 2786 if (min > max) {
01ec43d0 2787 Perl_croak(aTHX_
d1573ac7 2788 "Invalid range \"%c-%c\" in transliteration operator",
1ba5c669 2789 (char)min, (char)max);
c2e66d9e
GS
2790 }
2791
c7f1f016 2792#ifdef EBCDIC
4c3a8340
ST
2793 if (literal_endpoint == 2 &&
2794 ((isLOWER(min) && isLOWER(max)) ||
2795 (isUPPER(min) && isUPPER(max)))) {
8ada0baa
JH
2796 if (isLOWER(min)) {
2797 for (i = min; i <= max; i++)
2798 if (isLOWER(i))
db42d148 2799 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2800 } else {
2801 for (i = min; i <= max; i++)
2802 if (isUPPER(i))
db42d148 2803 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2804 }
2805 }
2806 else
2807#endif
2808 for (i = min; i <= max; i++)
e294cc5d
JH
2809#ifdef EBCDIC
2810 if (has_utf8) {
2811 const U8 ch = (U8)NATIVE_TO_UTF(i);
2812 if (UNI_IS_INVARIANT(ch))
2813 *d++ = (U8)i;
2814 else {
2815 *d++ = (U8)UTF8_EIGHT_BIT_HI(ch);
2816 *d++ = (U8)UTF8_EIGHT_BIT_LO(ch);
2817 }
2818 }
2819 else
2820#endif
2821 *d++ = (char)i;
2822
2823#ifdef EBCDIC
2824 if (uvmax) {
2825 d = (char*)uvchr_to_utf8((U8*)d, 0x100);
2826 if (uvmax > 0x101)
2827 *d++ = (char)UTF_TO_NATIVE(0xff);
2828 if (uvmax > 0x100)
2829 d = (char*)uvchr_to_utf8((U8*)d, uvmax);
2830 }
2831#endif
02aa26ce
NT
2832
2833 /* mark the range as done, and continue */
79072805 2834 dorange = FALSE;
01ec43d0 2835 didrange = TRUE;
4c3a8340
ST
2836#ifdef EBCDIC
2837 literal_endpoint = 0;
2838#endif
79072805 2839 continue;
4e553d73 2840 }
02aa26ce
NT
2841
2842 /* range begins (ignore - as first or last char) */
79072805 2843 else if (*s == '-' && s+1 < send && s != start) {
4e553d73 2844 if (didrange) {
1fafa243 2845 Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
01ec43d0 2846 }
e294cc5d
JH
2847 if (has_utf8
2848#ifdef EBCDIC
2849 && !native_range
2850#endif
2851 ) {
25716404 2852 *d++ = (char)UTF_TO_NATIVE(0xff); /* use illegal utf8 byte--see pmtrans */
a0ed51b3
LW
2853 s++;
2854 continue;
2855 }
79072805
LW
2856 dorange = TRUE;
2857 s++;
01ec43d0
GS
2858 }
2859 else {
2860 didrange = FALSE;
4c3a8340
ST
2861#ifdef EBCDIC
2862 literal_endpoint = 0;
e294cc5d 2863 native_range = TRUE;
4c3a8340 2864#endif
01ec43d0 2865 }
79072805 2866 }
02aa26ce
NT
2867
2868 /* if we get here, we're not doing a transliteration */
2869
e4a2df84
DM
2870 else if (*s == '[' && PL_lex_inpat && !in_charclass) {
2871 char *s1 = s-1;
2872 int esc = 0;
2873 while (s1 >= start && *s1-- == '\\')
2874 esc = !esc;
2875 if (!esc)
2876 in_charclass = TRUE;
2877 }
2866decb 2878
e4a2df84
DM
2879 else if (*s == ']' && PL_lex_inpat && in_charclass) {
2880 char *s1 = s-1;
2881 int esc = 0;
2882 while (s1 >= start && *s1-- == '\\')
2883 esc = !esc;
2884 if (!esc)
2885 in_charclass = FALSE;
2886 }
2866decb 2887
9da1dd8f
DM
2888 /* skip for regexp comments /(?#comment)/, except for the last
2889 * char, which will be done separately.
2890 * Stop on (?{..}) and friends */
2891
3280af22 2892 else if (*s == '(' && PL_lex_inpat && s[1] == '?') {
cc6b7395 2893 if (s[2] == '#') {
e994fd66 2894 while (s+1 < send && *s != ')')
db42d148 2895 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
155aba94 2896 }
2866decb 2897 else if (!PL_lex_casemods && !in_charclass &&
d3cec5e5
DM
2898 ( s[2] == '{' /* This should match regcomp.c */
2899 || (s[2] == '?' && s[3] == '{')))
155aba94 2900 {
9da1dd8f 2901 break;
cc6b7395 2902 }
748a9306 2903 }
02aa26ce
NT
2904
2905 /* likewise skip #-initiated comments in //x patterns */
3280af22 2906 else if (*s == '#' && PL_lex_inpat &&
73134a2e 2907 ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED) {
748a9306 2908 while (s+1 < send && *s != '\n')
db42d148 2909 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
748a9306 2910 }
02aa26ce 2911
9da1dd8f
DM
2912 /* no further processing of single-quoted regex */
2913 else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'')
2914 goto default_action;
2915
5d1d4326 2916 /* check for embedded arrays
da6eedaa 2917 (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
5d1d4326 2918 */
1749ea0d
ST
2919 else if (*s == '@' && s[1]) {
2920 if (isALNUM_lazy_if(s+1,UTF))
2921 break;
2922 if (strchr(":'{$", s[1]))
2923 break;
2924 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
2925 break; /* in regexp, neither @+ nor @- are interpolated */
2926 }
02aa26ce
NT
2927
2928 /* check for embedded scalars. only stop if we're sure it's a
2929 variable.
2930 */
79072805 2931 else if (*s == '$') {
3280af22 2932 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
79072805 2933 break;
77772344 2934 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
a2a5de95
NC
2935 if (s[1] == '\\') {
2936 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
2937 "Possible unintended interpolation of $\\ in regex");
77772344 2938 }
79072805 2939 break; /* in regexp, $ might be tail anchor */
77772344 2940 }
79072805 2941 }
02aa26ce 2942
2b9d42f0
NIS
2943 /* End of else if chain - OP_TRANS rejoin rest */
2944
02aa26ce 2945 /* backslashes */
79072805 2946 if (*s == '\\' && s+1 < send) {
ff3f963a
KW
2947 char* e; /* Can be used for ending '}', etc. */
2948
79072805 2949 s++;
02aa26ce 2950
7d0fc23c
KW
2951 /* warn on \1 - \9 in substitution replacements, but note that \11
2952 * is an octal; and \19 is \1 followed by '9' */
3280af22 2953 if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat &&
a0d0e21e 2954 isDIGIT(*s) && *s != '0' && !isDIGIT(s[1]))
79072805 2955 {
a2a5de95 2956 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
79072805
LW
2957 *--s = '$';
2958 break;
2959 }
02aa26ce
NT
2960
2961 /* string-change backslash escapes */
838f2281 2962 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQF", *s)) {
79072805
LW
2963 --s;
2964 break;
2965 }
ff3f963a
KW
2966 /* In a pattern, process \N, but skip any other backslash escapes.
2967 * This is because we don't want to translate an escape sequence
2968 * into a meta symbol and have the regex compiler use the meta
2969 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
2970 * in spite of this, we do have to process \N here while the proper
2971 * charnames handler is in scope. See bugs #56444 and #62056.
2972 * There is a complication because \N in a pattern may also stand
2973 * for 'match a non-nl', and not mean a charname, in which case its
2974 * processing should be deferred to the regex compiler. To be a
2975 * charname it must be followed immediately by a '{', and not look
2976 * like \N followed by a curly quantifier, i.e., not something like
2977 * \N{3,}. regcurly returns a boolean indicating if it is a legal
2978 * quantifier */
2979 else if (PL_lex_inpat
2980 && (*s != 'N'
2981 || s[1] != '{'
2982 || regcurly(s + 1)))
2983 {
cc74c5bd
ST
2984 *d++ = NATIVE_TO_NEED(has_utf8,'\\');
2985 goto default_action;
2986 }
02aa26ce 2987
79072805 2988 switch (*s) {
02aa26ce
NT
2989
2990 /* quoted - in transliterations */
79072805 2991 case '-':
3280af22 2992 if (PL_lex_inwhat == OP_TRANS) {
79072805
LW
2993 *d++ = *s++;
2994 continue;
2995 }
2996 /* FALL THROUGH */
2997 default:
11b8faa4 2998 {
e4ca4584 2999 if ((isALNUMC(*s)))
a2a5de95
NC
3000 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
3001 "Unrecognized escape \\%c passed through",
3002 *s);
11b8faa4 3003 /* default action is to copy the quoted character */
f9a63242 3004 goto default_action;
11b8faa4 3005 }
02aa26ce 3006
632403cc 3007 /* eg. \132 indicates the octal constant 0132 */
79072805
LW
3008 case '0': case '1': case '2': case '3':
3009 case '4': case '5': case '6': case '7':
ba210ebe 3010 {
53305cf1
NC
3011 I32 flags = 0;
3012 STRLEN len = 3;
77a135fe 3013 uv = NATIVE_TO_UNI(grok_oct(s, &len, &flags, NULL));
ba210ebe
JH
3014 s += len;
3015 }
012bcf8d 3016 goto NUM_ESCAPE_INSERT;
02aa26ce 3017
f0a2b745
KW
3018 /* eg. \o{24} indicates the octal constant \024 */
3019 case 'o':
3020 {
3021 STRLEN len;
454155d9 3022 const char* error;
f0a2b745 3023
454155d9 3024 bool valid = grok_bslash_o(s, &uv, &len, &error, 1);
f0a2b745 3025 s += len;
454155d9 3026 if (! valid) {
f0a2b745
KW
3027 yyerror(error);
3028 continue;
3029 }
3030 goto NUM_ESCAPE_INSERT;
3031 }
3032
77a135fe 3033 /* eg. \x24 indicates the hex constant 0x24 */
79072805 3034 case 'x':
a0481293 3035 {
53305cf1 3036 STRLEN len;
a0481293 3037 const char* error;
355860ce 3038
a0481293
KW
3039 bool valid = grok_bslash_x(s, &uv, &len, &error, 1);
3040 s += len;
3041 if (! valid) {
3042 yyerror(error);
355860ce 3043 continue;
ba210ebe 3044 }
012bcf8d
GS
3045 }
3046
3047 NUM_ESCAPE_INSERT:
ff3f963a
KW
3048 /* Insert oct or hex escaped character. There will always be
3049 * enough room in sv since such escapes will be longer than any
3050 * UTF-8 sequence they can end up as, except if they force us
3051 * to recode the rest of the string into utf8 */
ba7cea30 3052
77a135fe 3053 /* Here uv is the ordinal of the next character being added in
ff3f963a 3054 * unicode (converted from native). */
77a135fe 3055 if (!UNI_IS_INVARIANT(uv)) {
9aa983d2 3056 if (!has_utf8 && uv > 255) {
77a135fe
KW
3057 /* Might need to recode whatever we have accumulated so
3058 * far if it contains any chars variant in utf8 or
3059 * utf-ebcdic. */
3060
3061 SvCUR_set(sv, d - SvPVX_const(sv));
3062 SvPOK_on(sv);
3063 *d = '\0';
77a135fe 3064 /* See Note on sizing above. */
7bf79863
KW
3065 sv_utf8_upgrade_flags_grow(sv,
3066 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3067 UNISKIP(uv) + (STRLEN)(send - s) + 1);
77a135fe
KW
3068 d = SvPVX(sv) + SvCUR(sv);
3069 has_utf8 = TRUE;
012bcf8d
GS
3070 }
3071
77a135fe
KW
3072 if (has_utf8) {
3073 d = (char*)uvuni_to_utf8((U8*)d, uv);
f9a63242
JH
3074 if (PL_lex_inwhat == OP_TRANS &&
3075 PL_sublex_info.sub_op) {
3076 PL_sublex_info.sub_op->op_private |=
3077 (PL_lex_repl ? OPpTRANS_FROM_UTF
3078 : OPpTRANS_TO_UTF);
f9a63242 3079 }
e294cc5d
JH
3080#ifdef EBCDIC
3081 if (uv > 255 && !dorange)
3082 native_range = FALSE;
3083#endif
012bcf8d 3084 }
a0ed51b3 3085 else {
012bcf8d 3086 *d++ = (char)uv;
a0ed51b3 3087 }
012bcf8d
GS
3088 }
3089 else {
c4d5f83a 3090 *d++ = (char) uv;
a0ed51b3 3091 }
79072805 3092 continue;
02aa26ce 3093
4a2d328f 3094 case 'N':
ff3f963a
KW
3095 /* In a non-pattern \N must be a named character, like \N{LATIN
3096 * SMALL LETTER A} or \N{U+0041}. For patterns, it also can
3097 * mean to match a non-newline. For non-patterns, named
3098 * characters are converted to their string equivalents. In
3099 * patterns, named characters are not converted to their
3100 * ultimate forms for the same reasons that other escapes
3101 * aren't. Instead, they are converted to the \N{U+...} form
3102 * to get the value from the charnames that is in effect right
3103 * now, while preserving the fact that it was a named character
3104 * so that the regex compiler knows this */
3105
3106 /* This section of code doesn't generally use the
3107 * NATIVE_TO_NEED() macro to transform the input. I (khw) did
3108 * a close examination of this macro and determined it is a
3109 * no-op except on utfebcdic variant characters. Every
3110 * character generated by this that would normally need to be
3111 * enclosed by this macro is invariant, so the macro is not
7538f724
KW
3112 * needed, and would complicate use of copy(). XXX There are
3113 * other parts of this file where the macro is used
3114 * inconsistently, but are saved by it being a no-op */
ff3f963a
KW
3115
3116 /* The structure of this section of code (besides checking for
3117 * errors and upgrading to utf8) is:
3118 * Further disambiguate between the two meanings of \N, and if
3119 * not a charname, go process it elsewhere
0a96133f
KW
3120 * If of form \N{U+...}, pass it through if a pattern;
3121 * otherwise convert to utf8
3122 * Otherwise must be \N{NAME}: convert to \N{U+c1.c2...} if a
3123 * pattern; otherwise convert to utf8 */
ff3f963a
KW
3124
3125 /* Here, s points to the 'N'; the test below is guaranteed to
3126 * succeed if we are being called on a pattern as we already
3127 * know from a test above that the next character is a '{'.
3128 * On a non-pattern \N must mean 'named sequence, which
3129 * requires braces */
3130 s++;
3131 if (*s != '{') {
3132 yyerror("Missing braces on \\N{}");
3133 continue;
3134 }
3135 s++;
3136
0a96133f 3137 /* If there is no matching '}', it is an error. */
ff3f963a
KW
3138 if (! (e = strchr(s, '}'))) {
3139 if (! PL_lex_inpat) {
5777a3f7 3140 yyerror("Missing right brace on \\N{}");
0a96133f
KW
3141 } else {
3142 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N.");
dbc0d4f2 3143 }
0a96133f 3144 continue;
ff3f963a 3145 }
cddc7ef4 3146
ff3f963a 3147 /* Here it looks like a named character */
cddc7ef4 3148
ff3f963a
KW
3149 if (PL_lex_inpat) {
3150
3151 /* XXX This block is temporary code. \N{} implies that the
3152 * pattern is to have Unicode semantics, and therefore
3153 * currently has to be encoded in utf8. By putting it in
3154 * utf8 now, we save a whole pass in the regular expression
3155 * compiler. Once that code is changed so Unicode
3156 * semantics doesn't necessarily have to be in utf8, this
da3a4baf
KW
3157 * block should be removed. However, the code that parses
3158 * the output of this would have to be changed to not
3159 * necessarily expect utf8 */
ff3f963a 3160 if (!has_utf8) {
77a135fe 3161 SvCUR_set(sv, d - SvPVX_const(sv));
f08d6ad9 3162 SvPOK_on(sv);
e4f3eed8 3163 *d = '\0';
77a135fe 3164 /* See Note on sizing above. */
7bf79863 3165 sv_utf8_upgrade_flags_grow(sv,
ff3f963a
KW
3166 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3167 /* 5 = '\N{' + cur char + NUL */
3168 (STRLEN)(send - s) + 5);
f08d6ad9 3169 d = SvPVX(sv) + SvCUR(sv);
89491803 3170 has_utf8 = TRUE;
ff3f963a
KW
3171 }
3172 }
423cee85 3173
ff3f963a
KW
3174 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3175 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3176 | PERL_SCAN_DISALLOW_PREFIX;
3177 STRLEN len;
3178
3179 /* For \N{U+...}, the '...' is a unicode value even on
3180 * EBCDIC machines */
3181 s += 2; /* Skip to next char after the 'U+' */
3182 len = e - s;
3183 uv = grok_hex(s, &len, &flags, NULL);
3184 if (len == 0 || len != (STRLEN)(e - s)) {
3185 yyerror("Invalid hexadecimal number in \\N{U+...}");
3186 s = e + 1;
3187 continue;
3188 }
3189
3190 if (PL_lex_inpat) {
3191
e2a7e165
KW
3192 /* On non-EBCDIC platforms, pass through to the regex
3193 * compiler unchanged. The reason we evaluated the
3194 * number above is to make sure there wasn't a syntax
3195 * error. But on EBCDIC we convert to native so
3196 * downstream code can continue to assume it's native
3197 */
ff3f963a 3198 s -= 5; /* Include the '\N{U+' */
e2a7e165
KW
3199#ifdef EBCDIC
3200 d += my_snprintf(d, e - s + 1 + 1, /* includes the }
3201 and the \0 */
3202 "\\N{U+%X}",
3203 (unsigned int) UNI_TO_NATIVE(uv));
3204#else
ff3f963a
KW
3205 Copy(s, d, e - s + 1, char); /* 1 = include the } */
3206 d += e - s + 1;
e2a7e165 3207#endif
ff3f963a
KW
3208 }
3209 else { /* Not a pattern: convert the hex to string */
3210
3211 /* If destination is not in utf8, unconditionally
3212 * recode it to be so. This is because \N{} implies
3213 * Unicode semantics, and scalars have to be in utf8
3214 * to guarantee those semantics */
3215 if (! has_utf8) {
3216 SvCUR_set(sv, d - SvPVX_const(sv));
3217 SvPOK_on(sv);
3218 *d = '\0';
3219 /* See Note on sizing above. */
3220 sv_utf8_upgrade_flags_grow(
3221 sv,
3222 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3223 UNISKIP(uv) + (STRLEN)(send - e) + 1);
3224 d = SvPVX(sv) + SvCUR(sv);
3225 has_utf8 = TRUE;
3226 }
3227
3228 /* Add the string to the output */
3229 if (UNI_IS_INVARIANT(uv)) {
3230 *d++ = (char) uv;
3231 }
3232 else d = (char*)uvuni_to_utf8((U8*)d, uv);
3233 }
3234 }
3235 else { /* Here is \N{NAME} but not \N{U+...}. */
3236
3237 SV *res; /* result from charnames */
3238 const char *str; /* the string in 'res' */
3239 STRLEN len; /* its length */
3240
3241 /* Get the value for NAME */
3242 res = newSVpvn(s, e - s);
3243 res = new_constant( NULL, 0, "charnames",
3244 /* includes all of: \N{...} */
3245 res, NULL, s - 3, e - s + 4 );
3246
3247 /* Most likely res will be in utf8 already since the
3248 * standard charnames uses pack U, but a custom translator
3249 * can leave it otherwise, so make sure. XXX This can be
3250 * revisited to not have charnames use utf8 for characters
3251 * that don't need it when regexes don't have to be in utf8
3252 * for Unicode semantics. If doing so, remember EBCDIC */
3253 sv_utf8_upgrade(res);
3254 str = SvPV_const(res, len);
3255
3256 /* Don't accept malformed input */
3257 if (! is_utf8_string((U8 *) str, len)) {
3258 yyerror("Malformed UTF-8 returned by \\N");
3259 }
3260 else if (PL_lex_inpat) {
3261
3262 if (! len) { /* The name resolved to an empty string */
3263 Copy("\\N{}", d, 4, char);
3264 d += 4;
3265 }
3266 else {
3267 /* In order to not lose information for the regex
3268 * compiler, pass the result in the specially made
3269 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3270 * the code points in hex of each character
3271 * returned by charnames */
3272
3273 const char *str_end = str + len;
3274 STRLEN char_length; /* cur char's byte length */
3275 STRLEN output_length; /* and the number of bytes
3276 after this is translated
3277 into hex digits */
3278 const STRLEN off = d - SvPVX_const(sv);
3279
3280 /* 2 hex per byte; 2 chars for '\N'; 2 chars for
3281 * max('U+', '.'); and 1 for NUL */
3282 char hex_string[2 * UTF8_MAXBYTES + 5];
3283
3284 /* Get the first character of the result. */
3285 U32 uv = utf8n_to_uvuni((U8 *) str,
3286 len,
3287 &char_length,
3288 UTF8_ALLOW_ANYUV);
3289
3290 /* The call to is_utf8_string() above hopefully
3291 * guarantees that there won't be an error. But
3292 * it's easy here to make sure. The function just
3293 * above warns and returns 0 if invalid utf8, but
3294 * it can also return 0 if the input is validly a
3295 * NUL. Disambiguate */
3296 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3297 uv = UNICODE_REPLACEMENT;
3298 }
3299
3300 /* Convert first code point to hex, including the
e2a7e165
KW
3301 * boiler plate before it. For all these, we
3302 * convert to native format so that downstream code
3303 * can continue to assume the input is native */
78c35590 3304 output_length =
3353de27 3305 my_snprintf(hex_string, sizeof(hex_string),
e2a7e165
KW
3306 "\\N{U+%X",
3307 (unsigned int) UNI_TO_NATIVE(uv));
ff3f963a
KW
3308
3309 /* Make sure there is enough space to hold it */
3310 d = off + SvGROW(sv, off
3311 + output_length
3312 + (STRLEN)(send - e)
3313 + 2); /* '}' + NUL */
3314 /* And output it */
3315 Copy(hex_string, d, output_length, char);
3316 d += output_length;
3317
3318 /* For each subsequent character, append dot and
3319 * its ordinal in hex */
3320 while ((str += char_length) < str_end) {
3321 const STRLEN off = d - SvPVX_const(sv);
3322 U32 uv = utf8n_to_uvuni((U8 *) str,
3323 str_end - str,
3324 &char_length,
3325 UTF8_ALLOW_ANYUV);
3326 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3327 uv = UNICODE_REPLACEMENT;
3328 }
3329
78c35590 3330 output_length =
3353de27 3331 my_snprintf(hex_string, sizeof(hex_string),
e2a7e165
KW
3332 ".%X",
3333 (unsigned int) UNI_TO_NATIVE(uv));
ff3f963a
KW
3334
3335 d = off + SvGROW(sv, off
3336 + output_length
3337 + (STRLEN)(send - e)
3338 + 2); /* '}' + NUL */
3339 Copy(hex_string, d, output_length, char);
3340 d += output_length;
3341 }
3342
3343 *d++ = '}'; /* Done. Add the trailing brace */
3344 }
3345 }
3346 else { /* Here, not in a pattern. Convert the name to a
3347 * string. */
3348
3349 /* If destination is not in utf8, unconditionally
3350 * recode it to be so. This is because \N{} implies
3351 * Unicode semantics, and scalars have to be in utf8
3352 * to guarantee those semantics */
3353 if (! has_utf8) {
3354 SvCUR_set(sv, d - SvPVX_const(sv));
3355 SvPOK_on(sv);
3356 *d = '\0';
3357 /* See Note on sizing above. */
3358 sv_utf8_upgrade_flags_grow(sv,
3359 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3360 len + (STRLEN)(send - s) + 1);
3361 d = SvPVX(sv) + SvCUR(sv);
3362 has_utf8 = TRUE;
3363 } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
3364
3365 /* See Note on sizing above. (NOTE: SvCUR() is not
3366 * set correctly here). */
3367 const STRLEN off = d - SvPVX_const(sv);
3368 d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1);
3369 }
3370 Copy(str, d, len, char);
3371 d += len;
423cee85 3372 }
423cee85 3373 SvREFCNT_dec(res);
cb233ae3
KW
3374
3375 /* Deprecate non-approved name syntax */
3376 if (ckWARN_d(WARN_DEPRECATED)) {
3377 bool problematic = FALSE;
3378 char* i = s;
3379
3380 /* For non-ut8 input, look to see that the first
3381 * character is an alpha, then loop through the rest
3382 * checking that each is a continuation */
3383 if (! this_utf8) {
3384 if (! isALPHAU(*i)) problematic = TRUE;
3385 else for (i = s + 1; i < e; i++) {
3386 if (isCHARNAME_CONT(*i)) continue;
3387 problematic = TRUE;
3388 break;
3389 }
3390 }
3391 else {
3392 /* Similarly for utf8. For invariants can check
3393 * directly. We accept anything above the latin1
3394 * range because it is immaterial to Perl if it is
3395 * correct or not, and is expensive to check. But
3396 * it is fairly easy in the latin1 range to convert
3397 * the variants into a single character and check
3398 * those */
3399 if (UTF8_IS_INVARIANT(*i)) {
3400 if (! isALPHAU(*i)) problematic = TRUE;
3401 } else if (UTF8_IS_DOWNGRADEABLE_START(*i)) {
81c14aa2 3402 if (! isALPHAU(UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(*i,
cb233ae3
KW
3403 *(i+1)))))
3404 {
3405 problematic = TRUE;
3406 }
3407 }
3408 if (! problematic) for (i = s + UTF8SKIP(s);
3409 i < e;
3410 i+= UTF8SKIP(i))
3411 {
3412 if (UTF8_IS_INVARIANT(*i)) {
3413 if (isCHARNAME_CONT(*i)) continue;
3414 } else if (! UTF8_IS_DOWNGRADEABLE_START(*i)) {
3415 continue;
3416 } else if (isCHARNAME_CONT(
3417 UNI_TO_NATIVE(
81c14aa2 3418 TWO_BYTE_UTF8_TO_UNI(*i, *(i+1)))))
cb233ae3
KW
3419 {
3420 continue;
3421 }
3422 problematic = TRUE;
3423 break;
3424 }
3425 }
3426 if (problematic) {
6e1bad6c
KW
3427 /* The e-i passed to the final %.*s makes sure that
3428 * should the trailing NUL be missing that this
3429 * print won't run off the end of the string */
cb233ae3 3430 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
b00fc8d4
NC
3431 "Deprecated character in \\N{...}; marked by <-- HERE in \\N{%.*s<-- HERE %.*s",
3432 (int)(i - s + 1), s, (int)(e - i), i + 1);
cb233ae3
KW
3433 }
3434 }
3435 } /* End \N{NAME} */
ff3f963a
KW
3436#ifdef EBCDIC
3437 if (!dorange)
3438 native_range = FALSE; /* \N{} is defined to be Unicode */
3439#endif
3440 s = e + 1; /* Point to just after the '}' */
423cee85
JH
3441 continue;
3442
02aa26ce 3443 /* \c is a control character */
79072805
LW
3444 case 'c':
3445 s++;
961ce445 3446 if (s < send) {
17a3df4c 3447 *d++ = grok_bslash_c(*s++, has_utf8, 1);
ba210ebe 3448 }
961ce445
RGS
3449 else {
3450 yyerror("Missing control char name in \\c");
3451 }
79072805 3452 continue;
02aa26ce
NT
3453
3454 /* printf-style backslashes, formfeeds, newlines, etc */
79072805 3455 case 'b':
db42d148 3456 *d++ = NATIVE_TO_NEED(has_utf8,'\b');
79072805
LW
3457 break;
3458 case 'n':
db42d148 3459 *d++ = NATIVE_TO_NEED(has_utf8,'\n');
79072805
LW
3460 break;
3461 case 'r':
db42d148 3462 *d++ = NATIVE_TO_NEED(has_utf8,'\r');
79072805
LW
3463 break;
3464 case 'f':
db42d148 3465 *d++ = NATIVE_TO_NEED(has_utf8,'\f');
79072805
LW
3466 break;
3467 case 't':
db42d148 3468 *d++ = NATIVE_TO_NEED(has_utf8,'\t');
79072805 3469 break;
34a3fe2a 3470 case 'e':
db42d148 3471 *d++ = ASCII_TO_NEED(has_utf8,'\033');
34a3fe2a
PP
3472 break;
3473 case 'a':
db42d148 3474 *d++ = ASCII_TO_NEED(has_utf8,'\007');
79072805 3475 break;
02aa26ce
NT
3476 } /* end switch */
3477
79072805
LW
3478 s++;
3479 continue;
02aa26ce 3480 } /* end if (backslash) */
4c3a8340
ST
3481#ifdef EBCDIC
3482 else
3483 literal_endpoint++;
3484#endif
02aa26ce 3485
f9a63242 3486 default_action:
77a135fe
KW
3487 /* If we started with encoded form, or already know we want it,
3488 then encode the next character */
3489 if (! NATIVE_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
2b9d42f0 3490 STRLEN len = 1;
77a135fe
KW
3491
3492
3493 /* One might think that it is wasted effort in the case of the
3494 * source being utf8 (this_utf8 == TRUE) to take the next character
3495 * in the source, convert it to an unsigned value, and then convert
3496 * it back again. But the source has not been validated here. The
3497 * routine that does the conversion checks for errors like
3498 * malformed utf8 */
3499
5f66b61c
AL
3500 const UV nextuv = (this_utf8) ? utf8n_to_uvchr((U8*)s, send - s, &len, 0) : (UV) ((U8) *s);
3501 const STRLEN need = UNISKIP(NATIVE_TO_UNI(nextuv));
77a135fe
KW
3502 if (!has_utf8) {
3503 SvCUR_set(sv, d - SvPVX_const(sv));
3504 SvPOK_on(sv);
3505 *d = '\0';
77a135fe 3506 /* See Note on sizing above. */
7bf79863
KW
3507 sv_utf8_upgrade_flags_grow(sv,
3508 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3509 need + (STRLEN)(send - s) + 1);
77a135fe
KW
3510 d = SvPVX(sv) + SvCUR(sv);
3511 has_utf8 = TRUE;
3512 } else if (need > len) {
3513 /* encoded value larger than old, may need extra space (NOTE:
3514 * SvCUR() is not set correctly here). See Note on sizing
3515 * above. */
9d4ba2ae 3516 const STRLEN off = d - SvPVX_const(sv);
77a135fe 3517 d = SvGROW(sv, off + need + (STRLEN)(send - s) + 1) + off;
2b9d42f0 3518 }
77a135fe
KW
3519 s += len;
3520
5f66b61c 3521 d = (char*)uvchr_to_utf8((U8*)d, nextuv);
e294cc5d
JH
3522#ifdef EBCDIC
3523 if (uv > 255 && !dorange)
3524 native_range = FALSE;
3525#endif
2b9d42f0
NIS
3526 }
3527 else {
3528 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
3529 }
02aa26ce
NT
3530 } /* while loop to process each character */
3531
3532 /* terminate the string and set up the sv */
79072805 3533 *d = '\0';
95a20fc0 3534 SvCUR_set(sv, d - SvPVX_const(sv));
2b9d42f0 3535 if (SvCUR(sv) >= SvLEN(sv))
5637ef5b
NC
3536 Perl_croak(aTHX_ "panic: constant overflowed allocated space, %"UVuf
3537 " >= %"UVuf, (UV)SvCUR(sv), (UV)SvLEN(sv));
2b9d42f0 3538
79072805 3539 SvPOK_on(sv);
9f4817db 3540 if (PL_encoding && !has_utf8) {
d0063567
DK
3541 sv_recode_to_utf8(sv, PL_encoding);
3542 if (SvUTF8(sv))
3543 has_utf8 = TRUE;
9f4817db 3544 }
2b9d42f0 3545 if (has_utf8) {
7e2040f0 3546 SvUTF8_on(sv);
2b9d42f0 3547 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
d0063567 3548 PL_sublex_info.sub_op->op_private |=
2b9d42f0
NIS
3549 (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
3550 }
3551 }
79072805 3552
02aa26ce 3553 /* shrink the sv if we allocated more than we used */
79072805 3554 if (SvCUR(sv) + 5 < SvLEN(sv)) {
1da4ca5f 3555 SvPV_shrink_to_cur(sv);
79072805 3556 }
02aa26ce 3557
6154021b 3558 /* return the substring (via pl_yylval) only if we parsed anything */
3280af22 3559 if (s > PL_bufptr) {
eb0d8d16
NC
3560 if ( PL_hints & ( PL_lex_inpat ? HINT_NEW_RE : HINT_NEW_STRING ) ) {
3561 const char *const key = PL_lex_inpat ? "qr" : "q";
3562 const STRLEN keylen = PL_lex_inpat ? 2 : 1;
3563 const char *type;
3564 STRLEN typelen;
3565
3566 if (PL_lex_inwhat == OP_TRANS) {
3567 type = "tr";
3568 typelen = 2;
3569 } else if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat) {
3570 type = "s";
3571 typelen = 1;
9da1dd8f
DM
3572 } else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'') {
3573 type = "q";
3574 typelen = 1;
eb0d8d16
NC
3575 } else {
3576 type = "qq";
3577 typelen = 2;
3578 }
3579
3580 sv = S_new_constant(aTHX_ start, s - start, key, keylen, sv, NULL,
3581 type, typelen);
3582 }
6154021b 3583 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
b3ac6de7 3584 } else
8990e307 3585 SvREFCNT_dec(sv);
79072805
LW
3586 return s;
3587}
3588
ffb4593c
NT
3589/* S_intuit_more
3590 * Returns TRUE if there's more to the expression (e.g., a subscript),
3591 * FALSE otherwise.
ffb4593c
NT
3592 *
3593 * It deals with "$foo[3]" and /$foo[3]/ and /$foo[0123456789$]+/
3594 *
3595 * ->[ and ->{ return TRUE
3596 * { and [ outside a pattern are always subscripts, so return TRUE
3597 * if we're outside a pattern and it's not { or [, then return FALSE
3598 * if we're in a pattern and the first char is a {
3599 * {4,5} (any digits around the comma) returns FALSE
3600 * if we're in a pattern and the first char is a [
3601 * [] returns FALSE
3602 * [SOMETHING] has a funky algorithm to decide whether it's a
3603 * character class or not. It has to deal with things like
3604 * /$foo[-3]/ and /$foo[$bar]/ as well as /$foo[$\d]+/
3605 * anything else returns TRUE
3606 */
3607
9cbb5ea2
GS
3608/* This is the one truly awful dwimmer necessary to conflate C and sed. */
3609
76e3520e 3610STATIC int
cea2e8a9 3611S_intuit_more(pTHX_ register char *s)
79072805 3612{
97aff369 3613 dVAR;
7918f24d
NC
3614
3615 PERL_ARGS_ASSERT_INTUIT_MORE;
3616
3280af22 3617 if (PL_lex_brackets)
79072805
LW
3618 return TRUE;
3619 if (*s == '-' && s[1] == '>' && (s[2] == '[' || s[2] == '{'))
3620 return TRUE;
3621 if (*s != '{' && *s != '[')
3622 return FALSE;
3280af22 3623 if (!PL_lex_inpat)
79072805
LW
3624 return TRUE;
3625
3626 /* In a pattern, so maybe we have {n,m}. */
3627 if (*s == '{') {
b3155d95 3628 if (regcurly(s)) {
79072805 3629 return FALSE;
b3155d95 3630 }
79072805 3631 return TRUE;
79072805
LW
3632 }
3633
3634 /* On the other hand, maybe we have a character class */
3635
3636 s++;
3637 if (*s == ']' || *s == '^')
3638 return FALSE;
3639 else {
ffb4593c 3640 /* this is terrifying, and it works */
79072805
LW
3641 int weight = 2; /* let's weigh the evidence */
3642 char seen[256];
f27ffc4a 3643 unsigned char un_char = 255, last_un_char;
9d4ba2ae 3644 const char * const send = strchr(s,']');
3280af22 3645 char tmpbuf[sizeof PL_tokenbuf * 4];
79072805
LW
3646
3647 if (!send) /* has to be an expression */
3648 return TRUE;
3649
3650 Zero(seen,256,char);
3651 if (*s == '$')
3652 weight -= 3;
3653 else if (isDIGIT(*s)) {
3654 if (s[1] != ']') {
3655 if (isDIGIT(s[1]) && s[2] == ']')
3656 weight -= 10;
3657 }
3658 else
3659 weight -= 100;
3660 }
3661 for (; s < send; s++) {
3662 last_un_char = un_char;
3663 un_char = (unsigned char)*s;
3664 switch (*s) {
3665 case '@':
3666 case '&':
3667 case '$':
3668 weight -= seen[un_char] * 10;
7e2040f0 3669 if (isALNUM_lazy_if(s+1,UTF)) {
90e5519e 3670 int len;
8903cb82 3671 scan_ident(s, send, tmpbuf, sizeof tmpbuf, FALSE);
90e5519e 3672 len = (int)strlen(tmpbuf);
6fbd0d97
BF
3673 if (len > 1 && gv_fetchpvn_flags(tmpbuf, len,
3674 UTF ? SVf_UTF8 : 0, SVt_PV))
79072805
LW
3675 weight -= 100;
3676 else
3677 weight -= 10;
3678 }
3679 else if (*s == '$' && s[1] &&
93a17b20
LW
3680 strchr("[#!%*<>()-=",s[1])) {
3681 if (/*{*/ strchr("])} =",s[2]))
79072805
LW
3682 weight -= 10;
3683 else
3684 weight -= 1;
3685 }
3686 break;
3687 case '\\':
3688 un_char = 254;
3689 if (s[1]) {
93a17b20 3690 if (strchr("wds]",s[1]))
79072805 3691 weight += 100;
10edeb5d 3692 else if (seen[(U8)'\''] || seen[(U8)'"'])
79072805 3693 weight += 1;
93a17b20 3694 else if (strchr("rnftbxcav",s[1]))
79072805
LW
3695 weight += 40;
3696 else if (isDIGIT(s[1])) {
3697 weight += 40;
3698 while (s[1] && isDIGIT(s[1]))
3699 s++;
3700 }
3701 }
3702 else
3703 weight += 100;
3704 break;
3705 case '-':
3706 if (s[1] == '\\')
3707 weight += 50;
93a17b20 3708 if (strchr("aA01! ",last_un_char))
79072805 3709 weight += 30;
93a17b20 3710 if (strchr("zZ79~",s[1]))
79072805 3711 weight += 30;
f27ffc4a
GS
3712 if (last_un_char == 255 && (isDIGIT(s[1]) || s[1] == '$'))
3713 weight -= 5; /* cope with negative subscript */
79072805
LW
3714 break;
3715 default:
3792a11b
NC
3716 if (!isALNUM(last_un_char)
3717 && !(last_un_char == '$' || last_un_char == '@'
3718 || last_un_char == '&')
3719 && isALPHA(*s) && s[1] && isALPHA(s[1])) {
79072805
LW
3720 char *d = tmpbuf;
3721 while (isALPHA(*s))
3722 *d++ = *s++;
3723 *d = '\0';
5458a98a 3724 if (keyword(tmpbuf, d - tmpbuf, 0))
79072805
LW
3725 weight -= 150;
3726 }
3727 if (un_char == last_un_char + 1)
3728 weight += 5;
3729 weight -= seen[un_char];
3730 break;
3731 }
3732 seen[un_char]++;
3733 }
3734 if (weight >= 0) /* probably a character class */
3735 return FALSE;
3736 }
3737
3738 return TRUE;
3739}
ffed7fef 3740
ffb4593c
NT
3741/*
3742 * S_intuit_method
3743 *
3744 * Does all the checking to disambiguate
3745 * foo bar
3746 * between foo(bar) and bar->foo. Returns 0 if not a method, otherwise
3747 * FUNCMETH (bar->foo(args)) or METHOD (bar->foo args).
3748 *
3749 * First argument is the stuff after the first token, e.g. "bar".
3750 *
a4fd4a89 3751 * Not a method if foo is a filehandle.
ffb4593c
NT
3752 * Not a method if foo is a subroutine prototyped to take a filehandle.
3753 * Not a method if it's really "Foo $bar"
3754 * Method if it's "foo $bar"
3755 * Not a method if it's really "print foo $bar"
3756 * Method if it's really "foo package::" (interpreted as package->foo)
8f8cf39c 3757 * Not a method if bar is known to be a subroutine ("sub bar; foo bar")
3cb0bbe5 3758 * Not a method if bar is a filehandle or package, but is quoted with
ffb4593c
NT
3759 * =>
3760 */