This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Avoid redundant copies in string evals
[perl5.git] / toke.c
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
26
27This is the lower layer of the Perl parser, managing characters and tokens.
28
29=for apidoc AmU|yy_parser *|PL_parser
30
31Pointer to a structure encapsulating the state of the parsing operation
32currently in progress. The pointer can be locally changed to perform
33a nested parse without interfering with the state of an outer parse.
34Individual members of C<PL_parser> have their own documentation.
35
36=cut
37*/
38
378cc40b 39#include "EXTERN.h"
864dbfa3 40#define PERL_IN_TOKE_C
378cc40b 41#include "perl.h"
04e98a4d 42#include "dquote_static.c"
378cc40b 43
eb0d8d16
NC
44#define new_constant(a,b,c,d,e,f,g) \
45 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
46
6154021b 47#define pl_yylval (PL_parser->yylval)
d3b6f988 48
199e78b7
DM
49/* XXX temporary backwards compatibility */
50#define PL_lex_brackets (PL_parser->lex_brackets)
78cdf107
Z
51#define PL_lex_allbrackets (PL_parser->lex_allbrackets)
52#define PL_lex_fakeeof (PL_parser->lex_fakeeof)
199e78b7
DM
53#define PL_lex_brackstack (PL_parser->lex_brackstack)
54#define PL_lex_casemods (PL_parser->lex_casemods)
55#define PL_lex_casestack (PL_parser->lex_casestack)
56#define PL_lex_defer (PL_parser->lex_defer)
57#define PL_lex_dojoin (PL_parser->lex_dojoin)
58#define PL_lex_expect (PL_parser->lex_expect)
59#define PL_lex_formbrack (PL_parser->lex_formbrack)
60#define PL_lex_inpat (PL_parser->lex_inpat)
61#define PL_lex_inwhat (PL_parser->lex_inwhat)
62#define PL_lex_op (PL_parser->lex_op)
63#define PL_lex_repl (PL_parser->lex_repl)
64#define PL_lex_starts (PL_parser->lex_starts)
65#define PL_lex_stuff (PL_parser->lex_stuff)
66#define PL_multi_start (PL_parser->multi_start)
67#define PL_multi_open (PL_parser->multi_open)
68#define PL_multi_close (PL_parser->multi_close)
69#define PL_pending_ident (PL_parser->pending_ident)
70#define PL_preambled (PL_parser->preambled)
71#define PL_sublex_info (PL_parser->sublex_info)
bdc0bf6f 72#define PL_linestr (PL_parser->linestr)
c2598295
DM
73#define PL_expect (PL_parser->expect)
74#define PL_copline (PL_parser->copline)
f06b5848
DM
75#define PL_bufptr (PL_parser->bufptr)
76#define PL_oldbufptr (PL_parser->oldbufptr)
77#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
78#define PL_linestart (PL_parser->linestart)
79#define PL_bufend (PL_parser->bufend)
80#define PL_last_uni (PL_parser->last_uni)
81#define PL_last_lop (PL_parser->last_lop)
82#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 83#define PL_lex_state (PL_parser->lex_state)
2f9285f8 84#define PL_rsfp (PL_parser->rsfp)
5486870f 85#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
86#define PL_in_my (PL_parser->in_my)
87#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 88#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 89#define PL_multi_end (PL_parser->multi_end)
13765c85 90#define PL_error_count (PL_parser->error_count)
199e78b7
DM
91
92#ifdef PERL_MAD
93# define PL_endwhite (PL_parser->endwhite)
94# define PL_faketokens (PL_parser->faketokens)
95# define PL_lasttoke (PL_parser->lasttoke)
96# define PL_nextwhite (PL_parser->nextwhite)
97# define PL_realtokenstart (PL_parser->realtokenstart)
98# define PL_skipwhite (PL_parser->skipwhite)
99# define PL_thisclose (PL_parser->thisclose)
100# define PL_thismad (PL_parser->thismad)
101# define PL_thisopen (PL_parser->thisopen)
102# define PL_thisstuff (PL_parser->thisstuff)
103# define PL_thistoken (PL_parser->thistoken)
104# define PL_thiswhite (PL_parser->thiswhite)
fb205e7a
DM
105# define PL_thiswhite (PL_parser->thiswhite)
106# define PL_nexttoke (PL_parser->nexttoke)
107# define PL_curforce (PL_parser->curforce)
108#else
109# define PL_nexttoke (PL_parser->nexttoke)
110# define PL_nexttype (PL_parser->nexttype)
111# define PL_nextval (PL_parser->nextval)
199e78b7
DM
112#endif
113
16173588
NC
114/* This can't be done with embed.fnc, because struct yy_parser contains a
115 member named pending_ident, which clashes with the generated #define */
3cbf51f5
DM
116static int
117S_pending_ident(pTHX);
199e78b7 118
0bd48802 119static const char ident_too_long[] = "Identifier too long";
8903cb82 120
29595ff2 121#ifdef PERL_MAD
29595ff2 122# define CURMAD(slot,sv) if (PL_madskills) { curmad(slot,sv); sv = 0; }
cd81e915 123# define NEXTVAL_NEXTTOKE PL_nexttoke[PL_curforce].next_val
9ded7720 124#else
5db06880 125# define CURMAD(slot,sv)
9ded7720 126# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2
NC
127#endif
128
a7aaec61
Z
129#define XENUMMASK 0x3f
130#define XFAKEEOF 0x40
131#define XFAKEBRACK 0x80
9059aa12 132
39e02b42
JH
133#ifdef USE_UTF8_SCRIPTS
134# define UTF (!IN_BYTES)
2b9d42f0 135#else
802a15e9 136# define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
2b9d42f0 137#endif
a0ed51b3 138
b1fc3636
CJ
139/* The maximum number of characters preceding the unrecognized one to display */
140#define UNRECOGNIZED_PRECEDE_COUNT 10
141
61f0cdd9 142/* In variables named $^X, these are the legal values for X.
2b92dfce
GS
143 * 1999-02-27 mjd-perl-patch@plover.com */
144#define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
145
bf4acbe4 146#define SPACE_OR_TAB(c) ((c)==' '||(c)=='\t')
bf4acbe4 147
ffb4593c
NT
148/* LEX_* are values for PL_lex_state, the state of the lexer.
149 * They are arranged oddly so that the guard on the switch statement
79072805
LW
150 * can get by with a single comparison (if the compiler is smart enough).
151 */
152
fb73857a
PP
153/* #define LEX_NOTPARSING 11 is done in perl.h. */
154
b6007c36
DM
155#define LEX_NORMAL 10 /* normal code (ie not within "...") */
156#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
157#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
158#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
159#define LEX_INTERPSTART 6 /* expecting the start of a $var */
160
161 /* at end of code, eg "$x" followed by: */
162#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
163#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
164
165#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
166 string or after \E, $foo, etc */
167#define LEX_INTERPCONST 2 /* NOT USED */
168#define LEX_FORMLINE 1 /* expecting a format line */
169#define LEX_KNOWNEXT 0 /* next token known; just return it */
170
79072805 171
bbf60fe6 172#ifdef DEBUGGING
27da23d5 173static const char* const lex_state_names[] = {
bbf60fe6
DM
174 "KNOWNEXT",
175 "FORMLINE",
176 "INTERPCONST",
177 "INTERPCONCAT",
178 "INTERPENDMAYBE",
179 "INTERPEND",
180 "INTERPSTART",
181 "INTERPPUSH",
182 "INTERPCASEMOD",
183 "INTERPNORMAL",
184 "NORMAL"
185};
186#endif
187
79072805
LW
188#ifdef ff_next
189#undef ff_next
d48672a2
LW
190#endif
191
79072805 192#include "keywords.h"
fe14fcc3 193
ffb4593c
NT
194/* CLINE is a macro that ensures PL_copline has a sane value */
195
ae986130
LW
196#ifdef CLINE
197#undef CLINE
198#endif
57843af0 199#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 200
5db06880 201#ifdef PERL_MAD
29595ff2
NC
202# define SKIPSPACE0(s) skipspace0(s)
203# define SKIPSPACE1(s) skipspace1(s)
204# define SKIPSPACE2(s,tsv) skipspace2(s,&tsv)
205# define PEEKSPACE(s) skipspace2(s,0)
206#else
207# define SKIPSPACE0(s) skipspace(s)
208# define SKIPSPACE1(s) skipspace(s)
209# define SKIPSPACE2(s,tsv) skipspace(s)
210# define PEEKSPACE(s) skipspace(s)
211#endif
212
ffb4593c
NT
213/*
214 * Convenience functions to return different tokens and prime the
9cbb5ea2 215 * lexer for the next token. They all take an argument.
ffb4593c
NT
216 *
217 * TOKEN : generic token (used for '(', DOLSHARP, etc)
218 * OPERATOR : generic operator
219 * AOPERATOR : assignment operator
220 * PREBLOCK : beginning the block after an if, while, foreach, ...
221 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
222 * PREREF : *EXPR where EXPR is not a simple identifier
223 * TERM : expression term
224 * LOOPX : loop exiting command (goto, last, dump, etc)
225 * FTST : file test operator
226 * FUN0 : zero-argument function
7eb971ee 227 * FUN0OP : zero-argument function, with its op created in this file
2d2e263d 228 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
229 * BOop : bitwise or or xor
230 * BAop : bitwise and
231 * SHop : shift operator
232 * PWop : power operator
9cbb5ea2 233 * PMop : pattern-matching operator
ffb4593c
NT
234 * Aop : addition-level operator
235 * Mop : multiplication-level operator
236 * Eop : equality-testing operator
e5edeb50 237 * Rop : relational operator <= != gt
ffb4593c
NT
238 *
239 * Also see LOP and lop() below.
240 */
241
998054bd 242#ifdef DEBUGGING /* Serve -DT. */
704d4215 243# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 244#else
bbf60fe6 245# define REPORT(retval) (retval)
998054bd
SC
246#endif
247
bbf60fe6
DM
248#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
249#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
250#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, REPORT(retval)))
251#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
252#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
253#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
254#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
6154021b
RGS
255#define LOOPX(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)LOOPEX))
256#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
257#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
7eb971ee 258#define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
6154021b
RGS
259#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
260#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITOROP)))
261#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITANDOP)))
262#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)SHIFTOP)))
263#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)POWOP)))
264#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
265#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)ADDOP)))
266#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MULOP)))
267#define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
268#define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
2f3197b3 269
a687059c
LW
270/* This bit of chicanery makes a unary function followed by
271 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
272 * The UNIDOR macro is for unary functions that can be followed by the //
273 * operator (such as C<shift // 0>).
a687059c 274 */
376fcdbf 275#define UNI2(f,x) { \
6154021b 276 pl_yylval.ival = f; \
376fcdbf
AL
277 PL_expect = x; \
278 PL_bufptr = s; \
279 PL_last_uni = PL_oldbufptr; \
280 PL_last_lop_op = f; \
281 if (*s == '(') \
282 return REPORT( (int)FUNC1 ); \
29595ff2 283 s = PEEKSPACE(s); \
376fcdbf
AL
284 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
285 }
6f33ba73
RGS
286#define UNI(f) UNI2(f,XTERM)
287#define UNIDOR(f) UNI2(f,XTERMORDORDOR)
a687059c 288
376fcdbf 289#define UNIBRACK(f) { \
6154021b 290 pl_yylval.ival = f; \
376fcdbf
AL
291 PL_bufptr = s; \
292 PL_last_uni = PL_oldbufptr; \
293 if (*s == '(') \
294 return REPORT( (int)FUNC1 ); \
29595ff2 295 s = PEEKSPACE(s); \
376fcdbf
AL
296 return REPORT( (*s == '(') ? (int)FUNC1 : (int)UNIOP ); \
297 }
79072805 298
9f68db38 299/* grandfather return to old style */
78cdf107
Z
300#define OLDLOP(f) \
301 do { \
302 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
303 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
304 pl_yylval.ival = (f); \
305 PL_expect = XTERM; \
306 PL_bufptr = s; \
307 return (int)LSTOP; \
308 } while(0)
79072805 309
8fa7f367
JH
310#ifdef DEBUGGING
311
6154021b 312/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
313enum token_type {
314 TOKENTYPE_NONE,
315 TOKENTYPE_IVAL,
6154021b 316 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6
DM
317 TOKENTYPE_PVAL,
318 TOKENTYPE_OPVAL,
319 TOKENTYPE_GVVAL
320};
321
6d4a66ac
NC
322static struct debug_tokens {
323 const int token;
324 enum token_type type;
325 const char *name;
326} const debug_tokens[] =
9041c2e3 327{
bbf60fe6
DM
328 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
329 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
330 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
331 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
332 { ARROW, TOKENTYPE_NONE, "ARROW" },
333 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
334 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
335 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
336 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
337 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
0d863452 338 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
339 { DO, TOKENTYPE_NONE, "DO" },
340 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
341 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
342 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
343 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
344 { ELSE, TOKENTYPE_NONE, "ELSE" },
345 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
346 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
347 { FOR, TOKENTYPE_IVAL, "FOR" },
348 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
349 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
350 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
7eb971ee 351 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
bbf60fe6
DM
352 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
353 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
354 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 355 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
356 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
357 { IF, TOKENTYPE_IVAL, "IF" },
358 { LABEL, TOKENTYPE_PVAL, "LABEL" },
359 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
360 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
361 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
362 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
363 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
364 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
365 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
366 { MY, TOKENTYPE_IVAL, "MY" },
367 { MYSUB, TOKENTYPE_NONE, "MYSUB" },
368 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
369 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
370 { OROP, TOKENTYPE_IVAL, "OROP" },
371 { OROR, TOKENTYPE_NONE, "OROR" },
372 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
88e1f1a2
JV
373 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
374 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6
DM
375 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
376 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
377 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
378 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
379 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
380 { PREINC, TOKENTYPE_NONE, "PREINC" },
381 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
382 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
383 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
384 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
385 { SUB, TOKENTYPE_NONE, "SUB" },
386 { THING, TOKENTYPE_OPVAL, "THING" },
387 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
388 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
389 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
390 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
391 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
392 { USE, TOKENTYPE_IVAL, "USE" },
0d863452 393 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6
DM
394 { WHILE, TOKENTYPE_IVAL, "WHILE" },
395 { WORD, TOKENTYPE_OPVAL, "WORD" },
be25f609 396 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 397 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
398};
399
6154021b 400/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 401
bbf60fe6 402STATIC int
704d4215 403S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 404{
97aff369 405 dVAR;
7918f24d
NC
406
407 PERL_ARGS_ASSERT_TOKEREPORT;
408
bbf60fe6 409 if (DEBUG_T_TEST) {
bd61b366 410 const char *name = NULL;
bbf60fe6 411 enum token_type type = TOKENTYPE_NONE;
f54cb97a 412 const struct debug_tokens *p;
396482e1 413 SV* const report = newSVpvs("<== ");
bbf60fe6 414
f54cb97a 415 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
416 if (p->token == (int)rv) {
417 name = p->name;
418 type = p->type;
419 break;
420 }
421 }
422 if (name)
54667de8 423 Perl_sv_catpv(aTHX_ report, name);
bbf60fe6
DM
424 else if ((char)rv > ' ' && (char)rv < '~')
425 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
426 else if (!rv)
396482e1 427 sv_catpvs(report, "EOF");
bbf60fe6
DM
428 else
429 Perl_sv_catpvf(aTHX_ report, "?? %"IVdf, (IV)rv);
430 switch (type) {
431 case TOKENTYPE_NONE:
432 case TOKENTYPE_GVVAL: /* doesn't appear to be used */
433 break;
434 case TOKENTYPE_IVAL:
704d4215 435 Perl_sv_catpvf(aTHX_ report, "(ival=%"IVdf")", (IV)lvalp->ival);
bbf60fe6
DM
436 break;
437 case TOKENTYPE_OPNUM:
438 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 439 PL_op_name[lvalp->ival]);
bbf60fe6
DM
440 break;
441 case TOKENTYPE_PVAL:
704d4215 442 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
443 break;
444 case TOKENTYPE_OPVAL:
704d4215 445 if (lvalp->opval) {
401441c0 446 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
447 PL_op_name[lvalp->opval->op_type]);
448 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 449 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 450 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
451 }
452
453 }
401441c0 454 else
396482e1 455 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
456 break;
457 }
b6007c36 458 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
459 };
460 return (int)rv;
998054bd
SC
461}
462
b6007c36
DM
463
464/* print the buffer with suitable escapes */
465
466STATIC void
15f169a1 467S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 468{
396482e1 469 SV* const tmp = newSVpvs("");
7918f24d
NC
470
471 PERL_ARGS_ASSERT_PRINTBUF;
472
b6007c36
DM
473 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
474 SvREFCNT_dec(tmp);
475}
476
8fa7f367
JH
477#endif
478
8290c323
NC
479static int
480S_deprecate_commaless_var_list(pTHX) {
481 PL_expect = XTERM;
482 deprecate("comma-less variable list");
483 return REPORT(','); /* grandfather non-comma-format format */
484}
485
ffb4593c
NT
486/*
487 * S_ao
488 *
c963b151
BD
489 * This subroutine detects &&=, ||=, and //= and turns an ANDAND, OROR or DORDOR
490 * into an OP_ANDASSIGN, OP_ORASSIGN, or OP_DORASSIGN
ffb4593c
NT
491 */
492
76e3520e 493STATIC int
cea2e8a9 494S_ao(pTHX_ int toketype)
a0d0e21e 495{
97aff369 496 dVAR;
3280af22
NIS
497 if (*PL_bufptr == '=') {
498 PL_bufptr++;
a0d0e21e 499 if (toketype == ANDAND)
6154021b 500 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 501 else if (toketype == OROR)
6154021b 502 pl_yylval.ival = OP_ORASSIGN;
c963b151 503 else if (toketype == DORDOR)
6154021b 504 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
505 toketype = ASSIGNOP;
506 }
507 return toketype;
508}
509
ffb4593c
NT
510/*
511 * S_no_op
512 * When Perl expects an operator and finds something else, no_op
513 * prints the warning. It always prints "<something> found where
514 * operator expected. It prints "Missing semicolon on previous line?"
515 * if the surprise occurs at the start of the line. "do you need to
516 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
517 * where the compiler doesn't know if foo is a method call or a function.
518 * It prints "Missing operator before end of line" if there's nothing
519 * after the missing operator, or "... before <...>" if there is something
520 * after the missing operator.
521 */
522
76e3520e 523STATIC void
15f169a1 524S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 525{
97aff369 526 dVAR;
9d4ba2ae
AL
527 char * const oldbp = PL_bufptr;
528 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 529
7918f24d
NC
530 PERL_ARGS_ASSERT_NO_OP;
531
1189a94a
GS
532 if (!s)
533 s = oldbp;
07c798fb 534 else
1189a94a 535 PL_bufptr = s;
cea2e8a9 536 yywarn(Perl_form(aTHX_ "%s found where operator expected", what));
56da5a46
RGS
537 if (ckWARN_d(WARN_SYNTAX)) {
538 if (is_first)
539 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
540 "\t(Missing semicolon on previous line?)\n");
541 else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) {
f54cb97a 542 const char *t;
c35e046a
AL
543 for (t = PL_oldoldbufptr; (isALNUM_lazy_if(t,UTF) || *t == ':'); t++)
544 NOOP;
56da5a46
RGS
545 if (t < PL_bufptr && isSPACE(*t))
546 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
547 "\t(Do you need to predeclare %.*s?)\n",
551405c4 548 (int)(t - PL_oldoldbufptr), PL_oldoldbufptr);
56da5a46
RGS
549 }
550 else {
551 assert(s >= oldbp);
552 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
551405c4 553 "\t(Missing operator before %.*s?)\n", (int)(s - oldbp), oldbp);
56da5a46 554 }
07c798fb 555 }
3280af22 556 PL_bufptr = oldbp;
8990e307
LW
557}
558
ffb4593c
NT
559/*
560 * S_missingterm
561 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 562 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
563 * If we're in a delimited string and the delimiter is a control
564 * character, it's reformatted into a two-char sequence like ^C.
565 * This is fatal.
566 */
567
76e3520e 568STATIC void
cea2e8a9 569S_missingterm(pTHX_ char *s)
8990e307 570{
97aff369 571 dVAR;
8990e307
LW
572 char tmpbuf[3];
573 char q;
574 if (s) {
9d4ba2ae 575 char * const nl = strrchr(s,'\n');
d2719217 576 if (nl)
8990e307
LW
577 *nl = '\0';
578 }
463559e7 579 else if (isCNTRL(PL_multi_close)) {
8990e307 580 *tmpbuf = '^';
585ec06d 581 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
582 tmpbuf[2] = '\0';
583 s = tmpbuf;
584 }
585 else {
eb160463 586 *tmpbuf = (char)PL_multi_close;
8990e307
LW
587 tmpbuf[1] = '\0';
588 s = tmpbuf;
589 }
590 q = strchr(s,'"') ? '\'' : '"';
cea2e8a9 591 Perl_croak(aTHX_ "Can't find string terminator %c%s%c anywhere before EOF",q,s,q);
463ee0b2 592}
79072805 593
0d863452 594/*
0d863452
RH
595 * Check whether the named feature is enabled.
596 */
26ea9e12
NC
597bool
598Perl_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
0d863452 599{
97aff369 600 dVAR;
0d863452 601 HV * const hinthv = GvHV(PL_hintgv);
4a731d7b 602 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
7918f24d
NC
603
604 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
605
26ea9e12
NC
606 if (namelen > MAX_FEATURE_LEN)
607 return FALSE;
4a731d7b 608 memcpy(&he_name[8], name, namelen);
d4c19fe8 609
7b9ef140 610 return (hinthv && hv_exists(hinthv, he_name, 8 + namelen));
0d863452
RH
611}
612
ffb4593c 613/*
9cbb5ea2
GS
614 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
615 * utf16-to-utf8-reversed.
ffb4593c
NT
616 */
617
c39cd008
GS
618#ifdef PERL_CR_FILTER
619static void
620strip_return(SV *sv)
621{
95a20fc0 622 register const char *s = SvPVX_const(sv);
9d4ba2ae 623 register const char * const e = s + SvCUR(sv);
7918f24d
NC
624
625 PERL_ARGS_ASSERT_STRIP_RETURN;
626
c39cd008
GS
627 /* outer loop optimized to do nothing if there are no CR-LFs */
628 while (s < e) {
629 if (*s++ == '\r' && *s == '\n') {
630 /* hit a CR-LF, need to copy the rest */
631 register char *d = s - 1;
632 *d++ = *s++;
633 while (s < e) {
634 if (*s == '\r' && s[1] == '\n')
635 s++;
636 *d++ = *s++;
637 }
638 SvCUR(sv) -= s - d;
639 return;
640 }
641 }
642}
a868473f 643
76e3520e 644STATIC I32
c39cd008 645S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 646{
f54cb97a 647 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
648 if (count > 0 && !maxlen)
649 strip_return(sv);
650 return count;
a868473f
NIS
651}
652#endif
653
ffb4593c 654/*
8eaa0acf
Z
655=for apidoc Amx|void|lex_start|SV *line|PerlIO *rsfp|U32 flags
656
657Creates and initialises a new lexer/parser state object, supplying
658a context in which to lex and parse from a new source of Perl code.
659A pointer to the new state object is placed in L</PL_parser>. An entry
660is made on the save stack so that upon unwinding the new state object
661will be destroyed and the former value of L</PL_parser> will be restored.
662Nothing else need be done to clean up the parsing context.
663
664The code to be parsed comes from I<line> and I<rsfp>. I<line>, if
665non-null, provides a string (in SV form) containing code to be parsed.
666A copy of the string is made, so subsequent modification of I<line>
667does not affect parsing. I<rsfp>, if non-null, provides an input stream
668from which code will be read to be parsed. If both are non-null, the
669code in I<line> comes first and must consist of complete lines of input,
670and I<rsfp> supplies the remainder of the source.
671
672The I<flags> parameter is reserved for future use, and must always
27fcb6ee
FC
673be zero, except for one flag that is currently reserved for perl's internal
674use.
8eaa0acf
Z
675
676=cut
677*/
ffb4593c 678
27fcb6ee
FC
679/* LEX_START_SAME_FILTER indicates that this is not a new file, so it
680 can share filters with the current parser. */
681
a0d0e21e 682void
8eaa0acf 683Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
79072805 684{
97aff369 685 dVAR;
6ef55633 686 const char *s = NULL;
8990e307 687 STRLEN len;
5486870f 688 yy_parser *parser, *oparser;
60d63348 689 if (flags && flags & ~LEX_START_FLAGS)
8eaa0acf 690 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
acdf0a21
DM
691
692 /* create and initialise a parser */
693
199e78b7 694 Newxz(parser, 1, yy_parser);
5486870f 695 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
696 PL_parser = parser;
697
28ac2b49
Z
698 parser->stack = NULL;
699 parser->ps = NULL;
700 parser->stack_size = 0;
acdf0a21 701
e3abe207
DM
702 /* on scope exit, free this parser and restore any outer one */
703 SAVEPARSER(parser);
7c4baf47 704 parser->saved_curcop = PL_curcop;
e3abe207 705
acdf0a21 706 /* initialise lexer state */
8990e307 707
fb205e7a
DM
708#ifdef PERL_MAD
709 parser->curforce = -1;
710#else
711 parser->nexttoke = 0;
712#endif
ca4cfd28 713 parser->error_count = oparser ? oparser->error_count : 0;
c2598295 714 parser->copline = NOLINE;
5afb0a62 715 parser->lex_state = LEX_NORMAL;
c2598295 716 parser->expect = XSTATE;
2f9285f8 717 parser->rsfp = rsfp;
27fcb6ee
FC
718 parser->rsfp_filters =
719 !(flags & LEX_START_SAME_FILTER) || !oparser
720 ? newAV()
721 : MUTABLE_AV(SvREFCNT_inc(oparser->rsfp_filters));
2f9285f8 722
199e78b7
DM
723 Newx(parser->lex_brackstack, 120, char);
724 Newx(parser->lex_casestack, 12, char);
725 *parser->lex_casestack = '\0';
02b34bbe 726
10efb74f
NC
727 if (line) {
728 s = SvPV_const(line, len);
0abcdfa4
FC
729 parser->linestr = flags & LEX_START_COPIED
730 ? SvREFCNT_inc_simple_NN(line)
731 : newSVpvn_flags(s, len, SvUTF8(line));
10efb74f 732 if (s[len-1] != ';')
bdc0bf6f 733 sv_catpvs(parser->linestr, "\n;");
0abcdfa4
FC
734 } else {
735 parser->linestr = newSVpvs("\n;");
8990e307 736 }
f06b5848
DM
737 parser->oldoldbufptr =
738 parser->oldbufptr =
739 parser->bufptr =
740 parser->linestart = SvPVX(parser->linestr);
741 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
742 parser->last_lop = parser->last_uni = NULL;
60d63348 743 parser->lex_flags = flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES);
737c24fc 744
60d63348 745 parser->in_pod = parser->filtered = 0;
79072805 746}
a687059c 747
e3abe207
DM
748
749/* delete a parser object */
750
751void
752Perl_parser_free(pTHX_ const yy_parser *parser)
753{
7918f24d
NC
754 PERL_ARGS_ASSERT_PARSER_FREE;
755
7c4baf47 756 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
757 SvREFCNT_dec(parser->linestr);
758
2f9285f8
DM
759 if (parser->rsfp == PerlIO_stdin())
760 PerlIO_clearerr(parser->rsfp);
799361c3
SH
761 else if (parser->rsfp && (!parser->old_parser ||
762 (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 763 PerlIO_close(parser->rsfp);
5486870f 764 SvREFCNT_dec(parser->rsfp_filters);
2f9285f8 765
e3abe207
DM
766 Safefree(parser->lex_brackstack);
767 Safefree(parser->lex_casestack);
768 PL_parser = parser->old_parser;
769 Safefree(parser);
770}
771
772
ffb4593c 773/*
f0e67a1d
Z
774=for apidoc AmxU|SV *|PL_parser-E<gt>linestr
775
776Buffer scalar containing the chunk currently under consideration of the
777text currently being lexed. This is always a plain string scalar (for
778which C<SvPOK> is true). It is not intended to be used as a scalar by
779normal scalar means; instead refer to the buffer directly by the pointer
780variables described below.
781
782The lexer maintains various C<char*> pointers to things in the
783C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
784reallocated, all of these pointers must be updated. Don't attempt to
785do this manually, but rather use L</lex_grow_linestr> if you need to
786reallocate the buffer.
787
788The content of the text chunk in the buffer is commonly exactly one
789complete line of input, up to and including a newline terminator,
790but there are situations where it is otherwise. The octets of the
791buffer may be intended to be interpreted as either UTF-8 or Latin-1.
792The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
793flag on this scalar, which may disagree with it.
794
795For direct examination of the buffer, the variable
796L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
797lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
798of these pointers is usually preferable to examination of the scalar
799through normal scalar means.
800
801=for apidoc AmxU|char *|PL_parser-E<gt>bufend
802
803Direct pointer to the end of the chunk of text currently being lexed, the
804end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
805+ SvCUR(PL_parser-E<gt>linestr)>. A NUL character (zero octet) is
806always located at the end of the buffer, and does not count as part of
807the buffer's contents.
808
809=for apidoc AmxU|char *|PL_parser-E<gt>bufptr
810
811Points to the current position of lexing inside the lexer buffer.
812Characters around this point may be freely examined, within
813the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
814L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
815interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
816
817Lexing code (whether in the Perl core or not) moves this pointer past
818the characters that it consumes. It is also expected to perform some
819bookkeeping whenever a newline character is consumed. This movement
820can be more conveniently performed by the function L</lex_read_to>,
821which handles newlines appropriately.
822
823Interpretation of the buffer's octets can be abstracted out by
824using the slightly higher-level functions L</lex_peek_unichar> and
825L</lex_read_unichar>.
826
827=for apidoc AmxU|char *|PL_parser-E<gt>linestart
828
829Points to the start of the current line inside the lexer buffer.
830This is useful for indicating at which column an error occurred, and
831not much else. This must be updated by any lexing code that consumes
832a newline; the function L</lex_read_to> handles this detail.
833
834=cut
835*/
836
837/*
838=for apidoc Amx|bool|lex_bufutf8
839
840Indicates whether the octets in the lexer buffer
841(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
842of Unicode characters. If not, they should be interpreted as Latin-1
843characters. This is analogous to the C<SvUTF8> flag for scalars.
844
845In UTF-8 mode, it is not guaranteed that the lexer buffer actually
846contains valid UTF-8. Lexing code must be robust in the face of invalid
847encoding.
848
849The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
850is significant, but not the whole story regarding the input character
851encoding. Normally, when a file is being read, the scalar contains octets
852and its C<SvUTF8> flag is off, but the octets should be interpreted as
853UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
854however, the scalar may have the C<SvUTF8> flag on, and in this case its
855octets should be interpreted as UTF-8 unless the C<use bytes> pragma
856is in effect. This logic may change in the future; use this function
857instead of implementing the logic yourself.
858
859=cut
860*/
861
862bool
863Perl_lex_bufutf8(pTHX)
864{
865 return UTF;
866}
867
868/*
869=for apidoc Amx|char *|lex_grow_linestr|STRLEN len
870
871Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
872at least I<len> octets (including terminating NUL). Returns a
873pointer to the reallocated buffer. This is necessary before making
874any direct modification of the buffer that would increase its length.
875L</lex_stuff_pvn> provides a more convenient way to insert text into
876the buffer.
877
878Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
879this function updates all of the lexer's variables that point directly
880into the buffer.
881
882=cut
883*/
884
885char *
886Perl_lex_grow_linestr(pTHX_ STRLEN len)
887{
888 SV *linestr;
889 char *buf;
890 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
891 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
892 linestr = PL_parser->linestr;
893 buf = SvPVX(linestr);
894 if (len <= SvLEN(linestr))
895 return buf;
896 bufend_pos = PL_parser->bufend - buf;
897 bufptr_pos = PL_parser->bufptr - buf;
898 oldbufptr_pos = PL_parser->oldbufptr - buf;
899 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
900 linestart_pos = PL_parser->linestart - buf;
901 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
902 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
903 buf = sv_grow(linestr, len);
904 PL_parser->bufend = buf + bufend_pos;
905 PL_parser->bufptr = buf + bufptr_pos;
906 PL_parser->oldbufptr = buf + oldbufptr_pos;
907 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
908 PL_parser->linestart = buf + linestart_pos;
909 if (PL_parser->last_uni)
910 PL_parser->last_uni = buf + last_uni_pos;
911 if (PL_parser->last_lop)
912 PL_parser->last_lop = buf + last_lop_pos;
913 return buf;
914}
915
916/*
83aa740e 917=for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
f0e67a1d
Z
918
919Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
920immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
921reallocating the buffer if necessary. This means that lexing code that
922runs later will see the characters as if they had appeared in the input.
923It is not recommended to do this as part of normal parsing, and most
924uses of this facility run the risk of the inserted characters being
925interpreted in an unintended manner.
926
927The string to be inserted is represented by I<len> octets starting
928at I<pv>. These octets are interpreted as either UTF-8 or Latin-1,
929according to whether the C<LEX_STUFF_UTF8> flag is set in I<flags>.
930The characters are recoded for the lexer buffer, according to how the
931buffer is currently being interpreted (L</lex_bufutf8>). If a string
9dcc53ea 932to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
f0e67a1d
Z
933function is more convenient.
934
935=cut
936*/
937
938void
83aa740e 939Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
f0e67a1d 940{
749123ff 941 dVAR;
f0e67a1d
Z
942 char *bufptr;
943 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
944 if (flags & ~(LEX_STUFF_UTF8))
945 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
946 if (UTF) {
947 if (flags & LEX_STUFF_UTF8) {
948 goto plain_copy;
949 } else {
950 STRLEN highhalf = 0;
83aa740e 951 const char *p, *e = pv+len;
f0e67a1d
Z
952 for (p = pv; p != e; p++)
953 highhalf += !!(((U8)*p) & 0x80);
954 if (!highhalf)
955 goto plain_copy;
956 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
957 bufptr = PL_parser->bufptr;
958 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
959 SvCUR_set(PL_parser->linestr,
960 SvCUR(PL_parser->linestr) + len+highhalf);
f0e67a1d
Z
961 PL_parser->bufend += len+highhalf;
962 for (p = pv; p != e; p++) {
963 U8 c = (U8)*p;
964 if (c & 0x80) {
965 *bufptr++ = (char)(0xc0 | (c >> 6));
966 *bufptr++ = (char)(0x80 | (c & 0x3f));
967 } else {
968 *bufptr++ = (char)c;
969 }
970 }
971 }
972 } else {
973 if (flags & LEX_STUFF_UTF8) {
974 STRLEN highhalf = 0;
83aa740e 975 const char *p, *e = pv+len;
f0e67a1d
Z
976 for (p = pv; p != e; p++) {
977 U8 c = (U8)*p;
978 if (c >= 0xc4) {
979 Perl_croak(aTHX_ "Lexing code attempted to stuff "
980 "non-Latin-1 character into Latin-1 input");
981 } else if (c >= 0xc2 && p+1 != e &&
982 (((U8)p[1]) & 0xc0) == 0x80) {
983 p++;
984 highhalf++;
985 } else if (c >= 0x80) {
986 /* malformed UTF-8 */
987 ENTER;
988 SAVESPTR(PL_warnhook);
989 PL_warnhook = PERL_WARNHOOK_FATAL;
990 utf8n_to_uvuni((U8*)p, e-p, NULL, 0);
991 LEAVE;
992 }
993 }
994 if (!highhalf)
995 goto plain_copy;
996 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
997 bufptr = PL_parser->bufptr;
998 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
999 SvCUR_set(PL_parser->linestr,
1000 SvCUR(PL_parser->linestr) + len-highhalf);
f0e67a1d
Z
1001 PL_parser->bufend += len-highhalf;
1002 for (p = pv; p != e; p++) {
1003 U8 c = (U8)*p;
1004 if (c & 0x80) {
1005 *bufptr++ = (char)(((c & 0x3) << 6) | (p[1] & 0x3f));
1006 p++;
1007 } else {
1008 *bufptr++ = (char)c;
1009 }
1010 }
1011 } else {
1012 plain_copy:
1013 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1014 bufptr = PL_parser->bufptr;
1015 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
255fdf19 1016 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
f0e67a1d
Z
1017 PL_parser->bufend += len;
1018 Copy(pv, bufptr, len, char);
1019 }
1020 }
1021}
1022
1023/*
9dcc53ea
Z
1024=for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
1025
1026Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1027immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1028reallocating the buffer if necessary. This means that lexing code that
1029runs later will see the characters as if they had appeared in the input.
1030It is not recommended to do this as part of normal parsing, and most
1031uses of this facility run the risk of the inserted characters being
1032interpreted in an unintended manner.
1033
1034The string to be inserted is represented by octets starting at I<pv>
1035and continuing to the first nul. These octets are interpreted as either
1036UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
1037in I<flags>. The characters are recoded for the lexer buffer, according
1038to how the buffer is currently being interpreted (L</lex_bufutf8>).
1039If it is not convenient to nul-terminate a string to be inserted, the
1040L</lex_stuff_pvn> function is more appropriate.
1041
1042=cut
1043*/
1044
1045void
1046Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1047{
1048 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1049 lex_stuff_pvn(pv, strlen(pv), flags);
1050}
1051
1052/*
f0e67a1d
Z
1053=for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1054
1055Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1056immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1057reallocating the buffer if necessary. This means that lexing code that
1058runs later will see the characters as if they had appeared in the input.
1059It is not recommended to do this as part of normal parsing, and most
1060uses of this facility run the risk of the inserted characters being
1061interpreted in an unintended manner.
1062
1063The string to be inserted is the string value of I<sv>. The characters
1064are recoded for the lexer buffer, according to how the buffer is currently
9dcc53ea 1065being interpreted (L</lex_bufutf8>). If a string to be inserted is
f0e67a1d
Z
1066not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1067need to construct a scalar.
1068
1069=cut
1070*/
1071
1072void
1073Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1074{
1075 char *pv;
1076 STRLEN len;
1077 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1078 if (flags)
1079 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1080 pv = SvPV(sv, len);
1081 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1082}
1083
1084/*
1085=for apidoc Amx|void|lex_unstuff|char *ptr
1086
1087Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1088I<ptr>. Text following I<ptr> will be moved, and the buffer shortened.
1089This hides the discarded text from any lexing code that runs later,
1090as if the text had never appeared.
1091
1092This is not the normal way to consume lexed text. For that, use
1093L</lex_read_to>.
1094
1095=cut
1096*/
1097
1098void
1099Perl_lex_unstuff(pTHX_ char *ptr)
1100{
1101 char *buf, *bufend;
1102 STRLEN unstuff_len;
1103 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1104 buf = PL_parser->bufptr;
1105 if (ptr < buf)
1106 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1107 if (ptr == buf)
1108 return;
1109 bufend = PL_parser->bufend;
1110 if (ptr > bufend)
1111 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1112 unstuff_len = ptr - buf;
1113 Move(ptr, buf, bufend+1-ptr, char);
1114 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1115 PL_parser->bufend = bufend - unstuff_len;
1116}
1117
1118/*
1119=for apidoc Amx|void|lex_read_to|char *ptr
1120
1121Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1122to I<ptr>. This advances L</PL_parser-E<gt>bufptr> to match I<ptr>,
1123performing the correct bookkeeping whenever a newline character is passed.
1124This is the normal way to consume lexed text.
1125
1126Interpretation of the buffer's octets can be abstracted out by
1127using the slightly higher-level functions L</lex_peek_unichar> and
1128L</lex_read_unichar>.
1129
1130=cut
1131*/
1132
1133void
1134Perl_lex_read_to(pTHX_ char *ptr)
1135{
1136 char *s;
1137 PERL_ARGS_ASSERT_LEX_READ_TO;
1138 s = PL_parser->bufptr;
1139 if (ptr < s || ptr > PL_parser->bufend)
1140 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1141 for (; s != ptr; s++)
1142 if (*s == '\n') {
1143 CopLINE_inc(PL_curcop);
1144 PL_parser->linestart = s+1;
1145 }
1146 PL_parser->bufptr = ptr;
1147}
1148
1149/*
1150=for apidoc Amx|void|lex_discard_to|char *ptr
1151
1152Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1153up to I<ptr>. The remaining content of the buffer will be moved, and
1154all pointers into the buffer updated appropriately. I<ptr> must not
1155be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1156it is not permitted to discard text that has yet to be lexed.
1157
1158Normally it is not necessarily to do this directly, because it suffices to
1159use the implicit discarding behaviour of L</lex_next_chunk> and things
1160based on it. However, if a token stretches across multiple lines,
1f317c95 1161and the lexing code has kept multiple lines of text in the buffer for
f0e67a1d
Z
1162that purpose, then after completion of the token it would be wise to
1163explicitly discard the now-unneeded earlier lines, to avoid future
1164multi-line tokens growing the buffer without bound.
1165
1166=cut
1167*/
1168
1169void
1170Perl_lex_discard_to(pTHX_ char *ptr)
1171{
1172 char *buf;
1173 STRLEN discard_len;
1174 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1175 buf = SvPVX(PL_parser->linestr);
1176 if (ptr < buf)
1177 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1178 if (ptr == buf)
1179 return;
1180 if (ptr > PL_parser->bufptr)
1181 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1182 discard_len = ptr - buf;
1183 if (PL_parser->oldbufptr < ptr)
1184 PL_parser->oldbufptr = ptr;
1185 if (PL_parser->oldoldbufptr < ptr)
1186 PL_parser->oldoldbufptr = ptr;
1187 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1188 PL_parser->last_uni = NULL;
1189 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1190 PL_parser->last_lop = NULL;
1191 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1192 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1193 PL_parser->bufend -= discard_len;
1194 PL_parser->bufptr -= discard_len;
1195 PL_parser->oldbufptr -= discard_len;
1196 PL_parser->oldoldbufptr -= discard_len;
1197 if (PL_parser->last_uni)
1198 PL_parser->last_uni -= discard_len;
1199 if (PL_parser->last_lop)
1200 PL_parser->last_lop -= discard_len;
1201}
1202
1203/*
1204=for apidoc Amx|bool|lex_next_chunk|U32 flags
1205
1206Reads in the next chunk of text to be lexed, appending it to
1207L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1208looked to the end of the current chunk and wants to know more. It is
1209usual, but not necessary, for lexing to have consumed the entirety of
1210the current chunk at this time.
1211
1212If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1213chunk (i.e., the current chunk has been entirely consumed), normally the
1214current chunk will be discarded at the same time that the new chunk is
1215read in. If I<flags> includes C<LEX_KEEP_PREVIOUS>, the current chunk
1216will not be discarded. If the current chunk has not been entirely
1217consumed, then it will not be discarded regardless of the flag.
1218
1219Returns true if some new text was added to the buffer, or false if the
1220buffer has reached the end of the input text.
1221
1222=cut
1223*/
1224
1225#define LEX_FAKE_EOF 0x80000000
1226
1227bool
1228Perl_lex_next_chunk(pTHX_ U32 flags)
1229{
1230 SV *linestr;
1231 char *buf;
1232 STRLEN old_bufend_pos, new_bufend_pos;
1233 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1234 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1235 bool got_some_for_debugger = 0;
f0e67a1d
Z
1236 bool got_some;
1237 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF))
1238 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
f0e67a1d
Z
1239 linestr = PL_parser->linestr;
1240 buf = SvPVX(linestr);
1241 if (!(flags & LEX_KEEP_PREVIOUS) &&
1242 PL_parser->bufptr == PL_parser->bufend) {
1243 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1244 linestart_pos = 0;
1245 if (PL_parser->last_uni != PL_parser->bufend)
1246 PL_parser->last_uni = NULL;
1247 if (PL_parser->last_lop != PL_parser->bufend)
1248 PL_parser->last_lop = NULL;
1249 last_uni_pos = last_lop_pos = 0;
1250 *buf = 0;
1251 SvCUR(linestr) = 0;
1252 } else {
1253 old_bufend_pos = PL_parser->bufend - buf;
1254 bufptr_pos = PL_parser->bufptr - buf;
1255 oldbufptr_pos = PL_parser->oldbufptr - buf;
1256 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1257 linestart_pos = PL_parser->linestart - buf;
1258 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1259 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1260 }
1261 if (flags & LEX_FAKE_EOF) {
1262 goto eof;
60d63348 1263 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
f0e67a1d
Z
1264 got_some = 0;
1265 } else if (filter_gets(linestr, old_bufend_pos)) {
1266 got_some = 1;
17cc9359 1267 got_some_for_debugger = 1;
f0e67a1d 1268 } else {
580561a3
Z
1269 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1270 sv_setpvs(linestr, "");
f0e67a1d
Z
1271 eof:
1272 /* End of real input. Close filehandle (unless it was STDIN),
1273 * then add implicit termination.
1274 */
1275 if ((PerlIO*)PL_parser->rsfp == PerlIO_stdin())
1276 PerlIO_clearerr(PL_parser->rsfp);
1277 else if (PL_parser->rsfp)
1278 (void)PerlIO_close(PL_parser->rsfp);
1279 PL_parser->rsfp = NULL;
60d63348 1280 PL_parser->in_pod = PL_parser->filtered = 0;
f0e67a1d
Z
1281#ifdef PERL_MAD
1282 if (PL_madskills && !PL_in_eval && (PL_minus_p || PL_minus_n))
1283 PL_faketokens = 1;
1284#endif
1285 if (!PL_in_eval && PL_minus_p) {
1286 sv_catpvs(linestr,
1287 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1288 PL_minus_n = PL_minus_p = 0;
1289 } else if (!PL_in_eval && PL_minus_n) {
1290 sv_catpvs(linestr, /*{*/";}");
1291 PL_minus_n = 0;
1292 } else
1293 sv_catpvs(linestr, ";");
1294 got_some = 1;
1295 }
1296 buf = SvPVX(linestr);
1297 new_bufend_pos = SvCUR(linestr);
1298 PL_parser->bufend = buf + new_bufend_pos;
1299 PL_parser->bufptr = buf + bufptr_pos;
1300 PL_parser->oldbufptr = buf + oldbufptr_pos;
1301 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1302 PL_parser->linestart = buf + linestart_pos;
1303 if (PL_parser->last_uni)
1304 PL_parser->last_uni = buf + last_uni_pos;
1305 if (PL_parser->last_lop)
1306 PL_parser->last_lop = buf + last_lop_pos;
17cc9359 1307 if (got_some_for_debugger && (PERLDB_LINE || PERLDB_SAVESRC) &&
f0e67a1d
Z
1308 PL_curstash != PL_debstash) {
1309 /* debugger active and we're not compiling the debugger code,
1310 * so store the line into the debugger's array of lines
1311 */
1312 update_debugger_info(NULL, buf+old_bufend_pos,
1313 new_bufend_pos-old_bufend_pos);
1314 }
1315 return got_some;
1316}
1317
1318/*
1319=for apidoc Amx|I32|lex_peek_unichar|U32 flags
1320
1321Looks ahead one (Unicode) character in the text currently being lexed.
1322Returns the codepoint (unsigned integer value) of the next character,
1323or -1 if lexing has reached the end of the input text. To consume the
1324peeked character, use L</lex_read_unichar>.
1325
1326If the next character is in (or extends into) the next chunk of input
1327text, the next chunk will be read in. Normally the current chunk will be
1328discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1329then the current chunk will not be discarded.
1330
1331If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1332is encountered, an exception is generated.
1333
1334=cut
1335*/
1336
1337I32
1338Perl_lex_peek_unichar(pTHX_ U32 flags)
1339{
749123ff 1340 dVAR;
f0e67a1d
Z
1341 char *s, *bufend;
1342 if (flags & ~(LEX_KEEP_PREVIOUS))
1343 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1344 s = PL_parser->bufptr;
1345 bufend = PL_parser->bufend;
1346 if (UTF) {
1347 U8 head;
1348 I32 unichar;
1349 STRLEN len, retlen;
1350 if (s == bufend) {
1351 if (!lex_next_chunk(flags))
1352 return -1;
1353 s = PL_parser->bufptr;
1354 bufend = PL_parser->bufend;
1355 }
1356 head = (U8)*s;
1357 if (!(head & 0x80))
1358 return head;
1359 if (head & 0x40) {
1360 len = PL_utf8skip[head];
1361 while ((STRLEN)(bufend-s) < len) {
1362 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1363 break;
1364 s = PL_parser->bufptr;
1365 bufend = PL_parser->bufend;
1366 }
1367 }
1368 unichar = utf8n_to_uvuni((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
1369 if (retlen == (STRLEN)-1) {
1370 /* malformed UTF-8 */
1371 ENTER;
1372 SAVESPTR(PL_warnhook);
1373 PL_warnhook = PERL_WARNHOOK_FATAL;
1374 utf8n_to_uvuni((U8*)s, bufend-s, NULL, 0);
1375 LEAVE;
1376 }
1377 return unichar;
1378 } else {
1379 if (s == bufend) {
1380 if (!lex_next_chunk(flags))
1381 return -1;
1382 s = PL_parser->bufptr;
1383 }
1384 return (U8)*s;
1385 }
1386}
1387
1388/*
1389=for apidoc Amx|I32|lex_read_unichar|U32 flags
1390
1391Reads the next (Unicode) character in the text currently being lexed.
1392Returns the codepoint (unsigned integer value) of the character read,
1393and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1394if lexing has reached the end of the input text. To non-destructively
1395examine the next character, use L</lex_peek_unichar> instead.
1396
1397If the next character is in (or extends into) the next chunk of input
1398text, the next chunk will be read in. Normally the current chunk will be
1399discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1400then the current chunk will not be discarded.
1401
1402If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1403is encountered, an exception is generated.
1404
1405=cut
1406*/
1407
1408I32
1409Perl_lex_read_unichar(pTHX_ U32 flags)
1410{
1411 I32 c;
1412 if (flags & ~(LEX_KEEP_PREVIOUS))
1413 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1414 c = lex_peek_unichar(flags);
1415 if (c != -1) {
1416 if (c == '\n')
1417 CopLINE_inc(PL_curcop);
d9018cbe
EB
1418 if (UTF)
1419 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1420 else
1421 ++(PL_parser->bufptr);
f0e67a1d
Z
1422 }
1423 return c;
1424}
1425
1426/*
1427=for apidoc Amx|void|lex_read_space|U32 flags
1428
1429Reads optional spaces, in Perl style, in the text currently being
1430lexed. The spaces may include ordinary whitespace characters and
1431Perl-style comments. C<#line> directives are processed if encountered.
1432L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1433at a non-space character (or the end of the input text).
1434
1435If spaces extend into the next chunk of input text, the next chunk will
1436be read in. Normally the current chunk will be discarded at the same
1437time, but if I<flags> includes C<LEX_KEEP_PREVIOUS> then the current
1438chunk will not be discarded.
1439
1440=cut
1441*/
1442
f0998909
Z
1443#define LEX_NO_NEXT_CHUNK 0x80000000
1444
f0e67a1d
Z
1445void
1446Perl_lex_read_space(pTHX_ U32 flags)
1447{
1448 char *s, *bufend;
1449 bool need_incline = 0;
f0998909 1450 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK))
f0e67a1d
Z
1451 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
1452#ifdef PERL_MAD
1453 if (PL_skipwhite) {
1454 sv_free(PL_skipwhite);
1455 PL_skipwhite = NULL;
1456 }
1457 if (PL_madskills)
1458 PL_skipwhite = newSVpvs("");
1459#endif /* PERL_MAD */
1460 s = PL_parser->bufptr;
1461 bufend = PL_parser->bufend;
1462 while (1) {
1463 char c = *s;
1464 if (c == '#') {
1465 do {
1466 c = *++s;
1467 } while (!(c == '\n' || (c == 0 && s == bufend)));
1468 } else if (c == '\n') {
1469 s++;
1470 PL_parser->linestart = s;
1471 if (s == bufend)
1472 need_incline = 1;
1473 else
1474 incline(s);
1475 } else if (isSPACE(c)) {
1476 s++;
1477 } else if (c == 0 && s == bufend) {
1478 bool got_more;
1479#ifdef PERL_MAD
1480 if (PL_madskills)
1481 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1482#endif /* PERL_MAD */
f0998909
Z
1483 if (flags & LEX_NO_NEXT_CHUNK)
1484 break;
f0e67a1d
Z
1485 PL_parser->bufptr = s;
1486 CopLINE_inc(PL_curcop);
1487 got_more = lex_next_chunk(flags);
1488 CopLINE_dec(PL_curcop);
1489 s = PL_parser->bufptr;
1490 bufend = PL_parser->bufend;
1491 if (!got_more)
1492 break;
1493 if (need_incline && PL_parser->rsfp) {
1494 incline(s);
1495 need_incline = 0;
1496 }
1497 } else {
1498 break;
1499 }
1500 }
1501#ifdef PERL_MAD
1502 if (PL_madskills)
1503 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1504#endif /* PERL_MAD */
1505 PL_parser->bufptr = s;
1506}
1507
1508/*
ffb4593c
NT
1509 * S_incline
1510 * This subroutine has nothing to do with tilting, whether at windmills
1511 * or pinball tables. Its name is short for "increment line". It
57843af0 1512 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1513 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1514 * # line 500 "foo.pm"
1515 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1516 */
1517
76e3520e 1518STATIC void
d9095cec 1519S_incline(pTHX_ const char *s)
463ee0b2 1520{
97aff369 1521 dVAR;
d9095cec
NC
1522 const char *t;
1523 const char *n;
1524 const char *e;
8818d409 1525 line_t line_num;
463ee0b2 1526
7918f24d
NC
1527 PERL_ARGS_ASSERT_INCLINE;
1528
57843af0 1529 CopLINE_inc(PL_curcop);
463ee0b2
LW
1530 if (*s++ != '#')
1531 return;
d4c19fe8
AL
1532 while (SPACE_OR_TAB(*s))
1533 s++;
73659bf1
GS
1534 if (strnEQ(s, "line", 4))
1535 s += 4;
1536 else
1537 return;
084592ab 1538 if (SPACE_OR_TAB(*s))
73659bf1 1539 s++;
4e553d73 1540 else
73659bf1 1541 return;
d4c19fe8
AL
1542 while (SPACE_OR_TAB(*s))
1543 s++;
463ee0b2
LW
1544 if (!isDIGIT(*s))
1545 return;
d4c19fe8 1546
463ee0b2
LW
1547 n = s;
1548 while (isDIGIT(*s))
1549 s++;
07714eb4 1550 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1551 return;
bf4acbe4 1552 while (SPACE_OR_TAB(*s))
463ee0b2 1553 s++;
73659bf1 1554 if (*s == '"' && (t = strchr(s+1, '"'))) {
463ee0b2 1555 s++;
73659bf1
GS
1556 e = t + 1;
1557 }
463ee0b2 1558 else {
c35e046a
AL
1559 t = s;
1560 while (!isSPACE(*t))
1561 t++;
73659bf1 1562 e = t;
463ee0b2 1563 }
bf4acbe4 1564 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1565 e++;
1566 if (*e != '\n' && *e != '\0')
1567 return; /* false alarm */
1568
8818d409
FC
1569 line_num = atoi(n)-1;
1570
f4dd75d9 1571 if (t - s > 0) {
d9095cec 1572 const STRLEN len = t - s;
19bad673
NC
1573 SV *const temp_sv = CopFILESV(PL_curcop);
1574 const char *cf;
1575 STRLEN tmplen;
1576
1577 if (temp_sv) {
1578 cf = SvPVX(temp_sv);
1579 tmplen = SvCUR(temp_sv);
1580 } else {
1581 cf = NULL;
1582 tmplen = 0;
1583 }
1584
d1299d44 1585 if (!PL_rsfp && !PL_parser->filtered) {
e66cf94c
RGS
1586 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1587 * to *{"::_<newfilename"} */
44867030
NC
1588 /* However, the long form of evals is only turned on by the
1589 debugger - usually they're "(eval %lu)" */
1590 char smallbuf[128];
1591 char *tmpbuf;
1592 GV **gvp;
d9095cec 1593 STRLEN tmplen2 = len;
798b63bc 1594 if (tmplen + 2 <= sizeof smallbuf)
e66cf94c
RGS
1595 tmpbuf = smallbuf;
1596 else
2ae0db35 1597 Newx(tmpbuf, tmplen + 2, char);
44867030
NC
1598 tmpbuf[0] = '_';
1599 tmpbuf[1] = '<';
2ae0db35 1600 memcpy(tmpbuf + 2, cf, tmplen);
44867030 1601 tmplen += 2;
8a5ee598
RGS
1602 gvp = (GV**)hv_fetch(PL_defstash, tmpbuf, tmplen, FALSE);
1603 if (gvp) {
44867030
NC
1604 char *tmpbuf2;
1605 GV *gv2;
1606
1607 if (tmplen2 + 2 <= sizeof smallbuf)
1608 tmpbuf2 = smallbuf;
1609 else
1610 Newx(tmpbuf2, tmplen2 + 2, char);
1611
1612 if (tmpbuf2 != smallbuf || tmpbuf != smallbuf) {
1613 /* Either they malloc'd it, or we malloc'd it,
1614 so no prefix is present in ours. */
1615 tmpbuf2[0] = '_';
1616 tmpbuf2[1] = '<';
1617 }
1618
1619 memcpy(tmpbuf2 + 2, s, tmplen2);
1620 tmplen2 += 2;
1621
8a5ee598 1622 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1623 if (!isGV(gv2)) {
8a5ee598 1624 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1625 /* adjust ${"::_<newfilename"} to store the new file name */
1626 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
8818d409
FC
1627 /* The line number may differ. If that is the case,
1628 alias the saved lines that are in the array.
1629 Otherwise alias the whole array. */
1630 if (CopLINE(PL_curcop) == line_num) {
1631 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(*gvp)));
1632 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(*gvp)));
1633 }
1634 else if (GvAV(*gvp)) {
1635 AV * const av = GvAV(*gvp);
1636 const I32 start = CopLINE(PL_curcop)+1;
1637 I32 items = AvFILLp(av) - start;
1638 if (items > 0) {
1639 AV * const av2 = GvAVn(gv2);
1640 SV **svp = AvARRAY(av) + start;
1641 I32 l = (I32)line_num+1;
1642 while (items--)
1643 av_store(av2, l++, SvREFCNT_inc(*svp++));
1644 }
1645 }
e5527e4b 1646 }
44867030
NC
1647
1648 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
8a5ee598 1649 }
e66cf94c 1650 if (tmpbuf != smallbuf) Safefree(tmpbuf);
e66cf94c 1651 }
05ec9bb3 1652 CopFILE_free(PL_curcop);
d9095cec 1653 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1654 }
8818d409 1655 CopLINE_set(PL_curcop, line_num);
463ee0b2
LW
1656}
1657
29595ff2 1658#ifdef PERL_MAD
cd81e915 1659/* skip space before PL_thistoken */
29595ff2
NC
1660
1661STATIC char *
1662S_skipspace0(pTHX_ register char *s)
1663{
7918f24d
NC
1664 PERL_ARGS_ASSERT_SKIPSPACE0;
1665
29595ff2
NC
1666 s = skipspace(s);
1667 if (!PL_madskills)
1668 return s;
cd81e915
NC
1669 if (PL_skipwhite) {
1670 if (!PL_thiswhite)
6b29d1f5 1671 PL_thiswhite = newSVpvs("");
cd81e915
NC
1672 sv_catsv(PL_thiswhite, PL_skipwhite);
1673 sv_free(PL_skipwhite);
1674 PL_skipwhite = 0;
1675 }
1676 PL_realtokenstart = s - SvPVX(PL_linestr);
29595ff2
NC
1677 return s;
1678}
1679
cd81e915 1680/* skip space after PL_thistoken */
29595ff2
NC
1681
1682STATIC char *
1683S_skipspace1(pTHX_ register char *s)
1684{
d4c19fe8 1685 const char *start = s;
29595ff2
NC
1686 I32 startoff = start - SvPVX(PL_linestr);
1687
7918f24d
NC
1688 PERL_ARGS_ASSERT_SKIPSPACE1;
1689
29595ff2
NC
1690 s = skipspace(s);
1691 if (!PL_madskills)
1692 return s;
1693 start = SvPVX(PL_linestr) + startoff;
cd81e915 1694 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1695 const char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1696 PL_thistoken = newSVpvn(tstart, start - tstart);
1697 }
1698 PL_realtokenstart = -1;
1699 if (PL_skipwhite) {
1700 if (!PL_nextwhite)
6b29d1f5 1701 PL_nextwhite = newSVpvs("");
cd81e915
NC
1702 sv_catsv(PL_nextwhite, PL_skipwhite);
1703 sv_free(PL_skipwhite);
1704 PL_skipwhite = 0;
29595ff2
NC
1705 }
1706 return s;
1707}
1708
1709STATIC char *
1710S_skipspace2(pTHX_ register char *s, SV **svp)
1711{
c35e046a
AL
1712 char *start;
1713 const I32 bufptroff = PL_bufptr - SvPVX(PL_linestr);
1714 const I32 startoff = s - SvPVX(PL_linestr);
1715
7918f24d
NC
1716 PERL_ARGS_ASSERT_SKIPSPACE2;
1717
29595ff2
NC
1718 s = skipspace(s);
1719 PL_bufptr = SvPVX(PL_linestr) + bufptroff;
1720 if (!PL_madskills || !svp)
1721 return s;
1722 start = SvPVX(PL_linestr) + startoff;
cd81e915 1723 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1724 char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1725 PL_thistoken = newSVpvn(tstart, start - tstart);
1726 PL_realtokenstart = -1;
29595ff2 1727 }
cd81e915 1728 if (PL_skipwhite) {
29595ff2 1729 if (!*svp)
6b29d1f5 1730 *svp = newSVpvs("");
cd81e915
NC
1731 sv_setsv(*svp, PL_skipwhite);
1732 sv_free(PL_skipwhite);
1733 PL_skipwhite = 0;
29595ff2
NC
1734 }
1735
1736 return s;
1737}
1738#endif
1739
80a702cd 1740STATIC void
15f169a1 1741S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1742{
1743 AV *av = CopFILEAVx(PL_curcop);
1744 if (av) {
b9f83d2f 1745 SV * const sv = newSV_type(SVt_PVMG);
5fa550fb
NC
1746 if (orig_sv)
1747 sv_setsv(sv, orig_sv);
1748 else
1749 sv_setpvn(sv, buf, len);
80a702cd
RGS
1750 (void)SvIOK_on(sv);
1751 SvIV_set(sv, 0);
1752 av_store(av, (I32)CopLINE(PL_curcop), sv);
1753 }
1754}
1755
ffb4593c
NT
1756/*
1757 * S_skipspace
1758 * Called to gobble the appropriate amount and type of whitespace.
1759 * Skips comments as well.
1760 */
1761
76e3520e 1762STATIC char *
cea2e8a9 1763S_skipspace(pTHX_ register char *s)
a687059c 1764{
5db06880 1765#ifdef PERL_MAD
f0e67a1d
Z
1766 char *start = s;
1767#endif /* PERL_MAD */
7918f24d 1768 PERL_ARGS_ASSERT_SKIPSPACE;
f0e67a1d 1769#ifdef PERL_MAD
cd81e915
NC
1770 if (PL_skipwhite) {
1771 sv_free(PL_skipwhite);
f0e67a1d 1772 PL_skipwhite = NULL;
5db06880 1773 }
f0e67a1d 1774#endif /* PERL_MAD */
3280af22 1775 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
bf4acbe4 1776 while (s < PL_bufend && SPACE_OR_TAB(*s))
463ee0b2 1777 s++;
f0e67a1d
Z
1778 } else {
1779 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1780 PL_bufptr = s;
f0998909
Z
1781 lex_read_space(LEX_KEEP_PREVIOUS |
1782 (PL_sublex_info.sub_inwhat || PL_lex_state == LEX_FORMLINE ?
1783 LEX_NO_NEXT_CHUNK : 0));
3280af22 1784 s = PL_bufptr;
f0e67a1d
Z
1785 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1786 if (PL_linestart > PL_bufptr)
1787 PL_bufptr = PL_linestart;
1788 return s;
463ee0b2 1789 }
5db06880 1790#ifdef PERL_MAD
f0e67a1d
Z
1791 if (PL_madskills)
1792 PL_skipwhite = newSVpvn(start, s-start);
1793#endif /* PERL_MAD */
5db06880 1794 return s;
a687059c 1795}
378cc40b 1796
ffb4593c
NT
1797/*
1798 * S_check_uni
1799 * Check the unary operators to ensure there's no ambiguity in how they're
1800 * used. An ambiguous piece of code would be:
1801 * rand + 5
1802 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1803 * the +5 is its argument.
1804 */
1805
76e3520e 1806STATIC void
cea2e8a9 1807S_check_uni(pTHX)
ba106d47 1808{
97aff369 1809 dVAR;
d4c19fe8
AL
1810 const char *s;
1811 const char *t;
2f3197b3 1812
3280af22 1813 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1814 return;
3280af22
NIS
1815 while (isSPACE(*PL_last_uni))
1816 PL_last_uni++;
c35e046a
AL
1817 s = PL_last_uni;
1818 while (isALNUM_lazy_if(s,UTF) || *s == '-')
1819 s++;
3280af22 1820 if ((t = strchr(s, '(')) && t < PL_bufptr)
a0d0e21e 1821 return;
6136c704 1822
9b387841
NC
1823 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
1824 "Warning: Use of \"%.*s\" without parentheses is ambiguous",
1825 (int)(s - PL_last_uni), PL_last_uni);
2f3197b3
LW
1826}
1827
ffb4593c
NT
1828/*
1829 * LOP : macro to build a list operator. Its behaviour has been replaced
1830 * with a subroutine, S_lop() for which LOP is just another name.
1831 */
1832
a0d0e21e
LW
1833#define LOP(f,x) return lop(f,x,s)
1834
ffb4593c
NT
1835/*
1836 * S_lop
1837 * Build a list operator (or something that might be one). The rules:
1838 * - if we have a next token, then it's a list operator [why?]
1839 * - if the next thing is an opening paren, then it's a function
1840 * - else it's a list operator
1841 */
1842
76e3520e 1843STATIC I32
a0be28da 1844S_lop(pTHX_ I32 f, int x, char *s)
ffed7fef 1845{
97aff369 1846 dVAR;
7918f24d
NC
1847
1848 PERL_ARGS_ASSERT_LOP;
1849
6154021b 1850 pl_yylval.ival = f;
35c8bce7 1851 CLINE;
3280af22
NIS
1852 PL_expect = x;
1853 PL_bufptr = s;
1854 PL_last_lop = PL_oldbufptr;
eb160463 1855 PL_last_lop_op = (OPCODE)f;
5db06880
NC
1856#ifdef PERL_MAD
1857 if (PL_lasttoke)
78cdf107 1858 goto lstop;
5db06880 1859#else
3280af22 1860 if (PL_nexttoke)
78cdf107 1861 goto lstop;
5db06880 1862#endif
79072805 1863 if (*s == '(')
bbf60fe6 1864 return REPORT(FUNC);
29595ff2 1865 s = PEEKSPACE(s);
79072805 1866 if (*s == '(')
bbf60fe6 1867 return REPORT(FUNC);
78cdf107
Z
1868 else {
1869 lstop:
1870 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
1871 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
bbf60fe6 1872 return REPORT(LSTOP);
78cdf107 1873 }
79072805
LW
1874}
1875
5db06880
NC
1876#ifdef PERL_MAD
1877 /*
1878 * S_start_force
1879 * Sets up for an eventual force_next(). start_force(0) basically does
1880 * an unshift, while start_force(-1) does a push. yylex removes items
1881 * on the "pop" end.
1882 */
1883
1884STATIC void
1885S_start_force(pTHX_ int where)
1886{
1887 int i;
1888
cd81e915 1889 if (where < 0) /* so people can duplicate start_force(PL_curforce) */
5db06880 1890 where = PL_lasttoke;
cd81e915
NC
1891 assert(PL_curforce < 0 || PL_curforce == where);
1892 if (PL_curforce != where) {
5db06880
NC
1893 for (i = PL_lasttoke; i > where; --i) {
1894 PL_nexttoke[i] = PL_nexttoke[i-1];
1895 }
1896 PL_lasttoke++;
1897 }
cd81e915 1898 if (PL_curforce < 0) /* in case of duplicate start_force() */
5db06880 1899 Zero(&PL_nexttoke[where], 1, NEXTTOKE);
cd81e915
NC
1900 PL_curforce = where;
1901 if (PL_nextwhite) {
5db06880 1902 if (PL_madskills)
6b29d1f5 1903 curmad('^', newSVpvs(""));
cd81e915 1904 CURMAD('_', PL_nextwhite);
5db06880
NC
1905 }
1906}
1907
1908STATIC void
1909S_curmad(pTHX_ char slot, SV *sv)
1910{
1911 MADPROP **where;
1912
1913 if (!sv)
1914 return;
cd81e915
NC
1915 if (PL_curforce < 0)
1916 where = &PL_thismad;
5db06880 1917 else
cd81e915 1918 where = &PL_nexttoke[PL_curforce].next_mad;
5db06880 1919
cd81e915 1920 if (PL_faketokens)
76f68e9b 1921 sv_setpvs(sv, "");
5db06880
NC
1922 else {
1923 if (!IN_BYTES) {
1924 if (UTF && is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
1925 SvUTF8_on(sv);
1926 else if (PL_encoding) {
1927 sv_recode_to_utf8(sv, PL_encoding);
1928 }
1929 }
1930 }
1931
1932 /* keep a slot open for the head of the list? */
1933 if (slot != '_' && *where && (*where)->mad_key == '^') {
1934 (*where)->mad_key = slot;
daba3364 1935 sv_free(MUTABLE_SV(((*where)->mad_val)));
5db06880
NC
1936 (*where)->mad_val = (void*)sv;
1937 }
1938 else
1939 addmad(newMADsv(slot, sv), where, 0);
1940}
1941#else
b3f24c00
MHM
1942# define start_force(where) NOOP
1943# define curmad(slot, sv) NOOP
5db06880
NC
1944#endif
1945
ffb4593c
NT
1946/*
1947 * S_force_next
9cbb5ea2 1948 * When the lexer realizes it knows the next token (for instance,
ffb4593c 1949 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 1950 * to know what token to return the next time the lexer is called. Caller
5db06880
NC
1951 * will need to set PL_nextval[] (or PL_nexttoke[].next_val with PERL_MAD),
1952 * and possibly PL_expect to ensure the lexer handles the token correctly.
ffb4593c
NT
1953 */
1954
4e553d73 1955STATIC void
cea2e8a9 1956S_force_next(pTHX_ I32 type)
79072805 1957{
97aff369 1958 dVAR;
704d4215
GG
1959#ifdef DEBUGGING
1960 if (DEBUG_T_TEST) {
1961 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 1962 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
1963 }
1964#endif
5db06880 1965#ifdef PERL_MAD
cd81e915 1966 if (PL_curforce < 0)
5db06880 1967 start_force(PL_lasttoke);
cd81e915 1968 PL_nexttoke[PL_curforce].next_type = type;
5db06880
NC
1969 if (PL_lex_state != LEX_KNOWNEXT)
1970 PL_lex_defer = PL_lex_state;
1971 PL_lex_state = LEX_KNOWNEXT;
1972 PL_lex_expect = PL_expect;
cd81e915 1973 PL_curforce = -1;
5db06880 1974#else
3280af22
NIS
1975 PL_nexttype[PL_nexttoke] = type;
1976 PL_nexttoke++;
1977 if (PL_lex_state != LEX_KNOWNEXT) {
1978 PL_lex_defer = PL_lex_state;
1979 PL_lex_expect = PL_expect;
1980 PL_lex_state = LEX_KNOWNEXT;
79072805 1981 }
5db06880 1982#endif
79072805
LW
1983}
1984
28ac2b49
Z
1985void
1986Perl_yyunlex(pTHX)
1987{
a7aaec61
Z
1988 int yyc = PL_parser->yychar;
1989 if (yyc != YYEMPTY) {
1990 if (yyc) {
1991 start_force(-1);
1992 NEXTVAL_NEXTTOKE = PL_parser->yylval;
1993 if (yyc == '{'/*}*/ || yyc == HASHBRACK || yyc == '['/*]*/) {
78cdf107 1994 PL_lex_allbrackets--;
a7aaec61 1995 PL_lex_brackets--;
78cdf107
Z
1996 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
1997 } else if (yyc == '('/*)*/) {
1998 PL_lex_allbrackets--;
1999 yyc |= (2<<24);
a7aaec61
Z
2000 }
2001 force_next(yyc);
2002 }
28ac2b49
Z
2003 PL_parser->yychar = YYEMPTY;
2004 }
2005}
2006
d0a148a6 2007STATIC SV *
15f169a1 2008S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 2009{
97aff369 2010 dVAR;
740cce10 2011 SV * const sv = newSVpvn_utf8(start, len,
eaf7a4d2
CS
2012 !IN_BYTES
2013 && UTF
2014 && !is_ascii_string((const U8*)start, len)
740cce10 2015 && is_utf8_string((const U8*)start, len));
d0a148a6
NC
2016 return sv;
2017}
2018
ffb4593c
NT
2019/*
2020 * S_force_word
2021 * When the lexer knows the next thing is a word (for instance, it has
2022 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
2023 * it calls S_force_word to stick the next word into the PL_nexttoke/val
2024 * lookahead.
ffb4593c
NT
2025 *
2026 * Arguments:
b1b65b59 2027 * char *start : buffer position (must be within PL_linestr)
02b34bbe 2028 * int token : PL_next* will be this type of bare word (e.g., METHOD,WORD)
ffb4593c
NT
2029 * int check_keyword : if true, Perl checks to make sure the word isn't
2030 * a keyword (do this if the word is a label, e.g. goto FOO)
2031 * int allow_pack : if true, : characters will also be allowed (require,
2032 * use, etc. do this)
9cbb5ea2 2033 * int allow_initial_tick : used by the "sub" lexer only.
ffb4593c
NT
2034 */
2035
76e3520e 2036STATIC char *
cea2e8a9 2037S_force_word(pTHX_ register char *start, int token, int check_keyword, int allow_pack, int allow_initial_tick)
79072805 2038{
97aff369 2039 dVAR;
463ee0b2
LW
2040 register char *s;
2041 STRLEN len;
4e553d73 2042
7918f24d
NC
2043 PERL_ARGS_ASSERT_FORCE_WORD;
2044
29595ff2 2045 start = SKIPSPACE1(start);
463ee0b2 2046 s = start;
7e2040f0 2047 if (isIDFIRST_lazy_if(s,UTF) ||
a0d0e21e 2048 (allow_pack && *s == ':') ||
15f0808c 2049 (allow_initial_tick && *s == '\'') )
a0d0e21e 2050 {
3280af22 2051 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
5458a98a 2052 if (check_keyword && keyword(PL_tokenbuf, len, 0))
463ee0b2 2053 return start;
cd81e915 2054 start_force(PL_curforce);
5db06880
NC
2055 if (PL_madskills)
2056 curmad('X', newSVpvn(start,s-start));
463ee0b2 2057 if (token == METHOD) {
29595ff2 2058 s = SKIPSPACE1(s);
463ee0b2 2059 if (*s == '(')
3280af22 2060 PL_expect = XTERM;
463ee0b2 2061 else {
3280af22 2062 PL_expect = XOPERATOR;
463ee0b2 2063 }
79072805 2064 }
e74e6b3d 2065 if (PL_madskills)
63575281 2066 curmad('g', newSVpvs( "forced" ));
9ded7720 2067 NEXTVAL_NEXTTOKE.opval
d0a148a6
NC
2068 = (OP*)newSVOP(OP_CONST,0,
2069 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 2070 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
2071 force_next(token);
2072 }
2073 return s;
2074}
2075
ffb4593c
NT
2076/*
2077 * S_force_ident
9cbb5ea2 2078 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
2079 * text only contains the "foo" portion. The first argument is a pointer
2080 * to the "foo", and the second argument is the type symbol to prefix.
2081 * Forces the next token to be a "WORD".
9cbb5ea2 2082 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2083 */
2084
76e3520e 2085STATIC void
bfed75c6 2086S_force_ident(pTHX_ register const char *s, int kind)
79072805 2087{
97aff369 2088 dVAR;
7918f24d
NC
2089
2090 PERL_ARGS_ASSERT_FORCE_IDENT;
2091
c35e046a 2092 if (*s) {
90e5519e 2093 const STRLEN len = strlen(s);
728847b1
BF
2094 OP* const o = (OP*)newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
2095 UTF ? SVf_UTF8 : 0));
cd81e915 2096 start_force(PL_curforce);
9ded7720 2097 NEXTVAL_NEXTTOKE.opval = o;
79072805 2098 force_next(WORD);
748a9306 2099 if (kind) {
11343788 2100 o->op_private = OPpCONST_ENTERED;
55497cff
PP
2101 /* XXX see note in pp_entereval() for why we forgo typo
2102 warnings if the symbol must be introduced in an eval.
2103 GSAR 96-10-12 */
90e5519e 2104 gv_fetchpvn_flags(s, len,
728847b1
BF
2105 (PL_in_eval ? (GV_ADDMULTI | GV_ADDINEVAL)
2106 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
90e5519e
NC
2107 kind == '$' ? SVt_PV :
2108 kind == '@' ? SVt_PVAV :
2109 kind == '%' ? SVt_PVHV :
a0d0e21e 2110 SVt_PVGV
90e5519e 2111 );
748a9306 2112 }
79072805
LW
2113 }
2114}
2115
1571675a
GS
2116NV
2117Perl_str_to_version(pTHX_ SV *sv)
2118{
2119 NV retval = 0.0;
2120 NV nshift = 1.0;
2121 STRLEN len;
cfd0369c 2122 const char *start = SvPV_const(sv,len);
9d4ba2ae 2123 const char * const end = start + len;
504618e9 2124 const bool utf = SvUTF8(sv) ? TRUE : FALSE;
7918f24d
NC
2125
2126 PERL_ARGS_ASSERT_STR_TO_VERSION;
2127
1571675a 2128 while (start < end) {
ba210ebe 2129 STRLEN skip;
1571675a
GS
2130 UV n;
2131 if (utf)
9041c2e3 2132 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2133 else {
2134 n = *(U8*)start;
2135 skip = 1;
2136 }
2137 retval += ((NV)n)/nshift;
2138 start += skip;
2139 nshift *= 1000;
2140 }
2141 return retval;
2142}
2143
4e553d73 2144/*
ffb4593c
NT
2145 * S_force_version
2146 * Forces the next token to be a version number.
e759cc13
RGS
2147 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2148 * and if "guessing" is TRUE, then no new token is created (and the caller
2149 * must use an alternative parsing method).
ffb4593c
NT
2150 */
2151
76e3520e 2152STATIC char *
e759cc13 2153S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2154{
97aff369 2155 dVAR;
5f66b61c 2156 OP *version = NULL;
44dcb63b 2157 char *d;
5db06880
NC
2158#ifdef PERL_MAD
2159 I32 startoff = s - SvPVX(PL_linestr);
2160#endif
89bfa8cd 2161
7918f24d
NC
2162 PERL_ARGS_ASSERT_FORCE_VERSION;
2163
29595ff2 2164 s = SKIPSPACE1(s);
89bfa8cd 2165
44dcb63b 2166 d = s;
dd629d5b 2167 if (*d == 'v')
44dcb63b 2168 d++;
44dcb63b 2169 if (isDIGIT(*d)) {
e759cc13
RGS
2170 while (isDIGIT(*d) || *d == '_' || *d == '.')
2171 d++;
5db06880
NC
2172#ifdef PERL_MAD
2173 if (PL_madskills) {
cd81e915 2174 start_force(PL_curforce);
5db06880
NC
2175 curmad('X', newSVpvn(s,d-s));
2176 }
2177#endif
4e4da3ac 2178 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
dd629d5b 2179 SV *ver;
8d08d9ba
DG
2180#ifdef USE_LOCALE_NUMERIC
2181 char *loc = setlocale(LC_NUMERIC, "C");
2182#endif
6154021b 2183 s = scan_num(s, &pl_yylval);
8d08d9ba
DG
2184#ifdef USE_LOCALE_NUMERIC
2185 setlocale(LC_NUMERIC, loc);
2186#endif
6154021b 2187 version = pl_yylval.opval;
dd629d5b
GS
2188 ver = cSVOPx(version)->op_sv;
2189 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2190 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2191 SvNV_set(ver, str_to_version(ver));
1571675a 2192 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2193 }
89bfa8cd 2194 }
5db06880
NC
2195 else if (guessing) {
2196#ifdef PERL_MAD
2197 if (PL_madskills) {
cd81e915
NC
2198 sv_free(PL_nextwhite); /* let next token collect whitespace */
2199 PL_nextwhite = 0;
5db06880
NC
2200 s = SvPVX(PL_linestr) + startoff;
2201 }
2202#endif
e759cc13 2203 return s;
5db06880 2204 }
89bfa8cd
PP
2205 }
2206
5db06880
NC
2207#ifdef PERL_MAD
2208 if (PL_madskills && !version) {
cd81e915
NC
2209 sv_free(PL_nextwhite); /* let next token collect whitespace */
2210 PL_nextwhite = 0;
5db06880
NC
2211 s = SvPVX(PL_linestr) + startoff;
2212 }
2213#endif
89bfa8cd 2214 /* NOTE: The parser sees the package name and the VERSION swapped */
cd81e915 2215 start_force(PL_curforce);
9ded7720 2216 NEXTVAL_NEXTTOKE.opval = version;
4e553d73 2217 force_next(WORD);
89bfa8cd 2218
e759cc13 2219 return s;
89bfa8cd
PP
2220}
2221
ffb4593c 2222/*
91152fc1
DG
2223 * S_force_strict_version
2224 * Forces the next token to be a version number using strict syntax rules.
2225 */
2226
2227STATIC char *
2228S_force_strict_version(pTHX_ char *s)
2229{
2230 dVAR;
2231 OP *version = NULL;
2232#ifdef PERL_MAD
2233 I32 startoff = s - SvPVX(PL_linestr);
2234#endif
2235 const char *errstr = NULL;
2236
2237 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2238
2239 while (isSPACE(*s)) /* leading whitespace */
2240 s++;
2241
2242 if (is_STRICT_VERSION(s,&errstr)) {
2243 SV *ver = newSV(0);
2244 s = (char *)scan_version(s, ver, 0);
2245 version = newSVOP(OP_CONST, 0, ver);
2246 }
4e4da3ac
Z
2247 else if ( (*s != ';' && *s != '{' && *s != '}' ) &&
2248 (s = SKIPSPACE1(s), (*s != ';' && *s != '{' && *s != '}' )))
2249 {
91152fc1
DG
2250 PL_bufptr = s;
2251 if (errstr)
2252 yyerror(errstr); /* version required */
2253 return s;
2254 }
2255
2256#ifdef PERL_MAD
2257 if (PL_madskills && !version) {
2258 sv_free(PL_nextwhite); /* let next token collect whitespace */
2259 PL_nextwhite = 0;
2260 s = SvPVX(PL_linestr) + startoff;
2261 }
2262#endif
2263 /* NOTE: The parser sees the package name and the VERSION swapped */
2264 start_force(PL_curforce);
2265 NEXTVAL_NEXTTOKE.opval = version;
2266 force_next(WORD);
2267
2268 return s;
2269}
2270
2271/*
ffb4593c
NT
2272 * S_tokeq
2273 * Tokenize a quoted string passed in as an SV. It finds the next
2274 * chunk, up to end of string or a backslash. It may make a new
2275 * SV containing that chunk (if HINT_NEW_STRING is on). It also
2276 * turns \\ into \.
2277 */
2278
76e3520e 2279STATIC SV *
cea2e8a9 2280S_tokeq(pTHX_ SV *sv)
79072805 2281{
97aff369 2282 dVAR;
79072805
LW
2283 register char *s;
2284 register char *send;
2285 register char *d;
b3ac6de7
IZ
2286 STRLEN len = 0;
2287 SV *pv = sv;
79072805 2288
7918f24d
NC
2289 PERL_ARGS_ASSERT_TOKEQ;
2290
79072805 2291 if (!SvLEN(sv))
b3ac6de7 2292 goto finish;
79072805 2293
a0d0e21e 2294 s = SvPV_force(sv, len);
21a311ee 2295 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1)
b3ac6de7 2296 goto finish;
463ee0b2 2297 send = s + len;
dcb21ed6
NC
2298 /* This is relying on the SV being "well formed" with a trailing '\0' */
2299 while (s < send && !(*s == '\\' && s[1] == '\\'))
79072805
LW
2300 s++;
2301 if (s == send)
b3ac6de7 2302 goto finish;
79072805 2303 d = s;
be4731d2 2304 if ( PL_hints & HINT_NEW_STRING ) {
59cd0e26 2305 pv = newSVpvn_flags(SvPVX_const(pv), len, SVs_TEMP | SvUTF8(sv));
be4731d2 2306 }
79072805
LW
2307 while (s < send) {
2308 if (*s == '\\') {
a0d0e21e 2309 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2310 s++; /* all that, just for this */
2311 }
2312 *d++ = *s++;
2313 }
2314 *d = '\0';
95a20fc0 2315 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2316 finish:
3280af22 2317 if ( PL_hints & HINT_NEW_STRING )
eb0d8d16 2318 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
79072805
LW
2319 return sv;
2320}
2321
ffb4593c
NT
2322/*
2323 * Now come three functions related to double-quote context,
2324 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2325 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2326 * interact with PL_lex_state, and create fake ( ... ) argument lists
2327 * to handle functions and concatenation.
2328 * They assume that whoever calls them will be setting up a fake
2329 * join call, because each subthing puts a ',' after it. This lets
2330 * "lower \luPpEr"
2331 * become
2332 * join($, , 'lower ', lcfirst( 'uPpEr', ) ,)
2333 *
2334 * (I'm not sure whether the spurious commas at the end of lcfirst's
2335 * arguments and join's arguments are created or not).
2336 */
2337
2338/*
2339 * S_sublex_start
6154021b 2340 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2341 *
2342 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2343 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c
NT
2344 *
2345 * OP_CONST and OP_READLINE are easy--just make the new op and return.
2346 *
2347 * Everything else becomes a FUNC.
2348 *
2349 * Sets PL_lex_state to LEX_INTERPPUSH unless (ival was OP_NULL or we
2350 * had an OP_CONST or OP_READLINE). This just sets us up for a
2351 * call to S_sublex_push().
2352 */
2353
76e3520e 2354STATIC I32
cea2e8a9 2355S_sublex_start(pTHX)
79072805 2356{
97aff369 2357 dVAR;
6154021b 2358 register const I32 op_type = pl_yylval.ival;
79072805
LW
2359
2360 if (op_type == OP_NULL) {
6154021b 2361 pl_yylval.opval = PL_lex_op;
5f66b61c 2362 PL_lex_op = NULL;
79072805
LW
2363 return THING;
2364 }
2365 if (op_type == OP_CONST || op_type == OP_READLINE) {
3280af22 2366 SV *sv = tokeq(PL_lex_stuff);
b3ac6de7
IZ
2367
2368 if (SvTYPE(sv) == SVt_PVIV) {
2369 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2370 STRLEN len;
96a5add6 2371 const char * const p = SvPV_const(sv, len);
740cce10 2372 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2373 SvREFCNT_dec(sv);
2374 sv = nsv;
4e553d73 2375 }
6154021b 2376 pl_yylval.opval = (OP*)newSVOP(op_type, 0, sv);
a0714e2c 2377 PL_lex_stuff = NULL;
6f33ba73
RGS
2378 /* Allow <FH> // "foo" */
2379 if (op_type == OP_READLINE)
2380 PL_expect = XTERMORDORDOR;
79072805
LW
2381 return THING;
2382 }
e3f73d4e
RGS
2383 else if (op_type == OP_BACKTICK && PL_lex_op) {
2384 /* readpipe() vas overriden */
2385 cSVOPx(cLISTOPx(cUNOPx(PL_lex_op)->op_first)->op_first->op_sibling)->op_sv = tokeq(PL_lex_stuff);
6154021b 2386 pl_yylval.opval = PL_lex_op;
9b201d7d 2387 PL_lex_op = NULL;
e3f73d4e
RGS
2388 PL_lex_stuff = NULL;
2389 return THING;
2390 }
79072805 2391
3280af22 2392 PL_sublex_info.super_state = PL_lex_state;
eac04b2e 2393 PL_sublex_info.sub_inwhat = (U16)op_type;
3280af22
NIS
2394 PL_sublex_info.sub_op = PL_lex_op;
2395 PL_lex_state = LEX_INTERPPUSH;
55497cff 2396
3280af22
NIS
2397 PL_expect = XTERM;
2398 if (PL_lex_op) {
6154021b 2399 pl_yylval.opval = PL_lex_op;
5f66b61c 2400 PL_lex_op = NULL;
55497cff
PP
2401 return PMFUNC;
2402 }
2403 else
2404 return FUNC;
2405}
2406
ffb4593c
NT
2407/*
2408 * S_sublex_push
2409 * Create a new scope to save the lexing state. The scope will be
2410 * ended in S_sublex_done. Returns a '(', starting the function arguments
2411 * to the uc, lc, etc. found before.
2412 * Sets PL_lex_state to LEX_INTERPCONCAT.
2413 */
2414
76e3520e 2415STATIC I32
cea2e8a9 2416S_sublex_push(pTHX)
55497cff 2417{
27da23d5 2418 dVAR;
f46d017c 2419 ENTER;
55497cff 2420
3280af22 2421 PL_lex_state = PL_sublex_info.super_state;
651b5b28 2422 SAVEBOOL(PL_lex_dojoin);
3280af22 2423 SAVEI32(PL_lex_brackets);
78cdf107
Z
2424 SAVEI32(PL_lex_allbrackets);
2425 SAVEI8(PL_lex_fakeeof);
3280af22
NIS
2426 SAVEI32(PL_lex_casemods);
2427 SAVEI32(PL_lex_starts);
651b5b28 2428 SAVEI8(PL_lex_state);
7766f137 2429 SAVEVPTR(PL_lex_inpat);
98246f1e 2430 SAVEI16(PL_lex_inwhat);
57843af0 2431 SAVECOPLINE(PL_curcop);
3280af22 2432 SAVEPPTR(PL_bufptr);
8452ff4b 2433 SAVEPPTR(PL_bufend);
3280af22
NIS
2434 SAVEPPTR(PL_oldbufptr);
2435 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2436 SAVEPPTR(PL_last_lop);
2437 SAVEPPTR(PL_last_uni);
3280af22
NIS
2438 SAVEPPTR(PL_linestart);
2439 SAVESPTR(PL_linestr);
8edd5f42
RGS
2440 SAVEGENERICPV(PL_lex_brackstack);
2441 SAVEGENERICPV(PL_lex_casestack);
3280af22
NIS
2442
2443 PL_linestr = PL_lex_stuff;
a0714e2c 2444 PL_lex_stuff = NULL;
3280af22 2445
9cbb5ea2
GS
2446 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2447 = SvPVX(PL_linestr);
3280af22 2448 PL_bufend += SvCUR(PL_linestr);
bd61b366 2449 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2450 SAVEFREESV(PL_linestr);
2451
2452 PL_lex_dojoin = FALSE;
2453 PL_lex_brackets = 0;
78cdf107
Z
2454 PL_lex_allbrackets = 0;
2455 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
a02a5408
JC
2456 Newx(PL_lex_brackstack, 120, char);
2457 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2458 PL_lex_casemods = 0;
2459 *PL_lex_casestack = '\0';
2460 PL_lex_starts = 0;
2461 PL_lex_state = LEX_INTERPCONCAT;
eb160463 2462 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
3280af22
NIS
2463
2464 PL_lex_inwhat = PL_sublex_info.sub_inwhat;
bb16bae8 2465 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
3280af22
NIS
2466 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2467 PL_lex_inpat = PL_sublex_info.sub_op;
79072805 2468 else
5f66b61c 2469 PL_lex_inpat = NULL;
79072805 2470
55497cff 2471 return '(';
79072805
LW
2472}
2473
ffb4593c
NT
2474/*
2475 * S_sublex_done
2476 * Restores lexer state after a S_sublex_push.
2477 */
2478
76e3520e 2479STATIC I32
cea2e8a9 2480S_sublex_done(pTHX)
79072805 2481{
27da23d5 2482 dVAR;
3280af22 2483 if (!PL_lex_starts++) {
396482e1 2484 SV * const sv = newSVpvs("");
9aa983d2
JH
2485 if (SvUTF8(PL_linestr))
2486 SvUTF8_on(sv);
3280af22 2487 PL_expect = XOPERATOR;
6154021b 2488 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
79072805
LW
2489 return THING;
2490 }
2491
3280af22
NIS
2492 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2493 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2494 return yylex();
79072805
LW
2495 }
2496
ffb4593c 2497 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
bb16bae8 2498 assert(PL_lex_inwhat != OP_TRANSR);
3280af22
NIS
2499 if (PL_lex_repl && (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS)) {
2500 PL_linestr = PL_lex_repl;
2501 PL_lex_inpat = 0;
2502 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2503 PL_bufend += SvCUR(PL_linestr);
bd61b366 2504 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2505 SAVEFREESV(PL_linestr);
2506 PL_lex_dojoin = FALSE;
2507 PL_lex_brackets = 0;
78cdf107
Z
2508 PL_lex_allbrackets = 0;
2509 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
3280af22
NIS
2510 PL_lex_casemods = 0;
2511 *PL_lex_casestack = '\0';
2512 PL_lex_starts = 0;
25da4f38 2513 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2514 PL_lex_state = LEX_INTERPNORMAL;
2515 PL_lex_starts++;
e9fa98b2
HS
2516 /* we don't clear PL_lex_repl here, so that we can check later
2517 whether this is an evalled subst; that means we rely on the
2518 logic to ensure sublex_done() is called again only via the
2519 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2520 }
e9fa98b2 2521 else {
3280af22 2522 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2523 PL_lex_repl = NULL;
e9fa98b2 2524 }
79072805 2525 return ',';
ffed7fef
LW
2526 }
2527 else {
5db06880
NC
2528#ifdef PERL_MAD
2529 if (PL_madskills) {
cd81e915
NC
2530 if (PL_thiswhite) {
2531 if (!PL_endwhite)
6b29d1f5 2532 PL_endwhite = newSVpvs("");
cd81e915
NC
2533 sv_catsv(PL_endwhite, PL_thiswhite);
2534 PL_thiswhite = 0;
2535 }
2536 if (PL_thistoken)
76f68e9b 2537 sv_setpvs(PL_thistoken,"");
5db06880 2538 else
cd81e915 2539 PL_realtokenstart = -1;
5db06880
NC
2540 }
2541#endif
f46d017c 2542 LEAVE;
3280af22
NIS
2543 PL_bufend = SvPVX(PL_linestr);
2544 PL_bufend += SvCUR(PL_linestr);
2545 PL_expect = XOPERATOR;
09bef843 2546 PL_sublex_info.sub_inwhat = 0;
79072805 2547 return ')';
ffed7fef
LW
2548 }
2549}
2550
02aa26ce
NT
2551/*
2552 scan_const
2553
2554 Extracts a pattern, double-quoted string, or transliteration. This
2555 is terrifying code.
2556
94def140 2557 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 2558 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 2559 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 2560
94def140
ST
2561 Returns a pointer to the character scanned up to. If this is
2562 advanced from the start pointer supplied (i.e. if anything was
9b599b2a 2563 successfully parsed), will leave an OP for the substring scanned
6154021b 2564 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
2565 by looking at the next characters herself.
2566
02aa26ce
NT
2567 In patterns:
2568 backslashes:
ff3f963a 2569 constants: \N{NAME} only
02aa26ce
NT
2570 case and quoting: \U \Q \E
2571 stops on @ and $, but not for $ as tail anchor
2572
2573 In transliterations:
2574 characters are VERY literal, except for - not at the start or end
94def140
ST
2575 of the string, which indicates a range. If the range is in bytes,
2576 scan_const expands the range to the full set of intermediate
2577 characters. If the range is in utf8, the hyphen is replaced with
2578 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
2579
2580 In double-quoted strings:
2581 backslashes:
2582 double-quoted style: \r and \n
ff3f963a 2583 constants: \x31, etc.
94def140 2584 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
2585 case and quoting: \U \Q \E
2586 stops on @ and $
2587
2588 scan_const does *not* construct ops to handle interpolated strings.
2589 It stops processing as soon as it finds an embedded $ or @ variable
2590 and leaves it to the caller to work out what's going on.
2591
94def140
ST
2592 embedded arrays (whether in pattern or not) could be:
2593 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2594
2595 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
2596
2597 $ in pattern could be $foo or could be tail anchor. Assumption:
2598 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 2599 followed by one of "()| \r\n\t"
02aa26ce
NT
2600
2601 \1 (backreferences) are turned into $1
2602
2603 The structure of the code is
2604 while (there's a character to process) {
94def140
ST
2605 handle transliteration ranges
2606 skip regexp comments /(?#comment)/ and codes /(?{code})/
2607 skip #-initiated comments in //x patterns
2608 check for embedded arrays
02aa26ce
NT
2609 check for embedded scalars
2610 if (backslash) {
94def140 2611 deprecate \1 in substitution replacements
02aa26ce
NT
2612 handle string-changing backslashes \l \U \Q \E, etc.
2613 switch (what was escaped) {
94def140 2614 handle \- in a transliteration (becomes a literal -)
ff3f963a 2615 if a pattern and not \N{, go treat as regular character
94def140
ST
2616 handle \132 (octal characters)
2617 handle \x15 and \x{1234} (hex characters)
ff3f963a 2618 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
ST
2619 handle \cV (control characters)
2620 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 2621 } (end switch)
77a135fe 2622 continue
02aa26ce 2623 } (end if backslash)
77a135fe 2624 handle regular character
02aa26ce 2625 } (end while character to read)
4e553d73 2626
02aa26ce
NT
2627*/
2628
76e3520e 2629STATIC char *
cea2e8a9 2630S_scan_const(pTHX_ char *start)
79072805 2631{
97aff369 2632 dVAR;
3280af22 2633 register char *send = PL_bufend; /* end of the constant */
77a135fe
KW
2634 SV *sv = newSV(send - start); /* sv for the constant. See
2635 note below on sizing. */
02aa26ce
NT
2636 register char *s = start; /* start of the constant */
2637 register char *d = SvPVX(sv); /* destination for copies */
2638 bool dorange = FALSE; /* are we in a translit range? */
c2e66d9e 2639 bool didrange = FALSE; /* did we just finish a range? */
b953e60c
KW
2640 bool has_utf8 = FALSE; /* Output constant is UTF8 */
2641 bool this_utf8 = cBOOL(UTF); /* Is the source string assumed
77a135fe
KW
2642 to be UTF8? But, this can
2643 show as true when the source
2644 isn't utf8, as for example
2645 when it is entirely composed
2646 of hex constants */
2647
2648 /* Note on sizing: The scanned constant is placed into sv, which is
2649 * initialized by newSV() assuming one byte of output for every byte of
2650 * input. This routine expects newSV() to allocate an extra byte for a
2651 * trailing NUL, which this routine will append if it gets to the end of
2652 * the input. There may be more bytes of input than output (eg., \N{LATIN
2653 * CAPITAL LETTER A}), or more output than input if the constant ends up
2654 * recoded to utf8, but each time a construct is found that might increase
2655 * the needed size, SvGROW() is called. Its size parameter each time is
2656 * based on the best guess estimate at the time, namely the length used so
2657 * far, plus the length the current construct will occupy, plus room for
2658 * the trailing NUL, plus one byte for every input byte still unscanned */
2659
012bcf8d 2660 UV uv;
4c3a8340
ST
2661#ifdef EBCDIC
2662 UV literal_endpoint = 0;
e294cc5d 2663 bool native_range = TRUE; /* turned to FALSE if the first endpoint is Unicode. */
4c3a8340 2664#endif
012bcf8d 2665
7918f24d
NC
2666 PERL_ARGS_ASSERT_SCAN_CONST;
2667
bb16bae8 2668 assert(PL_lex_inwhat != OP_TRANSR);
2b9d42f0
NIS
2669 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
2670 /* If we are doing a trans and we know we want UTF8 set expectation */
2671 has_utf8 = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
2672 this_utf8 = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
2673 }
2674
2675
79072805 2676 while (s < send || dorange) {
ff3f963a 2677
02aa26ce 2678 /* get transliterations out of the way (they're most literal) */
3280af22 2679 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 2680 /* expand a range A-Z to the full set of characters. AIE! */
79072805 2681 if (dorange) {
1ba5c669
JH
2682 I32 i; /* current expanded character */
2683 I32 min; /* first character in range */
2684 I32 max; /* last character in range */
02aa26ce 2685
e294cc5d
JH
2686#ifdef EBCDIC
2687 UV uvmax = 0;
2688#endif
2689
2690 if (has_utf8
2691#ifdef EBCDIC
2692 && !native_range
2693#endif
2694 ) {
9d4ba2ae 2695 char * const c = (char*)utf8_hop((U8*)d, -1);
8973db79
JH
2696 char *e = d++;
2697 while (e-- > c)
2698 *(e + 1) = *e;
25716404 2699 *c = (char)UTF_TO_NATIVE(0xff);
8973db79
JH
2700 /* mark the range as done, and continue */
2701 dorange = FALSE;
2702 didrange = TRUE;
2703 continue;
2704 }
2b9d42f0 2705
95a20fc0 2706 i = d - SvPVX_const(sv); /* remember current offset */
e294cc5d
JH
2707#ifdef EBCDIC
2708 SvGROW(sv,
2709 SvLEN(sv) + (has_utf8 ?
2710 (512 - UTF_CONTINUATION_MARK +
2711 UNISKIP(0x100))
2712 : 256));
2713 /* How many two-byte within 0..255: 128 in UTF-8,
2714 * 96 in UTF-8-mod. */
2715#else
9cbb5ea2 2716 SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */
e294cc5d 2717#endif
9cbb5ea2 2718 d = SvPVX(sv) + i; /* refresh d after realloc */
e294cc5d
JH
2719#ifdef EBCDIC
2720 if (has_utf8) {
2721 int j;
2722 for (j = 0; j <= 1; j++) {
2723 char * const c = (char*)utf8_hop((U8*)d, -1);
2724 const UV uv = utf8n_to_uvchr((U8*)c, d - c, NULL, 0);
2725 if (j)
2726 min = (U8)uv;
2727 else if (uv < 256)
2728 max = (U8)uv;
2729 else {
2730 max = (U8)0xff; /* only to \xff */
2731 uvmax = uv; /* \x{100} to uvmax */
2732 }
2733 d = c; /* eat endpoint chars */
2734 }
2735 }
2736 else {
2737#endif
2738 d -= 2; /* eat the first char and the - */
2739 min = (U8)*d; /* first char in range */
2740 max = (U8)d[1]; /* last char in range */
2741#ifdef EBCDIC
2742 }
2743#endif
8ada0baa 2744
c2e66d9e 2745 if (min > max) {
01ec43d0 2746 Perl_croak(aTHX_
d1573ac7 2747 "Invalid range \"%c-%c\" in transliteration operator",
1ba5c669 2748 (char)min, (char)max);
c2e66d9e
GS
2749 }
2750
c7f1f016 2751#ifdef EBCDIC
4c3a8340
ST
2752 if (literal_endpoint == 2 &&
2753 ((isLOWER(min) && isLOWER(max)) ||
2754 (isUPPER(min) && isUPPER(max)))) {
8ada0baa
JH
2755 if (isLOWER(min)) {
2756 for (i = min; i <= max; i++)
2757 if (isLOWER(i))
db42d148 2758 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2759 } else {
2760 for (i = min; i <= max; i++)
2761 if (isUPPER(i))
db42d148 2762 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2763 }
2764 }
2765 else
2766#endif
2767 for (i = min; i <= max; i++)
e294cc5d
JH
2768#ifdef EBCDIC
2769 if (has_utf8) {
2770 const U8 ch = (U8)NATIVE_TO_UTF(i);
2771 if (UNI_IS_INVARIANT(ch))
2772 *d++ = (U8)i;
2773 else {
2774 *d++ = (U8)UTF8_EIGHT_BIT_HI(ch);
2775 *d++ = (U8)UTF8_EIGHT_BIT_LO(ch);
2776 }
2777 }
2778 else
2779#endif
2780 *d++ = (char)i;
2781
2782#ifdef EBCDIC
2783 if (uvmax) {
2784 d = (char*)uvchr_to_utf8((U8*)d, 0x100);
2785 if (uvmax > 0x101)
2786 *d++ = (char)UTF_TO_NATIVE(0xff);
2787 if (uvmax > 0x100)
2788 d = (char*)uvchr_to_utf8((U8*)d, uvmax);
2789 }
2790#endif
02aa26ce
NT
2791
2792 /* mark the range as done, and continue */
79072805 2793 dorange = FALSE;
01ec43d0 2794 didrange = TRUE;
4c3a8340
ST
2795#ifdef EBCDIC
2796 literal_endpoint = 0;
2797#endif
79072805 2798 continue;
4e553d73 2799 }
02aa26ce
NT
2800
2801 /* range begins (ignore - as first or last char) */
79072805 2802 else if (*s == '-' && s+1 < send && s != start) {
4e553d73 2803 if (didrange) {
1fafa243 2804 Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
01ec43d0 2805 }
e294cc5d
JH
2806 if (has_utf8
2807#ifdef EBCDIC
2808 && !native_range
2809#endif
2810 ) {
25716404 2811 *d++ = (char)UTF_TO_NATIVE(0xff); /* use illegal utf8 byte--see pmtrans */
a0ed51b3
LW
2812 s++;
2813 continue;
2814 }
79072805
LW
2815 dorange = TRUE;
2816 s++;
01ec43d0
GS
2817 }
2818 else {
2819 didrange = FALSE;
4c3a8340
ST
2820#ifdef EBCDIC
2821 literal_endpoint = 0;
e294cc5d 2822 native_range = TRUE;
4c3a8340 2823#endif
01ec43d0 2824 }
79072805 2825 }
02aa26ce
NT
2826
2827 /* if we get here, we're not doing a transliteration */
2828
0f5d15d6
IZ
2829 /* skip for regexp comments /(?#comment)/ and code /(?{code})/,
2830 except for the last char, which will be done separately. */
3280af22 2831 else if (*s == '(' && PL_lex_inpat && s[1] == '?') {
cc6b7395 2832 if (s[2] == '#') {
e994fd66 2833 while (s+1 < send && *s != ')')
db42d148 2834 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
155aba94
GS
2835 }
2836 else if (s[2] == '{' /* This should match regcomp.c */
67edc0c9 2837 || (s[2] == '?' && s[3] == '{'))
155aba94 2838 {
cc6b7395 2839 I32 count = 1;
0f5d15d6 2840 char *regparse = s + (s[2] == '{' ? 3 : 4);
cc6b7395
IZ
2841 char c;
2842
d9f97599
GS
2843 while (count && (c = *regparse)) {
2844 if (c == '\\' && regparse[1])
2845 regparse++;
4e553d73 2846 else if (c == '{')
cc6b7395 2847 count++;
4e553d73 2848 else if (c == '}')
cc6b7395 2849 count--;
d9f97599 2850 regparse++;
cc6b7395 2851 }
e994fd66 2852 if (*regparse != ')')
5bdf89e7 2853 regparse--; /* Leave one char for continuation. */
0f5d15d6 2854 while (s < regparse)
db42d148 2855 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
cc6b7395 2856 }
748a9306 2857 }
02aa26ce
NT
2858
2859 /* likewise skip #-initiated comments in //x patterns */
3280af22 2860 else if (*s == '#' && PL_lex_inpat &&
73134a2e 2861 ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED) {
748a9306 2862 while (s+1 < send && *s != '\n')
db42d148 2863 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
748a9306 2864 }
02aa26ce 2865
5d1d4326 2866 /* check for embedded arrays
da6eedaa 2867 (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
5d1d4326 2868 */
1749ea0d
ST
2869 else if (*s == '@' && s[1]) {
2870 if (isALNUM_lazy_if(s+1,UTF))
2871 break;
2872 if (strchr(":'{$", s[1]))
2873 break;
2874 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
2875 break; /* in regexp, neither @+ nor @- are interpolated */
2876 }
02aa26ce
NT
2877
2878 /* check for embedded scalars. only stop if we're sure it's a
2879 variable.
2880 */
79072805 2881 else if (*s == '$') {
3280af22 2882 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
79072805 2883 break;
77772344 2884 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
a2a5de95
NC
2885 if (s[1] == '\\') {
2886 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
2887 "Possible unintended interpolation of $\\ in regex");
77772344 2888 }
79072805 2889 break; /* in regexp, $ might be tail anchor */
77772344 2890 }
79072805 2891 }
02aa26ce 2892
2b9d42f0
NIS
2893 /* End of else if chain - OP_TRANS rejoin rest */
2894
02aa26ce 2895 /* backslashes */
79072805 2896 if (*s == '\\' && s+1 < send) {
ff3f963a
KW
2897 char* e; /* Can be used for ending '}', etc. */
2898
79072805 2899 s++;
02aa26ce 2900
7d0fc23c
KW
2901 /* warn on \1 - \9 in substitution replacements, but note that \11
2902 * is an octal; and \19 is \1 followed by '9' */
3280af22 2903 if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat &&
a0d0e21e 2904 isDIGIT(*s) && *s != '0' && !isDIGIT(s[1]))
79072805 2905 {
a2a5de95 2906 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
79072805
LW
2907 *--s = '$';
2908 break;
2909 }
02aa26ce
NT
2910
2911 /* string-change backslash escapes */
3280af22 2912 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQ", *s)) {
79072805
LW
2913 --s;
2914 break;
2915 }
ff3f963a
KW
2916 /* In a pattern, process \N, but skip any other backslash escapes.
2917 * This is because we don't want to translate an escape sequence
2918 * into a meta symbol and have the regex compiler use the meta
2919 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
2920 * in spite of this, we do have to process \N here while the proper
2921 * charnames handler is in scope. See bugs #56444 and #62056.
2922 * There is a complication because \N in a pattern may also stand
2923 * for 'match a non-nl', and not mean a charname, in which case its
2924 * processing should be deferred to the regex compiler. To be a
2925 * charname it must be followed immediately by a '{', and not look
2926 * like \N followed by a curly quantifier, i.e., not something like
2927 * \N{3,}. regcurly returns a boolean indicating if it is a legal
2928 * quantifier */
2929 else if (PL_lex_inpat
2930 && (*s != 'N'
2931 || s[1] != '{'
2932 || regcurly(s + 1)))
2933 {
cc74c5bd
ST
2934 *d++ = NATIVE_TO_NEED(has_utf8,'\\');
2935 goto default_action;
2936 }
02aa26ce 2937
79072805 2938 switch (*s) {
02aa26ce
NT
2939
2940 /* quoted - in transliterations */
79072805 2941 case '-':
3280af22 2942 if (PL_lex_inwhat == OP_TRANS) {
79072805
LW
2943 *d++ = *s++;
2944 continue;
2945 }
2946 /* FALL THROUGH */
2947 default:
11b8faa4 2948 {
a2a5de95
NC
2949 if ((isALPHA(*s) || isDIGIT(*s)))
2950 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
2951 "Unrecognized escape \\%c passed through",
2952 *s);
11b8faa4 2953 /* default action is to copy the quoted character */
f9a63242 2954 goto default_action;
11b8faa4 2955 }
02aa26ce 2956
632403cc 2957 /* eg. \132 indicates the octal constant 0132 */
79072805
LW
2958 case '0': case '1': case '2': case '3':
2959 case '4': case '5': case '6': case '7':
ba210ebe 2960 {
53305cf1
NC
2961 I32 flags = 0;
2962 STRLEN len = 3;
77a135fe 2963 uv = NATIVE_TO_UNI(grok_oct(s, &len, &flags, NULL));
ba210ebe
JH
2964 s += len;
2965 }
012bcf8d 2966 goto NUM_ESCAPE_INSERT;
02aa26ce 2967
f0a2b745
KW
2968 /* eg. \o{24} indicates the octal constant \024 */
2969 case 'o':
2970 {
2971 STRLEN len;
454155d9 2972 const char* error;
f0a2b745 2973
454155d9 2974 bool valid = grok_bslash_o(s, &uv, &len, &error, 1);
f0a2b745 2975 s += len;
454155d9 2976 if (! valid) {
f0a2b745
KW
2977 yyerror(error);
2978 continue;
2979 }
2980 goto NUM_ESCAPE_INSERT;
2981 }
2982
77a135fe 2983 /* eg. \x24 indicates the hex constant 0x24 */
79072805 2984 case 'x':
a0ed51b3
LW
2985 ++s;
2986 if (*s == '{') {
9d4ba2ae 2987 char* const e = strchr(s, '}');
a4c04bdc
NC
2988 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES |
2989 PERL_SCAN_DISALLOW_PREFIX;
53305cf1 2990 STRLEN len;
355860ce 2991
53305cf1 2992 ++s;
adaeee49 2993 if (!e) {
a0ed51b3 2994 yyerror("Missing right brace on \\x{}");
355860ce 2995 continue;
ba210ebe 2996 }
53305cf1 2997 len = e - s;
77a135fe 2998 uv = NATIVE_TO_UNI(grok_hex(s, &len, &flags, NULL));
ba210ebe 2999 s = e + 1;
a0ed51b3
LW
3000 }
3001 else {
ba210ebe 3002 {
53305cf1 3003 STRLEN len = 2;
a4c04bdc 3004 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
77a135fe 3005 uv = NATIVE_TO_UNI(grok_hex(s, &len, &flags, NULL));
ba210ebe
JH
3006 s += len;
3007 }
012bcf8d
GS
3008 }
3009
3010 NUM_ESCAPE_INSERT:
ff3f963a
KW
3011 /* Insert oct or hex escaped character. There will always be
3012 * enough room in sv since such escapes will be longer than any
3013 * UTF-8 sequence they can end up as, except if they force us
3014 * to recode the rest of the string into utf8 */
ba7cea30 3015
77a135fe 3016 /* Here uv is the ordinal of the next character being added in
ff3f963a 3017 * unicode (converted from native). */
77a135fe 3018 if (!UNI_IS_INVARIANT(uv)) {
9aa983d2 3019 if (!has_utf8 && uv > 255) {
77a135fe
KW
3020 /* Might need to recode whatever we have accumulated so
3021 * far if it contains any chars variant in utf8 or
3022 * utf-ebcdic. */
3023
3024 SvCUR_set(sv, d - SvPVX_const(sv));
3025 SvPOK_on(sv);
3026 *d = '\0';
77a135fe 3027 /* See Note on sizing above. */
7bf79863
KW
3028 sv_utf8_upgrade_flags_grow(sv,
3029 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3030 UNISKIP(uv) + (STRLEN)(send - s) + 1);
77a135fe
KW
3031 d = SvPVX(sv) + SvCUR(sv);
3032 has_utf8 = TRUE;
012bcf8d
GS
3033 }
3034
77a135fe
KW
3035 if (has_utf8) {
3036 d = (char*)uvuni_to_utf8((U8*)d, uv);
f9a63242
JH
3037 if (PL_lex_inwhat == OP_TRANS &&
3038 PL_sublex_info.sub_op) {
3039 PL_sublex_info.sub_op->op_private |=
3040 (PL_lex_repl ? OPpTRANS_FROM_UTF
3041 : OPpTRANS_TO_UTF);
f9a63242 3042 }
e294cc5d
JH
3043#ifdef EBCDIC
3044 if (uv > 255 && !dorange)
3045 native_range = FALSE;
3046#endif
012bcf8d 3047 }
a0ed51b3 3048 else {
012bcf8d 3049 *d++ = (char)uv;
a0ed51b3 3050 }
012bcf8d
GS
3051 }
3052 else {
c4d5f83a 3053 *d++ = (char) uv;
a0ed51b3 3054 }
79072805 3055 continue;
02aa26ce 3056
4a2d328f 3057 case 'N':
ff3f963a
KW
3058 /* In a non-pattern \N must be a named character, like \N{LATIN
3059 * SMALL LETTER A} or \N{U+0041}. For patterns, it also can
3060 * mean to match a non-newline. For non-patterns, named
3061 * characters are converted to their string equivalents. In
3062 * patterns, named characters are not converted to their
3063 * ultimate forms for the same reasons that other escapes
3064 * aren't. Instead, they are converted to the \N{U+...} form
3065 * to get the value from the charnames that is in effect right
3066 * now, while preserving the fact that it was a named character
3067 * so that the regex compiler knows this */
3068
3069 /* This section of code doesn't generally use the
3070 * NATIVE_TO_NEED() macro to transform the input. I (khw) did
3071 * a close examination of this macro and determined it is a
3072 * no-op except on utfebcdic variant characters. Every
3073 * character generated by this that would normally need to be
3074 * enclosed by this macro is invariant, so the macro is not
7538f724
KW
3075 * needed, and would complicate use of copy(). XXX There are
3076 * other parts of this file where the macro is used
3077 * inconsistently, but are saved by it being a no-op */
ff3f963a
KW
3078
3079 /* The structure of this section of code (besides checking for
3080 * errors and upgrading to utf8) is:
3081 * Further disambiguate between the two meanings of \N, and if
3082 * not a charname, go process it elsewhere
0a96133f
KW
3083 * If of form \N{U+...}, pass it through if a pattern;
3084 * otherwise convert to utf8
3085 * Otherwise must be \N{NAME}: convert to \N{U+c1.c2...} if a
3086 * pattern; otherwise convert to utf8 */
ff3f963a
KW
3087
3088 /* Here, s points to the 'N'; the test below is guaranteed to
3089 * succeed if we are being called on a pattern as we already
3090 * know from a test above that the next character is a '{'.
3091 * On a non-pattern \N must mean 'named sequence, which
3092 * requires braces */
3093 s++;
3094 if (*s != '{') {
3095 yyerror("Missing braces on \\N{}");
3096 continue;
3097 }
3098 s++;
3099
0a96133f 3100 /* If there is no matching '}', it is an error. */
ff3f963a
KW
3101 if (! (e = strchr(s, '}'))) {
3102 if (! PL_lex_inpat) {
5777a3f7 3103 yyerror("Missing right brace on \\N{}");
0a96133f
KW
3104 } else {
3105 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N.");
dbc0d4f2 3106 }
0a96133f 3107 continue;
ff3f963a 3108 }
cddc7ef4 3109
ff3f963a 3110 /* Here it looks like a named character */
cddc7ef4 3111
ff3f963a
KW
3112 if (PL_lex_inpat) {
3113
3114 /* XXX This block is temporary code. \N{} implies that the
3115 * pattern is to have Unicode semantics, and therefore
3116 * currently has to be encoded in utf8. By putting it in
3117 * utf8 now, we save a whole pass in the regular expression
3118 * compiler. Once that code is changed so Unicode
3119 * semantics doesn't necessarily have to be in utf8, this
da3a4baf
KW
3120 * block should be removed. However, the code that parses
3121 * the output of this would have to be changed to not
3122 * necessarily expect utf8 */
ff3f963a 3123 if (!has_utf8) {
77a135fe 3124 SvCUR_set(sv, d - SvPVX_const(sv));
f08d6ad9 3125 SvPOK_on(sv);
e4f3eed8 3126 *d = '\0';
77a135fe 3127 /* See Note on sizing above. */
7bf79863 3128 sv_utf8_upgrade_flags_grow(sv,
ff3f963a
KW
3129 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3130 /* 5 = '\N{' + cur char + NUL */
3131 (STRLEN)(send - s) + 5);
f08d6ad9 3132 d = SvPVX(sv) + SvCUR(sv);
89491803 3133 has_utf8 = TRUE;
ff3f963a
KW
3134 }
3135 }
423cee85 3136
ff3f963a
KW
3137 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3138 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3139 | PERL_SCAN_DISALLOW_PREFIX;
3140 STRLEN len;
3141
3142 /* For \N{U+...}, the '...' is a unicode value even on
3143 * EBCDIC machines */
3144 s += 2; /* Skip to next char after the 'U+' */
3145 len = e - s;
3146 uv = grok_hex(s, &len, &flags, NULL);
3147 if (len == 0 || len != (STRLEN)(e - s)) {
3148 yyerror("Invalid hexadecimal number in \\N{U+...}");
3149 s = e + 1;
3150 continue;
3151 }
3152
3153 if (PL_lex_inpat) {
3154
e2a7e165
KW
3155 /* On non-EBCDIC platforms, pass through to the regex
3156 * compiler unchanged. The reason we evaluated the
3157 * number above is to make sure there wasn't a syntax
3158 * error. But on EBCDIC we convert to native so
3159 * downstream code can continue to assume it's native
3160 */
ff3f963a 3161 s -= 5; /* Include the '\N{U+' */
e2a7e165
KW
3162#ifdef EBCDIC
3163 d += my_snprintf(d, e - s + 1 + 1, /* includes the }
3164 and the \0 */
3165 "\\N{U+%X}",
3166 (unsigned int) UNI_TO_NATIVE(uv));
3167#else
ff3f963a
KW
3168 Copy(s, d, e - s + 1, char); /* 1 = include the } */
3169 d += e - s + 1;
e2a7e165 3170#endif
ff3f963a
KW
3171 }
3172 else { /* Not a pattern: convert the hex to string */
3173
3174 /* If destination is not in utf8, unconditionally
3175 * recode it to be so. This is because \N{} implies
3176 * Unicode semantics, and scalars have to be in utf8
3177 * to guarantee those semantics */
3178 if (! has_utf8) {
3179 SvCUR_set(sv, d - SvPVX_const(sv));
3180 SvPOK_on(sv);
3181 *d = '\0';
3182 /* See Note on sizing above. */
3183 sv_utf8_upgrade_flags_grow(
3184 sv,
3185 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3186 UNISKIP(uv) + (STRLEN)(send - e) + 1);
3187 d = SvPVX(sv) + SvCUR(sv);
3188 has_utf8 = TRUE;
3189 }
3190
3191 /* Add the string to the output */
3192 if (UNI_IS_INVARIANT(uv)) {
3193 *d++ = (char) uv;
3194 }
3195 else d = (char*)uvuni_to_utf8((U8*)d, uv);
3196 }
3197 }
3198 else { /* Here is \N{NAME} but not \N{U+...}. */
3199
3200 SV *res; /* result from charnames */
3201 const char *str; /* the string in 'res' */
3202 STRLEN len; /* its length */
3203
3204 /* Get the value for NAME */
3205 res = newSVpvn(s, e - s);
3206 res = new_constant( NULL, 0, "charnames",
3207 /* includes all of: \N{...} */
3208 res, NULL, s - 3, e - s + 4 );
3209
3210 /* Most likely res will be in utf8 already since the
3211 * standard charnames uses pack U, but a custom translator
3212 * can leave it otherwise, so make sure. XXX This can be
3213 * revisited to not have charnames use utf8 for characters
3214 * that don't need it when regexes don't have to be in utf8
3215 * for Unicode semantics. If doing so, remember EBCDIC */
3216 sv_utf8_upgrade(res);
3217 str = SvPV_const(res, len);
3218
3219 /* Don't accept malformed input */
3220 if (! is_utf8_string((U8 *) str, len)) {
3221 yyerror("Malformed UTF-8 returned by \\N");
3222 }
3223 else if (PL_lex_inpat) {
3224
3225 if (! len) { /* The name resolved to an empty string */
3226 Copy("\\N{}", d, 4, char);
3227 d += 4;
3228 }
3229 else {
3230 /* In order to not lose information for the regex
3231 * compiler, pass the result in the specially made
3232 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3233 * the code points in hex of each character
3234 * returned by charnames */
3235
3236 const char *str_end = str + len;
3237 STRLEN char_length; /* cur char's byte length */
3238 STRLEN output_length; /* and the number of bytes
3239 after this is translated
3240 into hex digits */
3241 const STRLEN off = d - SvPVX_const(sv);
3242
3243 /* 2 hex per byte; 2 chars for '\N'; 2 chars for
3244 * max('U+', '.'); and 1 for NUL */
3245 char hex_string[2 * UTF8_MAXBYTES + 5];
3246
3247 /* Get the first character of the result. */
3248 U32 uv = utf8n_to_uvuni((U8 *) str,
3249 len,
3250 &char_length,
3251 UTF8_ALLOW_ANYUV);
3252
3253 /* The call to is_utf8_string() above hopefully
3254 * guarantees that there won't be an error. But
3255 * it's easy here to make sure. The function just
3256 * above warns and returns 0 if invalid utf8, but
3257 * it can also return 0 if the input is validly a
3258 * NUL. Disambiguate */
3259 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3260 uv = UNICODE_REPLACEMENT;
3261 }
3262
3263 /* Convert first code point to hex, including the
e2a7e165
KW
3264 * boiler plate before it. For all these, we
3265 * convert to native format so that downstream code
3266 * can continue to assume the input is native */
78c35590 3267 output_length =
3353de27 3268 my_snprintf(hex_string, sizeof(hex_string),
e2a7e165
KW
3269 "\\N{U+%X",
3270 (unsigned int) UNI_TO_NATIVE(uv));
ff3f963a
KW
3271
3272 /* Make sure there is enough space to hold it */
3273 d = off + SvGROW(sv, off
3274 + output_length
3275 + (STRLEN)(send - e)
3276 + 2); /* '}' + NUL */
3277 /* And output it */
3278 Copy(hex_string, d, output_length, char);
3279 d += output_length;
3280
3281 /* For each subsequent character, append dot and
3282 * its ordinal in hex */
3283 while ((str += char_length) < str_end) {
3284 const STRLEN off = d - SvPVX_const(sv);
3285 U32 uv = utf8n_to_uvuni((U8 *) str,
3286 str_end - str,
3287 &char_length,
3288 UTF8_ALLOW_ANYUV);
3289 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3290 uv = UNICODE_REPLACEMENT;
3291 }
3292
78c35590 3293 output_length =
3353de27 3294 my_snprintf(hex_string, sizeof(hex_string),
e2a7e165
KW
3295 ".%X",
3296 (unsigned int) UNI_TO_NATIVE(uv));
ff3f963a
KW
3297
3298 d = off + SvGROW(sv, off
3299 + output_length
3300 + (STRLEN)(send - e)
3301 + 2); /* '}' + NUL */
3302 Copy(hex_string, d, output_length, char);
3303 d += output_length;
3304 }
3305
3306 *d++ = '}'; /* Done. Add the trailing brace */
3307 }
3308 }
3309 else { /* Here, not in a pattern. Convert the name to a
3310 * string. */
3311
3312 /* If destination is not in utf8, unconditionally
3313 * recode it to be so. This is because \N{} implies
3314 * Unicode semantics, and scalars have to be in utf8
3315 * to guarantee those semantics */
3316 if (! has_utf8) {
3317 SvCUR_set(sv, d - SvPVX_const(sv));
3318 SvPOK_on(sv);
3319 *d = '\0';
3320 /* See Note on sizing above. */
3321 sv_utf8_upgrade_flags_grow(sv,
3322 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3323 len + (STRLEN)(send - s) + 1);
3324 d = SvPVX(sv) + SvCUR(sv);
3325 has_utf8 = TRUE;
3326 } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
3327
3328 /* See Note on sizing above. (NOTE: SvCUR() is not
3329 * set correctly here). */
3330 const STRLEN off = d - SvPVX_const(sv);
3331 d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1);
3332 }
3333 Copy(str, d, len, char);
3334 d += len;
423cee85 3335 }
423cee85 3336 SvREFCNT_dec(res);
cb233ae3
KW
3337
3338 /* Deprecate non-approved name syntax */
3339 if (ckWARN_d(WARN_DEPRECATED)) {
3340 bool problematic = FALSE;
3341 char* i = s;
3342
3343 /* For non-ut8 input, look to see that the first
3344 * character is an alpha, then loop through the rest
3345 * checking that each is a continuation */
3346 if (! this_utf8) {
3347 if (! isALPHAU(*i)) problematic = TRUE;
3348 else for (i = s + 1; i < e; i++) {
3349 if (isCHARNAME_CONT(*i)) continue;
3350 problematic = TRUE;
3351 break;
3352 }
3353 }
3354 else {
3355 /* Similarly for utf8. For invariants can check
3356 * directly. We accept anything above the latin1
3357 * range because it is immaterial to Perl if it is
3358 * correct or not, and is expensive to check. But
3359 * it is fairly easy in the latin1 range to convert
3360 * the variants into a single character and check
3361 * those */
3362 if (UTF8_IS_INVARIANT(*i)) {
3363 if (! isALPHAU(*i)) problematic = TRUE;
3364 } else if (UTF8_IS_DOWNGRADEABLE_START(*i)) {
81c14aa2 3365 if (! isALPHAU(UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(*i,
cb233ae3
KW
3366 *(i+1)))))
3367 {
3368 problematic = TRUE;
3369 }
3370 }
3371 if (! problematic) for (i = s + UTF8SKIP(s);
3372 i < e;
3373 i+= UTF8SKIP(i))
3374 {
3375 if (UTF8_IS_INVARIANT(*i)) {
3376 if (isCHARNAME_CONT(*i)) continue;
3377 } else if (! UTF8_IS_DOWNGRADEABLE_START(*i)) {
3378 continue;
3379 } else if (isCHARNAME_CONT(
3380 UNI_TO_NATIVE(
81c14aa2 3381 TWO_BYTE_UTF8_TO_UNI(*i, *(i+1)))))
cb233ae3
KW
3382 {
3383 continue;
3384 }
3385 problematic = TRUE;
3386 break;
3387 }
3388 }
3389 if (problematic) {
6e1bad6c
KW
3390 /* The e-i passed to the final %.*s makes sure that
3391 * should the trailing NUL be missing that this
3392 * print won't run off the end of the string */
cb233ae3 3393 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
b00fc8d4
NC
3394 "Deprecated character in \\N{...}; marked by <-- HERE in \\N{%.*s<-- HERE %.*s",
3395 (int)(i - s + 1), s, (int)(e - i), i + 1);
cb233ae3
KW
3396 }
3397 }
3398 } /* End \N{NAME} */
ff3f963a
KW
3399#ifdef EBCDIC
3400 if (!dorange)
3401 native_range = FALSE; /* \N{} is defined to be Unicode */
3402#endif
3403 s = e + 1; /* Point to just after the '}' */
423cee85
JH
3404 continue;
3405
02aa26ce 3406 /* \c is a control character */
79072805
LW
3407 case 'c':
3408 s++;
961ce445 3409 if (s < send) {
17a3df4c 3410 *d++ = grok_bslash_c(*s++, has_utf8, 1);
ba210ebe 3411 }
961ce445
RGS
3412 else {
3413 yyerror("Missing control char name in \\c");
3414 }
79072805 3415 continue;
02aa26ce
NT
3416
3417 /* printf-style backslashes, formfeeds, newlines, etc */
79072805 3418 case 'b':
db42d148 3419 *d++ = NATIVE_TO_NEED(has_utf8,'\b');
79072805
LW
3420 break;
3421 case 'n':
db42d148 3422 *d++ = NATIVE_TO_NEED(has_utf8,'\n');
79072805
LW
3423 break;
3424 case 'r':
db42d148 3425 *d++ = NATIVE_TO_NEED(has_utf8,'\r');
79072805
LW
3426 break;
3427 case 'f':
db42d148 3428 *d++ = NATIVE_TO_NEED(has_utf8,'\f');
79072805
LW
3429 break;
3430 case 't':
db42d148 3431 *d++ = NATIVE_TO_NEED(has_utf8,'\t');
79072805 3432 break;
34a3fe2a 3433 case 'e':
db42d148 3434 *d++ = ASCII_TO_NEED(has_utf8,'\033');
34a3fe2a
PP
3435 break;
3436 case 'a':
db42d148 3437 *d++ = ASCII_TO_NEED(has_utf8,'\007');
79072805 3438 break;
02aa26ce
NT
3439 } /* end switch */
3440
79072805
LW
3441 s++;
3442 continue;
02aa26ce 3443 } /* end if (backslash) */
4c3a8340
ST
3444#ifdef EBCDIC
3445 else
3446 literal_endpoint++;
3447#endif
02aa26ce 3448
f9a63242 3449 default_action:
77a135fe
KW
3450 /* If we started with encoded form, or already know we want it,
3451 then encode the next character */
3452 if (! NATIVE_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
2b9d42f0 3453 STRLEN len = 1;
77a135fe
KW
3454
3455
3456 /* One might think that it is wasted effort in the case of the
3457 * source being utf8 (this_utf8 == TRUE) to take the next character
3458 * in the source, convert it to an unsigned value, and then convert
3459 * it back again. But the source has not been validated here. The
3460 * routine that does the conversion checks for errors like
3461 * malformed utf8 */
3462
5f66b61c
AL
3463 const UV nextuv = (this_utf8) ? utf8n_to_uvchr((U8*)s, send - s, &len, 0) : (UV) ((U8) *s);
3464 const STRLEN need = UNISKIP(NATIVE_TO_UNI(nextuv));
77a135fe
KW
3465 if (!has_utf8) {
3466 SvCUR_set(sv, d - SvPVX_const(sv));
3467 SvPOK_on(sv);
3468 *d = '\0';
77a135fe 3469 /* See Note on sizing above. */
7bf79863
KW
3470 sv_utf8_upgrade_flags_grow(sv,
3471 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3472 need + (STRLEN)(send - s) + 1);
77a135fe
KW
3473 d = SvPVX(sv) + SvCUR(sv);
3474 has_utf8 = TRUE;
3475 } else if (need > len) {
3476 /* encoded value larger than old, may need extra space (NOTE:
3477 * SvCUR() is not set correctly here). See Note on sizing
3478 * above. */
9d4ba2ae 3479 const STRLEN off = d - SvPVX_const(sv);
77a135fe 3480 d = SvGROW(sv, off + need + (STRLEN)(send - s) + 1) + off;
2b9d42f0 3481 }
77a135fe
KW
3482 s += len;
3483
5f66b61c 3484 d = (char*)uvchr_to_utf8((U8*)d, nextuv);
e294cc5d
JH
3485#ifdef EBCDIC
3486 if (uv > 255 && !dorange)
3487 native_range = FALSE;
3488#endif
2b9d42f0
NIS
3489 }
3490 else {
3491 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
3492 }
02aa26ce
NT
3493 } /* while loop to process each character */
3494
3495 /* terminate the string and set up the sv */
79072805 3496 *d = '\0';
95a20fc0 3497 SvCUR_set(sv, d - SvPVX_const(sv));
2b9d42f0 3498 if (SvCUR(sv) >= SvLEN(sv))
d0063567 3499 Perl_croak(aTHX_ "panic: constant overflowed allocated space");
2b9d42f0 3500
79072805 3501 SvPOK_on(sv);
9f4817db 3502 if (PL_encoding && !has_utf8) {
d0063567
DK
3503 sv_recode_to_utf8(sv, PL_encoding);
3504 if (SvUTF8(sv))
3505 has_utf8 = TRUE;
9f4817db 3506 }
2b9d42f0 3507 if (has_utf8) {
7e2040f0 3508 SvUTF8_on(sv);
2b9d42f0 3509 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
d0063567 3510 PL_sublex_info.sub_op->op_private |=
2b9d42f0
NIS
3511 (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
3512 }
3513 }
79072805 3514
02aa26ce 3515 /* shrink the sv if we allocated more than we used */
79072805 3516 if (SvCUR(sv) + 5 < SvLEN(sv)) {
1da4ca5f 3517 SvPV_shrink_to_cur(sv);
79072805 3518 }
02aa26ce 3519
6154021b 3520 /* return the substring (via pl_yylval) only if we parsed anything */
3280af22 3521 if (s > PL_bufptr) {
eb0d8d16
NC
3522 if ( PL_hints & ( PL_lex_inpat ? HINT_NEW_RE : HINT_NEW_STRING ) ) {
3523 const char *const key = PL_lex_inpat ? "qr" : "q";
3524 const STRLEN keylen = PL_lex_inpat ? 2 : 1;
3525 const char *type;
3526 STRLEN typelen;
3527
3528 if (PL_lex_inwhat == OP_TRANS) {
3529 type = "tr";
3530 typelen = 2;
3531 } else if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat) {
3532 type = "s";
3533 typelen = 1;
3534 } else {
3535 type = "qq";
3536 typelen = 2;
3537 }
3538
3539 sv = S_new_constant(aTHX_ start, s - start, key, keylen, sv, NULL,
3540 type, typelen);
3541 }
6154021b 3542 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
b3ac6de7 3543 } else
8990e307 3544 SvREFCNT_dec(sv);
79072805
LW
3545 return s;
3546}
3547
ffb4593c
NT
3548/* S_intuit_more
3549 * Returns TRUE if there's more to the expression (e.g., a subscript),
3550 * FALSE otherwise.
ffb4593c
NT
3551 *
3552 * It deals with "$foo[3]" and /$foo[3]/ and /$foo[0123456789$]+/
3553 *
3554 * ->[ and ->{ return TRUE
3555 * { and [ outside a pattern are always subscripts, so return TRUE
3556 * if we're outside a pattern and it's not { or [, then return FALSE
3557 * if we're in a pattern and the first char is a {
3558 * {4,5} (any digits around the comma) returns FALSE
3559 * if we're in a pattern and the first char is a [
3560 * [] returns FALSE
3561 * [SOMETHING] has a funky algorithm to decide whether it's a
3562 * character class or not. It has to deal with things like
3563 * /$foo[-3]/ and /$foo[$bar]/ as well as /$foo[$\d]+/
3564 * anything else returns TRUE
3565 */
3566
9cbb5ea2
GS
3567/* This is the one truly awful dwimmer necessary to conflate C and sed. */
3568
76e3520e 3569STATIC int
cea2e8a9 3570S_intuit_more(pTHX_ register char *s)
79072805 3571{
97aff369 3572 dVAR;
7918f24d
NC
3573
3574 PERL_ARGS_ASSERT_INTUIT_MORE;
3575
3280af22 3576 if (PL_lex_brackets)
79072805
LW
3577 return TRUE;
3578 if (*s == '-' && s[1] == '>' && (s[2] == '[' || s[2] == '{'))
3579 return TRUE;
3580 if (*s != '{' && *s != '[')
3581 return FALSE;
3280af22 3582 if (!PL_lex_inpat)
79072805
LW
3583 return TRUE;
3584
3585 /* In a pattern, so maybe we have {n,m}. */
3586 if (*s == '{') {
b3155d95 3587 if (regcurly(s)) {
79072805 3588 return FALSE;
b3155d95 3589 }
79072805 3590 return TRUE;
79072805
LW
3591 }
3592
3593 /* On the other hand, maybe we have a character class */
3594
3595 s++;
3596 if (*s == ']' || *s == '^')
3597 return FALSE;
3598 else {
ffb4593c 3599 /* this is terrifying, and it works */
79072805
LW
3600 int weight = 2; /* let's weigh the evidence */
3601 char seen[256];
f27ffc4a 3602 unsigned char un_char = 255, last_un_char;
9d4ba2ae 3603 const char * const send = strchr(s,']');
3280af22 3604 char tmpbuf[sizeof PL_tokenbuf * 4];
79072805
LW
3605
3606 if (!send) /* has to be an expression */
3607 return TRUE;
3608
3609 Zero(seen,256,char);
3610 if (*s == '$')
3611 weight -= 3;
3612 else if (isDIGIT(*s)) {
3613 if (s[1] != ']') {
3614 if (isDIGIT(s[1]) && s[2] == ']')
3615 weight -= 10;
3616 }
3617 else
3618 weight -= 100;
3619 }
3620 for (; s < send; s++) {
3621 last_un_char = un_char;
3622 un_char = (unsigned char)*s;
3623 switch (*s) {
3624 case '@':
3625 case '&':
3626 case '$':
3627 weight -= seen[un_char] * 10;
7e2040f0 3628 if (isALNUM_lazy_if(s+1,UTF)) {
90e5519e 3629 int len;
8903cb82 3630 scan_ident(s, send, tmpbuf, sizeof tmpbuf, FALSE);
90e5519e 3631 len = (int)strlen(tmpbuf);
6fbd0d97
BF
3632 if (len > 1 && gv_fetchpvn_flags(tmpbuf, len,
3633 UTF ? SVf_UTF8 : 0, SVt_PV))
79072805
LW
3634 weight -= 100;
3635 else
3636 weight -= 10;
3637 }
3638 else if (*s == '$' && s[1] &&
93a17b20
LW
3639 strchr("[#!%*<>()-=",s[1])) {
3640 if (/*{*/ strchr("])} =",s[2]))
79072805
LW
3641 weight -= 10;
3642 else
3643 weight -= 1;
3644 }
3645 break;
3646 case '\\':
3647 un_char = 254;
3648 if (s[1]) {
93a17b20 3649 if (strchr("wds]",s[1]))
79072805 3650 weight += 100;
10edeb5d 3651 else if (seen[(U8)'\''] || seen[(U8)'"'])
79072805 3652 weight += 1;
93a17b20 3653 else if (strchr("rnftbxcav",s[1]))
79072805
LW
3654 weight += 40;
3655 else if (isDIGIT(s[1])) {
3656 weight += 40;
3657 while (s[1] && isDIGIT(s[1]))
3658 s++;
3659 }
3660 }
3661 else
3662 weight += 100;
3663 break;
3664 case '-':
3665 if (s[1] == '\\')
3666 weight += 50;
93a17b20 3667 if (strchr("aA01! ",last_un_char))
79072805 3668 weight += 30;
93a17b20 3669 if (strchr("zZ79~",s[1]))
79072805 3670 weight += 30;
f27ffc4a
GS
3671 if (last_un_char == 255 && (isDIGIT(s[1]) || s[1] == '$'))
3672 weight -= 5; /* cope with negative subscript */
79072805
LW
3673 break;
3674 default:
3792a11b
NC
3675 if (!isALNUM(last_un_char)
3676 && !(last_un_char == '$' || last_un_char == '@'
3677 || last_un_char == '&')
3678 && isALPHA(*s) && s[1] && isALPHA(s[1])) {
79072805
LW
3679 char *d = tmpbuf;
3680 while (isALPHA(*s))
3681 *d++ = *s++;
3682 *d = '\0';
5458a98a 3683 if (keyword(tmpbuf, d - tmpbuf, 0))
79072805
LW
3684 weight -= 150;
3685 }
3686 if (un_char == last_un_char + 1)
3687 weight += 5;
3688 weight -= seen[un_char];
3689 break;
3690 }
3691 seen[un_char]++;
3692 }
3693 if (weight >= 0) /* probably a character class */
3694 return FALSE;
3695 }
3696
3697 return TRUE;
3698}
ffed7fef 3699
ffb4593c
NT
3700/*
3701 * S_intuit_method
3702 *
3703 * Does all the checking to disambiguate
3704 * foo bar
3705 * between foo(bar) and bar->foo. Returns 0 if not a method, otherwise
3706 * FUNCMETH (bar->foo(args)) or METHOD (bar->foo args).
3707 *
3708 * First argument is the stuff after the first token, e.g. "bar".
3709 *
3710 * Not a method if bar is a filehandle.
3711 * Not a method if foo is a subroutine prototyped to take a filehandle.
3712 * Not a method if it's really "Foo $bar"
3713 * Method if it's "foo $bar"
3714 * Not a method if it's really "print foo $bar"
3715 * Method if it's really "foo package::" (interpreted as package->foo)
8f8cf39c 3716 * Not a method if bar is known to be a subroutine ("sub bar; foo bar")
3cb0bbe5 3717 * Not a method if bar is a filehandle or package, but is quoted with
ffb4593c
NT
3718 * =>
3719 */
3720
76e3520e 3721STATIC int
62d55b22 3722S_intuit_method(pTHX_ char *start, GV *gv, CV *cv)
a0d0e21e 3723{
97aff369 3724 dVAR;
a0d0e21e 3725 char *s = start + (*start == '$');
3280af22 3726 char tmpbuf[sizeof PL_tokenbuf];
a0d0e21e
LW
3727 STRLEN len;
3728 GV* indirgv;
5db06880
NC
3729#ifdef PERL_MAD
3730 int soff;
3731#endif
a0d0e21e 3732
7918f24d
NC
3733 PERL_ARGS_ASSERT_INTUIT_METHOD;
3734
a0d0e21e 3735 if (gv) {
62d55b22 3736 if (SvTYPE(gv) == SVt_PVGV && GvIO(gv))
a0d0e21e 3737 return 0;
62d55b22
NC
3738 if (cv) {
3739 if (SvPOK(cv)) {
8fa6a409 3740 const char *proto = CvPROTO(cv);
62d55b22
NC
3741 if (proto) {
3742 if (*proto == ';')
3743 proto++;
3744 if (*proto == '*')
3745 return 0;
3746 }
b6c543e3
IZ
3747 }
3748 } else
c35e046a 3749 gv = NULL;
a0d0e21e 3750 }
8903cb82 3751 s = scan_word(s, tmpbuf, sizeof tmpbuf, TRUE, &len);
ffb4593c
NT
3752 /* start is the beginning of the possible filehandle/object,
3753 * and s is the end of it
3754 * tmpbuf is a copy of it
3755 */
3756
a0d0e21e 3757 if (*start == '$') {
3ef1310e
RGS
3758 if (gv || PL_last_lop_op == OP_PRINT || PL_last_lop_op == OP_SAY ||
3759 isUPPER(*PL_tokenbuf))
a0d0e21e 3760 return 0;
5db06880
NC
3761#ifdef PERL_MAD
3762 len = start - SvPVX(PL_linestr);
3763#endif
29595ff2 3764 s = PEEKSPACE(s);
f0092767 3765#ifdef PERL_MAD
5db06880
NC
3766 start = SvPVX(PL_linestr) + len;
3767#endif
3280af22
NIS
3768 PL_bufptr = start;
3769