This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
eval"" should reset %^H in more cases
[perl5.git] / toke.c
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
26
27This is the lower layer of the Perl parser, managing characters and tokens.
28
29=for apidoc AmU|yy_parser *|PL_parser
30
31Pointer to a structure encapsulating the state of the parsing operation
32currently in progress. The pointer can be locally changed to perform
33a nested parse without interfering with the state of an outer parse.
34Individual members of C<PL_parser> have their own documentation.
35
36=cut
37*/
38
378cc40b 39#include "EXTERN.h"
864dbfa3 40#define PERL_IN_TOKE_C
378cc40b 41#include "perl.h"
04e98a4d 42#include "dquote_static.c"
378cc40b 43
eb0d8d16
NC
44#define new_constant(a,b,c,d,e,f,g) \
45 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
46
6154021b 47#define pl_yylval (PL_parser->yylval)
d3b6f988 48
199e78b7
DM
49/* XXX temporary backwards compatibility */
50#define PL_lex_brackets (PL_parser->lex_brackets)
78cdf107
Z
51#define PL_lex_allbrackets (PL_parser->lex_allbrackets)
52#define PL_lex_fakeeof (PL_parser->lex_fakeeof)
199e78b7
DM
53#define PL_lex_brackstack (PL_parser->lex_brackstack)
54#define PL_lex_casemods (PL_parser->lex_casemods)
55#define PL_lex_casestack (PL_parser->lex_casestack)
56#define PL_lex_defer (PL_parser->lex_defer)
57#define PL_lex_dojoin (PL_parser->lex_dojoin)
58#define PL_lex_expect (PL_parser->lex_expect)
59#define PL_lex_formbrack (PL_parser->lex_formbrack)
60#define PL_lex_inpat (PL_parser->lex_inpat)
61#define PL_lex_inwhat (PL_parser->lex_inwhat)
62#define PL_lex_op (PL_parser->lex_op)
63#define PL_lex_repl (PL_parser->lex_repl)
64#define PL_lex_starts (PL_parser->lex_starts)
65#define PL_lex_stuff (PL_parser->lex_stuff)
66#define PL_multi_start (PL_parser->multi_start)
67#define PL_multi_open (PL_parser->multi_open)
68#define PL_multi_close (PL_parser->multi_close)
69#define PL_pending_ident (PL_parser->pending_ident)
70#define PL_preambled (PL_parser->preambled)
71#define PL_sublex_info (PL_parser->sublex_info)
bdc0bf6f 72#define PL_linestr (PL_parser->linestr)
c2598295
DM
73#define PL_expect (PL_parser->expect)
74#define PL_copline (PL_parser->copline)
f06b5848
DM
75#define PL_bufptr (PL_parser->bufptr)
76#define PL_oldbufptr (PL_parser->oldbufptr)
77#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
78#define PL_linestart (PL_parser->linestart)
79#define PL_bufend (PL_parser->bufend)
80#define PL_last_uni (PL_parser->last_uni)
81#define PL_last_lop (PL_parser->last_lop)
82#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 83#define PL_lex_state (PL_parser->lex_state)
2f9285f8 84#define PL_rsfp (PL_parser->rsfp)
5486870f 85#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
86#define PL_in_my (PL_parser->in_my)
87#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 88#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 89#define PL_multi_end (PL_parser->multi_end)
13765c85 90#define PL_error_count (PL_parser->error_count)
199e78b7
DM
91
92#ifdef PERL_MAD
93# define PL_endwhite (PL_parser->endwhite)
94# define PL_faketokens (PL_parser->faketokens)
95# define PL_lasttoke (PL_parser->lasttoke)
96# define PL_nextwhite (PL_parser->nextwhite)
97# define PL_realtokenstart (PL_parser->realtokenstart)
98# define PL_skipwhite (PL_parser->skipwhite)
99# define PL_thisclose (PL_parser->thisclose)
100# define PL_thismad (PL_parser->thismad)
101# define PL_thisopen (PL_parser->thisopen)
102# define PL_thisstuff (PL_parser->thisstuff)
103# define PL_thistoken (PL_parser->thistoken)
104# define PL_thiswhite (PL_parser->thiswhite)
fb205e7a
DM
105# define PL_thiswhite (PL_parser->thiswhite)
106# define PL_nexttoke (PL_parser->nexttoke)
107# define PL_curforce (PL_parser->curforce)
108#else
109# define PL_nexttoke (PL_parser->nexttoke)
110# define PL_nexttype (PL_parser->nexttype)
111# define PL_nextval (PL_parser->nextval)
199e78b7
DM
112#endif
113
16173588
NC
114/* This can't be done with embed.fnc, because struct yy_parser contains a
115 member named pending_ident, which clashes with the generated #define */
3cbf51f5
DM
116static int
117S_pending_ident(pTHX);
199e78b7 118
0bd48802 119static const char ident_too_long[] = "Identifier too long";
8903cb82 120
29595ff2 121#ifdef PERL_MAD
29595ff2 122# define CURMAD(slot,sv) if (PL_madskills) { curmad(slot,sv); sv = 0; }
cd81e915 123# define NEXTVAL_NEXTTOKE PL_nexttoke[PL_curforce].next_val
9ded7720 124#else
5db06880 125# define CURMAD(slot,sv)
9ded7720 126# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2
NC
127#endif
128
a7aaec61
Z
129#define XENUMMASK 0x3f
130#define XFAKEEOF 0x40
131#define XFAKEBRACK 0x80
9059aa12 132
39e02b42
JH
133#ifdef USE_UTF8_SCRIPTS
134# define UTF (!IN_BYTES)
2b9d42f0 135#else
802a15e9 136# define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
2b9d42f0 137#endif
a0ed51b3 138
b1fc3636
CJ
139/* The maximum number of characters preceding the unrecognized one to display */
140#define UNRECOGNIZED_PRECEDE_COUNT 10
141
61f0cdd9 142/* In variables named $^X, these are the legal values for X.
2b92dfce
GS
143 * 1999-02-27 mjd-perl-patch@plover.com */
144#define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
145
bf4acbe4 146#define SPACE_OR_TAB(c) ((c)==' '||(c)=='\t')
bf4acbe4 147
ffb4593c
NT
148/* LEX_* are values for PL_lex_state, the state of the lexer.
149 * They are arranged oddly so that the guard on the switch statement
79072805
LW
150 * can get by with a single comparison (if the compiler is smart enough).
151 */
152
fb73857a 153/* #define LEX_NOTPARSING 11 is done in perl.h. */
154
b6007c36
DM
155#define LEX_NORMAL 10 /* normal code (ie not within "...") */
156#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
157#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
158#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
159#define LEX_INTERPSTART 6 /* expecting the start of a $var */
160
161 /* at end of code, eg "$x" followed by: */
162#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
163#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
164
165#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
166 string or after \E, $foo, etc */
167#define LEX_INTERPCONST 2 /* NOT USED */
168#define LEX_FORMLINE 1 /* expecting a format line */
169#define LEX_KNOWNEXT 0 /* next token known; just return it */
170
79072805 171
bbf60fe6 172#ifdef DEBUGGING
27da23d5 173static const char* const lex_state_names[] = {
bbf60fe6
DM
174 "KNOWNEXT",
175 "FORMLINE",
176 "INTERPCONST",
177 "INTERPCONCAT",
178 "INTERPENDMAYBE",
179 "INTERPEND",
180 "INTERPSTART",
181 "INTERPPUSH",
182 "INTERPCASEMOD",
183 "INTERPNORMAL",
184 "NORMAL"
185};
186#endif
187
79072805
LW
188#ifdef ff_next
189#undef ff_next
d48672a2
LW
190#endif
191
79072805 192#include "keywords.h"
fe14fcc3 193
ffb4593c
NT
194/* CLINE is a macro that ensures PL_copline has a sane value */
195
ae986130
LW
196#ifdef CLINE
197#undef CLINE
198#endif
57843af0 199#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 200
5db06880 201#ifdef PERL_MAD
29595ff2
NC
202# define SKIPSPACE0(s) skipspace0(s)
203# define SKIPSPACE1(s) skipspace1(s)
204# define SKIPSPACE2(s,tsv) skipspace2(s,&tsv)
205# define PEEKSPACE(s) skipspace2(s,0)
206#else
207# define SKIPSPACE0(s) skipspace(s)
208# define SKIPSPACE1(s) skipspace(s)
209# define SKIPSPACE2(s,tsv) skipspace(s)
210# define PEEKSPACE(s) skipspace(s)
211#endif
212
ffb4593c
NT
213/*
214 * Convenience functions to return different tokens and prime the
9cbb5ea2 215 * lexer for the next token. They all take an argument.
ffb4593c
NT
216 *
217 * TOKEN : generic token (used for '(', DOLSHARP, etc)
218 * OPERATOR : generic operator
219 * AOPERATOR : assignment operator
220 * PREBLOCK : beginning the block after an if, while, foreach, ...
221 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
222 * PREREF : *EXPR where EXPR is not a simple identifier
223 * TERM : expression term
224 * LOOPX : loop exiting command (goto, last, dump, etc)
225 * FTST : file test operator
226 * FUN0 : zero-argument function
7eb971ee 227 * FUN0OP : zero-argument function, with its op created in this file
2d2e263d 228 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
229 * BOop : bitwise or or xor
230 * BAop : bitwise and
231 * SHop : shift operator
232 * PWop : power operator
9cbb5ea2 233 * PMop : pattern-matching operator
ffb4593c
NT
234 * Aop : addition-level operator
235 * Mop : multiplication-level operator
236 * Eop : equality-testing operator
e5edeb50 237 * Rop : relational operator <= != gt
ffb4593c
NT
238 *
239 * Also see LOP and lop() below.
240 */
241
998054bd 242#ifdef DEBUGGING /* Serve -DT. */
704d4215 243# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 244#else
bbf60fe6 245# define REPORT(retval) (retval)
998054bd
SC
246#endif
247
bbf60fe6
DM
248#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
249#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
250#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, REPORT(retval)))
251#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
252#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
253#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
254#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
6154021b
RGS
255#define LOOPX(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)LOOPEX))
256#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
257#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
7eb971ee 258#define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
6154021b
RGS
259#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
260#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITOROP)))
261#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITANDOP)))
262#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)SHIFTOP)))
263#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)POWOP)))
264#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
265#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)ADDOP)))
266#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MULOP)))
267#define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
268#define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
2f3197b3 269
a687059c
LW
270/* This bit of chicanery makes a unary function followed by
271 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
272 * The UNIDOR macro is for unary functions that can be followed by the //
273 * operator (such as C<shift // 0>).
a687059c 274 */
376fcdbf 275#define UNI2(f,x) { \
6154021b 276 pl_yylval.ival = f; \
376fcdbf
AL
277 PL_expect = x; \
278 PL_bufptr = s; \
279 PL_last_uni = PL_oldbufptr; \
280 PL_last_lop_op = f; \
281 if (*s == '(') \
282 return REPORT( (int)FUNC1 ); \
29595ff2 283 s = PEEKSPACE(s); \
376fcdbf
AL
284 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
285 }
6f33ba73
RGS
286#define UNI(f) UNI2(f,XTERM)
287#define UNIDOR(f) UNI2(f,XTERMORDORDOR)
a687059c 288
376fcdbf 289#define UNIBRACK(f) { \
6154021b 290 pl_yylval.ival = f; \
376fcdbf
AL
291 PL_bufptr = s; \
292 PL_last_uni = PL_oldbufptr; \
293 if (*s == '(') \
294 return REPORT( (int)FUNC1 ); \
29595ff2 295 s = PEEKSPACE(s); \
376fcdbf
AL
296 return REPORT( (*s == '(') ? (int)FUNC1 : (int)UNIOP ); \
297 }
79072805 298
9f68db38 299/* grandfather return to old style */
78cdf107
Z
300#define OLDLOP(f) \
301 do { \
302 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
303 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
304 pl_yylval.ival = (f); \
305 PL_expect = XTERM; \
306 PL_bufptr = s; \
307 return (int)LSTOP; \
308 } while(0)
79072805 309
8fa7f367
JH
310#ifdef DEBUGGING
311
6154021b 312/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
313enum token_type {
314 TOKENTYPE_NONE,
315 TOKENTYPE_IVAL,
6154021b 316 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6
DM
317 TOKENTYPE_PVAL,
318 TOKENTYPE_OPVAL,
319 TOKENTYPE_GVVAL
320};
321
6d4a66ac
NC
322static struct debug_tokens {
323 const int token;
324 enum token_type type;
325 const char *name;
326} const debug_tokens[] =
9041c2e3 327{
bbf60fe6
DM
328 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
329 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
330 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
331 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
332 { ARROW, TOKENTYPE_NONE, "ARROW" },
333 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
334 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
335 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
336 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
337 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
0d863452 338 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
339 { DO, TOKENTYPE_NONE, "DO" },
340 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
341 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
342 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
343 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
344 { ELSE, TOKENTYPE_NONE, "ELSE" },
345 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
346 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
347 { FOR, TOKENTYPE_IVAL, "FOR" },
348 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
349 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
350 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
7eb971ee 351 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
bbf60fe6
DM
352 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
353 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
354 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 355 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
356 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
357 { IF, TOKENTYPE_IVAL, "IF" },
358 { LABEL, TOKENTYPE_PVAL, "LABEL" },
359 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
360 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
361 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
362 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
363 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
364 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
365 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
366 { MY, TOKENTYPE_IVAL, "MY" },
367 { MYSUB, TOKENTYPE_NONE, "MYSUB" },
368 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
369 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
370 { OROP, TOKENTYPE_IVAL, "OROP" },
371 { OROR, TOKENTYPE_NONE, "OROR" },
372 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
88e1f1a2
JV
373 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
374 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6
DM
375 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
376 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
377 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
378 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
379 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
380 { PREINC, TOKENTYPE_NONE, "PREINC" },
381 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
382 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
383 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
384 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
385 { SUB, TOKENTYPE_NONE, "SUB" },
386 { THING, TOKENTYPE_OPVAL, "THING" },
387 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
388 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
389 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
390 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
391 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
392 { USE, TOKENTYPE_IVAL, "USE" },
0d863452 393 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6
DM
394 { WHILE, TOKENTYPE_IVAL, "WHILE" },
395 { WORD, TOKENTYPE_OPVAL, "WORD" },
be25f609 396 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 397 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
398};
399
6154021b 400/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 401
bbf60fe6 402STATIC int
704d4215 403S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 404{
97aff369 405 dVAR;
7918f24d
NC
406
407 PERL_ARGS_ASSERT_TOKEREPORT;
408
bbf60fe6 409 if (DEBUG_T_TEST) {
bd61b366 410 const char *name = NULL;
bbf60fe6 411 enum token_type type = TOKENTYPE_NONE;
f54cb97a 412 const struct debug_tokens *p;
396482e1 413 SV* const report = newSVpvs("<== ");
bbf60fe6 414
f54cb97a 415 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
416 if (p->token == (int)rv) {
417 name = p->name;
418 type = p->type;
419 break;
420 }
421 }
422 if (name)
54667de8 423 Perl_sv_catpv(aTHX_ report, name);
bbf60fe6
DM
424 else if ((char)rv > ' ' && (char)rv < '~')
425 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
426 else if (!rv)
396482e1 427 sv_catpvs(report, "EOF");
bbf60fe6
DM
428 else
429 Perl_sv_catpvf(aTHX_ report, "?? %"IVdf, (IV)rv);
430 switch (type) {
431 case TOKENTYPE_NONE:
432 case TOKENTYPE_GVVAL: /* doesn't appear to be used */
433 break;
434 case TOKENTYPE_IVAL:
704d4215 435 Perl_sv_catpvf(aTHX_ report, "(ival=%"IVdf")", (IV)lvalp->ival);
bbf60fe6
DM
436 break;
437 case TOKENTYPE_OPNUM:
438 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 439 PL_op_name[lvalp->ival]);
bbf60fe6
DM
440 break;
441 case TOKENTYPE_PVAL:
704d4215 442 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
443 break;
444 case TOKENTYPE_OPVAL:
704d4215 445 if (lvalp->opval) {
401441c0 446 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
447 PL_op_name[lvalp->opval->op_type]);
448 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 449 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 450 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
451 }
452
453 }
401441c0 454 else
396482e1 455 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
456 break;
457 }
b6007c36 458 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
459 };
460 return (int)rv;
998054bd
SC
461}
462
b6007c36
DM
463
464/* print the buffer with suitable escapes */
465
466STATIC void
15f169a1 467S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 468{
396482e1 469 SV* const tmp = newSVpvs("");
7918f24d
NC
470
471 PERL_ARGS_ASSERT_PRINTBUF;
472
b6007c36
DM
473 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
474 SvREFCNT_dec(tmp);
475}
476
8fa7f367
JH
477#endif
478
8290c323
NC
479static int
480S_deprecate_commaless_var_list(pTHX) {
481 PL_expect = XTERM;
482 deprecate("comma-less variable list");
483 return REPORT(','); /* grandfather non-comma-format format */
484}
485
ffb4593c
NT
486/*
487 * S_ao
488 *
c963b151
BD
489 * This subroutine detects &&=, ||=, and //= and turns an ANDAND, OROR or DORDOR
490 * into an OP_ANDASSIGN, OP_ORASSIGN, or OP_DORASSIGN
ffb4593c
NT
491 */
492
76e3520e 493STATIC int
cea2e8a9 494S_ao(pTHX_ int toketype)
a0d0e21e 495{
97aff369 496 dVAR;
3280af22
NIS
497 if (*PL_bufptr == '=') {
498 PL_bufptr++;
a0d0e21e 499 if (toketype == ANDAND)
6154021b 500 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 501 else if (toketype == OROR)
6154021b 502 pl_yylval.ival = OP_ORASSIGN;
c963b151 503 else if (toketype == DORDOR)
6154021b 504 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
505 toketype = ASSIGNOP;
506 }
507 return toketype;
508}
509
ffb4593c
NT
510/*
511 * S_no_op
512 * When Perl expects an operator and finds something else, no_op
513 * prints the warning. It always prints "<something> found where
514 * operator expected. It prints "Missing semicolon on previous line?"
515 * if the surprise occurs at the start of the line. "do you need to
516 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
517 * where the compiler doesn't know if foo is a method call or a function.
518 * It prints "Missing operator before end of line" if there's nothing
519 * after the missing operator, or "... before <...>" if there is something
520 * after the missing operator.
521 */
522
76e3520e 523STATIC void
15f169a1 524S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 525{
97aff369 526 dVAR;
9d4ba2ae
AL
527 char * const oldbp = PL_bufptr;
528 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 529
7918f24d
NC
530 PERL_ARGS_ASSERT_NO_OP;
531
1189a94a
GS
532 if (!s)
533 s = oldbp;
07c798fb 534 else
1189a94a 535 PL_bufptr = s;
cea2e8a9 536 yywarn(Perl_form(aTHX_ "%s found where operator expected", what));
56da5a46
RGS
537 if (ckWARN_d(WARN_SYNTAX)) {
538 if (is_first)
539 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
540 "\t(Missing semicolon on previous line?)\n");
541 else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) {
f54cb97a 542 const char *t;
c35e046a
AL
543 for (t = PL_oldoldbufptr; (isALNUM_lazy_if(t,UTF) || *t == ':'); t++)
544 NOOP;
56da5a46
RGS
545 if (t < PL_bufptr && isSPACE(*t))
546 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
547 "\t(Do you need to predeclare %.*s?)\n",
551405c4 548 (int)(t - PL_oldoldbufptr), PL_oldoldbufptr);
56da5a46
RGS
549 }
550 else {
551 assert(s >= oldbp);
552 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
551405c4 553 "\t(Missing operator before %.*s?)\n", (int)(s - oldbp), oldbp);
56da5a46 554 }
07c798fb 555 }
3280af22 556 PL_bufptr = oldbp;
8990e307
LW
557}
558
ffb4593c
NT
559/*
560 * S_missingterm
561 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 562 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
563 * If we're in a delimited string and the delimiter is a control
564 * character, it's reformatted into a two-char sequence like ^C.
565 * This is fatal.
566 */
567
76e3520e 568STATIC void
cea2e8a9 569S_missingterm(pTHX_ char *s)
8990e307 570{
97aff369 571 dVAR;
8990e307
LW
572 char tmpbuf[3];
573 char q;
574 if (s) {
9d4ba2ae 575 char * const nl = strrchr(s,'\n');
d2719217 576 if (nl)
8990e307
LW
577 *nl = '\0';
578 }
463559e7 579 else if (isCNTRL(PL_multi_close)) {
8990e307 580 *tmpbuf = '^';
585ec06d 581 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
582 tmpbuf[2] = '\0';
583 s = tmpbuf;
584 }
585 else {
eb160463 586 *tmpbuf = (char)PL_multi_close;
8990e307
LW
587 tmpbuf[1] = '\0';
588 s = tmpbuf;
589 }
590 q = strchr(s,'"') ? '\'' : '"';
cea2e8a9 591 Perl_croak(aTHX_ "Can't find string terminator %c%s%c anywhere before EOF",q,s,q);
463ee0b2 592}
79072805 593
0d863452 594/*
0d863452
RH
595 * Check whether the named feature is enabled.
596 */
26ea9e12
NC
597bool
598Perl_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
0d863452 599{
97aff369 600 dVAR;
0d863452 601 HV * const hinthv = GvHV(PL_hintgv);
4a731d7b 602 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
7918f24d
NC
603
604 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
605
26ea9e12
NC
606 if (namelen > MAX_FEATURE_LEN)
607 return FALSE;
4a731d7b 608 memcpy(&he_name[8], name, namelen);
d4c19fe8 609
7b9ef140 610 return (hinthv && hv_exists(hinthv, he_name, 8 + namelen));
0d863452
RH
611}
612
ffb4593c 613/*
9cbb5ea2
GS
614 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
615 * utf16-to-utf8-reversed.
ffb4593c
NT
616 */
617
c39cd008
GS
618#ifdef PERL_CR_FILTER
619static void
620strip_return(SV *sv)
621{
95a20fc0 622 register const char *s = SvPVX_const(sv);
9d4ba2ae 623 register const char * const e = s + SvCUR(sv);
7918f24d
NC
624
625 PERL_ARGS_ASSERT_STRIP_RETURN;
626
c39cd008
GS
627 /* outer loop optimized to do nothing if there are no CR-LFs */
628 while (s < e) {
629 if (*s++ == '\r' && *s == '\n') {
630 /* hit a CR-LF, need to copy the rest */
631 register char *d = s - 1;
632 *d++ = *s++;
633 while (s < e) {
634 if (*s == '\r' && s[1] == '\n')
635 s++;
636 *d++ = *s++;
637 }
638 SvCUR(sv) -= s - d;
639 return;
640 }
641 }
642}
a868473f 643
76e3520e 644STATIC I32
c39cd008 645S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 646{
f54cb97a 647 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
648 if (count > 0 && !maxlen)
649 strip_return(sv);
650 return count;
a868473f
NIS
651}
652#endif
653
ffb4593c 654/*
8eaa0acf
Z
655=for apidoc Amx|void|lex_start|SV *line|PerlIO *rsfp|U32 flags
656
657Creates and initialises a new lexer/parser state object, supplying
658a context in which to lex and parse from a new source of Perl code.
659A pointer to the new state object is placed in L</PL_parser>. An entry
660is made on the save stack so that upon unwinding the new state object
661will be destroyed and the former value of L</PL_parser> will be restored.
662Nothing else need be done to clean up the parsing context.
663
664The code to be parsed comes from I<line> and I<rsfp>. I<line>, if
665non-null, provides a string (in SV form) containing code to be parsed.
666A copy of the string is made, so subsequent modification of I<line>
667does not affect parsing. I<rsfp>, if non-null, provides an input stream
668from which code will be read to be parsed. If both are non-null, the
669code in I<line> comes first and must consist of complete lines of input,
670and I<rsfp> supplies the remainder of the source.
671
672The I<flags> parameter is reserved for future use, and must always
27fcb6ee
FC
673be zero, except for one flag that is currently reserved for perl's internal
674use.
8eaa0acf
Z
675
676=cut
677*/
ffb4593c 678
27fcb6ee
FC
679/* LEX_START_SAME_FILTER indicates that this is not a new file, so it
680 can share filters with the current parser. */
681
a0d0e21e 682void
8eaa0acf 683Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
79072805 684{
97aff369 685 dVAR;
6ef55633 686 const char *s = NULL;
8990e307 687 STRLEN len;
5486870f 688 yy_parser *parser, *oparser;
60d63348 689 if (flags && flags & ~LEX_START_FLAGS)
8eaa0acf 690 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
acdf0a21
DM
691
692 /* create and initialise a parser */
693
199e78b7 694 Newxz(parser, 1, yy_parser);
5486870f 695 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
696 PL_parser = parser;
697
28ac2b49
Z
698 parser->stack = NULL;
699 parser->ps = NULL;
700 parser->stack_size = 0;
acdf0a21 701
e3abe207
DM
702 /* on scope exit, free this parser and restore any outer one */
703 SAVEPARSER(parser);
7c4baf47 704 parser->saved_curcop = PL_curcop;
e3abe207 705
acdf0a21 706 /* initialise lexer state */
8990e307 707
fb205e7a
DM
708#ifdef PERL_MAD
709 parser->curforce = -1;
710#else
711 parser->nexttoke = 0;
712#endif
ca4cfd28 713 parser->error_count = oparser ? oparser->error_count : 0;
c2598295 714 parser->copline = NOLINE;
5afb0a62 715 parser->lex_state = LEX_NORMAL;
c2598295 716 parser->expect = XSTATE;
2f9285f8 717 parser->rsfp = rsfp;
27fcb6ee
FC
718 parser->rsfp_filters =
719 !(flags & LEX_START_SAME_FILTER) || !oparser
720 ? newAV()
721 : MUTABLE_AV(SvREFCNT_inc(oparser->rsfp_filters));
2f9285f8 722
199e78b7
DM
723 Newx(parser->lex_brackstack, 120, char);
724 Newx(parser->lex_casestack, 12, char);
725 *parser->lex_casestack = '\0';
02b34bbe 726
10efb74f
NC
727 if (line) {
728 s = SvPV_const(line, len);
729 } else {
730 len = 0;
731 }
bdc0bf6f 732
10efb74f 733 if (!len) {
bdc0bf6f 734 parser->linestr = newSVpvs("\n;");
805700c1 735 } else {
719a9bb0 736 parser->linestr = newSVpvn_flags(s, len, SvUTF8(line));
10efb74f 737 if (s[len-1] != ';')
bdc0bf6f 738 sv_catpvs(parser->linestr, "\n;");
8990e307 739 }
f06b5848
DM
740 parser->oldoldbufptr =
741 parser->oldbufptr =
742 parser->bufptr =
743 parser->linestart = SvPVX(parser->linestr);
744 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
745 parser->last_lop = parser->last_uni = NULL;
60d63348 746 parser->lex_flags = flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES);
737c24fc 747
60d63348 748 parser->in_pod = parser->filtered = 0;
79072805 749}
a687059c 750
e3abe207
DM
751
752/* delete a parser object */
753
754void
755Perl_parser_free(pTHX_ const yy_parser *parser)
756{
7918f24d
NC
757 PERL_ARGS_ASSERT_PARSER_FREE;
758
7c4baf47 759 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
760 SvREFCNT_dec(parser->linestr);
761
2f9285f8
DM
762 if (parser->rsfp == PerlIO_stdin())
763 PerlIO_clearerr(parser->rsfp);
799361c3
SH
764 else if (parser->rsfp && (!parser->old_parser ||
765 (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 766 PerlIO_close(parser->rsfp);
5486870f 767 SvREFCNT_dec(parser->rsfp_filters);
2f9285f8 768
e3abe207
DM
769 Safefree(parser->lex_brackstack);
770 Safefree(parser->lex_casestack);
771 PL_parser = parser->old_parser;
772 Safefree(parser);
773}
774
775
ffb4593c 776/*
f0e67a1d
Z
777=for apidoc AmxU|SV *|PL_parser-E<gt>linestr
778
779Buffer scalar containing the chunk currently under consideration of the
780text currently being lexed. This is always a plain string scalar (for
781which C<SvPOK> is true). It is not intended to be used as a scalar by
782normal scalar means; instead refer to the buffer directly by the pointer
783variables described below.
784
785The lexer maintains various C<char*> pointers to things in the
786C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
787reallocated, all of these pointers must be updated. Don't attempt to
788do this manually, but rather use L</lex_grow_linestr> if you need to
789reallocate the buffer.
790
791The content of the text chunk in the buffer is commonly exactly one
792complete line of input, up to and including a newline terminator,
793but there are situations where it is otherwise. The octets of the
794buffer may be intended to be interpreted as either UTF-8 or Latin-1.
795The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
796flag on this scalar, which may disagree with it.
797
798For direct examination of the buffer, the variable
799L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
800lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
801of these pointers is usually preferable to examination of the scalar
802through normal scalar means.
803
804=for apidoc AmxU|char *|PL_parser-E<gt>bufend
805
806Direct pointer to the end of the chunk of text currently being lexed, the
807end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
808+ SvCUR(PL_parser-E<gt>linestr)>. A NUL character (zero octet) is
809always located at the end of the buffer, and does not count as part of
810the buffer's contents.
811
812=for apidoc AmxU|char *|PL_parser-E<gt>bufptr
813
814Points to the current position of lexing inside the lexer buffer.
815Characters around this point may be freely examined, within
816the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
817L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
818interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
819
820Lexing code (whether in the Perl core or not) moves this pointer past
821the characters that it consumes. It is also expected to perform some
822bookkeeping whenever a newline character is consumed. This movement
823can be more conveniently performed by the function L</lex_read_to>,
824which handles newlines appropriately.
825
826Interpretation of the buffer's octets can be abstracted out by
827using the slightly higher-level functions L</lex_peek_unichar> and
828L</lex_read_unichar>.
829
830=for apidoc AmxU|char *|PL_parser-E<gt>linestart
831
832Points to the start of the current line inside the lexer buffer.
833This is useful for indicating at which column an error occurred, and
834not much else. This must be updated by any lexing code that consumes
835a newline; the function L</lex_read_to> handles this detail.
836
837=cut
838*/
839
840/*
841=for apidoc Amx|bool|lex_bufutf8
842
843Indicates whether the octets in the lexer buffer
844(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
845of Unicode characters. If not, they should be interpreted as Latin-1
846characters. This is analogous to the C<SvUTF8> flag for scalars.
847
848In UTF-8 mode, it is not guaranteed that the lexer buffer actually
849contains valid UTF-8. Lexing code must be robust in the face of invalid
850encoding.
851
852The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
853is significant, but not the whole story regarding the input character
854encoding. Normally, when a file is being read, the scalar contains octets
855and its C<SvUTF8> flag is off, but the octets should be interpreted as
856UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
857however, the scalar may have the C<SvUTF8> flag on, and in this case its
858octets should be interpreted as UTF-8 unless the C<use bytes> pragma
859is in effect. This logic may change in the future; use this function
860instead of implementing the logic yourself.
861
862=cut
863*/
864
865bool
866Perl_lex_bufutf8(pTHX)
867{
868 return UTF;
869}
870
871/*
872=for apidoc Amx|char *|lex_grow_linestr|STRLEN len
873
874Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
875at least I<len> octets (including terminating NUL). Returns a
876pointer to the reallocated buffer. This is necessary before making
877any direct modification of the buffer that would increase its length.
878L</lex_stuff_pvn> provides a more convenient way to insert text into
879the buffer.
880
881Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
882this function updates all of the lexer's variables that point directly
883into the buffer.
884
885=cut
886*/
887
888char *
889Perl_lex_grow_linestr(pTHX_ STRLEN len)
890{
891 SV *linestr;
892 char *buf;
893 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
894 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
895 linestr = PL_parser->linestr;
896 buf = SvPVX(linestr);
897 if (len <= SvLEN(linestr))
898 return buf;
899 bufend_pos = PL_parser->bufend - buf;
900 bufptr_pos = PL_parser->bufptr - buf;
901 oldbufptr_pos = PL_parser->oldbufptr - buf;
902 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
903 linestart_pos = PL_parser->linestart - buf;
904 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
905 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
906 buf = sv_grow(linestr, len);
907 PL_parser->bufend = buf + bufend_pos;
908 PL_parser->bufptr = buf + bufptr_pos;
909 PL_parser->oldbufptr = buf + oldbufptr_pos;
910 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
911 PL_parser->linestart = buf + linestart_pos;
912 if (PL_parser->last_uni)
913 PL_parser->last_uni = buf + last_uni_pos;
914 if (PL_parser->last_lop)
915 PL_parser->last_lop = buf + last_lop_pos;
916 return buf;
917}
918
919/*
83aa740e 920=for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
f0e67a1d
Z
921
922Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
923immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
924reallocating the buffer if necessary. This means that lexing code that
925runs later will see the characters as if they had appeared in the input.
926It is not recommended to do this as part of normal parsing, and most
927uses of this facility run the risk of the inserted characters being
928interpreted in an unintended manner.
929
930The string to be inserted is represented by I<len> octets starting
931at I<pv>. These octets are interpreted as either UTF-8 or Latin-1,
932according to whether the C<LEX_STUFF_UTF8> flag is set in I<flags>.
933The characters are recoded for the lexer buffer, according to how the
934buffer is currently being interpreted (L</lex_bufutf8>). If a string
9dcc53ea 935to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
f0e67a1d
Z
936function is more convenient.
937
938=cut
939*/
940
941void
83aa740e 942Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
f0e67a1d 943{
749123ff 944 dVAR;
f0e67a1d
Z
945 char *bufptr;
946 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
947 if (flags & ~(LEX_STUFF_UTF8))
948 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
949 if (UTF) {
950 if (flags & LEX_STUFF_UTF8) {
951 goto plain_copy;
952 } else {
953 STRLEN highhalf = 0;
83aa740e 954 const char *p, *e = pv+len;
f0e67a1d
Z
955 for (p = pv; p != e; p++)
956 highhalf += !!(((U8)*p) & 0x80);
957 if (!highhalf)
958 goto plain_copy;
959 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
960 bufptr = PL_parser->bufptr;
961 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
962 SvCUR_set(PL_parser->linestr,
963 SvCUR(PL_parser->linestr) + len+highhalf);
f0e67a1d
Z
964 PL_parser->bufend += len+highhalf;
965 for (p = pv; p != e; p++) {
966 U8 c = (U8)*p;
967 if (c & 0x80) {
968 *bufptr++ = (char)(0xc0 | (c >> 6));
969 *bufptr++ = (char)(0x80 | (c & 0x3f));
970 } else {
971 *bufptr++ = (char)c;
972 }
973 }
974 }
975 } else {
976 if (flags & LEX_STUFF_UTF8) {
977 STRLEN highhalf = 0;
83aa740e 978 const char *p, *e = pv+len;
f0e67a1d
Z
979 for (p = pv; p != e; p++) {
980 U8 c = (U8)*p;
981 if (c >= 0xc4) {
982 Perl_croak(aTHX_ "Lexing code attempted to stuff "
983 "non-Latin-1 character into Latin-1 input");
984 } else if (c >= 0xc2 && p+1 != e &&
985 (((U8)p[1]) & 0xc0) == 0x80) {
986 p++;
987 highhalf++;
988 } else if (c >= 0x80) {
989 /* malformed UTF-8 */
990 ENTER;
991 SAVESPTR(PL_warnhook);
992 PL_warnhook = PERL_WARNHOOK_FATAL;
993 utf8n_to_uvuni((U8*)p, e-p, NULL, 0);
994 LEAVE;
995 }
996 }
997 if (!highhalf)
998 goto plain_copy;
999 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
1000 bufptr = PL_parser->bufptr;
1001 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1002 SvCUR_set(PL_parser->linestr,
1003 SvCUR(PL_parser->linestr) + len-highhalf);
f0e67a1d
Z
1004 PL_parser->bufend += len-highhalf;
1005 for (p = pv; p != e; p++) {
1006 U8 c = (U8)*p;
1007 if (c & 0x80) {
1008 *bufptr++ = (char)(((c & 0x3) << 6) | (p[1] & 0x3f));
1009 p++;
1010 } else {
1011 *bufptr++ = (char)c;
1012 }
1013 }
1014 } else {
1015 plain_copy:
1016 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1017 bufptr = PL_parser->bufptr;
1018 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
255fdf19 1019 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
f0e67a1d
Z
1020 PL_parser->bufend += len;
1021 Copy(pv, bufptr, len, char);
1022 }
1023 }
1024}
1025
1026/*
9dcc53ea
Z
1027=for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
1028
1029Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1030immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1031reallocating the buffer if necessary. This means that lexing code that
1032runs later will see the characters as if they had appeared in the input.
1033It is not recommended to do this as part of normal parsing, and most
1034uses of this facility run the risk of the inserted characters being
1035interpreted in an unintended manner.
1036
1037The string to be inserted is represented by octets starting at I<pv>
1038and continuing to the first nul. These octets are interpreted as either
1039UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
1040in I<flags>. The characters are recoded for the lexer buffer, according
1041to how the buffer is currently being interpreted (L</lex_bufutf8>).
1042If it is not convenient to nul-terminate a string to be inserted, the
1043L</lex_stuff_pvn> function is more appropriate.
1044
1045=cut
1046*/
1047
1048void
1049Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1050{
1051 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1052 lex_stuff_pvn(pv, strlen(pv), flags);
1053}
1054
1055/*
f0e67a1d
Z
1056=for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1057
1058Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1059immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1060reallocating the buffer if necessary. This means that lexing code that
1061runs later will see the characters as if they had appeared in the input.
1062It is not recommended to do this as part of normal parsing, and most
1063uses of this facility run the risk of the inserted characters being
1064interpreted in an unintended manner.
1065
1066The string to be inserted is the string value of I<sv>. The characters
1067are recoded for the lexer buffer, according to how the buffer is currently
9dcc53ea 1068being interpreted (L</lex_bufutf8>). If a string to be inserted is
f0e67a1d
Z
1069not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1070need to construct a scalar.
1071
1072=cut
1073*/
1074
1075void
1076Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1077{
1078 char *pv;
1079 STRLEN len;
1080 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1081 if (flags)
1082 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1083 pv = SvPV(sv, len);
1084 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1085}
1086
1087/*
1088=for apidoc Amx|void|lex_unstuff|char *ptr
1089
1090Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1091I<ptr>. Text following I<ptr> will be moved, and the buffer shortened.
1092This hides the discarded text from any lexing code that runs later,
1093as if the text had never appeared.
1094
1095This is not the normal way to consume lexed text. For that, use
1096L</lex_read_to>.
1097
1098=cut
1099*/
1100
1101void
1102Perl_lex_unstuff(pTHX_ char *ptr)
1103{
1104 char *buf, *bufend;
1105 STRLEN unstuff_len;
1106 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1107 buf = PL_parser->bufptr;
1108 if (ptr < buf)
1109 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1110 if (ptr == buf)
1111 return;
1112 bufend = PL_parser->bufend;
1113 if (ptr > bufend)
1114 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1115 unstuff_len = ptr - buf;
1116 Move(ptr, buf, bufend+1-ptr, char);
1117 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1118 PL_parser->bufend = bufend - unstuff_len;
1119}
1120
1121/*
1122=for apidoc Amx|void|lex_read_to|char *ptr
1123
1124Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1125to I<ptr>. This advances L</PL_parser-E<gt>bufptr> to match I<ptr>,
1126performing the correct bookkeeping whenever a newline character is passed.
1127This is the normal way to consume lexed text.
1128
1129Interpretation of the buffer's octets can be abstracted out by
1130using the slightly higher-level functions L</lex_peek_unichar> and
1131L</lex_read_unichar>.
1132
1133=cut
1134*/
1135
1136void
1137Perl_lex_read_to(pTHX_ char *ptr)
1138{
1139 char *s;
1140 PERL_ARGS_ASSERT_LEX_READ_TO;
1141 s = PL_parser->bufptr;
1142 if (ptr < s || ptr > PL_parser->bufend)
1143 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1144 for (; s != ptr; s++)
1145 if (*s == '\n') {
1146 CopLINE_inc(PL_curcop);
1147 PL_parser->linestart = s+1;
1148 }
1149 PL_parser->bufptr = ptr;
1150}
1151
1152/*
1153=for apidoc Amx|void|lex_discard_to|char *ptr
1154
1155Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1156up to I<ptr>. The remaining content of the buffer will be moved, and
1157all pointers into the buffer updated appropriately. I<ptr> must not
1158be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1159it is not permitted to discard text that has yet to be lexed.
1160
1161Normally it is not necessarily to do this directly, because it suffices to
1162use the implicit discarding behaviour of L</lex_next_chunk> and things
1163based on it. However, if a token stretches across multiple lines,
1f317c95 1164and the lexing code has kept multiple lines of text in the buffer for
f0e67a1d
Z
1165that purpose, then after completion of the token it would be wise to
1166explicitly discard the now-unneeded earlier lines, to avoid future
1167multi-line tokens growing the buffer without bound.
1168
1169=cut
1170*/
1171
1172void
1173Perl_lex_discard_to(pTHX_ char *ptr)
1174{
1175 char *buf;
1176 STRLEN discard_len;
1177 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1178 buf = SvPVX(PL_parser->linestr);
1179 if (ptr < buf)
1180 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1181 if (ptr == buf)
1182 return;
1183 if (ptr > PL_parser->bufptr)
1184 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1185 discard_len = ptr - buf;
1186 if (PL_parser->oldbufptr < ptr)
1187 PL_parser->oldbufptr = ptr;
1188 if (PL_parser->oldoldbufptr < ptr)
1189 PL_parser->oldoldbufptr = ptr;
1190 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1191 PL_parser->last_uni = NULL;
1192 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1193 PL_parser->last_lop = NULL;
1194 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1195 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1196 PL_parser->bufend -= discard_len;
1197 PL_parser->bufptr -= discard_len;
1198 PL_parser->oldbufptr -= discard_len;
1199 PL_parser->oldoldbufptr -= discard_len;
1200 if (PL_parser->last_uni)
1201 PL_parser->last_uni -= discard_len;
1202 if (PL_parser->last_lop)
1203 PL_parser->last_lop -= discard_len;
1204}
1205
1206/*
1207=for apidoc Amx|bool|lex_next_chunk|U32 flags
1208
1209Reads in the next chunk of text to be lexed, appending it to
1210L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1211looked to the end of the current chunk and wants to know more. It is
1212usual, but not necessary, for lexing to have consumed the entirety of
1213the current chunk at this time.
1214
1215If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1216chunk (i.e., the current chunk has been entirely consumed), normally the
1217current chunk will be discarded at the same time that the new chunk is
1218read in. If I<flags> includes C<LEX_KEEP_PREVIOUS>, the current chunk
1219will not be discarded. If the current chunk has not been entirely
1220consumed, then it will not be discarded regardless of the flag.
1221
1222Returns true if some new text was added to the buffer, or false if the
1223buffer has reached the end of the input text.
1224
1225=cut
1226*/
1227
1228#define LEX_FAKE_EOF 0x80000000
1229
1230bool
1231Perl_lex_next_chunk(pTHX_ U32 flags)
1232{
1233 SV *linestr;
1234 char *buf;
1235 STRLEN old_bufend_pos, new_bufend_pos;
1236 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1237 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1238 bool got_some_for_debugger = 0;
f0e67a1d
Z
1239 bool got_some;
1240 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF))
1241 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
f0e67a1d
Z
1242 linestr = PL_parser->linestr;
1243 buf = SvPVX(linestr);
1244 if (!(flags & LEX_KEEP_PREVIOUS) &&
1245 PL_parser->bufptr == PL_parser->bufend) {
1246 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1247 linestart_pos = 0;
1248 if (PL_parser->last_uni != PL_parser->bufend)
1249 PL_parser->last_uni = NULL;
1250 if (PL_parser->last_lop != PL_parser->bufend)
1251 PL_parser->last_lop = NULL;
1252 last_uni_pos = last_lop_pos = 0;
1253 *buf = 0;
1254 SvCUR(linestr) = 0;
1255 } else {
1256 old_bufend_pos = PL_parser->bufend - buf;
1257 bufptr_pos = PL_parser->bufptr - buf;
1258 oldbufptr_pos = PL_parser->oldbufptr - buf;
1259 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1260 linestart_pos = PL_parser->linestart - buf;
1261 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1262 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1263 }
1264 if (flags & LEX_FAKE_EOF) {
1265 goto eof;
60d63348 1266 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
f0e67a1d
Z
1267 got_some = 0;
1268 } else if (filter_gets(linestr, old_bufend_pos)) {
1269 got_some = 1;
17cc9359 1270 got_some_for_debugger = 1;
f0e67a1d 1271 } else {
580561a3
Z
1272 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1273 sv_setpvs(linestr, "");
f0e67a1d
Z
1274 eof:
1275 /* End of real input. Close filehandle (unless it was STDIN),
1276 * then add implicit termination.
1277 */
1278 if ((PerlIO*)PL_parser->rsfp == PerlIO_stdin())
1279 PerlIO_clearerr(PL_parser->rsfp);
1280 else if (PL_parser->rsfp)
1281 (void)PerlIO_close(PL_parser->rsfp);
1282 PL_parser->rsfp = NULL;
60d63348 1283 PL_parser->in_pod = PL_parser->filtered = 0;
f0e67a1d
Z
1284#ifdef PERL_MAD
1285 if (PL_madskills && !PL_in_eval && (PL_minus_p || PL_minus_n))
1286 PL_faketokens = 1;
1287#endif
1288 if (!PL_in_eval && PL_minus_p) {
1289 sv_catpvs(linestr,
1290 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1291 PL_minus_n = PL_minus_p = 0;
1292 } else if (!PL_in_eval && PL_minus_n) {
1293 sv_catpvs(linestr, /*{*/";}");
1294 PL_minus_n = 0;
1295 } else
1296 sv_catpvs(linestr, ";");
1297 got_some = 1;
1298 }
1299 buf = SvPVX(linestr);
1300 new_bufend_pos = SvCUR(linestr);
1301 PL_parser->bufend = buf + new_bufend_pos;
1302 PL_parser->bufptr = buf + bufptr_pos;
1303 PL_parser->oldbufptr = buf + oldbufptr_pos;
1304 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1305 PL_parser->linestart = buf + linestart_pos;
1306 if (PL_parser->last_uni)
1307 PL_parser->last_uni = buf + last_uni_pos;
1308 if (PL_parser->last_lop)
1309 PL_parser->last_lop = buf + last_lop_pos;
17cc9359 1310 if (got_some_for_debugger && (PERLDB_LINE || PERLDB_SAVESRC) &&
f0e67a1d
Z
1311 PL_curstash != PL_debstash) {
1312 /* debugger active and we're not compiling the debugger code,
1313 * so store the line into the debugger's array of lines
1314 */
1315 update_debugger_info(NULL, buf+old_bufend_pos,
1316 new_bufend_pos-old_bufend_pos);
1317 }
1318 return got_some;
1319}
1320
1321/*
1322=for apidoc Amx|I32|lex_peek_unichar|U32 flags
1323
1324Looks ahead one (Unicode) character in the text currently being lexed.
1325Returns the codepoint (unsigned integer value) of the next character,
1326or -1 if lexing has reached the end of the input text. To consume the
1327peeked character, use L</lex_read_unichar>.
1328
1329If the next character is in (or extends into) the next chunk of input
1330text, the next chunk will be read in. Normally the current chunk will be
1331discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1332then the current chunk will not be discarded.
1333
1334If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1335is encountered, an exception is generated.
1336
1337=cut
1338*/
1339
1340I32
1341Perl_lex_peek_unichar(pTHX_ U32 flags)
1342{
749123ff 1343 dVAR;
f0e67a1d
Z
1344 char *s, *bufend;
1345 if (flags & ~(LEX_KEEP_PREVIOUS))
1346 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1347 s = PL_parser->bufptr;
1348 bufend = PL_parser->bufend;
1349 if (UTF) {
1350 U8 head;
1351 I32 unichar;
1352 STRLEN len, retlen;
1353 if (s == bufend) {
1354 if (!lex_next_chunk(flags))
1355 return -1;
1356 s = PL_parser->bufptr;
1357 bufend = PL_parser->bufend;
1358 }
1359 head = (U8)*s;
1360 if (!(head & 0x80))
1361 return head;
1362 if (head & 0x40) {
1363 len = PL_utf8skip[head];
1364 while ((STRLEN)(bufend-s) < len) {
1365 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1366 break;
1367 s = PL_parser->bufptr;
1368 bufend = PL_parser->bufend;
1369 }
1370 }
1371 unichar = utf8n_to_uvuni((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
1372 if (retlen == (STRLEN)-1) {
1373 /* malformed UTF-8 */
1374 ENTER;
1375 SAVESPTR(PL_warnhook);
1376 PL_warnhook = PERL_WARNHOOK_FATAL;
1377 utf8n_to_uvuni((U8*)s, bufend-s, NULL, 0);
1378 LEAVE;
1379 }
1380 return unichar;
1381 } else {
1382 if (s == bufend) {
1383 if (!lex_next_chunk(flags))
1384 return -1;
1385 s = PL_parser->bufptr;
1386 }
1387 return (U8)*s;
1388 }
1389}
1390
1391/*
1392=for apidoc Amx|I32|lex_read_unichar|U32 flags
1393
1394Reads the next (Unicode) character in the text currently being lexed.
1395Returns the codepoint (unsigned integer value) of the character read,
1396and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1397if lexing has reached the end of the input text. To non-destructively
1398examine the next character, use L</lex_peek_unichar> instead.
1399
1400If the next character is in (or extends into) the next chunk of input
1401text, the next chunk will be read in. Normally the current chunk will be
1402discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1403then the current chunk will not be discarded.
1404
1405If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1406is encountered, an exception is generated.
1407
1408=cut
1409*/
1410
1411I32
1412Perl_lex_read_unichar(pTHX_ U32 flags)
1413{
1414 I32 c;
1415 if (flags & ~(LEX_KEEP_PREVIOUS))
1416 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1417 c = lex_peek_unichar(flags);
1418 if (c != -1) {
1419 if (c == '\n')
1420 CopLINE_inc(PL_curcop);
d9018cbe
EB
1421 if (UTF)
1422 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1423 else
1424 ++(PL_parser->bufptr);
f0e67a1d
Z
1425 }
1426 return c;
1427}
1428
1429/*
1430=for apidoc Amx|void|lex_read_space|U32 flags
1431
1432Reads optional spaces, in Perl style, in the text currently being
1433lexed. The spaces may include ordinary whitespace characters and
1434Perl-style comments. C<#line> directives are processed if encountered.
1435L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1436at a non-space character (or the end of the input text).
1437
1438If spaces extend into the next chunk of input text, the next chunk will
1439be read in. Normally the current chunk will be discarded at the same
1440time, but if I<flags> includes C<LEX_KEEP_PREVIOUS> then the current
1441chunk will not be discarded.
1442
1443=cut
1444*/
1445
f0998909
Z
1446#define LEX_NO_NEXT_CHUNK 0x80000000
1447
f0e67a1d
Z
1448void
1449Perl_lex_read_space(pTHX_ U32 flags)
1450{
1451 char *s, *bufend;
1452 bool need_incline = 0;
f0998909 1453 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK))
f0e67a1d
Z
1454 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
1455#ifdef PERL_MAD
1456 if (PL_skipwhite) {
1457 sv_free(PL_skipwhite);
1458 PL_skipwhite = NULL;
1459 }
1460 if (PL_madskills)
1461 PL_skipwhite = newSVpvs("");
1462#endif /* PERL_MAD */
1463 s = PL_parser->bufptr;
1464 bufend = PL_parser->bufend;
1465 while (1) {
1466 char c = *s;
1467 if (c == '#') {
1468 do {
1469 c = *++s;
1470 } while (!(c == '\n' || (c == 0 && s == bufend)));
1471 } else if (c == '\n') {
1472 s++;
1473 PL_parser->linestart = s;
1474 if (s == bufend)
1475 need_incline = 1;
1476 else
1477 incline(s);
1478 } else if (isSPACE(c)) {
1479 s++;
1480 } else if (c == 0 && s == bufend) {
1481 bool got_more;
1482#ifdef PERL_MAD
1483 if (PL_madskills)
1484 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1485#endif /* PERL_MAD */
f0998909
Z
1486 if (flags & LEX_NO_NEXT_CHUNK)
1487 break;
f0e67a1d
Z
1488 PL_parser->bufptr = s;
1489 CopLINE_inc(PL_curcop);
1490 got_more = lex_next_chunk(flags);
1491 CopLINE_dec(PL_curcop);
1492 s = PL_parser->bufptr;
1493 bufend = PL_parser->bufend;
1494 if (!got_more)
1495 break;
1496 if (need_incline && PL_parser->rsfp) {
1497 incline(s);
1498 need_incline = 0;
1499 }
1500 } else {
1501 break;
1502 }
1503 }
1504#ifdef PERL_MAD
1505 if (PL_madskills)
1506 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1507#endif /* PERL_MAD */
1508 PL_parser->bufptr = s;
1509}
1510
1511/*
ffb4593c
NT
1512 * S_incline
1513 * This subroutine has nothing to do with tilting, whether at windmills
1514 * or pinball tables. Its name is short for "increment line". It
57843af0 1515 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1516 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1517 * # line 500 "foo.pm"
1518 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1519 */
1520
76e3520e 1521STATIC void
d9095cec 1522S_incline(pTHX_ const char *s)
463ee0b2 1523{
97aff369 1524 dVAR;
d9095cec
NC
1525 const char *t;
1526 const char *n;
1527 const char *e;
8818d409 1528 line_t line_num;
463ee0b2 1529
7918f24d
NC
1530 PERL_ARGS_ASSERT_INCLINE;
1531
57843af0 1532 CopLINE_inc(PL_curcop);
463ee0b2
LW
1533 if (*s++ != '#')
1534 return;
d4c19fe8
AL
1535 while (SPACE_OR_TAB(*s))
1536 s++;
73659bf1
GS
1537 if (strnEQ(s, "line", 4))
1538 s += 4;
1539 else
1540 return;
084592ab 1541 if (SPACE_OR_TAB(*s))
73659bf1 1542 s++;
4e553d73 1543 else
73659bf1 1544 return;
d4c19fe8
AL
1545 while (SPACE_OR_TAB(*s))
1546 s++;
463ee0b2
LW
1547 if (!isDIGIT(*s))
1548 return;
d4c19fe8 1549
463ee0b2
LW
1550 n = s;
1551 while (isDIGIT(*s))
1552 s++;
07714eb4 1553 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1554 return;
bf4acbe4 1555 while (SPACE_OR_TAB(*s))
463ee0b2 1556 s++;
73659bf1 1557 if (*s == '"' && (t = strchr(s+1, '"'))) {
463ee0b2 1558 s++;
73659bf1
GS
1559 e = t + 1;
1560 }
463ee0b2 1561 else {
c35e046a
AL
1562 t = s;
1563 while (!isSPACE(*t))
1564 t++;
73659bf1 1565 e = t;
463ee0b2 1566 }
bf4acbe4 1567 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1568 e++;
1569 if (*e != '\n' && *e != '\0')
1570 return; /* false alarm */
1571
8818d409
FC
1572 line_num = atoi(n)-1;
1573
f4dd75d9 1574 if (t - s > 0) {
d9095cec 1575 const STRLEN len = t - s;
19bad673
NC
1576 SV *const temp_sv = CopFILESV(PL_curcop);
1577 const char *cf;
1578 STRLEN tmplen;
1579
1580 if (temp_sv) {
1581 cf = SvPVX(temp_sv);
1582 tmplen = SvCUR(temp_sv);
1583 } else {
1584 cf = NULL;
1585 tmplen = 0;
1586 }
1587
d1299d44 1588 if (!PL_rsfp && !PL_parser->filtered) {
e66cf94c
RGS
1589 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1590 * to *{"::_<newfilename"} */
44867030
NC
1591 /* However, the long form of evals is only turned on by the
1592 debugger - usually they're "(eval %lu)" */
1593 char smallbuf[128];
1594 char *tmpbuf;
1595 GV **gvp;
d9095cec 1596 STRLEN tmplen2 = len;
798b63bc 1597 if (tmplen + 2 <= sizeof smallbuf)
e66cf94c
RGS
1598 tmpbuf = smallbuf;
1599 else
2ae0db35 1600 Newx(tmpbuf, tmplen + 2, char);
44867030
NC
1601 tmpbuf[0] = '_';
1602 tmpbuf[1] = '<';
2ae0db35 1603 memcpy(tmpbuf + 2, cf, tmplen);
44867030 1604 tmplen += 2;
8a5ee598
RGS
1605 gvp = (GV**)hv_fetch(PL_defstash, tmpbuf, tmplen, FALSE);
1606 if (gvp) {
44867030
NC
1607 char *tmpbuf2;
1608 GV *gv2;
1609
1610 if (tmplen2 + 2 <= sizeof smallbuf)
1611 tmpbuf2 = smallbuf;
1612 else
1613 Newx(tmpbuf2, tmplen2 + 2, char);
1614
1615 if (tmpbuf2 != smallbuf || tmpbuf != smallbuf) {
1616 /* Either they malloc'd it, or we malloc'd it,
1617 so no prefix is present in ours. */
1618 tmpbuf2[0] = '_';
1619 tmpbuf2[1] = '<';
1620 }
1621
1622 memcpy(tmpbuf2 + 2, s, tmplen2);
1623 tmplen2 += 2;
1624
8a5ee598 1625 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1626 if (!isGV(gv2)) {
8a5ee598 1627 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1628 /* adjust ${"::_<newfilename"} to store the new file name */
1629 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
8818d409
FC
1630 /* The line number may differ. If that is the case,
1631 alias the saved lines that are in the array.
1632 Otherwise alias the whole array. */
1633 if (CopLINE(PL_curcop) == line_num) {
1634 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(*gvp)));
1635 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(*gvp)));
1636 }
1637 else if (GvAV(*gvp)) {
1638 AV * const av = GvAV(*gvp);
1639 const I32 start = CopLINE(PL_curcop)+1;
1640 I32 items = AvFILLp(av) - start;
1641 if (items > 0) {
1642 AV * const av2 = GvAVn(gv2);
1643 SV **svp = AvARRAY(av) + start;
1644 I32 l = (I32)line_num+1;
1645 while (items--)
1646 av_store(av2, l++, SvREFCNT_inc(*svp++));
1647 }
1648 }
e5527e4b 1649 }
44867030
NC
1650
1651 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
8a5ee598 1652 }
e66cf94c 1653 if (tmpbuf != smallbuf) Safefree(tmpbuf);
e66cf94c 1654 }
05ec9bb3 1655 CopFILE_free(PL_curcop);
d9095cec 1656 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1657 }
8818d409 1658 CopLINE_set(PL_curcop, line_num);
463ee0b2
LW
1659}
1660
29595ff2 1661#ifdef PERL_MAD
cd81e915 1662/* skip space before PL_thistoken */
29595ff2
NC
1663
1664STATIC char *
1665S_skipspace0(pTHX_ register char *s)
1666{
7918f24d
NC
1667 PERL_ARGS_ASSERT_SKIPSPACE0;
1668
29595ff2
NC
1669 s = skipspace(s);
1670 if (!PL_madskills)
1671 return s;
cd81e915
NC
1672 if (PL_skipwhite) {
1673 if (!PL_thiswhite)
6b29d1f5 1674 PL_thiswhite = newSVpvs("");
cd81e915
NC
1675 sv_catsv(PL_thiswhite, PL_skipwhite);
1676 sv_free(PL_skipwhite);
1677 PL_skipwhite = 0;
1678 }
1679 PL_realtokenstart = s - SvPVX(PL_linestr);
29595ff2
NC
1680 return s;
1681}
1682
cd81e915 1683/* skip space after PL_thistoken */
29595ff2
NC
1684
1685STATIC char *
1686S_skipspace1(pTHX_ register char *s)
1687{
d4c19fe8 1688 const char *start = s;
29595ff2
NC
1689 I32 startoff = start - SvPVX(PL_linestr);
1690
7918f24d
NC
1691 PERL_ARGS_ASSERT_SKIPSPACE1;
1692
29595ff2
NC
1693 s = skipspace(s);
1694 if (!PL_madskills)
1695 return s;
1696 start = SvPVX(PL_linestr) + startoff;
cd81e915 1697 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1698 const char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1699 PL_thistoken = newSVpvn(tstart, start - tstart);
1700 }
1701 PL_realtokenstart = -1;
1702 if (PL_skipwhite) {
1703 if (!PL_nextwhite)
6b29d1f5 1704 PL_nextwhite = newSVpvs("");
cd81e915
NC
1705 sv_catsv(PL_nextwhite, PL_skipwhite);
1706 sv_free(PL_skipwhite);
1707 PL_skipwhite = 0;
29595ff2
NC
1708 }
1709 return s;
1710}
1711
1712STATIC char *
1713S_skipspace2(pTHX_ register char *s, SV **svp)
1714{
c35e046a
AL
1715 char *start;
1716 const I32 bufptroff = PL_bufptr - SvPVX(PL_linestr);
1717 const I32 startoff = s - SvPVX(PL_linestr);
1718
7918f24d
NC
1719 PERL_ARGS_ASSERT_SKIPSPACE2;
1720
29595ff2
NC
1721 s = skipspace(s);
1722 PL_bufptr = SvPVX(PL_linestr) + bufptroff;
1723 if (!PL_madskills || !svp)
1724 return s;
1725 start = SvPVX(PL_linestr) + startoff;
cd81e915 1726 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1727 char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1728 PL_thistoken = newSVpvn(tstart, start - tstart);
1729 PL_realtokenstart = -1;
29595ff2 1730 }
cd81e915 1731 if (PL_skipwhite) {
29595ff2 1732 if (!*svp)
6b29d1f5 1733 *svp = newSVpvs("");
cd81e915
NC
1734 sv_setsv(*svp, PL_skipwhite);
1735 sv_free(PL_skipwhite);
1736 PL_skipwhite = 0;
29595ff2
NC
1737 }
1738
1739 return s;
1740}
1741#endif
1742
80a702cd 1743STATIC void
15f169a1 1744S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1745{
1746 AV *av = CopFILEAVx(PL_curcop);
1747 if (av) {
b9f83d2f 1748 SV * const sv = newSV_type(SVt_PVMG);
5fa550fb
NC
1749 if (orig_sv)
1750 sv_setsv(sv, orig_sv);
1751 else
1752 sv_setpvn(sv, buf, len);
80a702cd
RGS
1753 (void)SvIOK_on(sv);
1754 SvIV_set(sv, 0);
1755 av_store(av, (I32)CopLINE(PL_curcop), sv);
1756 }
1757}
1758
ffb4593c
NT
1759/*
1760 * S_skipspace
1761 * Called to gobble the appropriate amount and type of whitespace.
1762 * Skips comments as well.
1763 */
1764
76e3520e 1765STATIC char *
cea2e8a9 1766S_skipspace(pTHX_ register char *s)
a687059c 1767{
5db06880 1768#ifdef PERL_MAD
f0e67a1d
Z
1769 char *start = s;
1770#endif /* PERL_MAD */
7918f24d 1771 PERL_ARGS_ASSERT_SKIPSPACE;
f0e67a1d 1772#ifdef PERL_MAD
cd81e915
NC
1773 if (PL_skipwhite) {
1774 sv_free(PL_skipwhite);
f0e67a1d 1775 PL_skipwhite = NULL;
5db06880 1776 }
f0e67a1d 1777#endif /* PERL_MAD */
3280af22 1778 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
bf4acbe4 1779 while (s < PL_bufend && SPACE_OR_TAB(*s))
463ee0b2 1780 s++;
f0e67a1d
Z
1781 } else {
1782 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1783 PL_bufptr = s;
f0998909
Z
1784 lex_read_space(LEX_KEEP_PREVIOUS |
1785 (PL_sublex_info.sub_inwhat || PL_lex_state == LEX_FORMLINE ?
1786 LEX_NO_NEXT_CHUNK : 0));
3280af22 1787 s = PL_bufptr;
f0e67a1d
Z
1788 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1789 if (PL_linestart > PL_bufptr)
1790 PL_bufptr = PL_linestart;
1791 return s;
463ee0b2 1792 }
5db06880 1793#ifdef PERL_MAD
f0e67a1d
Z
1794 if (PL_madskills)
1795 PL_skipwhite = newSVpvn(start, s-start);
1796#endif /* PERL_MAD */
5db06880 1797 return s;
a687059c 1798}
378cc40b 1799
ffb4593c
NT
1800/*
1801 * S_check_uni
1802 * Check the unary operators to ensure there's no ambiguity in how they're
1803 * used. An ambiguous piece of code would be:
1804 * rand + 5
1805 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1806 * the +5 is its argument.
1807 */
1808
76e3520e 1809STATIC void
cea2e8a9 1810S_check_uni(pTHX)
ba106d47 1811{
97aff369 1812 dVAR;
d4c19fe8
AL
1813 const char *s;
1814 const char *t;
2f3197b3 1815
3280af22 1816 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1817 return;
3280af22
NIS
1818 while (isSPACE(*PL_last_uni))
1819 PL_last_uni++;
c35e046a
AL
1820 s = PL_last_uni;
1821 while (isALNUM_lazy_if(s,UTF) || *s == '-')
1822 s++;
3280af22 1823 if ((t = strchr(s, '(')) && t < PL_bufptr)
a0d0e21e 1824 return;
6136c704 1825
9b387841
NC
1826 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
1827 "Warning: Use of \"%.*s\" without parentheses is ambiguous",
1828 (int)(s - PL_last_uni), PL_last_uni);
2f3197b3
LW
1829}
1830
ffb4593c
NT
1831/*
1832 * LOP : macro to build a list operator. Its behaviour has been replaced
1833 * with a subroutine, S_lop() for which LOP is just another name.
1834 */
1835
a0d0e21e
LW
1836#define LOP(f,x) return lop(f,x,s)
1837
ffb4593c
NT
1838/*
1839 * S_lop
1840 * Build a list operator (or something that might be one). The rules:
1841 * - if we have a next token, then it's a list operator [why?]
1842 * - if the next thing is an opening paren, then it's a function
1843 * - else it's a list operator
1844 */
1845
76e3520e 1846STATIC I32
a0be28da 1847S_lop(pTHX_ I32 f, int x, char *s)
ffed7fef 1848{
97aff369 1849 dVAR;
7918f24d
NC
1850
1851 PERL_ARGS_ASSERT_LOP;
1852
6154021b 1853 pl_yylval.ival = f;
35c8bce7 1854 CLINE;
3280af22
NIS
1855 PL_expect = x;
1856 PL_bufptr = s;
1857 PL_last_lop = PL_oldbufptr;
eb160463 1858 PL_last_lop_op = (OPCODE)f;
5db06880
NC
1859#ifdef PERL_MAD
1860 if (PL_lasttoke)
78cdf107 1861 goto lstop;
5db06880 1862#else
3280af22 1863 if (PL_nexttoke)
78cdf107 1864 goto lstop;
5db06880 1865#endif
79072805 1866 if (*s == '(')
bbf60fe6 1867 return REPORT(FUNC);
29595ff2 1868 s = PEEKSPACE(s);
79072805 1869 if (*s == '(')
bbf60fe6 1870 return REPORT(FUNC);
78cdf107
Z
1871 else {
1872 lstop:
1873 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
1874 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
bbf60fe6 1875 return REPORT(LSTOP);
78cdf107 1876 }
79072805
LW
1877}
1878
5db06880
NC
1879#ifdef PERL_MAD
1880 /*
1881 * S_start_force
1882 * Sets up for an eventual force_next(). start_force(0) basically does
1883 * an unshift, while start_force(-1) does a push. yylex removes items
1884 * on the "pop" end.
1885 */
1886
1887STATIC void
1888S_start_force(pTHX_ int where)
1889{
1890 int i;
1891
cd81e915 1892 if (where < 0) /* so people can duplicate start_force(PL_curforce) */
5db06880 1893 where = PL_lasttoke;
cd81e915
NC
1894 assert(PL_curforce < 0 || PL_curforce == where);
1895 if (PL_curforce != where) {
5db06880
NC
1896 for (i = PL_lasttoke; i > where; --i) {
1897 PL_nexttoke[i] = PL_nexttoke[i-1];
1898 }
1899 PL_lasttoke++;
1900 }
cd81e915 1901 if (PL_curforce < 0) /* in case of duplicate start_force() */
5db06880 1902 Zero(&PL_nexttoke[where], 1, NEXTTOKE);
cd81e915
NC
1903 PL_curforce = where;
1904 if (PL_nextwhite) {
5db06880 1905 if (PL_madskills)
6b29d1f5 1906 curmad('^', newSVpvs(""));
cd81e915 1907 CURMAD('_', PL_nextwhite);
5db06880
NC
1908 }
1909}
1910
1911STATIC void
1912S_curmad(pTHX_ char slot, SV *sv)
1913{
1914 MADPROP **where;
1915
1916 if (!sv)
1917 return;
cd81e915
NC
1918 if (PL_curforce < 0)
1919 where = &PL_thismad;
5db06880 1920 else
cd81e915 1921 where = &PL_nexttoke[PL_curforce].next_mad;
5db06880 1922
cd81e915 1923 if (PL_faketokens)
76f68e9b 1924 sv_setpvs(sv, "");
5db06880
NC
1925 else {
1926 if (!IN_BYTES) {
1927 if (UTF && is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
1928 SvUTF8_on(sv);
1929 else if (PL_encoding) {
1930 sv_recode_to_utf8(sv, PL_encoding);
1931 }
1932 }
1933 }
1934
1935 /* keep a slot open for the head of the list? */
1936 if (slot != '_' && *where && (*where)->mad_key == '^') {
1937 (*where)->mad_key = slot;
daba3364 1938 sv_free(MUTABLE_SV(((*where)->mad_val)));
5db06880
NC
1939 (*where)->mad_val = (void*)sv;
1940 }
1941 else
1942 addmad(newMADsv(slot, sv), where, 0);
1943}
1944#else
b3f24c00
MHM
1945# define start_force(where) NOOP
1946# define curmad(slot, sv) NOOP
5db06880
NC
1947#endif
1948
ffb4593c
NT
1949/*
1950 * S_force_next
9cbb5ea2 1951 * When the lexer realizes it knows the next token (for instance,
ffb4593c 1952 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 1953 * to know what token to return the next time the lexer is called. Caller
5db06880
NC
1954 * will need to set PL_nextval[] (or PL_nexttoke[].next_val with PERL_MAD),
1955 * and possibly PL_expect to ensure the lexer handles the token correctly.
ffb4593c
NT
1956 */
1957
4e553d73 1958STATIC void
cea2e8a9 1959S_force_next(pTHX_ I32 type)
79072805 1960{
97aff369 1961 dVAR;
704d4215
GG
1962#ifdef DEBUGGING
1963 if (DEBUG_T_TEST) {
1964 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 1965 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
1966 }
1967#endif
5db06880 1968#ifdef PERL_MAD
cd81e915 1969 if (PL_curforce < 0)
5db06880 1970 start_force(PL_lasttoke);
cd81e915 1971 PL_nexttoke[PL_curforce].next_type = type;
5db06880
NC
1972 if (PL_lex_state != LEX_KNOWNEXT)
1973 PL_lex_defer = PL_lex_state;
1974 PL_lex_state = LEX_KNOWNEXT;
1975 PL_lex_expect = PL_expect;
cd81e915 1976 PL_curforce = -1;
5db06880 1977#else
3280af22
NIS
1978 PL_nexttype[PL_nexttoke] = type;
1979 PL_nexttoke++;
1980 if (PL_lex_state != LEX_KNOWNEXT) {
1981 PL_lex_defer = PL_lex_state;
1982 PL_lex_expect = PL_expect;
1983 PL_lex_state = LEX_KNOWNEXT;
79072805 1984 }
5db06880 1985#endif
79072805
LW
1986}
1987
28ac2b49
Z
1988void
1989Perl_yyunlex(pTHX)
1990{
a7aaec61
Z
1991 int yyc = PL_parser->yychar;
1992 if (yyc != YYEMPTY) {
1993 if (yyc) {
1994 start_force(-1);
1995 NEXTVAL_NEXTTOKE = PL_parser->yylval;
1996 if (yyc == '{'/*}*/ || yyc == HASHBRACK || yyc == '['/*]*/) {
78cdf107 1997 PL_lex_allbrackets--;
a7aaec61 1998 PL_lex_brackets--;
78cdf107
Z
1999 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
2000 } else if (yyc == '('/*)*/) {
2001 PL_lex_allbrackets--;
2002 yyc |= (2<<24);
a7aaec61
Z
2003 }
2004 force_next(yyc);
2005 }
28ac2b49
Z
2006 PL_parser->yychar = YYEMPTY;
2007 }
2008}
2009
d0a148a6 2010STATIC SV *
15f169a1 2011S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 2012{
97aff369 2013 dVAR;
740cce10 2014 SV * const sv = newSVpvn_utf8(start, len,
eaf7a4d2
CS
2015 !IN_BYTES
2016 && UTF
2017 && !is_ascii_string((const U8*)start, len)
740cce10 2018 && is_utf8_string((const U8*)start, len));
d0a148a6
NC
2019 return sv;
2020}
2021
ffb4593c
NT
2022/*
2023 * S_force_word
2024 * When the lexer knows the next thing is a word (for instance, it has
2025 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
2026 * it calls S_force_word to stick the next word into the PL_nexttoke/val
2027 * lookahead.
ffb4593c
NT
2028 *
2029 * Arguments:
b1b65b59 2030 * char *start : buffer position (must be within PL_linestr)
02b34bbe 2031 * int token : PL_next* will be this type of bare word (e.g., METHOD,WORD)
ffb4593c
NT
2032 * int check_keyword : if true, Perl checks to make sure the word isn't
2033 * a keyword (do this if the word is a label, e.g. goto FOO)
2034 * int allow_pack : if true, : characters will also be allowed (require,
2035 * use, etc. do this)
9cbb5ea2 2036 * int allow_initial_tick : used by the "sub" lexer only.
ffb4593c
NT
2037 */
2038
76e3520e 2039STATIC char *
cea2e8a9 2040S_force_word(pTHX_ register char *start, int token, int check_keyword, int allow_pack, int allow_initial_tick)
79072805 2041{
97aff369 2042 dVAR;
463ee0b2
LW
2043 register char *s;
2044 STRLEN len;
4e553d73 2045
7918f24d
NC
2046 PERL_ARGS_ASSERT_FORCE_WORD;
2047
29595ff2 2048 start = SKIPSPACE1(start);
463ee0b2 2049 s = start;
7e2040f0 2050 if (isIDFIRST_lazy_if(s,UTF) ||
a0d0e21e 2051 (allow_pack && *s == ':') ||
15f0808c 2052 (allow_initial_tick && *s == '\'') )
a0d0e21e 2053 {
3280af22 2054 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
5458a98a 2055 if (check_keyword && keyword(PL_tokenbuf, len, 0))
463ee0b2 2056 return start;
cd81e915 2057 start_force(PL_curforce);
5db06880
NC
2058 if (PL_madskills)
2059 curmad('X', newSVpvn(start,s-start));
463ee0b2 2060 if (token == METHOD) {
29595ff2 2061 s = SKIPSPACE1(s);
463ee0b2 2062 if (*s == '(')
3280af22 2063 PL_expect = XTERM;
463ee0b2 2064 else {
3280af22 2065 PL_expect = XOPERATOR;
463ee0b2 2066 }
79072805 2067 }
e74e6b3d 2068 if (PL_madskills)
63575281 2069 curmad('g', newSVpvs( "forced" ));
9ded7720 2070 NEXTVAL_NEXTTOKE.opval
d0a148a6
NC
2071 = (OP*)newSVOP(OP_CONST,0,
2072 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 2073 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
2074 force_next(token);
2075 }
2076 return s;
2077}
2078
ffb4593c
NT
2079/*
2080 * S_force_ident
9cbb5ea2 2081 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
2082 * text only contains the "foo" portion. The first argument is a pointer
2083 * to the "foo", and the second argument is the type symbol to prefix.
2084 * Forces the next token to be a "WORD".
9cbb5ea2 2085 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2086 */
2087
76e3520e 2088STATIC void
bfed75c6 2089S_force_ident(pTHX_ register const char *s, int kind)
79072805 2090{
97aff369 2091 dVAR;
7918f24d
NC
2092
2093 PERL_ARGS_ASSERT_FORCE_IDENT;
2094
c35e046a 2095 if (*s) {
90e5519e 2096 const STRLEN len = strlen(s);
728847b1
BF
2097 OP* const o = (OP*)newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
2098 UTF ? SVf_UTF8 : 0));
cd81e915 2099 start_force(PL_curforce);
9ded7720 2100 NEXTVAL_NEXTTOKE.opval = o;
79072805 2101 force_next(WORD);
748a9306 2102 if (kind) {
11343788 2103 o->op_private = OPpCONST_ENTERED;
55497cff 2104 /* XXX see note in pp_entereval() for why we forgo typo
2105 warnings if the symbol must be introduced in an eval.
2106 GSAR 96-10-12 */
90e5519e 2107 gv_fetchpvn_flags(s, len,
728847b1
BF
2108 (PL_in_eval ? (GV_ADDMULTI | GV_ADDINEVAL)
2109 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
90e5519e
NC
2110 kind == '$' ? SVt_PV :
2111 kind == '@' ? SVt_PVAV :
2112 kind == '%' ? SVt_PVHV :
a0d0e21e 2113 SVt_PVGV
90e5519e 2114 );
748a9306 2115 }
79072805
LW
2116 }
2117}
2118
1571675a
GS
2119NV
2120Perl_str_to_version(pTHX_ SV *sv)
2121{
2122 NV retval = 0.0;
2123 NV nshift = 1.0;
2124 STRLEN len;
cfd0369c 2125 const char *start = SvPV_const(sv,len);
9d4ba2ae 2126 const char * const end = start + len;
504618e9 2127 const bool utf = SvUTF8(sv) ? TRUE : FALSE;
7918f24d
NC
2128
2129 PERL_ARGS_ASSERT_STR_TO_VERSION;
2130
1571675a 2131 while (start < end) {
ba210ebe 2132 STRLEN skip;
1571675a
GS
2133 UV n;
2134 if (utf)
9041c2e3 2135 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2136 else {
2137 n = *(U8*)start;
2138 skip = 1;
2139 }
2140 retval += ((NV)n)/nshift;
2141 start += skip;
2142 nshift *= 1000;
2143 }
2144 return retval;
2145}
2146
4e553d73 2147/*
ffb4593c
NT
2148 * S_force_version
2149 * Forces the next token to be a version number.
e759cc13
RGS
2150 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2151 * and if "guessing" is TRUE, then no new token is created (and the caller
2152 * must use an alternative parsing method).
ffb4593c
NT
2153 */
2154
76e3520e 2155STATIC char *
e759cc13 2156S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2157{
97aff369 2158 dVAR;
5f66b61c 2159 OP *version = NULL;
44dcb63b 2160 char *d;
5db06880
NC
2161#ifdef PERL_MAD
2162 I32 startoff = s - SvPVX(PL_linestr);
2163#endif
89bfa8cd 2164
7918f24d
NC
2165 PERL_ARGS_ASSERT_FORCE_VERSION;
2166
29595ff2 2167 s = SKIPSPACE1(s);
89bfa8cd 2168
44dcb63b 2169 d = s;
dd629d5b 2170 if (*d == 'v')
44dcb63b 2171 d++;
44dcb63b 2172 if (isDIGIT(*d)) {
e759cc13
RGS
2173 while (isDIGIT(*d) || *d == '_' || *d == '.')
2174 d++;
5db06880
NC
2175#ifdef PERL_MAD
2176 if (PL_madskills) {
cd81e915 2177 start_force(PL_curforce);
5db06880
NC
2178 curmad('X', newSVpvn(s,d-s));
2179 }
2180#endif
4e4da3ac 2181 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
dd629d5b 2182 SV *ver;
8d08d9ba
DG
2183#ifdef USE_LOCALE_NUMERIC
2184 char *loc = setlocale(LC_NUMERIC, "C");
2185#endif
6154021b 2186 s = scan_num(s, &pl_yylval);
8d08d9ba
DG
2187#ifdef USE_LOCALE_NUMERIC
2188 setlocale(LC_NUMERIC, loc);
2189#endif
6154021b 2190 version = pl_yylval.opval;
dd629d5b
GS
2191 ver = cSVOPx(version)->op_sv;
2192 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2193 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2194 SvNV_set(ver, str_to_version(ver));
1571675a 2195 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2196 }
89bfa8cd 2197 }
5db06880
NC
2198 else if (guessing) {
2199#ifdef PERL_MAD
2200 if (PL_madskills) {
cd81e915
NC
2201 sv_free(PL_nextwhite); /* let next token collect whitespace */
2202 PL_nextwhite = 0;
5db06880
NC
2203 s = SvPVX(PL_linestr) + startoff;
2204 }
2205#endif
e759cc13 2206 return s;
5db06880 2207 }
89bfa8cd 2208 }
2209
5db06880
NC
2210#ifdef PERL_MAD
2211 if (PL_madskills && !version) {
cd81e915
NC
2212 sv_free(PL_nextwhite); /* let next token collect whitespace */
2213 PL_nextwhite = 0;
5db06880
NC
2214 s = SvPVX(PL_linestr) + startoff;
2215 }
2216#endif
89bfa8cd 2217 /* NOTE: The parser sees the package name and the VERSION swapped */
cd81e915 2218 start_force(PL_curforce);
9ded7720 2219 NEXTVAL_NEXTTOKE.opval = version;
4e553d73 2220 force_next(WORD);
89bfa8cd 2221
e759cc13 2222 return s;
89bfa8cd 2223}
2224
ffb4593c 2225/*
91152fc1
DG
2226 * S_force_strict_version
2227 * Forces the next token to be a version number using strict syntax rules.
2228 */
2229
2230STATIC char *
2231S_force_strict_version(pTHX_ char *s)
2232{
2233 dVAR;
2234 OP *version = NULL;
2235#ifdef PERL_MAD
2236 I32 startoff = s - SvPVX(PL_linestr);
2237#endif
2238 const char *errstr = NULL;
2239
2240 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2241
2242 while (isSPACE(*s)) /* leading whitespace */
2243 s++;
2244
2245 if (is_STRICT_VERSION(s,&errstr)) {
2246 SV *ver = newSV(0);
2247 s = (char *)scan_version(s, ver, 0);
2248 version = newSVOP(OP_CONST, 0, ver);
2249 }
4e4da3ac
Z
2250 else if ( (*s != ';' && *s != '{' && *s != '}' ) &&
2251 (s = SKIPSPACE1(s), (*s != ';' && *s != '{' && *s != '}' )))
2252 {
91152fc1
DG
2253 PL_bufptr = s;
2254 if (errstr)
2255 yyerror(errstr); /* version required */
2256 return s;
2257 }
2258
2259#ifdef PERL_MAD
2260 if (PL_madskills && !version) {
2261 sv_free(PL_nextwhite); /* let next token collect whitespace */
2262 PL_nextwhite = 0;
2263 s = SvPVX(PL_linestr) + startoff;
2264 }
2265#endif
2266 /* NOTE: The parser sees the package name and the VERSION swapped */
2267 start_force(PL_curforce);
2268 NEXTVAL_NEXTTOKE.opval = version;
2269 force_next(WORD);
2270
2271 return s;
2272}
2273
2274/*
ffb4593c
NT
2275 * S_tokeq
2276 * Tokenize a quoted string passed in as an SV. It finds the next
2277 * chunk, up to end of string or a backslash. It may make a new
2278 * SV containing that chunk (if HINT_NEW_STRING is on). It also
2279 * turns \\ into \.
2280 */
2281
76e3520e 2282STATIC SV *
cea2e8a9 2283S_tokeq(pTHX_ SV *sv)
79072805 2284{
97aff369 2285 dVAR;
79072805
LW
2286 register char *s;
2287 register char *send;
2288 register char *d;
b3ac6de7
IZ
2289 STRLEN len = 0;
2290 SV *pv = sv;
79072805 2291
7918f24d
NC
2292 PERL_ARGS_ASSERT_TOKEQ;
2293
79072805 2294 if (!SvLEN(sv))
b3ac6de7 2295 goto finish;
79072805 2296
a0d0e21e 2297 s = SvPV_force(sv, len);
21a311ee 2298 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1)
b3ac6de7 2299 goto finish;
463ee0b2 2300 send = s + len;
dcb21ed6
NC
2301 /* This is relying on the SV being "well formed" with a trailing '\0' */
2302 while (s < send && !(*s == '\\' && s[1] == '\\'))
79072805
LW
2303 s++;
2304 if (s == send)
b3ac6de7 2305 goto finish;
79072805 2306 d = s;
be4731d2 2307 if ( PL_hints & HINT_NEW_STRING ) {
59cd0e26 2308 pv = newSVpvn_flags(SvPVX_const(pv), len, SVs_TEMP | SvUTF8(sv));
be4731d2 2309 }
79072805
LW
2310 while (s < send) {
2311 if (*s == '\\') {
a0d0e21e 2312 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2313 s++; /* all that, just for this */
2314 }
2315 *d++ = *s++;
2316 }
2317 *d = '\0';
95a20fc0 2318 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2319 finish:
3280af22 2320 if ( PL_hints & HINT_NEW_STRING )
eb0d8d16 2321 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
79072805
LW
2322 return sv;
2323}
2324
ffb4593c
NT
2325/*
2326 * Now come three functions related to double-quote context,
2327 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2328 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2329 * interact with PL_lex_state, and create fake ( ... ) argument lists
2330 * to handle functions and concatenation.
2331 * They assume that whoever calls them will be setting up a fake
2332 * join call, because each subthing puts a ',' after it. This lets
2333 * "lower \luPpEr"
2334 * become
2335 * join($, , 'lower ', lcfirst( 'uPpEr', ) ,)
2336 *
2337 * (I'm not sure whether the spurious commas at the end of lcfirst's
2338 * arguments and join's arguments are created or not).
2339 */
2340
2341/*
2342 * S_sublex_start
6154021b 2343 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2344 *
2345 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2346 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c
NT
2347 *
2348 * OP_CONST and OP_READLINE are easy--just make the new op and return.
2349 *
2350 * Everything else becomes a FUNC.
2351 *
2352 * Sets PL_lex_state to LEX_INTERPPUSH unless (ival was OP_NULL or we
2353 * had an OP_CONST or OP_READLINE). This just sets us up for a
2354 * call to S_sublex_push().
2355 */
2356
76e3520e 2357STATIC I32
cea2e8a9 2358S_sublex_start(pTHX)
79072805 2359{
97aff369 2360 dVAR;
6154021b 2361 register const I32 op_type = pl_yylval.ival;
79072805
LW
2362
2363 if (op_type == OP_NULL) {
6154021b 2364 pl_yylval.opval = PL_lex_op;
5f66b61c 2365 PL_lex_op = NULL;
79072805
LW
2366 return THING;
2367 }
2368 if (op_type == OP_CONST || op_type == OP_READLINE) {
3280af22 2369 SV *sv = tokeq(PL_lex_stuff);
b3ac6de7
IZ
2370
2371 if (SvTYPE(sv) == SVt_PVIV) {
2372 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2373 STRLEN len;
96a5add6 2374 const char * const p = SvPV_const(sv, len);
740cce10 2375 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2376 SvREFCNT_dec(sv);
2377 sv = nsv;
4e553d73 2378 }
6154021b 2379 pl_yylval.opval = (OP*)newSVOP(op_type, 0, sv);
a0714e2c 2380 PL_lex_stuff = NULL;
6f33ba73
RGS
2381 /* Allow <FH> // "foo" */
2382 if (op_type == OP_READLINE)
2383 PL_expect = XTERMORDORDOR;
79072805
LW
2384 return THING;
2385 }
e3f73d4e
RGS
2386 else if (op_type == OP_BACKTICK && PL_lex_op) {
2387 /* readpipe() vas overriden */
2388 cSVOPx(cLISTOPx(cUNOPx(PL_lex_op)->op_first)->op_first->op_sibling)->op_sv = tokeq(PL_lex_stuff);
6154021b 2389 pl_yylval.opval = PL_lex_op;
9b201d7d 2390 PL_lex_op = NULL;
e3f73d4e
RGS
2391 PL_lex_stuff = NULL;
2392 return THING;
2393 }
79072805 2394
3280af22 2395 PL_sublex_info.super_state = PL_lex_state;
eac04b2e 2396 PL_sublex_info.sub_inwhat = (U16)op_type;
3280af22
NIS
2397 PL_sublex_info.sub_op = PL_lex_op;
2398 PL_lex_state = LEX_INTERPPUSH;
55497cff 2399
3280af22
NIS
2400 PL_expect = XTERM;
2401 if (PL_lex_op) {
6154021b 2402 pl_yylval.opval = PL_lex_op;
5f66b61c 2403 PL_lex_op = NULL;
55497cff 2404 return PMFUNC;
2405 }
2406 else
2407 return FUNC;
2408}
2409
ffb4593c
NT
2410/*
2411 * S_sublex_push
2412 * Create a new scope to save the lexing state. The scope will be
2413 * ended in S_sublex_done. Returns a '(', starting the function arguments
2414 * to the uc, lc, etc. found before.
2415 * Sets PL_lex_state to LEX_INTERPCONCAT.
2416 */
2417
76e3520e 2418STATIC I32
cea2e8a9 2419S_sublex_push(pTHX)
55497cff 2420{
27da23d5 2421 dVAR;
f46d017c 2422 ENTER;
55497cff 2423
3280af22 2424 PL_lex_state = PL_sublex_info.super_state;
651b5b28 2425 SAVEBOOL(PL_lex_dojoin);
3280af22 2426 SAVEI32(PL_lex_brackets);
78cdf107
Z
2427 SAVEI32(PL_lex_allbrackets);
2428 SAVEI8(PL_lex_fakeeof);
3280af22
NIS
2429 SAVEI32(PL_lex_casemods);
2430 SAVEI32(PL_lex_starts);
651b5b28 2431 SAVEI8(PL_lex_state);
7766f137 2432 SAVEVPTR(PL_lex_inpat);
98246f1e 2433 SAVEI16(PL_lex_inwhat);
57843af0 2434 SAVECOPLINE(PL_curcop);
3280af22 2435 SAVEPPTR(PL_bufptr);
8452ff4b 2436 SAVEPPTR(PL_bufend);
3280af22
NIS
2437 SAVEPPTR(PL_oldbufptr);
2438 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2439 SAVEPPTR(PL_last_lop);
2440 SAVEPPTR(PL_last_uni);
3280af22
NIS
2441 SAVEPPTR(PL_linestart);
2442 SAVESPTR(PL_linestr);
8edd5f42
RGS
2443 SAVEGENERICPV(PL_lex_brackstack);
2444 SAVEGENERICPV(PL_lex_casestack);
3280af22
NIS
2445
2446 PL_linestr = PL_lex_stuff;
a0714e2c 2447 PL_lex_stuff = NULL;
3280af22 2448
9cbb5ea2
GS
2449 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2450 = SvPVX(PL_linestr);
3280af22 2451 PL_bufend += SvCUR(PL_linestr);
bd61b366 2452 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2453 SAVEFREESV(PL_linestr);
2454
2455 PL_lex_dojoin = FALSE;
2456 PL_lex_brackets = 0;
78cdf107
Z
2457 PL_lex_allbrackets = 0;
2458 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
a02a5408
JC
2459 Newx(PL_lex_brackstack, 120, char);
2460 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2461 PL_lex_casemods = 0;
2462 *PL_lex_casestack = '\0';
2463 PL_lex_starts = 0;
2464 PL_lex_state = LEX_INTERPCONCAT;
eb160463 2465 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
3280af22
NIS
2466
2467 PL_lex_inwhat = PL_sublex_info.sub_inwhat;
bb16bae8 2468 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
3280af22
NIS
2469 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2470 PL_lex_inpat = PL_sublex_info.sub_op;
79072805 2471 else
5f66b61c 2472 PL_lex_inpat = NULL;
79072805 2473
55497cff 2474 return '(';
79072805
LW
2475}
2476
ffb4593c
NT
2477/*
2478 * S_sublex_done
2479 * Restores lexer state after a S_sublex_push.
2480 */
2481
76e3520e 2482STATIC I32
cea2e8a9 2483S_sublex_done(pTHX)
79072805 2484{
27da23d5 2485 dVAR;
3280af22 2486 if (!PL_lex_starts++) {
396482e1 2487 SV * const sv = newSVpvs("");
9aa983d2
JH
2488 if (SvUTF8(PL_linestr))
2489 SvUTF8_on(sv);
3280af22 2490 PL_expect = XOPERATOR;
6154021b 2491 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
79072805
LW
2492 return THING;
2493 }
2494
3280af22
NIS
2495 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2496 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2497 return yylex();
79072805
LW
2498 }
2499
ffb4593c 2500 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
bb16bae8 2501 assert(PL_lex_inwhat != OP_TRANSR);
3280af22
NIS
2502 if (PL_lex_repl && (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS)) {
2503 PL_linestr = PL_lex_repl;
2504 PL_lex_inpat = 0;
2505 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2506 PL_bufend += SvCUR(PL_linestr);
bd61b366 2507 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2508 SAVEFREESV(PL_linestr);
2509 PL_lex_dojoin = FALSE;
2510 PL_lex_brackets = 0;
78cdf107
Z
2511 PL_lex_allbrackets = 0;
2512 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
3280af22
NIS
2513 PL_lex_casemods = 0;
2514 *PL_lex_casestack = '\0';
2515 PL_lex_starts = 0;
25da4f38 2516 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2517 PL_lex_state = LEX_INTERPNORMAL;
2518 PL_lex_starts++;
e9fa98b2
HS
2519 /* we don't clear PL_lex_repl here, so that we can check later
2520 whether this is an evalled subst; that means we rely on the
2521 logic to ensure sublex_done() is called again only via the
2522 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2523 }
e9fa98b2 2524 else {
3280af22 2525 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2526 PL_lex_repl = NULL;
e9fa98b2 2527 }
79072805 2528 return ',';
ffed7fef
LW
2529 }
2530 else {
5db06880
NC
2531#ifdef PERL_MAD
2532 if (PL_madskills) {
cd81e915
NC
2533 if (PL_thiswhite) {
2534 if (!PL_endwhite)
6b29d1f5 2535 PL_endwhite = newSVpvs("");
cd81e915
NC
2536 sv_catsv(PL_endwhite, PL_thiswhite);
2537 PL_thiswhite = 0;
2538 }
2539 if (PL_thistoken)
76f68e9b 2540 sv_setpvs(PL_thistoken,"");
5db06880 2541 else
cd81e915 2542 PL_realtokenstart = -1;
5db06880
NC
2543 }
2544#endif
f46d017c 2545 LEAVE;
3280af22
NIS
2546 PL_bufend = SvPVX(PL_linestr);
2547 PL_bufend += SvCUR(PL_linestr);
2548 PL_expect = XOPERATOR;
09bef843 2549 PL_sublex_info.sub_inwhat = 0;
79072805 2550 return ')';
ffed7fef
LW
2551 }
2552}
2553
02aa26ce
NT
2554/*
2555 scan_const
2556
2557 Extracts a pattern, double-quoted string, or transliteration. This
2558 is terrifying code.
2559
94def140 2560 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 2561 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 2562 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 2563
94def140
TS
2564 Returns a pointer to the character scanned up to. If this is
2565 advanced from the start pointer supplied (i.e. if anything was
9b599b2a 2566 successfully parsed), will leave an OP for the substring scanned
6154021b 2567 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
2568 by looking at the next characters herself.
2569
02aa26ce
NT
2570 In patterns:
2571 backslashes:
ff3f963a 2572 constants: \N{NAME} only
02aa26ce
NT
2573 case and quoting: \U \Q \E
2574 stops on @ and $, but not for $ as tail anchor
2575
2576 In transliterations:
2577 characters are VERY literal, except for - not at the start or end
94def140
TS
2578 of the string, which indicates a range. If the range is in bytes,
2579 scan_const expands the range to the full set of intermediate
2580 characters. If the range is in utf8, the hyphen is replaced with
2581 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
2582
2583 In double-quoted strings:
2584 backslashes:
2585 double-quoted style: \r and \n
ff3f963a 2586 constants: \x31, etc.
94def140 2587 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
2588 case and quoting: \U \Q \E
2589 stops on @ and $
2590
2591 scan_const does *not* construct ops to handle interpolated strings.
2592 It stops processing as soon as it finds an embedded $ or @ variable
2593 and leaves it to the caller to work out what's going on.
2594
94def140
TS
2595 embedded arrays (whether in pattern or not) could be:
2596 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2597
2598 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
2599
2600 $ in pattern could be $foo or could be tail anchor. Assumption:
2601 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 2602 followed by one of "()| \r\n\t"
02aa26ce
NT
2603
2604 \1 (backreferences) are turned into $1
2605
2606 The structure of the code is
2607 while (there's a character to process) {
94def140
TS
2608 handle transliteration ranges
2609 skip regexp comments /(?#comment)/ and codes /(?{code})/
2610 skip #-initiated comments in //x patterns
2611 check for embedded arrays
02aa26ce
NT
2612 check for embedded scalars
2613 if (backslash) {
94def140 2614 deprecate \1 in substitution replacements
02aa26ce
NT
2615 handle string-changing backslashes \l \U \Q \E, etc.
2616 switch (what was escaped) {
94def140 2617 handle \- in a transliteration (becomes a literal -)
ff3f963a 2618 if a pattern and not \N{, go treat as regular character
94def140
TS
2619 handle \132 (octal characters)
2620 handle \x15 and \x{1234} (hex characters)
ff3f963a 2621 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
TS
2622 handle \cV (control characters)
2623 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 2624 } (end switch)
77a135fe 2625 continue
02aa26ce 2626 } (end if backslash)
77a135fe 2627 handle regular character
02aa26ce 2628 } (end while character to read)
4e553d73 2629
02aa26ce
NT
2630*/
2631
76e3520e 2632STATIC char *
cea2e8a9 2633S_scan_const(pTHX_ char *start)
79072805 2634{
97aff369 2635 dVAR;
3280af22 2636 register char *send = PL_bufend; /* end of the constant */
77a135fe
KW
2637 SV *sv = newSV(send - start); /* sv for the constant. See
2638 note below on sizing. */
02aa26ce
NT
2639 register char *s = start; /* start of the constant */
2640 register char *d = SvPVX(sv); /* destination for copies */
2641 bool dorange = FALSE; /* are we in a translit range? */
c2e66d9e 2642 bool didrange = FALSE; /* did we just finish a range? */
b953e60c
KW
2643 bool has_utf8 = FALSE; /* Output constant is UTF8 */
2644 bool this_utf8 = cBOOL(UTF); /* Is the source string assumed
77a135fe
KW
2645 to be UTF8? But, this can
2646 show as true when the source
2647 isn't utf8, as for example
2648 when it is entirely composed
2649 of hex constants */
2650
2651 /* Note on sizing: The scanned constant is placed into sv, which is
2652 * initialized by newSV() assuming one byte of output for every byte of
2653 * input. This routine expects newSV() to allocate an extra byte for a
2654 * trailing NUL, which this routine will append if it gets to the end of
2655 * the input. There may be more bytes of input than output (eg., \N{LATIN
2656 * CAPITAL LETTER A}), or more output than input if the constant ends up
2657 * recoded to utf8, but each time a construct is found that might increase
2658 * the needed size, SvGROW() is called. Its size parameter each time is
2659 * based on the best guess estimate at the time, namely the length used so
2660 * far, plus the length the current construct will occupy, plus room for
2661 * the trailing NUL, plus one byte for every input byte still unscanned */
2662
012bcf8d 2663 UV uv;
4c3a8340
TS
2664#ifdef EBCDIC
2665 UV literal_endpoint = 0;
e294cc5d 2666 bool native_range = TRUE; /* turned to FALSE if the first endpoint is Unicode. */
4c3a8340 2667#endif
012bcf8d 2668
7918f24d
NC
2669 PERL_ARGS_ASSERT_SCAN_CONST;
2670
bb16bae8 2671 assert(PL_lex_inwhat != OP_TRANSR);
2b9d42f0
NIS
2672 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
2673 /* If we are doing a trans and we know we want UTF8 set expectation */
2674 has_utf8 = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
2675 this_utf8 = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
2676 }
2677
2678
79072805 2679 while (s < send || dorange) {
ff3f963a 2680
02aa26ce 2681 /* get transliterations out of the way (they're most literal) */
3280af22 2682 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 2683 /* expand a range A-Z to the full set of characters. AIE! */
79072805 2684 if (dorange) {
1ba5c669
JH
2685 I32 i; /* current expanded character */
2686 I32 min; /* first character in range */
2687 I32 max; /* last character in range */
02aa26ce 2688
e294cc5d
JH
2689#ifdef EBCDIC
2690 UV uvmax = 0;
2691#endif
2692
2693 if (has_utf8
2694#ifdef EBCDIC
2695 && !native_range
2696#endif
2697 ) {
9d4ba2ae 2698 char * const c = (char*)utf8_hop((U8*)d, -1);
8973db79
JH
2699 char *e = d++;
2700 while (e-- > c)
2701 *(e + 1) = *e;
25716404 2702 *c = (char)UTF_TO_NATIVE(0xff);
8973db79
JH
2703 /* mark the range as done, and continue */
2704 dorange = FALSE;
2705 didrange = TRUE;
2706 continue;
2707 }
2b9d42f0 2708
95a20fc0 2709 i = d - SvPVX_const(sv); /* remember current offset */
e294cc5d
JH
2710#ifdef EBCDIC
2711 SvGROW(sv,
2712 SvLEN(sv) + (has_utf8 ?
2713 (512 - UTF_CONTINUATION_MARK +
2714 UNISKIP(0x100))
2715 : 256));
2716 /* How many two-byte within 0..255: 128 in UTF-8,
2717 * 96 in UTF-8-mod. */
2718#else
9cbb5ea2 2719 SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */
e294cc5d 2720#endif
9cbb5ea2 2721 d = SvPVX(sv) + i; /* refresh d after realloc */
e294cc5d
JH
2722#ifdef EBCDIC
2723 if (has_utf8) {
2724 int j;
2725 for (j = 0; j <= 1; j++) {
2726 char * const c = (char*)utf8_hop((U8*)d, -1);
2727 const UV uv = utf8n_to_uvchr((U8*)c, d - c, NULL, 0);
2728 if (j)
2729 min = (U8)uv;
2730 else if (uv < 256)
2731 max = (U8)uv;
2732 else {
2733 max = (U8)0xff; /* only to \xff */
2734 uvmax = uv; /* \x{100} to uvmax */
2735 }
2736 d = c; /* eat endpoint chars */
2737 }
2738 }
2739 else {
2740#endif
2741 d -= 2; /* eat the first char and the - */
2742 min = (U8)*d; /* first char in range */
2743 max = (U8)d[1]; /* last char in range */
2744#ifdef EBCDIC
2745 }
2746#endif
8ada0baa 2747
c2e66d9e 2748 if (min > max) {
01ec43d0 2749 Perl_croak(aTHX_
d1573ac7 2750 "Invalid range \"%c-%c\" in transliteration operator",
1ba5c669 2751 (char)min, (char)max);
c2e66d9e
GS
2752 }
2753
c7f1f016 2754#ifdef EBCDIC
4c3a8340
TS
2755 if (literal_endpoint == 2 &&
2756 ((isLOWER(min) && isLOWER(max)) ||
2757 (isUPPER(min) && isUPPER(max)))) {
8ada0baa
JH
2758 if (isLOWER(min)) {
2759 for (i = min; i <= max; i++)
2760 if (isLOWER(i))
db42d148 2761 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2762 } else {
2763 for (i = min; i <= max; i++)
2764 if (isUPPER(i))
db42d148 2765 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2766 }
2767 }
2768 else
2769#endif
2770 for (i = min; i <= max; i++)
e294cc5d
JH
2771#ifdef EBCDIC
2772 if (has_utf8) {
2773 const U8 ch = (U8)NATIVE_TO_UTF(i);
2774 if (UNI_IS_INVARIANT(ch))
2775 *d++ = (U8)i;
2776 else {
2777 *d++ = (U8)UTF8_EIGHT_BIT_HI(ch);
2778 *d++ = (U8)UTF8_EIGHT_BIT_LO(ch);
2779 }
2780 }
2781 else
2782#endif
2783 *d++ = (char)i;
2784
2785#ifdef EBCDIC
2786 if (uvmax) {
2787 d = (char*)uvchr_to_utf8((U8*)d, 0x100);
2788 if (uvmax > 0x101)
2789 *d++ = (char)UTF_TO_NATIVE(0xff);
2790 if (uvmax > 0x100)
2791 d = (char*)uvchr_to_utf8((U8*)d, uvmax);
2792 }
2793#endif
02aa26ce
NT
2794
2795 /* mark the range as done, and continue */
79072805 2796 dorange = FALSE;
01ec43d0 2797 didrange = TRUE;
4c3a8340
TS
2798#ifdef EBCDIC
2799 literal_endpoint = 0;
2800#endif
79072805 2801 continue;
4e553d73 2802 }
02aa26ce
NT
2803
2804 /* range begins (ignore - as first or last char) */
79072805 2805 else if (*s == '-' && s+1 < send && s != start) {
4e553d73 2806 if (didrange) {
1fafa243 2807 Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
01ec43d0 2808 }
e294cc5d
JH
2809 if (has_utf8
2810#ifdef EBCDIC
2811 && !native_range
2812#endif
2813 ) {
25716404 2814 *d++ = (char)UTF_TO_NATIVE(0xff); /* use illegal utf8 byte--see pmtrans */
a0ed51b3
LW
2815 s++;
2816 continue;
2817 }
79072805
LW
2818 dorange = TRUE;
2819 s++;
01ec43d0
GS
2820 }
2821 else {
2822 didrange = FALSE;
4c3a8340
TS
2823#ifdef EBCDIC
2824 literal_endpoint = 0;
e294cc5d 2825 native_range = TRUE;
4c3a8340 2826#endif
01ec43d0 2827 }
79072805 2828 }
02aa26ce
NT
2829
2830 /* if we get here, we're not doing a transliteration */
2831
0f5d15d6
IZ
2832 /* skip for regexp comments /(?#comment)/ and code /(?{code})/,
2833 except for the last char, which will be done separately. */
3280af22 2834 else if (*s == '(' && PL_lex_inpat && s[1] == '?') {
cc6b7395 2835 if (s[2] == '#') {
e994fd66 2836 while (s+1 < send && *s != ')')
db42d148 2837 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
155aba94
GS
2838 }
2839 else if (s[2] == '{' /* This should match regcomp.c */
67edc0c9 2840 || (s[2] == '?' && s[3] == '{'))
155aba94 2841 {
cc6b7395 2842 I32 count = 1;
0f5d15d6 2843 char *regparse = s + (s[2] == '{' ? 3 : 4);
cc6b7395
IZ
2844 char c;
2845
d9f97599
GS
2846 while (count && (c = *regparse)) {
2847 if (c == '\\' && regparse[1])
2848 regparse++;
4e553d73 2849 else if (c == '{')
cc6b7395 2850 count++;
4e553d73 2851 else if (c == '}')
cc6b7395 2852 count--;
d9f97599 2853 regparse++;
cc6b7395 2854 }
e994fd66 2855 if (*regparse != ')')
5bdf89e7 2856 regparse--; /* Leave one char for continuation. */
0f5d15d6 2857 while (s < regparse)
db42d148 2858 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
cc6b7395 2859 }
748a9306 2860 }
02aa26ce
NT
2861
2862 /* likewise skip #-initiated comments in //x patterns */
3280af22 2863 else if (*s == '#' && PL_lex_inpat &&
73134a2e 2864 ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED) {
748a9306 2865 while (s+1 < send && *s != '\n')
db42d148 2866 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
748a9306 2867 }
02aa26ce 2868
5d1d4326 2869 /* check for embedded arrays
da6eedaa 2870 (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
5d1d4326 2871 */
1749ea0d
TS
2872 else if (*s == '@' && s[1]) {
2873 if (isALNUM_lazy_if(s+1,UTF))
2874 break;
2875 if (strchr(":'{$", s[1]))
2876 break;
2877 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
2878 break; /* in regexp, neither @+ nor @- are interpolated */
2879 }
02aa26ce
NT
2880
2881 /* check for embedded scalars. only stop if we're sure it's a
2882 variable.
2883 */
79072805 2884 else if (*s == '$') {
3280af22 2885 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
79072805 2886 break;
77772344 2887 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
a2a5de95
NC
2888 if (s[1] == '\\') {
2889 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
2890 "Possible unintended interpolation of $\\ in regex");
77772344 2891 }
79072805 2892 break; /* in regexp, $ might be tail anchor */
77772344 2893 }
79072805 2894 }
02aa26ce 2895
2b9d42f0
NIS
2896 /* End of else if chain - OP_TRANS rejoin rest */
2897
02aa26ce 2898 /* backslashes */
79072805 2899 if (*s == '\\' && s+1 < send) {
ff3f963a
KW
2900 char* e; /* Can be used for ending '}', etc. */
2901
79072805 2902 s++;
02aa26ce 2903
7d0fc23c
KW
2904 /* warn on \1 - \9 in substitution replacements, but note that \11
2905 * is an octal; and \19 is \1 followed by '9' */
3280af22 2906 if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat &&
a0d0e21e 2907 isDIGIT(*s) && *s != '0' && !isDIGIT(s[1]))
79072805 2908 {
a2a5de95 2909 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
79072805
LW
2910 *--s = '$';
2911 break;
2912 }
02aa26ce
NT
2913
2914 /* string-change backslash escapes */
3280af22 2915 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQ", *s)) {
79072805
LW
2916 --s;
2917 break;
2918 }
ff3f963a
KW
2919 /* In a pattern, process \N, but skip any other backslash escapes.
2920 * This is because we don't want to translate an escape sequence
2921 * into a meta symbol and have the regex compiler use the meta
2922 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
2923 * in spite of this, we do have to process \N here while the proper
2924 * charnames handler is in scope. See bugs #56444 and #62056.
2925 * There is a complication because \N in a pattern may also stand
2926 * for 'match a non-nl', and not mean a charname, in which case its
2927 * processing should be deferred to the regex compiler. To be a
2928 * charname it must be followed immediately by a '{', and not look
2929 * like \N followed by a curly quantifier, i.e., not something like
2930 * \N{3,}. regcurly returns a boolean indicating if it is a legal
2931 * quantifier */
2932 else if (PL_lex_inpat
2933 && (*s != 'N'
2934 || s[1] != '{'
2935 || regcurly(s + 1)))
2936 {
cc74c5bd
TS
2937 *d++ = NATIVE_TO_NEED(has_utf8,'\\');
2938 goto default_action;
2939 }
02aa26ce 2940
79072805 2941 switch (*s) {
02aa26ce
NT
2942
2943 /* quoted - in transliterations */
79072805 2944 case '-':
3280af22 2945 if (PL_lex_inwhat == OP_TRANS) {
79072805
LW
2946 *d++ = *s++;
2947 continue;
2948 }
2949 /* FALL THROUGH */
2950 default:
11b8faa4 2951 {
a2a5de95
NC
2952 if ((isALPHA(*s) || isDIGIT(*s)))
2953 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
2954 "Unrecognized escape \\%c passed through",
2955 *s);
11b8faa4 2956 /* default action is to copy the quoted character */
f9a63242 2957 goto default_action;
11b8faa4 2958 }
02aa26ce 2959
632403cc 2960 /* eg. \132 indicates the octal constant 0132 */
79072805
LW
2961 case '0': case '1': case '2': case '3':
2962 case '4': case '5': case '6': case '7':
ba210ebe 2963 {
53305cf1
NC
2964 I32 flags = 0;
2965 STRLEN len = 3;
77a135fe 2966 uv = NATIVE_TO_UNI(grok_oct(s, &len, &flags, NULL));
ba210ebe
JH
2967 s += len;
2968 }
012bcf8d 2969 goto NUM_ESCAPE_INSERT;
02aa26ce 2970
f0a2b745
KW
2971 /* eg. \o{24} indicates the octal constant \024 */
2972 case 'o':
2973 {
2974 STRLEN len;
454155d9 2975 const char* error;
f0a2b745 2976
454155d9 2977 bool valid = grok_bslash_o(s, &uv, &len, &error, 1);
f0a2b745 2978 s += len;
454155d9 2979 if (! valid) {
f0a2b745
KW
2980 yyerror(error);
2981 continue;
2982 }
2983 goto NUM_ESCAPE_INSERT;
2984 }
2985
77a135fe 2986 /* eg. \x24 indicates the hex constant 0x24 */
79072805 2987 case 'x':
a0ed51b3
LW
2988 ++s;
2989 if (*s == '{') {
9d4ba2ae 2990 char* const e = strchr(s, '}');
a4c04bdc
NC
2991 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES |
2992 PERL_SCAN_DISALLOW_PREFIX;
53305cf1 2993 STRLEN len;
355860ce 2994
53305cf1 2995 ++s;
adaeee49 2996 if (!e) {
a0ed51b3 2997 yyerror("Missing right brace on \\x{}");
355860ce 2998 continue;
ba210ebe 2999 }
53305cf1 3000 len = e - s;
77a135fe 3001 uv = NATIVE_TO_UNI(grok_hex(s, &len, &flags, NULL));
ba210ebe 3002 s = e + 1;
a0ed51b3
LW
3003 }
3004 else {
ba210ebe 3005 {
53305cf1 3006 STRLEN len = 2;
a4c04bdc 3007 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
77a135fe 3008 uv = NATIVE_TO_UNI(grok_hex(s, &len, &flags, NULL));
ba210ebe
JH
3009 s += len;
3010 }
012bcf8d
GS
3011 }
3012
3013 NUM_ESCAPE_INSERT:
ff3f963a
KW
3014 /* Insert oct or hex escaped character. There will always be
3015 * enough room in sv since such escapes will be longer than any
3016 * UTF-8 sequence they can end up as, except if they force us
3017 * to recode the rest of the string into utf8 */
ba7cea30 3018
77a135fe 3019 /* Here uv is the ordinal of the next character being added in
ff3f963a 3020 * unicode (converted from native). */
77a135fe 3021 if (!UNI_IS_INVARIANT(uv)) {
9aa983d2 3022 if (!has_utf8 && uv > 255) {
77a135fe
KW
3023 /* Might need to recode whatever we have accumulated so
3024 * far if it contains any chars variant in utf8 or
3025 * utf-ebcdic. */
3026
3027 SvCUR_set(sv, d - SvPVX_const(sv));
3028 SvPOK_on(sv);
3029 *d = '\0';
77a135fe 3030 /* See Note on sizing above. */
7bf79863
KW
3031 sv_utf8_upgrade_flags_grow(sv,
3032 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3033 UNISKIP(uv) + (STRLEN)(send - s) + 1);
77a135fe
KW
3034 d = SvPVX(sv) + SvCUR(sv);
3035 has_utf8 = TRUE;
012bcf8d
GS
3036 }
3037
77a135fe
KW
3038 if (has_utf8) {
3039 d = (char*)uvuni_to_utf8((U8*)d, uv);
f9a63242
JH
3040 if (PL_lex_inwhat == OP_TRANS &&
3041 PL_sublex_info.sub_op) {
3042 PL_sublex_info.sub_op->op_private |=
3043 (PL_lex_repl ? OPpTRANS_FROM_UTF
3044 : OPpTRANS_TO_UTF);
f9a63242 3045 }
e294cc5d
JH
3046#ifdef EBCDIC
3047 if (uv > 255 && !dorange)
3048 native_range = FALSE;
3049#endif
012bcf8d 3050 }
a0ed51b3 3051 else {
012bcf8d 3052 *d++ = (char)uv;
a0ed51b3 3053 }
012bcf8d
GS
3054 }
3055 else {
c4d5f83a 3056 *d++ = (char) uv;
a0ed51b3 3057 }
79072805 3058 continue;
02aa26ce 3059
4a2d328f 3060 case 'N':
ff3f963a
KW
3061 /* In a non-pattern \N must be a named character, like \N{LATIN
3062 * SMALL LETTER A} or \N{U+0041}. For patterns, it also can
3063 * mean to match a non-newline. For non-patterns, named
3064 * characters are converted to their string equivalents. In
3065 * patterns, named characters are not converted to their
3066 * ultimate forms for the same reasons that other escapes
3067 * aren't. Instead, they are converted to the \N{U+...} form
3068 * to get the value from the charnames that is in effect right
3069 * now, while preserving the fact that it was a named character
3070 * so that the regex compiler knows this */
3071
3072 /* This section of code doesn't generally use the
3073 * NATIVE_TO_NEED() macro to transform the input. I (khw) did
3074 * a close examination of this macro and determined it is a
3075 * no-op except on utfebcdic variant characters. Every
3076 * character generated by this that would normally need to be
3077 * enclosed by this macro is invariant, so the macro is not
7538f724
KW
3078 * needed, and would complicate use of copy(). XXX There are
3079 * other parts of this file where the macro is used
3080 * inconsistently, but are saved by it being a no-op */
ff3f963a
KW
3081
3082 /* The structure of this section of code (besides checking for
3083 * errors and upgrading to utf8) is:
3084 * Further disambiguate between the two meanings of \N, and if
3085 * not a charname, go process it elsewhere
0a96133f
KW
3086 * If of form \N{U+...}, pass it through if a pattern;
3087 * otherwise convert to utf8
3088 * Otherwise must be \N{NAME}: convert to \N{U+c1.c2...} if a
3089 * pattern; otherwise convert to utf8 */
ff3f963a
KW
3090
3091 /* Here, s points to the 'N'; the test below is guaranteed to
3092 * succeed if we are being called on a pattern as we already
3093 * know from a test above that the next character is a '{'.
3094 * On a non-pattern \N must mean 'named sequence, which
3095 * requires braces */
3096 s++;
3097 if (*s != '{') {
3098 yyerror("Missing braces on \\N{}");
3099 continue;
3100 }
3101 s++;
3102
0a96133f 3103 /* If there is no matching '}', it is an error. */
ff3f963a
KW
3104 if (! (e = strchr(s, '}'))) {
3105 if (! PL_lex_inpat) {
5777a3f7 3106 yyerror("Missing right brace on \\N{}");
0a96133f
KW
3107 } else {
3108 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N.");
dbc0d4f2 3109 }
0a96133f 3110 continue;
ff3f963a 3111 }
cddc7ef4 3112
ff3f963a 3113 /* Here it looks like a named character */
cddc7ef4 3114
ff3f963a
KW
3115 if (PL_lex_inpat) {
3116
3117 /* XXX This block is temporary code. \N{} implies that the
3118 * pattern is to have Unicode semantics, and therefore
3119 * currently has to be encoded in utf8. By putting it in
3120 * utf8 now, we save a whole pass in the regular expression
3121 * compiler. Once that code is changed so Unicode
3122 * semantics doesn't necessarily have to be in utf8, this
da3a4baf
KW
3123 * block should be removed. However, the code that parses
3124 * the output of this would have to be changed to not
3125 * necessarily expect utf8 */
ff3f963a 3126 if (!has_utf8) {
77a135fe 3127 SvCUR_set(sv, d - SvPVX_const(sv));
f08d6ad9 3128 SvPOK_on(sv);
e4f3eed8 3129 *d = '\0';
77a135fe 3130 /* See Note on sizing above. */
7bf79863 3131 sv_utf8_upgrade_flags_grow(sv,
ff3f963a
KW
3132 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3133 /* 5 = '\N{' + cur char + NUL */
3134 (STRLEN)(send - s) + 5);
f08d6ad9 3135 d = SvPVX(sv) + SvCUR(sv);
89491803 3136 has_utf8 = TRUE;
ff3f963a
KW
3137 }
3138 }
423cee85 3139
ff3f963a
KW
3140 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3141 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3142 | PERL_SCAN_DISALLOW_PREFIX;
3143 STRLEN len;
3144
3145 /* For \N{U+...}, the '...' is a unicode value even on
3146 * EBCDIC machines */
3147 s += 2; /* Skip to next char after the 'U+' */
3148 len = e - s;
3149 uv = grok_hex(s, &len, &flags, NULL);
3150 if (len == 0 || len != (STRLEN)(e - s)) {
3151 yyerror("Invalid hexadecimal number in \\N{U+...}");
3152 s = e + 1;
3153 continue;
3154 }
3155
3156 if (PL_lex_inpat) {
3157
e2a7e165
KW
3158 /* On non-EBCDIC platforms, pass through to the regex
3159 * compiler unchanged. The reason we evaluated the
3160 * number above is to make sure there wasn't a syntax
3161 * error. But on EBCDIC we convert to native so
3162 * downstream code can continue to assume it's native
3163 */
ff3f963a 3164 s -= 5; /* Include the '\N{U+' */
e2a7e165
KW
3165#ifdef EBCDIC
3166 d += my_snprintf(d, e - s + 1 + 1, /* includes the }
3167 and the \0 */
3168 "\\N{U+%X}",
3169 (unsigned int) UNI_TO_NATIVE(uv));
3170#else
ff3f963a
KW
3171 Copy(s, d, e - s + 1, char); /* 1 = include the } */
3172 d += e - s + 1;
e2a7e165 3173#endif
ff3f963a
KW
3174 }
3175 else { /* Not a pattern: convert the hex to string */
3176
3177 /* If destination is not in utf8, unconditionally
3178 * recode it to be so. This is because \N{} implies
3179 * Unicode semantics, and scalars have to be in utf8
3180 * to guarantee those semantics */
3181 if (! has_utf8) {
3182 SvCUR_set(sv, d - SvPVX_const(sv));
3183 SvPOK_on(sv);
3184 *d = '\0';
3185 /* See Note on sizing above. */
3186 sv_utf8_upgrade_flags_grow(
3187 sv,
3188 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3189 UNISKIP(uv) + (STRLEN)(send - e) + 1);
3190 d = SvPVX(sv) + SvCUR(sv);
3191 has_utf8 = TRUE;
3192 }
3193
3194 /* Add the string to the output */
3195 if (UNI_IS_INVARIANT(uv)) {
3196 *d++ = (char) uv;
3197 }
3198 else d = (char*)uvuni_to_utf8((U8*)d, uv);
3199 }
3200 }
3201 else { /* Here is \N{NAME} but not \N{U+...}. */
3202
3203 SV *res; /* result from charnames */
3204 const char *str; /* the string in 'res' */
3205 STRLEN len; /* its length */
3206
3207 /* Get the value for NAME */
3208 res = newSVpvn(s, e - s);
3209 res = new_constant( NULL, 0, "charnames",
3210 /* includes all of: \N{...} */
3211 res, NULL, s - 3, e - s + 4 );
3212
3213 /* Most likely res will be in utf8 already since the
3214 * standard charnames uses pack U, but a custom translator
3215 * can leave it otherwise, so make sure. XXX This can be
3216 * revisited to not have charnames use utf8 for characters
3217 * that don't need it when regexes don't have to be in utf8
3218 * for Unicode semantics. If doing so, remember EBCDIC */
3219 sv_utf8_upgrade(res);
3220 str = SvPV_const(res, len);
3221
3222 /* Don't accept malformed input */
3223 if (! is_utf8_string((U8 *) str, len)) {
3224 yyerror("Malformed UTF-8 returned by \\N");
3225 }
3226 else if (PL_lex_inpat) {
3227
3228 if (! len) { /* The name resolved to an empty string */
3229 Copy("\\N{}", d, 4, char);
3230 d += 4;
3231 }
3232 else {
3233 /* In order to not lose information for the regex
3234 * compiler, pass the result in the specially made
3235 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3236 * the code points in hex of each character
3237 * returned by charnames */
3238
3239 const char *str_end = str + len;
3240 STRLEN char_length; /* cur char's byte length */
3241 STRLEN output_length; /* and the number of bytes
3242 after this is translated
3243 into hex digits */
3244 const STRLEN off = d - SvPVX_const(sv);
3245
3246 /* 2 hex per byte; 2 chars for '\N'; 2 chars for
3247 * max('U+', '.'); and 1 for NUL */
3248 char hex_string[2 * UTF8_MAXBYTES + 5];
3249
3250 /* Get the first character of the result. */
3251 U32 uv = utf8n_to_uvuni((U8 *) str,
3252 len,
3253 &char_length,
3254 UTF8_ALLOW_ANYUV);
3255
3256 /* The call to is_utf8_string() above hopefully
3257 * guarantees that there won't be an error. But
3258 * it's easy here to make sure. The function just
3259 * above warns and returns 0 if invalid utf8, but
3260 * it can also return 0 if the input is validly a
3261 * NUL. Disambiguate */
3262 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3263 uv = UNICODE_REPLACEMENT;
3264 }
3265
3266 /* Convert first code point to hex, including the
e2a7e165
KW
3267 * boiler plate before it. For all these, we
3268 * convert to native format so that downstream code
3269 * can continue to assume the input is native */
78c35590 3270 output_length =
3353de27 3271 my_snprintf(hex_string, sizeof(hex_string),
e2a7e165
KW
3272 "\\N{U+%X",
3273 (unsigned int) UNI_TO_NATIVE(uv));
ff3f963a
KW
3274
3275 /* Make sure there is enough space to hold it */
3276 d = off + SvGROW(sv, off
3277 + output_length
3278 + (STRLEN)(send - e)
3279 + 2); /* '}' + NUL */
3280 /* And output it */
3281 Copy(hex_string, d, output_length, char);
3282 d += output_length;
3283
3284 /* For each subsequent character, append dot and
3285 * its ordinal in hex */
3286 while ((str += char_length) < str_end) {
3287 const STRLEN off = d - SvPVX_const(sv);
3288 U32 uv = utf8n_to_uvuni((U8 *) str,
3289 str_end - str,
3290 &char_length,
3291 UTF8_ALLOW_ANYUV);
3292 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3293 uv = UNICODE_REPLACEMENT;
3294 }
3295
78c35590 3296 output_length =
3353de27 3297 my_snprintf(hex_string, sizeof(hex_string),
e2a7e165
KW
3298 ".%X",
3299 (unsigned int) UNI_TO_NATIVE(uv));
ff3f963a
KW
3300
3301 d = off + SvGROW(sv, off
3302 + output_length
3303 + (STRLEN)(send - e)
3304 + 2); /* '}' + NUL */
3305 Copy(hex_string, d, output_length, char);
3306 d += output_length;
3307 }
3308
3309 *d++ = '}'; /* Done. Add the trailing brace */
3310 }
3311 }
3312 else { /* Here, not in a pattern. Convert the name to a
3313 * string. */
3314
3315 /* If destination is not in utf8, unconditionally
3316 * recode it to be so. This is because \N{} implies
3317 * Unicode semantics, and scalars have to be in utf8
3318 * to guarantee those semantics */
3319 if (! has_utf8) {
3320 SvCUR_set(sv, d - SvPVX_const(sv));
3321 SvPOK_on(sv);
3322 *d = '\0';
3323 /* See Note on sizing above. */
3324 sv_utf8_upgrade_flags_grow(sv,
3325 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3326 len + (STRLEN)(send - s) + 1);
3327 d = SvPVX(sv) + SvCUR(sv);
3328 has_utf8 = TRUE;
3329 } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
3330
3331 /* See Note on sizing above. (NOTE: SvCUR() is not
3332 * set correctly here). */
3333 const STRLEN off = d - SvPVX_const(sv);
3334 d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1);
3335 }
3336 Copy(str, d, len, char);
3337 d += len;
423cee85 3338 }
423cee85 3339 SvREFCNT_dec(res);
cb233ae3
KW
3340
3341 /* Deprecate non-approved name syntax */
3342 if (ckWARN_d(WARN_DEPRECATED)) {
3343 bool problematic = FALSE;
3344 char* i = s;
3345
3346 /* For non-ut8 input, look to see that the first
3347 * character is an alpha, then loop through the rest
3348 * checking that each is a continuation */
3349 if (! this_utf8) {
3350 if (! isALPHAU(*i)) problematic = TRUE;
3351 else for (i = s + 1; i < e; i++) {
3352 if (isCHARNAME_CONT(*i)) continue;
3353 problematic = TRUE;
3354 break;
3355 }
3356 }
3357 else {
3358 /* Similarly for utf8. For invariants can check
3359 * directly. We accept anything above the latin1
3360 * range because it is immaterial to Perl if it is
3361 * correct or not, and is expensive to check. But
3362 * it is fairly easy in the latin1 range to convert
3363 * the variants into a single character and check
3364 * those */
3365 if (UTF8_IS_INVARIANT(*i)) {
3366 if (! isALPHAU(*i)) problematic = TRUE;
3367 } else if (UTF8_IS_DOWNGRADEABLE_START(*i)) {
81c14aa2 3368 if (! isALPHAU(UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(*i,
cb233ae3
KW
3369 *(i+1)))))
3370 {
3371 problematic = TRUE;
3372 }
3373 }
3374 if (! problematic) for (i = s + UTF8SKIP(s);
3375 i < e;
3376 i+= UTF8SKIP(i))
3377 {
3378 if (UTF8_IS_INVARIANT(*i)) {
3379 if (isCHARNAME_CONT(*i)) continue;
3380 } else if (! UTF8_IS_DOWNGRADEABLE_START(*i)) {
3381 continue;
3382 } else if (isCHARNAME_CONT(
3383 UNI_TO_NATIVE(
81c14aa2 3384 TWO_BYTE_UTF8_TO_UNI(*i, *(i+1)))))
cb233ae3
KW
3385 {
3386 continue;
3387 }
3388 problematic = TRUE;
3389 break;
3390 }
3391 }
3392 if (problematic) {
6e1bad6c
KW
3393 /* The e-i passed to the final %.*s makes sure that
3394 * should the trailing NUL be missing that this
3395 * print won't run off the end of the string */
cb233ae3 3396 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
b00fc8d4
NC
3397 "Deprecated character in \\N{...}; marked by <-- HERE in \\N{%.*s<-- HERE %.*s",
3398 (int)(i - s + 1), s, (int)(e - i), i + 1);
cb233ae3
KW
3399 }
3400 }
3401 } /* End \N{NAME} */
ff3f963a
KW
3402#ifdef EBCDIC
3403 if (!dorange)
3404 native_range = FALSE; /* \N{} is defined to be Unicode */
3405#endif
3406 s = e + 1; /* Point to just after the '}' */
423cee85
JH
3407 continue;
3408
02aa26ce 3409 /* \c is a control character */
79072805
LW
3410 case 'c':
3411 s++;
961ce445 3412 if (s < send) {
17a3df4c 3413 *d++ = grok_bslash_c(*s++, has_utf8, 1);
ba210ebe 3414 }
961ce445
RGS
3415 else {
3416 yyerror("Missing control char name in \\c");
3417 }
79072805 3418 continue;
02aa26ce
NT
3419
3420 /* printf-style backslashes, formfeeds, newlines, etc */
79072805 3421 case 'b':
db42d148 3422 *d++ = NATIVE_TO_NEED(has_utf8,'\b');
79072805
LW
3423 break;
3424 case 'n':
db42d148 3425 *d++ = NATIVE_TO_NEED(has_utf8,'\n');
79072805
LW
3426 break;
3427 case 'r':
db42d148 3428 *d++ = NATIVE_TO_NEED(has_utf8,'\r');
79072805
LW
3429 break;
3430 case 'f':
db42d148 3431 *d++ = NATIVE_TO_NEED(has_utf8,'\f');
79072805
LW
3432 break;
3433 case 't':
db42d148 3434 *d++ = NATIVE_TO_NEED(has_utf8,'\t');
79072805 3435 break;
34a3fe2a 3436 case 'e':
db42d148 3437 *d++ = ASCII_TO_NEED(has_utf8,'\033');
34a3fe2a
PP
3438 break;
3439 case 'a':
db42d148 3440 *d++ = ASCII_TO_NEED(has_utf8,'\007');
79072805 3441 break;
02aa26ce
NT
3442 } /* end switch */
3443
79072805
LW
3444 s++;
3445 continue;
02aa26ce 3446 } /* end if (backslash) */
4c3a8340
TS
3447#ifdef EBCDIC
3448 else
3449 literal_endpoint++;
3450#endif
02aa26ce 3451
f9a63242 3452 default_action:
77a135fe
KW
3453 /* If we started with encoded form, or already know we want it,
3454 then encode the next character */
3455 if (! NATIVE_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
2b9d42f0 3456 STRLEN len = 1;
77a135fe
KW
3457
3458
3459 /* One might think that it is wasted effort in the case of the
3460 * source being utf8 (this_utf8 == TRUE) to take the next character
3461 * in the source, convert it to an unsigned value, and then convert
3462 * it back again. But the source has not been validated here. The
3463 * routine that does the conversion checks for errors like
3464 * malformed utf8 */
3465
5f66b61c
AL
3466 const UV nextuv = (this_utf8) ? utf8n_to_uvchr((U8*)s, send - s, &len, 0) : (UV) ((U8) *s);
3467 const STRLEN need = UNISKIP(NATIVE_TO_UNI(nextuv));
77a135fe
KW
3468 if (!has_utf8) {
3469 SvCUR_set(sv, d - SvPVX_const(sv));
3470 SvPOK_on(sv);
3471 *d = '\0';
77a135fe 3472 /* See Note on sizing above. */
7bf79863
KW
3473 sv_utf8_upgrade_flags_grow(sv,
3474 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3475 need + (STRLEN)(send - s) + 1);
77a135fe
KW
3476 d = SvPVX(sv) + SvCUR(sv);
3477 has_utf8 = TRUE;
3478 } else if (need > len) {
3479 /* encoded value larger than old, may need extra space (NOTE:
3480 * SvCUR() is not set correctly here). See Note on sizing
3481 * above. */
9d4ba2ae 3482 const STRLEN off = d - SvPVX_const(sv);
77a135fe 3483 d = SvGROW(sv, off + need + (STRLEN)(send - s) + 1) + off;
2b9d42f0 3484 }
77a135fe
KW
3485 s += len;
3486
5f66b61c 3487 d = (char*)uvchr_to_utf8((U8*)d, nextuv);
e294cc5d
JH
3488#ifdef EBCDIC
3489 if (uv > 255 && !dorange)
3490 native_range = FALSE;
3491#endif
2b9d42f0
NIS
3492 }
3493 else {
3494 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
3495 }
02aa26ce
NT
3496 } /* while loop to process each character */
3497
3498 /* terminate the string and set up the sv */
79072805 3499 *d = '\0';
95a20fc0 3500 SvCUR_set(sv, d - SvPVX_const(sv));
2b9d42f0 3501 if (SvCUR(sv) >= SvLEN(sv))
d0063567 3502 Perl_croak(aTHX_ "panic: constant overflowed allocated space");
2b9d42f0 3503
79072805 3504 SvPOK_on(sv);
9f4817db 3505 if (PL_encoding && !has_utf8) {
d0063567
DK
3506 sv_recode_to_utf8(sv, PL_encoding);
3507 if (SvUTF8(sv))
3508 has_utf8 = TRUE;
9f4817db 3509 }
2b9d42f0 3510 if (has_utf8) {
7e2040f0 3511 SvUTF8_on(sv);
2b9d42f0 3512 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
d0063567 3513 PL_sublex_info.sub_op->op_private |=
2b9d42f0
NIS
3514 (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
3515 }
3516 }
79072805 3517
02aa26ce 3518 /* shrink the sv if we allocated more than we used */
79072805 3519 if (SvCUR(sv) + 5 < SvLEN(sv)) {
1da4ca5f 3520 SvPV_shrink_to_cur(sv);
79072805 3521 }
02aa26ce 3522
6154021b 3523 /* return the substring (via pl_yylval) only if we parsed anything */
3280af22 3524 if (s > PL_bufptr) {
eb0d8d16
NC
3525 if ( PL_hints & ( PL_lex_inpat ? HINT_NEW_RE : HINT_NEW_STRING ) ) {
3526 const char *const key = PL_lex_inpat ? "qr" : "q";
3527 const STRLEN keylen = PL_lex_inpat ? 2 : 1;
3528 const char *type;
3529 STRLEN typelen;
3530
3531 if (PL_lex_inwhat == OP_TRANS) {
3532 type = "tr";
3533 typelen = 2;
3534 } else if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat) {
3535 type = "s";
3536 typelen = 1;
3537 } else {
3538 type = "qq";
3539 typelen = 2;
3540 }
3541
3542 sv = S_new_constant(aTHX_ start, s - start, key, keylen, sv, NULL,
3543 type, typelen);
3544 }
6154021b 3545 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
b3ac6de7 3546 } else
8990e307 3547 SvREFCNT_dec(sv);
79072805
LW
3548 return s;
3549}
3550
ffb4593c
NT
3551/* S_intuit_more
3552 * Returns TRUE if there's more to the expression (e.g., a subscript),
3553 * FALSE otherwise.
ffb4593c
NT
3554 *
3555 * It deals with "$foo[3]" and /$foo[3]/ and /$foo[0123456789$]+/
3556 *
3557 * ->[ and ->{ return TRUE
3558 * { and [ outside a pattern are always subscripts, so return TRUE
3559 * if we're outside a pattern and it's not { or [, then return FALSE
3560 * if we're in a pattern and the first char is a {
3561 * {4,5} (any digits around the comma) returns FALSE
3562 * if we're in a pattern and the first char is a [
3563 * [] returns FALSE
3564 * [SOMETHING] has a funky algorithm to decide whether it's a
3565 * character class or not. It has to deal with things like
3566 * /$foo[-3]/ and /$foo[$bar]/ as well as /$foo[$\d]+/
3567 * anything else returns TRUE
3568 */
3569
9cbb5ea2
GS
3570/* This is the one truly awful dwimmer necessary to conflate C and sed. */
3571
76e3520e 3572STATIC int
cea2e8a9 3573S_intuit_more(pTHX_ register char *s)
79072805 3574{
97aff369 3575 dVAR;
7918f24d
NC
3576
3577 PERL_ARGS_ASSERT_INTUIT_MORE;
3578
3280af22 3579 if (PL_lex_brackets)
79072805
LW
3580 return TRUE;
3581 if (*s == '-' && s[1] == '>' && (s[2] == '[' || s[2] == '{'))
3582 return TRUE;
3583 if (*s != '{' && *s != '[')
3584 return FALSE;
3280af22 3585 if (!PL_lex_inpat)
79072805
LW
3586 return TRUE;
3587
3588 /* In a pattern, so maybe we have {n,m}. */
3589 if (*s == '{') {
b3155d95 3590 if (regcurly(s)) {
79072805 3591 return FALSE;
b3155d95 3592 }
79072805 3593 return TRUE;
79072805
LW
3594 }
3595
3596 /* On the other hand, maybe we have a character class */
3597
3598 s++;
3599 if (*s == ']' || *s == '^')
3600 return FALSE;
3601 else {
ffb4593c 3602 /* this is terrifying, and it works */
79072805
LW
3603 int weight = 2; /* let's weigh the evidence */
3604 char seen[256];
f27ffc4a 3605 unsigned char un_char = 255, last_un_char;
9d4ba2ae 3606 const char * const send = strchr(s,']');
3280af22 3607 char tmpbuf[sizeof PL_tokenbuf * 4];
79072805
LW
3608
3609 if (!send) /* has to be an expression */
3610 return TRUE;
3611
3612 Zero(seen,256,char);
3613 if (*s == '$')
3614 weight -= 3;
3615 else if (isDIGIT(*s)) {
3616 if (s[1] != ']') {
3617 if (isDIGIT(s[1]) && s[2] == ']')
3618 weight -= 10;
3619 }
3620 else
3621 weight -= 100;
3622 }
3623 for (; s < send; s++) {
3624 last_un_char = un_char;
3625 un_char = (unsigned char)*s;
3626 switch (*s) {
3627 case '@':
3628 case '&':
3629 case '$':
3630 weight -= seen[un_char] * 10;
7e2040f0 3631 if (isALNUM_lazy_if(s+1,UTF)) {
90e5519e 3632 int len;
8903cb82 3633 scan_ident(s, send, tmpbuf, sizeof tmpbuf, FALSE);
90e5519e 3634 len = (int)strlen(tmpbuf);
6fbd0d97
BF
3635 if (len > 1 && gv_fetchpvn_flags(tmpbuf, len,
3636 UTF ? SVf_UTF8 : 0, SVt_PV))
79072805
LW
3637 weight -= 100;
3638 else
3639 weight -= 10;
3640 }
3641 else if (*s == '$' && s[1] &&
93a17b20
LW
3642 strchr("[#!%*<>()-=",s[1])) {
3643 if (/*{*/ strchr("])} =",s[2]))
79072805
LW
3644 weight -= 10;
3645 else
3646 weight -= 1;
3647 }
3648 break;
3649 case '\\':
3650 un_char = 254;
3651 if (s[1]) {
93a17b20 3652 if (strchr("wds]",s[1]))
79072805 3653 weight += 100;
10edeb5d 3654 else if (seen[(U8)'\''] || seen[(U8)'"'])
79072805 3655 weight += 1;
93a17b20 3656 else if (strchr("rnftbxcav",s[1]))
79072805
LW
3657 weight += 40;
3658 else if (isDIGIT(s[1])) {
3659 weight += 40;
3660 while (s[1] && isDIGIT(s[1]))
3661 s++;
3662 }
3663 }
3664 else
3665 weight += 100;
3666 break;
3667 case '-':
3668 if (s[1] == '\\')
3669 weight += 50;
93a17b20 3670 if (strchr("aA01! ",last_un_char))
79072805 3671 weight += 30;
93a17b20 3672 if (strchr("zZ79~",s[1]))
79072805 3673 weight += 30;
f27ffc4a
GS
3674 if (last_un_char == 255 && (isDIGIT(s[1]) || s[1] == '$'))
3675 weight -= 5; /* cope with negative subscript */
79072805
LW
3676 break;
3677 default:
3792a11b
NC
3678 if (!isALNUM(last_un_char)
3679 && !(last_un_char == '$' || last_un_char == '@'
3680 || last_un_char == '&')
3681 && isALPHA(*s) && s[1] && isALPHA(s[1])) {
79072805
LW
3682 char *d = tmpbuf;
3683 while (isALPHA(*s))
3684 *d++ = *s++;
3685 *d = '\0';
5458a98a 3686 if (keyword(tmpbuf, d - tmpbuf, 0))
79072805
LW
3687 weight -= 150;
3688 }
3689 if (un_char == last_un_char + 1)
3690 weight += 5;
3691 weight -= seen[un_char];
3692 break;
3693 }
3694 seen[un_char]++;
3695 }
3696 if (weight >= 0) /* probably a character class */
3697 return FALSE;
3698 }
3699
3700 return TRUE;
3701}
ffed7fef 3702
ffb4593c
NT
3703/*
3704 * S_intuit_method
3705 *
3706 * Does all the checking to disambiguate
3707 * foo bar
3708 * between foo(bar) and bar->foo. Returns 0 if not a method, otherwise
3709 * FUNCMETH (bar->foo(args)) or METHOD (bar->foo args).
3710 *
3711 * First argument is the stuff after the first token, e.g. "bar".
3712 *
3713 * Not a method if bar is a filehandle.
3714 * Not a method if foo is a subroutine prototyped to take a filehandle.
3715 * Not a method if it's really "Foo $bar"
3716 * Method if it's "foo $bar"
3717 * Not a method if it's really "print foo $bar"
3718 * Method if it's really "foo package::" (interpreted as package->foo)
8f8cf39c 3719 * Not a method if bar is known to be a subroutine ("sub bar; foo bar")
3cb0bbe5 3720 * Not a method if bar is a filehandle or package, but is quoted with
ffb4593c
NT
3721 * =>
3722 */
3723
76e3520e 3724STATIC int
62d55b22 3725S_intuit_method(pTHX_ char *start, GV *gv, CV *cv)
a0d0e21e 3726{
97aff369 3727 dVAR;
a0d0e21e 3728 char *s = start + (*start == '$');
3280af22 3729 char tmpbuf[sizeof PL_tokenbuf];
a0d0e21e
LW
3730 STRLEN len;
3731 GV* indirgv;
5db06880
NC
3732#ifdef PERL_MAD
3733 int soff;
3734#endif
a0d0e21e 3735
7918f24d
NC
3736 PERL_ARGS_ASSERT_INTUIT_METHOD;
3737
a0d0e21e 3738 if (gv) {
62d55b22 3739 if (SvTYPE(gv) == SVt_PVGV && GvIO(gv))
a0d0e21e 3740 return 0;
62d55b22
NC
3741 if (cv) {
3742 if (SvPOK(cv)) {
8fa6a409 3743 const char *proto = CvPROTO(cv);
62d55b22
NC
3744 if (proto) {
3745 if (*proto == ';')
3746 proto++;
3747 if (*proto == '*')
3748 return 0;
3749 }
b6c543e3
IZ
3750 }
3751 } else
c35e046a 3752 gv = NULL;
a0d0e21e 3753 }
8903cb82 3754 s = scan_word(s, tmpbuf, sizeof tmpbuf, TRUE, &len);
ffb4593c
NT
3755 /* start is the beginning of the possible filehandle/object,
3756 * and s is the end of it
3757 * tmpbuf is a copy of it
3758 */
3759
a0d0e21e 3760 if (*start == '$') {
3ef1310e
RGS
3761 if (gv || PL_last_lop_op == OP_PRINT || PL_last_lop_op == OP_SAY ||
3762 isUPPER(*PL_tokenbuf))
a0d0e21e 3763 return 0;
5db06880
NC
3764#ifdef PERL_MAD
3765 len = start - SvPVX(PL_linestr);
3766#endif
29595ff2 3767 s = PEEKSPACE(s);
f0092767 3768#ifdef PERL_MAD
5db06880
NC
3769 start = SvPVX(PL_linestr) + len;
3770#endif