This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Cast to signed before negating, to avoid compiler warnings
[perl5.git] / toke.c
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
26
27This is the lower layer of the Perl parser, managing characters and tokens.
28
29=for apidoc AmU|yy_parser *|PL_parser
30
31Pointer to a structure encapsulating the state of the parsing operation
32currently in progress. The pointer can be locally changed to perform
33a nested parse without interfering with the state of an outer parse.
34Individual members of C<PL_parser> have their own documentation.
35
36=cut
37*/
38
378cc40b 39#include "EXTERN.h"
864dbfa3 40#define PERL_IN_TOKE_C
378cc40b 41#include "perl.h"
04e98a4d 42#include "dquote_static.c"
378cc40b 43
eb0d8d16
NC
44#define new_constant(a,b,c,d,e,f,g) \
45 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
46
6154021b 47#define pl_yylval (PL_parser->yylval)
d3b6f988 48
199e78b7
DM
49/* XXX temporary backwards compatibility */
50#define PL_lex_brackets (PL_parser->lex_brackets)
78cdf107
Z
51#define PL_lex_allbrackets (PL_parser->lex_allbrackets)
52#define PL_lex_fakeeof (PL_parser->lex_fakeeof)
199e78b7
DM
53#define PL_lex_brackstack (PL_parser->lex_brackstack)
54#define PL_lex_casemods (PL_parser->lex_casemods)
55#define PL_lex_casestack (PL_parser->lex_casestack)
56#define PL_lex_defer (PL_parser->lex_defer)
57#define PL_lex_dojoin (PL_parser->lex_dojoin)
58#define PL_lex_expect (PL_parser->lex_expect)
59#define PL_lex_formbrack (PL_parser->lex_formbrack)
60#define PL_lex_inpat (PL_parser->lex_inpat)
61#define PL_lex_inwhat (PL_parser->lex_inwhat)
62#define PL_lex_op (PL_parser->lex_op)
63#define PL_lex_repl (PL_parser->lex_repl)
64#define PL_lex_starts (PL_parser->lex_starts)
65#define PL_lex_stuff (PL_parser->lex_stuff)
66#define PL_multi_start (PL_parser->multi_start)
67#define PL_multi_open (PL_parser->multi_open)
68#define PL_multi_close (PL_parser->multi_close)
69#define PL_pending_ident (PL_parser->pending_ident)
70#define PL_preambled (PL_parser->preambled)
71#define PL_sublex_info (PL_parser->sublex_info)
bdc0bf6f 72#define PL_linestr (PL_parser->linestr)
c2598295
DM
73#define PL_expect (PL_parser->expect)
74#define PL_copline (PL_parser->copline)
f06b5848
DM
75#define PL_bufptr (PL_parser->bufptr)
76#define PL_oldbufptr (PL_parser->oldbufptr)
77#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
78#define PL_linestart (PL_parser->linestart)
79#define PL_bufend (PL_parser->bufend)
80#define PL_last_uni (PL_parser->last_uni)
81#define PL_last_lop (PL_parser->last_lop)
82#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 83#define PL_lex_state (PL_parser->lex_state)
2f9285f8 84#define PL_rsfp (PL_parser->rsfp)
5486870f 85#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
86#define PL_in_my (PL_parser->in_my)
87#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 88#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 89#define PL_multi_end (PL_parser->multi_end)
13765c85 90#define PL_error_count (PL_parser->error_count)
199e78b7
DM
91
92#ifdef PERL_MAD
93# define PL_endwhite (PL_parser->endwhite)
94# define PL_faketokens (PL_parser->faketokens)
95# define PL_lasttoke (PL_parser->lasttoke)
96# define PL_nextwhite (PL_parser->nextwhite)
97# define PL_realtokenstart (PL_parser->realtokenstart)
98# define PL_skipwhite (PL_parser->skipwhite)
99# define PL_thisclose (PL_parser->thisclose)
100# define PL_thismad (PL_parser->thismad)
101# define PL_thisopen (PL_parser->thisopen)
102# define PL_thisstuff (PL_parser->thisstuff)
103# define PL_thistoken (PL_parser->thistoken)
104# define PL_thiswhite (PL_parser->thiswhite)
fb205e7a
DM
105# define PL_thiswhite (PL_parser->thiswhite)
106# define PL_nexttoke (PL_parser->nexttoke)
107# define PL_curforce (PL_parser->curforce)
108#else
109# define PL_nexttoke (PL_parser->nexttoke)
110# define PL_nexttype (PL_parser->nexttype)
111# define PL_nextval (PL_parser->nextval)
199e78b7
DM
112#endif
113
16173588
NC
114/* This can't be done with embed.fnc, because struct yy_parser contains a
115 member named pending_ident, which clashes with the generated #define */
3cbf51f5
DM
116static int
117S_pending_ident(pTHX);
199e78b7 118
0bd48802 119static const char ident_too_long[] = "Identifier too long";
8903cb82 120
29595ff2 121#ifdef PERL_MAD
29595ff2 122# define CURMAD(slot,sv) if (PL_madskills) { curmad(slot,sv); sv = 0; }
cd81e915 123# define NEXTVAL_NEXTTOKE PL_nexttoke[PL_curforce].next_val
9ded7720 124#else
5db06880 125# define CURMAD(slot,sv)
9ded7720 126# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2
NC
127#endif
128
a7aaec61
Z
129#define XENUMMASK 0x3f
130#define XFAKEEOF 0x40
131#define XFAKEBRACK 0x80
9059aa12 132
39e02b42
JH
133#ifdef USE_UTF8_SCRIPTS
134# define UTF (!IN_BYTES)
2b9d42f0 135#else
746b446a 136# define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || (PL_hints & HINT_UTF8))
2b9d42f0 137#endif
a0ed51b3 138
b1fc3636
CJ
139/* The maximum number of characters preceding the unrecognized one to display */
140#define UNRECOGNIZED_PRECEDE_COUNT 10
141
61f0cdd9 142/* In variables named $^X, these are the legal values for X.
2b92dfce
GS
143 * 1999-02-27 mjd-perl-patch@plover.com */
144#define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
145
bf4acbe4 146#define SPACE_OR_TAB(c) ((c)==' '||(c)=='\t')
bf4acbe4 147
ffb4593c
NT
148/* LEX_* are values for PL_lex_state, the state of the lexer.
149 * They are arranged oddly so that the guard on the switch statement
79072805
LW
150 * can get by with a single comparison (if the compiler is smart enough).
151 */
152
fb73857a 153/* #define LEX_NOTPARSING 11 is done in perl.h. */
154
b6007c36
DM
155#define LEX_NORMAL 10 /* normal code (ie not within "...") */
156#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
157#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
158#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
159#define LEX_INTERPSTART 6 /* expecting the start of a $var */
160
161 /* at end of code, eg "$x" followed by: */
162#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
163#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
164
165#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
166 string or after \E, $foo, etc */
167#define LEX_INTERPCONST 2 /* NOT USED */
168#define LEX_FORMLINE 1 /* expecting a format line */
169#define LEX_KNOWNEXT 0 /* next token known; just return it */
170
79072805 171
bbf60fe6 172#ifdef DEBUGGING
27da23d5 173static const char* const lex_state_names[] = {
bbf60fe6
DM
174 "KNOWNEXT",
175 "FORMLINE",
176 "INTERPCONST",
177 "INTERPCONCAT",
178 "INTERPENDMAYBE",
179 "INTERPEND",
180 "INTERPSTART",
181 "INTERPPUSH",
182 "INTERPCASEMOD",
183 "INTERPNORMAL",
184 "NORMAL"
185};
186#endif
187
79072805
LW
188#ifdef ff_next
189#undef ff_next
d48672a2
LW
190#endif
191
79072805 192#include "keywords.h"
fe14fcc3 193
ffb4593c
NT
194/* CLINE is a macro that ensures PL_copline has a sane value */
195
ae986130
LW
196#ifdef CLINE
197#undef CLINE
198#endif
57843af0 199#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 200
5db06880 201#ifdef PERL_MAD
29595ff2
NC
202# define SKIPSPACE0(s) skipspace0(s)
203# define SKIPSPACE1(s) skipspace1(s)
204# define SKIPSPACE2(s,tsv) skipspace2(s,&tsv)
205# define PEEKSPACE(s) skipspace2(s,0)
206#else
207# define SKIPSPACE0(s) skipspace(s)
208# define SKIPSPACE1(s) skipspace(s)
209# define SKIPSPACE2(s,tsv) skipspace(s)
210# define PEEKSPACE(s) skipspace(s)
211#endif
212
ffb4593c
NT
213/*
214 * Convenience functions to return different tokens and prime the
9cbb5ea2 215 * lexer for the next token. They all take an argument.
ffb4593c
NT
216 *
217 * TOKEN : generic token (used for '(', DOLSHARP, etc)
218 * OPERATOR : generic operator
219 * AOPERATOR : assignment operator
220 * PREBLOCK : beginning the block after an if, while, foreach, ...
221 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
222 * PREREF : *EXPR where EXPR is not a simple identifier
223 * TERM : expression term
224 * LOOPX : loop exiting command (goto, last, dump, etc)
225 * FTST : file test operator
226 * FUN0 : zero-argument function
7eb971ee 227 * FUN0OP : zero-argument function, with its op created in this file
2d2e263d 228 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
229 * BOop : bitwise or or xor
230 * BAop : bitwise and
231 * SHop : shift operator
232 * PWop : power operator
9cbb5ea2 233 * PMop : pattern-matching operator
ffb4593c
NT
234 * Aop : addition-level operator
235 * Mop : multiplication-level operator
236 * Eop : equality-testing operator
e5edeb50 237 * Rop : relational operator <= != gt
ffb4593c
NT
238 *
239 * Also see LOP and lop() below.
240 */
241
998054bd 242#ifdef DEBUGGING /* Serve -DT. */
704d4215 243# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 244#else
bbf60fe6 245# define REPORT(retval) (retval)
998054bd
SC
246#endif
247
bbf60fe6
DM
248#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
249#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
250#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, REPORT(retval)))
251#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
252#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
253#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
254#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
6154021b
RGS
255#define LOOPX(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)LOOPEX))
256#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
257#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
7eb971ee 258#define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
6154021b
RGS
259#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
260#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITOROP)))
261#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITANDOP)))
262#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)SHIFTOP)))
263#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)POWOP)))
264#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
265#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)ADDOP)))
266#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MULOP)))
267#define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
268#define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
2f3197b3 269
a687059c
LW
270/* This bit of chicanery makes a unary function followed by
271 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
272 * The UNIDOR macro is for unary functions that can be followed by the //
273 * operator (such as C<shift // 0>).
a687059c 274 */
376fcdbf 275#define UNI2(f,x) { \
6154021b 276 pl_yylval.ival = f; \
376fcdbf
AL
277 PL_expect = x; \
278 PL_bufptr = s; \
279 PL_last_uni = PL_oldbufptr; \
280 PL_last_lop_op = f; \
281 if (*s == '(') \
282 return REPORT( (int)FUNC1 ); \
29595ff2 283 s = PEEKSPACE(s); \
376fcdbf
AL
284 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
285 }
6f33ba73
RGS
286#define UNI(f) UNI2(f,XTERM)
287#define UNIDOR(f) UNI2(f,XTERMORDORDOR)
a687059c 288
376fcdbf 289#define UNIBRACK(f) { \
6154021b 290 pl_yylval.ival = f; \
376fcdbf
AL
291 PL_bufptr = s; \
292 PL_last_uni = PL_oldbufptr; \
293 if (*s == '(') \
294 return REPORT( (int)FUNC1 ); \
29595ff2 295 s = PEEKSPACE(s); \
376fcdbf
AL
296 return REPORT( (*s == '(') ? (int)FUNC1 : (int)UNIOP ); \
297 }
79072805 298
9f68db38 299/* grandfather return to old style */
78cdf107
Z
300#define OLDLOP(f) \
301 do { \
302 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
303 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
304 pl_yylval.ival = (f); \
305 PL_expect = XTERM; \
306 PL_bufptr = s; \
307 return (int)LSTOP; \
308 } while(0)
79072805 309
8fa7f367
JH
310#ifdef DEBUGGING
311
6154021b 312/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
313enum token_type {
314 TOKENTYPE_NONE,
315 TOKENTYPE_IVAL,
6154021b 316 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6
DM
317 TOKENTYPE_PVAL,
318 TOKENTYPE_OPVAL,
319 TOKENTYPE_GVVAL
320};
321
6d4a66ac
NC
322static struct debug_tokens {
323 const int token;
324 enum token_type type;
325 const char *name;
326} const debug_tokens[] =
9041c2e3 327{
bbf60fe6
DM
328 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
329 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
330 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
331 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
332 { ARROW, TOKENTYPE_NONE, "ARROW" },
333 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
334 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
335 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
336 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
337 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
0d863452 338 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
339 { DO, TOKENTYPE_NONE, "DO" },
340 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
341 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
342 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
343 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
344 { ELSE, TOKENTYPE_NONE, "ELSE" },
345 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
346 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
347 { FOR, TOKENTYPE_IVAL, "FOR" },
348 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
349 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
350 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
7eb971ee 351 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
bbf60fe6
DM
352 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
353 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
354 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 355 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
356 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
357 { IF, TOKENTYPE_IVAL, "IF" },
358 { LABEL, TOKENTYPE_PVAL, "LABEL" },
359 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
360 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
361 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
362 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
363 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
364 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
365 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
366 { MY, TOKENTYPE_IVAL, "MY" },
367 { MYSUB, TOKENTYPE_NONE, "MYSUB" },
368 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
369 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
370 { OROP, TOKENTYPE_IVAL, "OROP" },
371 { OROR, TOKENTYPE_NONE, "OROR" },
372 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
88e1f1a2
JV
373 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
374 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6
DM
375 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
376 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
377 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
378 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
379 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
380 { PREINC, TOKENTYPE_NONE, "PREINC" },
381 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
382 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
383 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
384 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
385 { SUB, TOKENTYPE_NONE, "SUB" },
386 { THING, TOKENTYPE_OPVAL, "THING" },
387 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
388 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
389 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
390 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
391 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
392 { USE, TOKENTYPE_IVAL, "USE" },
0d863452 393 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6
DM
394 { WHILE, TOKENTYPE_IVAL, "WHILE" },
395 { WORD, TOKENTYPE_OPVAL, "WORD" },
be25f609 396 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 397 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
398};
399
6154021b 400/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 401
bbf60fe6 402STATIC int
704d4215 403S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 404{
97aff369 405 dVAR;
7918f24d
NC
406
407 PERL_ARGS_ASSERT_TOKEREPORT;
408
bbf60fe6 409 if (DEBUG_T_TEST) {
bd61b366 410 const char *name = NULL;
bbf60fe6 411 enum token_type type = TOKENTYPE_NONE;
f54cb97a 412 const struct debug_tokens *p;
396482e1 413 SV* const report = newSVpvs("<== ");
bbf60fe6 414
f54cb97a 415 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
416 if (p->token == (int)rv) {
417 name = p->name;
418 type = p->type;
419 break;
420 }
421 }
422 if (name)
54667de8 423 Perl_sv_catpv(aTHX_ report, name);
bbf60fe6
DM
424 else if ((char)rv > ' ' && (char)rv < '~')
425 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
426 else if (!rv)
396482e1 427 sv_catpvs(report, "EOF");
bbf60fe6
DM
428 else
429 Perl_sv_catpvf(aTHX_ report, "?? %"IVdf, (IV)rv);
430 switch (type) {
431 case TOKENTYPE_NONE:
432 case TOKENTYPE_GVVAL: /* doesn't appear to be used */
433 break;
434 case TOKENTYPE_IVAL:
704d4215 435 Perl_sv_catpvf(aTHX_ report, "(ival=%"IVdf")", (IV)lvalp->ival);
bbf60fe6
DM
436 break;
437 case TOKENTYPE_OPNUM:
438 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 439 PL_op_name[lvalp->ival]);
bbf60fe6
DM
440 break;
441 case TOKENTYPE_PVAL:
704d4215 442 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
443 break;
444 case TOKENTYPE_OPVAL:
704d4215 445 if (lvalp->opval) {
401441c0 446 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
447 PL_op_name[lvalp->opval->op_type]);
448 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 449 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 450 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
451 }
452
453 }
401441c0 454 else
396482e1 455 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
456 break;
457 }
b6007c36 458 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
459 };
460 return (int)rv;
998054bd
SC
461}
462
b6007c36
DM
463
464/* print the buffer with suitable escapes */
465
466STATIC void
15f169a1 467S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 468{
396482e1 469 SV* const tmp = newSVpvs("");
7918f24d
NC
470
471 PERL_ARGS_ASSERT_PRINTBUF;
472
b6007c36
DM
473 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
474 SvREFCNT_dec(tmp);
475}
476
8fa7f367
JH
477#endif
478
8290c323
NC
479static int
480S_deprecate_commaless_var_list(pTHX) {
481 PL_expect = XTERM;
482 deprecate("comma-less variable list");
483 return REPORT(','); /* grandfather non-comma-format format */
484}
485
ffb4593c
NT
486/*
487 * S_ao
488 *
c963b151
BD
489 * This subroutine detects &&=, ||=, and //= and turns an ANDAND, OROR or DORDOR
490 * into an OP_ANDASSIGN, OP_ORASSIGN, or OP_DORASSIGN
ffb4593c
NT
491 */
492
76e3520e 493STATIC int
cea2e8a9 494S_ao(pTHX_ int toketype)
a0d0e21e 495{
97aff369 496 dVAR;
3280af22
NIS
497 if (*PL_bufptr == '=') {
498 PL_bufptr++;
a0d0e21e 499 if (toketype == ANDAND)
6154021b 500 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 501 else if (toketype == OROR)
6154021b 502 pl_yylval.ival = OP_ORASSIGN;
c963b151 503 else if (toketype == DORDOR)
6154021b 504 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
505 toketype = ASSIGNOP;
506 }
507 return toketype;
508}
509
ffb4593c
NT
510/*
511 * S_no_op
512 * When Perl expects an operator and finds something else, no_op
513 * prints the warning. It always prints "<something> found where
514 * operator expected. It prints "Missing semicolon on previous line?"
515 * if the surprise occurs at the start of the line. "do you need to
516 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
517 * where the compiler doesn't know if foo is a method call or a function.
518 * It prints "Missing operator before end of line" if there's nothing
519 * after the missing operator, or "... before <...>" if there is something
520 * after the missing operator.
521 */
522
76e3520e 523STATIC void
15f169a1 524S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 525{
97aff369 526 dVAR;
9d4ba2ae
AL
527 char * const oldbp = PL_bufptr;
528 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 529
7918f24d
NC
530 PERL_ARGS_ASSERT_NO_OP;
531
1189a94a
GS
532 if (!s)
533 s = oldbp;
07c798fb 534 else
1189a94a 535 PL_bufptr = s;
cea2e8a9 536 yywarn(Perl_form(aTHX_ "%s found where operator expected", what));
56da5a46
RGS
537 if (ckWARN_d(WARN_SYNTAX)) {
538 if (is_first)
539 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
540 "\t(Missing semicolon on previous line?)\n");
541 else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) {
f54cb97a 542 const char *t;
c35e046a
AL
543 for (t = PL_oldoldbufptr; (isALNUM_lazy_if(t,UTF) || *t == ':'); t++)
544 NOOP;
56da5a46
RGS
545 if (t < PL_bufptr && isSPACE(*t))
546 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
547 "\t(Do you need to predeclare %.*s?)\n",
551405c4 548 (int)(t - PL_oldoldbufptr), PL_oldoldbufptr);
56da5a46
RGS
549 }
550 else {
551 assert(s >= oldbp);
552 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
551405c4 553 "\t(Missing operator before %.*s?)\n", (int)(s - oldbp), oldbp);
56da5a46 554 }
07c798fb 555 }
3280af22 556 PL_bufptr = oldbp;
8990e307
LW
557}
558
ffb4593c
NT
559/*
560 * S_missingterm
561 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 562 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
563 * If we're in a delimited string and the delimiter is a control
564 * character, it's reformatted into a two-char sequence like ^C.
565 * This is fatal.
566 */
567
76e3520e 568STATIC void
cea2e8a9 569S_missingterm(pTHX_ char *s)
8990e307 570{
97aff369 571 dVAR;
8990e307
LW
572 char tmpbuf[3];
573 char q;
574 if (s) {
9d4ba2ae 575 char * const nl = strrchr(s,'\n');
d2719217 576 if (nl)
8990e307
LW
577 *nl = '\0';
578 }
463559e7 579 else if (isCNTRL(PL_multi_close)) {
8990e307 580 *tmpbuf = '^';
585ec06d 581 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
582 tmpbuf[2] = '\0';
583 s = tmpbuf;
584 }
585 else {
eb160463 586 *tmpbuf = (char)PL_multi_close;
8990e307
LW
587 tmpbuf[1] = '\0';
588 s = tmpbuf;
589 }
590 q = strchr(s,'"') ? '\'' : '"';
cea2e8a9 591 Perl_croak(aTHX_ "Can't find string terminator %c%s%c anywhere before EOF",q,s,q);
463ee0b2 592}
79072805 593
0d863452 594/*
0d863452
RH
595 * Check whether the named feature is enabled.
596 */
26ea9e12
NC
597bool
598Perl_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
0d863452 599{
97aff369 600 dVAR;
0d863452 601 HV * const hinthv = GvHV(PL_hintgv);
4a731d7b 602 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
7918f24d
NC
603
604 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
605
26ea9e12
NC
606 if (namelen > MAX_FEATURE_LEN)
607 return FALSE;
4a731d7b 608 memcpy(&he_name[8], name, namelen);
d4c19fe8 609
7b9ef140 610 return (hinthv && hv_exists(hinthv, he_name, 8 + namelen));
0d863452
RH
611}
612
ffb4593c 613/*
9cbb5ea2
GS
614 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
615 * utf16-to-utf8-reversed.
ffb4593c
NT
616 */
617
c39cd008
GS
618#ifdef PERL_CR_FILTER
619static void
620strip_return(SV *sv)
621{
95a20fc0 622 register const char *s = SvPVX_const(sv);
9d4ba2ae 623 register const char * const e = s + SvCUR(sv);
7918f24d
NC
624
625 PERL_ARGS_ASSERT_STRIP_RETURN;
626
c39cd008
GS
627 /* outer loop optimized to do nothing if there are no CR-LFs */
628 while (s < e) {
629 if (*s++ == '\r' && *s == '\n') {
630 /* hit a CR-LF, need to copy the rest */
631 register char *d = s - 1;
632 *d++ = *s++;
633 while (s < e) {
634 if (*s == '\r' && s[1] == '\n')
635 s++;
636 *d++ = *s++;
637 }
638 SvCUR(sv) -= s - d;
639 return;
640 }
641 }
642}
a868473f 643
76e3520e 644STATIC I32
c39cd008 645S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 646{
f54cb97a 647 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
648 if (count > 0 && !maxlen)
649 strip_return(sv);
650 return count;
a868473f
NIS
651}
652#endif
653
ffb4593c 654/*
8eaa0acf
Z
655=for apidoc Amx|void|lex_start|SV *line|PerlIO *rsfp|U32 flags
656
657Creates and initialises a new lexer/parser state object, supplying
658a context in which to lex and parse from a new source of Perl code.
659A pointer to the new state object is placed in L</PL_parser>. An entry
660is made on the save stack so that upon unwinding the new state object
661will be destroyed and the former value of L</PL_parser> will be restored.
662Nothing else need be done to clean up the parsing context.
663
664The code to be parsed comes from I<line> and I<rsfp>. I<line>, if
665non-null, provides a string (in SV form) containing code to be parsed.
666A copy of the string is made, so subsequent modification of I<line>
667does not affect parsing. I<rsfp>, if non-null, provides an input stream
668from which code will be read to be parsed. If both are non-null, the
669code in I<line> comes first and must consist of complete lines of input,
670and I<rsfp> supplies the remainder of the source.
671
672The I<flags> parameter is reserved for future use, and must always
27fcb6ee
FC
673be zero, except for one flag that is currently reserved for perl's internal
674use.
8eaa0acf
Z
675
676=cut
677*/
ffb4593c 678
27fcb6ee
FC
679/* LEX_START_SAME_FILTER indicates that this is not a new file, so it
680 can share filters with the current parser. */
681
a0d0e21e 682void
8eaa0acf 683Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
79072805 684{
97aff369 685 dVAR;
6ef55633 686 const char *s = NULL;
8990e307 687 STRLEN len;
5486870f 688 yy_parser *parser, *oparser;
27fcb6ee 689 if (flags && flags != LEX_START_SAME_FILTER)
8eaa0acf 690 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
acdf0a21
DM
691
692 /* create and initialise a parser */
693
199e78b7 694 Newxz(parser, 1, yy_parser);
5486870f 695 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
696 PL_parser = parser;
697
28ac2b49
Z
698 parser->stack = NULL;
699 parser->ps = NULL;
700 parser->stack_size = 0;
acdf0a21 701
e3abe207
DM
702 /* on scope exit, free this parser and restore any outer one */
703 SAVEPARSER(parser);
7c4baf47 704 parser->saved_curcop = PL_curcop;
e3abe207 705
acdf0a21 706 /* initialise lexer state */
8990e307 707
fb205e7a
DM
708#ifdef PERL_MAD
709 parser->curforce = -1;
710#else
711 parser->nexttoke = 0;
712#endif
ca4cfd28 713 parser->error_count = oparser ? oparser->error_count : 0;
c2598295 714 parser->copline = NOLINE;
5afb0a62 715 parser->lex_state = LEX_NORMAL;
c2598295 716 parser->expect = XSTATE;
2f9285f8 717 parser->rsfp = rsfp;
27fcb6ee
FC
718 parser->rsfp_filters =
719 !(flags & LEX_START_SAME_FILTER) || !oparser
720 ? newAV()
721 : MUTABLE_AV(SvREFCNT_inc(oparser->rsfp_filters));
2f9285f8 722
199e78b7
DM
723 Newx(parser->lex_brackstack, 120, char);
724 Newx(parser->lex_casestack, 12, char);
725 *parser->lex_casestack = '\0';
02b34bbe 726
10efb74f
NC
727 if (line) {
728 s = SvPV_const(line, len);
729 } else {
730 len = 0;
731 }
bdc0bf6f 732
10efb74f 733 if (!len) {
bdc0bf6f 734 parser->linestr = newSVpvs("\n;");
805700c1 735 } else {
719a9bb0 736 parser->linestr = newSVpvn_flags(s, len, SvUTF8(line));
10efb74f 737 if (s[len-1] != ';')
bdc0bf6f 738 sv_catpvs(parser->linestr, "\n;");
8990e307 739 }
f06b5848
DM
740 parser->oldoldbufptr =
741 parser->oldbufptr =
742 parser->bufptr =
743 parser->linestart = SvPVX(parser->linestr);
744 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
745 parser->last_lop = parser->last_uni = NULL;
737c24fc
Z
746
747 parser->in_pod = 0;
79072805 748}
a687059c 749
e3abe207
DM
750
751/* delete a parser object */
752
753void
754Perl_parser_free(pTHX_ const yy_parser *parser)
755{
7918f24d
NC
756 PERL_ARGS_ASSERT_PARSER_FREE;
757
7c4baf47 758 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
759 SvREFCNT_dec(parser->linestr);
760
2f9285f8
DM
761 if (parser->rsfp == PerlIO_stdin())
762 PerlIO_clearerr(parser->rsfp);
799361c3
SH
763 else if (parser->rsfp && (!parser->old_parser ||
764 (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 765 PerlIO_close(parser->rsfp);
5486870f 766 SvREFCNT_dec(parser->rsfp_filters);
2f9285f8 767
e3abe207
DM
768 Safefree(parser->lex_brackstack);
769 Safefree(parser->lex_casestack);
770 PL_parser = parser->old_parser;
771 Safefree(parser);
772}
773
774
ffb4593c 775/*
f0e67a1d
Z
776=for apidoc AmxU|SV *|PL_parser-E<gt>linestr
777
778Buffer scalar containing the chunk currently under consideration of the
779text currently being lexed. This is always a plain string scalar (for
780which C<SvPOK> is true). It is not intended to be used as a scalar by
781normal scalar means; instead refer to the buffer directly by the pointer
782variables described below.
783
784The lexer maintains various C<char*> pointers to things in the
785C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
786reallocated, all of these pointers must be updated. Don't attempt to
787do this manually, but rather use L</lex_grow_linestr> if you need to
788reallocate the buffer.
789
790The content of the text chunk in the buffer is commonly exactly one
791complete line of input, up to and including a newline terminator,
792but there are situations where it is otherwise. The octets of the
793buffer may be intended to be interpreted as either UTF-8 or Latin-1.
794The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
795flag on this scalar, which may disagree with it.
796
797For direct examination of the buffer, the variable
798L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
799lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
800of these pointers is usually preferable to examination of the scalar
801through normal scalar means.
802
803=for apidoc AmxU|char *|PL_parser-E<gt>bufend
804
805Direct pointer to the end of the chunk of text currently being lexed, the
806end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
807+ SvCUR(PL_parser-E<gt>linestr)>. A NUL character (zero octet) is
808always located at the end of the buffer, and does not count as part of
809the buffer's contents.
810
811=for apidoc AmxU|char *|PL_parser-E<gt>bufptr
812
813Points to the current position of lexing inside the lexer buffer.
814Characters around this point may be freely examined, within
815the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
816L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
817interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
818
819Lexing code (whether in the Perl core or not) moves this pointer past
820the characters that it consumes. It is also expected to perform some
821bookkeeping whenever a newline character is consumed. This movement
822can be more conveniently performed by the function L</lex_read_to>,
823which handles newlines appropriately.
824
825Interpretation of the buffer's octets can be abstracted out by
826using the slightly higher-level functions L</lex_peek_unichar> and
827L</lex_read_unichar>.
828
829=for apidoc AmxU|char *|PL_parser-E<gt>linestart
830
831Points to the start of the current line inside the lexer buffer.
832This is useful for indicating at which column an error occurred, and
833not much else. This must be updated by any lexing code that consumes
834a newline; the function L</lex_read_to> handles this detail.
835
836=cut
837*/
838
839/*
840=for apidoc Amx|bool|lex_bufutf8
841
842Indicates whether the octets in the lexer buffer
843(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
844of Unicode characters. If not, they should be interpreted as Latin-1
845characters. This is analogous to the C<SvUTF8> flag for scalars.
846
847In UTF-8 mode, it is not guaranteed that the lexer buffer actually
848contains valid UTF-8. Lexing code must be robust in the face of invalid
849encoding.
850
851The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
852is significant, but not the whole story regarding the input character
853encoding. Normally, when a file is being read, the scalar contains octets
854and its C<SvUTF8> flag is off, but the octets should be interpreted as
855UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
856however, the scalar may have the C<SvUTF8> flag on, and in this case its
857octets should be interpreted as UTF-8 unless the C<use bytes> pragma
858is in effect. This logic may change in the future; use this function
859instead of implementing the logic yourself.
860
861=cut
862*/
863
864bool
865Perl_lex_bufutf8(pTHX)
866{
867 return UTF;
868}
869
870/*
871=for apidoc Amx|char *|lex_grow_linestr|STRLEN len
872
873Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
874at least I<len> octets (including terminating NUL). Returns a
875pointer to the reallocated buffer. This is necessary before making
876any direct modification of the buffer that would increase its length.
877L</lex_stuff_pvn> provides a more convenient way to insert text into
878the buffer.
879
880Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
881this function updates all of the lexer's variables that point directly
882into the buffer.
883
884=cut
885*/
886
887char *
888Perl_lex_grow_linestr(pTHX_ STRLEN len)
889{
890 SV *linestr;
891 char *buf;
892 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
893 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
894 linestr = PL_parser->linestr;
895 buf = SvPVX(linestr);
896 if (len <= SvLEN(linestr))
897 return buf;
898 bufend_pos = PL_parser->bufend - buf;
899 bufptr_pos = PL_parser->bufptr - buf;
900 oldbufptr_pos = PL_parser->oldbufptr - buf;
901 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
902 linestart_pos = PL_parser->linestart - buf;
903 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
904 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
905 buf = sv_grow(linestr, len);
906 PL_parser->bufend = buf + bufend_pos;
907 PL_parser->bufptr = buf + bufptr_pos;
908 PL_parser->oldbufptr = buf + oldbufptr_pos;
909 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
910 PL_parser->linestart = buf + linestart_pos;
911 if (PL_parser->last_uni)
912 PL_parser->last_uni = buf + last_uni_pos;
913 if (PL_parser->last_lop)
914 PL_parser->last_lop = buf + last_lop_pos;
915 return buf;
916}
917
918/*
83aa740e 919=for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
f0e67a1d
Z
920
921Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
922immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
923reallocating the buffer if necessary. This means that lexing code that
924runs later will see the characters as if they had appeared in the input.
925It is not recommended to do this as part of normal parsing, and most
926uses of this facility run the risk of the inserted characters being
927interpreted in an unintended manner.
928
929The string to be inserted is represented by I<len> octets starting
930at I<pv>. These octets are interpreted as either UTF-8 or Latin-1,
931according to whether the C<LEX_STUFF_UTF8> flag is set in I<flags>.
932The characters are recoded for the lexer buffer, according to how the
933buffer is currently being interpreted (L</lex_bufutf8>). If a string
9dcc53ea 934to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
f0e67a1d
Z
935function is more convenient.
936
937=cut
938*/
939
940void
83aa740e 941Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
f0e67a1d 942{
749123ff 943 dVAR;
f0e67a1d
Z
944 char *bufptr;
945 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
946 if (flags & ~(LEX_STUFF_UTF8))
947 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
948 if (UTF) {
949 if (flags & LEX_STUFF_UTF8) {
950 goto plain_copy;
951 } else {
952 STRLEN highhalf = 0;
83aa740e 953 const char *p, *e = pv+len;
f0e67a1d
Z
954 for (p = pv; p != e; p++)
955 highhalf += !!(((U8)*p) & 0x80);
956 if (!highhalf)
957 goto plain_copy;
958 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
959 bufptr = PL_parser->bufptr;
960 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
961 SvCUR_set(PL_parser->linestr,
962 SvCUR(PL_parser->linestr) + len+highhalf);
f0e67a1d
Z
963 PL_parser->bufend += len+highhalf;
964 for (p = pv; p != e; p++) {
965 U8 c = (U8)*p;
966 if (c & 0x80) {
967 *bufptr++ = (char)(0xc0 | (c >> 6));
968 *bufptr++ = (char)(0x80 | (c & 0x3f));
969 } else {
970 *bufptr++ = (char)c;
971 }
972 }
973 }
974 } else {
975 if (flags & LEX_STUFF_UTF8) {
976 STRLEN highhalf = 0;
83aa740e 977 const char *p, *e = pv+len;
f0e67a1d
Z
978 for (p = pv; p != e; p++) {
979 U8 c = (U8)*p;
980 if (c >= 0xc4) {
981 Perl_croak(aTHX_ "Lexing code attempted to stuff "
982 "non-Latin-1 character into Latin-1 input");
983 } else if (c >= 0xc2 && p+1 != e &&
984 (((U8)p[1]) & 0xc0) == 0x80) {
985 p++;
986 highhalf++;
987 } else if (c >= 0x80) {
988 /* malformed UTF-8 */
989 ENTER;
990 SAVESPTR(PL_warnhook);
991 PL_warnhook = PERL_WARNHOOK_FATAL;
992 utf8n_to_uvuni((U8*)p, e-p, NULL, 0);
993 LEAVE;
994 }
995 }
996 if (!highhalf)
997 goto plain_copy;
998 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
999 bufptr = PL_parser->bufptr;
1000 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1001 SvCUR_set(PL_parser->linestr,
1002 SvCUR(PL_parser->linestr) + len-highhalf);
f0e67a1d
Z
1003 PL_parser->bufend += len-highhalf;
1004 for (p = pv; p != e; p++) {
1005 U8 c = (U8)*p;
1006 if (c & 0x80) {
1007 *bufptr++ = (char)(((c & 0x3) << 6) | (p[1] & 0x3f));
1008 p++;
1009 } else {
1010 *bufptr++ = (char)c;
1011 }
1012 }
1013 } else {
1014 plain_copy:
1015 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1016 bufptr = PL_parser->bufptr;
1017 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
255fdf19 1018 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
f0e67a1d
Z
1019 PL_parser->bufend += len;
1020 Copy(pv, bufptr, len, char);
1021 }
1022 }
1023}
1024
1025/*
9dcc53ea
Z
1026=for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
1027
1028Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1029immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1030reallocating the buffer if necessary. This means that lexing code that
1031runs later will see the characters as if they had appeared in the input.
1032It is not recommended to do this as part of normal parsing, and most
1033uses of this facility run the risk of the inserted characters being
1034interpreted in an unintended manner.
1035
1036The string to be inserted is represented by octets starting at I<pv>
1037and continuing to the first nul. These octets are interpreted as either
1038UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
1039in I<flags>. The characters are recoded for the lexer buffer, according
1040to how the buffer is currently being interpreted (L</lex_bufutf8>).
1041If it is not convenient to nul-terminate a string to be inserted, the
1042L</lex_stuff_pvn> function is more appropriate.
1043
1044=cut
1045*/
1046
1047void
1048Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1049{
1050 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1051 lex_stuff_pvn(pv, strlen(pv), flags);
1052}
1053
1054/*
f0e67a1d
Z
1055=for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1056
1057Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1058immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1059reallocating the buffer if necessary. This means that lexing code that
1060runs later will see the characters as if they had appeared in the input.
1061It is not recommended to do this as part of normal parsing, and most
1062uses of this facility run the risk of the inserted characters being
1063interpreted in an unintended manner.
1064
1065The string to be inserted is the string value of I<sv>. The characters
1066are recoded for the lexer buffer, according to how the buffer is currently
9dcc53ea 1067being interpreted (L</lex_bufutf8>). If a string to be inserted is
f0e67a1d
Z
1068not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1069need to construct a scalar.
1070
1071=cut
1072*/
1073
1074void
1075Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1076{
1077 char *pv;
1078 STRLEN len;
1079 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1080 if (flags)
1081 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1082 pv = SvPV(sv, len);
1083 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1084}
1085
1086/*
1087=for apidoc Amx|void|lex_unstuff|char *ptr
1088
1089Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1090I<ptr>. Text following I<ptr> will be moved, and the buffer shortened.
1091This hides the discarded text from any lexing code that runs later,
1092as if the text had never appeared.
1093
1094This is not the normal way to consume lexed text. For that, use
1095L</lex_read_to>.
1096
1097=cut
1098*/
1099
1100void
1101Perl_lex_unstuff(pTHX_ char *ptr)
1102{
1103 char *buf, *bufend;
1104 STRLEN unstuff_len;
1105 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1106 buf = PL_parser->bufptr;
1107 if (ptr < buf)
1108 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1109 if (ptr == buf)
1110 return;
1111 bufend = PL_parser->bufend;
1112 if (ptr > bufend)
1113 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1114 unstuff_len = ptr - buf;
1115 Move(ptr, buf, bufend+1-ptr, char);
1116 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1117 PL_parser->bufend = bufend - unstuff_len;
1118}
1119
1120/*
1121=for apidoc Amx|void|lex_read_to|char *ptr
1122
1123Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1124to I<ptr>. This advances L</PL_parser-E<gt>bufptr> to match I<ptr>,
1125performing the correct bookkeeping whenever a newline character is passed.
1126This is the normal way to consume lexed text.
1127
1128Interpretation of the buffer's octets can be abstracted out by
1129using the slightly higher-level functions L</lex_peek_unichar> and
1130L</lex_read_unichar>.
1131
1132=cut
1133*/
1134
1135void
1136Perl_lex_read_to(pTHX_ char *ptr)
1137{
1138 char *s;
1139 PERL_ARGS_ASSERT_LEX_READ_TO;
1140 s = PL_parser->bufptr;
1141 if (ptr < s || ptr > PL_parser->bufend)
1142 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1143 for (; s != ptr; s++)
1144 if (*s == '\n') {
1145 CopLINE_inc(PL_curcop);
1146 PL_parser->linestart = s+1;
1147 }
1148 PL_parser->bufptr = ptr;
1149}
1150
1151/*
1152=for apidoc Amx|void|lex_discard_to|char *ptr
1153
1154Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1155up to I<ptr>. The remaining content of the buffer will be moved, and
1156all pointers into the buffer updated appropriately. I<ptr> must not
1157be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1158it is not permitted to discard text that has yet to be lexed.
1159
1160Normally it is not necessarily to do this directly, because it suffices to
1161use the implicit discarding behaviour of L</lex_next_chunk> and things
1162based on it. However, if a token stretches across multiple lines,
1f317c95 1163and the lexing code has kept multiple lines of text in the buffer for
f0e67a1d
Z
1164that purpose, then after completion of the token it would be wise to
1165explicitly discard the now-unneeded earlier lines, to avoid future
1166multi-line tokens growing the buffer without bound.
1167
1168=cut
1169*/
1170
1171void
1172Perl_lex_discard_to(pTHX_ char *ptr)
1173{
1174 char *buf;
1175 STRLEN discard_len;
1176 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1177 buf = SvPVX(PL_parser->linestr);
1178 if (ptr < buf)
1179 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1180 if (ptr == buf)
1181 return;
1182 if (ptr > PL_parser->bufptr)
1183 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1184 discard_len = ptr - buf;
1185 if (PL_parser->oldbufptr < ptr)
1186 PL_parser->oldbufptr = ptr;
1187 if (PL_parser->oldoldbufptr < ptr)
1188 PL_parser->oldoldbufptr = ptr;
1189 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1190 PL_parser->last_uni = NULL;
1191 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1192 PL_parser->last_lop = NULL;
1193 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1194 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1195 PL_parser->bufend -= discard_len;
1196 PL_parser->bufptr -= discard_len;
1197 PL_parser->oldbufptr -= discard_len;
1198 PL_parser->oldoldbufptr -= discard_len;
1199 if (PL_parser->last_uni)
1200 PL_parser->last_uni -= discard_len;
1201 if (PL_parser->last_lop)
1202 PL_parser->last_lop -= discard_len;
1203}
1204
1205/*
1206=for apidoc Amx|bool|lex_next_chunk|U32 flags
1207
1208Reads in the next chunk of text to be lexed, appending it to
1209L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1210looked to the end of the current chunk and wants to know more. It is
1211usual, but not necessary, for lexing to have consumed the entirety of
1212the current chunk at this time.
1213
1214If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1215chunk (i.e., the current chunk has been entirely consumed), normally the
1216current chunk will be discarded at the same time that the new chunk is
1217read in. If I<flags> includes C<LEX_KEEP_PREVIOUS>, the current chunk
1218will not be discarded. If the current chunk has not been entirely
1219consumed, then it will not be discarded regardless of the flag.
1220
1221Returns true if some new text was added to the buffer, or false if the
1222buffer has reached the end of the input text.
1223
1224=cut
1225*/
1226
1227#define LEX_FAKE_EOF 0x80000000
1228
1229bool
1230Perl_lex_next_chunk(pTHX_ U32 flags)
1231{
1232 SV *linestr;
1233 char *buf;
1234 STRLEN old_bufend_pos, new_bufend_pos;
1235 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1236 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1237 bool got_some_for_debugger = 0;
f0e67a1d
Z
1238 bool got_some;
1239 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF))
1240 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
f0e67a1d
Z
1241 linestr = PL_parser->linestr;
1242 buf = SvPVX(linestr);
1243 if (!(flags & LEX_KEEP_PREVIOUS) &&
1244 PL_parser->bufptr == PL_parser->bufend) {
1245 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1246 linestart_pos = 0;
1247 if (PL_parser->last_uni != PL_parser->bufend)
1248 PL_parser->last_uni = NULL;
1249 if (PL_parser->last_lop != PL_parser->bufend)
1250 PL_parser->last_lop = NULL;
1251 last_uni_pos = last_lop_pos = 0;
1252 *buf = 0;
1253 SvCUR(linestr) = 0;
1254 } else {
1255 old_bufend_pos = PL_parser->bufend - buf;
1256 bufptr_pos = PL_parser->bufptr - buf;
1257 oldbufptr_pos = PL_parser->oldbufptr - buf;
1258 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1259 linestart_pos = PL_parser->linestart - buf;
1260 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1261 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1262 }
1263 if (flags & LEX_FAKE_EOF) {
1264 goto eof;
1265 } else if (!PL_parser->rsfp) {
1266 got_some = 0;
1267 } else if (filter_gets(linestr, old_bufend_pos)) {
1268 got_some = 1;
17cc9359 1269 got_some_for_debugger = 1;
f0e67a1d 1270 } else {
580561a3
Z
1271 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1272 sv_setpvs(linestr, "");
f0e67a1d
Z
1273 eof:
1274 /* End of real input. Close filehandle (unless it was STDIN),
1275 * then add implicit termination.
1276 */
1277 if ((PerlIO*)PL_parser->rsfp == PerlIO_stdin())
1278 PerlIO_clearerr(PL_parser->rsfp);
1279 else if (PL_parser->rsfp)
1280 (void)PerlIO_close(PL_parser->rsfp);
1281 PL_parser->rsfp = NULL;
737c24fc 1282 PL_parser->in_pod = 0;
f0e67a1d
Z
1283#ifdef PERL_MAD
1284 if (PL_madskills && !PL_in_eval && (PL_minus_p || PL_minus_n))
1285 PL_faketokens = 1;
1286#endif
1287 if (!PL_in_eval && PL_minus_p) {
1288 sv_catpvs(linestr,
1289 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1290 PL_minus_n = PL_minus_p = 0;
1291 } else if (!PL_in_eval && PL_minus_n) {
1292 sv_catpvs(linestr, /*{*/";}");
1293 PL_minus_n = 0;
1294 } else
1295 sv_catpvs(linestr, ";");
1296 got_some = 1;
1297 }
1298 buf = SvPVX(linestr);
1299 new_bufend_pos = SvCUR(linestr);
1300 PL_parser->bufend = buf + new_bufend_pos;
1301 PL_parser->bufptr = buf + bufptr_pos;
1302 PL_parser->oldbufptr = buf + oldbufptr_pos;
1303 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1304 PL_parser->linestart = buf + linestart_pos;
1305 if (PL_parser->last_uni)
1306 PL_parser->last_uni = buf + last_uni_pos;
1307 if (PL_parser->last_lop)
1308 PL_parser->last_lop = buf + last_lop_pos;
17cc9359 1309 if (got_some_for_debugger && (PERLDB_LINE || PERLDB_SAVESRC) &&
f0e67a1d
Z
1310 PL_curstash != PL_debstash) {
1311 /* debugger active and we're not compiling the debugger code,
1312 * so store the line into the debugger's array of lines
1313 */
1314 update_debugger_info(NULL, buf+old_bufend_pos,
1315 new_bufend_pos-old_bufend_pos);
1316 }
1317 return got_some;
1318}
1319
1320/*
1321=for apidoc Amx|I32|lex_peek_unichar|U32 flags
1322
1323Looks ahead one (Unicode) character in the text currently being lexed.
1324Returns the codepoint (unsigned integer value) of the next character,
1325or -1 if lexing has reached the end of the input text. To consume the
1326peeked character, use L</lex_read_unichar>.
1327
1328If the next character is in (or extends into) the next chunk of input
1329text, the next chunk will be read in. Normally the current chunk will be
1330discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1331then the current chunk will not be discarded.
1332
1333If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1334is encountered, an exception is generated.
1335
1336=cut
1337*/
1338
1339I32
1340Perl_lex_peek_unichar(pTHX_ U32 flags)
1341{
749123ff 1342 dVAR;
f0e67a1d
Z
1343 char *s, *bufend;
1344 if (flags & ~(LEX_KEEP_PREVIOUS))
1345 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1346 s = PL_parser->bufptr;
1347 bufend = PL_parser->bufend;
1348 if (UTF) {
1349 U8 head;
1350 I32 unichar;
1351 STRLEN len, retlen;
1352 if (s == bufend) {
1353 if (!lex_next_chunk(flags))
1354 return -1;
1355 s = PL_parser->bufptr;
1356 bufend = PL_parser->bufend;
1357 }
1358 head = (U8)*s;
1359 if (!(head & 0x80))
1360 return head;
1361 if (head & 0x40) {
1362 len = PL_utf8skip[head];
1363 while ((STRLEN)(bufend-s) < len) {
1364 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1365 break;
1366 s = PL_parser->bufptr;
1367 bufend = PL_parser->bufend;
1368 }
1369 }
1370 unichar = utf8n_to_uvuni((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
1371 if (retlen == (STRLEN)-1) {
1372 /* malformed UTF-8 */
1373 ENTER;
1374 SAVESPTR(PL_warnhook);
1375 PL_warnhook = PERL_WARNHOOK_FATAL;
1376 utf8n_to_uvuni((U8*)s, bufend-s, NULL, 0);
1377 LEAVE;
1378 }
1379 return unichar;
1380 } else {
1381 if (s == bufend) {
1382 if (!lex_next_chunk(flags))
1383 return -1;
1384 s = PL_parser->bufptr;
1385 }
1386 return (U8)*s;
1387 }
1388}
1389
1390/*
1391=for apidoc Amx|I32|lex_read_unichar|U32 flags
1392
1393Reads the next (Unicode) character in the text currently being lexed.
1394Returns the codepoint (unsigned integer value) of the character read,
1395and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1396if lexing has reached the end of the input text. To non-destructively
1397examine the next character, use L</lex_peek_unichar> instead.
1398
1399If the next character is in (or extends into) the next chunk of input
1400text, the next chunk will be read in. Normally the current chunk will be
1401discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1402then the current chunk will not be discarded.
1403
1404If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1405is encountered, an exception is generated.
1406
1407=cut
1408*/
1409
1410I32
1411Perl_lex_read_unichar(pTHX_ U32 flags)
1412{
1413 I32 c;
1414 if (flags & ~(LEX_KEEP_PREVIOUS))
1415 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1416 c = lex_peek_unichar(flags);
1417 if (c != -1) {
1418 if (c == '\n')
1419 CopLINE_inc(PL_curcop);
d9018cbe
EB
1420 if (UTF)
1421 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1422 else
1423 ++(PL_parser->bufptr);
f0e67a1d
Z
1424 }
1425 return c;
1426}
1427
1428/*
1429=for apidoc Amx|void|lex_read_space|U32 flags
1430
1431Reads optional spaces, in Perl style, in the text currently being
1432lexed. The spaces may include ordinary whitespace characters and
1433Perl-style comments. C<#line> directives are processed if encountered.
1434L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1435at a non-space character (or the end of the input text).
1436
1437If spaces extend into the next chunk of input text, the next chunk will
1438be read in. Normally the current chunk will be discarded at the same
1439time, but if I<flags> includes C<LEX_KEEP_PREVIOUS> then the current
1440chunk will not be discarded.
1441
1442=cut
1443*/
1444
f0998909
Z
1445#define LEX_NO_NEXT_CHUNK 0x80000000
1446
f0e67a1d
Z
1447void
1448Perl_lex_read_space(pTHX_ U32 flags)
1449{
1450 char *s, *bufend;
1451 bool need_incline = 0;
f0998909 1452 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK))
f0e67a1d
Z
1453 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
1454#ifdef PERL_MAD
1455 if (PL_skipwhite) {
1456 sv_free(PL_skipwhite);
1457 PL_skipwhite = NULL;
1458 }
1459 if (PL_madskills)
1460 PL_skipwhite = newSVpvs("");
1461#endif /* PERL_MAD */
1462 s = PL_parser->bufptr;
1463 bufend = PL_parser->bufend;
1464 while (1) {
1465 char c = *s;
1466 if (c == '#') {
1467 do {
1468 c = *++s;
1469 } while (!(c == '\n' || (c == 0 && s == bufend)));
1470 } else if (c == '\n') {
1471 s++;
1472 PL_parser->linestart = s;
1473 if (s == bufend)
1474 need_incline = 1;
1475 else
1476 incline(s);
1477 } else if (isSPACE(c)) {
1478 s++;
1479 } else if (c == 0 && s == bufend) {
1480 bool got_more;
1481#ifdef PERL_MAD
1482 if (PL_madskills)
1483 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1484#endif /* PERL_MAD */
f0998909
Z
1485 if (flags & LEX_NO_NEXT_CHUNK)
1486 break;
f0e67a1d
Z
1487 PL_parser->bufptr = s;
1488 CopLINE_inc(PL_curcop);
1489 got_more = lex_next_chunk(flags);
1490 CopLINE_dec(PL_curcop);
1491 s = PL_parser->bufptr;
1492 bufend = PL_parser->bufend;
1493 if (!got_more)
1494 break;
1495 if (need_incline && PL_parser->rsfp) {
1496 incline(s);
1497 need_incline = 0;
1498 }
1499 } else {
1500 break;
1501 }
1502 }
1503#ifdef PERL_MAD
1504 if (PL_madskills)
1505 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1506#endif /* PERL_MAD */
1507 PL_parser->bufptr = s;
1508}
1509
1510/*
ffb4593c
NT
1511 * S_incline
1512 * This subroutine has nothing to do with tilting, whether at windmills
1513 * or pinball tables. Its name is short for "increment line". It
57843af0 1514 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1515 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1516 * # line 500 "foo.pm"
1517 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1518 */
1519
76e3520e 1520STATIC void
d9095cec 1521S_incline(pTHX_ const char *s)
463ee0b2 1522{
97aff369 1523 dVAR;
d9095cec
NC
1524 const char *t;
1525 const char *n;
1526 const char *e;
8818d409 1527 line_t line_num;
463ee0b2 1528
7918f24d
NC
1529 PERL_ARGS_ASSERT_INCLINE;
1530
57843af0 1531 CopLINE_inc(PL_curcop);
463ee0b2
LW
1532 if (*s++ != '#')
1533 return;
d4c19fe8
AL
1534 while (SPACE_OR_TAB(*s))
1535 s++;
73659bf1
GS
1536 if (strnEQ(s, "line", 4))
1537 s += 4;
1538 else
1539 return;
084592ab 1540 if (SPACE_OR_TAB(*s))
73659bf1 1541 s++;
4e553d73 1542 else
73659bf1 1543 return;
d4c19fe8
AL
1544 while (SPACE_OR_TAB(*s))
1545 s++;
463ee0b2
LW
1546 if (!isDIGIT(*s))
1547 return;
d4c19fe8 1548
463ee0b2
LW
1549 n = s;
1550 while (isDIGIT(*s))
1551 s++;
07714eb4 1552 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1553 return;
bf4acbe4 1554 while (SPACE_OR_TAB(*s))
463ee0b2 1555 s++;
73659bf1 1556 if (*s == '"' && (t = strchr(s+1, '"'))) {
463ee0b2 1557 s++;
73659bf1
GS
1558 e = t + 1;
1559 }
463ee0b2 1560 else {
c35e046a
AL
1561 t = s;
1562 while (!isSPACE(*t))
1563 t++;
73659bf1 1564 e = t;
463ee0b2 1565 }
bf4acbe4 1566 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1567 e++;
1568 if (*e != '\n' && *e != '\0')
1569 return; /* false alarm */
1570
8818d409
FC
1571 line_num = atoi(n)-1;
1572
f4dd75d9 1573 if (t - s > 0) {
d9095cec 1574 const STRLEN len = t - s;
19bad673
NC
1575 SV *const temp_sv = CopFILESV(PL_curcop);
1576 const char *cf;
1577 STRLEN tmplen;
1578
1579 if (temp_sv) {
1580 cf = SvPVX(temp_sv);
1581 tmplen = SvCUR(temp_sv);
1582 } else {
1583 cf = NULL;
1584 tmplen = 0;
1585 }
1586
42d9b98d 1587 if (tmplen > 7 && strnEQ(cf, "(eval ", 6)) {
e66cf94c
RGS
1588 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1589 * to *{"::_<newfilename"} */
44867030
NC
1590 /* However, the long form of evals is only turned on by the
1591 debugger - usually they're "(eval %lu)" */
1592 char smallbuf[128];
1593 char *tmpbuf;
1594 GV **gvp;
d9095cec 1595 STRLEN tmplen2 = len;
798b63bc 1596 if (tmplen + 2 <= sizeof smallbuf)
e66cf94c
RGS
1597 tmpbuf = smallbuf;
1598 else
2ae0db35 1599 Newx(tmpbuf, tmplen + 2, char);
44867030
NC
1600 tmpbuf[0] = '_';
1601 tmpbuf[1] = '<';
2ae0db35 1602 memcpy(tmpbuf + 2, cf, tmplen);
44867030 1603 tmplen += 2;
8a5ee598
RGS
1604 gvp = (GV**)hv_fetch(PL_defstash, tmpbuf, tmplen, FALSE);
1605 if (gvp) {
44867030
NC
1606 char *tmpbuf2;
1607 GV *gv2;
1608
1609 if (tmplen2 + 2 <= sizeof smallbuf)
1610 tmpbuf2 = smallbuf;
1611 else
1612 Newx(tmpbuf2, tmplen2 + 2, char);
1613
1614 if (tmpbuf2 != smallbuf || tmpbuf != smallbuf) {
1615 /* Either they malloc'd it, or we malloc'd it,
1616 so no prefix is present in ours. */
1617 tmpbuf2[0] = '_';
1618 tmpbuf2[1] = '<';
1619 }
1620
1621 memcpy(tmpbuf2 + 2, s, tmplen2);
1622 tmplen2 += 2;
1623
8a5ee598 1624 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1625 if (!isGV(gv2)) {
8a5ee598 1626 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1627 /* adjust ${"::_<newfilename"} to store the new file name */
1628 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
8818d409
FC
1629 /* The line number may differ. If that is the case,
1630 alias the saved lines that are in the array.
1631 Otherwise alias the whole array. */
1632 if (CopLINE(PL_curcop) == line_num) {
1633 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(*gvp)));
1634 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(*gvp)));
1635 }
1636 else if (GvAV(*gvp)) {
1637 AV * const av = GvAV(*gvp);
1638 const I32 start = CopLINE(PL_curcop)+1;
1639 I32 items = AvFILLp(av) - start;
1640 if (items > 0) {
1641 AV * const av2 = GvAVn(gv2);
1642 SV **svp = AvARRAY(av) + start;
1643 I32 l = (I32)line_num+1;
1644 while (items--)
1645 av_store(av2, l++, SvREFCNT_inc(*svp++));
1646 }
1647 }
e5527e4b 1648 }
44867030
NC
1649
1650 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
8a5ee598 1651 }
e66cf94c 1652 if (tmpbuf != smallbuf) Safefree(tmpbuf);
e66cf94c 1653 }
05ec9bb3 1654 CopFILE_free(PL_curcop);
d9095cec 1655 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1656 }
8818d409 1657 CopLINE_set(PL_curcop, line_num);
463ee0b2
LW
1658}
1659
29595ff2 1660#ifdef PERL_MAD
cd81e915 1661/* skip space before PL_thistoken */
29595ff2
NC
1662
1663STATIC char *
1664S_skipspace0(pTHX_ register char *s)
1665{
7918f24d
NC
1666 PERL_ARGS_ASSERT_SKIPSPACE0;
1667
29595ff2
NC
1668 s = skipspace(s);
1669 if (!PL_madskills)
1670 return s;
cd81e915
NC
1671 if (PL_skipwhite) {
1672 if (!PL_thiswhite)
6b29d1f5 1673 PL_thiswhite = newSVpvs("");
cd81e915
NC
1674 sv_catsv(PL_thiswhite, PL_skipwhite);
1675 sv_free(PL_skipwhite);
1676 PL_skipwhite = 0;
1677 }
1678 PL_realtokenstart = s - SvPVX(PL_linestr);
29595ff2
NC
1679 return s;
1680}
1681
cd81e915 1682/* skip space after PL_thistoken */
29595ff2
NC
1683
1684STATIC char *
1685S_skipspace1(pTHX_ register char *s)
1686{
d4c19fe8 1687 const char *start = s;
29595ff2
NC
1688 I32 startoff = start - SvPVX(PL_linestr);
1689
7918f24d
NC
1690 PERL_ARGS_ASSERT_SKIPSPACE1;
1691
29595ff2
NC
1692 s = skipspace(s);
1693 if (!PL_madskills)
1694 return s;
1695 start = SvPVX(PL_linestr) + startoff;
cd81e915 1696 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1697 const char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1698 PL_thistoken = newSVpvn(tstart, start - tstart);
1699 }
1700 PL_realtokenstart = -1;
1701 if (PL_skipwhite) {
1702 if (!PL_nextwhite)
6b29d1f5 1703 PL_nextwhite = newSVpvs("");
cd81e915
NC
1704 sv_catsv(PL_nextwhite, PL_skipwhite);
1705 sv_free(PL_skipwhite);
1706 PL_skipwhite = 0;
29595ff2
NC
1707 }
1708 return s;
1709}
1710
1711STATIC char *
1712S_skipspace2(pTHX_ register char *s, SV **svp)
1713{
c35e046a
AL
1714 char *start;
1715 const I32 bufptroff = PL_bufptr - SvPVX(PL_linestr);
1716 const I32 startoff = s - SvPVX(PL_linestr);
1717
7918f24d
NC
1718 PERL_ARGS_ASSERT_SKIPSPACE2;
1719
29595ff2
NC
1720 s = skipspace(s);
1721 PL_bufptr = SvPVX(PL_linestr) + bufptroff;
1722 if (!PL_madskills || !svp)
1723 return s;
1724 start = SvPVX(PL_linestr) + startoff;
cd81e915 1725 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1726 char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1727 PL_thistoken = newSVpvn(tstart, start - tstart);
1728 PL_realtokenstart = -1;
29595ff2 1729 }
cd81e915 1730 if (PL_skipwhite) {
29595ff2 1731 if (!*svp)
6b29d1f5 1732 *svp = newSVpvs("");
cd81e915
NC
1733 sv_setsv(*svp, PL_skipwhite);
1734 sv_free(PL_skipwhite);
1735 PL_skipwhite = 0;
29595ff2
NC
1736 }
1737
1738 return s;
1739}
1740#endif
1741
80a702cd 1742STATIC void
15f169a1 1743S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1744{
1745 AV *av = CopFILEAVx(PL_curcop);
1746 if (av) {
b9f83d2f 1747 SV * const sv = newSV_type(SVt_PVMG);
5fa550fb
NC
1748 if (orig_sv)
1749 sv_setsv(sv, orig_sv);
1750 else
1751 sv_setpvn(sv, buf, len);
80a702cd
RGS
1752 (void)SvIOK_on(sv);
1753 SvIV_set(sv, 0);
1754 av_store(av, (I32)CopLINE(PL_curcop), sv);
1755 }
1756}
1757
ffb4593c
NT
1758/*
1759 * S_skipspace
1760 * Called to gobble the appropriate amount and type of whitespace.
1761 * Skips comments as well.
1762 */
1763
76e3520e 1764STATIC char *
cea2e8a9 1765S_skipspace(pTHX_ register char *s)
a687059c 1766{
5db06880 1767#ifdef PERL_MAD
f0e67a1d
Z
1768 char *start = s;
1769#endif /* PERL_MAD */
7918f24d 1770 PERL_ARGS_ASSERT_SKIPSPACE;
f0e67a1d 1771#ifdef PERL_MAD
cd81e915
NC
1772 if (PL_skipwhite) {
1773 sv_free(PL_skipwhite);
f0e67a1d 1774 PL_skipwhite = NULL;
5db06880 1775 }
f0e67a1d 1776#endif /* PERL_MAD */
3280af22 1777 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
bf4acbe4 1778 while (s < PL_bufend && SPACE_OR_TAB(*s))
463ee0b2 1779 s++;
f0e67a1d
Z
1780 } else {
1781 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1782 PL_bufptr = s;
f0998909
Z
1783 lex_read_space(LEX_KEEP_PREVIOUS |
1784 (PL_sublex_info.sub_inwhat || PL_lex_state == LEX_FORMLINE ?
1785 LEX_NO_NEXT_CHUNK : 0));
3280af22 1786 s = PL_bufptr;
f0e67a1d
Z
1787 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1788 if (PL_linestart > PL_bufptr)
1789 PL_bufptr = PL_linestart;
1790 return s;
463ee0b2 1791 }
5db06880 1792#ifdef PERL_MAD
f0e67a1d
Z
1793 if (PL_madskills)
1794 PL_skipwhite = newSVpvn(start, s-start);
1795#endif /* PERL_MAD */
5db06880 1796 return s;
a687059c 1797}
378cc40b 1798
ffb4593c
NT
1799/*
1800 * S_check_uni
1801 * Check the unary operators to ensure there's no ambiguity in how they're
1802 * used. An ambiguous piece of code would be:
1803 * rand + 5
1804 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1805 * the +5 is its argument.
1806 */
1807
76e3520e 1808STATIC void
cea2e8a9 1809S_check_uni(pTHX)
ba106d47 1810{
97aff369 1811 dVAR;
d4c19fe8
AL
1812 const char *s;
1813 const char *t;
2f3197b3 1814
3280af22 1815 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1816 return;
3280af22
NIS
1817 while (isSPACE(*PL_last_uni))
1818 PL_last_uni++;
c35e046a
AL
1819 s = PL_last_uni;
1820 while (isALNUM_lazy_if(s,UTF) || *s == '-')
1821 s++;
3280af22 1822 if ((t = strchr(s, '(')) && t < PL_bufptr)
a0d0e21e 1823 return;
6136c704 1824
9b387841
NC
1825 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
1826 "Warning: Use of \"%.*s\" without parentheses is ambiguous",
1827 (int)(s - PL_last_uni), PL_last_uni);
2f3197b3
LW
1828}
1829
ffb4593c
NT
1830/*
1831 * LOP : macro to build a list operator. Its behaviour has been replaced
1832 * with a subroutine, S_lop() for which LOP is just another name.
1833 */
1834
a0d0e21e
LW
1835#define LOP(f,x) return lop(f,x,s)
1836
ffb4593c
NT
1837/*
1838 * S_lop
1839 * Build a list operator (or something that might be one). The rules:
1840 * - if we have a next token, then it's a list operator [why?]
1841 * - if the next thing is an opening paren, then it's a function
1842 * - else it's a list operator
1843 */
1844
76e3520e 1845STATIC I32
a0be28da 1846S_lop(pTHX_ I32 f, int x, char *s)
ffed7fef 1847{
97aff369 1848 dVAR;
7918f24d
NC
1849
1850 PERL_ARGS_ASSERT_LOP;
1851
6154021b 1852 pl_yylval.ival = f;
35c8bce7 1853 CLINE;
3280af22
NIS
1854 PL_expect = x;
1855 PL_bufptr = s;
1856 PL_last_lop = PL_oldbufptr;
eb160463 1857 PL_last_lop_op = (OPCODE)f;
5db06880
NC
1858#ifdef PERL_MAD
1859 if (PL_lasttoke)
78cdf107 1860 goto lstop;
5db06880 1861#else
3280af22 1862 if (PL_nexttoke)
78cdf107 1863 goto lstop;
5db06880 1864#endif
79072805 1865 if (*s == '(')
bbf60fe6 1866 return REPORT(FUNC);
29595ff2 1867 s = PEEKSPACE(s);
79072805 1868 if (*s == '(')
bbf60fe6 1869 return REPORT(FUNC);
78cdf107
Z
1870 else {
1871 lstop:
1872 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
1873 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
bbf60fe6 1874 return REPORT(LSTOP);
78cdf107 1875 }
79072805
LW
1876}
1877
5db06880
NC
1878#ifdef PERL_MAD
1879 /*
1880 * S_start_force
1881 * Sets up for an eventual force_next(). start_force(0) basically does
1882 * an unshift, while start_force(-1) does a push. yylex removes items
1883 * on the "pop" end.
1884 */
1885
1886STATIC void
1887S_start_force(pTHX_ int where)
1888{
1889 int i;
1890
cd81e915 1891 if (where < 0) /* so people can duplicate start_force(PL_curforce) */
5db06880 1892 where = PL_lasttoke;
cd81e915
NC
1893 assert(PL_curforce < 0 || PL_curforce == where);
1894 if (PL_curforce != where) {
5db06880
NC
1895 for (i = PL_lasttoke; i > where; --i) {
1896 PL_nexttoke[i] = PL_nexttoke[i-1];
1897 }
1898 PL_lasttoke++;
1899 }
cd81e915 1900 if (PL_curforce < 0) /* in case of duplicate start_force() */
5db06880 1901 Zero(&PL_nexttoke[where], 1, NEXTTOKE);
cd81e915
NC
1902 PL_curforce = where;
1903 if (PL_nextwhite) {
5db06880 1904 if (PL_madskills)
6b29d1f5 1905 curmad('^', newSVpvs(""));
cd81e915 1906 CURMAD('_', PL_nextwhite);
5db06880
NC
1907 }
1908}
1909
1910STATIC void
1911S_curmad(pTHX_ char slot, SV *sv)
1912{
1913 MADPROP **where;
1914
1915 if (!sv)
1916 return;
cd81e915
NC
1917 if (PL_curforce < 0)
1918 where = &PL_thismad;
5db06880 1919 else
cd81e915 1920 where = &PL_nexttoke[PL_curforce].next_mad;
5db06880 1921
cd81e915 1922 if (PL_faketokens)
76f68e9b 1923 sv_setpvs(sv, "");
5db06880
NC
1924 else {
1925 if (!IN_BYTES) {
1926 if (UTF && is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
1927 SvUTF8_on(sv);
1928 else if (PL_encoding) {
1929 sv_recode_to_utf8(sv, PL_encoding);
1930 }
1931 }
1932 }
1933
1934 /* keep a slot open for the head of the list? */
1935 if (slot != '_' && *where && (*where)->mad_key == '^') {
1936 (*where)->mad_key = slot;
daba3364 1937 sv_free(MUTABLE_SV(((*where)->mad_val)));
5db06880
NC
1938 (*where)->mad_val = (void*)sv;
1939 }
1940 else
1941 addmad(newMADsv(slot, sv), where, 0);
1942}
1943#else
b3f24c00
MHM
1944# define start_force(where) NOOP
1945# define curmad(slot, sv) NOOP
5db06880
NC
1946#endif
1947
ffb4593c
NT
1948/*
1949 * S_force_next
9cbb5ea2 1950 * When the lexer realizes it knows the next token (for instance,
ffb4593c 1951 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 1952 * to know what token to return the next time the lexer is called. Caller
5db06880
NC
1953 * will need to set PL_nextval[] (or PL_nexttoke[].next_val with PERL_MAD),
1954 * and possibly PL_expect to ensure the lexer handles the token correctly.
ffb4593c
NT
1955 */
1956
4e553d73 1957STATIC void
cea2e8a9 1958S_force_next(pTHX_ I32 type)
79072805 1959{
97aff369 1960 dVAR;
704d4215
GG
1961#ifdef DEBUGGING
1962 if (DEBUG_T_TEST) {
1963 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 1964 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
1965 }
1966#endif
5db06880 1967#ifdef PERL_MAD
cd81e915 1968 if (PL_curforce < 0)
5db06880 1969 start_force(PL_lasttoke);
cd81e915 1970 PL_nexttoke[PL_curforce].next_type = type;
5db06880
NC
1971 if (PL_lex_state != LEX_KNOWNEXT)
1972 PL_lex_defer = PL_lex_state;
1973 PL_lex_state = LEX_KNOWNEXT;
1974 PL_lex_expect = PL_expect;
cd81e915 1975 PL_curforce = -1;
5db06880 1976#else
3280af22
NIS
1977 PL_nexttype[PL_nexttoke] = type;
1978 PL_nexttoke++;
1979 if (PL_lex_state != LEX_KNOWNEXT) {
1980 PL_lex_defer = PL_lex_state;
1981 PL_lex_expect = PL_expect;
1982 PL_lex_state = LEX_KNOWNEXT;
79072805 1983 }
5db06880 1984#endif
79072805
LW
1985}
1986
28ac2b49
Z
1987void
1988Perl_yyunlex(pTHX)
1989{
a7aaec61
Z
1990 int yyc = PL_parser->yychar;
1991 if (yyc != YYEMPTY) {
1992 if (yyc) {
1993 start_force(-1);
1994 NEXTVAL_NEXTTOKE = PL_parser->yylval;
1995 if (yyc == '{'/*}*/ || yyc == HASHBRACK || yyc == '['/*]*/) {
78cdf107 1996 PL_lex_allbrackets--;
a7aaec61 1997 PL_lex_brackets--;
78cdf107
Z
1998 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
1999 } else if (yyc == '('/*)*/) {
2000 PL_lex_allbrackets--;
2001 yyc |= (2<<24);
a7aaec61
Z
2002 }
2003 force_next(yyc);
2004 }
28ac2b49
Z
2005 PL_parser->yychar = YYEMPTY;
2006 }
2007}
2008
d0a148a6 2009STATIC SV *
15f169a1 2010S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 2011{
97aff369 2012 dVAR;
740cce10 2013 SV * const sv = newSVpvn_utf8(start, len,
eaf7a4d2
CS
2014 !IN_BYTES
2015 && UTF
2016 && !is_ascii_string((const U8*)start, len)
740cce10 2017 && is_utf8_string((const U8*)start, len));
d0a148a6
NC
2018 return sv;
2019}
2020
ffb4593c
NT
2021/*
2022 * S_force_word
2023 * When the lexer knows the next thing is a word (for instance, it has
2024 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
2025 * it calls S_force_word to stick the next word into the PL_nexttoke/val
2026 * lookahead.
ffb4593c
NT
2027 *
2028 * Arguments:
b1b65b59 2029 * char *start : buffer position (must be within PL_linestr)
02b34bbe 2030 * int token : PL_next* will be this type of bare word (e.g., METHOD,WORD)
ffb4593c
NT
2031 * int check_keyword : if true, Perl checks to make sure the word isn't
2032 * a keyword (do this if the word is a label, e.g. goto FOO)
2033 * int allow_pack : if true, : characters will also be allowed (require,
2034 * use, etc. do this)
9cbb5ea2 2035 * int allow_initial_tick : used by the "sub" lexer only.
ffb4593c
NT
2036 */
2037
76e3520e 2038STATIC char *
cea2e8a9 2039S_force_word(pTHX_ register char *start, int token, int check_keyword, int allow_pack, int allow_initial_tick)
79072805 2040{
97aff369 2041 dVAR;
463ee0b2
LW
2042 register char *s;
2043 STRLEN len;
4e553d73 2044
7918f24d
NC
2045 PERL_ARGS_ASSERT_FORCE_WORD;
2046
29595ff2 2047 start = SKIPSPACE1(start);
463ee0b2 2048 s = start;
7e2040f0 2049 if (isIDFIRST_lazy_if(s,UTF) ||
a0d0e21e 2050 (allow_pack && *s == ':') ||
15f0808c 2051 (allow_initial_tick && *s == '\'') )
a0d0e21e 2052 {
3280af22 2053 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
5458a98a 2054 if (check_keyword && keyword(PL_tokenbuf, len, 0))
463ee0b2 2055 return start;
cd81e915 2056 start_force(PL_curforce);
5db06880
NC
2057 if (PL_madskills)
2058 curmad('X', newSVpvn(start,s-start));
463ee0b2 2059 if (token == METHOD) {
29595ff2 2060 s = SKIPSPACE1(s);
463ee0b2 2061 if (*s == '(')
3280af22 2062 PL_expect = XTERM;
463ee0b2 2063 else {
3280af22 2064 PL_expect = XOPERATOR;
463ee0b2 2065 }
79072805 2066 }
e74e6b3d 2067 if (PL_madskills)
63575281 2068 curmad('g', newSVpvs( "forced" ));
9ded7720 2069 NEXTVAL_NEXTTOKE.opval
d0a148a6
NC
2070 = (OP*)newSVOP(OP_CONST,0,
2071 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 2072 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
2073 force_next(token);
2074 }
2075 return s;
2076}
2077
ffb4593c
NT
2078/*
2079 * S_force_ident
9cbb5ea2 2080 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
2081 * text only contains the "foo" portion. The first argument is a pointer
2082 * to the "foo", and the second argument is the type symbol to prefix.
2083 * Forces the next token to be a "WORD".
9cbb5ea2 2084 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2085 */
2086
76e3520e 2087STATIC void
bfed75c6 2088S_force_ident(pTHX_ register const char *s, int kind)
79072805 2089{
97aff369 2090 dVAR;
7918f24d
NC
2091
2092 PERL_ARGS_ASSERT_FORCE_IDENT;
2093
c35e046a 2094 if (*s) {
90e5519e 2095 const STRLEN len = strlen(s);
728847b1
BF
2096 OP* const o = (OP*)newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
2097 UTF ? SVf_UTF8 : 0));
cd81e915 2098 start_force(PL_curforce);
9ded7720 2099 NEXTVAL_NEXTTOKE.opval = o;
79072805 2100 force_next(WORD);
748a9306 2101 if (kind) {
11343788 2102 o->op_private = OPpCONST_ENTERED;
55497cff 2103 /* XXX see note in pp_entereval() for why we forgo typo
2104 warnings if the symbol must be introduced in an eval.
2105 GSAR 96-10-12 */
90e5519e 2106 gv_fetchpvn_flags(s, len,
728847b1
BF
2107 (PL_in_eval ? (GV_ADDMULTI | GV_ADDINEVAL)
2108 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
90e5519e
NC
2109 kind == '$' ? SVt_PV :
2110 kind == '@' ? SVt_PVAV :
2111 kind == '%' ? SVt_PVHV :
a0d0e21e 2112 SVt_PVGV
90e5519e 2113 );
748a9306 2114 }
79072805
LW
2115 }
2116}
2117
1571675a
GS
2118NV
2119Perl_str_to_version(pTHX_ SV *sv)
2120{
2121 NV retval = 0.0;
2122 NV nshift = 1.0;
2123 STRLEN len;
cfd0369c 2124 const char *start = SvPV_const(sv,len);
9d4ba2ae 2125 const char * const end = start + len;
504618e9 2126 const bool utf = SvUTF8(sv) ? TRUE : FALSE;
7918f24d
NC
2127
2128 PERL_ARGS_ASSERT_STR_TO_VERSION;
2129
1571675a 2130 while (start < end) {
ba210ebe 2131 STRLEN skip;
1571675a
GS
2132 UV n;
2133 if (utf)
9041c2e3 2134 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2135 else {
2136 n = *(U8*)start;
2137 skip = 1;
2138 }
2139 retval += ((NV)n)/nshift;
2140 start += skip;
2141 nshift *= 1000;
2142 }
2143 return retval;
2144}
2145
4e553d73 2146/*
ffb4593c
NT
2147 * S_force_version
2148 * Forces the next token to be a version number.
e759cc13
RGS
2149 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2150 * and if "guessing" is TRUE, then no new token is created (and the caller
2151 * must use an alternative parsing method).
ffb4593c
NT
2152 */
2153
76e3520e 2154STATIC char *
e759cc13 2155S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2156{
97aff369 2157 dVAR;
5f66b61c 2158 OP *version = NULL;
44dcb63b 2159 char *d;
5db06880
NC
2160#ifdef PERL_MAD
2161 I32 startoff = s - SvPVX(PL_linestr);
2162#endif
89bfa8cd 2163
7918f24d
NC
2164 PERL_ARGS_ASSERT_FORCE_VERSION;
2165
29595ff2 2166 s = SKIPSPACE1(s);
89bfa8cd 2167
44dcb63b 2168 d = s;
dd629d5b 2169 if (*d == 'v')
44dcb63b 2170 d++;
44dcb63b 2171 if (isDIGIT(*d)) {
e759cc13
RGS
2172 while (isDIGIT(*d) || *d == '_' || *d == '.')
2173 d++;
5db06880
NC
2174#ifdef PERL_MAD
2175 if (PL_madskills) {
cd81e915 2176 start_force(PL_curforce);
5db06880
NC
2177 curmad('X', newSVpvn(s,d-s));
2178 }
2179#endif
4e4da3ac 2180 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
dd629d5b 2181 SV *ver;
8d08d9ba
DG
2182#ifdef USE_LOCALE_NUMERIC
2183 char *loc = setlocale(LC_NUMERIC, "C");
2184#endif
6154021b 2185 s = scan_num(s, &pl_yylval);
8d08d9ba
DG
2186#ifdef USE_LOCALE_NUMERIC
2187 setlocale(LC_NUMERIC, loc);
2188#endif
6154021b 2189 version = pl_yylval.opval;
dd629d5b
GS
2190 ver = cSVOPx(version)->op_sv;
2191 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2192 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2193 SvNV_set(ver, str_to_version(ver));
1571675a 2194 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2195 }
89bfa8cd 2196 }
5db06880
NC
2197 else if (guessing) {
2198#ifdef PERL_MAD
2199 if (PL_madskills) {
cd81e915
NC
2200 sv_free(PL_nextwhite); /* let next token collect whitespace */
2201 PL_nextwhite = 0;
5db06880
NC
2202 s = SvPVX(PL_linestr) + startoff;
2203 }
2204#endif
e759cc13 2205 return s;
5db06880 2206 }
89bfa8cd 2207 }
2208
5db06880
NC
2209#ifdef PERL_MAD
2210 if (PL_madskills && !version) {
cd81e915
NC
2211 sv_free(PL_nextwhite); /* let next token collect whitespace */
2212 PL_nextwhite = 0;
5db06880
NC
2213 s = SvPVX(PL_linestr) + startoff;
2214 }
2215#endif
89bfa8cd 2216 /* NOTE: The parser sees the package name and the VERSION swapped */
cd81e915 2217 start_force(PL_curforce);
9ded7720 2218 NEXTVAL_NEXTTOKE.opval = version;
4e553d73 2219 force_next(WORD);
89bfa8cd 2220
e759cc13 2221 return s;
89bfa8cd 2222}
2223
ffb4593c 2224/*
91152fc1
DG
2225 * S_force_strict_version
2226 * Forces the next token to be a version number using strict syntax rules.
2227 */
2228
2229STATIC char *
2230S_force_strict_version(pTHX_ char *s)
2231{
2232 dVAR;
2233 OP *version = NULL;
2234#ifdef PERL_MAD
2235 I32 startoff = s - SvPVX(PL_linestr);
2236#endif
2237 const char *errstr = NULL;
2238
2239 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2240
2241 while (isSPACE(*s)) /* leading whitespace */
2242 s++;
2243
2244 if (is_STRICT_VERSION(s,&errstr)) {
2245 SV *ver = newSV(0);
2246 s = (char *)scan_version(s, ver, 0);
2247 version = newSVOP(OP_CONST, 0, ver);
2248 }
4e4da3ac
Z
2249 else if ( (*s != ';' && *s != '{' && *s != '}' ) &&
2250 (s = SKIPSPACE1(s), (*s != ';' && *s != '{' && *s != '}' )))
2251 {
91152fc1
DG
2252 PL_bufptr = s;
2253 if (errstr)
2254 yyerror(errstr); /* version required */
2255 return s;
2256 }
2257
2258#ifdef PERL_MAD
2259 if (PL_madskills && !version) {
2260 sv_free(PL_nextwhite); /* let next token collect whitespace */
2261 PL_nextwhite = 0;
2262 s = SvPVX(PL_linestr) + startoff;
2263 }
2264#endif
2265 /* NOTE: The parser sees the package name and the VERSION swapped */
2266 start_force(PL_curforce);
2267 NEXTVAL_NEXTTOKE.opval = version;
2268 force_next(WORD);
2269
2270 return s;
2271}
2272
2273/*
ffb4593c
NT
2274 * S_tokeq
2275 * Tokenize a quoted string passed in as an SV. It finds the next
2276 * chunk, up to end of string or a backslash. It may make a new
2277 * SV containing that chunk (if HINT_NEW_STRING is on). It also
2278 * turns \\ into \.
2279 */
2280
76e3520e 2281STATIC SV *
cea2e8a9 2282S_tokeq(pTHX_ SV *sv)
79072805 2283{
97aff369 2284 dVAR;
79072805
LW
2285 register char *s;
2286 register char *send;
2287 register char *d;
b3ac6de7
IZ
2288 STRLEN len = 0;
2289 SV *pv = sv;
79072805 2290
7918f24d
NC
2291 PERL_ARGS_ASSERT_TOKEQ;
2292
79072805 2293 if (!SvLEN(sv))
b3ac6de7 2294 goto finish;
79072805 2295
a0d0e21e 2296 s = SvPV_force(sv, len);
21a311ee 2297 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1)
b3ac6de7 2298 goto finish;
463ee0b2 2299 send = s + len;
dcb21ed6
NC
2300 /* This is relying on the SV being "well formed" with a trailing '\0' */
2301 while (s < send && !(*s == '\\' && s[1] == '\\'))
79072805
LW
2302 s++;
2303 if (s == send)
b3ac6de7 2304 goto finish;
79072805 2305 d = s;
be4731d2 2306 if ( PL_hints & HINT_NEW_STRING ) {
59cd0e26 2307 pv = newSVpvn_flags(SvPVX_const(pv), len, SVs_TEMP | SvUTF8(sv));
be4731d2 2308 }
79072805
LW
2309 while (s < send) {
2310 if (*s == '\\') {
a0d0e21e 2311 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2312 s++; /* all that, just for this */
2313 }
2314 *d++ = *s++;
2315 }
2316 *d = '\0';
95a20fc0 2317 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2318 finish:
3280af22 2319 if ( PL_hints & HINT_NEW_STRING )
eb0d8d16 2320 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
79072805
LW
2321 return sv;
2322}
2323
ffb4593c
NT
2324/*
2325 * Now come three functions related to double-quote context,
2326 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2327 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2328 * interact with PL_lex_state, and create fake ( ... ) argument lists
2329 * to handle functions and concatenation.
2330 * They assume that whoever calls them will be setting up a fake
2331 * join call, because each subthing puts a ',' after it. This lets
2332 * "lower \luPpEr"
2333 * become
2334 * join($, , 'lower ', lcfirst( 'uPpEr', ) ,)
2335 *
2336 * (I'm not sure whether the spurious commas at the end of lcfirst's
2337 * arguments and join's arguments are created or not).
2338 */
2339
2340/*
2341 * S_sublex_start
6154021b 2342 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2343 *
2344 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2345 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c
NT
2346 *
2347 * OP_CONST and OP_READLINE are easy--just make the new op and return.
2348 *
2349 * Everything else becomes a FUNC.
2350 *
2351 * Sets PL_lex_state to LEX_INTERPPUSH unless (ival was OP_NULL or we
2352 * had an OP_CONST or OP_READLINE). This just sets us up for a
2353 * call to S_sublex_push().
2354 */
2355
76e3520e 2356STATIC I32
cea2e8a9 2357S_sublex_start(pTHX)
79072805 2358{
97aff369 2359 dVAR;
6154021b 2360 register const I32 op_type = pl_yylval.ival;
79072805
LW
2361
2362 if (op_type == OP_NULL) {
6154021b 2363 pl_yylval.opval = PL_lex_op;
5f66b61c 2364 PL_lex_op = NULL;
79072805
LW
2365 return THING;
2366 }
2367 if (op_type == OP_CONST || op_type == OP_READLINE) {
3280af22 2368 SV *sv = tokeq(PL_lex_stuff);
b3ac6de7
IZ
2369
2370 if (SvTYPE(sv) == SVt_PVIV) {
2371 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2372 STRLEN len;
96a5add6 2373 const char * const p = SvPV_const(sv, len);
740cce10 2374 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2375 SvREFCNT_dec(sv);
2376 sv = nsv;
4e553d73 2377 }
6154021b 2378 pl_yylval.opval = (OP*)newSVOP(op_type, 0, sv);
a0714e2c 2379 PL_lex_stuff = NULL;
6f33ba73
RGS
2380 /* Allow <FH> // "foo" */
2381 if (op_type == OP_READLINE)
2382 PL_expect = XTERMORDORDOR;
79072805
LW
2383 return THING;
2384 }
e3f73d4e
RGS
2385 else if (op_type == OP_BACKTICK && PL_lex_op) {
2386 /* readpipe() vas overriden */
2387 cSVOPx(cLISTOPx(cUNOPx(PL_lex_op)->op_first)->op_first->op_sibling)->op_sv = tokeq(PL_lex_stuff);
6154021b 2388 pl_yylval.opval = PL_lex_op;
9b201d7d 2389 PL_lex_op = NULL;
e3f73d4e
RGS
2390 PL_lex_stuff = NULL;
2391 return THING;
2392 }
79072805 2393
3280af22 2394 PL_sublex_info.super_state = PL_lex_state;
eac04b2e 2395 PL_sublex_info.sub_inwhat = (U16)op_type;
3280af22
NIS
2396 PL_sublex_info.sub_op = PL_lex_op;
2397 PL_lex_state = LEX_INTERPPUSH;
55497cff 2398
3280af22
NIS
2399 PL_expect = XTERM;
2400 if (PL_lex_op) {
6154021b 2401 pl_yylval.opval = PL_lex_op;
5f66b61c 2402 PL_lex_op = NULL;
55497cff 2403 return PMFUNC;
2404 }
2405 else
2406 return FUNC;
2407}
2408
ffb4593c
NT
2409/*
2410 * S_sublex_push
2411 * Create a new scope to save the lexing state. The scope will be
2412 * ended in S_sublex_done. Returns a '(', starting the function arguments
2413 * to the uc, lc, etc. found before.
2414 * Sets PL_lex_state to LEX_INTERPCONCAT.
2415 */
2416
76e3520e 2417STATIC I32
cea2e8a9 2418S_sublex_push(pTHX)
55497cff 2419{
27da23d5 2420 dVAR;
f46d017c 2421 ENTER;
55497cff 2422
3280af22 2423 PL_lex_state = PL_sublex_info.super_state;
651b5b28 2424 SAVEBOOL(PL_lex_dojoin);
3280af22 2425 SAVEI32(PL_lex_brackets);
78cdf107
Z
2426 SAVEI32(PL_lex_allbrackets);
2427 SAVEI8(PL_lex_fakeeof);
3280af22
NIS
2428 SAVEI32(PL_lex_casemods);
2429 SAVEI32(PL_lex_starts);
651b5b28 2430 SAVEI8(PL_lex_state);
7766f137 2431 SAVEVPTR(PL_lex_inpat);
98246f1e 2432 SAVEI16(PL_lex_inwhat);
57843af0 2433 SAVECOPLINE(PL_curcop);
3280af22 2434 SAVEPPTR(PL_bufptr);
8452ff4b 2435 SAVEPPTR(PL_bufend);
3280af22
NIS
2436 SAVEPPTR(PL_oldbufptr);
2437 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2438 SAVEPPTR(PL_last_lop);
2439 SAVEPPTR(PL_last_uni);
3280af22
NIS
2440 SAVEPPTR(PL_linestart);
2441 SAVESPTR(PL_linestr);
8edd5f42
RGS
2442 SAVEGENERICPV(PL_lex_brackstack);
2443 SAVEGENERICPV(PL_lex_casestack);
3280af22
NIS
2444
2445 PL_linestr = PL_lex_stuff;
a0714e2c 2446 PL_lex_stuff = NULL;
3280af22 2447
9cbb5ea2
GS
2448 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2449 = SvPVX(PL_linestr);
3280af22 2450 PL_bufend += SvCUR(PL_linestr);
bd61b366 2451 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2452 SAVEFREESV(PL_linestr);
2453
2454 PL_lex_dojoin = FALSE;
2455 PL_lex_brackets = 0;
78cdf107
Z
2456 PL_lex_allbrackets = 0;
2457 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
a02a5408
JC
2458 Newx(PL_lex_brackstack, 120, char);
2459 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2460 PL_lex_casemods = 0;
2461 *PL_lex_casestack = '\0';
2462 PL_lex_starts = 0;
2463 PL_lex_state = LEX_INTERPCONCAT;
eb160463 2464 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
3280af22
NIS
2465
2466 PL_lex_inwhat = PL_sublex_info.sub_inwhat;
bb16bae8 2467 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
3280af22
NIS
2468 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2469 PL_lex_inpat = PL_sublex_info.sub_op;
79072805 2470 else
5f66b61c 2471 PL_lex_inpat = NULL;
79072805 2472
55497cff 2473 return '(';
79072805
LW
2474}
2475
ffb4593c
NT
2476/*
2477 * S_sublex_done
2478 * Restores lexer state after a S_sublex_push.
2479 */
2480
76e3520e 2481STATIC I32
cea2e8a9 2482S_sublex_done(pTHX)
79072805 2483{
27da23d5 2484 dVAR;
3280af22 2485 if (!PL_lex_starts++) {
396482e1 2486 SV * const sv = newSVpvs("");
9aa983d2
JH
2487 if (SvUTF8(PL_linestr))
2488 SvUTF8_on(sv);
3280af22 2489 PL_expect = XOPERATOR;
6154021b 2490 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
79072805
LW
2491 return THING;
2492 }
2493
3280af22
NIS
2494 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2495 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2496 return yylex();
79072805
LW
2497 }
2498
ffb4593c 2499 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
bb16bae8 2500 assert(PL_lex_inwhat != OP_TRANSR);
3280af22
NIS
2501 if (PL_lex_repl && (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS)) {
2502 PL_linestr = PL_lex_repl;
2503 PL_lex_inpat = 0;
2504 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2505 PL_bufend += SvCUR(PL_linestr);
bd61b366 2506 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2507 SAVEFREESV(PL_linestr);
2508 PL_lex_dojoin = FALSE;
2509 PL_lex_brackets = 0;
78cdf107
Z
2510 PL_lex_allbrackets = 0;
2511 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
3280af22
NIS
2512 PL_lex_casemods = 0;
2513 *PL_lex_casestack = '\0';
2514 PL_lex_starts = 0;
25da4f38 2515 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2516 PL_lex_state = LEX_INTERPNORMAL;
2517 PL_lex_starts++;
e9fa98b2
HS
2518 /* we don't clear PL_lex_repl here, so that we can check later
2519 whether this is an evalled subst; that means we rely on the
2520 logic to ensure sublex_done() is called again only via the
2521 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2522 }
e9fa98b2 2523 else {
3280af22 2524 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2525 PL_lex_repl = NULL;
e9fa98b2 2526 }
79072805 2527 return ',';
ffed7fef
LW
2528 }
2529 else {
5db06880
NC
2530#ifdef PERL_MAD
2531 if (PL_madskills) {
cd81e915
NC
2532 if (PL_thiswhite) {
2533 if (!PL_endwhite)
6b29d1f5 2534 PL_endwhite = newSVpvs("");
cd81e915
NC
2535 sv_catsv(PL_endwhite, PL_thiswhite);
2536 PL_thiswhite = 0;
2537 }
2538 if (PL_thistoken)
76f68e9b 2539 sv_setpvs(PL_thistoken,"");
5db06880 2540 else
cd81e915 2541 PL_realtokenstart = -1;
5db06880
NC
2542 }
2543#endif
f46d017c 2544 LEAVE;
3280af22
NIS
2545 PL_bufend = SvPVX(PL_linestr);
2546 PL_bufend += SvCUR(PL_linestr);
2547 PL_expect = XOPERATOR;
09bef843 2548 PL_sublex_info.sub_inwhat = 0;
79072805 2549 return ')';
ffed7fef
LW
2550 }
2551}
2552
02aa26ce
NT
2553/*
2554 scan_const
2555
2556 Extracts a pattern, double-quoted string, or transliteration. This
2557 is terrifying code.
2558
94def140 2559 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 2560 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 2561 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 2562
94def140
TS
2563 Returns a pointer to the character scanned up to. If this is
2564 advanced from the start pointer supplied (i.e. if anything was
9b599b2a 2565 successfully parsed), will leave an OP for the substring scanned
6154021b 2566 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
2567 by looking at the next characters herself.
2568
02aa26ce
NT
2569 In patterns:
2570 backslashes:
ff3f963a 2571 constants: \N{NAME} only
02aa26ce
NT
2572 case and quoting: \U \Q \E
2573 stops on @ and $, but not for $ as tail anchor
2574
2575 In transliterations:
2576 characters are VERY literal, except for - not at the start or end
94def140
TS
2577 of the string, which indicates a range. If the range is in bytes,
2578 scan_const expands the range to the full set of intermediate
2579 characters. If the range is in utf8, the hyphen is replaced with
2580 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
2581
2582 In double-quoted strings:
2583 backslashes:
2584 double-quoted style: \r and \n
ff3f963a 2585 constants: \x31, etc.
94def140 2586 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
2587 case and quoting: \U \Q \E
2588 stops on @ and $
2589
2590 scan_const does *not* construct ops to handle interpolated strings.
2591 It stops processing as soon as it finds an embedded $ or @ variable
2592 and leaves it to the caller to work out what's going on.
2593
94def140
TS
2594 embedded arrays (whether in pattern or not) could be:
2595 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2596
2597 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
2598
2599 $ in pattern could be $foo or could be tail anchor. Assumption:
2600 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 2601 followed by one of "()| \r\n\t"
02aa26ce
NT
2602
2603 \1 (backreferences) are turned into $1
2604
2605 The structure of the code is
2606 while (there's a character to process) {
94def140
TS
2607 handle transliteration ranges
2608 skip regexp comments /(?#comment)/ and codes /(?{code})/
2609 skip #-initiated comments in //x patterns
2610 check for embedded arrays
02aa26ce
NT
2611 check for embedded scalars
2612 if (backslash) {
94def140 2613 deprecate \1 in substitution replacements
02aa26ce
NT
2614 handle string-changing backslashes \l \U \Q \E, etc.
2615 switch (what was escaped) {
94def140 2616 handle \- in a transliteration (becomes a literal -)
ff3f963a 2617 if a pattern and not \N{, go treat as regular character
94def140
TS
2618 handle \132 (octal characters)
2619 handle \x15 and \x{1234} (hex characters)
ff3f963a 2620 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
TS
2621 handle \cV (control characters)
2622 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 2623 } (end switch)
77a135fe 2624 continue
02aa26ce 2625 } (end if backslash)
77a135fe 2626 handle regular character
02aa26ce 2627 } (end while character to read)
4e553d73 2628
02aa26ce
NT
2629*/
2630
76e3520e 2631STATIC char *
cea2e8a9 2632S_scan_const(pTHX_ char *start)
79072805 2633{
97aff369 2634 dVAR;
3280af22 2635 register char *send = PL_bufend; /* end of the constant */
77a135fe
KW
2636 SV *sv = newSV(send - start); /* sv for the constant. See
2637 note below on sizing. */
02aa26ce
NT
2638 register char *s = start; /* start of the constant */
2639 register char *d = SvPVX(sv); /* destination for copies */
2640 bool dorange = FALSE; /* are we in a translit range? */
c2e66d9e 2641 bool didrange = FALSE; /* did we just finish a range? */
b953e60c
KW
2642 bool has_utf8 = FALSE; /* Output constant is UTF8 */
2643 bool this_utf8 = cBOOL(UTF); /* Is the source string assumed
77a135fe
KW
2644 to be UTF8? But, this can
2645 show as true when the source
2646 isn't utf8, as for example
2647 when it is entirely composed
2648 of hex constants */
2649
2650 /* Note on sizing: The scanned constant is placed into sv, which is
2651 * initialized by newSV() assuming one byte of output for every byte of
2652 * input. This routine expects newSV() to allocate an extra byte for a
2653 * trailing NUL, which this routine will append if it gets to the end of
2654 * the input. There may be more bytes of input than output (eg., \N{LATIN
2655 * CAPITAL LETTER A}), or more output than input if the constant ends up
2656 * recoded to utf8, but each time a construct is found that might increase
2657 * the needed size, SvGROW() is called. Its size parameter each time is
2658 * based on the best guess estimate at the time, namely the length used so
2659 * far, plus the length the current construct will occupy, plus room for
2660 * the trailing NUL, plus one byte for every input byte still unscanned */
2661
012bcf8d 2662 UV uv;
4c3a8340
TS
2663#ifdef EBCDIC
2664 UV literal_endpoint = 0;
e294cc5d 2665 bool native_range = TRUE; /* turned to FALSE if the first endpoint is Unicode. */
4c3a8340 2666#endif
012bcf8d 2667
7918f24d
NC
2668 PERL_ARGS_ASSERT_SCAN_CONST;
2669
bb16bae8 2670 assert(PL_lex_inwhat != OP_TRANSR);
2b9d42f0
NIS
2671 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
2672 /* If we are doing a trans and we know we want UTF8 set expectation */
2673 has_utf8 = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
2674 this_utf8 = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
2675 }
2676
2677
79072805 2678 while (s < send || dorange) {
ff3f963a 2679
02aa26ce 2680 /* get transliterations out of the way (they're most literal) */
3280af22 2681 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 2682 /* expand a range A-Z to the full set of characters. AIE! */
79072805 2683 if (dorange) {
1ba5c669
JH
2684 I32 i; /* current expanded character */
2685 I32 min; /* first character in range */
2686 I32 max; /* last character in range */
02aa26ce 2687
e294cc5d
JH
2688#ifdef EBCDIC
2689 UV uvmax = 0;
2690#endif
2691
2692 if (has_utf8
2693#ifdef EBCDIC
2694 && !native_range
2695#endif
2696 ) {
9d4ba2ae 2697 char * const c = (char*)utf8_hop((U8*)d, -1);
8973db79
JH
2698 char *e = d++;
2699 while (e-- > c)
2700 *(e + 1) = *e;
25716404 2701 *c = (char)UTF_TO_NATIVE(0xff);
8973db79
JH
2702 /* mark the range as done, and continue */
2703 dorange = FALSE;
2704 didrange = TRUE;
2705 continue;
2706 }
2b9d42f0 2707
95a20fc0 2708 i = d - SvPVX_const(sv); /* remember current offset */
e294cc5d
JH
2709#ifdef EBCDIC
2710 SvGROW(sv,
2711 SvLEN(sv) + (has_utf8 ?
2712 (512 - UTF_CONTINUATION_MARK +
2713 UNISKIP(0x100))
2714 : 256));
2715 /* How many two-byte within 0..255: 128 in UTF-8,
2716 * 96 in UTF-8-mod. */
2717#else
9cbb5ea2 2718 SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */
e294cc5d 2719#endif
9cbb5ea2 2720 d = SvPVX(sv) + i; /* refresh d after realloc */
e294cc5d
JH
2721#ifdef EBCDIC
2722 if (has_utf8) {
2723 int j;
2724 for (j = 0; j <= 1; j++) {
2725 char * const c = (char*)utf8_hop((U8*)d, -1);
2726 const UV uv = utf8n_to_uvchr((U8*)c, d - c, NULL, 0);
2727 if (j)
2728 min = (U8)uv;
2729 else if (uv < 256)
2730 max = (U8)uv;
2731 else {
2732 max = (U8)0xff; /* only to \xff */
2733 uvmax = uv; /* \x{100} to uvmax */
2734 }
2735 d = c; /* eat endpoint chars */
2736 }
2737 }
2738 else {
2739#endif
2740 d -= 2; /* eat the first char and the - */
2741 min = (U8)*d; /* first char in range */
2742 max = (U8)d[1]; /* last char in range */
2743#ifdef EBCDIC
2744 }
2745#endif
8ada0baa 2746
c2e66d9e 2747 if (min > max) {
01ec43d0 2748 Perl_croak(aTHX_
d1573ac7 2749 "Invalid range \"%c-%c\" in transliteration operator",
1ba5c669 2750 (char)min, (char)max);
c2e66d9e
GS
2751 }
2752
c7f1f016 2753#ifdef EBCDIC
4c3a8340
TS
2754 if (literal_endpoint == 2 &&
2755 ((isLOWER(min) && isLOWER(max)) ||
2756 (isUPPER(min) && isUPPER(max)))) {
8ada0baa
JH
2757 if (isLOWER(min)) {
2758 for (i = min; i <= max; i++)
2759 if (isLOWER(i))
db42d148 2760 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2761 } else {
2762 for (i = min; i <= max; i++)
2763 if (isUPPER(i))
db42d148 2764 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2765 }
2766 }
2767 else
2768#endif
2769 for (i = min; i <= max; i++)
e294cc5d
JH
2770#ifdef EBCDIC
2771 if (has_utf8) {
2772 const U8 ch = (U8)NATIVE_TO_UTF(i);
2773 if (UNI_IS_INVARIANT(ch))
2774 *d++ = (U8)i;
2775 else {
2776 *d++ = (U8)UTF8_EIGHT_BIT_HI(ch);
2777 *d++ = (U8)UTF8_EIGHT_BIT_LO(ch);
2778 }
2779 }
2780 else
2781#endif
2782 *d++ = (char)i;
2783
2784#ifdef EBCDIC
2785 if (uvmax) {
2786 d = (char*)uvchr_to_utf8((U8*)d, 0x100);
2787 if (uvmax > 0x101)
2788 *d++ = (char)UTF_TO_NATIVE(0xff);
2789 if (uvmax > 0x100)
2790 d = (char*)uvchr_to_utf8((U8*)d, uvmax);
2791 }
2792#endif
02aa26ce
NT
2793
2794 /* mark the range as done, and continue */
79072805 2795 dorange = FALSE;
01ec43d0 2796 didrange = TRUE;
4c3a8340
TS
2797#ifdef EBCDIC
2798 literal_endpoint = 0;
2799#endif
79072805 2800 continue;
4e553d73 2801 }
02aa26ce
NT
2802
2803 /* range begins (ignore - as first or last char) */
79072805 2804 else if (*s == '-' && s+1 < send && s != start) {
4e553d73 2805 if (didrange) {
1fafa243 2806 Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
01ec43d0 2807 }
e294cc5d
JH
2808 if (has_utf8
2809#ifdef EBCDIC
2810 && !native_range
2811#endif
2812 ) {
25716404 2813 *d++ = (char)UTF_TO_NATIVE(0xff); /* use illegal utf8 byte--see pmtrans */
a0ed51b3
LW
2814 s++;
2815 continue;
2816 }
79072805
LW
2817 dorange = TRUE;
2818 s++;
01ec43d0
GS
2819 }
2820 else {
2821 didrange = FALSE;
4c3a8340
TS
2822#ifdef EBCDIC
2823 literal_endpoint = 0;
e294cc5d 2824 native_range = TRUE;
4c3a8340 2825#endif
01ec43d0 2826 }
79072805 2827 }
02aa26ce
NT
2828
2829 /* if we get here, we're not doing a transliteration */
2830
0f5d15d6
IZ
2831 /* skip for regexp comments /(?#comment)/ and code /(?{code})/,
2832 except for the last char, which will be done separately. */
3280af22 2833 else if (*s == '(' && PL_lex_inpat && s[1] == '?') {
cc6b7395 2834 if (s[2] == '#') {
e994fd66 2835 while (s+1 < send && *s != ')')
db42d148 2836 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
155aba94
GS
2837 }
2838 else if (s[2] == '{' /* This should match regcomp.c */
67edc0c9 2839 || (s[2] == '?' && s[3] == '{'))
155aba94 2840 {
cc6b7395 2841 I32 count = 1;
0f5d15d6 2842 char *regparse = s + (s[2] == '{' ? 3 : 4);
cc6b7395
IZ
2843 char c;
2844
d9f97599
GS
2845 while (count && (c = *regparse)) {
2846 if (c == '\\' && regparse[1])
2847 regparse++;
4e553d73 2848 else if (c == '{')
cc6b7395 2849 count++;
4e553d73 2850 else if (c == '}')
cc6b7395 2851 count--;
d9f97599 2852 regparse++;
cc6b7395 2853 }
e994fd66 2854 if (*regparse != ')')
5bdf89e7 2855 regparse--; /* Leave one char for continuation. */
0f5d15d6 2856 while (s < regparse)
db42d148 2857 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
cc6b7395 2858 }
748a9306 2859 }
02aa26ce
NT
2860
2861 /* likewise skip #-initiated comments in //x patterns */
3280af22 2862 else if (*s == '#' && PL_lex_inpat &&
73134a2e 2863 ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED) {
748a9306 2864 while (s+1 < send && *s != '\n')
db42d148 2865 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
748a9306 2866 }
02aa26ce 2867
5d1d4326 2868 /* check for embedded arrays
da6eedaa 2869 (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
5d1d4326 2870 */
1749ea0d
TS
2871 else if (*s == '@' && s[1]) {
2872 if (isALNUM_lazy_if(s+1,UTF))
2873 break;
2874 if (strchr(":'{$", s[1]))
2875 break;
2876 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
2877 break; /* in regexp, neither @+ nor @- are interpolated */
2878 }
02aa26ce
NT
2879
2880 /* check for embedded scalars. only stop if we're sure it's a
2881 variable.
2882 */
79072805 2883 else if (*s == '$') {
3280af22 2884 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
79072805 2885 break;
77772344 2886 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
a2a5de95
NC
2887 if (s[1] == '\\') {
2888 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
2889 "Possible unintended interpolation of $\\ in regex");
77772344 2890 }
79072805 2891 break; /* in regexp, $ might be tail anchor */
77772344 2892 }
79072805 2893 }
02aa26ce 2894
2b9d42f0
NIS
2895 /* End of else if chain - OP_TRANS rejoin rest */
2896
02aa26ce 2897 /* backslashes */
79072805 2898 if (*s == '\\' && s+1 < send) {
ff3f963a
KW
2899 char* e; /* Can be used for ending '}', etc. */
2900
79072805 2901 s++;
02aa26ce 2902
7d0fc23c
KW
2903 /* warn on \1 - \9 in substitution replacements, but note that \11
2904 * is an octal; and \19 is \1 followed by '9' */
3280af22 2905 if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat &&
a0d0e21e 2906 isDIGIT(*s) && *s != '0' && !isDIGIT(s[1]))
79072805 2907 {
a2a5de95 2908 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
79072805
LW
2909 *--s = '$';
2910 break;
2911 }
02aa26ce
NT
2912
2913 /* string-change backslash escapes */
3280af22 2914 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQ", *s)) {
79072805
LW
2915 --s;
2916 break;
2917 }
ff3f963a
KW
2918 /* In a pattern, process \N, but skip any other backslash escapes.
2919 * This is because we don't want to translate an escape sequence
2920 * into a meta symbol and have the regex compiler use the meta
2921 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
2922 * in spite of this, we do have to process \N here while the proper
2923 * charnames handler is in scope. See bugs #56444 and #62056.
2924 * There is a complication because \N in a pattern may also stand
2925 * for 'match a non-nl', and not mean a charname, in which case its
2926 * processing should be deferred to the regex compiler. To be a
2927 * charname it must be followed immediately by a '{', and not look
2928 * like \N followed by a curly quantifier, i.e., not something like
2929 * \N{3,}. regcurly returns a boolean indicating if it is a legal
2930 * quantifier */
2931 else if (PL_lex_inpat
2932 && (*s != 'N'
2933 || s[1] != '{'
2934 || regcurly(s + 1)))
2935 {
cc74c5bd
TS
2936 *d++ = NATIVE_TO_NEED(has_utf8,'\\');
2937 goto default_action;
2938 }
02aa26ce 2939
79072805 2940 switch (*s) {
02aa26ce
NT
2941
2942 /* quoted - in transliterations */
79072805 2943 case '-':
3280af22 2944 if (PL_lex_inwhat == OP_TRANS) {
79072805
LW
2945 *d++ = *s++;
2946 continue;
2947 }
2948 /* FALL THROUGH */
2949 default:
11b8faa4 2950 {
a2a5de95
NC
2951 if ((isALPHA(*s) || isDIGIT(*s)))
2952 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
2953 "Unrecognized escape \\%c passed through",
2954 *s);
11b8faa4 2955 /* default action is to copy the quoted character */
f9a63242 2956 goto default_action;
11b8faa4 2957 }
02aa26ce 2958
632403cc 2959 /* eg. \132 indicates the octal constant 0132 */
79072805
LW
2960 case '0': case '1': case '2': case '3':
2961 case '4': case '5': case '6': case '7':
ba210ebe 2962 {
53305cf1
NC
2963 I32 flags = 0;
2964 STRLEN len = 3;
77a135fe 2965 uv = NATIVE_TO_UNI(grok_oct(s, &len, &flags, NULL));
ba210ebe
JH
2966 s += len;
2967 }
012bcf8d 2968 goto NUM_ESCAPE_INSERT;
02aa26ce 2969
f0a2b745
KW
2970 /* eg. \o{24} indicates the octal constant \024 */
2971 case 'o':
2972 {
2973 STRLEN len;
454155d9 2974 const char* error;
f0a2b745 2975
454155d9 2976 bool valid = grok_bslash_o(s, &uv, &len, &error, 1);
f0a2b745 2977 s += len;
454155d9 2978 if (! valid) {
f0a2b745
KW
2979 yyerror(error);
2980 continue;
2981 }
2982 goto NUM_ESCAPE_INSERT;
2983 }
2984
77a135fe 2985 /* eg. \x24 indicates the hex constant 0x24 */
79072805 2986 case 'x':
a0ed51b3
LW
2987 ++s;
2988 if (*s == '{') {
9d4ba2ae 2989 char* const e = strchr(s, '}');
a4c04bdc
NC
2990 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES |
2991 PERL_SCAN_DISALLOW_PREFIX;
53305cf1 2992 STRLEN len;
355860ce 2993
53305cf1 2994 ++s;
adaeee49 2995 if (!e) {
a0ed51b3 2996 yyerror("Missing right brace on \\x{}");
355860ce 2997 continue;
ba210ebe 2998 }
53305cf1 2999 len = e - s;
77a135fe 3000 uv = NATIVE_TO_UNI(grok_hex(s, &len, &flags, NULL));
ba210ebe 3001 s = e + 1;
a0ed51b3
LW
3002 }
3003 else {
ba210ebe 3004 {
53305cf1 3005 STRLEN len = 2;
a4c04bdc 3006 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
77a135fe 3007 uv = NATIVE_TO_UNI(grok_hex(s, &len, &flags, NULL));
ba210ebe
JH
3008 s += len;
3009 }
012bcf8d
GS
3010 }
3011
3012 NUM_ESCAPE_INSERT:
ff3f963a
KW
3013 /* Insert oct or hex escaped character. There will always be
3014 * enough room in sv since such escapes will be longer than any
3015 * UTF-8 sequence they can end up as, except if they force us
3016 * to recode the rest of the string into utf8 */
ba7cea30 3017
77a135fe 3018 /* Here uv is the ordinal of the next character being added in
ff3f963a 3019 * unicode (converted from native). */
77a135fe 3020 if (!UNI_IS_INVARIANT(uv)) {
9aa983d2 3021 if (!has_utf8 && uv > 255) {
77a135fe
KW
3022 /* Might need to recode whatever we have accumulated so
3023 * far if it contains any chars variant in utf8 or
3024 * utf-ebcdic. */
3025
3026 SvCUR_set(sv, d - SvPVX_const(sv));
3027 SvPOK_on(sv);
3028 *d = '\0';
77a135fe 3029 /* See Note on sizing above. */
7bf79863
KW
3030 sv_utf8_upgrade_flags_grow(sv,
3031 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3032 UNISKIP(uv) + (STRLEN)(send - s) + 1);
77a135fe
KW
3033 d = SvPVX(sv) + SvCUR(sv);
3034 has_utf8 = TRUE;
012bcf8d
GS
3035 }
3036
77a135fe
KW
3037 if (has_utf8) {
3038 d = (char*)uvuni_to_utf8((U8*)d, uv);
f9a63242
JH
3039 if (PL_lex_inwhat == OP_TRANS &&
3040 PL_sublex_info.sub_op) {
3041 PL_sublex_info.sub_op->op_private |=
3042 (PL_lex_repl ? OPpTRANS_FROM_UTF
3043 : OPpTRANS_TO_UTF);
f9a63242 3044 }
e294cc5d
JH
3045#ifdef EBCDIC
3046 if (uv > 255 && !dorange)
3047 native_range = FALSE;
3048#endif
012bcf8d 3049 }
a0ed51b3 3050 else {
012bcf8d 3051 *d++ = (char)uv;
a0ed51b3 3052 }
012bcf8d
GS
3053 }
3054 else {
c4d5f83a 3055 *d++ = (char) uv;
a0ed51b3 3056 }
79072805 3057 continue;
02aa26ce 3058
4a2d328f 3059 case 'N':
ff3f963a
KW
3060 /* In a non-pattern \N must be a named character, like \N{LATIN
3061 * SMALL LETTER A} or \N{U+0041}. For patterns, it also can
3062 * mean to match a non-newline. For non-patterns, named
3063 * characters are converted to their string equivalents. In
3064 * patterns, named characters are not converted to their
3065 * ultimate forms for the same reasons that other escapes
3066 * aren't. Instead, they are converted to the \N{U+...} form
3067 * to get the value from the charnames that is in effect right
3068 * now, while preserving the fact that it was a named character
3069 * so that the regex compiler knows this */
3070
3071 /* This section of code doesn't generally use the
3072 * NATIVE_TO_NEED() macro to transform the input. I (khw) did
3073 * a close examination of this macro and determined it is a
3074 * no-op except on utfebcdic variant characters. Every
3075 * character generated by this that would normally need to be
3076 * enclosed by this macro is invariant, so the macro is not
7538f724
KW
3077 * needed, and would complicate use of copy(). XXX There are
3078 * other parts of this file where the macro is used
3079 * inconsistently, but are saved by it being a no-op */
ff3f963a
KW
3080
3081 /* The structure of this section of code (besides checking for
3082 * errors and upgrading to utf8) is:
3083 * Further disambiguate between the two meanings of \N, and if
3084 * not a charname, go process it elsewhere
0a96133f
KW
3085 * If of form \N{U+...}, pass it through if a pattern;
3086 * otherwise convert to utf8
3087 * Otherwise must be \N{NAME}: convert to \N{U+c1.c2...} if a
3088 * pattern; otherwise convert to utf8 */
ff3f963a
KW
3089
3090 /* Here, s points to the 'N'; the test below is guaranteed to
3091 * succeed if we are being called on a pattern as we already
3092 * know from a test above that the next character is a '{'.
3093 * On a non-pattern \N must mean 'named sequence, which
3094 * requires braces */
3095 s++;
3096 if (*s != '{') {
3097 yyerror("Missing braces on \\N{}");
3098 continue;
3099 }
3100 s++;
3101
0a96133f 3102 /* If there is no matching '}', it is an error. */
ff3f963a
KW
3103 if (! (e = strchr(s, '}'))) {
3104 if (! PL_lex_inpat) {
5777a3f7 3105 yyerror("Missing right brace on \\N{}");
0a96133f
KW
3106 } else {
3107 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N.");
dbc0d4f2 3108 }
0a96133f 3109 continue;
ff3f963a 3110 }
cddc7ef4 3111
ff3f963a 3112 /* Here it looks like a named character */
cddc7ef4 3113
ff3f963a
KW
3114 if (PL_lex_inpat) {
3115
3116 /* XXX This block is temporary code. \N{} implies that the
3117 * pattern is to have Unicode semantics, and therefore
3118 * currently has to be encoded in utf8. By putting it in
3119 * utf8 now, we save a whole pass in the regular expression
3120 * compiler. Once that code is changed so Unicode
3121 * semantics doesn't necessarily have to be in utf8, this
da3a4baf
KW
3122 * block should be removed. However, the code that parses
3123 * the output of this would have to be changed to not
3124 * necessarily expect utf8 */
ff3f963a 3125 if (!has_utf8) {
77a135fe 3126 SvCUR_set(sv, d - SvPVX_const(sv));
f08d6ad9 3127 SvPOK_on(sv);
e4f3eed8 3128 *d = '\0';
77a135fe 3129 /* See Note on sizing above. */
7bf79863 3130 sv_utf8_upgrade_flags_grow(sv,
ff3f963a
KW
3131 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3132 /* 5 = '\N{' + cur char + NUL */
3133 (STRLEN)(send - s) + 5);
f08d6ad9 3134 d = SvPVX(sv) + SvCUR(sv);
89491803 3135 has_utf8 = TRUE;
ff3f963a
KW
3136 }
3137 }
423cee85 3138
ff3f963a
KW
3139 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3140 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3141 | PERL_SCAN_DISALLOW_PREFIX;
3142 STRLEN len;
3143
3144 /* For \N{U+...}, the '...' is a unicode value even on
3145 * EBCDIC machines */
3146 s += 2; /* Skip to next char after the 'U+' */
3147 len = e - s;
3148 uv = grok_hex(s, &len, &flags, NULL);
3149 if (len == 0 || len != (STRLEN)(e - s)) {
3150 yyerror("Invalid hexadecimal number in \\N{U+...}");
3151 s = e + 1;
3152 continue;
3153 }
3154
3155 if (PL_lex_inpat) {
3156
e2a7e165
KW
3157 /* On non-EBCDIC platforms, pass through to the regex
3158 * compiler unchanged. The reason we evaluated the
3159 * number above is to make sure there wasn't a syntax
3160 * error. But on EBCDIC we convert to native so
3161 * downstream code can continue to assume it's native
3162 */
ff3f963a 3163 s -= 5; /* Include the '\N{U+' */
e2a7e165
KW
3164#ifdef EBCDIC
3165 d += my_snprintf(d, e - s + 1 + 1, /* includes the }
3166 and the \0 */
3167 "\\N{U+%X}",
3168 (unsigned int) UNI_TO_NATIVE(uv));
3169#else
ff3f963a
KW
3170 Copy(s, d, e - s + 1, char); /* 1 = include the } */
3171 d += e - s + 1;
e2a7e165 3172#endif
ff3f963a
KW
3173 }
3174 else { /* Not a pattern: convert the hex to string */
3175
3176 /* If destination is not in utf8, unconditionally
3177 * recode it to be so. This is because \N{} implies
3178 * Unicode semantics, and scalars have to be in utf8
3179 * to guarantee those semantics */
3180 if (! has_utf8) {
3181 SvCUR_set(sv, d - SvPVX_const(sv));
3182 SvPOK_on(sv);
3183 *d = '\0';
3184 /* See Note on sizing above. */
3185 sv_utf8_upgrade_flags_grow(
3186 sv,
3187 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3188 UNISKIP(uv) + (STRLEN)(send - e) + 1);
3189 d = SvPVX(sv) + SvCUR(sv);
3190 has_utf8 = TRUE;
3191 }
3192
3193 /* Add the string to the output */
3194 if (UNI_IS_INVARIANT(uv)) {
3195 *d++ = (char) uv;
3196 }
3197 else d = (char*)uvuni_to_utf8((U8*)d, uv);
3198 }
3199 }
3200 else { /* Here is \N{NAME} but not \N{U+...}. */
3201
3202 SV *res; /* result from charnames */
3203 const char *str; /* the string in 'res' */
3204 STRLEN len; /* its length */
3205
3206 /* Get the value for NAME */
3207 res = newSVpvn(s, e - s);
3208 res = new_constant( NULL, 0, "charnames",
3209 /* includes all of: \N{...} */
3210 res, NULL, s - 3, e - s + 4 );
3211
3212 /* Most likely res will be in utf8 already since the
3213 * standard charnames uses pack U, but a custom translator
3214 * can leave it otherwise, so make sure. XXX This can be
3215 * revisited to not have charnames use utf8 for characters
3216 * that don't need it when regexes don't have to be in utf8
3217 * for Unicode semantics. If doing so, remember EBCDIC */
3218 sv_utf8_upgrade(res);
3219 str = SvPV_const(res, len);
3220
3221 /* Don't accept malformed input */
3222 if (! is_utf8_string((U8 *) str, len)) {
3223 yyerror("Malformed UTF-8 returned by \\N");
3224 }
3225 else if (PL_lex_inpat) {
3226
3227 if (! len) { /* The name resolved to an empty string */
3228 Copy("\\N{}", d, 4, char);
3229 d += 4;
3230 }
3231 else {
3232 /* In order to not lose information for the regex
3233 * compiler, pass the result in the specially made
3234 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3235 * the code points in hex of each character
3236 * returned by charnames */
3237
3238 const char *str_end = str + len;
3239 STRLEN char_length; /* cur char's byte length */
3240 STRLEN output_length; /* and the number of bytes
3241 after this is translated
3242 into hex digits */
3243 const STRLEN off = d - SvPVX_const(sv);
3244
3245 /* 2 hex per byte; 2 chars for '\N'; 2 chars for
3246 * max('U+', '.'); and 1 for NUL */
3247 char hex_string[2 * UTF8_MAXBYTES + 5];
3248
3249 /* Get the first character of the result. */
3250 U32 uv = utf8n_to_uvuni((U8 *) str,
3251 len,
3252 &char_length,
3253 UTF8_ALLOW_ANYUV);
3254
3255 /* The call to is_utf8_string() above hopefully
3256 * guarantees that there won't be an error. But
3257 * it's easy here to make sure. The function just
3258 * above warns and returns 0 if invalid utf8, but
3259 * it can also return 0 if the input is validly a
3260 * NUL. Disambiguate */
3261 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3262 uv = UNICODE_REPLACEMENT;
3263 }
3264
3265 /* Convert first code point to hex, including the
e2a7e165
KW
3266 * boiler plate before it. For all these, we
3267 * convert to native format so that downstream code
3268 * can continue to assume the input is native */
78c35590 3269 output_length =
3353de27 3270 my_snprintf(hex_string, sizeof(hex_string),
e2a7e165
KW
3271 "\\N{U+%X",
3272 (unsigned int) UNI_TO_NATIVE(uv));
ff3f963a
KW
3273
3274 /* Make sure there is enough space to hold it */
3275 d = off + SvGROW(sv, off
3276 + output_length
3277 + (STRLEN)(send - e)
3278 + 2); /* '}' + NUL */
3279 /* And output it */
3280 Copy(hex_string, d, output_length, char);
3281 d += output_length;
3282
3283 /* For each subsequent character, append dot and
3284 * its ordinal in hex */
3285 while ((str += char_length) < str_end) {
3286 const STRLEN off = d - SvPVX_const(sv);
3287 U32 uv = utf8n_to_uvuni((U8 *) str,
3288 str_end - str,
3289 &char_length,
3290 UTF8_ALLOW_ANYUV);
3291 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3292 uv = UNICODE_REPLACEMENT;
3293 }
3294
78c35590 3295 output_length =
3353de27 3296 my_snprintf(hex_string, sizeof(hex_string),
e2a7e165
KW
3297 ".%X",
3298 (unsigned int) UNI_TO_NATIVE(uv));
ff3f963a
KW
3299
3300 d = off + SvGROW(sv, off
3301 + output_length
3302 + (STRLEN)(send - e)
3303 + 2); /* '}' + NUL */
3304 Copy(hex_string, d, output_length, char);
3305 d += output_length;
3306 }
3307
3308 *d++ = '}'; /* Done. Add the trailing brace */
3309 }
3310 }
3311 else { /* Here, not in a pattern. Convert the name to a
3312 * string. */
3313
3314 /* If destination is not in utf8, unconditionally
3315 * recode it to be so. This is because \N{} implies
3316 * Unicode semantics, and scalars have to be in utf8
3317 * to guarantee those semantics */
3318 if (! has_utf8) {
3319 SvCUR_set(sv, d - SvPVX_const(sv));
3320 SvPOK_on(sv);
3321 *d = '\0';
3322 /* See Note on sizing above. */
3323 sv_utf8_upgrade_flags_grow(sv,
3324 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3325 len + (STRLEN)(send - s) + 1);
3326 d = SvPVX(sv) + SvCUR(sv);
3327 has_utf8 = TRUE;
3328 } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
3329
3330 /* See Note on sizing above. (NOTE: SvCUR() is not
3331 * set correctly here). */
3332 const STRLEN off = d - SvPVX_const(sv);
3333 d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1);
3334 }
3335 Copy(str, d, len, char);
3336 d += len;
423cee85 3337 }
423cee85 3338 SvREFCNT_dec(res);
cb233ae3
KW
3339
3340 /* Deprecate non-approved name syntax */
3341 if (ckWARN_d(WARN_DEPRECATED)) {
3342 bool problematic = FALSE;
3343 char* i = s;
3344
3345 /* For non-ut8 input, look to see that the first
3346 * character is an alpha, then loop through the rest
3347 * checking that each is a continuation */
3348 if (! this_utf8) {
3349 if (! isALPHAU(*i)) problematic = TRUE;
3350 else for (i = s + 1; i < e; i++) {
3351 if (isCHARNAME_CONT(*i)) continue;
3352 problematic = TRUE;
3353 break;
3354 }
3355 }
3356 else {
3357 /* Similarly for utf8. For invariants can check
3358 * directly. We accept anything above the latin1
3359 * range because it is immaterial to Perl if it is
3360 * correct or not, and is expensive to check. But
3361 * it is fairly easy in the latin1 range to convert
3362 * the variants into a single character and check
3363 * those */
3364 if (UTF8_IS_INVARIANT(*i)) {
3365 if (! isALPHAU(*i)) problematic = TRUE;
3366 } else if (UTF8_IS_DOWNGRADEABLE_START(*i)) {
81c14aa2 3367 if (! isALPHAU(UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(*i,
cb233ae3
KW
3368 *(i+1)))))
3369 {
3370 problematic = TRUE;
3371 }
3372 }
3373 if (! problematic) for (i = s + UTF8SKIP(s);
3374 i < e;
3375 i+= UTF8SKIP(i))
3376 {
3377 if (UTF8_IS_INVARIANT(*i)) {
3378 if (isCHARNAME_CONT(*i)) continue;
3379 } else if (! UTF8_IS_DOWNGRADEABLE_START(*i)) {
3380 continue;
3381 } else if (isCHARNAME_CONT(
3382 UNI_TO_NATIVE(
81c14aa2 3383 TWO_BYTE_UTF8_TO_UNI(*i, *(i+1)))))
cb233ae3
KW
3384 {
3385 continue;
3386 }
3387 problematic = TRUE;
3388 break;
3389 }
3390 }
3391 if (problematic) {
6e1bad6c
KW
3392 /* The e-i passed to the final %.*s makes sure that
3393 * should the trailing NUL be missing that this
3394 * print won't run off the end of the string */
cb233ae3 3395 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
b00fc8d4
NC
3396 "Deprecated character in \\N{...}; marked by <-- HERE in \\N{%.*s<-- HERE %.*s",
3397 (int)(i - s + 1), s, (int)(e - i), i + 1);
cb233ae3
KW
3398 }
3399 }
3400 } /* End \N{NAME} */
ff3f963a
KW
3401#ifdef EBCDIC
3402 if (!dorange)
3403 native_range = FALSE; /* \N{} is defined to be Unicode */
3404#endif
3405 s = e + 1; /* Point to just after the '}' */
423cee85
JH
3406 continue;
3407
02aa26ce 3408 /* \c is a control character */
79072805
LW
3409 case 'c':
3410 s++;
961ce445 3411 if (s < send) {
17a3df4c 3412 *d++ = grok_bslash_c(*s++, has_utf8, 1);
ba210ebe 3413 }
961ce445
RGS
3414 else {
3415 yyerror("Missing control char name in \\c");
3416 }
79072805 3417 continue;
02aa26ce
NT
3418
3419 /* printf-style backslashes, formfeeds, newlines, etc */
79072805 3420 case 'b':
db42d148 3421 *d++ = NATIVE_TO_NEED(has_utf8,'\b');
79072805
LW
3422 break;
3423 case 'n':
db42d148 3424 *d++ = NATIVE_TO_NEED(has_utf8,'\n');
79072805
LW
3425 break;
3426 case 'r':
db42d148 3427 *d++ = NATIVE_TO_NEED(has_utf8,'\r');
79072805
LW
3428 break;
3429 case 'f':
db42d148 3430 *d++ = NATIVE_TO_NEED(has_utf8,'\f');
79072805
LW
3431 break;
3432 case 't':
db42d148 3433 *d++ = NATIVE_TO_NEED(has_utf8,'\t');
79072805 3434 break;
34a3fe2a 3435 case 'e':
db42d148 3436 *d++ = ASCII_TO_NEED(has_utf8,'\033');
34a3fe2a
PP
3437 break;
3438 case 'a':
db42d148 3439 *d++ = ASCII_TO_NEED(has_utf8,'\007');
79072805 3440 break;
02aa26ce
NT
3441 } /* end switch */
3442
79072805
LW
3443 s++;
3444 continue;
02aa26ce 3445 } /* end if (backslash) */
4c3a8340
TS
3446#ifdef EBCDIC
3447 else
3448 literal_endpoint++;
3449#endif
02aa26ce 3450
f9a63242 3451 default_action:
77a135fe
KW
3452 /* If we started with encoded form, or already know we want it,
3453 then encode the next character */
3454 if (! NATIVE_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
2b9d42f0 3455 STRLEN len = 1;
77a135fe
KW
3456
3457
3458 /* One might think that it is wasted effort in the case of the
3459 * source being utf8 (this_utf8 == TRUE) to take the next character
3460 * in the source, convert it to an unsigned value, and then convert
3461 * it back again. But the source has not been validated here. The
3462 * routine that does the conversion checks for errors like
3463 * malformed utf8 */
3464
5f66b61c
AL
3465 const UV nextuv = (this_utf8) ? utf8n_to_uvchr((U8*)s, send - s, &len, 0) : (UV) ((U8) *s);
3466 const STRLEN need = UNISKIP(NATIVE_TO_UNI(nextuv));
77a135fe
KW
3467 if (!has_utf8) {
3468 SvCUR_set(sv, d - SvPVX_const(sv));
3469 SvPOK_on(sv);
3470 *d = '\0';
77a135fe 3471 /* See Note on sizing above. */
7bf79863
KW
3472 sv_utf8_upgrade_flags_grow(sv,
3473 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3474 need + (STRLEN)(send - s) + 1);
77a135fe
KW
3475 d = SvPVX(sv) + SvCUR(sv);
3476 has_utf8 = TRUE;
3477 } else if (need > len) {
3478 /* encoded value larger than old, may need extra space (NOTE:
3479 * SvCUR() is not set correctly here). See Note on sizing
3480 * above. */
9d4ba2ae 3481 const STRLEN off = d - SvPVX_const(sv);
77a135fe 3482 d = SvGROW(sv, off + need + (STRLEN)(send - s) + 1) + off;
2b9d42f0 3483 }
77a135fe
KW
3484 s += len;
3485
5f66b61c 3486 d = (char*)uvchr_to_utf8((U8*)d, nextuv);
e294cc5d
JH
3487#ifdef EBCDIC
3488 if (uv > 255 && !dorange)
3489 native_range = FALSE;
3490#endif
2b9d42f0
NIS
3491 }
3492 else {
3493 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
3494 }
02aa26ce
NT
3495 } /* while loop to process each character */
3496
3497 /* terminate the string and set up the sv */
79072805 3498 *d = '\0';
95a20fc0 3499 SvCUR_set(sv, d - SvPVX_const(sv));
2b9d42f0 3500 if (SvCUR(sv) >= SvLEN(sv))
d0063567 3501 Perl_croak(aTHX_ "panic: constant overflowed allocated space");
2b9d42f0 3502
79072805 3503 SvPOK_on(sv);
9f4817db 3504 if (PL_encoding && !has_utf8) {
d0063567
DK
3505 sv_recode_to_utf8(sv, PL_encoding);
3506 if (SvUTF8(sv))
3507 has_utf8 = TRUE;
9f4817db 3508 }
2b9d42f0 3509 if (has_utf8) {
7e2040f0 3510 SvUTF8_on(sv);
2b9d42f0 3511 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
d0063567 3512 PL_sublex_info.sub_op->op_private |=
2b9d42f0
NIS
3513 (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
3514 }
3515 }
79072805 3516
02aa26ce 3517 /* shrink the sv if we allocated more than we used */
79072805 3518 if (SvCUR(sv) + 5 < SvLEN(sv)) {
1da4ca5f 3519 SvPV_shrink_to_cur(sv);
79072805 3520 }
02aa26ce 3521
6154021b 3522 /* return the substring (via pl_yylval) only if we parsed anything */
3280af22 3523 if (s > PL_bufptr) {
eb0d8d16
NC
3524 if ( PL_hints & ( PL_lex_inpat ? HINT_NEW_RE : HINT_NEW_STRING ) ) {
3525 const char *const key = PL_lex_inpat ? "qr" : "q";
3526 const STRLEN keylen = PL_lex_inpat ? 2 : 1;
3527 const char *type;
3528 STRLEN typelen;
3529
3530 if (PL_lex_inwhat == OP_TRANS) {
3531 type = "tr";
3532 typelen = 2;
3533 } else if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat) {
3534 type = "s";
3535 typelen = 1;
3536 } else {
3537 type = "qq";
3538 typelen = 2;
3539 }
3540
3541 sv = S_new_constant(aTHX_ start, s - start, key, keylen, sv, NULL,
3542 type, typelen);
3543 }
6154021b 3544 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
b3ac6de7 3545 } else
8990e307 3546 SvREFCNT_dec(sv);
79072805
LW
3547 return s;
3548}
3549
ffb4593c
NT
3550/* S_intuit_more
3551 * Returns TRUE if there's more to the expression (e.g., a subscript),
3552 * FALSE otherwise.
ffb4593c
NT
3553 *
3554 * It deals with "$foo[3]" and /$foo[3]/ and /$foo[0123456789$]+/
3555 *
3556 * ->[ and ->{ return TRUE
3557 * { and [ outside a pattern are always subscripts, so return TRUE
3558 * if we're outside a pattern and it's not { or [, then return FALSE
3559 * if we're in a pattern and the first char is a {
3560 * {4,5} (any digits around the comma) returns FALSE
3561 * if we're in a pattern and the first char is a [
3562 * [] returns FALSE
3563 * [SOMETHING] has a funky algorithm to decide whether it's a
3564 * character class or not. It has to deal with things like
3565 * /$foo[-3]/ and /$foo[$bar]/ as well as /$foo[$\d]+/
3566 * anything else returns TRUE
3567 */
3568
9cbb5ea2
GS
3569/* This is the one truly awful dwimmer necessary to conflate C and sed. */
3570
76e3520e 3571STATIC int
cea2e8a9 3572S_intuit_more(pTHX_ register char *s)
79072805 3573{
97aff369 3574 dVAR;
7918f24d
NC
3575
3576 PERL_ARGS_ASSERT_INTUIT_MORE;
3577
3280af22 3578 if (PL_lex_brackets)
79072805
LW
3579 return TRUE;
3580 if (*s == '-' && s[1] == '>' && (s[2] == '[' || s[2] == '{'))
3581 return TRUE;
3582 if (*s != '{' && *s != '[')
3583 return FALSE;
3280af22 3584 if (!PL_lex_inpat)
79072805
LW
3585 return TRUE;
3586
3587 /* In a pattern, so maybe we have {n,m}. */
3588 if (*s == '{') {
b3155d95 3589 if (regcurly(s)) {
79072805 3590 return FALSE;
b3155d95 3591 }
79072805 3592 return TRUE;
79072805
LW
3593 }
3594
3595 /* On the other hand, maybe we have a character class */
3596
3597 s++;
3598 if (*s == ']' || *s == '^')
3599 return FALSE;
3600 else {
ffb4593c 3601 /* this is terrifying, and it works */
79072805
LW
3602 int weight = 2; /* let's weigh the evidence */
3603 char seen[256];
f27ffc4a 3604 unsigned char un_char = 255, last_un_char;
9d4ba2ae 3605 const char * const send = strchr(s,']');
3280af22 3606 char tmpbuf[sizeof PL_tokenbuf * 4];
79072805
LW
3607
3608 if (!send) /* has to be an expression */
3609 return TRUE;
3610
3611 Zero(seen,256,char);
3612 if (*s == '$')
3613 weight -= 3;
3614 else if (isDIGIT(*s)) {
3615 if (s[1] != ']') {
3616 if (isDIGIT(s[1]) && s[2] == ']')
3617 weight -= 10;
3618 }
3619 else
3620 weight -= 100;
3621 }
3622 for (; s < send; s++) {
3623 last_un_char = un_char;
3624 un_char = (unsigned char)*s;
3625 switch (*s) {
3626 case '@':
3627 case '&':
3628 case '$':
3629 weight -= seen[un_char] * 10;
7e2040f0 3630 if (isALNUM_lazy_if(s+1,UTF)) {
90e5519e 3631 int len;
8903cb82 3632 scan_ident(s, send, tmpbuf, sizeof tmpbuf, FALSE);
90e5519e 3633 len = (int)strlen(tmpbuf);
6fbd0d97
BF
3634 if (len > 1 && gv_fetchpvn_flags(tmpbuf, len,
3635 UTF ? SVf_UTF8 : 0, SVt_PV))
79072805
LW
3636 weight -= 100;
3637 else
3638 weight -= 10;
3639 }
3640 else if (*s == '$' && s[1] &&
93a17b20
LW
3641 strchr("[#!%*<>()-=",s[1])) {
3642 if (/*{*/ strchr("])} =",s[2]))
79072805
LW
3643 weight -= 10;
3644 else
3645 weight -= 1;
3646 }
3647 break;
3648 case '\\':
3649 un_char = 254;
3650 if (s[1]) {
93a17b20 3651 if (strchr("wds]",s[1]))
79072805 3652 weight += 100;
10edeb5d 3653 else if (seen[(U8)'\''] || seen[(U8)'"'])
79072805 3654 weight += 1;
93a17b20 3655 else if (strchr("rnftbxcav",s[1]))
79072805
LW
3656 weight += 40;
3657 else if (isDIGIT(s[1])) {
3658 weight += 40;
3659 while (s[1] && isDIGIT(s[1]))
3660 s++;
3661 }
3662 }
3663 else
3664 weight += 100;
3665 break;
3666 case '-':
3667 if (s[1] == '\\')
3668 weight += 50;
93a17b20 3669 if (strchr("aA01! ",last_un_char))
79072805 3670 weight += 30;
93a17b20 3671 if (strchr("zZ79~",s[1]))
79072805 3672 weight += 30;
f27ffc4a
GS
3673 if (last_un_char == 255 && (isDIGIT(s[1]) || s[1] == '$'))
3674 weight -= 5; /* cope with negative subscript */
79072805
LW
3675 break;
3676 default:
3792a11b
NC
3677 if (!isALNUM(last_un_char)
3678 && !(last_un_char == '$' || last_un_char == '@'
3679 || last_un_char == '&')
3680 && isALPHA(*s) && s[1] && isALPHA(s[1])) {
79072805
LW
3681 char *d = tmpbuf;
3682 while (isALPHA(*s))
3683 *d++ = *s++;
3684 *d = '\0';
5458a98a 3685 if (keyword(tmpbuf, d - tmpbuf, 0))
79072805
LW
3686 weight -= 150;
3687 }
3688 if (un_char == last_un_char + 1)
3689 weight += 5;
3690 weight -= seen[un_char];
3691 break;
3692 }
3693 seen[un_char]++;
3694 }
3695 if (weight >= 0) /* probably a character class */
3696 return FALSE;
3697 }
3698
3699 return TRUE;
3700}
ffed7fef 3701
ffb4593c
NT
3702/*
3703 * S_intuit_method
3704 *
3705 * Does all the checking to disambiguate
3706 * foo bar
3707 * between foo(bar) and bar->foo. Returns 0 if not a method, otherwise
3708 * FUNCMETH (bar->foo(args)) or METHOD (bar->foo args).
3709 *
3710 * First argument is the stuff after the first token, e.g. "bar".
3711 *
3712 * Not a method if bar is a filehandle.
3713 * Not a method if foo is a subroutine prototyped to take a filehandle.
3714 * Not a method if it's really "Foo $bar"
3715 * Method if it's "foo $bar"
3716 * Not a method if it's really "print foo $bar"
3717 * Method if it's really "foo package::" (interpreted as package->foo)
8f8cf39c 3718 * Not a method if bar is known to be a subroutine ("sub bar; foo bar")
3cb0bbe5 3719 * Not a method if bar is a filehandle or package, but is quoted with
ffb4593c
NT
3720 * =>
3721 */
3722
76e3520e 3723STATIC int
62d55b22 3724S_intuit_method(pTHX_ char *start, GV *gv, CV *cv)
a0d0e21e 3725{
97aff369 3726 dVAR;
a0d0e21e 3727 char *s = start + (*start == '$');
3280af22 3728 char tmpbuf[sizeof PL_tokenbuf];
a0d0e21e
LW
3729 STRLEN len;
3730 GV* indirgv;
5db06880
NC
3731#ifdef PERL_MAD
3732 int soff;
3733#endif
a0d0e21e 3734
7918f24d
NC
3735 PERL_ARGS_ASSERT_INTUIT_METHOD;
3736
a0d0e21e 3737 if (gv) {
62d55b22 3738 if (SvTYPE(gv) == SVt_PVGV && GvIO(gv))
a0d0e21e 3739 return 0;
62d55b22
NC
3740 if (cv) {
3741 if (SvPOK(cv)) {
3742 const char *proto = SvPVX_const(cv);
3743 if (proto) {
3744 if (*proto == ';')
3745 proto++;
3746 if (*proto == '*')
3747 return 0;
3748 }
b6c543e3
IZ
3749 }
3750 } else
c35e046a 3751 gv = NULL;
a0d0e21e 3752 }
8903cb82 3753 s = scan_word(s, tmpbuf, sizeof tmpbuf, TRUE, &len);
ffb4593c
NT
3754 /* start is the beginning of the possible filehandle/object,
3755 * and s is the end of it
3756 * tmpbuf is a copy of it
3757 */
3758
a0d0e21e 3759 if (*start == '$') {
3ef1310e
RGS
3760 if (gv || PL_last_lop_op == OP_PRINT || PL_last_lop_op == OP_SAY ||
3761 isUPPER(*PL_tokenbuf))
a0d0e21e 3762 return 0;
5db06880
NC
3763#ifdef PERL_MAD
3764 len = start - SvPVX(PL_linestr);
3765#endif
29595ff2 3766 s = PEEKSPACE(s);
f0092767 3767#ifdef PERL_MAD
5db06880
NC
3768 start = SvPVX(PL_linestr) + len;
3769#endif
3280af22
NIS
3770 PL_bufptr = start;
3771 PL_expect = XREF;
a0d0e21e
LW
3772 return *s == '(' ? FUNCMETH : METHOD;
3773 }
5458a98a 3774 if (!keyword(tmpbuf, len, 0)) {
c3e0f903
GS