This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
pp.c: pp_rv2gv UTF8 cleanup.
[perl5.git] / toke.c
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
26
27This is the lower layer of the Perl parser, managing characters and tokens.
28
29=for apidoc AmU|yy_parser *|PL_parser
30
31Pointer to a structure encapsulating the state of the parsing operation
32currently in progress. The pointer can be locally changed to perform
33a nested parse without interfering with the state of an outer parse.
34Individual members of C<PL_parser> have their own documentation.
35
36=cut
37*/
38
378cc40b 39#include "EXTERN.h"
864dbfa3 40#define PERL_IN_TOKE_C
378cc40b 41#include "perl.h"
04e98a4d 42#include "dquote_static.c"
378cc40b 43
eb0d8d16
NC
44#define new_constant(a,b,c,d,e,f,g) \
45 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
46
6154021b 47#define pl_yylval (PL_parser->yylval)
d3b6f988 48
199e78b7
DM
49/* XXX temporary backwards compatibility */
50#define PL_lex_brackets (PL_parser->lex_brackets)
78cdf107
Z
51#define PL_lex_allbrackets (PL_parser->lex_allbrackets)
52#define PL_lex_fakeeof (PL_parser->lex_fakeeof)
199e78b7
DM
53#define PL_lex_brackstack (PL_parser->lex_brackstack)
54#define PL_lex_casemods (PL_parser->lex_casemods)
55#define PL_lex_casestack (PL_parser->lex_casestack)
56#define PL_lex_defer (PL_parser->lex_defer)
57#define PL_lex_dojoin (PL_parser->lex_dojoin)
58#define PL_lex_expect (PL_parser->lex_expect)
59#define PL_lex_formbrack (PL_parser->lex_formbrack)
60#define PL_lex_inpat (PL_parser->lex_inpat)
61#define PL_lex_inwhat (PL_parser->lex_inwhat)
62#define PL_lex_op (PL_parser->lex_op)
63#define PL_lex_repl (PL_parser->lex_repl)
64#define PL_lex_starts (PL_parser->lex_starts)
65#define PL_lex_stuff (PL_parser->lex_stuff)
66#define PL_multi_start (PL_parser->multi_start)
67#define PL_multi_open (PL_parser->multi_open)
68#define PL_multi_close (PL_parser->multi_close)
69#define PL_pending_ident (PL_parser->pending_ident)
70#define PL_preambled (PL_parser->preambled)
71#define PL_sublex_info (PL_parser->sublex_info)
bdc0bf6f 72#define PL_linestr (PL_parser->linestr)
c2598295
DM
73#define PL_expect (PL_parser->expect)
74#define PL_copline (PL_parser->copline)
f06b5848
DM
75#define PL_bufptr (PL_parser->bufptr)
76#define PL_oldbufptr (PL_parser->oldbufptr)
77#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
78#define PL_linestart (PL_parser->linestart)
79#define PL_bufend (PL_parser->bufend)
80#define PL_last_uni (PL_parser->last_uni)
81#define PL_last_lop (PL_parser->last_lop)
82#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 83#define PL_lex_state (PL_parser->lex_state)
2f9285f8 84#define PL_rsfp (PL_parser->rsfp)
5486870f 85#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
86#define PL_in_my (PL_parser->in_my)
87#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 88#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 89#define PL_multi_end (PL_parser->multi_end)
13765c85 90#define PL_error_count (PL_parser->error_count)
199e78b7
DM
91
92#ifdef PERL_MAD
93# define PL_endwhite (PL_parser->endwhite)
94# define PL_faketokens (PL_parser->faketokens)
95# define PL_lasttoke (PL_parser->lasttoke)
96# define PL_nextwhite (PL_parser->nextwhite)
97# define PL_realtokenstart (PL_parser->realtokenstart)
98# define PL_skipwhite (PL_parser->skipwhite)
99# define PL_thisclose (PL_parser->thisclose)
100# define PL_thismad (PL_parser->thismad)
101# define PL_thisopen (PL_parser->thisopen)
102# define PL_thisstuff (PL_parser->thisstuff)
103# define PL_thistoken (PL_parser->thistoken)
104# define PL_thiswhite (PL_parser->thiswhite)
fb205e7a
DM
105# define PL_thiswhite (PL_parser->thiswhite)
106# define PL_nexttoke (PL_parser->nexttoke)
107# define PL_curforce (PL_parser->curforce)
108#else
109# define PL_nexttoke (PL_parser->nexttoke)
110# define PL_nexttype (PL_parser->nexttype)
111# define PL_nextval (PL_parser->nextval)
199e78b7
DM
112#endif
113
16173588
NC
114/* This can't be done with embed.fnc, because struct yy_parser contains a
115 member named pending_ident, which clashes with the generated #define */
3cbf51f5
DM
116static int
117S_pending_ident(pTHX);
199e78b7 118
0bd48802 119static const char ident_too_long[] = "Identifier too long";
8903cb82 120
29595ff2 121#ifdef PERL_MAD
29595ff2 122# define CURMAD(slot,sv) if (PL_madskills) { curmad(slot,sv); sv = 0; }
cd81e915 123# define NEXTVAL_NEXTTOKE PL_nexttoke[PL_curforce].next_val
9ded7720 124#else
5db06880 125# define CURMAD(slot,sv)
9ded7720 126# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2
NC
127#endif
128
a7aaec61
Z
129#define XENUMMASK 0x3f
130#define XFAKEEOF 0x40
131#define XFAKEBRACK 0x80
9059aa12 132
39e02b42
JH
133#ifdef USE_UTF8_SCRIPTS
134# define UTF (!IN_BYTES)
2b9d42f0 135#else
746b446a 136# define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || (PL_hints & HINT_UTF8))
2b9d42f0 137#endif
a0ed51b3 138
b1fc3636
CJ
139/* The maximum number of characters preceding the unrecognized one to display */
140#define UNRECOGNIZED_PRECEDE_COUNT 10
141
61f0cdd9 142/* In variables named $^X, these are the legal values for X.
2b92dfce
GS
143 * 1999-02-27 mjd-perl-patch@plover.com */
144#define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
145
bf4acbe4 146#define SPACE_OR_TAB(c) ((c)==' '||(c)=='\t')
bf4acbe4 147
ffb4593c
NT
148/* LEX_* are values for PL_lex_state, the state of the lexer.
149 * They are arranged oddly so that the guard on the switch statement
79072805
LW
150 * can get by with a single comparison (if the compiler is smart enough).
151 */
152
fb73857a 153/* #define LEX_NOTPARSING 11 is done in perl.h. */
154
b6007c36
DM
155#define LEX_NORMAL 10 /* normal code (ie not within "...") */
156#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
157#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
158#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
159#define LEX_INTERPSTART 6 /* expecting the start of a $var */
160
161 /* at end of code, eg "$x" followed by: */
162#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
163#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
164
165#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
166 string or after \E, $foo, etc */
167#define LEX_INTERPCONST 2 /* NOT USED */
168#define LEX_FORMLINE 1 /* expecting a format line */
169#define LEX_KNOWNEXT 0 /* next token known; just return it */
170
79072805 171
bbf60fe6 172#ifdef DEBUGGING
27da23d5 173static const char* const lex_state_names[] = {
bbf60fe6
DM
174 "KNOWNEXT",
175 "FORMLINE",
176 "INTERPCONST",
177 "INTERPCONCAT",
178 "INTERPENDMAYBE",
179 "INTERPEND",
180 "INTERPSTART",
181 "INTERPPUSH",
182 "INTERPCASEMOD",
183 "INTERPNORMAL",
184 "NORMAL"
185};
186#endif
187
79072805
LW
188#ifdef ff_next
189#undef ff_next
d48672a2
LW
190#endif
191
79072805 192#include "keywords.h"
fe14fcc3 193
ffb4593c
NT
194/* CLINE is a macro that ensures PL_copline has a sane value */
195
ae986130
LW
196#ifdef CLINE
197#undef CLINE
198#endif
57843af0 199#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 200
5db06880 201#ifdef PERL_MAD
29595ff2
NC
202# define SKIPSPACE0(s) skipspace0(s)
203# define SKIPSPACE1(s) skipspace1(s)
204# define SKIPSPACE2(s,tsv) skipspace2(s,&tsv)
205# define PEEKSPACE(s) skipspace2(s,0)
206#else
207# define SKIPSPACE0(s) skipspace(s)
208# define SKIPSPACE1(s) skipspace(s)
209# define SKIPSPACE2(s,tsv) skipspace(s)
210# define PEEKSPACE(s) skipspace(s)
211#endif
212
ffb4593c
NT
213/*
214 * Convenience functions to return different tokens and prime the
9cbb5ea2 215 * lexer for the next token. They all take an argument.
ffb4593c
NT
216 *
217 * TOKEN : generic token (used for '(', DOLSHARP, etc)
218 * OPERATOR : generic operator
219 * AOPERATOR : assignment operator
220 * PREBLOCK : beginning the block after an if, while, foreach, ...
221 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
222 * PREREF : *EXPR where EXPR is not a simple identifier
223 * TERM : expression term
224 * LOOPX : loop exiting command (goto, last, dump, etc)
225 * FTST : file test operator
226 * FUN0 : zero-argument function
7eb971ee 227 * FUN0OP : zero-argument function, with its op created in this file
2d2e263d 228 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
229 * BOop : bitwise or or xor
230 * BAop : bitwise and
231 * SHop : shift operator
232 * PWop : power operator
9cbb5ea2 233 * PMop : pattern-matching operator
ffb4593c
NT
234 * Aop : addition-level operator
235 * Mop : multiplication-level operator
236 * Eop : equality-testing operator
e5edeb50 237 * Rop : relational operator <= != gt
ffb4593c
NT
238 *
239 * Also see LOP and lop() below.
240 */
241
998054bd 242#ifdef DEBUGGING /* Serve -DT. */
704d4215 243# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 244#else
bbf60fe6 245# define REPORT(retval) (retval)
998054bd
SC
246#endif
247
bbf60fe6
DM
248#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
249#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
250#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, REPORT(retval)))
251#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
252#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
253#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
254#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
6154021b
RGS
255#define LOOPX(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)LOOPEX))
256#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
257#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
7eb971ee 258#define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
6154021b
RGS
259#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
260#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITOROP)))
261#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITANDOP)))
262#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)SHIFTOP)))
263#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)POWOP)))
264#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
265#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)ADDOP)))
266#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MULOP)))
267#define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
268#define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
2f3197b3 269
a687059c
LW
270/* This bit of chicanery makes a unary function followed by
271 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
272 * The UNIDOR macro is for unary functions that can be followed by the //
273 * operator (such as C<shift // 0>).
a687059c 274 */
376fcdbf 275#define UNI2(f,x) { \
6154021b 276 pl_yylval.ival = f; \
376fcdbf
AL
277 PL_expect = x; \
278 PL_bufptr = s; \
279 PL_last_uni = PL_oldbufptr; \
280 PL_last_lop_op = f; \
281 if (*s == '(') \
282 return REPORT( (int)FUNC1 ); \
29595ff2 283 s = PEEKSPACE(s); \
376fcdbf
AL
284 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
285 }
6f33ba73
RGS
286#define UNI(f) UNI2(f,XTERM)
287#define UNIDOR(f) UNI2(f,XTERMORDORDOR)
a687059c 288
376fcdbf 289#define UNIBRACK(f) { \
6154021b 290 pl_yylval.ival = f; \
376fcdbf
AL
291 PL_bufptr = s; \
292 PL_last_uni = PL_oldbufptr; \
293 if (*s == '(') \
294 return REPORT( (int)FUNC1 ); \
29595ff2 295 s = PEEKSPACE(s); \
376fcdbf
AL
296 return REPORT( (*s == '(') ? (int)FUNC1 : (int)UNIOP ); \
297 }
79072805 298
9f68db38 299/* grandfather return to old style */
78cdf107
Z
300#define OLDLOP(f) \
301 do { \
302 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
303 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
304 pl_yylval.ival = (f); \
305 PL_expect = XTERM; \
306 PL_bufptr = s; \
307 return (int)LSTOP; \
308 } while(0)
79072805 309
8fa7f367
JH
310#ifdef DEBUGGING
311
6154021b 312/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
313enum token_type {
314 TOKENTYPE_NONE,
315 TOKENTYPE_IVAL,
6154021b 316 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6
DM
317 TOKENTYPE_PVAL,
318 TOKENTYPE_OPVAL,
319 TOKENTYPE_GVVAL
320};
321
6d4a66ac
NC
322static struct debug_tokens {
323 const int token;
324 enum token_type type;
325 const char *name;
326} const debug_tokens[] =
9041c2e3 327{
bbf60fe6
DM
328 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
329 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
330 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
331 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
332 { ARROW, TOKENTYPE_NONE, "ARROW" },
333 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
334 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
335 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
336 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
337 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
0d863452 338 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
339 { DO, TOKENTYPE_NONE, "DO" },
340 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
341 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
342 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
343 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
344 { ELSE, TOKENTYPE_NONE, "ELSE" },
345 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
346 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
347 { FOR, TOKENTYPE_IVAL, "FOR" },
348 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
349 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
350 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
7eb971ee 351 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
bbf60fe6
DM
352 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
353 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
354 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 355 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
356 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
357 { IF, TOKENTYPE_IVAL, "IF" },
358 { LABEL, TOKENTYPE_PVAL, "LABEL" },
359 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
360 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
361 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
362 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
363 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
364 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
365 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
366 { MY, TOKENTYPE_IVAL, "MY" },
367 { MYSUB, TOKENTYPE_NONE, "MYSUB" },
368 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
369 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
370 { OROP, TOKENTYPE_IVAL, "OROP" },
371 { OROR, TOKENTYPE_NONE, "OROR" },
372 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
88e1f1a2
JV
373 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
374 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6
DM
375 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
376 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
377 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
378 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
379 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
380 { PREINC, TOKENTYPE_NONE, "PREINC" },
381 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
382 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
383 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
384 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
385 { SUB, TOKENTYPE_NONE, "SUB" },
386 { THING, TOKENTYPE_OPVAL, "THING" },
387 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
388 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
389 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
390 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
391 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
392 { USE, TOKENTYPE_IVAL, "USE" },
0d863452 393 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6
DM
394 { WHILE, TOKENTYPE_IVAL, "WHILE" },
395 { WORD, TOKENTYPE_OPVAL, "WORD" },
be25f609 396 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 397 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
398};
399
6154021b 400/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 401
bbf60fe6 402STATIC int
704d4215 403S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 404{
97aff369 405 dVAR;
7918f24d
NC
406
407 PERL_ARGS_ASSERT_TOKEREPORT;
408
bbf60fe6 409 if (DEBUG_T_TEST) {
bd61b366 410 const char *name = NULL;
bbf60fe6 411 enum token_type type = TOKENTYPE_NONE;
f54cb97a 412 const struct debug_tokens *p;
396482e1 413 SV* const report = newSVpvs("<== ");
bbf60fe6 414
f54cb97a 415 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
416 if (p->token == (int)rv) {
417 name = p->name;
418 type = p->type;
419 break;
420 }
421 }
422 if (name)
54667de8 423 Perl_sv_catpv(aTHX_ report, name);
bbf60fe6
DM
424 else if ((char)rv > ' ' && (char)rv < '~')
425 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
426 else if (!rv)
396482e1 427 sv_catpvs(report, "EOF");
bbf60fe6
DM
428 else
429 Perl_sv_catpvf(aTHX_ report, "?? %"IVdf, (IV)rv);
430 switch (type) {
431 case TOKENTYPE_NONE:
432 case TOKENTYPE_GVVAL: /* doesn't appear to be used */
433 break;
434 case TOKENTYPE_IVAL:
704d4215 435 Perl_sv_catpvf(aTHX_ report, "(ival=%"IVdf")", (IV)lvalp->ival);
bbf60fe6
DM
436 break;
437 case TOKENTYPE_OPNUM:
438 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 439 PL_op_name[lvalp->ival]);
bbf60fe6
DM
440 break;
441 case TOKENTYPE_PVAL:
704d4215 442 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
443 break;
444 case TOKENTYPE_OPVAL:
704d4215 445 if (lvalp->opval) {
401441c0 446 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
447 PL_op_name[lvalp->opval->op_type]);
448 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 449 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 450 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
451 }
452
453 }
401441c0 454 else
396482e1 455 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
456 break;
457 }
b6007c36 458 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
459 };
460 return (int)rv;
998054bd
SC
461}
462
b6007c36
DM
463
464/* print the buffer with suitable escapes */
465
466STATIC void
15f169a1 467S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 468{
396482e1 469 SV* const tmp = newSVpvs("");
7918f24d
NC
470
471 PERL_ARGS_ASSERT_PRINTBUF;
472
b6007c36
DM
473 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
474 SvREFCNT_dec(tmp);
475}
476
8fa7f367
JH
477#endif
478
8290c323
NC
479static int
480S_deprecate_commaless_var_list(pTHX) {
481 PL_expect = XTERM;
482 deprecate("comma-less variable list");
483 return REPORT(','); /* grandfather non-comma-format format */
484}
485
ffb4593c
NT
486/*
487 * S_ao
488 *
c963b151
BD
489 * This subroutine detects &&=, ||=, and //= and turns an ANDAND, OROR or DORDOR
490 * into an OP_ANDASSIGN, OP_ORASSIGN, or OP_DORASSIGN
ffb4593c
NT
491 */
492
76e3520e 493STATIC int
cea2e8a9 494S_ao(pTHX_ int toketype)
a0d0e21e 495{
97aff369 496 dVAR;
3280af22
NIS
497 if (*PL_bufptr == '=') {
498 PL_bufptr++;
a0d0e21e 499 if (toketype == ANDAND)
6154021b 500 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 501 else if (toketype == OROR)
6154021b 502 pl_yylval.ival = OP_ORASSIGN;
c963b151 503 else if (toketype == DORDOR)
6154021b 504 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
505 toketype = ASSIGNOP;
506 }
507 return toketype;
508}
509
ffb4593c
NT
510/*
511 * S_no_op
512 * When Perl expects an operator and finds something else, no_op
513 * prints the warning. It always prints "<something> found where
514 * operator expected. It prints "Missing semicolon on previous line?"
515 * if the surprise occurs at the start of the line. "do you need to
516 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
517 * where the compiler doesn't know if foo is a method call or a function.
518 * It prints "Missing operator before end of line" if there's nothing
519 * after the missing operator, or "... before <...>" if there is something
520 * after the missing operator.
521 */
522
76e3520e 523STATIC void
15f169a1 524S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 525{
97aff369 526 dVAR;
9d4ba2ae
AL
527 char * const oldbp = PL_bufptr;
528 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 529
7918f24d
NC
530 PERL_ARGS_ASSERT_NO_OP;
531
1189a94a
GS
532 if (!s)
533 s = oldbp;
07c798fb 534 else
1189a94a 535 PL_bufptr = s;
cea2e8a9 536 yywarn(Perl_form(aTHX_ "%s found where operator expected", what));
56da5a46
RGS
537 if (ckWARN_d(WARN_SYNTAX)) {
538 if (is_first)
539 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
540 "\t(Missing semicolon on previous line?)\n");
541 else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) {
f54cb97a 542 const char *t;
c35e046a
AL
543 for (t = PL_oldoldbufptr; (isALNUM_lazy_if(t,UTF) || *t == ':'); t++)
544 NOOP;
56da5a46
RGS
545 if (t < PL_bufptr && isSPACE(*t))
546 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
547 "\t(Do you need to predeclare %.*s?)\n",
551405c4 548 (int)(t - PL_oldoldbufptr), PL_oldoldbufptr);
56da5a46
RGS
549 }
550 else {
551 assert(s >= oldbp);
552 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
551405c4 553 "\t(Missing operator before %.*s?)\n", (int)(s - oldbp), oldbp);
56da5a46 554 }
07c798fb 555 }
3280af22 556 PL_bufptr = oldbp;
8990e307
LW
557}
558
ffb4593c
NT
559/*
560 * S_missingterm
561 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 562 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
563 * If we're in a delimited string and the delimiter is a control
564 * character, it's reformatted into a two-char sequence like ^C.
565 * This is fatal.
566 */
567
76e3520e 568STATIC void
cea2e8a9 569S_missingterm(pTHX_ char *s)
8990e307 570{
97aff369 571 dVAR;
8990e307
LW
572 char tmpbuf[3];
573 char q;
574 if (s) {
9d4ba2ae 575 char * const nl = strrchr(s,'\n');
d2719217 576 if (nl)
8990e307
LW
577 *nl = '\0';
578 }
463559e7 579 else if (isCNTRL(PL_multi_close)) {
8990e307 580 *tmpbuf = '^';
585ec06d 581 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
582 tmpbuf[2] = '\0';
583 s = tmpbuf;
584 }
585 else {
eb160463 586 *tmpbuf = (char)PL_multi_close;
8990e307
LW
587 tmpbuf[1] = '\0';
588 s = tmpbuf;
589 }
590 q = strchr(s,'"') ? '\'' : '"';
cea2e8a9 591 Perl_croak(aTHX_ "Can't find string terminator %c%s%c anywhere before EOF",q,s,q);
463ee0b2 592}
79072805 593
0d863452 594/*
0d863452
RH
595 * Check whether the named feature is enabled.
596 */
26ea9e12
NC
597bool
598Perl_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
0d863452 599{
97aff369 600 dVAR;
0d863452 601 HV * const hinthv = GvHV(PL_hintgv);
4a731d7b 602 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
7918f24d
NC
603
604 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
605
26ea9e12
NC
606 if (namelen > MAX_FEATURE_LEN)
607 return FALSE;
4a731d7b 608 memcpy(&he_name[8], name, namelen);
d4c19fe8 609
7b9ef140 610 return (hinthv && hv_exists(hinthv, he_name, 8 + namelen));
0d863452
RH
611}
612
ffb4593c 613/*
9cbb5ea2
GS
614 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
615 * utf16-to-utf8-reversed.
ffb4593c
NT
616 */
617
c39cd008
GS
618#ifdef PERL_CR_FILTER
619static void
620strip_return(SV *sv)
621{
95a20fc0 622 register const char *s = SvPVX_const(sv);
9d4ba2ae 623 register const char * const e = s + SvCUR(sv);
7918f24d
NC
624
625 PERL_ARGS_ASSERT_STRIP_RETURN;
626
c39cd008
GS
627 /* outer loop optimized to do nothing if there are no CR-LFs */
628 while (s < e) {
629 if (*s++ == '\r' && *s == '\n') {
630 /* hit a CR-LF, need to copy the rest */
631 register char *d = s - 1;
632 *d++ = *s++;
633 while (s < e) {
634 if (*s == '\r' && s[1] == '\n')
635 s++;
636 *d++ = *s++;
637 }
638 SvCUR(sv) -= s - d;
639 return;
640 }
641 }
642}
a868473f 643
76e3520e 644STATIC I32
c39cd008 645S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 646{
f54cb97a 647 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
648 if (count > 0 && !maxlen)
649 strip_return(sv);
650 return count;
a868473f
NIS
651}
652#endif
653
ffb4593c 654/*
8eaa0acf
Z
655=for apidoc Amx|void|lex_start|SV *line|PerlIO *rsfp|U32 flags
656
657Creates and initialises a new lexer/parser state object, supplying
658a context in which to lex and parse from a new source of Perl code.
659A pointer to the new state object is placed in L</PL_parser>. An entry
660is made on the save stack so that upon unwinding the new state object
661will be destroyed and the former value of L</PL_parser> will be restored.
662Nothing else need be done to clean up the parsing context.
663
664The code to be parsed comes from I<line> and I<rsfp>. I<line>, if
665non-null, provides a string (in SV form) containing code to be parsed.
666A copy of the string is made, so subsequent modification of I<line>
667does not affect parsing. I<rsfp>, if non-null, provides an input stream
668from which code will be read to be parsed. If both are non-null, the
669code in I<line> comes first and must consist of complete lines of input,
670and I<rsfp> supplies the remainder of the source.
671
672The I<flags> parameter is reserved for future use, and must always
27fcb6ee
FC
673be zero, except for one flag that is currently reserved for perl's internal
674use.
8eaa0acf
Z
675
676=cut
677*/
ffb4593c 678
27fcb6ee
FC
679/* LEX_START_SAME_FILTER indicates that this is not a new file, so it
680 can share filters with the current parser. */
681
a0d0e21e 682void
8eaa0acf 683Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
79072805 684{
97aff369 685 dVAR;
6ef55633 686 const char *s = NULL;
8990e307 687 STRLEN len;
5486870f 688 yy_parser *parser, *oparser;
27fcb6ee 689 if (flags && flags != LEX_START_SAME_FILTER)
8eaa0acf 690 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
acdf0a21
DM
691
692 /* create and initialise a parser */
693
199e78b7 694 Newxz(parser, 1, yy_parser);
5486870f 695 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
696 PL_parser = parser;
697
28ac2b49
Z
698 parser->stack = NULL;
699 parser->ps = NULL;
700 parser->stack_size = 0;
acdf0a21 701
e3abe207
DM
702 /* on scope exit, free this parser and restore any outer one */
703 SAVEPARSER(parser);
7c4baf47 704 parser->saved_curcop = PL_curcop;
e3abe207 705
acdf0a21 706 /* initialise lexer state */
8990e307 707
fb205e7a
DM
708#ifdef PERL_MAD
709 parser->curforce = -1;
710#else
711 parser->nexttoke = 0;
712#endif
ca4cfd28 713 parser->error_count = oparser ? oparser->error_count : 0;
c2598295 714 parser->copline = NOLINE;
5afb0a62 715 parser->lex_state = LEX_NORMAL;
c2598295 716 parser->expect = XSTATE;
2f9285f8 717 parser->rsfp = rsfp;
27fcb6ee
FC
718 parser->rsfp_filters =
719 !(flags & LEX_START_SAME_FILTER) || !oparser
720 ? newAV()
721 : MUTABLE_AV(SvREFCNT_inc(oparser->rsfp_filters));
2f9285f8 722
199e78b7
DM
723 Newx(parser->lex_brackstack, 120, char);
724 Newx(parser->lex_casestack, 12, char);
725 *parser->lex_casestack = '\0';
02b34bbe 726
10efb74f
NC
727 if (line) {
728 s = SvPV_const(line, len);
729 } else {
730 len = 0;
731 }
bdc0bf6f 732
10efb74f 733 if (!len) {
bdc0bf6f 734 parser->linestr = newSVpvs("\n;");
805700c1 735 } else {
719a9bb0 736 parser->linestr = newSVpvn_flags(s, len, SvUTF8(line));
10efb74f 737 if (s[len-1] != ';')
bdc0bf6f 738 sv_catpvs(parser->linestr, "\n;");
8990e307 739 }
f06b5848
DM
740 parser->oldoldbufptr =
741 parser->oldbufptr =
742 parser->bufptr =
743 parser->linestart = SvPVX(parser->linestr);
744 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
745 parser->last_lop = parser->last_uni = NULL;
737c24fc
Z
746
747 parser->in_pod = 0;
79072805 748}
a687059c 749
e3abe207
DM
750
751/* delete a parser object */
752
753void
754Perl_parser_free(pTHX_ const yy_parser *parser)
755{
7918f24d
NC
756 PERL_ARGS_ASSERT_PARSER_FREE;
757
7c4baf47 758 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
759 SvREFCNT_dec(parser->linestr);
760
2f9285f8
DM
761 if (parser->rsfp == PerlIO_stdin())
762 PerlIO_clearerr(parser->rsfp);
799361c3
SH
763 else if (parser->rsfp && (!parser->old_parser ||
764 (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 765 PerlIO_close(parser->rsfp);
5486870f 766 SvREFCNT_dec(parser->rsfp_filters);
2f9285f8 767
e3abe207
DM
768 Safefree(parser->lex_brackstack);
769 Safefree(parser->lex_casestack);
770 PL_parser = parser->old_parser;
771 Safefree(parser);
772}
773
774
ffb4593c 775/*
f0e67a1d
Z
776=for apidoc AmxU|SV *|PL_parser-E<gt>linestr
777
778Buffer scalar containing the chunk currently under consideration of the
779text currently being lexed. This is always a plain string scalar (for
780which C<SvPOK> is true). It is not intended to be used as a scalar by
781normal scalar means; instead refer to the buffer directly by the pointer
782variables described below.
783
784The lexer maintains various C<char*> pointers to things in the
785C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
786reallocated, all of these pointers must be updated. Don't attempt to
787do this manually, but rather use L</lex_grow_linestr> if you need to
788reallocate the buffer.
789
790The content of the text chunk in the buffer is commonly exactly one
791complete line of input, up to and including a newline terminator,
792but there are situations where it is otherwise. The octets of the
793buffer may be intended to be interpreted as either UTF-8 or Latin-1.
794The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
795flag on this scalar, which may disagree with it.
796
797For direct examination of the buffer, the variable
798L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
799lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
800of these pointers is usually preferable to examination of the scalar
801through normal scalar means.
802
803=for apidoc AmxU|char *|PL_parser-E<gt>bufend
804
805Direct pointer to the end of the chunk of text currently being lexed, the
806end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
807+ SvCUR(PL_parser-E<gt>linestr)>. A NUL character (zero octet) is
808always located at the end of the buffer, and does not count as part of
809the buffer's contents.
810
811=for apidoc AmxU|char *|PL_parser-E<gt>bufptr
812
813Points to the current position of lexing inside the lexer buffer.
814Characters around this point may be freely examined, within
815the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
816L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
817interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
818
819Lexing code (whether in the Perl core or not) moves this pointer past
820the characters that it consumes. It is also expected to perform some
821bookkeeping whenever a newline character is consumed. This movement
822can be more conveniently performed by the function L</lex_read_to>,
823which handles newlines appropriately.
824
825Interpretation of the buffer's octets can be abstracted out by
826using the slightly higher-level functions L</lex_peek_unichar> and
827L</lex_read_unichar>.
828
829=for apidoc AmxU|char *|PL_parser-E<gt>linestart
830
831Points to the start of the current line inside the lexer buffer.
832This is useful for indicating at which column an error occurred, and
833not much else. This must be updated by any lexing code that consumes
834a newline; the function L</lex_read_to> handles this detail.
835
836=cut
837*/
838
839/*
840=for apidoc Amx|bool|lex_bufutf8
841
842Indicates whether the octets in the lexer buffer
843(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
844of Unicode characters. If not, they should be interpreted as Latin-1
845characters. This is analogous to the C<SvUTF8> flag for scalars.
846
847In UTF-8 mode, it is not guaranteed that the lexer buffer actually
848contains valid UTF-8. Lexing code must be robust in the face of invalid
849encoding.
850
851The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
852is significant, but not the whole story regarding the input character
853encoding. Normally, when a file is being read, the scalar contains octets
854and its C<SvUTF8> flag is off, but the octets should be interpreted as
855UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
856however, the scalar may have the C<SvUTF8> flag on, and in this case its
857octets should be interpreted as UTF-8 unless the C<use bytes> pragma
858is in effect. This logic may change in the future; use this function
859instead of implementing the logic yourself.
860
861=cut
862*/
863
864bool
865Perl_lex_bufutf8(pTHX)
866{
867 return UTF;
868}
869
870/*
871=for apidoc Amx|char *|lex_grow_linestr|STRLEN len
872
873Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
874at least I<len> octets (including terminating NUL). Returns a
875pointer to the reallocated buffer. This is necessary before making
876any direct modification of the buffer that would increase its length.
877L</lex_stuff_pvn> provides a more convenient way to insert text into
878the buffer.
879
880Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
881this function updates all of the lexer's variables that point directly
882into the buffer.
883
884=cut
885*/
886
887char *
888Perl_lex_grow_linestr(pTHX_ STRLEN len)
889{
890 SV *linestr;
891 char *buf;
892 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
893 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
894 linestr = PL_parser->linestr;
895 buf = SvPVX(linestr);
896 if (len <= SvLEN(linestr))
897 return buf;
898 bufend_pos = PL_parser->bufend - buf;
899 bufptr_pos = PL_parser->bufptr - buf;
900 oldbufptr_pos = PL_parser->oldbufptr - buf;
901 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
902 linestart_pos = PL_parser->linestart - buf;
903 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
904 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
905 buf = sv_grow(linestr, len);
906 PL_parser->bufend = buf + bufend_pos;
907 PL_parser->bufptr = buf + bufptr_pos;
908 PL_parser->oldbufptr = buf + oldbufptr_pos;
909 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
910 PL_parser->linestart = buf + linestart_pos;
911 if (PL_parser->last_uni)
912 PL_parser->last_uni = buf + last_uni_pos;
913 if (PL_parser->last_lop)
914 PL_parser->last_lop = buf + last_lop_pos;
915 return buf;
916}
917
918/*
83aa740e 919=for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
f0e67a1d
Z
920
921Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
922immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
923reallocating the buffer if necessary. This means that lexing code that
924runs later will see the characters as if they had appeared in the input.
925It is not recommended to do this as part of normal parsing, and most
926uses of this facility run the risk of the inserted characters being
927interpreted in an unintended manner.
928
929The string to be inserted is represented by I<len> octets starting
930at I<pv>. These octets are interpreted as either UTF-8 or Latin-1,
931according to whether the C<LEX_STUFF_UTF8> flag is set in I<flags>.
932The characters are recoded for the lexer buffer, according to how the
933buffer is currently being interpreted (L</lex_bufutf8>). If a string
9dcc53ea 934to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
f0e67a1d
Z
935function is more convenient.
936
937=cut
938*/
939
940void
83aa740e 941Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
f0e67a1d 942{
749123ff 943 dVAR;
f0e67a1d
Z
944 char *bufptr;
945 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
946 if (flags & ~(LEX_STUFF_UTF8))
947 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
948 if (UTF) {
949 if (flags & LEX_STUFF_UTF8) {
950 goto plain_copy;
951 } else {
952 STRLEN highhalf = 0;
83aa740e 953 const char *p, *e = pv+len;
f0e67a1d
Z
954 for (p = pv; p != e; p++)
955 highhalf += !!(((U8)*p) & 0x80);
956 if (!highhalf)
957 goto plain_copy;
958 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
959 bufptr = PL_parser->bufptr;
960 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
961 SvCUR_set(PL_parser->linestr,
962 SvCUR(PL_parser->linestr) + len+highhalf);
f0e67a1d
Z
963 PL_parser->bufend += len+highhalf;
964 for (p = pv; p != e; p++) {
965 U8 c = (U8)*p;
966 if (c & 0x80) {
967 *bufptr++ = (char)(0xc0 | (c >> 6));
968 *bufptr++ = (char)(0x80 | (c & 0x3f));
969 } else {
970 *bufptr++ = (char)c;
971 }
972 }
973 }
974 } else {
975 if (flags & LEX_STUFF_UTF8) {
976 STRLEN highhalf = 0;
83aa740e 977 const char *p, *e = pv+len;
f0e67a1d
Z
978 for (p = pv; p != e; p++) {
979 U8 c = (U8)*p;
980 if (c >= 0xc4) {
981 Perl_croak(aTHX_ "Lexing code attempted to stuff "
982 "non-Latin-1 character into Latin-1 input");
983 } else if (c >= 0xc2 && p+1 != e &&
984 (((U8)p[1]) & 0xc0) == 0x80) {
985 p++;
986 highhalf++;
987 } else if (c >= 0x80) {
988 /* malformed UTF-8 */
989 ENTER;
990 SAVESPTR(PL_warnhook);
991 PL_warnhook = PERL_WARNHOOK_FATAL;
992 utf8n_to_uvuni((U8*)p, e-p, NULL, 0);
993 LEAVE;
994 }
995 }
996 if (!highhalf)
997 goto plain_copy;
998 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
999 bufptr = PL_parser->bufptr;
1000 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1001 SvCUR_set(PL_parser->linestr,
1002 SvCUR(PL_parser->linestr) + len-highhalf);
f0e67a1d
Z
1003 PL_parser->bufend += len-highhalf;
1004 for (p = pv; p != e; p++) {
1005 U8 c = (U8)*p;
1006 if (c & 0x80) {
1007 *bufptr++ = (char)(((c & 0x3) << 6) | (p[1] & 0x3f));
1008 p++;
1009 } else {
1010 *bufptr++ = (char)c;
1011 }
1012 }
1013 } else {
1014 plain_copy:
1015 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1016 bufptr = PL_parser->bufptr;
1017 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
255fdf19 1018 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
f0e67a1d
Z
1019 PL_parser->bufend += len;
1020 Copy(pv, bufptr, len, char);
1021 }
1022 }
1023}
1024
1025/*
9dcc53ea
Z
1026=for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
1027
1028Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1029immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1030reallocating the buffer if necessary. This means that lexing code that
1031runs later will see the characters as if they had appeared in the input.
1032It is not recommended to do this as part of normal parsing, and most
1033uses of this facility run the risk of the inserted characters being
1034interpreted in an unintended manner.
1035
1036The string to be inserted is represented by octets starting at I<pv>
1037and continuing to the first nul. These octets are interpreted as either
1038UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
1039in I<flags>. The characters are recoded for the lexer buffer, according
1040to how the buffer is currently being interpreted (L</lex_bufutf8>).
1041If it is not convenient to nul-terminate a string to be inserted, the
1042L</lex_stuff_pvn> function is more appropriate.
1043
1044=cut
1045*/
1046
1047void
1048Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1049{
1050 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1051 lex_stuff_pvn(pv, strlen(pv), flags);
1052}
1053
1054/*
f0e67a1d
Z
1055=for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1056
1057Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1058immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1059reallocating the buffer if necessary. This means that lexing code that
1060runs later will see the characters as if they had appeared in the input.
1061It is not recommended to do this as part of normal parsing, and most
1062uses of this facility run the risk of the inserted characters being
1063interpreted in an unintended manner.
1064
1065The string to be inserted is the string value of I<sv>. The characters
1066are recoded for the lexer buffer, according to how the buffer is currently
9dcc53ea 1067being interpreted (L</lex_bufutf8>). If a string to be inserted is
f0e67a1d
Z
1068not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1069need to construct a scalar.
1070
1071=cut
1072*/
1073
1074void
1075Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1076{
1077 char *pv;
1078 STRLEN len;
1079 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1080 if (flags)
1081 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1082 pv = SvPV(sv, len);
1083 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1084}
1085
1086/*
1087=for apidoc Amx|void|lex_unstuff|char *ptr
1088
1089Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1090I<ptr>. Text following I<ptr> will be moved, and the buffer shortened.
1091This hides the discarded text from any lexing code that runs later,
1092as if the text had never appeared.
1093
1094This is not the normal way to consume lexed text. For that, use
1095L</lex_read_to>.
1096
1097=cut
1098*/
1099
1100void
1101Perl_lex_unstuff(pTHX_ char *ptr)
1102{
1103 char *buf, *bufend;
1104 STRLEN unstuff_len;
1105 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1106 buf = PL_parser->bufptr;
1107 if (ptr < buf)
1108 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1109 if (ptr == buf)
1110 return;
1111 bufend = PL_parser->bufend;
1112 if (ptr > bufend)
1113 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1114 unstuff_len = ptr - buf;
1115 Move(ptr, buf, bufend+1-ptr, char);
1116 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1117 PL_parser->bufend = bufend - unstuff_len;
1118}
1119
1120/*
1121=for apidoc Amx|void|lex_read_to|char *ptr
1122
1123Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1124to I<ptr>. This advances L</PL_parser-E<gt>bufptr> to match I<ptr>,
1125performing the correct bookkeeping whenever a newline character is passed.
1126This is the normal way to consume lexed text.
1127
1128Interpretation of the buffer's octets can be abstracted out by
1129using the slightly higher-level functions L</lex_peek_unichar> and
1130L</lex_read_unichar>.
1131
1132=cut
1133*/
1134
1135void
1136Perl_lex_read_to(pTHX_ char *ptr)
1137{
1138 char *s;
1139 PERL_ARGS_ASSERT_LEX_READ_TO;
1140 s = PL_parser->bufptr;
1141 if (ptr < s || ptr > PL_parser->bufend)
1142 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1143 for (; s != ptr; s++)
1144 if (*s == '\n') {
1145 CopLINE_inc(PL_curcop);
1146 PL_parser->linestart = s+1;
1147 }
1148 PL_parser->bufptr = ptr;
1149}
1150
1151/*
1152=for apidoc Amx|void|lex_discard_to|char *ptr
1153
1154Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1155up to I<ptr>. The remaining content of the buffer will be moved, and
1156all pointers into the buffer updated appropriately. I<ptr> must not
1157be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1158it is not permitted to discard text that has yet to be lexed.
1159
1160Normally it is not necessarily to do this directly, because it suffices to
1161use the implicit discarding behaviour of L</lex_next_chunk> and things
1162based on it. However, if a token stretches across multiple lines,
1f317c95 1163and the lexing code has kept multiple lines of text in the buffer for
f0e67a1d
Z
1164that purpose, then after completion of the token it would be wise to
1165explicitly discard the now-unneeded earlier lines, to avoid future
1166multi-line tokens growing the buffer without bound.
1167
1168=cut
1169*/
1170
1171void
1172Perl_lex_discard_to(pTHX_ char *ptr)
1173{
1174 char *buf;
1175 STRLEN discard_len;
1176 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1177 buf = SvPVX(PL_parser->linestr);
1178 if (ptr < buf)
1179 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1180 if (ptr == buf)
1181 return;
1182 if (ptr > PL_parser->bufptr)
1183 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1184 discard_len = ptr - buf;
1185 if (PL_parser->oldbufptr < ptr)
1186 PL_parser->oldbufptr = ptr;
1187 if (PL_parser->oldoldbufptr < ptr)
1188 PL_parser->oldoldbufptr = ptr;
1189 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1190 PL_parser->last_uni = NULL;
1191 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1192 PL_parser->last_lop = NULL;
1193 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1194 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1195 PL_parser->bufend -= discard_len;
1196 PL_parser->bufptr -= discard_len;
1197 PL_parser->oldbufptr -= discard_len;
1198 PL_parser->oldoldbufptr -= discard_len;
1199 if (PL_parser->last_uni)
1200 PL_parser->last_uni -= discard_len;
1201 if (PL_parser->last_lop)
1202 PL_parser->last_lop -= discard_len;
1203}
1204
1205/*
1206=for apidoc Amx|bool|lex_next_chunk|U32 flags
1207
1208Reads in the next chunk of text to be lexed, appending it to
1209L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1210looked to the end of the current chunk and wants to know more. It is
1211usual, but not necessary, for lexing to have consumed the entirety of
1212the current chunk at this time.
1213
1214If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1215chunk (i.e., the current chunk has been entirely consumed), normally the
1216current chunk will be discarded at the same time that the new chunk is
1217read in. If I<flags> includes C<LEX_KEEP_PREVIOUS>, the current chunk
1218will not be discarded. If the current chunk has not been entirely
1219consumed, then it will not be discarded regardless of the flag.
1220
1221Returns true if some new text was added to the buffer, or false if the
1222buffer has reached the end of the input text.
1223
1224=cut
1225*/
1226
1227#define LEX_FAKE_EOF 0x80000000
1228
1229bool
1230Perl_lex_next_chunk(pTHX_ U32 flags)
1231{
1232 SV *linestr;
1233 char *buf;
1234 STRLEN old_bufend_pos, new_bufend_pos;
1235 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1236 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1237 bool got_some_for_debugger = 0;
f0e67a1d
Z
1238 bool got_some;
1239 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF))
1240 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
f0e67a1d
Z
1241 linestr = PL_parser->linestr;
1242 buf = SvPVX(linestr);
1243 if (!(flags & LEX_KEEP_PREVIOUS) &&
1244 PL_parser->bufptr == PL_parser->bufend) {
1245 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1246 linestart_pos = 0;
1247 if (PL_parser->last_uni != PL_parser->bufend)
1248 PL_parser->last_uni = NULL;
1249 if (PL_parser->last_lop != PL_parser->bufend)
1250 PL_parser->last_lop = NULL;
1251 last_uni_pos = last_lop_pos = 0;
1252 *buf = 0;
1253 SvCUR(linestr) = 0;
1254 } else {
1255 old_bufend_pos = PL_parser->bufend - buf;
1256 bufptr_pos = PL_parser->bufptr - buf;
1257 oldbufptr_pos = PL_parser->oldbufptr - buf;
1258 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1259 linestart_pos = PL_parser->linestart - buf;
1260 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1261 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1262 }
1263 if (flags & LEX_FAKE_EOF) {
1264 goto eof;
1265 } else if (!PL_parser->rsfp) {
1266 got_some = 0;
1267 } else if (filter_gets(linestr, old_bufend_pos)) {
1268 got_some = 1;
17cc9359 1269 got_some_for_debugger = 1;
f0e67a1d 1270 } else {
580561a3
Z
1271 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1272 sv_setpvs(linestr, "");
f0e67a1d
Z
1273 eof:
1274 /* End of real input. Close filehandle (unless it was STDIN),
1275 * then add implicit termination.
1276 */
1277 if ((PerlIO*)PL_parser->rsfp == PerlIO_stdin())
1278 PerlIO_clearerr(PL_parser->rsfp);
1279 else if (PL_parser->rsfp)
1280 (void)PerlIO_close(PL_parser->rsfp);
1281 PL_parser->rsfp = NULL;
737c24fc 1282 PL_parser->in_pod = 0;
f0e67a1d
Z
1283#ifdef PERL_MAD
1284 if (PL_madskills && !PL_in_eval && (PL_minus_p || PL_minus_n))
1285 PL_faketokens = 1;
1286#endif
1287 if (!PL_in_eval && PL_minus_p) {
1288 sv_catpvs(linestr,
1289 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1290 PL_minus_n = PL_minus_p = 0;
1291 } else if (!PL_in_eval && PL_minus_n) {
1292 sv_catpvs(linestr, /*{*/";}");
1293 PL_minus_n = 0;
1294 } else
1295 sv_catpvs(linestr, ";");
1296 got_some = 1;
1297 }
1298 buf = SvPVX(linestr);
1299 new_bufend_pos = SvCUR(linestr);
1300 PL_parser->bufend = buf + new_bufend_pos;
1301 PL_parser->bufptr = buf + bufptr_pos;
1302 PL_parser->oldbufptr = buf + oldbufptr_pos;
1303 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1304 PL_parser->linestart = buf + linestart_pos;
1305 if (PL_parser->last_uni)
1306 PL_parser->last_uni = buf + last_uni_pos;
1307 if (PL_parser->last_lop)
1308 PL_parser->last_lop = buf + last_lop_pos;
17cc9359 1309 if (got_some_for_debugger && (PERLDB_LINE || PERLDB_SAVESRC) &&
f0e67a1d
Z
1310 PL_curstash != PL_debstash) {
1311 /* debugger active and we're not compiling the debugger code,
1312 * so store the line into the debugger's array of lines
1313 */
1314 update_debugger_info(NULL, buf+old_bufend_pos,
1315 new_bufend_pos-old_bufend_pos);
1316 }
1317 return got_some;
1318}
1319
1320/*
1321=for apidoc Amx|I32|lex_peek_unichar|U32 flags
1322
1323Looks ahead one (Unicode) character in the text currently being lexed.
1324Returns the codepoint (unsigned integer value) of the next character,
1325or -1 if lexing has reached the end of the input text. To consume the
1326peeked character, use L</lex_read_unichar>.
1327
1328If the next character is in (or extends into) the next chunk of input
1329text, the next chunk will be read in. Normally the current chunk will be
1330discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1331then the current chunk will not be discarded.
1332
1333If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1334is encountered, an exception is generated.
1335
1336=cut
1337*/
1338
1339I32
1340Perl_lex_peek_unichar(pTHX_ U32 flags)
1341{
749123ff 1342 dVAR;
f0e67a1d
Z
1343 char *s, *bufend;
1344 if (flags & ~(LEX_KEEP_PREVIOUS))
1345 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1346 s = PL_parser->bufptr;
1347 bufend = PL_parser->bufend;
1348 if (UTF) {
1349 U8 head;
1350 I32 unichar;
1351 STRLEN len, retlen;
1352 if (s == bufend) {
1353 if (!lex_next_chunk(flags))
1354 return -1;
1355 s = PL_parser->bufptr;
1356 bufend = PL_parser->bufend;
1357 }
1358 head = (U8)*s;
1359 if (!(head & 0x80))
1360 return head;
1361 if (head & 0x40) {
1362 len = PL_utf8skip[head];
1363 while ((STRLEN)(bufend-s) < len) {
1364 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1365 break;
1366 s = PL_parser->bufptr;
1367 bufend = PL_parser->bufend;
1368 }
1369 }
1370 unichar = utf8n_to_uvuni((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
1371 if (retlen == (STRLEN)-1) {
1372 /* malformed UTF-8 */
1373 ENTER;
1374 SAVESPTR(PL_warnhook);
1375 PL_warnhook = PERL_WARNHOOK_FATAL;
1376 utf8n_to_uvuni((U8*)s, bufend-s, NULL, 0);
1377 LEAVE;
1378 }
1379 return unichar;
1380 } else {
1381 if (s == bufend) {
1382 if (!lex_next_chunk(flags))
1383 return -1;
1384 s = PL_parser->bufptr;
1385 }
1386 return (U8)*s;
1387 }
1388}
1389
1390/*
1391=for apidoc Amx|I32|lex_read_unichar|U32 flags
1392
1393Reads the next (Unicode) character in the text currently being lexed.
1394Returns the codepoint (unsigned integer value) of the character read,
1395and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1396if lexing has reached the end of the input text. To non-destructively
1397examine the next character, use L</lex_peek_unichar> instead.
1398
1399If the next character is in (or extends into) the next chunk of input
1400text, the next chunk will be read in. Normally the current chunk will be
1401discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1402then the current chunk will not be discarded.
1403
1404If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1405is encountered, an exception is generated.
1406
1407=cut
1408*/
1409
1410I32
1411Perl_lex_read_unichar(pTHX_ U32 flags)
1412{
1413 I32 c;
1414 if (flags & ~(LEX_KEEP_PREVIOUS))
1415 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1416 c = lex_peek_unichar(flags);
1417 if (c != -1) {
1418 if (c == '\n')
1419 CopLINE_inc(PL_curcop);
d9018cbe
EB
1420 if (UTF)
1421 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1422 else
1423 ++(PL_parser->bufptr);
f0e67a1d
Z
1424 }
1425 return c;
1426}
1427
1428/*
1429=for apidoc Amx|void|lex_read_space|U32 flags
1430
1431Reads optional spaces, in Perl style, in the text currently being
1432lexed. The spaces may include ordinary whitespace characters and
1433Perl-style comments. C<#line> directives are processed if encountered.
1434L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1435at a non-space character (or the end of the input text).
1436
1437If spaces extend into the next chunk of input text, the next chunk will
1438be read in. Normally the current chunk will be discarded at the same
1439time, but if I<flags> includes C<LEX_KEEP_PREVIOUS> then the current
1440chunk will not be discarded.
1441
1442=cut
1443*/
1444
f0998909
Z
1445#define LEX_NO_NEXT_CHUNK 0x80000000
1446
f0e67a1d
Z
1447void
1448Perl_lex_read_space(pTHX_ U32 flags)
1449{
1450 char *s, *bufend;
1451 bool need_incline = 0;
f0998909 1452 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK))
f0e67a1d
Z
1453 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
1454#ifdef PERL_MAD
1455 if (PL_skipwhite) {
1456 sv_free(PL_skipwhite);
1457 PL_skipwhite = NULL;
1458 }
1459 if (PL_madskills)
1460 PL_skipwhite = newSVpvs("");
1461#endif /* PERL_MAD */
1462 s = PL_parser->bufptr;
1463 bufend = PL_parser->bufend;
1464 while (1) {
1465 char c = *s;
1466 if (c == '#') {
1467 do {
1468 c = *++s;
1469 } while (!(c == '\n' || (c == 0 && s == bufend)));
1470 } else if (c == '\n') {
1471 s++;
1472 PL_parser->linestart = s;
1473 if (s == bufend)
1474 need_incline = 1;
1475 else
1476 incline(s);
1477 } else if (isSPACE(c)) {
1478 s++;
1479 } else if (c == 0 && s == bufend) {
1480 bool got_more;
1481#ifdef PERL_MAD
1482 if (PL_madskills)
1483 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1484#endif /* PERL_MAD */
f0998909
Z
1485 if (flags & LEX_NO_NEXT_CHUNK)
1486 break;
f0e67a1d
Z
1487 PL_parser->bufptr = s;
1488 CopLINE_inc(PL_curcop);
1489 got_more = lex_next_chunk(flags);
1490 CopLINE_dec(PL_curcop);
1491 s = PL_parser->bufptr;
1492 bufend = PL_parser->bufend;
1493 if (!got_more)
1494 break;
1495 if (need_incline && PL_parser->rsfp) {
1496 incline(s);
1497 need_incline = 0;
1498 }
1499 } else {
1500 break;
1501 }
1502 }
1503#ifdef PERL_MAD
1504 if (PL_madskills)
1505 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1506#endif /* PERL_MAD */
1507 PL_parser->bufptr = s;
1508}
1509
1510/*
ffb4593c
NT
1511 * S_incline
1512 * This subroutine has nothing to do with tilting, whether at windmills
1513 * or pinball tables. Its name is short for "increment line". It
57843af0 1514 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1515 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1516 * # line 500 "foo.pm"
1517 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1518 */
1519
76e3520e 1520STATIC void
d9095cec 1521S_incline(pTHX_ const char *s)
463ee0b2 1522{
97aff369 1523 dVAR;
d9095cec
NC
1524 const char *t;
1525 const char *n;
1526 const char *e;
8818d409 1527 line_t line_num;
463ee0b2 1528
7918f24d
NC
1529 PERL_ARGS_ASSERT_INCLINE;
1530
57843af0 1531 CopLINE_inc(PL_curcop);
463ee0b2
LW
1532 if (*s++ != '#')
1533 return;
d4c19fe8
AL
1534 while (SPACE_OR_TAB(*s))
1535 s++;
73659bf1
GS
1536 if (strnEQ(s, "line", 4))
1537 s += 4;
1538 else
1539 return;
084592ab 1540 if (SPACE_OR_TAB(*s))
73659bf1 1541 s++;
4e553d73 1542 else
73659bf1 1543 return;
d4c19fe8
AL
1544 while (SPACE_OR_TAB(*s))
1545 s++;
463ee0b2
LW
1546 if (!isDIGIT(*s))
1547 return;
d4c19fe8 1548
463ee0b2
LW
1549 n = s;
1550 while (isDIGIT(*s))
1551 s++;
07714eb4 1552 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1553 return;
bf4acbe4 1554 while (SPACE_OR_TAB(*s))
463ee0b2 1555 s++;
73659bf1 1556 if (*s == '"' && (t = strchr(s+1, '"'))) {
463ee0b2 1557 s++;
73659bf1
GS
1558 e = t + 1;
1559 }
463ee0b2 1560 else {
c35e046a
AL
1561 t = s;
1562 while (!isSPACE(*t))
1563 t++;
73659bf1 1564 e = t;
463ee0b2 1565 }
bf4acbe4 1566 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1567 e++;
1568 if (*e != '\n' && *e != '\0')
1569 return; /* false alarm */
1570
8818d409
FC
1571 line_num = atoi(n)-1;
1572
f4dd75d9 1573 if (t - s > 0) {
d9095cec 1574 const STRLEN len = t - s;
19bad673
NC
1575 SV *const temp_sv = CopFILESV(PL_curcop);
1576 const char *cf;
1577 STRLEN tmplen;
1578
1579 if (temp_sv) {
1580 cf = SvPVX(temp_sv);
1581 tmplen = SvCUR(temp_sv);
1582 } else {
1583 cf = NULL;
1584 tmplen = 0;
1585 }
1586
42d9b98d 1587 if (tmplen > 7 && strnEQ(cf, "(eval ", 6)) {
e66cf94c
RGS
1588 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1589 * to *{"::_<newfilename"} */
44867030
NC
1590 /* However, the long form of evals is only turned on by the
1591 debugger - usually they're "(eval %lu)" */
1592 char smallbuf[128];
1593 char *tmpbuf;
1594 GV **gvp;
d9095cec 1595 STRLEN tmplen2 = len;
798b63bc 1596 if (tmplen + 2 <= sizeof smallbuf)
e66cf94c
RGS
1597 tmpbuf = smallbuf;
1598 else
2ae0db35 1599 Newx(tmpbuf, tmplen + 2, char);
44867030
NC
1600 tmpbuf[0] = '_';
1601 tmpbuf[1] = '<';
2ae0db35 1602 memcpy(tmpbuf + 2, cf, tmplen);
44867030 1603 tmplen += 2;
8a5ee598
RGS
1604 gvp = (GV**)hv_fetch(PL_defstash, tmpbuf, tmplen, FALSE);
1605 if (gvp) {
44867030
NC
1606 char *tmpbuf2;
1607 GV *gv2;
1608
1609 if (tmplen2 + 2 <= sizeof smallbuf)
1610 tmpbuf2 = smallbuf;
1611 else
1612 Newx(tmpbuf2, tmplen2 + 2, char);
1613
1614 if (tmpbuf2 != smallbuf || tmpbuf != smallbuf) {
1615 /* Either they malloc'd it, or we malloc'd it,
1616 so no prefix is present in ours. */
1617 tmpbuf2[0] = '_';
1618 tmpbuf2[1] = '<';
1619 }
1620
1621 memcpy(tmpbuf2 + 2, s, tmplen2);
1622 tmplen2 += 2;
1623
8a5ee598 1624 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1625 if (!isGV(gv2)) {
8a5ee598 1626 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1627 /* adjust ${"::_<newfilename"} to store the new file name */
1628 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
8818d409
FC
1629 /* The line number may differ. If that is the case,
1630 alias the saved lines that are in the array.
1631 Otherwise alias the whole array. */
1632 if (CopLINE(PL_curcop) == line_num) {
1633 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(*gvp)));
1634 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(*gvp)));
1635 }
1636 else if (GvAV(*gvp)) {
1637 AV * const av = GvAV(*gvp);
1638 const I32 start = CopLINE(PL_curcop)+1;
1639 I32 items = AvFILLp(av) - start;
1640 if (items > 0) {
1641 AV * const av2 = GvAVn(gv2);
1642 SV **svp = AvARRAY(av) + start;
1643 I32 l = (I32)line_num+1;
1644 while (items--)
1645 av_store(av2, l++, SvREFCNT_inc(*svp++));
1646 }
1647 }
e5527e4b 1648 }
44867030
NC
1649
1650 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
8a5ee598 1651 }
e66cf94c 1652 if (tmpbuf != smallbuf) Safefree(tmpbuf);
e66cf94c 1653 }
05ec9bb3 1654 CopFILE_free(PL_curcop);
d9095cec 1655 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1656 }
8818d409 1657 CopLINE_set(PL_curcop, line_num);
463ee0b2
LW
1658}
1659
29595ff2 1660#ifdef PERL_MAD
cd81e915 1661/* skip space before PL_thistoken */
29595ff2
NC
1662
1663STATIC char *
1664S_skipspace0(pTHX_ register char *s)
1665{
7918f24d
NC
1666 PERL_ARGS_ASSERT_SKIPSPACE0;
1667
29595ff2
NC
1668 s = skipspace(s);
1669 if (!PL_madskills)
1670 return s;
cd81e915
NC
1671 if (PL_skipwhite) {
1672 if (!PL_thiswhite)
6b29d1f5 1673 PL_thiswhite = newSVpvs("");
cd81e915
NC
1674 sv_catsv(PL_thiswhite, PL_skipwhite);
1675 sv_free(PL_skipwhite);
1676 PL_skipwhite = 0;
1677 }
1678 PL_realtokenstart = s - SvPVX(PL_linestr);
29595ff2
NC
1679 return s;
1680}
1681
cd81e915 1682/* skip space after PL_thistoken */
29595ff2
NC
1683
1684STATIC char *
1685S_skipspace1(pTHX_ register char *s)
1686{
d4c19fe8 1687 const char *start = s;
29595ff2
NC
1688 I32 startoff = start - SvPVX(PL_linestr);
1689
7918f24d
NC
1690 PERL_ARGS_ASSERT_SKIPSPACE1;
1691
29595ff2
NC
1692 s = skipspace(s);
1693 if (!PL_madskills)
1694 return s;
1695 start = SvPVX(PL_linestr) + startoff;
cd81e915 1696 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1697 const char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1698 PL_thistoken = newSVpvn(tstart, start - tstart);
1699 }
1700 PL_realtokenstart = -1;
1701 if (PL_skipwhite) {
1702 if (!PL_nextwhite)
6b29d1f5 1703 PL_nextwhite = newSVpvs("");
cd81e915
NC
1704 sv_catsv(PL_nextwhite, PL_skipwhite);
1705 sv_free(PL_skipwhite);
1706 PL_skipwhite = 0;
29595ff2
NC
1707 }
1708 return s;
1709}
1710
1711STATIC char *
1712S_skipspace2(pTHX_ register char *s, SV **svp)
1713{
c35e046a
AL
1714 char *start;
1715 const I32 bufptroff = PL_bufptr - SvPVX(PL_linestr);
1716 const I32 startoff = s - SvPVX(PL_linestr);
1717
7918f24d
NC
1718 PERL_ARGS_ASSERT_SKIPSPACE2;
1719
29595ff2
NC
1720 s = skipspace(s);
1721 PL_bufptr = SvPVX(PL_linestr) + bufptroff;
1722 if (!PL_madskills || !svp)
1723 return s;
1724 start = SvPVX(PL_linestr) + startoff;
cd81e915 1725 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1726 char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1727 PL_thistoken = newSVpvn(tstart, start - tstart);
1728 PL_realtokenstart = -1;
29595ff2 1729 }
cd81e915 1730 if (PL_skipwhite) {
29595ff2 1731 if (!*svp)
6b29d1f5 1732 *svp = newSVpvs("");
cd81e915
NC
1733 sv_setsv(*svp, PL_skipwhite);
1734 sv_free(PL_skipwhite);
1735 PL_skipwhite = 0;
29595ff2
NC
1736 }
1737
1738 return s;
1739}
1740#endif
1741
80a702cd 1742STATIC void
15f169a1 1743S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1744{
1745 AV *av = CopFILEAVx(PL_curcop);
1746 if (av) {
b9f83d2f 1747 SV * const sv = newSV_type(SVt_PVMG);
5fa550fb
NC
1748 if (orig_sv)
1749 sv_setsv(sv, orig_sv);
1750 else
1751 sv_setpvn(sv, buf, len);
80a702cd
RGS
1752 (void)SvIOK_on(sv);
1753 SvIV_set(sv, 0);
1754 av_store(av, (I32)CopLINE(PL_curcop), sv);
1755 }
1756}
1757
ffb4593c
NT
1758/*
1759 * S_skipspace
1760 * Called to gobble the appropriate amount and type of whitespace.
1761 * Skips comments as well.
1762 */
1763
76e3520e 1764STATIC char *
cea2e8a9 1765S_skipspace(pTHX_ register char *s)
a687059c 1766{
5db06880 1767#ifdef PERL_MAD
f0e67a1d
Z
1768 char *start = s;
1769#endif /* PERL_MAD */
7918f24d 1770 PERL_ARGS_ASSERT_SKIPSPACE;
f0e67a1d 1771#ifdef PERL_MAD
cd81e915
NC
1772 if (PL_skipwhite) {
1773 sv_free(PL_skipwhite);
f0e67a1d 1774 PL_skipwhite = NULL;
5db06880 1775 }
f0e67a1d 1776#endif /* PERL_MAD */
3280af22 1777 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
bf4acbe4 1778 while (s < PL_bufend && SPACE_OR_TAB(*s))
463ee0b2 1779 s++;
f0e67a1d
Z
1780 } else {
1781 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1782 PL_bufptr = s;
f0998909
Z
1783 lex_read_space(LEX_KEEP_PREVIOUS |
1784 (PL_sublex_info.sub_inwhat || PL_lex_state == LEX_FORMLINE ?
1785 LEX_NO_NEXT_CHUNK : 0));
3280af22 1786 s = PL_bufptr;
f0e67a1d
Z
1787 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1788 if (PL_linestart > PL_bufptr)
1789 PL_bufptr = PL_linestart;
1790 return s;
463ee0b2 1791 }
5db06880 1792#ifdef PERL_MAD
f0e67a1d
Z
1793 if (PL_madskills)
1794 PL_skipwhite = newSVpvn(start, s-start);
1795#endif /* PERL_MAD */
5db06880 1796 return s;
a687059c 1797}
378cc40b 1798
ffb4593c
NT
1799/*
1800 * S_check_uni
1801 * Check the unary operators to ensure there's no ambiguity in how they're
1802 * used. An ambiguous piece of code would be:
1803 * rand + 5
1804 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1805 * the +5 is its argument.
1806 */
1807
76e3520e 1808STATIC void
cea2e8a9 1809S_check_uni(pTHX)
ba106d47 1810{
97aff369 1811 dVAR;
d4c19fe8
AL
1812 const char *s;
1813 const char *t;
2f3197b3 1814
3280af22 1815 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1816 return;
3280af22
NIS
1817 while (isSPACE(*PL_last_uni))
1818 PL_last_uni++;
c35e046a
AL
1819 s = PL_last_uni;
1820 while (isALNUM_lazy_if(s,UTF) || *s == '-')
1821 s++;
3280af22 1822 if ((t = strchr(s, '(')) && t < PL_bufptr)
a0d0e21e 1823 return;
6136c704 1824
9b387841
NC
1825 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
1826 "Warning: Use of \"%.*s\" without parentheses is ambiguous",
1827 (int)(s - PL_last_uni), PL_last_uni);
2f3197b3
LW
1828}
1829
ffb4593c
NT
1830/*
1831 * LOP : macro to build a list operator. Its behaviour has been replaced
1832 * with a subroutine, S_lop() for which LOP is just another name.
1833 */
1834
a0d0e21e
LW
1835#define LOP(f,x) return lop(f,x,s)
1836
ffb4593c
NT
1837/*
1838 * S_lop
1839 * Build a list operator (or something that might be one). The rules:
1840 * - if we have a next token, then it's a list operator [why?]
1841 * - if the next thing is an opening paren, then it's a function
1842 * - else it's a list operator
1843 */
1844
76e3520e 1845STATIC I32
a0be28da 1846S_lop(pTHX_ I32 f, int x, char *s)
ffed7fef 1847{
97aff369 1848 dVAR;
7918f24d
NC
1849
1850 PERL_ARGS_ASSERT_LOP;
1851
6154021b 1852 pl_yylval.ival = f;
35c8bce7 1853 CLINE;
3280af22
NIS
1854 PL_expect = x;
1855 PL_bufptr = s;
1856 PL_last_lop = PL_oldbufptr;
eb160463 1857 PL_last_lop_op = (OPCODE)f;
5db06880
NC
1858#ifdef PERL_MAD
1859 if (PL_lasttoke)
78cdf107 1860 goto lstop;
5db06880 1861#else
3280af22 1862 if (PL_nexttoke)
78cdf107 1863 goto lstop;
5db06880 1864#endif
79072805 1865 if (*s == '(')
bbf60fe6 1866 return REPORT(FUNC);
29595ff2 1867 s = PEEKSPACE(s);
79072805 1868 if (*s == '(')
bbf60fe6 1869 return REPORT(FUNC);
78cdf107
Z
1870 else {
1871 lstop:
1872 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
1873 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
bbf60fe6 1874 return REPORT(LSTOP);
78cdf107 1875 }
79072805
LW
1876}
1877
5db06880
NC
1878#ifdef PERL_MAD
1879 /*
1880 * S_start_force
1881 * Sets up for an eventual force_next(). start_force(0) basically does
1882 * an unshift, while start_force(-1) does a push. yylex removes items
1883 * on the "pop" end.
1884 */
1885
1886STATIC void
1887S_start_force(pTHX_ int where)
1888{
1889 int i;
1890
cd81e915 1891 if (where < 0) /* so people can duplicate start_force(PL_curforce) */
5db06880 1892 where = PL_lasttoke;
cd81e915
NC
1893 assert(PL_curforce < 0 || PL_curforce == where);
1894 if (PL_curforce != where) {
5db06880
NC
1895 for (i = PL_lasttoke; i > where; --i) {
1896 PL_nexttoke[i] = PL_nexttoke[i-1];
1897 }
1898 PL_lasttoke++;
1899 }
cd81e915 1900 if (PL_curforce < 0) /* in case of duplicate start_force() */
5db06880 1901 Zero(&PL_nexttoke[where], 1, NEXTTOKE);
cd81e915
NC
1902 PL_curforce = where;
1903 if (PL_nextwhite) {
5db06880 1904 if (PL_madskills)
6b29d1f5 1905 curmad('^', newSVpvs(""));
cd81e915 1906 CURMAD('_', PL_nextwhite);
5db06880
NC
1907 }
1908}
1909
1910STATIC void
1911S_curmad(pTHX_ char slot, SV *sv)
1912{
1913 MADPROP **where;
1914
1915 if (!sv)
1916 return;
cd81e915
NC
1917 if (PL_curforce < 0)
1918 where = &PL_thismad;
5db06880 1919 else
cd81e915 1920 where = &PL_nexttoke[PL_curforce].next_mad;
5db06880 1921
cd81e915 1922 if (PL_faketokens)
76f68e9b 1923 sv_setpvs(sv, "");
5db06880
NC
1924 else {
1925 if (!IN_BYTES) {
1926 if (UTF && is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
1927 SvUTF8_on(sv);
1928 else if (PL_encoding) {
1929 sv_recode_to_utf8(sv, PL_encoding);
1930 }
1931 }
1932 }
1933
1934 /* keep a slot open for the head of the list? */
1935 if (slot != '_' && *where && (*where)->mad_key == '^') {
1936 (*where)->mad_key = slot;
daba3364 1937 sv_free(MUTABLE_SV(((*where)->mad_val)));
5db06880
NC
1938 (*where)->mad_val = (void*)sv;
1939 }
1940 else
1941 addmad(newMADsv(slot, sv), where, 0);
1942}
1943#else
b3f24c00
MHM
1944# define start_force(where) NOOP
1945# define curmad(slot, sv) NOOP
5db06880
NC
1946#endif
1947
ffb4593c
NT
1948/*
1949 * S_force_next
9cbb5ea2 1950 * When the lexer realizes it knows the next token (for instance,
ffb4593c 1951 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 1952 * to know what token to return the next time the lexer is called. Caller
5db06880
NC
1953 * will need to set PL_nextval[] (or PL_nexttoke[].next_val with PERL_MAD),
1954 * and possibly PL_expect to ensure the lexer handles the token correctly.
ffb4593c
NT
1955 */
1956
4e553d73 1957STATIC void
cea2e8a9 1958S_force_next(pTHX_ I32 type)
79072805 1959{
97aff369 1960 dVAR;
704d4215
GG
1961#ifdef DEBUGGING
1962 if (DEBUG_T_TEST) {
1963 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 1964 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
1965 }
1966#endif
5db06880 1967#ifdef PERL_MAD
cd81e915 1968 if (PL_curforce < 0)
5db06880 1969 start_force(PL_lasttoke);
cd81e915 1970 PL_nexttoke[PL_curforce].next_type = type;
5db06880
NC
1971 if (PL_lex_state != LEX_KNOWNEXT)
1972 PL_lex_defer = PL_lex_state;
1973 PL_lex_state = LEX_KNOWNEXT;
1974 PL_lex_expect = PL_expect;
cd81e915 1975 PL_curforce = -1;
5db06880 1976#else
3280af22
NIS
1977 PL_nexttype[PL_nexttoke] = type;
1978 PL_nexttoke++;
1979 if (PL_lex_state != LEX_KNOWNEXT) {
1980 PL_lex_defer = PL_lex_state;
1981 PL_lex_expect = PL_expect;
1982 PL_lex_state = LEX_KNOWNEXT;
79072805 1983 }
5db06880 1984#endif
79072805
LW
1985}
1986
28ac2b49
Z
1987void
1988Perl_yyunlex(pTHX)
1989{
a7aaec61
Z
1990 int yyc = PL_parser->yychar;
1991 if (yyc != YYEMPTY) {
1992 if (yyc) {
1993 start_force(-1);
1994 NEXTVAL_NEXTTOKE = PL_parser->yylval;
1995 if (yyc == '{'/*}*/ || yyc == HASHBRACK || yyc == '['/*]*/) {
78cdf107 1996 PL_lex_allbrackets--;
a7aaec61 1997 PL_lex_brackets--;
78cdf107
Z
1998 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
1999 } else if (yyc == '('/*)*/) {
2000 PL_lex_allbrackets--;
2001 yyc |= (2<<24);
a7aaec61
Z
2002 }
2003 force_next(yyc);
2004 }
28ac2b49
Z
2005 PL_parser->yychar = YYEMPTY;
2006 }
2007}
2008
d0a148a6 2009STATIC SV *
15f169a1 2010S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 2011{
97aff369 2012 dVAR;
740cce10 2013 SV * const sv = newSVpvn_utf8(start, len,
eaf7a4d2
CS
2014 !IN_BYTES
2015 && UTF
2016 && !is_ascii_string((const U8*)start, len)
740cce10 2017 && is_utf8_string((const U8*)start, len));
d0a148a6
NC
2018 return sv;
2019}
2020
ffb4593c
NT
2021/*
2022 * S_force_word
2023 * When the lexer knows the next thing is a word (for instance, it has
2024 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
2025 * it calls S_force_word to stick the next word into the PL_nexttoke/val
2026 * lookahead.
ffb4593c
NT
2027 *
2028 * Arguments:
b1b65b59 2029 * char *start : buffer position (must be within PL_linestr)
02b34bbe 2030 * int token : PL_next* will be this type of bare word (e.g., METHOD,WORD)
ffb4593c
NT
2031 * int check_keyword : if true, Perl checks to make sure the word isn't
2032 * a keyword (do this if the word is a label, e.g. goto FOO)
2033 * int allow_pack : if true, : characters will also be allowed (require,
2034 * use, etc. do this)
9cbb5ea2 2035 * int allow_initial_tick : used by the "sub" lexer only.
ffb4593c
NT
2036 */
2037
76e3520e 2038STATIC char *
cea2e8a9 2039S_force_word(pTHX_ register char *start, int token, int check_keyword, int allow_pack, int allow_initial_tick)
79072805 2040{
97aff369 2041 dVAR;
463ee0b2
LW
2042 register char *s;
2043 STRLEN len;
4e553d73 2044
7918f24d
NC
2045 PERL_ARGS_ASSERT_FORCE_WORD;
2046
29595ff2 2047 start = SKIPSPACE1(start);
463ee0b2 2048 s = start;
7e2040f0 2049 if (isIDFIRST_lazy_if(s,UTF) ||
a0d0e21e 2050 (allow_pack && *s == ':') ||
15f0808c 2051 (allow_initial_tick && *s == '\'') )
a0d0e21e 2052 {
3280af22 2053 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
5458a98a 2054 if (check_keyword && keyword(PL_tokenbuf, len, 0))
463ee0b2 2055 return start;
cd81e915 2056 start_force(PL_curforce);
5db06880
NC
2057 if (PL_madskills)
2058 curmad('X', newSVpvn(start,s-start));
463ee0b2 2059 if (token == METHOD) {
29595ff2 2060 s = SKIPSPACE1(s);
463ee0b2 2061 if (*s == '(')
3280af22 2062 PL_expect = XTERM;
463ee0b2 2063 else {
3280af22 2064 PL_expect = XOPERATOR;
463ee0b2 2065 }
79072805 2066 }
e74e6b3d 2067 if (PL_madskills)
63575281 2068 curmad('g', newSVpvs( "forced" ));
9ded7720 2069 NEXTVAL_NEXTTOKE.opval
d0a148a6
NC
2070 = (OP*)newSVOP(OP_CONST,0,
2071 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 2072 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
2073 force_next(token);
2074 }
2075 return s;
2076}
2077
ffb4593c
NT
2078/*
2079 * S_force_ident
9cbb5ea2 2080 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
2081 * text only contains the "foo" portion. The first argument is a pointer
2082 * to the "foo", and the second argument is the type symbol to prefix.
2083 * Forces the next token to be a "WORD".
9cbb5ea2 2084 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2085 */
2086
76e3520e 2087STATIC void
bfed75c6 2088S_force_ident(pTHX_ register const char *s, int kind)
79072805 2089{
97aff369 2090 dVAR;
7918f24d
NC
2091
2092 PERL_ARGS_ASSERT_FORCE_IDENT;
2093
c35e046a 2094 if (*s) {
90e5519e
NC
2095 const STRLEN len = strlen(s);
2096 OP* const o = (OP*)newSVOP(OP_CONST, 0, newSVpvn(s, len));
cd81e915 2097 start_force(PL_curforce);
9ded7720 2098 NEXTVAL_NEXTTOKE.opval = o;
79072805 2099 force_next(WORD);
748a9306 2100 if (kind) {
11343788 2101 o->op_private = OPpCONST_ENTERED;
55497cff 2102 /* XXX see note in pp_entereval() for why we forgo typo
2103 warnings if the symbol must be introduced in an eval.
2104 GSAR 96-10-12 */
90e5519e
NC
2105 gv_fetchpvn_flags(s, len,
2106 PL_in_eval ? (GV_ADDMULTI | GV_ADDINEVAL)
2107 : GV_ADD,
2108 kind == '$' ? SVt_PV :
2109 kind == '@' ? SVt_PVAV :
2110 kind == '%' ? SVt_PVHV :
a0d0e21e 2111 SVt_PVGV
90e5519e 2112 );
748a9306 2113 }
79072805
LW
2114 }
2115}
2116
1571675a
GS
2117NV
2118Perl_str_to_version(pTHX_ SV *sv)
2119{
2120 NV retval = 0.0;
2121 NV nshift = 1.0;
2122 STRLEN len;
cfd0369c 2123 const char *start = SvPV_const(sv,len);
9d4ba2ae 2124 const char * const end = start + len;
504618e9 2125 const bool utf = SvUTF8(sv) ? TRUE : FALSE;
7918f24d
NC
2126
2127 PERL_ARGS_ASSERT_STR_TO_VERSION;
2128
1571675a 2129 while (start < end) {
ba210ebe 2130 STRLEN skip;
1571675a
GS
2131 UV n;
2132 if (utf)
9041c2e3 2133 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2134 else {
2135 n = *(U8*)start;
2136 skip = 1;
2137 }
2138 retval += ((NV)n)/nshift;
2139 start += skip;
2140 nshift *= 1000;
2141 }
2142 return retval;
2143}
2144
4e553d73 2145/*
ffb4593c
NT
2146 * S_force_version
2147 * Forces the next token to be a version number.
e759cc13
RGS
2148 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2149 * and if "guessing" is TRUE, then no new token is created (and the caller
2150 * must use an alternative parsing method).
ffb4593c
NT
2151 */
2152
76e3520e 2153STATIC char *
e759cc13 2154S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2155{
97aff369 2156 dVAR;
5f66b61c 2157 OP *version = NULL;
44dcb63b 2158 char *d;
5db06880
NC
2159#ifdef PERL_MAD
2160 I32 startoff = s - SvPVX(PL_linestr);
2161#endif
89bfa8cd 2162
7918f24d
NC
2163 PERL_ARGS_ASSERT_FORCE_VERSION;
2164
29595ff2 2165 s = SKIPSPACE1(s);
89bfa8cd 2166
44dcb63b 2167 d = s;
dd629d5b 2168 if (*d == 'v')
44dcb63b 2169 d++;
44dcb63b 2170 if (isDIGIT(*d)) {
e759cc13
RGS
2171 while (isDIGIT(*d) || *d == '_' || *d == '.')
2172 d++;
5db06880
NC
2173#ifdef PERL_MAD
2174 if (PL_madskills) {
cd81e915 2175 start_force(PL_curforce);
5db06880
NC
2176 curmad('X', newSVpvn(s,d-s));
2177 }
2178#endif
4e4da3ac 2179 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
dd629d5b 2180 SV *ver;
8d08d9ba
DG
2181#ifdef USE_LOCALE_NUMERIC
2182 char *loc = setlocale(LC_NUMERIC, "C");
2183#endif
6154021b 2184 s = scan_num(s, &pl_yylval);
8d08d9ba
DG
2185#ifdef USE_LOCALE_NUMERIC
2186 setlocale(LC_NUMERIC, loc);
2187#endif
6154021b 2188 version = pl_yylval.opval;
dd629d5b
GS
2189 ver = cSVOPx(version)->op_sv;
2190 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2191 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2192 SvNV_set(ver, str_to_version(ver));
1571675a 2193 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2194 }
89bfa8cd 2195 }
5db06880
NC
2196 else if (guessing) {
2197#ifdef PERL_MAD
2198 if (PL_madskills) {
cd81e915
NC
2199 sv_free(PL_nextwhite); /* let next token collect whitespace */
2200 PL_nextwhite = 0;
5db06880
NC
2201 s = SvPVX(PL_linestr) + startoff;
2202 }
2203#endif
e759cc13 2204 return s;
5db06880 2205 }
89bfa8cd 2206 }
2207
5db06880
NC
2208#ifdef PERL_MAD
2209 if (PL_madskills && !version) {
cd81e915
NC
2210 sv_free(PL_nextwhite); /* let next token collect whitespace */
2211 PL_nextwhite = 0;
5db06880
NC
2212 s = SvPVX(PL_linestr) + startoff;
2213 }
2214#endif
89bfa8cd 2215 /* NOTE: The parser sees the package name and the VERSION swapped */
cd81e915 2216 start_force(PL_curforce);
9ded7720 2217 NEXTVAL_NEXTTOKE.opval = version;
4e553d73 2218 force_next(WORD);
89bfa8cd 2219
e759cc13 2220 return s;
89bfa8cd 2221}
2222
ffb4593c 2223/*
91152fc1
DG
2224 * S_force_strict_version
2225 * Forces the next token to be a version number using strict syntax rules.
2226 */
2227
2228STATIC char *
2229S_force_strict_version(pTHX_ char *s)
2230{
2231 dVAR;
2232 OP *version = NULL;
2233#ifdef PERL_MAD
2234 I32 startoff = s - SvPVX(PL_linestr);
2235#endif
2236 const char *errstr = NULL;
2237
2238 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2239
2240 while (isSPACE(*s)) /* leading whitespace */
2241 s++;
2242
2243 if (is_STRICT_VERSION(s,&errstr)) {
2244 SV *ver = newSV(0);
2245 s = (char *)scan_version(s, ver, 0);
2246 version = newSVOP(OP_CONST, 0, ver);
2247 }
4e4da3ac
Z
2248 else if ( (*s != ';' && *s != '{' && *s != '}' ) &&
2249 (s = SKIPSPACE1(s), (*s != ';' && *s != '{' && *s != '}' )))
2250 {
91152fc1
DG
2251 PL_bufptr = s;
2252 if (errstr)
2253 yyerror(errstr); /* version required */
2254 return s;
2255 }
2256
2257#ifdef PERL_MAD
2258 if (PL_madskills && !version) {
2259 sv_free(PL_nextwhite); /* let next token collect whitespace */
2260 PL_nextwhite = 0;
2261 s = SvPVX(PL_linestr) + startoff;
2262 }
2263#endif
2264 /* NOTE: The parser sees the package name and the VERSION swapped */
2265 start_force(PL_curforce);
2266 NEXTVAL_NEXTTOKE.opval = version;
2267 force_next(WORD);
2268
2269 return s;
2270}
2271
2272/*
ffb4593c
NT
2273 * S_tokeq
2274 * Tokenize a quoted string passed in as an SV. It finds the next
2275 * chunk, up to end of string or a backslash. It may make a new
2276 * SV containing that chunk (if HINT_NEW_STRING is on). It also
2277 * turns \\ into \.
2278 */
2279
76e3520e 2280STATIC SV *
cea2e8a9 2281S_tokeq(pTHX_ SV *sv)
79072805 2282{
97aff369 2283 dVAR;
79072805
LW
2284 register char *s;
2285 register char *send;
2286 register char *d;
b3ac6de7
IZ
2287 STRLEN len = 0;
2288 SV *pv = sv;
79072805 2289
7918f24d
NC
2290 PERL_ARGS_ASSERT_TOKEQ;
2291
79072805 2292 if (!SvLEN(sv))
b3ac6de7 2293 goto finish;
79072805 2294
a0d0e21e 2295 s = SvPV_force(sv, len);
21a311ee 2296 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1)
b3ac6de7 2297 goto finish;
463ee0b2 2298 send = s + len;
dcb21ed6
NC
2299 /* This is relying on the SV being "well formed" with a trailing '\0' */
2300 while (s < send && !(*s == '\\' && s[1] == '\\'))
79072805
LW
2301 s++;
2302 if (s == send)
b3ac6de7 2303 goto finish;
79072805 2304 d = s;
be4731d2 2305 if ( PL_hints & HINT_NEW_STRING ) {
59cd0e26 2306 pv = newSVpvn_flags(SvPVX_const(pv), len, SVs_TEMP | SvUTF8(sv));
be4731d2 2307 }
79072805
LW
2308 while (s < send) {
2309 if (*s == '\\') {
a0d0e21e 2310 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2311 s++; /* all that, just for this */
2312 }
2313 *d++ = *s++;
2314 }
2315 *d = '\0';
95a20fc0 2316 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2317 finish:
3280af22 2318 if ( PL_hints & HINT_NEW_STRING )
eb0d8d16 2319 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
79072805
LW
2320 return sv;
2321}
2322
ffb4593c
NT
2323/*
2324 * Now come three functions related to double-quote context,
2325 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2326 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2327 * interact with PL_lex_state, and create fake ( ... ) argument lists
2328 * to handle functions and concatenation.
2329 * They assume that whoever calls them will be setting up a fake
2330 * join call, because each subthing puts a ',' after it. This lets
2331 * "lower \luPpEr"
2332 * become
2333 * join($, , 'lower ', lcfirst( 'uPpEr', ) ,)
2334 *
2335 * (I'm not sure whether the spurious commas at the end of lcfirst's
2336 * arguments and join's arguments are created or not).
2337 */
2338
2339/*
2340 * S_sublex_start
6154021b 2341 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2342 *
2343 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2344 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c
NT
2345 *
2346 * OP_CONST and OP_READLINE are easy--just make the new op and return.
2347 *
2348 * Everything else becomes a FUNC.
2349 *
2350 * Sets PL_lex_state to LEX_INTERPPUSH unless (ival was OP_NULL or we
2351 * had an OP_CONST or OP_READLINE). This just sets us up for a
2352 * call to S_sublex_push().
2353 */
2354
76e3520e 2355STATIC I32
cea2e8a9 2356S_sublex_start(pTHX)
79072805 2357{
97aff369 2358 dVAR;
6154021b 2359 register const I32 op_type = pl_yylval.ival;
79072805
LW
2360
2361 if (op_type == OP_NULL) {
6154021b 2362 pl_yylval.opval = PL_lex_op;
5f66b61c 2363 PL_lex_op = NULL;
79072805
LW
2364 return THING;
2365 }
2366 if (op_type == OP_CONST || op_type == OP_READLINE) {
3280af22 2367 SV *sv = tokeq(PL_lex_stuff);
b3ac6de7
IZ
2368
2369 if (SvTYPE(sv) == SVt_PVIV) {
2370 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2371 STRLEN len;
96a5add6 2372 const char * const p = SvPV_const(sv, len);
740cce10 2373 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2374 SvREFCNT_dec(sv);
2375 sv = nsv;
4e553d73 2376 }
6154021b 2377 pl_yylval.opval = (OP*)newSVOP(op_type, 0, sv);
a0714e2c 2378 PL_lex_stuff = NULL;
6f33ba73
RGS
2379 /* Allow <FH> // "foo" */
2380 if (op_type == OP_READLINE)
2381 PL_expect = XTERMORDORDOR;
79072805
LW
2382 return THING;
2383 }
e3f73d4e
RGS
2384 else if (op_type == OP_BACKTICK && PL_lex_op) {
2385 /* readpipe() vas overriden */
2386 cSVOPx(cLISTOPx(cUNOPx(PL_lex_op)->op_first)->op_first->op_sibling)->op_sv = tokeq(PL_lex_stuff);
6154021b 2387 pl_yylval.opval = PL_lex_op;
9b201d7d 2388 PL_lex_op = NULL;
e3f73d4e
RGS
2389 PL_lex_stuff = NULL;
2390 return THING;
2391 }
79072805 2392
3280af22 2393 PL_sublex_info.super_state = PL_lex_state;
eac04b2e 2394 PL_sublex_info.sub_inwhat = (U16)op_type;
3280af22
NIS
2395 PL_sublex_info.sub_op = PL_lex_op;
2396 PL_lex_state = LEX_INTERPPUSH;
55497cff 2397
3280af22
NIS
2398 PL_expect = XTERM;
2399 if (PL_lex_op) {
6154021b 2400 pl_yylval.opval = PL_lex_op;
5f66b61c 2401 PL_lex_op = NULL;
55497cff 2402 return PMFUNC;
2403 }
2404 else
2405 return FUNC;
2406}
2407
ffb4593c
NT
2408/*
2409 * S_sublex_push
2410 * Create a new scope to save the lexing state. The scope will be
2411 * ended in S_sublex_done. Returns a '(', starting the function arguments
2412 * to the uc, lc, etc. found before.
2413 * Sets PL_lex_state to LEX_INTERPCONCAT.
2414 */
2415
76e3520e 2416STATIC I32
cea2e8a9 2417S_sublex_push(pTHX)
55497cff 2418{
27da23d5 2419 dVAR;
f46d017c 2420 ENTER;
55497cff 2421
3280af22 2422 PL_lex_state = PL_sublex_info.super_state;
651b5b28 2423 SAVEBOOL(PL_lex_dojoin);
3280af22 2424 SAVEI32(PL_lex_brackets);
78cdf107
Z
2425 SAVEI32(PL_lex_allbrackets);
2426 SAVEI8(PL_lex_fakeeof);
3280af22
NIS
2427 SAVEI32(PL_lex_casemods);
2428 SAVEI32(PL_lex_starts);
651b5b28 2429 SAVEI8(PL_lex_state);
7766f137 2430 SAVEVPTR(PL_lex_inpat);
98246f1e 2431 SAVEI16(PL_lex_inwhat);
57843af0 2432 SAVECOPLINE(PL_curcop);
3280af22 2433 SAVEPPTR(PL_bufptr);
8452ff4b 2434 SAVEPPTR(PL_bufend);
3280af22
NIS
2435 SAVEPPTR(PL_oldbufptr);
2436 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2437 SAVEPPTR(PL_last_lop);
2438 SAVEPPTR(PL_last_uni);
3280af22
NIS
2439 SAVEPPTR(PL_linestart);
2440 SAVESPTR(PL_linestr);
8edd5f42
RGS
2441 SAVEGENERICPV(PL_lex_brackstack);
2442 SAVEGENERICPV(PL_lex_casestack);
3280af22
NIS
2443
2444 PL_linestr = PL_lex_stuff;
a0714e2c 2445 PL_lex_stuff = NULL;
3280af22 2446
9cbb5ea2
GS
2447 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2448 = SvPVX(PL_linestr);
3280af22 2449 PL_bufend += SvCUR(PL_linestr);
bd61b366 2450 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2451 SAVEFREESV(PL_linestr);
2452
2453 PL_lex_dojoin = FALSE;
2454 PL_lex_brackets = 0;
78cdf107
Z
2455 PL_lex_allbrackets = 0;
2456 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
a02a5408
JC
2457 Newx(PL_lex_brackstack, 120, char);
2458 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2459 PL_lex_casemods = 0;
2460 *PL_lex_casestack = '\0';
2461 PL_lex_starts = 0;
2462 PL_lex_state = LEX_INTERPCONCAT;
eb160463 2463 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
3280af22
NIS
2464
2465 PL_lex_inwhat = PL_sublex_info.sub_inwhat;
bb16bae8 2466 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
3280af22
NIS
2467 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2468 PL_lex_inpat = PL_sublex_info.sub_op;
79072805 2469 else
5f66b61c 2470 PL_lex_inpat = NULL;
79072805 2471
55497cff 2472 return '(';
79072805
LW
2473}
2474
ffb4593c
NT
2475/*
2476 * S_sublex_done
2477 * Restores lexer state after a S_sublex_push.
2478 */
2479
76e3520e 2480STATIC I32
cea2e8a9 2481S_sublex_done(pTHX)
79072805 2482{
27da23d5 2483 dVAR;
3280af22 2484 if (!PL_lex_starts++) {
396482e1 2485 SV * const sv = newSVpvs("");
9aa983d2
JH
2486 if (SvUTF8(PL_linestr))
2487 SvUTF8_on(sv);
3280af22 2488 PL_expect = XOPERATOR;
6154021b 2489 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
79072805
LW
2490 return THING;
2491 }
2492
3280af22
NIS
2493 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2494 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2495 return yylex();
79072805
LW
2496 }
2497
ffb4593c 2498 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
bb16bae8 2499 assert(PL_lex_inwhat != OP_TRANSR);
3280af22
NIS
2500 if (PL_lex_repl && (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS)) {
2501 PL_linestr = PL_lex_repl;
2502 PL_lex_inpat = 0;
2503 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2504 PL_bufend += SvCUR(PL_linestr);
bd61b366 2505 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2506 SAVEFREESV(PL_linestr);
2507 PL_lex_dojoin = FALSE;
2508 PL_lex_brackets = 0;
78cdf107
Z
2509 PL_lex_allbrackets = 0;
2510 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
3280af22
NIS
2511 PL_lex_casemods = 0;
2512 *PL_lex_casestack = '\0';
2513 PL_lex_starts = 0;
25da4f38 2514 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2515 PL_lex_state = LEX_INTERPNORMAL;
2516 PL_lex_starts++;
e9fa98b2
HS
2517 /* we don't clear PL_lex_repl here, so that we can check later
2518 whether this is an evalled subst; that means we rely on the
2519 logic to ensure sublex_done() is called again only via the
2520 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2521 }
e9fa98b2 2522 else {
3280af22 2523 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2524 PL_lex_repl = NULL;
e9fa98b2 2525 }
79072805 2526 return ',';
ffed7fef
LW
2527 }
2528 else {
5db06880
NC
2529#ifdef PERL_MAD
2530 if (PL_madskills) {
cd81e915
NC
2531 if (PL_thiswhite) {
2532 if (!PL_endwhite)
6b29d1f5 2533 PL_endwhite = newSVpvs("");
cd81e915
NC
2534 sv_catsv(PL_endwhite, PL_thiswhite);
2535 PL_thiswhite = 0;
2536 }
2537 if (PL_thistoken)
76f68e9b 2538 sv_setpvs(PL_thistoken,"");
5db06880 2539 else
cd81e915 2540 PL_realtokenstart = -1;
5db06880
NC
2541 }
2542#endif
f46d017c 2543 LEAVE;
3280af22
NIS
2544 PL_bufend = SvPVX(PL_linestr);
2545 PL_bufend += SvCUR(PL_linestr);
2546 PL_expect = XOPERATOR;
09bef843 2547 PL_sublex_info.sub_inwhat = 0;
79072805 2548 return ')';
ffed7fef
LW
2549 }
2550}
2551
02aa26ce
NT
2552/*
2553 scan_const
2554
2555 Extracts a pattern, double-quoted string, or transliteration. This
2556 is terrifying code.
2557
94def140 2558 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 2559 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 2560 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 2561
94def140
TS
2562 Returns a pointer to the character scanned up to. If this is
2563 advanced from the start pointer supplied (i.e. if anything was
9b599b2a 2564 successfully parsed), will leave an OP for the substring scanned
6154021b 2565 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
2566 by looking at the next characters herself.
2567
02aa26ce
NT
2568 In patterns:
2569 backslashes:
ff3f963a 2570 constants: \N{NAME} only
02aa26ce
NT
2571 case and quoting: \U \Q \E
2572 stops on @ and $, but not for $ as tail anchor
2573
2574 In transliterations:
2575 characters are VERY literal, except for - not at the start or end
94def140
TS
2576 of the string, which indicates a range. If the range is in bytes,
2577 scan_const expands the range to the full set of intermediate
2578 characters. If the range is in utf8, the hyphen is replaced with
2579 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
2580
2581 In double-quoted strings:
2582 backslashes:
2583 double-quoted style: \r and \n
ff3f963a 2584 constants: \x31, etc.
94def140 2585 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
2586 case and quoting: \U \Q \E
2587 stops on @ and $
2588
2589 scan_const does *not* construct ops to handle interpolated strings.
2590 It stops processing as soon as it finds an embedded $ or @ variable
2591 and leaves it to the caller to work out what's going on.
2592
94def140
TS
2593 embedded arrays (whether in pattern or not) could be:
2594 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2595
2596 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
2597
2598 $ in pattern could be $foo or could be tail anchor. Assumption:
2599 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 2600 followed by one of "()| \r\n\t"
02aa26ce
NT
2601
2602 \1 (backreferences) are turned into $1
2603
2604 The structure of the code is
2605 while (there's a character to process) {
94def140
TS
2606 handle transliteration ranges
2607 skip regexp comments /(?#comment)/ and codes /(?{code})/
2608 skip #-initiated comments in //x patterns
2609 check for embedded arrays
02aa26ce
NT
2610 check for embedded scalars
2611 if (backslash) {
94def140 2612 deprecate \1 in substitution replacements
02aa26ce
NT
2613 handle string-changing backslashes \l \U \Q \E, etc.
2614 switch (what was escaped) {
94def140 2615 handle \- in a transliteration (becomes a literal -)
ff3f963a 2616 if a pattern and not \N{, go treat as regular character
94def140
TS
2617 handle \132 (octal characters)
2618 handle \x15 and \x{1234} (hex characters)
ff3f963a 2619 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
TS
2620 handle \cV (control characters)
2621 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 2622 } (end switch)
77a135fe 2623 continue
02aa26ce 2624 } (end if backslash)
77a135fe 2625 handle regular character
02aa26ce 2626 } (end while character to read)
4e553d73 2627
02aa26ce
NT
2628*/
2629
76e3520e 2630STATIC char *
cea2e8a9 2631S_scan_const(pTHX_ char *start)
79072805 2632{
97aff369 2633 dVAR;
3280af22 2634 register char *send = PL_bufend; /* end of the constant */
77a135fe
KW
2635 SV *sv = newSV(send - start); /* sv for the constant. See
2636 note below on sizing. */
02aa26ce
NT
2637 register char *s = start; /* start of the constant */
2638 register char *d = SvPVX(sv); /* destination for copies */
2639 bool dorange = FALSE; /* are we in a translit range? */
c2e66d9e 2640 bool didrange = FALSE; /* did we just finish a range? */
b953e60c
KW
2641 bool has_utf8 = FALSE; /* Output constant is UTF8 */
2642 bool this_utf8 = cBOOL(UTF); /* Is the source string assumed
77a135fe
KW
2643 to be UTF8? But, this can
2644 show as true when the source
2645 isn't utf8, as for example
2646 when it is entirely composed
2647 of hex constants */
2648
2649 /* Note on sizing: The scanned constant is placed into sv, which is
2650 * initialized by newSV() assuming one byte of output for every byte of
2651 * input. This routine expects newSV() to allocate an extra byte for a
2652 * trailing NUL, which this routine will append if it gets to the end of
2653 * the input. There may be more bytes of input than output (eg., \N{LATIN
2654 * CAPITAL LETTER A}), or more output than input if the constant ends up
2655 * recoded to utf8, but each time a construct is found that might increase
2656 * the needed size, SvGROW() is called. Its size parameter each time is
2657 * based on the best guess estimate at the time, namely the length used so
2658 * far, plus the length the current construct will occupy, plus room for
2659 * the trailing NUL, plus one byte for every input byte still unscanned */
2660
012bcf8d 2661 UV uv;
4c3a8340
TS
2662#ifdef EBCDIC
2663 UV literal_endpoint = 0;
e294cc5d 2664 bool native_range = TRUE; /* turned to FALSE if the first endpoint is Unicode. */
4c3a8340 2665#endif
012bcf8d 2666
7918f24d
NC
2667 PERL_ARGS_ASSERT_SCAN_CONST;
2668
bb16bae8 2669 assert(PL_lex_inwhat != OP_TRANSR);
2b9d42f0
NIS
2670 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
2671 /* If we are doing a trans and we know we want UTF8 set expectation */
2672 has_utf8 = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
2673 this_utf8 = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
2674 }
2675
2676
79072805 2677 while (s < send || dorange) {
ff3f963a 2678
02aa26ce 2679 /* get transliterations out of the way (they're most literal) */
3280af22 2680 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 2681 /* expand a range A-Z to the full set of characters. AIE! */
79072805 2682 if (dorange) {
1ba5c669
JH
2683 I32 i; /* current expanded character */
2684 I32 min; /* first character in range */
2685 I32 max; /* last character in range */
02aa26ce 2686
e294cc5d
JH
2687#ifdef EBCDIC
2688 UV uvmax = 0;
2689#endif
2690
2691 if (has_utf8
2692#ifdef EBCDIC
2693 && !native_range
2694#endif
2695 ) {
9d4ba2ae 2696 char * const c = (char*)utf8_hop((U8*)d, -1);
8973db79
JH
2697 char *e = d++;
2698 while (e-- > c)
2699 *(e + 1) = *e;
25716404 2700 *c = (char)UTF_TO_NATIVE(0xff);
8973db79
JH
2701 /* mark the range as done, and continue */
2702 dorange = FALSE;
2703 didrange = TRUE;
2704 continue;
2705 }
2b9d42f0 2706
95a20fc0 2707 i = d - SvPVX_const(sv); /* remember current offset */
e294cc5d
JH
2708#ifdef EBCDIC
2709 SvGROW(sv,
2710 SvLEN(sv) + (has_utf8 ?
2711 (512 - UTF_CONTINUATION_MARK +
2712 UNISKIP(0x100))
2713 : 256));
2714 /* How many two-byte within 0..255: 128 in UTF-8,
2715 * 96 in UTF-8-mod. */
2716#else
9cbb5ea2 2717 SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */
e294cc5d 2718#endif
9cbb5ea2 2719 d = SvPVX(sv) + i; /* refresh d after realloc */
e294cc5d
JH
2720#ifdef EBCDIC
2721 if (has_utf8) {
2722 int j;
2723 for (j = 0; j <= 1; j++) {
2724 char * const c = (char*)utf8_hop((U8*)d, -1);
2725 const UV uv = utf8n_to_uvchr((U8*)c, d - c, NULL, 0);
2726 if (j)
2727 min = (U8)uv;
2728 else if (uv < 256)
2729 max = (U8)uv;
2730 else {
2731 max = (U8)0xff; /* only to \xff */
2732 uvmax = uv; /* \x{100} to uvmax */
2733 }
2734 d = c; /* eat endpoint chars */
2735 }
2736 }
2737 else {
2738#endif
2739 d -= 2; /* eat the first char and the - */
2740 min = (U8)*d; /* first char in range */
2741 max = (U8)d[1]; /* last char in range */
2742#ifdef EBCDIC
2743 }
2744#endif
8ada0baa 2745
c2e66d9e 2746 if (min > max) {
01ec43d0 2747 Perl_croak(aTHX_
d1573ac7 2748 "Invalid range \"%c-%c\" in transliteration operator",
1ba5c669 2749 (char)min, (char)max);
c2e66d9e
GS
2750 }
2751
c7f1f016 2752#ifdef EBCDIC
4c3a8340
TS
2753 if (literal_endpoint == 2 &&
2754 ((isLOWER(min) && isLOWER(max)) ||
2755 (isUPPER(min) && isUPPER(max)))) {
8ada0baa
JH
2756 if (isLOWER(min)) {
2757 for (i = min; i <= max; i++)
2758 if (isLOWER(i))
db42d148 2759 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2760 } else {
2761 for (i = min; i <= max; i++)
2762 if (isUPPER(i))
db42d148 2763 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2764 }
2765 }
2766 else
2767#endif
2768 for (i = min; i <= max; i++)
e294cc5d
JH
2769#ifdef EBCDIC
2770 if (has_utf8) {
2771 const U8 ch = (U8)NATIVE_TO_UTF(i);
2772 if (UNI_IS_INVARIANT(ch))
2773 *d++ = (U8)i;
2774 else {
2775 *d++ = (U8)UTF8_EIGHT_BIT_HI(ch);
2776 *d++ = (U8)UTF8_EIGHT_BIT_LO(ch);
2777 }
2778 }
2779 else
2780#endif
2781 *d++ = (char)i;
2782
2783#ifdef EBCDIC
2784 if (uvmax) {
2785 d = (char*)uvchr_to_utf8((U8*)d, 0x100);
2786 if (uvmax > 0x101)
2787 *d++ = (char)UTF_TO_NATIVE(0xff);
2788 if (uvmax > 0x100)
2789 d = (char*)uvchr_to_utf8((U8*)d, uvmax);
2790 }
2791#endif
02aa26ce
NT
2792
2793 /* mark the range as done, and continue */
79072805 2794 dorange = FALSE;
01ec43d0 2795 didrange = TRUE;
4c3a8340
TS
2796#ifdef EBCDIC
2797 literal_endpoint = 0;
2798#endif
79072805 2799 continue;
4e553d73 2800 }
02aa26ce
NT
2801
2802 /* range begins (ignore - as first or last char) */
79072805 2803 else if (*s == '-' && s+1 < send && s != start) {
4e553d73 2804 if (didrange) {
1fafa243 2805 Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
01ec43d0 2806 }
e294cc5d
JH
2807 if (has_utf8
2808#ifdef EBCDIC
2809 && !native_range
2810#endif
2811 ) {
25716404 2812 *d++ = (char)UTF_TO_NATIVE(0xff); /* use illegal utf8 byte--see pmtrans */
a0ed51b3
LW
2813 s++;
2814 continue;
2815 }
79072805
LW
2816 dorange = TRUE;
2817 s++;
01ec43d0
GS
2818 }
2819 else {
2820 didrange = FALSE;
4c3a8340
TS
2821#ifdef EBCDIC
2822 literal_endpoint = 0;
e294cc5d 2823 native_range = TRUE;
4c3a8340 2824#endif
01ec43d0 2825 }
79072805 2826 }
02aa26ce
NT
2827
2828 /* if we get here, we're not doing a transliteration */
2829
0f5d15d6
IZ
2830 /* skip for regexp comments /(?#comment)/ and code /(?{code})/,
2831 except for the last char, which will be done separately. */
3280af22 2832 else if (*s == '(' && PL_lex_inpat && s[1] == '?') {
cc6b7395 2833 if (s[2] == '#') {
e994fd66 2834 while (s+1 < send && *s != ')')
db42d148 2835 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
155aba94
GS
2836 }
2837 else if (s[2] == '{' /* This should match regcomp.c */
67edc0c9 2838 || (s[2] == '?' && s[3] == '{'))
155aba94 2839 {
cc6b7395 2840 I32 count = 1;
0f5d15d6 2841 char *regparse = s + (s[2] == '{' ? 3 : 4);
cc6b7395
IZ
2842 char c;
2843
d9f97599
GS
2844 while (count && (c = *regparse)) {
2845 if (c == '\\' && regparse[1])
2846 regparse++;
4e553d73 2847 else if (c == '{')
cc6b7395 2848 count++;
4e553d73 2849 else if (c == '}')
cc6b7395 2850 count--;
d9f97599 2851 regparse++;
cc6b7395 2852 }
e994fd66 2853 if (*regparse != ')')
5bdf89e7 2854 regparse--; /* Leave one char for continuation. */
0f5d15d6 2855 while (s < regparse)
db42d148 2856 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
cc6b7395 2857 }
748a9306 2858 }
02aa26ce
NT
2859
2860 /* likewise skip #-initiated comments in //x patterns */
3280af22 2861 else if (*s == '#' && PL_lex_inpat &&
73134a2e 2862 ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED) {
748a9306 2863 while (s+1 < send && *s != '\n')
db42d148 2864 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
748a9306 2865 }
02aa26ce 2866
5d1d4326 2867 /* check for embedded arrays
da6eedaa 2868 (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
5d1d4326 2869 */
1749ea0d
TS
2870 else if (*s == '@' && s[1]) {
2871 if (isALNUM_lazy_if(s+1,UTF))
2872 break;
2873 if (strchr(":'{$", s[1]))
2874 break;
2875 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
2876 break; /* in regexp, neither @+ nor @- are interpolated */
2877 }
02aa26ce
NT
2878
2879 /* check for embedded scalars. only stop if we're sure it's a
2880 variable.
2881 */
79072805 2882 else if (*s == '$') {
3280af22 2883 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
79072805 2884 break;
77772344 2885 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
a2a5de95
NC
2886 if (s[1] == '\\') {
2887 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
2888 "Possible unintended interpolation of $\\ in regex");
77772344 2889 }
79072805 2890 break; /* in regexp, $ might be tail anchor */
77772344 2891 }
79072805 2892 }
02aa26ce 2893
2b9d42f0
NIS
2894 /* End of else if chain - OP_TRANS rejoin rest */
2895
02aa26ce 2896 /* backslashes */
79072805 2897 if (*s == '\\' && s+1 < send) {
ff3f963a
KW
2898 char* e; /* Can be used for ending '}', etc. */
2899
79072805 2900 s++;
02aa26ce 2901
7d0fc23c
KW
2902 /* warn on \1 - \9 in substitution replacements, but note that \11
2903 * is an octal; and \19 is \1 followed by '9' */
3280af22 2904 if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat &&
a0d0e21e 2905 isDIGIT(*s) && *s != '0' && !isDIGIT(s[1]))
79072805 2906 {
a2a5de95 2907 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
79072805
LW
2908 *--s = '$';
2909 break;
2910 }
02aa26ce
NT
2911
2912 /* string-change backslash escapes */
3280af22 2913 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQ", *s)) {
79072805
LW
2914 --s;
2915 break;
2916 }
ff3f963a
KW
2917 /* In a pattern, process \N, but skip any other backslash escapes.
2918 * This is because we don't want to translate an escape sequence
2919 * into a meta symbol and have the regex compiler use the meta
2920 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
2921 * in spite of this, we do have to process \N here while the proper
2922 * charnames handler is in scope. See bugs #56444 and #62056.
2923 * There is a complication because \N in a pattern may also stand
2924 * for 'match a non-nl', and not mean a charname, in which case its
2925 * processing should be deferred to the regex compiler. To be a
2926 * charname it must be followed immediately by a '{', and not look
2927 * like \N followed by a curly quantifier, i.e., not something like
2928 * \N{3,}. regcurly returns a boolean indicating if it is a legal
2929 * quantifier */
2930 else if (PL_lex_inpat
2931 && (*s != 'N'
2932 || s[1] != '{'
2933 || regcurly(s + 1)))
2934 {
cc74c5bd
TS
2935 *d++ = NATIVE_TO_NEED(has_utf8,'\\');
2936 goto default_action;
2937 }
02aa26ce 2938
79072805 2939 switch (*s) {
02aa26ce
NT
2940
2941 /* quoted - in transliterations */
79072805 2942 case '-':
3280af22 2943 if (PL_lex_inwhat == OP_TRANS) {
79072805
LW
2944 *d++ = *s++;
2945 continue;
2946 }
2947 /* FALL THROUGH */
2948 default:
11b8faa4 2949 {
a2a5de95
NC
2950 if ((isALPHA(*s) || isDIGIT(*s)))
2951 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
2952 "Unrecognized escape \\%c passed through",
2953 *s);
11b8faa4 2954 /* default action is to copy the quoted character */
f9a63242 2955 goto default_action;
11b8faa4 2956 }
02aa26ce 2957
632403cc 2958 /* eg. \132 indicates the octal constant 0132 */
79072805
LW
2959 case '0': case '1': case '2': case '3':
2960 case '4': case '5': case '6': case '7':
ba210ebe 2961 {
53305cf1
NC
2962 I32 flags = 0;
2963 STRLEN len = 3;
77a135fe 2964 uv = NATIVE_TO_UNI(grok_oct(s, &len, &flags, NULL));
ba210ebe
JH
2965 s += len;
2966 }
012bcf8d 2967 goto NUM_ESCAPE_INSERT;
02aa26ce 2968
f0a2b745
KW
2969 /* eg. \o{24} indicates the octal constant \024 */
2970 case 'o':
2971 {
2972 STRLEN len;
454155d9 2973 const char* error;
f0a2b745 2974
454155d9 2975 bool valid = grok_bslash_o(s, &uv, &len, &error, 1);
f0a2b745 2976 s += len;
454155d9 2977 if (! valid) {
f0a2b745
KW
2978 yyerror(error);
2979 continue;
2980 }
2981 goto NUM_ESCAPE_INSERT;
2982 }
2983
77a135fe 2984 /* eg. \x24 indicates the hex constant 0x24 */
79072805 2985 case 'x':
a0ed51b3
LW
2986 ++s;
2987 if (*s == '{') {
9d4ba2ae 2988 char* const e = strchr(s, '}');
a4c04bdc
NC
2989 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES |
2990 PERL_SCAN_DISALLOW_PREFIX;
53305cf1 2991 STRLEN len;
355860ce 2992
53305cf1 2993 ++s;
adaeee49 2994 if (!e) {
a0ed51b3 2995 yyerror("Missing right brace on \\x{}");
355860ce 2996 continue;
ba210ebe 2997 }
53305cf1 2998 len = e - s;
77a135fe 2999 uv = NATIVE_TO_UNI(grok_hex(s, &len, &flags, NULL));
ba210ebe 3000 s = e + 1;
a0ed51b3
LW
3001 }
3002 else {
ba210ebe 3003 {
53305cf1 3004 STRLEN len = 2;
a4c04bdc 3005 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
77a135fe 3006 uv = NATIVE_TO_UNI(grok_hex(s, &len, &flags, NULL));
ba210ebe
JH
3007 s += len;
3008 }
012bcf8d
GS
3009 }
3010
3011 NUM_ESCAPE_INSERT:
ff3f963a
KW
3012 /* Insert oct or hex escaped character. There will always be
3013 * enough room in sv since such escapes will be longer than any
3014 * UTF-8 sequence they can end up as, except if they force us
3015 * to recode the rest of the string into utf8 */
ba7cea30 3016
77a135fe 3017 /* Here uv is the ordinal of the next character being added in
ff3f963a 3018 * unicode (converted from native). */
77a135fe 3019 if (!UNI_IS_INVARIANT(uv)) {
9aa983d2 3020 if (!has_utf8 && uv > 255) {
77a135fe
KW
3021 /* Might need to recode whatever we have accumulated so
3022 * far if it contains any chars variant in utf8 or
3023 * utf-ebcdic. */
3024
3025 SvCUR_set(sv, d - SvPVX_const(sv));
3026 SvPOK_on(sv);
3027 *d = '\0';
77a135fe 3028 /* See Note on sizing above. */
7bf79863
KW
3029 sv_utf8_upgrade_flags_grow(sv,
3030 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3031 UNISKIP(uv) + (STRLEN)(send - s) + 1);
77a135fe
KW
3032 d = SvPVX(sv) + SvCUR(sv);
3033 has_utf8 = TRUE;
012bcf8d
GS
3034 }
3035
77a135fe
KW
3036 if (has_utf8) {
3037 d = (char*)uvuni_to_utf8((U8*)d, uv);
f9a63242
JH
3038 if (PL_lex_inwhat == OP_TRANS &&
3039 PL_sublex_info.sub_op) {
3040 PL_sublex_info.sub_op->op_private |=
3041 (PL_lex_repl ? OPpTRANS_FROM_UTF
3042 : OPpTRANS_TO_UTF);
f9a63242 3043 }
e294cc5d
JH
3044#ifdef EBCDIC
3045 if (uv > 255 && !dorange)
3046 native_range = FALSE;
3047#endif
012bcf8d 3048 }
a0ed51b3 3049 else {
012bcf8d 3050 *d++ = (char)uv;
a0ed51b3 3051 }
012bcf8d
GS
3052 }
3053 else {
c4d5f83a 3054 *d++ = (char) uv;
a0ed51b3 3055 }
79072805 3056 continue;
02aa26ce 3057
4a2d328f 3058 case 'N':
ff3f963a
KW
3059 /* In a non-pattern \N must be a named character, like \N{LATIN
3060 * SMALL LETTER A} or \N{U+0041}. For patterns, it also can
3061 * mean to match a non-newline. For non-patterns, named
3062 * characters are converted to their string equivalents. In
3063 * patterns, named characters are not converted to their
3064 * ultimate forms for the same reasons that other escapes
3065 * aren't. Instead, they are converted to the \N{U+...} form
3066 * to get the value from the charnames that is in effect right
3067 * now, while preserving the fact that it was a named character
3068 * so that the regex compiler knows this */
3069
3070 /* This section of code doesn't generally use the
3071 * NATIVE_TO_NEED() macro to transform the input. I (khw) did
3072 * a close examination of this macro and determined it is a
3073 * no-op except on utfebcdic variant characters. Every
3074 * character generated by this that would normally need to be
3075 * enclosed by this macro is invariant, so the macro is not
7538f724
KW
3076 * needed, and would complicate use of copy(). XXX There are
3077 * other parts of this file where the macro is used
3078 * inconsistently, but are saved by it being a no-op */
ff3f963a
KW
3079
3080 /* The structure of this section of code (besides checking for
3081 * errors and upgrading to utf8) is:
3082 * Further disambiguate between the two meanings of \N, and if
3083 * not a charname, go process it elsewhere
0a96133f
KW
3084 * If of form \N{U+...}, pass it through if a pattern;
3085 * otherwise convert to utf8
3086 * Otherwise must be \N{NAME}: convert to \N{U+c1.c2...} if a
3087 * pattern; otherwise convert to utf8 */
ff3f963a
KW
3088
3089 /* Here, s points to the 'N'; the test below is guaranteed to
3090 * succeed if we are being called on a pattern as we already
3091 * know from a test above that the next character is a '{'.
3092 * On a non-pattern \N must mean 'named sequence, which
3093 * requires braces */
3094 s++;
3095 if (*s != '{') {
3096 yyerror("Missing braces on \\N{}");
3097 continue;
3098 }
3099 s++;
3100
0a96133f 3101 /* If there is no matching '}', it is an error. */
ff3f963a
KW
3102 if (! (e = strchr(s, '}'))) {
3103 if (! PL_lex_inpat) {
5777a3f7 3104 yyerror("Missing right brace on \\N{}");
0a96133f
KW
3105 } else {
3106 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N.");
dbc0d4f2 3107 }
0a96133f 3108 continue;
ff3f963a 3109 }
cddc7ef4 3110
ff3f963a 3111 /* Here it looks like a named character */
cddc7ef4 3112
ff3f963a
KW
3113 if (PL_lex_inpat) {
3114
3115 /* XXX This block is temporary code. \N{} implies that the
3116 * pattern is to have Unicode semantics, and therefore
3117 * currently has to be encoded in utf8. By putting it in
3118 * utf8 now, we save a whole pass in the regular expression
3119 * compiler. Once that code is changed so Unicode
3120 * semantics doesn't necessarily have to be in utf8, this
da3a4baf
KW
3121 * block should be removed. However, the code that parses
3122 * the output of this would have to be changed to not
3123 * necessarily expect utf8 */
ff3f963a 3124 if (!has_utf8) {
77a135fe 3125 SvCUR_set(sv, d - SvPVX_const(sv));
f08d6ad9 3126 SvPOK_on(sv);
e4f3eed8 3127 *d = '\0';
77a135fe 3128 /* See Note on sizing above. */
7bf79863 3129 sv_utf8_upgrade_flags_grow(sv,
ff3f963a
KW
3130 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3131 /* 5 = '\N{' + cur char + NUL */
3132 (STRLEN)(send - s) + 5);
f08d6ad9 3133 d = SvPVX(sv) + SvCUR(sv);
89491803 3134 has_utf8 = TRUE;
ff3f963a
KW
3135 }
3136 }
423cee85 3137
ff3f963a
KW
3138 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3139 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3140 | PERL_SCAN_DISALLOW_PREFIX;
3141 STRLEN len;
3142
3143 /* For \N{U+...}, the '...' is a unicode value even on
3144 * EBCDIC machines */
3145 s += 2; /* Skip to next char after the 'U+' */
3146 len = e - s;
3147 uv = grok_hex(s, &len, &flags, NULL);
3148 if (len == 0 || len != (STRLEN)(e - s)) {
3149 yyerror("Invalid hexadecimal number in \\N{U+...}");
3150 s = e + 1;
3151 continue;
3152 }
3153
3154 if (PL_lex_inpat) {
3155
e2a7e165
KW
3156 /* On non-EBCDIC platforms, pass through to the regex
3157 * compiler unchanged. The reason we evaluated the
3158 * number above is to make sure there wasn't a syntax
3159 * error. But on EBCDIC we convert to native so
3160 * downstream code can continue to assume it's native
3161 */
ff3f963a 3162 s -= 5; /* Include the '\N{U+' */
e2a7e165
KW
3163#ifdef EBCDIC
3164 d += my_snprintf(d, e - s + 1 + 1, /* includes the }
3165 and the \0 */
3166 "\\N{U+%X}",
3167 (unsigned int) UNI_TO_NATIVE(uv));
3168#else
ff3f963a
KW
3169 Copy(s, d, e - s + 1, char); /* 1 = include the } */
3170 d += e - s + 1;
e2a7e165 3171#endif
ff3f963a
KW
3172 }
3173 else { /* Not a pattern: convert the hex to string */
3174
3175 /* If destination is not in utf8, unconditionally
3176 * recode it to be so. This is because \N{} implies
3177 * Unicode semantics, and scalars have to be in utf8
3178 * to guarantee those semantics */
3179 if (! has_utf8) {
3180 SvCUR_set(sv, d - SvPVX_const(sv));
3181 SvPOK_on(sv);
3182 *d = '\0';
3183 /* See Note on sizing above. */
3184 sv_utf8_upgrade_flags_grow(
3185 sv,
3186 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3187 UNISKIP(uv) + (STRLEN)(send - e) + 1);
3188 d = SvPVX(sv) + SvCUR(sv);
3189 has_utf8 = TRUE;
3190 }
3191
3192 /* Add the string to the output */
3193 if (UNI_IS_INVARIANT(uv)) {
3194 *d++ = (char) uv;
3195 }
3196 else d = (char*)uvuni_to_utf8((U8*)d, uv);
3197 }
3198 }
3199 else { /* Here is \N{NAME} but not \N{U+...}. */
3200
3201 SV *res; /* result from charnames */
3202 const char *str; /* the string in 'res' */
3203 STRLEN len; /* its length */
3204
3205 /* Get the value for NAME */
3206 res = newSVpvn(s, e - s);
3207 res = new_constant( NULL, 0, "charnames",
3208 /* includes all of: \N{...} */
3209 res, NULL, s - 3, e - s + 4 );
3210
3211 /* Most likely res will be in utf8 already since the
3212 * standard charnames uses pack U, but a custom translator
3213 * can leave it otherwise, so make sure. XXX This can be
3214 * revisited to not have charnames use utf8 for characters
3215 * that don't need it when regexes don't have to be in utf8
3216 * for Unicode semantics. If doing so, remember EBCDIC */
3217 sv_utf8_upgrade(res);
3218 str = SvPV_const(res, len);
3219
3220 /* Don't accept malformed input */
3221 if (! is_utf8_string((U8 *) str, len)) {
3222 yyerror("Malformed UTF-8 returned by \\N");
3223 }
3224 else if (PL_lex_inpat) {
3225
3226 if (! len) { /* The name resolved to an empty string */
3227 Copy("\\N{}", d, 4, char);
3228 d += 4;
3229 }
3230 else {
3231 /* In order to not lose information for the regex
3232 * compiler, pass the result in the specially made
3233 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3234 * the code points in hex of each character
3235 * returned by charnames */
3236
3237 const char *str_end = str + len;
3238 STRLEN char_length; /* cur char's byte length */
3239 STRLEN output_length; /* and the number of bytes
3240 after this is translated
3241 into hex digits */
3242 const STRLEN off = d - SvPVX_const(sv);
3243
3244 /* 2 hex per byte; 2 chars for '\N'; 2 chars for
3245 * max('U+', '.'); and 1 for NUL */
3246 char hex_string[2 * UTF8_MAXBYTES + 5];
3247
3248 /* Get the first character of the result. */
3249 U32 uv = utf8n_to_uvuni((U8 *) str,
3250 len,
3251 &char_length,
3252 UTF8_ALLOW_ANYUV);
3253
3254 /* The call to is_utf8_string() above hopefully
3255 * guarantees that there won't be an error. But
3256 * it's easy here to make sure. The function just
3257 * above warns and returns 0 if invalid utf8, but
3258 * it can also return 0 if the input is validly a
3259 * NUL. Disambiguate */
3260 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3261 uv = UNICODE_REPLACEMENT;
3262 }
3263
3264 /* Convert first code point to hex, including the
e2a7e165
KW
3265 * boiler plate before it. For all these, we
3266 * convert to native format so that downstream code
3267 * can continue to assume the input is native */
78c35590 3268 output_length =
3353de27 3269 my_snprintf(hex_string, sizeof(hex_string),
e2a7e165
KW
3270 "\\N{U+%X",
3271 (unsigned int) UNI_TO_NATIVE(uv));
ff3f963a
KW
3272
3273 /* Make sure there is enough space to hold it */
3274 d = off + SvGROW(sv, off
3275 + output_length
3276 + (STRLEN)(send - e)
3277 + 2); /* '}' + NUL */
3278 /* And output it */
3279 Copy(hex_string, d, output_length, char);
3280 d += output_length;
3281
3282 /* For each subsequent character, append dot and
3283 * its ordinal in hex */
3284 while ((str += char_length) < str_end) {
3285 const STRLEN off = d - SvPVX_const(sv);
3286 U32 uv = utf8n_to_uvuni((U8 *) str,
3287 str_end - str,
3288 &char_length,
3289 UTF8_ALLOW_ANYUV);
3290 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3291 uv = UNICODE_REPLACEMENT;
3292 }
3293
78c35590 3294 output_length =
3353de27 3295 my_snprintf(hex_string, sizeof(hex_string),
e2a7e165
KW
3296 ".%X",
3297 (unsigned int) UNI_TO_NATIVE(uv));
ff3f963a
KW
3298
3299 d = off + SvGROW(sv, off
3300 + output_length
3301 + (STRLEN)(send - e)
3302 + 2); /* '}' + NUL */
3303 Copy(hex_string, d, output_length, char);
3304 d += output_length;
3305 }
3306
3307 *d++ = '}'; /* Done. Add the trailing brace */
3308 }
3309 }
3310 else { /* Here, not in a pattern. Convert the name to a
3311 * string. */
3312
3313 /* If destination is not in utf8, unconditionally
3314 * recode it to be so. This is because \N{} implies
3315 * Unicode semantics, and scalars have to be in utf8
3316 * to guarantee those semantics */
3317 if (! has_utf8) {
3318 SvCUR_set(sv, d - SvPVX_const(sv));
3319 SvPOK_on(sv);
3320 *d = '\0';
3321 /* See Note on sizing above. */
3322 sv_utf8_upgrade_flags_grow(sv,
3323 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3324 len + (STRLEN)(send - s) + 1);
3325 d = SvPVX(sv) + SvCUR(sv);
3326 has_utf8 = TRUE;
3327 } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
3328
3329 /* See Note on sizing above. (NOTE: SvCUR() is not
3330 * set correctly here). */
3331 const STRLEN off = d - SvPVX_const(sv);
3332 d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1);
3333 }
3334 Copy(str, d, len, char);
3335 d += len;
423cee85 3336 }
423cee85 3337 SvREFCNT_dec(res);
cb233ae3
KW
3338
3339 /* Deprecate non-approved name syntax */
3340 if (ckWARN_d(WARN_DEPRECATED)) {
3341 bool problematic = FALSE;
3342 char* i = s;
3343
3344 /* For non-ut8 input, look to see that the first
3345 * character is an alpha, then loop through the rest
3346 * checking that each is a continuation */
3347 if (! this_utf8) {
3348 if (! isALPHAU(*i)) problematic = TRUE;
3349 else for (i = s + 1; i < e; i++) {
3350 if (isCHARNAME_CONT(*i)) continue;
3351 problematic = TRUE;
3352 break;
3353 }
3354 }
3355 else {
3356 /* Similarly for utf8. For invariants can check
3357 * directly. We accept anything above the latin1
3358 * range because it is immaterial to Perl if it is
3359 * correct or not, and is expensive to check. But
3360 * it is fairly easy in the latin1 range to convert
3361 * the variants into a single character and check
3362 * those */
3363 if (UTF8_IS_INVARIANT(*i)) {
3364 if (! isALPHAU(*i)) problematic = TRUE;
3365 } else if (UTF8_IS_DOWNGRADEABLE_START(*i)) {
81c14aa2 3366 if (! isALPHAU(UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(*i,
cb233ae3
KW
3367 *(i+1)))))
3368 {
3369 problematic = TRUE;
3370 }
3371 }
3372 if (! problematic) for (i = s + UTF8SKIP(s);
3373 i < e;
3374 i+= UTF8SKIP(i))
3375 {
3376 if (UTF8_IS_INVARIANT(*i)) {
3377 if (isCHARNAME_CONT(*i)) continue;
3378 } else if (! UTF8_IS_DOWNGRADEABLE_START(*i)) {
3379 continue;
3380 } else if (isCHARNAME_CONT(
3381 UNI_TO_NATIVE(
81c14aa2 3382 TWO_BYTE_UTF8_TO_UNI(*i, *(i+1)))))
cb233ae3
KW
3383 {
3384 continue;
3385 }
3386 problematic = TRUE;
3387 break;
3388 }
3389 }
3390 if (problematic) {
6e1bad6c
KW
3391 /* The e-i passed to the final %.*s makes sure that
3392 * should the trailing NUL be missing that this
3393 * print won't run off the end of the string */
cb233ae3 3394 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
b00fc8d4
NC
3395 "Deprecated character in \\N{...}; marked by <-- HERE in \\N{%.*s<-- HERE %.*s",
3396 (int)(i - s + 1), s, (int)(e - i), i + 1);
cb233ae3
KW
3397 }
3398 }
3399 } /* End \N{NAME} */
ff3f963a
KW
3400#ifdef EBCDIC
3401 if (!dorange)
3402 native_range = FALSE; /* \N{} is defined to be Unicode */
3403#endif
3404 s = e + 1; /* Point to just after the '}' */
423cee85
JH
3405 continue;
3406
02aa26ce 3407 /* \c is a control character */
79072805
LW
3408 case 'c':
3409 s++;
961ce445 3410 if (s < send) {
17a3df4c 3411 *d++ = grok_bslash_c(*s++, has_utf8, 1);
ba210ebe 3412 }
961ce445
RGS
3413 else {
3414 yyerror("Missing control char name in \\c");
3415 }
79072805 3416 continue;
02aa26ce
NT
3417
3418 /* printf-style backslashes, formfeeds, newlines, etc */
79072805 3419 case 'b':
db42d148 3420 *d++ = NATIVE_TO_NEED(has_utf8,'\b');
79072805
LW
3421 break;
3422 case 'n':
db42d148 3423 *d++ = NATIVE_TO_NEED(has_utf8,'\n');
79072805
LW
3424 break;
3425 case 'r':
db42d148 3426 *d++ = NATIVE_TO_NEED(has_utf8,'\r');
79072805
LW
3427 break;
3428 case 'f':
db42d148 3429 *d++ = NATIVE_TO_NEED(has_utf8,'\f');
79072805
LW
3430 break;
3431 case 't':
db42d148 3432 *d++ = NATIVE_TO_NEED(has_utf8,'\t');
79072805 3433 break;
34a3fe2a 3434 case 'e':
db42d148 3435 *d++ = ASCII_TO_NEED(has_utf8,'\033');
34a3fe2a
PP
3436 break;
3437 case 'a':
db42d148 3438 *d++ = ASCII_TO_NEED(has_utf8,'\007');
79072805 3439 break;
02aa26ce
NT
3440 } /* end switch */
3441
79072805
LW
3442 s++;
3443 continue;
02aa26ce 3444 } /* end if (backslash) */
4c3a8340
TS
3445#ifdef EBCDIC
3446 else
3447 literal_endpoint++;
3448#endif
02aa26ce 3449
f9a63242 3450 default_action:
77a135fe
KW
3451 /* If we started with encoded form, or already know we want it,
3452 then encode the next character */
3453 if (! NATIVE_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
2b9d42f0 3454 STRLEN len = 1;
77a135fe
KW
3455
3456
3457 /* One might think that it is wasted effort in the case of the
3458 * source being utf8 (this_utf8 == TRUE) to take the next character
3459 * in the source, convert it to an unsigned value, and then convert
3460 * it back again. But the source has not been validated here. The
3461 * routine that does the conversion checks for errors like
3462 * malformed utf8 */
3463
5f66b61c
AL
3464 const UV nextuv = (this_utf8) ? utf8n_to_uvchr((U8*)s, send - s, &len, 0) : (UV) ((U8) *s);
3465 const STRLEN need = UNISKIP(NATIVE_TO_UNI(nextuv));
77a135fe
KW
3466 if (!has_utf8) {
3467 SvCUR_set(sv, d - SvPVX_const(sv));
3468 SvPOK_on(sv);
3469 *d = '\0';
77a135fe 3470 /* See Note on sizing above. */
7bf79863
KW
3471 sv_utf8_upgrade_flags_grow(sv,
3472 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3473 need + (STRLEN)(send - s) + 1);
77a135fe
KW
3474 d = SvPVX(sv) + SvCUR(sv);
3475 has_utf8 = TRUE;
3476 } else if (need > len) {
3477 /* encoded value larger than old, may need extra space (NOTE:
3478 * SvCUR() is not set correctly here). See Note on sizing
3479 * above. */
9d4ba2ae 3480 const STRLEN off = d - SvPVX_const(sv);
77a135fe 3481 d = SvGROW(sv, off + need + (STRLEN)(send - s) + 1) + off;
2b9d42f0 3482 }
77a135fe
KW
3483 s += len;
3484
5f66b61c 3485 d = (char*)uvchr_to_utf8((U8*)d, nextuv);
e294cc5d
JH
3486#ifdef EBCDIC
3487 if (uv > 255 && !dorange)
3488 native_range = FALSE;
3489#endif
2b9d42f0
NIS
3490 }
3491 else {
3492 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
3493 }
02aa26ce
NT
3494 } /* while loop to process each character */
3495
3496 /* terminate the string and set up the sv */
79072805 3497 *d = '\0';
95a20fc0 3498 SvCUR_set(sv, d - SvPVX_const(sv));
2b9d42f0 3499 if (SvCUR(sv) >= SvLEN(sv))
d0063567 3500 Perl_croak(aTHX_ "panic: constant overflowed allocated space");
2b9d42f0 3501
79072805 3502 SvPOK_on(sv);
9f4817db 3503 if (PL_encoding && !has_utf8) {
d0063567
DK
3504 sv_recode_to_utf8(sv, PL_encoding);
3505 if (SvUTF8(sv))
3506 has_utf8 = TRUE;
9f4817db 3507 }
2b9d42f0 3508 if (has_utf8) {
7e2040f0 3509 SvUTF8_on(sv);
2b9d42f0 3510 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
d0063567 3511 PL_sublex_info.sub_op->op_private |=
2b9d42f0
NIS
3512 (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
3513 }
3514 }
79072805 3515
02aa26ce 3516 /* shrink the sv if we allocated more than we used */
79072805 3517 if (SvCUR(sv) + 5 < SvLEN(sv)) {
1da4ca5f 3518 SvPV_shrink_to_cur(sv);
79072805 3519 }
02aa26ce 3520
6154021b 3521 /* return the substring (via pl_yylval) only if we parsed anything */
3280af22 3522 if (s > PL_bufptr) {
eb0d8d16
NC
3523 if ( PL_hints & ( PL_lex_inpat ? HINT_NEW_RE : HINT_NEW_STRING ) ) {
3524 const char *const key = PL_lex_inpat ? "qr" : "q";
3525 const STRLEN keylen = PL_lex_inpat ? 2 : 1;
3526 const char *type;
3527 STRLEN typelen;
3528
3529 if (PL_lex_inwhat == OP_TRANS) {
3530 type = "tr";
3531 typelen = 2;
3532 } else if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat) {
3533 type = "s";
3534 typelen = 1;
3535 } else {
3536 type = "qq";
3537 typelen = 2;
3538 }
3539
3540 sv = S_new_constant(aTHX_ start, s - start, key, keylen, sv, NULL,
3541 type, typelen);
3542 }
6154021b 3543 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
b3ac6de7 3544 } else
8990e307 3545 SvREFCNT_dec(sv);
79072805
LW
3546 return s;
3547}
3548
ffb4593c
NT
3549/* S_intuit_more
3550 * Returns TRUE if there's more to the expression (e.g., a subscript),
3551 * FALSE otherwise.
ffb4593c
NT
3552 *
3553 * It deals with "$foo[3]" and /$foo[3]/ and /$foo[0123456789$]+/
3554 *
3555 * ->[ and ->{ return TRUE
3556 * { and [ outside a pattern are always subscripts, so return TRUE
3557 * if we're outside a pattern and it's not { or [, then return FALSE
3558 * if we're in a pattern and the first char is a {
3559 * {4,5} (any digits around the comma) returns FALSE
3560 * if we're in a pattern and the first char is a [
3561 * [] returns FALSE
3562 * [SOMETHING] has a funky algorithm to decide whether it's a
3563 * character class or not. It has to deal with things like
3564 * /$foo[-3]/ and /$foo[$bar]/ as well as /$foo[$\d]+/
3565 * anything else returns TRUE
3566 */
3567
9cbb5ea2
GS
3568/* This is the one truly awful dwimmer necessary to conflate C and sed. */
3569
76e3520e 3570STATIC int
cea2e8a9 3571S_intuit_more(pTHX_ register char *s)
79072805 3572{
97aff369 3573 dVAR;
7918f24d
NC
3574
3575 PERL_ARGS_ASSERT_INTUIT_MORE;
3576
3280af22 3577 if (PL_lex_brackets)
79072805
LW
3578 return TRUE;
3579 if (*s == '-' && s[1] == '>' && (s[2] == '[' || s[2] == '{'))
3580 return TRUE;
3581 if (*s != '{' && *s != '[')
3582 return FALSE;
3280af22 3583 if (!PL_lex_inpat)
79072805
LW
3584 return TRUE;
3585
3586 /* In a pattern, so maybe we have {n,m}. */
3587 if (*s == '{') {
b3155d95 3588 if (regcurly(s)) {
79072805 3589 return FALSE;
b3155d95 3590 }
79072805 3591 return TRUE;
79072805
LW
3592 }
3593
3594 /* On the other hand, maybe we have a character class */
3595
3596 s++;
3597 if (*s == ']' || *s == '^')
3598 return FALSE;
3599 else {
ffb4593c 3600 /* this is terrifying, and it works */
79072805
LW
3601 int weight = 2; /* let's weigh the evidence */
3602 char seen[256];
f27ffc4a 3603 unsigned char un_char = 255, last_un_char;
9d4ba2ae 3604 const char * const send = strchr(s,']');
3280af22 3605 char tmpbuf[sizeof PL_tokenbuf * 4];
79072805
LW
3606
3607 if (!send) /* has to be an expression */
3608 return TRUE;
3609
3610 Zero(seen,256,char);
3611 if (*s == '$')
3612 weight -= 3;
3613 else if (isDIGIT(*s)) {
3614 if (s[1] != ']') {
3615 if (isDIGIT(s[1]) && s[2] == ']')
3616 weight -= 10;
3617 }
3618 else
3619 weight -= 100;
3620 }
3621 for (; s < send; s++) {
3622 last_un_char = un_char;
3623 un_char = (unsigned char)*s;
3624 switch (*s) {
3625 case '@':
3626 case '&':
3627 case '$':
3628 weight -= seen[un_char] * 10;
7e2040f0 3629 if (isALNUM_lazy_if(s+1,UTF)) {
90e5519e 3630 int len;
8903cb82 3631 scan_ident(s, send, tmpbuf, sizeof tmpbuf, FALSE);
90e5519e
NC
3632 len = (int)strlen(tmpbuf);
3633 if (len > 1 && gv_fetchpvn_flags(tmpbuf, len, 0, SVt_PV))
79072805
LW
3634 weight -= 100;
3635 else
3636 weight -= 10;
3637 }
3638 else if (*s == '$' && s[1] &&
93a17b20
LW
3639 strchr("[#!%*<>()-=",s[1])) {
3640 if (/*{*/ strchr("])} =",s[2]))
79072805
LW
3641 weight -= 10;
3642 else
3643 weight -= 1;
3644 }
3645 break;
3646 case '\\':
3647 un_char = 254;
3648 if (s[1]) {
93a17b20 3649 if (strchr("wds]",s[1]))
79072805 3650 weight += 100;
10edeb5d 3651 else if (seen[(U8)'\''] || seen[(U8)'"'])
79072805 3652 weight += 1;
93a17b20 3653 else if (strchr("rnftbxcav",s[1]))
79072805
LW
3654 weight += 40;
3655 else if (isDIGIT(s[1])) {
3656 weight += 40;
3657 while (s[1] && isDIGIT(s[1]))
3658 s++;
3659 }
3660 }
3661 else
3662 weight += 100;
3663 break;
3664 case '-':
3665 if (s[1] == '\\')
3666 weight += 50;
93a17b20 3667 if (strchr("aA01! ",last_un_char))
79072805 3668 weight += 30;
93a17b20 3669 if (strchr("zZ79~",s[1]))
79072805 3670 weight += 30;
f27ffc4a
GS
3671 if (last_un_char == 255 && (isDIGIT(s[1]) || s[1] == '$'))
3672 weight -= 5; /* cope with negative subscript */
79072805
LW
3673 break;
3674 default:
3792a11b
NC
3675 if (!isALNUM(last_un_char)
3676 && !(last_un_char == '$' || last_un_char == '@'
3677 || last_un_char == '&')
3678 && isALPHA(*s) && s[1] && isALPHA(s[1])) {
79072805
LW
3679 char *d = tmpbuf;
3680 while (isALPHA(*s))
3681 *d++ = *s++;
3682 *d = '\0';
5458a98a 3683 if (keyword(tmpbuf, d - tmpbuf, 0))
79072805
LW
3684 weight -= 150;
3685 }
3686 if (un_char == last_un_char + 1)
3687 weight += 5;
3688 weight -= seen[un_char];
3689 break;
3690 }
3691 seen[un_char]++;
3692 }
3693 if (weight >= 0) /* probably a character class */
3694 return FALSE;
3695 }
3696
3697 return TRUE;
3698}
ffed7fef 3699
ffb4593c
NT
3700/*
3701 * S_intuit_method
3702 *
3703 * Does all the checking to disambiguate
3704 * foo bar
3705 * between foo(bar) and bar->foo. Returns 0 if not a method, otherwise
3706 * FUNCMETH (bar->foo(args)) or METHOD (bar->foo args).
3707 *
3708 * First argument is the stuff after the first token, e.g. "bar".
3709 *
3710 * Not a method if bar is a filehandle.
3711 * Not a method if foo is a subroutine prototyped to take a filehandle.
3712 * Not a method if it's really "Foo $bar"
3713 * Method if it's "foo $bar"
3714 * Not a method if it's really "print foo $bar"
3715 * Method if it's really "foo package::" (interpreted as package->foo)
8f8cf39c 3716 * Not a method if bar is known to be a subroutine ("sub bar; foo bar")
3cb0bbe5 3717 * Not a method if bar is a filehandle or package, but is quoted with
ffb4593c
NT
3718 * =>
3719 */
3720
76e3520e 3721STATIC int
62d55b22 3722S_intuit_method(pTHX_ char *start, GV *gv, CV *cv)
a0d0e21e 3723{
97aff369 3724 dVAR;
a0d0e21e 3725 char *s = start + (*start == '$');
3280af22 3726 char tmpbuf[sizeof PL_tokenbuf];
a0d0e21e
LW
3727 STRLEN len;
3728 GV* indirgv;
5db06880
NC
3729#ifdef PERL_MAD
3730 int soff;
3731#endif
a0d0e21e 3732
7918f24d
NC
3733 PERL_ARGS_ASSERT_INTUIT_METHOD;
3734
a0d0e21e 3735 if (gv) {
62d55b22 3736 if (SvTYPE(gv) == SVt_PVGV && GvIO(gv))
a0d0e21e 3737 return 0;
62d55b22
NC
3738 if (cv) {
3739 if (SvPOK(cv)) {
3740 const char *proto = SvPVX_const(cv);
3741 if (proto) {
3742 if (*proto == ';')
3743 proto++;
3744 if (*proto == '*')
3745 return 0;
3746 }
b6c543e3
IZ
3747 }
3748 } else
c35e046a 3749 gv = NULL;
a0d0e21e 3750 }
8903cb82 3751 s = scan_word(s, tmpbuf, sizeof tmpbuf, TRUE, &len);
ffb4593c
NT
3752 /* start is the beginning of the possible filehandle/object,
3753 * and s is the end of it
3754 * tmpbuf is a copy of it
3755 */
3756
a0d0e21e 3757 if (*start == '$') {
3ef1310e
RGS
3758 if (gv || PL_last_lop_op == OP_PRINT || PL_last_lop_op == OP_SAY ||
3759 isUPPER(*PL_tokenbuf))
a0d0e21e 3760 return 0;
5db06880
NC
3761#ifdef PERL_MAD
3762 len = start - SvPVX(PL_linestr);
3763#endif
29595ff2 3764 s = PEEKSPACE(s);
f0092767 3765#ifdef PERL_MAD
5db06880
NC
3766 start = SvPVX(PL_linestr) + len;
3767#endif
3280af22
NIS
3768 PL_bufptr = start;
3769 PL_expect = XREF;
a0d0e21e
LW
3770 return *s == '(' ? FUNCMETH : METHOD;
3771 }
5458a98a 3772 if (!keyword(tmpbuf, len, 0)) {
c3e0f903
GS
3773 if (len > 2 && tmpbuf[len - 2] == ':' && tmpbuf[len - 1] == ':') {
3774 len -= 2;
3775 tmpbuf[len] = '\0';