This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Tweaks to find_uninit_var docs
[perl5.git] / toke.c
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
26
27This is the lower layer of the Perl parser, managing characters and tokens.
28
29=for apidoc AmU|yy_parser *|PL_parser
30
31Pointer to a structure encapsulating the state of the parsing operation
32currently in progress. The pointer can be locally changed to perform
33a nested parse without interfering with the state of an outer parse.
34Individual members of C<PL_parser> have their own documentation.
35
36=cut
37*/
38
378cc40b 39#include "EXTERN.h"
864dbfa3 40#define PERL_IN_TOKE_C
378cc40b 41#include "perl.h"
04e98a4d 42#include "dquote_static.c"
378cc40b 43
eb0d8d16
NC
44#define new_constant(a,b,c,d,e,f,g) \
45 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
46
6154021b 47#define pl_yylval (PL_parser->yylval)
d3b6f988 48
199e78b7
DM
49/* XXX temporary backwards compatibility */
50#define PL_lex_brackets (PL_parser->lex_brackets)
78cdf107
Z
51#define PL_lex_allbrackets (PL_parser->lex_allbrackets)
52#define PL_lex_fakeeof (PL_parser->lex_fakeeof)
199e78b7
DM
53#define PL_lex_brackstack (PL_parser->lex_brackstack)
54#define PL_lex_casemods (PL_parser->lex_casemods)
55#define PL_lex_casestack (PL_parser->lex_casestack)
56#define PL_lex_defer (PL_parser->lex_defer)
57#define PL_lex_dojoin (PL_parser->lex_dojoin)
58#define PL_lex_expect (PL_parser->lex_expect)
59#define PL_lex_formbrack (PL_parser->lex_formbrack)
60#define PL_lex_inpat (PL_parser->lex_inpat)
61#define PL_lex_inwhat (PL_parser->lex_inwhat)
62#define PL_lex_op (PL_parser->lex_op)
63#define PL_lex_repl (PL_parser->lex_repl)
64#define PL_lex_starts (PL_parser->lex_starts)
65#define PL_lex_stuff (PL_parser->lex_stuff)
66#define PL_multi_start (PL_parser->multi_start)
67#define PL_multi_open (PL_parser->multi_open)
68#define PL_multi_close (PL_parser->multi_close)
69#define PL_pending_ident (PL_parser->pending_ident)
70#define PL_preambled (PL_parser->preambled)
71#define PL_sublex_info (PL_parser->sublex_info)
bdc0bf6f 72#define PL_linestr (PL_parser->linestr)
c2598295
DM
73#define PL_expect (PL_parser->expect)
74#define PL_copline (PL_parser->copline)
f06b5848
DM
75#define PL_bufptr (PL_parser->bufptr)
76#define PL_oldbufptr (PL_parser->oldbufptr)
77#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
78#define PL_linestart (PL_parser->linestart)
79#define PL_bufend (PL_parser->bufend)
80#define PL_last_uni (PL_parser->last_uni)
81#define PL_last_lop (PL_parser->last_lop)
82#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 83#define PL_lex_state (PL_parser->lex_state)
2f9285f8 84#define PL_rsfp (PL_parser->rsfp)
5486870f 85#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
86#define PL_in_my (PL_parser->in_my)
87#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 88#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 89#define PL_multi_end (PL_parser->multi_end)
13765c85 90#define PL_error_count (PL_parser->error_count)
199e78b7
DM
91
92#ifdef PERL_MAD
93# define PL_endwhite (PL_parser->endwhite)
94# define PL_faketokens (PL_parser->faketokens)
95# define PL_lasttoke (PL_parser->lasttoke)
96# define PL_nextwhite (PL_parser->nextwhite)
97# define PL_realtokenstart (PL_parser->realtokenstart)
98# define PL_skipwhite (PL_parser->skipwhite)
99# define PL_thisclose (PL_parser->thisclose)
100# define PL_thismad (PL_parser->thismad)
101# define PL_thisopen (PL_parser->thisopen)
102# define PL_thisstuff (PL_parser->thisstuff)
103# define PL_thistoken (PL_parser->thistoken)
104# define PL_thiswhite (PL_parser->thiswhite)
fb205e7a
DM
105# define PL_thiswhite (PL_parser->thiswhite)
106# define PL_nexttoke (PL_parser->nexttoke)
107# define PL_curforce (PL_parser->curforce)
108#else
109# define PL_nexttoke (PL_parser->nexttoke)
110# define PL_nexttype (PL_parser->nexttype)
111# define PL_nextval (PL_parser->nextval)
199e78b7
DM
112#endif
113
16173588
NC
114/* This can't be done with embed.fnc, because struct yy_parser contains a
115 member named pending_ident, which clashes with the generated #define */
3cbf51f5
DM
116static int
117S_pending_ident(pTHX);
199e78b7 118
0bd48802 119static const char ident_too_long[] = "Identifier too long";
8903cb82 120
29595ff2 121#ifdef PERL_MAD
29595ff2 122# define CURMAD(slot,sv) if (PL_madskills) { curmad(slot,sv); sv = 0; }
cd81e915 123# define NEXTVAL_NEXTTOKE PL_nexttoke[PL_curforce].next_val
9ded7720 124#else
5db06880 125# define CURMAD(slot,sv)
9ded7720 126# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2
NC
127#endif
128
a7aaec61
Z
129#define XENUMMASK 0x3f
130#define XFAKEEOF 0x40
131#define XFAKEBRACK 0x80
9059aa12 132
39e02b42
JH
133#ifdef USE_UTF8_SCRIPTS
134# define UTF (!IN_BYTES)
2b9d42f0 135#else
802a15e9 136# define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
2b9d42f0 137#endif
a0ed51b3 138
b1fc3636
CJ
139/* The maximum number of characters preceding the unrecognized one to display */
140#define UNRECOGNIZED_PRECEDE_COUNT 10
141
61f0cdd9 142/* In variables named $^X, these are the legal values for X.
2b92dfce
GS
143 * 1999-02-27 mjd-perl-patch@plover.com */
144#define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
145
bf4acbe4 146#define SPACE_OR_TAB(c) ((c)==' '||(c)=='\t')
bf4acbe4 147
ffb4593c
NT
148/* LEX_* are values for PL_lex_state, the state of the lexer.
149 * They are arranged oddly so that the guard on the switch statement
79072805
LW
150 * can get by with a single comparison (if the compiler is smart enough).
151 */
152
fb73857a 153/* #define LEX_NOTPARSING 11 is done in perl.h. */
154
b6007c36
DM
155#define LEX_NORMAL 10 /* normal code (ie not within "...") */
156#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
157#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
158#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
159#define LEX_INTERPSTART 6 /* expecting the start of a $var */
160
161 /* at end of code, eg "$x" followed by: */
162#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
163#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
164
165#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
166 string or after \E, $foo, etc */
167#define LEX_INTERPCONST 2 /* NOT USED */
168#define LEX_FORMLINE 1 /* expecting a format line */
169#define LEX_KNOWNEXT 0 /* next token known; just return it */
170
79072805 171
bbf60fe6 172#ifdef DEBUGGING
27da23d5 173static const char* const lex_state_names[] = {
bbf60fe6
DM
174 "KNOWNEXT",
175 "FORMLINE",
176 "INTERPCONST",
177 "INTERPCONCAT",
178 "INTERPENDMAYBE",
179 "INTERPEND",
180 "INTERPSTART",
181 "INTERPPUSH",
182 "INTERPCASEMOD",
183 "INTERPNORMAL",
184 "NORMAL"
185};
186#endif
187
79072805
LW
188#ifdef ff_next
189#undef ff_next
d48672a2
LW
190#endif
191
79072805 192#include "keywords.h"
fe14fcc3 193
ffb4593c
NT
194/* CLINE is a macro that ensures PL_copline has a sane value */
195
ae986130
LW
196#ifdef CLINE
197#undef CLINE
198#endif
57843af0 199#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 200
5db06880 201#ifdef PERL_MAD
29595ff2
NC
202# define SKIPSPACE0(s) skipspace0(s)
203# define SKIPSPACE1(s) skipspace1(s)
204# define SKIPSPACE2(s,tsv) skipspace2(s,&tsv)
205# define PEEKSPACE(s) skipspace2(s,0)
206#else
207# define SKIPSPACE0(s) skipspace(s)
208# define SKIPSPACE1(s) skipspace(s)
209# define SKIPSPACE2(s,tsv) skipspace(s)
210# define PEEKSPACE(s) skipspace(s)
211#endif
212
ffb4593c
NT
213/*
214 * Convenience functions to return different tokens and prime the
9cbb5ea2 215 * lexer for the next token. They all take an argument.
ffb4593c
NT
216 *
217 * TOKEN : generic token (used for '(', DOLSHARP, etc)
218 * OPERATOR : generic operator
219 * AOPERATOR : assignment operator
220 * PREBLOCK : beginning the block after an if, while, foreach, ...
221 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
222 * PREREF : *EXPR where EXPR is not a simple identifier
223 * TERM : expression term
224 * LOOPX : loop exiting command (goto, last, dump, etc)
225 * FTST : file test operator
226 * FUN0 : zero-argument function
7eb971ee 227 * FUN0OP : zero-argument function, with its op created in this file
2d2e263d 228 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
229 * BOop : bitwise or or xor
230 * BAop : bitwise and
231 * SHop : shift operator
232 * PWop : power operator
9cbb5ea2 233 * PMop : pattern-matching operator
ffb4593c
NT
234 * Aop : addition-level operator
235 * Mop : multiplication-level operator
236 * Eop : equality-testing operator
e5edeb50 237 * Rop : relational operator <= != gt
ffb4593c
NT
238 *
239 * Also see LOP and lop() below.
240 */
241
998054bd 242#ifdef DEBUGGING /* Serve -DT. */
704d4215 243# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 244#else
bbf60fe6 245# define REPORT(retval) (retval)
998054bd
SC
246#endif
247
bbf60fe6
DM
248#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
249#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
250#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, REPORT(retval)))
251#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
252#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
253#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
254#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
6154021b
RGS
255#define LOOPX(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)LOOPEX))
256#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
257#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
7eb971ee 258#define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
6154021b
RGS
259#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
260#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITOROP)))
261#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITANDOP)))
262#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)SHIFTOP)))
263#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)POWOP)))
264#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
265#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)ADDOP)))
266#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MULOP)))
267#define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
268#define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
2f3197b3 269
a687059c
LW
270/* This bit of chicanery makes a unary function followed by
271 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
272 * The UNIDOR macro is for unary functions that can be followed by the //
273 * operator (such as C<shift // 0>).
a687059c 274 */
376fcdbf 275#define UNI2(f,x) { \
6154021b 276 pl_yylval.ival = f; \
376fcdbf
AL
277 PL_expect = x; \
278 PL_bufptr = s; \
279 PL_last_uni = PL_oldbufptr; \
280 PL_last_lop_op = f; \
281 if (*s == '(') \
282 return REPORT( (int)FUNC1 ); \
29595ff2 283 s = PEEKSPACE(s); \
376fcdbf
AL
284 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
285 }
6f33ba73
RGS
286#define UNI(f) UNI2(f,XTERM)
287#define UNIDOR(f) UNI2(f,XTERMORDORDOR)
a687059c 288
376fcdbf 289#define UNIBRACK(f) { \
6154021b 290 pl_yylval.ival = f; \
376fcdbf
AL
291 PL_bufptr = s; \
292 PL_last_uni = PL_oldbufptr; \
293 if (*s == '(') \
294 return REPORT( (int)FUNC1 ); \
29595ff2 295 s = PEEKSPACE(s); \
376fcdbf
AL
296 return REPORT( (*s == '(') ? (int)FUNC1 : (int)UNIOP ); \
297 }
79072805 298
9f68db38 299/* grandfather return to old style */
78cdf107
Z
300#define OLDLOP(f) \
301 do { \
302 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
303 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
304 pl_yylval.ival = (f); \
305 PL_expect = XTERM; \
306 PL_bufptr = s; \
307 return (int)LSTOP; \
308 } while(0)
79072805 309
8fa7f367
JH
310#ifdef DEBUGGING
311
6154021b 312/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
313enum token_type {
314 TOKENTYPE_NONE,
315 TOKENTYPE_IVAL,
6154021b 316 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6
DM
317 TOKENTYPE_PVAL,
318 TOKENTYPE_OPVAL,
319 TOKENTYPE_GVVAL
320};
321
6d4a66ac
NC
322static struct debug_tokens {
323 const int token;
324 enum token_type type;
325 const char *name;
326} const debug_tokens[] =
9041c2e3 327{
bbf60fe6
DM
328 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
329 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
330 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
331 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
332 { ARROW, TOKENTYPE_NONE, "ARROW" },
333 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
334 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
335 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
336 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
337 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
0d863452 338 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
339 { DO, TOKENTYPE_NONE, "DO" },
340 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
341 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
342 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
343 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
344 { ELSE, TOKENTYPE_NONE, "ELSE" },
345 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
346 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
347 { FOR, TOKENTYPE_IVAL, "FOR" },
348 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
349 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
350 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
7eb971ee 351 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
bbf60fe6
DM
352 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
353 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
354 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 355 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
356 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
357 { IF, TOKENTYPE_IVAL, "IF" },
358 { LABEL, TOKENTYPE_PVAL, "LABEL" },
359 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
360 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
361 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
362 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
363 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
364 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
365 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
366 { MY, TOKENTYPE_IVAL, "MY" },
367 { MYSUB, TOKENTYPE_NONE, "MYSUB" },
368 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
369 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
370 { OROP, TOKENTYPE_IVAL, "OROP" },
371 { OROR, TOKENTYPE_NONE, "OROR" },
372 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
88e1f1a2
JV
373 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
374 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6
DM
375 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
376 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
377 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
378 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
379 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
380 { PREINC, TOKENTYPE_NONE, "PREINC" },
381 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
382 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
383 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
384 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
385 { SUB, TOKENTYPE_NONE, "SUB" },
386 { THING, TOKENTYPE_OPVAL, "THING" },
387 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
388 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
389 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
390 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
391 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
392 { USE, TOKENTYPE_IVAL, "USE" },
0d863452 393 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6
DM
394 { WHILE, TOKENTYPE_IVAL, "WHILE" },
395 { WORD, TOKENTYPE_OPVAL, "WORD" },
be25f609 396 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 397 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
398};
399
6154021b 400/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 401
bbf60fe6 402STATIC int
704d4215 403S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 404{
97aff369 405 dVAR;
7918f24d
NC
406
407 PERL_ARGS_ASSERT_TOKEREPORT;
408
bbf60fe6 409 if (DEBUG_T_TEST) {
bd61b366 410 const char *name = NULL;
bbf60fe6 411 enum token_type type = TOKENTYPE_NONE;
f54cb97a 412 const struct debug_tokens *p;
396482e1 413 SV* const report = newSVpvs("<== ");
bbf60fe6 414
f54cb97a 415 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
416 if (p->token == (int)rv) {
417 name = p->name;
418 type = p->type;
419 break;
420 }
421 }
422 if (name)
54667de8 423 Perl_sv_catpv(aTHX_ report, name);
bbf60fe6
DM
424 else if ((char)rv > ' ' && (char)rv < '~')
425 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
426 else if (!rv)
396482e1 427 sv_catpvs(report, "EOF");
bbf60fe6
DM
428 else
429 Perl_sv_catpvf(aTHX_ report, "?? %"IVdf, (IV)rv);
430 switch (type) {
431 case TOKENTYPE_NONE:
432 case TOKENTYPE_GVVAL: /* doesn't appear to be used */
433 break;
434 case TOKENTYPE_IVAL:
704d4215 435 Perl_sv_catpvf(aTHX_ report, "(ival=%"IVdf")", (IV)lvalp->ival);
bbf60fe6
DM
436 break;
437 case TOKENTYPE_OPNUM:
438 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 439 PL_op_name[lvalp->ival]);
bbf60fe6
DM
440 break;
441 case TOKENTYPE_PVAL:
704d4215 442 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
443 break;
444 case TOKENTYPE_OPVAL:
704d4215 445 if (lvalp->opval) {
401441c0 446 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
447 PL_op_name[lvalp->opval->op_type]);
448 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 449 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 450 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
451 }
452
453 }
401441c0 454 else
396482e1 455 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
456 break;
457 }
b6007c36 458 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
459 };
460 return (int)rv;
998054bd
SC
461}
462
b6007c36
DM
463
464/* print the buffer with suitable escapes */
465
466STATIC void
15f169a1 467S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 468{
396482e1 469 SV* const tmp = newSVpvs("");
7918f24d
NC
470
471 PERL_ARGS_ASSERT_PRINTBUF;
472
b6007c36
DM
473 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
474 SvREFCNT_dec(tmp);
475}
476
8fa7f367
JH
477#endif
478
8290c323
NC
479static int
480S_deprecate_commaless_var_list(pTHX) {
481 PL_expect = XTERM;
482 deprecate("comma-less variable list");
483 return REPORT(','); /* grandfather non-comma-format format */
484}
485
ffb4593c
NT
486/*
487 * S_ao
488 *
c963b151
BD
489 * This subroutine detects &&=, ||=, and //= and turns an ANDAND, OROR or DORDOR
490 * into an OP_ANDASSIGN, OP_ORASSIGN, or OP_DORASSIGN
ffb4593c
NT
491 */
492
76e3520e 493STATIC int
cea2e8a9 494S_ao(pTHX_ int toketype)
a0d0e21e 495{
97aff369 496 dVAR;
3280af22
NIS
497 if (*PL_bufptr == '=') {
498 PL_bufptr++;
a0d0e21e 499 if (toketype == ANDAND)
6154021b 500 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 501 else if (toketype == OROR)
6154021b 502 pl_yylval.ival = OP_ORASSIGN;
c963b151 503 else if (toketype == DORDOR)
6154021b 504 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
505 toketype = ASSIGNOP;
506 }
507 return toketype;
508}
509
ffb4593c
NT
510/*
511 * S_no_op
512 * When Perl expects an operator and finds something else, no_op
513 * prints the warning. It always prints "<something> found where
514 * operator expected. It prints "Missing semicolon on previous line?"
515 * if the surprise occurs at the start of the line. "do you need to
516 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
517 * where the compiler doesn't know if foo is a method call or a function.
518 * It prints "Missing operator before end of line" if there's nothing
519 * after the missing operator, or "... before <...>" if there is something
520 * after the missing operator.
521 */
522
76e3520e 523STATIC void
15f169a1 524S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 525{
97aff369 526 dVAR;
9d4ba2ae
AL
527 char * const oldbp = PL_bufptr;
528 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 529
7918f24d
NC
530 PERL_ARGS_ASSERT_NO_OP;
531
1189a94a
GS
532 if (!s)
533 s = oldbp;
07c798fb 534 else
1189a94a 535 PL_bufptr = s;
cea2e8a9 536 yywarn(Perl_form(aTHX_ "%s found where operator expected", what));
56da5a46
RGS
537 if (ckWARN_d(WARN_SYNTAX)) {
538 if (is_first)
539 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
540 "\t(Missing semicolon on previous line?)\n");
541 else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) {
f54cb97a 542 const char *t;
c35e046a
AL
543 for (t = PL_oldoldbufptr; (isALNUM_lazy_if(t,UTF) || *t == ':'); t++)
544 NOOP;
56da5a46
RGS
545 if (t < PL_bufptr && isSPACE(*t))
546 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
547 "\t(Do you need to predeclare %.*s?)\n",
551405c4 548 (int)(t - PL_oldoldbufptr), PL_oldoldbufptr);
56da5a46
RGS
549 }
550 else {
551 assert(s >= oldbp);
552 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
551405c4 553 "\t(Missing operator before %.*s?)\n", (int)(s - oldbp), oldbp);
56da5a46 554 }
07c798fb 555 }
3280af22 556 PL_bufptr = oldbp;
8990e307
LW
557}
558
ffb4593c
NT
559/*
560 * S_missingterm
561 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 562 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
563 * If we're in a delimited string and the delimiter is a control
564 * character, it's reformatted into a two-char sequence like ^C.
565 * This is fatal.
566 */
567
76e3520e 568STATIC void
cea2e8a9 569S_missingterm(pTHX_ char *s)
8990e307 570{
97aff369 571 dVAR;
8990e307
LW
572 char tmpbuf[3];
573 char q;
574 if (s) {
9d4ba2ae 575 char * const nl = strrchr(s,'\n');
d2719217 576 if (nl)
8990e307
LW
577 *nl = '\0';
578 }
463559e7 579 else if (isCNTRL(PL_multi_close)) {
8990e307 580 *tmpbuf = '^';
585ec06d 581 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
582 tmpbuf[2] = '\0';
583 s = tmpbuf;
584 }
585 else {
eb160463 586 *tmpbuf = (char)PL_multi_close;
8990e307
LW
587 tmpbuf[1] = '\0';
588 s = tmpbuf;
589 }
590 q = strchr(s,'"') ? '\'' : '"';
cea2e8a9 591 Perl_croak(aTHX_ "Can't find string terminator %c%s%c anywhere before EOF",q,s,q);
463ee0b2 592}
79072805 593
0d863452 594/*
0d863452
RH
595 * Check whether the named feature is enabled.
596 */
26ea9e12
NC
597bool
598Perl_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
0d863452 599{
97aff369 600 dVAR;
0d863452 601 HV * const hinthv = GvHV(PL_hintgv);
4a731d7b 602 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
7918f24d
NC
603
604 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
605
26ea9e12
NC
606 if (namelen > MAX_FEATURE_LEN)
607 return FALSE;
4a731d7b 608 memcpy(&he_name[8], name, namelen);
d4c19fe8 609
7b9ef140 610 return (hinthv && hv_exists(hinthv, he_name, 8 + namelen));
0d863452
RH
611}
612
ffb4593c 613/*
9cbb5ea2
GS
614 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
615 * utf16-to-utf8-reversed.
ffb4593c
NT
616 */
617
c39cd008
GS
618#ifdef PERL_CR_FILTER
619static void
620strip_return(SV *sv)
621{
95a20fc0 622 register const char *s = SvPVX_const(sv);
9d4ba2ae 623 register const char * const e = s + SvCUR(sv);
7918f24d
NC
624
625 PERL_ARGS_ASSERT_STRIP_RETURN;
626
c39cd008
GS
627 /* outer loop optimized to do nothing if there are no CR-LFs */
628 while (s < e) {
629 if (*s++ == '\r' && *s == '\n') {
630 /* hit a CR-LF, need to copy the rest */
631 register char *d = s - 1;
632 *d++ = *s++;
633 while (s < e) {
634 if (*s == '\r' && s[1] == '\n')
635 s++;
636 *d++ = *s++;
637 }
638 SvCUR(sv) -= s - d;
639 return;
640 }
641 }
642}
a868473f 643
76e3520e 644STATIC I32
c39cd008 645S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 646{
f54cb97a 647 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
648 if (count > 0 && !maxlen)
649 strip_return(sv);
650 return count;
a868473f
NIS
651}
652#endif
653
ffb4593c 654/*
8eaa0acf
Z
655=for apidoc Amx|void|lex_start|SV *line|PerlIO *rsfp|U32 flags
656
657Creates and initialises a new lexer/parser state object, supplying
658a context in which to lex and parse from a new source of Perl code.
659A pointer to the new state object is placed in L</PL_parser>. An entry
660is made on the save stack so that upon unwinding the new state object
661will be destroyed and the former value of L</PL_parser> will be restored.
662Nothing else need be done to clean up the parsing context.
663
664The code to be parsed comes from I<line> and I<rsfp>. I<line>, if
665non-null, provides a string (in SV form) containing code to be parsed.
666A copy of the string is made, so subsequent modification of I<line>
667does not affect parsing. I<rsfp>, if non-null, provides an input stream
668from which code will be read to be parsed. If both are non-null, the
669code in I<line> comes first and must consist of complete lines of input,
670and I<rsfp> supplies the remainder of the source.
671
e368b3bd
FC
672The I<flags> parameter is reserved for future use. Currently it is only
673used by perl internally, so extensions should always pass zero.
8eaa0acf
Z
674
675=cut
676*/
ffb4593c 677
27fcb6ee
FC
678/* LEX_START_SAME_FILTER indicates that this is not a new file, so it
679 can share filters with the current parser. */
680
a0d0e21e 681void
8eaa0acf 682Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
79072805 683{
97aff369 684 dVAR;
6ef55633 685 const char *s = NULL;
5486870f 686 yy_parser *parser, *oparser;
60d63348 687 if (flags && flags & ~LEX_START_FLAGS)
8eaa0acf 688 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
acdf0a21
DM
689
690 /* create and initialise a parser */
691
199e78b7 692 Newxz(parser, 1, yy_parser);
5486870f 693 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
694 PL_parser = parser;
695
28ac2b49
Z
696 parser->stack = NULL;
697 parser->ps = NULL;
698 parser->stack_size = 0;
acdf0a21 699
e3abe207
DM
700 /* on scope exit, free this parser and restore any outer one */
701 SAVEPARSER(parser);
7c4baf47 702 parser->saved_curcop = PL_curcop;
e3abe207 703
acdf0a21 704 /* initialise lexer state */
8990e307 705
fb205e7a
DM
706#ifdef PERL_MAD
707 parser->curforce = -1;
708#else
709 parser->nexttoke = 0;
710#endif
ca4cfd28 711 parser->error_count = oparser ? oparser->error_count : 0;
c2598295 712 parser->copline = NOLINE;
5afb0a62 713 parser->lex_state = LEX_NORMAL;
c2598295 714 parser->expect = XSTATE;
2f9285f8 715 parser->rsfp = rsfp;
27fcb6ee
FC
716 parser->rsfp_filters =
717 !(flags & LEX_START_SAME_FILTER) || !oparser
718 ? newAV()
719 : MUTABLE_AV(SvREFCNT_inc(oparser->rsfp_filters));
2f9285f8 720
199e78b7
DM
721 Newx(parser->lex_brackstack, 120, char);
722 Newx(parser->lex_casestack, 12, char);
723 *parser->lex_casestack = '\0';
02b34bbe 724
10efb74f 725 if (line) {
0528fd32 726 STRLEN len;
10efb74f 727 s = SvPV_const(line, len);
0abcdfa4
FC
728 parser->linestr = flags & LEX_START_COPIED
729 ? SvREFCNT_inc_simple_NN(line)
730 : newSVpvn_flags(s, len, SvUTF8(line));
37555a72 731 if (!len || s[len-1] != ';')
bdc0bf6f 732 sv_catpvs(parser->linestr, "\n;");
0abcdfa4
FC
733 } else {
734 parser->linestr = newSVpvs("\n;");
8990e307 735 }
f06b5848
DM
736 parser->oldoldbufptr =
737 parser->oldbufptr =
738 parser->bufptr =
739 parser->linestart = SvPVX(parser->linestr);
740 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
741 parser->last_lop = parser->last_uni = NULL;
60d63348 742 parser->lex_flags = flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES);
737c24fc 743
60d63348 744 parser->in_pod = parser->filtered = 0;
79072805 745}
a687059c 746
e3abe207
DM
747
748/* delete a parser object */
749
750void
751Perl_parser_free(pTHX_ const yy_parser *parser)
752{
7918f24d
NC
753 PERL_ARGS_ASSERT_PARSER_FREE;
754
7c4baf47 755 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
756 SvREFCNT_dec(parser->linestr);
757
2f9285f8
DM
758 if (parser->rsfp == PerlIO_stdin())
759 PerlIO_clearerr(parser->rsfp);
799361c3
SH
760 else if (parser->rsfp && (!parser->old_parser ||
761 (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 762 PerlIO_close(parser->rsfp);
5486870f 763 SvREFCNT_dec(parser->rsfp_filters);
2f9285f8 764
e3abe207
DM
765 Safefree(parser->lex_brackstack);
766 Safefree(parser->lex_casestack);
767 PL_parser = parser->old_parser;
768 Safefree(parser);
769}
770
771
ffb4593c 772/*
f0e67a1d
Z
773=for apidoc AmxU|SV *|PL_parser-E<gt>linestr
774
775Buffer scalar containing the chunk currently under consideration of the
776text currently being lexed. This is always a plain string scalar (for
777which C<SvPOK> is true). It is not intended to be used as a scalar by
778normal scalar means; instead refer to the buffer directly by the pointer
779variables described below.
780
781The lexer maintains various C<char*> pointers to things in the
782C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
783reallocated, all of these pointers must be updated. Don't attempt to
784do this manually, but rather use L</lex_grow_linestr> if you need to
785reallocate the buffer.
786
787The content of the text chunk in the buffer is commonly exactly one
788complete line of input, up to and including a newline terminator,
789but there are situations where it is otherwise. The octets of the
790buffer may be intended to be interpreted as either UTF-8 or Latin-1.
791The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
792flag on this scalar, which may disagree with it.
793
794For direct examination of the buffer, the variable
795L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
796lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
797of these pointers is usually preferable to examination of the scalar
798through normal scalar means.
799
800=for apidoc AmxU|char *|PL_parser-E<gt>bufend
801
802Direct pointer to the end of the chunk of text currently being lexed, the
803end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
804+ SvCUR(PL_parser-E<gt>linestr)>. A NUL character (zero octet) is
805always located at the end of the buffer, and does not count as part of
806the buffer's contents.
807
808=for apidoc AmxU|char *|PL_parser-E<gt>bufptr
809
810Points to the current position of lexing inside the lexer buffer.
811Characters around this point may be freely examined, within
812the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
813L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
814interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
815
816Lexing code (whether in the Perl core or not) moves this pointer past
817the characters that it consumes. It is also expected to perform some
818bookkeeping whenever a newline character is consumed. This movement
819can be more conveniently performed by the function L</lex_read_to>,
820which handles newlines appropriately.
821
822Interpretation of the buffer's octets can be abstracted out by
823using the slightly higher-level functions L</lex_peek_unichar> and
824L</lex_read_unichar>.
825
826=for apidoc AmxU|char *|PL_parser-E<gt>linestart
827
828Points to the start of the current line inside the lexer buffer.
829This is useful for indicating at which column an error occurred, and
830not much else. This must be updated by any lexing code that consumes
831a newline; the function L</lex_read_to> handles this detail.
832
833=cut
834*/
835
836/*
837=for apidoc Amx|bool|lex_bufutf8
838
839Indicates whether the octets in the lexer buffer
840(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
841of Unicode characters. If not, they should be interpreted as Latin-1
842characters. This is analogous to the C<SvUTF8> flag for scalars.
843
844In UTF-8 mode, it is not guaranteed that the lexer buffer actually
845contains valid UTF-8. Lexing code must be robust in the face of invalid
846encoding.
847
848The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
849is significant, but not the whole story regarding the input character
850encoding. Normally, when a file is being read, the scalar contains octets
851and its C<SvUTF8> flag is off, but the octets should be interpreted as
852UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
853however, the scalar may have the C<SvUTF8> flag on, and in this case its
854octets should be interpreted as UTF-8 unless the C<use bytes> pragma
855is in effect. This logic may change in the future; use this function
856instead of implementing the logic yourself.
857
858=cut
859*/
860
861bool
862Perl_lex_bufutf8(pTHX)
863{
864 return UTF;
865}
866
867/*
868=for apidoc Amx|char *|lex_grow_linestr|STRLEN len
869
870Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
871at least I<len> octets (including terminating NUL). Returns a
872pointer to the reallocated buffer. This is necessary before making
873any direct modification of the buffer that would increase its length.
874L</lex_stuff_pvn> provides a more convenient way to insert text into
875the buffer.
876
877Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
878this function updates all of the lexer's variables that point directly
879into the buffer.
880
881=cut
882*/
883
884char *
885Perl_lex_grow_linestr(pTHX_ STRLEN len)
886{
887 SV *linestr;
888 char *buf;
889 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
890 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
891 linestr = PL_parser->linestr;
892 buf = SvPVX(linestr);
893 if (len <= SvLEN(linestr))
894 return buf;
895 bufend_pos = PL_parser->bufend - buf;
896 bufptr_pos = PL_parser->bufptr - buf;
897 oldbufptr_pos = PL_parser->oldbufptr - buf;
898 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
899 linestart_pos = PL_parser->linestart - buf;
900 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
901 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
902 buf = sv_grow(linestr, len);
903 PL_parser->bufend = buf + bufend_pos;
904 PL_parser->bufptr = buf + bufptr_pos;
905 PL_parser->oldbufptr = buf + oldbufptr_pos;
906 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
907 PL_parser->linestart = buf + linestart_pos;
908 if (PL_parser->last_uni)
909 PL_parser->last_uni = buf + last_uni_pos;
910 if (PL_parser->last_lop)
911 PL_parser->last_lop = buf + last_lop_pos;
912 return buf;
913}
914
915/*
83aa740e 916=for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
f0e67a1d
Z
917
918Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
919immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
920reallocating the buffer if necessary. This means that lexing code that
921runs later will see the characters as if they had appeared in the input.
922It is not recommended to do this as part of normal parsing, and most
923uses of this facility run the risk of the inserted characters being
924interpreted in an unintended manner.
925
926The string to be inserted is represented by I<len> octets starting
927at I<pv>. These octets are interpreted as either UTF-8 or Latin-1,
928according to whether the C<LEX_STUFF_UTF8> flag is set in I<flags>.
929The characters are recoded for the lexer buffer, according to how the
930buffer is currently being interpreted (L</lex_bufutf8>). If a string
9dcc53ea 931to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
f0e67a1d
Z
932function is more convenient.
933
934=cut
935*/
936
937void
83aa740e 938Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
f0e67a1d 939{
749123ff 940 dVAR;
f0e67a1d
Z
941 char *bufptr;
942 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
943 if (flags & ~(LEX_STUFF_UTF8))
944 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
945 if (UTF) {
946 if (flags & LEX_STUFF_UTF8) {
947 goto plain_copy;
948 } else {
949 STRLEN highhalf = 0;
83aa740e 950 const char *p, *e = pv+len;
f0e67a1d
Z
951 for (p = pv; p != e; p++)
952 highhalf += !!(((U8)*p) & 0x80);
953 if (!highhalf)
954 goto plain_copy;
955 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
956 bufptr = PL_parser->bufptr;
957 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
958 SvCUR_set(PL_parser->linestr,
959 SvCUR(PL_parser->linestr) + len+highhalf);
f0e67a1d
Z
960 PL_parser->bufend += len+highhalf;
961 for (p = pv; p != e; p++) {
962 U8 c = (U8)*p;
963 if (c & 0x80) {
964 *bufptr++ = (char)(0xc0 | (c >> 6));
965 *bufptr++ = (char)(0x80 | (c & 0x3f));
966 } else {
967 *bufptr++ = (char)c;
968 }
969 }
970 }
971 } else {
972 if (flags & LEX_STUFF_UTF8) {
973 STRLEN highhalf = 0;
83aa740e 974 const char *p, *e = pv+len;
f0e67a1d
Z
975 for (p = pv; p != e; p++) {
976 U8 c = (U8)*p;
977 if (c >= 0xc4) {
978 Perl_croak(aTHX_ "Lexing code attempted to stuff "
979 "non-Latin-1 character into Latin-1 input");
980 } else if (c >= 0xc2 && p+1 != e &&
981 (((U8)p[1]) & 0xc0) == 0x80) {
982 p++;
983 highhalf++;
984 } else if (c >= 0x80) {
985 /* malformed UTF-8 */
986 ENTER;
987 SAVESPTR(PL_warnhook);
988 PL_warnhook = PERL_WARNHOOK_FATAL;
989 utf8n_to_uvuni((U8*)p, e-p, NULL, 0);
990 LEAVE;
991 }
992 }
993 if (!highhalf)
994 goto plain_copy;
995 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
996 bufptr = PL_parser->bufptr;
997 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
998 SvCUR_set(PL_parser->linestr,
999 SvCUR(PL_parser->linestr) + len-highhalf);
f0e67a1d
Z
1000 PL_parser->bufend += len-highhalf;
1001 for (p = pv; p != e; p++) {
1002 U8 c = (U8)*p;
1003 if (c & 0x80) {
1004 *bufptr++ = (char)(((c & 0x3) << 6) | (p[1] & 0x3f));
1005 p++;
1006 } else {
1007 *bufptr++ = (char)c;
1008 }
1009 }
1010 } else {
1011 plain_copy:
1012 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1013 bufptr = PL_parser->bufptr;
1014 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
255fdf19 1015 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
f0e67a1d
Z
1016 PL_parser->bufend += len;
1017 Copy(pv, bufptr, len, char);
1018 }
1019 }
1020}
1021
1022/*
9dcc53ea
Z
1023=for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
1024
1025Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1026immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1027reallocating the buffer if necessary. This means that lexing code that
1028runs later will see the characters as if they had appeared in the input.
1029It is not recommended to do this as part of normal parsing, and most
1030uses of this facility run the risk of the inserted characters being
1031interpreted in an unintended manner.
1032
1033The string to be inserted is represented by octets starting at I<pv>
1034and continuing to the first nul. These octets are interpreted as either
1035UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
1036in I<flags>. The characters are recoded for the lexer buffer, according
1037to how the buffer is currently being interpreted (L</lex_bufutf8>).
1038If it is not convenient to nul-terminate a string to be inserted, the
1039L</lex_stuff_pvn> function is more appropriate.
1040
1041=cut
1042*/
1043
1044void
1045Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1046{
1047 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1048 lex_stuff_pvn(pv, strlen(pv), flags);
1049}
1050
1051/*
f0e67a1d
Z
1052=for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1053
1054Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1055immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1056reallocating the buffer if necessary. This means that lexing code that
1057runs later will see the characters as if they had appeared in the input.
1058It is not recommended to do this as part of normal parsing, and most
1059uses of this facility run the risk of the inserted characters being
1060interpreted in an unintended manner.
1061
1062The string to be inserted is the string value of I<sv>. The characters
1063are recoded for the lexer buffer, according to how the buffer is currently
9dcc53ea 1064being interpreted (L</lex_bufutf8>). If a string to be inserted is
f0e67a1d
Z
1065not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1066need to construct a scalar.
1067
1068=cut
1069*/
1070
1071void
1072Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1073{
1074 char *pv;
1075 STRLEN len;
1076 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1077 if (flags)
1078 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1079 pv = SvPV(sv, len);
1080 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1081}
1082
1083/*
1084=for apidoc Amx|void|lex_unstuff|char *ptr
1085
1086Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1087I<ptr>. Text following I<ptr> will be moved, and the buffer shortened.
1088This hides the discarded text from any lexing code that runs later,
1089as if the text had never appeared.
1090
1091This is not the normal way to consume lexed text. For that, use
1092L</lex_read_to>.
1093
1094=cut
1095*/
1096
1097void
1098Perl_lex_unstuff(pTHX_ char *ptr)
1099{
1100 char *buf, *bufend;
1101 STRLEN unstuff_len;
1102 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1103 buf = PL_parser->bufptr;
1104 if (ptr < buf)
1105 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1106 if (ptr == buf)
1107 return;
1108 bufend = PL_parser->bufend;
1109 if (ptr > bufend)
1110 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1111 unstuff_len = ptr - buf;
1112 Move(ptr, buf, bufend+1-ptr, char);
1113 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1114 PL_parser->bufend = bufend - unstuff_len;
1115}
1116
1117/*
1118=for apidoc Amx|void|lex_read_to|char *ptr
1119
1120Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1121to I<ptr>. This advances L</PL_parser-E<gt>bufptr> to match I<ptr>,
1122performing the correct bookkeeping whenever a newline character is passed.
1123This is the normal way to consume lexed text.
1124
1125Interpretation of the buffer's octets can be abstracted out by
1126using the slightly higher-level functions L</lex_peek_unichar> and
1127L</lex_read_unichar>.
1128
1129=cut
1130*/
1131
1132void
1133Perl_lex_read_to(pTHX_ char *ptr)
1134{
1135 char *s;
1136 PERL_ARGS_ASSERT_LEX_READ_TO;
1137 s = PL_parser->bufptr;
1138 if (ptr < s || ptr > PL_parser->bufend)
1139 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1140 for (; s != ptr; s++)
1141 if (*s == '\n') {
1142 CopLINE_inc(PL_curcop);
1143 PL_parser->linestart = s+1;
1144 }
1145 PL_parser->bufptr = ptr;
1146}
1147
1148/*
1149=for apidoc Amx|void|lex_discard_to|char *ptr
1150
1151Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1152up to I<ptr>. The remaining content of the buffer will be moved, and
1153all pointers into the buffer updated appropriately. I<ptr> must not
1154be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1155it is not permitted to discard text that has yet to be lexed.
1156
1157Normally it is not necessarily to do this directly, because it suffices to
1158use the implicit discarding behaviour of L</lex_next_chunk> and things
1159based on it. However, if a token stretches across multiple lines,
1f317c95 1160and the lexing code has kept multiple lines of text in the buffer for
f0e67a1d
Z
1161that purpose, then after completion of the token it would be wise to
1162explicitly discard the now-unneeded earlier lines, to avoid future
1163multi-line tokens growing the buffer without bound.
1164
1165=cut
1166*/
1167
1168void
1169Perl_lex_discard_to(pTHX_ char *ptr)
1170{
1171 char *buf;
1172 STRLEN discard_len;
1173 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1174 buf = SvPVX(PL_parser->linestr);
1175 if (ptr < buf)
1176 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1177 if (ptr == buf)
1178 return;
1179 if (ptr > PL_parser->bufptr)
1180 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1181 discard_len = ptr - buf;
1182 if (PL_parser->oldbufptr < ptr)
1183 PL_parser->oldbufptr = ptr;
1184 if (PL_parser->oldoldbufptr < ptr)
1185 PL_parser->oldoldbufptr = ptr;
1186 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1187 PL_parser->last_uni = NULL;
1188 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1189 PL_parser->last_lop = NULL;
1190 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1191 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1192 PL_parser->bufend -= discard_len;
1193 PL_parser->bufptr -= discard_len;
1194 PL_parser->oldbufptr -= discard_len;
1195 PL_parser->oldoldbufptr -= discard_len;
1196 if (PL_parser->last_uni)
1197 PL_parser->last_uni -= discard_len;
1198 if (PL_parser->last_lop)
1199 PL_parser->last_lop -= discard_len;
1200}
1201
1202/*
1203=for apidoc Amx|bool|lex_next_chunk|U32 flags
1204
1205Reads in the next chunk of text to be lexed, appending it to
1206L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1207looked to the end of the current chunk and wants to know more. It is
1208usual, but not necessary, for lexing to have consumed the entirety of
1209the current chunk at this time.
1210
1211If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1212chunk (i.e., the current chunk has been entirely consumed), normally the
1213current chunk will be discarded at the same time that the new chunk is
1214read in. If I<flags> includes C<LEX_KEEP_PREVIOUS>, the current chunk
1215will not be discarded. If the current chunk has not been entirely
1216consumed, then it will not be discarded regardless of the flag.
1217
1218Returns true if some new text was added to the buffer, or false if the
1219buffer has reached the end of the input text.
1220
1221=cut
1222*/
1223
1224#define LEX_FAKE_EOF 0x80000000
1225
1226bool
1227Perl_lex_next_chunk(pTHX_ U32 flags)
1228{
1229 SV *linestr;
1230 char *buf;
1231 STRLEN old_bufend_pos, new_bufend_pos;
1232 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1233 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1234 bool got_some_for_debugger = 0;
f0e67a1d
Z
1235 bool got_some;
1236 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF))
1237 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
f0e67a1d
Z
1238 linestr = PL_parser->linestr;
1239 buf = SvPVX(linestr);
1240 if (!(flags & LEX_KEEP_PREVIOUS) &&
1241 PL_parser->bufptr == PL_parser->bufend) {
1242 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1243 linestart_pos = 0;
1244 if (PL_parser->last_uni != PL_parser->bufend)
1245 PL_parser->last_uni = NULL;
1246 if (PL_parser->last_lop != PL_parser->bufend)
1247 PL_parser->last_lop = NULL;
1248 last_uni_pos = last_lop_pos = 0;
1249 *buf = 0;
1250 SvCUR(linestr) = 0;
1251 } else {
1252 old_bufend_pos = PL_parser->bufend - buf;
1253 bufptr_pos = PL_parser->bufptr - buf;
1254 oldbufptr_pos = PL_parser->oldbufptr - buf;
1255 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1256 linestart_pos = PL_parser->linestart - buf;
1257 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1258 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1259 }
1260 if (flags & LEX_FAKE_EOF) {
1261 goto eof;
60d63348 1262 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
f0e67a1d
Z
1263 got_some = 0;
1264 } else if (filter_gets(linestr, old_bufend_pos)) {
1265 got_some = 1;
17cc9359 1266 got_some_for_debugger = 1;
f0e67a1d 1267 } else {
580561a3
Z
1268 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1269 sv_setpvs(linestr, "");
f0e67a1d
Z
1270 eof:
1271 /* End of real input. Close filehandle (unless it was STDIN),
1272 * then add implicit termination.
1273 */
1274 if ((PerlIO*)PL_parser->rsfp == PerlIO_stdin())
1275 PerlIO_clearerr(PL_parser->rsfp);
1276 else if (PL_parser->rsfp)
1277 (void)PerlIO_close(PL_parser->rsfp);
1278 PL_parser->rsfp = NULL;
60d63348 1279 PL_parser->in_pod = PL_parser->filtered = 0;
f0e67a1d
Z
1280#ifdef PERL_MAD
1281 if (PL_madskills && !PL_in_eval && (PL_minus_p || PL_minus_n))
1282 PL_faketokens = 1;
1283#endif
1284 if (!PL_in_eval && PL_minus_p) {
1285 sv_catpvs(linestr,
1286 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1287 PL_minus_n = PL_minus_p = 0;
1288 } else if (!PL_in_eval && PL_minus_n) {
1289 sv_catpvs(linestr, /*{*/";}");
1290 PL_minus_n = 0;
1291 } else
1292 sv_catpvs(linestr, ";");
1293 got_some = 1;
1294 }
1295 buf = SvPVX(linestr);
1296 new_bufend_pos = SvCUR(linestr);
1297 PL_parser->bufend = buf + new_bufend_pos;
1298 PL_parser->bufptr = buf + bufptr_pos;
1299 PL_parser->oldbufptr = buf + oldbufptr_pos;
1300 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1301 PL_parser->linestart = buf + linestart_pos;
1302 if (PL_parser->last_uni)
1303 PL_parser->last_uni = buf + last_uni_pos;
1304 if (PL_parser->last_lop)
1305 PL_parser->last_lop = buf + last_lop_pos;
17cc9359 1306 if (got_some_for_debugger && (PERLDB_LINE || PERLDB_SAVESRC) &&
f0e67a1d
Z
1307 PL_curstash != PL_debstash) {
1308 /* debugger active and we're not compiling the debugger code,
1309 * so store the line into the debugger's array of lines
1310 */
1311 update_debugger_info(NULL, buf+old_bufend_pos,
1312 new_bufend_pos-old_bufend_pos);
1313 }
1314 return got_some;
1315}
1316
1317/*
1318=for apidoc Amx|I32|lex_peek_unichar|U32 flags
1319
1320Looks ahead one (Unicode) character in the text currently being lexed.
1321Returns the codepoint (unsigned integer value) of the next character,
1322or -1 if lexing has reached the end of the input text. To consume the
1323peeked character, use L</lex_read_unichar>.
1324
1325If the next character is in (or extends into) the next chunk of input
1326text, the next chunk will be read in. Normally the current chunk will be
1327discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1328then the current chunk will not be discarded.
1329
1330If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1331is encountered, an exception is generated.
1332
1333=cut
1334*/
1335
1336I32
1337Perl_lex_peek_unichar(pTHX_ U32 flags)
1338{
749123ff 1339 dVAR;
f0e67a1d
Z
1340 char *s, *bufend;
1341 if (flags & ~(LEX_KEEP_PREVIOUS))
1342 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1343 s = PL_parser->bufptr;
1344 bufend = PL_parser->bufend;
1345 if (UTF) {
1346 U8 head;
1347 I32 unichar;
1348 STRLEN len, retlen;
1349 if (s == bufend) {
1350 if (!lex_next_chunk(flags))
1351 return -1;
1352 s = PL_parser->bufptr;
1353 bufend = PL_parser->bufend;
1354 }
1355 head = (U8)*s;
1356 if (!(head & 0x80))
1357 return head;
1358 if (head & 0x40) {
1359 len = PL_utf8skip[head];
1360 while ((STRLEN)(bufend-s) < len) {
1361 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1362 break;
1363 s = PL_parser->bufptr;
1364 bufend = PL_parser->bufend;
1365 }
1366 }
1367 unichar = utf8n_to_uvuni((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
1368 if (retlen == (STRLEN)-1) {
1369 /* malformed UTF-8 */
1370 ENTER;
1371 SAVESPTR(PL_warnhook);
1372 PL_warnhook = PERL_WARNHOOK_FATAL;
1373 utf8n_to_uvuni((U8*)s, bufend-s, NULL, 0);
1374 LEAVE;
1375 }
1376 return unichar;
1377 } else {
1378 if (s == bufend) {
1379 if (!lex_next_chunk(flags))
1380 return -1;
1381 s = PL_parser->bufptr;
1382 }
1383 return (U8)*s;
1384 }
1385}
1386
1387/*
1388=for apidoc Amx|I32|lex_read_unichar|U32 flags
1389
1390Reads the next (Unicode) character in the text currently being lexed.
1391Returns the codepoint (unsigned integer value) of the character read,
1392and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1393if lexing has reached the end of the input text. To non-destructively
1394examine the next character, use L</lex_peek_unichar> instead.
1395
1396If the next character is in (or extends into) the next chunk of input
1397text, the next chunk will be read in. Normally the current chunk will be
1398discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1399then the current chunk will not be discarded.
1400
1401If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1402is encountered, an exception is generated.
1403
1404=cut
1405*/
1406
1407I32
1408Perl_lex_read_unichar(pTHX_ U32 flags)
1409{
1410 I32 c;
1411 if (flags & ~(LEX_KEEP_PREVIOUS))
1412 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1413 c = lex_peek_unichar(flags);
1414 if (c != -1) {
1415 if (c == '\n')
1416 CopLINE_inc(PL_curcop);
d9018cbe
EB
1417 if (UTF)
1418 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1419 else
1420 ++(PL_parser->bufptr);
f0e67a1d
Z
1421 }
1422 return c;
1423}
1424
1425/*
1426=for apidoc Amx|void|lex_read_space|U32 flags
1427
1428Reads optional spaces, in Perl style, in the text currently being
1429lexed. The spaces may include ordinary whitespace characters and
1430Perl-style comments. C<#line> directives are processed if encountered.
1431L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1432at a non-space character (or the end of the input text).
1433
1434If spaces extend into the next chunk of input text, the next chunk will
1435be read in. Normally the current chunk will be discarded at the same
1436time, but if I<flags> includes C<LEX_KEEP_PREVIOUS> then the current
1437chunk will not be discarded.
1438
1439=cut
1440*/
1441
f0998909
Z
1442#define LEX_NO_NEXT_CHUNK 0x80000000
1443
f0e67a1d
Z
1444void
1445Perl_lex_read_space(pTHX_ U32 flags)
1446{
1447 char *s, *bufend;
1448 bool need_incline = 0;
f0998909 1449 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK))
f0e67a1d
Z
1450 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
1451#ifdef PERL_MAD
1452 if (PL_skipwhite) {
1453 sv_free(PL_skipwhite);
1454 PL_skipwhite = NULL;
1455 }
1456 if (PL_madskills)
1457 PL_skipwhite = newSVpvs("");
1458#endif /* PERL_MAD */
1459 s = PL_parser->bufptr;
1460 bufend = PL_parser->bufend;
1461 while (1) {
1462 char c = *s;
1463 if (c == '#') {
1464 do {
1465 c = *++s;
1466 } while (!(c == '\n' || (c == 0 && s == bufend)));
1467 } else if (c == '\n') {
1468 s++;
1469 PL_parser->linestart = s;
1470 if (s == bufend)
1471 need_incline = 1;
1472 else
1473 incline(s);
1474 } else if (isSPACE(c)) {
1475 s++;
1476 } else if (c == 0 && s == bufend) {
1477 bool got_more;
1478#ifdef PERL_MAD
1479 if (PL_madskills)
1480 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1481#endif /* PERL_MAD */
f0998909
Z
1482 if (flags & LEX_NO_NEXT_CHUNK)
1483 break;
f0e67a1d
Z
1484 PL_parser->bufptr = s;
1485 CopLINE_inc(PL_curcop);
1486 got_more = lex_next_chunk(flags);
1487 CopLINE_dec(PL_curcop);
1488 s = PL_parser->bufptr;
1489 bufend = PL_parser->bufend;
1490 if (!got_more)
1491 break;
1492 if (need_incline && PL_parser->rsfp) {
1493 incline(s);
1494 need_incline = 0;
1495 }
1496 } else {
1497 break;
1498 }
1499 }
1500#ifdef PERL_MAD
1501 if (PL_madskills)
1502 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1503#endif /* PERL_MAD */
1504 PL_parser->bufptr = s;
1505}
1506
1507/*
ffb4593c
NT
1508 * S_incline
1509 * This subroutine has nothing to do with tilting, whether at windmills
1510 * or pinball tables. Its name is short for "increment line". It
57843af0 1511 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1512 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1513 * # line 500 "foo.pm"
1514 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1515 */
1516
76e3520e 1517STATIC void
d9095cec 1518S_incline(pTHX_ const char *s)
463ee0b2 1519{
97aff369 1520 dVAR;
d9095cec
NC
1521 const char *t;
1522 const char *n;
1523 const char *e;
8818d409 1524 line_t line_num;
463ee0b2 1525
7918f24d
NC
1526 PERL_ARGS_ASSERT_INCLINE;
1527
57843af0 1528 CopLINE_inc(PL_curcop);
463ee0b2
LW
1529 if (*s++ != '#')
1530 return;
d4c19fe8
AL
1531 while (SPACE_OR_TAB(*s))
1532 s++;
73659bf1
GS
1533 if (strnEQ(s, "line", 4))
1534 s += 4;
1535 else
1536 return;
084592ab 1537 if (SPACE_OR_TAB(*s))
73659bf1 1538 s++;
4e553d73 1539 else
73659bf1 1540 return;
d4c19fe8
AL
1541 while (SPACE_OR_TAB(*s))
1542 s++;
463ee0b2
LW
1543 if (!isDIGIT(*s))
1544 return;
d4c19fe8 1545
463ee0b2
LW
1546 n = s;
1547 while (isDIGIT(*s))
1548 s++;
07714eb4 1549 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1550 return;
bf4acbe4 1551 while (SPACE_OR_TAB(*s))
463ee0b2 1552 s++;
73659bf1 1553 if (*s == '"' && (t = strchr(s+1, '"'))) {
463ee0b2 1554 s++;
73659bf1
GS
1555 e = t + 1;
1556 }
463ee0b2 1557 else {
c35e046a
AL
1558 t = s;
1559 while (!isSPACE(*t))
1560 t++;
73659bf1 1561 e = t;
463ee0b2 1562 }
bf4acbe4 1563 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1564 e++;
1565 if (*e != '\n' && *e != '\0')
1566 return; /* false alarm */
1567
8818d409
FC
1568 line_num = atoi(n)-1;
1569
f4dd75d9 1570 if (t - s > 0) {
d9095cec 1571 const STRLEN len = t - s;
19bad673
NC
1572 SV *const temp_sv = CopFILESV(PL_curcop);
1573 const char *cf;
1574 STRLEN tmplen;
1575
1576 if (temp_sv) {
1577 cf = SvPVX(temp_sv);
1578 tmplen = SvCUR(temp_sv);
1579 } else {
1580 cf = NULL;
1581 tmplen = 0;
1582 }
1583
d1299d44 1584 if (!PL_rsfp && !PL_parser->filtered) {
e66cf94c
RGS
1585 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1586 * to *{"::_<newfilename"} */
44867030
NC
1587 /* However, the long form of evals is only turned on by the
1588 debugger - usually they're "(eval %lu)" */
1589 char smallbuf[128];
1590 char *tmpbuf;
1591 GV **gvp;
d9095cec 1592 STRLEN tmplen2 = len;
798b63bc 1593 if (tmplen + 2 <= sizeof smallbuf)
e66cf94c
RGS
1594 tmpbuf = smallbuf;
1595 else
2ae0db35 1596 Newx(tmpbuf, tmplen + 2, char);
44867030
NC
1597 tmpbuf[0] = '_';
1598 tmpbuf[1] = '<';
2ae0db35 1599 memcpy(tmpbuf + 2, cf, tmplen);
44867030 1600 tmplen += 2;
8a5ee598
RGS
1601 gvp = (GV**)hv_fetch(PL_defstash, tmpbuf, tmplen, FALSE);
1602 if (gvp) {
44867030
NC
1603 char *tmpbuf2;
1604 GV *gv2;
1605
1606 if (tmplen2 + 2 <= sizeof smallbuf)
1607 tmpbuf2 = smallbuf;
1608 else
1609 Newx(tmpbuf2, tmplen2 + 2, char);
1610
1611 if (tmpbuf2 != smallbuf || tmpbuf != smallbuf) {
1612 /* Either they malloc'd it, or we malloc'd it,
1613 so no prefix is present in ours. */
1614 tmpbuf2[0] = '_';
1615 tmpbuf2[1] = '<';
1616 }
1617
1618 memcpy(tmpbuf2 + 2, s, tmplen2);
1619 tmplen2 += 2;
1620
8a5ee598 1621 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1622 if (!isGV(gv2)) {
8a5ee598 1623 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1624 /* adjust ${"::_<newfilename"} to store the new file name */
1625 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
8818d409
FC
1626 /* The line number may differ. If that is the case,
1627 alias the saved lines that are in the array.
1628 Otherwise alias the whole array. */
1629 if (CopLINE(PL_curcop) == line_num) {
1630 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(*gvp)));
1631 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(*gvp)));
1632 }
1633 else if (GvAV(*gvp)) {
1634 AV * const av = GvAV(*gvp);
1635 const I32 start = CopLINE(PL_curcop)+1;
1636 I32 items = AvFILLp(av) - start;
1637 if (items > 0) {
1638 AV * const av2 = GvAVn(gv2);
1639 SV **svp = AvARRAY(av) + start;
1640 I32 l = (I32)line_num+1;
1641 while (items--)
1642 av_store(av2, l++, SvREFCNT_inc(*svp++));
1643 }
1644 }
e5527e4b 1645 }
44867030
NC
1646
1647 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
8a5ee598 1648 }
e66cf94c 1649 if (tmpbuf != smallbuf) Safefree(tmpbuf);
e66cf94c 1650 }
05ec9bb3 1651 CopFILE_free(PL_curcop);
d9095cec 1652 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1653 }
8818d409 1654 CopLINE_set(PL_curcop, line_num);
463ee0b2
LW
1655}
1656
29595ff2 1657#ifdef PERL_MAD
cd81e915 1658/* skip space before PL_thistoken */
29595ff2
NC
1659
1660STATIC char *
1661S_skipspace0(pTHX_ register char *s)
1662{
7918f24d
NC
1663 PERL_ARGS_ASSERT_SKIPSPACE0;
1664
29595ff2
NC
1665 s = skipspace(s);
1666 if (!PL_madskills)
1667 return s;
cd81e915
NC
1668 if (PL_skipwhite) {
1669 if (!PL_thiswhite)
6b29d1f5 1670 PL_thiswhite = newSVpvs("");
cd81e915
NC
1671 sv_catsv(PL_thiswhite, PL_skipwhite);
1672 sv_free(PL_skipwhite);
1673 PL_skipwhite = 0;
1674 }
1675 PL_realtokenstart = s - SvPVX(PL_linestr);
29595ff2
NC
1676 return s;
1677}
1678
cd81e915 1679/* skip space after PL_thistoken */
29595ff2
NC
1680
1681STATIC char *
1682S_skipspace1(pTHX_ register char *s)
1683{
d4c19fe8 1684 const char *start = s;
29595ff2
NC
1685 I32 startoff = start - SvPVX(PL_linestr);
1686
7918f24d
NC
1687 PERL_ARGS_ASSERT_SKIPSPACE1;
1688
29595ff2
NC
1689 s = skipspace(s);
1690 if (!PL_madskills)
1691 return s;
1692 start = SvPVX(PL_linestr) + startoff;
cd81e915 1693 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1694 const char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1695 PL_thistoken = newSVpvn(tstart, start - tstart);
1696 }
1697 PL_realtokenstart = -1;
1698 if (PL_skipwhite) {
1699 if (!PL_nextwhite)
6b29d1f5 1700 PL_nextwhite = newSVpvs("");
cd81e915
NC
1701 sv_catsv(PL_nextwhite, PL_skipwhite);
1702 sv_free(PL_skipwhite);
1703 PL_skipwhite = 0;
29595ff2
NC
1704 }
1705 return s;
1706}
1707
1708STATIC char *
1709S_skipspace2(pTHX_ register char *s, SV **svp)
1710{
c35e046a
AL
1711 char *start;
1712 const I32 bufptroff = PL_bufptr - SvPVX(PL_linestr);
1713 const I32 startoff = s - SvPVX(PL_linestr);
1714
7918f24d
NC
1715 PERL_ARGS_ASSERT_SKIPSPACE2;
1716
29595ff2
NC
1717 s = skipspace(s);
1718 PL_bufptr = SvPVX(PL_linestr) + bufptroff;
1719 if (!PL_madskills || !svp)
1720 return s;
1721 start = SvPVX(PL_linestr) + startoff;
cd81e915 1722 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1723 char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1724 PL_thistoken = newSVpvn(tstart, start - tstart);
1725 PL_realtokenstart = -1;
29595ff2 1726 }
cd81e915 1727 if (PL_skipwhite) {
29595ff2 1728 if (!*svp)
6b29d1f5 1729 *svp = newSVpvs("");
cd81e915
NC
1730 sv_setsv(*svp, PL_skipwhite);
1731 sv_free(PL_skipwhite);
1732 PL_skipwhite = 0;
29595ff2
NC
1733 }
1734
1735 return s;
1736}
1737#endif
1738
80a702cd 1739STATIC void
15f169a1 1740S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1741{
1742 AV *av = CopFILEAVx(PL_curcop);
1743 if (av) {
b9f83d2f 1744 SV * const sv = newSV_type(SVt_PVMG);
5fa550fb
NC
1745 if (orig_sv)
1746 sv_setsv(sv, orig_sv);
1747 else
1748 sv_setpvn(sv, buf, len);
80a702cd
RGS
1749 (void)SvIOK_on(sv);
1750 SvIV_set(sv, 0);
1751 av_store(av, (I32)CopLINE(PL_curcop), sv);
1752 }
1753}
1754
ffb4593c
NT
1755/*
1756 * S_skipspace
1757 * Called to gobble the appropriate amount and type of whitespace.
1758 * Skips comments as well.
1759 */
1760
76e3520e 1761STATIC char *
cea2e8a9 1762S_skipspace(pTHX_ register char *s)
a687059c 1763{
5db06880 1764#ifdef PERL_MAD
f0e67a1d
Z
1765 char *start = s;
1766#endif /* PERL_MAD */
7918f24d 1767 PERL_ARGS_ASSERT_SKIPSPACE;
f0e67a1d 1768#ifdef PERL_MAD
cd81e915
NC
1769 if (PL_skipwhite) {
1770 sv_free(PL_skipwhite);
f0e67a1d 1771 PL_skipwhite = NULL;
5db06880 1772 }
f0e67a1d 1773#endif /* PERL_MAD */
3280af22 1774 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
bf4acbe4 1775 while (s < PL_bufend && SPACE_OR_TAB(*s))
463ee0b2 1776 s++;
f0e67a1d
Z
1777 } else {
1778 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1779 PL_bufptr = s;
f0998909
Z
1780 lex_read_space(LEX_KEEP_PREVIOUS |
1781 (PL_sublex_info.sub_inwhat || PL_lex_state == LEX_FORMLINE ?
1782 LEX_NO_NEXT_CHUNK : 0));
3280af22 1783 s = PL_bufptr;
f0e67a1d
Z
1784 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1785 if (PL_linestart > PL_bufptr)
1786 PL_bufptr = PL_linestart;
1787 return s;
463ee0b2 1788 }
5db06880 1789#ifdef PERL_MAD
f0e67a1d
Z
1790 if (PL_madskills)
1791 PL_skipwhite = newSVpvn(start, s-start);
1792#endif /* PERL_MAD */
5db06880 1793 return s;
a687059c 1794}
378cc40b 1795
ffb4593c
NT
1796/*
1797 * S_check_uni
1798 * Check the unary operators to ensure there's no ambiguity in how they're
1799 * used. An ambiguous piece of code would be:
1800 * rand + 5
1801 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1802 * the +5 is its argument.
1803 */
1804
76e3520e 1805STATIC void
cea2e8a9 1806S_check_uni(pTHX)
ba106d47 1807{
97aff369 1808 dVAR;
d4c19fe8
AL
1809 const char *s;
1810 const char *t;
2f3197b3 1811
3280af22 1812 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1813 return;
3280af22
NIS
1814 while (isSPACE(*PL_last_uni))
1815 PL_last_uni++;
c35e046a
AL
1816 s = PL_last_uni;
1817 while (isALNUM_lazy_if(s,UTF) || *s == '-')
1818 s++;
3280af22 1819 if ((t = strchr(s, '(')) && t < PL_bufptr)
a0d0e21e 1820 return;
6136c704 1821
9b387841
NC
1822 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
1823 "Warning: Use of \"%.*s\" without parentheses is ambiguous",
1824 (int)(s - PL_last_uni), PL_last_uni);
2f3197b3
LW
1825}
1826
ffb4593c
NT
1827/*
1828 * LOP : macro to build a list operator. Its behaviour has been replaced
1829 * with a subroutine, S_lop() for which LOP is just another name.
1830 */
1831
a0d0e21e
LW
1832#define LOP(f,x) return lop(f,x,s)
1833
ffb4593c
NT
1834/*
1835 * S_lop
1836 * Build a list operator (or something that might be one). The rules:
1837 * - if we have a next token, then it's a list operator [why?]
1838 * - if the next thing is an opening paren, then it's a function
1839 * - else it's a list operator
1840 */
1841
76e3520e 1842STATIC I32
a0be28da 1843S_lop(pTHX_ I32 f, int x, char *s)
ffed7fef 1844{
97aff369 1845 dVAR;
7918f24d
NC
1846
1847 PERL_ARGS_ASSERT_LOP;
1848
6154021b 1849 pl_yylval.ival = f;
35c8bce7 1850 CLINE;
3280af22
NIS
1851 PL_expect = x;
1852 PL_bufptr = s;
1853 PL_last_lop = PL_oldbufptr;
eb160463 1854 PL_last_lop_op = (OPCODE)f;
5db06880
NC
1855#ifdef PERL_MAD
1856 if (PL_lasttoke)
78cdf107 1857 goto lstop;
5db06880 1858#else
3280af22 1859 if (PL_nexttoke)
78cdf107 1860 goto lstop;
5db06880 1861#endif
79072805 1862 if (*s == '(')
bbf60fe6 1863 return REPORT(FUNC);
29595ff2 1864 s = PEEKSPACE(s);
79072805 1865 if (*s == '(')
bbf60fe6 1866 return REPORT(FUNC);
78cdf107
Z
1867 else {
1868 lstop:
1869 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
1870 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
bbf60fe6 1871 return REPORT(LSTOP);
78cdf107 1872 }
79072805
LW
1873}
1874
5db06880
NC
1875#ifdef PERL_MAD
1876 /*
1877 * S_start_force
1878 * Sets up for an eventual force_next(). start_force(0) basically does
1879 * an unshift, while start_force(-1) does a push. yylex removes items
1880 * on the "pop" end.
1881 */
1882
1883STATIC void
1884S_start_force(pTHX_ int where)
1885{
1886 int i;
1887
cd81e915 1888 if (where < 0) /* so people can duplicate start_force(PL_curforce) */
5db06880 1889 where = PL_lasttoke;
cd81e915
NC
1890 assert(PL_curforce < 0 || PL_curforce == where);
1891 if (PL_curforce != where) {
5db06880
NC
1892 for (i = PL_lasttoke; i > where; --i) {
1893 PL_nexttoke[i] = PL_nexttoke[i-1];
1894 }
1895 PL_lasttoke++;
1896 }
cd81e915 1897 if (PL_curforce < 0) /* in case of duplicate start_force() */
5db06880 1898 Zero(&PL_nexttoke[where], 1, NEXTTOKE);
cd81e915
NC
1899 PL_curforce = where;
1900 if (PL_nextwhite) {
5db06880 1901 if (PL_madskills)
6b29d1f5 1902 curmad('^', newSVpvs(""));
cd81e915 1903 CURMAD('_', PL_nextwhite);
5db06880
NC
1904 }
1905}
1906
1907STATIC void
1908S_curmad(pTHX_ char slot, SV *sv)
1909{
1910 MADPROP **where;
1911
1912 if (!sv)
1913 return;
cd81e915
NC
1914 if (PL_curforce < 0)
1915 where = &PL_thismad;
5db06880 1916 else
cd81e915 1917 where = &PL_nexttoke[PL_curforce].next_mad;
5db06880 1918
cd81e915 1919 if (PL_faketokens)
76f68e9b 1920 sv_setpvs(sv, "");
5db06880
NC
1921 else {
1922 if (!IN_BYTES) {
1923 if (UTF && is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
1924 SvUTF8_on(sv);
1925 else if (PL_encoding) {
1926 sv_recode_to_utf8(sv, PL_encoding);
1927 }
1928 }
1929 }
1930
1931 /* keep a slot open for the head of the list? */
1932 if (slot != '_' && *where && (*where)->mad_key == '^') {
1933 (*where)->mad_key = slot;
daba3364 1934 sv_free(MUTABLE_SV(((*where)->mad_val)));
5db06880
NC
1935 (*where)->mad_val = (void*)sv;
1936 }
1937 else
1938 addmad(newMADsv(slot, sv), where, 0);
1939}
1940#else
b3f24c00
MHM
1941# define start_force(where) NOOP
1942# define curmad(slot, sv) NOOP
5db06880
NC
1943#endif
1944
ffb4593c
NT
1945/*
1946 * S_force_next
9cbb5ea2 1947 * When the lexer realizes it knows the next token (for instance,
ffb4593c 1948 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 1949 * to know what token to return the next time the lexer is called. Caller
5db06880
NC
1950 * will need to set PL_nextval[] (or PL_nexttoke[].next_val with PERL_MAD),
1951 * and possibly PL_expect to ensure the lexer handles the token correctly.
ffb4593c
NT
1952 */
1953
4e553d73 1954STATIC void
cea2e8a9 1955S_force_next(pTHX_ I32 type)
79072805 1956{
97aff369 1957 dVAR;
704d4215
GG
1958#ifdef DEBUGGING
1959 if (DEBUG_T_TEST) {
1960 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 1961 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
1962 }
1963#endif
5db06880 1964#ifdef PERL_MAD
cd81e915 1965 if (PL_curforce < 0)
5db06880 1966 start_force(PL_lasttoke);
cd81e915 1967 PL_nexttoke[PL_curforce].next_type = type;
5db06880
NC
1968 if (PL_lex_state != LEX_KNOWNEXT)
1969 PL_lex_defer = PL_lex_state;
1970 PL_lex_state = LEX_KNOWNEXT;
1971 PL_lex_expect = PL_expect;
cd81e915 1972 PL_curforce = -1;
5db06880 1973#else
3280af22
NIS
1974 PL_nexttype[PL_nexttoke] = type;
1975 PL_nexttoke++;
1976 if (PL_lex_state != LEX_KNOWNEXT) {
1977 PL_lex_defer = PL_lex_state;
1978 PL_lex_expect = PL_expect;
1979 PL_lex_state = LEX_KNOWNEXT;
79072805 1980 }
5db06880 1981#endif
79072805
LW
1982}
1983
28ac2b49
Z
1984void
1985Perl_yyunlex(pTHX)
1986{
a7aaec61
Z
1987 int yyc = PL_parser->yychar;
1988 if (yyc != YYEMPTY) {
1989 if (yyc) {
1990 start_force(-1);
1991 NEXTVAL_NEXTTOKE = PL_parser->yylval;
1992 if (yyc == '{'/*}*/ || yyc == HASHBRACK || yyc == '['/*]*/) {
78cdf107 1993 PL_lex_allbrackets--;
a7aaec61 1994 PL_lex_brackets--;
78cdf107
Z
1995 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
1996 } else if (yyc == '('/*)*/) {
1997 PL_lex_allbrackets--;
1998 yyc |= (2<<24);
a7aaec61
Z
1999 }
2000 force_next(yyc);
2001 }
28ac2b49
Z
2002 PL_parser->yychar = YYEMPTY;
2003 }
2004}
2005
d0a148a6 2006STATIC SV *
15f169a1 2007S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 2008{
97aff369 2009 dVAR;
740cce10 2010 SV * const sv = newSVpvn_utf8(start, len,
eaf7a4d2
CS
2011 !IN_BYTES
2012 && UTF
2013 && !is_ascii_string((const U8*)start, len)
740cce10 2014 && is_utf8_string((const U8*)start, len));
d0a148a6
NC
2015 return sv;
2016}
2017
ffb4593c
NT
2018/*
2019 * S_force_word
2020 * When the lexer knows the next thing is a word (for instance, it has
2021 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
2022 * it calls S_force_word to stick the next word into the PL_nexttoke/val
2023 * lookahead.
ffb4593c
NT
2024 *
2025 * Arguments:
b1b65b59 2026 * char *start : buffer position (must be within PL_linestr)
02b34bbe 2027 * int token : PL_next* will be this type of bare word (e.g., METHOD,WORD)
ffb4593c
NT
2028 * int check_keyword : if true, Perl checks to make sure the word isn't
2029 * a keyword (do this if the word is a label, e.g. goto FOO)
2030 * int allow_pack : if true, : characters will also be allowed (require,
2031 * use, etc. do this)
9cbb5ea2 2032 * int allow_initial_tick : used by the "sub" lexer only.
ffb4593c
NT
2033 */
2034
76e3520e 2035STATIC char *
cea2e8a9 2036S_force_word(pTHX_ register char *start, int token, int check_keyword, int allow_pack, int allow_initial_tick)
79072805 2037{
97aff369 2038 dVAR;
463ee0b2
LW
2039 register char *s;
2040 STRLEN len;
4e553d73 2041
7918f24d
NC
2042 PERL_ARGS_ASSERT_FORCE_WORD;
2043
29595ff2 2044 start = SKIPSPACE1(start);
463ee0b2 2045 s = start;
7e2040f0 2046 if (isIDFIRST_lazy_if(s,UTF) ||
a0d0e21e 2047 (allow_pack && *s == ':') ||
15f0808c 2048 (allow_initial_tick && *s == '\'') )
a0d0e21e 2049 {
3280af22 2050 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
5458a98a 2051 if (check_keyword && keyword(PL_tokenbuf, len, 0))
463ee0b2 2052 return start;
cd81e915 2053 start_force(PL_curforce);
5db06880
NC
2054 if (PL_madskills)
2055 curmad('X', newSVpvn(start,s-start));
463ee0b2 2056 if (token == METHOD) {
29595ff2 2057 s = SKIPSPACE1(s);
463ee0b2 2058 if (*s == '(')
3280af22 2059 PL_expect = XTERM;
463ee0b2 2060 else {
3280af22 2061 PL_expect = XOPERATOR;
463ee0b2 2062 }
79072805 2063 }
e74e6b3d 2064 if (PL_madskills)
63575281 2065 curmad('g', newSVpvs( "forced" ));
9ded7720 2066 NEXTVAL_NEXTTOKE.opval
d0a148a6
NC
2067 = (OP*)newSVOP(OP_CONST,0,
2068 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 2069 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
2070 force_next(token);
2071 }
2072 return s;
2073}
2074
ffb4593c
NT
2075/*
2076 * S_force_ident
9cbb5ea2 2077 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
2078 * text only contains the "foo" portion. The first argument is a pointer
2079 * to the "foo", and the second argument is the type symbol to prefix.
2080 * Forces the next token to be a "WORD".
9cbb5ea2 2081 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2082 */
2083
76e3520e 2084STATIC void
bfed75c6 2085S_force_ident(pTHX_ register const char *s, int kind)
79072805 2086{
97aff369 2087 dVAR;
7918f24d
NC
2088
2089 PERL_ARGS_ASSERT_FORCE_IDENT;
2090
c35e046a 2091 if (*s) {
90e5519e 2092 const STRLEN len = strlen(s);
728847b1
BF
2093 OP* const o = (OP*)newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
2094 UTF ? SVf_UTF8 : 0));
cd81e915 2095 start_force(PL_curforce);
9ded7720 2096 NEXTVAL_NEXTTOKE.opval = o;
79072805 2097 force_next(WORD);
748a9306 2098 if (kind) {
11343788 2099 o->op_private = OPpCONST_ENTERED;
55497cff 2100 /* XXX see note in pp_entereval() for why we forgo typo
2101 warnings if the symbol must be introduced in an eval.
2102 GSAR 96-10-12 */
90e5519e 2103 gv_fetchpvn_flags(s, len,
728847b1
BF
2104 (PL_in_eval ? (GV_ADDMULTI | GV_ADDINEVAL)
2105 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
90e5519e
NC
2106 kind == '$' ? SVt_PV :
2107 kind == '@' ? SVt_PVAV :
2108 kind == '%' ? SVt_PVHV :
a0d0e21e 2109 SVt_PVGV
90e5519e 2110 );
748a9306 2111 }
79072805
LW
2112 }
2113}
2114
1571675a
GS
2115NV
2116Perl_str_to_version(pTHX_ SV *sv)
2117{
2118 NV retval = 0.0;
2119 NV nshift = 1.0;
2120 STRLEN len;
cfd0369c 2121 const char *start = SvPV_const(sv,len);
9d4ba2ae 2122 const char * const end = start + len;
504618e9 2123 const bool utf = SvUTF8(sv) ? TRUE : FALSE;
7918f24d
NC
2124
2125 PERL_ARGS_ASSERT_STR_TO_VERSION;
2126
1571675a 2127 while (start < end) {
ba210ebe 2128 STRLEN skip;
1571675a
GS
2129 UV n;
2130 if (utf)
9041c2e3 2131 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2132 else {
2133 n = *(U8*)start;
2134 skip = 1;
2135 }
2136 retval += ((NV)n)/nshift;
2137 start += skip;
2138 nshift *= 1000;
2139 }
2140 return retval;
2141}
2142
4e553d73 2143/*
ffb4593c
NT
2144 * S_force_version
2145 * Forces the next token to be a version number.
e759cc13
RGS
2146 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2147 * and if "guessing" is TRUE, then no new token is created (and the caller
2148 * must use an alternative parsing method).
ffb4593c
NT
2149 */
2150
76e3520e 2151STATIC char *
e759cc13 2152S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2153{
97aff369 2154 dVAR;
5f66b61c 2155 OP *version = NULL;
44dcb63b 2156 char *d;
5db06880
NC
2157#ifdef PERL_MAD
2158 I32 startoff = s - SvPVX(PL_linestr);
2159#endif
89bfa8cd 2160
7918f24d
NC
2161 PERL_ARGS_ASSERT_FORCE_VERSION;
2162
29595ff2 2163 s = SKIPSPACE1(s);
89bfa8cd 2164
44dcb63b 2165 d = s;
dd629d5b 2166 if (*d == 'v')
44dcb63b 2167 d++;
44dcb63b 2168 if (isDIGIT(*d)) {
e759cc13
RGS
2169 while (isDIGIT(*d) || *d == '_' || *d == '.')
2170 d++;
5db06880
NC
2171#ifdef PERL_MAD
2172 if (PL_madskills) {
cd81e915 2173 start_force(PL_curforce);
5db06880
NC
2174 curmad('X', newSVpvn(s,d-s));
2175 }
2176#endif
4e4da3ac 2177 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
dd629d5b 2178 SV *ver;
8d08d9ba
DG
2179#ifdef USE_LOCALE_NUMERIC
2180 char *loc = setlocale(LC_NUMERIC, "C");
2181#endif
6154021b 2182 s = scan_num(s, &pl_yylval);
8d08d9ba
DG
2183#ifdef USE_LOCALE_NUMERIC
2184 setlocale(LC_NUMERIC, loc);
2185#endif
6154021b 2186 version = pl_yylval.opval;
dd629d5b
GS
2187 ver = cSVOPx(version)->op_sv;
2188 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2189 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2190 SvNV_set(ver, str_to_version(ver));
1571675a 2191 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2192 }
89bfa8cd 2193 }
5db06880
NC
2194 else if (guessing) {
2195#ifdef PERL_MAD
2196 if (PL_madskills) {
cd81e915
NC
2197 sv_free(PL_nextwhite); /* let next token collect whitespace */
2198 PL_nextwhite = 0;
5db06880
NC
2199 s = SvPVX(PL_linestr) + startoff;
2200 }
2201#endif
e759cc13 2202 return s;
5db06880 2203 }
89bfa8cd 2204 }
2205
5db06880
NC
2206#ifdef PERL_MAD
2207 if (PL_madskills && !version) {
cd81e915
NC
2208 sv_free(PL_nextwhite); /* let next token collect whitespace */
2209 PL_nextwhite = 0;
5db06880
NC
2210 s = SvPVX(PL_linestr) + startoff;
2211 }
2212#endif
89bfa8cd 2213 /* NOTE: The parser sees the package name and the VERSION swapped */
cd81e915 2214 start_force(PL_curforce);
9ded7720 2215 NEXTVAL_NEXTTOKE.opval = version;
4e553d73 2216 force_next(WORD);
89bfa8cd 2217
e759cc13 2218 return s;
89bfa8cd 2219}
2220
ffb4593c 2221/*
91152fc1
DG
2222 * S_force_strict_version
2223 * Forces the next token to be a version number using strict syntax rules.
2224 */
2225
2226STATIC char *
2227S_force_strict_version(pTHX_ char *s)
2228{
2229 dVAR;
2230 OP *version = NULL;
2231#ifdef PERL_MAD
2232 I32 startoff = s - SvPVX(PL_linestr);
2233#endif
2234 const char *errstr = NULL;
2235
2236 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2237
2238 while (isSPACE(*s)) /* leading whitespace */
2239 s++;
2240
2241 if (is_STRICT_VERSION(s,&errstr)) {
2242 SV *ver = newSV(0);
2243 s = (char *)scan_version(s, ver, 0);
2244 version = newSVOP(OP_CONST, 0, ver);
2245 }
4e4da3ac
Z
2246 else if ( (*s != ';' && *s != '{' && *s != '}' ) &&
2247 (s = SKIPSPACE1(s), (*s != ';' && *s != '{' && *s != '}' )))
2248 {
91152fc1
DG
2249 PL_bufptr = s;
2250 if (errstr)
2251 yyerror(errstr); /* version required */
2252 return s;
2253 }
2254
2255#ifdef PERL_MAD
2256 if (PL_madskills && !version) {
2257 sv_free(PL_nextwhite); /* let next token collect whitespace */
2258 PL_nextwhite = 0;
2259 s = SvPVX(PL_linestr) + startoff;
2260 }
2261#endif
2262 /* NOTE: The parser sees the package name and the VERSION swapped */
2263 start_force(PL_curforce);
2264 NEXTVAL_NEXTTOKE.opval = version;
2265 force_next(WORD);
2266
2267 return s;
2268}
2269
2270/*
ffb4593c
NT
2271 * S_tokeq
2272 * Tokenize a quoted string passed in as an SV. It finds the next
2273 * chunk, up to end of string or a backslash. It may make a new
2274 * SV containing that chunk (if HINT_NEW_STRING is on). It also
2275 * turns \\ into \.
2276 */
2277
76e3520e 2278STATIC SV *
cea2e8a9 2279S_tokeq(pTHX_ SV *sv)
79072805 2280{
97aff369 2281 dVAR;
79072805
LW
2282 register char *s;
2283 register char *send;
2284 register char *d;
b3ac6de7
IZ
2285 STRLEN len = 0;
2286 SV *pv = sv;
79072805 2287
7918f24d
NC
2288 PERL_ARGS_ASSERT_TOKEQ;
2289
79072805 2290 if (!SvLEN(sv))
b3ac6de7 2291 goto finish;
79072805 2292
a0d0e21e 2293 s = SvPV_force(sv, len);
21a311ee 2294 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1)
b3ac6de7 2295 goto finish;
463ee0b2 2296 send = s + len;
dcb21ed6
NC
2297 /* This is relying on the SV being "well formed" with a trailing '\0' */
2298 while (s < send && !(*s == '\\' && s[1] == '\\'))
79072805
LW
2299 s++;
2300 if (s == send)
b3ac6de7 2301 goto finish;
79072805 2302 d = s;
be4731d2 2303 if ( PL_hints & HINT_NEW_STRING ) {
59cd0e26 2304 pv = newSVpvn_flags(SvPVX_const(pv), len, SVs_TEMP | SvUTF8(sv));
be4731d2 2305 }
79072805
LW
2306 while (s < send) {
2307 if (*s == '\\') {
a0d0e21e 2308 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2309 s++; /* all that, just for this */
2310 }
2311 *d++ = *s++;
2312 }
2313 *d = '\0';
95a20fc0 2314 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2315 finish:
3280af22 2316 if ( PL_hints & HINT_NEW_STRING )
eb0d8d16 2317 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
79072805
LW
2318 return sv;
2319}
2320
ffb4593c
NT
2321/*
2322 * Now come three functions related to double-quote context,
2323 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2324 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2325 * interact with PL_lex_state, and create fake ( ... ) argument lists
2326 * to handle functions and concatenation.
2327 * They assume that whoever calls them will be setting up a fake
2328 * join call, because each subthing puts a ',' after it. This lets
2329 * "lower \luPpEr"
2330 * become
2331 * join($, , 'lower ', lcfirst( 'uPpEr', ) ,)
2332 *
2333 * (I'm not sure whether the spurious commas at the end of lcfirst's
2334 * arguments and join's arguments are created or not).
2335 */
2336
2337/*
2338 * S_sublex_start
6154021b 2339 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2340 *
2341 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2342 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c
NT
2343 *
2344 * OP_CONST and OP_READLINE are easy--just make the new op and return.
2345 *
2346 * Everything else becomes a FUNC.
2347 *
2348 * Sets PL_lex_state to LEX_INTERPPUSH unless (ival was OP_NULL or we
2349 * had an OP_CONST or OP_READLINE). This just sets us up for a
2350 * call to S_sublex_push().
2351 */
2352
76e3520e 2353STATIC I32
cea2e8a9 2354S_sublex_start(pTHX)
79072805 2355{
97aff369 2356 dVAR;
6154021b 2357 register const I32 op_type = pl_yylval.ival;
79072805
LW
2358
2359 if (op_type == OP_NULL) {
6154021b 2360 pl_yylval.opval = PL_lex_op;
5f66b61c 2361 PL_lex_op = NULL;
79072805
LW
2362 return THING;
2363 }
2364 if (op_type == OP_CONST || op_type == OP_READLINE) {
3280af22 2365 SV *sv = tokeq(PL_lex_stuff);
b3ac6de7
IZ
2366
2367 if (SvTYPE(sv) == SVt_PVIV) {
2368 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2369 STRLEN len;
96a5add6 2370 const char * const p = SvPV_const(sv, len);
740cce10 2371 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2372 SvREFCNT_dec(sv);
2373 sv = nsv;
4e553d73 2374 }
6154021b 2375 pl_yylval.opval = (OP*)newSVOP(op_type, 0, sv);
a0714e2c 2376 PL_lex_stuff = NULL;
6f33ba73
RGS
2377 /* Allow <FH> // "foo" */
2378 if (op_type == OP_READLINE)
2379 PL_expect = XTERMORDORDOR;
79072805
LW
2380 return THING;
2381 }
e3f73d4e
RGS
2382 else if (op_type == OP_BACKTICK && PL_lex_op) {
2383 /* readpipe() vas overriden */
2384 cSVOPx(cLISTOPx(cUNOPx(PL_lex_op)->op_first)->op_first->op_sibling)->op_sv = tokeq(PL_lex_stuff);
6154021b 2385 pl_yylval.opval = PL_lex_op;
9b201d7d 2386 PL_lex_op = NULL;
e3f73d4e
RGS
2387 PL_lex_stuff = NULL;
2388 return THING;
2389 }
79072805 2390
3280af22 2391 PL_sublex_info.super_state = PL_lex_state;
eac04b2e 2392 PL_sublex_info.sub_inwhat = (U16)op_type;
3280af22
NIS
2393 PL_sublex_info.sub_op = PL_lex_op;
2394 PL_lex_state = LEX_INTERPPUSH;
55497cff 2395
3280af22
NIS
2396 PL_expect = XTERM;
2397 if (PL_lex_op) {
6154021b 2398 pl_yylval.opval = PL_lex_op;
5f66b61c 2399 PL_lex_op = NULL;
55497cff 2400 return PMFUNC;
2401 }
2402 else
2403 return FUNC;
2404}
2405
ffb4593c
NT
2406/*
2407 * S_sublex_push
2408 * Create a new scope to save the lexing state. The scope will be
2409 * ended in S_sublex_done. Returns a '(', starting the function arguments
2410 * to the uc, lc, etc. found before.
2411 * Sets PL_lex_state to LEX_INTERPCONCAT.
2412 */
2413
76e3520e 2414STATIC I32
cea2e8a9 2415S_sublex_push(pTHX)
55497cff 2416{
27da23d5 2417 dVAR;
f46d017c 2418 ENTER;
55497cff 2419
3280af22 2420 PL_lex_state = PL_sublex_info.super_state;
651b5b28 2421 SAVEBOOL(PL_lex_dojoin);
3280af22 2422 SAVEI32(PL_lex_brackets);
78cdf107
Z
2423 SAVEI32(PL_lex_allbrackets);
2424 SAVEI8(PL_lex_fakeeof);
3280af22
NIS
2425 SAVEI32(PL_lex_casemods);
2426 SAVEI32(PL_lex_starts);
651b5b28 2427 SAVEI8(PL_lex_state);
7766f137 2428 SAVEVPTR(PL_lex_inpat);
98246f1e 2429 SAVEI16(PL_lex_inwhat);
57843af0 2430 SAVECOPLINE(PL_curcop);
3280af22 2431 SAVEPPTR(PL_bufptr);
8452ff4b 2432 SAVEPPTR(PL_bufend);
3280af22
NIS
2433 SAVEPPTR(PL_oldbufptr);
2434 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2435 SAVEPPTR(PL_last_lop);
2436 SAVEPPTR(PL_last_uni);
3280af22
NIS
2437 SAVEPPTR(PL_linestart);
2438 SAVESPTR(PL_linestr);
8edd5f42
RGS
2439 SAVEGENERICPV(PL_lex_brackstack);
2440 SAVEGENERICPV(PL_lex_casestack);
3280af22
NIS
2441
2442 PL_linestr = PL_lex_stuff;
a0714e2c 2443 PL_lex_stuff = NULL;
3280af22 2444
9cbb5ea2
GS
2445 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2446 = SvPVX(PL_linestr);
3280af22 2447 PL_bufend += SvCUR(PL_linestr);
bd61b366 2448 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2449 SAVEFREESV(PL_linestr);
2450
2451 PL_lex_dojoin = FALSE;
2452 PL_lex_brackets = 0;
78cdf107
Z
2453 PL_lex_allbrackets = 0;
2454 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
a02a5408
JC
2455 Newx(PL_lex_brackstack, 120, char);
2456 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2457 PL_lex_casemods = 0;
2458 *PL_lex_casestack = '\0';
2459 PL_lex_starts = 0;
2460 PL_lex_state = LEX_INTERPCONCAT;
eb160463 2461 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
3280af22
NIS
2462
2463 PL_lex_inwhat = PL_sublex_info.sub_inwhat;
bb16bae8 2464 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
3280af22
NIS
2465 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2466 PL_lex_inpat = PL_sublex_info.sub_op;
79072805 2467 else
5f66b61c 2468 PL_lex_inpat = NULL;
79072805 2469
55497cff 2470 return '(';
79072805
LW
2471}
2472
ffb4593c
NT
2473/*
2474 * S_sublex_done
2475 * Restores lexer state after a S_sublex_push.
2476 */
2477
76e3520e 2478STATIC I32
cea2e8a9 2479S_sublex_done(pTHX)
79072805 2480{
27da23d5 2481 dVAR;
3280af22 2482 if (!PL_lex_starts++) {
396482e1 2483 SV * const sv = newSVpvs("");
9aa983d2
JH
2484 if (SvUTF8(PL_linestr))
2485 SvUTF8_on(sv);
3280af22 2486 PL_expect = XOPERATOR;
6154021b 2487 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
79072805
LW
2488 return THING;
2489 }
2490
3280af22
NIS
2491 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2492 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2493 return yylex();
79072805
LW
2494 }
2495
ffb4593c 2496 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
bb16bae8 2497 assert(PL_lex_inwhat != OP_TRANSR);
3280af22
NIS
2498 if (PL_lex_repl && (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS)) {
2499 PL_linestr = PL_lex_repl;
2500 PL_lex_inpat = 0;
2501 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2502 PL_bufend += SvCUR(PL_linestr);
bd61b366 2503 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2504 SAVEFREESV(PL_linestr);
2505 PL_lex_dojoin = FALSE;
2506 PL_lex_brackets = 0;
78cdf107
Z
2507 PL_lex_allbrackets = 0;
2508 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
3280af22
NIS
2509 PL_lex_casemods = 0;
2510 *PL_lex_casestack = '\0';
2511 PL_lex_starts = 0;
25da4f38 2512 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2513 PL_lex_state = LEX_INTERPNORMAL;
2514 PL_lex_starts++;
e9fa98b2
HS
2515 /* we don't clear PL_lex_repl here, so that we can check later
2516 whether this is an evalled subst; that means we rely on the
2517 logic to ensure sublex_done() is called again only via the
2518 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2519 }
e9fa98b2 2520 else {
3280af22 2521 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2522 PL_lex_repl = NULL;
e9fa98b2 2523 }
79072805 2524 return ',';
ffed7fef
LW
2525 }
2526 else {
5db06880
NC
2527#ifdef PERL_MAD
2528 if (PL_madskills) {
cd81e915
NC
2529 if (PL_thiswhite) {
2530 if (!PL_endwhite)
6b29d1f5 2531 PL_endwhite = newSVpvs("");
cd81e915
NC
2532 sv_catsv(PL_endwhite, PL_thiswhite);
2533 PL_thiswhite = 0;
2534 }
2535 if (PL_thistoken)
76f68e9b 2536 sv_setpvs(PL_thistoken,"");
5db06880 2537 else
cd81e915 2538 PL_realtokenstart = -1;
5db06880
NC
2539 }
2540#endif
f46d017c 2541 LEAVE;
3280af22
NIS
2542 PL_bufend = SvPVX(PL_linestr);
2543 PL_bufend += SvCUR(PL_linestr);
2544 PL_expect = XOPERATOR;
09bef843 2545 PL_sublex_info.sub_inwhat = 0;
79072805 2546 return ')';
ffed7fef
LW
2547 }
2548}
2549
02aa26ce
NT
2550/*
2551 scan_const
2552
2553 Extracts a pattern, double-quoted string, or transliteration. This
2554 is terrifying code.
2555
94def140 2556 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 2557 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 2558 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 2559
94def140
TS
2560 Returns a pointer to the character scanned up to. If this is
2561 advanced from the start pointer supplied (i.e. if anything was
9b599b2a 2562 successfully parsed), will leave an OP for the substring scanned
6154021b 2563 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
2564 by looking at the next characters herself.
2565
02aa26ce
NT
2566 In patterns:
2567 backslashes:
ff3f963a 2568 constants: \N{NAME} only
02aa26ce
NT
2569 case and quoting: \U \Q \E
2570 stops on @ and $, but not for $ as tail anchor
2571
2572 In transliterations:
2573 characters are VERY literal, except for - not at the start or end
94def140
TS
2574 of the string, which indicates a range. If the range is in bytes,
2575 scan_const expands the range to the full set of intermediate
2576 characters. If the range is in utf8, the hyphen is replaced with
2577 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
2578
2579 In double-quoted strings:
2580 backslashes:
2581 double-quoted style: \r and \n
ff3f963a 2582 constants: \x31, etc.
94def140 2583 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
2584 case and quoting: \U \Q \E
2585 stops on @ and $
2586
2587 scan_const does *not* construct ops to handle interpolated strings.
2588 It stops processing as soon as it finds an embedded $ or @ variable
2589 and leaves it to the caller to work out what's going on.
2590
94def140
TS
2591 embedded arrays (whether in pattern or not) could be:
2592 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2593
2594 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
2595
2596 $ in pattern could be $foo or could be tail anchor. Assumption:
2597 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 2598 followed by one of "()| \r\n\t"
02aa26ce
NT
2599
2600 \1 (backreferences) are turned into $1
2601
2602 The structure of the code is
2603 while (there's a character to process) {
94def140
TS
2604 handle transliteration ranges
2605 skip regexp comments /(?#comment)/ and codes /(?{code})/
2606 skip #-initiated comments in //x patterns
2607 check for embedded arrays
02aa26ce
NT
2608 check for embedded scalars
2609 if (backslash) {
94def140 2610 deprecate \1 in substitution replacements
02aa26ce
NT
2611 handle string-changing backslashes \l \U \Q \E, etc.
2612 switch (what was escaped) {
94def140 2613 handle \- in a transliteration (becomes a literal -)
ff3f963a 2614 if a pattern and not \N{, go treat as regular character
94def140
TS
2615 handle \132 (octal characters)
2616 handle \x15 and \x{1234} (hex characters)
ff3f963a 2617 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
TS
2618 handle \cV (control characters)
2619 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 2620 } (end switch)
77a135fe 2621 continue
02aa26ce 2622 } (end if backslash)
77a135fe 2623 handle regular character
02aa26ce 2624 } (end while character to read)
4e553d73 2625
02aa26ce
NT
2626*/
2627
76e3520e 2628STATIC char *
cea2e8a9 2629S_scan_const(pTHX_ char *start)
79072805 2630{
97aff369 2631 dVAR;
3280af22 2632 register char *send = PL_bufend; /* end of the constant */
77a135fe
KW
2633 SV *sv = newSV(send - start); /* sv for the constant. See
2634 note below on sizing. */
02aa26ce
NT
2635 register char *s = start; /* start of the constant */
2636 register char *d = SvPVX(sv); /* destination for copies */
2637 bool dorange = FALSE; /* are we in a translit range? */
c2e66d9e 2638 bool didrange = FALSE; /* did we just finish a range? */
b953e60c
KW
2639 bool has_utf8 = FALSE; /* Output constant is UTF8 */
2640 bool this_utf8 = cBOOL(UTF); /* Is the source string assumed
77a135fe
KW
2641 to be UTF8? But, this can
2642 show as true when the source
2643 isn't utf8, as for example
2644 when it is entirely composed
2645 of hex constants */
2646
2647 /* Note on sizing: The scanned constant is placed into sv, which is
2648 * initialized by newSV() assuming one byte of output for every byte of
2649 * input. This routine expects newSV() to allocate an extra byte for a
2650 * trailing NUL, which this routine will append if it gets to the end of
2651 * the input. There may be more bytes of input than output (eg., \N{LATIN
2652 * CAPITAL LETTER A}), or more output than input if the constant ends up
2653 * recoded to utf8, but each time a construct is found that might increase
2654 * the needed size, SvGROW() is called. Its size parameter each time is
2655 * based on the best guess estimate at the time, namely the length used so
2656 * far, plus the length the current construct will occupy, plus room for
2657 * the trailing NUL, plus one byte for every input byte still unscanned */
2658
012bcf8d 2659 UV uv;
4c3a8340
TS
2660#ifdef EBCDIC
2661 UV literal_endpoint = 0;
e294cc5d 2662 bool native_range = TRUE; /* turned to FALSE if the first endpoint is Unicode. */
4c3a8340 2663#endif
012bcf8d 2664
7918f24d
NC
2665 PERL_ARGS_ASSERT_SCAN_CONST;
2666
bb16bae8 2667 assert(PL_lex_inwhat != OP_TRANSR);
2b9d42f0
NIS
2668 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
2669 /* If we are doing a trans and we know we want UTF8 set expectation */
2670 has_utf8 = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
2671 this_utf8 = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
2672 }
2673
2674
79072805 2675 while (s < send || dorange) {
ff3f963a 2676
02aa26ce 2677 /* get transliterations out of the way (they're most literal) */
3280af22 2678 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 2679 /* expand a range A-Z to the full set of characters. AIE! */
79072805 2680 if (dorange) {
1ba5c669
JH
2681 I32 i; /* current expanded character */
2682 I32 min; /* first character in range */
2683 I32 max; /* last character in range */
02aa26ce 2684
e294cc5d
JH
2685#ifdef EBCDIC
2686 UV uvmax = 0;
2687#endif
2688
2689 if (has_utf8
2690#ifdef EBCDIC
2691 && !native_range
2692#endif
2693 ) {
9d4ba2ae 2694 char * const c = (char*)utf8_hop((U8*)d, -1);
8973db79
JH
2695 char *e = d++;
2696 while (e-- > c)
2697 *(e + 1) = *e;
25716404 2698 *c = (char)UTF_TO_NATIVE(0xff);
8973db79
JH
2699 /* mark the range as done, and continue */
2700 dorange = FALSE;
2701 didrange = TRUE;
2702 continue;
2703 }
2b9d42f0 2704
95a20fc0 2705 i = d - SvPVX_const(sv); /* remember current offset */
e294cc5d
JH
2706#ifdef EBCDIC
2707 SvGROW(sv,
2708 SvLEN(sv) + (has_utf8 ?
2709 (512 - UTF_CONTINUATION_MARK +
2710 UNISKIP(0x100))
2711 : 256));
2712 /* How many two-byte within 0..255: 128 in UTF-8,
2713 * 96 in UTF-8-mod. */
2714#else
9cbb5ea2 2715 SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */
e294cc5d 2716#endif
9cbb5ea2 2717 d = SvPVX(sv) + i; /* refresh d after realloc */
e294cc5d
JH
2718#ifdef EBCDIC
2719 if (has_utf8) {
2720 int j;
2721 for (j = 0; j <= 1; j++) {
2722 char * const c = (char*)utf8_hop((U8*)d, -1);
2723 const UV uv = utf8n_to_uvchr((U8*)c, d - c, NULL, 0);
2724 if (j)
2725 min = (U8)uv;
2726 else if (uv < 256)
2727 max = (U8)uv;
2728 else {
2729 max = (U8)0xff; /* only to \xff */
2730 uvmax = uv; /* \x{100} to uvmax */
2731 }
2732 d = c; /* eat endpoint chars */
2733 }
2734 }
2735 else {
2736#endif
2737 d -= 2; /* eat the first char and the - */
2738 min = (U8)*d; /* first char in range */
2739 max = (U8)d[1]; /* last char in range */
2740#ifdef EBCDIC
2741 }
2742#endif
8ada0baa 2743
c2e66d9e 2744 if (min > max) {
01ec43d0 2745 Perl_croak(aTHX_
d1573ac7 2746 "Invalid range \"%c-%c\" in transliteration operator",
1ba5c669 2747 (char)min, (char)max);
c2e66d9e
GS
2748 }
2749
c7f1f016 2750#ifdef EBCDIC
4c3a8340
TS
2751 if (literal_endpoint == 2 &&
2752 ((isLOWER(min) && isLOWER(max)) ||
2753 (isUPPER(min) && isUPPER(max)))) {
8ada0baa
JH
2754 if (isLOWER(min)) {
2755 for (i = min; i <= max; i++)
2756 if (isLOWER(i))
db42d148 2757 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2758 } else {
2759 for (i = min; i <= max; i++)
2760 if (isUPPER(i))
db42d148 2761 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2762 }
2763 }
2764 else
2765#endif
2766 for (i = min; i <= max; i++)
e294cc5d
JH
2767#ifdef EBCDIC
2768 if (has_utf8) {
2769 const U8 ch = (U8)NATIVE_TO_UTF(i);
2770 if (UNI_IS_INVARIANT(ch))
2771 *d++ = (U8)i;
2772 else {
2773 *d++ = (U8)UTF8_EIGHT_BIT_HI(ch);
2774 *d++ = (U8)UTF8_EIGHT_BIT_LO(ch);
2775 }
2776 }
2777 else
2778#endif
2779 *d++ = (char)i;
2780
2781#ifdef EBCDIC
2782 if (uvmax) {
2783 d = (char*)uvchr_to_utf8((U8*)d, 0x100);
2784 if (uvmax > 0x101)
2785 *d++ = (char)UTF_TO_NATIVE(0xff);
2786 if (uvmax > 0x100)
2787 d = (char*)uvchr_to_utf8((U8*)d, uvmax);
2788 }
2789#endif
02aa26ce
NT
2790
2791 /* mark the range as done, and continue */
79072805 2792 dorange = FALSE;
01ec43d0 2793 didrange = TRUE;
4c3a8340
TS
2794#ifdef EBCDIC
2795 literal_endpoint = 0;
2796#endif
79072805 2797 continue;
4e553d73 2798 }
02aa26ce
NT
2799
2800 /* range begins (ignore - as first or last char) */
79072805 2801 else if (*s == '-' && s+1 < send && s != start) {
4e553d73 2802 if (didrange) {
1fafa243 2803 Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
01ec43d0 2804 }
e294cc5d
JH
2805 if (has_utf8
2806#ifdef EBCDIC
2807 && !native_range
2808#endif
2809 ) {
25716404 2810 *d++ = (char)UTF_TO_NATIVE(0xff); /* use illegal utf8 byte--see pmtrans */
a0ed51b3
LW
2811 s++;
2812 continue;
2813 }
79072805
LW
2814 dorange = TRUE;
2815 s++;
01ec43d0
GS
2816 }
2817 else {
2818 didrange = FALSE;
4c3a8340
TS
2819#ifdef EBCDIC
2820 literal_endpoint = 0;
e294cc5d 2821 native_range = TRUE;
4c3a8340 2822#endif
01ec43d0 2823 }
79072805 2824 }
02aa26ce
NT
2825
2826 /* if we get here, we're not doing a transliteration */
2827
0f5d15d6
IZ
2828 /* skip for regexp comments /(?#comment)/ and code /(?{code})/,
2829 except for the last char, which will be done separately. */
3280af22 2830 else if (*s == '(' && PL_lex_inpat && s[1] == '?') {
cc6b7395 2831 if (s[2] == '#') {
e994fd66 2832 while (s+1 < send && *s != ')')
db42d148 2833 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
155aba94
GS
2834 }
2835 else if (s[2] == '{' /* This should match regcomp.c */
67edc0c9 2836 || (s[2] == '?' && s[3] == '{'))
155aba94 2837 {
cc6b7395 2838 I32 count = 1;
0f5d15d6 2839 char *regparse = s + (s[2] == '{' ? 3 : 4);
cc6b7395
IZ
2840 char c;
2841
d9f97599
GS
2842 while (count && (c = *regparse)) {
2843 if (c == '\\' && regparse[1])
2844 regparse++;
4e553d73 2845 else if (c == '{')
cc6b7395 2846 count++;
4e553d73 2847 else if (c == '}')
cc6b7395 2848 count--;
d9f97599 2849 regparse++;
cc6b7395 2850 }
e994fd66 2851 if (*regparse != ')')
5bdf89e7 2852 regparse--; /* Leave one char for continuation. */
0f5d15d6 2853 while (s < regparse)
db42d148 2854 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
cc6b7395 2855 }
748a9306 2856 }
02aa26ce
NT
2857
2858 /* likewise skip #-initiated comments in //x patterns */
3280af22 2859 else if (*s == '#' && PL_lex_inpat &&
73134a2e 2860 ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED) {
748a9306 2861 while (s+1 < send && *s != '\n')
db42d148 2862 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
748a9306 2863 }
02aa26ce 2864
5d1d4326 2865 /* check for embedded arrays
da6eedaa 2866 (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
5d1d4326 2867 */
1749ea0d
TS
2868 else if (*s == '@' && s[1]) {
2869 if (isALNUM_lazy_if(s+1,UTF))
2870 break;
2871 if (strchr(":'{$", s[1]))
2872 break;
2873 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
2874 break; /* in regexp, neither @+ nor @- are interpolated */
2875 }
02aa26ce
NT
2876
2877 /* check for embedded scalars. only stop if we're sure it's a
2878 variable.
2879 */
79072805 2880 else if (*s == '$') {
3280af22 2881 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
79072805 2882 break;
77772344 2883 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
a2a5de95
NC
2884 if (s[1] == '\\') {
2885 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
2886 "Possible unintended interpolation of $\\ in regex");
77772344 2887 }
79072805 2888 break; /* in regexp, $ might be tail anchor */
77772344 2889 }
79072805 2890 }
02aa26ce 2891
2b9d42f0
NIS
2892 /* End of else if chain - OP_TRANS rejoin rest */
2893
02aa26ce 2894 /* backslashes */
79072805 2895 if (*s == '\\' && s+1 < send) {
ff3f963a
KW
2896 char* e; /* Can be used for ending '}', etc. */
2897
79072805 2898 s++;
02aa26ce 2899
7d0fc23c
KW
2900 /* warn on \1 - \9 in substitution replacements, but note that \11
2901 * is an octal; and \19 is \1 followed by '9' */
3280af22 2902 if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat &&
a0d0e21e 2903 isDIGIT(*s) && *s != '0' && !isDIGIT(s[1]))
79072805 2904 {
a2a5de95 2905 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
79072805
LW
2906 *--s = '$';
2907 break;
2908 }
02aa26ce
NT
2909
2910 /* string-change backslash escapes */
3280af22 2911 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQ", *s)) {
79072805
LW
2912 --s;
2913 break;
2914 }
ff3f963a
KW
2915 /* In a pattern, process \N, but skip any other backslash escapes.
2916 * This is because we don't want to translate an escape sequence
2917 * into a meta symbol and have the regex compiler use the meta
2918 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
2919 * in spite of this, we do have to process \N here while the proper
2920 * charnames handler is in scope. See bugs #56444 and #62056.
2921 * There is a complication because \N in a pattern may also stand
2922 * for 'match a non-nl', and not mean a charname, in which case its
2923 * processing should be deferred to the regex compiler. To be a
2924 * charname it must be followed immediately by a '{', and not look
2925 * like \N followed by a curly quantifier, i.e., not something like
2926 * \N{3,}. regcurly returns a boolean indicating if it is a legal
2927 * quantifier */
2928 else if (PL_lex_inpat
2929 && (*s != 'N'
2930 || s[1] != '{'
2931 || regcurly(s + 1)))
2932 {
cc74c5bd
TS
2933 *d++ = NATIVE_TO_NEED(has_utf8,'\\');
2934 goto default_action;
2935 }
02aa26ce 2936
79072805 2937 switch (*s) {
02aa26ce
NT
2938
2939 /* quoted - in transliterations */
79072805 2940 case '-':
3280af22 2941 if (PL_lex_inwhat == OP_TRANS) {
79072805
LW
2942 *d++ = *s++;
2943 continue;
2944 }
2945 /* FALL THROUGH */
2946 default:
11b8faa4 2947 {
a2a5de95
NC
2948 if ((isALPHA(*s) || isDIGIT(*s)))
2949 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
2950 "Unrecognized escape \\%c passed through",
2951 *s);
11b8faa4 2952 /* default action is to copy the quoted character */
f9a63242 2953 goto default_action;
11b8faa4 2954 }
02aa26ce 2955
632403cc 2956 /* eg. \132 indicates the octal constant 0132 */
79072805
LW
2957 case '0': case '1': case '2': case '3':
2958 case '4': case '5': case '6': case '7':
ba210ebe 2959 {
53305cf1
NC
2960 I32 flags = 0;
2961 STRLEN len = 3;
77a135fe 2962 uv = NATIVE_TO_UNI(grok_oct(s, &len, &flags, NULL));
ba210ebe
JH
2963 s += len;
2964 }
012bcf8d 2965 goto NUM_ESCAPE_INSERT;
02aa26ce 2966
f0a2b745
KW
2967 /* eg. \o{24} indicates the octal constant \024 */
2968 case 'o':
2969 {
2970 STRLEN len;
454155d9 2971 const char* error;
f0a2b745 2972
454155d9 2973 bool valid = grok_bslash_o(s, &uv, &len, &error, 1);
f0a2b745 2974 s += len;
454155d9 2975 if (! valid) {
f0a2b745
KW
2976 yyerror(error);
2977 continue;
2978 }
2979 goto NUM_ESCAPE_INSERT;
2980 }
2981
77a135fe 2982 /* eg. \x24 indicates the hex constant 0x24 */
79072805 2983 case 'x':
a0ed51b3
LW
2984 ++s;
2985 if (*s == '{') {
9d4ba2ae 2986 char* const e = strchr(s, '}');
a4c04bdc
NC
2987 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES |
2988 PERL_SCAN_DISALLOW_PREFIX;
53305cf1 2989 STRLEN len;
355860ce 2990
53305cf1 2991 ++s;
adaeee49 2992 if (!e) {
a0ed51b3 2993 yyerror("Missing right brace on \\x{}");
355860ce 2994 continue;
ba210ebe 2995 }
53305cf1 2996 len = e - s;
77a135fe 2997 uv = NATIVE_TO_UNI(grok_hex(s, &len, &flags, NULL));
ba210ebe 2998 s = e + 1;
a0ed51b3
LW
2999 }
3000 else {
ba210ebe 3001 {
53305cf1 3002 STRLEN len = 2;
a4c04bdc 3003 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
77a135fe 3004 uv = NATIVE_TO_UNI(grok_hex(s, &len, &flags, NULL));
ba210ebe
JH
3005 s += len;
3006 }
012bcf8d
GS
3007 }
3008
3009 NUM_ESCAPE_INSERT:
ff3f963a
KW
3010 /* Insert oct or hex escaped character. There will always be
3011 * enough room in sv since such escapes will be longer than any
3012 * UTF-8 sequence they can end up as, except if they force us
3013 * to recode the rest of the string into utf8 */
ba7cea30 3014
77a135fe 3015 /* Here uv is the ordinal of the next character being added in
ff3f963a 3016 * unicode (converted from native). */
77a135fe 3017 if (!UNI_IS_INVARIANT(uv)) {
9aa983d2 3018 if (!has_utf8 && uv > 255) {
77a135fe
KW
3019 /* Might need to recode whatever we have accumulated so
3020 * far if it contains any chars variant in utf8 or
3021 * utf-ebcdic. */
3022
3023 SvCUR_set(sv, d - SvPVX_const(sv));
3024 SvPOK_on(sv);
3025 *d = '\0';
77a135fe 3026 /* See Note on sizing above. */
7bf79863
KW
3027 sv_utf8_upgrade_flags_grow(sv,
3028 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3029 UNISKIP(uv) + (STRLEN)(send - s) + 1);
77a135fe
KW
3030 d = SvPVX(sv) + SvCUR(sv);
3031 has_utf8 = TRUE;
012bcf8d
GS
3032 }
3033
77a135fe
KW
3034 if (has_utf8) {
3035 d = (char*)uvuni_to_utf8((U8*)d, uv);
f9a63242
JH
3036 if (PL_lex_inwhat == OP_TRANS &&
3037 PL_sublex_info.sub_op) {
3038 PL_sublex_info.sub_op->op_private |=
3039 (PL_lex_repl ? OPpTRANS_FROM_UTF
3040 : OPpTRANS_TO_UTF);
f9a63242 3041 }
e294cc5d
JH
3042#ifdef EBCDIC
3043 if (uv > 255 && !dorange)
3044 native_range = FALSE;
3045#endif
012bcf8d 3046 }
a0ed51b3 3047 else {
012bcf8d 3048 *d++ = (char)uv;
a0ed51b3 3049 }
012bcf8d
GS
3050 }
3051 else {
c4d5f83a 3052 *d++ = (char) uv;
a0ed51b3 3053 }
79072805 3054 continue;
02aa26ce 3055
4a2d328f 3056 case 'N':
ff3f963a
KW
3057 /* In a non-pattern \N must be a named character, like \N{LATIN
3058 * SMALL LETTER A} or \N{U+0041}. For patterns, it also can
3059 * mean to match a non-newline. For non-patterns, named
3060 * characters are converted to their string equivalents. In
3061 * patterns, named characters are not converted to their
3062 * ultimate forms for the same reasons that other escapes
3063 * aren't. Instead, they are converted to the \N{U+...} form
3064 * to get the value from the charnames that is in effect right
3065 * now, while preserving the fact that it was a named character
3066 * so that the regex compiler knows this */
3067
3068 /* This section of code doesn't generally use the
3069 * NATIVE_TO_NEED() macro to transform the input. I (khw) did
3070 * a close examination of this macro and determined it is a
3071 * no-op except on utfebcdic variant characters. Every
3072 * character generated by this that would normally need to be
3073 * enclosed by this macro is invariant, so the macro is not
7538f724
KW
3074 * needed, and would complicate use of copy(). XXX There are
3075 * other parts of this file where the macro is used
3076 * inconsistently, but are saved by it being a no-op */
ff3f963a
KW
3077
3078 /* The structure of this section of code (besides checking for
3079 * errors and upgrading to utf8) is:
3080 * Further disambiguate between the two meanings of \N, and if
3081 * not a charname, go process it elsewhere
0a96133f
KW
3082 * If of form \N{U+...}, pass it through if a pattern;
3083 * otherwise convert to utf8
3084 * Otherwise must be \N{NAME}: convert to \N{U+c1.c2...} if a
3085 * pattern; otherwise convert to utf8 */
ff3f963a
KW
3086
3087 /* Here, s points to the 'N'; the test below is guaranteed to
3088 * succeed if we are being called on a pattern as we already
3089 * know from a test above that the next character is a '{'.
3090 * On a non-pattern \N must mean 'named sequence, which
3091 * requires braces */
3092 s++;
3093 if (*s != '{') {
3094 yyerror("Missing braces on \\N{}");
3095 continue;
3096 }
3097 s++;
3098
0a96133f 3099 /* If there is no matching '}', it is an error. */
ff3f963a
KW
3100 if (! (e = strchr(s, '}'))) {
3101 if (! PL_lex_inpat) {
5777a3f7 3102 yyerror("Missing right brace on \\N{}");
0a96133f
KW
3103 } else {
3104 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N.");
dbc0d4f2 3105 }
0a96133f 3106 continue;
ff3f963a 3107 }
cddc7ef4 3108
ff3f963a 3109 /* Here it looks like a named character */
cddc7ef4 3110
ff3f963a
KW
3111 if (PL_lex_inpat) {
3112
3113 /* XXX This block is temporary code. \N{} implies that the
3114 * pattern is to have Unicode semantics, and therefore
3115 * currently has to be encoded in utf8. By putting it in
3116 * utf8 now, we save a whole pass in the regular expression
3117 * compiler. Once that code is changed so Unicode
3118 * semantics doesn't necessarily have to be in utf8, this
da3a4baf
KW
3119 * block should be removed. However, the code that parses
3120 * the output of this would have to be changed to not
3121 * necessarily expect utf8 */
ff3f963a 3122 if (!has_utf8) {
77a135fe 3123 SvCUR_set(sv, d - SvPVX_const(sv));
f08d6ad9 3124 SvPOK_on(sv);
e4f3eed8 3125 *d = '\0';
77a135fe 3126 /* See Note on sizing above. */
7bf79863 3127 sv_utf8_upgrade_flags_grow(sv,
ff3f963a
KW
3128 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3129 /* 5 = '\N{' + cur char + NUL */
3130 (STRLEN)(send - s) + 5);
f08d6ad9 3131 d = SvPVX(sv) + SvCUR(sv);
89491803 3132 has_utf8 = TRUE;
ff3f963a
KW
3133 }
3134 }
423cee85 3135
ff3f963a
KW
3136 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3137 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3138 | PERL_SCAN_DISALLOW_PREFIX;
3139 STRLEN len;
3140
3141 /* For \N{U+...}, the '...' is a unicode value even on
3142 * EBCDIC machines */
3143 s += 2; /* Skip to next char after the 'U+' */
3144 len = e - s;
3145 uv = grok_hex(s, &len, &flags, NULL);
3146 if (len == 0 || len != (STRLEN)(e - s)) {
3147 yyerror("Invalid hexadecimal number in \\N{U+...}");
3148 s = e + 1;
3149 continue;
3150 }
3151
3152 if (PL_lex_inpat) {
3153
e2a7e165
KW
3154 /* On non-EBCDIC platforms, pass through to the regex
3155 * compiler unchanged. The reason we evaluated the
3156 * number above is to make sure there wasn't a syntax
3157 * error. But on EBCDIC we convert to native so
3158 * downstream code can continue to assume it's native
3159 */
ff3f963a 3160 s -= 5; /* Include the '\N{U+' */
e2a7e165
KW
3161#ifdef EBCDIC
3162 d += my_snprintf(d, e - s + 1 + 1, /* includes the }
3163 and the \0 */
3164 "\\N{U+%X}",
3165 (unsigned int) UNI_TO_NATIVE(uv));
3166#else
ff3f963a
KW
3167 Copy(s, d, e - s + 1, char); /* 1 = include the } */
3168 d += e - s + 1;
e2a7e165 3169#endif
ff3f963a
KW
3170 }
3171 else { /* Not a pattern: convert the hex to string */
3172
3173 /* If destination is not in utf8, unconditionally
3174 * recode it to be so. This is because \N{} implies
3175 * Unicode semantics, and scalars have to be in utf8
3176 * to guarantee those semantics */
3177 if (! has_utf8) {
3178 SvCUR_set(sv, d - SvPVX_const(sv));
3179 SvPOK_on(sv);
3180 *d = '\0';
3181 /* See Note on sizing above. */
3182 sv_utf8_upgrade_flags_grow(
3183 sv,
3184 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3185 UNISKIP(uv) + (STRLEN)(send - e) + 1);
3186 d = SvPVX(sv) + SvCUR(sv);
3187 has_utf8 = TRUE;
3188 }
3189
3190 /* Add the string to the output */
3191 if (UNI_IS_INVARIANT(uv)) {
3192 *d++ = (char) uv;
3193 }
3194 else d = (char*)uvuni_to_utf8((U8*)d, uv);
3195 }
3196 }
3197 else { /* Here is \N{NAME} but not \N{U+...}. */
3198
3199 SV *res; /* result from charnames */
3200 const char *str; /* the string in 'res' */
3201 STRLEN len; /* its length */
3202
3203 /* Get the value for NAME */
3204 res = newSVpvn(s, e - s);
3205 res = new_constant( NULL, 0, "charnames",
3206 /* includes all of: \N{...} */
3207 res, NULL, s - 3, e - s + 4 );
3208
3209 /* Most likely res will be in utf8 already since the
3210 * standard charnames uses pack U, but a custom translator
3211 * can leave it otherwise, so make sure. XXX This can be
3212 * revisited to not have charnames use utf8 for characters
3213 * that don't need it when regexes don't have to be in utf8
3214 * for Unicode semantics. If doing so, remember EBCDIC */
3215 sv_utf8_upgrade(res);
3216 str = SvPV_const(res, len);
3217
3218 /* Don't accept malformed input */
3219 if (! is_utf8_string((U8 *) str, len)) {
3220 yyerror("Malformed UTF-8 returned by \\N");
3221 }
3222 else if (PL_lex_inpat) {
3223
3224 if (! len) { /* The name resolved to an empty string */
3225 Copy("\\N{}", d, 4, char);
3226 d += 4;
3227 }
3228 else {
3229 /* In order to not lose information for the regex
3230 * compiler, pass the result in the specially made
3231 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3232 * the code points in hex of each character
3233 * returned by charnames */
3234
3235 const char *str_end = str + len;
3236 STRLEN char_length; /* cur char's byte length */
3237 STRLEN output_length; /* and the number of bytes
3238 after this is translated
3239 into hex digits */
3240 const STRLEN off = d - SvPVX_const(sv);
3241
3242 /* 2 hex per byte; 2 chars for '\N'; 2 chars for
3243 * max('U+', '.'); and 1 for NUL */
3244 char hex_string[2 * UTF8_MAXBYTES + 5];
3245
3246 /* Get the first character of the result. */
3247 U32 uv = utf8n_to_uvuni((U8 *) str,
3248 len,
3249 &char_length,
3250 UTF8_ALLOW_ANYUV);
3251
3252 /* The call to is_utf8_string() above hopefully
3253 * guarantees that there won't be an error. But
3254 * it's easy here to make sure. The function just
3255 * above warns and returns 0 if invalid utf8, but
3256 * it can also return 0 if the input is validly a
3257 * NUL. Disambiguate */
3258 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3259 uv = UNICODE_REPLACEMENT;
3260 }
3261
3262 /* Convert first code point to hex, including the
e2a7e165
KW
3263 * boiler plate before it. For all these, we
3264 * convert to native format so that downstream code
3265 * can continue to assume the input is native */
78c35590 3266 output_length =
3353de27 3267 my_snprintf(hex_string, sizeof(hex_string),
e2a7e165
KW
3268 "\\N{U+%X",
3269 (unsigned int) UNI_TO_NATIVE(uv));
ff3f963a
KW
3270
3271 /* Make sure there is enough space to hold it */
3272 d = off + SvGROW(sv, off
3273 + output_length
3274 + (STRLEN)(send - e)
3275 + 2); /* '}' + NUL */
3276 /* And output it */
3277 Copy(hex_string, d, output_length, char);
3278 d += output_length;
3279
3280 /* For each subsequent character, append dot and
3281 * its ordinal in hex */
3282 while ((str += char_length) < str_end) {
3283 const STRLEN off = d - SvPVX_const(sv);
3284 U32 uv = utf8n_to_uvuni((U8 *) str,
3285 str_end - str,
3286 &char_length,
3287 UTF8_ALLOW_ANYUV);
3288 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3289 uv = UNICODE_REPLACEMENT;
3290 }
3291
78c35590 3292 output_length =
3353de27 3293 my_snprintf(hex_string, sizeof(hex_string),
e2a7e165
KW
3294 ".%X",
3295 (unsigned int) UNI_TO_NATIVE(uv));
ff3f963a
KW
3296
3297 d = off + SvGROW(sv, off
3298 + output_length
3299 + (STRLEN)(send - e)
3300 + 2); /* '}' + NUL */
3301 Copy(hex_string, d, output_length, char);
3302 d += output_length;
3303 }
3304
3305 *d++ = '}'; /* Done. Add the trailing brace */
3306 }
3307 }
3308 else { /* Here, not in a pattern. Convert the name to a
3309 * string. */
3310
3311 /* If destination is not in utf8, unconditionally
3312 * recode it to be so. This is because \N{} implies
3313 * Unicode semantics, and scalars have to be in utf8
3314 * to guarantee those semantics */
3315 if (! has_utf8) {
3316 SvCUR_set(sv, d - SvPVX_const(sv));
3317 SvPOK_on(sv);
3318 *d = '\0';
3319 /* See Note on sizing above. */
3320 sv_utf8_upgrade_flags_grow(sv,
3321 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3322 len + (STRLEN)(send - s) + 1);
3323 d = SvPVX(sv) + SvCUR(sv);
3324 has_utf8 = TRUE;
3325 } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
3326
3327 /* See Note on sizing above. (NOTE: SvCUR() is not
3328 * set correctly here). */
3329 const STRLEN off = d - SvPVX_const(sv);
3330 d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1);
3331 }
3332 Copy(str, d, len, char);
3333 d += len;
423cee85 3334 }
423cee85 3335 SvREFCNT_dec(res);
cb233ae3
KW
3336
3337 /* Deprecate non-approved name syntax */
3338 if (ckWARN_d(WARN_DEPRECATED)) {
3339 bool problematic = FALSE;
3340 char* i = s;
3341
3342 /* For non-ut8 input, look to see that the first
3343 * character is an alpha, then loop through the rest
3344 * checking that each is a continuation */
3345 if (! this_utf8) {
3346 if (! isALPHAU(*i)) problematic = TRUE;
3347 else for (i = s + 1; i < e; i++) {
3348 if (isCHARNAME_CONT(*i)) continue;
3349 problematic = TRUE;
3350 break;
3351 }
3352 }
3353 else {
3354 /* Similarly for utf8. For invariants can check
3355 * directly. We accept anything above the latin1
3356 * range because it is immaterial to Perl if it is
3357 * correct or not, and is expensive to check. But
3358 * it is fairly easy in the latin1 range to convert
3359 * the variants into a single character and check
3360 * those */
3361 if (UTF8_IS_INVARIANT(*i)) {
3362 if (! isALPHAU(*i)) problematic = TRUE;
3363 } else if (UTF8_IS_DOWNGRADEABLE_START(*i)) {
81c14aa2 3364 if (! isALPHAU(UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(*i,
cb233ae3
KW
3365 *(i+1)))))
3366 {
3367 problematic = TRUE;
3368 }
3369 }
3370 if (! problematic) for (i = s + UTF8SKIP(s);
3371 i < e;
3372 i+= UTF8SKIP(i))
3373 {
3374 if (UTF8_IS_INVARIANT(*i)) {
3375 if (isCHARNAME_CONT(*i)) continue;
3376 } else if (! UTF8_IS_DOWNGRADEABLE_START(*i)) {
3377 continue;
3378 } else if (isCHARNAME_CONT(
3379 UNI_TO_NATIVE(
81c14aa2 3380 TWO_BYTE_UTF8_TO_UNI(*i, *(i+1)))))
cb233ae3
KW
3381 {
3382 continue;
3383 }
3384 problematic = TRUE;
3385 break;
3386 }
3387 }
3388 if (problematic) {
6e1bad6c
KW
3389 /* The e-i passed to the final %.*s makes sure that
3390 * should the trailing NUL be missing that this
3391 * print won't run off the end of the string */
cb233ae3 3392 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
b00fc8d4
NC
3393 "Deprecated character in \\N{...}; marked by <-- HERE in \\N{%.*s<-- HERE %.*s",
3394 (int)(i - s + 1), s, (int)(e - i), i + 1);
cb233ae3
KW
3395 }
3396 }
3397 } /* End \N{NAME} */
ff3f963a
KW
3398#ifdef EBCDIC
3399 if (!dorange)
3400 native_range = FALSE; /* \N{} is defined to be Unicode */
3401#endif
3402 s = e + 1; /* Point to just after the '}' */
423cee85
JH
3403 continue;
3404
02aa26ce 3405 /* \c is a control character */
79072805
LW
3406 case 'c':
3407 s++;
961ce445 3408 if (s < send) {
17a3df4c 3409 *d++ = grok_bslash_c(*s++, has_utf8, 1);
ba210ebe 3410 }
961ce445
RGS
3411 else {
3412 yyerror("Missing control char name in \\c");
3413 }
79072805 3414 continue;
02aa26ce
NT
3415
3416 /* printf-style backslashes, formfeeds, newlines, etc */
79072805 3417 case 'b':
db42d148 3418 *d++ = NATIVE_TO_NEED(has_utf8,'\b');
79072805
LW
3419 break;
3420 case 'n':
db42d148 3421 *d++ = NATIVE_TO_NEED(has_utf8,'\n');
79072805
LW
3422 break;
3423 case 'r':
db42d148 3424 *d++ = NATIVE_TO_NEED(has_utf8,'\r');
79072805
LW
3425 break;
3426 case 'f':
db42d148 3427 *d++ = NATIVE_TO_NEED(has_utf8,'\f');
79072805
LW
3428 break;
3429 case 't':
db42d148 3430 *d++ = NATIVE_TO_NEED(has_utf8,'\t');
79072805 3431 break;
34a3fe2a 3432 case 'e':
db42d148 3433 *d++ = ASCII_TO_NEED(has_utf8,'\033');
34a3fe2a
PP
3434 break;
3435 case 'a':
db42d148 3436 *d++ = ASCII_TO_NEED(has_utf8,'\007');
79072805 3437 break;
02aa26ce
NT
3438 } /* end switch */
3439
79072805
LW
3440 s++;
3441 continue;
02aa26ce 3442 } /* end if (backslash) */
4c3a8340
TS
3443#ifdef EBCDIC
3444 else
3445 literal_endpoint++;
3446#endif
02aa26ce 3447
f9a63242 3448 default_action:
77a135fe
KW
3449 /* If we started with encoded form, or already know we want it,
3450 then encode the next character */
3451 if (! NATIVE_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
2b9d42f0 3452 STRLEN len = 1;
77a135fe
KW
3453
3454
3455 /* One might think that it is wasted effort in the case of the
3456 * source being utf8 (this_utf8 == TRUE) to take the next character
3457 * in the source, convert it to an unsigned value, and then convert
3458 * it back again. But the source has not been validated here. The
3459 * routine that does the conversion checks for errors like
3460 * malformed utf8 */
3461
5f66b61c
AL
3462 const UV nextuv = (this_utf8) ? utf8n_to_uvchr((U8*)s, send - s, &len, 0) : (UV) ((U8) *s);
3463 const STRLEN need = UNISKIP(NATIVE_TO_UNI(nextuv));
77a135fe
KW
3464 if (!has_utf8) {
3465 SvCUR_set(sv, d - SvPVX_const(sv));
3466 SvPOK_on(sv);
3467 *d = '\0';
77a135fe 3468 /* See Note on sizing above. */
7bf79863
KW
3469 sv_utf8_upgrade_flags_grow(sv,
3470 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3471 need + (STRLEN)(send - s) + 1);
77a135fe
KW
3472 d = SvPVX(sv) + SvCUR(sv);
3473 has_utf8 = TRUE;
3474 } else if (need > len) {
3475 /* encoded value larger than old, may need extra space (NOTE:
3476 * SvCUR() is not set correctly here). See Note on sizing
3477 * above. */
9d4ba2ae 3478 const STRLEN off = d - SvPVX_const(sv);
77a135fe 3479 d = SvGROW(sv, off + need + (STRLEN)(send - s) + 1) + off;
2b9d42f0 3480 }
77a135fe
KW
3481 s += len;
3482
5f66b61c 3483 d = (char*)uvchr_to_utf8((U8*)d, nextuv);
e294cc5d
JH
3484#ifdef EBCDIC
3485 if (uv > 255 && !dorange)
3486 native_range = FALSE;
3487#endif
2b9d42f0
NIS
3488 }
3489 else {
3490 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
3491 }
02aa26ce
NT
3492 } /* while loop to process each character */
3493
3494 /* terminate the string and set up the sv */
79072805 3495 *d = '\0';
95a20fc0 3496 SvCUR_set(sv, d - SvPVX_const(sv));
2b9d42f0 3497 if (SvCUR(sv) >= SvLEN(sv))
d0063567 3498 Perl_croak(aTHX_ "panic: constant overflowed allocated space");
2b9d42f0 3499
79072805 3500 SvPOK_on(sv);
9f4817db 3501 if (PL_encoding && !has_utf8) {
d0063567
DK
3502 sv_recode_to_utf8(sv, PL_encoding);
3503 if (SvUTF8(sv))
3504 has_utf8 = TRUE;
9f4817db 3505 }
2b9d42f0 3506 if (has_utf8) {
7e2040f0 3507 SvUTF8_on(sv);
2b9d42f0 3508 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
d0063567 3509 PL_sublex_info.sub_op->op_private |=
2b9d42f0
NIS
3510 (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
3511 }
3512 }
79072805 3513
02aa26ce 3514 /* shrink the sv if we allocated more than we used */
79072805 3515 if (SvCUR(sv) + 5 < SvLEN(sv)) {
1da4ca5f 3516 SvPV_shrink_to_cur(sv);
79072805 3517 }
02aa26ce 3518
6154021b 3519 /* return the substring (via pl_yylval) only if we parsed anything */
3280af22 3520 if (s > PL_bufptr) {
eb0d8d16
NC
3521 if ( PL_hints & ( PL_lex_inpat ? HINT_NEW_RE : HINT_NEW_STRING ) ) {
3522 const char *const key = PL_lex_inpat ? "qr" : "q";
3523 const STRLEN keylen = PL_lex_inpat ? 2 : 1;
3524 const char *type;
3525 STRLEN typelen;
3526
3527 if (PL_lex_inwhat == OP_TRANS) {
3528 type = "tr";
3529 typelen = 2;
3530 } else if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat) {
3531 type = "s";
3532 typelen = 1;
3533 } else {
3534 type = "qq";
3535 typelen = 2;
3536 }
3537
3538 sv = S_new_constant(aTHX_ start, s - start, key, keylen, sv, NULL,
3539 type, typelen);
3540 }
6154021b 3541 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
b3ac6de7 3542 } else
8990e307 3543 SvREFCNT_dec(sv);
79072805
LW
3544 return s;
3545}
3546
ffb4593c
NT
3547/* S_intuit_more
3548 * Returns TRUE if there's more to the expression (e.g., a subscript),
3549 * FALSE otherwise.
ffb4593c
NT
3550 *
3551 * It deals with "$foo[3]" and /$foo[3]/ and /$foo[0123456789$]+/
3552 *
3553 * ->[ and ->{ return TRUE
3554 * { and [ outside a pattern are always subscripts, so return TRUE
3555 * if we're outside a pattern and it's not { or [, then return FALSE
3556 * if we're in a pattern and the first char is a {
3557 * {4,5} (any digits around the comma) returns FALSE
3558 * if we're in a pattern and the first char is a [
3559 * [] returns FALSE
3560 * [SOMETHING] has a funky algorithm to decide whether it's a
3561 * character class or not. It has to deal with things like
3562 * /$foo[-3]/ and /$foo[$bar]/ as well as /$foo[$\d]+/
3563 * anything else returns TRUE
3564 */
3565
9cbb5ea2
GS
3566/* This is the one truly awful dwimmer necessary to conflate C and sed. */
3567
76e3520e 3568STATIC int
cea2e8a9 3569S_intuit_more(pTHX_ register char *s)
79072805 3570{
97aff369 3571 dVAR;
7918f24d
NC
3572
3573 PERL_ARGS_ASSERT_INTUIT_MORE;
3574
3280af22 3575 if (PL_lex_brackets)
79072805
LW
3576 return TRUE;
3577 if (*s == '-' && s[1] == '>' && (s[2] == '[' || s[2] == '{'))
3578 return TRUE;
3579 if (*s != '{' && *s != '[')
3580 return FALSE;
3280af22 3581 if (!PL_lex_inpat)
79072805
LW
3582 return TRUE;
3583
3584 /* In a pattern, so maybe we have {n,m}. */
3585 if (*s == '{') {
b3155d95 3586 if (regcurly(s)) {
79072805 3587 return FALSE;
b3155d95 3588 }
79072805 3589 return TRUE;
79072805
LW
3590 }
3591
3592 /* On the other hand, maybe we have a character class */
3593
3594 s++;
3595 if (*s == ']' || *s == '^')
3596 return FALSE;
3597 else {
ffb4593c 3598 /* this is terrifying, and it works */
79072805
LW
3599 int weight = 2; /* let's weigh the evidence */
3600 char seen[256];
f27ffc4a 3601 unsigned char un_char = 255, last_un_char;
9d4ba2ae 3602 const char * const send = strchr(s,']');
3280af22 3603 char tmpbuf[sizeof PL_tokenbuf * 4];
79072805
LW
3604
3605 if (!send) /* has to be an expression */
3606 return TRUE;
3607
3608 Zero(seen,256,char);
3609 if (*s == '$')
3610 weight -= 3;
3611 else if (isDIGIT(*s)) {
3612 if (s[1] != ']') {
3613 if (isDIGIT(s[1]) && s[2] == ']')
3614 weight -= 10;
3615 }
3616 else
3617 weight -= 100;
3618 }
3619 for (; s < send; s++) {
3620 last_un_char = un_char;
3621 un_char = (unsigned char)*s;
3622 switch (*s) {
3623 case '@':
3624 case '&':
3625 case '$':
3626 weight -= seen[un_char] * 10;
7e2040f0 3627 if (isALNUM_lazy_if(s+1,UTF)) {
90e5519e 3628 int len;
8903cb82 3629 scan_ident(s, send, tmpbuf, sizeof tmpbuf, FALSE);
90e5519e 3630 len = (int)strlen(tmpbuf);
6fbd0d97
BF
3631 if (len > 1 && gv_fetchpvn_flags(tmpbuf, len,
3632 UTF ? SVf_UTF8 : 0, SVt_PV))
79072805
LW
3633 weight -= 100;
3634 else
3635 weight -= 10;
3636 }
3637 else if (*s == '$' && s[1] &&
93a17b20
LW
3638 strchr("[#!%*<>()-=",s[1])) {
3639 if (/*{*/ strchr("])} =",s[2]))
79072805
LW
3640 weight -= 10;
3641 else
3642 weight -= 1;
3643 }
3644 break;
3645 case '\\':
3646 un_char = 254;
3647 if (s[1]) {
93a17b20 3648 if (strchr("wds]",s[1]))
79072805 3649 weight += 100;
10edeb5d 3650 else if (seen[(U8)'\''] || seen[(U8)'"'])
79072805 3651 weight += 1;
93a17b20 3652 else if (strchr("rnftbxcav",s[1]))
79072805
LW
3653 weight += 40;
3654 else if (isDIGIT(s[1])) {
3655 weight += 40;
3656 while (s[1] && isDIGIT(s[1]))
3657 s++;
3658 }
3659 }
3660 else
3661 weight += 100;
3662 break;
3663 case '-':
3664 if (s[1] == '\\')
3665 weight += 50;
93a17b20 3666 if (strchr("aA01! ",last_un_char))
79072805 3667 weight += 30;
93a17b20 3668 if (strchr("zZ79~",s[1]))
79072805 3669 weight += 30;
f27ffc4a
GS
3670 if (last_un_char == 255 && (isDIGIT(s[1]) || s[1] == '$'))
3671 weight -= 5; /* cope with negative subscript */
79072805
LW
3672 break;
3673 default:
3792a11b
NC
3674 if (!isALNUM(last_un_char)
3675 && !(last_un_char == '$' || last_un_char == '@'
3676 || last_un_char == '&')
3677 && isALPHA(*s) && s[1] && isALPHA(s[1])) {
79072805
LW
3678 char *d = tmpbuf;
3679 while (isALPHA(*s))
3680 *d++ = *s++;
3681 *d = '\0';
5458a98a 3682 if (keyword(tmpbuf, d - tmpbuf, 0))
79072805
LW
3683 weight -= 150;
3684 }
3685 if (un_char == last_un_char + 1)
3686 weight += 5;
3687 weight -= seen[un_char];
3688 break;
3689 }
3690 seen[un_char]++;
3691 }
3692 if (weight >= 0) /* probably a character class */
3693 return FALSE;
3694 }
3695
3696 return TRUE;
3697}
ffed7fef 3698
ffb4593c
NT
3699/*
3700 * S_intuit_method
3701 *
3702 * Does all the checking to disambiguate
3703 * foo bar
3704 * between foo(bar) and bar->foo. Returns 0 if not a method, otherwise
3705 * FUNCMETH (bar->foo(args)) or METHOD (bar->foo args).
3706 *
3707 * First argument is the stuff after the first token, e.g. "bar".
3708 *
3709 * Not a method if bar is a filehandle.
3710 * Not a method if foo is a subroutine prototyped to take a filehandle.
3711 * Not a method if it's really "Foo $bar"
3712 * Method if it's "foo $bar"
3713 * Not a method if it's really "print foo $bar"
3714 * Method if it's really "foo package::" (interpreted as package->foo)
8f8cf39c 3715 * Not a method if bar is known to be a subroutine ("sub bar; foo bar")
3cb0bbe5 3716 * Not a method if bar is a filehandle or package, but is quoted with
ffb4593c
NT
3717 * =>
3718 */
3719
76e3520e 3720STATIC int
62d55b22 3721S_intuit_method(pTHX_ char *start, GV *gv, CV *cv)
a0d0e21e 3722{
97aff369 3723 dVAR;
a0d0e21e 3724 char *s = start + (*start == '$');
3280af22 3725 char tmpbuf[sizeof PL_tokenbuf];
a0d0e21e
LW
3726 STRLEN len;
3727 GV* indirgv;
5db06880
NC
3728#ifdef PERL_MAD
3729 int soff;
3730#endif
a0d0e21e 3731
7918f24d
NC
3732 PERL_ARGS_ASSERT_INTUIT_METHOD;
3733
a0d0e21e 3734 if (gv) {
62d55b22 3735 if (SvTYPE(gv) == SVt_PVGV && GvIO(gv))
a0d0e21e 3736 return 0;
62d55b22
NC
3737 if (cv) {
3738 if (SvPOK(cv)) {
8fa6a409 3739 const char *proto = CvPROTO(cv);
62d55b22
NC
3740 if (proto) {
3741 if (*proto == ';')
3742 proto++;
3743 if (*proto == '*')
3744 return 0;
3745 }
b6c543e3
IZ
3746 }
3747 } else
c35e046a 3748 gv = NULL;
a0d0e21e 3749 }
8903cb82 3750 s = scan_word(s, tmpbuf, sizeof tmpbuf, TRUE, &len);
ffb4593c
NT
3751 /* start is the beginning of the possible filehandle/object,
3752 * and s is the end of it
3753 * tmpbuf is a copy of it
3754 */
3755
a0d0e21e 3756 if (*start == '$') {
3ef1310e
RGS
3757 if (gv || PL_last_lop_op == OP_PRINT || PL_last_lop_op == OP_SAY ||
3758 isUPPER(*PL_tokenbuf))
a0d0e21e 3759 return 0;
5db06880
NC
3760#ifdef PERL_MAD
3761 len = start - SvPVX(PL_linestr);
3762#endif
29595ff2 3763 s = PEEKSPACE(s);
f0092767 3764#ifdef PERL_MAD
5db06880
NC
3765 start = SvPVX(PL_linestr) + len;
3766#endif
3280af22
NIS
3767 PL_bufptr = start;
3768 PL_expect = XREF;
a0d0e21e
LW
3769 return *s == '(' ? FUNCMETH : METHOD;
3770 }<