This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Move the close-on-exec logic to one place, at the end of S_open_script().
[perl5.git] / toke.c
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
26
27This is the lower layer of the Perl parser, managing characters and tokens.
28
29=for apidoc AmU|yy_parser *|PL_parser
30
31Pointer to a structure encapsulating the state of the parsing operation
32currently in progress. The pointer can be locally changed to perform
33a nested parse without interfering with the state of an outer parse.
34Individual members of C<PL_parser> have their own documentation.
35
36=cut
37*/
38
378cc40b 39#include "EXTERN.h"
864dbfa3 40#define PERL_IN_TOKE_C
378cc40b 41#include "perl.h"
04e98a4d 42#include "dquote_static.c"
378cc40b 43
eb0d8d16
NC
44#define new_constant(a,b,c,d,e,f,g) \
45 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
46
6154021b 47#define pl_yylval (PL_parser->yylval)
d3b6f988 48
199e78b7
DM
49/* XXX temporary backwards compatibility */
50#define PL_lex_brackets (PL_parser->lex_brackets)
78cdf107
Z
51#define PL_lex_allbrackets (PL_parser->lex_allbrackets)
52#define PL_lex_fakeeof (PL_parser->lex_fakeeof)
199e78b7
DM
53#define PL_lex_brackstack (PL_parser->lex_brackstack)
54#define PL_lex_casemods (PL_parser->lex_casemods)
55#define PL_lex_casestack (PL_parser->lex_casestack)
56#define PL_lex_defer (PL_parser->lex_defer)
57#define PL_lex_dojoin (PL_parser->lex_dojoin)
58#define PL_lex_expect (PL_parser->lex_expect)
59#define PL_lex_formbrack (PL_parser->lex_formbrack)
60#define PL_lex_inpat (PL_parser->lex_inpat)
61#define PL_lex_inwhat (PL_parser->lex_inwhat)
62#define PL_lex_op (PL_parser->lex_op)
63#define PL_lex_repl (PL_parser->lex_repl)
64#define PL_lex_starts (PL_parser->lex_starts)
65#define PL_lex_stuff (PL_parser->lex_stuff)
66#define PL_multi_start (PL_parser->multi_start)
67#define PL_multi_open (PL_parser->multi_open)
68#define PL_multi_close (PL_parser->multi_close)
69#define PL_pending_ident (PL_parser->pending_ident)
70#define PL_preambled (PL_parser->preambled)
71#define PL_sublex_info (PL_parser->sublex_info)
bdc0bf6f 72#define PL_linestr (PL_parser->linestr)
c2598295
DM
73#define PL_expect (PL_parser->expect)
74#define PL_copline (PL_parser->copline)
f06b5848
DM
75#define PL_bufptr (PL_parser->bufptr)
76#define PL_oldbufptr (PL_parser->oldbufptr)
77#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
78#define PL_linestart (PL_parser->linestart)
79#define PL_bufend (PL_parser->bufend)
80#define PL_last_uni (PL_parser->last_uni)
81#define PL_last_lop (PL_parser->last_lop)
82#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 83#define PL_lex_state (PL_parser->lex_state)
2f9285f8 84#define PL_rsfp (PL_parser->rsfp)
5486870f 85#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
86#define PL_in_my (PL_parser->in_my)
87#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 88#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 89#define PL_multi_end (PL_parser->multi_end)
13765c85 90#define PL_error_count (PL_parser->error_count)
199e78b7
DM
91
92#ifdef PERL_MAD
93# define PL_endwhite (PL_parser->endwhite)
94# define PL_faketokens (PL_parser->faketokens)
95# define PL_lasttoke (PL_parser->lasttoke)
96# define PL_nextwhite (PL_parser->nextwhite)
97# define PL_realtokenstart (PL_parser->realtokenstart)
98# define PL_skipwhite (PL_parser->skipwhite)
99# define PL_thisclose (PL_parser->thisclose)
100# define PL_thismad (PL_parser->thismad)
101# define PL_thisopen (PL_parser->thisopen)
102# define PL_thisstuff (PL_parser->thisstuff)
103# define PL_thistoken (PL_parser->thistoken)
104# define PL_thiswhite (PL_parser->thiswhite)
fb205e7a
DM
105# define PL_thiswhite (PL_parser->thiswhite)
106# define PL_nexttoke (PL_parser->nexttoke)
107# define PL_curforce (PL_parser->curforce)
108#else
109# define PL_nexttoke (PL_parser->nexttoke)
110# define PL_nexttype (PL_parser->nexttype)
111# define PL_nextval (PL_parser->nextval)
199e78b7
DM
112#endif
113
16173588
NC
114/* This can't be done with embed.fnc, because struct yy_parser contains a
115 member named pending_ident, which clashes with the generated #define */
3cbf51f5
DM
116static int
117S_pending_ident(pTHX);
199e78b7 118
0bd48802 119static const char ident_too_long[] = "Identifier too long";
8903cb82 120
29595ff2 121#ifdef PERL_MAD
29595ff2 122# define CURMAD(slot,sv) if (PL_madskills) { curmad(slot,sv); sv = 0; }
cd81e915 123# define NEXTVAL_NEXTTOKE PL_nexttoke[PL_curforce].next_val
9ded7720 124#else
5db06880 125# define CURMAD(slot,sv)
9ded7720 126# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2
NC
127#endif
128
a7aaec61
Z
129#define XENUMMASK 0x3f
130#define XFAKEEOF 0x40
131#define XFAKEBRACK 0x80
9059aa12 132
39e02b42
JH
133#ifdef USE_UTF8_SCRIPTS
134# define UTF (!IN_BYTES)
2b9d42f0 135#else
802a15e9 136# define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
2b9d42f0 137#endif
a0ed51b3 138
b1fc3636
CJ
139/* The maximum number of characters preceding the unrecognized one to display */
140#define UNRECOGNIZED_PRECEDE_COUNT 10
141
61f0cdd9 142/* In variables named $^X, these are the legal values for X.
2b92dfce
GS
143 * 1999-02-27 mjd-perl-patch@plover.com */
144#define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
145
bf4acbe4 146#define SPACE_OR_TAB(c) ((c)==' '||(c)=='\t')
bf4acbe4 147
ffb4593c
NT
148/* LEX_* are values for PL_lex_state, the state of the lexer.
149 * They are arranged oddly so that the guard on the switch statement
79072805
LW
150 * can get by with a single comparison (if the compiler is smart enough).
151 */
152
fb73857a 153/* #define LEX_NOTPARSING 11 is done in perl.h. */
154
b6007c36
DM
155#define LEX_NORMAL 10 /* normal code (ie not within "...") */
156#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
157#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
158#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
159#define LEX_INTERPSTART 6 /* expecting the start of a $var */
160
161 /* at end of code, eg "$x" followed by: */
162#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
163#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
164
165#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
166 string or after \E, $foo, etc */
167#define LEX_INTERPCONST 2 /* NOT USED */
168#define LEX_FORMLINE 1 /* expecting a format line */
169#define LEX_KNOWNEXT 0 /* next token known; just return it */
170
79072805 171
bbf60fe6 172#ifdef DEBUGGING
27da23d5 173static const char* const lex_state_names[] = {
bbf60fe6
DM
174 "KNOWNEXT",
175 "FORMLINE",
176 "INTERPCONST",
177 "INTERPCONCAT",
178 "INTERPENDMAYBE",
179 "INTERPEND",
180 "INTERPSTART",
181 "INTERPPUSH",
182 "INTERPCASEMOD",
183 "INTERPNORMAL",
184 "NORMAL"
185};
186#endif
187
79072805
LW
188#ifdef ff_next
189#undef ff_next
d48672a2
LW
190#endif
191
79072805 192#include "keywords.h"
fe14fcc3 193
ffb4593c
NT
194/* CLINE is a macro that ensures PL_copline has a sane value */
195
ae986130
LW
196#ifdef CLINE
197#undef CLINE
198#endif
57843af0 199#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 200
5db06880 201#ifdef PERL_MAD
29595ff2
NC
202# define SKIPSPACE0(s) skipspace0(s)
203# define SKIPSPACE1(s) skipspace1(s)
204# define SKIPSPACE2(s,tsv) skipspace2(s,&tsv)
205# define PEEKSPACE(s) skipspace2(s,0)
206#else
207# define SKIPSPACE0(s) skipspace(s)
208# define SKIPSPACE1(s) skipspace(s)
209# define SKIPSPACE2(s,tsv) skipspace(s)
210# define PEEKSPACE(s) skipspace(s)
211#endif
212
ffb4593c
NT
213/*
214 * Convenience functions to return different tokens and prime the
9cbb5ea2 215 * lexer for the next token. They all take an argument.
ffb4593c
NT
216 *
217 * TOKEN : generic token (used for '(', DOLSHARP, etc)
218 * OPERATOR : generic operator
219 * AOPERATOR : assignment operator
220 * PREBLOCK : beginning the block after an if, while, foreach, ...
221 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
222 * PREREF : *EXPR where EXPR is not a simple identifier
223 * TERM : expression term
224 * LOOPX : loop exiting command (goto, last, dump, etc)
225 * FTST : file test operator
226 * FUN0 : zero-argument function
7eb971ee 227 * FUN0OP : zero-argument function, with its op created in this file
2d2e263d 228 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
229 * BOop : bitwise or or xor
230 * BAop : bitwise and
231 * SHop : shift operator
232 * PWop : power operator
9cbb5ea2 233 * PMop : pattern-matching operator
ffb4593c
NT
234 * Aop : addition-level operator
235 * Mop : multiplication-level operator
236 * Eop : equality-testing operator
e5edeb50 237 * Rop : relational operator <= != gt
ffb4593c
NT
238 *
239 * Also see LOP and lop() below.
240 */
241
998054bd 242#ifdef DEBUGGING /* Serve -DT. */
704d4215 243# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 244#else
bbf60fe6 245# define REPORT(retval) (retval)
998054bd
SC
246#endif
247
bbf60fe6
DM
248#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
249#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
250#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, REPORT(retval)))
251#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
252#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
253#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
254#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
6154021b
RGS
255#define LOOPX(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)LOOPEX))
256#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
257#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
7eb971ee 258#define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
6154021b
RGS
259#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
260#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITOROP)))
261#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITANDOP)))
262#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)SHIFTOP)))
263#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)POWOP)))
264#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
265#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)ADDOP)))
266#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MULOP)))
267#define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
268#define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
2f3197b3 269
a687059c
LW
270/* This bit of chicanery makes a unary function followed by
271 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
272 * The UNIDOR macro is for unary functions that can be followed by the //
273 * operator (such as C<shift // 0>).
a687059c 274 */
376fcdbf 275#define UNI2(f,x) { \
6154021b 276 pl_yylval.ival = f; \
376fcdbf
AL
277 PL_expect = x; \
278 PL_bufptr = s; \
279 PL_last_uni = PL_oldbufptr; \
280 PL_last_lop_op = f; \
281 if (*s == '(') \
282 return REPORT( (int)FUNC1 ); \
29595ff2 283 s = PEEKSPACE(s); \
376fcdbf
AL
284 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
285 }
6f33ba73
RGS
286#define UNI(f) UNI2(f,XTERM)
287#define UNIDOR(f) UNI2(f,XTERMORDORDOR)
b5fb7ce3
FC
288#define UNIPROTO(f,optional) { \
289 if (optional) PL_last_uni = PL_oldbufptr; \
22393538
MH
290 OPERATOR(f); \
291 }
a687059c 292
376fcdbf 293#define UNIBRACK(f) { \
6154021b 294 pl_yylval.ival = f; \
376fcdbf
AL
295 PL_bufptr = s; \
296 PL_last_uni = PL_oldbufptr; \
297 if (*s == '(') \
298 return REPORT( (int)FUNC1 ); \
29595ff2 299 s = PEEKSPACE(s); \
376fcdbf
AL
300 return REPORT( (*s == '(') ? (int)FUNC1 : (int)UNIOP ); \
301 }
79072805 302
9f68db38 303/* grandfather return to old style */
78cdf107
Z
304#define OLDLOP(f) \
305 do { \
306 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
307 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
308 pl_yylval.ival = (f); \
309 PL_expect = XTERM; \
310 PL_bufptr = s; \
311 return (int)LSTOP; \
312 } while(0)
79072805 313
8fa7f367
JH
314#ifdef DEBUGGING
315
6154021b 316/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
317enum token_type {
318 TOKENTYPE_NONE,
319 TOKENTYPE_IVAL,
6154021b 320 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6
DM
321 TOKENTYPE_PVAL,
322 TOKENTYPE_OPVAL,
323 TOKENTYPE_GVVAL
324};
325
6d4a66ac
NC
326static struct debug_tokens {
327 const int token;
328 enum token_type type;
329 const char *name;
330} const debug_tokens[] =
9041c2e3 331{
bbf60fe6
DM
332 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
333 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
334 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
335 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
336 { ARROW, TOKENTYPE_NONE, "ARROW" },
337 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
338 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
339 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
340 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
341 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
0d863452 342 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
343 { DO, TOKENTYPE_NONE, "DO" },
344 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
345 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
346 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
347 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
348 { ELSE, TOKENTYPE_NONE, "ELSE" },
349 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
350 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
351 { FOR, TOKENTYPE_IVAL, "FOR" },
352 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
353 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
354 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
7eb971ee 355 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
bbf60fe6
DM
356 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
357 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
358 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 359 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
360 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
361 { IF, TOKENTYPE_IVAL, "IF" },
362 { LABEL, TOKENTYPE_PVAL, "LABEL" },
363 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
364 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
365 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
366 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
367 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
368 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
369 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
370 { MY, TOKENTYPE_IVAL, "MY" },
371 { MYSUB, TOKENTYPE_NONE, "MYSUB" },
372 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
373 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
374 { OROP, TOKENTYPE_IVAL, "OROP" },
375 { OROR, TOKENTYPE_NONE, "OROR" },
376 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
88e1f1a2
JV
377 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
378 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6
DM
379 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
380 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
381 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
382 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
383 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
384 { PREINC, TOKENTYPE_NONE, "PREINC" },
385 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
386 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
387 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
388 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
389 { SUB, TOKENTYPE_NONE, "SUB" },
390 { THING, TOKENTYPE_OPVAL, "THING" },
391 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
392 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
393 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
394 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
395 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
396 { USE, TOKENTYPE_IVAL, "USE" },
0d863452 397 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6
DM
398 { WHILE, TOKENTYPE_IVAL, "WHILE" },
399 { WORD, TOKENTYPE_OPVAL, "WORD" },
be25f609 400 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 401 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
402};
403
6154021b 404/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 405
bbf60fe6 406STATIC int
704d4215 407S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 408{
97aff369 409 dVAR;
7918f24d
NC
410
411 PERL_ARGS_ASSERT_TOKEREPORT;
412
bbf60fe6 413 if (DEBUG_T_TEST) {
bd61b366 414 const char *name = NULL;
bbf60fe6 415 enum token_type type = TOKENTYPE_NONE;
f54cb97a 416 const struct debug_tokens *p;
396482e1 417 SV* const report = newSVpvs("<== ");
bbf60fe6 418
f54cb97a 419 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
420 if (p->token == (int)rv) {
421 name = p->name;
422 type = p->type;
423 break;
424 }
425 }
426 if (name)
54667de8 427 Perl_sv_catpv(aTHX_ report, name);
bbf60fe6
DM
428 else if ((char)rv > ' ' && (char)rv < '~')
429 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
430 else if (!rv)
396482e1 431 sv_catpvs(report, "EOF");
bbf60fe6
DM
432 else
433 Perl_sv_catpvf(aTHX_ report, "?? %"IVdf, (IV)rv);
434 switch (type) {
435 case TOKENTYPE_NONE:
436 case TOKENTYPE_GVVAL: /* doesn't appear to be used */
437 break;
438 case TOKENTYPE_IVAL:
704d4215 439 Perl_sv_catpvf(aTHX_ report, "(ival=%"IVdf")", (IV)lvalp->ival);
bbf60fe6
DM
440 break;
441 case TOKENTYPE_OPNUM:
442 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 443 PL_op_name[lvalp->ival]);
bbf60fe6
DM
444 break;
445 case TOKENTYPE_PVAL:
704d4215 446 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
447 break;
448 case TOKENTYPE_OPVAL:
704d4215 449 if (lvalp->opval) {
401441c0 450 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
451 PL_op_name[lvalp->opval->op_type]);
452 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 453 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 454 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
455 }
456
457 }
401441c0 458 else
396482e1 459 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
460 break;
461 }
b6007c36 462 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
463 };
464 return (int)rv;
998054bd
SC
465}
466
b6007c36
DM
467
468/* print the buffer with suitable escapes */
469
470STATIC void
15f169a1 471S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 472{
396482e1 473 SV* const tmp = newSVpvs("");
7918f24d
NC
474
475 PERL_ARGS_ASSERT_PRINTBUF;
476
b6007c36
DM
477 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
478 SvREFCNT_dec(tmp);
479}
480
8fa7f367
JH
481#endif
482
8290c323
NC
483static int
484S_deprecate_commaless_var_list(pTHX) {
485 PL_expect = XTERM;
486 deprecate("comma-less variable list");
487 return REPORT(','); /* grandfather non-comma-format format */
488}
489
ffb4593c
NT
490/*
491 * S_ao
492 *
c963b151
BD
493 * This subroutine detects &&=, ||=, and //= and turns an ANDAND, OROR or DORDOR
494 * into an OP_ANDASSIGN, OP_ORASSIGN, or OP_DORASSIGN
ffb4593c
NT
495 */
496
76e3520e 497STATIC int
cea2e8a9 498S_ao(pTHX_ int toketype)
a0d0e21e 499{
97aff369 500 dVAR;
3280af22
NIS
501 if (*PL_bufptr == '=') {
502 PL_bufptr++;
a0d0e21e 503 if (toketype == ANDAND)
6154021b 504 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 505 else if (toketype == OROR)
6154021b 506 pl_yylval.ival = OP_ORASSIGN;
c963b151 507 else if (toketype == DORDOR)
6154021b 508 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
509 toketype = ASSIGNOP;
510 }
511 return toketype;
512}
513
ffb4593c
NT
514/*
515 * S_no_op
516 * When Perl expects an operator and finds something else, no_op
517 * prints the warning. It always prints "<something> found where
518 * operator expected. It prints "Missing semicolon on previous line?"
519 * if the surprise occurs at the start of the line. "do you need to
520 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
521 * where the compiler doesn't know if foo is a method call or a function.
522 * It prints "Missing operator before end of line" if there's nothing
523 * after the missing operator, or "... before <...>" if there is something
524 * after the missing operator.
525 */
526
76e3520e 527STATIC void
15f169a1 528S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 529{
97aff369 530 dVAR;
9d4ba2ae
AL
531 char * const oldbp = PL_bufptr;
532 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 533
7918f24d
NC
534 PERL_ARGS_ASSERT_NO_OP;
535
1189a94a
GS
536 if (!s)
537 s = oldbp;
07c798fb 538 else
1189a94a 539 PL_bufptr = s;
cea2e8a9 540 yywarn(Perl_form(aTHX_ "%s found where operator expected", what));
56da5a46
RGS
541 if (ckWARN_d(WARN_SYNTAX)) {
542 if (is_first)
543 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
544 "\t(Missing semicolon on previous line?)\n");
545 else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) {
f54cb97a 546 const char *t;
c35e046a
AL
547 for (t = PL_oldoldbufptr; (isALNUM_lazy_if(t,UTF) || *t == ':'); t++)
548 NOOP;
56da5a46
RGS
549 if (t < PL_bufptr && isSPACE(*t))
550 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
551 "\t(Do you need to predeclare %.*s?)\n",
551405c4 552 (int)(t - PL_oldoldbufptr), PL_oldoldbufptr);
56da5a46
RGS
553 }
554 else {
555 assert(s >= oldbp);
556 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
551405c4 557 "\t(Missing operator before %.*s?)\n", (int)(s - oldbp), oldbp);
56da5a46 558 }
07c798fb 559 }
3280af22 560 PL_bufptr = oldbp;
8990e307
LW
561}
562
ffb4593c
NT
563/*
564 * S_missingterm
565 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 566 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
567 * If we're in a delimited string and the delimiter is a control
568 * character, it's reformatted into a two-char sequence like ^C.
569 * This is fatal.
570 */
571
76e3520e 572STATIC void
cea2e8a9 573S_missingterm(pTHX_ char *s)
8990e307 574{
97aff369 575 dVAR;
8990e307
LW
576 char tmpbuf[3];
577 char q;
578 if (s) {
9d4ba2ae 579 char * const nl = strrchr(s,'\n');
d2719217 580 if (nl)
8990e307
LW
581 *nl = '\0';
582 }
463559e7 583 else if (isCNTRL(PL_multi_close)) {
8990e307 584 *tmpbuf = '^';
585ec06d 585 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
586 tmpbuf[2] = '\0';
587 s = tmpbuf;
588 }
589 else {
eb160463 590 *tmpbuf = (char)PL_multi_close;
8990e307
LW
591 tmpbuf[1] = '\0';
592 s = tmpbuf;
593 }
594 q = strchr(s,'"') ? '\'' : '"';
cea2e8a9 595 Perl_croak(aTHX_ "Can't find string terminator %c%s%c anywhere before EOF",q,s,q);
463ee0b2 596}
79072805 597
dd0ac2b9
FC
598#include "feature.h"
599
0d863452 600/*
0d863452
RH
601 * Check whether the named feature is enabled.
602 */
26ea9e12 603bool
3fff3427 604Perl_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
0d863452 605{
97aff369 606 dVAR;
4a731d7b 607 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
7918f24d
NC
608
609 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
ca4d40c4
FC
610
611 assert(CURRENT_FEATURE_BUNDLE == FEATURE_BUNDLE_CUSTOM);
7918f24d 612
26ea9e12
NC
613 if (namelen > MAX_FEATURE_LEN)
614 return FALSE;
3fff3427 615 memcpy(&he_name[8], name, namelen);
7d69d4a6 616
c8ca97b0
NC
617 return cBOOL(cop_hints_fetch_pvn(PL_curcop, he_name, 8 + namelen, 0,
618 REFCOUNTED_HE_EXISTS));
0d863452
RH
619}
620
ffb4593c 621/*
9cbb5ea2
GS
622 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
623 * utf16-to-utf8-reversed.
ffb4593c
NT
624 */
625
c39cd008
GS
626#ifdef PERL_CR_FILTER
627static void
628strip_return(SV *sv)
629{
95a20fc0 630 register const char *s = SvPVX_const(sv);
9d4ba2ae 631 register const char * const e = s + SvCUR(sv);
7918f24d
NC
632
633 PERL_ARGS_ASSERT_STRIP_RETURN;
634
c39cd008
GS
635 /* outer loop optimized to do nothing if there are no CR-LFs */
636 while (s < e) {
637 if (*s++ == '\r' && *s == '\n') {
638 /* hit a CR-LF, need to copy the rest */
639 register char *d = s - 1;
640 *d++ = *s++;
641 while (s < e) {
642 if (*s == '\r' && s[1] == '\n')
643 s++;
644 *d++ = *s++;
645 }
646 SvCUR(sv) -= s - d;
647 return;
648 }
649 }
650}
a868473f 651
76e3520e 652STATIC I32
c39cd008 653S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 654{
f54cb97a 655 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
656 if (count > 0 && !maxlen)
657 strip_return(sv);
658 return count;
a868473f
NIS
659}
660#endif
661
ffb4593c 662/*
8eaa0acf
Z
663=for apidoc Amx|void|lex_start|SV *line|PerlIO *rsfp|U32 flags
664
665Creates and initialises a new lexer/parser state object, supplying
666a context in which to lex and parse from a new source of Perl code.
667A pointer to the new state object is placed in L</PL_parser>. An entry
668is made on the save stack so that upon unwinding the new state object
669will be destroyed and the former value of L</PL_parser> will be restored.
670Nothing else need be done to clean up the parsing context.
671
672The code to be parsed comes from I<line> and I<rsfp>. I<line>, if
673non-null, provides a string (in SV form) containing code to be parsed.
674A copy of the string is made, so subsequent modification of I<line>
675does not affect parsing. I<rsfp>, if non-null, provides an input stream
676from which code will be read to be parsed. If both are non-null, the
677code in I<line> comes first and must consist of complete lines of input,
678and I<rsfp> supplies the remainder of the source.
679
e368b3bd
FC
680The I<flags> parameter is reserved for future use. Currently it is only
681used by perl internally, so extensions should always pass zero.
8eaa0acf
Z
682
683=cut
684*/
ffb4593c 685
27fcb6ee
FC
686/* LEX_START_SAME_FILTER indicates that this is not a new file, so it
687 can share filters with the current parser. */
688
a0d0e21e 689void
8eaa0acf 690Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
79072805 691{
97aff369 692 dVAR;
6ef55633 693 const char *s = NULL;
5486870f 694 yy_parser *parser, *oparser;
60d63348 695 if (flags && flags & ~LEX_START_FLAGS)
8eaa0acf 696 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
acdf0a21
DM
697
698 /* create and initialise a parser */
699
199e78b7 700 Newxz(parser, 1, yy_parser);
5486870f 701 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
702 PL_parser = parser;
703
28ac2b49
Z
704 parser->stack = NULL;
705 parser->ps = NULL;
706 parser->stack_size = 0;
acdf0a21 707
e3abe207
DM
708 /* on scope exit, free this parser and restore any outer one */
709 SAVEPARSER(parser);
7c4baf47 710 parser->saved_curcop = PL_curcop;
e3abe207 711
acdf0a21 712 /* initialise lexer state */
8990e307 713
fb205e7a
DM
714#ifdef PERL_MAD
715 parser->curforce = -1;
716#else
717 parser->nexttoke = 0;
718#endif
ca4cfd28 719 parser->error_count = oparser ? oparser->error_count : 0;
c2598295 720 parser->copline = NOLINE;
5afb0a62 721 parser->lex_state = LEX_NORMAL;
c2598295 722 parser->expect = XSTATE;
2f9285f8 723 parser->rsfp = rsfp;
27fcb6ee
FC
724 parser->rsfp_filters =
725 !(flags & LEX_START_SAME_FILTER) || !oparser
d3cd8e11
FC
726 ? NULL
727 : MUTABLE_AV(SvREFCNT_inc(
728 oparser->rsfp_filters
729 ? oparser->rsfp_filters
730 : (oparser->rsfp_filters = newAV())
731 ));
2f9285f8 732
199e78b7
DM
733 Newx(parser->lex_brackstack, 120, char);
734 Newx(parser->lex_casestack, 12, char);
735 *parser->lex_casestack = '\0';
02b34bbe 736
10efb74f 737 if (line) {
0528fd32 738 STRLEN len;
10efb74f 739 s = SvPV_const(line, len);
0abcdfa4
FC
740 parser->linestr = flags & LEX_START_COPIED
741 ? SvREFCNT_inc_simple_NN(line)
742 : newSVpvn_flags(s, len, SvUTF8(line));
37555a72 743 if (!len || s[len-1] != ';')
bdc0bf6f 744 sv_catpvs(parser->linestr, "\n;");
0abcdfa4
FC
745 } else {
746 parser->linestr = newSVpvs("\n;");
8990e307 747 }
f06b5848
DM
748 parser->oldoldbufptr =
749 parser->oldbufptr =
750 parser->bufptr =
751 parser->linestart = SvPVX(parser->linestr);
752 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
753 parser->last_lop = parser->last_uni = NULL;
60d63348 754 parser->lex_flags = flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES);
737c24fc 755
60d63348 756 parser->in_pod = parser->filtered = 0;
79072805 757}
a687059c 758
e3abe207
DM
759
760/* delete a parser object */
761
762void
763Perl_parser_free(pTHX_ const yy_parser *parser)
764{
7918f24d
NC
765 PERL_ARGS_ASSERT_PARSER_FREE;
766
7c4baf47 767 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
768 SvREFCNT_dec(parser->linestr);
769
2f9285f8
DM
770 if (parser->rsfp == PerlIO_stdin())
771 PerlIO_clearerr(parser->rsfp);
799361c3
SH
772 else if (parser->rsfp && (!parser->old_parser ||
773 (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 774 PerlIO_close(parser->rsfp);
5486870f 775 SvREFCNT_dec(parser->rsfp_filters);
2f9285f8 776
e3abe207
DM
777 Safefree(parser->lex_brackstack);
778 Safefree(parser->lex_casestack);
779 PL_parser = parser->old_parser;
780 Safefree(parser);
781}
782
783
ffb4593c 784/*
f0e67a1d
Z
785=for apidoc AmxU|SV *|PL_parser-E<gt>linestr
786
787Buffer scalar containing the chunk currently under consideration of the
788text currently being lexed. This is always a plain string scalar (for
789which C<SvPOK> is true). It is not intended to be used as a scalar by
790normal scalar means; instead refer to the buffer directly by the pointer
791variables described below.
792
793The lexer maintains various C<char*> pointers to things in the
794C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
795reallocated, all of these pointers must be updated. Don't attempt to
796do this manually, but rather use L</lex_grow_linestr> if you need to
797reallocate the buffer.
798
799The content of the text chunk in the buffer is commonly exactly one
800complete line of input, up to and including a newline terminator,
801but there are situations where it is otherwise. The octets of the
802buffer may be intended to be interpreted as either UTF-8 or Latin-1.
803The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
804flag on this scalar, which may disagree with it.
805
806For direct examination of the buffer, the variable
807L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
808lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
809of these pointers is usually preferable to examination of the scalar
810through normal scalar means.
811
812=for apidoc AmxU|char *|PL_parser-E<gt>bufend
813
814Direct pointer to the end of the chunk of text currently being lexed, the
815end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
816+ SvCUR(PL_parser-E<gt>linestr)>. A NUL character (zero octet) is
817always located at the end of the buffer, and does not count as part of
818the buffer's contents.
819
820=for apidoc AmxU|char *|PL_parser-E<gt>bufptr
821
822Points to the current position of lexing inside the lexer buffer.
823Characters around this point may be freely examined, within
824the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
825L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
826interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
827
828Lexing code (whether in the Perl core or not) moves this pointer past
829the characters that it consumes. It is also expected to perform some
830bookkeeping whenever a newline character is consumed. This movement
831can be more conveniently performed by the function L</lex_read_to>,
832which handles newlines appropriately.
833
834Interpretation of the buffer's octets can be abstracted out by
835using the slightly higher-level functions L</lex_peek_unichar> and
836L</lex_read_unichar>.
837
838=for apidoc AmxU|char *|PL_parser-E<gt>linestart
839
840Points to the start of the current line inside the lexer buffer.
841This is useful for indicating at which column an error occurred, and
842not much else. This must be updated by any lexing code that consumes
843a newline; the function L</lex_read_to> handles this detail.
844
845=cut
846*/
847
848/*
849=for apidoc Amx|bool|lex_bufutf8
850
851Indicates whether the octets in the lexer buffer
852(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
853of Unicode characters. If not, they should be interpreted as Latin-1
854characters. This is analogous to the C<SvUTF8> flag for scalars.
855
856In UTF-8 mode, it is not guaranteed that the lexer buffer actually
857contains valid UTF-8. Lexing code must be robust in the face of invalid
858encoding.
859
860The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
861is significant, but not the whole story regarding the input character
862encoding. Normally, when a file is being read, the scalar contains octets
863and its C<SvUTF8> flag is off, but the octets should be interpreted as
864UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
865however, the scalar may have the C<SvUTF8> flag on, and in this case its
866octets should be interpreted as UTF-8 unless the C<use bytes> pragma
867is in effect. This logic may change in the future; use this function
868instead of implementing the logic yourself.
869
870=cut
871*/
872
873bool
874Perl_lex_bufutf8(pTHX)
875{
876 return UTF;
877}
878
879/*
880=for apidoc Amx|char *|lex_grow_linestr|STRLEN len
881
882Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
883at least I<len> octets (including terminating NUL). Returns a
884pointer to the reallocated buffer. This is necessary before making
885any direct modification of the buffer that would increase its length.
886L</lex_stuff_pvn> provides a more convenient way to insert text into
887the buffer.
888
889Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
890this function updates all of the lexer's variables that point directly
891into the buffer.
892
893=cut
894*/
895
896char *
897Perl_lex_grow_linestr(pTHX_ STRLEN len)
898{
899 SV *linestr;
900 char *buf;
901 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
902 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
903 linestr = PL_parser->linestr;
904 buf = SvPVX(linestr);
905 if (len <= SvLEN(linestr))
906 return buf;
907 bufend_pos = PL_parser->bufend - buf;
908 bufptr_pos = PL_parser->bufptr - buf;
909 oldbufptr_pos = PL_parser->oldbufptr - buf;
910 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
911 linestart_pos = PL_parser->linestart - buf;
912 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
913 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
914 buf = sv_grow(linestr, len);
915 PL_parser->bufend = buf + bufend_pos;
916 PL_parser->bufptr = buf + bufptr_pos;
917 PL_parser->oldbufptr = buf + oldbufptr_pos;
918 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
919 PL_parser->linestart = buf + linestart_pos;
920 if (PL_parser->last_uni)
921 PL_parser->last_uni = buf + last_uni_pos;
922 if (PL_parser->last_lop)
923 PL_parser->last_lop = buf + last_lop_pos;
924 return buf;
925}
926
927/*
83aa740e 928=for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
f0e67a1d
Z
929
930Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
931immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
932reallocating the buffer if necessary. This means that lexing code that
933runs later will see the characters as if they had appeared in the input.
934It is not recommended to do this as part of normal parsing, and most
935uses of this facility run the risk of the inserted characters being
936interpreted in an unintended manner.
937
938The string to be inserted is represented by I<len> octets starting
939at I<pv>. These octets are interpreted as either UTF-8 or Latin-1,
940according to whether the C<LEX_STUFF_UTF8> flag is set in I<flags>.
941The characters are recoded for the lexer buffer, according to how the
942buffer is currently being interpreted (L</lex_bufutf8>). If a string
9dcc53ea 943to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
f0e67a1d
Z
944function is more convenient.
945
946=cut
947*/
948
949void
83aa740e 950Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
f0e67a1d 951{
749123ff 952 dVAR;
f0e67a1d
Z
953 char *bufptr;
954 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
955 if (flags & ~(LEX_STUFF_UTF8))
956 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
957 if (UTF) {
958 if (flags & LEX_STUFF_UTF8) {
959 goto plain_copy;
960 } else {
961 STRLEN highhalf = 0;
83aa740e 962 const char *p, *e = pv+len;
f0e67a1d
Z
963 for (p = pv; p != e; p++)
964 highhalf += !!(((U8)*p) & 0x80);
965 if (!highhalf)
966 goto plain_copy;
967 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
968 bufptr = PL_parser->bufptr;
969 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
970 SvCUR_set(PL_parser->linestr,
971 SvCUR(PL_parser->linestr) + len+highhalf);
f0e67a1d
Z
972 PL_parser->bufend += len+highhalf;
973 for (p = pv; p != e; p++) {
974 U8 c = (U8)*p;
975 if (c & 0x80) {
976 *bufptr++ = (char)(0xc0 | (c >> 6));
977 *bufptr++ = (char)(0x80 | (c & 0x3f));
978 } else {
979 *bufptr++ = (char)c;
980 }
981 }
982 }
983 } else {
984 if (flags & LEX_STUFF_UTF8) {
985 STRLEN highhalf = 0;
83aa740e 986 const char *p, *e = pv+len;
f0e67a1d
Z
987 for (p = pv; p != e; p++) {
988 U8 c = (U8)*p;
989 if (c >= 0xc4) {
990 Perl_croak(aTHX_ "Lexing code attempted to stuff "
991 "non-Latin-1 character into Latin-1 input");
992 } else if (c >= 0xc2 && p+1 != e &&
993 (((U8)p[1]) & 0xc0) == 0x80) {
994 p++;
995 highhalf++;
996 } else if (c >= 0x80) {
997 /* malformed UTF-8 */
998 ENTER;
999 SAVESPTR(PL_warnhook);
1000 PL_warnhook = PERL_WARNHOOK_FATAL;
1001 utf8n_to_uvuni((U8*)p, e-p, NULL, 0);
1002 LEAVE;
1003 }
1004 }
1005 if (!highhalf)
1006 goto plain_copy;
1007 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
1008 bufptr = PL_parser->bufptr;
1009 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1010 SvCUR_set(PL_parser->linestr,
1011 SvCUR(PL_parser->linestr) + len-highhalf);
f0e67a1d
Z
1012 PL_parser->bufend += len-highhalf;
1013 for (p = pv; p != e; p++) {
1014 U8 c = (U8)*p;
1015 if (c & 0x80) {
1016 *bufptr++ = (char)(((c & 0x3) << 6) | (p[1] & 0x3f));
1017 p++;
1018 } else {
1019 *bufptr++ = (char)c;
1020 }
1021 }
1022 } else {
1023 plain_copy:
1024 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1025 bufptr = PL_parser->bufptr;
1026 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
255fdf19 1027 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
f0e67a1d
Z
1028 PL_parser->bufend += len;
1029 Copy(pv, bufptr, len, char);
1030 }
1031 }
1032}
1033
1034/*
9dcc53ea
Z
1035=for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
1036
1037Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1038immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1039reallocating the buffer if necessary. This means that lexing code that
1040runs later will see the characters as if they had appeared in the input.
1041It is not recommended to do this as part of normal parsing, and most
1042uses of this facility run the risk of the inserted characters being
1043interpreted in an unintended manner.
1044
1045The string to be inserted is represented by octets starting at I<pv>
1046and continuing to the first nul. These octets are interpreted as either
1047UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
1048in I<flags>. The characters are recoded for the lexer buffer, according
1049to how the buffer is currently being interpreted (L</lex_bufutf8>).
1050If it is not convenient to nul-terminate a string to be inserted, the
1051L</lex_stuff_pvn> function is more appropriate.
1052
1053=cut
1054*/
1055
1056void
1057Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1058{
1059 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1060 lex_stuff_pvn(pv, strlen(pv), flags);
1061}
1062
1063/*
f0e67a1d
Z
1064=for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1065
1066Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1067immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1068reallocating the buffer if necessary. This means that lexing code that
1069runs later will see the characters as if they had appeared in the input.
1070It is not recommended to do this as part of normal parsing, and most
1071uses of this facility run the risk of the inserted characters being
1072interpreted in an unintended manner.
1073
1074The string to be inserted is the string value of I<sv>. The characters
1075are recoded for the lexer buffer, according to how the buffer is currently
9dcc53ea 1076being interpreted (L</lex_bufutf8>). If a string to be inserted is
f0e67a1d
Z
1077not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1078need to construct a scalar.
1079
1080=cut
1081*/
1082
1083void
1084Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1085{
1086 char *pv;
1087 STRLEN len;
1088 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1089 if (flags)
1090 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1091 pv = SvPV(sv, len);
1092 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1093}
1094
1095/*
1096=for apidoc Amx|void|lex_unstuff|char *ptr
1097
1098Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1099I<ptr>. Text following I<ptr> will be moved, and the buffer shortened.
1100This hides the discarded text from any lexing code that runs later,
1101as if the text had never appeared.
1102
1103This is not the normal way to consume lexed text. For that, use
1104L</lex_read_to>.
1105
1106=cut
1107*/
1108
1109void
1110Perl_lex_unstuff(pTHX_ char *ptr)
1111{
1112 char *buf, *bufend;
1113 STRLEN unstuff_len;
1114 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1115 buf = PL_parser->bufptr;
1116 if (ptr < buf)
1117 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1118 if (ptr == buf)
1119 return;
1120 bufend = PL_parser->bufend;
1121 if (ptr > bufend)
1122 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1123 unstuff_len = ptr - buf;
1124 Move(ptr, buf, bufend+1-ptr, char);
1125 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1126 PL_parser->bufend = bufend - unstuff_len;
1127}
1128
1129/*
1130=for apidoc Amx|void|lex_read_to|char *ptr
1131
1132Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1133to I<ptr>. This advances L</PL_parser-E<gt>bufptr> to match I<ptr>,
1134performing the correct bookkeeping whenever a newline character is passed.
1135This is the normal way to consume lexed text.
1136
1137Interpretation of the buffer's octets can be abstracted out by
1138using the slightly higher-level functions L</lex_peek_unichar> and
1139L</lex_read_unichar>.
1140
1141=cut
1142*/
1143
1144void
1145Perl_lex_read_to(pTHX_ char *ptr)
1146{
1147 char *s;
1148 PERL_ARGS_ASSERT_LEX_READ_TO;
1149 s = PL_parser->bufptr;
1150 if (ptr < s || ptr > PL_parser->bufend)
1151 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1152 for (; s != ptr; s++)
1153 if (*s == '\n') {
1154 CopLINE_inc(PL_curcop);
1155 PL_parser->linestart = s+1;
1156 }
1157 PL_parser->bufptr = ptr;
1158}
1159
1160/*
1161=for apidoc Amx|void|lex_discard_to|char *ptr
1162
1163Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1164up to I<ptr>. The remaining content of the buffer will be moved, and
1165all pointers into the buffer updated appropriately. I<ptr> must not
1166be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1167it is not permitted to discard text that has yet to be lexed.
1168
1169Normally it is not necessarily to do this directly, because it suffices to
1170use the implicit discarding behaviour of L</lex_next_chunk> and things
1171based on it. However, if a token stretches across multiple lines,
1f317c95 1172and the lexing code has kept multiple lines of text in the buffer for
f0e67a1d
Z
1173that purpose, then after completion of the token it would be wise to
1174explicitly discard the now-unneeded earlier lines, to avoid future
1175multi-line tokens growing the buffer without bound.
1176
1177=cut
1178*/
1179
1180void
1181Perl_lex_discard_to(pTHX_ char *ptr)
1182{
1183 char *buf;
1184 STRLEN discard_len;
1185 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1186 buf = SvPVX(PL_parser->linestr);
1187 if (ptr < buf)
1188 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1189 if (ptr == buf)
1190 return;
1191 if (ptr > PL_parser->bufptr)
1192 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1193 discard_len = ptr - buf;
1194 if (PL_parser->oldbufptr < ptr)
1195 PL_parser->oldbufptr = ptr;
1196 if (PL_parser->oldoldbufptr < ptr)
1197 PL_parser->oldoldbufptr = ptr;
1198 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1199 PL_parser->last_uni = NULL;
1200 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1201 PL_parser->last_lop = NULL;
1202 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1203 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1204 PL_parser->bufend -= discard_len;
1205 PL_parser->bufptr -= discard_len;
1206 PL_parser->oldbufptr -= discard_len;
1207 PL_parser->oldoldbufptr -= discard_len;
1208 if (PL_parser->last_uni)
1209 PL_parser->last_uni -= discard_len;
1210 if (PL_parser->last_lop)
1211 PL_parser->last_lop -= discard_len;
1212}
1213
1214/*
1215=for apidoc Amx|bool|lex_next_chunk|U32 flags
1216
1217Reads in the next chunk of text to be lexed, appending it to
1218L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1219looked to the end of the current chunk and wants to know more. It is
1220usual, but not necessary, for lexing to have consumed the entirety of
1221the current chunk at this time.
1222
1223If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1224chunk (i.e., the current chunk has been entirely consumed), normally the
1225current chunk will be discarded at the same time that the new chunk is
1226read in. If I<flags> includes C<LEX_KEEP_PREVIOUS>, the current chunk
1227will not be discarded. If the current chunk has not been entirely
1228consumed, then it will not be discarded regardless of the flag.
1229
1230Returns true if some new text was added to the buffer, or false if the
1231buffer has reached the end of the input text.
1232
1233=cut
1234*/
1235
1236#define LEX_FAKE_EOF 0x80000000
1237
1238bool
1239Perl_lex_next_chunk(pTHX_ U32 flags)
1240{
1241 SV *linestr;
1242 char *buf;
1243 STRLEN old_bufend_pos, new_bufend_pos;
1244 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1245 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1246 bool got_some_for_debugger = 0;
f0e67a1d
Z
1247 bool got_some;
1248 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF))
1249 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
f0e67a1d
Z
1250 linestr = PL_parser->linestr;
1251 buf = SvPVX(linestr);
1252 if (!(flags & LEX_KEEP_PREVIOUS) &&
1253 PL_parser->bufptr == PL_parser->bufend) {
1254 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1255 linestart_pos = 0;
1256 if (PL_parser->last_uni != PL_parser->bufend)
1257 PL_parser->last_uni = NULL;
1258 if (PL_parser->last_lop != PL_parser->bufend)
1259 PL_parser->last_lop = NULL;
1260 last_uni_pos = last_lop_pos = 0;
1261 *buf = 0;
1262 SvCUR(linestr) = 0;
1263 } else {
1264 old_bufend_pos = PL_parser->bufend - buf;
1265 bufptr_pos = PL_parser->bufptr - buf;
1266 oldbufptr_pos = PL_parser->oldbufptr - buf;
1267 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1268 linestart_pos = PL_parser->linestart - buf;
1269 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1270 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1271 }
1272 if (flags & LEX_FAKE_EOF) {
1273 goto eof;
60d63348 1274 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
f0e67a1d
Z
1275 got_some = 0;
1276 } else if (filter_gets(linestr, old_bufend_pos)) {
1277 got_some = 1;
17cc9359 1278 got_some_for_debugger = 1;
f0e67a1d 1279 } else {
580561a3
Z
1280 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1281 sv_setpvs(linestr, "");
f0e67a1d
Z
1282 eof:
1283 /* End of real input. Close filehandle (unless it was STDIN),
1284 * then add implicit termination.
1285 */
1286 if ((PerlIO*)PL_parser->rsfp == PerlIO_stdin())
1287 PerlIO_clearerr(PL_parser->rsfp);
1288 else if (PL_parser->rsfp)
1289 (void)PerlIO_close(PL_parser->rsfp);
1290 PL_parser->rsfp = NULL;
60d63348 1291 PL_parser->in_pod = PL_parser->filtered = 0;
f0e67a1d
Z
1292#ifdef PERL_MAD
1293 if (PL_madskills && !PL_in_eval && (PL_minus_p || PL_minus_n))
1294 PL_faketokens = 1;
1295#endif
1296 if (!PL_in_eval && PL_minus_p) {
1297 sv_catpvs(linestr,
1298 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1299 PL_minus_n = PL_minus_p = 0;
1300 } else if (!PL_in_eval && PL_minus_n) {
1301 sv_catpvs(linestr, /*{*/";}");
1302 PL_minus_n = 0;
1303 } else
1304 sv_catpvs(linestr, ";");
1305 got_some = 1;
1306 }
1307 buf = SvPVX(linestr);
1308 new_bufend_pos = SvCUR(linestr);
1309 PL_parser->bufend = buf + new_bufend_pos;
1310 PL_parser->bufptr = buf + bufptr_pos;
1311 PL_parser->oldbufptr = buf + oldbufptr_pos;
1312 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1313 PL_parser->linestart = buf + linestart_pos;
1314 if (PL_parser->last_uni)
1315 PL_parser->last_uni = buf + last_uni_pos;
1316 if (PL_parser->last_lop)
1317 PL_parser->last_lop = buf + last_lop_pos;
17cc9359 1318 if (got_some_for_debugger && (PERLDB_LINE || PERLDB_SAVESRC) &&
f0e67a1d
Z
1319 PL_curstash != PL_debstash) {
1320 /* debugger active and we're not compiling the debugger code,
1321 * so store the line into the debugger's array of lines
1322 */
1323 update_debugger_info(NULL, buf+old_bufend_pos,
1324 new_bufend_pos-old_bufend_pos);
1325 }
1326 return got_some;
1327}
1328
1329/*
1330=for apidoc Amx|I32|lex_peek_unichar|U32 flags
1331
1332Looks ahead one (Unicode) character in the text currently being lexed.
1333Returns the codepoint (unsigned integer value) of the next character,
1334or -1 if lexing has reached the end of the input text. To consume the
1335peeked character, use L</lex_read_unichar>.
1336
1337If the next character is in (or extends into) the next chunk of input
1338text, the next chunk will be read in. Normally the current chunk will be
1339discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1340then the current chunk will not be discarded.
1341
1342If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1343is encountered, an exception is generated.
1344
1345=cut
1346*/
1347
1348I32
1349Perl_lex_peek_unichar(pTHX_ U32 flags)
1350{
749123ff 1351 dVAR;
f0e67a1d
Z
1352 char *s, *bufend;
1353 if (flags & ~(LEX_KEEP_PREVIOUS))
1354 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1355 s = PL_parser->bufptr;
1356 bufend = PL_parser->bufend;
1357 if (UTF) {
1358 U8 head;
1359 I32 unichar;
1360 STRLEN len, retlen;
1361 if (s == bufend) {
1362 if (!lex_next_chunk(flags))
1363 return -1;
1364 s = PL_parser->bufptr;
1365 bufend = PL_parser->bufend;
1366 }
1367 head = (U8)*s;
1368 if (!(head & 0x80))
1369 return head;
1370 if (head & 0x40) {
1371 len = PL_utf8skip[head];
1372 while ((STRLEN)(bufend-s) < len) {
1373 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1374 break;
1375 s = PL_parser->bufptr;
1376 bufend = PL_parser->bufend;
1377 }
1378 }
1379 unichar = utf8n_to_uvuni((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
1380 if (retlen == (STRLEN)-1) {
1381 /* malformed UTF-8 */
1382 ENTER;
1383 SAVESPTR(PL_warnhook);
1384 PL_warnhook = PERL_WARNHOOK_FATAL;
1385 utf8n_to_uvuni((U8*)s, bufend-s, NULL, 0);
1386 LEAVE;
1387 }
1388 return unichar;
1389 } else {
1390 if (s == bufend) {
1391 if (!lex_next_chunk(flags))
1392 return -1;
1393 s = PL_parser->bufptr;
1394 }
1395 return (U8)*s;
1396 }
1397}
1398
1399/*
1400=for apidoc Amx|I32|lex_read_unichar|U32 flags
1401
1402Reads the next (Unicode) character in the text currently being lexed.
1403Returns the codepoint (unsigned integer value) of the character read,
1404and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1405if lexing has reached the end of the input text. To non-destructively
1406examine the next character, use L</lex_peek_unichar> instead.
1407
1408If the next character is in (or extends into) the next chunk of input
1409text, the next chunk will be read in. Normally the current chunk will be
1410discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1411then the current chunk will not be discarded.
1412
1413If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1414is encountered, an exception is generated.
1415
1416=cut
1417*/
1418
1419I32
1420Perl_lex_read_unichar(pTHX_ U32 flags)
1421{
1422 I32 c;
1423 if (flags & ~(LEX_KEEP_PREVIOUS))
1424 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1425 c = lex_peek_unichar(flags);
1426 if (c != -1) {
1427 if (c == '\n')
1428 CopLINE_inc(PL_curcop);
d9018cbe
EB
1429 if (UTF)
1430 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1431 else
1432 ++(PL_parser->bufptr);
f0e67a1d
Z
1433 }
1434 return c;
1435}
1436
1437/*
1438=for apidoc Amx|void|lex_read_space|U32 flags
1439
1440Reads optional spaces, in Perl style, in the text currently being
1441lexed. The spaces may include ordinary whitespace characters and
1442Perl-style comments. C<#line> directives are processed if encountered.
1443L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1444at a non-space character (or the end of the input text).
1445
1446If spaces extend into the next chunk of input text, the next chunk will
1447be read in. Normally the current chunk will be discarded at the same
1448time, but if I<flags> includes C<LEX_KEEP_PREVIOUS> then the current
1449chunk will not be discarded.
1450
1451=cut
1452*/
1453
f0998909
Z
1454#define LEX_NO_NEXT_CHUNK 0x80000000
1455
f0e67a1d
Z
1456void
1457Perl_lex_read_space(pTHX_ U32 flags)
1458{
1459 char *s, *bufend;
1460 bool need_incline = 0;
f0998909 1461 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK))
f0e67a1d
Z
1462 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
1463#ifdef PERL_MAD
1464 if (PL_skipwhite) {
1465 sv_free(PL_skipwhite);
1466 PL_skipwhite = NULL;
1467 }
1468 if (PL_madskills)
1469 PL_skipwhite = newSVpvs("");
1470#endif /* PERL_MAD */
1471 s = PL_parser->bufptr;
1472 bufend = PL_parser->bufend;
1473 while (1) {
1474 char c = *s;
1475 if (c == '#') {
1476 do {
1477 c = *++s;
1478 } while (!(c == '\n' || (c == 0 && s == bufend)));
1479 } else if (c == '\n') {
1480 s++;
1481 PL_parser->linestart = s;
1482 if (s == bufend)
1483 need_incline = 1;
1484 else
1485 incline(s);
1486 } else if (isSPACE(c)) {
1487 s++;
1488 } else if (c == 0 && s == bufend) {
1489 bool got_more;
1490#ifdef PERL_MAD
1491 if (PL_madskills)
1492 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1493#endif /* PERL_MAD */
f0998909
Z
1494 if (flags & LEX_NO_NEXT_CHUNK)
1495 break;
f0e67a1d
Z
1496 PL_parser->bufptr = s;
1497 CopLINE_inc(PL_curcop);
1498 got_more = lex_next_chunk(flags);
1499 CopLINE_dec(PL_curcop);
1500 s = PL_parser->bufptr;
1501 bufend = PL_parser->bufend;
1502 if (!got_more)
1503 break;
1504 if (need_incline && PL_parser->rsfp) {
1505 incline(s);
1506 need_incline = 0;
1507 }
1508 } else {
1509 break;
1510 }
1511 }
1512#ifdef PERL_MAD
1513 if (PL_madskills)
1514 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1515#endif /* PERL_MAD */
1516 PL_parser->bufptr = s;
1517}
1518
1519/*
ffb4593c
NT
1520 * S_incline
1521 * This subroutine has nothing to do with tilting, whether at windmills
1522 * or pinball tables. Its name is short for "increment line". It
57843af0 1523 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1524 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1525 * # line 500 "foo.pm"
1526 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1527 */
1528
76e3520e 1529STATIC void
d9095cec 1530S_incline(pTHX_ const char *s)
463ee0b2 1531{
97aff369 1532 dVAR;
d9095cec
NC
1533 const char *t;
1534 const char *n;
1535 const char *e;
8818d409 1536 line_t line_num;
463ee0b2 1537
7918f24d
NC
1538 PERL_ARGS_ASSERT_INCLINE;
1539
57843af0 1540 CopLINE_inc(PL_curcop);
463ee0b2
LW
1541 if (*s++ != '#')
1542 return;
d4c19fe8
AL
1543 while (SPACE_OR_TAB(*s))
1544 s++;
73659bf1
GS
1545 if (strnEQ(s, "line", 4))
1546 s += 4;
1547 else
1548 return;
084592ab 1549 if (SPACE_OR_TAB(*s))
73659bf1 1550 s++;
4e553d73 1551 else
73659bf1 1552 return;
d4c19fe8
AL
1553 while (SPACE_OR_TAB(*s))
1554 s++;
463ee0b2
LW
1555 if (!isDIGIT(*s))
1556 return;
d4c19fe8 1557
463ee0b2
LW
1558 n = s;
1559 while (isDIGIT(*s))
1560 s++;
07714eb4 1561 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1562 return;
bf4acbe4 1563 while (SPACE_OR_TAB(*s))
463ee0b2 1564 s++;
73659bf1 1565 if (*s == '"' && (t = strchr(s+1, '"'))) {
463ee0b2 1566 s++;
73659bf1
GS
1567 e = t + 1;
1568 }
463ee0b2 1569 else {
c35e046a
AL
1570 t = s;
1571 while (!isSPACE(*t))
1572 t++;
73659bf1 1573 e = t;
463ee0b2 1574 }
bf4acbe4 1575 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1576 e++;
1577 if (*e != '\n' && *e != '\0')
1578 return; /* false alarm */
1579
8818d409
FC
1580 line_num = atoi(n)-1;
1581
f4dd75d9 1582 if (t - s > 0) {
d9095cec 1583 const STRLEN len = t - s;
19bad673
NC
1584 SV *const temp_sv = CopFILESV(PL_curcop);
1585 const char *cf;
1586 STRLEN tmplen;
1587
1588 if (temp_sv) {
1589 cf = SvPVX(temp_sv);
1590 tmplen = SvCUR(temp_sv);
1591 } else {
1592 cf = NULL;
1593 tmplen = 0;
1594 }
1595
d1299d44 1596 if (!PL_rsfp && !PL_parser->filtered) {
e66cf94c
RGS
1597 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1598 * to *{"::_<newfilename"} */
44867030
NC
1599 /* However, the long form of evals is only turned on by the
1600 debugger - usually they're "(eval %lu)" */
1601 char smallbuf[128];
1602 char *tmpbuf;
1603 GV **gvp;
d9095cec 1604 STRLEN tmplen2 = len;
798b63bc 1605 if (tmplen + 2 <= sizeof smallbuf)
e66cf94c
RGS
1606 tmpbuf = smallbuf;
1607 else
2ae0db35 1608 Newx(tmpbuf, tmplen + 2, char);
44867030
NC
1609 tmpbuf[0] = '_';
1610 tmpbuf[1] = '<';
2ae0db35 1611 memcpy(tmpbuf + 2, cf, tmplen);
44867030 1612 tmplen += 2;
8a5ee598
RGS
1613 gvp = (GV**)hv_fetch(PL_defstash, tmpbuf, tmplen, FALSE);
1614 if (gvp) {
44867030
NC
1615 char *tmpbuf2;
1616 GV *gv2;
1617
1618 if (tmplen2 + 2 <= sizeof smallbuf)
1619 tmpbuf2 = smallbuf;
1620 else
1621 Newx(tmpbuf2, tmplen2 + 2, char);
1622
1623 if (tmpbuf2 != smallbuf || tmpbuf != smallbuf) {
1624 /* Either they malloc'd it, or we malloc'd it,
1625 so no prefix is present in ours. */
1626 tmpbuf2[0] = '_';
1627 tmpbuf2[1] = '<';
1628 }
1629
1630 memcpy(tmpbuf2 + 2, s, tmplen2);
1631 tmplen2 += 2;
1632
8a5ee598 1633 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1634 if (!isGV(gv2)) {
8a5ee598 1635 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1636 /* adjust ${"::_<newfilename"} to store the new file name */
1637 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
8818d409
FC
1638 /* The line number may differ. If that is the case,
1639 alias the saved lines that are in the array.
1640 Otherwise alias the whole array. */
1641 if (CopLINE(PL_curcop) == line_num) {
1642 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(*gvp)));
1643 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(*gvp)));
1644 }
1645 else if (GvAV(*gvp)) {
1646 AV * const av = GvAV(*gvp);
1647 const I32 start = CopLINE(PL_curcop)+1;
1648 I32 items = AvFILLp(av) - start;
1649 if (items > 0) {
1650 AV * const av2 = GvAVn(gv2);
1651 SV **svp = AvARRAY(av) + start;
1652 I32 l = (I32)line_num+1;
1653 while (items--)
1654 av_store(av2, l++, SvREFCNT_inc(*svp++));
1655 }
1656 }
e5527e4b 1657 }
44867030
NC
1658
1659 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
8a5ee598 1660 }
e66cf94c 1661 if (tmpbuf != smallbuf) Safefree(tmpbuf);
e66cf94c 1662 }
05ec9bb3 1663 CopFILE_free(PL_curcop);
d9095cec 1664 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1665 }
8818d409 1666 CopLINE_set(PL_curcop, line_num);
463ee0b2
LW
1667}
1668
29595ff2 1669#ifdef PERL_MAD
cd81e915 1670/* skip space before PL_thistoken */
29595ff2
NC
1671
1672STATIC char *
1673S_skipspace0(pTHX_ register char *s)
1674{
7918f24d
NC
1675 PERL_ARGS_ASSERT_SKIPSPACE0;
1676
29595ff2
NC
1677 s = skipspace(s);
1678 if (!PL_madskills)
1679 return s;
cd81e915
NC
1680 if (PL_skipwhite) {
1681 if (!PL_thiswhite)
6b29d1f5 1682 PL_thiswhite = newSVpvs("");
cd81e915
NC
1683 sv_catsv(PL_thiswhite, PL_skipwhite);
1684 sv_free(PL_skipwhite);
1685 PL_skipwhite = 0;
1686 }
1687 PL_realtokenstart = s - SvPVX(PL_linestr);
29595ff2
NC
1688 return s;
1689}
1690
cd81e915 1691/* skip space after PL_thistoken */
29595ff2
NC
1692
1693STATIC char *
1694S_skipspace1(pTHX_ register char *s)
1695{
d4c19fe8 1696 const char *start = s;
29595ff2
NC
1697 I32 startoff = start - SvPVX(PL_linestr);
1698
7918f24d
NC
1699 PERL_ARGS_ASSERT_SKIPSPACE1;
1700
29595ff2
NC
1701 s = skipspace(s);
1702 if (!PL_madskills)
1703 return s;
1704 start = SvPVX(PL_linestr) + startoff;
cd81e915 1705 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1706 const char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1707 PL_thistoken = newSVpvn(tstart, start - tstart);
1708 }
1709 PL_realtokenstart = -1;
1710 if (PL_skipwhite) {
1711 if (!PL_nextwhite)
6b29d1f5 1712 PL_nextwhite = newSVpvs("");
cd81e915
NC
1713 sv_catsv(PL_nextwhite, PL_skipwhite);
1714 sv_free(PL_skipwhite);
1715 PL_skipwhite = 0;
29595ff2
NC
1716 }
1717 return s;
1718}
1719
1720STATIC char *
1721S_skipspace2(pTHX_ register char *s, SV **svp)
1722{
c35e046a
AL
1723 char *start;
1724 const I32 bufptroff = PL_bufptr - SvPVX(PL_linestr);
1725 const I32 startoff = s - SvPVX(PL_linestr);
1726
7918f24d
NC
1727 PERL_ARGS_ASSERT_SKIPSPACE2;
1728
29595ff2
NC
1729 s = skipspace(s);
1730 PL_bufptr = SvPVX(PL_linestr) + bufptroff;
1731 if (!PL_madskills || !svp)
1732 return s;
1733 start = SvPVX(PL_linestr) + startoff;
cd81e915 1734 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1735 char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1736 PL_thistoken = newSVpvn(tstart, start - tstart);
1737 PL_realtokenstart = -1;
29595ff2 1738 }
cd81e915 1739 if (PL_skipwhite) {
29595ff2 1740 if (!*svp)
6b29d1f5 1741 *svp = newSVpvs("");
cd81e915
NC
1742 sv_setsv(*svp, PL_skipwhite);
1743 sv_free(PL_skipwhite);
1744 PL_skipwhite = 0;
29595ff2
NC
1745 }
1746
1747 return s;
1748}
1749#endif
1750
80a702cd 1751STATIC void
15f169a1 1752S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1753{
1754 AV *av = CopFILEAVx(PL_curcop);
1755 if (av) {
b9f83d2f 1756 SV * const sv = newSV_type(SVt_PVMG);
5fa550fb
NC
1757 if (orig_sv)
1758 sv_setsv(sv, orig_sv);
1759 else
1760 sv_setpvn(sv, buf, len);
80a702cd
RGS
1761 (void)SvIOK_on(sv);
1762 SvIV_set(sv, 0);
1763 av_store(av, (I32)CopLINE(PL_curcop), sv);
1764 }
1765}
1766
ffb4593c
NT
1767/*
1768 * S_skipspace
1769 * Called to gobble the appropriate amount and type of whitespace.
1770 * Skips comments as well.
1771 */
1772
76e3520e 1773STATIC char *
cea2e8a9 1774S_skipspace(pTHX_ register char *s)
a687059c 1775{
5db06880 1776#ifdef PERL_MAD
f0e67a1d
Z
1777 char *start = s;
1778#endif /* PERL_MAD */
7918f24d 1779 PERL_ARGS_ASSERT_SKIPSPACE;
f0e67a1d 1780#ifdef PERL_MAD
cd81e915
NC
1781 if (PL_skipwhite) {
1782 sv_free(PL_skipwhite);
f0e67a1d 1783 PL_skipwhite = NULL;
5db06880 1784 }
f0e67a1d 1785#endif /* PERL_MAD */
3280af22 1786 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
bf4acbe4 1787 while (s < PL_bufend && SPACE_OR_TAB(*s))
463ee0b2 1788 s++;
f0e67a1d
Z
1789 } else {
1790 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1791 PL_bufptr = s;
f0998909
Z
1792 lex_read_space(LEX_KEEP_PREVIOUS |
1793 (PL_sublex_info.sub_inwhat || PL_lex_state == LEX_FORMLINE ?
1794 LEX_NO_NEXT_CHUNK : 0));
3280af22 1795 s = PL_bufptr;
f0e67a1d
Z
1796 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1797 if (PL_linestart > PL_bufptr)
1798 PL_bufptr = PL_linestart;
1799 return s;
463ee0b2 1800 }
5db06880 1801#ifdef PERL_MAD
f0e67a1d
Z
1802 if (PL_madskills)
1803 PL_skipwhite = newSVpvn(start, s-start);
1804#endif /* PERL_MAD */
5db06880 1805 return s;
a687059c 1806}
378cc40b 1807
ffb4593c
NT
1808/*
1809 * S_check_uni
1810 * Check the unary operators to ensure there's no ambiguity in how they're
1811 * used. An ambiguous piece of code would be:
1812 * rand + 5
1813 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1814 * the +5 is its argument.
1815 */
1816
76e3520e 1817STATIC void
cea2e8a9 1818S_check_uni(pTHX)
ba106d47 1819{
97aff369 1820 dVAR;
d4c19fe8
AL
1821 const char *s;
1822 const char *t;
2f3197b3 1823
3280af22 1824 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1825 return;
3280af22
NIS
1826 while (isSPACE(*PL_last_uni))
1827 PL_last_uni++;
c35e046a
AL
1828 s = PL_last_uni;
1829 while (isALNUM_lazy_if(s,UTF) || *s == '-')
1830 s++;
3280af22 1831 if ((t = strchr(s, '(')) && t < PL_bufptr)
a0d0e21e 1832 return;
6136c704 1833
9b387841
NC
1834 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
1835 "Warning: Use of \"%.*s\" without parentheses is ambiguous",
1836 (int)(s - PL_last_uni), PL_last_uni);
2f3197b3
LW
1837}
1838
ffb4593c
NT
1839/*
1840 * LOP : macro to build a list operator. Its behaviour has been replaced
1841 * with a subroutine, S_lop() for which LOP is just another name.
1842 */
1843
a0d0e21e
LW
1844#define LOP(f,x) return lop(f,x,s)
1845
ffb4593c
NT
1846/*
1847 * S_lop
1848 * Build a list operator (or something that might be one). The rules:
1849 * - if we have a next token, then it's a list operator [why?]
1850 * - if the next thing is an opening paren, then it's a function
1851 * - else it's a list operator
1852 */
1853
76e3520e 1854STATIC I32
a0be28da 1855S_lop(pTHX_ I32 f, int x, char *s)
ffed7fef 1856{
97aff369 1857 dVAR;
7918f24d
NC
1858
1859 PERL_ARGS_ASSERT_LOP;
1860
6154021b 1861 pl_yylval.ival = f;
35c8bce7 1862 CLINE;
3280af22
NIS
1863 PL_expect = x;
1864 PL_bufptr = s;
1865 PL_last_lop = PL_oldbufptr;
eb160463 1866 PL_last_lop_op = (OPCODE)f;
5db06880
NC
1867#ifdef PERL_MAD
1868 if (PL_lasttoke)
78cdf107 1869 goto lstop;
5db06880 1870#else
3280af22 1871 if (PL_nexttoke)
78cdf107 1872 goto lstop;
5db06880 1873#endif
79072805 1874 if (*s == '(')
bbf60fe6 1875 return REPORT(FUNC);
29595ff2 1876 s = PEEKSPACE(s);
79072805 1877 if (*s == '(')
bbf60fe6 1878 return REPORT(FUNC);
78cdf107
Z
1879 else {
1880 lstop:
1881 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
1882 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
bbf60fe6 1883 return REPORT(LSTOP);
78cdf107 1884 }
79072805
LW
1885}
1886
5db06880
NC
1887#ifdef PERL_MAD
1888 /*
1889 * S_start_force
1890 * Sets up for an eventual force_next(). start_force(0) basically does
1891 * an unshift, while start_force(-1) does a push. yylex removes items
1892 * on the "pop" end.
1893 */
1894
1895STATIC void
1896S_start_force(pTHX_ int where)
1897{
1898 int i;
1899
cd81e915 1900 if (where < 0) /* so people can duplicate start_force(PL_curforce) */
5db06880 1901 where = PL_lasttoke;
cd81e915
NC
1902 assert(PL_curforce < 0 || PL_curforce == where);
1903 if (PL_curforce != where) {
5db06880
NC
1904 for (i = PL_lasttoke; i > where; --i) {
1905 PL_nexttoke[i] = PL_nexttoke[i-1];
1906 }
1907 PL_lasttoke++;
1908 }
cd81e915 1909 if (PL_curforce < 0) /* in case of duplicate start_force() */
5db06880 1910 Zero(&PL_nexttoke[where], 1, NEXTTOKE);
cd81e915
NC
1911 PL_curforce = where;
1912 if (PL_nextwhite) {
5db06880 1913 if (PL_madskills)
6b29d1f5 1914 curmad('^', newSVpvs(""));
cd81e915 1915 CURMAD('_', PL_nextwhite);
5db06880
NC
1916 }
1917}
1918
1919STATIC void
1920S_curmad(pTHX_ char slot, SV *sv)
1921{
1922 MADPROP **where;
1923
1924 if (!sv)
1925 return;
cd81e915
NC
1926 if (PL_curforce < 0)
1927 where = &PL_thismad;
5db06880 1928 else
cd81e915 1929 where = &PL_nexttoke[PL_curforce].next_mad;
5db06880 1930
cd81e915 1931 if (PL_faketokens)
76f68e9b 1932 sv_setpvs(sv, "");
5db06880
NC
1933 else {
1934 if (!IN_BYTES) {
1935 if (UTF && is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
1936 SvUTF8_on(sv);
1937 else if (PL_encoding) {
1938 sv_recode_to_utf8(sv, PL_encoding);
1939 }
1940 }
1941 }
1942
1943 /* keep a slot open for the head of the list? */
1944 if (slot != '_' && *where && (*where)->mad_key == '^') {
1945 (*where)->mad_key = slot;
daba3364 1946 sv_free(MUTABLE_SV(((*where)->mad_val)));
5db06880
NC
1947 (*where)->mad_val = (void*)sv;
1948 }
1949 else
1950 addmad(newMADsv(slot, sv), where, 0);
1951}
1952#else
b3f24c00
MHM
1953# define start_force(where) NOOP
1954# define curmad(slot, sv) NOOP
5db06880
NC
1955#endif
1956
ffb4593c
NT
1957/*
1958 * S_force_next
9cbb5ea2 1959 * When the lexer realizes it knows the next token (for instance,
ffb4593c 1960 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 1961 * to know what token to return the next time the lexer is called. Caller
5db06880
NC
1962 * will need to set PL_nextval[] (or PL_nexttoke[].next_val with PERL_MAD),
1963 * and possibly PL_expect to ensure the lexer handles the token correctly.
ffb4593c
NT
1964 */
1965
4e553d73 1966STATIC void
cea2e8a9 1967S_force_next(pTHX_ I32 type)
79072805 1968{
97aff369 1969 dVAR;
704d4215
GG
1970#ifdef DEBUGGING
1971 if (DEBUG_T_TEST) {
1972 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 1973 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
1974 }
1975#endif
5db06880 1976#ifdef PERL_MAD
cd81e915 1977 if (PL_curforce < 0)
5db06880 1978 start_force(PL_lasttoke);
cd81e915 1979 PL_nexttoke[PL_curforce].next_type = type;
5db06880
NC
1980 if (PL_lex_state != LEX_KNOWNEXT)
1981 PL_lex_defer = PL_lex_state;
1982 PL_lex_state = LEX_KNOWNEXT;
1983 PL_lex_expect = PL_expect;
cd81e915 1984 PL_curforce = -1;
5db06880 1985#else
3280af22
NIS
1986 PL_nexttype[PL_nexttoke] = type;
1987 PL_nexttoke++;
1988 if (PL_lex_state != LEX_KNOWNEXT) {
1989 PL_lex_defer = PL_lex_state;
1990 PL_lex_expect = PL_expect;
1991 PL_lex_state = LEX_KNOWNEXT;
79072805 1992 }
5db06880 1993#endif
79072805
LW
1994}
1995
28ac2b49
Z
1996void
1997Perl_yyunlex(pTHX)
1998{
a7aaec61
Z
1999 int yyc = PL_parser->yychar;
2000 if (yyc != YYEMPTY) {
2001 if (yyc) {
2002 start_force(-1);
2003 NEXTVAL_NEXTTOKE = PL_parser->yylval;
2004 if (yyc == '{'/*}*/ || yyc == HASHBRACK || yyc == '['/*]*/) {
78cdf107 2005 PL_lex_allbrackets--;
a7aaec61 2006 PL_lex_brackets--;
78cdf107
Z
2007 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
2008 } else if (yyc == '('/*)*/) {
2009 PL_lex_allbrackets--;
2010 yyc |= (2<<24);
a7aaec61
Z
2011 }
2012 force_next(yyc);
2013 }
28ac2b49
Z
2014 PL_parser->yychar = YYEMPTY;
2015 }
2016}
2017
d0a148a6 2018STATIC SV *
15f169a1 2019S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 2020{
97aff369 2021 dVAR;
740cce10 2022 SV * const sv = newSVpvn_utf8(start, len,
eaf7a4d2
CS
2023 !IN_BYTES
2024 && UTF
2025 && !is_ascii_string((const U8*)start, len)
740cce10 2026 && is_utf8_string((const U8*)start, len));
d0a148a6
NC
2027 return sv;
2028}
2029
ffb4593c
NT
2030/*
2031 * S_force_word
2032 * When the lexer knows the next thing is a word (for instance, it has
2033 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
2034 * it calls S_force_word to stick the next word into the PL_nexttoke/val
2035 * lookahead.
ffb4593c
NT
2036 *
2037 * Arguments:
b1b65b59 2038 * char *start : buffer position (must be within PL_linestr)
02b34bbe 2039 * int token : PL_next* will be this type of bare word (e.g., METHOD,WORD)
ffb4593c
NT
2040 * int check_keyword : if true, Perl checks to make sure the word isn't
2041 * a keyword (do this if the word is a label, e.g. goto FOO)
2042 * int allow_pack : if true, : characters will also be allowed (require,
2043 * use, etc. do this)
9cbb5ea2 2044 * int allow_initial_tick : used by the "sub" lexer only.
ffb4593c
NT
2045 */
2046
76e3520e 2047STATIC char *
cea2e8a9 2048S_force_word(pTHX_ register char *start, int token, int check_keyword, int allow_pack, int allow_initial_tick)
79072805 2049{
97aff369 2050 dVAR;
463ee0b2
LW
2051 register char *s;
2052 STRLEN len;
4e553d73 2053
7918f24d
NC
2054 PERL_ARGS_ASSERT_FORCE_WORD;
2055
29595ff2 2056 start = SKIPSPACE1(start);
463ee0b2 2057 s = start;
7e2040f0 2058 if (isIDFIRST_lazy_if(s,UTF) ||
a0d0e21e 2059 (allow_pack && *s == ':') ||
15f0808c 2060 (allow_initial_tick && *s == '\'') )
a0d0e21e 2061 {
3280af22 2062 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
5458a98a 2063 if (check_keyword && keyword(PL_tokenbuf, len, 0))
463ee0b2 2064 return start;
cd81e915 2065 start_force(PL_curforce);
5db06880
NC
2066 if (PL_madskills)
2067 curmad('X', newSVpvn(start,s-start));
463ee0b2 2068 if (token == METHOD) {
29595ff2 2069 s = SKIPSPACE1(s);
463ee0b2 2070 if (*s == '(')
3280af22 2071 PL_expect = XTERM;
463ee0b2 2072 else {
3280af22 2073 PL_expect = XOPERATOR;
463ee0b2 2074 }
79072805 2075 }
e74e6b3d 2076 if (PL_madskills)
63575281 2077 curmad('g', newSVpvs( "forced" ));
9ded7720 2078 NEXTVAL_NEXTTOKE.opval
d0a148a6
NC
2079 = (OP*)newSVOP(OP_CONST,0,
2080 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 2081 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
2082 force_next(token);
2083 }
2084 return s;
2085}
2086
ffb4593c
NT
2087/*
2088 * S_force_ident
9cbb5ea2 2089 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
2090 * text only contains the "foo" portion. The first argument is a pointer
2091 * to the "foo", and the second argument is the type symbol to prefix.
2092 * Forces the next token to be a "WORD".
9cbb5ea2 2093 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2094 */
2095
76e3520e 2096STATIC void
bfed75c6 2097S_force_ident(pTHX_ register const char *s, int kind)
79072805 2098{
97aff369 2099 dVAR;
7918f24d
NC
2100
2101 PERL_ARGS_ASSERT_FORCE_IDENT;
2102
c35e046a 2103 if (*s) {
90e5519e 2104 const STRLEN len = strlen(s);
728847b1
BF
2105 OP* const o = (OP*)newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
2106 UTF ? SVf_UTF8 : 0));
cd81e915 2107 start_force(PL_curforce);
9ded7720 2108 NEXTVAL_NEXTTOKE.opval = o;
79072805 2109 force_next(WORD);
748a9306 2110 if (kind) {
11343788 2111 o->op_private = OPpCONST_ENTERED;
55497cff 2112 /* XXX see note in pp_entereval() for why we forgo typo
2113 warnings if the symbol must be introduced in an eval.
2114 GSAR 96-10-12 */
90e5519e 2115 gv_fetchpvn_flags(s, len,
728847b1
BF
2116 (PL_in_eval ? (GV_ADDMULTI | GV_ADDINEVAL)
2117 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
90e5519e
NC
2118 kind == '$' ? SVt_PV :
2119 kind == '@' ? SVt_PVAV :
2120 kind == '%' ? SVt_PVHV :
a0d0e21e 2121 SVt_PVGV
90e5519e 2122 );
748a9306 2123 }
79072805
LW
2124 }
2125}
2126
1571675a
GS
2127NV
2128Perl_str_to_version(pTHX_ SV *sv)
2129{
2130 NV retval = 0.0;
2131 NV nshift = 1.0;
2132 STRLEN len;
cfd0369c 2133 const char *start = SvPV_const(sv,len);
9d4ba2ae 2134 const char * const end = start + len;
504618e9 2135 const bool utf = SvUTF8(sv) ? TRUE : FALSE;
7918f24d
NC
2136
2137 PERL_ARGS_ASSERT_STR_TO_VERSION;
2138
1571675a 2139 while (start < end) {
ba210ebe 2140 STRLEN skip;
1571675a
GS
2141 UV n;
2142 if (utf)
9041c2e3 2143 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2144 else {
2145 n = *(U8*)start;
2146 skip = 1;
2147 }
2148 retval += ((NV)n)/nshift;
2149 start += skip;
2150 nshift *= 1000;
2151 }
2152 return retval;
2153}
2154
4e553d73 2155/*
ffb4593c
NT
2156 * S_force_version
2157 * Forces the next token to be a version number.
e759cc13
RGS
2158 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2159 * and if "guessing" is TRUE, then no new token is created (and the caller
2160 * must use an alternative parsing method).
ffb4593c
NT
2161 */
2162
76e3520e 2163STATIC char *
e759cc13 2164S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2165{
97aff369 2166 dVAR;
5f66b61c 2167 OP *version = NULL;
44dcb63b 2168 char *d;
5db06880
NC
2169#ifdef PERL_MAD
2170 I32 startoff = s - SvPVX(PL_linestr);
2171#endif
89bfa8cd 2172
7918f24d
NC
2173 PERL_ARGS_ASSERT_FORCE_VERSION;
2174
29595ff2 2175 s = SKIPSPACE1(s);
89bfa8cd 2176
44dcb63b 2177 d = s;
dd629d5b 2178 if (*d == 'v')
44dcb63b 2179 d++;
44dcb63b 2180 if (isDIGIT(*d)) {
e759cc13
RGS
2181 while (isDIGIT(*d) || *d == '_' || *d == '.')
2182 d++;
5db06880
NC
2183#ifdef PERL_MAD
2184 if (PL_madskills) {
cd81e915 2185 start_force(PL_curforce);
5db06880
NC
2186 curmad('X', newSVpvn(s,d-s));
2187 }
2188#endif
4e4da3ac 2189 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
dd629d5b 2190 SV *ver;
8d08d9ba 2191#ifdef USE_LOCALE_NUMERIC
909d3787
KW
2192 char *loc = savepv(setlocale(LC_NUMERIC, NULL));
2193 setlocale(LC_NUMERIC, "C");
8d08d9ba 2194#endif
6154021b 2195 s = scan_num(s, &pl_yylval);
8d08d9ba
DG
2196#ifdef USE_LOCALE_NUMERIC
2197 setlocale(LC_NUMERIC, loc);
909d3787 2198 Safefree(loc);
8d08d9ba 2199#endif
6154021b 2200 version = pl_yylval.opval;
dd629d5b
GS
2201 ver = cSVOPx(version)->op_sv;
2202 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2203 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2204 SvNV_set(ver, str_to_version(ver));
1571675a 2205 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2206 }
89bfa8cd 2207 }
5db06880
NC
2208 else if (guessing) {
2209#ifdef PERL_MAD
2210 if (PL_madskills) {
cd81e915
NC
2211 sv_free(PL_nextwhite); /* let next token collect whitespace */
2212 PL_nextwhite = 0;
5db06880
NC
2213 s = SvPVX(PL_linestr) + startoff;
2214 }
2215#endif
e759cc13 2216 return s;
5db06880 2217 }
89bfa8cd 2218 }
2219
5db06880
NC
2220#ifdef PERL_MAD
2221 if (PL_madskills && !version) {
cd81e915
NC
2222 sv_free(PL_nextwhite); /* let next token collect whitespace */
2223 PL_nextwhite = 0;
5db06880
NC
2224 s = SvPVX(PL_linestr) + startoff;
2225 }
2226#endif
89bfa8cd 2227 /* NOTE: The parser sees the package name and the VERSION swapped */
cd81e915 2228 start_force(PL_curforce);
9ded7720 2229 NEXTVAL_NEXTTOKE.opval = version;
4e553d73 2230 force_next(WORD);
89bfa8cd 2231
e759cc13 2232 return s;
89bfa8cd 2233}
2234
ffb4593c 2235/*
91152fc1
DG
2236 * S_force_strict_version
2237 * Forces the next token to be a version number using strict syntax rules.
2238 */
2239
2240STATIC char *
2241S_force_strict_version(pTHX_ char *s)
2242{
2243 dVAR;
2244 OP *version = NULL;
2245#ifdef PERL_MAD
2246 I32 startoff = s - SvPVX(PL_linestr);
2247#endif
2248 const char *errstr = NULL;
2249
2250 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2251
2252 while (isSPACE(*s)) /* leading whitespace */
2253 s++;
2254
2255 if (is_STRICT_VERSION(s,&errstr)) {
2256 SV *ver = newSV(0);
2257 s = (char *)scan_version(s, ver, 0);
2258 version = newSVOP(OP_CONST, 0, ver);
2259 }
4e4da3ac
Z
2260 else if ( (*s != ';' && *s != '{' && *s != '}' ) &&
2261 (s = SKIPSPACE1(s), (*s != ';' && *s != '{' && *s != '}' )))
2262 {
91152fc1
DG
2263 PL_bufptr = s;
2264 if (errstr)
2265 yyerror(errstr); /* version required */
2266 return s;
2267 }
2268
2269#ifdef PERL_MAD
2270 if (PL_madskills && !version) {
2271 sv_free(PL_nextwhite); /* let next token collect whitespace */
2272 PL_nextwhite = 0;
2273 s = SvPVX(PL_linestr) + startoff;
2274 }
2275#endif
2276 /* NOTE: The parser sees the package name and the VERSION swapped */
2277 start_force(PL_curforce);
2278 NEXTVAL_NEXTTOKE.opval = version;
2279 force_next(WORD);
2280
2281 return s;
2282}
2283
2284/*
ffb4593c
NT
2285 * S_tokeq
2286 * Tokenize a quoted string passed in as an SV. It finds the next
2287 * chunk, up to end of string or a backslash. It may make a new
2288 * SV containing that chunk (if HINT_NEW_STRING is on). It also
2289 * turns \\ into \.
2290 */
2291
76e3520e 2292STATIC SV *
cea2e8a9 2293S_tokeq(pTHX_ SV *sv)
79072805 2294{
97aff369 2295 dVAR;
79072805
LW
2296 register char *s;
2297 register char *send;
2298 register char *d;
b3ac6de7
IZ
2299 STRLEN len = 0;
2300 SV *pv = sv;
79072805 2301
7918f24d
NC
2302 PERL_ARGS_ASSERT_TOKEQ;
2303
79072805 2304 if (!SvLEN(sv))
b3ac6de7 2305 goto finish;
79072805 2306
a0d0e21e 2307 s = SvPV_force(sv, len);
21a311ee 2308 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1)
b3ac6de7 2309 goto finish;
463ee0b2 2310 send = s + len;
dcb21ed6
NC
2311 /* This is relying on the SV being "well formed" with a trailing '\0' */
2312 while (s < send && !(*s == '\\' && s[1] == '\\'))
79072805
LW
2313 s++;
2314 if (s == send)
b3ac6de7 2315 goto finish;
79072805 2316 d = s;
be4731d2 2317 if ( PL_hints & HINT_NEW_STRING ) {
59cd0e26 2318 pv = newSVpvn_flags(SvPVX_const(pv), len, SVs_TEMP | SvUTF8(sv));
be4731d2 2319 }
79072805
LW
2320 while (s < send) {
2321 if (*s == '\\') {
a0d0e21e 2322 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2323 s++; /* all that, just for this */
2324 }
2325 *d++ = *s++;
2326 }
2327 *d = '\0';
95a20fc0 2328 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2329 finish:
3280af22 2330 if ( PL_hints & HINT_NEW_STRING )
eb0d8d16 2331 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
79072805
LW
2332 return sv;
2333}
2334
ffb4593c
NT
2335/*
2336 * Now come three functions related to double-quote context,
2337 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2338 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2339 * interact with PL_lex_state, and create fake ( ... ) argument lists
2340 * to handle functions and concatenation.
2341 * They assume that whoever calls them will be setting up a fake
2342 * join call, because each subthing puts a ',' after it. This lets
2343 * "lower \luPpEr"
2344 * become
2345 * join($, , 'lower ', lcfirst( 'uPpEr', ) ,)
2346 *
2347 * (I'm not sure whether the spurious commas at the end of lcfirst's
2348 * arguments and join's arguments are created or not).
2349 */
2350
2351/*
2352 * S_sublex_start
6154021b 2353 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2354 *
2355 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2356 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c
NT
2357 *
2358 * OP_CONST and OP_READLINE are easy--just make the new op and return.
2359 *
2360 * Everything else becomes a FUNC.
2361 *
2362 * Sets PL_lex_state to LEX_INTERPPUSH unless (ival was OP_NULL or we
2363 * had an OP_CONST or OP_READLINE). This just sets us up for a
2364 * call to S_sublex_push().
2365 */
2366
76e3520e 2367STATIC I32
cea2e8a9 2368S_sublex_start(pTHX)
79072805 2369{
97aff369 2370 dVAR;
6154021b 2371 register const I32 op_type = pl_yylval.ival;
79072805
LW
2372
2373 if (op_type == OP_NULL) {
6154021b 2374 pl_yylval.opval = PL_lex_op;
5f66b61c 2375 PL_lex_op = NULL;
79072805
LW
2376 return THING;
2377 }
2378 if (op_type == OP_CONST || op_type == OP_READLINE) {
3280af22 2379 SV *sv = tokeq(PL_lex_stuff);
b3ac6de7
IZ
2380
2381 if (SvTYPE(sv) == SVt_PVIV) {
2382 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2383 STRLEN len;
96a5add6 2384 const char * const p = SvPV_const(sv, len);
740cce10 2385 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2386 SvREFCNT_dec(sv);
2387 sv = nsv;
4e553d73 2388 }
6154021b 2389 pl_yylval.opval = (OP*)newSVOP(op_type, 0, sv);
a0714e2c 2390 PL_lex_stuff = NULL;
6f33ba73
RGS
2391 /* Allow <FH> // "foo" */
2392 if (op_type == OP_READLINE)
2393 PL_expect = XTERMORDORDOR;
79072805
LW
2394 return THING;
2395 }
e3f73d4e
RGS
2396 else if (op_type == OP_BACKTICK && PL_lex_op) {
2397 /* readpipe() vas overriden */
2398 cSVOPx(cLISTOPx(cUNOPx(PL_lex_op)->op_first)->op_first->op_sibling)->op_sv = tokeq(PL_lex_stuff);
6154021b 2399 pl_yylval.opval = PL_lex_op;
9b201d7d 2400 PL_lex_op = NULL;
e3f73d4e
RGS
2401 PL_lex_stuff = NULL;
2402 return THING;
2403 }
79072805 2404
3280af22 2405 PL_sublex_info.super_state = PL_lex_state;
eac04b2e 2406 PL_sublex_info.sub_inwhat = (U16)op_type;
3280af22
NIS
2407 PL_sublex_info.sub_op = PL_lex_op;
2408 PL_lex_state = LEX_INTERPPUSH;
55497cff 2409
3280af22
NIS
2410 PL_expect = XTERM;
2411 if (PL_lex_op) {
6154021b 2412 pl_yylval.opval = PL_lex_op;
5f66b61c 2413 PL_lex_op = NULL;
55497cff 2414 return PMFUNC;
2415 }
2416 else
2417 return FUNC;
2418}
2419
ffb4593c
NT
2420/*
2421 * S_sublex_push
2422 * Create a new scope to save the lexing state. The scope will be
2423 * ended in S_sublex_done. Returns a '(', starting the function arguments
2424 * to the uc, lc, etc. found before.
2425 * Sets PL_lex_state to LEX_INTERPCONCAT.
2426 */
2427
76e3520e 2428STATIC I32
cea2e8a9 2429S_sublex_push(pTHX)
55497cff 2430{
27da23d5 2431 dVAR;
f46d017c 2432 ENTER;
55497cff 2433
3280af22 2434 PL_lex_state = PL_sublex_info.super_state;
651b5b28 2435 SAVEBOOL(PL_lex_dojoin);
3280af22 2436 SAVEI32(PL_lex_brackets);
78cdf107
Z
2437 SAVEI32(PL_lex_allbrackets);
2438 SAVEI8(PL_lex_fakeeof);
3280af22
NIS
2439 SAVEI32(PL_lex_casemods);
2440 SAVEI32(PL_lex_starts);
651b5b28 2441 SAVEI8(PL_lex_state);
7766f137 2442 SAVEVPTR(PL_lex_inpat);
98246f1e 2443 SAVEI16(PL_lex_inwhat);
57843af0 2444 SAVECOPLINE(PL_curcop);
3280af22 2445 SAVEPPTR(PL_bufptr);
8452ff4b 2446 SAVEPPTR(PL_bufend);
3280af22
NIS
2447 SAVEPPTR(PL_oldbufptr);
2448 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2449 SAVEPPTR(PL_last_lop);
2450 SAVEPPTR(PL_last_uni);
3280af22
NIS
2451 SAVEPPTR(PL_linestart);
2452 SAVESPTR(PL_linestr);
8edd5f42
RGS
2453 SAVEGENERICPV(PL_lex_brackstack);
2454 SAVEGENERICPV(PL_lex_casestack);
3280af22
NIS
2455
2456 PL_linestr = PL_lex_stuff;
a0714e2c 2457 PL_lex_stuff = NULL;
3280af22 2458
9cbb5ea2
GS
2459 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2460 = SvPVX(PL_linestr);
3280af22 2461 PL_bufend += SvCUR(PL_linestr);
bd61b366 2462 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2463 SAVEFREESV(PL_linestr);
2464
2465 PL_lex_dojoin = FALSE;
2466 PL_lex_brackets = 0;
78cdf107
Z
2467 PL_lex_allbrackets = 0;
2468 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
a02a5408
JC
2469 Newx(PL_lex_brackstack, 120, char);
2470 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2471 PL_lex_casemods = 0;
2472 *PL_lex_casestack = '\0';
2473 PL_lex_starts = 0;
2474 PL_lex_state = LEX_INTERPCONCAT;
eb160463 2475 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
3280af22
NIS
2476
2477 PL_lex_inwhat = PL_sublex_info.sub_inwhat;
bb16bae8 2478 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
3280af22
NIS
2479 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2480 PL_lex_inpat = PL_sublex_info.sub_op;
79072805 2481 else
5f66b61c 2482 PL_lex_inpat = NULL;
79072805 2483
55497cff 2484 return '(';
79072805
LW
2485}
2486
ffb4593c
NT
2487/*
2488 * S_sublex_done
2489 * Restores lexer state after a S_sublex_push.
2490 */
2491
76e3520e 2492STATIC I32
cea2e8a9 2493S_sublex_done(pTHX)
79072805 2494{
27da23d5 2495 dVAR;
3280af22 2496 if (!PL_lex_starts++) {
396482e1 2497 SV * const sv = newSVpvs("");
9aa983d2
JH
2498 if (SvUTF8(PL_linestr))
2499 SvUTF8_on(sv);
3280af22 2500 PL_expect = XOPERATOR;
6154021b 2501 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
79072805
LW
2502 return THING;
2503 }
2504
3280af22
NIS
2505 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2506 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2507 return yylex();
79072805
LW
2508 }
2509
ffb4593c 2510 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
bb16bae8 2511 assert(PL_lex_inwhat != OP_TRANSR);
3280af22
NIS
2512 if (PL_lex_repl && (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS)) {
2513 PL_linestr = PL_lex_repl;
2514 PL_lex_inpat = 0;
2515 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2516 PL_bufend += SvCUR(PL_linestr);
bd61b366 2517 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2518 SAVEFREESV(PL_linestr);
2519 PL_lex_dojoin = FALSE;
2520 PL_lex_brackets = 0;
78cdf107
Z
2521 PL_lex_allbrackets = 0;
2522 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
3280af22
NIS
2523 PL_lex_casemods = 0;
2524 *PL_lex_casestack = '\0';
2525 PL_lex_starts = 0;
25da4f38 2526 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2527 PL_lex_state = LEX_INTERPNORMAL;
2528 PL_lex_starts++;
e9fa98b2
HS
2529 /* we don't clear PL_lex_repl here, so that we can check later
2530 whether this is an evalled subst; that means we rely on the
2531 logic to ensure sublex_done() is called again only via the
2532 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2533 }
e9fa98b2 2534 else {
3280af22 2535 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2536 PL_lex_repl = NULL;
e9fa98b2 2537 }
79072805 2538 return ',';
ffed7fef
LW
2539 }
2540 else {
5db06880
NC
2541#ifdef PERL_MAD
2542 if (PL_madskills) {
cd81e915
NC
2543 if (PL_thiswhite) {
2544 if (!PL_endwhite)
6b29d1f5 2545 PL_endwhite = newSVpvs("");
cd81e915
NC
2546 sv_catsv(PL_endwhite, PL_thiswhite);
2547 PL_thiswhite = 0;
2548 }
2549 if (PL_thistoken)
76f68e9b 2550 sv_setpvs(PL_thistoken,"");
5db06880 2551 else
cd81e915 2552 PL_realtokenstart = -1;
5db06880
NC
2553 }
2554#endif
f46d017c 2555 LEAVE;
3280af22
NIS
2556 PL_bufend = SvPVX(PL_linestr);
2557 PL_bufend += SvCUR(PL_linestr);
2558 PL_expect = XOPERATOR;
09bef843 2559 PL_sublex_info.sub_inwhat = 0;
79072805 2560 return ')';
ffed7fef
LW
2561 }
2562}
2563
02aa26ce
NT
2564/*
2565 scan_const
2566
2567 Extracts a pattern, double-quoted string, or transliteration. This
2568 is terrifying code.
2569
94def140 2570 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 2571 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 2572 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 2573
94def140
TS
2574 Returns a pointer to the character scanned up to. If this is
2575 advanced from the start pointer supplied (i.e. if anything was
9b599b2a 2576 successfully parsed), will leave an OP for the substring scanned
6154021b 2577 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
2578 by looking at the next characters herself.
2579
02aa26ce
NT
2580 In patterns:
2581 backslashes:
ff3f963a 2582 constants: \N{NAME} only
02aa26ce
NT
2583 case and quoting: \U \Q \E
2584 stops on @ and $, but not for $ as tail anchor
2585
2586 In transliterations:
2587 characters are VERY literal, except for - not at the start or end
94def140
TS
2588 of the string, which indicates a range. If the range is in bytes,
2589 scan_const expands the range to the full set of intermediate
2590 characters. If the range is in utf8, the hyphen is replaced with
2591 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
2592
2593 In double-quoted strings:
2594 backslashes:
2595 double-quoted style: \r and \n
ff3f963a 2596 constants: \x31, etc.
94def140 2597 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
2598 case and quoting: \U \Q \E
2599 stops on @ and $
2600
2601 scan_const does *not* construct ops to handle interpolated strings.
2602 It stops processing as soon as it finds an embedded $ or @ variable
2603 and leaves it to the caller to work out what's going on.
2604
94def140
TS
2605 embedded arrays (whether in pattern or not) could be:
2606 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2607
2608 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
2609
2610 $ in pattern could be $foo or could be tail anchor. Assumption:
2611 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 2612 followed by one of "()| \r\n\t"
02aa26ce
NT
2613
2614 \1 (backreferences) are turned into $1
2615
2616 The structure of the code is
2617 while (there's a character to process) {
94def140
TS
2618 handle transliteration ranges
2619 skip regexp comments /(?#comment)/ and codes /(?{code})/
2620 skip #-initiated comments in //x patterns
2621 check for embedded arrays
02aa26ce
NT
2622 check for embedded scalars
2623 if (backslash) {
94def140 2624 deprecate \1 in substitution replacements
02aa26ce
NT
2625 handle string-changing backslashes \l \U \Q \E, etc.
2626 switch (what was escaped) {
94def140 2627 handle \- in a transliteration (becomes a literal -)
ff3f963a 2628 if a pattern and not \N{, go treat as regular character
94def140
TS
2629 handle \132 (octal characters)
2630 handle \x15 and \x{1234} (hex characters)
ff3f963a 2631 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
TS
2632 handle \cV (control characters)
2633 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 2634 } (end switch)
77a135fe 2635 continue
02aa26ce 2636 } (end if backslash)
77a135fe 2637 handle regular character
02aa26ce 2638 } (end while character to read)
4e553d73 2639
02aa26ce
NT
2640*/
2641
76e3520e 2642STATIC char *
cea2e8a9 2643S_scan_const(pTHX_ char *start)
79072805 2644{
97aff369 2645 dVAR;
3280af22 2646 register char *send = PL_bufend; /* end of the constant */
77a135fe
KW
2647 SV *sv = newSV(send - start); /* sv for the constant. See
2648 note below on sizing. */
02aa26ce
NT
2649 register char *s = start; /* start of the constant */
2650 register char *d = SvPVX(sv); /* destination for copies */
2651 bool dorange = FALSE; /* are we in a translit range? */
c2e66d9e 2652 bool didrange = FALSE; /* did we just finish a range? */
b953e60c
KW
2653 bool has_utf8 = FALSE; /* Output constant is UTF8 */
2654 bool this_utf8 = cBOOL(UTF); /* Is the source string assumed
77a135fe
KW
2655 to be UTF8? But, this can
2656 show as true when the source
2657 isn't utf8, as for example
2658 when it is entirely composed
2659 of hex constants */
2660
2661 /* Note on sizing: The scanned constant is placed into sv, which is
2662 * initialized by newSV() assuming one byte of output for every byte of
2663 * input. This routine expects newSV() to allocate an extra byte for a
2664 * trailing NUL, which this routine will append if it gets to the end of
2665 * the input. There may be more bytes of input than output (eg., \N{LATIN
2666 * CAPITAL LETTER A}), or more output than input if the constant ends up
2667 * recoded to utf8, but each time a construct is found that might increase
2668 * the needed size, SvGROW() is called. Its size parameter each time is
2669 * based on the best guess estimate at the time, namely the length used so
2670 * far, plus the length the current construct will occupy, plus room for
2671 * the trailing NUL, plus one byte for every input byte still unscanned */
2672
012bcf8d 2673 UV uv;
4c3a8340
TS
2674#ifdef EBCDIC
2675 UV literal_endpoint = 0;
e294cc5d 2676 bool native_range = TRUE; /* turned to FALSE if the first endpoint is Unicode. */
4c3a8340 2677#endif
012bcf8d 2678
7918f24d
NC
2679 PERL_ARGS_ASSERT_SCAN_CONST;
2680
bb16bae8 2681 assert(PL_lex_inwhat != OP_TRANSR);
2b9d42f0
NIS
2682 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
2683 /* If we are doing a trans and we know we want UTF8 set expectation */
2684 has_utf8 = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
2685 this_utf8 = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
2686 }
2687
2688
79072805 2689 while (s < send || dorange) {
ff3f963a 2690
02aa26ce 2691 /* get transliterations out of the way (they're most literal) */
3280af22 2692 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 2693 /* expand a range A-Z to the full set of characters. AIE! */
79072805 2694 if (dorange) {
1ba5c669
JH
2695 I32 i; /* current expanded character */
2696 I32 min; /* first character in range */
2697 I32 max; /* last character in range */
02aa26ce 2698
e294cc5d
JH
2699#ifdef EBCDIC
2700 UV uvmax = 0;
2701#endif
2702
2703 if (has_utf8
2704#ifdef EBCDIC
2705 && !native_range
2706#endif
2707 ) {
9d4ba2ae 2708 char * const c = (char*)utf8_hop((U8*)d, -1);
8973db79
JH
2709 char *e = d++;
2710 while (e-- > c)
2711 *(e + 1) = *e;
25716404 2712 *c = (char)UTF_TO_NATIVE(0xff);
8973db79
JH
2713 /* mark the range as done, and continue */
2714 dorange = FALSE;
2715 didrange = TRUE;
2716 continue;
2717 }
2b9d42f0 2718
95a20fc0 2719 i = d - SvPVX_const(sv); /* remember current offset */
e294cc5d
JH
2720#ifdef EBCDIC
2721 SvGROW(sv,
2722 SvLEN(sv) + (has_utf8 ?
2723 (512 - UTF_CONTINUATION_MARK +
2724 UNISKIP(0x100))
2725 : 256));
2726 /* How many two-byte within 0..255: 128 in UTF-8,
2727 * 96 in UTF-8-mod. */
2728#else
9cbb5ea2 2729 SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */
e294cc5d 2730#endif
9cbb5ea2 2731 d = SvPVX(sv) + i; /* refresh d after realloc */
e294cc5d
JH
2732#ifdef EBCDIC
2733 if (has_utf8) {
2734 int j;
2735 for (j = 0; j <= 1; j++) {
2736 char * const c = (char*)utf8_hop((U8*)d, -1);
2737 const UV uv = utf8n_to_uvchr((U8*)c, d - c, NULL, 0);
2738 if (j)
2739 min = (U8)uv;
2740 else if (uv < 256)
2741 max = (U8)uv;
2742 else {
2743 max = (U8)0xff; /* only to \xff */
2744 uvmax = uv; /* \x{100} to uvmax */
2745 }
2746 d = c; /* eat endpoint chars */
2747 }
2748 }
2749 else {
2750#endif
2751 d -= 2; /* eat the first char and the - */
2752 min = (U8)*d; /* first char in range */
2753 max = (U8)d[1]; /* last char in range */
2754#ifdef EBCDIC
2755 }
2756#endif
8ada0baa 2757
c2e66d9e 2758 if (min > max) {
01ec43d0 2759 Perl_croak(aTHX_
d1573ac7 2760 "Invalid range \"%c-%c\" in transliteration operator",
1ba5c669 2761 (char)min, (char)max);
c2e66d9e
GS
2762 }
2763
c7f1f016 2764#ifdef EBCDIC
4c3a8340
TS
2765 if (literal_endpoint == 2 &&
2766 ((isLOWER(min) && isLOWER(max)) ||
2767 (isUPPER(min) && isUPPER(max)))) {
8ada0baa
JH
2768 if (isLOWER(min)) {
2769 for (i = min; i <= max; i++)
2770 if (isLOWER(i))
db42d148 2771 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2772 } else {
2773 for (i = min; i <= max; i++)
2774 if (isUPPER(i))
db42d148 2775 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2776 }
2777 }
2778 else
2779#endif
2780 for (i = min; i <= max; i++)
e294cc5d
JH
2781#ifdef EBCDIC
2782 if (has_utf8) {
2783 const U8 ch = (U8)NATIVE_TO_UTF(i);
2784 if (UNI_IS_INVARIANT(ch))
2785 *d++ = (U8)i;
2786 else {
2787 *d++ = (U8)UTF8_EIGHT_BIT_HI(ch);
2788 *d++ = (U8)UTF8_EIGHT_BIT_LO(ch);
2789 }
2790 }
2791 else
2792#endif
2793 *d++ = (char)i;
2794
2795#ifdef EBCDIC
2796 if (uvmax) {
2797 d = (char*)uvchr_to_utf8((U8*)d, 0x100);
2798 if (uvmax > 0x101)
2799 *d++ = (char)UTF_TO_NATIVE(0xff);
2800 if (uvmax > 0x100)
2801 d = (char*)uvchr_to_utf8((U8*)d, uvmax);
2802 }
2803#endif
02aa26ce
NT
2804
2805 /* mark the range as done, and continue */
79072805 2806 dorange = FALSE;
01ec43d0 2807 didrange = TRUE;
4c3a8340
TS
2808#ifdef EBCDIC
2809 literal_endpoint = 0;
2810#endif
79072805 2811 continue;
4e553d73 2812 }
02aa26ce
NT
2813
2814 /* range begins (ignore - as first or last char) */
79072805 2815 else if (*s == '-' && s+1 < send && s != start) {
4e553d73 2816 if (didrange) {
1fafa243 2817 Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
01ec43d0 2818 }
e294cc5d
JH
2819 if (has_utf8
2820#ifdef EBCDIC
2821 && !native_range
2822#endif
2823 ) {
25716404 2824 *d++ = (char)UTF_TO_NATIVE(0xff); /* use illegal utf8 byte--see pmtrans */
a0ed51b3
LW
2825 s++;
2826 continue;
2827 }
79072805
LW
2828 dorange = TRUE;
2829 s++;
01ec43d0
GS
2830 }
2831 else {
2832 didrange = FALSE;
4c3a8340
TS
2833#ifdef EBCDIC
2834 literal_endpoint = 0;
e294cc5d 2835 native_range = TRUE;
4c3a8340 2836#endif
01ec43d0 2837 }
79072805 2838 }
02aa26ce
NT
2839
2840 /* if we get here, we're not doing a transliteration */
2841
0f5d15d6
IZ
2842 /* skip for regexp comments /(?#comment)/ and code /(?{code})/,
2843 except for the last char, which will be done separately. */
3280af22 2844 else if (*s == '(' && PL_lex_inpat && s[1] == '?') {
cc6b7395 2845 if (s[2] == '#') {
e994fd66 2846 while (s+1 < send && *s != ')')
db42d148 2847 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
155aba94
GS
2848 }
2849 else if (s[2] == '{' /* This should match regcomp.c */
67edc0c9 2850 || (s[2] == '?' && s[3] == '{'))
155aba94 2851 {
cc6b7395 2852 I32 count = 1;
0f5d15d6 2853 char *regparse = s + (s[2] == '{' ? 3 : 4);
cc6b7395
IZ
2854 char c;
2855
d9f97599
GS
2856 while (count && (c = *regparse)) {
2857 if (c == '\\' && regparse[1])
2858 regparse++;
4e553d73 2859 else if (c == '{')
cc6b7395 2860 count++;
4e553d73 2861 else if (c == '}')
cc6b7395 2862 count--;
d9f97599 2863 regparse++;
cc6b7395 2864 }
e994fd66 2865 if (*regparse != ')')
5bdf89e7 2866 regparse--; /* Leave one char for continuation. */
0f5d15d6 2867 while (s < regparse)
db42d148 2868 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
cc6b7395 2869 }
748a9306 2870 }
02aa26ce
NT
2871
2872 /* likewise skip #-initiated comments in //x patterns */
3280af22 2873 else if (*s == '#' && PL_lex_inpat &&
73134a2e 2874 ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED) {
748a9306 2875 while (s+1 < send && *s != '\n')
db42d148 2876 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
748a9306 2877 }
02aa26ce 2878
5d1d4326 2879 /* check for embedded arrays
da6eedaa 2880 (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
5d1d4326 2881 */
1749ea0d
TS
2882 else if (*s == '@' && s[1]) {
2883 if (isALNUM_lazy_if(s+1,UTF))
2884 break;
2885 if (strchr(":'{$", s[1]))
2886 break;
2887 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
2888 break; /* in regexp, neither @+ nor @- are interpolated */
2889 }
02aa26ce
NT
2890
2891 /* check for embedded scalars. only stop if we're sure it's a
2892 variable.
2893 */
79072805 2894 else if (*s == '$') {
3280af22 2895 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
79072805 2896 break;
77772344 2897 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
a2a5de95
NC
2898 if (s[1] == '\\') {
2899 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
2900 "Possible unintended interpolation of $\\ in regex");
77772344 2901 }
79072805 2902 break; /* in regexp, $ might be tail anchor */
77772344 2903 }
79072805 2904 }
02aa26ce 2905
2b9d42f0
NIS
2906 /* End of else if chain - OP_TRANS rejoin rest */
2907
02aa26ce 2908 /* backslashes */
79072805 2909 if (*s == '\\' && s+1 < send) {
ff3f963a
KW
2910 char* e; /* Can be used for ending '}', etc. */
2911
79072805 2912 s++;
02aa26ce 2913
7d0fc23c
KW
2914 /* warn on \1 - \9 in substitution replacements, but note that \11
2915 * is an octal; and \19 is \1 followed by '9' */
3280af22 2916 if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat &&
a0d0e21e 2917 isDIGIT(*s) && *s != '0' && !isDIGIT(s[1]))
79072805 2918 {
a2a5de95 2919 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
79072805
LW
2920 *--s = '$';
2921 break;
2922 }
02aa26ce
NT
2923
2924 /* string-change backslash escapes */
838f2281 2925 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQF", *s)) {
79072805
LW
2926 --s;
2927 break;
2928 }
ff3f963a
KW
2929 /* In a pattern, process \N, but skip any other backslash escapes.
2930 * This is because we don't want to translate an escape sequence
2931 * into a meta symbol and have the regex compiler use the meta
2932 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
2933 * in spite of this, we do have to process \N here while the proper
2934 * charnames handler is in scope. See bugs #56444 and #62056.
2935 * There is a complication because \N in a pattern may also stand
2936 * for 'match a non-nl', and not mean a charname, in which case its
2937 * processing should be deferred to the regex compiler. To be a
2938 * charname it must be followed immediately by a '{', and not look
2939 * like \N followed by a curly quantifier, i.e., not something like
2940 * \N{3,}. regcurly returns a boolean indicating if it is a legal
2941 * quantifier */
2942 else if (PL_lex_inpat
2943 && (*s != 'N'
2944 || s[1] != '{'
2945 || regcurly(s + 1)))
2946 {
cc74c5bd
TS
2947 *d++ = NATIVE_TO_NEED(has_utf8,'\\');
2948 goto default_action;
2949 }
02aa26ce 2950
79072805 2951 switch (*s) {
02aa26ce
NT
2952
2953 /* quoted - in transliterations */
79072805 2954 case '-':
3280af22 2955 if (PL_lex_inwhat == OP_TRANS) {
79072805
LW
2956 *d++ = *s++;
2957 continue;
2958 }
2959 /* FALL THROUGH */
2960 default:
11b8faa4 2961 {
a2a5de95
NC
2962 if ((isALPHA(*s) || isDIGIT(*s)))
2963 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
2964 "Unrecognized escape \\%c passed through",
2965 *s);
11b8faa4 2966 /* default action is to copy the quoted character */
f9a63242 2967 goto default_action;
11b8faa4 2968 }
02aa26ce 2969
632403cc 2970 /* eg. \132 indicates the octal constant 0132 */
79072805
LW
2971 case '0': case '1': case '2': case '3':
2972 case '4': case '5': case '6': case '7':
ba210ebe 2973 {
53305cf1
NC
2974 I32 flags = 0;
2975 STRLEN len = 3;
77a135fe 2976 uv = NATIVE_TO_UNI(grok_oct(s, &len, &flags, NULL));
ba210ebe
JH
2977 s += len;
2978 }
012bcf8d 2979 goto NUM_ESCAPE_INSERT;
02aa26ce 2980
f0a2b745
KW
2981 /* eg. \o{24} indicates the octal constant \024 */
2982 case 'o':
2983 {
2984 STRLEN len;
454155d9 2985 const char* error;
f0a2b745 2986
454155d9 2987 bool valid = grok_bslash_o(s, &uv, &len, &error, 1);
f0a2b745 2988 s += len;
454155d9 2989 if (! valid) {
f0a2b745
KW
2990 yyerror(error);
2991 continue;
2992 }
2993 goto NUM_ESCAPE_INSERT;
2994 }
2995
77a135fe 2996 /* eg. \x24 indicates the hex constant 0x24 */
79072805 2997 case 'x':
a0ed51b3
LW
2998 ++s;
2999 if (*s == '{') {
9d4ba2ae 3000 char* const e = strchr(s, '}');
a4c04bdc
NC
3001 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES |
3002 PERL_SCAN_DISALLOW_PREFIX;
53305cf1 3003 STRLEN len;
355860ce 3004
53305cf1 3005 ++s;
adaeee49 3006 if (!e) {
a0ed51b3 3007 yyerror("Missing right brace on \\x{}");
355860ce 3008 continue;
ba210ebe 3009 }
53305cf1 3010 len = e - s;
77a135fe 3011 uv = NATIVE_TO_UNI(grok_hex(s, &len, &flags, NULL));
ba210ebe 3012 s = e + 1;
a0ed51b3
LW
3013 }
3014 else {
ba210ebe 3015 {
53305cf1 3016 STRLEN len = 2;
a4c04bdc 3017 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
77a135fe 3018 uv = NATIVE_TO_UNI(grok_hex(s, &len, &flags, NULL));
ba210ebe
JH
3019 s += len;
3020 }
012bcf8d
GS
3021 }
3022
3023 NUM_ESCAPE_INSERT:
ff3f963a
KW
3024 /* Insert oct or hex escaped character. There will always be
3025 * enough room in sv since such escapes will be longer than any
3026 * UTF-8 sequence they can end up as, except if they force us
3027 * to recode the rest of the string into utf8 */
ba7cea30 3028
77a135fe 3029 /* Here uv is the ordinal of the next character being added in
ff3f963a 3030 * unicode (converted from native). */
77a135fe 3031 if (!UNI_IS_INVARIANT(uv)) {
9aa983d2 3032 if (!has_utf8 && uv > 255) {
77a135fe
KW
3033 /* Might need to recode whatever we have accumulated so
3034 * far if it contains any chars variant in utf8 or
3035 * utf-ebcdic. */
3036
3037 SvCUR_set(sv, d - SvPVX_const(sv));
3038 SvPOK_on(sv);
3039 *d = '\0';
77a135fe 3040 /* See Note on sizing above. */
7bf79863
KW
3041 sv_utf8_upgrade_flags_grow(sv,
3042 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3043 UNISKIP(uv) + (STRLEN)(send - s) + 1);
77a135fe
KW
3044 d = SvPVX(sv) + SvCUR(sv);
3045 has_utf8 = TRUE;
012bcf8d
GS
3046 }
3047
77a135fe
KW
3048 if (has_utf8) {
3049 d = (char*)uvuni_to_utf8((U8*)d, uv);
f9a63242
JH
3050 if (PL_lex_inwhat == OP_TRANS &&
3051 PL_sublex_info.sub_op) {
3052 PL_sublex_info.sub_op->op_private |=
3053 (PL_lex_repl ? OPpTRANS_FROM_UTF
3054 : OPpTRANS_TO_UTF);
f9a63242 3055 }
e294cc5d
JH
3056#ifdef EBCDIC
3057 if (uv > 255 && !dorange)
3058 native_range = FALSE;
3059#endif
012bcf8d 3060 }
a0ed51b3 3061 else {
012bcf8d 3062 *d++ = (char)uv;
a0ed51b3 3063 }
012bcf8d
GS
3064 }
3065 else {
c4d5f83a 3066 *d++ = (char) uv;
a0ed51b3 3067 }
79072805 3068 continue;
02aa26ce 3069
4a2d328f 3070 case 'N':
ff3f963a
KW
3071 /* In a non-pattern \N must be a named character, like \N{LATIN
3072 * SMALL LETTER A} or \N{U+0041}. For patterns, it also can
3073 * mean to match a non-newline. For non-patterns, named
3074 * characters are converted to their string equivalents. In
3075 * patterns, named characters are not converted to their
3076 * ultimate forms for the same reasons that other escapes
3077 * aren't. Instead, they are converted to the \N{U+...} form
3078 * to get the value from the charnames that is in effect right
3079 * now, while preserving the fact that it was a named character
3080 * so that the regex compiler knows this */
3081
3082 /* This section of code doesn't generally use the
3083 * NATIVE_TO_NEED() macro to transform the input. I (khw) did
3084 * a close examination of this macro and determined it is a
3085 * no-op except on utfebcdic variant characters. Every
3086 * character generated by this that would normally need to be
3087 * enclosed by this macro is invariant, so the macro is not
7538f724
KW
3088 * needed, and would complicate use of copy(). XXX There are
3089 * other parts of this file where the macro is used
3090 * inconsistently, but are saved by it being a no-op */
ff3f963a
KW
3091
3092 /* The structure of this section of code (besides checking for
3093 * errors and upgrading to utf8) is:
3094 * Further disambiguate between the two meanings of \N, and if
3095 * not a charname, go process it elsewhere
0a96133f
KW
3096 * If of form \N{U+...}, pass it through if a pattern;
3097 * otherwise convert to utf8
3098 * Otherwise must be \N{NAME}: convert to \N{U+c1.c2...} if a
3099 * pattern; otherwise convert to utf8 */
ff3f963a
KW
3100
3101 /* Here, s points to the 'N'; the test below is guaranteed to
3102 * succeed if we are being called on a pattern as we already
3103 * know from a test above that the next character is a '{'.
3104 * On a non-pattern \N must mean 'named sequence, which
3105 * requires braces */
3106 s++;
3107 if (*s != '{') {
3108 yyerror("Missing braces on \\N{}");
3109 continue;
3110 }
3111 s++;
3112
0a96133f 3113 /* If there is no matching '}', it is an error. */
ff3f963a
KW
3114 if (! (e = strchr(s, '}'))) {
3115 if (! PL_lex_inpat) {
5777a3f7 3116 yyerror("Missing right brace on \\N{}");
0a96133f
KW
3117 } else {
3118 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N.");
dbc0d4f2 3119 }
0a96133f 3120 continue;
ff3f963a 3121 }
cddc7ef4 3122
ff3f963a 3123 /* Here it looks like a named character */
cddc7ef4 3124
ff3f963a
KW
3125 if (PL_lex_inpat) {
3126
3127 /* XXX This block is temporary code. \N{} implies that the
3128 * pattern is to have Unicode semantics, and therefore
3129 * currently has to be encoded in utf8. By putting it in
3130 * utf8 now, we save a whole pass in the regular expression
3131 * compiler. Once that code is changed so Unicode
3132 * semantics doesn't necessarily have to be in utf8, this
da3a4baf
KW
3133 * block should be removed. However, the code that parses
3134 * the output of this would have to be changed to not
3135 * necessarily expect utf8 */
ff3f963a 3136 if (!has_utf8) {
77a135fe 3137 SvCUR_set(sv, d - SvPVX_const(sv));
f08d6ad9 3138 SvPOK_on(sv);
e4f3eed8 3139 *d = '\0';
77a135fe 3140 /* See Note on sizing above. */
7bf79863 3141 sv_utf8_upgrade_flags_grow(sv,
ff3f963a
KW
3142 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3143 /* 5 = '\N{' + cur char + NUL */
3144 (STRLEN)(send - s) + 5);
f08d6ad9 3145 d = SvPVX(sv) + SvCUR(sv);
89491803 3146 has_utf8 = TRUE;
ff3f963a
KW
3147 }
3148 }
423cee85 3149
ff3f963a
KW
3150 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3151 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3152 | PERL_SCAN_DISALLOW_PREFIX;
3153 STRLEN len;
3154
3155 /* For \N{U+...}, the '...' is a unicode value even on
3156 * EBCDIC machines */
3157 s += 2; /* Skip to next char after the 'U+' */
3158 len = e - s;
3159 uv = grok_hex(s, &len, &flags, NULL);
3160 if (len == 0 || len != (STRLEN)(e - s)) {
3161 yyerror("Invalid hexadecimal number in \\N{U+...}");
3162 s = e + 1;
3163 continue;
3164 }
3165
3166 if (PL_lex_inpat) {
3167
e2a7e165
KW
3168 /* On non-EBCDIC platforms, pass through to the regex
3169 * compiler unchanged. The reason we evaluated the
3170 * number above is to make sure there wasn't a syntax
3171 * error. But on EBCDIC we convert to native so
3172 * downstream code can continue to assume it's native
3173 */
ff3f963a 3174 s -= 5; /* Include the '\N{U+' */
e2a7e165
KW
3175#ifdef EBCDIC
3176 d += my_snprintf(d, e - s + 1 + 1, /* includes the }
3177 and the \0 */
3178 "\\N{U+%X}",
3179 (unsigned int) UNI_TO_NATIVE(uv));
3180#else
ff3f963a
KW
3181 Copy(s, d, e - s + 1, char); /* 1 = include the } */
3182 d += e - s + 1;
e2a7e165 3183#endif
ff3f963a
KW
3184 }
3185 else { /* Not a pattern: convert the hex to string */
3186
3187 /* If destination is not in utf8, unconditionally
3188 * recode it to be so. This is because \N{} implies
3189 * Unicode semantics, and scalars have to be in utf8
3190 * to guarantee those semantics */
3191 if (! has_utf8) {
3192 SvCUR_set(sv, d - SvPVX_const(sv));
3193 SvPOK_on(sv);
3194 *d = '\0';
3195 /* See Note on sizing above. */
3196 sv_utf8_upgrade_flags_grow(
3197 sv,
3198 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3199 UNISKIP(uv) + (STRLEN)(send - e) + 1);
3200 d = SvPVX(sv) + SvCUR(sv);
3201 has_utf8 = TRUE;
3202 }
3203
3204 /* Add the string to the output */
3205 if (UNI_IS_INVARIANT(uv)) {
3206 *d++ = (char) uv;
3207 }
3208 else d = (char*)uvuni_to_utf8((U8*)d, uv);
3209 }
3210 }
3211 else { /* Here is \N{NAME} but not \N{U+...}. */
3212
3213 SV *res; /* result from charnames */
3214 const char *str; /* the string in 'res' */
3215 STRLEN len; /* its length */
3216
3217 /* Get the value for NAME */
3218 res = newSVpvn(s, e - s);
3219 res = new_constant( NULL, 0, "charnames",
3220 /* includes all of: \N{...} */
3221 res, NULL, s - 3, e - s + 4 );
3222
3223 /* Most likely res will be in utf8 already since the
3224 * standard charnames uses pack U, but a custom translator
3225 * can leave it otherwise, so make sure. XXX This can be
3226 * revisited to not have charnames use utf8 for characters
3227 * that don't need it when regexes don't have to be in utf8
3228 * for Unicode semantics. If doing so, remember EBCDIC */
3229 sv_utf8_upgrade(res);
3230 str = SvPV_const(res, len);
3231
3232 /* Don't accept malformed input */
3233 if (! is_utf8_string((U8 *) str, len)) {
3234 yyerror("Malformed UTF-8 returned by \\N");
3235 }
3236 else if (PL_lex_inpat) {
3237
3238 if (! len) { /* The name resolved to an empty string */
3239 Copy("\\N{}", d, 4, char);
3240 d += 4;
3241 }
3242 else {
3243 /* In order to not lose information for the regex
3244 * compiler, pass the result in the specially made
3245 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3246 * the code points in hex of each character
3247 * returned by charnames */
3248
3249 const char *str_end = str + len;
3250 STRLEN char_length; /* cur char's byte length */
3251 STRLEN output_length; /* and the number of bytes
3252 after this is translated
3253 into hex digits */
3254 const STRLEN off = d - SvPVX_const(sv);
3255
3256 /* 2 hex per byte; 2 chars for '\N'; 2 chars for
3257 * max('U+', '.'); and 1 for NUL */
3258 char hex_string[2 * UTF8_MAXBYTES + 5];
3259
3260 /* Get the first character of the result. */
3261 U32 uv = utf8n_to_uvuni((U8 *) str,
3262 len,
3263 &char_length,
3264 UTF8_ALLOW_ANYUV);
3265
3266 /* The call to is_utf8_string() above hopefully
3267 * guarantees that there won't be an error. But
3268 * it's easy here to make sure. The function just
3269 * above warns and returns 0 if invalid utf8, but
3270 * it can also return 0 if the input is validly a
3271 * NUL. Disambiguate */
3272 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3273 uv = UNICODE_REPLACEMENT;
3274 }
3275
3276 /* Convert first code point to hex, including the
e2a7e165
KW
3277 * boiler plate before it. For all these, we
3278 * convert to native format so that downstream code
3279 * can continue to assume the input is native */
78c35590 3280 output_length =
3353de27 3281 my_snprintf(hex_string, sizeof(hex_string),
e2a7e165
KW
3282 "\\N{U+%X",
3283 (unsigned int) UNI_TO_NATIVE(uv));
ff3f963a
KW
3284
3285 /* Make sure there is enough space to hold it */
3286 d = off + SvGROW(sv, off
3287 + output_length
3288 + (STRLEN)(send - e)
3289 + 2); /* '}' + NUL */
3290 /* And output it */
3291 Copy(hex_string, d, output_length, char);
3292 d += output_length;
3293
3294 /* For each subsequent character, append dot and
3295 * its ordinal in hex */
3296 while ((str += char_length) < str_end) {
3297 const STRLEN off = d - SvPVX_const(sv);
3298 U32 uv = utf8n_to_uvuni((U8 *) str,
3299 str_end - str,
3300 &char_length,
3301 UTF8_ALLOW_ANYUV);
3302 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3303 uv = UNICODE_REPLACEMENT;
3304 }
3305
78c35590 3306 output_length =
3353de27 3307 my_snprintf(hex_string, sizeof(hex_string),
e2a7e165
KW
3308 ".%X",
3309 (unsigned int) UNI_TO_NATIVE(uv));
ff3f963a
KW
3310
3311 d = off + SvGROW(sv, off
3312 + output_length
3313 + (STRLEN)(send - e)
3314 + 2); /* '}' + NUL */
3315 Copy(hex_string, d, output_length, char);
3316 d += output_length;
3317 }
3318
3319 *d++ = '}'; /* Done. Add the trailing brace */
3320 }
3321 }
3322 else { /* Here, not in a pattern. Convert the name to a
3323 * string. */
3324
3325 /* If destination is not in utf8, unconditionally
3326 * recode it to be so. This is because \N{} implies
3327 * Unicode semantics, and scalars have to be in utf8
3328 * to guarantee those semantics */
3329 if (! has_utf8) {
3330 SvCUR_set(sv, d - SvPVX_const(sv));
3331 SvPOK_on(sv);
3332 *d = '\0';
3333 /* See Note on sizing above. */
3334 sv_utf8_upgrade_flags_grow(sv,
3335 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3336 len + (STRLEN)(send - s) + 1);
3337 d = SvPVX(sv) + SvCUR(sv);
3338 has_utf8 = TRUE;
3339 } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
3340
3341 /* See Note on sizing above. (NOTE: SvCUR() is not
3342 * set correctly here). */
3343 const STRLEN off = d - SvPVX_const(sv);
3344 d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1);
3345 }
3346 Copy(str, d, len, char);
3347 d += len;
423cee85 3348 }
423cee85 3349 SvREFCNT_dec(res);
cb233ae3
KW
3350
3351 /* Deprecate non-approved name syntax */
3352 if (ckWARN_d(WARN_DEPRECATED)) {
3353 bool problematic = FALSE;
3354 char* i = s;
3355
3356 /* For non-ut8 input, look to see that the first
3357 * character is an alpha, then loop through the rest
3358 * checking that each is a continuation */
3359 if (! this_utf8) {
3360 if (! isALPHAU(*i)) problematic = TRUE;
3361 else for (i = s + 1; i < e; i++) {
3362 if (isCHARNAME_CONT(*i)) continue;
3363 problematic = TRUE;
3364 break;
3365 }
3366 }
3367 else {
3368 /* Similarly for utf8. For invariants can check
3369 * directly. We accept anything above the latin1
3370 * range because it is immaterial to Perl if it is
3371 * correct or not, and is expensive to check. But
3372 * it is fairly easy in the latin1 range to convert
3373 * the variants into a single character and check
3374 * those */
3375 if (UTF8_IS_INVARIANT(*i)) {
3376 if (! isALPHAU(*i)) problematic = TRUE;
3377 } else if (UTF8_IS_DOWNGRADEABLE_START(*i)) {
81c14aa2 3378 if (! isALPHAU(UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(*i,
cb233ae3
KW
3379 *(i+1)))))
3380 {
3381 problematic = TRUE;
3382 }
3383 }
3384 if (! problematic) for (i = s + UTF8SKIP(s);
3385 i < e;
3386 i+= UTF8SKIP(i))
3387 {
3388 if (UTF8_IS_INVARIANT(*i)) {
3389 if (isCHARNAME_CONT(*i)) continue;
3390 } else if (! UTF8_IS_DOWNGRADEABLE_START(*i)) {
3391 continue;
3392 } else if (isCHARNAME_CONT(
3393 UNI_TO_NATIVE(
81c14aa2 3394 TWO_BYTE_UTF8_TO_UNI(*i, *(i+1)))))
cb233ae3
KW
3395 {
3396 continue;
3397 }
3398 problematic = TRUE;
3399 break;
3400 }
3401 }
3402 if (problematic) {
6e1bad6c
KW
3403 /* The e-i passed to the final %.*s makes sure that
3404 * should the trailing NUL be missing that this
3405 * print won't run off the end of the string */
cb233ae3 3406 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
b00fc8d4
NC
3407 "Deprecated character in \\N{...}; marked by <-- HERE in \\N{%.*s<-- HERE %.*s",
3408 (int)(i - s + 1), s, (int)(e - i), i + 1);
cb233ae3
KW
3409 }
3410 }
3411 } /* End \N{NAME} */
ff3f963a
KW
3412#ifdef EBCDIC
3413 if (!dorange)
3414 native_range = FALSE; /* \N{} is defined to be Unicode */
3415#endif
3416 s = e + 1; /* Point to just after the '}' */
423cee85
JH
3417 continue;
3418
02aa26ce 3419 /* \c is a control character */
79072805
LW
3420 case 'c':
3421 s++;
961ce445 3422 if (s < send) {
17a3df4c 3423 *d++ = grok_bslash_c(*s++, has_utf8, 1);
ba210ebe 3424 }
961ce445
RGS
3425 else {
3426 yyerror("Missing control char name in \\c");
3427 }
79072805 3428 continue;
02aa26ce
NT
3429
3430 /* printf-style backslashes, formfeeds, newlines, etc */
79072805 3431 case 'b':
db42d148 3432 *d++ = NATIVE_TO_NEED(has_utf8,'\b');
79072805
LW
3433 break;
3434 case 'n':
db42d148 3435 *d++ = NATIVE_TO_NEED(has_utf8,'\n');
79072805
LW
3436 break;
3437 case 'r':
db42d148 3438 *d++ = NATIVE_TO_NEED(has_utf8,'\r');
79072805
LW
3439 break;
3440 case 'f':
db42d148 3441 *d++ = NATIVE_TO_NEED(has_utf8,'\f');
79072805
LW
3442 break;
3443 case 't':
db42d148 3444 *d++ = NATIVE_TO_NEED(has_utf8,'\t');
79072805 3445 break;
34a3fe2a 3446 case 'e':
db42d148 3447 *d++ = ASCII_TO_NEED(has_utf8,'\033');
34a3fe2a
PP
3448 break;
3449 case 'a':
db42d148 3450 *d++ = ASCII_TO_NEED(has_utf8,'\007');
79072805 3451 break;
02aa26ce
NT
3452 } /* end switch */
3453
79072805
LW
3454 s++;
3455 continue;
02aa26ce 3456 } /* end if (backslash) */
4c3a8340
TS
3457#ifdef EBCDIC
3458 else
3459 literal_endpoint++;
3460#endif
02aa26ce 3461
f9a63242 3462 default_action:
77a135fe
KW
3463 /* If we started with encoded form, or already know we want it,
3464 then encode the next character */
3465 if (! NATIVE_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
2b9d42f0 3466 STRLEN len = 1;
77a135fe
KW
3467
3468
3469 /* One might think that it is wasted effort in the case of the
3470 * source being utf8 (this_utf8 == TRUE) to take the next character
3471 * in the source, convert it to an unsigned value, and then convert
3472 * it back again. But the source has not been validated here. The
3473 * routine that does the conversion checks for errors like
3474 * malformed utf8 */
3475
5f66b61c
AL
3476 const UV nextuv = (this_utf8) ? utf8n_to_uvchr((U8*)s, send - s, &len, 0) : (UV) ((U8) *s);
3477 const STRLEN need = UNISKIP(NATIVE_TO_UNI(nextuv));
77a135fe
KW
3478 if (!has_utf8) {
3479 SvCUR_set(sv, d - SvPVX_const(sv));
3480 SvPOK_on(sv);
3481 *d = '\0';
77a135fe 3482 /* See Note on sizing above. */
7bf79863
KW
3483 sv_utf8_upgrade_flags_grow(sv,
3484 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3485 need + (STRLEN)(send - s) + 1);
77a135fe
KW
3486 d = SvPVX(sv) + SvCUR(sv);
3487 has_utf8 = TRUE;
3488 } else if (need > len) {
3489 /* encoded value larger than old, may need extra space (NOTE:
3490 * SvCUR() is not set correctly here). See Note on sizing
3491 * above. */
9d4ba2ae 3492 const STRLEN off = d - SvPVX_const(sv);
77a135fe 3493 d = SvGROW(sv, off + need + (STRLEN)(send - s) + 1) + off;
2b9d42f0 3494 }
77a135fe
KW
3495 s += len;
3496
5f66b61c 3497 d = (char*)uvchr_to_utf8((U8*)d, nextuv);
e294cc5d
JH
3498#ifdef EBCDIC
3499 if (uv > 255 && !dorange)
3500 native_range = FALSE;
3501#endif
2b9d42f0
NIS
3502 }
3503 else {
3504 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
3505 }
02aa26ce
NT
3506 } /* while loop to process each character */
3507
3508 /* terminate the string and set up the sv */
79072805 3509 *d = '\0';
95a20fc0 3510 SvCUR_set(sv, d - SvPVX_const(sv));
2b9d42f0 3511 if (SvCUR(sv) >= SvLEN(sv))
5637ef5b
NC
3512 Perl_croak(aTHX_ "panic: constant overflowed allocated space, %"UVuf
3513 " >= %"UVuf, (UV)SvCUR(sv), (UV)SvLEN(sv));
2b9d42f0 3514
79072805 3515 SvPOK_on(sv);
9f4817db 3516 if (PL_encoding && !has_utf8) {
d0063567
DK
3517 sv_recode_to_utf8(sv, PL_encoding);
3518 if (SvUTF8(sv))
3519 has_utf8 = TRUE;
9f4817db 3520 }
2b9d42f0 3521 if (has_utf8) {
7e2040f0 3522 SvUTF8_on(sv);
2b9d42f0 3523 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
d0063567 3524 PL_sublex_info.sub_op->op_private |=
2b9d42f0
NIS
3525 (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
3526 }
3527 }
79072805 3528
02aa26ce 3529 /* shrink the sv if we allocated more than we used */
79072805 3530 if (SvCUR(sv) + 5 < SvLEN(sv)) {
1da4ca5f 3531 SvPV_shrink_to_cur(sv);
79072805 3532 }
02aa26ce 3533
6154021b 3534 /* return the substring (via pl_yylval) only if we parsed anything */
3280af22 3535 if (s > PL_bufptr) {
eb0d8d16
NC
3536 if ( PL_hints & ( PL_lex_inpat ? HINT_NEW_RE : HINT_NEW_STRING ) ) {
3537 const char *const key = PL_lex_inpat ? "qr" : "q";
3538 const STRLEN keylen = PL_lex_inpat ? 2 : 1;
3539 const char *type;
3540 STRLEN typelen;
3541
3542 if (PL_lex_inwhat == OP_TRANS) {
3543 type = "tr";
3544 typelen = 2;
3545 } else if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat) {
3546 type = "s";
3547 typelen = 1;
3548 } else {
3549 type = "qq";
3550 typelen = 2;
3551 }
3552
3553 sv = S_new_constant(aTHX_ start, s - start, key, keylen, sv, NULL,
3554 type, typelen);
3555 }
6154021b 3556 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
b3ac6de7 3557 } else
8990e307 3558 SvREFCNT_dec(sv);
79072805
LW
3559 return s;
3560}
3561
ffb4593c
NT
3562/* S_intuit_more
3563 * Returns TRUE if there's more to the expression (e.g., a subscript),
3564 * FALSE otherwise.
ffb4593c
NT
3565 *
3566 * It deals with "$foo[3]" and /$foo[3]/ and /$foo[0123456789$]+/
3567 *
3568 * ->[ and ->{ return TRUE
3569 * { and [ outside a pattern are always subscripts, so return TRUE
3570 * if we're outside a pattern and it's not { or [, then return FALSE
3571 * if we're in a pattern and the first char is a {
3572 * {4,5} (any digits around the comma) returns FALSE
3573 * if we're in a pattern and the first char is a [
3574 * [] returns FALSE
3575 * [SOMETHING] has a funky algorithm to decide whether it's a
3576 * character class or not. It has to deal with things like
3577 * /$foo[-3]/ and /$foo[$bar]/ as well as /$foo[$\d]+/
3578 * anything else returns TRUE
3579 */
3580
9cbb5ea2
GS
3581/* This is the one truly awful dwimmer necessary to conflate C and sed. */
3582
76e3520e 3583STATIC int
cea2e8a9 3584S_intuit_more(pTHX_ register char *s)
79072805 3585{
97aff369 3586 dVAR;
7918f24d
NC
3587
3588 PERL_ARGS_ASSERT_INTUIT_MORE;
3589
3280af22 3590 if (PL_lex_brackets)
79072805
LW
3591 return TRUE;
3592 if (*s == '-' && s[1] == '>' && (s[2] == '[' || s[2] == '{'))
3593 return TRUE;
3594 if (*s != '{' && *s != '[')
3595 return FALSE;
3280af22 3596 if (!PL_lex_inpat)
79072805
LW
3597 return TRUE;
3598
3599 /* In a pattern, so maybe we have {n,m}. */
3600 if (*s == '{') {
b3155d95 3601 if (regcurly(s)) {
79072805 3602 return FALSE;
b3155d95 3603 }
79072805 3604 return TRUE;
79072805
LW
3605 }
3606
3607 /* On the other hand, maybe we have a character class */
3608
3609 s++;
3610 if (*s == ']' || *s == '^')
3611 return FALSE;
3612 else {
ffb4593c 3613 /* this is terrifying, and it works */
79072805
LW
3614 int weight = 2; /* let's weigh the evidence */
3615 char seen[256];
f27ffc4a 3616 unsigned char un_char = 255, last_un_char;
9d4ba2ae 3617 const char * const send = strchr(s,']');
3280af22 3618 char tmpbuf[sizeof PL_tokenbuf * 4];
79072805
LW
3619
3620 if (!send) /* has to be an expression */
3621 return TRUE;
3622
3623 Zero(seen,256,char);
3624 if (*s == '$')
3625 weight -= 3;
3626 else if (isDIGIT(*s)) {
3627 if (s[1] != ']') {
3628 if (isDIGIT(s[1]) && s[2] == ']')
3629 weight -= 10;
3630 }
3631 else
3632 weight -= 100;
3633 }
3634 for (; s < send; s++) {
3635 last_un_char = un_char;
3636 un_char = (unsigned char)*s;
3637 switch (*s) {
3638 case '@':
3639 case '&':
3640 case '$':
3641 weight -= seen[un_char] * 10;
7e2040f0 3642 if (isALNUM_lazy_if(s+1,UTF)) {
90e5519e 3643 int len;
8903cb82 3644 scan_ident(s, send, tmpbuf, sizeof tmpbuf, FALSE);
90e5519e 3645 len = (int)strlen(tmpbuf);
6fbd0d97
BF
3646 if (len > 1 && gv_fetchpvn_flags(tmpbuf, len,
3647 UTF ? SVf_UTF8 : 0, SVt_PV))
79072805
LW
3648 weight -= 100;
3649 else
3650 weight -= 10;
3651 }
3652 else if (*s == '$' && s[1] &&
93a17b20
LW
3653 strchr("[#!%*<>()-=",s[1])) {
3654 if (/*{*/ strchr("])} =",s[2]))
79072805
LW
3655 weight -= 10;
3656 else
3657 weight -= 1;
3658 }
3659 break;
3660 case '\\':
3661 un_char = 254;
3662 if (s[1]) {
93a17b20 3663 if (strchr("wds]",s[1]))
79072805 3664 weight += 100;
10edeb5d 3665 else if (seen[(U8)'\''] || seen[(U8)'"'])
79072805 3666 weight += 1;
93a17b20 3667 else if (strchr("rnftbxcav",s[1]))
79072805
LW
3668 weight += 40;
3669 else if (isDIGIT(s[1])) {
3670 weight += 40;
3671 while (s[1] && isDIGIT(s[1]))
3672 s++;
3673 }
3674 }
3675 else
3676 weight += 100;
3677 break;
3678 case '-':
3679 if (s[1] == '\\')
3680 weight += 50;
93a17b20 3681 if (strchr("aA01! ",last_un_char))
79072805 3682 weight += 30;
93a17b20 3683 if (strchr("zZ79~",s[1]))
79072805 3684 weight += 30;
f27ffc4a
GS
3685 if (last_un_char == 255 && (isDIGIT(s[1]) || s[1] == '$'))
3686 weight -= 5; /* cope with negative subscript */
79072805
LW
3687 break;
3688 default:
3792a11b
NC
3689 if (!isALNUM(last_un_char)
3690 && !(last_un_char == '$' || last_un_char == '@'
3691 || last_un_char == '&')
3692 && isALPHA(*s) && s[1] && isALPHA(s[1])) {
79072805
LW
3693 char *d = tmpbuf;
3694 while (isALPHA(*s))
3695 *d++ = *s++;
3696 *d = '\0';
5458a98a 3697 if (keyword(tmpbuf, d - tmpbuf, 0))
79072805
LW
3698 weight -= 150;
3699 }
3700 if (un_char == last_un_char + 1)
3701 weight += 5;
3702 weight -= seen[un_char];
3703 break;
3704 }
3705 seen[un_char]++;
3706 }
3707 if (weight >= 0) /* probably a character class */
3708 return FALSE;
3709 }
3710
3711 return TRUE;
3712}
ffed7fef 3713
ffb4593c
NT
3714/*
3715 * S_intuit_method
3716 *
3717 * Does all the checking to disambiguate
3718 * foo bar
3719 * between foo(bar) and bar->foo. Returns 0 if not a method, otherwise
3720 * FUNCMETH (bar->foo(args)) or METHOD (bar->foo args).
3721 *
3722 * First argument is the stuff after the first token, e.g. "bar".
3723 *
3724 * Not a method if bar is a filehandle.
3725 * Not a method if foo is a subroutine prototyped to take a filehandle.
3726 * Not a method if it's really "Foo $bar"
3727 * Method if it's "foo $bar"
3728 * Not a method if it's really "print foo $bar"
3729 * Method if it's really "foo package::" (interpreted as package->foo)
8f8cf39c 3730 * Not a method if bar is known to be a subroutine ("sub bar; foo bar")
3cb0bbe5 3731 * Not a method if bar is a filehandle or package, but is quoted with
ffb4593c
NT
3732 * =>
3733 */
3734
76e3520e 3735STATIC int
62d55b22 3736S_intuit_method(pTHX_ char *start, GV *gv, CV *cv)
a0d0e21e 3737{
97aff369 3738 dVAR;
a0d0e21e 3739 char *s = start + (*start == '$');
3280af22 3740 char tmpbuf[sizeof PL_tokenbuf];
a0d0e21e
LW
3741 STRLEN len;
3742 GV* indirgv;
5db06880
NC
3743#ifdef PERL_MAD
3744 int soff;
3745#endif
a0d0e21e 3746
7918f24d
NC
3747 PERL_ARGS_ASSERT_INTUIT_METHOD;
3748
a0d0e21e 3749 if (gv) {
62d55b22 3750 if (SvTYPE(gv) == SVt_PVGV && GvIO(gv))
a0d0e21e 3751 return 0;
62d55b22
NC
3752 if (cv) {
3753 if (SvPOK(cv)) {
8fa6a409 3754 const char *proto = CvPROTO(cv);
62d55b22
NC
3755 if (proto) {
3756 if (*proto == ';')
3757 proto++;
3758 if (*proto == '*')
3759 return 0;
3760 }
b6c543e3
IZ
3761 }
3762 } else
c35e046a 3763 gv = NULL;
a0d0e21e 3764 }
8903cb82 3765 s = scan_word(s, tmpbuf, sizeof tmpbuf, TRUE, &len);
ffb4593c
NT
3766 /* start is the beginning of the possible filehandle/object,
3767 * and s is the end of it
3768 * tmpbuf is a copy of it