This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
When parsing subs with user-defined prototypes, store information needed to throw...
[perl5.git] / toke.c
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
26
27This is the lower layer of the Perl parser, managing characters and tokens.
28
29=for apidoc AmU|yy_parser *|PL_parser
30
31Pointer to a structure encapsulating the state of the parsing operation
32currently in progress. The pointer can be locally changed to perform
33a nested parse without interfering with the state of an outer parse.
34Individual members of C<PL_parser> have their own documentation.
35
36=cut
37*/
38
378cc40b 39#include "EXTERN.h"
864dbfa3 40#define PERL_IN_TOKE_C
378cc40b 41#include "perl.h"
04e98a4d 42#include "dquote_static.c"
378cc40b 43
eb0d8d16
NC
44#define new_constant(a,b,c,d,e,f,g) \
45 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
46
6154021b 47#define pl_yylval (PL_parser->yylval)
d3b6f988 48
199e78b7
DM
49/* XXX temporary backwards compatibility */
50#define PL_lex_brackets (PL_parser->lex_brackets)
78cdf107
Z
51#define PL_lex_allbrackets (PL_parser->lex_allbrackets)
52#define PL_lex_fakeeof (PL_parser->lex_fakeeof)
199e78b7
DM
53#define PL_lex_brackstack (PL_parser->lex_brackstack)
54#define PL_lex_casemods (PL_parser->lex_casemods)
55#define PL_lex_casestack (PL_parser->lex_casestack)
56#define PL_lex_defer (PL_parser->lex_defer)
57#define PL_lex_dojoin (PL_parser->lex_dojoin)
58#define PL_lex_expect (PL_parser->lex_expect)
59#define PL_lex_formbrack (PL_parser->lex_formbrack)
60#define PL_lex_inpat (PL_parser->lex_inpat)
61#define PL_lex_inwhat (PL_parser->lex_inwhat)
62#define PL_lex_op (PL_parser->lex_op)
63#define PL_lex_repl (PL_parser->lex_repl)
64#define PL_lex_starts (PL_parser->lex_starts)
65#define PL_lex_stuff (PL_parser->lex_stuff)
66#define PL_multi_start (PL_parser->multi_start)
67#define PL_multi_open (PL_parser->multi_open)
68#define PL_multi_close (PL_parser->multi_close)
69#define PL_pending_ident (PL_parser->pending_ident)
70#define PL_preambled (PL_parser->preambled)
71#define PL_sublex_info (PL_parser->sublex_info)
bdc0bf6f 72#define PL_linestr (PL_parser->linestr)
c2598295
DM
73#define PL_expect (PL_parser->expect)
74#define PL_copline (PL_parser->copline)
f06b5848
DM
75#define PL_bufptr (PL_parser->bufptr)
76#define PL_oldbufptr (PL_parser->oldbufptr)
77#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
78#define PL_linestart (PL_parser->linestart)
79#define PL_bufend (PL_parser->bufend)
80#define PL_last_uni (PL_parser->last_uni)
81#define PL_last_lop (PL_parser->last_lop)
82#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 83#define PL_lex_state (PL_parser->lex_state)
2f9285f8 84#define PL_rsfp (PL_parser->rsfp)
5486870f 85#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
86#define PL_in_my (PL_parser->in_my)
87#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 88#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 89#define PL_multi_end (PL_parser->multi_end)
13765c85 90#define PL_error_count (PL_parser->error_count)
199e78b7
DM
91
92#ifdef PERL_MAD
93# define PL_endwhite (PL_parser->endwhite)
94# define PL_faketokens (PL_parser->faketokens)
95# define PL_lasttoke (PL_parser->lasttoke)
96# define PL_nextwhite (PL_parser->nextwhite)
97# define PL_realtokenstart (PL_parser->realtokenstart)
98# define PL_skipwhite (PL_parser->skipwhite)
99# define PL_thisclose (PL_parser->thisclose)
100# define PL_thismad (PL_parser->thismad)
101# define PL_thisopen (PL_parser->thisopen)
102# define PL_thisstuff (PL_parser->thisstuff)
103# define PL_thistoken (PL_parser->thistoken)
104# define PL_thiswhite (PL_parser->thiswhite)
fb205e7a
DM
105# define PL_thiswhite (PL_parser->thiswhite)
106# define PL_nexttoke (PL_parser->nexttoke)
107# define PL_curforce (PL_parser->curforce)
108#else
109# define PL_nexttoke (PL_parser->nexttoke)
110# define PL_nexttype (PL_parser->nexttype)
111# define PL_nextval (PL_parser->nextval)
199e78b7
DM
112#endif
113
16173588
NC
114/* This can't be done with embed.fnc, because struct yy_parser contains a
115 member named pending_ident, which clashes with the generated #define */
3cbf51f5
DM
116static int
117S_pending_ident(pTHX);
199e78b7 118
0bd48802 119static const char ident_too_long[] = "Identifier too long";
8903cb82 120
29595ff2 121#ifdef PERL_MAD
29595ff2 122# define CURMAD(slot,sv) if (PL_madskills) { curmad(slot,sv); sv = 0; }
cd81e915 123# define NEXTVAL_NEXTTOKE PL_nexttoke[PL_curforce].next_val
9ded7720 124#else
5db06880 125# define CURMAD(slot,sv)
9ded7720 126# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2
NC
127#endif
128
a7aaec61
Z
129#define XENUMMASK 0x3f
130#define XFAKEEOF 0x40
131#define XFAKEBRACK 0x80
9059aa12 132
39e02b42
JH
133#ifdef USE_UTF8_SCRIPTS
134# define UTF (!IN_BYTES)
2b9d42f0 135#else
802a15e9 136# define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
2b9d42f0 137#endif
a0ed51b3 138
b1fc3636
CJ
139/* The maximum number of characters preceding the unrecognized one to display */
140#define UNRECOGNIZED_PRECEDE_COUNT 10
141
61f0cdd9 142/* In variables named $^X, these are the legal values for X.
2b92dfce
GS
143 * 1999-02-27 mjd-perl-patch@plover.com */
144#define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
145
bf4acbe4 146#define SPACE_OR_TAB(c) ((c)==' '||(c)=='\t')
bf4acbe4 147
ffb4593c
NT
148/* LEX_* are values for PL_lex_state, the state of the lexer.
149 * They are arranged oddly so that the guard on the switch statement
79072805
LW
150 * can get by with a single comparison (if the compiler is smart enough).
151 */
152
fb73857a
PP
153/* #define LEX_NOTPARSING 11 is done in perl.h. */
154
b6007c36
DM
155#define LEX_NORMAL 10 /* normal code (ie not within "...") */
156#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
157#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
158#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
159#define LEX_INTERPSTART 6 /* expecting the start of a $var */
160
161 /* at end of code, eg "$x" followed by: */
162#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
163#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
164
165#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
166 string or after \E, $foo, etc */
167#define LEX_INTERPCONST 2 /* NOT USED */
168#define LEX_FORMLINE 1 /* expecting a format line */
169#define LEX_KNOWNEXT 0 /* next token known; just return it */
170
79072805 171
bbf60fe6 172#ifdef DEBUGGING
27da23d5 173static const char* const lex_state_names[] = {
bbf60fe6
DM
174 "KNOWNEXT",
175 "FORMLINE",
176 "INTERPCONST",
177 "INTERPCONCAT",
178 "INTERPENDMAYBE",
179 "INTERPEND",
180 "INTERPSTART",
181 "INTERPPUSH",
182 "INTERPCASEMOD",
183 "INTERPNORMAL",
184 "NORMAL"
185};
186#endif
187
79072805
LW
188#ifdef ff_next
189#undef ff_next
d48672a2
LW
190#endif
191
79072805 192#include "keywords.h"
fe14fcc3 193
ffb4593c
NT
194/* CLINE is a macro that ensures PL_copline has a sane value */
195
ae986130
LW
196#ifdef CLINE
197#undef CLINE
198#endif
57843af0 199#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 200
5db06880 201#ifdef PERL_MAD
29595ff2
NC
202# define SKIPSPACE0(s) skipspace0(s)
203# define SKIPSPACE1(s) skipspace1(s)
204# define SKIPSPACE2(s,tsv) skipspace2(s,&tsv)
205# define PEEKSPACE(s) skipspace2(s,0)
206#else
207# define SKIPSPACE0(s) skipspace(s)
208# define SKIPSPACE1(s) skipspace(s)
209# define SKIPSPACE2(s,tsv) skipspace(s)
210# define PEEKSPACE(s) skipspace(s)
211#endif
212
ffb4593c
NT
213/*
214 * Convenience functions to return different tokens and prime the
9cbb5ea2 215 * lexer for the next token. They all take an argument.
ffb4593c
NT
216 *
217 * TOKEN : generic token (used for '(', DOLSHARP, etc)
218 * OPERATOR : generic operator
219 * AOPERATOR : assignment operator
220 * PREBLOCK : beginning the block after an if, while, foreach, ...
221 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
222 * PREREF : *EXPR where EXPR is not a simple identifier
223 * TERM : expression term
224 * LOOPX : loop exiting command (goto, last, dump, etc)
225 * FTST : file test operator
226 * FUN0 : zero-argument function
7eb971ee 227 * FUN0OP : zero-argument function, with its op created in this file
2d2e263d 228 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
229 * BOop : bitwise or or xor
230 * BAop : bitwise and
231 * SHop : shift operator
232 * PWop : power operator
9cbb5ea2 233 * PMop : pattern-matching operator
ffb4593c
NT
234 * Aop : addition-level operator
235 * Mop : multiplication-level operator
236 * Eop : equality-testing operator
e5edeb50 237 * Rop : relational operator <= != gt
ffb4593c
NT
238 *
239 * Also see LOP and lop() below.
240 */
241
998054bd 242#ifdef DEBUGGING /* Serve -DT. */
704d4215 243# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 244#else
bbf60fe6 245# define REPORT(retval) (retval)
998054bd
SC
246#endif
247
bbf60fe6
DM
248#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
249#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
250#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, REPORT(retval)))
251#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
252#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
253#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
254#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
6154021b
RGS
255#define LOOPX(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)LOOPEX))
256#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
257#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
7eb971ee 258#define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
6154021b
RGS
259#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
260#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITOROP)))
261#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITANDOP)))
262#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)SHIFTOP)))
263#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)POWOP)))
264#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
265#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)ADDOP)))
266#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MULOP)))
267#define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
268#define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
2f3197b3 269
a687059c
LW
270/* This bit of chicanery makes a unary function followed by
271 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
272 * The UNIDOR macro is for unary functions that can be followed by the //
273 * operator (such as C<shift // 0>).
a687059c 274 */
376fcdbf 275#define UNI2(f,x) { \
6154021b 276 pl_yylval.ival = f; \
376fcdbf
AL
277 PL_expect = x; \
278 PL_bufptr = s; \
279 PL_last_uni = PL_oldbufptr; \
280 PL_last_lop_op = f; \
281 if (*s == '(') \
282 return REPORT( (int)FUNC1 ); \
29595ff2 283 s = PEEKSPACE(s); \
376fcdbf
AL
284 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
285 }
6f33ba73
RGS
286#define UNI(f) UNI2(f,XTERM)
287#define UNIDOR(f) UNI2(f,XTERMORDORDOR)
22393538
MH
288#define UNIPROTO(f) { \
289 PL_last_uni = PL_oldbufptr; \
290 OPERATOR(f); \
291 }
a687059c 292
376fcdbf 293#define UNIBRACK(f) { \
6154021b 294 pl_yylval.ival = f; \
376fcdbf
AL
295 PL_bufptr = s; \
296 PL_last_uni = PL_oldbufptr; \
297 if (*s == '(') \
298 return REPORT( (int)FUNC1 ); \
29595ff2 299 s = PEEKSPACE(s); \
376fcdbf
AL
300 return REPORT( (*s == '(') ? (int)FUNC1 : (int)UNIOP ); \
301 }
79072805 302
9f68db38 303/* grandfather return to old style */
78cdf107
Z
304#define OLDLOP(f) \
305 do { \
306 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
307 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
308 pl_yylval.ival = (f); \
309 PL_expect = XTERM; \
310 PL_bufptr = s; \
311 return (int)LSTOP; \
312 } while(0)
79072805 313
8fa7f367
JH
314#ifdef DEBUGGING
315
6154021b 316/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
317enum token_type {
318 TOKENTYPE_NONE,
319 TOKENTYPE_IVAL,
6154021b 320 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6
DM
321 TOKENTYPE_PVAL,
322 TOKENTYPE_OPVAL,
323 TOKENTYPE_GVVAL
324};
325
6d4a66ac
NC
326static struct debug_tokens {
327 const int token;
328 enum token_type type;
329 const char *name;
330} const debug_tokens[] =
9041c2e3 331{
bbf60fe6
DM
332 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
333 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
334 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
335 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
336 { ARROW, TOKENTYPE_NONE, "ARROW" },
337 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
338 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
339 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
340 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
341 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
0d863452 342 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
343 { DO, TOKENTYPE_NONE, "DO" },
344 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
345 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
346 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
347 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
348 { ELSE, TOKENTYPE_NONE, "ELSE" },
349 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
350 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
351 { FOR, TOKENTYPE_IVAL, "FOR" },
352 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
353 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
354 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
7eb971ee 355 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
bbf60fe6
DM
356 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
357 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
358 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 359 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
360 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
361 { IF, TOKENTYPE_IVAL, "IF" },
362 { LABEL, TOKENTYPE_PVAL, "LABEL" },
363 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
364 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
365 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
366 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
367 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
368 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
369 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
370 { MY, TOKENTYPE_IVAL, "MY" },
371 { MYSUB, TOKENTYPE_NONE, "MYSUB" },
372 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
373 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
374 { OROP, TOKENTYPE_IVAL, "OROP" },
375 { OROR, TOKENTYPE_NONE, "OROR" },
376 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
88e1f1a2
JV
377 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
378 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6
DM
379 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
380 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
381 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
382 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
383 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
384 { PREINC, TOKENTYPE_NONE, "PREINC" },
385 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
386 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
387 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
388 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
389 { SUB, TOKENTYPE_NONE, "SUB" },
390 { THING, TOKENTYPE_OPVAL, "THING" },
391 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
392 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
393 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
394 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
395 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
396 { USE, TOKENTYPE_IVAL, "USE" },
0d863452 397 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6
DM
398 { WHILE, TOKENTYPE_IVAL, "WHILE" },
399 { WORD, TOKENTYPE_OPVAL, "WORD" },
be25f609 400 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 401 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
402};
403
6154021b 404/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 405
bbf60fe6 406STATIC int
704d4215 407S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 408{
97aff369 409 dVAR;
7918f24d
NC
410
411 PERL_ARGS_ASSERT_TOKEREPORT;
412
bbf60fe6 413 if (DEBUG_T_TEST) {
bd61b366 414 const char *name = NULL;
bbf60fe6 415 enum token_type type = TOKENTYPE_NONE;
f54cb97a 416 const struct debug_tokens *p;
396482e1 417 SV* const report = newSVpvs("<== ");
bbf60fe6 418
f54cb97a 419 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
420 if (p->token == (int)rv) {
421 name = p->name;
422 type = p->type;
423 break;
424 }
425 }
426 if (name)
54667de8 427 Perl_sv_catpv(aTHX_ report, name);
bbf60fe6
DM
428 else if ((char)rv > ' ' && (char)rv < '~')
429 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
430 else if (!rv)
396482e1 431 sv_catpvs(report, "EOF");
bbf60fe6
DM
432 else
433 Perl_sv_catpvf(aTHX_ report, "?? %"IVdf, (IV)rv);
434 switch (type) {
435 case TOKENTYPE_NONE:
436 case TOKENTYPE_GVVAL: /* doesn't appear to be used */
437 break;
438 case TOKENTYPE_IVAL:
704d4215 439 Perl_sv_catpvf(aTHX_ report, "(ival=%"IVdf")", (IV)lvalp->ival);
bbf60fe6
DM
440 break;
441 case TOKENTYPE_OPNUM:
442 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 443 PL_op_name[lvalp->ival]);
bbf60fe6
DM
444 break;
445 case TOKENTYPE_PVAL:
704d4215 446 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
447 break;
448 case TOKENTYPE_OPVAL:
704d4215 449 if (lvalp->opval) {
401441c0 450 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
451 PL_op_name[lvalp->opval->op_type]);
452 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 453 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 454 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
455 }
456
457 }
401441c0 458 else
396482e1 459 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
460 break;
461 }
b6007c36 462 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
463 };
464 return (int)rv;
998054bd
SC
465}
466
b6007c36
DM
467
468/* print the buffer with suitable escapes */
469
470STATIC void
15f169a1 471S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 472{
396482e1 473 SV* const tmp = newSVpvs("");
7918f24d
NC
474
475 PERL_ARGS_ASSERT_PRINTBUF;
476
b6007c36
DM
477 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
478 SvREFCNT_dec(tmp);
479}
480
8fa7f367
JH
481#endif
482
8290c323
NC
483static int
484S_deprecate_commaless_var_list(pTHX) {
485 PL_expect = XTERM;
486 deprecate("comma-less variable list");
487 return REPORT(','); /* grandfather non-comma-format format */
488}
489
ffb4593c
NT
490/*
491 * S_ao
492 *
c963b151
BD
493 * This subroutine detects &&=, ||=, and //= and turns an ANDAND, OROR or DORDOR
494 * into an OP_ANDASSIGN, OP_ORASSIGN, or OP_DORASSIGN
ffb4593c
NT
495 */
496
76e3520e 497STATIC int
cea2e8a9 498S_ao(pTHX_ int toketype)
a0d0e21e 499{
97aff369 500 dVAR;
3280af22
NIS
501 if (*PL_bufptr == '=') {
502 PL_bufptr++;
a0d0e21e 503 if (toketype == ANDAND)
6154021b 504 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 505 else if (toketype == OROR)
6154021b 506 pl_yylval.ival = OP_ORASSIGN;
c963b151 507 else if (toketype == DORDOR)
6154021b 508 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
509 toketype = ASSIGNOP;
510 }
511 return toketype;
512}
513
ffb4593c
NT
514/*
515 * S_no_op
516 * When Perl expects an operator and finds something else, no_op
517 * prints the warning. It always prints "<something> found where
518 * operator expected. It prints "Missing semicolon on previous line?"
519 * if the surprise occurs at the start of the line. "do you need to
520 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
521 * where the compiler doesn't know if foo is a method call or a function.
522 * It prints "Missing operator before end of line" if there's nothing
523 * after the missing operator, or "... before <...>" if there is something
524 * after the missing operator.
525 */
526
76e3520e 527STATIC void
15f169a1 528S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 529{
97aff369 530 dVAR;
9d4ba2ae
AL
531 char * const oldbp = PL_bufptr;
532 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 533
7918f24d
NC
534 PERL_ARGS_ASSERT_NO_OP;
535
1189a94a
GS
536 if (!s)
537 s = oldbp;
07c798fb 538 else
1189a94a 539 PL_bufptr = s;
cea2e8a9 540 yywarn(Perl_form(aTHX_ "%s found where operator expected", what));
56da5a46
RGS
541 if (ckWARN_d(WARN_SYNTAX)) {
542 if (is_first)
543 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
544 "\t(Missing semicolon on previous line?)\n");
545 else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) {
f54cb97a 546 const char *t;
c35e046a
AL
547 for (t = PL_oldoldbufptr; (isALNUM_lazy_if(t,UTF) || *t == ':'); t++)
548 NOOP;
56da5a46
RGS
549 if (t < PL_bufptr && isSPACE(*t))
550 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
551 "\t(Do you need to predeclare %.*s?)\n",
551405c4 552 (int)(t - PL_oldoldbufptr), PL_oldoldbufptr);
56da5a46
RGS
553 }
554 else {
555 assert(s >= oldbp);
556 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
551405c4 557 "\t(Missing operator before %.*s?)\n", (int)(s - oldbp), oldbp);
56da5a46 558 }
07c798fb 559 }
3280af22 560 PL_bufptr = oldbp;
8990e307
LW
561}
562
ffb4593c
NT
563/*
564 * S_missingterm
565 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 566 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
567 * If we're in a delimited string and the delimiter is a control
568 * character, it's reformatted into a two-char sequence like ^C.
569 * This is fatal.
570 */
571
76e3520e 572STATIC void
cea2e8a9 573S_missingterm(pTHX_ char *s)
8990e307 574{
97aff369 575 dVAR;
8990e307
LW
576 char tmpbuf[3];
577 char q;
578 if (s) {
9d4ba2ae 579 char * const nl = strrchr(s,'\n');
d2719217 580 if (nl)
8990e307
LW
581 *nl = '\0';
582 }
463559e7 583 else if (isCNTRL(PL_multi_close)) {
8990e307 584 *tmpbuf = '^';
585ec06d 585 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
586 tmpbuf[2] = '\0';
587 s = tmpbuf;
588 }
589 else {
eb160463 590 *tmpbuf = (char)PL_multi_close;
8990e307
LW
591 tmpbuf[1] = '\0';
592 s = tmpbuf;
593 }
594 q = strchr(s,'"') ? '\'' : '"';
cea2e8a9 595 Perl_croak(aTHX_ "Can't find string terminator %c%s%c anywhere before EOF",q,s,q);
463ee0b2 596}
79072805 597
0d863452 598/*
0d863452
RH
599 * Check whether the named feature is enabled.
600 */
26ea9e12
NC
601bool
602Perl_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
0d863452 603{
97aff369 604 dVAR;
0d863452 605 HV * const hinthv = GvHV(PL_hintgv);
4a731d7b 606 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
7918f24d
NC
607
608 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
609
26ea9e12
NC
610 if (namelen > MAX_FEATURE_LEN)
611 return FALSE;
4a731d7b 612 memcpy(&he_name[8], name, namelen);
d4c19fe8 613
7b9ef140 614 return (hinthv && hv_exists(hinthv, he_name, 8 + namelen));
0d863452
RH
615}
616
ffb4593c 617/*
9cbb5ea2
GS
618 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
619 * utf16-to-utf8-reversed.
ffb4593c
NT
620 */
621
c39cd008
GS
622#ifdef PERL_CR_FILTER
623static void
624strip_return(SV *sv)
625{
95a20fc0 626 register const char *s = SvPVX_const(sv);
9d4ba2ae 627 register const char * const e = s + SvCUR(sv);
7918f24d
NC
628
629 PERL_ARGS_ASSERT_STRIP_RETURN;
630
c39cd008
GS
631 /* outer loop optimized to do nothing if there are no CR-LFs */
632 while (s < e) {
633 if (*s++ == '\r' && *s == '\n') {
634 /* hit a CR-LF, need to copy the rest */
635 register char *d = s - 1;
636 *d++ = *s++;
637 while (s < e) {
638 if (*s == '\r' && s[1] == '\n')
639 s++;
640 *d++ = *s++;
641 }
642 SvCUR(sv) -= s - d;
643 return;
644 }
645 }
646}
a868473f 647
76e3520e 648STATIC I32
c39cd008 649S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 650{
f54cb97a 651 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
652 if (count > 0 && !maxlen)
653 strip_return(sv);
654 return count;
a868473f
NIS
655}
656#endif
657
ffb4593c 658/*
8eaa0acf
Z
659=for apidoc Amx|void|lex_start|SV *line|PerlIO *rsfp|U32 flags
660
661Creates and initialises a new lexer/parser state object, supplying
662a context in which to lex and parse from a new source of Perl code.
663A pointer to the new state object is placed in L</PL_parser>. An entry
664is made on the save stack so that upon unwinding the new state object
665will be destroyed and the former value of L</PL_parser> will be restored.
666Nothing else need be done to clean up the parsing context.
667
668The code to be parsed comes from I<line> and I<rsfp>. I<line>, if
669non-null, provides a string (in SV form) containing code to be parsed.
670A copy of the string is made, so subsequent modification of I<line>
671does not affect parsing. I<rsfp>, if non-null, provides an input stream
672from which code will be read to be parsed. If both are non-null, the
673code in I<line> comes first and must consist of complete lines of input,
674and I<rsfp> supplies the remainder of the source.
675
e368b3bd
FC
676The I<flags> parameter is reserved for future use. Currently it is only
677used by perl internally, so extensions should always pass zero.
8eaa0acf
Z
678
679=cut
680*/
ffb4593c 681
27fcb6ee
FC
682/* LEX_START_SAME_FILTER indicates that this is not a new file, so it
683 can share filters with the current parser. */
684
a0d0e21e 685void
8eaa0acf 686Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
79072805 687{
97aff369 688 dVAR;
6ef55633 689 const char *s = NULL;
5486870f 690 yy_parser *parser, *oparser;
60d63348 691 if (flags && flags & ~LEX_START_FLAGS)
8eaa0acf 692 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
acdf0a21
DM
693
694 /* create and initialise a parser */
695
199e78b7 696 Newxz(parser, 1, yy_parser);
5486870f 697 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
698 PL_parser = parser;
699
28ac2b49
Z
700 parser->stack = NULL;
701 parser->ps = NULL;
702 parser->stack_size = 0;
acdf0a21 703
e3abe207
DM
704 /* on scope exit, free this parser and restore any outer one */
705 SAVEPARSER(parser);
7c4baf47 706 parser->saved_curcop = PL_curcop;
e3abe207 707
acdf0a21 708 /* initialise lexer state */
8990e307 709
fb205e7a
DM
710#ifdef PERL_MAD
711 parser->curforce = -1;
712#else
713 parser->nexttoke = 0;
714#endif
ca4cfd28 715 parser->error_count = oparser ? oparser->error_count : 0;
c2598295 716 parser->copline = NOLINE;
5afb0a62 717 parser->lex_state = LEX_NORMAL;
c2598295 718 parser->expect = XSTATE;
2f9285f8 719 parser->rsfp = rsfp;
27fcb6ee
FC
720 parser->rsfp_filters =
721 !(flags & LEX_START_SAME_FILTER) || !oparser
d3cd8e11
FC
722 ? NULL
723 : MUTABLE_AV(SvREFCNT_inc(
724 oparser->rsfp_filters
725 ? oparser->rsfp_filters
726 : (oparser->rsfp_filters = newAV())
727 ));
2f9285f8 728
199e78b7
DM
729 Newx(parser->lex_brackstack, 120, char);
730 Newx(parser->lex_casestack, 12, char);
731 *parser->lex_casestack = '\0';
02b34bbe 732
10efb74f 733 if (line) {
0528fd32 734 STRLEN len;
10efb74f 735 s = SvPV_const(line, len);
0abcdfa4
FC
736 parser->linestr = flags & LEX_START_COPIED
737 ? SvREFCNT_inc_simple_NN(line)
738 : newSVpvn_flags(s, len, SvUTF8(line));
37555a72 739 if (!len || s[len-1] != ';')
bdc0bf6f 740 sv_catpvs(parser->linestr, "\n;");
0abcdfa4
FC
741 } else {
742 parser->linestr = newSVpvs("\n;");
8990e307 743 }
f06b5848
DM
744 parser->oldoldbufptr =
745 parser->oldbufptr =
746 parser->bufptr =
747 parser->linestart = SvPVX(parser->linestr);
748 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
749 parser->last_lop = parser->last_uni = NULL;
60d63348 750 parser->lex_flags = flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES);
737c24fc 751
60d63348 752 parser->in_pod = parser->filtered = 0;
79072805 753}
a687059c 754
e3abe207
DM
755
756/* delete a parser object */
757
758void
759Perl_parser_free(pTHX_ const yy_parser *parser)
760{
7918f24d
NC
761 PERL_ARGS_ASSERT_PARSER_FREE;
762
7c4baf47 763 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
764 SvREFCNT_dec(parser->linestr);
765
2f9285f8
DM
766 if (parser->rsfp == PerlIO_stdin())
767 PerlIO_clearerr(parser->rsfp);
799361c3
SH
768 else if (parser->rsfp && (!parser->old_parser ||
769 (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 770 PerlIO_close(parser->rsfp);
5486870f 771 SvREFCNT_dec(parser->rsfp_filters);
2f9285f8 772
e3abe207
DM
773 Safefree(parser->lex_brackstack);
774 Safefree(parser->lex_casestack);
775 PL_parser = parser->old_parser;
776 Safefree(parser);
777}
778
779
ffb4593c 780/*
f0e67a1d
Z
781=for apidoc AmxU|SV *|PL_parser-E<gt>linestr
782
783Buffer scalar containing the chunk currently under consideration of the
784text currently being lexed. This is always a plain string scalar (for
785which C<SvPOK> is true). It is not intended to be used as a scalar by
786normal scalar means; instead refer to the buffer directly by the pointer
787variables described below.
788
789The lexer maintains various C<char*> pointers to things in the
790C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
791reallocated, all of these pointers must be updated. Don't attempt to
792do this manually, but rather use L</lex_grow_linestr> if you need to
793reallocate the buffer.
794
795The content of the text chunk in the buffer is commonly exactly one
796complete line of input, up to and including a newline terminator,
797but there are situations where it is otherwise. The octets of the
798buffer may be intended to be interpreted as either UTF-8 or Latin-1.
799The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
800flag on this scalar, which may disagree with it.
801
802For direct examination of the buffer, the variable
803L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
804lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
805of these pointers is usually preferable to examination of the scalar
806through normal scalar means.
807
808=for apidoc AmxU|char *|PL_parser-E<gt>bufend
809
810Direct pointer to the end of the chunk of text currently being lexed, the
811end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
812+ SvCUR(PL_parser-E<gt>linestr)>. A NUL character (zero octet) is
813always located at the end of the buffer, and does not count as part of
814the buffer's contents.
815
816=for apidoc AmxU|char *|PL_parser-E<gt>bufptr
817
818Points to the current position of lexing inside the lexer buffer.
819Characters around this point may be freely examined, within
820the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
821L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
822interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
823
824Lexing code (whether in the Perl core or not) moves this pointer past
825the characters that it consumes. It is also expected to perform some
826bookkeeping whenever a newline character is consumed. This movement
827can be more conveniently performed by the function L</lex_read_to>,
828which handles newlines appropriately.
829
830Interpretation of the buffer's octets can be abstracted out by
831using the slightly higher-level functions L</lex_peek_unichar> and
832L</lex_read_unichar>.
833
834=for apidoc AmxU|char *|PL_parser-E<gt>linestart
835
836Points to the start of the current line inside the lexer buffer.
837This is useful for indicating at which column an error occurred, and
838not much else. This must be updated by any lexing code that consumes
839a newline; the function L</lex_read_to> handles this detail.
840
841=cut
842*/
843
844/*
845=for apidoc Amx|bool|lex_bufutf8
846
847Indicates whether the octets in the lexer buffer
848(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
849of Unicode characters. If not, they should be interpreted as Latin-1
850characters. This is analogous to the C<SvUTF8> flag for scalars.
851
852In UTF-8 mode, it is not guaranteed that the lexer buffer actually
853contains valid UTF-8. Lexing code must be robust in the face of invalid
854encoding.
855
856The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
857is significant, but not the whole story regarding the input character
858encoding. Normally, when a file is being read, the scalar contains octets
859and its C<SvUTF8> flag is off, but the octets should be interpreted as
860UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
861however, the scalar may have the C<SvUTF8> flag on, and in this case its
862octets should be interpreted as UTF-8 unless the C<use bytes> pragma
863is in effect. This logic may change in the future; use this function
864instead of implementing the logic yourself.
865
866=cut
867*/
868
869bool
870Perl_lex_bufutf8(pTHX)
871{
872 return UTF;
873}
874
875/*
876=for apidoc Amx|char *|lex_grow_linestr|STRLEN len
877
878Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
879at least I<len> octets (including terminating NUL). Returns a
880pointer to the reallocated buffer. This is necessary before making
881any direct modification of the buffer that would increase its length.
882L</lex_stuff_pvn> provides a more convenient way to insert text into
883the buffer.
884
885Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
886this function updates all of the lexer's variables that point directly
887into the buffer.
888
889=cut
890*/
891
892char *
893Perl_lex_grow_linestr(pTHX_ STRLEN len)
894{
895 SV *linestr;
896 char *buf;
897 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
898 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
899 linestr = PL_parser->linestr;
900 buf = SvPVX(linestr);
901 if (len <= SvLEN(linestr))
902 return buf;
903 bufend_pos = PL_parser->bufend - buf;
904 bufptr_pos = PL_parser->bufptr - buf;
905 oldbufptr_pos = PL_parser->oldbufptr - buf;
906 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
907 linestart_pos = PL_parser->linestart - buf;
908 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
909 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
910 buf = sv_grow(linestr, len);
911 PL_parser->bufend = buf + bufend_pos;
912 PL_parser->bufptr = buf + bufptr_pos;
913 PL_parser->oldbufptr = buf + oldbufptr_pos;
914 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
915 PL_parser->linestart = buf + linestart_pos;
916 if (PL_parser->last_uni)
917 PL_parser->last_uni = buf + last_uni_pos;
918 if (PL_parser->last_lop)
919 PL_parser->last_lop = buf + last_lop_pos;
920 return buf;
921}
922
923/*
83aa740e 924=for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
f0e67a1d
Z
925
926Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
927immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
928reallocating the buffer if necessary. This means that lexing code that
929runs later will see the characters as if they had appeared in the input.
930It is not recommended to do this as part of normal parsing, and most
931uses of this facility run the risk of the inserted characters being
932interpreted in an unintended manner.
933
934The string to be inserted is represented by I<len> octets starting
935at I<pv>. These octets are interpreted as either UTF-8 or Latin-1,
936according to whether the C<LEX_STUFF_UTF8> flag is set in I<flags>.
937The characters are recoded for the lexer buffer, according to how the
938buffer is currently being interpreted (L</lex_bufutf8>). If a string
9dcc53ea 939to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
f0e67a1d
Z
940function is more convenient.
941
942=cut
943*/
944
945void
83aa740e 946Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
f0e67a1d 947{
749123ff 948 dVAR;
f0e67a1d
Z
949 char *bufptr;
950 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
951 if (flags & ~(LEX_STUFF_UTF8))
952 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
953 if (UTF) {
954 if (flags & LEX_STUFF_UTF8) {
955 goto plain_copy;
956 } else {
957 STRLEN highhalf = 0;
83aa740e 958 const char *p, *e = pv+len;
f0e67a1d
Z
959 for (p = pv; p != e; p++)
960 highhalf += !!(((U8)*p) & 0x80);
961 if (!highhalf)
962 goto plain_copy;
963 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
964 bufptr = PL_parser->bufptr;
965 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
966 SvCUR_set(PL_parser->linestr,
967 SvCUR(PL_parser->linestr) + len+highhalf);
f0e67a1d
Z
968 PL_parser->bufend += len+highhalf;
969 for (p = pv; p != e; p++) {
970 U8 c = (U8)*p;
971 if (c & 0x80) {
972 *bufptr++ = (char)(0xc0 | (c >> 6));
973 *bufptr++ = (char)(0x80 | (c & 0x3f));
974 } else {
975 *bufptr++ = (char)c;
976 }
977 }
978 }
979 } else {
980 if (flags & LEX_STUFF_UTF8) {
981 STRLEN highhalf = 0;
83aa740e 982 const char *p, *e = pv+len;
f0e67a1d
Z
983 for (p = pv; p != e; p++) {
984 U8 c = (U8)*p;
985 if (c >= 0xc4) {
986 Perl_croak(aTHX_ "Lexing code attempted to stuff "
987 "non-Latin-1 character into Latin-1 input");
988 } else if (c >= 0xc2 && p+1 != e &&
989 (((U8)p[1]) & 0xc0) == 0x80) {
990 p++;
991 highhalf++;
992 } else if (c >= 0x80) {
993 /* malformed UTF-8 */
994 ENTER;
995 SAVESPTR(PL_warnhook);
996 PL_warnhook = PERL_WARNHOOK_FATAL;
997 utf8n_to_uvuni((U8*)p, e-p, NULL, 0);
998 LEAVE;
999 }
1000 }
1001 if (!highhalf)
1002 goto plain_copy;
1003 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
1004 bufptr = PL_parser->bufptr;
1005 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1006 SvCUR_set(PL_parser->linestr,
1007 SvCUR(PL_parser->linestr) + len-highhalf);
f0e67a1d
Z
1008 PL_parser->bufend += len-highhalf;
1009 for (p = pv; p != e; p++) {
1010 U8 c = (U8)*p;
1011 if (c & 0x80) {
1012 *bufptr++ = (char)(((c & 0x3) << 6) | (p[1] & 0x3f));
1013 p++;
1014 } else {
1015 *bufptr++ = (char)c;
1016 }
1017 }
1018 } else {
1019 plain_copy:
1020 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1021 bufptr = PL_parser->bufptr;
1022 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
255fdf19 1023 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
f0e67a1d
Z
1024 PL_parser->bufend += len;
1025 Copy(pv, bufptr, len, char);
1026 }
1027 }
1028}
1029
1030/*
9dcc53ea
Z
1031=for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
1032
1033Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1034immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1035reallocating the buffer if necessary. This means that lexing code that
1036runs later will see the characters as if they had appeared in the input.
1037It is not recommended to do this as part of normal parsing, and most
1038uses of this facility run the risk of the inserted characters being
1039interpreted in an unintended manner.
1040
1041The string to be inserted is represented by octets starting at I<pv>
1042and continuing to the first nul. These octets are interpreted as either
1043UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
1044in I<flags>. The characters are recoded for the lexer buffer, according
1045to how the buffer is currently being interpreted (L</lex_bufutf8>).
1046If it is not convenient to nul-terminate a string to be inserted, the
1047L</lex_stuff_pvn> function is more appropriate.
1048
1049=cut
1050*/
1051
1052void
1053Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1054{
1055 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1056 lex_stuff_pvn(pv, strlen(pv), flags);
1057}
1058
1059/*
f0e67a1d
Z
1060=for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1061
1062Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1063immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1064reallocating the buffer if necessary. This means that lexing code that
1065runs later will see the characters as if they had appeared in the input.
1066It is not recommended to do this as part of normal parsing, and most
1067uses of this facility run the risk of the inserted characters being
1068interpreted in an unintended manner.
1069
1070The string to be inserted is the string value of I<sv>. The characters
1071are recoded for the lexer buffer, according to how the buffer is currently
9dcc53ea 1072being interpreted (L</lex_bufutf8>). If a string to be inserted is
f0e67a1d
Z
1073not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1074need to construct a scalar.
1075
1076=cut
1077*/
1078
1079void
1080Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1081{
1082 char *pv;
1083 STRLEN len;
1084 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1085 if (flags)
1086 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1087 pv = SvPV(sv, len);
1088 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1089}
1090
1091/*
1092=for apidoc Amx|void|lex_unstuff|char *ptr
1093
1094Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1095I<ptr>. Text following I<ptr> will be moved, and the buffer shortened.
1096This hides the discarded text from any lexing code that runs later,
1097as if the text had never appeared.
1098
1099This is not the normal way to consume lexed text. For that, use
1100L</lex_read_to>.
1101
1102=cut
1103*/
1104
1105void
1106Perl_lex_unstuff(pTHX_ char *ptr)
1107{
1108 char *buf, *bufend;
1109 STRLEN unstuff_len;
1110 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1111 buf = PL_parser->bufptr;
1112 if (ptr < buf)
1113 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1114 if (ptr == buf)
1115 return;
1116 bufend = PL_parser->bufend;
1117 if (ptr > bufend)
1118 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1119 unstuff_len = ptr - buf;
1120 Move(ptr, buf, bufend+1-ptr, char);
1121 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1122 PL_parser->bufend = bufend - unstuff_len;
1123}
1124
1125/*
1126=for apidoc Amx|void|lex_read_to|char *ptr
1127
1128Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1129to I<ptr>. This advances L</PL_parser-E<gt>bufptr> to match I<ptr>,
1130performing the correct bookkeeping whenever a newline character is passed.
1131This is the normal way to consume lexed text.
1132
1133Interpretation of the buffer's octets can be abstracted out by
1134using the slightly higher-level functions L</lex_peek_unichar> and
1135L</lex_read_unichar>.
1136
1137=cut
1138*/
1139
1140void
1141Perl_lex_read_to(pTHX_ char *ptr)
1142{
1143 char *s;
1144 PERL_ARGS_ASSERT_LEX_READ_TO;
1145 s = PL_parser->bufptr;
1146 if (ptr < s || ptr > PL_parser->bufend)
1147 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1148 for (; s != ptr; s++)
1149 if (*s == '\n') {
1150 CopLINE_inc(PL_curcop);
1151 PL_parser->linestart = s+1;
1152 }
1153 PL_parser->bufptr = ptr;
1154}
1155
1156/*
1157=for apidoc Amx|void|lex_discard_to|char *ptr
1158
1159Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1160up to I<ptr>. The remaining content of the buffer will be moved, and
1161all pointers into the buffer updated appropriately. I<ptr> must not
1162be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1163it is not permitted to discard text that has yet to be lexed.
1164
1165Normally it is not necessarily to do this directly, because it suffices to
1166use the implicit discarding behaviour of L</lex_next_chunk> and things
1167based on it. However, if a token stretches across multiple lines,
1f317c95 1168and the lexing code has kept multiple lines of text in the buffer for
f0e67a1d
Z
1169that purpose, then after completion of the token it would be wise to
1170explicitly discard the now-unneeded earlier lines, to avoid future
1171multi-line tokens growing the buffer without bound.
1172
1173=cut
1174*/
1175
1176void
1177Perl_lex_discard_to(pTHX_ char *ptr)
1178{
1179 char *buf;
1180 STRLEN discard_len;
1181 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1182 buf = SvPVX(PL_parser->linestr);
1183 if (ptr < buf)
1184 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1185 if (ptr == buf)
1186 return;
1187 if (ptr > PL_parser->bufptr)
1188 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1189 discard_len = ptr - buf;
1190 if (PL_parser->oldbufptr < ptr)
1191 PL_parser->oldbufptr = ptr;
1192 if (PL_parser->oldoldbufptr < ptr)
1193 PL_parser->oldoldbufptr = ptr;
1194 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1195 PL_parser->last_uni = NULL;
1196 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1197 PL_parser->last_lop = NULL;
1198 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1199 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1200 PL_parser->bufend -= discard_len;
1201 PL_parser->bufptr -= discard_len;
1202 PL_parser->oldbufptr -= discard_len;
1203 PL_parser->oldoldbufptr -= discard_len;
1204 if (PL_parser->last_uni)
1205 PL_parser->last_uni -= discard_len;
1206 if (PL_parser->last_lop)
1207 PL_parser->last_lop -= discard_len;
1208}
1209
1210/*
1211=for apidoc Amx|bool|lex_next_chunk|U32 flags
1212
1213Reads in the next chunk of text to be lexed, appending it to
1214L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1215looked to the end of the current chunk and wants to know more. It is
1216usual, but not necessary, for lexing to have consumed the entirety of
1217the current chunk at this time.
1218
1219If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1220chunk (i.e., the current chunk has been entirely consumed), normally the
1221current chunk will be discarded at the same time that the new chunk is
1222read in. If I<flags> includes C<LEX_KEEP_PREVIOUS>, the current chunk
1223will not be discarded. If the current chunk has not been entirely
1224consumed, then it will not be discarded regardless of the flag.
1225
1226Returns true if some new text was added to the buffer, or false if the
1227buffer has reached the end of the input text.
1228
1229=cut
1230*/
1231
1232#define LEX_FAKE_EOF 0x80000000
1233
1234bool
1235Perl_lex_next_chunk(pTHX_ U32 flags)
1236{
1237 SV *linestr;
1238 char *buf;
1239 STRLEN old_bufend_pos, new_bufend_pos;
1240 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1241 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1242 bool got_some_for_debugger = 0;
f0e67a1d
Z
1243 bool got_some;
1244 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF))
1245 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
f0e67a1d
Z
1246 linestr = PL_parser->linestr;
1247 buf = SvPVX(linestr);
1248 if (!(flags & LEX_KEEP_PREVIOUS) &&
1249 PL_parser->bufptr == PL_parser->bufend) {
1250 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1251 linestart_pos = 0;
1252 if (PL_parser->last_uni != PL_parser->bufend)
1253 PL_parser->last_uni = NULL;
1254 if (PL_parser->last_lop != PL_parser->bufend)
1255 PL_parser->last_lop = NULL;
1256 last_uni_pos = last_lop_pos = 0;
1257 *buf = 0;
1258 SvCUR(linestr) = 0;
1259 } else {
1260 old_bufend_pos = PL_parser->bufend - buf;
1261 bufptr_pos = PL_parser->bufptr - buf;
1262 oldbufptr_pos = PL_parser->oldbufptr - buf;
1263 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1264 linestart_pos = PL_parser->linestart - buf;
1265 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1266 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1267 }
1268 if (flags & LEX_FAKE_EOF) {
1269 goto eof;
60d63348 1270 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
f0e67a1d
Z
1271 got_some = 0;
1272 } else if (filter_gets(linestr, old_bufend_pos)) {
1273 got_some = 1;
17cc9359 1274 got_some_for_debugger = 1;
f0e67a1d 1275 } else {
580561a3
Z
1276 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1277 sv_setpvs(linestr, "");
f0e67a1d
Z
1278 eof:
1279 /* End of real input. Close filehandle (unless it was STDIN),
1280 * then add implicit termination.
1281 */
1282 if ((PerlIO*)PL_parser->rsfp == PerlIO_stdin())
1283 PerlIO_clearerr(PL_parser->rsfp);
1284 else if (PL_parser->rsfp)
1285 (void)PerlIO_close(PL_parser->rsfp);
1286 PL_parser->rsfp = NULL;
60d63348 1287 PL_parser->in_pod = PL_parser->filtered = 0;
f0e67a1d
Z
1288#ifdef PERL_MAD
1289 if (PL_madskills && !PL_in_eval && (PL_minus_p || PL_minus_n))
1290 PL_faketokens = 1;
1291#endif
1292 if (!PL_in_eval && PL_minus_p) {
1293 sv_catpvs(linestr,
1294 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1295 PL_minus_n = PL_minus_p = 0;
1296 } else if (!PL_in_eval && PL_minus_n) {
1297 sv_catpvs(linestr, /*{*/";}");
1298 PL_minus_n = 0;
1299 } else
1300 sv_catpvs(linestr, ";");
1301 got_some = 1;
1302 }
1303 buf = SvPVX(linestr);
1304 new_bufend_pos = SvCUR(linestr);
1305 PL_parser->bufend = buf + new_bufend_pos;
1306 PL_parser->bufptr = buf + bufptr_pos;
1307 PL_parser->oldbufptr = buf + oldbufptr_pos;
1308 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1309 PL_parser->linestart = buf + linestart_pos;
1310 if (PL_parser->last_uni)
1311 PL_parser->last_uni = buf + last_uni_pos;
1312 if (PL_parser->last_lop)
1313 PL_parser->last_lop = buf + last_lop_pos;
17cc9359 1314 if (got_some_for_debugger && (PERLDB_LINE || PERLDB_SAVESRC) &&
f0e67a1d
Z
1315 PL_curstash != PL_debstash) {
1316 /* debugger active and we're not compiling the debugger code,
1317 * so store the line into the debugger's array of lines
1318 */
1319 update_debugger_info(NULL, buf+old_bufend_pos,
1320 new_bufend_pos-old_bufend_pos);
1321 }
1322 return got_some;
1323}
1324
1325/*
1326=for apidoc Amx|I32|lex_peek_unichar|U32 flags
1327
1328Looks ahead one (Unicode) character in the text currently being lexed.
1329Returns the codepoint (unsigned integer value) of the next character,
1330or -1 if lexing has reached the end of the input text. To consume the
1331peeked character, use L</lex_read_unichar>.
1332
1333If the next character is in (or extends into) the next chunk of input
1334text, the next chunk will be read in. Normally the current chunk will be
1335discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1336then the current chunk will not be discarded.
1337
1338If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1339is encountered, an exception is generated.
1340
1341=cut
1342*/
1343
1344I32
1345Perl_lex_peek_unichar(pTHX_ U32 flags)
1346{
749123ff 1347 dVAR;
f0e67a1d
Z
1348 char *s, *bufend;
1349 if (flags & ~(LEX_KEEP_PREVIOUS))
1350 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1351 s = PL_parser->bufptr;
1352 bufend = PL_parser->bufend;
1353 if (UTF) {
1354 U8 head;
1355 I32 unichar;
1356 STRLEN len, retlen;
1357 if (s == bufend) {
1358 if (!lex_next_chunk(flags))
1359 return -1;
1360 s = PL_parser->bufptr;
1361 bufend = PL_parser->bufend;
1362 }
1363 head = (U8)*s;
1364 if (!(head & 0x80))
1365 return head;
1366 if (head & 0x40) {
1367 len = PL_utf8skip[head];
1368 while ((STRLEN)(bufend-s) < len) {
1369 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1370 break;
1371 s = PL_parser->bufptr;
1372 bufend = PL_parser->bufend;
1373 }
1374 }
1375 unichar = utf8n_to_uvuni((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
1376 if (retlen == (STRLEN)-1) {
1377 /* malformed UTF-8 */
1378 ENTER;
1379 SAVESPTR(PL_warnhook);
1380 PL_warnhook = PERL_WARNHOOK_FATAL;
1381 utf8n_to_uvuni((U8*)s, bufend-s, NULL, 0);
1382 LEAVE;
1383 }
1384 return unichar;
1385 } else {
1386 if (s == bufend) {
1387 if (!lex_next_chunk(flags))
1388 return -1;
1389 s = PL_parser->bufptr;
1390 }
1391 return (U8)*s;
1392 }
1393}
1394
1395/*
1396=for apidoc Amx|I32|lex_read_unichar|U32 flags
1397
1398Reads the next (Unicode) character in the text currently being lexed.
1399Returns the codepoint (unsigned integer value) of the character read,
1400and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1401if lexing has reached the end of the input text. To non-destructively
1402examine the next character, use L</lex_peek_unichar> instead.
1403
1404If the next character is in (or extends into) the next chunk of input
1405text, the next chunk will be read in. Normally the current chunk will be
1406discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1407then the current chunk will not be discarded.
1408
1409If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1410is encountered, an exception is generated.
1411
1412=cut
1413*/
1414
1415I32
1416Perl_lex_read_unichar(pTHX_ U32 flags)
1417{
1418 I32 c;
1419 if (flags & ~(LEX_KEEP_PREVIOUS))
1420 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1421 c = lex_peek_unichar(flags);
1422 if (c != -1) {
1423 if (c == '\n')
1424 CopLINE_inc(PL_curcop);
d9018cbe
EB
1425 if (UTF)
1426 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1427 else
1428 ++(PL_parser->bufptr);
f0e67a1d
Z
1429 }
1430 return c;
1431}
1432
1433/*
1434=for apidoc Amx|void|lex_read_space|U32 flags
1435
1436Reads optional spaces, in Perl style, in the text currently being
1437lexed. The spaces may include ordinary whitespace characters and
1438Perl-style comments. C<#line> directives are processed if encountered.
1439L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1440at a non-space character (or the end of the input text).
1441
1442If spaces extend into the next chunk of input text, the next chunk will
1443be read in. Normally the current chunk will be discarded at the same
1444time, but if I<flags> includes C<LEX_KEEP_PREVIOUS> then the current
1445chunk will not be discarded.
1446
1447=cut
1448*/
1449
f0998909
Z
1450#define LEX_NO_NEXT_CHUNK 0x80000000
1451
f0e67a1d
Z
1452void
1453Perl_lex_read_space(pTHX_ U32 flags)
1454{
1455 char *s, *bufend;
1456 bool need_incline = 0;
f0998909 1457 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK))
f0e67a1d
Z
1458 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
1459#ifdef PERL_MAD
1460 if (PL_skipwhite) {
1461 sv_free(PL_skipwhite);
1462 PL_skipwhite = NULL;
1463 }
1464 if (PL_madskills)
1465 PL_skipwhite = newSVpvs("");
1466#endif /* PERL_MAD */
1467 s = PL_parser->bufptr;
1468 bufend = PL_parser->bufend;
1469 while (1) {
1470 char c = *s;
1471 if (c == '#') {
1472 do {
1473 c = *++s;
1474 } while (!(c == '\n' || (c == 0 && s == bufend)));
1475 } else if (c == '\n') {
1476 s++;
1477 PL_parser->linestart = s;
1478 if (s == bufend)
1479 need_incline = 1;
1480 else
1481 incline(s);
1482 } else if (isSPACE(c)) {
1483 s++;
1484 } else if (c == 0 && s == bufend) {
1485 bool got_more;
1486#ifdef PERL_MAD
1487 if (PL_madskills)
1488 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1489#endif /* PERL_MAD */
f0998909
Z
1490 if (flags & LEX_NO_NEXT_CHUNK)
1491 break;
f0e67a1d
Z
1492 PL_parser->bufptr = s;
1493 CopLINE_inc(PL_curcop);
1494 got_more = lex_next_chunk(flags);
1495 CopLINE_dec(PL_curcop);
1496 s = PL_parser->bufptr;
1497 bufend = PL_parser->bufend;
1498 if (!got_more)
1499 break;
1500 if (need_incline && PL_parser->rsfp) {
1501 incline(s);
1502 need_incline = 0;
1503 }
1504 } else {
1505 break;
1506 }
1507 }
1508#ifdef PERL_MAD
1509 if (PL_madskills)
1510 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1511#endif /* PERL_MAD */
1512 PL_parser->bufptr = s;
1513}
1514
1515/*
ffb4593c
NT
1516 * S_incline
1517 * This subroutine has nothing to do with tilting, whether at windmills
1518 * or pinball tables. Its name is short for "increment line". It
57843af0 1519 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1520 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1521 * # line 500 "foo.pm"
1522 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1523 */
1524
76e3520e 1525STATIC void
d9095cec 1526S_incline(pTHX_ const char *s)
463ee0b2 1527{
97aff369 1528 dVAR;
d9095cec
NC
1529 const char *t;
1530 const char *n;
1531 const char *e;
8818d409 1532 line_t line_num;
463ee0b2 1533
7918f24d
NC
1534 PERL_ARGS_ASSERT_INCLINE;
1535
57843af0 1536 CopLINE_inc(PL_curcop);
463ee0b2
LW
1537 if (*s++ != '#')
1538 return;
d4c19fe8
AL
1539 while (SPACE_OR_TAB(*s))
1540 s++;
73659bf1
GS
1541 if (strnEQ(s, "line", 4))
1542 s += 4;
1543 else
1544 return;
084592ab 1545 if (SPACE_OR_TAB(*s))
73659bf1 1546 s++;
4e553d73 1547 else
73659bf1 1548 return;
d4c19fe8
AL
1549 while (SPACE_OR_TAB(*s))
1550 s++;
463ee0b2
LW
1551 if (!isDIGIT(*s))
1552 return;
d4c19fe8 1553
463ee0b2
LW
1554 n = s;
1555 while (isDIGIT(*s))
1556 s++;
07714eb4 1557 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1558 return;
bf4acbe4 1559 while (SPACE_OR_TAB(*s))
463ee0b2 1560 s++;
73659bf1 1561 if (*s == '"' && (t = strchr(s+1, '"'))) {
463ee0b2 1562 s++;
73659bf1
GS
1563 e = t + 1;
1564 }
463ee0b2 1565 else {
c35e046a
AL
1566 t = s;
1567 while (!isSPACE(*t))
1568 t++;
73659bf1 1569 e = t;
463ee0b2 1570 }
bf4acbe4 1571 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1572 e++;
1573 if (*e != '\n' && *e != '\0')
1574 return; /* false alarm */
1575
8818d409
FC
1576 line_num = atoi(n)-1;
1577
f4dd75d9 1578 if (t - s > 0) {
d9095cec 1579 const STRLEN len = t - s;
19bad673
NC
1580 SV *const temp_sv = CopFILESV(PL_curcop);
1581 const char *cf;
1582 STRLEN tmplen;
1583
1584 if (temp_sv) {
1585 cf = SvPVX(temp_sv);
1586 tmplen = SvCUR(temp_sv);
1587 } else {
1588 cf = NULL;
1589 tmplen = 0;
1590 }
1591
d1299d44 1592 if (!PL_rsfp && !PL_parser->filtered) {
e66cf94c
RGS
1593 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1594 * to *{"::_<newfilename"} */
44867030
NC
1595 /* However, the long form of evals is only turned on by the
1596 debugger - usually they're "(eval %lu)" */
1597 char smallbuf[128];
1598 char *tmpbuf;
1599 GV **gvp;
d9095cec 1600 STRLEN tmplen2 = len;
798b63bc 1601 if (tmplen + 2 <= sizeof smallbuf)
e66cf94c
RGS
1602 tmpbuf = smallbuf;
1603 else
2ae0db35 1604 Newx(tmpbuf, tmplen + 2, char);
44867030
NC
1605 tmpbuf[0] = '_';
1606 tmpbuf[1] = '<';
2ae0db35 1607 memcpy(tmpbuf + 2, cf, tmplen);
44867030 1608 tmplen += 2;
8a5ee598
RGS
1609 gvp = (GV**)hv_fetch(PL_defstash, tmpbuf, tmplen, FALSE);
1610 if (gvp) {
44867030
NC
1611 char *tmpbuf2;
1612 GV *gv2;
1613
1614 if (tmplen2 + 2 <= sizeof smallbuf)
1615 tmpbuf2 = smallbuf;
1616 else
1617 Newx(tmpbuf2, tmplen2 + 2, char);
1618
1619 if (tmpbuf2 != smallbuf || tmpbuf != smallbuf) {
1620 /* Either they malloc'd it, or we malloc'd it,
1621 so no prefix is present in ours. */
1622 tmpbuf2[0] = '_';
1623 tmpbuf2[1] = '<';
1624 }
1625
1626 memcpy(tmpbuf2 + 2, s, tmplen2);
1627 tmplen2 += 2;
1628
8a5ee598 1629 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1630 if (!isGV(gv2)) {
8a5ee598 1631 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1632 /* adjust ${"::_<newfilename"} to store the new file name */
1633 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
8818d409
FC
1634 /* The line number may differ. If that is the case,
1635 alias the saved lines that are in the array.
1636 Otherwise alias the whole array. */
1637 if (CopLINE(PL_curcop) == line_num) {
1638 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(*gvp)));
1639 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(*gvp)));
1640 }
1641 else if (GvAV(*gvp)) {
1642 AV * const av = GvAV(*gvp);
1643 const I32 start = CopLINE(PL_curcop)+1;
1644 I32 items = AvFILLp(av) - start;
1645 if (items > 0) {
1646 AV * const av2 = GvAVn(gv2);
1647 SV **svp = AvARRAY(av) + start;
1648 I32 l = (I32)line_num+1;
1649 while (items--)
1650 av_store(av2, l++, SvREFCNT_inc(*svp++));
1651 }
1652 }
e5527e4b 1653 }
44867030
NC
1654
1655 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
8a5ee598 1656 }
e66cf94c 1657 if (tmpbuf != smallbuf) Safefree(tmpbuf);
e66cf94c 1658 }
05ec9bb3 1659 CopFILE_free(PL_curcop);
d9095cec 1660 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1661 }
8818d409 1662 CopLINE_set(PL_curcop, line_num);
463ee0b2
LW
1663}
1664
29595ff2 1665#ifdef PERL_MAD
cd81e915 1666/* skip space before PL_thistoken */
29595ff2
NC
1667
1668STATIC char *
1669S_skipspace0(pTHX_ register char *s)
1670{
7918f24d
NC
1671 PERL_ARGS_ASSERT_SKIPSPACE0;
1672
29595ff2
NC
1673 s = skipspace(s);
1674 if (!PL_madskills)
1675 return s;
cd81e915
NC
1676 if (PL_skipwhite) {
1677 if (!PL_thiswhite)
6b29d1f5 1678 PL_thiswhite = newSVpvs("");
cd81e915
NC
1679 sv_catsv(PL_thiswhite, PL_skipwhite);
1680 sv_free(PL_skipwhite);
1681 PL_skipwhite = 0;
1682 }
1683 PL_realtokenstart = s - SvPVX(PL_linestr);
29595ff2
NC
1684 return s;
1685}
1686
cd81e915 1687/* skip space after PL_thistoken */
29595ff2
NC
1688
1689STATIC char *
1690S_skipspace1(pTHX_ register char *s)
1691{
d4c19fe8 1692 const char *start = s;
29595ff2
NC
1693 I32 startoff = start - SvPVX(PL_linestr);
1694
7918f24d
NC
1695 PERL_ARGS_ASSERT_SKIPSPACE1;
1696
29595ff2
NC
1697 s = skipspace(s);
1698 if (!PL_madskills)
1699 return s;
1700 start = SvPVX(PL_linestr) + startoff;
cd81e915 1701 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1702 const char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1703 PL_thistoken = newSVpvn(tstart, start - tstart);
1704 }
1705 PL_realtokenstart = -1;
1706 if (PL_skipwhite) {
1707 if (!PL_nextwhite)
6b29d1f5 1708 PL_nextwhite = newSVpvs("");
cd81e915
NC
1709 sv_catsv(PL_nextwhite, PL_skipwhite);
1710 sv_free(PL_skipwhite);
1711 PL_skipwhite = 0;
29595ff2
NC
1712 }
1713 return s;
1714}
1715
1716STATIC char *
1717S_skipspace2(pTHX_ register char *s, SV **svp)
1718{
c35e046a
AL
1719 char *start;
1720 const I32 bufptroff = PL_bufptr - SvPVX(PL_linestr);
1721 const I32 startoff = s - SvPVX(PL_linestr);
1722
7918f24d
NC
1723 PERL_ARGS_ASSERT_SKIPSPACE2;
1724
29595ff2
NC
1725 s = skipspace(s);
1726 PL_bufptr = SvPVX(PL_linestr) + bufptroff;
1727 if (!PL_madskills || !svp)
1728 return s;
1729 start = SvPVX(PL_linestr) + startoff;
cd81e915 1730 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1731 char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1732 PL_thistoken = newSVpvn(tstart, start - tstart);
1733 PL_realtokenstart = -1;
29595ff2 1734 }
cd81e915 1735 if (PL_skipwhite) {
29595ff2 1736 if (!*svp)
6b29d1f5 1737 *svp = newSVpvs("");
cd81e915
NC
1738 sv_setsv(*svp, PL_skipwhite);
1739 sv_free(PL_skipwhite);
1740 PL_skipwhite = 0;
29595ff2
NC
1741 }
1742
1743 return s;
1744}
1745#endif
1746
80a702cd 1747STATIC void
15f169a1 1748S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1749{
1750 AV *av = CopFILEAVx(PL_curcop);
1751 if (av) {
b9f83d2f 1752 SV * const sv = newSV_type(SVt_PVMG);
5fa550fb
NC
1753 if (orig_sv)
1754 sv_setsv(sv, orig_sv);
1755 else
1756 sv_setpvn(sv, buf, len);
80a702cd
RGS
1757 (void)SvIOK_on(sv);
1758 SvIV_set(sv, 0);
1759 av_store(av, (I32)CopLINE(PL_curcop), sv);
1760 }
1761}
1762
ffb4593c
NT
1763/*
1764 * S_skipspace
1765 * Called to gobble the appropriate amount and type of whitespace.
1766 * Skips comments as well.
1767 */
1768
76e3520e 1769STATIC char *
cea2e8a9 1770S_skipspace(pTHX_ register char *s)
a687059c 1771{
5db06880 1772#ifdef PERL_MAD
f0e67a1d
Z
1773 char *start = s;
1774#endif /* PERL_MAD */
7918f24d 1775 PERL_ARGS_ASSERT_SKIPSPACE;
f0e67a1d 1776#ifdef PERL_MAD
cd81e915
NC
1777 if (PL_skipwhite) {
1778 sv_free(PL_skipwhite);
f0e67a1d 1779 PL_skipwhite = NULL;
5db06880 1780 }
f0e67a1d 1781#endif /* PERL_MAD */
3280af22 1782 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
bf4acbe4 1783 while (s < PL_bufend && SPACE_OR_TAB(*s))
463ee0b2 1784 s++;
f0e67a1d
Z
1785 } else {
1786 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1787 PL_bufptr = s;
f0998909
Z
1788 lex_read_space(LEX_KEEP_PREVIOUS |
1789 (PL_sublex_info.sub_inwhat || PL_lex_state == LEX_FORMLINE ?
1790 LEX_NO_NEXT_CHUNK : 0));
3280af22 1791 s = PL_bufptr;
f0e67a1d
Z
1792 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1793 if (PL_linestart > PL_bufptr)
1794 PL_bufptr = PL_linestart;
1795 return s;
463ee0b2 1796 }
5db06880 1797#ifdef PERL_MAD
f0e67a1d
Z
1798 if (PL_madskills)
1799 PL_skipwhite = newSVpvn(start, s-start);
1800#endif /* PERL_MAD */
5db06880 1801 return s;
a687059c 1802}
378cc40b 1803
ffb4593c
NT
1804/*
1805 * S_check_uni
1806 * Check the unary operators to ensure there's no ambiguity in how they're
1807 * used. An ambiguous piece of code would be:
1808 * rand + 5
1809 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1810 * the +5 is its argument.
1811 */
1812
76e3520e 1813STATIC void
cea2e8a9 1814S_check_uni(pTHX)
ba106d47 1815{
97aff369 1816 dVAR;
d4c19fe8
AL
1817 const char *s;
1818 const char *t;
2f3197b3 1819
3280af22 1820 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1821 return;
3280af22
NIS
1822 while (isSPACE(*PL_last_uni))
1823 PL_last_uni++;
c35e046a
AL
1824 s = PL_last_uni;
1825 while (isALNUM_lazy_if(s,UTF) || *s == '-')
1826 s++;
3280af22 1827 if ((t = strchr(s, '(')) && t < PL_bufptr)
a0d0e21e 1828 return;
6136c704 1829
9b387841
NC
1830 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
1831 "Warning: Use of \"%.*s\" without parentheses is ambiguous",
1832 (int)(s - PL_last_uni), PL_last_uni);
2f3197b3
LW
1833}
1834
ffb4593c
NT
1835/*
1836 * LOP : macro to build a list operator. Its behaviour has been replaced
1837 * with a subroutine, S_lop() for which LOP is just another name.
1838 */
1839
a0d0e21e
LW
1840#define LOP(f,x) return lop(f,x,s)
1841
ffb4593c
NT
1842/*
1843 * S_lop
1844 * Build a list operator (or something that might be one). The rules:
1845 * - if we have a next token, then it's a list operator [why?]
1846 * - if the next thing is an opening paren, then it's a function
1847 * - else it's a list operator
1848 */
1849
76e3520e 1850STATIC I32
a0be28da 1851S_lop(pTHX_ I32 f, int x, char *s)
ffed7fef 1852{
97aff369 1853 dVAR;
7918f24d
NC
1854
1855 PERL_ARGS_ASSERT_LOP;
1856
6154021b 1857 pl_yylval.ival = f;
35c8bce7 1858 CLINE;
3280af22
NIS
1859 PL_expect = x;
1860 PL_bufptr = s;
1861 PL_last_lop = PL_oldbufptr;
eb160463 1862 PL_last_lop_op = (OPCODE)f;
5db06880
NC
1863#ifdef PERL_MAD
1864 if (PL_lasttoke)
78cdf107 1865 goto lstop;
5db06880 1866#else
3280af22 1867 if (PL_nexttoke)
78cdf107 1868 goto lstop;
5db06880 1869#endif
79072805 1870 if (*s == '(')
bbf60fe6 1871 return REPORT(FUNC);
29595ff2 1872 s = PEEKSPACE(s);
79072805 1873 if (*s == '(')
bbf60fe6 1874 return REPORT(FUNC);
78cdf107
Z
1875 else {
1876 lstop:
1877 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
1878 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
bbf60fe6 1879 return REPORT(LSTOP);
78cdf107 1880 }
79072805
LW
1881}
1882
5db06880
NC
1883#ifdef PERL_MAD
1884 /*
1885 * S_start_force
1886 * Sets up for an eventual force_next(). start_force(0) basically does
1887 * an unshift, while start_force(-1) does a push. yylex removes items
1888 * on the "pop" end.
1889 */
1890
1891STATIC void
1892S_start_force(pTHX_ int where)
1893{
1894 int i;
1895
cd81e915 1896 if (where < 0) /* so people can duplicate start_force(PL_curforce) */
5db06880 1897 where = PL_lasttoke;
cd81e915
NC
1898 assert(PL_curforce < 0 || PL_curforce == where);
1899 if (PL_curforce != where) {
5db06880
NC
1900 for (i = PL_lasttoke; i > where; --i) {
1901 PL_nexttoke[i] = PL_nexttoke[i-1];
1902 }
1903 PL_lasttoke++;
1904 }
cd81e915 1905 if (PL_curforce < 0) /* in case of duplicate start_force() */
5db06880 1906 Zero(&PL_nexttoke[where], 1, NEXTTOKE);
cd81e915
NC
1907 PL_curforce = where;
1908 if (PL_nextwhite) {
5db06880 1909 if (PL_madskills)
6b29d1f5 1910 curmad('^', newSVpvs(""));
cd81e915 1911 CURMAD('_', PL_nextwhite);
5db06880
NC
1912 }
1913}
1914
1915STATIC void
1916S_curmad(pTHX_ char slot, SV *sv)
1917{
1918 MADPROP **where;
1919
1920 if (!sv)
1921 return;
cd81e915
NC
1922 if (PL_curforce < 0)
1923 where = &PL_thismad;
5db06880 1924 else
cd81e915 1925 where = &PL_nexttoke[PL_curforce].next_mad;
5db06880 1926
cd81e915 1927 if (PL_faketokens)
76f68e9b 1928 sv_setpvs(sv, "");
5db06880
NC
1929 else {
1930 if (!IN_BYTES) {
1931 if (UTF && is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
1932 SvUTF8_on(sv);
1933 else if (PL_encoding) {
1934 sv_recode_to_utf8(sv, PL_encoding);
1935 }
1936 }
1937 }
1938
1939 /* keep a slot open for the head of the list? */
1940 if (slot != '_' && *where && (*where)->mad_key == '^') {
1941 (*where)->mad_key = slot;
daba3364 1942 sv_free(MUTABLE_SV(((*where)->mad_val)));
5db06880
NC
1943 (*where)->mad_val = (void*)sv;
1944 }
1945 else
1946 addmad(newMADsv(slot, sv), where, 0);
1947}
1948#else
b3f24c00
MHM
1949# define start_force(where) NOOP
1950# define curmad(slot, sv) NOOP
5db06880
NC
1951#endif
1952
ffb4593c
NT
1953/*
1954 * S_force_next
9cbb5ea2 1955 * When the lexer realizes it knows the next token (for instance,
ffb4593c 1956 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 1957 * to know what token to return the next time the lexer is called. Caller
5db06880
NC
1958 * will need to set PL_nextval[] (or PL_nexttoke[].next_val with PERL_MAD),
1959 * and possibly PL_expect to ensure the lexer handles the token correctly.
ffb4593c
NT
1960 */
1961
4e553d73 1962STATIC void
cea2e8a9 1963S_force_next(pTHX_ I32 type)
79072805 1964{
97aff369 1965 dVAR;
704d4215
GG
1966#ifdef DEBUGGING
1967 if (DEBUG_T_TEST) {
1968 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 1969 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
1970 }
1971#endif
5db06880 1972#ifdef PERL_MAD
cd81e915 1973 if (PL_curforce < 0)
5db06880 1974 start_force(PL_lasttoke);
cd81e915 1975 PL_nexttoke[PL_curforce].next_type = type;
5db06880
NC
1976 if (PL_lex_state != LEX_KNOWNEXT)
1977 PL_lex_defer = PL_lex_state;
1978 PL_lex_state = LEX_KNOWNEXT;
1979 PL_lex_expect = PL_expect;
cd81e915 1980 PL_curforce = -1;
5db06880 1981#else
3280af22
NIS
1982 PL_nexttype[PL_nexttoke] = type;
1983 PL_nexttoke++;
1984 if (PL_lex_state != LEX_KNOWNEXT) {
1985 PL_lex_defer = PL_lex_state;
1986 PL_lex_expect = PL_expect;
1987 PL_lex_state = LEX_KNOWNEXT;
79072805 1988 }
5db06880 1989#endif
79072805
LW
1990}
1991
28ac2b49
Z
1992void
1993Perl_yyunlex(pTHX)
1994{
a7aaec61
Z
1995 int yyc = PL_parser->yychar;
1996 if (yyc != YYEMPTY) {
1997 if (yyc) {
1998 start_force(-1);
1999 NEXTVAL_NEXTTOKE = PL_parser->yylval;
2000 if (yyc == '{'/*}*/ || yyc == HASHBRACK || yyc == '['/*]*/) {
78cdf107 2001 PL_lex_allbrackets--;
a7aaec61 2002 PL_lex_brackets--;
78cdf107
Z
2003 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
2004 } else if (yyc == '('/*)*/) {
2005 PL_lex_allbrackets--;
2006 yyc |= (2<<24);
a7aaec61
Z
2007 }
2008 force_next(yyc);
2009 }
28ac2b49
Z
2010 PL_parser->yychar = YYEMPTY;
2011 }
2012}
2013
d0a148a6 2014STATIC SV *
15f169a1 2015S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 2016{
97aff369 2017 dVAR;
740cce10 2018 SV * const sv = newSVpvn_utf8(start, len,
eaf7a4d2
CS
2019 !IN_BYTES
2020 && UTF
2021 && !is_ascii_string((const U8*)start, len)
740cce10 2022 && is_utf8_string((const U8*)start, len));
d0a148a6
NC
2023 return sv;
2024}
2025
ffb4593c
NT
2026/*
2027 * S_force_word
2028 * When the lexer knows the next thing is a word (for instance, it has
2029 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
2030 * it calls S_force_word to stick the next word into the PL_nexttoke/val
2031 * lookahead.
ffb4593c
NT
2032 *
2033 * Arguments:
b1b65b59 2034 * char *start : buffer position (must be within PL_linestr)
02b34bbe 2035 * int token : PL_next* will be this type of bare word (e.g., METHOD,WORD)
ffb4593c
NT
2036 * int check_keyword : if true, Perl checks to make sure the word isn't
2037 * a keyword (do this if the word is a label, e.g. goto FOO)
2038 * int allow_pack : if true, : characters will also be allowed (require,
2039 * use, etc. do this)
9cbb5ea2 2040 * int allow_initial_tick : used by the "sub" lexer only.
ffb4593c
NT
2041 */
2042
76e3520e 2043STATIC char *
cea2e8a9 2044S_force_word(pTHX_ register char *start, int token, int check_keyword, int allow_pack, int allow_initial_tick)
79072805 2045{
97aff369 2046 dVAR;
463ee0b2
LW
2047 register char *s;
2048 STRLEN len;
4e553d73 2049
7918f24d
NC
2050 PERL_ARGS_ASSERT_FORCE_WORD;
2051
29595ff2 2052 start = SKIPSPACE1(start);
463ee0b2 2053 s = start;
7e2040f0 2054 if (isIDFIRST_lazy_if(s,UTF) ||
a0d0e21e 2055 (allow_pack && *s == ':') ||
15f0808c 2056 (allow_initial_tick && *s == '\'') )
a0d0e21e 2057 {
3280af22 2058 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
5458a98a 2059 if (check_keyword && keyword(PL_tokenbuf, len, 0))
463ee0b2 2060 return start;
cd81e915 2061 start_force(PL_curforce);
5db06880
NC
2062 if (PL_madskills)
2063 curmad('X', newSVpvn(start,s-start));
463ee0b2 2064 if (token == METHOD) {
29595ff2 2065 s = SKIPSPACE1(s);
463ee0b2 2066 if (*s == '(')
3280af22 2067 PL_expect = XTERM;
463ee0b2 2068 else {
3280af22 2069 PL_expect = XOPERATOR;
463ee0b2 2070 }
79072805 2071 }
e74e6b3d 2072 if (PL_madskills)
63575281 2073 curmad('g', newSVpvs( "forced" ));
9ded7720 2074 NEXTVAL_NEXTTOKE.opval
d0a148a6
NC
2075 = (OP*)newSVOP(OP_CONST,0,
2076 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 2077 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
2078 force_next(token);
2079 }
2080 return s;
2081}
2082
ffb4593c
NT
2083/*
2084 * S_force_ident
9cbb5ea2 2085 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
2086 * text only contains the "foo" portion. The first argument is a pointer
2087 * to the "foo", and the second argument is the type symbol to prefix.
2088 * Forces the next token to be a "WORD".
9cbb5ea2 2089 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2090 */
2091
76e3520e 2092STATIC void
bfed75c6 2093S_force_ident(pTHX_ register const char *s, int kind)
79072805 2094{
97aff369 2095 dVAR;
7918f24d
NC
2096
2097 PERL_ARGS_ASSERT_FORCE_IDENT;
2098
c35e046a 2099 if (*s) {
90e5519e 2100 const STRLEN len = strlen(s);
728847b1
BF
2101 OP* const o = (OP*)newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
2102 UTF ? SVf_UTF8 : 0));
cd81e915 2103 start_force(PL_curforce);
9ded7720 2104 NEXTVAL_NEXTTOKE.opval = o;
79072805 2105 force_next(WORD);
748a9306 2106 if (kind) {
11343788 2107 o->op_private = OPpCONST_ENTERED;
55497cff
PP
2108 /* XXX see note in pp_entereval() for why we forgo typo
2109 warnings if the symbol must be introduced in an eval.
2110 GSAR 96-10-12 */
90e5519e 2111 gv_fetchpvn_flags(s, len,
728847b1
BF
2112 (PL_in_eval ? (GV_ADDMULTI | GV_ADDINEVAL)
2113 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
90e5519e
NC
2114 kind == '$' ? SVt_PV :
2115 kind == '@' ? SVt_PVAV :
2116 kind == '%' ? SVt_PVHV :
a0d0e21e 2117 SVt_PVGV
90e5519e 2118 );
748a9306 2119 }
79072805
LW
2120 }
2121}
2122
1571675a
GS
2123NV
2124Perl_str_to_version(pTHX_ SV *sv)
2125{
2126 NV retval = 0.0;
2127 NV nshift = 1.0;
2128 STRLEN len;
cfd0369c 2129 const char *start = SvPV_const(sv,len);
9d4ba2ae 2130 const char * const end = start + len;
504618e9 2131 const bool utf = SvUTF8(sv) ? TRUE : FALSE;
7918f24d
NC
2132
2133 PERL_ARGS_ASSERT_STR_TO_VERSION;
2134
1571675a 2135 while (start < end) {
ba210ebe 2136 STRLEN skip;
1571675a
GS
2137 UV n;
2138 if (utf)
9041c2e3 2139 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2140 else {
2141 n = *(U8*)start;
2142 skip = 1;
2143 }
2144 retval += ((NV)n)/nshift;
2145 start += skip;
2146 nshift *= 1000;
2147 }
2148 return retval;
2149}
2150
4e553d73 2151/*
ffb4593c
NT
2152 * S_force_version
2153 * Forces the next token to be a version number.
e759cc13
RGS
2154 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2155 * and if "guessing" is TRUE, then no new token is created (and the caller
2156 * must use an alternative parsing method).
ffb4593c
NT
2157 */
2158
76e3520e 2159STATIC char *
e759cc13 2160S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2161{
97aff369 2162 dVAR;
5f66b61c 2163 OP *version = NULL;
44dcb63b 2164 char *d;
5db06880
NC
2165#ifdef PERL_MAD
2166 I32 startoff = s - SvPVX(PL_linestr);
2167#endif
89bfa8cd 2168
7918f24d
NC
2169 PERL_ARGS_ASSERT_FORCE_VERSION;
2170
29595ff2 2171 s = SKIPSPACE1(s);
89bfa8cd 2172
44dcb63b 2173 d = s;
dd629d5b 2174 if (*d == 'v')
44dcb63b 2175 d++;
44dcb63b 2176 if (isDIGIT(*d)) {
e759cc13
RGS
2177 while (isDIGIT(*d) || *d == '_' || *d == '.')
2178 d++;
5db06880
NC
2179#ifdef PERL_MAD
2180 if (PL_madskills) {
cd81e915 2181 start_force(PL_curforce);
5db06880
NC
2182 curmad('X', newSVpvn(s,d-s));
2183 }
2184#endif
4e4da3ac 2185 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
dd629d5b 2186 SV *ver;
8d08d9ba
DG
2187#ifdef USE_LOCALE_NUMERIC
2188 char *loc = setlocale(LC_NUMERIC, "C");
2189#endif
6154021b 2190 s = scan_num(s, &pl_yylval);
8d08d9ba
DG
2191#ifdef USE_LOCALE_NUMERIC
2192 setlocale(LC_NUMERIC, loc);
2193#endif
6154021b 2194 version = pl_yylval.opval;
dd629d5b
GS
2195 ver = cSVOPx(version)->op_sv;
2196 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2197 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2198 SvNV_set(ver, str_to_version(ver));
1571675a 2199 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2200 }
89bfa8cd 2201 }
5db06880
NC
2202 else if (guessing) {
2203#ifdef PERL_MAD
2204 if (PL_madskills) {
cd81e915
NC
2205 sv_free(PL_nextwhite); /* let next token collect whitespace */
2206 PL_nextwhite = 0;
5db06880
NC
2207 s = SvPVX(PL_linestr) + startoff;
2208 }
2209#endif
e759cc13 2210 return s;
5db06880 2211 }
89bfa8cd
PP
2212 }
2213
5db06880
NC
2214#ifdef PERL_MAD
2215 if (PL_madskills && !version) {
cd81e915
NC
2216 sv_free(PL_nextwhite); /* let next token collect whitespace */
2217 PL_nextwhite = 0;
5db06880
NC
2218 s = SvPVX(PL_linestr) + startoff;
2219 }
2220#endif
89bfa8cd 2221 /* NOTE: The parser sees the package name and the VERSION swapped */
cd81e915 2222 start_force(PL_curforce);
9ded7720 2223 NEXTVAL_NEXTTOKE.opval = version;
4e553d73 2224 force_next(WORD);
89bfa8cd 2225
e759cc13 2226 return s;
89bfa8cd
PP
2227}
2228
ffb4593c 2229/*
91152fc1
DG
2230 * S_force_strict_version
2231 * Forces the next token to be a version number using strict syntax rules.
2232 */
2233
2234STATIC char *
2235S_force_strict_version(pTHX_ char *s)
2236{
2237 dVAR;
2238 OP *version = NULL;
2239#ifdef PERL_MAD
2240 I32 startoff = s - SvPVX(PL_linestr);
2241#endif
2242 const char *errstr = NULL;
2243
2244 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2245
2246 while (isSPACE(*s)) /* leading whitespace */
2247 s++;
2248
2249 if (is_STRICT_VERSION(s,&errstr)) {
2250 SV *ver = newSV(0);
2251 s = (char *)scan_version(s, ver, 0);
2252 version = newSVOP(OP_CONST, 0, ver);
2253 }
4e4da3ac
Z
2254 else if ( (*s != ';' && *s != '{' && *s != '}' ) &&
2255 (s = SKIPSPACE1(s), (*s != ';' && *s != '{' && *s != '}' )))
2256 {
91152fc1
DG
2257 PL_bufptr = s;
2258 if (errstr)
2259 yyerror(errstr); /* version required */
2260 return s;
2261 }
2262
2263#ifdef PERL_MAD
2264 if (PL_madskills && !version) {
2265 sv_free(PL_nextwhite); /* let next token collect whitespace */
2266 PL_nextwhite = 0;
2267 s = SvPVX(PL_linestr) + startoff;
2268 }
2269#endif
2270 /* NOTE: The parser sees the package name and the VERSION swapped */
2271 start_force(PL_curforce);
2272 NEXTVAL_NEXTTOKE.opval = version;
2273 force_next(WORD);
2274
2275 return s;
2276}
2277
2278/*
ffb4593c
NT
2279 * S_tokeq
2280 * Tokenize a quoted string passed in as an SV. It finds the next
2281 * chunk, up to end of string or a backslash. It may make a new
2282 * SV containing that chunk (if HINT_NEW_STRING is on). It also
2283 * turns \\ into \.
2284 */
2285
76e3520e 2286STATIC SV *
cea2e8a9 2287S_tokeq(pTHX_ SV *sv)
79072805 2288{
97aff369 2289 dVAR;
79072805
LW
2290 register char *s;
2291 register char *send;
2292 register char *d;
b3ac6de7
IZ
2293 STRLEN len = 0;
2294 SV *pv = sv;
79072805 2295
7918f24d
NC
2296 PERL_ARGS_ASSERT_TOKEQ;
2297
79072805 2298 if (!SvLEN(sv))
b3ac6de7 2299 goto finish;
79072805 2300
a0d0e21e 2301 s = SvPV_force(sv, len);
21a311ee 2302 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1)
b3ac6de7 2303 goto finish;
463ee0b2 2304 send = s + len;
dcb21ed6
NC
2305 /* This is relying on the SV being "well formed" with a trailing '\0' */
2306 while (s < send && !(*s == '\\' && s[1] == '\\'))
79072805
LW
2307 s++;
2308 if (s == send)
b3ac6de7 2309 goto finish;
79072805 2310 d = s;
be4731d2 2311 if ( PL_hints & HINT_NEW_STRING ) {
59cd0e26 2312 pv = newSVpvn_flags(SvPVX_const(pv), len, SVs_TEMP | SvUTF8(sv));
be4731d2 2313 }
79072805
LW
2314 while (s < send) {
2315 if (*s == '\\') {
a0d0e21e 2316 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2317 s++; /* all that, just for this */
2318 }
2319 *d++ = *s++;
2320 }
2321 *d = '\0';
95a20fc0 2322 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2323 finish:
3280af22 2324 if ( PL_hints & HINT_NEW_STRING )
eb0d8d16 2325 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
79072805
LW
2326 return sv;
2327}
2328
ffb4593c
NT
2329/*
2330 * Now come three functions related to double-quote context,
2331 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2332 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2333 * interact with PL_lex_state, and create fake ( ... ) argument lists
2334 * to handle functions and concatenation.
2335 * They assume that whoever calls them will be setting up a fake
2336 * join call, because each subthing puts a ',' after it. This lets
2337 * "lower \luPpEr"
2338 * become
2339 * join($, , 'lower ', lcfirst( 'uPpEr', ) ,)
2340 *
2341 * (I'm not sure whether the spurious commas at the end of lcfirst's
2342 * arguments and join's arguments are created or not).
2343 */
2344
2345/*
2346 * S_sublex_start
6154021b 2347 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2348 *
2349 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2350 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c
NT
2351 *
2352 * OP_CONST and OP_READLINE are easy--just make the new op and return.
2353 *
2354 * Everything else becomes a FUNC.
2355 *
2356 * Sets PL_lex_state to LEX_INTERPPUSH unless (ival was OP_NULL or we
2357 * had an OP_CONST or OP_READLINE). This just sets us up for a
2358 * call to S_sublex_push().
2359 */
2360
76e3520e 2361STATIC I32
cea2e8a9 2362S_sublex_start(pTHX)
79072805 2363{
97aff369 2364 dVAR;
6154021b 2365 register const I32 op_type = pl_yylval.ival;
79072805
LW
2366
2367 if (op_type == OP_NULL) {
6154021b 2368 pl_yylval.opval = PL_lex_op;
5f66b61c 2369 PL_lex_op = NULL;
79072805
LW
2370 return THING;
2371 }
2372 if (op_type == OP_CONST || op_type == OP_READLINE) {
3280af22 2373 SV *sv = tokeq(PL_lex_stuff);
b3ac6de7
IZ
2374
2375 if (SvTYPE(sv) == SVt_PVIV) {
2376 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2377 STRLEN len;
96a5add6 2378 const char * const p = SvPV_const(sv, len);
740cce10 2379 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2380 SvREFCNT_dec(sv);
2381 sv = nsv;
4e553d73 2382 }
6154021b 2383 pl_yylval.opval = (OP*)newSVOP(op_type, 0, sv);
a0714e2c 2384 PL_lex_stuff = NULL;
6f33ba73
RGS
2385 /* Allow <FH> // "foo" */
2386 if (op_type == OP_READLINE)
2387 PL_expect = XTERMORDORDOR;
79072805
LW
2388 return THING;
2389 }
e3f73d4e
RGS
2390 else if (op_type == OP_BACKTICK && PL_lex_op) {
2391 /* readpipe() vas overriden */
2392 cSVOPx(cLISTOPx(cUNOPx(PL_lex_op)->op_first)->op_first->op_sibling)->op_sv = tokeq(PL_lex_stuff);
6154021b 2393 pl_yylval.opval = PL_lex_op;
9b201d7d 2394 PL_lex_op = NULL;
e3f73d4e
RGS
2395 PL_lex_stuff = NULL;
2396 return THING;
2397 }
79072805 2398
3280af22 2399 PL_sublex_info.super_state = PL_lex_state;
eac04b2e 2400 PL_sublex_info.sub_inwhat = (U16)op_type;
3280af22
NIS
2401 PL_sublex_info.sub_op = PL_lex_op;
2402 PL_lex_state = LEX_INTERPPUSH;
55497cff 2403
3280af22
NIS
2404 PL_expect = XTERM;
2405 if (PL_lex_op) {
6154021b 2406 pl_yylval.opval = PL_lex_op;
5f66b61c 2407 PL_lex_op = NULL;
55497cff
PP
2408 return PMFUNC;
2409 }
2410 else
2411 return FUNC;
2412}
2413
ffb4593c
NT
2414/*
2415 * S_sublex_push
2416 * Create a new scope to save the lexing state. The scope will be
2417 * ended in S_sublex_done. Returns a '(', starting the function arguments
2418 * to the uc, lc, etc. found before.
2419 * Sets PL_lex_state to LEX_INTERPCONCAT.
2420 */
2421
76e3520e 2422STATIC I32
cea2e8a9 2423S_sublex_push(pTHX)
55497cff 2424{
27da23d5 2425 dVAR;
f46d017c 2426 ENTER;
55497cff 2427
3280af22 2428 PL_lex_state = PL_sublex_info.super_state;
651b5b28 2429 SAVEBOOL(PL_lex_dojoin);
3280af22 2430 SAVEI32(PL_lex_brackets);
78cdf107
Z
2431 SAVEI32(PL_lex_allbrackets);
2432 SAVEI8(PL_lex_fakeeof);
3280af22
NIS
2433 SAVEI32(PL_lex_casemods);
2434 SAVEI32(PL_lex_starts);
651b5b28 2435 SAVEI8(PL_lex_state);
7766f137 2436 SAVEVPTR(PL_lex_inpat);
98246f1e 2437 SAVEI16(PL_lex_inwhat);
57843af0 2438 SAVECOPLINE(PL_curcop);
3280af22 2439 SAVEPPTR(PL_bufptr);
8452ff4b 2440 SAVEPPTR(PL_bufend);
3280af22
NIS
2441 SAVEPPTR(PL_oldbufptr);
2442 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2443 SAVEPPTR(PL_last_lop);
2444 SAVEPPTR(PL_last_uni);
3280af22
NIS
2445 SAVEPPTR(PL_linestart);
2446 SAVESPTR(PL_linestr);
8edd5f42
RGS
2447 SAVEGENERICPV(PL_lex_brackstack);
2448 SAVEGENERICPV(PL_lex_casestack);
3280af22
NIS
2449
2450 PL_linestr = PL_lex_stuff;
a0714e2c 2451 PL_lex_stuff = NULL;
3280af22 2452
9cbb5ea2
GS
2453 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2454 = SvPVX(PL_linestr);
3280af22 2455 PL_bufend += SvCUR(PL_linestr);
bd61b366 2456 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2457 SAVEFREESV(PL_linestr);
2458
2459 PL_lex_dojoin = FALSE;
2460 PL_lex_brackets = 0;
78cdf107
Z
2461 PL_lex_allbrackets = 0;
2462 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
a02a5408
JC
2463 Newx(PL_lex_brackstack, 120, char);
2464 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2465 PL_lex_casemods = 0;
2466 *PL_lex_casestack = '\0';
2467 PL_lex_starts = 0;
2468 PL_lex_state = LEX_INTERPCONCAT;
eb160463 2469 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
3280af22
NIS
2470
2471 PL_lex_inwhat = PL_sublex_info.sub_inwhat;
bb16bae8 2472 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
3280af22
NIS
2473 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2474 PL_lex_inpat = PL_sublex_info.sub_op;
79072805 2475 else
5f66b61c 2476 PL_lex_inpat = NULL;
79072805 2477
55497cff 2478 return '(';
79072805
LW
2479}
2480
ffb4593c
NT
2481/*
2482 * S_sublex_done
2483 * Restores lexer state after a S_sublex_push.
2484 */
2485
76e3520e 2486STATIC I32
cea2e8a9 2487S_sublex_done(pTHX)
79072805 2488{
27da23d5 2489 dVAR;
3280af22 2490 if (!PL_lex_starts++) {
396482e1 2491 SV * const sv = newSVpvs("");
9aa983d2
JH
2492 if (SvUTF8(PL_linestr))
2493 SvUTF8_on(sv);
3280af22 2494 PL_expect = XOPERATOR;
6154021b 2495 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
79072805
LW
2496 return THING;
2497 }
2498
3280af22
NIS
2499 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2500 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2501 return yylex();
79072805
LW
2502 }
2503
ffb4593c 2504 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
bb16bae8 2505 assert(PL_lex_inwhat != OP_TRANSR);
3280af22
NIS
2506 if (PL_lex_repl && (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS)) {
2507 PL_linestr = PL_lex_repl;
2508 PL_lex_inpat = 0;
2509 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2510 PL_bufend += SvCUR(PL_linestr);
bd61b366 2511 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2512 SAVEFREESV(PL_linestr);
2513 PL_lex_dojoin = FALSE;
2514 PL_lex_brackets = 0;
78cdf107
Z
2515 PL_lex_allbrackets = 0;
2516 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
3280af22
NIS
2517 PL_lex_casemods = 0;
2518 *PL_lex_casestack = '\0';
2519 PL_lex_starts = 0;
25da4f38 2520 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2521 PL_lex_state = LEX_INTERPNORMAL;
2522 PL_lex_starts++;
e9fa98b2
HS
2523 /* we don't clear PL_lex_repl here, so that we can check later
2524 whether this is an evalled subst; that means we rely on the
2525 logic to ensure sublex_done() is called again only via the
2526 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2527 }
e9fa98b2 2528 else {
3280af22 2529 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2530 PL_lex_repl = NULL;
e9fa98b2 2531 }
79072805 2532 return ',';
ffed7fef
LW
2533 }
2534 else {
5db06880
NC
2535#ifdef PERL_MAD
2536 if (PL_madskills) {
cd81e915
NC
2537 if (PL_thiswhite) {
2538 if (!PL_endwhite)
6b29d1f5 2539 PL_endwhite = newSVpvs("");
cd81e915
NC
2540 sv_catsv(PL_endwhite, PL_thiswhite);
2541 PL_thiswhite = 0;
2542 }
2543 if (PL_thistoken)
76f68e9b 2544 sv_setpvs(PL_thistoken,"");
5db06880 2545 else
cd81e915 2546 PL_realtokenstart = -1;
5db06880
NC
2547 }
2548#endif
f46d017c 2549 LEAVE;
3280af22
NIS
2550 PL_bufend = SvPVX(PL_linestr);
2551 PL_bufend += SvCUR(PL_linestr);
2552 PL_expect = XOPERATOR;
09bef843 2553 PL_sublex_info.sub_inwhat = 0;
79072805 2554 return ')';
ffed7fef
LW
2555 }
2556}
2557
02aa26ce
NT
2558/*
2559 scan_const
2560
2561 Extracts a pattern, double-quoted string, or transliteration. This
2562 is terrifying code.
2563
94def140 2564 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 2565 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 2566 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 2567
94def140
ST
2568 Returns a pointer to the character scanned up to. If this is
2569 advanced from the start pointer supplied (i.e. if anything was
9b599b2a 2570 successfully parsed), will leave an OP for the substring scanned
6154021b 2571 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
2572 by looking at the next characters herself.
2573
02aa26ce
NT
2574 In patterns:
2575 backslashes:
ff3f963a 2576 constants: \N{NAME} only
02aa26ce
NT
2577 case and quoting: \U \Q \E
2578 stops on @ and $, but not for $ as tail anchor
2579
2580 In transliterations:
2581 characters are VERY literal, except for - not at the start or end
94def140
ST
2582 of the string, which indicates a range. If the range is in bytes,
2583 scan_const expands the range to the full set of intermediate
2584 characters. If the range is in utf8, the hyphen is replaced with
2585 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
2586
2587 In double-quoted strings:
2588 backslashes:
2589 double-quoted style: \r and \n
ff3f963a 2590 constants: \x31, etc.
94def140 2591 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
2592 case and quoting: \U \Q \E
2593 stops on @ and $
2594
2595 scan_const does *not* construct ops to handle interpolated strings.
2596 It stops processing as soon as it finds an embedded $ or @ variable
2597 and leaves it to the caller to work out what's going on.
2598
94def140
ST
2599 embedded arrays (whether in pattern or not) could be:
2600 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2601
2602 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
2603
2604 $ in pattern could be $foo or could be tail anchor. Assumption:
2605 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 2606 followed by one of "()| \r\n\t"
02aa26ce
NT
2607
2608 \1 (backreferences) are turned into $1
2609
2610 The structure of the code is
2611 while (there's a character to process) {
94def140
ST
2612 handle transliteration ranges
2613 skip regexp comments /(?#comment)/ and codes /(?{code})/
2614 skip #-initiated comments in //x patterns
2615 check for embedded arrays
02aa26ce
NT
2616 check for embedded scalars
2617 if (backslash) {
94def140 2618 deprecate \1 in substitution replacements
02aa26ce
NT
2619 handle string-changing backslashes \l \U \Q \E, etc.
2620 switch (what was escaped) {
94def140 2621 handle \- in a transliteration (becomes a literal -)
ff3f963a 2622 if a pattern and not \N{, go treat as regular character
94def140
ST
2623 handle \132 (octal characters)
2624 handle \x15 and \x{1234} (hex characters)
ff3f963a 2625 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
ST
2626 handle \cV (control characters)
2627 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 2628 } (end switch)
77a135fe 2629 continue
02aa26ce 2630 } (end if backslash)
77a135fe 2631 handle regular character
02aa26ce 2632 } (end while character to read)
4e553d73 2633
02aa26ce
NT
2634*/
2635
76e3520e 2636STATIC char *
cea2e8a9 2637S_scan_const(pTHX_ char *start)
79072805 2638{
97aff369 2639 dVAR;
3280af22 2640 register char *send = PL_bufend; /* end of the constant */
77a135fe
KW
2641 SV *sv = newSV(send - start); /* sv for the constant. See
2642 note below on sizing. */
02aa26ce
NT
2643 register char *s = start; /* start of the constant */
2644 register char *d = SvPVX(sv); /* destination for copies */
2645 bool dorange = FALSE; /* are we in a translit range? */
c2e66d9e 2646 bool didrange = FALSE; /* did we just finish a range? */
b953e60c
KW
2647 bool has_utf8 = FALSE; /* Output constant is UTF8 */
2648 bool this_utf8 = cBOOL(UTF); /* Is the source string assumed
77a135fe
KW
2649 to be UTF8? But, this can
2650 show as true when the source
2651 isn't utf8, as for example
2652 when it is entirely composed
2653 of hex constants */
2654
2655 /* Note on sizing: The scanned constant is placed into sv, which is
2656 * initialized by newSV() assuming one byte of output for every byte of
2657 * input. This routine expects newSV() to allocate an extra byte for a
2658 * trailing NUL, which this routine will append if it gets to the end of
2659 * the input. There may be more bytes of input than output (eg., \N{LATIN
2660 * CAPITAL LETTER A}), or more output than input if the constant ends up
2661 * recoded to utf8, but each time a construct is found that might increase
2662 * the needed size, SvGROW() is called. Its size parameter each time is
2663 * based on the best guess estimate at the time, namely the length used so
2664 * far, plus the length the current construct will occupy, plus room for
2665 * the trailing NUL, plus one byte for every input byte still unscanned */
2666
012bcf8d 2667 UV uv;
4c3a8340
ST
2668#ifdef EBCDIC
2669 UV literal_endpoint = 0;
e294cc5d 2670 bool native_range = TRUE; /* turned to FALSE if the first endpoint is Unicode. */
4c3a8340 2671#endif
012bcf8d 2672
7918f24d
NC
2673 PERL_ARGS_ASSERT_SCAN_CONST;
2674
bb16bae8 2675 assert(PL_lex_inwhat != OP_TRANSR);
2b9d42f0
NIS
2676 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
2677 /* If we are doing a trans and we know we want UTF8 set expectation */
2678 has_utf8 = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
2679 this_utf8 = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
2680 }
2681
2682
79072805 2683 while (s < send || dorange) {
ff3f963a 2684
02aa26ce 2685 /* get transliterations out of the way (they're most literal) */
3280af22 2686 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 2687 /* expand a range A-Z to the full set of characters. AIE! */
79072805 2688 if (dorange) {
1ba5c669
JH
2689 I32 i; /* current expanded character */
2690 I32 min; /* first character in range */
2691 I32 max; /* last character in range */
02aa26ce 2692
e294cc5d
JH
2693#ifdef EBCDIC
2694 UV uvmax = 0;
2695#endif
2696
2697 if (has_utf8
2698#ifdef EBCDIC
2699 && !native_range
2700#endif
2701 ) {
9d4ba2ae 2702 char * const c = (char*)utf8_hop((U8*)d, -1);
8973db79
JH
2703 char *e = d++;
2704 while (e-- > c)
2705 *(e + 1) = *e;
25716404 2706 *c = (char)UTF_TO_NATIVE(0xff);
8973db79
JH
2707 /* mark the range as done, and continue */
2708 dorange = FALSE;
2709 didrange = TRUE;
2710 continue;
2711 }
2b9d42f0 2712
95a20fc0 2713 i = d - SvPVX_const(sv); /* remember current offset */
e294cc5d
JH
2714#ifdef EBCDIC
2715 SvGROW(sv,
2716 SvLEN(sv) + (has_utf8 ?
2717 (512 - UTF_CONTINUATION_MARK +
2718 UNISKIP(0x100))
2719 : 256));
2720 /* How many two-byte within 0..255: 128 in UTF-8,
2721 * 96 in UTF-8-mod. */
2722#else
9cbb5ea2 2723 SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */
e294cc5d 2724#endif
9cbb5ea2 2725 d = SvPVX(sv) + i; /* refresh d after realloc */
e294cc5d
JH
2726#ifdef EBCDIC
2727 if (has_utf8) {
2728 int j;
2729 for (j = 0; j <= 1; j++) {
2730 char * const c = (char*)utf8_hop((U8*)d, -1);
2731 const UV uv = utf8n_to_uvchr((U8*)c, d - c, NULL, 0);
2732 if (j)
2733 min = (U8)uv;
2734 else if (uv < 256)
2735 max = (U8)uv;
2736 else {
2737 max = (U8)0xff; /* only to \xff */
2738 uvmax = uv; /* \x{100} to uvmax */
2739 }
2740 d = c; /* eat endpoint chars */
2741 }
2742 }
2743 else {
2744#endif
2745 d -= 2; /* eat the first char and the - */
2746 min = (U8)*d; /* first char in range */
2747 max = (U8)d[1]; /* last char in range */
2748#ifdef EBCDIC
2749 }
2750#endif
8ada0baa 2751
c2e66d9e 2752 if (min > max) {
01ec43d0 2753 Perl_croak(aTHX_
d1573ac7 2754 "Invalid range \"%c-%c\" in transliteration operator",
1ba5c669 2755 (char)min, (char)max);
c2e66d9e
GS
2756 }
2757
c7f1f016 2758#ifdef EBCDIC
4c3a8340
ST
2759 if (literal_endpoint == 2 &&
2760 ((isLOWER(min) && isLOWER(max)) ||
2761 (isUPPER(min) && isUPPER(max)))) {
8ada0baa
JH
2762 if (isLOWER(min)) {
2763 for (i = min; i <= max; i++)
2764 if (isLOWER(i))
db42d148 2765 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2766 } else {
2767 for (i = min; i <= max; i++)
2768 if (isUPPER(i))
db42d148 2769 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2770 }
2771 }
2772 else
2773#endif
2774 for (i = min; i <= max; i++)
e294cc5d
JH
2775#ifdef EBCDIC
2776 if (has_utf8) {
2777 const U8 ch = (U8)NATIVE_TO_UTF(i);
2778 if (UNI_IS_INVARIANT(ch))
2779 *d++ = (U8)i;
2780 else {
2781 *d++ = (U8)UTF8_EIGHT_BIT_HI(ch);
2782 *d++ = (U8)UTF8_EIGHT_BIT_LO(ch);
2783 }
2784 }
2785 else
2786#endif
2787 *d++ = (char)i;
2788
2789#ifdef EBCDIC
2790 if (uvmax) {
2791 d = (char*)uvchr_to_utf8((U8*)d, 0x100);
2792 if (uvmax > 0x101)
2793 *d++ = (char)UTF_TO_NATIVE(0xff);
2794 if (uvmax > 0x100)
2795 d = (char*)uvchr_to_utf8((U8*)d, uvmax);
2796 }
2797#endif
02aa26ce
NT
2798
2799 /* mark the range as done, and continue */
79072805 2800 dorange = FALSE;
01ec43d0 2801 didrange = TRUE;
4c3a8340
ST
2802#ifdef EBCDIC
2803 literal_endpoint = 0;
2804#endif
79072805 2805 continue;
4e553d73 2806 }
02aa26ce
NT
2807
2808 /* range begins (ignore - as first or last char) */
79072805 2809 else if (*s == '-' && s+1 < send && s != start) {
4e553d73 2810 if (didrange) {
1fafa243 2811 Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
01ec43d0 2812 }
e294cc5d
JH
2813 if (has_utf8
2814#ifdef EBCDIC
2815 && !native_range
2816#endif
2817 ) {
25716404 2818 *d++ = (char)UTF_TO_NATIVE(0xff); /* use illegal utf8 byte--see pmtrans */
a0ed51b3
LW
2819 s++;
2820 continue;
2821 }
79072805
LW
2822 dorange = TRUE;
2823 s++;
01ec43d0
GS
2824 }
2825 else {
2826 didrange = FALSE;
4c3a8340
ST
2827#ifdef EBCDIC
2828 literal_endpoint = 0;
e294cc5d 2829 native_range = TRUE;
4c3a8340 2830#endif
01ec43d0 2831 }
79072805 2832 }
02aa26ce
NT
2833
2834 /* if we get here, we're not doing a transliteration */
2835
0f5d15d6
IZ
2836 /* skip for regexp comments /(?#comment)/ and code /(?{code})/,
2837 except for the last char, which will be done separately. */
3280af22 2838 else if (*s == '(' && PL_lex_inpat && s[1] == '?') {
cc6b7395 2839 if (s[2] == '#') {
e994fd66 2840 while (s+1 < send && *s != ')')
db42d148 2841 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
155aba94
GS
2842 }
2843 else if (s[2] == '{' /* This should match regcomp.c */
67edc0c9 2844 || (s[2] == '?' && s[3] == '{'))
155aba94 2845 {
cc6b7395 2846 I32 count = 1;
0f5d15d6 2847 char *regparse = s + (s[2] == '{' ? 3 : 4);
cc6b7395
IZ
2848 char c;
2849
d9f97599
GS
2850 while (count && (c = *regparse)) {
2851 if (c == '\\' && regparse[1])
2852 regparse++;
4e553d73 2853 else if (c == '{')
cc6b7395 2854 count++;
4e553d73 2855 else if (c == '}')
cc6b7395 2856 count--;
d9f97599 2857 regparse++;
cc6b7395 2858 }
e994fd66 2859 if (*regparse != ')')
5bdf89e7 2860 regparse--; /* Leave one char for continuation. */
0f5d15d6 2861 while (s < regparse)
db42d148 2862 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
cc6b7395 2863 }
748a9306 2864 }
02aa26ce
NT
2865
2866 /* likewise skip #-initiated comments in //x patterns */
3280af22 2867 else if (*s == '#' && PL_lex_inpat &&
73134a2e 2868 ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED) {
748a9306 2869 while (s+1 < send && *s != '\n')
db42d148 2870 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
748a9306 2871 }
02aa26ce 2872
5d1d4326 2873 /* check for embedded arrays
da6eedaa 2874 (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
5d1d4326 2875 */
1749ea0d
ST
2876 else if (*s == '@' && s[1]) {
2877 if (isALNUM_lazy_if(s+1,UTF))
2878 break;
2879 if (strchr(":'{$", s[1]))
2880 break;
2881 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
2882 break; /* in regexp, neither @+ nor @- are interpolated */
2883 }
02aa26ce
NT
2884
2885 /* check for embedded scalars. only stop if we're sure it's a
2886 variable.
2887 */
79072805 2888 else if (*s == '$') {
3280af22 2889 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
79072805 2890 break;
77772344 2891 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
a2a5de95
NC
2892 if (s[1] == '\\') {
2893 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
2894 "Possible unintended interpolation of $\\ in regex");
77772344 2895 }
79072805 2896 break; /* in regexp, $ might be tail anchor */
77772344 2897 }
79072805 2898 }
02aa26ce 2899
2b9d42f0
NIS
2900 /* End of else if chain - OP_TRANS rejoin rest */
2901
02aa26ce 2902 /* backslashes */
79072805 2903 if (*s == '\\' && s+1 < send) {
ff3f963a
KW
2904 char* e; /* Can be used for ending '}', etc. */
2905
79072805 2906 s++;
02aa26ce 2907
7d0fc23c
KW
2908 /* warn on \1 - \9 in substitution replacements, but note that \11
2909 * is an octal; and \19 is \1 followed by '9' */
3280af22 2910 if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat &&
a0d0e21e 2911 isDIGIT(*s) && *s != '0' && !isDIGIT(s[1]))
79072805 2912 {
a2a5de95 2913 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
79072805
LW
2914 *--s = '$';
2915 break;
2916 }
02aa26ce
NT
2917
2918 /* string-change backslash escapes */
3280af22 2919 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQ", *s)) {
79072805
LW
2920 --s;
2921 break;
2922 }
ff3f963a
KW
2923 /* In a pattern, process \N, but skip any other backslash escapes.
2924 * This is because we don't want to translate an escape sequence
2925 * into a meta symbol and have the regex compiler use the meta
2926 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
2927 * in spite of this, we do have to process \N here while the proper
2928 * charnames handler is in scope. See bugs #56444 and #62056.
2929 * There is a complication because \N in a pattern may also stand
2930 * for 'match a non-nl', and not mean a charname, in which case its
2931 * processing should be deferred to the regex compiler. To be a
2932 * charname it must be followed immediately by a '{', and not look
2933 * like \N followed by a curly quantifier, i.e., not something like
2934 * \N{3,}. regcurly returns a boolean indicating if it is a legal
2935 * quantifier */
2936 else if (PL_lex_inpat
2937 && (*s != 'N'
2938 || s[1] != '{'
2939 || regcurly(s + 1)))
2940 {
cc74c5bd
ST
2941 *d++ = NATIVE_TO_NEED(has_utf8,'\\');
2942 goto default_action;
2943 }
02aa26ce 2944
79072805 2945 switch (*s) {
02aa26ce
NT
2946
2947 /* quoted - in transliterations */
79072805 2948 case '-':
3280af22 2949 if (PL_lex_inwhat == OP_TRANS) {
79072805
LW
2950 *d++ = *s++;
2951 continue;
2952 }
2953 /* FALL THROUGH */
2954 default:
11b8faa4 2955 {
a2a5de95
NC
2956 if ((isALPHA(*s) || isDIGIT(*s)))
2957 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
2958 "Unrecognized escape \\%c passed through",
2959 *s);
11b8faa4 2960 /* default action is to copy the quoted character */
f9a63242 2961 goto default_action;
11b8faa4 2962 }
02aa26ce 2963
632403cc 2964 /* eg. \132 indicates the octal constant 0132 */
79072805
LW
2965 case '0': case '1': case '2': case '3':
2966 case '4': case '5': case '6': case '7':
ba210ebe 2967 {
53305cf1
NC
2968 I32 flags = 0;
2969 STRLEN len = 3;
77a135fe 2970 uv = NATIVE_TO_UNI(grok_oct(s, &len, &flags, NULL));
ba210ebe
JH
2971 s += len;
2972 }
012bcf8d 2973 goto NUM_ESCAPE_INSERT;
02aa26ce 2974
f0a2b745
KW
2975 /* eg. \o{24} indicates the octal constant \024 */
2976 case 'o':
2977 {
2978 STRLEN len;
454155d9 2979 const char* error;
f0a2b745 2980
454155d9 2981 bool valid = grok_bslash_o(s, &uv, &len, &error, 1);
f0a2b745 2982 s += len;
454155d9 2983 if (! valid) {
f0a2b745
KW
2984 yyerror(error);
2985 continue;
2986 }
2987 goto NUM_ESCAPE_INSERT;
2988 }
2989
77a135fe 2990 /* eg. \x24 indicates the hex constant 0x24 */
79072805 2991 case 'x':
a0ed51b3
LW
2992 ++s;
2993 if (*s == '{') {
9d4ba2ae 2994 char* const e = strchr(s, '}');
a4c04bdc
NC
2995 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES |
2996 PERL_SCAN_DISALLOW_PREFIX;
53305cf1 2997 STRLEN len;
355860ce 2998
53305cf1 2999 ++s;
adaeee49 3000 if (!e) {
a0ed51b3 3001 yyerror("Missing right brace on \\x{}");
355860ce 3002 continue;
ba210ebe 3003 }
53305cf1 3004 len = e - s;
77a135fe 3005 uv = NATIVE_TO_UNI(grok_hex(s, &len, &flags, NULL));
ba210ebe 3006 s = e + 1;
a0ed51b3
LW
3007 }
3008 else {
ba210ebe 3009 {
53305cf1 3010 STRLEN len = 2;
a4c04bdc 3011 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
77a135fe 3012 uv = NATIVE_TO_UNI(grok_hex(s, &len, &flags, NULL));
ba210ebe
JH
3013 s += len;
3014 }
012bcf8d
GS
3015 }
3016
3017 NUM_ESCAPE_INSERT:
ff3f963a
KW
3018 /* Insert oct or hex escaped character. There will always be
3019 * enough room in sv since such escapes will be longer than any
3020 * UTF-8 sequence they can end up as, except if they force us
3021 * to recode the rest of the string into utf8 */
ba7cea30 3022
77a135fe 3023 /* Here uv is the ordinal of the next character being added in
ff3f963a 3024 * unicode (converted from native). */
77a135fe 3025 if (!UNI_IS_INVARIANT(uv)) {
9aa983d2 3026 if (!has_utf8 && uv > 255) {
77a135fe
KW
3027 /* Might need to recode whatever we have accumulated so
3028 * far if it contains any chars variant in utf8 or
3029 * utf-ebcdic. */
3030
3031 SvCUR_set(sv, d - SvPVX_const(sv));
3032 SvPOK_on(sv);
3033 *d = '\0';
77a135fe 3034 /* See Note on sizing above. */
7bf79863
KW
3035 sv_utf8_upgrade_flags_grow(sv,
3036 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3037 UNISKIP(uv) + (STRLEN)(send - s) + 1);
77a135fe
KW
3038 d = SvPVX(sv) + SvCUR(sv);
3039 has_utf8 = TRUE;
012bcf8d
GS
3040 }
3041
77a135fe
KW
3042 if (has_utf8) {
3043 d = (char*)uvuni_to_utf8((U8*)d, uv);
f9a63242
JH
3044 if (PL_lex_inwhat == OP_TRANS &&
3045 PL_sublex_info.sub_op) {
3046 PL_sublex_info.sub_op->op_private |=
3047 (PL_lex_repl ? OPpTRANS_FROM_UTF
3048 : OPpTRANS_TO_UTF);
f9a63242 3049 }
e294cc5d
JH
3050#ifdef EBCDIC
3051 if (uv > 255 && !dorange)
3052 native_range = FALSE;
3053#endif
012bcf8d 3054 }
a0ed51b3 3055 else {
012bcf8d 3056 *d++ = (char)uv;
a0ed51b3 3057 }
012bcf8d
GS
3058 }
3059 else {
c4d5f83a 3060 *d++ = (char) uv;
a0ed51b3 3061 }
79072805 3062 continue;
02aa26ce 3063
4a2d328f 3064 case 'N':
ff3f963a
KW
3065 /* In a non-pattern \N must be a named character, like \N{LATIN
3066 * SMALL LETTER A} or \N{U+0041}. For patterns, it also can
3067 * mean to match a non-newline. For non-patterns, named
3068 * characters are converted to their string equivalents. In
3069 * patterns, named characters are not converted to their
3070 * ultimate forms for the same reasons that other escapes
3071 * aren't. Instead, they are converted to the \N{U+...} form
3072 * to get the value from the charnames that is in effect right
3073 * now, while preserving the fact that it was a named character
3074 * so that the regex compiler knows this */
3075
3076 /* This section of code doesn't generally use the
3077 * NATIVE_TO_NEED() macro to transform the input. I (khw) did
3078 * a close examination of this macro and determined it is a
3079 * no-op except on utfebcdic variant characters. Every
3080 * character generated by this that would normally need to be
3081 * enclosed by this macro is invariant, so the macro is not
7538f724
KW
3082 * needed, and would complicate use of copy(). XXX There are
3083 * other parts of this file where the macro is used
3084 * inconsistently, but are saved by it being a no-op */
ff3f963a
KW
3085
3086 /* The structure of this section of code (besides checking for
3087 * errors and upgrading to utf8) is:
3088 * Further disambiguate between the two meanings of \N, and if
3089 * not a charname, go process it elsewhere
0a96133f
KW
3090 * If of form \N{U+...}, pass it through if a pattern;
3091 * otherwise convert to utf8
3092 * Otherwise must be \N{NAME}: convert to \N{U+c1.c2...} if a
3093 * pattern; otherwise convert to utf8 */
ff3f963a
KW
3094
3095 /* Here, s points to the 'N'; the test below is guaranteed to
3096 * succeed if we are being called on a pattern as we already
3097 * know from a test above that the next character is a '{'.
3098 * On a non-pattern \N must mean 'named sequence, which
3099 * requires braces */
3100 s++;
3101 if (*s != '{') {
3102 yyerror("Missing braces on \\N{}");
3103 continue;
3104 }
3105 s++;
3106
0a96133f 3107 /* If there is no matching '}', it is an error. */
ff3f963a
KW
3108 if (! (e = strchr(s, '}'))) {
3109 if (! PL_lex_inpat) {
5777a3f7 3110 yyerror("Missing right brace on \\N{}");
0a96133f
KW
3111 } else {
3112 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N.");
dbc0d4f2 3113 }
0a96133f 3114 continue;
ff3f963a 3115 }
cddc7ef4 3116
ff3f963a 3117 /* Here it looks like a named character */
cddc7ef4 3118
ff3f963a
KW
3119 if (PL_lex_inpat) {
3120
3121 /* XXX This block is temporary code. \N{} implies that the
3122 * pattern is to have Unicode semantics, and therefore
3123 * currently has to be encoded in utf8. By putting it in
3124 * utf8 now, we save a whole pass in the regular expression
3125 * compiler. Once that code is changed so Unicode
3126 * semantics doesn't necessarily have to be in utf8, this
da3a4baf
KW
3127 * block should be removed. However, the code that parses
3128 * the output of this would have to be changed to not
3129 * necessarily expect utf8 */
ff3f963a 3130 if (!has_utf8) {
77a135fe 3131 SvCUR_set(sv, d - SvPVX_const(sv));
f08d6ad9 3132 SvPOK_on(sv);
e4f3eed8 3133 *d = '\0';
77a135fe 3134 /* See Note on sizing above. */
7bf79863 3135 sv_utf8_upgrade_flags_grow(sv,
ff3f963a
KW
3136 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3137 /* 5 = '\N{' + cur char + NUL */
3138 (STRLEN)(send - s) + 5);
f08d6ad9 3139 d = SvPVX(sv) + SvCUR(sv);
89491803 3140 has_utf8 = TRUE;
ff3f963a
KW
3141 }
3142 }
423cee85 3143
ff3f963a
KW
3144 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3145 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3146 | PERL_SCAN_DISALLOW_PREFIX;
3147 STRLEN len;
3148
3149 /* For \N{U+...}, the '...' is a unicode value even on
3150 * EBCDIC machines */
3151 s += 2; /* Skip to next char after the 'U+' */
3152 len = e - s;
3153 uv = grok_hex(s, &len, &flags, NULL);
3154 if (len == 0 || len != (STRLEN)(e - s)) {
3155 yyerror("Invalid hexadecimal number in \\N{U+...}");
3156 s = e + 1;
3157 continue;
3158 }
3159
3160 if (PL_lex_inpat) {
3161
e2a7e165
KW
3162 /* On non-EBCDIC platforms, pass through to the regex
3163 * compiler unchanged. The reason we evaluated the
3164 * number above is to make sure there wasn't a syntax
3165 * error. But on EBCDIC we convert to native so
3166 * downstream code can continue to assume it's native
3167 */
ff3f963a 3168 s -= 5; /* Include the '\N{U+' */
e2a7e165
KW
3169#ifdef EBCDIC
3170 d += my_snprintf(d, e - s + 1 + 1, /* includes the }
3171 and the \0 */
3172 "\\N{U+%X}",
3173 (unsigned int) UNI_TO_NATIVE(uv));
3174#else
ff3f963a
KW
3175 Copy(s, d, e - s + 1, char); /* 1 = include the } */
3176 d += e - s + 1;
e2a7e165 3177#endif
ff3f963a
KW
3178 }
3179 else { /* Not a pattern: convert the hex to string */
3180
3181 /* If destination is not in utf8, unconditionally
3182 * recode it to be so. This is because \N{} implies
3183 * Unicode semantics, and scalars have to be in utf8
3184 * to guarantee those semantics */
3185 if (! has_utf8) {
3186 SvCUR_set(sv, d - SvPVX_const(sv));
3187 SvPOK_on(sv);
3188 *d = '\0';
3189 /* See Note on sizing above. */
3190 sv_utf8_upgrade_flags_grow(
3191 sv,
3192 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3193 UNISKIP(uv) + (STRLEN)(send - e) + 1);
3194 d = SvPVX(sv) + SvCUR(sv);
3195 has_utf8 = TRUE;
3196 }
3197
3198 /* Add the string to the output */
3199 if (UNI_IS_INVARIANT(uv)) {
3200 *d++ = (char) uv;
3201 }
3202 else d = (char*)uvuni_to_utf8((U8*)d, uv);
3203 }
3204 }
3205 else { /* Here is \N{NAME} but not \N{U+...}. */
3206
3207 SV *res; /* result from charnames */
3208 const char *str; /* the string in 'res' */
3209 STRLEN len; /* its length */
3210
3211 /* Get the value for NAME */
3212 res = newSVpvn(s, e - s);
3213 res = new_constant( NULL, 0, "charnames",
3214 /* includes all of: \N{...} */
3215 res, NULL, s - 3, e - s + 4 );
3216
3217 /* Most likely res will be in utf8 already since the
3218 * standard charnames uses pack U, but a custom translator
3219 * can leave it otherwise, so make sure. XXX This can be
3220 * revisited to not have charnames use utf8 for characters
3221 * that don't need it when regexes don't have to be in utf8
3222 * for Unicode semantics. If doing so, remember EBCDIC */
3223 sv_utf8_upgrade(res);
3224 str = SvPV_const(res, len);
3225
3226 /* Don't accept malformed input */
3227 if (! is_utf8_string((U8 *) str, len)) {
3228 yyerror("Malformed UTF-8 returned by \\N");
3229 }
3230 else if (PL_lex_inpat) {
3231
3232 if (! len) { /* The name resolved to an empty string */
3233 Copy("\\N{}", d, 4, char);
3234 d += 4;
3235 }
3236 else {
3237 /* In order to not lose information for the regex
3238 * compiler, pass the result in the specially made
3239 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3240 * the code points in hex of each character
3241 * returned by charnames */
3242
3243 const char *str_end = str + len;
3244 STRLEN char_length; /* cur char's byte length */
3245 STRLEN output_length; /* and the number of bytes
3246 after this is translated
3247 into hex digits */
3248 const STRLEN off = d - SvPVX_const(sv);
3249
3250 /* 2 hex per byte; 2 chars for '\N'; 2 chars for
3251 * max('U+', '.'); and 1 for NUL */
3252 char hex_string[2 * UTF8_MAXBYTES + 5];
3253
3254 /* Get the first character of the result. */
3255 U32 uv = utf8n_to_uvuni((U8 *) str,
3256 len,
3257 &char_length,
3258 UTF8_ALLOW_ANYUV);
3259
3260 /* The call to is_utf8_string() above hopefully
3261 * guarantees that there won't be an error. But
3262 * it's easy here to make sure. The function just
3263 * above warns and returns 0 if invalid utf8, but
3264 * it can also return 0 if the input is validly a
3265 * NUL. Disambiguate */
3266 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3267 uv = UNICODE_REPLACEMENT;
3268 }
3269
3270 /* Convert first code point to hex, including the
e2a7e165
KW
3271 * boiler plate before it. For all these, we
3272 * convert to native format so that downstream code
3273 * can continue to assume the input is native */
78c35590 3274 output_length =
3353de27 3275 my_snprintf(hex_string, sizeof(hex_string),
e2a7e165
KW
3276 "\\N{U+%X",
3277 (unsigned int) UNI_TO_NATIVE(uv));
ff3f963a
KW
3278
3279 /* Make sure there is enough space to hold it */
3280 d = off + SvGROW(sv, off
3281 + output_length
3282 + (STRLEN)(send - e)
3283 + 2); /* '}' + NUL */
3284 /* And output it */
3285 Copy(hex_string, d, output_length, char);
3286 d += output_length;
3287
3288 /* For each subsequent character, append dot and
3289 * its ordinal in hex */
3290 while ((str += char_length) < str_end) {
3291 const STRLEN off = d - SvPVX_const(sv);
3292 U32 uv = utf8n_to_uvuni((U8 *) str,
3293 str_end - str,
3294 &char_length,
3295 UTF8_ALLOW_ANYUV);
3296 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3297 uv = UNICODE_REPLACEMENT;
3298 }
3299
78c35590 3300 output_length =
3353de27 3301 my_snprintf(hex_string, sizeof(hex_string),
e2a7e165
KW
3302 ".%X",
3303 (unsigned int) UNI_TO_NATIVE(uv));
ff3f963a
KW
3304
3305 d = off + SvGROW(sv, off
3306 + output_length
3307 + (STRLEN)(send - e)
3308 + 2); /* '}' + NUL */
3309 Copy(hex_string, d, output_length, char);
3310 d += output_length;
3311 }
3312
3313 *d++ = '}'; /* Done. Add the trailing brace */
3314 }
3315 }
3316 else { /* Here, not in a pattern. Convert the name to a
3317 * string. */
3318
3319 /* If destination is not in utf8, unconditionally
3320 * recode it to be so. This is because \N{} implies
3321 * Unicode semantics, and scalars have to be in utf8
3322 * to guarantee those semantics */
3323 if (! has_utf8) {
3324 SvCUR_set(sv, d - SvPVX_const(sv));
3325 SvPOK_on(sv);
3326 *d = '\0';
3327 /* See Note on sizing above. */
3328 sv_utf8_upgrade_flags_grow(sv,
3329 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3330 len + (STRLEN)(send - s) + 1);
3331 d = SvPVX(sv) + SvCUR(sv);
3332 has_utf8 = TRUE;
3333 } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
3334
3335 /* See Note on sizing above. (NOTE: SvCUR() is not
3336 * set correctly here). */
3337 const STRLEN off = d - SvPVX_const(sv);
3338 d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1);
3339 }
3340 Copy(str, d, len, char);
3341 d += len;
423cee85 3342 }
423cee85 3343 SvREFCNT_dec(res);
cb233ae3
KW
3344
3345 /* Deprecate non-approved name syntax */
3346 if (ckWARN_d(WARN_DEPRECATED)) {
3347 bool problematic = FALSE;
3348 char* i = s;
3349
3350 /* For non-ut8 input, look to see that the first
3351 * character is an alpha, then loop through the rest
3352 * checking that each is a continuation */
3353 if (! this_utf8) {
3354 if (! isALPHAU(*i)) problematic = TRUE;
3355 else for (i = s + 1; i < e; i++) {
3356 if (isCHARNAME_CONT(*i)) continue;
3357 problematic = TRUE;
3358 break;
3359 }
3360 }
3361 else {
3362 /* Similarly for utf8. For invariants can check
3363 * directly. We accept anything above the latin1
3364 * range because it is immaterial to Perl if it is
3365 * correct or not, and is expensive to check. But
3366 * it is fairly easy in the latin1 range to convert
3367 * the variants into a single character and check
3368 * those */
3369 if (UTF8_IS_INVARIANT(*i)) {
3370 if (! isALPHAU(*i)) problematic = TRUE;
3371 } else if (UTF8_IS_DOWNGRADEABLE_START(*i)) {
81c14aa2 3372 if (! isALPHAU(UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(*i,
cb233ae3
KW
3373 *(i+1)))))
3374 {
3375 problematic = TRUE;
3376 }
3377 }
3378 if (! problematic) for (i = s + UTF8SKIP(s);
3379 i < e;
3380 i+= UTF8SKIP(i))
3381 {
3382 if (UTF8_IS_INVARIANT(*i)) {
3383 if (isCHARNAME_CONT(*i)) continue;
3384 } else if (! UTF8_IS_DOWNGRADEABLE_START(*i)) {
3385 continue;
3386 } else if (isCHARNAME_CONT(
3387 UNI_TO_NATIVE(
81c14aa2 3388 TWO_BYTE_UTF8_TO_UNI(*i, *(i+1)))))
cb233ae3
KW
3389 {
3390 continue;
3391 }
3392 problematic = TRUE;
3393 break;
3394 }
3395 }
3396 if (problematic) {
6e1bad6c
KW
3397 /* The e-i passed to the final %.*s makes sure that
3398 * should the trailing NUL be missing that this
3399 * print won't run off the end of the string */
cb233ae3 3400 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
b00fc8d4
NC
3401 "Deprecated character in \\N{...}; marked by <-- HERE in \\N{%.*s<-- HERE %.*s",
3402 (int)(i - s + 1), s, (int)(e - i), i + 1);
cb233ae3
KW
3403 }
3404 }
3405 } /* End \N{NAME} */
ff3f963a
KW
3406#ifdef EBCDIC
3407 if (!dorange)
3408 native_range = FALSE; /* \N{} is defined to be Unicode */
3409#endif
3410 s = e + 1; /* Point to just after the '}' */
423cee85
JH
3411 continue;
3412
02aa26ce 3413 /* \c is a control character */
79072805
LW
3414 case 'c':
3415 s++;
961ce445 3416 if (s < send) {
17a3df4c 3417 *d++ = grok_bslash_c(*s++, has_utf8, 1);
ba210ebe 3418 }
961ce445
RGS
3419 else {
3420 yyerror("Missing control char name in \\c");
3421 }
79072805 3422 continue;
02aa26ce
NT
3423
3424 /* printf-style backslashes, formfeeds, newlines, etc */
79072805 3425 case 'b':
db42d148 3426 *d++ = NATIVE_TO_NEED(has_utf8,'\b');
79072805
LW
3427 break;
3428 case 'n':
db42d148 3429 *d++ = NATIVE_TO_NEED(has_utf8,'\n');
79072805
LW
3430 break;
3431 case 'r':
db42d148 3432 *d++ = NATIVE_TO_NEED(has_utf8,'\r');
79072805
LW
3433 break;
3434 case 'f':
db42d148 3435 *d++ = NATIVE_TO_NEED(has_utf8,'\f');
79072805
LW
3436 break;
3437 case 't':
db42d148 3438 *d++ = NATIVE_TO_NEED(has_utf8,'\t');
79072805 3439 break;
34a3fe2a 3440 case 'e':
db42d148 3441 *d++ = ASCII_TO_NEED(has_utf8,'\033');
34a3fe2a
PP
3442 break;
3443 case 'a':
db42d148 3444 *d++ = ASCII_TO_NEED(has_utf8,'\007');
79072805 3445 break;
02aa26ce
NT
3446 } /* end switch */
3447
79072805
LW
3448 s++;
3449 continue;
02aa26ce 3450 } /* end if (backslash) */
4c3a8340
ST
3451#ifdef EBCDIC
3452 else
3453 literal_endpoint++;
3454#endif
02aa26ce 3455
f9a63242 3456 default_action:
77a135fe
KW
3457 /* If we started with encoded form, or already know we want it,
3458 then encode the next character */
3459 if (! NATIVE_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
2b9d42f0 3460 STRLEN len = 1;
77a135fe
KW
3461
3462
3463 /* One might think that it is wasted effort in the case of the
3464 * source being utf8 (this_utf8 == TRUE) to take the next character
3465 * in the source, convert it to an unsigned value, and then convert
3466 * it back again. But the source has not been validated here. The
3467 * routine that does the conversion checks for errors like
3468 * malformed utf8 */
3469
5f66b61c
AL
3470 const UV nextuv = (this_utf8) ? utf8n_to_uvchr((U8*)s, send - s, &len, 0) : (UV) ((U8) *s);
3471 const STRLEN need = UNISKIP(NATIVE_TO_UNI(nextuv));
77a135fe
KW
3472 if (!has_utf8) {
3473 SvCUR_set(sv, d - SvPVX_const(sv));
3474 SvPOK_on(sv);
3475 *d = '\0';
77a135fe 3476 /* See Note on sizing above. */
7bf79863
KW
3477 sv_utf8_upgrade_flags_grow(sv,
3478 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3479 need + (STRLEN)(send - s) + 1);
77a135fe
KW
3480 d = SvPVX(sv) + SvCUR(sv);
3481 has_utf8 = TRUE;
3482 } else if (need > len) {
3483 /* encoded value larger than old, may need extra space (NOTE:
3484 * SvCUR() is not set correctly here). See Note on sizing
3485 * above. */
9d4ba2ae 3486 const STRLEN off = d - SvPVX_const(sv);
77a135fe 3487 d = SvGROW(sv, off + need + (STRLEN)(send - s) + 1) + off;
2b9d42f0 3488 }
77a135fe
KW
3489 s += len;
3490
5f66b61c 3491 d = (char*)uvchr_to_utf8((U8*)d, nextuv);
e294cc5d
JH
3492#ifdef EBCDIC
3493 if (uv > 255 && !dorange)
3494 native_range = FALSE;
3495#endif
2b9d42f0
NIS
3496 }
3497 else {
3498 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
3499 }
02aa26ce
NT
3500 } /* while loop to process each character */
3501
3502 /* terminate the string and set up the sv */
79072805 3503 *d = '\0';
95a20fc0 3504 SvCUR_set(sv, d - SvPVX_const(sv));
2b9d42f0 3505 if (SvCUR(sv) >= SvLEN(sv))
d0063567 3506 Perl_croak(aTHX_ "panic: constant overflowed allocated space");
2b9d42f0 3507
79072805 3508 SvPOK_on(sv);
9f4817db 3509 if (PL_encoding && !has_utf8) {
d0063567
DK
3510 sv_recode_to_utf8(sv, PL_encoding);
3511 if (SvUTF8(sv))
3512 has_utf8 = TRUE;
9f4817db 3513 }
2b9d42f0 3514 if (has_utf8) {
7e2040f0 3515 SvUTF8_on(sv);
2b9d42f0 3516 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
d0063567 3517 PL_sublex_info.sub_op->op_private |=
2b9d42f0
NIS
3518 (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
3519 }
3520 }
79072805 3521
02aa26ce 3522 /* shrink the sv if we allocated more than we used */
79072805 3523 if (SvCUR(sv) + 5 < SvLEN(sv)) {
1da4ca5f 3524 SvPV_shrink_to_cur(sv);
79072805 3525 }
02aa26ce 3526
6154021b 3527 /* return the substring (via pl_yylval) only if we parsed anything */
3280af22 3528 if (s > PL_bufptr) {
eb0d8d16
NC
3529 if ( PL_hints & ( PL_lex_inpat ? HINT_NEW_RE : HINT_NEW_STRING ) ) {
3530 const char *const key = PL_lex_inpat ? "qr" : "q";
3531 const STRLEN keylen = PL_lex_inpat ? 2 : 1;
3532 const char *type;
3533 STRLEN typelen;
3534
3535 if (PL_lex_inwhat == OP_TRANS) {
3536 type = "tr";
3537 typelen = 2;
3538 } else if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat) {
3539 type = "s";
3540 typelen = 1;
3541 } else {
3542 type = "qq";
3543 typelen = 2;
3544 }
3545
3546 sv = S_new_constant(aTHX_ start, s - start, key, keylen, sv, NULL,
3547 type, typelen);
3548 }
6154021b 3549 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
b3ac6de7 3550 } else
8990e307 3551 SvREFCNT_dec(sv);
79072805
LW
3552 return s;
3553}
3554
ffb4593c
NT
3555/* S_intuit_more
3556 * Returns TRUE if there's more to the expression (e.g., a subscript),
3557 * FALSE otherwise.
ffb4593c
NT
3558 *
3559 * It deals with "$foo[3]" and /$foo[3]/ and /$foo[0123456789$]+/
3560 *
3561 * ->[ and ->{ return TRUE
3562 * { and [ outside a pattern are always subscripts, so return TRUE
3563 * if we're outside a pattern and it's not { or [, then return FALSE
3564 * if we're in a pattern and the first char is a {
3565 * {4,5} (any digits around the comma) returns FALSE
3566 * if we're in a pattern and the first char is a [
3567 * [] returns FALSE
3568 * [SOMETHING] has a funky algorithm to decide whether it's a
3569 * character class or not. It has to deal with things like
3570 * /$foo[-3]/ and /$foo[$bar]/ as well as /$foo[$\d]+/
3571 * anything else returns TRUE
3572 */
3573
9cbb5ea2
GS
3574/* This is the one truly awful dwimmer necessary to conflate C and sed. */
3575
76e3520e 3576STATIC int
cea2e8a9 3577S_intuit_more(pTHX_ register char *s)
79072805 3578{
97aff369 3579 dVAR;
7918f24d
NC
3580
3581 PERL_ARGS_ASSERT_INTUIT_MORE;
3582
3280af22 3583 if (PL_lex_brackets)
79072805
LW
3584 return TRUE;
3585 if (*s == '-' && s[1] == '>' && (s[2] == '[' || s[2] == '{'))
3586 return TRUE;
3587 if (*s != '{' && *s != '[')
3588 return FALSE;
3280af22 3589 if (!PL_lex_inpat)
79072805
LW
3590 return TRUE;
3591
3592 /* In a pattern, so maybe we have {n,m}. */
3593 if (*s == '{') {
b3155d95 3594 if (regcurly(s)) {
79072805 3595 return FALSE;
b3155d95 3596 }
79072805 3597 return TRUE;
79072805
LW
3598 }
3599
3600 /* On the other hand, maybe we have a character class */
3601
3602 s++;
3603 if (*s == ']' || *s == '^')
3604 return FALSE;
3605 else {
ffb4593c 3606 /* this is terrifying, and it works */
79072805
LW
3607 int weight = 2; /* let's weigh the evidence */
3608 char seen[256];
f27ffc4a 3609 unsigned char un_char = 255, last_un_char;
9d4ba2ae 3610 const char * const send = strchr(s,']');
3280af22 3611 char tmpbuf[sizeof PL_tokenbuf * 4];
79072805
LW
3612
3613 if (!send) /* has to be an expression */
3614 return TRUE;
3615
3616 Zero(seen,256,char);
3617 if (*s == '$')
3618 weight -= 3;
3619 else if (isDIGIT(*s)) {
3620 if (s[1] != ']') {
3621 if (isDIGIT(s[1]) && s[2] == ']')
3622 weight -= 10;
3623 }
3624 else
3625 weight -= 100;
3626 }
3627 for (; s < send; s++) {
3628 last_un_char = un_char;
3629 un_char = (unsigned char)*s;
3630 switch (*s) {
3631 case '@':
3632 case '&':
3633 case '$':
3634 weight -= seen[un_char] * 10;
7e2040f0 3635 if (isALNUM_lazy_if(s+1,UTF)) {
90e5519e 3636 int len;
8903cb82 3637 scan_ident(s, send, tmpbuf, sizeof tmpbuf, FALSE);
90e5519e 3638 len = (int)strlen(tmpbuf);
6fbd0d97
BF
3639 if (len > 1 && gv_fetchpvn_flags(tmpbuf, len,
3640 UTF ? SVf_UTF8 : 0, SVt_PV))
79072805
LW
3641 weight -= 100;
3642 else
3643 weight -= 10;
3644 }
3645 else if (*s == '$' && s[1] &&
93a17b20
LW
3646 strchr("[#!%*<>()-=",s[1])) {
3647 if (/*{*/ strchr("])} =",s[2]))
79072805
LW
3648 weight -= 10;
3649 else
3650 weight -= 1;
3651 }
3652 break;
3653 case '\\':
3654 un_char = 254;
3655 if (s[1]) {
93a17b20 3656 if (strchr("wds]",s[1]))
79072805 3657 weight += 100;
10edeb5d 3658 else if (seen[(U8)'\''] || seen[(U8)'"'])
79072805 3659 weight += 1;
93a17b20 3660 else if (strchr("rnftbxcav",s[1]))
79072805
LW
3661 weight += 40;
3662 else if (isDIGIT(s[1])) {
3663 weight += 40;
3664 while (s[1] && isDIGIT(s[1]))
3665 s++;
3666 }
3667 }
3668 else
3669 weight += 100;
3670 break;
3671 case '-':
3672 if (s[1] == '\\')
3673 weight += 50;
93a17b20 3674 if (strchr("aA01! ",last_un_char))
79072805 3675 weight += 30;
93a17b20 3676 if (strchr("zZ79~",s[1]))
79072805 3677 weight += 30;
f27ffc4a
GS
3678 if (last_un_char == 255 && (isDIGIT(s[1]) || s[1] == '$'))
3679 weight -= 5; /* cope with negative subscript */
79072805
LW
3680 break;
3681 default:
3792a11b
NC
3682 if (!isALNUM(last_un_char)
3683 && !(last_un_char == '$' || last_un_char == '@'
3684 || last_un_char == '&')
3685 && isALPHA(*s) && s[1] && isALPHA(s[1])) {
79072805
LW
3686 char *d = tmpbuf;
3687 while (isALPHA(*s))
3688 *d++ = *s++;
3689 *d = '\0';
5458a98a 3690 if (keyword(tmpbuf, d - tmpbuf, 0))
79072805
LW
3691 weight -= 150;
3692 }
3693 if (un_char == last_un_char + 1)
3694 weight += 5;
3695 weight -= seen[un_char];
3696 break;
3697 }
3698 seen[un_char]++;
3699 }
3700 if (weight >= 0) /* probably a character class */
3701 return FALSE;
3702 }
3703
3704 return TRUE;
3705}
ffed7fef 3706
ffb4593c
NT
3707/*
3708 * S_intuit_method
3709 *
3710 * Does all the checking to disambiguate
3711 * foo bar
3712 * between foo(bar) and bar->foo. Returns 0 if not a method, otherwise
3713 * FUNCMETH (bar->foo(args)) or METHOD (bar->foo args).
3714 *
3715 * First argument is the stuff after the first token, e.g. "bar".
3716 *
3717 * Not a method if bar is a filehandle.
3718 * Not a method if foo is a subroutine prototyped to take a filehandle.
3719 * Not a method if it's really "Foo $bar"
3720 * Method if it's "foo $bar"
3721 * Not a method if it's really "print foo $bar"
3722 * Method if it's really "foo package::" (interpreted as package->foo)
8f8cf39c 3723 * Not a method if bar is known to be a subroutine ("sub bar; foo bar")
3cb0bbe5 3724 * Not a method if bar is a filehandle or package, but is quoted with
ffb4593c
NT
3725 * =>
3726 */
3727
76e3520e 3728STATIC int
62d55b22 3729S_intuit_method(pTHX_ char *start, GV *gv, CV *cv)
a0d0e21e 3730{
97aff369 3731 dVAR;
a0d0e21e 3732 char *s = start + (*start == '$');
3280af22 3733 char tmpbuf[sizeof PL_tokenbuf];
a0d0e21e
LW
3734 STRLEN len;
3735 GV* indirgv;
5db06880
NC
3736#ifdef PERL_MAD
3737 int soff;
3738#endif
a0d0e21e 3739
7918f24d
NC
3740 PERL_ARGS_ASSERT_INTUIT_METHOD;
3741
a0d0e21e 3742 if (gv) {
62d55b22 3743 if (SvTYPE(gv) == SVt_PVGV && GvIO(gv))
a0d0e21e 3744 return 0;
62d55b22
NC
3745 if (cv) {
3746 if (SvPOK(cv)) {
8fa6a409 3747 const char *proto = CvPROTO(cv);
62d55b22
NC
3748 if (proto) {
3749 if (*proto == ';')
3750 proto++;
3751 if (*proto == '*')
3752 return 0;
3753 }
b6c543e3
IZ
3754 }
3755 } else
c35e046a 3756 gv = NULL;
a0d0e21e 3757 }
8903cb82 3758 s = scan_word(s, tmpbuf, sizeof tmpbuf, TRUE, &len);
ffb4593c
NT
3759 /* start is the beginning of the possible filehandle/object,
3760 * and s is the end of it
3761 * tmpbuf is a copy of it
3762 */
3763
a0d0e21e 3764 if (*start == '$') {
3ef1310e
RGS
3765 if (gv || PL_last_lop_op == OP_PRINT || PL_last_lop_op == OP_SAY ||
3766 isUPPER(*PL_tokenbuf))
a0d0e21e 3767 return 0;
5db06880
NC
3768#ifdef PERL_MAD
3769 len = start - SvPVX(PL_linestr);
3770#endif
29595ff2 3771<