This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
toke.c: move leftbracket label
[perl5.git] / toke.c
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
26
27This is the lower layer of the Perl parser, managing characters and tokens.
28
29=for apidoc AmU|yy_parser *|PL_parser
30
31Pointer to a structure encapsulating the state of the parsing operation
32currently in progress. The pointer can be locally changed to perform
33a nested parse without interfering with the state of an outer parse.
34Individual members of C<PL_parser> have their own documentation.
35
36=cut
37*/
38
378cc40b 39#include "EXTERN.h"
864dbfa3 40#define PERL_IN_TOKE_C
378cc40b 41#include "perl.h"
04e98a4d 42#include "dquote_static.c"
378cc40b 43
eb0d8d16
NC
44#define new_constant(a,b,c,d,e,f,g) \
45 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
46
6154021b 47#define pl_yylval (PL_parser->yylval)
d3b6f988 48
199e78b7
DM
49/* XXX temporary backwards compatibility */
50#define PL_lex_brackets (PL_parser->lex_brackets)
78cdf107
Z
51#define PL_lex_allbrackets (PL_parser->lex_allbrackets)
52#define PL_lex_fakeeof (PL_parser->lex_fakeeof)
199e78b7
DM
53#define PL_lex_brackstack (PL_parser->lex_brackstack)
54#define PL_lex_casemods (PL_parser->lex_casemods)
55#define PL_lex_casestack (PL_parser->lex_casestack)
56#define PL_lex_defer (PL_parser->lex_defer)
57#define PL_lex_dojoin (PL_parser->lex_dojoin)
58#define PL_lex_expect (PL_parser->lex_expect)
59#define PL_lex_formbrack (PL_parser->lex_formbrack)
60#define PL_lex_inpat (PL_parser->lex_inpat)
61#define PL_lex_inwhat (PL_parser->lex_inwhat)
62#define PL_lex_op (PL_parser->lex_op)
63#define PL_lex_repl (PL_parser->lex_repl)
64#define PL_lex_starts (PL_parser->lex_starts)
65#define PL_lex_stuff (PL_parser->lex_stuff)
66#define PL_multi_start (PL_parser->multi_start)
67#define PL_multi_open (PL_parser->multi_open)
68#define PL_multi_close (PL_parser->multi_close)
69#define PL_pending_ident (PL_parser->pending_ident)
70#define PL_preambled (PL_parser->preambled)
71#define PL_sublex_info (PL_parser->sublex_info)
bdc0bf6f 72#define PL_linestr (PL_parser->linestr)
c2598295
DM
73#define PL_expect (PL_parser->expect)
74#define PL_copline (PL_parser->copline)
f06b5848
DM
75#define PL_bufptr (PL_parser->bufptr)
76#define PL_oldbufptr (PL_parser->oldbufptr)
77#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
78#define PL_linestart (PL_parser->linestart)
79#define PL_bufend (PL_parser->bufend)
80#define PL_last_uni (PL_parser->last_uni)
81#define PL_last_lop (PL_parser->last_lop)
82#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 83#define PL_lex_state (PL_parser->lex_state)
2f9285f8 84#define PL_rsfp (PL_parser->rsfp)
5486870f 85#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
86#define PL_in_my (PL_parser->in_my)
87#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 88#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 89#define PL_multi_end (PL_parser->multi_end)
13765c85 90#define PL_error_count (PL_parser->error_count)
199e78b7
DM
91
92#ifdef PERL_MAD
93# define PL_endwhite (PL_parser->endwhite)
94# define PL_faketokens (PL_parser->faketokens)
95# define PL_lasttoke (PL_parser->lasttoke)
96# define PL_nextwhite (PL_parser->nextwhite)
97# define PL_realtokenstart (PL_parser->realtokenstart)
98# define PL_skipwhite (PL_parser->skipwhite)
99# define PL_thisclose (PL_parser->thisclose)
100# define PL_thismad (PL_parser->thismad)
101# define PL_thisopen (PL_parser->thisopen)
102# define PL_thisstuff (PL_parser->thisstuff)
103# define PL_thistoken (PL_parser->thistoken)
104# define PL_thiswhite (PL_parser->thiswhite)
fb205e7a
DM
105# define PL_thiswhite (PL_parser->thiswhite)
106# define PL_nexttoke (PL_parser->nexttoke)
107# define PL_curforce (PL_parser->curforce)
108#else
109# define PL_nexttoke (PL_parser->nexttoke)
110# define PL_nexttype (PL_parser->nexttype)
111# define PL_nextval (PL_parser->nextval)
199e78b7
DM
112#endif
113
16173588
NC
114/* This can't be done with embed.fnc, because struct yy_parser contains a
115 member named pending_ident, which clashes with the generated #define */
3cbf51f5
DM
116static int
117S_pending_ident(pTHX);
199e78b7 118
0bd48802 119static const char ident_too_long[] = "Identifier too long";
8903cb82 120
29595ff2 121#ifdef PERL_MAD
29595ff2 122# define CURMAD(slot,sv) if (PL_madskills) { curmad(slot,sv); sv = 0; }
cd81e915 123# define NEXTVAL_NEXTTOKE PL_nexttoke[PL_curforce].next_val
9ded7720 124#else
5db06880 125# define CURMAD(slot,sv)
9ded7720 126# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2
NC
127#endif
128
a7aaec61
Z
129#define XENUMMASK 0x3f
130#define XFAKEEOF 0x40
131#define XFAKEBRACK 0x80
9059aa12 132
39e02b42
JH
133#ifdef USE_UTF8_SCRIPTS
134# define UTF (!IN_BYTES)
2b9d42f0 135#else
802a15e9 136# define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
2b9d42f0 137#endif
a0ed51b3 138
b1fc3636
CJ
139/* The maximum number of characters preceding the unrecognized one to display */
140#define UNRECOGNIZED_PRECEDE_COUNT 10
141
61f0cdd9 142/* In variables named $^X, these are the legal values for X.
2b92dfce
GS
143 * 1999-02-27 mjd-perl-patch@plover.com */
144#define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
145
bf4acbe4 146#define SPACE_OR_TAB(c) ((c)==' '||(c)=='\t')
bf4acbe4 147
ffb4593c
NT
148/* LEX_* are values for PL_lex_state, the state of the lexer.
149 * They are arranged oddly so that the guard on the switch statement
79072805 150 * can get by with a single comparison (if the compiler is smart enough).
9da1dd8f
DM
151 *
152 * These values refer to the various states within a sublex parse,
153 * i.e. within a double quotish string
79072805
LW
154 */
155
fb73857a 156/* #define LEX_NOTPARSING 11 is done in perl.h. */
157
b6007c36
DM
158#define LEX_NORMAL 10 /* normal code (ie not within "...") */
159#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
160#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
161#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
162#define LEX_INTERPSTART 6 /* expecting the start of a $var */
163
164 /* at end of code, eg "$x" followed by: */
165#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
166#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
167
168#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
169 string or after \E, $foo, etc */
170#define LEX_INTERPCONST 2 /* NOT USED */
171#define LEX_FORMLINE 1 /* expecting a format line */
172#define LEX_KNOWNEXT 0 /* next token known; just return it */
173
79072805 174
bbf60fe6 175#ifdef DEBUGGING
27da23d5 176static const char* const lex_state_names[] = {
bbf60fe6
DM
177 "KNOWNEXT",
178 "FORMLINE",
179 "INTERPCONST",
180 "INTERPCONCAT",
181 "INTERPENDMAYBE",
182 "INTERPEND",
183 "INTERPSTART",
184 "INTERPPUSH",
185 "INTERPCASEMOD",
186 "INTERPNORMAL",
187 "NORMAL"
188};
189#endif
190
79072805
LW
191#ifdef ff_next
192#undef ff_next
d48672a2
LW
193#endif
194
79072805 195#include "keywords.h"
fe14fcc3 196
ffb4593c
NT
197/* CLINE is a macro that ensures PL_copline has a sane value */
198
ae986130
LW
199#ifdef CLINE
200#undef CLINE
201#endif
57843af0 202#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 203
5db06880 204#ifdef PERL_MAD
29595ff2
NC
205# define SKIPSPACE0(s) skipspace0(s)
206# define SKIPSPACE1(s) skipspace1(s)
207# define SKIPSPACE2(s,tsv) skipspace2(s,&tsv)
208# define PEEKSPACE(s) skipspace2(s,0)
209#else
210# define SKIPSPACE0(s) skipspace(s)
211# define SKIPSPACE1(s) skipspace(s)
212# define SKIPSPACE2(s,tsv) skipspace(s)
213# define PEEKSPACE(s) skipspace(s)
214#endif
215
ffb4593c
NT
216/*
217 * Convenience functions to return different tokens and prime the
9cbb5ea2 218 * lexer for the next token. They all take an argument.
ffb4593c
NT
219 *
220 * TOKEN : generic token (used for '(', DOLSHARP, etc)
221 * OPERATOR : generic operator
222 * AOPERATOR : assignment operator
223 * PREBLOCK : beginning the block after an if, while, foreach, ...
224 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
225 * PREREF : *EXPR where EXPR is not a simple identifier
226 * TERM : expression term
227 * LOOPX : loop exiting command (goto, last, dump, etc)
228 * FTST : file test operator
229 * FUN0 : zero-argument function
7eb971ee 230 * FUN0OP : zero-argument function, with its op created in this file
2d2e263d 231 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
232 * BOop : bitwise or or xor
233 * BAop : bitwise and
234 * SHop : shift operator
235 * PWop : power operator
9cbb5ea2 236 * PMop : pattern-matching operator
ffb4593c
NT
237 * Aop : addition-level operator
238 * Mop : multiplication-level operator
239 * Eop : equality-testing operator
e5edeb50 240 * Rop : relational operator <= != gt
ffb4593c
NT
241 *
242 * Also see LOP and lop() below.
243 */
244
998054bd 245#ifdef DEBUGGING /* Serve -DT. */
704d4215 246# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 247#else
bbf60fe6 248# define REPORT(retval) (retval)
998054bd
SC
249#endif
250
bbf60fe6
DM
251#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
252#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
253#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, REPORT(retval)))
254#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
255#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
256#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
257#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
6154021b
RGS
258#define LOOPX(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)LOOPEX))
259#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
260#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
7eb971ee 261#define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
6154021b
RGS
262#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
263#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITOROP)))
264#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITANDOP)))
265#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)SHIFTOP)))
266#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)POWOP)))
267#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
268#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)ADDOP)))
269#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MULOP)))
270#define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
271#define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
2f3197b3 272
a687059c
LW
273/* This bit of chicanery makes a unary function followed by
274 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
275 * The UNIDOR macro is for unary functions that can be followed by the //
276 * operator (such as C<shift // 0>).
a687059c 277 */
d68ce4ac 278#define UNI3(f,x,have_x) { \
6154021b 279 pl_yylval.ival = f; \
d68ce4ac 280 if (have_x) PL_expect = x; \
376fcdbf
AL
281 PL_bufptr = s; \
282 PL_last_uni = PL_oldbufptr; \
283 PL_last_lop_op = f; \
284 if (*s == '(') \
285 return REPORT( (int)FUNC1 ); \
29595ff2 286 s = PEEKSPACE(s); \
376fcdbf
AL
287 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
288 }
d68ce4ac
FC
289#define UNI(f) UNI3(f,XTERM,1)
290#define UNIDOR(f) UNI3(f,XTERMORDORDOR,1)
b5fb7ce3
FC
291#define UNIPROTO(f,optional) { \
292 if (optional) PL_last_uni = PL_oldbufptr; \
22393538
MH
293 OPERATOR(f); \
294 }
a687059c 295
d68ce4ac 296#define UNIBRACK(f) UNI3(f,0,0)
79072805 297
9f68db38 298/* grandfather return to old style */
78cdf107
Z
299#define OLDLOP(f) \
300 do { \
301 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
302 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
303 pl_yylval.ival = (f); \
304 PL_expect = XTERM; \
305 PL_bufptr = s; \
306 return (int)LSTOP; \
307 } while(0)
79072805 308
8fa7f367
JH
309#ifdef DEBUGGING
310
6154021b 311/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
312enum token_type {
313 TOKENTYPE_NONE,
314 TOKENTYPE_IVAL,
6154021b 315 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6
DM
316 TOKENTYPE_PVAL,
317 TOKENTYPE_OPVAL,
318 TOKENTYPE_GVVAL
319};
320
6d4a66ac
NC
321static struct debug_tokens {
322 const int token;
323 enum token_type type;
324 const char *name;
325} const debug_tokens[] =
9041c2e3 326{
bbf60fe6
DM
327 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
328 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
329 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
330 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
331 { ARROW, TOKENTYPE_NONE, "ARROW" },
332 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
333 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
334 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
335 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
336 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
0d863452 337 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
338 { DO, TOKENTYPE_NONE, "DO" },
339 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
340 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
341 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
342 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
343 { ELSE, TOKENTYPE_NONE, "ELSE" },
344 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
345 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
346 { FOR, TOKENTYPE_IVAL, "FOR" },
347 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
348 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
349 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
7eb971ee 350 { FUNC0OP, TOKENTYPE_OPVAL, "FUNC0OP" },
bbf60fe6
DM
351 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
352 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
353 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 354 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
355 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
356 { IF, TOKENTYPE_IVAL, "IF" },
5db1eb8d 357 { LABEL, TOKENTYPE_OPVAL, "LABEL" },
bbf60fe6
DM
358 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
359 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
360 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
361 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
362 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
363 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
364 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
365 { MY, TOKENTYPE_IVAL, "MY" },
366 { MYSUB, TOKENTYPE_NONE, "MYSUB" },
367 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
368 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
369 { OROP, TOKENTYPE_IVAL, "OROP" },
370 { OROR, TOKENTYPE_NONE, "OROR" },
371 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
88e1f1a2
JV
372 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
373 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6
DM
374 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
375 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
376 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
377 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
378 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
379 { PREINC, TOKENTYPE_NONE, "PREINC" },
380 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
381 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
382 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
383 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
384 { SUB, TOKENTYPE_NONE, "SUB" },
385 { THING, TOKENTYPE_OPVAL, "THING" },
386 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
387 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
388 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
389 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
390 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
391 { USE, TOKENTYPE_IVAL, "USE" },
0d863452 392 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6
DM
393 { WHILE, TOKENTYPE_IVAL, "WHILE" },
394 { WORD, TOKENTYPE_OPVAL, "WORD" },
be25f609 395 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 396 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
397};
398
6154021b 399/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 400
bbf60fe6 401STATIC int
704d4215 402S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 403{
97aff369 404 dVAR;
7918f24d
NC
405
406 PERL_ARGS_ASSERT_TOKEREPORT;
407
bbf60fe6 408 if (DEBUG_T_TEST) {
bd61b366 409 const char *name = NULL;
bbf60fe6 410 enum token_type type = TOKENTYPE_NONE;
f54cb97a 411 const struct debug_tokens *p;
396482e1 412 SV* const report = newSVpvs("<== ");
bbf60fe6 413
f54cb97a 414 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
415 if (p->token == (int)rv) {
416 name = p->name;
417 type = p->type;
418 break;
419 }
420 }
421 if (name)
54667de8 422 Perl_sv_catpv(aTHX_ report, name);
bbf60fe6
DM
423 else if ((char)rv > ' ' && (char)rv < '~')
424 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
425 else if (!rv)
396482e1 426 sv_catpvs(report, "EOF");
bbf60fe6
DM
427 else
428 Perl_sv_catpvf(aTHX_ report, "?? %"IVdf, (IV)rv);
429 switch (type) {
430 case TOKENTYPE_NONE:
431 case TOKENTYPE_GVVAL: /* doesn't appear to be used */
432 break;
433 case TOKENTYPE_IVAL:
704d4215 434 Perl_sv_catpvf(aTHX_ report, "(ival=%"IVdf")", (IV)lvalp->ival);
bbf60fe6
DM
435 break;
436 case TOKENTYPE_OPNUM:
437 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 438 PL_op_name[lvalp->ival]);
bbf60fe6
DM
439 break;
440 case TOKENTYPE_PVAL:
704d4215 441 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
442 break;
443 case TOKENTYPE_OPVAL:
704d4215 444 if (lvalp->opval) {
401441c0 445 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
446 PL_op_name[lvalp->opval->op_type]);
447 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 448 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 449 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
450 }
451
452 }
401441c0 453 else
396482e1 454 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
455 break;
456 }
b6007c36 457 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
458 };
459 return (int)rv;
998054bd
SC
460}
461
b6007c36
DM
462
463/* print the buffer with suitable escapes */
464
465STATIC void
15f169a1 466S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 467{
396482e1 468 SV* const tmp = newSVpvs("");
7918f24d
NC
469
470 PERL_ARGS_ASSERT_PRINTBUF;
471
b6007c36
DM
472 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
473 SvREFCNT_dec(tmp);
474}
475
8fa7f367
JH
476#endif
477
8290c323
NC
478static int
479S_deprecate_commaless_var_list(pTHX) {
480 PL_expect = XTERM;
481 deprecate("comma-less variable list");
482 return REPORT(','); /* grandfather non-comma-format format */
483}
484
ffb4593c
NT
485/*
486 * S_ao
487 *
c963b151
BD
488 * This subroutine detects &&=, ||=, and //= and turns an ANDAND, OROR or DORDOR
489 * into an OP_ANDASSIGN, OP_ORASSIGN, or OP_DORASSIGN
ffb4593c
NT
490 */
491
76e3520e 492STATIC int
cea2e8a9 493S_ao(pTHX_ int toketype)
a0d0e21e 494{
97aff369 495 dVAR;
3280af22
NIS
496 if (*PL_bufptr == '=') {
497 PL_bufptr++;
a0d0e21e 498 if (toketype == ANDAND)
6154021b 499 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 500 else if (toketype == OROR)
6154021b 501 pl_yylval.ival = OP_ORASSIGN;
c963b151 502 else if (toketype == DORDOR)
6154021b 503 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
504 toketype = ASSIGNOP;
505 }
506 return toketype;
507}
508
ffb4593c
NT
509/*
510 * S_no_op
511 * When Perl expects an operator and finds something else, no_op
512 * prints the warning. It always prints "<something> found where
513 * operator expected. It prints "Missing semicolon on previous line?"
514 * if the surprise occurs at the start of the line. "do you need to
515 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
516 * where the compiler doesn't know if foo is a method call or a function.
517 * It prints "Missing operator before end of line" if there's nothing
518 * after the missing operator, or "... before <...>" if there is something
519 * after the missing operator.
520 */
521
76e3520e 522STATIC void
15f169a1 523S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 524{
97aff369 525 dVAR;
9d4ba2ae
AL
526 char * const oldbp = PL_bufptr;
527 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 528
7918f24d
NC
529 PERL_ARGS_ASSERT_NO_OP;
530
1189a94a
GS
531 if (!s)
532 s = oldbp;
07c798fb 533 else
1189a94a 534 PL_bufptr = s;
734ab321 535 yywarn(Perl_form(aTHX_ "%s found where operator expected", what), UTF ? SVf_UTF8 : 0);
56da5a46
RGS
536 if (ckWARN_d(WARN_SYNTAX)) {
537 if (is_first)
538 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
539 "\t(Missing semicolon on previous line?)\n");
540 else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) {
f54cb97a 541 const char *t;
734ab321
BF
542 for (t = PL_oldoldbufptr; (isALNUM_lazy_if(t,UTF) || *t == ':');
543 t += UTF ? UTF8SKIP(t) : 1)
c35e046a 544 NOOP;
56da5a46
RGS
545 if (t < PL_bufptr && isSPACE(*t))
546 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
734ab321
BF
547 "\t(Do you need to predeclare %"SVf"?)\n",
548 SVfARG(newSVpvn_flags(PL_oldoldbufptr, (STRLEN)(t - PL_oldoldbufptr),
549 SVs_TEMP | (UTF ? SVf_UTF8 : 0))));
56da5a46
RGS
550 }
551 else {
552 assert(s >= oldbp);
553 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
734ab321
BF
554 "\t(Missing operator before %"SVf"?)\n",
555 SVfARG(newSVpvn_flags(oldbp, (STRLEN)(s - oldbp),
556 SVs_TEMP | (UTF ? SVf_UTF8 : 0))));
56da5a46 557 }
07c798fb 558 }
3280af22 559 PL_bufptr = oldbp;
8990e307
LW
560}
561
ffb4593c
NT
562/*
563 * S_missingterm
564 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 565 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
566 * If we're in a delimited string and the delimiter is a control
567 * character, it's reformatted into a two-char sequence like ^C.
568 * This is fatal.
569 */
570
76e3520e 571STATIC void
cea2e8a9 572S_missingterm(pTHX_ char *s)
8990e307 573{
97aff369 574 dVAR;
8990e307
LW
575 char tmpbuf[3];
576 char q;
577 if (s) {
9d4ba2ae 578 char * const nl = strrchr(s,'\n');
d2719217 579 if (nl)
8990e307
LW
580 *nl = '\0';
581 }
463559e7 582 else if (isCNTRL(PL_multi_close)) {
8990e307 583 *tmpbuf = '^';
585ec06d 584 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
585 tmpbuf[2] = '\0';
586 s = tmpbuf;
587 }
588 else {
eb160463 589 *tmpbuf = (char)PL_multi_close;
8990e307
LW
590 tmpbuf[1] = '\0';
591 s = tmpbuf;
592 }
593 q = strchr(s,'"') ? '\'' : '"';
cea2e8a9 594 Perl_croak(aTHX_ "Can't find string terminator %c%s%c anywhere before EOF",q,s,q);
463ee0b2 595}
79072805 596
dd0ac2b9
FC
597#include "feature.h"
598
0d863452 599/*
0d863452
RH
600 * Check whether the named feature is enabled.
601 */
26ea9e12 602bool
3fff3427 603Perl_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
0d863452 604{
97aff369 605 dVAR;
4a731d7b 606 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
7918f24d
NC
607
608 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
ca4d40c4
FC
609
610 assert(CURRENT_FEATURE_BUNDLE == FEATURE_BUNDLE_CUSTOM);
7918f24d 611
26ea9e12
NC
612 if (namelen > MAX_FEATURE_LEN)
613 return FALSE;
3fff3427 614 memcpy(&he_name[8], name, namelen);
7d69d4a6 615
c8ca97b0
NC
616 return cBOOL(cop_hints_fetch_pvn(PL_curcop, he_name, 8 + namelen, 0,
617 REFCOUNTED_HE_EXISTS));
0d863452
RH
618}
619
ffb4593c 620/*
9cbb5ea2
GS
621 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
622 * utf16-to-utf8-reversed.
ffb4593c
NT
623 */
624
c39cd008
GS
625#ifdef PERL_CR_FILTER
626static void
627strip_return(SV *sv)
628{
95a20fc0 629 register const char *s = SvPVX_const(sv);
9d4ba2ae 630 register const char * const e = s + SvCUR(sv);
7918f24d
NC
631
632 PERL_ARGS_ASSERT_STRIP_RETURN;
633
c39cd008
GS
634 /* outer loop optimized to do nothing if there are no CR-LFs */
635 while (s < e) {
636 if (*s++ == '\r' && *s == '\n') {
637 /* hit a CR-LF, need to copy the rest */
638 register char *d = s - 1;
639 *d++ = *s++;
640 while (s < e) {
641 if (*s == '\r' && s[1] == '\n')
642 s++;
643 *d++ = *s++;
644 }
645 SvCUR(sv) -= s - d;
646 return;
647 }
648 }
649}
a868473f 650
76e3520e 651STATIC I32
c39cd008 652S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 653{
f54cb97a 654 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
655 if (count > 0 && !maxlen)
656 strip_return(sv);
657 return count;
a868473f
NIS
658}
659#endif
660
ffb4593c 661/*
8eaa0acf
Z
662=for apidoc Amx|void|lex_start|SV *line|PerlIO *rsfp|U32 flags
663
664Creates and initialises a new lexer/parser state object, supplying
665a context in which to lex and parse from a new source of Perl code.
666A pointer to the new state object is placed in L</PL_parser>. An entry
667is made on the save stack so that upon unwinding the new state object
668will be destroyed and the former value of L</PL_parser> will be restored.
669Nothing else need be done to clean up the parsing context.
670
671The code to be parsed comes from I<line> and I<rsfp>. I<line>, if
672non-null, provides a string (in SV form) containing code to be parsed.
673A copy of the string is made, so subsequent modification of I<line>
674does not affect parsing. I<rsfp>, if non-null, provides an input stream
675from which code will be read to be parsed. If both are non-null, the
676code in I<line> comes first and must consist of complete lines of input,
677and I<rsfp> supplies the remainder of the source.
678
e368b3bd
FC
679The I<flags> parameter is reserved for future use. Currently it is only
680used by perl internally, so extensions should always pass zero.
8eaa0acf
Z
681
682=cut
683*/
ffb4593c 684
27fcb6ee 685/* LEX_START_SAME_FILTER indicates that this is not a new file, so it
87606032
NC
686 can share filters with the current parser.
687 LEX_START_DONT_CLOSE indicates that the file handle wasn't opened by the
688 caller, hence isn't owned by the parser, so shouldn't be closed on parser
689 destruction. This is used to handle the case of defaulting to reading the
690 script from the standard input because no filename was given on the command
691 line (without getting confused by situation where STDIN has been closed, so
692 the script handle is opened on fd 0) */
27fcb6ee 693
a0d0e21e 694void
8eaa0acf 695Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
79072805 696{
97aff369 697 dVAR;
6ef55633 698 const char *s = NULL;
5486870f 699 yy_parser *parser, *oparser;
60d63348 700 if (flags && flags & ~LEX_START_FLAGS)
8eaa0acf 701 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
acdf0a21
DM
702
703 /* create and initialise a parser */
704
199e78b7 705 Newxz(parser, 1, yy_parser);
5486870f 706 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
707 PL_parser = parser;
708
28ac2b49
Z
709 parser->stack = NULL;
710 parser->ps = NULL;
711 parser->stack_size = 0;
acdf0a21 712
e3abe207
DM
713 /* on scope exit, free this parser and restore any outer one */
714 SAVEPARSER(parser);
7c4baf47 715 parser->saved_curcop = PL_curcop;
e3abe207 716
acdf0a21 717 /* initialise lexer state */
8990e307 718
fb205e7a
DM
719#ifdef PERL_MAD
720 parser->curforce = -1;
721#else
722 parser->nexttoke = 0;
723#endif
ca4cfd28 724 parser->error_count = oparser ? oparser->error_count : 0;
c2598295 725 parser->copline = NOLINE;
5afb0a62 726 parser->lex_state = LEX_NORMAL;
c2598295 727 parser->expect = XSTATE;
2f9285f8 728 parser->rsfp = rsfp;
27fcb6ee
FC
729 parser->rsfp_filters =
730 !(flags & LEX_START_SAME_FILTER) || !oparser
d3cd8e11
FC
731 ? NULL
732 : MUTABLE_AV(SvREFCNT_inc(
733 oparser->rsfp_filters
734 ? oparser->rsfp_filters
735 : (oparser->rsfp_filters = newAV())
736 ));
2f9285f8 737
199e78b7
DM
738 Newx(parser->lex_brackstack, 120, char);
739 Newx(parser->lex_casestack, 12, char);
740 *parser->lex_casestack = '\0';
02b34bbe 741
10efb74f 742 if (line) {
0528fd32 743 STRLEN len;
10efb74f 744 s = SvPV_const(line, len);
0abcdfa4
FC
745 parser->linestr = flags & LEX_START_COPIED
746 ? SvREFCNT_inc_simple_NN(line)
747 : newSVpvn_flags(s, len, SvUTF8(line));
37555a72 748 if (!len || s[len-1] != ';')
bdc0bf6f 749 sv_catpvs(parser->linestr, "\n;");
0abcdfa4
FC
750 } else {
751 parser->linestr = newSVpvs("\n;");
8990e307 752 }
f06b5848
DM
753 parser->oldoldbufptr =
754 parser->oldbufptr =
755 parser->bufptr =
756 parser->linestart = SvPVX(parser->linestr);
757 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
758 parser->last_lop = parser->last_uni = NULL;
87606032
NC
759 parser->lex_flags = flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
760 |LEX_DONT_CLOSE_RSFP);
737c24fc 761
60d63348 762 parser->in_pod = parser->filtered = 0;
79072805 763}
a687059c 764
e3abe207
DM
765
766/* delete a parser object */
767
768void
769Perl_parser_free(pTHX_ const yy_parser *parser)
770{
7918f24d
NC
771 PERL_ARGS_ASSERT_PARSER_FREE;
772
7c4baf47 773 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
774 SvREFCNT_dec(parser->linestr);
775
87606032 776 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
2f9285f8 777 PerlIO_clearerr(parser->rsfp);
799361c3
SH
778 else if (parser->rsfp && (!parser->old_parser ||
779 (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 780 PerlIO_close(parser->rsfp);
5486870f 781 SvREFCNT_dec(parser->rsfp_filters);
2f9285f8 782
e3abe207
DM
783 Safefree(parser->lex_brackstack);
784 Safefree(parser->lex_casestack);
785 PL_parser = parser->old_parser;
786 Safefree(parser);
787}
788
789
ffb4593c 790/*
f0e67a1d
Z
791=for apidoc AmxU|SV *|PL_parser-E<gt>linestr
792
793Buffer scalar containing the chunk currently under consideration of the
794text currently being lexed. This is always a plain string scalar (for
795which C<SvPOK> is true). It is not intended to be used as a scalar by
796normal scalar means; instead refer to the buffer directly by the pointer
797variables described below.
798
799The lexer maintains various C<char*> pointers to things in the
800C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
801reallocated, all of these pointers must be updated. Don't attempt to
802do this manually, but rather use L</lex_grow_linestr> if you need to
803reallocate the buffer.
804
805The content of the text chunk in the buffer is commonly exactly one
806complete line of input, up to and including a newline terminator,
807but there are situations where it is otherwise. The octets of the
808buffer may be intended to be interpreted as either UTF-8 or Latin-1.
809The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
810flag on this scalar, which may disagree with it.
811
812For direct examination of the buffer, the variable
813L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
814lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
815of these pointers is usually preferable to examination of the scalar
816through normal scalar means.
817
818=for apidoc AmxU|char *|PL_parser-E<gt>bufend
819
820Direct pointer to the end of the chunk of text currently being lexed, the
821end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
822+ SvCUR(PL_parser-E<gt>linestr)>. A NUL character (zero octet) is
823always located at the end of the buffer, and does not count as part of
824the buffer's contents.
825
826=for apidoc AmxU|char *|PL_parser-E<gt>bufptr
827
828Points to the current position of lexing inside the lexer buffer.
829Characters around this point may be freely examined, within
830the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
831L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
832interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
833
834Lexing code (whether in the Perl core or not) moves this pointer past
835the characters that it consumes. It is also expected to perform some
836bookkeeping whenever a newline character is consumed. This movement
837can be more conveniently performed by the function L</lex_read_to>,
838which handles newlines appropriately.
839
840Interpretation of the buffer's octets can be abstracted out by
841using the slightly higher-level functions L</lex_peek_unichar> and
842L</lex_read_unichar>.
843
844=for apidoc AmxU|char *|PL_parser-E<gt>linestart
845
846Points to the start of the current line inside the lexer buffer.
847This is useful for indicating at which column an error occurred, and
848not much else. This must be updated by any lexing code that consumes
849a newline; the function L</lex_read_to> handles this detail.
850
851=cut
852*/
853
854/*
855=for apidoc Amx|bool|lex_bufutf8
856
857Indicates whether the octets in the lexer buffer
858(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
859of Unicode characters. If not, they should be interpreted as Latin-1
860characters. This is analogous to the C<SvUTF8> flag for scalars.
861
862In UTF-8 mode, it is not guaranteed that the lexer buffer actually
863contains valid UTF-8. Lexing code must be robust in the face of invalid
864encoding.
865
866The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
867is significant, but not the whole story regarding the input character
868encoding. Normally, when a file is being read, the scalar contains octets
869and its C<SvUTF8> flag is off, but the octets should be interpreted as
870UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
871however, the scalar may have the C<SvUTF8> flag on, and in this case its
872octets should be interpreted as UTF-8 unless the C<use bytes> pragma
873is in effect. This logic may change in the future; use this function
874instead of implementing the logic yourself.
875
876=cut
877*/
878
879bool
880Perl_lex_bufutf8(pTHX)
881{
882 return UTF;
883}
884
885/*
886=for apidoc Amx|char *|lex_grow_linestr|STRLEN len
887
888Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
889at least I<len> octets (including terminating NUL). Returns a
890pointer to the reallocated buffer. This is necessary before making
891any direct modification of the buffer that would increase its length.
892L</lex_stuff_pvn> provides a more convenient way to insert text into
893the buffer.
894
895Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
896this function updates all of the lexer's variables that point directly
897into the buffer.
898
899=cut
900*/
901
902char *
903Perl_lex_grow_linestr(pTHX_ STRLEN len)
904{
905 SV *linestr;
906 char *buf;
907 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
c7641931 908 STRLEN linestart_pos, last_uni_pos, last_lop_pos, re_eval_start_pos;
f0e67a1d
Z
909 linestr = PL_parser->linestr;
910 buf = SvPVX(linestr);
911 if (len <= SvLEN(linestr))
912 return buf;
913 bufend_pos = PL_parser->bufend - buf;
914 bufptr_pos = PL_parser->bufptr - buf;
915 oldbufptr_pos = PL_parser->oldbufptr - buf;
916 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
917 linestart_pos = PL_parser->linestart - buf;
918 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
919 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
c7641931
DM
920 re_eval_start_pos = PL_sublex_info.re_eval_start ?
921 PL_sublex_info.re_eval_start - buf : 0;
922
f0e67a1d 923 buf = sv_grow(linestr, len);
c7641931 924
f0e67a1d
Z
925 PL_parser->bufend = buf + bufend_pos;
926 PL_parser->bufptr = buf + bufptr_pos;
927 PL_parser->oldbufptr = buf + oldbufptr_pos;
928 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
929 PL_parser->linestart = buf + linestart_pos;
930 if (PL_parser->last_uni)
931 PL_parser->last_uni = buf + last_uni_pos;
932 if (PL_parser->last_lop)
933 PL_parser->last_lop = buf + last_lop_pos;
c7641931
DM
934 if (PL_sublex_info.re_eval_start)
935 PL_sublex_info.re_eval_start = buf + re_eval_start_pos;
f0e67a1d
Z
936 return buf;
937}
938
939/*
83aa740e 940=for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
f0e67a1d
Z
941
942Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
943immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
944reallocating the buffer if necessary. This means that lexing code that
945runs later will see the characters as if they had appeared in the input.
946It is not recommended to do this as part of normal parsing, and most
947uses of this facility run the risk of the inserted characters being
948interpreted in an unintended manner.
949
950The string to be inserted is represented by I<len> octets starting
951at I<pv>. These octets are interpreted as either UTF-8 or Latin-1,
952according to whether the C<LEX_STUFF_UTF8> flag is set in I<flags>.
953The characters are recoded for the lexer buffer, according to how the
954buffer is currently being interpreted (L</lex_bufutf8>). If a string
9dcc53ea 955to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
f0e67a1d
Z
956function is more convenient.
957
958=cut
959*/
960
961void
83aa740e 962Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
f0e67a1d 963{
749123ff 964 dVAR;
f0e67a1d
Z
965 char *bufptr;
966 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
967 if (flags & ~(LEX_STUFF_UTF8))
968 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
969 if (UTF) {
970 if (flags & LEX_STUFF_UTF8) {
971 goto plain_copy;
972 } else {
973 STRLEN highhalf = 0;
83aa740e 974 const char *p, *e = pv+len;
f0e67a1d
Z
975 for (p = pv; p != e; p++)
976 highhalf += !!(((U8)*p) & 0x80);
977 if (!highhalf)
978 goto plain_copy;
979 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
980 bufptr = PL_parser->bufptr;
981 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
982 SvCUR_set(PL_parser->linestr,
983 SvCUR(PL_parser->linestr) + len+highhalf);
f0e67a1d
Z
984 PL_parser->bufend += len+highhalf;
985 for (p = pv; p != e; p++) {
986 U8 c = (U8)*p;
987 if (c & 0x80) {
988 *bufptr++ = (char)(0xc0 | (c >> 6));
989 *bufptr++ = (char)(0x80 | (c & 0x3f));
990 } else {
991 *bufptr++ = (char)c;
992 }
993 }
994 }
995 } else {
996 if (flags & LEX_STUFF_UTF8) {
997 STRLEN highhalf = 0;
83aa740e 998 const char *p, *e = pv+len;
f0e67a1d
Z
999 for (p = pv; p != e; p++) {
1000 U8 c = (U8)*p;
1001 if (c >= 0xc4) {
1002 Perl_croak(aTHX_ "Lexing code attempted to stuff "
1003 "non-Latin-1 character into Latin-1 input");
1004 } else if (c >= 0xc2 && p+1 != e &&
1005 (((U8)p[1]) & 0xc0) == 0x80) {
1006 p++;
1007 highhalf++;
1008 } else if (c >= 0x80) {
1009 /* malformed UTF-8 */
1010 ENTER;
1011 SAVESPTR(PL_warnhook);
1012 PL_warnhook = PERL_WARNHOOK_FATAL;
1013 utf8n_to_uvuni((U8*)p, e-p, NULL, 0);
1014 LEAVE;
1015 }
1016 }
1017 if (!highhalf)
1018 goto plain_copy;
1019 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
1020 bufptr = PL_parser->bufptr;
1021 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
1022 SvCUR_set(PL_parser->linestr,
1023 SvCUR(PL_parser->linestr) + len-highhalf);
f0e67a1d
Z
1024 PL_parser->bufend += len-highhalf;
1025 for (p = pv; p != e; p++) {
1026 U8 c = (U8)*p;
1027 if (c & 0x80) {
1028 *bufptr++ = (char)(((c & 0x3) << 6) | (p[1] & 0x3f));
1029 p++;
1030 } else {
1031 *bufptr++ = (char)c;
1032 }
1033 }
1034 } else {
1035 plain_copy:
1036 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1037 bufptr = PL_parser->bufptr;
1038 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
255fdf19 1039 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
f0e67a1d
Z
1040 PL_parser->bufend += len;
1041 Copy(pv, bufptr, len, char);
1042 }
1043 }
1044}
1045
1046/*
9dcc53ea
Z
1047=for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
1048
1049Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1050immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1051reallocating the buffer if necessary. This means that lexing code that
1052runs later will see the characters as if they had appeared in the input.
1053It is not recommended to do this as part of normal parsing, and most
1054uses of this facility run the risk of the inserted characters being
1055interpreted in an unintended manner.
1056
1057The string to be inserted is represented by octets starting at I<pv>
1058and continuing to the first nul. These octets are interpreted as either
1059UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
1060in I<flags>. The characters are recoded for the lexer buffer, according
1061to how the buffer is currently being interpreted (L</lex_bufutf8>).
1062If it is not convenient to nul-terminate a string to be inserted, the
1063L</lex_stuff_pvn> function is more appropriate.
1064
1065=cut
1066*/
1067
1068void
1069Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1070{
1071 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1072 lex_stuff_pvn(pv, strlen(pv), flags);
1073}
1074
1075/*
f0e67a1d
Z
1076=for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1077
1078Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1079immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1080reallocating the buffer if necessary. This means that lexing code that
1081runs later will see the characters as if they had appeared in the input.
1082It is not recommended to do this as part of normal parsing, and most
1083uses of this facility run the risk of the inserted characters being
1084interpreted in an unintended manner.
1085
1086The string to be inserted is the string value of I<sv>. The characters
1087are recoded for the lexer buffer, according to how the buffer is currently
9dcc53ea 1088being interpreted (L</lex_bufutf8>). If a string to be inserted is
f0e67a1d
Z
1089not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1090need to construct a scalar.
1091
1092=cut
1093*/
1094
1095void
1096Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1097{
1098 char *pv;
1099 STRLEN len;
1100 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1101 if (flags)
1102 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1103 pv = SvPV(sv, len);
1104 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1105}
1106
1107/*
1108=for apidoc Amx|void|lex_unstuff|char *ptr
1109
1110Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1111I<ptr>. Text following I<ptr> will be moved, and the buffer shortened.
1112This hides the discarded text from any lexing code that runs later,
1113as if the text had never appeared.
1114
1115This is not the normal way to consume lexed text. For that, use
1116L</lex_read_to>.
1117
1118=cut
1119*/
1120
1121void
1122Perl_lex_unstuff(pTHX_ char *ptr)
1123{
1124 char *buf, *bufend;
1125 STRLEN unstuff_len;
1126 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1127 buf = PL_parser->bufptr;
1128 if (ptr < buf)
1129 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1130 if (ptr == buf)
1131 return;
1132 bufend = PL_parser->bufend;
1133 if (ptr > bufend)
1134 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1135 unstuff_len = ptr - buf;
1136 Move(ptr, buf, bufend+1-ptr, char);
1137 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1138 PL_parser->bufend = bufend - unstuff_len;
1139}
1140
1141/*
1142=for apidoc Amx|void|lex_read_to|char *ptr
1143
1144Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1145to I<ptr>. This advances L</PL_parser-E<gt>bufptr> to match I<ptr>,
1146performing the correct bookkeeping whenever a newline character is passed.
1147This is the normal way to consume lexed text.
1148
1149Interpretation of the buffer's octets can be abstracted out by
1150using the slightly higher-level functions L</lex_peek_unichar> and
1151L</lex_read_unichar>.
1152
1153=cut
1154*/
1155
1156void
1157Perl_lex_read_to(pTHX_ char *ptr)
1158{
1159 char *s;
1160 PERL_ARGS_ASSERT_LEX_READ_TO;
1161 s = PL_parser->bufptr;
1162 if (ptr < s || ptr > PL_parser->bufend)
1163 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1164 for (; s != ptr; s++)
1165 if (*s == '\n') {
1166 CopLINE_inc(PL_curcop);
1167 PL_parser->linestart = s+1;
1168 }
1169 PL_parser->bufptr = ptr;
1170}
1171
1172/*
1173=for apidoc Amx|void|lex_discard_to|char *ptr
1174
1175Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1176up to I<ptr>. The remaining content of the buffer will be moved, and
1177all pointers into the buffer updated appropriately. I<ptr> must not
1178be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1179it is not permitted to discard text that has yet to be lexed.
1180
1181Normally it is not necessarily to do this directly, because it suffices to
1182use the implicit discarding behaviour of L</lex_next_chunk> and things
1183based on it. However, if a token stretches across multiple lines,
1f317c95 1184and the lexing code has kept multiple lines of text in the buffer for
f0e67a1d
Z
1185that purpose, then after completion of the token it would be wise to
1186explicitly discard the now-unneeded earlier lines, to avoid future
1187multi-line tokens growing the buffer without bound.
1188
1189=cut
1190*/
1191
1192void
1193Perl_lex_discard_to(pTHX_ char *ptr)
1194{
1195 char *buf;
1196 STRLEN discard_len;
1197 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1198 buf = SvPVX(PL_parser->linestr);
1199 if (ptr < buf)
1200 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1201 if (ptr == buf)
1202 return;
1203 if (ptr > PL_parser->bufptr)
1204 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1205 discard_len = ptr - buf;
1206 if (PL_parser->oldbufptr < ptr)
1207 PL_parser->oldbufptr = ptr;
1208 if (PL_parser->oldoldbufptr < ptr)
1209 PL_parser->oldoldbufptr = ptr;
1210 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1211 PL_parser->last_uni = NULL;
1212 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1213 PL_parser->last_lop = NULL;
1214 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1215 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1216 PL_parser->bufend -= discard_len;
1217 PL_parser->bufptr -= discard_len;
1218 PL_parser->oldbufptr -= discard_len;
1219 PL_parser->oldoldbufptr -= discard_len;
1220 if (PL_parser->last_uni)
1221 PL_parser->last_uni -= discard_len;
1222 if (PL_parser->last_lop)
1223 PL_parser->last_lop -= discard_len;
1224}
1225
1226/*
1227=for apidoc Amx|bool|lex_next_chunk|U32 flags
1228
1229Reads in the next chunk of text to be lexed, appending it to
1230L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1231looked to the end of the current chunk and wants to know more. It is
1232usual, but not necessary, for lexing to have consumed the entirety of
1233the current chunk at this time.
1234
1235If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1236chunk (i.e., the current chunk has been entirely consumed), normally the
1237current chunk will be discarded at the same time that the new chunk is
1238read in. If I<flags> includes C<LEX_KEEP_PREVIOUS>, the current chunk
1239will not be discarded. If the current chunk has not been entirely
1240consumed, then it will not be discarded regardless of the flag.
1241
1242Returns true if some new text was added to the buffer, or false if the
1243buffer has reached the end of the input text.
1244
1245=cut
1246*/
1247
1248#define LEX_FAKE_EOF 0x80000000
1249
1250bool
1251Perl_lex_next_chunk(pTHX_ U32 flags)
1252{
1253 SV *linestr;
1254 char *buf;
1255 STRLEN old_bufend_pos, new_bufend_pos;
1256 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1257 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1258 bool got_some_for_debugger = 0;
f0e67a1d
Z
1259 bool got_some;
1260 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF))
1261 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
f0e67a1d
Z
1262 linestr = PL_parser->linestr;
1263 buf = SvPVX(linestr);
1264 if (!(flags & LEX_KEEP_PREVIOUS) &&
1265 PL_parser->bufptr == PL_parser->bufend) {
1266 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1267 linestart_pos = 0;
1268 if (PL_parser->last_uni != PL_parser->bufend)
1269 PL_parser->last_uni = NULL;
1270 if (PL_parser->last_lop != PL_parser->bufend)
1271 PL_parser->last_lop = NULL;
1272 last_uni_pos = last_lop_pos = 0;
1273 *buf = 0;
1274 SvCUR(linestr) = 0;
1275 } else {
1276 old_bufend_pos = PL_parser->bufend - buf;
1277 bufptr_pos = PL_parser->bufptr - buf;
1278 oldbufptr_pos = PL_parser->oldbufptr - buf;
1279 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1280 linestart_pos = PL_parser->linestart - buf;
1281 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1282 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1283 }
1284 if (flags & LEX_FAKE_EOF) {
1285 goto eof;
60d63348 1286 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
f0e67a1d
Z
1287 got_some = 0;
1288 } else if (filter_gets(linestr, old_bufend_pos)) {
1289 got_some = 1;
17cc9359 1290 got_some_for_debugger = 1;
f0e67a1d 1291 } else {
580561a3
Z
1292 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1293 sv_setpvs(linestr, "");
f0e67a1d
Z
1294 eof:
1295 /* End of real input. Close filehandle (unless it was STDIN),
1296 * then add implicit termination.
1297 */
87606032 1298 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
f0e67a1d
Z
1299 PerlIO_clearerr(PL_parser->rsfp);
1300 else if (PL_parser->rsfp)
1301 (void)PerlIO_close(PL_parser->rsfp);
1302 PL_parser->rsfp = NULL;
60d63348 1303 PL_parser->in_pod = PL_parser->filtered = 0;
f0e67a1d
Z
1304#ifdef PERL_MAD
1305 if (PL_madskills && !PL_in_eval && (PL_minus_p || PL_minus_n))
1306 PL_faketokens = 1;
1307#endif
1308 if (!PL_in_eval && PL_minus_p) {
1309 sv_catpvs(linestr,
1310 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1311 PL_minus_n = PL_minus_p = 0;
1312 } else if (!PL_in_eval && PL_minus_n) {
1313 sv_catpvs(linestr, /*{*/";}");
1314 PL_minus_n = 0;
1315 } else
1316 sv_catpvs(linestr, ";");
1317 got_some = 1;
1318 }
1319 buf = SvPVX(linestr);
1320 new_bufend_pos = SvCUR(linestr);
1321 PL_parser->bufend = buf + new_bufend_pos;
1322 PL_parser->bufptr = buf + bufptr_pos;
1323 PL_parser->oldbufptr = buf + oldbufptr_pos;
1324 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1325 PL_parser->linestart = buf + linestart_pos;
1326 if (PL_parser->last_uni)
1327 PL_parser->last_uni = buf + last_uni_pos;
1328 if (PL_parser->last_lop)
1329 PL_parser->last_lop = buf + last_lop_pos;
17cc9359 1330 if (got_some_for_debugger && (PERLDB_LINE || PERLDB_SAVESRC) &&
f0e67a1d
Z
1331 PL_curstash != PL_debstash) {
1332 /* debugger active and we're not compiling the debugger code,
1333 * so store the line into the debugger's array of lines
1334 */
1335 update_debugger_info(NULL, buf+old_bufend_pos,
1336 new_bufend_pos-old_bufend_pos);
1337 }
1338 return got_some;
1339}
1340
1341/*
1342=for apidoc Amx|I32|lex_peek_unichar|U32 flags
1343
1344Looks ahead one (Unicode) character in the text currently being lexed.
1345Returns the codepoint (unsigned integer value) of the next character,
1346or -1 if lexing has reached the end of the input text. To consume the
1347peeked character, use L</lex_read_unichar>.
1348
1349If the next character is in (or extends into) the next chunk of input
1350text, the next chunk will be read in. Normally the current chunk will be
1351discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1352then the current chunk will not be discarded.
1353
1354If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1355is encountered, an exception is generated.
1356
1357=cut
1358*/
1359
1360I32
1361Perl_lex_peek_unichar(pTHX_ U32 flags)
1362{
749123ff 1363 dVAR;
f0e67a1d
Z
1364 char *s, *bufend;
1365 if (flags & ~(LEX_KEEP_PREVIOUS))
1366 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1367 s = PL_parser->bufptr;
1368 bufend = PL_parser->bufend;
1369 if (UTF) {
1370 U8 head;
1371 I32 unichar;
1372 STRLEN len, retlen;
1373 if (s == bufend) {
1374 if (!lex_next_chunk(flags))
1375 return -1;
1376 s = PL_parser->bufptr;
1377 bufend = PL_parser->bufend;
1378 }
1379 head = (U8)*s;
1380 if (!(head & 0x80))
1381 return head;
1382 if (head & 0x40) {
1383 len = PL_utf8skip[head];
1384 while ((STRLEN)(bufend-s) < len) {
1385 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1386 break;
1387 s = PL_parser->bufptr;
1388 bufend = PL_parser->bufend;
1389 }
1390 }
1391 unichar = utf8n_to_uvuni((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
1392 if (retlen == (STRLEN)-1) {
1393 /* malformed UTF-8 */
1394 ENTER;
1395 SAVESPTR(PL_warnhook);
1396 PL_warnhook = PERL_WARNHOOK_FATAL;
1397 utf8n_to_uvuni((U8*)s, bufend-s, NULL, 0);
1398 LEAVE;
1399 }
1400 return unichar;
1401 } else {
1402 if (s == bufend) {
1403 if (!lex_next_chunk(flags))
1404 return -1;
1405 s = PL_parser->bufptr;
1406 }
1407 return (U8)*s;
1408 }
1409}
1410
1411/*
1412=for apidoc Amx|I32|lex_read_unichar|U32 flags
1413
1414Reads the next (Unicode) character in the text currently being lexed.
1415Returns the codepoint (unsigned integer value) of the character read,
1416and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1417if lexing has reached the end of the input text. To non-destructively
1418examine the next character, use L</lex_peek_unichar> instead.
1419
1420If the next character is in (or extends into) the next chunk of input
1421text, the next chunk will be read in. Normally the current chunk will be
1422discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1423then the current chunk will not be discarded.
1424
1425If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1426is encountered, an exception is generated.
1427
1428=cut
1429*/
1430
1431I32
1432Perl_lex_read_unichar(pTHX_ U32 flags)
1433{
1434 I32 c;
1435 if (flags & ~(LEX_KEEP_PREVIOUS))
1436 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1437 c = lex_peek_unichar(flags);
1438 if (c != -1) {
1439 if (c == '\n')
1440 CopLINE_inc(PL_curcop);
d9018cbe
EB
1441 if (UTF)
1442 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1443 else
1444 ++(PL_parser->bufptr);
f0e67a1d
Z
1445 }
1446 return c;
1447}
1448
1449/*
1450=for apidoc Amx|void|lex_read_space|U32 flags
1451
1452Reads optional spaces, in Perl style, in the text currently being
1453lexed. The spaces may include ordinary whitespace characters and
1454Perl-style comments. C<#line> directives are processed if encountered.
1455L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1456at a non-space character (or the end of the input text).
1457
1458If spaces extend into the next chunk of input text, the next chunk will
1459be read in. Normally the current chunk will be discarded at the same
1460time, but if I<flags> includes C<LEX_KEEP_PREVIOUS> then the current
1461chunk will not be discarded.
1462
1463=cut
1464*/
1465
f0998909
Z
1466#define LEX_NO_NEXT_CHUNK 0x80000000
1467
f0e67a1d
Z
1468void
1469Perl_lex_read_space(pTHX_ U32 flags)
1470{
1471 char *s, *bufend;
1472 bool need_incline = 0;
f0998909 1473 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK))
f0e67a1d
Z
1474 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
1475#ifdef PERL_MAD
1476 if (PL_skipwhite) {
1477 sv_free(PL_skipwhite);
1478 PL_skipwhite = NULL;
1479 }
1480 if (PL_madskills)
1481 PL_skipwhite = newSVpvs("");
1482#endif /* PERL_MAD */
1483 s = PL_parser->bufptr;
1484 bufend = PL_parser->bufend;
1485 while (1) {
1486 char c = *s;
1487 if (c == '#') {
1488 do {
1489 c = *++s;
1490 } while (!(c == '\n' || (c == 0 && s == bufend)));
1491 } else if (c == '\n') {
1492 s++;
1493 PL_parser->linestart = s;
1494 if (s == bufend)
1495 need_incline = 1;
1496 else
1497 incline(s);
1498 } else if (isSPACE(c)) {
1499 s++;
1500 } else if (c == 0 && s == bufend) {
1501 bool got_more;
1502#ifdef PERL_MAD
1503 if (PL_madskills)
1504 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1505#endif /* PERL_MAD */
f0998909
Z
1506 if (flags & LEX_NO_NEXT_CHUNK)
1507 break;
f0e67a1d
Z
1508 PL_parser->bufptr = s;
1509 CopLINE_inc(PL_curcop);
1510 got_more = lex_next_chunk(flags);
1511 CopLINE_dec(PL_curcop);
1512 s = PL_parser->bufptr;
1513 bufend = PL_parser->bufend;
1514 if (!got_more)
1515 break;
1516 if (need_incline && PL_parser->rsfp) {
1517 incline(s);
1518 need_incline = 0;
1519 }
1520 } else {
1521 break;
1522 }
1523 }
1524#ifdef PERL_MAD
1525 if (PL_madskills)
1526 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1527#endif /* PERL_MAD */
1528 PL_parser->bufptr = s;
1529}
1530
1531/*
ffb4593c
NT
1532 * S_incline
1533 * This subroutine has nothing to do with tilting, whether at windmills
1534 * or pinball tables. Its name is short for "increment line". It
57843af0 1535 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1536 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1537 * # line 500 "foo.pm"
1538 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1539 */
1540
76e3520e 1541STATIC void
d9095cec 1542S_incline(pTHX_ const char *s)
463ee0b2 1543{
97aff369 1544 dVAR;
d9095cec
NC
1545 const char *t;
1546 const char *n;
1547 const char *e;
8818d409 1548 line_t line_num;
463ee0b2 1549
7918f24d
NC
1550 PERL_ARGS_ASSERT_INCLINE;
1551
57843af0 1552 CopLINE_inc(PL_curcop);
463ee0b2
LW
1553 if (*s++ != '#')
1554 return;
d4c19fe8
AL
1555 while (SPACE_OR_TAB(*s))
1556 s++;
73659bf1
GS
1557 if (strnEQ(s, "line", 4))
1558 s += 4;
1559 else
1560 return;
084592ab 1561 if (SPACE_OR_TAB(*s))
73659bf1 1562 s++;
4e553d73 1563 else
73659bf1 1564 return;
d4c19fe8
AL
1565 while (SPACE_OR_TAB(*s))
1566 s++;
463ee0b2
LW
1567 if (!isDIGIT(*s))
1568 return;
d4c19fe8 1569
463ee0b2
LW
1570 n = s;
1571 while (isDIGIT(*s))
1572 s++;
07714eb4 1573 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1574 return;
bf4acbe4 1575 while (SPACE_OR_TAB(*s))
463ee0b2 1576 s++;
73659bf1 1577 if (*s == '"' && (t = strchr(s+1, '"'))) {
463ee0b2 1578 s++;
73659bf1
GS
1579 e = t + 1;
1580 }
463ee0b2 1581 else {
c35e046a
AL
1582 t = s;
1583 while (!isSPACE(*t))
1584 t++;
73659bf1 1585 e = t;
463ee0b2 1586 }
bf4acbe4 1587 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1588 e++;
1589 if (*e != '\n' && *e != '\0')
1590 return; /* false alarm */
1591
8818d409
FC
1592 line_num = atoi(n)-1;
1593
f4dd75d9 1594 if (t - s > 0) {
d9095cec 1595 const STRLEN len = t - s;
19bad673
NC
1596 SV *const temp_sv = CopFILESV(PL_curcop);
1597 const char *cf;
1598 STRLEN tmplen;
1599
1600 if (temp_sv) {
1601 cf = SvPVX(temp_sv);
1602 tmplen = SvCUR(temp_sv);
1603 } else {
1604 cf = NULL;
1605 tmplen = 0;
1606 }
1607
d1299d44 1608 if (!PL_rsfp && !PL_parser->filtered) {
e66cf94c
RGS
1609 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1610 * to *{"::_<newfilename"} */
44867030
NC
1611 /* However, the long form of evals is only turned on by the
1612 debugger - usually they're "(eval %lu)" */
1613 char smallbuf[128];
1614 char *tmpbuf;
1615 GV **gvp;
d9095cec 1616 STRLEN tmplen2 = len;
798b63bc 1617 if (tmplen + 2 <= sizeof smallbuf)
e66cf94c
RGS
1618 tmpbuf = smallbuf;
1619 else
2ae0db35 1620 Newx(tmpbuf, tmplen + 2, char);
44867030
NC
1621 tmpbuf[0] = '_';
1622 tmpbuf[1] = '<';
2ae0db35 1623 memcpy(tmpbuf + 2, cf, tmplen);
44867030 1624 tmplen += 2;
8a5ee598
RGS
1625 gvp = (GV**)hv_fetch(PL_defstash, tmpbuf, tmplen, FALSE);
1626 if (gvp) {
44867030
NC
1627 char *tmpbuf2;
1628 GV *gv2;
1629
1630 if (tmplen2 + 2 <= sizeof smallbuf)
1631 tmpbuf2 = smallbuf;
1632 else
1633 Newx(tmpbuf2, tmplen2 + 2, char);
1634
1635 if (tmpbuf2 != smallbuf || tmpbuf != smallbuf) {
1636 /* Either they malloc'd it, or we malloc'd it,
1637 so no prefix is present in ours. */
1638 tmpbuf2[0] = '_';
1639 tmpbuf2[1] = '<';
1640 }
1641
1642 memcpy(tmpbuf2 + 2, s, tmplen2);
1643 tmplen2 += 2;
1644
8a5ee598 1645 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1646 if (!isGV(gv2)) {
8a5ee598 1647 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1648 /* adjust ${"::_<newfilename"} to store the new file name */
1649 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
8818d409
FC
1650 /* The line number may differ. If that is the case,
1651 alias the saved lines that are in the array.
1652 Otherwise alias the whole array. */
1653 if (CopLINE(PL_curcop) == line_num) {
1654 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(*gvp)));
1655 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(*gvp)));
1656 }
1657 else if (GvAV(*gvp)) {
1658 AV * const av = GvAV(*gvp);
1659 const I32 start = CopLINE(PL_curcop)+1;
1660 I32 items = AvFILLp(av) - start;
1661 if (items > 0) {
1662 AV * const av2 = GvAVn(gv2);
1663 SV **svp = AvARRAY(av) + start;
1664 I32 l = (I32)line_num+1;
1665 while (items--)
1666 av_store(av2, l++, SvREFCNT_inc(*svp++));
1667 }
1668 }
e5527e4b 1669 }
44867030
NC
1670
1671 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
8a5ee598 1672 }
e66cf94c 1673 if (tmpbuf != smallbuf) Safefree(tmpbuf);
e66cf94c 1674 }
05ec9bb3 1675 CopFILE_free(PL_curcop);
d9095cec 1676 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1677 }
8818d409 1678 CopLINE_set(PL_curcop, line_num);
463ee0b2
LW
1679}
1680
29595ff2 1681#ifdef PERL_MAD
cd81e915 1682/* skip space before PL_thistoken */
29595ff2
NC
1683
1684STATIC char *
1685S_skipspace0(pTHX_ register char *s)
1686{
7918f24d
NC
1687 PERL_ARGS_ASSERT_SKIPSPACE0;
1688
29595ff2
NC
1689 s = skipspace(s);
1690 if (!PL_madskills)
1691 return s;
cd81e915
NC
1692 if (PL_skipwhite) {
1693 if (!PL_thiswhite)
6b29d1f5 1694 PL_thiswhite = newSVpvs("");
cd81e915
NC
1695 sv_catsv(PL_thiswhite, PL_skipwhite);
1696 sv_free(PL_skipwhite);
1697 PL_skipwhite = 0;
1698 }
1699 PL_realtokenstart = s - SvPVX(PL_linestr);
29595ff2
NC
1700 return s;
1701}
1702
cd81e915 1703/* skip space after PL_thistoken */
29595ff2
NC
1704
1705STATIC char *
1706S_skipspace1(pTHX_ register char *s)
1707{
d4c19fe8 1708 const char *start = s;
29595ff2
NC
1709 I32 startoff = start - SvPVX(PL_linestr);
1710
7918f24d
NC
1711 PERL_ARGS_ASSERT_SKIPSPACE1;
1712
29595ff2
NC
1713 s = skipspace(s);
1714 if (!PL_madskills)
1715 return s;
1716 start = SvPVX(PL_linestr) + startoff;
cd81e915 1717 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1718 const char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1719 PL_thistoken = newSVpvn(tstart, start - tstart);
1720 }
1721 PL_realtokenstart = -1;
1722 if (PL_skipwhite) {
1723 if (!PL_nextwhite)
6b29d1f5 1724 PL_nextwhite = newSVpvs("");
cd81e915
NC
1725 sv_catsv(PL_nextwhite, PL_skipwhite);
1726 sv_free(PL_skipwhite);
1727 PL_skipwhite = 0;
29595ff2
NC
1728 }
1729 return s;
1730}
1731
1732STATIC char *
1733S_skipspace2(pTHX_ register char *s, SV **svp)
1734{
c35e046a
AL
1735 char *start;
1736 const I32 bufptroff = PL_bufptr - SvPVX(PL_linestr);
1737 const I32 startoff = s - SvPVX(PL_linestr);
1738
7918f24d
NC
1739 PERL_ARGS_ASSERT_SKIPSPACE2;
1740
29595ff2
NC
1741 s = skipspace(s);
1742 PL_bufptr = SvPVX(PL_linestr) + bufptroff;
1743 if (!PL_madskills || !svp)
1744 return s;
1745 start = SvPVX(PL_linestr) + startoff;
cd81e915 1746 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1747 char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1748 PL_thistoken = newSVpvn(tstart, start - tstart);
1749 PL_realtokenstart = -1;
29595ff2 1750 }
cd81e915 1751 if (PL_skipwhite) {
29595ff2 1752 if (!*svp)
6b29d1f5 1753 *svp = newSVpvs("");
cd81e915
NC
1754 sv_setsv(*svp, PL_skipwhite);
1755 sv_free(PL_skipwhite);
1756 PL_skipwhite = 0;
29595ff2
NC
1757 }
1758
1759 return s;
1760}
1761#endif
1762
80a702cd 1763STATIC void
15f169a1 1764S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1765{
1766 AV *av = CopFILEAVx(PL_curcop);
1767 if (av) {
b9f83d2f 1768 SV * const sv = newSV_type(SVt_PVMG);
5fa550fb
NC
1769 if (orig_sv)
1770 sv_setsv(sv, orig_sv);
1771 else
1772 sv_setpvn(sv, buf, len);
80a702cd
RGS
1773 (void)SvIOK_on(sv);
1774 SvIV_set(sv, 0);
1775 av_store(av, (I32)CopLINE(PL_curcop), sv);
1776 }
1777}
1778
ffb4593c
NT
1779/*
1780 * S_skipspace
1781 * Called to gobble the appropriate amount and type of whitespace.
1782 * Skips comments as well.
1783 */
1784
76e3520e 1785STATIC char *
cea2e8a9 1786S_skipspace(pTHX_ register char *s)
a687059c 1787{
5db06880 1788#ifdef PERL_MAD
f0e67a1d
Z
1789 char *start = s;
1790#endif /* PERL_MAD */
7918f24d 1791 PERL_ARGS_ASSERT_SKIPSPACE;
f0e67a1d 1792#ifdef PERL_MAD
cd81e915
NC
1793 if (PL_skipwhite) {
1794 sv_free(PL_skipwhite);
f0e67a1d 1795 PL_skipwhite = NULL;
5db06880 1796 }
f0e67a1d 1797#endif /* PERL_MAD */
3280af22 1798 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
bf4acbe4 1799 while (s < PL_bufend && SPACE_OR_TAB(*s))
463ee0b2 1800 s++;
f0e67a1d
Z
1801 } else {
1802 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1803 PL_bufptr = s;
f0998909
Z
1804 lex_read_space(LEX_KEEP_PREVIOUS |
1805 (PL_sublex_info.sub_inwhat || PL_lex_state == LEX_FORMLINE ?
1806 LEX_NO_NEXT_CHUNK : 0));
3280af22 1807 s = PL_bufptr;
f0e67a1d
Z
1808 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1809 if (PL_linestart > PL_bufptr)
1810 PL_bufptr = PL_linestart;
1811 return s;
463ee0b2 1812 }
5db06880 1813#ifdef PERL_MAD
f0e67a1d
Z
1814 if (PL_madskills)
1815 PL_skipwhite = newSVpvn(start, s-start);
1816#endif /* PERL_MAD */
5db06880 1817 return s;
a687059c 1818}
378cc40b 1819
ffb4593c
NT
1820/*
1821 * S_check_uni
1822 * Check the unary operators to ensure there's no ambiguity in how they're
1823 * used. An ambiguous piece of code would be:
1824 * rand + 5
1825 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1826 * the +5 is its argument.
1827 */
1828
76e3520e 1829STATIC void
cea2e8a9 1830S_check_uni(pTHX)
ba106d47 1831{
97aff369 1832 dVAR;
d4c19fe8
AL
1833 const char *s;
1834 const char *t;
2f3197b3 1835
3280af22 1836 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1837 return;
3280af22
NIS
1838 while (isSPACE(*PL_last_uni))
1839 PL_last_uni++;
c35e046a
AL
1840 s = PL_last_uni;
1841 while (isALNUM_lazy_if(s,UTF) || *s == '-')
1842 s++;
3280af22 1843 if ((t = strchr(s, '(')) && t < PL_bufptr)
a0d0e21e 1844 return;
6136c704 1845
9b387841
NC
1846 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
1847 "Warning: Use of \"%.*s\" without parentheses is ambiguous",
1848 (int)(s - PL_last_uni), PL_last_uni);
2f3197b3
LW
1849}
1850
ffb4593c
NT
1851/*
1852 * LOP : macro to build a list operator. Its behaviour has been replaced
1853 * with a subroutine, S_lop() for which LOP is just another name.
1854 */
1855
a0d0e21e
LW
1856#define LOP(f,x) return lop(f,x,s)
1857
ffb4593c
NT
1858/*
1859 * S_lop
1860 * Build a list operator (or something that might be one). The rules:
1861 * - if we have a next token, then it's a list operator [why?]
1862 * - if the next thing is an opening paren, then it's a function
1863 * - else it's a list operator
1864 */
1865
76e3520e 1866STATIC I32
a0be28da 1867S_lop(pTHX_ I32 f, int x, char *s)
ffed7fef 1868{
97aff369 1869 dVAR;
7918f24d
NC
1870
1871 PERL_ARGS_ASSERT_LOP;
1872
6154021b 1873 pl_yylval.ival = f;
35c8bce7 1874 CLINE;
3280af22
NIS
1875 PL_expect = x;
1876 PL_bufptr = s;
1877 PL_last_lop = PL_oldbufptr;
eb160463 1878 PL_last_lop_op = (OPCODE)f;
5db06880
NC
1879#ifdef PERL_MAD
1880 if (PL_lasttoke)
78cdf107 1881 goto lstop;
5db06880 1882#else
3280af22 1883 if (PL_nexttoke)
78cdf107 1884 goto lstop;
5db06880 1885#endif
79072805 1886 if (*s == '(')
bbf60fe6 1887 return REPORT(FUNC);
29595ff2 1888 s = PEEKSPACE(s);
79072805 1889 if (*s == '(')
bbf60fe6 1890 return REPORT(FUNC);
78cdf107
Z
1891 else {
1892 lstop:
1893 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
1894 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
bbf60fe6 1895 return REPORT(LSTOP);
78cdf107 1896 }
79072805
LW
1897}
1898
5db06880
NC
1899#ifdef PERL_MAD
1900 /*
1901 * S_start_force
1902 * Sets up for an eventual force_next(). start_force(0) basically does
1903 * an unshift, while start_force(-1) does a push. yylex removes items
1904 * on the "pop" end.
1905 */
1906
1907STATIC void
1908S_start_force(pTHX_ int where)
1909{
1910 int i;
1911
cd81e915 1912 if (where < 0) /* so people can duplicate start_force(PL_curforce) */
5db06880 1913 where = PL_lasttoke;
cd81e915
NC
1914 assert(PL_curforce < 0 || PL_curforce == where);
1915 if (PL_curforce != where) {
5db06880
NC
1916 for (i = PL_lasttoke; i > where; --i) {
1917 PL_nexttoke[i] = PL_nexttoke[i-1];
1918 }
1919 PL_lasttoke++;
1920 }
cd81e915 1921 if (PL_curforce < 0) /* in case of duplicate start_force() */
5db06880 1922 Zero(&PL_nexttoke[where], 1, NEXTTOKE);
cd81e915
NC
1923 PL_curforce = where;
1924 if (PL_nextwhite) {
5db06880 1925 if (PL_madskills)
6b29d1f5 1926 curmad('^', newSVpvs(""));
cd81e915 1927 CURMAD('_', PL_nextwhite);
5db06880
NC
1928 }
1929}
1930
1931STATIC void
1932S_curmad(pTHX_ char slot, SV *sv)
1933{
1934 MADPROP **where;
1935
1936 if (!sv)
1937 return;
cd81e915
NC
1938 if (PL_curforce < 0)
1939 where = &PL_thismad;
5db06880 1940 else
cd81e915 1941 where = &PL_nexttoke[PL_curforce].next_mad;
5db06880 1942
cd81e915 1943 if (PL_faketokens)
76f68e9b 1944 sv_setpvs(sv, "");
5db06880
NC
1945 else {
1946 if (!IN_BYTES) {
1947 if (UTF && is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
1948 SvUTF8_on(sv);
1949 else if (PL_encoding) {
1950 sv_recode_to_utf8(sv, PL_encoding);
1951 }
1952 }
1953 }
1954
1955 /* keep a slot open for the head of the list? */
1956 if (slot != '_' && *where && (*where)->mad_key == '^') {
1957 (*where)->mad_key = slot;
daba3364 1958 sv_free(MUTABLE_SV(((*where)->mad_val)));
5db06880
NC
1959 (*where)->mad_val = (void*)sv;
1960 }
1961 else
1962 addmad(newMADsv(slot, sv), where, 0);
1963}
1964#else
b3f24c00
MHM
1965# define start_force(where) NOOP
1966# define curmad(slot, sv) NOOP
5db06880
NC
1967#endif
1968
ffb4593c
NT
1969/*
1970 * S_force_next
9cbb5ea2 1971 * When the lexer realizes it knows the next token (for instance,
ffb4593c 1972 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 1973 * to know what token to return the next time the lexer is called. Caller
5db06880
NC
1974 * will need to set PL_nextval[] (or PL_nexttoke[].next_val with PERL_MAD),
1975 * and possibly PL_expect to ensure the lexer handles the token correctly.
ffb4593c
NT
1976 */
1977
4e553d73 1978STATIC void
cea2e8a9 1979S_force_next(pTHX_ I32 type)
79072805 1980{
97aff369 1981 dVAR;
704d4215
GG
1982#ifdef DEBUGGING
1983 if (DEBUG_T_TEST) {
1984 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 1985 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
1986 }
1987#endif
5db06880 1988#ifdef PERL_MAD
cd81e915 1989 if (PL_curforce < 0)
5db06880 1990 start_force(PL_lasttoke);
cd81e915 1991 PL_nexttoke[PL_curforce].next_type = type;
5db06880
NC
1992 if (PL_lex_state != LEX_KNOWNEXT)
1993 PL_lex_defer = PL_lex_state;
1994 PL_lex_state = LEX_KNOWNEXT;
1995 PL_lex_expect = PL_expect;
cd81e915 1996 PL_curforce = -1;
5db06880 1997#else
3280af22
NIS
1998 PL_nexttype[PL_nexttoke] = type;
1999 PL_nexttoke++;
2000 if (PL_lex_state != LEX_KNOWNEXT) {
2001 PL_lex_defer = PL_lex_state;
2002 PL_lex_expect = PL_expect;
2003 PL_lex_state = LEX_KNOWNEXT;
79072805 2004 }
5db06880 2005#endif
79072805
LW
2006}
2007
28ac2b49
Z
2008void
2009Perl_yyunlex(pTHX)
2010{
a7aaec61
Z
2011 int yyc = PL_parser->yychar;
2012 if (yyc != YYEMPTY) {
2013 if (yyc) {
2014 start_force(-1);
2015 NEXTVAL_NEXTTOKE = PL_parser->yylval;
2016 if (yyc == '{'/*}*/ || yyc == HASHBRACK || yyc == '['/*]*/) {
78cdf107 2017 PL_lex_allbrackets--;
a7aaec61 2018 PL_lex_brackets--;
78cdf107
Z
2019 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
2020 } else if (yyc == '('/*)*/) {
2021 PL_lex_allbrackets--;
2022 yyc |= (2<<24);
a7aaec61
Z
2023 }
2024 force_next(yyc);
2025 }
28ac2b49
Z
2026 PL_parser->yychar = YYEMPTY;
2027 }
2028}
2029
d0a148a6 2030STATIC SV *
15f169a1 2031S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 2032{
97aff369 2033 dVAR;
740cce10 2034 SV * const sv = newSVpvn_utf8(start, len,
eaf7a4d2
CS
2035 !IN_BYTES
2036 && UTF
2037 && !is_ascii_string((const U8*)start, len)
740cce10 2038 && is_utf8_string((const U8*)start, len));
d0a148a6
NC
2039 return sv;
2040}
2041
ffb4593c
NT
2042/*
2043 * S_force_word
2044 * When the lexer knows the next thing is a word (for instance, it has
2045 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
2046 * it calls S_force_word to stick the next word into the PL_nexttoke/val
2047 * lookahead.
ffb4593c
NT
2048 *
2049 * Arguments:
b1b65b59 2050 * char *start : buffer position (must be within PL_linestr)
02b34bbe 2051 * int token : PL_next* will be this type of bare word (e.g., METHOD,WORD)
ffb4593c
NT
2052 * int check_keyword : if true, Perl checks to make sure the word isn't
2053 * a keyword (do this if the word is a label, e.g. goto FOO)
2054 * int allow_pack : if true, : characters will also be allowed (require,
2055 * use, etc. do this)
9cbb5ea2 2056 * int allow_initial_tick : used by the "sub" lexer only.
ffb4593c
NT
2057 */
2058
76e3520e 2059STATIC char *
cea2e8a9 2060S_force_word(pTHX_ register char *start, int token, int check_keyword, int allow_pack, int allow_initial_tick)
79072805 2061{
97aff369 2062 dVAR;
463ee0b2
LW
2063 register char *s;
2064 STRLEN len;
4e553d73 2065
7918f24d
NC
2066 PERL_ARGS_ASSERT_FORCE_WORD;
2067
29595ff2 2068 start = SKIPSPACE1(start);
463ee0b2 2069 s = start;
7e2040f0 2070 if (isIDFIRST_lazy_if(s,UTF) ||
a0d0e21e 2071 (allow_pack && *s == ':') ||
15f0808c 2072 (allow_initial_tick && *s == '\'') )
a0d0e21e 2073 {
3280af22 2074 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
5458a98a 2075 if (check_keyword && keyword(PL_tokenbuf, len, 0))
463ee0b2 2076 return start;
cd81e915 2077 start_force(PL_curforce);
5db06880
NC
2078 if (PL_madskills)
2079 curmad('X', newSVpvn(start,s-start));
463ee0b2 2080 if (token == METHOD) {
29595ff2 2081 s = SKIPSPACE1(s);
463ee0b2 2082 if (*s == '(')
3280af22 2083 PL_expect = XTERM;
463ee0b2 2084 else {
3280af22 2085 PL_expect = XOPERATOR;
463ee0b2 2086 }
79072805 2087 }
e74e6b3d 2088 if (PL_madskills)
63575281 2089 curmad('g', newSVpvs( "forced" ));
9ded7720 2090 NEXTVAL_NEXTTOKE.opval
d0a148a6
NC
2091 = (OP*)newSVOP(OP_CONST,0,
2092 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 2093 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
2094 force_next(token);
2095 }
2096 return s;
2097}
2098
ffb4593c
NT
2099/*
2100 * S_force_ident
9cbb5ea2 2101 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
2102 * text only contains the "foo" portion. The first argument is a pointer
2103 * to the "foo", and the second argument is the type symbol to prefix.
2104 * Forces the next token to be a "WORD".
9cbb5ea2 2105 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2106 */
2107
76e3520e 2108STATIC void
bfed75c6 2109S_force_ident(pTHX_ register const char *s, int kind)
79072805 2110{
97aff369 2111 dVAR;
7918f24d
NC
2112
2113 PERL_ARGS_ASSERT_FORCE_IDENT;
2114
c35e046a 2115 if (*s) {
90e5519e 2116 const STRLEN len = strlen(s);
728847b1
BF
2117 OP* const o = (OP*)newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
2118 UTF ? SVf_UTF8 : 0));
cd81e915 2119 start_force(PL_curforce);
9ded7720 2120 NEXTVAL_NEXTTOKE.opval = o;
79072805 2121 force_next(WORD);
748a9306 2122 if (kind) {
11343788 2123 o->op_private = OPpCONST_ENTERED;
55497cff 2124 /* XXX see note in pp_entereval() for why we forgo typo
2125 warnings if the symbol must be introduced in an eval.
2126 GSAR 96-10-12 */
90e5519e 2127 gv_fetchpvn_flags(s, len,
728847b1
BF
2128 (PL_in_eval ? (GV_ADDMULTI | GV_ADDINEVAL)
2129 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
90e5519e
NC
2130 kind == '$' ? SVt_PV :
2131 kind == '@' ? SVt_PVAV :
2132 kind == '%' ? SVt_PVHV :
a0d0e21e 2133 SVt_PVGV
90e5519e 2134 );
748a9306 2135 }
79072805
LW
2136 }
2137}
2138
1571675a
GS
2139NV
2140Perl_str_to_version(pTHX_ SV *sv)
2141{
2142 NV retval = 0.0;
2143 NV nshift = 1.0;
2144 STRLEN len;
cfd0369c 2145 const char *start = SvPV_const(sv,len);
9d4ba2ae 2146 const char * const end = start + len;
504618e9 2147 const bool utf = SvUTF8(sv) ? TRUE : FALSE;
7918f24d
NC
2148
2149 PERL_ARGS_ASSERT_STR_TO_VERSION;
2150
1571675a 2151 while (start < end) {
ba210ebe 2152 STRLEN skip;
1571675a
GS
2153 UV n;
2154 if (utf)
9041c2e3 2155 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2156 else {
2157 n = *(U8*)start;
2158 skip = 1;
2159 }
2160 retval += ((NV)n)/nshift;
2161 start += skip;
2162 nshift *= 1000;
2163 }
2164 return retval;
2165}
2166
4e553d73 2167/*
ffb4593c
NT
2168 * S_force_version
2169 * Forces the next token to be a version number.
e759cc13
RGS
2170 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2171 * and if "guessing" is TRUE, then no new token is created (and the caller
2172 * must use an alternative parsing method).
ffb4593c
NT
2173 */
2174
76e3520e 2175STATIC char *
e759cc13 2176S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2177{
97aff369 2178 dVAR;
5f66b61c 2179 OP *version = NULL;
44dcb63b 2180 char *d;
5db06880
NC
2181#ifdef PERL_MAD
2182 I32 startoff = s - SvPVX(PL_linestr);
2183#endif
89bfa8cd 2184
7918f24d
NC
2185 PERL_ARGS_ASSERT_FORCE_VERSION;
2186
29595ff2 2187 s = SKIPSPACE1(s);
89bfa8cd 2188
44dcb63b 2189 d = s;
dd629d5b 2190 if (*d == 'v')
44dcb63b 2191 d++;
44dcb63b 2192 if (isDIGIT(*d)) {
e759cc13
RGS
2193 while (isDIGIT(*d) || *d == '_' || *d == '.')
2194 d++;
5db06880
NC
2195#ifdef PERL_MAD
2196 if (PL_madskills) {
cd81e915 2197 start_force(PL_curforce);
5db06880
NC
2198 curmad('X', newSVpvn(s,d-s));
2199 }
2200#endif
4e4da3ac 2201 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
dd629d5b 2202 SV *ver;
8d08d9ba 2203#ifdef USE_LOCALE_NUMERIC
909d3787
KW
2204 char *loc = savepv(setlocale(LC_NUMERIC, NULL));
2205 setlocale(LC_NUMERIC, "C");
8d08d9ba 2206#endif
6154021b 2207 s = scan_num(s, &pl_yylval);
8d08d9ba
DG
2208#ifdef USE_LOCALE_NUMERIC
2209 setlocale(LC_NUMERIC, loc);
909d3787 2210 Safefree(loc);
8d08d9ba 2211#endif
6154021b 2212 version = pl_yylval.opval;
dd629d5b
GS
2213 ver = cSVOPx(version)->op_sv;
2214 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2215 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2216 SvNV_set(ver, str_to_version(ver));
1571675a 2217 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2218 }
89bfa8cd 2219 }
5db06880
NC
2220 else if (guessing) {
2221#ifdef PERL_MAD
2222 if (PL_madskills) {
cd81e915
NC
2223 sv_free(PL_nextwhite); /* let next token collect whitespace */
2224 PL_nextwhite = 0;
5db06880
NC
2225 s = SvPVX(PL_linestr) + startoff;
2226 }
2227#endif
e759cc13 2228 return s;
5db06880 2229 }
89bfa8cd 2230 }
2231
5db06880
NC
2232#ifdef PERL_MAD
2233 if (PL_madskills && !version) {
cd81e915
NC
2234 sv_free(PL_nextwhite); /* let next token collect whitespace */
2235 PL_nextwhite = 0;
5db06880
NC
2236 s = SvPVX(PL_linestr) + startoff;
2237 }
2238#endif
89bfa8cd 2239 /* NOTE: The parser sees the package name and the VERSION swapped */
cd81e915 2240 start_force(PL_curforce);
9ded7720 2241 NEXTVAL_NEXTTOKE.opval = version;
4e553d73 2242 force_next(WORD);
89bfa8cd 2243
e759cc13 2244 return s;
89bfa8cd 2245}
2246
ffb4593c 2247/*
91152fc1
DG
2248 * S_force_strict_version
2249 * Forces the next token to be a version number using strict syntax rules.
2250 */
2251
2252STATIC char *
2253S_force_strict_version(pTHX_ char *s)
2254{
2255 dVAR;
2256 OP *version = NULL;
2257#ifdef PERL_MAD
2258 I32 startoff = s - SvPVX(PL_linestr);
2259#endif
2260 const char *errstr = NULL;
2261
2262 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2263
2264 while (isSPACE(*s)) /* leading whitespace */
2265 s++;
2266
2267 if (is_STRICT_VERSION(s,&errstr)) {
2268 SV *ver = newSV(0);
2269 s = (char *)scan_version(s, ver, 0);
2270 version = newSVOP(OP_CONST, 0, ver);
2271 }
4e4da3ac
Z
2272 else if ( (*s != ';' && *s != '{' && *s != '}' ) &&
2273 (s = SKIPSPACE1(s), (*s != ';' && *s != '{' && *s != '}' )))
2274 {
91152fc1
DG
2275 PL_bufptr = s;
2276 if (errstr)
2277 yyerror(errstr); /* version required */
2278 return s;
2279 }
2280
2281#ifdef PERL_MAD
2282 if (PL_madskills && !version) {
2283 sv_free(PL_nextwhite); /* let next token collect whitespace */
2284 PL_nextwhite = 0;
2285 s = SvPVX(PL_linestr) + startoff;
2286 }
2287#endif
2288 /* NOTE: The parser sees the package name and the VERSION swapped */
2289 start_force(PL_curforce);
2290 NEXTVAL_NEXTTOKE.opval = version;
2291 force_next(WORD);
2292
2293 return s;
2294}
2295
2296/*
ffb4593c
NT
2297 * S_tokeq
2298 * Tokenize a quoted string passed in as an SV. It finds the next
2299 * chunk, up to end of string or a backslash. It may make a new
2300 * SV containing that chunk (if HINT_NEW_STRING is on). It also
2301 * turns \\ into \.
2302 */
2303
76e3520e 2304STATIC SV *
cea2e8a9 2305S_tokeq(pTHX_ SV *sv)
79072805 2306{
97aff369 2307 dVAR;
79072805
LW
2308 register char *s;
2309 register char *send;
2310 register char *d;
b3ac6de7
IZ
2311 STRLEN len = 0;
2312 SV *pv = sv;
79072805 2313
7918f24d
NC
2314 PERL_ARGS_ASSERT_TOKEQ;
2315
79072805 2316 if (!SvLEN(sv))
b3ac6de7 2317 goto finish;
79072805 2318
a0d0e21e 2319 s = SvPV_force(sv, len);
21a311ee 2320 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1)
b3ac6de7 2321 goto finish;
463ee0b2 2322 send = s + len;
dcb21ed6
NC
2323 /* This is relying on the SV being "well formed" with a trailing '\0' */
2324 while (s < send && !(*s == '\\' && s[1] == '\\'))
79072805
LW
2325 s++;
2326 if (s == send)
b3ac6de7 2327 goto finish;
79072805 2328 d = s;
be4731d2 2329 if ( PL_hints & HINT_NEW_STRING ) {
59cd0e26 2330 pv = newSVpvn_flags(SvPVX_const(pv), len, SVs_TEMP | SvUTF8(sv));
be4731d2 2331 }
79072805
LW
2332 while (s < send) {
2333 if (*s == '\\') {
a0d0e21e 2334 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2335 s++; /* all that, just for this */
2336 }
2337 *d++ = *s++;
2338 }
2339 *d = '\0';
95a20fc0 2340 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2341 finish:
3280af22 2342 if ( PL_hints & HINT_NEW_STRING )
eb0d8d16 2343 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
79072805
LW
2344 return sv;
2345}
2346
ffb4593c
NT
2347/*
2348 * Now come three functions related to double-quote context,
2349 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2350 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2351 * interact with PL_lex_state, and create fake ( ... ) argument lists
2352 * to handle functions and concatenation.
ecd24171
DM
2353 * For example,
2354 * "foo\lbar"
2355 * is tokenised as
2356 * stringify ( const[foo] concat lcfirst ( const[bar] ) )
ffb4593c
NT
2357 */
2358
2359/*
2360 * S_sublex_start
6154021b 2361 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2362 *
2363 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2364 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c
NT
2365 *
2366 * OP_CONST and OP_READLINE are easy--just make the new op and return.
2367 *
2368 * Everything else becomes a FUNC.
2369 *
2370 * Sets PL_lex_state to LEX_INTERPPUSH unless (ival was OP_NULL or we
2371 * had an OP_CONST or OP_READLINE). This just sets us up for a
2372 * call to S_sublex_push().
2373 */
2374
76e3520e 2375STATIC I32
cea2e8a9 2376S_sublex_start(pTHX)
79072805 2377{
97aff369 2378 dVAR;
6154021b 2379 register const I32 op_type = pl_yylval.ival;
79072805
LW
2380
2381 if (op_type == OP_NULL) {
6154021b 2382 pl_yylval.opval = PL_lex_op;
5f66b61c 2383 PL_lex_op = NULL;
79072805
LW
2384 return THING;
2385 }
2386 if (op_type == OP_CONST || op_type == OP_READLINE) {
3280af22 2387 SV *sv = tokeq(PL_lex_stuff);
b3ac6de7
IZ
2388
2389 if (SvTYPE(sv) == SVt_PVIV) {
2390 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2391 STRLEN len;
96a5add6 2392 const char * const p = SvPV_const(sv, len);
740cce10 2393 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2394 SvREFCNT_dec(sv);
2395 sv = nsv;
4e553d73 2396 }
6154021b 2397 pl_yylval.opval = (OP*)newSVOP(op_type, 0, sv);
a0714e2c 2398 PL_lex_stuff = NULL;
6f33ba73
RGS
2399 /* Allow <FH> // "foo" */
2400 if (op_type == OP_READLINE)
2401 PL_expect = XTERMORDORDOR;
79072805
LW
2402 return THING;
2403 }
e3f73d4e
RGS
2404 else if (op_type == OP_BACKTICK && PL_lex_op) {
2405 /* readpipe() vas overriden */
2406 cSVOPx(cLISTOPx(cUNOPx(PL_lex_op)->op_first)->op_first->op_sibling)->op_sv = tokeq(PL_lex_stuff);
6154021b 2407 pl_yylval.opval = PL_lex_op;
9b201d7d 2408 PL_lex_op = NULL;
e3f73d4e
RGS
2409 PL_lex_stuff = NULL;
2410 return THING;
2411 }
79072805 2412
3280af22 2413 PL_sublex_info.super_state = PL_lex_state;
eac04b2e 2414 PL_sublex_info.sub_inwhat = (U16)op_type;
3280af22
NIS
2415 PL_sublex_info.sub_op = PL_lex_op;
2416 PL_lex_state = LEX_INTERPPUSH;
55497cff 2417
3280af22
NIS
2418 PL_expect = XTERM;
2419 if (PL_lex_op) {
6154021b 2420 pl_yylval.opval = PL_lex_op;
5f66b61c 2421 PL_lex_op = NULL;
55497cff 2422 return PMFUNC;
2423 }
2424 else
2425 return FUNC;
2426}
2427
ffb4593c
NT
2428/*
2429 * S_sublex_push
2430 * Create a new scope to save the lexing state. The scope will be
2431 * ended in S_sublex_done. Returns a '(', starting the function arguments
2432 * to the uc, lc, etc. found before.
2433 * Sets PL_lex_state to LEX_INTERPCONCAT.
2434 */
2435
76e3520e 2436STATIC I32
cea2e8a9 2437S_sublex_push(pTHX)
55497cff 2438{
27da23d5 2439 dVAR;
f46d017c 2440 ENTER;
55497cff 2441
3280af22 2442 PL_lex_state = PL_sublex_info.super_state;
651b5b28 2443 SAVEBOOL(PL_lex_dojoin);
3280af22 2444 SAVEI32(PL_lex_brackets);
78cdf107
Z
2445 SAVEI32(PL_lex_allbrackets);
2446 SAVEI8(PL_lex_fakeeof);
3280af22
NIS
2447 SAVEI32(PL_lex_casemods);
2448 SAVEI32(PL_lex_starts);
651b5b28 2449 SAVEI8(PL_lex_state);
9da1dd8f 2450 SAVEPPTR(PL_sublex_info.re_eval_start);
7766f137 2451 SAVEVPTR(PL_lex_inpat);
98246f1e 2452 SAVEI16(PL_lex_inwhat);
57843af0 2453 SAVECOPLINE(PL_curcop);
3280af22 2454 SAVEPPTR(PL_bufptr);
8452ff4b 2455 SAVEPPTR(PL_bufend);
3280af22
NIS
2456 SAVEPPTR(PL_oldbufptr);
2457 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2458 SAVEPPTR(PL_last_lop);
2459 SAVEPPTR(PL_last_uni);
3280af22
NIS
2460 SAVEPPTR(PL_linestart);
2461 SAVESPTR(PL_linestr);
8edd5f42
RGS
2462 SAVEGENERICPV(PL_lex_brackstack);
2463 SAVEGENERICPV(PL_lex_casestack);
3280af22
NIS
2464
2465 PL_linestr = PL_lex_stuff;
a0714e2c 2466 PL_lex_stuff = NULL;
9da1dd8f 2467 PL_sublex_info.re_eval_start = NULL;
3280af22 2468
9cbb5ea2
GS
2469 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2470 = SvPVX(PL_linestr);
3280af22 2471 PL_bufend += SvCUR(PL_linestr);
bd61b366 2472 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2473 SAVEFREESV(PL_linestr);
2474
2475 PL_lex_dojoin = FALSE;
2476 PL_lex_brackets = 0;
78cdf107
Z
2477 PL_lex_allbrackets = 0;
2478 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
a02a5408
JC
2479 Newx(PL_lex_brackstack, 120, char);
2480 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2481 PL_lex_casemods = 0;
2482 *PL_lex_casestack = '\0';
2483 PL_lex_starts = 0;
2484 PL_lex_state = LEX_INTERPCONCAT;
eb160463 2485 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
3280af22
NIS
2486
2487 PL_lex_inwhat = PL_sublex_info.sub_inwhat;
bb16bae8 2488 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
3280af22
NIS
2489 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2490 PL_lex_inpat = PL_sublex_info.sub_op;
79072805 2491 else
5f66b61c 2492 PL_lex_inpat = NULL;
79072805 2493
55497cff 2494 return '(';
79072805
LW
2495}
2496
ffb4593c
NT
2497/*
2498 * S_sublex_done
2499 * Restores lexer state after a S_sublex_push.
2500 */
2501
76e3520e 2502STATIC I32
cea2e8a9 2503S_sublex_done(pTHX)
79072805 2504{
27da23d5 2505 dVAR;
3280af22 2506 if (!PL_lex_starts++) {
396482e1 2507 SV * const sv = newSVpvs("");
9aa983d2
JH
2508 if (SvUTF8(PL_linestr))
2509 SvUTF8_on(sv);
3280af22 2510 PL_expect = XOPERATOR;
6154021b 2511 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
79072805
LW
2512 return THING;
2513 }
2514
3280af22
NIS
2515 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2516 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2517 return yylex();
79072805
LW
2518 }
2519
ffb4593c 2520 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
bb16bae8 2521 assert(PL_lex_inwhat != OP_TRANSR);
3280af22
NIS
2522 if (PL_lex_repl && (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS)) {
2523 PL_linestr = PL_lex_repl;
2524 PL_lex_inpat = 0;
2525 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2526 PL_bufend += SvCUR(PL_linestr);
bd61b366 2527 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2528 SAVEFREESV(PL_linestr);
2529 PL_lex_dojoin = FALSE;
2530 PL_lex_brackets = 0;
78cdf107
Z
2531 PL_lex_allbrackets = 0;
2532 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
3280af22
NIS
2533 PL_lex_casemods = 0;
2534 *PL_lex_casestack = '\0';
2535 PL_lex_starts = 0;
25da4f38 2536 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2537 PL_lex_state = LEX_INTERPNORMAL;
2538 PL_lex_starts++;
e9fa98b2
HS
2539 /* we don't clear PL_lex_repl here, so that we can check later
2540 whether this is an evalled subst; that means we rely on the
2541 logic to ensure sublex_done() is called again only via the
2542 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2543 }
e9fa98b2 2544 else {
3280af22 2545 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2546 PL_lex_repl = NULL;
e9fa98b2 2547 }
79072805 2548 return ',';
ffed7fef
LW
2549 }
2550 else {
5db06880
NC
2551#ifdef PERL_MAD
2552 if (PL_madskills) {
cd81e915
NC
2553 if (PL_thiswhite) {
2554 if (!PL_endwhite)
6b29d1f5 2555 PL_endwhite = newSVpvs("");
cd81e915
NC
2556 sv_catsv(PL_endwhite, PL_thiswhite);
2557 PL_thiswhite = 0;
2558 }
2559 if (PL_thistoken)
76f68e9b 2560 sv_setpvs(PL_thistoken,"");
5db06880 2561 else
cd81e915 2562 PL_realtokenstart = -1;
5db06880
NC
2563 }
2564#endif
f46d017c 2565 LEAVE;
3280af22
NIS
2566 PL_bufend = SvPVX(PL_linestr);
2567 PL_bufend += SvCUR(PL_linestr);
2568 PL_expect = XOPERATOR;
09bef843 2569 PL_sublex_info.sub_inwhat = 0;
79072805 2570 return ')';
ffed7fef
LW
2571 }
2572}
2573
02aa26ce
NT
2574/*
2575 scan_const
2576
9da1dd8f
DM
2577 Extracts the next constant part of a pattern, double-quoted string,
2578 or transliteration. This is terrifying code.
2579
2580 For example, in parsing the double-quoted string "ab\x63$d", it would
2581 stop at the '$' and return an OP_CONST containing 'abc'.
02aa26ce 2582
94def140 2583 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 2584 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 2585 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 2586
94def140
TS
2587 Returns a pointer to the character scanned up to. If this is
2588 advanced from the start pointer supplied (i.e. if anything was
9da1dd8f 2589 successfully parsed), will leave an OP_CONST for the substring scanned
6154021b 2590 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
2591 by looking at the next characters herself.
2592
02aa26ce 2593 In patterns:
9da1dd8f
DM
2594 expand:
2595 \N{ABC} => \N{U+41.42.43}
2596
2597 pass through:
2598 all other \-char, including \N and \N{ apart from \N{ABC}
2599
2600 stops on:
2601 @ and $ where it appears to be a var, but not for $ as tail anchor
2602 \l \L \u \U \Q \E
2603 (?{ or (??{
2604
02aa26ce
NT
2605
2606 In transliterations:
2607 characters are VERY literal, except for - not at the start or end
94def140
TS
2608 of the string, which indicates a range. If the range is in bytes,
2609 scan_const expands the range to the full set of intermediate
2610 characters. If the range is in utf8, the hyphen is replaced with
2611 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
2612
2613 In double-quoted strings:
2614 backslashes:
2615 double-quoted style: \r and \n
ff3f963a 2616 constants: \x31, etc.
94def140 2617 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
2618 case and quoting: \U \Q \E
2619 stops on @ and $
2620
2621 scan_const does *not* construct ops to handle interpolated strings.
2622 It stops processing as soon as it finds an embedded $ or @ variable
2623 and leaves it to the caller to work out what's going on.
2624
94def140
TS
2625 embedded arrays (whether in pattern or not) could be:
2626 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2627
2628 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
2629
2630 $ in pattern could be $foo or could be tail anchor. Assumption:
2631 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 2632 followed by one of "()| \r\n\t"
02aa26ce 2633
9da1dd8f 2634 \1 (backreferences) are turned into $1 in substitutions
02aa26ce
NT
2635
2636 The structure of the code is
2637 while (there's a character to process) {
94def140
TS
2638 handle transliteration ranges
2639 skip regexp comments /(?#comment)/ and codes /(?{code})/
2640 skip #-initiated comments in //x patterns
2641 check for embedded arrays
02aa26ce
NT
2642 check for embedded scalars
2643 if (backslash) {
94def140 2644 deprecate \1 in substitution replacements
02aa26ce
NT
2645 handle string-changing backslashes \l \U \Q \E, etc.
2646 switch (what was escaped) {
94def140 2647 handle \- in a transliteration (becomes a literal -)
ff3f963a 2648 if a pattern and not \N{, go treat as regular character
94def140
TS
2649 handle \132 (octal characters)
2650 handle \x15 and \x{1234} (hex characters)
ff3f963a 2651 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
TS
2652 handle \cV (control characters)
2653 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 2654 } (end switch)
77a135fe 2655 continue
02aa26ce 2656 } (end if backslash)
77a135fe 2657 handle regular character
02aa26ce 2658 } (end while character to read)
4e553d73 2659
02aa26ce
NT
2660*/
2661
76e3520e 2662STATIC char *
cea2e8a9 2663S_scan_const(pTHX_ char *start)
79072805 2664{
97aff369 2665 dVAR;
3280af22 2666 register char *send = PL_bufend; /* end of the constant */
77a135fe
KW
2667 SV *sv = newSV(send - start); /* sv for the constant. See
2668 note below on sizing. */
02aa26ce
NT
2669 register char *s = start; /* start of the constant */
2670 register char *d = SvPVX(sv); /* destination for copies */
2671 bool dorange = FALSE; /* are we in a translit range? */
c2e66d9e 2672 bool didrange = FALSE; /* did we just finish a range? */
2866decb 2673 bool in_charclass = FALSE; /* within /[...]/ */
b953e60c
KW
2674 bool has_utf8 = FALSE; /* Output constant is UTF8 */
2675 bool this_utf8 = cBOOL(UTF); /* Is the source string assumed
77a135fe
KW
2676 to be UTF8? But, this can
2677 show as true when the source
2678 isn't utf8, as for example
2679 when it is entirely composed
2680 of hex constants */
2681
2682 /* Note on sizing: The scanned constant is placed into sv, which is
2683 * initialized by newSV() assuming one byte of output for every byte of
2684 * input. This routine expects newSV() to allocate an extra byte for a
2685 * trailing NUL, which this routine will append if it gets to the end of
2686 * the input. There may be more bytes of input than output (eg., \N{LATIN
2687 * CAPITAL LETTER A}), or more output than input if the constant ends up
2688 * recoded to utf8, but each time a construct is found that might increase
2689 * the needed size, SvGROW() is called. Its size parameter each time is
2690 * based on the best guess estimate at the time, namely the length used so
2691 * far, plus the length the current construct will occupy, plus room for
2692 * the trailing NUL, plus one byte for every input byte still unscanned */
2693
012bcf8d 2694 UV uv;
4c3a8340
TS
2695#ifdef EBCDIC
2696 UV literal_endpoint = 0;
e294cc5d 2697 bool native_range = TRUE; /* turned to FALSE if the first endpoint is Unicode. */
4c3a8340 2698#endif
012bcf8d 2699
7918f24d
NC
2700 PERL_ARGS_ASSERT_SCAN_CONST;
2701
bb16bae8 2702 assert(PL_lex_inwhat != OP_TRANSR);
2b9d42f0
NIS
2703 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
2704 /* If we are doing a trans and we know we want UTF8 set expectation */
2705 has_utf8 = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
2706 this_utf8 = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
2707 }
2708
2709
79072805 2710 while (s < send || dorange) {
ff3f963a 2711
02aa26ce 2712 /* get transliterations out of the way (they're most literal) */
3280af22 2713 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 2714 /* expand a range A-Z to the full set of characters. AIE! */
79072805 2715 if (dorange) {
1ba5c669
JH
2716 I32 i; /* current expanded character */
2717 I32 min; /* first character in range */
2718 I32 max; /* last character in range */
02aa26ce 2719
e294cc5d
JH
2720#ifdef EBCDIC
2721 UV uvmax = 0;
2722#endif
2723
2724 if (has_utf8
2725#ifdef EBCDIC
2726 && !native_range
2727#endif
2728 ) {
9d4ba2ae 2729 char * const c = (char*)utf8_hop((U8*)d, -1);
8973db79
JH
2730 char *e = d++;
2731 while (e-- > c)
2732 *(e + 1) = *e;
25716404 2733 *c = (char)UTF_TO_NATIVE(0xff);
8973db79
JH
2734 /* mark the range as done, and continue */
2735 dorange = FALSE;
2736 didrange = TRUE;
2737 continue;
2738 }
2b9d42f0 2739
95a20fc0 2740 i = d - SvPVX_const(sv); /* remember current offset */
e294cc5d
JH
2741#ifdef EBCDIC
2742 SvGROW(sv,
2743 SvLEN(sv) + (has_utf8 ?
2744 (512 - UTF_CONTINUATION_MARK +
2745 UNISKIP(0x100))
2746 : 256));
2747 /* How many two-byte within 0..255: 128 in UTF-8,
2748 * 96 in UTF-8-mod. */
2749#else
9cbb5ea2 2750 SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */
e294cc5d 2751#endif
9cbb5ea2 2752 d = SvPVX(sv) + i; /* refresh d after realloc */
e294cc5d
JH
2753#ifdef EBCDIC
2754 if (has_utf8) {
2755 int j;
2756 for (j = 0; j <= 1; j++) {
2757 char * const c = (char*)utf8_hop((U8*)d, -1);
2758 const UV uv = utf8n_to_uvchr((U8*)c, d - c, NULL, 0);
2759 if (j)
2760 min = (U8)uv;
2761 else if (uv < 256)
2762 max = (U8)uv;
2763 else {
2764 max = (U8)0xff; /* only to \xff */
2765 uvmax = uv; /* \x{100} to uvmax */
2766 }
2767 d = c; /* eat endpoint chars */
2768 }
2769 }
2770 else {
2771#endif
2772 d -= 2; /* eat the first char and the - */
2773 min = (U8)*d; /* first char in range */
2774 max = (U8)d[1]; /* last char in range */
2775#ifdef EBCDIC
2776 }
2777#endif
8ada0baa 2778
c2e66d9e 2779 if (min > max) {
01ec43d0 2780 Perl_croak(aTHX_
d1573ac7 2781 "Invalid range \"%c-%c\" in transliteration operator",
1ba5c669 2782 (char)min, (char)max);
c2e66d9e
GS
2783 }
2784
c7f1f016 2785#ifdef EBCDIC
4c3a8340
TS
2786 if (literal_endpoint == 2 &&
2787 ((isLOWER(min) && isLOWER(max)) ||
2788 (isUPPER(min) && isUPPER(max)))) {
8ada0baa
JH
2789 if (isLOWER(min)) {
2790 for (i = min; i <= max; i++)
2791 if (isLOWER(i))
db42d148 2792 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2793 } else {
2794 for (i = min; i <= max; i++)
2795 if (isUPPER(i))
db42d148 2796 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2797 }
2798 }
2799 else
2800#endif
2801 for (i = min; i <= max; i++)
e294cc5d
JH
2802#ifdef EBCDIC
2803 if (has_utf8) {
2804 const U8 ch = (U8)NATIVE_TO_UTF(i);
2805 if (UNI_IS_INVARIANT(ch))
2806 *d++ = (U8)i;
2807 else {
2808 *d++ = (U8)UTF8_EIGHT_BIT_HI(ch);
2809 *d++ = (U8)UTF8_EIGHT_BIT_LO(ch);
2810 }
2811 }
2812 else
2813#endif
2814 *d++ = (char)i;
2815
2816#ifdef EBCDIC
2817 if (uvmax) {
2818 d = (char*)uvchr_to_utf8((U8*)d, 0x100);
2819 if (uvmax > 0x101)
2820 *d++ = (char)UTF_TO_NATIVE(0xff);
2821 if (uvmax > 0x100)
2822 d = (char*)uvchr_to_utf8((U8*)d, uvmax);
2823 }
2824#endif
02aa26ce
NT
2825
2826 /* mark the range as done, and continue */
79072805 2827 dorange = FALSE;
01ec43d0 2828 didrange = TRUE;
4c3a8340
TS
2829#ifdef EBCDIC
2830 literal_endpoint = 0;
2831#endif
79072805 2832 continue;
4e553d73 2833 }
02aa26ce
NT
2834
2835 /* range begins (ignore - as first or last char) */
79072805 2836 else if (*s == '-' && s+1 < send && s != start) {
4e553d73 2837 if (didrange) {
1fafa243 2838 Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
01ec43d0 2839 }
e294cc5d
JH
2840 if (has_utf8
2841#ifdef EBCDIC
2842 && !native_range
2843#endif
2844 ) {
25716404 2845 *d++ = (char)UTF_TO_NATIVE(0xff); /* use illegal utf8 byte--see pmtrans */
a0ed51b3
LW
2846 s++;
2847 continue;
2848 }
79072805
LW
2849 dorange = TRUE;
2850 s++;
01ec43d0
GS
2851 }
2852 else {
2853 didrange = FALSE;
4c3a8340
TS
2854#ifdef EBCDIC
2855 literal_endpoint = 0;
e294cc5d 2856 native_range = TRUE;
4c3a8340 2857#endif
01ec43d0 2858 }
79072805 2859 }
02aa26ce
NT
2860
2861 /* if we get here, we're not doing a transliteration */
2862
e4a2df84
DM
2863 else if (*s == '[' && PL_lex_inpat && !in_charclass) {
2864 char *s1 = s-1;
2865 int esc = 0;
2866 while (s1 >= start && *s1-- == '\\')
2867 esc = !esc;
2868 if (!esc)
2869 in_charclass = TRUE;
2870 }
2866decb 2871
e4a2df84
DM
2872 else if (*s == ']' && PL_lex_inpat && in_charclass) {
2873 char *s1 = s-1;
2874 int esc = 0;
2875 while (s1 >= start && *s1-- == '\\')
2876 esc = !esc;
2877 if (!esc)
2878 in_charclass = FALSE;
2879 }
2866decb 2880
9da1dd8f
DM
2881 /* skip for regexp comments /(?#comment)/, except for the last
2882 * char, which will be done separately.
2883 * Stop on (?{..}) and friends */
2884
3280af22 2885 else if (*s == '(' && PL_lex_inpat && s[1] == '?') {
cc6b7395 2886 if (s[2] == '#') {
e994fd66 2887 while (s+1 < send && *s != ')')
db42d148 2888 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
155aba94 2889 }
2866decb 2890 else if (!PL_lex_casemods && !in_charclass &&
d3cec5e5
DM
2891 ( s[2] == '{' /* This should match regcomp.c */
2892 || (s[2] == '?' && s[3] == '{')))
155aba94 2893 {
9da1dd8f 2894 break;
cc6b7395 2895 }
748a9306 2896 }
02aa26ce
NT
2897
2898 /* likewise skip #-initiated comments in //x patterns */
3280af22 2899 else if (*s == '#' && PL_lex_inpat &&
73134a2e 2900 ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED) {
748a9306 2901 while (s+1 < send && *s != '\n')
db42d148 2902 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
748a9306 2903 }
02aa26ce 2904
9da1dd8f
DM
2905 /* no further processing of single-quoted regex */
2906 else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'')
2907 goto default_action;
2908
5d1d4326 2909 /* check for embedded arrays
da6eedaa 2910 (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
5d1d4326 2911 */
1749ea0d
TS
2912 else if (*s == '@' && s[1]) {
2913 if (isALNUM_lazy_if(s+1,UTF))
2914 break;
2915 if (strchr(":'{$", s[1]))
2916 break;
2917 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
2918 break; /* in regexp, neither @+ nor @- are interpolated */
2919 }
02aa26ce
NT
2920
2921 /* check for embedded scalars. only stop if we're sure it's a
2922 variable.
2923 */
79072805 2924 else if (*s == '$') {
3280af22 2925 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
79072805 2926 break;
77772344 2927 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
a2a5de95
NC
2928 if (s[1] == '\\') {
2929 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
2930 "Possible unintended interpolation of $\\ in regex");
77772344 2931 }
79072805 2932 break; /* in regexp, $ might be tail anchor */
77772344 2933 }
79072805 2934 }
02aa26ce 2935
2b9d42f0
NIS
2936 /* End of else if chain - OP_TRANS rejoin rest */
2937
02aa26ce 2938 /* backslashes */
79072805 2939 if (*s == '\\' && s+1 < send) {
ff3f963a
KW
2940 char* e; /* Can be used for ending '}', etc. */
2941
79072805 2942 s++;
02aa26ce 2943
7d0fc23c
KW
2944 /* warn on \1 - \9 in substitution replacements, but note that \11
2945 * is an octal; and \19 is \1 followed by '9' */
3280af22 2946 if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat &&
a0d0e21e 2947 isDIGIT(*s) && *s != '0' && !isDIGIT(s[1]))
79072805 2948 {
a2a5de95 2949 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
79072805
LW
2950 *--s = '$';
2951 break;
2952 }
02aa26ce
NT
2953
2954 /* string-change backslash escapes */
838f2281 2955 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQF", *s)) {
79072805
LW
2956 --s;
2957 break;
2958 }
ff3f963a
KW
2959 /* In a pattern, process \N, but skip any other backslash escapes.
2960 * This is because we don't want to translate an escape sequence
2961 * into a meta symbol and have the regex compiler use the meta
2962 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
2963 * in spite of this, we do have to process \N here while the proper
2964 * charnames handler is in scope. See bugs #56444 and #62056.
2965 * There is a complication because \N in a pattern may also stand
2966 * for 'match a non-nl', and not mean a charname, in which case its
2967 * processing should be deferred to the regex compiler. To be a
2968 * charname it must be followed immediately by a '{', and not look
2969 * like \N followed by a curly quantifier, i.e., not something like
2970 * \N{3,}. regcurly returns a boolean indicating if it is a legal
2971 * quantifier */
2972 else if (PL_lex_inpat
2973 && (*s != 'N'
2974 || s[1] != '{'
2975 || regcurly(s + 1)))
2976 {
cc74c5bd
TS
2977 *d++ = NATIVE_TO_NEED(has_utf8,'\\');
2978 goto default_action;
2979 }
02aa26ce 2980
79072805 2981 switch (*s) {
02aa26ce
NT
2982
2983 /* quoted - in transliterations */
79072805 2984 case '-':
3280af22 2985 if (PL_lex_inwhat == OP_TRANS) {
79072805
LW
2986 *d++ = *s++;
2987 continue;
2988 }
2989 /* FALL THROUGH */
2990 default:
11b8faa4 2991 {
e4ca4584 2992 if ((isALNUMC(*s)))
a2a5de95
NC
2993 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
2994 "Unrecognized escape \\%c passed through",
2995 *s);
11b8faa4 2996 /* default action is to copy the quoted character */
f9a63242 2997 goto default_action;
11b8faa4 2998 }
02aa26ce 2999
632403cc 3000 /* eg. \132 indicates the octal constant 0132 */
79072805
LW
3001 case '0': case '1': case '2': case '3':
3002 case '4': case '5': case '6': case '7':
ba210ebe 3003 {
53305cf1
NC
3004 I32 flags = 0;
3005 STRLEN len = 3;
77a135fe 3006 uv = NATIVE_TO_UNI(grok_oct(s, &len, &flags, NULL));
ba210ebe
JH
3007 s += len;
3008 }
012bcf8d 3009 goto NUM_ESCAPE_INSERT;
02aa26ce 3010
f0a2b745
KW
3011 /* eg. \o{24} indicates the octal constant \024 */
3012 case 'o':
3013 {
3014 STRLEN len;
454155d9 3015 const char* error;
f0a2b745 3016
454155d9 3017 bool valid = grok_bslash_o(s, &uv, &len, &error, 1);
f0a2b745 3018 s += len;
454155d9 3019 if (! valid) {
f0a2b745
KW
3020 yyerror(error);
3021 continue;
3022 }
3023 goto NUM_ESCAPE_INSERT;
3024 }
3025
77a135fe 3026 /* eg. \x24 indicates the hex constant 0x24 */
79072805 3027 case 'x':
a0481293 3028 {
53305cf1 3029 STRLEN len;
a0481293 3030 const char* error;
355860ce 3031
a0481293
KW
3032 bool valid = grok_bslash_x(s, &uv, &len, &error, 1);
3033 s += len;
3034 if (! valid) {
3035 yyerror(error);
355860ce 3036 continue;
ba210ebe 3037 }
012bcf8d
GS
3038 }
3039
3040 NUM_ESCAPE_INSERT:
ff3f963a
KW
3041 /* Insert oct or hex escaped character. There will always be
3042 * enough room in sv since such escapes will be longer than any
3043 * UTF-8 sequence they can end up as, except if they force us
3044 * to recode the rest of the string into utf8 */
ba7cea30 3045
77a135fe 3046 /* Here uv is the ordinal of the next character being added in
ff3f963a 3047 * unicode (converted from native). */
77a135fe 3048 if (!UNI_IS_INVARIANT(uv)) {
9aa983d2 3049 if (!has_utf8 && uv > 255) {
77a135fe
KW
3050 /* Might need to recode whatever we have accumulated so
3051 * far if it contains any chars variant in utf8 or
3052 * utf-ebcdic. */
3053
3054 SvCUR_set(sv, d - SvPVX_const(sv));
3055 SvPOK_on(sv);
3056 *d = '\0';
77a135fe 3057 /* See Note on sizing above. */
7bf79863
KW
3058 sv_utf8_upgrade_flags_grow(sv,
3059 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3060 UNISKIP(uv) + (STRLEN)(send - s) + 1);
77a135fe
KW
3061 d = SvPVX(sv) + SvCUR(sv);
3062 has_utf8 = TRUE;
012bcf8d
GS
3063 }
3064
77a135fe
KW
3065 if (has_utf8) {
3066 d = (char*)uvuni_to_utf8((U8*)d, uv);
f9a63242
JH
3067 if (PL_lex_inwhat == OP_TRANS &&
3068 PL_sublex_info.sub_op) {
3069 PL_sublex_info.sub_op->op_private |=
3070 (PL_lex_repl ? OPpTRANS_FROM_UTF
3071 : OPpTRANS_TO_UTF);
f9a63242 3072 }
e294cc5d
JH
3073#ifdef EBCDIC
3074 if (uv > 255 && !dorange)
3075 native_range = FALSE;
3076#endif
012bcf8d 3077 }
a0ed51b3 3078 else {
012bcf8d 3079 *d++ = (char)uv;
a0ed51b3 3080 }
012bcf8d
GS
3081 }
3082 else {
c4d5f83a 3083 *d++ = (char) uv;
a0ed51b3 3084 }
79072805 3085 continue;
02aa26ce 3086
4a2d328f 3087 case 'N':
ff3f963a
KW
3088 /* In a non-pattern \N must be a named character, like \N{LATIN
3089 * SMALL LETTER A} or \N{U+0041}. For patterns, it also can
3090 * mean to match a non-newline. For non-patterns, named
3091 * characters are converted to their string equivalents. In
3092 * patterns, named characters are not converted to their
3093 * ultimate forms for the same reasons that other escapes
3094 * aren't. Instead, they are converted to the \N{U+...} form
3095 * to get the value from the charnames that is in effect right
3096 * now, while preserving the fact that it was a named character
3097 * so that the regex compiler knows this */
3098
3099 /* This section of code doesn't generally use the
3100 * NATIVE_TO_NEED() macro to transform the input. I (khw) did
3101 * a close examination of this macro and determined it is a
3102 * no-op except on utfebcdic variant characters. Every
3103 * character generated by this that would normally need to be
3104 * enclosed by this macro is invariant, so the macro is not
7538f724
KW
3105 * needed, and would complicate use of copy(). XXX There are
3106 * other parts of this file where the macro is used
3107 * inconsistently, but are saved by it being a no-op */
ff3f963a
KW
3108
3109 /* The structure of this section of code (besides checking for
3110 * errors and upgrading to utf8) is:
3111 * Further disambiguate between the two meanings of \N, and if
3112 * not a charname, go process it elsewhere
0a96133f
KW
3113 * If of form \N{U+...}, pass it through if a pattern;
3114 * otherwise convert to utf8
3115 * Otherwise must be \N{NAME}: convert to \N{U+c1.c2...} if a
3116 * pattern; otherwise convert to utf8 */
ff3f963a
KW
3117
3118 /* Here, s points to the 'N'; the test below is guaranteed to
3119 * succeed if we are being called on a pattern as we already
3120 * know from a test above that the next character is a '{'.
3121 * On a non-pattern \N must mean 'named sequence, which
3122 * requires braces */
3123 s++;
3124 if (*s != '{') {
3125 yyerror("Missing braces on \\N{}");
3126 continue;
3127 }
3128 s++;
3129
0a96133f 3130 /* If there is no matching '}', it is an error. */
ff3f963a
KW
3131 if (! (e = strchr(s, '}'))) {
3132 if (! PL_lex_inpat) {
5777a3f7 3133 yyerror("Missing right brace on \\N{}");
0a96133f
KW
3134 } else {
3135 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N.");
dbc0d4f2 3136 }
0a96133f 3137 continue;
ff3f963a 3138 }
cddc7ef4 3139
ff3f963a 3140 /* Here it looks like a named character */
cddc7ef4 3141
ff3f963a
KW
3142 if (PL_lex_inpat) {
3143
3144 /* XXX This block is temporary code. \N{} implies that the
3145 * pattern is to have Unicode semantics, and therefore
3146 * currently has to be encoded in utf8. By putting it in
3147 * utf8 now, we save a whole pass in the regular expression
3148 * compiler. Once that code is changed so Unicode
3149 * semantics doesn't necessarily have to be in utf8, this
da3a4baf
KW
3150 * block should be removed. However, the code that parses
3151 * the output of this would have to be changed to not
3152 * necessarily expect utf8 */
ff3f963a 3153 if (!has_utf8) {
77a135fe 3154 SvCUR_set(sv, d - SvPVX_const(sv));
f08d6ad9 3155 SvPOK_on(sv);
e4f3eed8 3156 *d = '\0';
77a135fe 3157 /* See Note on sizing above. */
7bf79863 3158 sv_utf8_upgrade_flags_grow(sv,
ff3f963a
KW
3159 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3160 /* 5 = '\N{' + cur char + NUL */
3161 (STRLEN)(send - s) + 5);
f08d6ad9 3162 d = SvPVX(sv) + SvCUR(sv);
89491803 3163 has_utf8 = TRUE;
ff3f963a
KW
3164 }
3165 }
423cee85 3166
ff3f963a
KW
3167 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3168 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3169 | PERL_SCAN_DISALLOW_PREFIX;
3170 STRLEN len;
3171
3172 /* For \N{U+...}, the '...' is a unicode value even on
3173 * EBCDIC machines */
3174 s += 2; /* Skip to next char after the 'U+' */
3175 len = e - s;
3176 uv = grok_hex(s, &len, &flags, NULL);
3177 if (len == 0 || len != (STRLEN)(e - s)) {
3178 yyerror("Invalid hexadecimal number in \\N{U+...}");
3179 s = e + 1;
3180 continue;
3181 }
3182
3183 if (PL_lex_inpat) {
3184
e2a7e165
KW
3185 /* On non-EBCDIC platforms, pass through to the regex
3186 * compiler unchanged. The reason we evaluated the
3187 * number above is to make sure there wasn't a syntax
3188 * error. But on EBCDIC we convert to native so
3189 * downstream code can continue to assume it's native
3190 */
ff3f963a 3191 s -= 5; /* Include the '\N{U+' */
e2a7e165
KW
3192#ifdef EBCDIC
3193 d += my_snprintf(d, e - s + 1 + 1, /* includes the }
3194 and the \0 */
3195 "\\N{U+%X}",
3196 (unsigned int) UNI_TO_NATIVE(uv));
3197#else
ff3f963a
KW
3198 Copy(s, d, e - s + 1, char); /* 1 = include the } */
3199 d += e - s + 1;
e2a7e165 3200#endif
ff3f963a
KW
3201 }
3202 else { /* Not a pattern: convert the hex to string */
3203
3204 /* If destination is not in utf8, unconditionally
3205 * recode it to be so. This is because \N{} implies
3206 * Unicode semantics, and scalars have to be in utf8
3207 * to guarantee those semantics */
3208 if (! has_utf8) {
3209 SvCUR_set(sv, d - SvPVX_const(sv));
3210 SvPOK_on(sv);
3211 *d = '\0';
3212 /* See Note on sizing above. */
3213 sv_utf8_upgrade_flags_grow(
3214 sv,
3215 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3216 UNISKIP(uv) + (STRLEN)(send - e) + 1);
3217 d = SvPVX(sv) + SvCUR(sv);
3218 has_utf8 = TRUE;
3219 }
3220
3221 /* Add the string to the output */
3222 if (UNI_IS_INVARIANT(uv)) {
3223 *d++ = (char) uv;
3224 }
3225 else d = (char*)uvuni_to_utf8((U8*)d, uv);
3226 }
3227 }
3228 else { /* Here is \N{NAME} but not \N{U+...}. */
3229
3230 SV *res; /* result from charnames */
3231 const char *str; /* the string in 'res' */
3232 STRLEN len; /* its length */
3233
3234 /* Get the value for NAME */
3235 res = newSVpvn(s, e - s);
3236 res = new_constant( NULL, 0, "charnames",
3237 /* includes all of: \N{...} */
3238 res, NULL, s - 3, e - s + 4 );
3239
3240 /* Most likely res will be in utf8 already since the
3241 * standard charnames uses pack U, but a custom translator
3242 * can leave it otherwise, so make sure. XXX This can be
3243 * revisited to not have charnames use utf8 for characters
3244 * that don't need it when regexes don't have to be in utf8
3245 * for Unicode semantics. If doing so, remember EBCDIC */
3246 sv_utf8_upgrade(res);
3247 str = SvPV_const(res, len);
3248
3249 /* Don't accept malformed input */
3250 if (! is_utf8_string((U8 *) str, len)) {
3251 yyerror("Malformed UTF-8 returned by \\N");
3252 }
3253 else if (PL_lex_inpat) {
3254
3255 if (! len) { /* The name resolved to an empty string */
3256 Copy("\\N{}", d, 4, char);
3257 d += 4;
3258 }
3259 else {
3260 /* In order to not lose information for the regex
3261 * compiler, pass the result in the specially made
3262 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3263 * the code points in hex of each character
3264 * returned by charnames */
3265
3266 const char *str_end = str + len;
3267 STRLEN char_length; /* cur char's byte length */
3268 STRLEN output_length; /* and the number of bytes
3269 after this is translated
3270 into hex digits */
3271 const STRLEN off = d - SvPVX_const(sv);
3272
3273 /* 2 hex per byte; 2 chars for '\N'; 2 chars for
3274 * max('U+', '.'); and 1 for NUL */
3275 char hex_string[2 * UTF8_MAXBYTES + 5];
3276
3277 /* Get the first character of the result. */
3278 U32 uv = utf8n_to_uvuni((U8 *) str,
3279 len,
3280 &char_length,
3281 UTF8_ALLOW_ANYUV);
3282
3283 /* The call to is_utf8_string() above hopefully
3284 * guarantees that there won't be an error. But
3285 * it's easy here to make sure. The function just
3286 * above warns and returns 0 if invalid utf8, but
3287 * it can also return 0 if the input is validly a
3288 * NUL. Disambiguate */
3289 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3290 uv = UNICODE_REPLACEMENT;
3291 }
3292
3293 /* Convert first code point to hex, including the
e2a7e165
KW
3294 * boiler plate before it. For all these, we
3295 * convert to native format so that downstream code
3296 * can continue to assume the input is native */
78c35590 3297 output_length =
3353de27 3298 my_snprintf(hex_string, sizeof(hex_string),
e2a7e165
KW
3299 "\\N{U+%X",
3300 (unsigned int) UNI_TO_NATIVE(uv));
ff3f963a
KW
3301
3302 /* Make sure there is enough space to hold it */
3303 d = off + SvGROW(sv, off
3304 + output_length
3305 + (STRLEN)(send - e)
3306 + 2); /* '}' + NUL */
3307 /* And output it */
3308 Copy(hex_string, d, output_length, char);
3309 d += output_length;
3310
3311 /* For each subsequent character, append dot and
3312 * its ordinal in hex */
3313 while ((str += char_length) < str_end) {
3314 const STRLEN off = d - SvPVX_const(sv);
3315 U32 uv = utf8n_to_uvuni((U8 *) str,
3316 str_end - str,
3317 &char_length,
3318 UTF8_ALLOW_ANYUV);
3319 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3320 uv = UNICODE_REPLACEMENT;
3321 }
3322
78c35590 3323 output_length =
3353de27 3324 my_snprintf(hex_string, sizeof(hex_string),
e2a7e165
KW
3325 ".%X",
3326 (unsigned int) UNI_TO_NATIVE(uv));
ff3f963a
KW
3327
3328 d = off + SvGROW(sv, off
3329 + output_length
3330 + (STRLEN)(send - e)
3331 + 2); /* '}' + NUL */
3332 Copy(hex_string, d, output_length, char);
3333 d += output_length;
3334 }
3335
3336 *d++ = '}'; /* Done. Add the trailing brace */
3337 }
3338 }
3339 else { /* Here, not in a pattern. Convert the name to a
3340 * string. */
3341
3342 /* If destination is not in utf8, unconditionally
3343 * recode it to be so. This is because \N{} implies
3344 * Unicode semantics, and scalars have to be in utf8
3345 * to guarantee those semantics */
3346 if (! has_utf8) {
3347 SvCUR_set(sv, d - SvPVX_const(sv));
3348 SvPOK_on(sv);
3349 *d = '\0';
3350 /* See Note on sizing above. */
3351 sv_utf8_upgrade_flags_grow(sv,
3352 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3353 len + (STRLEN)(send - s) + 1);
3354 d = SvPVX(sv) + SvCUR(sv);
3355 has_utf8 = TRUE;
3356 } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
3357
3358 /* See Note on sizing above. (NOTE: SvCUR() is not
3359 * set correctly here). */
3360 const STRLEN off = d - SvPVX_const(sv);
3361 d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1);
3362 }
3363 Copy(str, d, len, char);
3364 d += len;
423cee85 3365 }
423cee85 3366 SvREFCNT_dec(res);
cb233ae3
KW
3367
3368 /* Deprecate non-approved name syntax */
3369 if (ckWARN_d(WARN_DEPRECATED)) {
3370 bool problematic = FALSE;
3371 char* i = s;
3372
3373 /* For non-ut8 input, look to see that the first
3374 * character is an alpha, then loop through the rest
3375 * checking that each is a continuation */
3376 if (! this_utf8) {
3377 if (! isALPHAU(*i)) problematic = TRUE;
3378 else for (i = s + 1; i < e; i++) {
3379 if (isCHARNAME_CONT(*i)) continue;
3380 problematic = TRUE;
3381 break;
3382 }
3383 }
3384 else {
3385 /* Similarly for utf8. For invariants can check
3386 * directly. We accept anything above the latin1
3387 * range because it is immaterial to Perl if it is
3388 * correct or not, and is expensive to check. But
3389 * it is fairly easy in the latin1 range to convert
3390 * the variants into a single character and check
3391 * those */
3392 if (UTF8_IS_INVARIANT(*i)) {
3393 if (! isALPHAU(*i)) problematic = TRUE;
3394 } else if (UTF8_IS_DOWNGRADEABLE_START(*i)) {
81c14aa2 3395 if (! isALPHAU(UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(*i,
cb233ae3
KW
3396 *(i+1)))))
3397 {
3398 problematic = TRUE;
3399 }
3400 }
3401 if (! problematic) for (i = s + UTF8SKIP(s);
3402 i < e;
3403 i+= UTF8SKIP(i))
3404 {
3405 if (UTF8_IS_INVARIANT(*i)) {
3406 if (isCHARNAME_CONT(*i)) continue;
3407 } else if (! UTF8_IS_DOWNGRADEABLE_START(*i)) {
3408 continue;
3409 } else if (isCHARNAME_CONT(
3410 UNI_TO_NATIVE(
81c14aa2 3411 TWO_BYTE_UTF8_TO_UNI(*i, *(i+1)))))
cb233ae3
KW
3412 {
3413 continue;
3414 }
3415 problematic = TRUE;
3416 break;
3417 }
3418 }
3419 if (problematic) {
6e1bad6c
KW
3420 /* The e-i passed to the final %.*s makes sure that
3421 * should the trailing NUL be missing that this
3422 * print won't run off the end of the string */
cb233ae3 3423 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
b00fc8d4
NC
3424 "Deprecated character in \\N{...}; marked by <-- HERE in \\N{%.*s<-- HERE %.*s",
3425 (int)(i - s + 1), s, (int)(e - i), i + 1);
cb233ae3
KW
3426 }
3427 }
3428 } /* End \N{NAME} */
ff3f963a
KW
3429#ifdef EBCDIC
3430 if (!dorange)
3431 native_range = FALSE; /* \N{} is defined to be Unicode */
3432#endif
3433 s = e + 1; /* Point to just after the '}' */
423cee85
JH
3434 continue;
3435
02aa26ce 3436 /* \c is a control character */
79072805
LW
3437 case 'c':
3438 s++;
961ce445 3439 if (s < send) {
17a3df4c 3440 *d++ = grok_bslash_c(*s++, has_utf8, 1);
ba210ebe 3441 }
961ce445
RGS
3442 else {
3443 yyerror("Missing control char name in \\c");
3444 }
79072805 3445 continue;
02aa26ce
NT
3446
3447 /* printf-style backslashes, formfeeds, newlines, etc */
79072805 3448 case 'b':
db42d148 3449 *d++ = NATIVE_TO_NEED(has_utf8,'\b');
79072805
LW
3450 break;
3451 case 'n':
db42d148 3452 *d++ = NATIVE_TO_NEED(has_utf8,'\n');
79072805
LW
3453 break;
3454 case 'r':
db42d148 3455 *d++ = NATIVE_TO_NEED(has_utf8,'\r');
79072805
LW
3456 break;
3457 case 'f':
db42d148 3458 *d++ = NATIVE_TO_NEED(has_utf8,'\f');
79072805
LW
3459 break;
3460 case 't':
db42d148 3461 *d++ = NATIVE_TO_NEED(has_utf8,'\t');
79072805 3462 break;
34a3fe2a 3463 case 'e':
db42d148 3464 *d++ = ASCII_TO_NEED(has_utf8,'\033');
34a3fe2a
PP
3465 break;
3466 case 'a':
db42d148 3467 *d++ = ASCII_TO_NEED(has_utf8,'\007');
79072805 3468 break;
02aa26ce
NT
3469 } /* end switch */
3470
79072805
LW
3471 s++;
3472 continue;
02aa26ce 3473 } /* end if (backslash) */
4c3a8340
TS
3474#ifdef EBCDIC
3475 else
3476 literal_endpoint++;
3477#endif
02aa26ce 3478
f9a63242 3479 default_action:
77a135fe
KW
3480 /* If we started with encoded form, or already know we want it,
3481 then encode the next character */
3482 if (! NATIVE_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
2b9d42f0 3483 STRLEN len = 1;
77a135fe
KW
3484
3485
3486 /* One might think that it is wasted effort in the case of the
3487 * source being utf8 (this_utf8 == TRUE) to take the next character
3488 * in the source, convert it to an unsigned value, and then convert
3489 * it back again. But the source has not been validated here. The
3490 * routine that does the conversion checks for errors like
3491 * malformed utf8 */
3492
5f66b61c
AL
3493 const UV nextuv = (this_utf8) ? utf8n_to_uvchr((U8*)s, send - s, &len, 0) : (UV) ((U8) *s);
3494 const STRLEN need = UNISKIP(NATIVE_TO_UNI(nextuv));
77a135fe
KW
3495 if (!has_utf8) {
3496 SvCUR_set(sv, d - SvPVX_const(sv));
3497 SvPOK_on(sv);
3498 *d = '\0';
77a135fe 3499 /* See Note on sizing above. */
7bf79863
KW
3500 sv_utf8_upgrade_flags_grow(sv,
3501 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3502 need + (STRLEN)(send - s) + 1);
77a135fe
KW
3503 d = SvPVX(sv) + SvCUR(sv);
3504 has_utf8 = TRUE;
3505 } else if (need > len) {
3506 /* encoded value larger than old, may need extra space (NOTE:
3507 * SvCUR() is not set correctly here). See Note on sizing
3508 * above. */
9d4ba2ae 3509 const STRLEN off = d - SvPVX_const(sv);
77a135fe 3510 d = SvGROW(sv, off + need + (STRLEN)(send - s) + 1) + off;
2b9d42f0 3511 }
77a135fe
KW
3512 s += len;
3513
5f66b61c 3514 d = (char*)uvchr_to_utf8((U8*)d, nextuv);
e294cc5d
JH
3515#ifdef EBCDIC
3516 if (uv > 255 && !dorange)
3517 native_range = FALSE;
3518#endif
2b9d42f0
NIS
3519 }
3520 else {
3521 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
3522 }
02aa26ce
NT
3523 } /* while loop to process each character */
3524
3525 /* terminate the string and set up the sv */
79072805 3526 *d = '\0';
95a20fc0 3527 SvCUR_set(sv, d - SvPVX_const(sv));
2b9d42f0 3528 if (SvCUR(sv) >= SvLEN(sv))
5637ef5b
NC
3529 Perl_croak(aTHX_ "panic: constant overflowed allocated space, %"UVuf
3530 " >= %"UVuf, (UV)SvCUR(sv), (UV)SvLEN(sv));
2b9d42f0 3531
79072805 3532 SvPOK_on(sv);
9f4817db 3533 if (PL_encoding && !has_utf8) {
d0063567
DK
3534 sv_recode_to_utf8(sv, PL_encoding);
3535 if (SvUTF8(sv))
3536 has_utf8 = TRUE;
9f4817db 3537 }
2b9d42f0 3538 if (has_utf8) {
7e2040f0 3539 SvUTF8_on(sv);
2b9d42f0 3540 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
d0063567 3541 PL_sublex_info.sub_op->op_private |=
2b9d42f0
NIS
3542 (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
3543 }
3544 }
79072805 3545
02aa26ce 3546 /* shrink the sv if we allocated more than we used */
79072805 3547 if (SvCUR(sv) + 5 < SvLEN(sv)) {
1da4ca5f 3548 SvPV_shrink_to_cur(sv);
79072805 3549 }
02aa26ce 3550
6154021b 3551 /* return the substring (via pl_yylval) only if we parsed anything */
3280af22 3552 if (s > PL_bufptr) {
eb0d8d16
NC
3553 if ( PL_hints & ( PL_lex_inpat ? HINT_NEW_RE : HINT_NEW_STRING ) ) {
3554 const char *const key = PL_lex_inpat ? "qr" : "q";
3555 const STRLEN keylen = PL_lex_inpat ? 2 : 1;
3556 const char *type;
3557 STRLEN typelen;
3558
3559 if (PL_lex_inwhat == OP_TRANS) {
3560 type = "tr";
3561 typelen = 2;
3562 } else if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat) {
3563 type = "s";
3564 typelen = 1;
9da1dd8f
DM
3565 } else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'') {
3566 type = "q";
3567 typelen = 1;
eb0d8d16
NC
3568 } else {
3569 type = "qq";
3570 typelen = 2;
3571 }
3572
3573 sv = S_new_constant(aTHX_ start, s - start, key, keylen, sv, NULL,
3574 type, typelen);
3575 }
6154021b 3576 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
b3ac6de7 3577 } else
8990e307 3578 SvREFCNT_dec(sv);
79072805
LW
3579 return s;
3580}
3581
ffb4593c
NT
3582/* S_intuit_more
3583 * Returns TRUE if there's more to the expression (e.g., a subscript),
3584 * FALSE otherwise.
ffb4593c
NT
3585 *
3586 * It deals with "$foo[3]" and /$foo[3]/ and /$foo[0123456789$]+/
3587 *
3588 * ->[ and ->{ return TRUE
3589 * { and [ outside a pattern are always subscripts, so return TRUE
3590 * if we're outside a pattern and it's not { or [, then return FALSE
3591 * if we're in a pattern and the first char is a {
3592 * {4,5} (any digits around the comma) returns FALSE
3593 * if we're in a pattern and the first char is a [
3594 * [] returns FALSE
3595 * [SOMETHING] has a funky algorithm to decide whether it's a
3596 * character class or not. It has to deal with things like
3597 * /$foo[-3]/ and /$foo[$bar]/ as well as /$foo[$\d]+/
3598 * anything else returns TRUE
3599 */
3600
9cbb5ea2
GS
3601/* This is the one truly awful dwimmer necessary to conflate C and sed. */
3602
76e3520e 3603STATIC int
cea2e8a9 3604S_intuit_more(pTHX_ register char *s)
79072805 3605{
97aff369 3606 dVAR;
7918f24d
NC
3607
3608 PERL_ARGS_ASSERT_INTUIT_MORE;
3609
3280af22 3610 if (PL_lex_brackets)
79072805
LW
3611 return TRUE;
3612 if (*s == '-' && s[1] == '>' && (s[2] == '[' || s[2] == '{'))
3613 return TRUE;
3614 if (*s != '{' && *s != '[')
3615 return FALSE;
3280af22 3616 if (!PL_lex_inpat)
79072805
LW
3617 return TRUE;
3618
3619 /* In a pattern, so maybe we have {n,m}. */
3620 if (*s == '{') {
b3155d95 3621 if (regcurly(s)) {
79072805 3622 return FALSE;
b3155d95 3623 }
79072805 3624 return TRUE;
79072805
LW
3625 }
3626
3627 /* On the other hand, maybe we have a character class */
3628
3629 s++;
3630 if (*s == ']' || *s == '^')
3631 return FALSE;
3632 else {
ffb4593c 3633 /* this is terrifying, and it works */
79072805
LW
3634 int weight = 2; /* let's weigh the evidence */
3635 char seen[256];
f27ffc4a 3636 unsigned char un_char = 255, last_un_char;
9d4ba2ae 3637 const char * const send = strchr(s,']');
3280af22 3638 char tmpbuf[sizeof PL_tokenbuf * 4];
79072805
LW
3639
3640 if (!send) /* has to be an expression */
3641 return TRUE;
3642
3643 Zero(seen,256,char);
3644 if (*s == '$')
3645 weight -= 3;
3646 else if (isDIGIT(*s)) {
3647 if (s[1] != ']') {
3648 if (isDIGIT(s[1]) && s[2] == ']')
3649 weight -= 10;
3650 }
3651 else
3652 weight -= 100;
3653 }
3654 for (; s < send; s++) {
3655 last_un_char = un_char;
3656 un_char = (unsigned char)*s;
3657 switch (*s) {
3658 case '@':
3659 case '&':
3660 case '$':
3661 weight -= seen[un_char] * 10;
7e2040f0 3662 if (isALNUM_lazy_if(s+1,UTF)) {
90e5519e 3663 int len;
8903cb82 3664 scan_ident(s, send, tmpbuf, sizeof tmpbuf, FALSE);
90e5519e 3665 len = (int)strlen(tmpbuf);
6fbd0d97
BF
3666 if (len > 1 && gv_fetchpvn_flags(tmpbuf, len,
3667 UTF ? SVf_UTF8 : 0, SVt_PV))
79072805
LW
3668 weight -= 100;
3669 else
3670 weight -= 10;
3671 }
3672 else if (*s == '$' && s[1] &&
93a17b20
LW
3673 strchr("[#!%*<>()-=",s[1])) {
3674 if (/*{*/ strchr("])} =",s[2]))
79072805
LW
3675 weight -= 10;
3676 else
3677 weight -= 1;
3678 }
3679 break;
3680 case '\\':
3681 un_char = 254;
3682 if (s[1]) {
93a17b20 3683 if (strchr("wds]",s[1]))
79072805 3684 weight += 100;
10edeb5d 3685 else if (seen[(U8)'\''] || seen[(U8)'"'])
79072805 3686 weight += 1;
93a17b20 3687 else if (strchr("rnftbxcav",s[1]))
79072805
LW
3688 weight += 40;
3689 else if (isDIGIT(s[1])) {
3690 weight += 40;
3691 while (s[1] && isDIGIT(s[1]))
3692 s++;
3693 }
3694 }
3695 else
3696 weight += 100;
3697 break;
3698 case '-':
3699 if (s[1] == '\\')
3700 weight += 50;
93a17b20 3701 if (strchr("aA01! ",last_un_char))
79072805 3702 weight += 30;
93a17b20 3703 if (strchr("zZ79~",s[1]))
79072805 3704 weight += 30;
f27ffc4a
GS
3705 if (last_un_char == 255 && (isDIGIT(s[1]) || s[1] == '$'))
3706 weight -= 5; /* cope with negative subscript */
79072805
LW
3707 break;
3708 default:
3792a11b
NC
3709 if (!isALNUM(last_un_char)
3710 && !(last_un_char == '$' || last_un_char == '@'
3711 || last_un_char == '&')
3712 && isALPHA(*s) && s[1] && isALPHA(s[1])) {
79072805
LW
3713 char *d = tmpbuf;
3714 while (isALPHA(*s))
3715 *d++ = *s++;
3716 *d = '\0';
5458a98a 3717 if (keyword(tmpbuf, d - tmpbuf, 0))
79072805
LW
3718 weight -= 150;
3719 }
3720 if (un_char == last_un_char + 1)
3721 weight += 5;
3722 weight -= seen[un_char];
3723 break;
3724 }
3725 seen[un_char]++;
3726 }
3727 if (weight >= 0) /* probably a character class */
3728 return FALSE;
3729 }
3730
3731 return TRUE;
3732}
ffed7fef 3733
ffb4593c
NT
3734/*
3735 * S_intuit_method
3736 *
3737 * Does all the checking to disambiguate
3738 * foo bar
3739 * between foo(bar) and bar->foo. Returns 0 if not a method, otherwise
3740 * FUNCMETH (bar->foo(args)) or METHOD (bar->foo args).
3741 *
3742 * First argument is the stuff after the first token, e.g. "bar".
3743 *
a4fd4a89 3744 * Not a method if foo is a filehandle.
ffb4593c
NT
3745 * Not a method if foo is a subroutine prototyped to take a filehandle.
3746 * Not a method if it's really "Foo $bar"
3747 * Method if it's "foo $bar"
3748 * Not a method if it's really "print foo $bar"
3749 * Method if it's really "foo package::" (interpreted as package->foo)
8f8cf39c 3750 * Not a method if bar is known to be a subroutine ("sub bar; foo bar")
3cb0bbe5 3751 * Not a method if bar is a filehandle or package, but is quoted with
ffb4593c
NT
3752 * =>
3753 */
3754
76e3520e 3755STATIC int
62d55b22 3756S_intuit_method(pTHX_ char *start, GV *gv, CV *cv)
a0d0e21e 3757{
97aff369 3758 dVAR;
a0d0e21e 3759 char *s = start + (*start == '$');
3280af22 3760 char tmpbuf[sizeof PL_tokenbuf];
a0d0e21e
LW
3761 STRLEN len;
3762 GV* indirgv;
5db06880
NC
3763#ifdef PERL_MAD
3764 int soff;
3765#endif
a0d0e21e