This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Clean up MANIFEST before creating a new one.
[perl5.git] / toke.c
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
26
27This is the lower layer of the Perl parser, managing characters and tokens.
28
29=for apidoc AmU|yy_parser *|PL_parser
30
31Pointer to a structure encapsulating the state of the parsing operation
32currently in progress. The pointer can be locally changed to perform
33a nested parse without interfering with the state of an outer parse.
34Individual members of C<PL_parser> have their own documentation.
35
36=cut
37*/
38
378cc40b 39#include "EXTERN.h"
864dbfa3 40#define PERL_IN_TOKE_C
378cc40b 41#include "perl.h"
04e98a4d 42#include "dquote_static.c"
378cc40b 43
eb0d8d16
NC
44#define new_constant(a,b,c,d,e,f,g) \
45 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
46
6154021b 47#define pl_yylval (PL_parser->yylval)
d3b6f988 48
199e78b7
DM
49/* XXX temporary backwards compatibility */
50#define PL_lex_brackets (PL_parser->lex_brackets)
78cdf107
Z
51#define PL_lex_allbrackets (PL_parser->lex_allbrackets)
52#define PL_lex_fakeeof (PL_parser->lex_fakeeof)
199e78b7
DM
53#define PL_lex_brackstack (PL_parser->lex_brackstack)
54#define PL_lex_casemods (PL_parser->lex_casemods)
55#define PL_lex_casestack (PL_parser->lex_casestack)
56#define PL_lex_defer (PL_parser->lex_defer)
57#define PL_lex_dojoin (PL_parser->lex_dojoin)
58#define PL_lex_expect (PL_parser->lex_expect)
59#define PL_lex_formbrack (PL_parser->lex_formbrack)
60#define PL_lex_inpat (PL_parser->lex_inpat)
61#define PL_lex_inwhat (PL_parser->lex_inwhat)
62#define PL_lex_op (PL_parser->lex_op)
63#define PL_lex_repl (PL_parser->lex_repl)
64#define PL_lex_starts (PL_parser->lex_starts)
65#define PL_lex_stuff (PL_parser->lex_stuff)
66#define PL_multi_start (PL_parser->multi_start)
67#define PL_multi_open (PL_parser->multi_open)
68#define PL_multi_close (PL_parser->multi_close)
69#define PL_pending_ident (PL_parser->pending_ident)
70#define PL_preambled (PL_parser->preambled)
71#define PL_sublex_info (PL_parser->sublex_info)
bdc0bf6f 72#define PL_linestr (PL_parser->linestr)
c2598295
DM
73#define PL_expect (PL_parser->expect)
74#define PL_copline (PL_parser->copline)
f06b5848
DM
75#define PL_bufptr (PL_parser->bufptr)
76#define PL_oldbufptr (PL_parser->oldbufptr)
77#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
78#define PL_linestart (PL_parser->linestart)
79#define PL_bufend (PL_parser->bufend)
80#define PL_last_uni (PL_parser->last_uni)
81#define PL_last_lop (PL_parser->last_lop)
82#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 83#define PL_lex_state (PL_parser->lex_state)
2f9285f8 84#define PL_rsfp (PL_parser->rsfp)
5486870f 85#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
86#define PL_in_my (PL_parser->in_my)
87#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 88#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 89#define PL_multi_end (PL_parser->multi_end)
13765c85 90#define PL_error_count (PL_parser->error_count)
199e78b7
DM
91
92#ifdef PERL_MAD
93# define PL_endwhite (PL_parser->endwhite)
94# define PL_faketokens (PL_parser->faketokens)
95# define PL_lasttoke (PL_parser->lasttoke)
96# define PL_nextwhite (PL_parser->nextwhite)
97# define PL_realtokenstart (PL_parser->realtokenstart)
98# define PL_skipwhite (PL_parser->skipwhite)
99# define PL_thisclose (PL_parser->thisclose)
100# define PL_thismad (PL_parser->thismad)
101# define PL_thisopen (PL_parser->thisopen)
102# define PL_thisstuff (PL_parser->thisstuff)
103# define PL_thistoken (PL_parser->thistoken)
104# define PL_thiswhite (PL_parser->thiswhite)
fb205e7a
DM
105# define PL_thiswhite (PL_parser->thiswhite)
106# define PL_nexttoke (PL_parser->nexttoke)
107# define PL_curforce (PL_parser->curforce)
108#else
109# define PL_nexttoke (PL_parser->nexttoke)
110# define PL_nexttype (PL_parser->nexttype)
111# define PL_nextval (PL_parser->nextval)
199e78b7
DM
112#endif
113
16173588
NC
114/* This can't be done with embed.fnc, because struct yy_parser contains a
115 member named pending_ident, which clashes with the generated #define */
3cbf51f5
DM
116static int
117S_pending_ident(pTHX);
199e78b7 118
0bd48802 119static const char ident_too_long[] = "Identifier too long";
8903cb82 120
29595ff2 121#ifdef PERL_MAD
29595ff2 122# define CURMAD(slot,sv) if (PL_madskills) { curmad(slot,sv); sv = 0; }
cd81e915 123# define NEXTVAL_NEXTTOKE PL_nexttoke[PL_curforce].next_val
9ded7720 124#else
5db06880 125# define CURMAD(slot,sv)
9ded7720 126# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2
NC
127#endif
128
a7aaec61
Z
129#define XENUMMASK 0x3f
130#define XFAKEEOF 0x40
131#define XFAKEBRACK 0x80
9059aa12 132
39e02b42
JH
133#ifdef USE_UTF8_SCRIPTS
134# define UTF (!IN_BYTES)
2b9d42f0 135#else
746b446a 136# define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || (PL_hints & HINT_UTF8))
2b9d42f0 137#endif
a0ed51b3 138
b1fc3636
CJ
139/* The maximum number of characters preceding the unrecognized one to display */
140#define UNRECOGNIZED_PRECEDE_COUNT 10
141
61f0cdd9 142/* In variables named $^X, these are the legal values for X.
2b92dfce
GS
143 * 1999-02-27 mjd-perl-patch@plover.com */
144#define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
145
bf4acbe4 146#define SPACE_OR_TAB(c) ((c)==' '||(c)=='\t')
bf4acbe4 147
ffb4593c
NT
148/* LEX_* are values for PL_lex_state, the state of the lexer.
149 * They are arranged oddly so that the guard on the switch statement
79072805
LW
150 * can get by with a single comparison (if the compiler is smart enough).
151 */
152
fb73857a 153/* #define LEX_NOTPARSING 11 is done in perl.h. */
154
b6007c36
DM
155#define LEX_NORMAL 10 /* normal code (ie not within "...") */
156#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
157#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
158#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
159#define LEX_INTERPSTART 6 /* expecting the start of a $var */
160
161 /* at end of code, eg "$x" followed by: */
162#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
163#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
164
165#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
166 string or after \E, $foo, etc */
167#define LEX_INTERPCONST 2 /* NOT USED */
168#define LEX_FORMLINE 1 /* expecting a format line */
169#define LEX_KNOWNEXT 0 /* next token known; just return it */
170
79072805 171
bbf60fe6 172#ifdef DEBUGGING
27da23d5 173static const char* const lex_state_names[] = {
bbf60fe6
DM
174 "KNOWNEXT",
175 "FORMLINE",
176 "INTERPCONST",
177 "INTERPCONCAT",
178 "INTERPENDMAYBE",
179 "INTERPEND",
180 "INTERPSTART",
181 "INTERPPUSH",
182 "INTERPCASEMOD",
183 "INTERPNORMAL",
184 "NORMAL"
185};
186#endif
187
79072805
LW
188#ifdef ff_next
189#undef ff_next
d48672a2
LW
190#endif
191
79072805 192#include "keywords.h"
fe14fcc3 193
ffb4593c
NT
194/* CLINE is a macro that ensures PL_copline has a sane value */
195
ae986130
LW
196#ifdef CLINE
197#undef CLINE
198#endif
57843af0 199#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 200
5db06880 201#ifdef PERL_MAD
29595ff2
NC
202# define SKIPSPACE0(s) skipspace0(s)
203# define SKIPSPACE1(s) skipspace1(s)
204# define SKIPSPACE2(s,tsv) skipspace2(s,&tsv)
205# define PEEKSPACE(s) skipspace2(s,0)
206#else
207# define SKIPSPACE0(s) skipspace(s)
208# define SKIPSPACE1(s) skipspace(s)
209# define SKIPSPACE2(s,tsv) skipspace(s)
210# define PEEKSPACE(s) skipspace(s)
211#endif
212
ffb4593c
NT
213/*
214 * Convenience functions to return different tokens and prime the
9cbb5ea2 215 * lexer for the next token. They all take an argument.
ffb4593c
NT
216 *
217 * TOKEN : generic token (used for '(', DOLSHARP, etc)
218 * OPERATOR : generic operator
219 * AOPERATOR : assignment operator
220 * PREBLOCK : beginning the block after an if, while, foreach, ...
221 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
222 * PREREF : *EXPR where EXPR is not a simple identifier
223 * TERM : expression term
224 * LOOPX : loop exiting command (goto, last, dump, etc)
225 * FTST : file test operator
226 * FUN0 : zero-argument function
2d2e263d 227 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
228 * BOop : bitwise or or xor
229 * BAop : bitwise and
230 * SHop : shift operator
231 * PWop : power operator
9cbb5ea2 232 * PMop : pattern-matching operator
ffb4593c
NT
233 * Aop : addition-level operator
234 * Mop : multiplication-level operator
235 * Eop : equality-testing operator
e5edeb50 236 * Rop : relational operator <= != gt
ffb4593c
NT
237 *
238 * Also see LOP and lop() below.
239 */
240
998054bd 241#ifdef DEBUGGING /* Serve -DT. */
704d4215 242# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 243#else
bbf60fe6 244# define REPORT(retval) (retval)
998054bd
SC
245#endif
246
bbf60fe6
DM
247#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
248#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
249#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, REPORT(retval)))
250#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
251#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
252#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
253#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
6154021b
RGS
254#define LOOPX(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)LOOPEX))
255#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
256#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
257#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
258#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITOROP)))
259#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITANDOP)))
260#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)SHIFTOP)))
261#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)POWOP)))
262#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
263#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)ADDOP)))
264#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MULOP)))
265#define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
266#define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
2f3197b3 267
a687059c
LW
268/* This bit of chicanery makes a unary function followed by
269 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
270 * The UNIDOR macro is for unary functions that can be followed by the //
271 * operator (such as C<shift // 0>).
a687059c 272 */
376fcdbf 273#define UNI2(f,x) { \
6154021b 274 pl_yylval.ival = f; \
376fcdbf
AL
275 PL_expect = x; \
276 PL_bufptr = s; \
277 PL_last_uni = PL_oldbufptr; \
278 PL_last_lop_op = f; \
279 if (*s == '(') \
280 return REPORT( (int)FUNC1 ); \
29595ff2 281 s = PEEKSPACE(s); \
376fcdbf
AL
282 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
283 }
6f33ba73
RGS
284#define UNI(f) UNI2(f,XTERM)
285#define UNIDOR(f) UNI2(f,XTERMORDORDOR)
a687059c 286
376fcdbf 287#define UNIBRACK(f) { \
6154021b 288 pl_yylval.ival = f; \
376fcdbf
AL
289 PL_bufptr = s; \
290 PL_last_uni = PL_oldbufptr; \
291 if (*s == '(') \
292 return REPORT( (int)FUNC1 ); \
29595ff2 293 s = PEEKSPACE(s); \
376fcdbf
AL
294 return REPORT( (*s == '(') ? (int)FUNC1 : (int)UNIOP ); \
295 }
79072805 296
9f68db38 297/* grandfather return to old style */
78cdf107
Z
298#define OLDLOP(f) \
299 do { \
300 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
301 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
302 pl_yylval.ival = (f); \
303 PL_expect = XTERM; \
304 PL_bufptr = s; \
305 return (int)LSTOP; \
306 } while(0)
79072805 307
8fa7f367
JH
308#ifdef DEBUGGING
309
6154021b 310/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
311enum token_type {
312 TOKENTYPE_NONE,
313 TOKENTYPE_IVAL,
6154021b 314 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6
DM
315 TOKENTYPE_PVAL,
316 TOKENTYPE_OPVAL,
317 TOKENTYPE_GVVAL
318};
319
6d4a66ac
NC
320static struct debug_tokens {
321 const int token;
322 enum token_type type;
323 const char *name;
324} const debug_tokens[] =
9041c2e3 325{
bbf60fe6
DM
326 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
327 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
328 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
329 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
330 { ARROW, TOKENTYPE_NONE, "ARROW" },
331 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
332 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
333 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
334 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
335 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
0d863452 336 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
337 { DO, TOKENTYPE_NONE, "DO" },
338 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
339 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
340 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
341 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
342 { ELSE, TOKENTYPE_NONE, "ELSE" },
343 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
344 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
345 { FOR, TOKENTYPE_IVAL, "FOR" },
346 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
347 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
348 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
349 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
350 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
351 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 352 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
353 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
354 { IF, TOKENTYPE_IVAL, "IF" },
355 { LABEL, TOKENTYPE_PVAL, "LABEL" },
356 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
357 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
358 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
359 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
360 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
361 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
362 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
363 { MY, TOKENTYPE_IVAL, "MY" },
364 { MYSUB, TOKENTYPE_NONE, "MYSUB" },
365 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
366 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
367 { OROP, TOKENTYPE_IVAL, "OROP" },
368 { OROR, TOKENTYPE_NONE, "OROR" },
369 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
88e1f1a2
JV
370 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
371 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6
DM
372 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
373 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
374 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
375 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
376 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
377 { PREINC, TOKENTYPE_NONE, "PREINC" },
378 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
379 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
380 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
381 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
382 { SUB, TOKENTYPE_NONE, "SUB" },
383 { THING, TOKENTYPE_OPVAL, "THING" },
384 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
385 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
386 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
387 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
388 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
389 { USE, TOKENTYPE_IVAL, "USE" },
0d863452 390 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6
DM
391 { WHILE, TOKENTYPE_IVAL, "WHILE" },
392 { WORD, TOKENTYPE_OPVAL, "WORD" },
be25f609 393 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 394 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
395};
396
6154021b 397/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 398
bbf60fe6 399STATIC int
704d4215 400S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 401{
97aff369 402 dVAR;
7918f24d
NC
403
404 PERL_ARGS_ASSERT_TOKEREPORT;
405
bbf60fe6 406 if (DEBUG_T_TEST) {
bd61b366 407 const char *name = NULL;
bbf60fe6 408 enum token_type type = TOKENTYPE_NONE;
f54cb97a 409 const struct debug_tokens *p;
396482e1 410 SV* const report = newSVpvs("<== ");
bbf60fe6 411
f54cb97a 412 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
413 if (p->token == (int)rv) {
414 name = p->name;
415 type = p->type;
416 break;
417 }
418 }
419 if (name)
54667de8 420 Perl_sv_catpv(aTHX_ report, name);
bbf60fe6
DM
421 else if ((char)rv > ' ' && (char)rv < '~')
422 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
423 else if (!rv)
396482e1 424 sv_catpvs(report, "EOF");
bbf60fe6
DM
425 else
426 Perl_sv_catpvf(aTHX_ report, "?? %"IVdf, (IV)rv);
427 switch (type) {
428 case TOKENTYPE_NONE:
429 case TOKENTYPE_GVVAL: /* doesn't appear to be used */
430 break;
431 case TOKENTYPE_IVAL:
704d4215 432 Perl_sv_catpvf(aTHX_ report, "(ival=%"IVdf")", (IV)lvalp->ival);
bbf60fe6
DM
433 break;
434 case TOKENTYPE_OPNUM:
435 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 436 PL_op_name[lvalp->ival]);
bbf60fe6
DM
437 break;
438 case TOKENTYPE_PVAL:
704d4215 439 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
440 break;
441 case TOKENTYPE_OPVAL:
704d4215 442 if (lvalp->opval) {
401441c0 443 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
444 PL_op_name[lvalp->opval->op_type]);
445 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 446 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 447 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
448 }
449
450 }
401441c0 451 else
396482e1 452 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
453 break;
454 }
b6007c36 455 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
456 };
457 return (int)rv;
998054bd
SC
458}
459
b6007c36
DM
460
461/* print the buffer with suitable escapes */
462
463STATIC void
15f169a1 464S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 465{
396482e1 466 SV* const tmp = newSVpvs("");
7918f24d
NC
467
468 PERL_ARGS_ASSERT_PRINTBUF;
469
b6007c36
DM
470 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
471 SvREFCNT_dec(tmp);
472}
473
8fa7f367
JH
474#endif
475
8290c323
NC
476static int
477S_deprecate_commaless_var_list(pTHX) {
478 PL_expect = XTERM;
479 deprecate("comma-less variable list");
480 return REPORT(','); /* grandfather non-comma-format format */
481}
482
ffb4593c
NT
483/*
484 * S_ao
485 *
c963b151
BD
486 * This subroutine detects &&=, ||=, and //= and turns an ANDAND, OROR or DORDOR
487 * into an OP_ANDASSIGN, OP_ORASSIGN, or OP_DORASSIGN
ffb4593c
NT
488 */
489
76e3520e 490STATIC int
cea2e8a9 491S_ao(pTHX_ int toketype)
a0d0e21e 492{
97aff369 493 dVAR;
3280af22
NIS
494 if (*PL_bufptr == '=') {
495 PL_bufptr++;
a0d0e21e 496 if (toketype == ANDAND)
6154021b 497 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 498 else if (toketype == OROR)
6154021b 499 pl_yylval.ival = OP_ORASSIGN;
c963b151 500 else if (toketype == DORDOR)
6154021b 501 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
502 toketype = ASSIGNOP;
503 }
504 return toketype;
505}
506
ffb4593c
NT
507/*
508 * S_no_op
509 * When Perl expects an operator and finds something else, no_op
510 * prints the warning. It always prints "<something> found where
511 * operator expected. It prints "Missing semicolon on previous line?"
512 * if the surprise occurs at the start of the line. "do you need to
513 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
514 * where the compiler doesn't know if foo is a method call or a function.
515 * It prints "Missing operator before end of line" if there's nothing
516 * after the missing operator, or "... before <...>" if there is something
517 * after the missing operator.
518 */
519
76e3520e 520STATIC void
15f169a1 521S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 522{
97aff369 523 dVAR;
9d4ba2ae
AL
524 char * const oldbp = PL_bufptr;
525 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 526
7918f24d
NC
527 PERL_ARGS_ASSERT_NO_OP;
528
1189a94a
GS
529 if (!s)
530 s = oldbp;
07c798fb 531 else
1189a94a 532 PL_bufptr = s;
cea2e8a9 533 yywarn(Perl_form(aTHX_ "%s found where operator expected", what));
56da5a46
RGS
534 if (ckWARN_d(WARN_SYNTAX)) {
535 if (is_first)
536 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
537 "\t(Missing semicolon on previous line?)\n");
538 else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) {
f54cb97a 539 const char *t;
c35e046a
AL
540 for (t = PL_oldoldbufptr; (isALNUM_lazy_if(t,UTF) || *t == ':'); t++)
541 NOOP;
56da5a46
RGS
542 if (t < PL_bufptr && isSPACE(*t))
543 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
544 "\t(Do you need to predeclare %.*s?)\n",
551405c4 545 (int)(t - PL_oldoldbufptr), PL_oldoldbufptr);
56da5a46
RGS
546 }
547 else {
548 assert(s >= oldbp);
549 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
551405c4 550 "\t(Missing operator before %.*s?)\n", (int)(s - oldbp), oldbp);
56da5a46 551 }
07c798fb 552 }
3280af22 553 PL_bufptr = oldbp;
8990e307
LW
554}
555
ffb4593c
NT
556/*
557 * S_missingterm
558 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 559 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
560 * If we're in a delimited string and the delimiter is a control
561 * character, it's reformatted into a two-char sequence like ^C.
562 * This is fatal.
563 */
564
76e3520e 565STATIC void
cea2e8a9 566S_missingterm(pTHX_ char *s)
8990e307 567{
97aff369 568 dVAR;
8990e307
LW
569 char tmpbuf[3];
570 char q;
571 if (s) {
9d4ba2ae 572 char * const nl = strrchr(s,'\n');
d2719217 573 if (nl)
8990e307
LW
574 *nl = '\0';
575 }
463559e7 576 else if (isCNTRL(PL_multi_close)) {
8990e307 577 *tmpbuf = '^';
585ec06d 578 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
579 tmpbuf[2] = '\0';
580 s = tmpbuf;
581 }
582 else {
eb160463 583 *tmpbuf = (char)PL_multi_close;
8990e307
LW
584 tmpbuf[1] = '\0';
585 s = tmpbuf;
586 }
587 q = strchr(s,'"') ? '\'' : '"';
cea2e8a9 588 Perl_croak(aTHX_ "Can't find string terminator %c%s%c anywhere before EOF",q,s,q);
463ee0b2 589}
79072805 590
0d863452 591/*
0d863452
RH
592 * Check whether the named feature is enabled.
593 */
26ea9e12
NC
594bool
595Perl_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
0d863452 596{
97aff369 597 dVAR;
0d863452 598 HV * const hinthv = GvHV(PL_hintgv);
4a731d7b 599 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
7918f24d
NC
600
601 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
602
26ea9e12
NC
603 if (namelen > MAX_FEATURE_LEN)
604 return FALSE;
4a731d7b 605 memcpy(&he_name[8], name, namelen);
d4c19fe8 606
7b9ef140 607 return (hinthv && hv_exists(hinthv, he_name, 8 + namelen));
0d863452
RH
608}
609
ffb4593c 610/*
9cbb5ea2
GS
611 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
612 * utf16-to-utf8-reversed.
ffb4593c
NT
613 */
614
c39cd008
GS
615#ifdef PERL_CR_FILTER
616static void
617strip_return(SV *sv)
618{
95a20fc0 619 register const char *s = SvPVX_const(sv);
9d4ba2ae 620 register const char * const e = s + SvCUR(sv);
7918f24d
NC
621
622 PERL_ARGS_ASSERT_STRIP_RETURN;
623
c39cd008
GS
624 /* outer loop optimized to do nothing if there are no CR-LFs */
625 while (s < e) {
626 if (*s++ == '\r' && *s == '\n') {
627 /* hit a CR-LF, need to copy the rest */
628 register char *d = s - 1;
629 *d++ = *s++;
630 while (s < e) {
631 if (*s == '\r' && s[1] == '\n')
632 s++;
633 *d++ = *s++;
634 }
635 SvCUR(sv) -= s - d;
636 return;
637 }
638 }
639}
a868473f 640
76e3520e 641STATIC I32
c39cd008 642S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 643{
f54cb97a 644 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
645 if (count > 0 && !maxlen)
646 strip_return(sv);
647 return count;
a868473f
NIS
648}
649#endif
650
ffb4593c 651/*
8eaa0acf
Z
652=for apidoc Amx|void|lex_start|SV *line|PerlIO *rsfp|U32 flags
653
654Creates and initialises a new lexer/parser state object, supplying
655a context in which to lex and parse from a new source of Perl code.
656A pointer to the new state object is placed in L</PL_parser>. An entry
657is made on the save stack so that upon unwinding the new state object
658will be destroyed and the former value of L</PL_parser> will be restored.
659Nothing else need be done to clean up the parsing context.
660
661The code to be parsed comes from I<line> and I<rsfp>. I<line>, if
662non-null, provides a string (in SV form) containing code to be parsed.
663A copy of the string is made, so subsequent modification of I<line>
664does not affect parsing. I<rsfp>, if non-null, provides an input stream
665from which code will be read to be parsed. If both are non-null, the
666code in I<line> comes first and must consist of complete lines of input,
667and I<rsfp> supplies the remainder of the source.
668
669The I<flags> parameter is reserved for future use, and must always
27fcb6ee
FC
670be zero, except for one flag that is currently reserved for perl's internal
671use.
8eaa0acf
Z
672
673=cut
674*/
ffb4593c 675
27fcb6ee
FC
676/* LEX_START_SAME_FILTER indicates that this is not a new file, so it
677 can share filters with the current parser. */
678
a0d0e21e 679void
8eaa0acf 680Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
79072805 681{
97aff369 682 dVAR;
6ef55633 683 const char *s = NULL;
8990e307 684 STRLEN len;
5486870f 685 yy_parser *parser, *oparser;
27fcb6ee 686 if (flags && flags != LEX_START_SAME_FILTER)
8eaa0acf 687 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
acdf0a21
DM
688
689 /* create and initialise a parser */
690
199e78b7 691 Newxz(parser, 1, yy_parser);
5486870f 692 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
693 PL_parser = parser;
694
28ac2b49
Z
695 parser->stack = NULL;
696 parser->ps = NULL;
697 parser->stack_size = 0;
acdf0a21 698
e3abe207
DM
699 /* on scope exit, free this parser and restore any outer one */
700 SAVEPARSER(parser);
7c4baf47 701 parser->saved_curcop = PL_curcop;
e3abe207 702
acdf0a21 703 /* initialise lexer state */
8990e307 704
fb205e7a
DM
705#ifdef PERL_MAD
706 parser->curforce = -1;
707#else
708 parser->nexttoke = 0;
709#endif
ca4cfd28 710 parser->error_count = oparser ? oparser->error_count : 0;
c2598295 711 parser->copline = NOLINE;
5afb0a62 712 parser->lex_state = LEX_NORMAL;
c2598295 713 parser->expect = XSTATE;
2f9285f8 714 parser->rsfp = rsfp;
27fcb6ee
FC
715 parser->rsfp_filters =
716 !(flags & LEX_START_SAME_FILTER) || !oparser
717 ? newAV()
718 : MUTABLE_AV(SvREFCNT_inc(oparser->rsfp_filters));
2f9285f8 719
199e78b7
DM
720 Newx(parser->lex_brackstack, 120, char);
721 Newx(parser->lex_casestack, 12, char);
722 *parser->lex_casestack = '\0';
02b34bbe 723
10efb74f
NC
724 if (line) {
725 s = SvPV_const(line, len);
726 } else {
727 len = 0;
728 }
bdc0bf6f 729
10efb74f 730 if (!len) {
bdc0bf6f 731 parser->linestr = newSVpvs("\n;");
805700c1 732 } else {
719a9bb0 733 parser->linestr = newSVpvn_flags(s, len, SvUTF8(line));
10efb74f 734 if (s[len-1] != ';')
bdc0bf6f 735 sv_catpvs(parser->linestr, "\n;");
8990e307 736 }
f06b5848
DM
737 parser->oldoldbufptr =
738 parser->oldbufptr =
739 parser->bufptr =
740 parser->linestart = SvPVX(parser->linestr);
741 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
742 parser->last_lop = parser->last_uni = NULL;
737c24fc
Z
743
744 parser->in_pod = 0;
79072805 745}
a687059c 746
e3abe207
DM
747
748/* delete a parser object */
749
750void
751Perl_parser_free(pTHX_ const yy_parser *parser)
752{
7918f24d
NC
753 PERL_ARGS_ASSERT_PARSER_FREE;
754
7c4baf47 755 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
756 SvREFCNT_dec(parser->linestr);
757
2f9285f8
DM
758 if (parser->rsfp == PerlIO_stdin())
759 PerlIO_clearerr(parser->rsfp);
799361c3
SH
760 else if (parser->rsfp && (!parser->old_parser ||
761 (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 762 PerlIO_close(parser->rsfp);
5486870f 763 SvREFCNT_dec(parser->rsfp_filters);
2f9285f8 764
e3abe207
DM
765 Safefree(parser->lex_brackstack);
766 Safefree(parser->lex_casestack);
767 PL_parser = parser->old_parser;
768 Safefree(parser);
769}
770
771
ffb4593c 772/*
f0e67a1d
Z
773=for apidoc AmxU|SV *|PL_parser-E<gt>linestr
774
775Buffer scalar containing the chunk currently under consideration of the
776text currently being lexed. This is always a plain string scalar (for
777which C<SvPOK> is true). It is not intended to be used as a scalar by
778normal scalar means; instead refer to the buffer directly by the pointer
779variables described below.
780
781The lexer maintains various C<char*> pointers to things in the
782C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
783reallocated, all of these pointers must be updated. Don't attempt to
784do this manually, but rather use L</lex_grow_linestr> if you need to
785reallocate the buffer.
786
787The content of the text chunk in the buffer is commonly exactly one
788complete line of input, up to and including a newline terminator,
789but there are situations where it is otherwise. The octets of the
790buffer may be intended to be interpreted as either UTF-8 or Latin-1.
791The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
792flag on this scalar, which may disagree with it.
793
794For direct examination of the buffer, the variable
795L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
796lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
797of these pointers is usually preferable to examination of the scalar
798through normal scalar means.
799
800=for apidoc AmxU|char *|PL_parser-E<gt>bufend
801
802Direct pointer to the end of the chunk of text currently being lexed, the
803end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
804+ SvCUR(PL_parser-E<gt>linestr)>. A NUL character (zero octet) is
805always located at the end of the buffer, and does not count as part of
806the buffer's contents.
807
808=for apidoc AmxU|char *|PL_parser-E<gt>bufptr
809
810Points to the current position of lexing inside the lexer buffer.
811Characters around this point may be freely examined, within
812the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
813L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
814interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
815
816Lexing code (whether in the Perl core or not) moves this pointer past
817the characters that it consumes. It is also expected to perform some
818bookkeeping whenever a newline character is consumed. This movement
819can be more conveniently performed by the function L</lex_read_to>,
820which handles newlines appropriately.
821
822Interpretation of the buffer's octets can be abstracted out by
823using the slightly higher-level functions L</lex_peek_unichar> and
824L</lex_read_unichar>.
825
826=for apidoc AmxU|char *|PL_parser-E<gt>linestart
827
828Points to the start of the current line inside the lexer buffer.
829This is useful for indicating at which column an error occurred, and
830not much else. This must be updated by any lexing code that consumes
831a newline; the function L</lex_read_to> handles this detail.
832
833=cut
834*/
835
836/*
837=for apidoc Amx|bool|lex_bufutf8
838
839Indicates whether the octets in the lexer buffer
840(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
841of Unicode characters. If not, they should be interpreted as Latin-1
842characters. This is analogous to the C<SvUTF8> flag for scalars.
843
844In UTF-8 mode, it is not guaranteed that the lexer buffer actually
845contains valid UTF-8. Lexing code must be robust in the face of invalid
846encoding.
847
848The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
849is significant, but not the whole story regarding the input character
850encoding. Normally, when a file is being read, the scalar contains octets
851and its C<SvUTF8> flag is off, but the octets should be interpreted as
852UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
853however, the scalar may have the C<SvUTF8> flag on, and in this case its
854octets should be interpreted as UTF-8 unless the C<use bytes> pragma
855is in effect. This logic may change in the future; use this function
856instead of implementing the logic yourself.
857
858=cut
859*/
860
861bool
862Perl_lex_bufutf8(pTHX)
863{
864 return UTF;
865}
866
867/*
868=for apidoc Amx|char *|lex_grow_linestr|STRLEN len
869
870Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
871at least I<len> octets (including terminating NUL). Returns a
872pointer to the reallocated buffer. This is necessary before making
873any direct modification of the buffer that would increase its length.
874L</lex_stuff_pvn> provides a more convenient way to insert text into
875the buffer.
876
877Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
878this function updates all of the lexer's variables that point directly
879into the buffer.
880
881=cut
882*/
883
884char *
885Perl_lex_grow_linestr(pTHX_ STRLEN len)
886{
887 SV *linestr;
888 char *buf;
889 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
890 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
891 linestr = PL_parser->linestr;
892 buf = SvPVX(linestr);
893 if (len <= SvLEN(linestr))
894 return buf;
895 bufend_pos = PL_parser->bufend - buf;
896 bufptr_pos = PL_parser->bufptr - buf;
897 oldbufptr_pos = PL_parser->oldbufptr - buf;
898 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
899 linestart_pos = PL_parser->linestart - buf;
900 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
901 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
902 buf = sv_grow(linestr, len);
903 PL_parser->bufend = buf + bufend_pos;
904 PL_parser->bufptr = buf + bufptr_pos;
905 PL_parser->oldbufptr = buf + oldbufptr_pos;
906 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
907 PL_parser->linestart = buf + linestart_pos;
908 if (PL_parser->last_uni)
909 PL_parser->last_uni = buf + last_uni_pos;
910 if (PL_parser->last_lop)
911 PL_parser->last_lop = buf + last_lop_pos;
912 return buf;
913}
914
915/*
83aa740e 916=for apidoc Amx|void|lex_stuff_pvn|const char *pv|STRLEN len|U32 flags
f0e67a1d
Z
917
918Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
919immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
920reallocating the buffer if necessary. This means that lexing code that
921runs later will see the characters as if they had appeared in the input.
922It is not recommended to do this as part of normal parsing, and most
923uses of this facility run the risk of the inserted characters being
924interpreted in an unintended manner.
925
926The string to be inserted is represented by I<len> octets starting
927at I<pv>. These octets are interpreted as either UTF-8 or Latin-1,
928according to whether the C<LEX_STUFF_UTF8> flag is set in I<flags>.
929The characters are recoded for the lexer buffer, according to how the
930buffer is currently being interpreted (L</lex_bufutf8>). If a string
9dcc53ea 931to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
f0e67a1d
Z
932function is more convenient.
933
934=cut
935*/
936
937void
83aa740e 938Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
f0e67a1d 939{
749123ff 940 dVAR;
f0e67a1d
Z
941 char *bufptr;
942 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
943 if (flags & ~(LEX_STUFF_UTF8))
944 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
945 if (UTF) {
946 if (flags & LEX_STUFF_UTF8) {
947 goto plain_copy;
948 } else {
949 STRLEN highhalf = 0;
83aa740e 950 const char *p, *e = pv+len;
f0e67a1d
Z
951 for (p = pv; p != e; p++)
952 highhalf += !!(((U8)*p) & 0x80);
953 if (!highhalf)
954 goto plain_copy;
955 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
956 bufptr = PL_parser->bufptr;
957 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
958 SvCUR_set(PL_parser->linestr,
959 SvCUR(PL_parser->linestr) + len+highhalf);
f0e67a1d
Z
960 PL_parser->bufend += len+highhalf;
961 for (p = pv; p != e; p++) {
962 U8 c = (U8)*p;
963 if (c & 0x80) {
964 *bufptr++ = (char)(0xc0 | (c >> 6));
965 *bufptr++ = (char)(0x80 | (c & 0x3f));
966 } else {
967 *bufptr++ = (char)c;
968 }
969 }
970 }
971 } else {
972 if (flags & LEX_STUFF_UTF8) {
973 STRLEN highhalf = 0;
83aa740e 974 const char *p, *e = pv+len;
f0e67a1d
Z
975 for (p = pv; p != e; p++) {
976 U8 c = (U8)*p;
977 if (c >= 0xc4) {
978 Perl_croak(aTHX_ "Lexing code attempted to stuff "
979 "non-Latin-1 character into Latin-1 input");
980 } else if (c >= 0xc2 && p+1 != e &&
981 (((U8)p[1]) & 0xc0) == 0x80) {
982 p++;
983 highhalf++;
984 } else if (c >= 0x80) {
985 /* malformed UTF-8 */
986 ENTER;
987 SAVESPTR(PL_warnhook);
988 PL_warnhook = PERL_WARNHOOK_FATAL;
989 utf8n_to_uvuni((U8*)p, e-p, NULL, 0);
990 LEAVE;
991 }
992 }
993 if (!highhalf)
994 goto plain_copy;
995 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
996 bufptr = PL_parser->bufptr;
997 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
255fdf19
Z
998 SvCUR_set(PL_parser->linestr,
999 SvCUR(PL_parser->linestr) + len-highhalf);
f0e67a1d
Z
1000 PL_parser->bufend += len-highhalf;
1001 for (p = pv; p != e; p++) {
1002 U8 c = (U8)*p;
1003 if (c & 0x80) {
1004 *bufptr++ = (char)(((c & 0x3) << 6) | (p[1] & 0x3f));
1005 p++;
1006 } else {
1007 *bufptr++ = (char)c;
1008 }
1009 }
1010 } else {
1011 plain_copy:
1012 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1013 bufptr = PL_parser->bufptr;
1014 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
255fdf19 1015 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
f0e67a1d
Z
1016 PL_parser->bufend += len;
1017 Copy(pv, bufptr, len, char);
1018 }
1019 }
1020}
1021
1022/*
9dcc53ea
Z
1023=for apidoc Amx|void|lex_stuff_pv|const char *pv|U32 flags
1024
1025Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1026immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1027reallocating the buffer if necessary. This means that lexing code that
1028runs later will see the characters as if they had appeared in the input.
1029It is not recommended to do this as part of normal parsing, and most
1030uses of this facility run the risk of the inserted characters being
1031interpreted in an unintended manner.
1032
1033The string to be inserted is represented by octets starting at I<pv>
1034and continuing to the first nul. These octets are interpreted as either
1035UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
1036in I<flags>. The characters are recoded for the lexer buffer, according
1037to how the buffer is currently being interpreted (L</lex_bufutf8>).
1038If it is not convenient to nul-terminate a string to be inserted, the
1039L</lex_stuff_pvn> function is more appropriate.
1040
1041=cut
1042*/
1043
1044void
1045Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1046{
1047 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1048 lex_stuff_pvn(pv, strlen(pv), flags);
1049}
1050
1051/*
f0e67a1d
Z
1052=for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1053
1054Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1055immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1056reallocating the buffer if necessary. This means that lexing code that
1057runs later will see the characters as if they had appeared in the input.
1058It is not recommended to do this as part of normal parsing, and most
1059uses of this facility run the risk of the inserted characters being
1060interpreted in an unintended manner.
1061
1062The string to be inserted is the string value of I<sv>. The characters
1063are recoded for the lexer buffer, according to how the buffer is currently
9dcc53ea 1064being interpreted (L</lex_bufutf8>). If a string to be inserted is
f0e67a1d
Z
1065not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1066need to construct a scalar.
1067
1068=cut
1069*/
1070
1071void
1072Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1073{
1074 char *pv;
1075 STRLEN len;
1076 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1077 if (flags)
1078 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1079 pv = SvPV(sv, len);
1080 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1081}
1082
1083/*
1084=for apidoc Amx|void|lex_unstuff|char *ptr
1085
1086Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1087I<ptr>. Text following I<ptr> will be moved, and the buffer shortened.
1088This hides the discarded text from any lexing code that runs later,
1089as if the text had never appeared.
1090
1091This is not the normal way to consume lexed text. For that, use
1092L</lex_read_to>.
1093
1094=cut
1095*/
1096
1097void
1098Perl_lex_unstuff(pTHX_ char *ptr)
1099{
1100 char *buf, *bufend;
1101 STRLEN unstuff_len;
1102 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1103 buf = PL_parser->bufptr;
1104 if (ptr < buf)
1105 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1106 if (ptr == buf)
1107 return;
1108 bufend = PL_parser->bufend;
1109 if (ptr > bufend)
1110 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1111 unstuff_len = ptr - buf;
1112 Move(ptr, buf, bufend+1-ptr, char);
1113 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1114 PL_parser->bufend = bufend - unstuff_len;
1115}
1116
1117/*
1118=for apidoc Amx|void|lex_read_to|char *ptr
1119
1120Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1121to I<ptr>. This advances L</PL_parser-E<gt>bufptr> to match I<ptr>,
1122performing the correct bookkeeping whenever a newline character is passed.
1123This is the normal way to consume lexed text.
1124
1125Interpretation of the buffer's octets can be abstracted out by
1126using the slightly higher-level functions L</lex_peek_unichar> and
1127L</lex_read_unichar>.
1128
1129=cut
1130*/
1131
1132void
1133Perl_lex_read_to(pTHX_ char *ptr)
1134{
1135 char *s;
1136 PERL_ARGS_ASSERT_LEX_READ_TO;
1137 s = PL_parser->bufptr;
1138 if (ptr < s || ptr > PL_parser->bufend)
1139 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1140 for (; s != ptr; s++)
1141 if (*s == '\n') {
1142 CopLINE_inc(PL_curcop);
1143 PL_parser->linestart = s+1;
1144 }
1145 PL_parser->bufptr = ptr;
1146}
1147
1148/*
1149=for apidoc Amx|void|lex_discard_to|char *ptr
1150
1151Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1152up to I<ptr>. The remaining content of the buffer will be moved, and
1153all pointers into the buffer updated appropriately. I<ptr> must not
1154be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1155it is not permitted to discard text that has yet to be lexed.
1156
1157Normally it is not necessarily to do this directly, because it suffices to
1158use the implicit discarding behaviour of L</lex_next_chunk> and things
1159based on it. However, if a token stretches across multiple lines,
1f317c95 1160and the lexing code has kept multiple lines of text in the buffer for
f0e67a1d
Z
1161that purpose, then after completion of the token it would be wise to
1162explicitly discard the now-unneeded earlier lines, to avoid future
1163multi-line tokens growing the buffer without bound.
1164
1165=cut
1166*/
1167
1168void
1169Perl_lex_discard_to(pTHX_ char *ptr)
1170{
1171 char *buf;
1172 STRLEN discard_len;
1173 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1174 buf = SvPVX(PL_parser->linestr);
1175 if (ptr < buf)
1176 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1177 if (ptr == buf)
1178 return;
1179 if (ptr > PL_parser->bufptr)
1180 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1181 discard_len = ptr - buf;
1182 if (PL_parser->oldbufptr < ptr)
1183 PL_parser->oldbufptr = ptr;
1184 if (PL_parser->oldoldbufptr < ptr)
1185 PL_parser->oldoldbufptr = ptr;
1186 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1187 PL_parser->last_uni = NULL;
1188 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1189 PL_parser->last_lop = NULL;
1190 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1191 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1192 PL_parser->bufend -= discard_len;
1193 PL_parser->bufptr -= discard_len;
1194 PL_parser->oldbufptr -= discard_len;
1195 PL_parser->oldoldbufptr -= discard_len;
1196 if (PL_parser->last_uni)
1197 PL_parser->last_uni -= discard_len;
1198 if (PL_parser->last_lop)
1199 PL_parser->last_lop -= discard_len;
1200}
1201
1202/*
1203=for apidoc Amx|bool|lex_next_chunk|U32 flags
1204
1205Reads in the next chunk of text to be lexed, appending it to
1206L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1207looked to the end of the current chunk and wants to know more. It is
1208usual, but not necessary, for lexing to have consumed the entirety of
1209the current chunk at this time.
1210
1211If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1212chunk (i.e., the current chunk has been entirely consumed), normally the
1213current chunk will be discarded at the same time that the new chunk is
1214read in. If I<flags> includes C<LEX_KEEP_PREVIOUS>, the current chunk
1215will not be discarded. If the current chunk has not been entirely
1216consumed, then it will not be discarded regardless of the flag.
1217
1218Returns true if some new text was added to the buffer, or false if the
1219buffer has reached the end of the input text.
1220
1221=cut
1222*/
1223
1224#define LEX_FAKE_EOF 0x80000000
1225
1226bool
1227Perl_lex_next_chunk(pTHX_ U32 flags)
1228{
1229 SV *linestr;
1230 char *buf;
1231 STRLEN old_bufend_pos, new_bufend_pos;
1232 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1233 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1234 bool got_some_for_debugger = 0;
f0e67a1d
Z
1235 bool got_some;
1236 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF))
1237 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
f0e67a1d
Z
1238 linestr = PL_parser->linestr;
1239 buf = SvPVX(linestr);
1240 if (!(flags & LEX_KEEP_PREVIOUS) &&
1241 PL_parser->bufptr == PL_parser->bufend) {
1242 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1243 linestart_pos = 0;
1244 if (PL_parser->last_uni != PL_parser->bufend)
1245 PL_parser->last_uni = NULL;
1246 if (PL_parser->last_lop != PL_parser->bufend)
1247 PL_parser->last_lop = NULL;
1248 last_uni_pos = last_lop_pos = 0;
1249 *buf = 0;
1250 SvCUR(linestr) = 0;
1251 } else {
1252 old_bufend_pos = PL_parser->bufend - buf;
1253 bufptr_pos = PL_parser->bufptr - buf;
1254 oldbufptr_pos = PL_parser->oldbufptr - buf;
1255 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1256 linestart_pos = PL_parser->linestart - buf;
1257 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1258 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1259 }
1260 if (flags & LEX_FAKE_EOF) {
1261 goto eof;
1262 } else if (!PL_parser->rsfp) {
1263 got_some = 0;
1264 } else if (filter_gets(linestr, old_bufend_pos)) {
1265 got_some = 1;
17cc9359 1266 got_some_for_debugger = 1;
f0e67a1d 1267 } else {
580561a3
Z
1268 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1269 sv_setpvs(linestr, "");
f0e67a1d
Z
1270 eof:
1271 /* End of real input. Close filehandle (unless it was STDIN),
1272 * then add implicit termination.
1273 */
1274 if ((PerlIO*)PL_parser->rsfp == PerlIO_stdin())
1275 PerlIO_clearerr(PL_parser->rsfp);
1276 else if (PL_parser->rsfp)
1277 (void)PerlIO_close(PL_parser->rsfp);
1278 PL_parser->rsfp = NULL;
737c24fc 1279 PL_parser->in_pod = 0;
f0e67a1d
Z
1280#ifdef PERL_MAD
1281 if (PL_madskills && !PL_in_eval && (PL_minus_p || PL_minus_n))
1282 PL_faketokens = 1;
1283#endif
1284 if (!PL_in_eval && PL_minus_p) {
1285 sv_catpvs(linestr,
1286 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1287 PL_minus_n = PL_minus_p = 0;
1288 } else if (!PL_in_eval && PL_minus_n) {
1289 sv_catpvs(linestr, /*{*/";}");
1290 PL_minus_n = 0;
1291 } else
1292 sv_catpvs(linestr, ";");
1293 got_some = 1;
1294 }
1295 buf = SvPVX(linestr);
1296 new_bufend_pos = SvCUR(linestr);
1297 PL_parser->bufend = buf + new_bufend_pos;
1298 PL_parser->bufptr = buf + bufptr_pos;
1299 PL_parser->oldbufptr = buf + oldbufptr_pos;
1300 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1301 PL_parser->linestart = buf + linestart_pos;
1302 if (PL_parser->last_uni)
1303 PL_parser->last_uni = buf + last_uni_pos;
1304 if (PL_parser->last_lop)
1305 PL_parser->last_lop = buf + last_lop_pos;
17cc9359 1306 if (got_some_for_debugger && (PERLDB_LINE || PERLDB_SAVESRC) &&
f0e67a1d
Z
1307 PL_curstash != PL_debstash) {
1308 /* debugger active and we're not compiling the debugger code,
1309 * so store the line into the debugger's array of lines
1310 */
1311 update_debugger_info(NULL, buf+old_bufend_pos,
1312 new_bufend_pos-old_bufend_pos);
1313 }
1314 return got_some;
1315}
1316
1317/*
1318=for apidoc Amx|I32|lex_peek_unichar|U32 flags
1319
1320Looks ahead one (Unicode) character in the text currently being lexed.
1321Returns the codepoint (unsigned integer value) of the next character,
1322or -1 if lexing has reached the end of the input text. To consume the
1323peeked character, use L</lex_read_unichar>.
1324
1325If the next character is in (or extends into) the next chunk of input
1326text, the next chunk will be read in. Normally the current chunk will be
1327discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1328then the current chunk will not be discarded.
1329
1330If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1331is encountered, an exception is generated.
1332
1333=cut
1334*/
1335
1336I32
1337Perl_lex_peek_unichar(pTHX_ U32 flags)
1338{
749123ff 1339 dVAR;
f0e67a1d
Z
1340 char *s, *bufend;
1341 if (flags & ~(LEX_KEEP_PREVIOUS))
1342 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1343 s = PL_parser->bufptr;
1344 bufend = PL_parser->bufend;
1345 if (UTF) {
1346 U8 head;
1347 I32 unichar;
1348 STRLEN len, retlen;
1349 if (s == bufend) {
1350 if (!lex_next_chunk(flags))
1351 return -1;
1352 s = PL_parser->bufptr;
1353 bufend = PL_parser->bufend;
1354 }
1355 head = (U8)*s;
1356 if (!(head & 0x80))
1357 return head;
1358 if (head & 0x40) {
1359 len = PL_utf8skip[head];
1360 while ((STRLEN)(bufend-s) < len) {
1361 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1362 break;
1363 s = PL_parser->bufptr;
1364 bufend = PL_parser->bufend;
1365 }
1366 }
1367 unichar = utf8n_to_uvuni((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
1368 if (retlen == (STRLEN)-1) {
1369 /* malformed UTF-8 */
1370 ENTER;
1371 SAVESPTR(PL_warnhook);
1372 PL_warnhook = PERL_WARNHOOK_FATAL;
1373 utf8n_to_uvuni((U8*)s, bufend-s, NULL, 0);
1374 LEAVE;
1375 }
1376 return unichar;
1377 } else {
1378 if (s == bufend) {
1379 if (!lex_next_chunk(flags))
1380 return -1;
1381 s = PL_parser->bufptr;
1382 }
1383 return (U8)*s;
1384 }
1385}
1386
1387/*
1388=for apidoc Amx|I32|lex_read_unichar|U32 flags
1389
1390Reads the next (Unicode) character in the text currently being lexed.
1391Returns the codepoint (unsigned integer value) of the character read,
1392and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1393if lexing has reached the end of the input text. To non-destructively
1394examine the next character, use L</lex_peek_unichar> instead.
1395
1396If the next character is in (or extends into) the next chunk of input
1397text, the next chunk will be read in. Normally the current chunk will be
1398discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1399then the current chunk will not be discarded.
1400
1401If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1402is encountered, an exception is generated.
1403
1404=cut
1405*/
1406
1407I32
1408Perl_lex_read_unichar(pTHX_ U32 flags)
1409{
1410 I32 c;
1411 if (flags & ~(LEX_KEEP_PREVIOUS))
1412 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1413 c = lex_peek_unichar(flags);
1414 if (c != -1) {
1415 if (c == '\n')
1416 CopLINE_inc(PL_curcop);
1417 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1418 }
1419 return c;
1420}
1421
1422/*
1423=for apidoc Amx|void|lex_read_space|U32 flags
1424
1425Reads optional spaces, in Perl style, in the text currently being
1426lexed. The spaces may include ordinary whitespace characters and
1427Perl-style comments. C<#line> directives are processed if encountered.
1428L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1429at a non-space character (or the end of the input text).
1430
1431If spaces extend into the next chunk of input text, the next chunk will
1432be read in. Normally the current chunk will be discarded at the same
1433time, but if I<flags> includes C<LEX_KEEP_PREVIOUS> then the current
1434chunk will not be discarded.
1435
1436=cut
1437*/
1438
f0998909
Z
1439#define LEX_NO_NEXT_CHUNK 0x80000000
1440
f0e67a1d
Z
1441void
1442Perl_lex_read_space(pTHX_ U32 flags)
1443{
1444 char *s, *bufend;
1445 bool need_incline = 0;
f0998909 1446 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK))
f0e67a1d
Z
1447 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
1448#ifdef PERL_MAD
1449 if (PL_skipwhite) {
1450 sv_free(PL_skipwhite);
1451 PL_skipwhite = NULL;
1452 }
1453 if (PL_madskills)
1454 PL_skipwhite = newSVpvs("");
1455#endif /* PERL_MAD */
1456 s = PL_parser->bufptr;
1457 bufend = PL_parser->bufend;
1458 while (1) {
1459 char c = *s;
1460 if (c == '#') {
1461 do {
1462 c = *++s;
1463 } while (!(c == '\n' || (c == 0 && s == bufend)));
1464 } else if (c == '\n') {
1465 s++;
1466 PL_parser->linestart = s;
1467 if (s == bufend)
1468 need_incline = 1;
1469 else
1470 incline(s);
1471 } else if (isSPACE(c)) {
1472 s++;
1473 } else if (c == 0 && s == bufend) {
1474 bool got_more;
1475#ifdef PERL_MAD
1476 if (PL_madskills)
1477 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1478#endif /* PERL_MAD */
f0998909
Z
1479 if (flags & LEX_NO_NEXT_CHUNK)
1480 break;
f0e67a1d
Z
1481 PL_parser->bufptr = s;
1482 CopLINE_inc(PL_curcop);
1483 got_more = lex_next_chunk(flags);
1484 CopLINE_dec(PL_curcop);
1485 s = PL_parser->bufptr;
1486 bufend = PL_parser->bufend;
1487 if (!got_more)
1488 break;
1489 if (need_incline && PL_parser->rsfp) {
1490 incline(s);
1491 need_incline = 0;
1492 }
1493 } else {
1494 break;
1495 }
1496 }
1497#ifdef PERL_MAD
1498 if (PL_madskills)
1499 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1500#endif /* PERL_MAD */
1501 PL_parser->bufptr = s;
1502}
1503
1504/*
ffb4593c
NT
1505 * S_incline
1506 * This subroutine has nothing to do with tilting, whether at windmills
1507 * or pinball tables. Its name is short for "increment line". It
57843af0 1508 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1509 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1510 * # line 500 "foo.pm"
1511 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1512 */
1513
76e3520e 1514STATIC void
d9095cec 1515S_incline(pTHX_ const char *s)
463ee0b2 1516{
97aff369 1517 dVAR;
d9095cec
NC
1518 const char *t;
1519 const char *n;
1520 const char *e;
8818d409 1521 line_t line_num;
463ee0b2 1522
7918f24d
NC
1523 PERL_ARGS_ASSERT_INCLINE;
1524
57843af0 1525 CopLINE_inc(PL_curcop);
463ee0b2
LW
1526 if (*s++ != '#')
1527 return;
d4c19fe8
AL
1528 while (SPACE_OR_TAB(*s))
1529 s++;
73659bf1
GS
1530 if (strnEQ(s, "line", 4))
1531 s += 4;
1532 else
1533 return;
084592ab 1534 if (SPACE_OR_TAB(*s))
73659bf1 1535 s++;
4e553d73 1536 else
73659bf1 1537 return;
d4c19fe8
AL
1538 while (SPACE_OR_TAB(*s))
1539 s++;
463ee0b2
LW
1540 if (!isDIGIT(*s))
1541 return;
d4c19fe8 1542
463ee0b2
LW
1543 n = s;
1544 while (isDIGIT(*s))
1545 s++;
07714eb4 1546 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1547 return;
bf4acbe4 1548 while (SPACE_OR_TAB(*s))
463ee0b2 1549 s++;
73659bf1 1550 if (*s == '"' && (t = strchr(s+1, '"'))) {
463ee0b2 1551 s++;
73659bf1
GS
1552 e = t + 1;
1553 }
463ee0b2 1554 else {
c35e046a
AL
1555 t = s;
1556 while (!isSPACE(*t))
1557 t++;
73659bf1 1558 e = t;
463ee0b2 1559 }
bf4acbe4 1560 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1561 e++;
1562 if (*e != '\n' && *e != '\0')
1563 return; /* false alarm */
1564
8818d409
FC
1565 line_num = atoi(n)-1;
1566
f4dd75d9 1567 if (t - s > 0) {
d9095cec 1568 const STRLEN len = t - s;
19bad673
NC
1569 SV *const temp_sv = CopFILESV(PL_curcop);
1570 const char *cf;
1571 STRLEN tmplen;
1572
1573 if (temp_sv) {
1574 cf = SvPVX(temp_sv);
1575 tmplen = SvCUR(temp_sv);
1576 } else {
1577 cf = NULL;
1578 tmplen = 0;
1579 }
1580
42d9b98d 1581 if (tmplen > 7 && strnEQ(cf, "(eval ", 6)) {
e66cf94c
RGS
1582 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1583 * to *{"::_<newfilename"} */
44867030
NC
1584 /* However, the long form of evals is only turned on by the
1585 debugger - usually they're "(eval %lu)" */
1586 char smallbuf[128];
1587 char *tmpbuf;
1588 GV **gvp;
d9095cec 1589 STRLEN tmplen2 = len;
798b63bc 1590 if (tmplen + 2 <= sizeof smallbuf)
e66cf94c
RGS
1591 tmpbuf = smallbuf;
1592 else
2ae0db35 1593 Newx(tmpbuf, tmplen + 2, char);
44867030
NC
1594 tmpbuf[0] = '_';
1595 tmpbuf[1] = '<';
2ae0db35 1596 memcpy(tmpbuf + 2, cf, tmplen);
44867030 1597 tmplen += 2;
8a5ee598
RGS
1598 gvp = (GV**)hv_fetch(PL_defstash, tmpbuf, tmplen, FALSE);
1599 if (gvp) {
44867030
NC
1600 char *tmpbuf2;
1601 GV *gv2;
1602
1603 if (tmplen2 + 2 <= sizeof smallbuf)
1604 tmpbuf2 = smallbuf;
1605 else
1606 Newx(tmpbuf2, tmplen2 + 2, char);
1607
1608 if (tmpbuf2 != smallbuf || tmpbuf != smallbuf) {
1609 /* Either they malloc'd it, or we malloc'd it,
1610 so no prefix is present in ours. */
1611 tmpbuf2[0] = '_';
1612 tmpbuf2[1] = '<';
1613 }
1614
1615 memcpy(tmpbuf2 + 2, s, tmplen2);
1616 tmplen2 += 2;
1617
8a5ee598 1618 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1619 if (!isGV(gv2)) {
8a5ee598 1620 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1621 /* adjust ${"::_<newfilename"} to store the new file name */
1622 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
8818d409
FC
1623 /* The line number may differ. If that is the case,
1624 alias the saved lines that are in the array.
1625 Otherwise alias the whole array. */
1626 if (CopLINE(PL_curcop) == line_num) {
1627 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(*gvp)));
1628 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(*gvp)));
1629 }
1630 else if (GvAV(*gvp)) {
1631 AV * const av = GvAV(*gvp);
1632 const I32 start = CopLINE(PL_curcop)+1;
1633 I32 items = AvFILLp(av) - start;
1634 if (items > 0) {
1635 AV * const av2 = GvAVn(gv2);
1636 SV **svp = AvARRAY(av) + start;
1637 I32 l = (I32)line_num+1;
1638 while (items--)
1639 av_store(av2, l++, SvREFCNT_inc(*svp++));
1640 }
1641 }
e5527e4b 1642 }
44867030
NC
1643
1644 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
8a5ee598 1645 }
e66cf94c 1646 if (tmpbuf != smallbuf) Safefree(tmpbuf);
e66cf94c 1647 }
05ec9bb3 1648 CopFILE_free(PL_curcop);
d9095cec 1649 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1650 }
8818d409 1651 CopLINE_set(PL_curcop, line_num);
463ee0b2
LW
1652}
1653
29595ff2 1654#ifdef PERL_MAD
cd81e915 1655/* skip space before PL_thistoken */
29595ff2
NC
1656
1657STATIC char *
1658S_skipspace0(pTHX_ register char *s)
1659{
7918f24d
NC
1660 PERL_ARGS_ASSERT_SKIPSPACE0;
1661
29595ff2
NC
1662 s = skipspace(s);
1663 if (!PL_madskills)
1664 return s;
cd81e915
NC
1665 if (PL_skipwhite) {
1666 if (!PL_thiswhite)
6b29d1f5 1667 PL_thiswhite = newSVpvs("");
cd81e915
NC
1668 sv_catsv(PL_thiswhite, PL_skipwhite);
1669 sv_free(PL_skipwhite);
1670 PL_skipwhite = 0;
1671 }
1672 PL_realtokenstart = s - SvPVX(PL_linestr);
29595ff2
NC
1673 return s;
1674}
1675
cd81e915 1676/* skip space after PL_thistoken */
29595ff2
NC
1677
1678STATIC char *
1679S_skipspace1(pTHX_ register char *s)
1680{
d4c19fe8 1681 const char *start = s;
29595ff2
NC
1682 I32 startoff = start - SvPVX(PL_linestr);
1683
7918f24d
NC
1684 PERL_ARGS_ASSERT_SKIPSPACE1;
1685
29595ff2
NC
1686 s = skipspace(s);
1687 if (!PL_madskills)
1688 return s;
1689 start = SvPVX(PL_linestr) + startoff;
cd81e915 1690 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1691 const char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1692 PL_thistoken = newSVpvn(tstart, start - tstart);
1693 }
1694 PL_realtokenstart = -1;
1695 if (PL_skipwhite) {
1696 if (!PL_nextwhite)
6b29d1f5 1697 PL_nextwhite = newSVpvs("");
cd81e915
NC
1698 sv_catsv(PL_nextwhite, PL_skipwhite);
1699 sv_free(PL_skipwhite);
1700 PL_skipwhite = 0;
29595ff2
NC
1701 }
1702 return s;
1703}
1704
1705STATIC char *
1706S_skipspace2(pTHX_ register char *s, SV **svp)
1707{
c35e046a
AL
1708 char *start;
1709 const I32 bufptroff = PL_bufptr - SvPVX(PL_linestr);
1710 const I32 startoff = s - SvPVX(PL_linestr);
1711
7918f24d
NC
1712 PERL_ARGS_ASSERT_SKIPSPACE2;
1713
29595ff2
NC
1714 s = skipspace(s);
1715 PL_bufptr = SvPVX(PL_linestr) + bufptroff;
1716 if (!PL_madskills || !svp)
1717 return s;
1718 start = SvPVX(PL_linestr) + startoff;
cd81e915 1719 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1720 char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1721 PL_thistoken = newSVpvn(tstart, start - tstart);
1722 PL_realtokenstart = -1;
29595ff2 1723 }
cd81e915 1724 if (PL_skipwhite) {
29595ff2 1725 if (!*svp)
6b29d1f5 1726 *svp = newSVpvs("");
cd81e915
NC
1727 sv_setsv(*svp, PL_skipwhite);
1728 sv_free(PL_skipwhite);
1729 PL_skipwhite = 0;
29595ff2
NC
1730 }
1731
1732 return s;
1733}
1734#endif
1735
80a702cd 1736STATIC void
15f169a1 1737S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1738{
1739 AV *av = CopFILEAVx(PL_curcop);
1740 if (av) {
b9f83d2f 1741 SV * const sv = newSV_type(SVt_PVMG);
5fa550fb
NC
1742 if (orig_sv)
1743 sv_setsv(sv, orig_sv);
1744 else
1745 sv_setpvn(sv, buf, len);
80a702cd
RGS
1746 (void)SvIOK_on(sv);
1747 SvIV_set(sv, 0);
1748 av_store(av, (I32)CopLINE(PL_curcop), sv);
1749 }
1750}
1751
ffb4593c
NT
1752/*
1753 * S_skipspace
1754 * Called to gobble the appropriate amount and type of whitespace.
1755 * Skips comments as well.
1756 */
1757
76e3520e 1758STATIC char *
cea2e8a9 1759S_skipspace(pTHX_ register char *s)
a687059c 1760{
5db06880 1761#ifdef PERL_MAD
f0e67a1d
Z
1762 char *start = s;
1763#endif /* PERL_MAD */
7918f24d 1764 PERL_ARGS_ASSERT_SKIPSPACE;
f0e67a1d 1765#ifdef PERL_MAD
cd81e915
NC
1766 if (PL_skipwhite) {
1767 sv_free(PL_skipwhite);
f0e67a1d 1768 PL_skipwhite = NULL;
5db06880 1769 }
f0e67a1d 1770#endif /* PERL_MAD */
3280af22 1771 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
bf4acbe4 1772 while (s < PL_bufend && SPACE_OR_TAB(*s))
463ee0b2 1773 s++;
f0e67a1d
Z
1774 } else {
1775 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1776 PL_bufptr = s;
f0998909
Z
1777 lex_read_space(LEX_KEEP_PREVIOUS |
1778 (PL_sublex_info.sub_inwhat || PL_lex_state == LEX_FORMLINE ?
1779 LEX_NO_NEXT_CHUNK : 0));
3280af22 1780 s = PL_bufptr;
f0e67a1d
Z
1781 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1782 if (PL_linestart > PL_bufptr)
1783 PL_bufptr = PL_linestart;
1784 return s;
463ee0b2 1785 }
5db06880 1786#ifdef PERL_MAD
f0e67a1d
Z
1787 if (PL_madskills)
1788 PL_skipwhite = newSVpvn(start, s-start);
1789#endif /* PERL_MAD */
5db06880 1790 return s;
a687059c 1791}
378cc40b 1792
ffb4593c
NT
1793/*
1794 * S_check_uni
1795 * Check the unary operators to ensure there's no ambiguity in how they're
1796 * used. An ambiguous piece of code would be:
1797 * rand + 5
1798 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1799 * the +5 is its argument.
1800 */
1801
76e3520e 1802STATIC void
cea2e8a9 1803S_check_uni(pTHX)
ba106d47 1804{
97aff369 1805 dVAR;
d4c19fe8
AL
1806 const char *s;
1807 const char *t;
2f3197b3 1808
3280af22 1809 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1810 return;
3280af22
NIS
1811 while (isSPACE(*PL_last_uni))
1812 PL_last_uni++;
c35e046a
AL
1813 s = PL_last_uni;
1814 while (isALNUM_lazy_if(s,UTF) || *s == '-')
1815 s++;
3280af22 1816 if ((t = strchr(s, '(')) && t < PL_bufptr)
a0d0e21e 1817 return;
6136c704 1818
9b387841
NC
1819 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
1820 "Warning: Use of \"%.*s\" without parentheses is ambiguous",
1821 (int)(s - PL_last_uni), PL_last_uni);
2f3197b3
LW
1822}
1823
ffb4593c
NT
1824/*
1825 * LOP : macro to build a list operator. Its behaviour has been replaced
1826 * with a subroutine, S_lop() for which LOP is just another name.
1827 */
1828
a0d0e21e
LW
1829#define LOP(f,x) return lop(f,x,s)
1830
ffb4593c
NT
1831/*
1832 * S_lop
1833 * Build a list operator (or something that might be one). The rules:
1834 * - if we have a next token, then it's a list operator [why?]
1835 * - if the next thing is an opening paren, then it's a function
1836 * - else it's a list operator
1837 */
1838
76e3520e 1839STATIC I32
a0be28da 1840S_lop(pTHX_ I32 f, int x, char *s)
ffed7fef 1841{
97aff369 1842 dVAR;
7918f24d
NC
1843
1844 PERL_ARGS_ASSERT_LOP;
1845
6154021b 1846 pl_yylval.ival = f;
35c8bce7 1847 CLINE;
3280af22
NIS
1848 PL_expect = x;
1849 PL_bufptr = s;
1850 PL_last_lop = PL_oldbufptr;
eb160463 1851 PL_last_lop_op = (OPCODE)f;
5db06880
NC
1852#ifdef PERL_MAD
1853 if (PL_lasttoke)
78cdf107 1854 goto lstop;
5db06880 1855#else
3280af22 1856 if (PL_nexttoke)
78cdf107 1857 goto lstop;
5db06880 1858#endif
79072805 1859 if (*s == '(')
bbf60fe6 1860 return REPORT(FUNC);
29595ff2 1861 s = PEEKSPACE(s);
79072805 1862 if (*s == '(')
bbf60fe6 1863 return REPORT(FUNC);
78cdf107
Z
1864 else {
1865 lstop:
1866 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
1867 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
bbf60fe6 1868 return REPORT(LSTOP);
78cdf107 1869 }
79072805
LW
1870}
1871
5db06880
NC
1872#ifdef PERL_MAD
1873 /*
1874 * S_start_force
1875 * Sets up for an eventual force_next(). start_force(0) basically does
1876 * an unshift, while start_force(-1) does a push. yylex removes items
1877 * on the "pop" end.
1878 */
1879
1880STATIC void
1881S_start_force(pTHX_ int where)
1882{
1883 int i;
1884
cd81e915 1885 if (where < 0) /* so people can duplicate start_force(PL_curforce) */
5db06880 1886 where = PL_lasttoke;
cd81e915
NC
1887 assert(PL_curforce < 0 || PL_curforce == where);
1888 if (PL_curforce != where) {
5db06880
NC
1889 for (i = PL_lasttoke; i > where; --i) {
1890 PL_nexttoke[i] = PL_nexttoke[i-1];
1891 }
1892 PL_lasttoke++;
1893 }
cd81e915 1894 if (PL_curforce < 0) /* in case of duplicate start_force() */
5db06880 1895 Zero(&PL_nexttoke[where], 1, NEXTTOKE);
cd81e915
NC
1896 PL_curforce = where;
1897 if (PL_nextwhite) {
5db06880 1898 if (PL_madskills)
6b29d1f5 1899 curmad('^', newSVpvs(""));
cd81e915 1900 CURMAD('_', PL_nextwhite);
5db06880
NC
1901 }
1902}
1903
1904STATIC void
1905S_curmad(pTHX_ char slot, SV *sv)
1906{
1907 MADPROP **where;
1908
1909 if (!sv)
1910 return;
cd81e915
NC
1911 if (PL_curforce < 0)
1912 where = &PL_thismad;
5db06880 1913 else
cd81e915 1914 where = &PL_nexttoke[PL_curforce].next_mad;
5db06880 1915
cd81e915 1916 if (PL_faketokens)
76f68e9b 1917 sv_setpvs(sv, "");
5db06880
NC
1918 else {
1919 if (!IN_BYTES) {
1920 if (UTF && is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
1921 SvUTF8_on(sv);
1922 else if (PL_encoding) {
1923 sv_recode_to_utf8(sv, PL_encoding);
1924 }
1925 }
1926 }
1927
1928 /* keep a slot open for the head of the list? */
1929 if (slot != '_' && *where && (*where)->mad_key == '^') {
1930 (*where)->mad_key = slot;
daba3364 1931 sv_free(MUTABLE_SV(((*where)->mad_val)));
5db06880
NC
1932 (*where)->mad_val = (void*)sv;
1933 }
1934 else
1935 addmad(newMADsv(slot, sv), where, 0);
1936}
1937#else
b3f24c00
MHM
1938# define start_force(where) NOOP
1939# define curmad(slot, sv) NOOP
5db06880
NC
1940#endif
1941
ffb4593c
NT
1942/*
1943 * S_force_next
9cbb5ea2 1944 * When the lexer realizes it knows the next token (for instance,
ffb4593c 1945 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 1946 * to know what token to return the next time the lexer is called. Caller
5db06880
NC
1947 * will need to set PL_nextval[] (or PL_nexttoke[].next_val with PERL_MAD),
1948 * and possibly PL_expect to ensure the lexer handles the token correctly.
ffb4593c
NT
1949 */
1950
4e553d73 1951STATIC void
cea2e8a9 1952S_force_next(pTHX_ I32 type)
79072805 1953{
97aff369 1954 dVAR;
704d4215
GG
1955#ifdef DEBUGGING
1956 if (DEBUG_T_TEST) {
1957 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 1958 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
1959 }
1960#endif
5db06880 1961#ifdef PERL_MAD
cd81e915 1962 if (PL_curforce < 0)
5db06880 1963 start_force(PL_lasttoke);
cd81e915 1964 PL_nexttoke[PL_curforce].next_type = type;
5db06880
NC
1965 if (PL_lex_state != LEX_KNOWNEXT)
1966 PL_lex_defer = PL_lex_state;
1967 PL_lex_state = LEX_KNOWNEXT;
1968 PL_lex_expect = PL_expect;
cd81e915 1969 PL_curforce = -1;
5db06880 1970#else
3280af22
NIS
1971 PL_nexttype[PL_nexttoke] = type;
1972 PL_nexttoke++;
1973 if (PL_lex_state != LEX_KNOWNEXT) {
1974 PL_lex_defer = PL_lex_state;
1975 PL_lex_expect = PL_expect;
1976 PL_lex_state = LEX_KNOWNEXT;
79072805 1977 }
5db06880 1978#endif
79072805
LW
1979}
1980
28ac2b49
Z
1981void
1982Perl_yyunlex(pTHX)
1983{
a7aaec61
Z
1984 int yyc = PL_parser->yychar;
1985 if (yyc != YYEMPTY) {
1986 if (yyc) {
1987 start_force(-1);
1988 NEXTVAL_NEXTTOKE = PL_parser->yylval;
1989 if (yyc == '{'/*}*/ || yyc == HASHBRACK || yyc == '['/*]*/) {
78cdf107 1990 PL_lex_allbrackets--;
a7aaec61 1991 PL_lex_brackets--;
78cdf107
Z
1992 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
1993 } else if (yyc == '('/*)*/) {
1994 PL_lex_allbrackets--;
1995 yyc |= (2<<24);
a7aaec61
Z
1996 }
1997 force_next(yyc);
1998 }
28ac2b49
Z
1999 PL_parser->yychar = YYEMPTY;
2000 }
2001}
2002
d0a148a6 2003STATIC SV *
15f169a1 2004S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 2005{
97aff369 2006 dVAR;
740cce10 2007 SV * const sv = newSVpvn_utf8(start, len,
eaf7a4d2
CS
2008 !IN_BYTES
2009 && UTF
2010 && !is_ascii_string((const U8*)start, len)
740cce10 2011 && is_utf8_string((const U8*)start, len));
d0a148a6
NC
2012 return sv;
2013}
2014
ffb4593c
NT
2015/*
2016 * S_force_word
2017 * When the lexer knows the next thing is a word (for instance, it has
2018 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
2019 * it calls S_force_word to stick the next word into the PL_nexttoke/val
2020 * lookahead.
ffb4593c
NT
2021 *
2022 * Arguments:
b1b65b59 2023 * char *start : buffer position (must be within PL_linestr)
02b34bbe 2024 * int token : PL_next* will be this type of bare word (e.g., METHOD,WORD)
ffb4593c
NT
2025 * int check_keyword : if true, Perl checks to make sure the word isn't
2026 * a keyword (do this if the word is a label, e.g. goto FOO)
2027 * int allow_pack : if true, : characters will also be allowed (require,
2028 * use, etc. do this)
9cbb5ea2 2029 * int allow_initial_tick : used by the "sub" lexer only.
ffb4593c
NT
2030 */
2031
76e3520e 2032STATIC char *
cea2e8a9 2033S_force_word(pTHX_ register char *start, int token, int check_keyword, int allow_pack, int allow_initial_tick)
79072805 2034{
97aff369 2035 dVAR;
463ee0b2
LW
2036 register char *s;
2037 STRLEN len;
4e553d73 2038
7918f24d
NC
2039 PERL_ARGS_ASSERT_FORCE_WORD;
2040
29595ff2 2041 start = SKIPSPACE1(start);
463ee0b2 2042 s = start;
7e2040f0 2043 if (isIDFIRST_lazy_if(s,UTF) ||
a0d0e21e 2044 (allow_pack && *s == ':') ||
15f0808c 2045 (allow_initial_tick && *s == '\'') )
a0d0e21e 2046 {
3280af22 2047 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
5458a98a 2048 if (check_keyword && keyword(PL_tokenbuf, len, 0))
463ee0b2 2049 return start;
cd81e915 2050 start_force(PL_curforce);
5db06880
NC
2051 if (PL_madskills)
2052 curmad('X', newSVpvn(start,s-start));
463ee0b2 2053 if (token == METHOD) {
29595ff2 2054 s = SKIPSPACE1(s);
463ee0b2 2055 if (*s == '(')
3280af22 2056 PL_expect = XTERM;
463ee0b2 2057 else {
3280af22 2058 PL_expect = XOPERATOR;
463ee0b2 2059 }
79072805 2060 }
e74e6b3d 2061 if (PL_madskills)
63575281 2062 curmad('g', newSVpvs( "forced" ));
9ded7720 2063 NEXTVAL_NEXTTOKE.opval
d0a148a6
NC
2064 = (OP*)newSVOP(OP_CONST,0,
2065 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 2066 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
2067 force_next(token);
2068 }
2069 return s;
2070}
2071
ffb4593c
NT
2072/*
2073 * S_force_ident
9cbb5ea2 2074 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
2075 * text only contains the "foo" portion. The first argument is a pointer
2076 * to the "foo", and the second argument is the type symbol to prefix.
2077 * Forces the next token to be a "WORD".
9cbb5ea2 2078 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2079 */
2080
76e3520e 2081STATIC void
bfed75c6 2082S_force_ident(pTHX_ register const char *s, int kind)
79072805 2083{
97aff369 2084 dVAR;
7918f24d
NC
2085
2086 PERL_ARGS_ASSERT_FORCE_IDENT;
2087
c35e046a 2088 if (*s) {
90e5519e
NC
2089 const STRLEN len = strlen(s);
2090 OP* const o = (OP*)newSVOP(OP_CONST, 0, newSVpvn(s, len));
cd81e915 2091 start_force(PL_curforce);
9ded7720 2092 NEXTVAL_NEXTTOKE.opval = o;
79072805 2093 force_next(WORD);
748a9306 2094 if (kind) {
11343788 2095 o->op_private = OPpCONST_ENTERED;
55497cff 2096 /* XXX see note in pp_entereval() for why we forgo typo
2097 warnings if the symbol must be introduced in an eval.
2098 GSAR 96-10-12 */
90e5519e
NC
2099 gv_fetchpvn_flags(s, len,
2100 PL_in_eval ? (GV_ADDMULTI | GV_ADDINEVAL)
2101 : GV_ADD,
2102 kind == '$' ? SVt_PV :
2103 kind == '@' ? SVt_PVAV :
2104 kind == '%' ? SVt_PVHV :
a0d0e21e 2105 SVt_PVGV
90e5519e 2106 );
748a9306 2107 }
79072805
LW
2108 }
2109}
2110
1571675a
GS
2111NV
2112Perl_str_to_version(pTHX_ SV *sv)
2113{
2114 NV retval = 0.0;
2115 NV nshift = 1.0;
2116 STRLEN len;
cfd0369c 2117 const char *start = SvPV_const(sv,len);
9d4ba2ae 2118 const char * const end = start + len;
504618e9 2119 const bool utf = SvUTF8(sv) ? TRUE : FALSE;
7918f24d
NC
2120
2121 PERL_ARGS_ASSERT_STR_TO_VERSION;
2122
1571675a 2123 while (start < end) {
ba210ebe 2124 STRLEN skip;
1571675a
GS
2125 UV n;
2126 if (utf)
9041c2e3 2127 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2128 else {
2129 n = *(U8*)start;
2130 skip = 1;
2131 }
2132 retval += ((NV)n)/nshift;
2133 start += skip;
2134 nshift *= 1000;
2135 }
2136 return retval;
2137}
2138
4e553d73 2139/*
ffb4593c
NT
2140 * S_force_version
2141 * Forces the next token to be a version number.
e759cc13
RGS
2142 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2143 * and if "guessing" is TRUE, then no new token is created (and the caller
2144 * must use an alternative parsing method).
ffb4593c
NT
2145 */
2146
76e3520e 2147STATIC char *
e759cc13 2148S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2149{
97aff369 2150 dVAR;
5f66b61c 2151 OP *version = NULL;
44dcb63b 2152 char *d;
5db06880
NC
2153#ifdef PERL_MAD
2154 I32 startoff = s - SvPVX(PL_linestr);
2155#endif
89bfa8cd 2156
7918f24d
NC
2157 PERL_ARGS_ASSERT_FORCE_VERSION;
2158
29595ff2 2159 s = SKIPSPACE1(s);
89bfa8cd 2160
44dcb63b 2161 d = s;
dd629d5b 2162 if (*d == 'v')
44dcb63b 2163 d++;
44dcb63b 2164 if (isDIGIT(*d)) {
e759cc13
RGS
2165 while (isDIGIT(*d) || *d == '_' || *d == '.')
2166 d++;
5db06880
NC
2167#ifdef PERL_MAD
2168 if (PL_madskills) {
cd81e915 2169 start_force(PL_curforce);
5db06880
NC
2170 curmad('X', newSVpvn(s,d-s));
2171 }
2172#endif
4e4da3ac 2173 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
dd629d5b 2174 SV *ver;
8d08d9ba
DG
2175#ifdef USE_LOCALE_NUMERIC
2176 char *loc = setlocale(LC_NUMERIC, "C");
2177#endif
6154021b 2178 s = scan_num(s, &pl_yylval);
8d08d9ba
DG
2179#ifdef USE_LOCALE_NUMERIC
2180 setlocale(LC_NUMERIC, loc);
2181#endif
6154021b 2182 version = pl_yylval.opval;
dd629d5b
GS
2183 ver = cSVOPx(version)->op_sv;
2184 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2185 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2186 SvNV_set(ver, str_to_version(ver));
1571675a 2187 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2188 }
89bfa8cd 2189 }
5db06880
NC
2190 else if (guessing) {
2191#ifdef PERL_MAD
2192 if (PL_madskills) {
cd81e915
NC
2193 sv_free(PL_nextwhite); /* let next token collect whitespace */
2194 PL_nextwhite = 0;
5db06880
NC
2195 s = SvPVX(PL_linestr) + startoff;
2196 }
2197#endif
e759cc13 2198 return s;
5db06880 2199 }
89bfa8cd 2200 }
2201
5db06880
NC
2202#ifdef PERL_MAD
2203 if (PL_madskills && !version) {
cd81e915
NC
2204 sv_free(PL_nextwhite); /* let next token collect whitespace */
2205 PL_nextwhite = 0;
5db06880
NC
2206 s = SvPVX(PL_linestr) + startoff;
2207 }
2208#endif
89bfa8cd 2209 /* NOTE: The parser sees the package name and the VERSION swapped */
cd81e915 2210 start_force(PL_curforce);
9ded7720 2211 NEXTVAL_NEXTTOKE.opval = version;
4e553d73 2212 force_next(WORD);
89bfa8cd 2213
e759cc13 2214 return s;
89bfa8cd 2215}
2216
ffb4593c 2217/*
91152fc1
DG
2218 * S_force_strict_version
2219 * Forces the next token to be a version number using strict syntax rules.
2220 */
2221
2222STATIC char *
2223S_force_strict_version(pTHX_ char *s)
2224{
2225 dVAR;
2226 OP *version = NULL;
2227#ifdef PERL_MAD
2228 I32 startoff = s - SvPVX(PL_linestr);
2229#endif
2230 const char *errstr = NULL;
2231
2232 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2233
2234 while (isSPACE(*s)) /* leading whitespace */
2235 s++;
2236
2237 if (is_STRICT_VERSION(s,&errstr)) {
2238 SV *ver = newSV(0);
2239 s = (char *)scan_version(s, ver, 0);
2240 version = newSVOP(OP_CONST, 0, ver);
2241 }
4e4da3ac
Z
2242 else if ( (*s != ';' && *s != '{' && *s != '}' ) &&
2243 (s = SKIPSPACE1(s), (*s != ';' && *s != '{' && *s != '}' )))
2244 {
91152fc1
DG
2245 PL_bufptr = s;
2246 if (errstr)
2247 yyerror(errstr); /* version required */
2248 return s;
2249 }
2250
2251#ifdef PERL_MAD
2252 if (PL_madskills && !version) {
2253 sv_free(PL_nextwhite); /* let next token collect whitespace */
2254 PL_nextwhite = 0;
2255 s = SvPVX(PL_linestr) + startoff;
2256 }
2257#endif
2258 /* NOTE: The parser sees the package name and the VERSION swapped */
2259 start_force(PL_curforce);
2260 NEXTVAL_NEXTTOKE.opval = version;
2261 force_next(WORD);
2262
2263 return s;
2264}
2265
2266/*
ffb4593c
NT
2267 * S_tokeq
2268 * Tokenize a quoted string passed in as an SV. It finds the next
2269 * chunk, up to end of string or a backslash. It may make a new
2270 * SV containing that chunk (if HINT_NEW_STRING is on). It also
2271 * turns \\ into \.
2272 */
2273
76e3520e 2274STATIC SV *
cea2e8a9 2275S_tokeq(pTHX_ SV *sv)
79072805 2276{
97aff369 2277 dVAR;
79072805
LW
2278 register char *s;
2279 register char *send;
2280 register char *d;
b3ac6de7
IZ
2281 STRLEN len = 0;
2282 SV *pv = sv;
79072805 2283
7918f24d
NC
2284 PERL_ARGS_ASSERT_TOKEQ;
2285
79072805 2286 if (!SvLEN(sv))
b3ac6de7 2287 goto finish;
79072805 2288
a0d0e21e 2289 s = SvPV_force(sv, len);
21a311ee 2290 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1)
b3ac6de7 2291 goto finish;
463ee0b2 2292 send = s + len;
dcb21ed6
NC
2293 /* This is relying on the SV being "well formed" with a trailing '\0' */
2294 while (s < send && !(*s == '\\' && s[1] == '\\'))
79072805
LW
2295 s++;
2296 if (s == send)
b3ac6de7 2297 goto finish;
79072805 2298 d = s;
be4731d2 2299 if ( PL_hints & HINT_NEW_STRING ) {
59cd0e26 2300 pv = newSVpvn_flags(SvPVX_const(pv), len, SVs_TEMP | SvUTF8(sv));
be4731d2 2301 }
79072805
LW
2302 while (s < send) {
2303 if (*s == '\\') {
a0d0e21e 2304 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2305 s++; /* all that, just for this */
2306 }
2307 *d++ = *s++;
2308 }
2309 *d = '\0';
95a20fc0 2310 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2311 finish:
3280af22 2312 if ( PL_hints & HINT_NEW_STRING )
eb0d8d16 2313 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
79072805
LW
2314 return sv;
2315}
2316
ffb4593c
NT
2317/*
2318 * Now come three functions related to double-quote context,
2319 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2320 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2321 * interact with PL_lex_state, and create fake ( ... ) argument lists
2322 * to handle functions and concatenation.
2323 * They assume that whoever calls them will be setting up a fake
2324 * join call, because each subthing puts a ',' after it. This lets
2325 * "lower \luPpEr"
2326 * become
2327 * join($, , 'lower ', lcfirst( 'uPpEr', ) ,)
2328 *
2329 * (I'm not sure whether the spurious commas at the end of lcfirst's
2330 * arguments and join's arguments are created or not).
2331 */
2332
2333/*
2334 * S_sublex_start
6154021b 2335 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2336 *
2337 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2338 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c
NT
2339 *
2340 * OP_CONST and OP_READLINE are easy--just make the new op and return.
2341 *
2342 * Everything else becomes a FUNC.
2343 *
2344 * Sets PL_lex_state to LEX_INTERPPUSH unless (ival was OP_NULL or we
2345 * had an OP_CONST or OP_READLINE). This just sets us up for a
2346 * call to S_sublex_push().
2347 */
2348
76e3520e 2349STATIC I32
cea2e8a9 2350S_sublex_start(pTHX)
79072805 2351{
97aff369 2352 dVAR;
6154021b 2353 register const I32 op_type = pl_yylval.ival;
79072805
LW
2354
2355 if (op_type == OP_NULL) {
6154021b 2356 pl_yylval.opval = PL_lex_op;
5f66b61c 2357 PL_lex_op = NULL;
79072805
LW
2358 return THING;
2359 }
2360 if (op_type == OP_CONST || op_type == OP_READLINE) {
3280af22 2361 SV *sv = tokeq(PL_lex_stuff);
b3ac6de7
IZ
2362
2363 if (SvTYPE(sv) == SVt_PVIV) {
2364 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2365 STRLEN len;
96a5add6 2366 const char * const p = SvPV_const(sv, len);
740cce10 2367 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2368 SvREFCNT_dec(sv);
2369 sv = nsv;
4e553d73 2370 }
6154021b 2371 pl_yylval.opval = (OP*)newSVOP(op_type, 0, sv);
a0714e2c 2372 PL_lex_stuff = NULL;
6f33ba73
RGS
2373 /* Allow <FH> // "foo" */
2374 if (op_type == OP_READLINE)
2375 PL_expect = XTERMORDORDOR;
79072805
LW
2376 return THING;
2377 }
e3f73d4e
RGS
2378 else if (op_type == OP_BACKTICK && PL_lex_op) {
2379 /* readpipe() vas overriden */
2380 cSVOPx(cLISTOPx(cUNOPx(PL_lex_op)->op_first)->op_first->op_sibling)->op_sv = tokeq(PL_lex_stuff);
6154021b 2381 pl_yylval.opval = PL_lex_op;
9b201d7d 2382 PL_lex_op = NULL;
e3f73d4e
RGS
2383 PL_lex_stuff = NULL;
2384 return THING;
2385 }
79072805 2386
3280af22 2387 PL_sublex_info.super_state = PL_lex_state;
eac04b2e 2388 PL_sublex_info.sub_inwhat = (U16)op_type;
3280af22
NIS
2389 PL_sublex_info.sub_op = PL_lex_op;
2390 PL_lex_state = LEX_INTERPPUSH;
55497cff 2391
3280af22
NIS
2392 PL_expect = XTERM;
2393 if (PL_lex_op) {
6154021b 2394 pl_yylval.opval = PL_lex_op;
5f66b61c 2395 PL_lex_op = NULL;
55497cff 2396 return PMFUNC;
2397 }
2398 else
2399 return FUNC;
2400}
2401
ffb4593c
NT
2402/*
2403 * S_sublex_push
2404 * Create a new scope to save the lexing state. The scope will be
2405 * ended in S_sublex_done. Returns a '(', starting the function arguments
2406 * to the uc, lc, etc. found before.
2407 * Sets PL_lex_state to LEX_INTERPCONCAT.
2408 */
2409
76e3520e 2410STATIC I32
cea2e8a9 2411S_sublex_push(pTHX)
55497cff 2412{
27da23d5 2413 dVAR;
f46d017c 2414 ENTER;
55497cff 2415
3280af22 2416 PL_lex_state = PL_sublex_info.super_state;
651b5b28 2417 SAVEBOOL(PL_lex_dojoin);
3280af22 2418 SAVEI32(PL_lex_brackets);
78cdf107
Z
2419 SAVEI32(PL_lex_allbrackets);
2420 SAVEI8(PL_lex_fakeeof);
3280af22
NIS
2421 SAVEI32(PL_lex_casemods);
2422 SAVEI32(PL_lex_starts);
651b5b28 2423 SAVEI8(PL_lex_state);
7766f137 2424 SAVEVPTR(PL_lex_inpat);
98246f1e 2425 SAVEI16(PL_lex_inwhat);
57843af0 2426 SAVECOPLINE(PL_curcop);
3280af22 2427 SAVEPPTR(PL_bufptr);
8452ff4b 2428 SAVEPPTR(PL_bufend);
3280af22
NIS
2429 SAVEPPTR(PL_oldbufptr);
2430 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2431 SAVEPPTR(PL_last_lop);
2432 SAVEPPTR(PL_last_uni);
3280af22
NIS
2433 SAVEPPTR(PL_linestart);
2434 SAVESPTR(PL_linestr);
8edd5f42
RGS
2435 SAVEGENERICPV(PL_lex_brackstack);
2436 SAVEGENERICPV(PL_lex_casestack);
3280af22
NIS
2437
2438 PL_linestr = PL_lex_stuff;
a0714e2c 2439 PL_lex_stuff = NULL;
3280af22 2440
9cbb5ea2
GS
2441 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2442 = SvPVX(PL_linestr);
3280af22 2443 PL_bufend += SvCUR(PL_linestr);
bd61b366 2444 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2445 SAVEFREESV(PL_linestr);
2446
2447 PL_lex_dojoin = FALSE;
2448 PL_lex_brackets = 0;
78cdf107
Z
2449 PL_lex_allbrackets = 0;
2450 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
a02a5408
JC
2451 Newx(PL_lex_brackstack, 120, char);
2452 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2453 PL_lex_casemods = 0;
2454 *PL_lex_casestack = '\0';
2455 PL_lex_starts = 0;
2456 PL_lex_state = LEX_INTERPCONCAT;
eb160463 2457 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
3280af22
NIS
2458
2459 PL_lex_inwhat = PL_sublex_info.sub_inwhat;
bb16bae8 2460 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
3280af22
NIS
2461 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2462 PL_lex_inpat = PL_sublex_info.sub_op;
79072805 2463 else
5f66b61c 2464 PL_lex_inpat = NULL;
79072805 2465
55497cff 2466 return '(';
79072805
LW
2467}
2468
ffb4593c
NT
2469/*
2470 * S_sublex_done
2471 * Restores lexer state after a S_sublex_push.
2472 */
2473
76e3520e 2474STATIC I32
cea2e8a9 2475S_sublex_done(pTHX)
79072805 2476{
27da23d5 2477 dVAR;
3280af22 2478 if (!PL_lex_starts++) {
396482e1 2479 SV * const sv = newSVpvs("");
9aa983d2
JH
2480 if (SvUTF8(PL_linestr))
2481 SvUTF8_on(sv);
3280af22 2482 PL_expect = XOPERATOR;
6154021b 2483 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
79072805
LW
2484 return THING;
2485 }
2486
3280af22
NIS
2487 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2488 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2489 return yylex();
79072805
LW
2490 }
2491
ffb4593c 2492 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
bb16bae8 2493 assert(PL_lex_inwhat != OP_TRANSR);
3280af22
NIS
2494 if (PL_lex_repl && (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS)) {
2495 PL_linestr = PL_lex_repl;
2496 PL_lex_inpat = 0;
2497 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2498 PL_bufend += SvCUR(PL_linestr);
bd61b366 2499 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2500 SAVEFREESV(PL_linestr);
2501 PL_lex_dojoin = FALSE;
2502 PL_lex_brackets = 0;
78cdf107
Z
2503 PL_lex_allbrackets = 0;
2504 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
3280af22
NIS
2505 PL_lex_casemods = 0;
2506 *PL_lex_casestack = '\0';
2507 PL_lex_starts = 0;
25da4f38 2508 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2509 PL_lex_state = LEX_INTERPNORMAL;
2510 PL_lex_starts++;
e9fa98b2
HS
2511 /* we don't clear PL_lex_repl here, so that we can check later
2512 whether this is an evalled subst; that means we rely on the
2513 logic to ensure sublex_done() is called again only via the
2514 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2515 }
e9fa98b2 2516 else {
3280af22 2517 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2518 PL_lex_repl = NULL;
e9fa98b2 2519 }
79072805 2520 return ',';
ffed7fef
LW
2521 }
2522 else {
5db06880
NC
2523#ifdef PERL_MAD
2524 if (PL_madskills) {
cd81e915
NC
2525 if (PL_thiswhite) {
2526 if (!PL_endwhite)
6b29d1f5 2527 PL_endwhite = newSVpvs("");
cd81e915
NC
2528 sv_catsv(PL_endwhite, PL_thiswhite);
2529 PL_thiswhite = 0;
2530 }
2531 if (PL_thistoken)
76f68e9b 2532 sv_setpvs(PL_thistoken,"");
5db06880 2533 else
cd81e915 2534 PL_realtokenstart = -1;
5db06880
NC
2535 }
2536#endif
f46d017c 2537 LEAVE;
3280af22
NIS
2538 PL_bufend = SvPVX(PL_linestr);
2539 PL_bufend += SvCUR(PL_linestr);
2540 PL_expect = XOPERATOR;
09bef843 2541 PL_sublex_info.sub_inwhat = 0;
79072805 2542 return ')';
ffed7fef
LW
2543 }
2544}
2545
02aa26ce
NT
2546/*
2547 scan_const
2548
2549 Extracts a pattern, double-quoted string, or transliteration. This
2550 is terrifying code.
2551
94def140 2552 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 2553 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 2554 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 2555
94def140
TS
2556 Returns a pointer to the character scanned up to. If this is
2557 advanced from the start pointer supplied (i.e. if anything was
9b599b2a 2558 successfully parsed), will leave an OP for the substring scanned
6154021b 2559 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
2560 by looking at the next characters herself.
2561
02aa26ce
NT
2562 In patterns:
2563 backslashes:
ff3f963a 2564 constants: \N{NAME} only
02aa26ce
NT
2565 case and quoting: \U \Q \E
2566 stops on @ and $, but not for $ as tail anchor
2567
2568 In transliterations:
2569 characters are VERY literal, except for - not at the start or end
94def140
TS
2570 of the string, which indicates a range. If the range is in bytes,
2571 scan_const expands the range to the full set of intermediate
2572 characters. If the range is in utf8, the hyphen is replaced with
2573 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
2574
2575 In double-quoted strings:
2576 backslashes:
2577 double-quoted style: \r and \n
ff3f963a 2578 constants: \x31, etc.
94def140 2579 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
2580 case and quoting: \U \Q \E
2581 stops on @ and $
2582
2583 scan_const does *not* construct ops to handle interpolated strings.
2584 It stops processing as soon as it finds an embedded $ or @ variable
2585 and leaves it to the caller to work out what's going on.
2586
94def140
TS
2587 embedded arrays (whether in pattern or not) could be:
2588 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2589
2590 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
2591
2592 $ in pattern could be $foo or could be tail anchor. Assumption:
2593 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 2594 followed by one of "()| \r\n\t"
02aa26ce
NT
2595
2596 \1 (backreferences) are turned into $1
2597
2598 The structure of the code is
2599 while (there's a character to process) {
94def140
TS
2600 handle transliteration ranges
2601 skip regexp comments /(?#comment)/ and codes /(?{code})/
2602 skip #-initiated comments in //x patterns
2603 check for embedded arrays
02aa26ce
NT
2604 check for embedded scalars
2605 if (backslash) {
94def140 2606 deprecate \1 in substitution replacements
02aa26ce
NT
2607 handle string-changing backslashes \l \U \Q \E, etc.
2608 switch (what was escaped) {
94def140 2609 handle \- in a transliteration (becomes a literal -)
ff3f963a 2610 if a pattern and not \N{, go treat as regular character
94def140
TS
2611 handle \132 (octal characters)
2612 handle \x15 and \x{1234} (hex characters)
ff3f963a 2613 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
TS
2614 handle \cV (control characters)
2615 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 2616 } (end switch)
77a135fe 2617 continue
02aa26ce 2618 } (end if backslash)
77a135fe 2619 handle regular character
02aa26ce 2620 } (end while character to read)
4e553d73 2621
02aa26ce
NT
2622*/
2623
76e3520e 2624STATIC char *
cea2e8a9 2625S_scan_const(pTHX_ char *start)
79072805 2626{
97aff369 2627 dVAR;
3280af22 2628 register char *send = PL_bufend; /* end of the constant */
77a135fe
KW
2629 SV *sv = newSV(send - start); /* sv for the constant. See
2630 note below on sizing. */
02aa26ce
NT
2631 register char *s = start; /* start of the constant */
2632 register char *d = SvPVX(sv); /* destination for copies */
2633 bool dorange = FALSE; /* are we in a translit range? */
c2e66d9e 2634 bool didrange = FALSE; /* did we just finish a range? */
b953e60c
KW
2635 bool has_utf8 = FALSE; /* Output constant is UTF8 */
2636 bool this_utf8 = cBOOL(UTF); /* Is the source string assumed
77a135fe
KW
2637 to be UTF8? But, this can
2638 show as true when the source
2639 isn't utf8, as for example
2640 when it is entirely composed
2641 of hex constants */
2642
2643 /* Note on sizing: The scanned constant is placed into sv, which is
2644 * initialized by newSV() assuming one byte of output for every byte of
2645 * input. This routine expects newSV() to allocate an extra byte for a
2646 * trailing NUL, which this routine will append if it gets to the end of
2647 * the input. There may be more bytes of input than output (eg., \N{LATIN
2648 * CAPITAL LETTER A}), or more output than input if the constant ends up
2649 * recoded to utf8, but each time a construct is found that might increase
2650 * the needed size, SvGROW() is called. Its size parameter each time is
2651 * based on the best guess estimate at the time, namely the length used so
2652 * far, plus the length the current construct will occupy, plus room for
2653 * the trailing NUL, plus one byte for every input byte still unscanned */
2654
012bcf8d 2655 UV uv;
4c3a8340
TS
2656#ifdef EBCDIC
2657 UV literal_endpoint = 0;
e294cc5d 2658 bool native_range = TRUE; /* turned to FALSE if the first endpoint is Unicode. */
4c3a8340 2659#endif
012bcf8d 2660
7918f24d
NC
2661 PERL_ARGS_ASSERT_SCAN_CONST;
2662
bb16bae8 2663 assert(PL_lex_inwhat != OP_TRANSR);
2b9d42f0
NIS
2664 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
2665 /* If we are doing a trans and we know we want UTF8 set expectation */
2666 has_utf8 = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
2667 this_utf8 = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
2668 }
2669
2670
79072805 2671 while (s < send || dorange) {
ff3f963a 2672
02aa26ce 2673 /* get transliterations out of the way (they're most literal) */
3280af22 2674 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 2675 /* expand a range A-Z to the full set of characters. AIE! */
79072805 2676 if (dorange) {
1ba5c669
JH
2677 I32 i; /* current expanded character */
2678 I32 min; /* first character in range */
2679 I32 max; /* last character in range */
02aa26ce 2680
e294cc5d
JH
2681#ifdef EBCDIC
2682 UV uvmax = 0;
2683#endif
2684
2685 if (has_utf8
2686#ifdef EBCDIC
2687 && !native_range
2688#endif
2689 ) {
9d4ba2ae 2690 char * const c = (char*)utf8_hop((U8*)d, -1);
8973db79
JH
2691 char *e = d++;
2692 while (e-- > c)
2693 *(e + 1) = *e;
25716404 2694 *c = (char)UTF_TO_NATIVE(0xff);
8973db79
JH
2695 /* mark the range as done, and continue */
2696 dorange = FALSE;
2697 didrange = TRUE;
2698 continue;
2699 }
2b9d42f0 2700
95a20fc0 2701 i = d - SvPVX_const(sv); /* remember current offset */
e294cc5d
JH
2702#ifdef EBCDIC
2703 SvGROW(sv,
2704 SvLEN(sv) + (has_utf8 ?
2705 (512 - UTF_CONTINUATION_MARK +
2706 UNISKIP(0x100))
2707 : 256));
2708 /* How many two-byte within 0..255: 128 in UTF-8,
2709 * 96 in UTF-8-mod. */
2710#else
9cbb5ea2 2711 SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */
e294cc5d 2712#endif
9cbb5ea2 2713 d = SvPVX(sv) + i; /* refresh d after realloc */
e294cc5d
JH
2714#ifdef EBCDIC
2715 if (has_utf8) {
2716 int j;
2717 for (j = 0; j <= 1; j++) {
2718 char * const c = (char*)utf8_hop((U8*)d, -1);
2719 const UV uv = utf8n_to_uvchr((U8*)c, d - c, NULL, 0);
2720 if (j)
2721 min = (U8)uv;
2722 else if (uv < 256)
2723 max = (U8)uv;
2724 else {
2725 max = (U8)0xff; /* only to \xff */
2726 uvmax = uv; /* \x{100} to uvmax */
2727 }
2728 d = c; /* eat endpoint chars */
2729 }
2730 }
2731 else {
2732#endif
2733 d -= 2; /* eat the first char and the - */
2734 min = (U8)*d; /* first char in range */
2735 max = (U8)d[1]; /* last char in range */
2736#ifdef EBCDIC
2737 }
2738#endif
8ada0baa 2739
c2e66d9e 2740 if (min > max) {
01ec43d0 2741 Perl_croak(aTHX_
d1573ac7 2742 "Invalid range \"%c-%c\" in transliteration operator",
1ba5c669 2743 (char)min, (char)max);
c2e66d9e
GS
2744 }
2745
c7f1f016 2746#ifdef EBCDIC
4c3a8340
TS
2747 if (literal_endpoint == 2 &&
2748 ((isLOWER(min) && isLOWER(max)) ||
2749 (isUPPER(min) && isUPPER(max)))) {
8ada0baa
JH
2750 if (isLOWER(min)) {
2751 for (i = min; i <= max; i++)
2752 if (isLOWER(i))
db42d148 2753 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2754 } else {
2755 for (i = min; i <= max; i++)
2756 if (isUPPER(i))
db42d148 2757 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2758 }
2759 }
2760 else
2761#endif
2762 for (i = min; i <= max; i++)
e294cc5d
JH
2763#ifdef EBCDIC
2764 if (has_utf8) {
2765 const U8 ch = (U8)NATIVE_TO_UTF(i);
2766 if (UNI_IS_INVARIANT(ch))
2767 *d++ = (U8)i;
2768 else {
2769 *d++ = (U8)UTF8_EIGHT_BIT_HI(ch);
2770 *d++ = (U8)UTF8_EIGHT_BIT_LO(ch);
2771 }
2772 }
2773 else
2774#endif
2775 *d++ = (char)i;
2776
2777#ifdef EBCDIC
2778 if (uvmax) {
2779 d = (char*)uvchr_to_utf8((U8*)d, 0x100);
2780 if (uvmax > 0x101)
2781 *d++ = (char)UTF_TO_NATIVE(0xff);
2782 if (uvmax > 0x100)
2783 d = (char*)uvchr_to_utf8((U8*)d, uvmax);
2784 }
2785#endif
02aa26ce
NT
2786
2787 /* mark the range as done, and continue */
79072805 2788 dorange = FALSE;
01ec43d0 2789 didrange = TRUE;
4c3a8340
TS
2790#ifdef EBCDIC
2791 literal_endpoint = 0;
2792#endif
79072805 2793 continue;
4e553d73 2794 }
02aa26ce
NT
2795
2796 /* range begins (ignore - as first or last char) */
79072805 2797 else if (*s == '-' && s+1 < send && s != start) {
4e553d73 2798 if (didrange) {
1fafa243 2799 Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
01ec43d0 2800 }
e294cc5d
JH
2801 if (has_utf8
2802#ifdef EBCDIC
2803 && !native_range
2804#endif
2805 ) {
25716404 2806 *d++ = (char)UTF_TO_NATIVE(0xff); /* use illegal utf8 byte--see pmtrans */
a0ed51b3
LW
2807 s++;
2808 continue;
2809 }
79072805
LW
2810 dorange = TRUE;
2811 s++;
01ec43d0
GS
2812 }
2813 else {
2814 didrange = FALSE;
4c3a8340
TS
2815#ifdef EBCDIC
2816 literal_endpoint = 0;
e294cc5d 2817 native_range = TRUE;
4c3a8340 2818#endif
01ec43d0 2819 }
79072805 2820 }
02aa26ce
NT
2821
2822 /* if we get here, we're not doing a transliteration */
2823
0f5d15d6
IZ
2824 /* skip for regexp comments /(?#comment)/ and code /(?{code})/,
2825 except for the last char, which will be done separately. */
3280af22 2826 else if (*s == '(' && PL_lex_inpat && s[1] == '?') {
cc6b7395 2827 if (s[2] == '#') {
e994fd66 2828 while (s+1 < send && *s != ')')
db42d148 2829 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
155aba94
GS
2830 }
2831 else if (s[2] == '{' /* This should match regcomp.c */
67edc0c9 2832 || (s[2] == '?' && s[3] == '{'))
155aba94 2833 {
cc6b7395 2834 I32 count = 1;
0f5d15d6 2835 char *regparse = s + (s[2] == '{' ? 3 : 4);
cc6b7395
IZ
2836 char c;
2837
d9f97599
GS
2838 while (count && (c = *regparse)) {
2839 if (c == '\\' && regparse[1])
2840 regparse++;
4e553d73 2841 else if (c == '{')
cc6b7395 2842 count++;
4e553d73 2843 else if (c == '}')
cc6b7395 2844 count--;
d9f97599 2845 regparse++;
cc6b7395 2846 }
e994fd66 2847 if (*regparse != ')')
5bdf89e7 2848 regparse--; /* Leave one char for continuation. */
0f5d15d6 2849 while (s < regparse)
db42d148 2850 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
cc6b7395 2851 }
748a9306 2852 }
02aa26ce
NT
2853
2854 /* likewise skip #-initiated comments in //x patterns */
3280af22 2855 else if (*s == '#' && PL_lex_inpat &&
73134a2e 2856 ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED) {
748a9306 2857 while (s+1 < send && *s != '\n')
db42d148 2858 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
748a9306 2859 }
02aa26ce 2860
5d1d4326 2861 /* check for embedded arrays
da6eedaa 2862 (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
5d1d4326 2863 */
1749ea0d
TS
2864 else if (*s == '@' && s[1]) {
2865 if (isALNUM_lazy_if(s+1,UTF))
2866 break;
2867 if (strchr(":'{$", s[1]))
2868 break;
2869 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
2870 break; /* in regexp, neither @+ nor @- are interpolated */
2871 }
02aa26ce
NT
2872
2873 /* check for embedded scalars. only stop if we're sure it's a
2874 variable.
2875 */
79072805 2876 else if (*s == '$') {
3280af22 2877 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
79072805 2878 break;
77772344 2879 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
a2a5de95
NC
2880 if (s[1] == '\\') {
2881 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
2882 "Possible unintended interpolation of $\\ in regex");
77772344 2883 }
79072805 2884 break; /* in regexp, $ might be tail anchor */
77772344 2885 }
79072805 2886 }
02aa26ce 2887
2b9d42f0
NIS
2888 /* End of else if chain - OP_TRANS rejoin rest */
2889
02aa26ce 2890 /* backslashes */
79072805 2891 if (*s == '\\' && s+1 < send) {
ff3f963a
KW
2892 char* e; /* Can be used for ending '}', etc. */
2893
79072805 2894 s++;
02aa26ce 2895
7d0fc23c
KW
2896 /* warn on \1 - \9 in substitution replacements, but note that \11
2897 * is an octal; and \19 is \1 followed by '9' */
3280af22 2898 if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat &&
a0d0e21e 2899 isDIGIT(*s) && *s != '0' && !isDIGIT(s[1]))
79072805 2900 {
a2a5de95 2901 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
79072805
LW
2902 *--s = '$';
2903 break;
2904 }
02aa26ce
NT
2905
2906 /* string-change backslash escapes */
3280af22 2907 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQ", *s)) {
79072805
LW
2908 --s;
2909 break;
2910 }
ff3f963a
KW
2911 /* In a pattern, process \N, but skip any other backslash escapes.
2912 * This is because we don't want to translate an escape sequence
2913 * into a meta symbol and have the regex compiler use the meta
2914 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
2915 * in spite of this, we do have to process \N here while the proper
2916 * charnames handler is in scope. See bugs #56444 and #62056.
2917 * There is a complication because \N in a pattern may also stand
2918 * for 'match a non-nl', and not mean a charname, in which case its
2919 * processing should be deferred to the regex compiler. To be a
2920 * charname it must be followed immediately by a '{', and not look
2921 * like \N followed by a curly quantifier, i.e., not something like
2922 * \N{3,}. regcurly returns a boolean indicating if it is a legal
2923 * quantifier */
2924 else if (PL_lex_inpat
2925 && (*s != 'N'
2926 || s[1] != '{'
2927 || regcurly(s + 1)))
2928 {
cc74c5bd
TS
2929 *d++ = NATIVE_TO_NEED(has_utf8,'\\');
2930 goto default_action;
2931 }
02aa26ce 2932
79072805 2933 switch (*s) {
02aa26ce
NT
2934
2935 /* quoted - in transliterations */
79072805 2936 case '-':
3280af22 2937 if (PL_lex_inwhat == OP_TRANS) {
79072805
LW
2938 *d++ = *s++;
2939 continue;
2940 }
2941 /* FALL THROUGH */
2942 default:
11b8faa4 2943 {
a2a5de95
NC
2944 if ((isALPHA(*s) || isDIGIT(*s)))
2945 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
2946 "Unrecognized escape \\%c passed through",
2947 *s);
11b8faa4 2948 /* default action is to copy the quoted character */
f9a63242 2949 goto default_action;
11b8faa4 2950 }
02aa26ce 2951
632403cc 2952 /* eg. \132 indicates the octal constant 0132 */
79072805
LW
2953 case '0': case '1': case '2': case '3':
2954 case '4': case '5': case '6': case '7':
ba210ebe 2955 {
53305cf1
NC
2956 I32 flags = 0;
2957 STRLEN len = 3;
77a135fe 2958 uv = NATIVE_TO_UNI(grok_oct(s, &len, &flags, NULL));
ba210ebe
JH
2959 s += len;
2960 }
012bcf8d 2961 goto NUM_ESCAPE_INSERT;
02aa26ce 2962
f0a2b745
KW
2963 /* eg. \o{24} indicates the octal constant \024 */
2964 case 'o':
2965 {
2966 STRLEN len;
454155d9 2967 const char* error;
f0a2b745 2968
454155d9 2969 bool valid = grok_bslash_o(s, &uv, &len, &error, 1);
f0a2b745 2970 s += len;
454155d9 2971 if (! valid) {
f0a2b745
KW
2972 yyerror(error);
2973 continue;
2974 }
2975 goto NUM_ESCAPE_INSERT;
2976 }
2977
77a135fe 2978 /* eg. \x24 indicates the hex constant 0x24 */
79072805 2979 case 'x':
a0ed51b3
LW
2980 ++s;
2981 if (*s == '{') {
9d4ba2ae 2982 char* const e = strchr(s, '}');
a4c04bdc
NC
2983 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES |
2984 PERL_SCAN_DISALLOW_PREFIX;
53305cf1 2985 STRLEN len;
355860ce 2986
53305cf1 2987 ++s;
adaeee49 2988 if (!e) {
a0ed51b3 2989 yyerror("Missing right brace on \\x{}");
355860ce 2990 continue;
ba210ebe 2991 }
53305cf1 2992 len = e - s;
77a135fe 2993 uv = NATIVE_TO_UNI(grok_hex(s, &len, &flags, NULL));
ba210ebe 2994 s = e + 1;
a0ed51b3
LW
2995 }
2996 else {
ba210ebe 2997 {
53305cf1 2998 STRLEN len = 2;
a4c04bdc 2999 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
77a135fe 3000 uv = NATIVE_TO_UNI(grok_hex(s, &len, &flags, NULL));
ba210ebe
JH
3001 s += len;
3002 }
012bcf8d
GS
3003 }
3004
3005 NUM_ESCAPE_INSERT:
ff3f963a
KW
3006 /* Insert oct or hex escaped character. There will always be
3007 * enough room in sv since such escapes will be longer than any
3008 * UTF-8 sequence they can end up as, except if they force us
3009 * to recode the rest of the string into utf8 */
ba7cea30 3010
77a135fe 3011 /* Here uv is the ordinal of the next character being added in
ff3f963a 3012 * unicode (converted from native). */
77a135fe 3013 if (!UNI_IS_INVARIANT(uv)) {
9aa983d2 3014 if (!has_utf8 && uv > 255) {
77a135fe
KW
3015 /* Might need to recode whatever we have accumulated so
3016 * far if it contains any chars variant in utf8 or
3017 * utf-ebcdic. */
3018
3019 SvCUR_set(sv, d - SvPVX_const(sv));
3020 SvPOK_on(sv);
3021 *d = '\0';
77a135fe 3022 /* See Note on sizing above. */
7bf79863
KW
3023 sv_utf8_upgrade_flags_grow(sv,
3024 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3025 UNISKIP(uv) + (STRLEN)(send - s) + 1);
77a135fe
KW
3026 d = SvPVX(sv) + SvCUR(sv);
3027 has_utf8 = TRUE;
012bcf8d
GS
3028 }
3029
77a135fe
KW
3030 if (has_utf8) {
3031 d = (char*)uvuni_to_utf8((U8*)d, uv);
f9a63242
JH
3032 if (PL_lex_inwhat == OP_TRANS &&
3033 PL_sublex_info.sub_op) {
3034 PL_sublex_info.sub_op->op_private |=
3035 (PL_lex_repl ? OPpTRANS_FROM_UTF
3036 : OPpTRANS_TO_UTF);
f9a63242 3037 }
e294cc5d
JH
3038#ifdef EBCDIC
3039 if (uv > 255 && !dorange)
3040 native_range = FALSE;
3041#endif
012bcf8d 3042 }
a0ed51b3 3043 else {
012bcf8d 3044 *d++ = (char)uv;
a0ed51b3 3045 }
012bcf8d
GS
3046 }
3047 else {
c4d5f83a 3048 *d++ = (char) uv;
a0ed51b3 3049 }
79072805 3050 continue;
02aa26ce 3051
4a2d328f 3052 case 'N':
ff3f963a
KW
3053 /* In a non-pattern \N must be a named character, like \N{LATIN
3054 * SMALL LETTER A} or \N{U+0041}. For patterns, it also can
3055 * mean to match a non-newline. For non-patterns, named
3056 * characters are converted to their string equivalents. In
3057 * patterns, named characters are not converted to their
3058 * ultimate forms for the same reasons that other escapes
3059 * aren't. Instead, they are converted to the \N{U+...} form
3060 * to get the value from the charnames that is in effect right
3061 * now, while preserving the fact that it was a named character
3062 * so that the regex compiler knows this */
3063
3064 /* This section of code doesn't generally use the
3065 * NATIVE_TO_NEED() macro to transform the input. I (khw) did
3066 * a close examination of this macro and determined it is a
3067 * no-op except on utfebcdic variant characters. Every
3068 * character generated by this that would normally need to be
3069 * enclosed by this macro is invariant, so the macro is not
7538f724
KW
3070 * needed, and would complicate use of copy(). XXX There are
3071 * other parts of this file where the macro is used
3072 * inconsistently, but are saved by it being a no-op */
ff3f963a
KW
3073
3074 /* The structure of this section of code (besides checking for
3075 * errors and upgrading to utf8) is:
3076 * Further disambiguate between the two meanings of \N, and if
3077 * not a charname, go process it elsewhere
0a96133f
KW
3078 * If of form \N{U+...}, pass it through if a pattern;
3079 * otherwise convert to utf8
3080 * Otherwise must be \N{NAME}: convert to \N{U+c1.c2...} if a
3081 * pattern; otherwise convert to utf8 */
ff3f963a
KW
3082
3083 /* Here, s points to the 'N'; the test below is guaranteed to
3084 * succeed if we are being called on a pattern as we already
3085 * know from a test above that the next character is a '{'.
3086 * On a non-pattern \N must mean 'named sequence, which
3087 * requires braces */
3088 s++;
3089 if (*s != '{') {
3090 yyerror("Missing braces on \\N{}");
3091 continue;
3092 }
3093 s++;
3094
0a96133f 3095 /* If there is no matching '}', it is an error. */
ff3f963a
KW
3096 if (! (e = strchr(s, '}'))) {
3097 if (! PL_lex_inpat) {
5777a3f7 3098 yyerror("Missing right brace on \\N{}");
0a96133f
KW
3099 } else {
3100 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N.");
dbc0d4f2 3101 }
0a96133f 3102 continue;
ff3f963a 3103 }
cddc7ef4 3104
ff3f963a 3105 /* Here it looks like a named character */
cddc7ef4 3106
ff3f963a
KW
3107 if (PL_lex_inpat) {
3108
3109 /* XXX This block is temporary code. \N{} implies that the
3110 * pattern is to have Unicode semantics, and therefore
3111 * currently has to be encoded in utf8. By putting it in
3112 * utf8 now, we save a whole pass in the regular expression
3113 * compiler. Once that code is changed so Unicode
3114 * semantics doesn't necessarily have to be in utf8, this
da3a4baf
KW
3115 * block should be removed. However, the code that parses
3116 * the output of this would have to be changed to not
3117 * necessarily expect utf8 */
ff3f963a 3118 if (!has_utf8) {
77a135fe 3119 SvCUR_set(sv, d - SvPVX_const(sv));
f08d6ad9 3120 SvPOK_on(sv);
e4f3eed8 3121 *d = '\0';
77a135fe 3122 /* See Note on sizing above. */
7bf79863 3123 sv_utf8_upgrade_flags_grow(sv,
ff3f963a
KW
3124 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3125 /* 5 = '\N{' + cur char + NUL */
3126 (STRLEN)(send - s) + 5);
f08d6ad9 3127 d = SvPVX(sv) + SvCUR(sv);
89491803 3128 has_utf8 = TRUE;
ff3f963a
KW
3129 }
3130 }
423cee85 3131
ff3f963a
KW
3132 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3133 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3134 | PERL_SCAN_DISALLOW_PREFIX;
3135 STRLEN len;
3136
3137 /* For \N{U+...}, the '...' is a unicode value even on
3138 * EBCDIC machines */
3139 s += 2; /* Skip to next char after the 'U+' */
3140 len = e - s;
3141 uv = grok_hex(s, &len, &flags, NULL);
3142 if (len == 0 || len != (STRLEN)(e - s)) {
3143 yyerror("Invalid hexadecimal number in \\N{U+...}");
3144 s = e + 1;
3145 continue;
3146 }
3147
3148 if (PL_lex_inpat) {
3149
e2a7e165
KW
3150 /* On non-EBCDIC platforms, pass through to the regex
3151 * compiler unchanged. The reason we evaluated the
3152 * number above is to make sure there wasn't a syntax
3153 * error. But on EBCDIC we convert to native so
3154 * downstream code can continue to assume it's native
3155 */
ff3f963a 3156 s -= 5; /* Include the '\N{U+' */
e2a7e165
KW
3157#ifdef EBCDIC
3158 d += my_snprintf(d, e - s + 1 + 1, /* includes the }
3159 and the \0 */
3160 "\\N{U+%X}",
3161 (unsigned int) UNI_TO_NATIVE(uv));
3162#else
ff3f963a
KW
3163 Copy(s, d, e - s + 1, char); /* 1 = include the } */
3164 d += e - s + 1;
e2a7e165 3165#endif
ff3f963a
KW
3166 }
3167 else { /* Not a pattern: convert the hex to string */
3168
3169 /* If destination is not in utf8, unconditionally
3170 * recode it to be so. This is because \N{} implies
3171 * Unicode semantics, and scalars have to be in utf8
3172 * to guarantee those semantics */
3173 if (! has_utf8) {
3174 SvCUR_set(sv, d - SvPVX_const(sv));
3175 SvPOK_on(sv);
3176 *d = '\0';
3177 /* See Note on sizing above. */
3178 sv_utf8_upgrade_flags_grow(
3179 sv,
3180 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3181 UNISKIP(uv) + (STRLEN)(send - e) + 1);
3182 d = SvPVX(sv) + SvCUR(sv);
3183 has_utf8 = TRUE;
3184 }
3185
3186 /* Add the string to the output */
3187 if (UNI_IS_INVARIANT(uv)) {
3188 *d++ = (char) uv;
3189 }
3190 else d = (char*)uvuni_to_utf8((U8*)d, uv);
3191 }
3192 }
3193 else { /* Here is \N{NAME} but not \N{U+...}. */
3194
3195 SV *res; /* result from charnames */
3196 const char *str; /* the string in 'res' */
3197 STRLEN len; /* its length */
3198
3199 /* Get the value for NAME */
3200 res = newSVpvn(s, e - s);
3201 res = new_constant( NULL, 0, "charnames",
3202 /* includes all of: \N{...} */
3203 res, NULL, s - 3, e - s + 4 );
3204
3205 /* Most likely res will be in utf8 already since the
3206 * standard charnames uses pack U, but a custom translator
3207 * can leave it otherwise, so make sure. XXX This can be
3208 * revisited to not have charnames use utf8 for characters
3209 * that don't need it when regexes don't have to be in utf8
3210 * for Unicode semantics. If doing so, remember EBCDIC */
3211 sv_utf8_upgrade(res);
3212 str = SvPV_const(res, len);
3213
3214 /* Don't accept malformed input */
3215 if (! is_utf8_string((U8 *) str, len)) {
3216 yyerror("Malformed UTF-8 returned by \\N");
3217 }
3218 else if (PL_lex_inpat) {
3219
3220 if (! len) { /* The name resolved to an empty string */
3221 Copy("\\N{}", d, 4, char);
3222 d += 4;
3223 }
3224 else {
3225 /* In order to not lose information for the regex
3226 * compiler, pass the result in the specially made
3227 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3228 * the code points in hex of each character
3229 * returned by charnames */
3230
3231 const char *str_end = str + len;
3232 STRLEN char_length; /* cur char's byte length */
3233 STRLEN output_length; /* and the number of bytes
3234 after this is translated
3235 into hex digits */
3236 const STRLEN off = d - SvPVX_const(sv);
3237
3238 /* 2 hex per byte; 2 chars for '\N'; 2 chars for
3239 * max('U+', '.'); and 1 for NUL */
3240 char hex_string[2 * UTF8_MAXBYTES + 5];
3241
3242 /* Get the first character of the result. */
3243 U32 uv = utf8n_to_uvuni((U8 *) str,
3244 len,
3245 &char_length,
3246 UTF8_ALLOW_ANYUV);
3247
3248 /* The call to is_utf8_string() above hopefully
3249 * guarantees that there won't be an error. But
3250 * it's easy here to make sure. The function just
3251 * above warns and returns 0 if invalid utf8, but
3252 * it can also return 0 if the input is validly a
3253 * NUL. Disambiguate */
3254 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3255 uv = UNICODE_REPLACEMENT;
3256 }
3257
3258 /* Convert first code point to hex, including the
e2a7e165
KW
3259 * boiler plate before it. For all these, we
3260 * convert to native format so that downstream code
3261 * can continue to assume the input is native */
78c35590 3262 output_length =
3353de27 3263 my_snprintf(hex_string, sizeof(hex_string),
e2a7e165
KW
3264 "\\N{U+%X",
3265 (unsigned int) UNI_TO_NATIVE(uv));
ff3f963a
KW
3266
3267 /* Make sure there is enough space to hold it */
3268 d = off + SvGROW(sv, off
3269 + output_length
3270 + (STRLEN)(send - e)
3271 + 2); /* '}' + NUL */
3272 /* And output it */
3273 Copy(hex_string, d, output_length, char);
3274 d += output_length;
3275
3276 /* For each subsequent character, append dot and
3277 * its ordinal in hex */
3278 while ((str += char_length) < str_end) {
3279 const STRLEN off = d - SvPVX_const(sv);
3280 U32 uv = utf8n_to_uvuni((U8 *) str,
3281 str_end - str,
3282 &char_length,
3283 UTF8_ALLOW_ANYUV);
3284 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3285 uv = UNICODE_REPLACEMENT;
3286 }
3287
78c35590 3288 output_length =
3353de27 3289 my_snprintf(hex_string, sizeof(hex_string),
e2a7e165
KW
3290 ".%X",
3291 (unsigned int) UNI_TO_NATIVE(uv));
ff3f963a
KW
3292
3293 d = off + SvGROW(sv, off
3294 + output_length
3295 + (STRLEN)(send - e)
3296 + 2); /* '}' + NUL */
3297 Copy(hex_string, d, output_length, char);
3298 d += output_length;
3299 }
3300
3301 *d++ = '}'; /* Done. Add the trailing brace */
3302 }
3303 }
3304 else { /* Here, not in a pattern. Convert the name to a
3305 * string. */
3306
3307 /* If destination is not in utf8, unconditionally
3308 * recode it to be so. This is because \N{} implies
3309 * Unicode semantics, and scalars have to be in utf8
3310 * to guarantee those semantics */
3311 if (! has_utf8) {
3312 SvCUR_set(sv, d - SvPVX_const(sv));
3313 SvPOK_on(sv);
3314 *d = '\0';
3315 /* See Note on sizing above. */
3316 sv_utf8_upgrade_flags_grow(sv,
3317 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3318 len + (STRLEN)(send - s) + 1);
3319 d = SvPVX(sv) + SvCUR(sv);
3320 has_utf8 = TRUE;
3321 } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
3322
3323 /* See Note on sizing above. (NOTE: SvCUR() is not
3324 * set correctly here). */
3325 const STRLEN off = d - SvPVX_const(sv);
3326 d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1);
3327 }
3328 Copy(str, d, len, char);
3329 d += len;
423cee85 3330 }
423cee85 3331 SvREFCNT_dec(res);
cb233ae3
KW
3332
3333 /* Deprecate non-approved name syntax */
3334 if (ckWARN_d(WARN_DEPRECATED)) {
3335 bool problematic = FALSE;
3336 char* i = s;
3337
3338 /* For non-ut8 input, look to see that the first
3339 * character is an alpha, then loop through the rest
3340 * checking that each is a continuation */
3341 if (! this_utf8) {
3342 if (! isALPHAU(*i)) problematic = TRUE;
3343 else for (i = s + 1; i < e; i++) {
3344 if (isCHARNAME_CONT(*i)) continue;
3345 problematic = TRUE;
3346 break;
3347 }
3348 }
3349 else {
3350 /* Similarly for utf8. For invariants can check
3351 * directly. We accept anything above the latin1
3352 * range because it is immaterial to Perl if it is
3353 * correct or not, and is expensive to check. But
3354 * it is fairly easy in the latin1 range to convert
3355 * the variants into a single character and check
3356 * those */
3357 if (UTF8_IS_INVARIANT(*i)) {
3358 if (! isALPHAU(*i)) problematic = TRUE;
3359 } else if (UTF8_IS_DOWNGRADEABLE_START(*i)) {
81c14aa2 3360 if (! isALPHAU(UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(*i,
cb233ae3
KW
3361 *(i+1)))))
3362 {
3363 problematic = TRUE;
3364 }
3365 }
3366 if (! problematic) for (i = s + UTF8SKIP(s);
3367 i < e;
3368 i+= UTF8SKIP(i))
3369 {
3370 if (UTF8_IS_INVARIANT(*i)) {
3371 if (isCHARNAME_CONT(*i)) continue;
3372 } else if (! UTF8_IS_DOWNGRADEABLE_START(*i)) {
3373 continue;
3374 } else if (isCHARNAME_CONT(
3375 UNI_TO_NATIVE(
81c14aa2 3376 TWO_BYTE_UTF8_TO_UNI(*i, *(i+1)))))
cb233ae3
KW
3377 {
3378 continue;
3379 }
3380 problematic = TRUE;
3381 break;
3382 }
3383 }
3384 if (problematic) {
6e1bad6c
KW
3385 /* The e-i passed to the final %.*s makes sure that
3386 * should the trailing NUL be missing that this
3387 * print won't run off the end of the string */
cb233ae3 3388 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
b00fc8d4
NC
3389 "Deprecated character in \\N{...}; marked by <-- HERE in \\N{%.*s<-- HERE %.*s",
3390 (int)(i - s + 1), s, (int)(e - i), i + 1);
cb233ae3
KW
3391 }
3392 }
3393 } /* End \N{NAME} */
ff3f963a
KW
3394#ifdef EBCDIC
3395 if (!dorange)
3396 native_range = FALSE; /* \N{} is defined to be Unicode */
3397#endif
3398 s = e + 1; /* Point to just after the '}' */
423cee85
JH
3399 continue;
3400
02aa26ce 3401 /* \c is a control character */
79072805
LW
3402 case 'c':
3403 s++;
961ce445 3404 if (s < send) {
17a3df4c 3405 *d++ = grok_bslash_c(*s++, has_utf8, 1);
ba210ebe 3406 }
961ce445
RGS
3407 else {
3408 yyerror("Missing control char name in \\c");
3409 }
79072805 3410 continue;
02aa26ce
NT
3411
3412 /* printf-style backslashes, formfeeds, newlines, etc */
79072805 3413 case 'b':
db42d148 3414 *d++ = NATIVE_TO_NEED(has_utf8,'\b');
79072805
LW
3415 break;
3416 case 'n':
db42d148 3417 *d++ = NATIVE_TO_NEED(has_utf8,'\n');
79072805
LW
3418 break;
3419 case 'r':
db42d148 3420 *d++ = NATIVE_TO_NEED(has_utf8,'\r');
79072805
LW
3421 break;
3422 case 'f':
db42d148 3423 *d++ = NATIVE_TO_NEED(has_utf8,'\f');
79072805
LW
3424 break;
3425 case 't':
db42d148 3426 *d++ = NATIVE_TO_NEED(has_utf8,'\t');
79072805 3427 break;
34a3fe2a 3428 case 'e':
db42d148 3429 *d++ = ASCII_TO_NEED(has_utf8,'\033');
34a3fe2a
PP
3430 break;
3431 case 'a':
db42d148 3432 *d++ = ASCII_TO_NEED(has_utf8,'\007');
79072805 3433 break;
02aa26ce
NT
3434 } /* end switch */
3435
79072805
LW
3436 s++;
3437 continue;
02aa26ce 3438 } /* end if (backslash) */
4c3a8340
TS
3439#ifdef EBCDIC
3440 else
3441 literal_endpoint++;
3442#endif
02aa26ce 3443
f9a63242 3444 default_action:
77a135fe
KW
3445 /* If we started with encoded form, or already know we want it,
3446 then encode the next character */
3447 if (! NATIVE_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
2b9d42f0 3448 STRLEN len = 1;
77a135fe
KW
3449
3450
3451 /* One might think that it is wasted effort in the case of the
3452 * source being utf8 (this_utf8 == TRUE) to take the next character
3453 * in the source, convert it to an unsigned value, and then convert
3454 * it back again. But the source has not been validated here. The
3455 * routine that does the conversion checks for errors like
3456 * malformed utf8 */
3457
5f66b61c
AL
3458 const UV nextuv = (this_utf8) ? utf8n_to_uvchr((U8*)s, send - s, &len, 0) : (UV) ((U8) *s);
3459 const STRLEN need = UNISKIP(NATIVE_TO_UNI(nextuv));
77a135fe
KW
3460 if (!has_utf8) {
3461 SvCUR_set(sv, d - SvPVX_const(sv));
3462 SvPOK_on(sv);
3463 *d = '\0';
77a135fe 3464 /* See Note on sizing above. */
7bf79863
KW
3465 sv_utf8_upgrade_flags_grow(sv,
3466 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3467 need + (STRLEN)(send - s) + 1);
77a135fe
KW
3468 d = SvPVX(sv) + SvCUR(sv);
3469 has_utf8 = TRUE;
3470 } else if (need > len) {
3471 /* encoded value larger than old, may need extra space (NOTE:
3472 * SvCUR() is not set correctly here). See Note on sizing
3473 * above. */
9d4ba2ae 3474 const STRLEN off = d - SvPVX_const(sv);
77a135fe 3475 d = SvGROW(sv, off + need + (STRLEN)(send - s) + 1) + off;
2b9d42f0 3476 }
77a135fe
KW
3477 s += len;
3478
5f66b61c 3479 d = (char*)uvchr_to_utf8((U8*)d, nextuv);
e294cc5d
JH
3480#ifdef EBCDIC
3481 if (uv > 255 && !dorange)
3482 native_range = FALSE;
3483#endif
2b9d42f0
NIS
3484 }
3485 else {
3486 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
3487 }
02aa26ce
NT
3488 } /* while loop to process each character */
3489
3490 /* terminate the string and set up the sv */
79072805 3491 *d = '\0';
95a20fc0 3492 SvCUR_set(sv, d - SvPVX_const(sv));
2b9d42f0 3493 if (SvCUR(sv) >= SvLEN(sv))
d0063567 3494 Perl_croak(aTHX_ "panic: constant overflowed allocated space");
2b9d42f0 3495
79072805 3496 SvPOK_on(sv);
9f4817db 3497 if (PL_encoding && !has_utf8) {
d0063567
DK
3498 sv_recode_to_utf8(sv, PL_encoding);
3499 if (SvUTF8(sv))
3500 has_utf8 = TRUE;
9f4817db 3501 }
2b9d42f0 3502 if (has_utf8) {
7e2040f0 3503 SvUTF8_on(sv);
2b9d42f0 3504 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
d0063567 3505 PL_sublex_info.sub_op->op_private |=
2b9d42f0
NIS
3506 (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
3507 }
3508 }
79072805 3509
02aa26ce 3510 /* shrink the sv if we allocated more than we used */
79072805 3511 if (SvCUR(sv) + 5 < SvLEN(sv)) {
1da4ca5f 3512 SvPV_shrink_to_cur(sv);
79072805 3513 }
02aa26ce 3514
6154021b 3515 /* return the substring (via pl_yylval) only if we parsed anything */
3280af22 3516 if (s > PL_bufptr) {
eb0d8d16
NC
3517 if ( PL_hints & ( PL_lex_inpat ? HINT_NEW_RE : HINT_NEW_STRING ) ) {
3518 const char *const key = PL_lex_inpat ? "qr" : "q";
3519 const STRLEN keylen = PL_lex_inpat ? 2 : 1;
3520 const char *type;
3521 STRLEN typelen;
3522
3523 if (PL_lex_inwhat == OP_TRANS) {
3524 type = "tr";
3525 typelen = 2;
3526 } else if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat) {
3527 type = "s";
3528 typelen = 1;
3529 } else {
3530 type = "qq";
3531 typelen = 2;
3532 }
3533
3534 sv = S_new_constant(aTHX_ start, s - start, key, keylen, sv, NULL,
3535 type, typelen);
3536 }
6154021b 3537 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
b3ac6de7 3538 } else
8990e307 3539 SvREFCNT_dec(sv);
79072805
LW
3540 return s;
3541}
3542
ffb4593c
NT
3543/* S_intuit_more
3544 * Returns TRUE if there's more to the expression (e.g., a subscript),
3545 * FALSE otherwise.
ffb4593c
NT
3546 *
3547 * It deals with "$foo[3]" and /$foo[3]/ and /$foo[0123456789$]+/
3548 *
3549 * ->[ and ->{ return TRUE
3550 * { and [ outside a pattern are always subscripts, so return TRUE
3551 * if we're outside a pattern and it's not { or [, then return FALSE
3552 * if we're in a pattern and the first char is a {
3553 * {4,5} (any digits around the comma) returns FALSE
3554 * if we're in a pattern and the first char is a [
3555 * [] returns FALSE
3556 * [SOMETHING] has a funky algorithm to decide whether it's a
3557 * character class or not. It has to deal with things like
3558 * /$foo[-3]/ and /$foo[$bar]/ as well as /$foo[$\d]+/
3559 * anything else returns TRUE
3560 */
3561
9cbb5ea2
GS
3562/* This is the one truly awful dwimmer necessary to conflate C and sed. */
3563
76e3520e 3564STATIC int
cea2e8a9 3565S_intuit_more(pTHX_ register char *s)
79072805 3566{
97aff369 3567 dVAR;
7918f24d
NC
3568
3569 PERL_ARGS_ASSERT_INTUIT_MORE;
3570
3280af22 3571 if (PL_lex_brackets)
79072805
LW
3572 return TRUE;
3573 if (*s == '-' && s[1] == '>' && (s[2] == '[' || s[2] == '{'))
3574 return TRUE;
3575 if (*s != '{' && *s != '[')
3576 return FALSE;
3280af22 3577 if (!PL_lex_inpat)
79072805
LW
3578 return TRUE;
3579
3580 /* In a pattern, so maybe we have {n,m}. */
3581 if (*s == '{') {
b3155d95 3582 if (regcurly(s)) {
79072805 3583 return FALSE;
b3155d95 3584 }
79072805 3585 return TRUE;
79072805
LW
3586 }
3587
3588 /* On the other hand, maybe we have a character class */
3589
3590 s++;
3591 if (*s == ']' || *s == '^')
3592 return FALSE;
3593 else {
ffb4593c 3594 /* this is terrifying, and it works */
79072805
LW
3595 int weight = 2; /* let's weigh the evidence */
3596 char seen[256];
f27ffc4a 3597 unsigned char un_char = 255, last_un_char;
9d4ba2ae 3598 const char * const send = strchr(s,']');
3280af22 3599 char tmpbuf[sizeof PL_tokenbuf * 4];
79072805
LW
3600
3601 if (!send) /* has to be an expression */
3602 return TRUE;
3603
3604 Zero(seen,256,char);
3605 if (*s == '$')
3606 weight -= 3;
3607 else if (isDIGIT(*s)) {
3608 if (s[1] != ']') {
3609 if (isDIGIT(s[1]) && s[2] == ']')
3610 weight -= 10;
3611 }
3612 else
3613 weight -= 100;
3614 }
3615 for (; s < send; s++) {
3616 last_un_char = un_char;
3617 un_char = (unsigned char)*s;
3618 switch (*s) {
3619 case '@':
3620 case '&':
3621 case '$':
3622 weight -= seen[un_char] * 10;
7e2040f0 3623 if (isALNUM_lazy_if(s+1,UTF)) {
90e5519e 3624 int len;
8903cb82 3625 scan_ident(s, send, tmpbuf, sizeof tmpbuf, FALSE);
90e5519e
NC
3626 len = (int)strlen(tmpbuf);
3627 if (len > 1 && gv_fetchpvn_flags(tmpbuf, len, 0, SVt_PV))
79072805
LW
3628 weight -= 100;
3629 else
3630 weight -= 10;
3631 }
3632 else if (*s == '$' && s[1] &&
93a17b20
LW
3633 strchr("[#!%*<>()-=",s[1])) {
3634 if (/*{*/ strchr("])} =",s[2]))
79072805
LW
3635 weight -= 10;
3636 else
3637 weight -= 1;
3638 }
3639 break;
3640 case '\\':
3641 un_char = 254;
3642 if (s[1]) {
93a17b20 3643 if (strchr("wds]",s[1]))
79072805 3644 weight += 100;
10edeb5d 3645 else if (seen[(U8)'\''] || seen[(U8)'"'])
79072805 3646 weight += 1;
93a17b20 3647 else if (strchr("rnftbxcav",s[1]))
79072805
LW
3648 weight += 40;
3649 else if (isDIGIT(s[1])) {
3650 weight += 40;
3651 while (s[1] && isDIGIT(s[1]))
3652 s++;
3653 }
3654 }
3655 else
3656 weight += 100;
3657 break;
3658 case '-':
3659 if (s[1] == '\\')
3660 weight += 50;
93a17b20 3661 if (strchr("aA01! ",last_un_char))
79072805 3662 weight += 30;
93a17b20 3663 if (strchr("zZ79~",s[1]))
79072805 3664 weight += 30;
f27ffc4a
GS
3665 if (last_un_char == 255 && (isDIGIT(s[1]) || s[1] == '$'))
3666 weight -= 5; /* cope with negative subscript */
79072805
LW
3667 break;
3668 default:
3792a11b
NC
3669 if (!isALNUM(last_un_char)
3670 && !(last_un_char == '$' || last_un_char == '@'
3671 || last_un_char == '&')
3672 && isALPHA(*s) && s[1] && isALPHA(s[1])) {
79072805
LW
3673 char *d = tmpbuf;
3674 while (isALPHA(*s))
3675 *d++ = *s++;
3676 *d = '\0';
5458a98a 3677 if (keyword(tmpbuf, d - tmpbuf, 0))
79072805
LW
3678 weight -= 150;
3679 }
3680 if (un_char == last_un_char + 1)
3681 weight += 5;
3682 weight -= seen[un_char];
3683 break;
3684 }
3685 seen[un_char]++;
3686 }
3687 if (weight >= 0) /* probably a character class */
3688 return FALSE;
3689 }
3690
3691 return TRUE;
3692}
ffed7fef 3693
ffb4593c
NT
3694/*
3695 * S_intuit_method
3696 *
3697 * Does all the checking to disambiguate
3698 * foo bar
3699 * between foo(bar) and bar->foo. Returns 0 if not a method, otherwise
3700 * FUNCMETH (bar->foo(args)) or METHOD (bar->foo args).
3701 *
3702 * First argument is the stuff after the first token, e.g. "bar".
3703 *
3704 * Not a method if bar is a filehandle.
3705 * Not a method if foo is a subroutine prototyped to take a filehandle.
3706 * Not a method if it's really "Foo $bar"
3707 * Method if it's "foo $bar"
3708 * Not a method if it's really "print foo $bar"
3709 * Method if it's really "foo package::" (interpreted as package->foo)
8f8cf39c 3710 * Not a method if bar is known to be a subroutine ("sub bar; foo bar")
3cb0bbe5 3711 * Not a method if bar is a filehandle or package, but is quoted with
ffb4593c
NT
3712 * =>
3713 */
3714
76e3520e 3715STATIC int
62d55b22 3716S_intuit_method(pTHX_ char *start, GV *gv, CV *cv)
a0d0e21e 3717{
97aff369 3718 dVAR;
a0d0e21e 3719 char *s = start + (*start == '$');
3280af22 3720 char tmpbuf[sizeof PL_tokenbuf];
a0d0e21e
LW
3721 STRLEN len;
3722 GV* indirgv;
5db06880
NC
3723#ifdef PERL_MAD
3724 int soff;
3725#endif
a0d0e21e 3726
7918f24d
NC
3727 PERL_ARGS_ASSERT_INTUIT_METHOD;
3728
a0d0e21e 3729 if (gv) {
62d55b22 3730 if (SvTYPE(gv) == SVt_PVGV && GvIO(gv))
a0d0e21e 3731 return 0;
62d55b22
NC
3732 if (cv) {
3733 if (SvPOK(cv)) {
3734 const char *proto = SvPVX_const(cv);
3735 if (proto) {
3736 if (*proto == ';')
3737 proto++;
3738 if (*proto == '*')
3739 return 0;
3740 }
b6c543e3
IZ
3741 }
3742 } else
c35e046a 3743 gv = NULL;
a0d0e21e 3744 }
8903cb82 3745 s = scan_word(s, tmpbuf, sizeof tmpbuf, TRUE, &len);
ffb4593c
NT
3746 /* start is the beginning of the possible filehandle/object,
3747 * and s is the end of it
3748 * tmpbuf is a copy of it
3749 */
3750
a0d0e21e 3751 if (*start == '$') {
3ef1310e
RGS
3752 if (gv || PL_last_lop_op == OP_PRINT || PL_last_lop_op == OP_SAY ||
3753 isUPPER(*PL_tokenbuf))
a0d0e21e 3754 return 0;
5db06880
NC
3755#ifdef PERL_MAD
3756 len = start - SvPVX(PL_linestr);
3757#endif
29595ff2 3758 s = PEEKSPACE(s);
f0092767 3759#ifdef PERL_MAD
5db06880
NC
3760 start = SvPVX(PL_linestr) + len;
3761#endif
3280af22
NIS
3762 PL_bufptr = start;
3763 PL_expect = XREF;
a0d0e21e
LW
3764 return *s == '(' ? FUNCMETH : METHOD;
3765 }
5458a98a 3766 if (!keyword(tmpbuf, len, 0)) {
c3e0f903
GS
3767 if (len > 2 && tmpbuf[len - 2] == ':' && tmpbuf[len - 1] == ':') {
3768 len -= 2;
3769 tmpbuf[len] = '\0';
5db06880
NC
3770#ifdef PERL_MAD
3771 soff = s - SvPVX(PL_linestr);
3772#endif
c3e0f903
GS
3773 goto bare_package;
3774 }
90e5519e 3775 indirgv = gv_fetchpvn_flags(tmpbuf, len, 0, SVt_PVCV);
8ebc5c01 3776 if (indirgv && GvCVu(indirgv))
a0d0e21e
LW
3777 return 0;
3778 /* filehandle or package name makes it a method */