This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
PATCH: [perl #56444] delayed interpolation of \N{...}
[perl5.git] / toke.c
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
26
27This is the lower layer of the Perl parser, managing characters and tokens.
28
29=for apidoc AmU|yy_parser *|PL_parser
30
31Pointer to a structure encapsulating the state of the parsing operation
32currently in progress. The pointer can be locally changed to perform
33a nested parse without interfering with the state of an outer parse.
34Individual members of C<PL_parser> have their own documentation.
35
36=cut
37*/
38
378cc40b 39#include "EXTERN.h"
864dbfa3 40#define PERL_IN_TOKE_C
378cc40b 41#include "perl.h"
378cc40b 42
eb0d8d16
NC
43#define new_constant(a,b,c,d,e,f,g) \
44 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
45
6154021b 46#define pl_yylval (PL_parser->yylval)
d3b6f988 47
acdf0a21
DM
48/* YYINITDEPTH -- initial size of the parser's stacks. */
49#define YYINITDEPTH 200
50
199e78b7
DM
51/* XXX temporary backwards compatibility */
52#define PL_lex_brackets (PL_parser->lex_brackets)
53#define PL_lex_brackstack (PL_parser->lex_brackstack)
54#define PL_lex_casemods (PL_parser->lex_casemods)
55#define PL_lex_casestack (PL_parser->lex_casestack)
56#define PL_lex_defer (PL_parser->lex_defer)
57#define PL_lex_dojoin (PL_parser->lex_dojoin)
58#define PL_lex_expect (PL_parser->lex_expect)
59#define PL_lex_formbrack (PL_parser->lex_formbrack)
60#define PL_lex_inpat (PL_parser->lex_inpat)
61#define PL_lex_inwhat (PL_parser->lex_inwhat)
62#define PL_lex_op (PL_parser->lex_op)
63#define PL_lex_repl (PL_parser->lex_repl)
64#define PL_lex_starts (PL_parser->lex_starts)
65#define PL_lex_stuff (PL_parser->lex_stuff)
66#define PL_multi_start (PL_parser->multi_start)
67#define PL_multi_open (PL_parser->multi_open)
68#define PL_multi_close (PL_parser->multi_close)
69#define PL_pending_ident (PL_parser->pending_ident)
70#define PL_preambled (PL_parser->preambled)
71#define PL_sublex_info (PL_parser->sublex_info)
bdc0bf6f 72#define PL_linestr (PL_parser->linestr)
c2598295
DM
73#define PL_expect (PL_parser->expect)
74#define PL_copline (PL_parser->copline)
f06b5848
DM
75#define PL_bufptr (PL_parser->bufptr)
76#define PL_oldbufptr (PL_parser->oldbufptr)
77#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
78#define PL_linestart (PL_parser->linestart)
79#define PL_bufend (PL_parser->bufend)
80#define PL_last_uni (PL_parser->last_uni)
81#define PL_last_lop (PL_parser->last_lop)
82#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 83#define PL_lex_state (PL_parser->lex_state)
2f9285f8 84#define PL_rsfp (PL_parser->rsfp)
5486870f 85#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
86#define PL_in_my (PL_parser->in_my)
87#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 88#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 89#define PL_multi_end (PL_parser->multi_end)
13765c85 90#define PL_error_count (PL_parser->error_count)
199e78b7
DM
91
92#ifdef PERL_MAD
93# define PL_endwhite (PL_parser->endwhite)
94# define PL_faketokens (PL_parser->faketokens)
95# define PL_lasttoke (PL_parser->lasttoke)
96# define PL_nextwhite (PL_parser->nextwhite)
97# define PL_realtokenstart (PL_parser->realtokenstart)
98# define PL_skipwhite (PL_parser->skipwhite)
99# define PL_thisclose (PL_parser->thisclose)
100# define PL_thismad (PL_parser->thismad)
101# define PL_thisopen (PL_parser->thisopen)
102# define PL_thisstuff (PL_parser->thisstuff)
103# define PL_thistoken (PL_parser->thistoken)
104# define PL_thiswhite (PL_parser->thiswhite)
fb205e7a
DM
105# define PL_thiswhite (PL_parser->thiswhite)
106# define PL_nexttoke (PL_parser->nexttoke)
107# define PL_curforce (PL_parser->curforce)
108#else
109# define PL_nexttoke (PL_parser->nexttoke)
110# define PL_nexttype (PL_parser->nexttype)
111# define PL_nextval (PL_parser->nextval)
199e78b7
DM
112#endif
113
16173588
NC
114/* This can't be done with embed.fnc, because struct yy_parser contains a
115 member named pending_ident, which clashes with the generated #define */
3cbf51f5
DM
116static int
117S_pending_ident(pTHX);
199e78b7 118
0bd48802 119static const char ident_too_long[] = "Identifier too long";
8903cb82 120
29595ff2 121#ifdef PERL_MAD
29595ff2 122# define CURMAD(slot,sv) if (PL_madskills) { curmad(slot,sv); sv = 0; }
cd81e915 123# define NEXTVAL_NEXTTOKE PL_nexttoke[PL_curforce].next_val
9ded7720 124#else
5db06880 125# define CURMAD(slot,sv)
9ded7720 126# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2
NC
127#endif
128
9059aa12
LW
129#define XFAKEBRACK 128
130#define XENUMMASK 127
131
39e02b42
JH
132#ifdef USE_UTF8_SCRIPTS
133# define UTF (!IN_BYTES)
2b9d42f0 134#else
746b446a 135# define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || (PL_hints & HINT_UTF8))
2b9d42f0 136#endif
a0ed51b3 137
b1fc3636
CJ
138/* The maximum number of characters preceding the unrecognized one to display */
139#define UNRECOGNIZED_PRECEDE_COUNT 10
140
61f0cdd9 141/* In variables named $^X, these are the legal values for X.
2b92dfce
GS
142 * 1999-02-27 mjd-perl-patch@plover.com */
143#define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
144
bf4acbe4 145#define SPACE_OR_TAB(c) ((c)==' '||(c)=='\t')
bf4acbe4 146
ffb4593c
NT
147/* LEX_* are values for PL_lex_state, the state of the lexer.
148 * They are arranged oddly so that the guard on the switch statement
79072805
LW
149 * can get by with a single comparison (if the compiler is smart enough).
150 */
151
fb73857a
PP
152/* #define LEX_NOTPARSING 11 is done in perl.h. */
153
b6007c36
DM
154#define LEX_NORMAL 10 /* normal code (ie not within "...") */
155#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
156#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
157#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
158#define LEX_INTERPSTART 6 /* expecting the start of a $var */
159
160 /* at end of code, eg "$x" followed by: */
161#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
162#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
163
164#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
165 string or after \E, $foo, etc */
166#define LEX_INTERPCONST 2 /* NOT USED */
167#define LEX_FORMLINE 1 /* expecting a format line */
168#define LEX_KNOWNEXT 0 /* next token known; just return it */
169
79072805 170
bbf60fe6 171#ifdef DEBUGGING
27da23d5 172static const char* const lex_state_names[] = {
bbf60fe6
DM
173 "KNOWNEXT",
174 "FORMLINE",
175 "INTERPCONST",
176 "INTERPCONCAT",
177 "INTERPENDMAYBE",
178 "INTERPEND",
179 "INTERPSTART",
180 "INTERPPUSH",
181 "INTERPCASEMOD",
182 "INTERPNORMAL",
183 "NORMAL"
184};
185#endif
186
79072805
LW
187#ifdef ff_next
188#undef ff_next
d48672a2
LW
189#endif
190
79072805 191#include "keywords.h"
fe14fcc3 192
ffb4593c
NT
193/* CLINE is a macro that ensures PL_copline has a sane value */
194
ae986130
LW
195#ifdef CLINE
196#undef CLINE
197#endif
57843af0 198#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 199
5db06880 200#ifdef PERL_MAD
29595ff2
NC
201# define SKIPSPACE0(s) skipspace0(s)
202# define SKIPSPACE1(s) skipspace1(s)
203# define SKIPSPACE2(s,tsv) skipspace2(s,&tsv)
204# define PEEKSPACE(s) skipspace2(s,0)
205#else
206# define SKIPSPACE0(s) skipspace(s)
207# define SKIPSPACE1(s) skipspace(s)
208# define SKIPSPACE2(s,tsv) skipspace(s)
209# define PEEKSPACE(s) skipspace(s)
210#endif
211
ffb4593c
NT
212/*
213 * Convenience functions to return different tokens and prime the
9cbb5ea2 214 * lexer for the next token. They all take an argument.
ffb4593c
NT
215 *
216 * TOKEN : generic token (used for '(', DOLSHARP, etc)
217 * OPERATOR : generic operator
218 * AOPERATOR : assignment operator
219 * PREBLOCK : beginning the block after an if, while, foreach, ...
220 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
221 * PREREF : *EXPR where EXPR is not a simple identifier
222 * TERM : expression term
223 * LOOPX : loop exiting command (goto, last, dump, etc)
224 * FTST : file test operator
225 * FUN0 : zero-argument function
2d2e263d 226 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
227 * BOop : bitwise or or xor
228 * BAop : bitwise and
229 * SHop : shift operator
230 * PWop : power operator
9cbb5ea2 231 * PMop : pattern-matching operator
ffb4593c
NT
232 * Aop : addition-level operator
233 * Mop : multiplication-level operator
234 * Eop : equality-testing operator
e5edeb50 235 * Rop : relational operator <= != gt
ffb4593c
NT
236 *
237 * Also see LOP and lop() below.
238 */
239
998054bd 240#ifdef DEBUGGING /* Serve -DT. */
704d4215 241# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 242#else
bbf60fe6 243# define REPORT(retval) (retval)
998054bd
SC
244#endif
245
bbf60fe6
DM
246#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
247#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
248#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, REPORT(retval)))
249#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
250#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
251#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
252#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
6154021b
RGS
253#define LOOPX(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)LOOPEX))
254#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
255#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
256#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
257#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITOROP)))
258#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITANDOP)))
259#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)SHIFTOP)))
260#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)POWOP)))
261#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
262#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)ADDOP)))
263#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MULOP)))
264#define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
265#define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
2f3197b3 266
a687059c
LW
267/* This bit of chicanery makes a unary function followed by
268 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
269 * The UNIDOR macro is for unary functions that can be followed by the //
270 * operator (such as C<shift // 0>).
a687059c 271 */
376fcdbf 272#define UNI2(f,x) { \
6154021b 273 pl_yylval.ival = f; \
376fcdbf
AL
274 PL_expect = x; \
275 PL_bufptr = s; \
276 PL_last_uni = PL_oldbufptr; \
277 PL_last_lop_op = f; \
278 if (*s == '(') \
279 return REPORT( (int)FUNC1 ); \
29595ff2 280 s = PEEKSPACE(s); \
376fcdbf
AL
281 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
282 }
6f33ba73
RGS
283#define UNI(f) UNI2(f,XTERM)
284#define UNIDOR(f) UNI2(f,XTERMORDORDOR)
a687059c 285
376fcdbf 286#define UNIBRACK(f) { \
6154021b 287 pl_yylval.ival = f; \
376fcdbf
AL
288 PL_bufptr = s; \
289 PL_last_uni = PL_oldbufptr; \
290 if (*s == '(') \
291 return REPORT( (int)FUNC1 ); \
29595ff2 292 s = PEEKSPACE(s); \
376fcdbf
AL
293 return REPORT( (*s == '(') ? (int)FUNC1 : (int)UNIOP ); \
294 }
79072805 295
9f68db38 296/* grandfather return to old style */
6154021b 297#define OLDLOP(f) return(pl_yylval.ival=f,PL_expect = XTERM,PL_bufptr = s,(int)LSTOP)
79072805 298
8fa7f367
JH
299#ifdef DEBUGGING
300
6154021b 301/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
302enum token_type {
303 TOKENTYPE_NONE,
304 TOKENTYPE_IVAL,
6154021b 305 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6
DM
306 TOKENTYPE_PVAL,
307 TOKENTYPE_OPVAL,
308 TOKENTYPE_GVVAL
309};
310
6d4a66ac
NC
311static struct debug_tokens {
312 const int token;
313 enum token_type type;
314 const char *name;
315} const debug_tokens[] =
9041c2e3 316{
bbf60fe6
DM
317 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
318 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
319 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
320 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
321 { ARROW, TOKENTYPE_NONE, "ARROW" },
322 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
323 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
324 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
325 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
326 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
0d863452 327 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
328 { DO, TOKENTYPE_NONE, "DO" },
329 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
330 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
331 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
332 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
333 { ELSE, TOKENTYPE_NONE, "ELSE" },
334 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
335 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
336 { FOR, TOKENTYPE_IVAL, "FOR" },
337 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
338 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
339 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
340 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
341 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
342 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 343 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
344 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
345 { IF, TOKENTYPE_IVAL, "IF" },
346 { LABEL, TOKENTYPE_PVAL, "LABEL" },
347 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
348 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
349 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
350 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
351 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
352 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
353 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
354 { MY, TOKENTYPE_IVAL, "MY" },
355 { MYSUB, TOKENTYPE_NONE, "MYSUB" },
356 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
357 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
358 { OROP, TOKENTYPE_IVAL, "OROP" },
359 { OROR, TOKENTYPE_NONE, "OROR" },
360 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
88e1f1a2
JV
361 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
362 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6
DM
363 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
364 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
365 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
366 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
367 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
368 { PREINC, TOKENTYPE_NONE, "PREINC" },
369 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
370 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
371 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
372 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
373 { SUB, TOKENTYPE_NONE, "SUB" },
374 { THING, TOKENTYPE_OPVAL, "THING" },
375 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
376 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
377 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
378 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
379 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
380 { USE, TOKENTYPE_IVAL, "USE" },
0d863452 381 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6
DM
382 { WHILE, TOKENTYPE_IVAL, "WHILE" },
383 { WORD, TOKENTYPE_OPVAL, "WORD" },
be25f609 384 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 385 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
386};
387
6154021b 388/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 389
bbf60fe6 390STATIC int
704d4215 391S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 392{
97aff369 393 dVAR;
7918f24d
NC
394
395 PERL_ARGS_ASSERT_TOKEREPORT;
396
bbf60fe6 397 if (DEBUG_T_TEST) {
bd61b366 398 const char *name = NULL;
bbf60fe6 399 enum token_type type = TOKENTYPE_NONE;
f54cb97a 400 const struct debug_tokens *p;
396482e1 401 SV* const report = newSVpvs("<== ");
bbf60fe6 402
f54cb97a 403 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
404 if (p->token == (int)rv) {
405 name = p->name;
406 type = p->type;
407 break;
408 }
409 }
410 if (name)
54667de8 411 Perl_sv_catpv(aTHX_ report, name);
bbf60fe6
DM
412 else if ((char)rv > ' ' && (char)rv < '~')
413 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
414 else if (!rv)
396482e1 415 sv_catpvs(report, "EOF");
bbf60fe6
DM
416 else
417 Perl_sv_catpvf(aTHX_ report, "?? %"IVdf, (IV)rv);
418 switch (type) {
419 case TOKENTYPE_NONE:
420 case TOKENTYPE_GVVAL: /* doesn't appear to be used */
421 break;
422 case TOKENTYPE_IVAL:
704d4215 423 Perl_sv_catpvf(aTHX_ report, "(ival=%"IVdf")", (IV)lvalp->ival);
bbf60fe6
DM
424 break;
425 case TOKENTYPE_OPNUM:
426 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 427 PL_op_name[lvalp->ival]);
bbf60fe6
DM
428 break;
429 case TOKENTYPE_PVAL:
704d4215 430 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
431 break;
432 case TOKENTYPE_OPVAL:
704d4215 433 if (lvalp->opval) {
401441c0 434 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
435 PL_op_name[lvalp->opval->op_type]);
436 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 437 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 438 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
439 }
440
441 }
401441c0 442 else
396482e1 443 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
444 break;
445 }
b6007c36 446 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
447 };
448 return (int)rv;
998054bd
SC
449}
450
b6007c36
DM
451
452/* print the buffer with suitable escapes */
453
454STATIC void
15f169a1 455S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 456{
396482e1 457 SV* const tmp = newSVpvs("");
7918f24d
NC
458
459 PERL_ARGS_ASSERT_PRINTBUF;
460
b6007c36
DM
461 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
462 SvREFCNT_dec(tmp);
463}
464
8fa7f367
JH
465#endif
466
8290c323
NC
467static int
468S_deprecate_commaless_var_list(pTHX) {
469 PL_expect = XTERM;
470 deprecate("comma-less variable list");
471 return REPORT(','); /* grandfather non-comma-format format */
472}
473
ffb4593c
NT
474/*
475 * S_ao
476 *
c963b151
BD
477 * This subroutine detects &&=, ||=, and //= and turns an ANDAND, OROR or DORDOR
478 * into an OP_ANDASSIGN, OP_ORASSIGN, or OP_DORASSIGN
ffb4593c
NT
479 */
480
76e3520e 481STATIC int
cea2e8a9 482S_ao(pTHX_ int toketype)
a0d0e21e 483{
97aff369 484 dVAR;
3280af22
NIS
485 if (*PL_bufptr == '=') {
486 PL_bufptr++;
a0d0e21e 487 if (toketype == ANDAND)
6154021b 488 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 489 else if (toketype == OROR)
6154021b 490 pl_yylval.ival = OP_ORASSIGN;
c963b151 491 else if (toketype == DORDOR)
6154021b 492 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
493 toketype = ASSIGNOP;
494 }
495 return toketype;
496}
497
ffb4593c
NT
498/*
499 * S_no_op
500 * When Perl expects an operator and finds something else, no_op
501 * prints the warning. It always prints "<something> found where
502 * operator expected. It prints "Missing semicolon on previous line?"
503 * if the surprise occurs at the start of the line. "do you need to
504 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
505 * where the compiler doesn't know if foo is a method call or a function.
506 * It prints "Missing operator before end of line" if there's nothing
507 * after the missing operator, or "... before <...>" if there is something
508 * after the missing operator.
509 */
510
76e3520e 511STATIC void
15f169a1 512S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 513{
97aff369 514 dVAR;
9d4ba2ae
AL
515 char * const oldbp = PL_bufptr;
516 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 517
7918f24d
NC
518 PERL_ARGS_ASSERT_NO_OP;
519
1189a94a
GS
520 if (!s)
521 s = oldbp;
07c798fb 522 else
1189a94a 523 PL_bufptr = s;
cea2e8a9 524 yywarn(Perl_form(aTHX_ "%s found where operator expected", what));
56da5a46
RGS
525 if (ckWARN_d(WARN_SYNTAX)) {
526 if (is_first)
527 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
528 "\t(Missing semicolon on previous line?)\n");
529 else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) {
f54cb97a 530 const char *t;
c35e046a
AL
531 for (t = PL_oldoldbufptr; (isALNUM_lazy_if(t,UTF) || *t == ':'); t++)
532 NOOP;
56da5a46
RGS
533 if (t < PL_bufptr && isSPACE(*t))
534 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
535 "\t(Do you need to predeclare %.*s?)\n",
551405c4 536 (int)(t - PL_oldoldbufptr), PL_oldoldbufptr);
56da5a46
RGS
537 }
538 else {
539 assert(s >= oldbp);
540 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
551405c4 541 "\t(Missing operator before %.*s?)\n", (int)(s - oldbp), oldbp);
56da5a46 542 }
07c798fb 543 }
3280af22 544 PL_bufptr = oldbp;
8990e307
LW
545}
546
ffb4593c
NT
547/*
548 * S_missingterm
549 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 550 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
551 * If we're in a delimited string and the delimiter is a control
552 * character, it's reformatted into a two-char sequence like ^C.
553 * This is fatal.
554 */
555
76e3520e 556STATIC void
cea2e8a9 557S_missingterm(pTHX_ char *s)
8990e307 558{
97aff369 559 dVAR;
8990e307
LW
560 char tmpbuf[3];
561 char q;
562 if (s) {
9d4ba2ae 563 char * const nl = strrchr(s,'\n');
d2719217 564 if (nl)
8990e307
LW
565 *nl = '\0';
566 }
463559e7 567 else if (isCNTRL(PL_multi_close)) {
8990e307 568 *tmpbuf = '^';
585ec06d 569 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
570 tmpbuf[2] = '\0';
571 s = tmpbuf;
572 }
573 else {
eb160463 574 *tmpbuf = (char)PL_multi_close;
8990e307
LW
575 tmpbuf[1] = '\0';
576 s = tmpbuf;
577 }
578 q = strchr(s,'"') ? '\'' : '"';
cea2e8a9 579 Perl_croak(aTHX_ "Can't find string terminator %c%s%c anywhere before EOF",q,s,q);
463ee0b2 580}
79072805 581
ef89dcc3 582#define FEATURE_IS_ENABLED(name) \
0d863452 583 ((0 != (PL_hints & HINT_LOCALIZE_HH)) \
89529cee 584 && S_feature_is_enabled(aTHX_ STR_WITH_LEN(name)))
4a731d7b 585/* The longest string we pass in. */
1863b879 586#define MAX_FEATURE_LEN (sizeof("unicode_strings")-1)
4a731d7b 587
0d863452
RH
588/*
589 * S_feature_is_enabled
590 * Check whether the named feature is enabled.
591 */
592STATIC bool
15f169a1 593S_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
0d863452 594{
97aff369 595 dVAR;
0d863452 596 HV * const hinthv = GvHV(PL_hintgv);
4a731d7b 597 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
7918f24d
NC
598
599 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
600
4a731d7b
NC
601 assert(namelen <= MAX_FEATURE_LEN);
602 memcpy(&he_name[8], name, namelen);
d4c19fe8 603
7b9ef140 604 return (hinthv && hv_exists(hinthv, he_name, 8 + namelen));
0d863452
RH
605}
606
ffb4593c 607/*
9cbb5ea2
GS
608 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
609 * utf16-to-utf8-reversed.
ffb4593c
NT
610 */
611
c39cd008
GS
612#ifdef PERL_CR_FILTER
613static void
614strip_return(SV *sv)
615{
95a20fc0 616 register const char *s = SvPVX_const(sv);
9d4ba2ae 617 register const char * const e = s + SvCUR(sv);
7918f24d
NC
618
619 PERL_ARGS_ASSERT_STRIP_RETURN;
620
c39cd008
GS
621 /* outer loop optimized to do nothing if there are no CR-LFs */
622 while (s < e) {
623 if (*s++ == '\r' && *s == '\n') {
624 /* hit a CR-LF, need to copy the rest */
625 register char *d = s - 1;
626 *d++ = *s++;
627 while (s < e) {
628 if (*s == '\r' && s[1] == '\n')
629 s++;
630 *d++ = *s++;
631 }
632 SvCUR(sv) -= s - d;
633 return;
634 }
635 }
636}
a868473f 637
76e3520e 638STATIC I32
c39cd008 639S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 640{
f54cb97a 641 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
642 if (count > 0 && !maxlen)
643 strip_return(sv);
644 return count;
a868473f
NIS
645}
646#endif
647
199e78b7
DM
648
649
ffb4593c
NT
650/*
651 * Perl_lex_start
5486870f 652 *
e3abe207 653 * Create a parser object and initialise its parser and lexer fields
5486870f
DM
654 *
655 * rsfp is the opened file handle to read from (if any),
656 *
657 * line holds any initial content already read from the file (or in
658 * the case of no file, such as an eval, the whole contents);
659 *
660 * new_filter indicates that this is a new file and it shouldn't inherit
661 * the filters from the current parser (ie require).
ffb4593c
NT
662 */
663
a0d0e21e 664void
5486870f 665Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, bool new_filter)
79072805 666{
97aff369 667 dVAR;
6ef55633 668 const char *s = NULL;
8990e307 669 STRLEN len;
5486870f 670 yy_parser *parser, *oparser;
acdf0a21
DM
671
672 /* create and initialise a parser */
673
199e78b7 674 Newxz(parser, 1, yy_parser);
5486870f 675 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
676 PL_parser = parser;
677
678 Newx(parser->stack, YYINITDEPTH, yy_stack_frame);
679 parser->ps = parser->stack;
680 parser->stack_size = YYINITDEPTH;
681
682 parser->stack->state = 0;
683 parser->yyerrstatus = 0;
684 parser->yychar = YYEMPTY; /* Cause a token to be read. */
685
e3abe207
DM
686 /* on scope exit, free this parser and restore any outer one */
687 SAVEPARSER(parser);
7c4baf47 688 parser->saved_curcop = PL_curcop;
e3abe207 689
acdf0a21 690 /* initialise lexer state */
8990e307 691
fb205e7a
DM
692#ifdef PERL_MAD
693 parser->curforce = -1;
694#else
695 parser->nexttoke = 0;
696#endif
ca4cfd28 697 parser->error_count = oparser ? oparser->error_count : 0;
c2598295 698 parser->copline = NOLINE;
5afb0a62 699 parser->lex_state = LEX_NORMAL;
c2598295 700 parser->expect = XSTATE;
2f9285f8 701 parser->rsfp = rsfp;
56b27c9a 702 parser->rsfp_filters = (new_filter || !oparser) ? newAV()
502c6561 703 : MUTABLE_AV(SvREFCNT_inc(oparser->rsfp_filters));
2f9285f8 704
199e78b7
DM
705 Newx(parser->lex_brackstack, 120, char);
706 Newx(parser->lex_casestack, 12, char);
707 *parser->lex_casestack = '\0';
02b34bbe 708
10efb74f
NC
709 if (line) {
710 s = SvPV_const(line, len);
711 } else {
712 len = 0;
713 }
bdc0bf6f 714
10efb74f 715 if (!len) {
bdc0bf6f 716 parser->linestr = newSVpvs("\n;");
10efb74f 717 } else if (SvREADONLY(line) || s[len-1] != ';') {
bdc0bf6f 718 parser->linestr = newSVsv(line);
10efb74f 719 if (s[len-1] != ';')
bdc0bf6f 720 sv_catpvs(parser->linestr, "\n;");
6c5ce11d
NC
721 } else {
722 SvTEMP_off(line);
723 SvREFCNT_inc_simple_void_NN(line);
bdc0bf6f 724 parser->linestr = line;
8990e307 725 }
f06b5848
DM
726 parser->oldoldbufptr =
727 parser->oldbufptr =
728 parser->bufptr =
729 parser->linestart = SvPVX(parser->linestr);
730 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
731 parser->last_lop = parser->last_uni = NULL;
79072805 732}
a687059c 733
e3abe207
DM
734
735/* delete a parser object */
736
737void
738Perl_parser_free(pTHX_ const yy_parser *parser)
739{
7918f24d
NC
740 PERL_ARGS_ASSERT_PARSER_FREE;
741
7c4baf47 742 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
743 SvREFCNT_dec(parser->linestr);
744
2f9285f8
DM
745 if (parser->rsfp == PerlIO_stdin())
746 PerlIO_clearerr(parser->rsfp);
799361c3
SH
747 else if (parser->rsfp && (!parser->old_parser ||
748 (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 749 PerlIO_close(parser->rsfp);
5486870f 750 SvREFCNT_dec(parser->rsfp_filters);
2f9285f8 751
e3abe207
DM
752 Safefree(parser->stack);
753 Safefree(parser->lex_brackstack);
754 Safefree(parser->lex_casestack);
755 PL_parser = parser->old_parser;
756 Safefree(parser);
757}
758
759
ffb4593c
NT
760/*
761 * Perl_lex_end
9cbb5ea2
GS
762 * Finalizer for lexing operations. Must be called when the parser is
763 * done with the lexer.
ffb4593c
NT
764 */
765
463ee0b2 766void
864dbfa3 767Perl_lex_end(pTHX)
463ee0b2 768{
97aff369 769 dVAR;
3280af22 770 PL_doextract = FALSE;
463ee0b2
LW
771}
772
ffb4593c 773/*
f0e67a1d
Z
774=for apidoc AmxU|SV *|PL_parser-E<gt>linestr
775
776Buffer scalar containing the chunk currently under consideration of the
777text currently being lexed. This is always a plain string scalar (for
778which C<SvPOK> is true). It is not intended to be used as a scalar by
779normal scalar means; instead refer to the buffer directly by the pointer
780variables described below.
781
782The lexer maintains various C<char*> pointers to things in the
783C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
784reallocated, all of these pointers must be updated. Don't attempt to
785do this manually, but rather use L</lex_grow_linestr> if you need to
786reallocate the buffer.
787
788The content of the text chunk in the buffer is commonly exactly one
789complete line of input, up to and including a newline terminator,
790but there are situations where it is otherwise. The octets of the
791buffer may be intended to be interpreted as either UTF-8 or Latin-1.
792The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
793flag on this scalar, which may disagree with it.
794
795For direct examination of the buffer, the variable
796L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
797lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
798of these pointers is usually preferable to examination of the scalar
799through normal scalar means.
800
801=for apidoc AmxU|char *|PL_parser-E<gt>bufend
802
803Direct pointer to the end of the chunk of text currently being lexed, the
804end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
805+ SvCUR(PL_parser-E<gt>linestr)>. A NUL character (zero octet) is
806always located at the end of the buffer, and does not count as part of
807the buffer's contents.
808
809=for apidoc AmxU|char *|PL_parser-E<gt>bufptr
810
811Points to the current position of lexing inside the lexer buffer.
812Characters around this point may be freely examined, within
813the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
814L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
815interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
816
817Lexing code (whether in the Perl core or not) moves this pointer past
818the characters that it consumes. It is also expected to perform some
819bookkeeping whenever a newline character is consumed. This movement
820can be more conveniently performed by the function L</lex_read_to>,
821which handles newlines appropriately.
822
823Interpretation of the buffer's octets can be abstracted out by
824using the slightly higher-level functions L</lex_peek_unichar> and
825L</lex_read_unichar>.
826
827=for apidoc AmxU|char *|PL_parser-E<gt>linestart
828
829Points to the start of the current line inside the lexer buffer.
830This is useful for indicating at which column an error occurred, and
831not much else. This must be updated by any lexing code that consumes
832a newline; the function L</lex_read_to> handles this detail.
833
834=cut
835*/
836
837/*
838=for apidoc Amx|bool|lex_bufutf8
839
840Indicates whether the octets in the lexer buffer
841(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
842of Unicode characters. If not, they should be interpreted as Latin-1
843characters. This is analogous to the C<SvUTF8> flag for scalars.
844
845In UTF-8 mode, it is not guaranteed that the lexer buffer actually
846contains valid UTF-8. Lexing code must be robust in the face of invalid
847encoding.
848
849The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
850is significant, but not the whole story regarding the input character
851encoding. Normally, when a file is being read, the scalar contains octets
852and its C<SvUTF8> flag is off, but the octets should be interpreted as
853UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
854however, the scalar may have the C<SvUTF8> flag on, and in this case its
855octets should be interpreted as UTF-8 unless the C<use bytes> pragma
856is in effect. This logic may change in the future; use this function
857instead of implementing the logic yourself.
858
859=cut
860*/
861
862bool
863Perl_lex_bufutf8(pTHX)
864{
865 return UTF;
866}
867
868/*
869=for apidoc Amx|char *|lex_grow_linestr|STRLEN len
870
871Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
872at least I<len> octets (including terminating NUL). Returns a
873pointer to the reallocated buffer. This is necessary before making
874any direct modification of the buffer that would increase its length.
875L</lex_stuff_pvn> provides a more convenient way to insert text into
876the buffer.
877
878Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
879this function updates all of the lexer's variables that point directly
880into the buffer.
881
882=cut
883*/
884
885char *
886Perl_lex_grow_linestr(pTHX_ STRLEN len)
887{
888 SV *linestr;
889 char *buf;
890 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
891 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
892 linestr = PL_parser->linestr;
893 buf = SvPVX(linestr);
894 if (len <= SvLEN(linestr))
895 return buf;
896 bufend_pos = PL_parser->bufend - buf;
897 bufptr_pos = PL_parser->bufptr - buf;
898 oldbufptr_pos = PL_parser->oldbufptr - buf;
899 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
900 linestart_pos = PL_parser->linestart - buf;
901 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
902 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
903 buf = sv_grow(linestr, len);
904 PL_parser->bufend = buf + bufend_pos;
905 PL_parser->bufptr = buf + bufptr_pos;
906 PL_parser->oldbufptr = buf + oldbufptr_pos;
907 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
908 PL_parser->linestart = buf + linestart_pos;
909 if (PL_parser->last_uni)
910 PL_parser->last_uni = buf + last_uni_pos;
911 if (PL_parser->last_lop)
912 PL_parser->last_lop = buf + last_lop_pos;
913 return buf;
914}
915
916/*
917=for apidoc Amx|void|lex_stuff_pvn|char *pv|STRLEN len|U32 flags
918
919Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
920immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
921reallocating the buffer if necessary. This means that lexing code that
922runs later will see the characters as if they had appeared in the input.
923It is not recommended to do this as part of normal parsing, and most
924uses of this facility run the risk of the inserted characters being
925interpreted in an unintended manner.
926
927The string to be inserted is represented by I<len> octets starting
928at I<pv>. These octets are interpreted as either UTF-8 or Latin-1,
929according to whether the C<LEX_STUFF_UTF8> flag is set in I<flags>.
930The characters are recoded for the lexer buffer, according to how the
931buffer is currently being interpreted (L</lex_bufutf8>). If a string
932to be interpreted is available as a Perl scalar, the L</lex_stuff_sv>
933function is more convenient.
934
935=cut
936*/
937
938void
939Perl_lex_stuff_pvn(pTHX_ char *pv, STRLEN len, U32 flags)
940{
941 char *bufptr;
942 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
943 if (flags & ~(LEX_STUFF_UTF8))
944 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
945 if (UTF) {
946 if (flags & LEX_STUFF_UTF8) {
947 goto plain_copy;
948 } else {
949 STRLEN highhalf = 0;
950 char *p, *e = pv+len;
951 for (p = pv; p != e; p++)
952 highhalf += !!(((U8)*p) & 0x80);
953 if (!highhalf)
954 goto plain_copy;
955 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
956 bufptr = PL_parser->bufptr;
957 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
958 PL_parser->bufend += len+highhalf;
959 for (p = pv; p != e; p++) {
960 U8 c = (U8)*p;
961 if (c & 0x80) {
962 *bufptr++ = (char)(0xc0 | (c >> 6));
963 *bufptr++ = (char)(0x80 | (c & 0x3f));
964 } else {
965 *bufptr++ = (char)c;
966 }
967 }
968 }
969 } else {
970 if (flags & LEX_STUFF_UTF8) {
971 STRLEN highhalf = 0;
972 char *p, *e = pv+len;
973 for (p = pv; p != e; p++) {
974 U8 c = (U8)*p;
975 if (c >= 0xc4) {
976 Perl_croak(aTHX_ "Lexing code attempted to stuff "
977 "non-Latin-1 character into Latin-1 input");
978 } else if (c >= 0xc2 && p+1 != e &&
979 (((U8)p[1]) & 0xc0) == 0x80) {
980 p++;
981 highhalf++;
982 } else if (c >= 0x80) {
983 /* malformed UTF-8 */
984 ENTER;
985 SAVESPTR(PL_warnhook);
986 PL_warnhook = PERL_WARNHOOK_FATAL;
987 utf8n_to_uvuni((U8*)p, e-p, NULL, 0);
988 LEAVE;
989 }
990 }
991 if (!highhalf)
992 goto plain_copy;
993 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
994 bufptr = PL_parser->bufptr;
995 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
996 PL_parser->bufend += len-highhalf;
997 for (p = pv; p != e; p++) {
998 U8 c = (U8)*p;
999 if (c & 0x80) {
1000 *bufptr++ = (char)(((c & 0x3) << 6) | (p[1] & 0x3f));
1001 p++;
1002 } else {
1003 *bufptr++ = (char)c;
1004 }
1005 }
1006 } else {
1007 plain_copy:
1008 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1009 bufptr = PL_parser->bufptr;
1010 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
1011 PL_parser->bufend += len;
1012 Copy(pv, bufptr, len, char);
1013 }
1014 }
1015}
1016
1017/*
1018=for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1019
1020Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1021immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1022reallocating the buffer if necessary. This means that lexing code that
1023runs later will see the characters as if they had appeared in the input.
1024It is not recommended to do this as part of normal parsing, and most
1025uses of this facility run the risk of the inserted characters being
1026interpreted in an unintended manner.
1027
1028The string to be inserted is the string value of I<sv>. The characters
1029are recoded for the lexer buffer, according to how the buffer is currently
1030being interpreted (L</lex_bufutf8>). If a string to be interpreted is
1031not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1032need to construct a scalar.
1033
1034=cut
1035*/
1036
1037void
1038Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1039{
1040 char *pv;
1041 STRLEN len;
1042 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1043 if (flags)
1044 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1045 pv = SvPV(sv, len);
1046 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1047}
1048
1049/*
1050=for apidoc Amx|void|lex_unstuff|char *ptr
1051
1052Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1053I<ptr>. Text following I<ptr> will be moved, and the buffer shortened.
1054This hides the discarded text from any lexing code that runs later,
1055as if the text had never appeared.
1056
1057This is not the normal way to consume lexed text. For that, use
1058L</lex_read_to>.
1059
1060=cut
1061*/
1062
1063void
1064Perl_lex_unstuff(pTHX_ char *ptr)
1065{
1066 char *buf, *bufend;
1067 STRLEN unstuff_len;
1068 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1069 buf = PL_parser->bufptr;
1070 if (ptr < buf)
1071 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1072 if (ptr == buf)
1073 return;
1074 bufend = PL_parser->bufend;
1075 if (ptr > bufend)
1076 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1077 unstuff_len = ptr - buf;
1078 Move(ptr, buf, bufend+1-ptr, char);
1079 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1080 PL_parser->bufend = bufend - unstuff_len;
1081}
1082
1083/*
1084=for apidoc Amx|void|lex_read_to|char *ptr
1085
1086Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1087to I<ptr>. This advances L</PL_parser-E<gt>bufptr> to match I<ptr>,
1088performing the correct bookkeeping whenever a newline character is passed.
1089This is the normal way to consume lexed text.
1090
1091Interpretation of the buffer's octets can be abstracted out by
1092using the slightly higher-level functions L</lex_peek_unichar> and
1093L</lex_read_unichar>.
1094
1095=cut
1096*/
1097
1098void
1099Perl_lex_read_to(pTHX_ char *ptr)
1100{
1101 char *s;
1102 PERL_ARGS_ASSERT_LEX_READ_TO;
1103 s = PL_parser->bufptr;
1104 if (ptr < s || ptr > PL_parser->bufend)
1105 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1106 for (; s != ptr; s++)
1107 if (*s == '\n') {
1108 CopLINE_inc(PL_curcop);
1109 PL_parser->linestart = s+1;
1110 }
1111 PL_parser->bufptr = ptr;
1112}
1113
1114/*
1115=for apidoc Amx|void|lex_discard_to|char *ptr
1116
1117Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1118up to I<ptr>. The remaining content of the buffer will be moved, and
1119all pointers into the buffer updated appropriately. I<ptr> must not
1120be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1121it is not permitted to discard text that has yet to be lexed.
1122
1123Normally it is not necessarily to do this directly, because it suffices to
1124use the implicit discarding behaviour of L</lex_next_chunk> and things
1125based on it. However, if a token stretches across multiple lines,
1126and the lexing code has kept multiple lines of text in the buffer fof
1127that purpose, then after completion of the token it would be wise to
1128explicitly discard the now-unneeded earlier lines, to avoid future
1129multi-line tokens growing the buffer without bound.
1130
1131=cut
1132*/
1133
1134void
1135Perl_lex_discard_to(pTHX_ char *ptr)
1136{
1137 char *buf;
1138 STRLEN discard_len;
1139 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1140 buf = SvPVX(PL_parser->linestr);
1141 if (ptr < buf)
1142 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1143 if (ptr == buf)
1144 return;
1145 if (ptr > PL_parser->bufptr)
1146 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1147 discard_len = ptr - buf;
1148 if (PL_parser->oldbufptr < ptr)
1149 PL_parser->oldbufptr = ptr;
1150 if (PL_parser->oldoldbufptr < ptr)
1151 PL_parser->oldoldbufptr = ptr;
1152 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1153 PL_parser->last_uni = NULL;
1154 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1155 PL_parser->last_lop = NULL;
1156 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1157 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1158 PL_parser->bufend -= discard_len;
1159 PL_parser->bufptr -= discard_len;
1160 PL_parser->oldbufptr -= discard_len;
1161 PL_parser->oldoldbufptr -= discard_len;
1162 if (PL_parser->last_uni)
1163 PL_parser->last_uni -= discard_len;
1164 if (PL_parser->last_lop)
1165 PL_parser->last_lop -= discard_len;
1166}
1167
1168/*
1169=for apidoc Amx|bool|lex_next_chunk|U32 flags
1170
1171Reads in the next chunk of text to be lexed, appending it to
1172L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1173looked to the end of the current chunk and wants to know more. It is
1174usual, but not necessary, for lexing to have consumed the entirety of
1175the current chunk at this time.
1176
1177If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1178chunk (i.e., the current chunk has been entirely consumed), normally the
1179current chunk will be discarded at the same time that the new chunk is
1180read in. If I<flags> includes C<LEX_KEEP_PREVIOUS>, the current chunk
1181will not be discarded. If the current chunk has not been entirely
1182consumed, then it will not be discarded regardless of the flag.
1183
1184Returns true if some new text was added to the buffer, or false if the
1185buffer has reached the end of the input text.
1186
1187=cut
1188*/
1189
1190#define LEX_FAKE_EOF 0x80000000
1191
1192bool
1193Perl_lex_next_chunk(pTHX_ U32 flags)
1194{
1195 SV *linestr;
1196 char *buf;
1197 STRLEN old_bufend_pos, new_bufend_pos;
1198 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1199 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1200 bool got_some_for_debugger = 0;
f0e67a1d
Z
1201 bool got_some;
1202 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF))
1203 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
f0e67a1d
Z
1204 linestr = PL_parser->linestr;
1205 buf = SvPVX(linestr);
1206 if (!(flags & LEX_KEEP_PREVIOUS) &&
1207 PL_parser->bufptr == PL_parser->bufend) {
1208 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1209 linestart_pos = 0;
1210 if (PL_parser->last_uni != PL_parser->bufend)
1211 PL_parser->last_uni = NULL;
1212 if (PL_parser->last_lop != PL_parser->bufend)
1213 PL_parser->last_lop = NULL;
1214 last_uni_pos = last_lop_pos = 0;
1215 *buf = 0;
1216 SvCUR(linestr) = 0;
1217 } else {
1218 old_bufend_pos = PL_parser->bufend - buf;
1219 bufptr_pos = PL_parser->bufptr - buf;
1220 oldbufptr_pos = PL_parser->oldbufptr - buf;
1221 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1222 linestart_pos = PL_parser->linestart - buf;
1223 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1224 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1225 }
1226 if (flags & LEX_FAKE_EOF) {
1227 goto eof;
1228 } else if (!PL_parser->rsfp) {
1229 got_some = 0;
1230 } else if (filter_gets(linestr, old_bufend_pos)) {
1231 got_some = 1;
17cc9359 1232 got_some_for_debugger = 1;
f0e67a1d 1233 } else {
580561a3
Z
1234 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1235 sv_setpvs(linestr, "");
f0e67a1d
Z
1236 eof:
1237 /* End of real input. Close filehandle (unless it was STDIN),
1238 * then add implicit termination.
1239 */
1240 if ((PerlIO*)PL_parser->rsfp == PerlIO_stdin())
1241 PerlIO_clearerr(PL_parser->rsfp);
1242 else if (PL_parser->rsfp)
1243 (void)PerlIO_close(PL_parser->rsfp);
1244 PL_parser->rsfp = NULL;
1245 PL_doextract = FALSE;
1246#ifdef PERL_MAD
1247 if (PL_madskills && !PL_in_eval && (PL_minus_p || PL_minus_n))
1248 PL_faketokens = 1;
1249#endif
1250 if (!PL_in_eval && PL_minus_p) {
1251 sv_catpvs(linestr,
1252 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1253 PL_minus_n = PL_minus_p = 0;
1254 } else if (!PL_in_eval && PL_minus_n) {
1255 sv_catpvs(linestr, /*{*/";}");
1256 PL_minus_n = 0;
1257 } else
1258 sv_catpvs(linestr, ";");
1259 got_some = 1;
1260 }
1261 buf = SvPVX(linestr);
1262 new_bufend_pos = SvCUR(linestr);
1263 PL_parser->bufend = buf + new_bufend_pos;
1264 PL_parser->bufptr = buf + bufptr_pos;
1265 PL_parser->oldbufptr = buf + oldbufptr_pos;
1266 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1267 PL_parser->linestart = buf + linestart_pos;
1268 if (PL_parser->last_uni)
1269 PL_parser->last_uni = buf + last_uni_pos;
1270 if (PL_parser->last_lop)
1271 PL_parser->last_lop = buf + last_lop_pos;
17cc9359 1272 if (got_some_for_debugger && (PERLDB_LINE || PERLDB_SAVESRC) &&
f0e67a1d
Z
1273 PL_curstash != PL_debstash) {
1274 /* debugger active and we're not compiling the debugger code,
1275 * so store the line into the debugger's array of lines
1276 */
1277 update_debugger_info(NULL, buf+old_bufend_pos,
1278 new_bufend_pos-old_bufend_pos);
1279 }
1280 return got_some;
1281}
1282
1283/*
1284=for apidoc Amx|I32|lex_peek_unichar|U32 flags
1285
1286Looks ahead one (Unicode) character in the text currently being lexed.
1287Returns the codepoint (unsigned integer value) of the next character,
1288or -1 if lexing has reached the end of the input text. To consume the
1289peeked character, use L</lex_read_unichar>.
1290
1291If the next character is in (or extends into) the next chunk of input
1292text, the next chunk will be read in. Normally the current chunk will be
1293discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1294then the current chunk will not be discarded.
1295
1296If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1297is encountered, an exception is generated.
1298
1299=cut
1300*/
1301
1302I32
1303Perl_lex_peek_unichar(pTHX_ U32 flags)
1304{
1305 char *s, *bufend;
1306 if (flags & ~(LEX_KEEP_PREVIOUS))
1307 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1308 s = PL_parser->bufptr;
1309 bufend = PL_parser->bufend;
1310 if (UTF) {
1311 U8 head;
1312 I32 unichar;
1313 STRLEN len, retlen;
1314 if (s == bufend) {
1315 if (!lex_next_chunk(flags))
1316 return -1;
1317 s = PL_parser->bufptr;
1318 bufend = PL_parser->bufend;
1319 }
1320 head = (U8)*s;
1321 if (!(head & 0x80))
1322 return head;
1323 if (head & 0x40) {
1324 len = PL_utf8skip[head];
1325 while ((STRLEN)(bufend-s) < len) {
1326 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1327 break;
1328 s = PL_parser->bufptr;
1329 bufend = PL_parser->bufend;
1330 }
1331 }
1332 unichar = utf8n_to_uvuni((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
1333 if (retlen == (STRLEN)-1) {
1334 /* malformed UTF-8 */
1335 ENTER;
1336 SAVESPTR(PL_warnhook);
1337 PL_warnhook = PERL_WARNHOOK_FATAL;
1338 utf8n_to_uvuni((U8*)s, bufend-s, NULL, 0);
1339 LEAVE;
1340 }
1341 return unichar;
1342 } else {
1343 if (s == bufend) {
1344 if (!lex_next_chunk(flags))
1345 return -1;
1346 s = PL_parser->bufptr;
1347 }
1348 return (U8)*s;
1349 }
1350}
1351
1352/*
1353=for apidoc Amx|I32|lex_read_unichar|U32 flags
1354
1355Reads the next (Unicode) character in the text currently being lexed.
1356Returns the codepoint (unsigned integer value) of the character read,
1357and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1358if lexing has reached the end of the input text. To non-destructively
1359examine the next character, use L</lex_peek_unichar> instead.
1360
1361If the next character is in (or extends into) the next chunk of input
1362text, the next chunk will be read in. Normally the current chunk will be
1363discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1364then the current chunk will not be discarded.
1365
1366If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1367is encountered, an exception is generated.
1368
1369=cut
1370*/
1371
1372I32
1373Perl_lex_read_unichar(pTHX_ U32 flags)
1374{
1375 I32 c;
1376 if (flags & ~(LEX_KEEP_PREVIOUS))
1377 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1378 c = lex_peek_unichar(flags);
1379 if (c != -1) {
1380 if (c == '\n')
1381 CopLINE_inc(PL_curcop);
1382 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1383 }
1384 return c;
1385}
1386
1387/*
1388=for apidoc Amx|void|lex_read_space|U32 flags
1389
1390Reads optional spaces, in Perl style, in the text currently being
1391lexed. The spaces may include ordinary whitespace characters and
1392Perl-style comments. C<#line> directives are processed if encountered.
1393L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1394at a non-space character (or the end of the input text).
1395
1396If spaces extend into the next chunk of input text, the next chunk will
1397be read in. Normally the current chunk will be discarded at the same
1398time, but if I<flags> includes C<LEX_KEEP_PREVIOUS> then the current
1399chunk will not be discarded.
1400
1401=cut
1402*/
1403
f0998909
Z
1404#define LEX_NO_NEXT_CHUNK 0x80000000
1405
f0e67a1d
Z
1406void
1407Perl_lex_read_space(pTHX_ U32 flags)
1408{
1409 char *s, *bufend;
1410 bool need_incline = 0;
f0998909 1411 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK))
f0e67a1d
Z
1412 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
1413#ifdef PERL_MAD
1414 if (PL_skipwhite) {
1415 sv_free(PL_skipwhite);
1416 PL_skipwhite = NULL;
1417 }
1418 if (PL_madskills)
1419 PL_skipwhite = newSVpvs("");
1420#endif /* PERL_MAD */
1421 s = PL_parser->bufptr;
1422 bufend = PL_parser->bufend;
1423 while (1) {
1424 char c = *s;
1425 if (c == '#') {
1426 do {
1427 c = *++s;
1428 } while (!(c == '\n' || (c == 0 && s == bufend)));
1429 } else if (c == '\n') {
1430 s++;
1431 PL_parser->linestart = s;
1432 if (s == bufend)
1433 need_incline = 1;
1434 else
1435 incline(s);
1436 } else if (isSPACE(c)) {
1437 s++;
1438 } else if (c == 0 && s == bufend) {
1439 bool got_more;
1440#ifdef PERL_MAD
1441 if (PL_madskills)
1442 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1443#endif /* PERL_MAD */
f0998909
Z
1444 if (flags & LEX_NO_NEXT_CHUNK)
1445 break;
f0e67a1d
Z
1446 PL_parser->bufptr = s;
1447 CopLINE_inc(PL_curcop);
1448 got_more = lex_next_chunk(flags);
1449 CopLINE_dec(PL_curcop);
1450 s = PL_parser->bufptr;
1451 bufend = PL_parser->bufend;
1452 if (!got_more)
1453 break;
1454 if (need_incline && PL_parser->rsfp) {
1455 incline(s);
1456 need_incline = 0;
1457 }
1458 } else {
1459 break;
1460 }
1461 }
1462#ifdef PERL_MAD
1463 if (PL_madskills)
1464 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1465#endif /* PERL_MAD */
1466 PL_parser->bufptr = s;
1467}
1468
1469/*
ffb4593c
NT
1470 * S_incline
1471 * This subroutine has nothing to do with tilting, whether at windmills
1472 * or pinball tables. Its name is short for "increment line". It
57843af0 1473 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1474 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1475 * # line 500 "foo.pm"
1476 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1477 */
1478
76e3520e 1479STATIC void
d9095cec 1480S_incline(pTHX_ const char *s)
463ee0b2 1481{
97aff369 1482 dVAR;
d9095cec
NC
1483 const char *t;
1484 const char *n;
1485 const char *e;
463ee0b2 1486
7918f24d
NC
1487 PERL_ARGS_ASSERT_INCLINE;
1488
57843af0 1489 CopLINE_inc(PL_curcop);
463ee0b2
LW
1490 if (*s++ != '#')
1491 return;
d4c19fe8
AL
1492 while (SPACE_OR_TAB(*s))
1493 s++;
73659bf1
GS
1494 if (strnEQ(s, "line", 4))
1495 s += 4;
1496 else
1497 return;
084592ab 1498 if (SPACE_OR_TAB(*s))
73659bf1 1499 s++;
4e553d73 1500 else
73659bf1 1501 return;
d4c19fe8
AL
1502 while (SPACE_OR_TAB(*s))
1503 s++;
463ee0b2
LW
1504 if (!isDIGIT(*s))
1505 return;
d4c19fe8 1506
463ee0b2
LW
1507 n = s;
1508 while (isDIGIT(*s))
1509 s++;
07714eb4 1510 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1511 return;
bf4acbe4 1512 while (SPACE_OR_TAB(*s))
463ee0b2 1513 s++;
73659bf1 1514 if (*s == '"' && (t = strchr(s+1, '"'))) {
463ee0b2 1515 s++;
73659bf1
GS
1516 e = t + 1;
1517 }
463ee0b2 1518 else {
c35e046a
AL
1519 t = s;
1520 while (!isSPACE(*t))
1521 t++;
73659bf1 1522 e = t;
463ee0b2 1523 }
bf4acbe4 1524 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1525 e++;
1526 if (*e != '\n' && *e != '\0')
1527 return; /* false alarm */
1528
f4dd75d9 1529 if (t - s > 0) {
d9095cec 1530 const STRLEN len = t - s;
8a5ee598 1531#ifndef USE_ITHREADS
19bad673
NC
1532 SV *const temp_sv = CopFILESV(PL_curcop);
1533 const char *cf;
1534 STRLEN tmplen;
1535
1536 if (temp_sv) {
1537 cf = SvPVX(temp_sv);
1538 tmplen = SvCUR(temp_sv);
1539 } else {
1540 cf = NULL;
1541 tmplen = 0;
1542 }
1543
42d9b98d 1544 if (tmplen > 7 && strnEQ(cf, "(eval ", 6)) {
e66cf94c
RGS
1545 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1546 * to *{"::_<newfilename"} */
44867030
NC
1547 /* However, the long form of evals is only turned on by the
1548 debugger - usually they're "(eval %lu)" */
1549 char smallbuf[128];
1550 char *tmpbuf;
1551 GV **gvp;
d9095cec 1552 STRLEN tmplen2 = len;
798b63bc 1553 if (tmplen + 2 <= sizeof smallbuf)
e66cf94c
RGS
1554 tmpbuf = smallbuf;
1555 else
2ae0db35 1556 Newx(tmpbuf, tmplen + 2, char);
44867030
NC
1557 tmpbuf[0] = '_';
1558 tmpbuf[1] = '<';
2ae0db35 1559 memcpy(tmpbuf + 2, cf, tmplen);
44867030 1560 tmplen += 2;
8a5ee598
RGS
1561 gvp = (GV**)hv_fetch(PL_defstash, tmpbuf, tmplen, FALSE);
1562 if (gvp) {
44867030
NC
1563 char *tmpbuf2;
1564 GV *gv2;
1565
1566 if (tmplen2 + 2 <= sizeof smallbuf)
1567 tmpbuf2 = smallbuf;
1568 else
1569 Newx(tmpbuf2, tmplen2 + 2, char);
1570
1571 if (tmpbuf2 != smallbuf || tmpbuf != smallbuf) {
1572 /* Either they malloc'd it, or we malloc'd it,
1573 so no prefix is present in ours. */
1574 tmpbuf2[0] = '_';
1575 tmpbuf2[1] = '<';
1576 }
1577
1578 memcpy(tmpbuf2 + 2, s, tmplen2);
1579 tmplen2 += 2;
1580
8a5ee598 1581 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1582 if (!isGV(gv2)) {
8a5ee598 1583 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1584 /* adjust ${"::_<newfilename"} to store the new file name */
1585 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
3cb1dbc6
NC
1586 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(*gvp)));
1587 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(*gvp)));
e5527e4b 1588 }
44867030
NC
1589
1590 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
8a5ee598 1591 }
e66cf94c 1592 if (tmpbuf != smallbuf) Safefree(tmpbuf);
e66cf94c 1593 }
8a5ee598 1594#endif
05ec9bb3 1595 CopFILE_free(PL_curcop);
d9095cec 1596 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1597 }
57843af0 1598 CopLINE_set(PL_curcop, atoi(n)-1);
463ee0b2
LW
1599}
1600
29595ff2 1601#ifdef PERL_MAD
cd81e915 1602/* skip space before PL_thistoken */
29595ff2
NC
1603
1604STATIC char *
1605S_skipspace0(pTHX_ register char *s)
1606{
7918f24d
NC
1607 PERL_ARGS_ASSERT_SKIPSPACE0;
1608
29595ff2
NC
1609 s = skipspace(s);
1610 if (!PL_madskills)
1611 return s;
cd81e915
NC
1612 if (PL_skipwhite) {
1613 if (!PL_thiswhite)
6b29d1f5 1614 PL_thiswhite = newSVpvs("");
cd81e915
NC
1615 sv_catsv(PL_thiswhite, PL_skipwhite);
1616 sv_free(PL_skipwhite);
1617 PL_skipwhite = 0;
1618 }
1619 PL_realtokenstart = s - SvPVX(PL_linestr);
29595ff2
NC
1620 return s;
1621}
1622
cd81e915 1623/* skip space after PL_thistoken */
29595ff2
NC
1624
1625STATIC char *
1626S_skipspace1(pTHX_ register char *s)
1627{
d4c19fe8 1628 const char *start = s;
29595ff2
NC
1629 I32 startoff = start - SvPVX(PL_linestr);
1630
7918f24d
NC
1631 PERL_ARGS_ASSERT_SKIPSPACE1;
1632
29595ff2
NC
1633 s = skipspace(s);
1634 if (!PL_madskills)
1635 return s;
1636 start = SvPVX(PL_linestr) + startoff;
cd81e915 1637 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1638 const char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1639 PL_thistoken = newSVpvn(tstart, start - tstart);
1640 }
1641 PL_realtokenstart = -1;
1642 if (PL_skipwhite) {
1643 if (!PL_nextwhite)
6b29d1f5 1644 PL_nextwhite = newSVpvs("");
cd81e915
NC
1645 sv_catsv(PL_nextwhite, PL_skipwhite);
1646 sv_free(PL_skipwhite);
1647 PL_skipwhite = 0;
29595ff2
NC
1648 }
1649 return s;
1650}
1651
1652STATIC char *
1653S_skipspace2(pTHX_ register char *s, SV **svp)
1654{
c35e046a
AL
1655 char *start;
1656 const I32 bufptroff = PL_bufptr - SvPVX(PL_linestr);
1657 const I32 startoff = s - SvPVX(PL_linestr);
1658
7918f24d
NC
1659 PERL_ARGS_ASSERT_SKIPSPACE2;
1660
29595ff2
NC
1661 s = skipspace(s);
1662 PL_bufptr = SvPVX(PL_linestr) + bufptroff;
1663 if (!PL_madskills || !svp)
1664 return s;
1665 start = SvPVX(PL_linestr) + startoff;
cd81e915 1666 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1667 char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1668 PL_thistoken = newSVpvn(tstart, start - tstart);
1669 PL_realtokenstart = -1;
29595ff2 1670 }
cd81e915 1671 if (PL_skipwhite) {
29595ff2 1672 if (!*svp)
6b29d1f5 1673 *svp = newSVpvs("");
cd81e915
NC
1674 sv_setsv(*svp, PL_skipwhite);
1675 sv_free(PL_skipwhite);
1676 PL_skipwhite = 0;
29595ff2
NC
1677 }
1678
1679 return s;
1680}
1681#endif
1682
80a702cd 1683STATIC void
15f169a1 1684S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1685{
1686 AV *av = CopFILEAVx(PL_curcop);
1687 if (av) {
b9f83d2f 1688 SV * const sv = newSV_type(SVt_PVMG);
5fa550fb
NC
1689 if (orig_sv)
1690 sv_setsv(sv, orig_sv);
1691 else
1692 sv_setpvn(sv, buf, len);
80a702cd
RGS
1693 (void)SvIOK_on(sv);
1694 SvIV_set(sv, 0);
1695 av_store(av, (I32)CopLINE(PL_curcop), sv);
1696 }
1697}
1698
ffb4593c
NT
1699/*
1700 * S_skipspace
1701 * Called to gobble the appropriate amount and type of whitespace.
1702 * Skips comments as well.
1703 */
1704
76e3520e 1705STATIC char *
cea2e8a9 1706S_skipspace(pTHX_ register char *s)
a687059c 1707{
5db06880 1708#ifdef PERL_MAD
f0e67a1d
Z
1709 char *start = s;
1710#endif /* PERL_MAD */
7918f24d 1711 PERL_ARGS_ASSERT_SKIPSPACE;
f0e67a1d 1712#ifdef PERL_MAD
cd81e915
NC
1713 if (PL_skipwhite) {
1714 sv_free(PL_skipwhite);
f0e67a1d 1715 PL_skipwhite = NULL;
5db06880 1716 }
f0e67a1d 1717#endif /* PERL_MAD */
3280af22 1718 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
bf4acbe4 1719 while (s < PL_bufend && SPACE_OR_TAB(*s))
463ee0b2 1720 s++;
f0e67a1d
Z
1721 } else {
1722 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1723 PL_bufptr = s;
f0998909
Z
1724 lex_read_space(LEX_KEEP_PREVIOUS |
1725 (PL_sublex_info.sub_inwhat || PL_lex_state == LEX_FORMLINE ?
1726 LEX_NO_NEXT_CHUNK : 0));
3280af22 1727 s = PL_bufptr;
f0e67a1d
Z
1728 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1729 if (PL_linestart > PL_bufptr)
1730 PL_bufptr = PL_linestart;
1731 return s;
463ee0b2 1732 }
5db06880 1733#ifdef PERL_MAD
f0e67a1d
Z
1734 if (PL_madskills)
1735 PL_skipwhite = newSVpvn(start, s-start);
1736#endif /* PERL_MAD */
5db06880 1737 return s;
a687059c 1738}
378cc40b 1739
ffb4593c
NT
1740/*
1741 * S_check_uni
1742 * Check the unary operators to ensure there's no ambiguity in how they're
1743 * used. An ambiguous piece of code would be:
1744 * rand + 5
1745 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1746 * the +5 is its argument.
1747 */
1748
76e3520e 1749STATIC void
cea2e8a9 1750S_check_uni(pTHX)
ba106d47 1751{
97aff369 1752 dVAR;
d4c19fe8
AL
1753 const char *s;
1754 const char *t;
2f3197b3 1755
3280af22 1756 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1757 return;
3280af22
NIS
1758 while (isSPACE(*PL_last_uni))
1759 PL_last_uni++;
c35e046a
AL
1760 s = PL_last_uni;
1761 while (isALNUM_lazy_if(s,UTF) || *s == '-')
1762 s++;
3280af22 1763 if ((t = strchr(s, '(')) && t < PL_bufptr)
a0d0e21e 1764 return;
6136c704 1765
9b387841
NC
1766 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
1767 "Warning: Use of \"%.*s\" without parentheses is ambiguous",
1768 (int)(s - PL_last_uni), PL_last_uni);
2f3197b3
LW
1769}
1770
ffb4593c
NT
1771/*
1772 * LOP : macro to build a list operator. Its behaviour has been replaced
1773 * with a subroutine, S_lop() for which LOP is just another name.
1774 */
1775
a0d0e21e
LW
1776#define LOP(f,x) return lop(f,x,s)
1777
ffb4593c
NT
1778/*
1779 * S_lop
1780 * Build a list operator (or something that might be one). The rules:
1781 * - if we have a next token, then it's a list operator [why?]
1782 * - if the next thing is an opening paren, then it's a function
1783 * - else it's a list operator
1784 */
1785
76e3520e 1786STATIC I32
a0be28da 1787S_lop(pTHX_ I32 f, int x, char *s)
ffed7fef 1788{
97aff369 1789 dVAR;
7918f24d
NC
1790
1791 PERL_ARGS_ASSERT_LOP;
1792
6154021b 1793 pl_yylval.ival = f;
35c8bce7 1794 CLINE;
3280af22
NIS
1795 PL_expect = x;
1796 PL_bufptr = s;
1797 PL_last_lop = PL_oldbufptr;
eb160463 1798 PL_last_lop_op = (OPCODE)f;
5db06880
NC
1799#ifdef PERL_MAD
1800 if (PL_lasttoke)
1801 return REPORT(LSTOP);
1802#else
3280af22 1803 if (PL_nexttoke)
bbf60fe6 1804 return REPORT(LSTOP);
5db06880 1805#endif
79072805 1806 if (*s == '(')
bbf60fe6 1807 return REPORT(FUNC);
29595ff2 1808 s = PEEKSPACE(s);
79072805 1809 if (*s == '(')
bbf60fe6 1810 return REPORT(FUNC);
79072805 1811 else
bbf60fe6 1812 return REPORT(LSTOP);
79072805
LW
1813}
1814
5db06880
NC
1815#ifdef PERL_MAD
1816 /*
1817 * S_start_force
1818 * Sets up for an eventual force_next(). start_force(0) basically does
1819 * an unshift, while start_force(-1) does a push. yylex removes items
1820 * on the "pop" end.
1821 */
1822
1823STATIC void
1824S_start_force(pTHX_ int where)
1825{
1826 int i;
1827
cd81e915 1828 if (where < 0) /* so people can duplicate start_force(PL_curforce) */
5db06880 1829 where = PL_lasttoke;
cd81e915
NC
1830 assert(PL_curforce < 0 || PL_curforce == where);
1831 if (PL_curforce != where) {
5db06880
NC
1832 for (i = PL_lasttoke; i > where; --i) {
1833 PL_nexttoke[i] = PL_nexttoke[i-1];
1834 }
1835 PL_lasttoke++;
1836 }
cd81e915 1837 if (PL_curforce < 0) /* in case of duplicate start_force() */
5db06880 1838 Zero(&PL_nexttoke[where], 1, NEXTTOKE);
cd81e915
NC
1839 PL_curforce = where;
1840 if (PL_nextwhite) {
5db06880 1841 if (PL_madskills)
6b29d1f5 1842 curmad('^', newSVpvs(""));
cd81e915 1843 CURMAD('_', PL_nextwhite);
5db06880
NC
1844 }
1845}
1846
1847STATIC void
1848S_curmad(pTHX_ char slot, SV *sv)
1849{
1850 MADPROP **where;
1851
1852 if (!sv)
1853 return;
cd81e915
NC
1854 if (PL_curforce < 0)
1855 where = &PL_thismad;
5db06880 1856 else
cd81e915 1857 where = &PL_nexttoke[PL_curforce].next_mad;
5db06880 1858
cd81e915 1859 if (PL_faketokens)
76f68e9b 1860 sv_setpvs(sv, "");
5db06880
NC
1861 else {
1862 if (!IN_BYTES) {
1863 if (UTF && is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
1864 SvUTF8_on(sv);
1865 else if (PL_encoding) {
1866 sv_recode_to_utf8(sv, PL_encoding);
1867 }
1868 }
1869 }
1870
1871 /* keep a slot open for the head of the list? */
1872 if (slot != '_' && *where && (*where)->mad_key == '^') {
1873 (*where)->mad_key = slot;
daba3364 1874 sv_free(MUTABLE_SV(((*where)->mad_val)));
5db06880
NC
1875 (*where)->mad_val = (void*)sv;
1876 }
1877 else
1878 addmad(newMADsv(slot, sv), where, 0);
1879}
1880#else
b3f24c00
MHM
1881# define start_force(where) NOOP
1882# define curmad(slot, sv) NOOP
5db06880
NC
1883#endif
1884
ffb4593c
NT
1885/*
1886 * S_force_next
9cbb5ea2 1887 * When the lexer realizes it knows the next token (for instance,
ffb4593c 1888 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 1889 * to know what token to return the next time the lexer is called. Caller
5db06880
NC
1890 * will need to set PL_nextval[] (or PL_nexttoke[].next_val with PERL_MAD),
1891 * and possibly PL_expect to ensure the lexer handles the token correctly.
ffb4593c
NT
1892 */
1893
4e553d73 1894STATIC void
cea2e8a9 1895S_force_next(pTHX_ I32 type)
79072805 1896{
97aff369 1897 dVAR;
704d4215
GG
1898#ifdef DEBUGGING
1899 if (DEBUG_T_TEST) {
1900 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 1901 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
1902 }
1903#endif
5db06880 1904#ifdef PERL_MAD
cd81e915 1905 if (PL_curforce < 0)
5db06880 1906 start_force(PL_lasttoke);
cd81e915 1907 PL_nexttoke[PL_curforce].next_type = type;
5db06880
NC
1908 if (PL_lex_state != LEX_KNOWNEXT)
1909 PL_lex_defer = PL_lex_state;
1910 PL_lex_state = LEX_KNOWNEXT;
1911 PL_lex_expect = PL_expect;
cd81e915 1912 PL_curforce = -1;
5db06880 1913#else
3280af22
NIS
1914 PL_nexttype[PL_nexttoke] = type;
1915 PL_nexttoke++;
1916 if (PL_lex_state != LEX_KNOWNEXT) {
1917 PL_lex_defer = PL_lex_state;
1918 PL_lex_expect = PL_expect;
1919 PL_lex_state = LEX_KNOWNEXT;
79072805 1920 }
5db06880 1921#endif
79072805
LW
1922}
1923
d0a148a6 1924STATIC SV *
15f169a1 1925S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 1926{
97aff369 1927 dVAR;
740cce10 1928 SV * const sv = newSVpvn_utf8(start, len,
eaf7a4d2
CS
1929 !IN_BYTES
1930 && UTF
1931 && !is_ascii_string((const U8*)start, len)
740cce10 1932 && is_utf8_string((const U8*)start, len));
d0a148a6
NC
1933 return sv;
1934}
1935
ffb4593c
NT
1936/*
1937 * S_force_word
1938 * When the lexer knows the next thing is a word (for instance, it has
1939 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
1940 * it calls S_force_word to stick the next word into the PL_nexttoke/val
1941 * lookahead.
ffb4593c
NT
1942 *
1943 * Arguments:
b1b65b59 1944 * char *start : buffer position (must be within PL_linestr)
02b34bbe 1945 * int token : PL_next* will be this type of bare word (e.g., METHOD,WORD)
ffb4593c
NT
1946 * int check_keyword : if true, Perl checks to make sure the word isn't
1947 * a keyword (do this if the word is a label, e.g. goto FOO)
1948 * int allow_pack : if true, : characters will also be allowed (require,
1949 * use, etc. do this)
9cbb5ea2 1950 * int allow_initial_tick : used by the "sub" lexer only.
ffb4593c
NT
1951 */
1952
76e3520e 1953STATIC char *
cea2e8a9 1954S_force_word(pTHX_ register char *start, int token, int check_keyword, int allow_pack, int allow_initial_tick)
79072805 1955{
97aff369 1956 dVAR;
463ee0b2
LW
1957 register char *s;
1958 STRLEN len;
4e553d73 1959
7918f24d
NC
1960 PERL_ARGS_ASSERT_FORCE_WORD;
1961
29595ff2 1962 start = SKIPSPACE1(start);
463ee0b2 1963 s = start;
7e2040f0 1964 if (isIDFIRST_lazy_if(s,UTF) ||
a0d0e21e 1965 (allow_pack && *s == ':') ||
15f0808c 1966 (allow_initial_tick && *s == '\'') )
a0d0e21e 1967 {
3280af22 1968 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
5458a98a 1969 if (check_keyword && keyword(PL_tokenbuf, len, 0))
463ee0b2 1970 return start;
cd81e915 1971 start_force(PL_curforce);
5db06880
NC
1972 if (PL_madskills)
1973 curmad('X', newSVpvn(start,s-start));
463ee0b2 1974 if (token == METHOD) {
29595ff2 1975 s = SKIPSPACE1(s);
463ee0b2 1976 if (*s == '(')
3280af22 1977 PL_expect = XTERM;
463ee0b2 1978 else {
3280af22 1979 PL_expect = XOPERATOR;
463ee0b2 1980 }
79072805 1981 }
e74e6b3d 1982 if (PL_madskills)
63575281 1983 curmad('g', newSVpvs( "forced" ));
9ded7720 1984 NEXTVAL_NEXTTOKE.opval
d0a148a6
NC
1985 = (OP*)newSVOP(OP_CONST,0,
1986 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 1987 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
1988 force_next(token);
1989 }
1990 return s;
1991}
1992
ffb4593c
NT
1993/*
1994 * S_force_ident
9cbb5ea2 1995 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
1996 * text only contains the "foo" portion. The first argument is a pointer
1997 * to the "foo", and the second argument is the type symbol to prefix.
1998 * Forces the next token to be a "WORD".
9cbb5ea2 1999 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2000 */
2001
76e3520e 2002STATIC void
bfed75c6 2003S_force_ident(pTHX_ register const char *s, int kind)
79072805 2004{
97aff369 2005 dVAR;
7918f24d
NC
2006
2007 PERL_ARGS_ASSERT_FORCE_IDENT;
2008
c35e046a 2009 if (*s) {
90e5519e
NC
2010 const STRLEN len = strlen(s);
2011 OP* const o = (OP*)newSVOP(OP_CONST, 0, newSVpvn(s, len));
cd81e915 2012 start_force(PL_curforce);
9ded7720 2013 NEXTVAL_NEXTTOKE.opval = o;
79072805 2014 force_next(WORD);
748a9306 2015 if (kind) {
11343788 2016 o->op_private = OPpCONST_ENTERED;
55497cff
PP
2017 /* XXX see note in pp_entereval() for why we forgo typo
2018 warnings if the symbol must be introduced in an eval.
2019 GSAR 96-10-12 */
90e5519e
NC
2020 gv_fetchpvn_flags(s, len,
2021 PL_in_eval ? (GV_ADDMULTI | GV_ADDINEVAL)
2022 : GV_ADD,
2023 kind == '$' ? SVt_PV :
2024 kind == '@' ? SVt_PVAV :
2025 kind == '%' ? SVt_PVHV :
a0d0e21e 2026 SVt_PVGV
90e5519e 2027 );
748a9306 2028 }
79072805
LW
2029 }
2030}
2031
1571675a
GS
2032NV
2033Perl_str_to_version(pTHX_ SV *sv)
2034{
2035 NV retval = 0.0;
2036 NV nshift = 1.0;
2037 STRLEN len;
cfd0369c 2038 const char *start = SvPV_const(sv,len);
9d4ba2ae 2039 const char * const end = start + len;
504618e9 2040 const bool utf = SvUTF8(sv) ? TRUE : FALSE;
7918f24d
NC
2041
2042 PERL_ARGS_ASSERT_STR_TO_VERSION;
2043
1571675a 2044 while (start < end) {
ba210ebe 2045 STRLEN skip;
1571675a
GS
2046 UV n;
2047 if (utf)
9041c2e3 2048 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2049 else {
2050 n = *(U8*)start;
2051 skip = 1;
2052 }
2053 retval += ((NV)n)/nshift;
2054 start += skip;
2055 nshift *= 1000;
2056 }
2057 return retval;
2058}
2059
4e553d73 2060/*
ffb4593c
NT
2061 * S_force_version
2062 * Forces the next token to be a version number.
e759cc13
RGS
2063 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2064 * and if "guessing" is TRUE, then no new token is created (and the caller
2065 * must use an alternative parsing method).
ffb4593c
NT
2066 */
2067
76e3520e 2068STATIC char *
e759cc13 2069S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2070{
97aff369 2071 dVAR;
5f66b61c 2072 OP *version = NULL;
44dcb63b 2073 char *d;
5db06880
NC
2074#ifdef PERL_MAD
2075 I32 startoff = s - SvPVX(PL_linestr);
2076#endif
89bfa8cd 2077
7918f24d
NC
2078 PERL_ARGS_ASSERT_FORCE_VERSION;
2079
29595ff2 2080 s = SKIPSPACE1(s);
89bfa8cd 2081
44dcb63b 2082 d = s;
dd629d5b 2083 if (*d == 'v')
44dcb63b 2084 d++;
44dcb63b 2085 if (isDIGIT(*d)) {
e759cc13
RGS
2086 while (isDIGIT(*d) || *d == '_' || *d == '.')
2087 d++;
5db06880
NC
2088#ifdef PERL_MAD
2089 if (PL_madskills) {
cd81e915 2090 start_force(PL_curforce);
5db06880
NC
2091 curmad('X', newSVpvn(s,d-s));
2092 }
2093#endif
9f3d182e 2094 if (*d == ';' || isSPACE(*d) || *d == '}' || !*d) {
dd629d5b 2095 SV *ver;
8d08d9ba
DG
2096#ifdef USE_LOCALE_NUMERIC
2097 char *loc = setlocale(LC_NUMERIC, "C");
2098#endif
6154021b 2099 s = scan_num(s, &pl_yylval);
8d08d9ba
DG
2100#ifdef USE_LOCALE_NUMERIC
2101 setlocale(LC_NUMERIC, loc);
2102#endif
6154021b 2103 version = pl_yylval.opval;
dd629d5b
GS
2104 ver = cSVOPx(version)->op_sv;
2105 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2106 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2107 SvNV_set(ver, str_to_version(ver));
1571675a 2108 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2109 }
89bfa8cd 2110 }
5db06880
NC
2111 else if (guessing) {
2112#ifdef PERL_MAD
2113 if (PL_madskills) {
cd81e915
NC
2114 sv_free(PL_nextwhite); /* let next token collect whitespace */
2115 PL_nextwhite = 0;
5db06880
NC
2116 s = SvPVX(PL_linestr) + startoff;
2117 }
2118#endif
e759cc13 2119 return s;
5db06880 2120 }
89bfa8cd
PP
2121 }
2122
5db06880
NC
2123#ifdef PERL_MAD
2124 if (PL_madskills && !version) {
cd81e915
NC
2125 sv_free(PL_nextwhite); /* let next token collect whitespace */
2126 PL_nextwhite = 0;
5db06880
NC
2127 s = SvPVX(PL_linestr) + startoff;
2128 }
2129#endif
89bfa8cd 2130 /* NOTE: The parser sees the package name and the VERSION swapped */
cd81e915 2131 start_force(PL_curforce);
9ded7720 2132 NEXTVAL_NEXTTOKE.opval = version;
4e553d73 2133 force_next(WORD);
89bfa8cd 2134
e759cc13 2135 return s;
89bfa8cd
PP
2136}
2137
ffb4593c 2138/*
91152fc1
DG
2139 * S_force_strict_version
2140 * Forces the next token to be a version number using strict syntax rules.
2141 */
2142
2143STATIC char *
2144S_force_strict_version(pTHX_ char *s)
2145{
2146 dVAR;
2147 OP *version = NULL;
2148#ifdef PERL_MAD
2149 I32 startoff = s - SvPVX(PL_linestr);
2150#endif
2151 const char *errstr = NULL;
2152
2153 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2154
2155 while (isSPACE(*s)) /* leading whitespace */
2156 s++;
2157
2158 if (is_STRICT_VERSION(s,&errstr)) {
2159 SV *ver = newSV(0);
2160 s = (char *)scan_version(s, ver, 0);
2161 version = newSVOP(OP_CONST, 0, ver);
2162 }
2163 else if ( (*s != ';' && *s != '}' ) && (s = SKIPSPACE1(s), (*s != ';' && *s !='}' ))) {
2164 PL_bufptr = s;
2165 if (errstr)
2166 yyerror(errstr); /* version required */
2167 return s;
2168 }
2169
2170#ifdef PERL_MAD
2171 if (PL_madskills && !version) {
2172 sv_free(PL_nextwhite); /* let next token collect whitespace */
2173 PL_nextwhite = 0;
2174 s = SvPVX(PL_linestr) + startoff;
2175 }
2176#endif
2177 /* NOTE: The parser sees the package name and the VERSION swapped */
2178 start_force(PL_curforce);
2179 NEXTVAL_NEXTTOKE.opval = version;
2180 force_next(WORD);
2181
2182 return s;
2183}
2184
2185/*
ffb4593c
NT
2186 * S_tokeq
2187 * Tokenize a quoted string passed in as an SV. It finds the next
2188 * chunk, up to end of string or a backslash. It may make a new
2189 * SV containing that chunk (if HINT_NEW_STRING is on). It also
2190 * turns \\ into \.
2191 */
2192
76e3520e 2193STATIC SV *
cea2e8a9 2194S_tokeq(pTHX_ SV *sv)
79072805 2195{
97aff369 2196 dVAR;
79072805
LW
2197 register char *s;
2198 register char *send;
2199 register char *d;
b3ac6de7
IZ
2200 STRLEN len = 0;
2201 SV *pv = sv;
79072805 2202
7918f24d
NC
2203 PERL_ARGS_ASSERT_TOKEQ;
2204
79072805 2205 if (!SvLEN(sv))
b3ac6de7 2206 goto finish;
79072805 2207
a0d0e21e 2208 s = SvPV_force(sv, len);
21a311ee 2209 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1)
b3ac6de7 2210 goto finish;
463ee0b2 2211 send = s + len;
79072805
LW
2212 while (s < send && *s != '\\')
2213 s++;
2214 if (s == send)
b3ac6de7 2215 goto finish;
79072805 2216 d = s;
be4731d2 2217 if ( PL_hints & HINT_NEW_STRING ) {
59cd0e26 2218 pv = newSVpvn_flags(SvPVX_const(pv), len, SVs_TEMP | SvUTF8(sv));
be4731d2 2219 }
79072805
LW
2220 while (s < send) {
2221 if (*s == '\\') {
a0d0e21e 2222 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2223 s++; /* all that, just for this */
2224 }
2225 *d++ = *s++;
2226 }
2227 *d = '\0';
95a20fc0 2228 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2229 finish:
3280af22 2230 if ( PL_hints & HINT_NEW_STRING )
eb0d8d16 2231 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
79072805
LW
2232 return sv;
2233}
2234
ffb4593c
NT
2235/*
2236 * Now come three functions related to double-quote context,
2237 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2238 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2239 * interact with PL_lex_state, and create fake ( ... ) argument lists
2240 * to handle functions and concatenation.
2241 * They assume that whoever calls them will be setting up a fake
2242 * join call, because each subthing puts a ',' after it. This lets
2243 * "lower \luPpEr"
2244 * become
2245 * join($, , 'lower ', lcfirst( 'uPpEr', ) ,)
2246 *
2247 * (I'm not sure whether the spurious commas at the end of lcfirst's
2248 * arguments and join's arguments are created or not).
2249 */
2250
2251/*
2252 * S_sublex_start
6154021b 2253 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2254 *
2255 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2256 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c
NT
2257 *
2258 * OP_CONST and OP_READLINE are easy--just make the new op and return.
2259 *
2260 * Everything else becomes a FUNC.
2261 *
2262 * Sets PL_lex_state to LEX_INTERPPUSH unless (ival was OP_NULL or we
2263 * had an OP_CONST or OP_READLINE). This just sets us up for a
2264 * call to S_sublex_push().
2265 */
2266
76e3520e 2267STATIC I32
cea2e8a9 2268S_sublex_start(pTHX)
79072805 2269{
97aff369 2270 dVAR;
6154021b 2271 register const I32 op_type = pl_yylval.ival;
79072805
LW
2272
2273 if (op_type == OP_NULL) {
6154021b 2274 pl_yylval.opval = PL_lex_op;
5f66b61c 2275 PL_lex_op = NULL;
79072805
LW
2276 return THING;
2277 }
2278 if (op_type == OP_CONST || op_type == OP_READLINE) {
3280af22 2279 SV *sv = tokeq(PL_lex_stuff);
b3ac6de7
IZ
2280
2281 if (SvTYPE(sv) == SVt_PVIV) {
2282 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2283 STRLEN len;
96a5add6 2284 const char * const p = SvPV_const(sv, len);
740cce10 2285 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2286 SvREFCNT_dec(sv);
2287 sv = nsv;
4e553d73 2288 }
6154021b 2289 pl_yylval.opval = (OP*)newSVOP(op_type, 0, sv);
a0714e2c 2290 PL_lex_stuff = NULL;
6f33ba73
RGS
2291 /* Allow <FH> // "foo" */
2292 if (op_type == OP_READLINE)
2293 PL_expect = XTERMORDORDOR;
79072805
LW
2294 return THING;
2295 }
e3f73d4e
RGS
2296 else if (op_type == OP_BACKTICK && PL_lex_op) {
2297 /* readpipe() vas overriden */
2298 cSVOPx(cLISTOPx(cUNOPx(PL_lex_op)->op_first)->op_first->op_sibling)->op_sv = tokeq(PL_lex_stuff);
6154021b 2299 pl_yylval.opval = PL_lex_op;
9b201d7d 2300 PL_lex_op = NULL;
e3f73d4e
RGS
2301 PL_lex_stuff = NULL;
2302 return THING;
2303 }
79072805 2304
3280af22 2305 PL_sublex_info.super_state = PL_lex_state;
eac04b2e 2306 PL_sublex_info.sub_inwhat = (U16)op_type;
3280af22
NIS
2307 PL_sublex_info.sub_op = PL_lex_op;
2308 PL_lex_state = LEX_INTERPPUSH;
55497cff 2309
3280af22
NIS
2310 PL_expect = XTERM;
2311 if (PL_lex_op) {
6154021b 2312 pl_yylval.opval = PL_lex_op;
5f66b61c 2313 PL_lex_op = NULL;
55497cff
PP
2314 return PMFUNC;
2315 }
2316 else
2317 return FUNC;
2318}
2319
ffb4593c
NT
2320/*
2321 * S_sublex_push
2322 * Create a new scope to save the lexing state. The scope will be
2323 * ended in S_sublex_done. Returns a '(', starting the function arguments
2324 * to the uc, lc, etc. found before.
2325 * Sets PL_lex_state to LEX_INTERPCONCAT.
2326 */
2327
76e3520e 2328STATIC I32
cea2e8a9 2329S_sublex_push(pTHX)
55497cff 2330{
27da23d5 2331 dVAR;
f46d017c 2332 ENTER;
55497cff 2333
3280af22 2334 PL_lex_state = PL_sublex_info.super_state;
651b5b28 2335 SAVEBOOL(PL_lex_dojoin);
3280af22 2336 SAVEI32(PL_lex_brackets);
3280af22
NIS
2337 SAVEI32(PL_lex_casemods);
2338 SAVEI32(PL_lex_starts);
651b5b28 2339 SAVEI8(PL_lex_state);
7766f137 2340 SAVEVPTR(PL_lex_inpat);
98246f1e 2341 SAVEI16(PL_lex_inwhat);
57843af0 2342 SAVECOPLINE(PL_curcop);
3280af22 2343 SAVEPPTR(PL_bufptr);
8452ff4b 2344 SAVEPPTR(PL_bufend);
3280af22
NIS
2345 SAVEPPTR(PL_oldbufptr);
2346 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2347 SAVEPPTR(PL_last_lop);
2348 SAVEPPTR(PL_last_uni);
3280af22
NIS
2349 SAVEPPTR(PL_linestart);
2350 SAVESPTR(PL_linestr);
8edd5f42
RGS
2351 SAVEGENERICPV(PL_lex_brackstack);
2352 SAVEGENERICPV(PL_lex_casestack);
3280af22
NIS
2353
2354 PL_linestr = PL_lex_stuff;
a0714e2c 2355 PL_lex_stuff = NULL;
3280af22 2356
9cbb5ea2
GS
2357 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2358 = SvPVX(PL_linestr);
3280af22 2359 PL_bufend += SvCUR(PL_linestr);
bd61b366 2360 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2361 SAVEFREESV(PL_linestr);
2362
2363 PL_lex_dojoin = FALSE;
2364 PL_lex_brackets = 0;
a02a5408
JC
2365 Newx(PL_lex_brackstack, 120, char);
2366 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2367 PL_lex_casemods = 0;
2368 *PL_lex_casestack = '\0';
2369 PL_lex_starts = 0;
2370 PL_lex_state = LEX_INTERPCONCAT;
eb160463 2371 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
3280af22
NIS
2372
2373 PL_lex_inwhat = PL_sublex_info.sub_inwhat;
2374 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2375 PL_lex_inpat = PL_sublex_info.sub_op;
79072805 2376 else
5f66b61c 2377 PL_lex_inpat = NULL;
79072805 2378
55497cff 2379 return '(';
79072805
LW
2380}
2381
ffb4593c
NT
2382/*
2383 * S_sublex_done
2384 * Restores lexer state after a S_sublex_push.
2385 */
2386
76e3520e 2387STATIC I32
cea2e8a9 2388S_sublex_done(pTHX)
79072805 2389{
27da23d5 2390 dVAR;
3280af22 2391 if (!PL_lex_starts++) {
396482e1 2392 SV * const sv = newSVpvs("");
9aa983d2
JH
2393 if (SvUTF8(PL_linestr))
2394 SvUTF8_on(sv);
3280af22 2395 PL_expect = XOPERATOR;
6154021b 2396 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
79072805
LW
2397 return THING;
2398 }
2399
3280af22
NIS
2400 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2401 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2402 return yylex();
79072805
LW
2403 }
2404
ffb4593c 2405 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
3280af22
NIS
2406 if (PL_lex_repl && (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS)) {
2407 PL_linestr = PL_lex_repl;
2408 PL_lex_inpat = 0;
2409 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2410 PL_bufend += SvCUR(PL_linestr);
bd61b366 2411 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2412 SAVEFREESV(PL_linestr);
2413 PL_lex_dojoin = FALSE;
2414 PL_lex_brackets = 0;
3280af22
NIS
2415 PL_lex_casemods = 0;
2416 *PL_lex_casestack = '\0';
2417 PL_lex_starts = 0;
25da4f38 2418 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2419 PL_lex_state = LEX_INTERPNORMAL;
2420 PL_lex_starts++;
e9fa98b2
HS
2421 /* we don't clear PL_lex_repl here, so that we can check later
2422 whether this is an evalled subst; that means we rely on the
2423 logic to ensure sublex_done() is called again only via the
2424 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2425 }
e9fa98b2 2426 else {
3280af22 2427 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2428 PL_lex_repl = NULL;
e9fa98b2 2429 }
79072805 2430 return ',';
ffed7fef
LW
2431 }
2432 else {
5db06880
NC
2433#ifdef PERL_MAD
2434 if (PL_madskills) {
cd81e915
NC
2435 if (PL_thiswhite) {
2436 if (!PL_endwhite)
6b29d1f5 2437 PL_endwhite = newSVpvs("");
cd81e915
NC
2438 sv_catsv(PL_endwhite, PL_thiswhite);
2439 PL_thiswhite = 0;
2440 }
2441 if (PL_thistoken)
76f68e9b 2442 sv_setpvs(PL_thistoken,"");
5db06880 2443 else
cd81e915 2444 PL_realtokenstart = -1;
5db06880
NC
2445 }
2446#endif
f46d017c 2447 LEAVE;
3280af22
NIS
2448 PL_bufend = SvPVX(PL_linestr);
2449 PL_bufend += SvCUR(PL_linestr);
2450 PL_expect = XOPERATOR;
09bef843 2451 PL_sublex_info.sub_inwhat = 0;
79072805 2452 return ')';
ffed7fef
LW
2453 }
2454}
2455
02aa26ce
NT
2456/*
2457 scan_const
2458
2459 Extracts a pattern, double-quoted string, or transliteration. This
2460 is terrifying code.
2461
94def140 2462 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 2463 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 2464 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 2465
94def140
ST
2466 Returns a pointer to the character scanned up to. If this is
2467 advanced from the start pointer supplied (i.e. if anything was
9b599b2a 2468 successfully parsed), will leave an OP for the substring scanned
6154021b 2469 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
2470 by looking at the next characters herself.
2471
02aa26ce
NT
2472 In patterns:
2473 backslashes:
ff3f963a 2474 constants: \N{NAME} only
02aa26ce
NT
2475 case and quoting: \U \Q \E
2476 stops on @ and $, but not for $ as tail anchor
2477
2478 In transliterations:
2479 characters are VERY literal, except for - not at the start or end
94def140
ST
2480 of the string, which indicates a range. If the range is in bytes,
2481 scan_const expands the range to the full set of intermediate
2482 characters. If the range is in utf8, the hyphen is replaced with
2483 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
2484
2485 In double-quoted strings:
2486 backslashes:
2487 double-quoted style: \r and \n
ff3f963a 2488 constants: \x31, etc.
94def140 2489 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
2490 case and quoting: \U \Q \E
2491 stops on @ and $
2492
2493 scan_const does *not* construct ops to handle interpolated strings.
2494 It stops processing as soon as it finds an embedded $ or @ variable
2495 and leaves it to the caller to work out what's going on.
2496
94def140
ST
2497 embedded arrays (whether in pattern or not) could be:
2498 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2499
2500 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
2501
2502 $ in pattern could be $foo or could be tail anchor. Assumption:
2503 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 2504 followed by one of "()| \r\n\t"
02aa26ce
NT
2505
2506 \1 (backreferences) are turned into $1
2507
2508 The structure of the code is
2509 while (there's a character to process) {
94def140
ST
2510 handle transliteration ranges
2511 skip regexp comments /(?#comment)/ and codes /(?{code})/
2512 skip #-initiated comments in //x patterns
2513 check for embedded arrays
02aa26ce
NT
2514 check for embedded scalars
2515 if (backslash) {
94def140 2516 deprecate \1 in substitution replacements
02aa26ce
NT
2517 handle string-changing backslashes \l \U \Q \E, etc.
2518 switch (what was escaped) {
94def140 2519 handle \- in a transliteration (becomes a literal -)
ff3f963a 2520 if a pattern and not \N{, go treat as regular character
94def140
ST
2521 handle \132 (octal characters)
2522 handle \x15 and \x{1234} (hex characters)
ff3f963a 2523 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
ST
2524 handle \cV (control characters)
2525 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 2526 } (end switch)
77a135fe 2527 continue
02aa26ce 2528 } (end if backslash)
77a135fe 2529 handle regular character
02aa26ce 2530 } (end while character to read)
4e553d73 2531
02aa26ce
NT
2532*/
2533
76e3520e 2534STATIC char *
cea2e8a9 2535S_scan_const(pTHX_ char *start)
79072805 2536{
97aff369 2537 dVAR;
3280af22 2538 register char *send = PL_bufend; /* end of the constant */
77a135fe
KW
2539 SV *sv = newSV(send - start); /* sv for the constant. See
2540 note below on sizing. */
02aa26ce
NT
2541 register char *s = start; /* start of the constant */
2542 register char *d = SvPVX(sv); /* destination for copies */
2543 bool dorange = FALSE; /* are we in a translit range? */
c2e66d9e 2544 bool didrange = FALSE; /* did we just finish a range? */
2b9d42f0 2545 I32 has_utf8 = FALSE; /* Output constant is UTF8 */
77a135fe
KW
2546 I32 this_utf8 = UTF; /* Is the source string assumed
2547 to be UTF8? But, this can
2548 show as true when the source
2549 isn't utf8, as for example
2550 when it is entirely composed
2551 of hex constants */
2552
2553 /* Note on sizing: The scanned constant is placed into sv, which is
2554 * initialized by newSV() assuming one byte of output for every byte of
2555 * input. This routine expects newSV() to allocate an extra byte for a
2556 * trailing NUL, which this routine will append if it gets to the end of
2557 * the input. There may be more bytes of input than output (eg., \N{LATIN
2558 * CAPITAL LETTER A}), or more output than input if the constant ends up
2559 * recoded to utf8, but each time a construct is found that might increase
2560 * the needed size, SvGROW() is called. Its size parameter each time is
2561 * based on the best guess estimate at the time, namely the length used so
2562 * far, plus the length the current construct will occupy, plus room for
2563 * the trailing NUL, plus one byte for every input byte still unscanned */
2564
012bcf8d 2565 UV uv;
4c3a8340
ST
2566#ifdef EBCDIC
2567 UV literal_endpoint = 0;
e294cc5d 2568 bool native_range = TRUE; /* turned to FALSE if the first endpoint is Unicode. */
4c3a8340 2569#endif
012bcf8d 2570
7918f24d
NC
2571 PERL_ARGS_ASSERT_SCAN_CONST;
2572
2b9d42f0
NIS
2573 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
2574 /* If we are doing a trans and we know we want UTF8 set expectation */
2575 has_utf8 = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
2576 this_utf8 = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
2577 }
2578
2579
79072805 2580 while (s < send || dorange) {
ff3f963a 2581
02aa26ce 2582 /* get transliterations out of the way (they're most literal) */
3280af22 2583 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 2584 /* expand a range A-Z to the full set of characters. AIE! */
79072805 2585 if (dorange) {
1ba5c669
JH
2586 I32 i; /* current expanded character */
2587 I32 min; /* first character in range */
2588 I32 max; /* last character in range */
02aa26ce 2589
e294cc5d
JH
2590#ifdef EBCDIC
2591 UV uvmax = 0;
2592#endif
2593
2594 if (has_utf8
2595#ifdef EBCDIC
2596 && !native_range
2597#endif
2598 ) {
9d4ba2ae 2599 char * const c = (char*)utf8_hop((U8*)d, -1);
8973db79
JH
2600 char *e = d++;
2601 while (e-- > c)
2602 *(e + 1) = *e;
25716404 2603 *c = (char)UTF_TO_NATIVE(0xff);
8973db79
JH
2604 /* mark the range as done, and continue */
2605 dorange = FALSE;
2606 didrange = TRUE;
2607 continue;
2608 }
2b9d42f0 2609
95a20fc0 2610 i = d - SvPVX_const(sv); /* remember current offset */
e294cc5d
JH
2611#ifdef EBCDIC
2612 SvGROW(sv,
2613 SvLEN(sv) + (has_utf8 ?
2614 (512 - UTF_CONTINUATION_MARK +
2615 UNISKIP(0x100))
2616 : 256));
2617 /* How many two-byte within 0..255: 128 in UTF-8,
2618 * 96 in UTF-8-mod. */
2619#else
9cbb5ea2 2620 SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */
e294cc5d 2621#endif
9cbb5ea2 2622 d = SvPVX(sv) + i; /* refresh d after realloc */
e294cc5d
JH
2623#ifdef EBCDIC
2624 if (has_utf8) {
2625 int j;
2626 for (j = 0; j <= 1; j++) {
2627 char * const c = (char*)utf8_hop((U8*)d, -1);
2628 const UV uv = utf8n_to_uvchr((U8*)c, d - c, NULL, 0);
2629 if (j)
2630 min = (U8)uv;
2631 else if (uv < 256)
2632 max = (U8)uv;
2633 else {
2634 max = (U8)0xff; /* only to \xff */
2635 uvmax = uv; /* \x{100} to uvmax */
2636 }
2637 d = c; /* eat endpoint chars */
2638 }
2639 }
2640 else {
2641#endif
2642 d -= 2; /* eat the first char and the - */
2643 min = (U8)*d; /* first char in range */
2644 max = (U8)d[1]; /* last char in range */
2645#ifdef EBCDIC
2646 }
2647#endif
8ada0baa 2648
c2e66d9e 2649 if (min > max) {
01ec43d0 2650 Perl_croak(aTHX_
d1573ac7 2651 "Invalid range \"%c-%c\" in transliteration operator",
1ba5c669 2652 (char)min, (char)max);
c2e66d9e
GS
2653 }
2654
c7f1f016 2655#ifdef EBCDIC
4c3a8340
ST
2656 if (literal_endpoint == 2 &&
2657 ((isLOWER(min) && isLOWER(max)) ||
2658 (isUPPER(min) && isUPPER(max)))) {
8ada0baa
JH
2659 if (isLOWER(min)) {
2660 for (i = min; i <= max; i++)
2661 if (isLOWER(i))
db42d148 2662 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2663 } else {
2664 for (i = min; i <= max; i++)
2665 if (isUPPER(i))
db42d148 2666 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2667 }
2668 }
2669 else
2670#endif
2671 for (i = min; i <= max; i++)
e294cc5d
JH
2672#ifdef EBCDIC
2673 if (has_utf8) {
2674 const U8 ch = (U8)NATIVE_TO_UTF(i);
2675 if (UNI_IS_INVARIANT(ch))
2676 *d++ = (U8)i;
2677 else {
2678 *d++ = (U8)UTF8_EIGHT_BIT_HI(ch);
2679 *d++ = (U8)UTF8_EIGHT_BIT_LO(ch);
2680 }
2681 }
2682 else
2683#endif
2684 *d++ = (char)i;
2685
2686#ifdef EBCDIC
2687 if (uvmax) {
2688 d = (char*)uvchr_to_utf8((U8*)d, 0x100);
2689 if (uvmax > 0x101)
2690 *d++ = (char)UTF_TO_NATIVE(0xff);
2691 if (uvmax > 0x100)
2692 d = (char*)uvchr_to_utf8((U8*)d, uvmax);
2693 }
2694#endif
02aa26ce
NT
2695
2696 /* mark the range as done, and continue */
79072805 2697 dorange = FALSE;
01ec43d0 2698 didrange = TRUE;
4c3a8340
ST
2699#ifdef EBCDIC
2700 literal_endpoint = 0;
2701#endif
79072805 2702 continue;
4e553d73 2703 }
02aa26ce
NT
2704
2705 /* range begins (ignore - as first or last char) */
79072805 2706 else if (*s == '-' && s+1 < send && s != start) {
4e553d73 2707 if (didrange) {
1fafa243 2708 Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
01ec43d0 2709 }
e294cc5d
JH
2710 if (has_utf8
2711#ifdef EBCDIC
2712 && !native_range
2713#endif
2714 ) {
25716404 2715 *d++ = (char)UTF_TO_NATIVE(0xff); /* use illegal utf8 byte--see pmtrans */
a0ed51b3
LW
2716 s++;
2717 continue;
2718 }
79072805
LW
2719 dorange = TRUE;
2720 s++;
01ec43d0
GS
2721 }
2722 else {
2723 didrange = FALSE;
4c3a8340
ST
2724#ifdef EBCDIC
2725 literal_endpoint = 0;
e294cc5d 2726 native_range = TRUE;
4c3a8340 2727#endif
01ec43d0 2728 }
79072805 2729 }
02aa26ce
NT
2730
2731 /* if we get here, we're not doing a transliteration */
2732
0f5d15d6
IZ
2733 /* skip for regexp comments /(?#comment)/ and code /(?{code})/,
2734 except for the last char, which will be done separately. */
3280af22 2735 else if (*s == '(' && PL_lex_inpat && s[1] == '?') {
cc6b7395 2736 if (s[2] == '#') {
e994fd66 2737 while (s+1 < send && *s != ')')
db42d148 2738 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
155aba94
GS
2739 }
2740 else if (s[2] == '{' /* This should match regcomp.c */
67edc0c9 2741 || (s[2] == '?' && s[3] == '{'))
155aba94 2742 {
cc6b7395 2743 I32 count = 1;
0f5d15d6 2744 char *regparse = s + (s[2] == '{' ? 3 : 4);
cc6b7395
IZ
2745 char c;
2746
d9f97599
GS
2747 while (count && (c = *regparse)) {
2748 if (c == '\\' && regparse[1])
2749 regparse++;
4e553d73 2750 else if (c == '{')
cc6b7395 2751 count++;
4e553d73 2752 else if (c == '}')
cc6b7395 2753 count--;
d9f97599 2754 regparse++;
cc6b7395 2755 }
e994fd66 2756 if (*regparse != ')')
5bdf89e7 2757 regparse--; /* Leave one char for continuation. */
0f5d15d6 2758 while (s < regparse)
db42d148 2759 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
cc6b7395 2760 }
748a9306 2761 }
02aa26ce
NT
2762
2763 /* likewise skip #-initiated comments in //x patterns */
3280af22
NIS
2764 else if (*s == '#' && PL_lex_inpat &&
2765 ((PMOP*)PL_lex_inpat)->op_pmflags & PMf_EXTENDED) {
748a9306 2766 while (s+1 < send && *s != '\n')
db42d148 2767 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
748a9306 2768 }
02aa26ce 2769
5d1d4326 2770 /* check for embedded arrays
da6eedaa 2771 (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
5d1d4326 2772 */
1749ea0d
ST
2773 else if (*s == '@' && s[1]) {
2774 if (isALNUM_lazy_if(s+1,UTF))
2775 break;
2776 if (strchr(":'{$", s[1]))
2777 break;
2778 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
2779 break; /* in regexp, neither @+ nor @- are interpolated */
2780 }
02aa26ce
NT
2781
2782 /* check for embedded scalars. only stop if we're sure it's a
2783 variable.
2784 */
79072805 2785 else if (*s == '$') {
3280af22 2786 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
79072805 2787 break;
77772344 2788 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
a2a5de95
NC
2789 if (s[1] == '\\') {
2790 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
2791 "Possible unintended interpolation of $\\ in regex");
77772344 2792 }
79072805 2793 break; /* in regexp, $ might be tail anchor */
77772344 2794 }
79072805 2795 }
02aa26ce 2796
2b9d42f0
NIS
2797 /* End of else if chain - OP_TRANS rejoin rest */
2798
02aa26ce 2799 /* backslashes */
79072805 2800 if (*s == '\\' && s+1 < send) {
ff3f963a
KW
2801 char* e; /* Can be used for ending '}', etc. */
2802
79072805 2803 s++;
02aa26ce 2804
02aa26ce 2805 /* deprecate \1 in strings and substitution replacements */
3280af22 2806 if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat &&
a0d0e21e 2807 isDIGIT(*s) && *s != '0' && !isDIGIT(s[1]))
79072805 2808 {
a2a5de95 2809 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
79072805
LW
2810 *--s = '$';
2811 break;
2812 }
02aa26ce
NT
2813
2814 /* string-change backslash escapes */
3280af22 2815 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQ", *s)) {
79072805
LW
2816 --s;
2817 break;
2818 }
ff3f963a
KW
2819 /* In a pattern, process \N, but skip any other backslash escapes.
2820 * This is because we don't want to translate an escape sequence
2821 * into a meta symbol and have the regex compiler use the meta
2822 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
2823 * in spite of this, we do have to process \N here while the proper
2824 * charnames handler is in scope. See bugs #56444 and #62056.
2825 * There is a complication because \N in a pattern may also stand
2826 * for 'match a non-nl', and not mean a charname, in which case its
2827 * processing should be deferred to the regex compiler. To be a
2828 * charname it must be followed immediately by a '{', and not look
2829 * like \N followed by a curly quantifier, i.e., not something like
2830 * \N{3,}. regcurly returns a boolean indicating if it is a legal
2831 * quantifier */
2832 else if (PL_lex_inpat
2833 && (*s != 'N'
2834 || s[1] != '{'
2835 || regcurly(s + 1)))
2836 {
cc74c5bd
ST
2837 *d++ = NATIVE_TO_NEED(has_utf8,'\\');
2838 goto default_action;
2839 }
02aa26ce 2840
79072805 2841 switch (*s) {
02aa26ce
NT
2842
2843 /* quoted - in transliterations */
79072805 2844 case '-':
3280af22 2845 if (PL_lex_inwhat == OP_TRANS) {
79072805
LW
2846 *d++ = *s++;
2847 continue;
2848 }
2849 /* FALL THROUGH */
2850 default:
11b8faa4 2851 {
a2a5de95
NC
2852 if ((isALPHA(*s) || isDIGIT(*s)))
2853 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
2854 "Unrecognized escape \\%c passed through",
2855 *s);
11b8faa4 2856 /* default action is to copy the quoted character */
f9a63242 2857 goto default_action;
11b8faa4 2858 }
02aa26ce 2859
77a135fe 2860 /* eg. \132 indicates the octal constant 0x132 */
79072805
LW
2861 case '0': case '1': case '2': case '3':
2862 case '4': case '5': case '6': case '7':
ba210ebe 2863 {
53305cf1
NC
2864 I32 flags = 0;
2865 STRLEN len = 3;
77a135fe 2866 uv = NATIVE_TO_UNI(grok_oct(s, &len, &flags, NULL));
ba210ebe
JH
2867 s += len;
2868 }
012bcf8d 2869 goto NUM_ESCAPE_INSERT;
02aa26ce 2870
77a135fe 2871 /* eg. \x24 indicates the hex constant 0x24 */
79072805 2872 case 'x':
a0ed51b3
LW
2873 ++s;
2874 if (*s == '{') {
9d4ba2ae 2875 char* const e = strchr(s, '}');
a4c04bdc
NC
2876 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES |
2877 PERL_SCAN_DISALLOW_PREFIX;
53305cf1 2878 STRLEN len;
355860ce 2879
53305cf1 2880 ++s;
adaeee49 2881 if (!e) {
a0ed51b3 2882 yyerror("Missing right brace on \\x{}");
355860ce 2883 continue;
ba210ebe 2884 }
53305cf1 2885 len = e - s;
77a135fe 2886 uv = NATIVE_TO_UNI(grok_hex(s, &len, &flags, NULL));
ba210ebe 2887 s = e + 1;
a0ed51b3
LW
2888 }
2889 else {
ba210ebe 2890 {
53305cf1 2891 STRLEN len = 2;
a4c04bdc 2892 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
77a135fe 2893 uv = NATIVE_TO_UNI(grok_hex(s, &len, &flags, NULL));
ba210ebe
JH
2894 s += len;
2895 }
012bcf8d
GS
2896 }
2897
2898 NUM_ESCAPE_INSERT:
ff3f963a
KW
2899 /* Insert oct or hex escaped character. There will always be
2900 * enough room in sv since such escapes will be longer than any
2901 * UTF-8 sequence they can end up as, except if they force us
2902 * to recode the rest of the string into utf8 */
ba7cea30 2903
77a135fe 2904 /* Here uv is the ordinal of the next character being added in
ff3f963a 2905 * unicode (converted from native). */
77a135fe 2906 if (!UNI_IS_INVARIANT(uv)) {
9aa983d2 2907 if (!has_utf8 && uv > 255) {
77a135fe
KW
2908 /* Might need to recode whatever we have accumulated so
2909 * far if it contains any chars variant in utf8 or
2910 * utf-ebcdic. */
2911
2912 SvCUR_set(sv, d - SvPVX_const(sv));
2913 SvPOK_on(sv);
2914 *d = '\0';
77a135fe 2915 /* See Note on sizing above. */
7bf79863
KW
2916 sv_utf8_upgrade_flags_grow(sv,
2917 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
2918 UNISKIP(uv) + (STRLEN)(send - s) + 1);
77a135fe
KW
2919 d = SvPVX(sv) + SvCUR(sv);
2920 has_utf8 = TRUE;
012bcf8d
GS
2921 }
2922
77a135fe
KW
2923 if (has_utf8) {
2924 d = (char*)uvuni_to_utf8((U8*)d, uv);
f9a63242
JH
2925 if (PL_lex_inwhat == OP_TRANS &&
2926 PL_sublex_info.sub_op) {
2927 PL_sublex_info.sub_op->op_private |=
2928 (PL_lex_repl ? OPpTRANS_FROM_UTF
2929 : OPpTRANS_TO_UTF);
f9a63242 2930 }
e294cc5d
JH
2931#ifdef EBCDIC
2932 if (uv > 255 && !dorange)
2933 native_range = FALSE;
2934#endif
012bcf8d 2935 }
a0ed51b3 2936 else {
012bcf8d 2937 *d++ = (char)uv;
a0ed51b3 2938 }
012bcf8d
GS
2939 }
2940 else {
c4d5f83a 2941 *d++ = (char) uv;
a0ed51b3 2942 }
79072805 2943 continue;
02aa26ce 2944
4a2d328f 2945 case 'N':
ff3f963a
KW
2946 /* In a non-pattern \N must be a named character, like \N{LATIN
2947 * SMALL LETTER A} or \N{U+0041}. For patterns, it also can
2948 * mean to match a non-newline. For non-patterns, named
2949 * characters are converted to their string equivalents. In
2950 * patterns, named characters are not converted to their
2951 * ultimate forms for the same reasons that other escapes
2952 * aren't. Instead, they are converted to the \N{U+...} form
2953 * to get the value from the charnames that is in effect right
2954 * now, while preserving the fact that it was a named character
2955 * so that the regex compiler knows this */
2956
2957 /* This section of code doesn't generally use the
2958 * NATIVE_TO_NEED() macro to transform the input. I (khw) did
2959 * a close examination of this macro and determined it is a
2960 * no-op except on utfebcdic variant characters. Every
2961 * character generated by this that would normally need to be
2962 * enclosed by this macro is invariant, so the macro is not
2963 * needed, and would complicate use of copy(). There are other
2964 * parts of this file where the macro is used inconsistently,
2965 * but are saved by it being a no-op */
2966
2967 /* The structure of this section of code (besides checking for
2968 * errors and upgrading to utf8) is:
2969 * Further disambiguate between the two meanings of \N, and if
2970 * not a charname, go process it elsewhere
2971 * If of form \N{U+...}, pass it through if a pattern; otherwise
2972 * convert to utf8
2973 * Otherwise must be \N{NAME}: convert to \N{U+c1.c2...} if a pattern;
2974 * otherwise convert to utf8 */
2975
2976 /* Here, s points to the 'N'; the test below is guaranteed to
2977 * succeed if we are being called on a pattern as we already
2978 * know from a test above that the next character is a '{'.
2979 * On a non-pattern \N must mean 'named sequence, which
2980 * requires braces */
2981 s++;
2982 if (*s != '{') {
2983 yyerror("Missing braces on \\N{}");
2984 continue;
2985 }
2986 s++;
2987
2988 /* If there is no matching '}', it is an error outside of a
2989 * pattern, or ambiguous inside. */
2990 if (! (e = strchr(s, '}'))) {
2991 if (! PL_lex_inpat) {
5777a3f7 2992 yyerror("Missing right brace on \\N{}");
ff3f963a 2993 continue;
423cee85 2994 }
ff3f963a
KW
2995 else {
2996
2997 /* A missing brace means it can't be a legal character
2998 * name, and it could be a legal "match non-newline".
2999 * But it's kind of weird without an unescaped left
3000 * brace, so warn. */
3001 if (ckWARN(WARN_SYNTAX)) {
3002 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
3003 "Missing right brace on \\N{} or unescaped left brace after \\N. Assuming the latter");
fc8cd66c 3004 }
ff3f963a
KW
3005 s -= 3; /* Backup over cur char, {, N, to the '\' */
3006 *d++ = NATIVE_TO_NEED(has_utf8,'\\');
3007 goto default_action;
dbc0d4f2 3008 }
ff3f963a 3009 }
cddc7ef4 3010
ff3f963a 3011 /* Here it looks like a named character */
cddc7ef4 3012
ff3f963a
KW
3013 if (PL_lex_inpat) {
3014
3015 /* XXX This block is temporary code. \N{} implies that the
3016 * pattern is to have Unicode semantics, and therefore
3017 * currently has to be encoded in utf8. By putting it in
3018 * utf8 now, we save a whole pass in the regular expression
3019 * compiler. Once that code is changed so Unicode
3020 * semantics doesn't necessarily have to be in utf8, this
3021 * block should be removed */
3022 if (!has_utf8) {
77a135fe 3023 SvCUR_set(sv, d - SvPVX_const(sv));
f08d6ad9 3024 SvPOK_on(sv);
e4f3eed8 3025 *d = '\0';
77a135fe 3026 /* See Note on sizing above. */
7bf79863 3027 sv_utf8_upgrade_flags_grow(sv,
ff3f963a
KW
3028 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3029 /* 5 = '\N{' + cur char + NUL */
3030 (STRLEN)(send - s) + 5);
f08d6ad9 3031 d = SvPVX(sv) + SvCUR(sv);
89491803 3032 has_utf8 = TRUE;
ff3f963a
KW
3033 }
3034 }
423cee85 3035
ff3f963a
KW
3036 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3037 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3038 | PERL_SCAN_DISALLOW_PREFIX;
3039 STRLEN len;
3040
3041 /* For \N{U+...}, the '...' is a unicode value even on
3042 * EBCDIC machines */
3043 s += 2; /* Skip to next char after the 'U+' */
3044 len = e - s;
3045 uv = grok_hex(s, &len, &flags, NULL);
3046 if (len == 0 || len != (STRLEN)(e - s)) {
3047 yyerror("Invalid hexadecimal number in \\N{U+...}");
3048 s = e + 1;
3049 continue;
3050 }
3051
3052 if (PL_lex_inpat) {
3053
3054 /* Pass through to the regex compiler unchanged. The
3055 * reason we evaluated the number above is to make sure
3056 * there wasn't a syntax error. It also makes sure
3057 * that the syntax created below, \N{Uc1.c2...}, is
3058 * internal-only */
3059 s -= 5; /* Include the '\N{U+' */
3060 Copy(s, d, e - s + 1, char); /* 1 = include the } */
3061 d += e - s + 1;
3062 }
3063 else { /* Not a pattern: convert the hex to string */
3064
3065 /* If destination is not in utf8, unconditionally
3066 * recode it to be so. This is because \N{} implies
3067 * Unicode semantics, and scalars have to be in utf8
3068 * to guarantee those semantics */
3069 if (! has_utf8) {
3070 SvCUR_set(sv, d - SvPVX_const(sv));
3071 SvPOK_on(sv);
3072 *d = '\0';
3073 /* See Note on sizing above. */
3074 sv_utf8_upgrade_flags_grow(
3075 sv,
3076 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3077 UNISKIP(uv) + (STRLEN)(send - e) + 1);
3078 d = SvPVX(sv) + SvCUR(sv);
3079 has_utf8 = TRUE;
3080 }
3081
3082 /* Add the string to the output */
3083 if (UNI_IS_INVARIANT(uv)) {
3084 *d++ = (char) uv;
3085 }
3086 else d = (char*)uvuni_to_utf8((U8*)d, uv);
3087 }
3088 }
3089 else { /* Here is \N{NAME} but not \N{U+...}. */
3090
3091 SV *res; /* result from charnames */
3092 const char *str; /* the string in 'res' */
3093 STRLEN len; /* its length */
3094
3095 /* Get the value for NAME */
3096 res = newSVpvn(s, e - s);
3097 res = new_constant( NULL, 0, "charnames",
3098 /* includes all of: \N{...} */
3099 res, NULL, s - 3, e - s + 4 );
3100
3101 /* Most likely res will be in utf8 already since the
3102 * standard charnames uses pack U, but a custom translator
3103 * can leave it otherwise, so make sure. XXX This can be
3104 * revisited to not have charnames use utf8 for characters
3105 * that don't need it when regexes don't have to be in utf8
3106 * for Unicode semantics. If doing so, remember EBCDIC */
3107 sv_utf8_upgrade(res);
3108 str = SvPV_const(res, len);
3109
3110 /* Don't accept malformed input */
3111 if (! is_utf8_string((U8 *) str, len)) {
3112 yyerror("Malformed UTF-8 returned by \\N");
3113 }
3114 else if (PL_lex_inpat) {
3115
3116 if (! len) { /* The name resolved to an empty string */
3117 Copy("\\N{}", d, 4, char);
3118 d += 4;
3119 }
3120 else {
3121 /* In order to not lose information for the regex
3122 * compiler, pass the result in the specially made
3123 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3124 * the code points in hex of each character
3125 * returned by charnames */
3126
3127 const char *str_end = str + len;
3128 STRLEN char_length; /* cur char's byte length */
3129 STRLEN output_length; /* and the number of bytes
3130 after this is translated
3131 into hex digits */
3132 const STRLEN off = d - SvPVX_const(sv);
3133
3134 /* 2 hex per byte; 2 chars for '\N'; 2 chars for
3135 * max('U+', '.'); and 1 for NUL */
3136 char hex_string[2 * UTF8_MAXBYTES + 5];
3137
3138 /* Get the first character of the result. */
3139 U32 uv = utf8n_to_uvuni((U8 *) str,
3140 len,
3141 &char_length,
3142 UTF8_ALLOW_ANYUV);
3143
3144 /* The call to is_utf8_string() above hopefully
3145 * guarantees that there won't be an error. But
3146 * it's easy here to make sure. The function just
3147 * above warns and returns 0 if invalid utf8, but
3148 * it can also return 0 if the input is validly a
3149 * NUL. Disambiguate */
3150 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3151 uv = UNICODE_REPLACEMENT;
3152 }
3153
3154 /* Convert first code point to hex, including the
3155 * boiler plate before it */
3156 sprintf(hex_string, "\\N{U+%X", (unsigned int) uv);
3157 output_length = strlen(hex_string);
3158
3159 /* Make sure there is enough space to hold it */
3160 d = off + SvGROW(sv, off
3161 + output_length
3162 + (STRLEN)(send - e)
3163 + 2); /* '}' + NUL */
3164 /* And output it */
3165 Copy(hex_string, d, output_length, char);
3166 d += output_length;
3167
3168 /* For each subsequent character, append dot and
3169 * its ordinal in hex */
3170 while ((str += char_length) < str_end) {
3171 const STRLEN off = d - SvPVX_const(sv);
3172 U32 uv = utf8n_to_uvuni((U8 *) str,
3173 str_end - str,
3174 &char_length,
3175 UTF8_ALLOW_ANYUV);
3176 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3177 uv = UNICODE_REPLACEMENT;
3178 }
3179
3180 sprintf(hex_string, ".%X", (unsigned int) uv);
3181 output_length = strlen(hex_string);
3182
3183 d = off + SvGROW(sv, off
3184 + output_length
3185 + (STRLEN)(send - e)
3186 + 2); /* '}' + NUL */
3187 Copy(hex_string, d, output_length, char);
3188 d += output_length;
3189 }
3190
3191 *d++ = '}'; /* Done. Add the trailing brace */
3192 }
3193 }
3194 else { /* Here, not in a pattern. Convert the name to a
3195 * string. */
3196
3197 /* If destination is not in utf8, unconditionally
3198 * recode it to be so. This is because \N{} implies
3199 * Unicode semantics, and scalars have to be in utf8
3200 * to guarantee those semantics */
3201 if (! has_utf8) {
3202 SvCUR_set(sv, d - SvPVX_const(sv));
3203 SvPOK_on(sv);
3204 *d = '\0';
3205 /* See Note on sizing above. */
3206 sv_utf8_upgrade_flags_grow(sv,
3207 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3208 len + (STRLEN)(send - s) + 1);
3209 d = SvPVX(sv) + SvCUR(sv);
3210 has_utf8 = TRUE;
3211 } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
3212
3213 /* See Note on sizing above. (NOTE: SvCUR() is not
3214 * set correctly here). */
3215 const STRLEN off = d - SvPVX_const(sv);
3216 d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1);
3217 }
3218 Copy(str, d, len, char);
3219 d += len;
423cee85 3220 }
423cee85 3221 SvREFCNT_dec(res);
423cee85 3222 }
ff3f963a
KW
3223#ifdef EBCDIC
3224 if (!dorange)
3225 native_range = FALSE; /* \N{} is defined to be Unicode */
3226#endif
3227 s = e + 1; /* Point to just after the '}' */
423cee85
JH
3228 continue;
3229
02aa26ce 3230 /* \c is a control character */
79072805
LW
3231 case 'c':
3232 s++;
961ce445 3233 if (s < send) {
ba210ebe 3234 U8 c = *s++;
c7f1f016
NIS
3235#ifdef EBCDIC
3236 if (isLOWER(c))
3237 c = toUPPER(c);
3238#endif
db42d148 3239 *d++ = NATIVE_TO_NEED(has_utf8,toCTRL(c));
ba210ebe 3240 }
961ce445
RGS
3241 else {
3242 yyerror("Missing control char name in \\c");
3243 }
79072805 3244 continue;
02aa26ce
NT
3245
3246 /* printf-style backslashes, formfeeds, newlines, etc */
79072805 3247 case 'b':
db42d148 3248 *d++ = NATIVE_TO_NEED(has_utf8,'\b');
79072805
LW
3249 break;
3250 case 'n':
db42d148 3251 *d++ = NATIVE_TO_NEED(has_utf8,'\n');
79072805
LW
3252 break;
3253 case 'r':
db42d148 3254 *d++ = NATIVE_TO_NEED(has_utf8,'\r');
79072805
LW
3255 break;
3256 case 'f':
db42d148 3257 *d++ = NATIVE_TO_NEED(has_utf8,'\f');
79072805
LW
3258 break;
3259 case 't':
db42d148 3260 *d++ = NATIVE_TO_NEED(has_utf8,'\t');
79072805 3261 break;
34a3fe2a 3262 case 'e':
db42d148 3263 *d++ = ASCII_TO_NEED(has_utf8,'\033');
34a3fe2a
PP
3264 break;
3265 case 'a':
db42d148 3266 *d++ = ASCII_TO_NEED(has_utf8,'\007');
79072805 3267 break;
02aa26ce
NT
3268 } /* end switch */
3269
79072805
LW
3270 s++;
3271 continue;
02aa26ce 3272 } /* end if (backslash) */
4c3a8340
ST
3273#ifdef EBCDIC
3274 else
3275 literal_endpoint++;
3276#endif
02aa26ce 3277
f9a63242 3278 default_action:
77a135fe
KW
3279 /* If we started with encoded form, or already know we want it,
3280 then encode the next character */
3281 if (! NATIVE_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
2b9d42f0 3282 STRLEN len = 1;
77a135fe
KW
3283
3284
3285 /* One might think that it is wasted effort in the case of the
3286 * source being utf8 (this_utf8 == TRUE) to take the next character
3287 * in the source, convert it to an unsigned value, and then convert
3288 * it back again. But the source has not been validated here. The
3289 * routine that does the conversion checks for errors like
3290 * malformed utf8 */
3291
5f66b61c
AL
3292 const UV nextuv = (this_utf8) ? utf8n_to_uvchr((U8*)s, send - s, &len, 0) : (UV) ((U8) *s);
3293 const STRLEN need = UNISKIP(NATIVE_TO_UNI(nextuv));
77a135fe
KW
3294 if (!has_utf8) {
3295 SvCUR_set(sv, d - SvPVX_const(sv));
3296 SvPOK_on(sv);
3297 *d = '\0';
77a135fe 3298 /* See Note on sizing above. */
7bf79863
KW
3299 sv_utf8_upgrade_flags_grow(sv,
3300 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3301 need + (STRLEN)(send - s) + 1);
77a135fe
KW
3302 d = SvPVX(sv) + SvCUR(sv);
3303 has_utf8 = TRUE;
3304 } else if (need > len) {
3305 /* encoded value larger than old, may need extra space (NOTE:
3306 * SvCUR() is not set correctly here). See Note on sizing
3307 * above. */
9d4ba2ae 3308 const STRLEN off = d - SvPVX_const(sv);
77a135fe 3309 d = SvGROW(sv, off + need + (STRLEN)(send - s) + 1) + off;
2b9d42f0 3310 }
77a135fe
KW
3311 s += len;
3312
5f66b61c 3313 d = (char*)uvchr_to_utf8((U8*)d, nextuv);
e294cc5d
JH
3314#ifdef EBCDIC
3315 if (uv > 255 && !dorange)
3316 native_range = FALSE;
3317#endif
2b9d42f0
NIS
3318 }
3319 else {
3320 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
3321 }
02aa26ce
NT
3322 } /* while loop to process each character */
3323
3324 /* terminate the string and set up the sv */
79072805 3325 *d = '\0';
95a20fc0 3326 SvCUR_set(sv, d - SvPVX_const(sv));
2b9d42f0 3327 if (SvCUR(sv) >= SvLEN(sv))
d0063567 3328 Perl_croak(aTHX_ "panic: constant overflowed allocated space");
2b9d42f0 3329
79072805 3330 SvPOK_on(sv);
9f4817db 3331 if (PL_encoding && !has_utf8) {
d0063567
DK
3332 sv_recode_to_utf8(sv, PL_encoding);
3333 if (SvUTF8(sv))
3334 has_utf8 = TRUE;
9f4817db 3335 }
2b9d42f0 3336 if (has_utf8) {
7e2040f0 3337 SvUTF8_on(sv);
2b9d42f0 3338 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
d0063567 3339 PL_sublex_info.sub_op->op_private |=
2b9d42f0
NIS
3340 (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
3341 }
3342 }
79072805 3343
02aa26ce 3344 /* shrink the sv if we allocated more than we used */
79072805 3345 if (SvCUR(sv) + 5 < SvLEN(sv)) {
1da4ca5f 3346 SvPV_shrink_to_cur(sv);
79072805 3347 }
02aa26ce 3348
6154021b 3349 /* return the substring (via pl_yylval) only if we parsed anything */
3280af22 3350 if (s > PL_bufptr) {
eb0d8d16
NC
3351 if ( PL_hints & ( PL_lex_inpat ? HINT_NEW_RE : HINT_NEW_STRING ) ) {
3352 const char *const key = PL_lex_inpat ? "qr" : "q";
3353 const STRLEN keylen = PL_lex_inpat ? 2 : 1;
3354 const char *type;
3355 STRLEN typelen;
3356
3357 if (PL_lex_inwhat == OP_TRANS) {
3358 type = "tr";
3359 typelen = 2;
3360 } else if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat) {
3361 type = "s";
3362 typelen = 1;
3363 } else {
3364 type = "qq";
3365 typelen = 2;
3366 }
3367
3368 sv = S_new_constant(aTHX_ start, s - start, key, keylen, sv, NULL,
3369 type, typelen);
3370 }
6154021b 3371 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
b3ac6de7 3372 } else
8990e307 3373 SvREFCNT_dec(sv);
79072805
LW
3374 return s;
3375}
3376
ffb4593c
NT
3377/* S_intuit_more
3378 * Returns TRUE if there's more to the expression (e.g., a subscript),
3379 * FALSE otherwise.
ffb4593c
NT
3380 *
3381 * It deals with "$foo[3]" and /$foo[3]/ and /$foo[0123456789$]+/
3382 *
3383 * ->[ and ->{ return TRUE
3384 * { and [ outside a pattern are always subscripts, so return TRUE
3385 * if we're outside a pattern and it's not { or [, then return FALSE
3386 * if we're in a pattern and the first char is a {
3387 * {4,5} (any digits around the comma) returns FALSE
3388 * if we're in a pattern and the first char is a [
3389 * [] returns FALSE
3390 * [SOMETHING] has a funky algorithm to decide whether it's a
3391 * character class or not. It has to deal with things like
3392 * /$foo[-3]/ and /$foo[$bar]/ as well as /$foo[$\d]+/
3393 * anything else returns TRUE
3394 */
3395
9cbb5ea2
GS
3396/* This is the one truly awful dwimmer necessary to conflate C and sed. */
3397
76e3520e 3398STATIC int
cea2e8a9 3399S_intuit_more(pTHX_ register char *s)
79072805 3400{
97aff369 3401 dVAR;
7918f24d
NC
3402
3403 PERL_ARGS_ASSERT_INTUIT_MORE;
3404
3280af22 3405 if (PL_lex_brackets)
79072805
LW
3406 return TRUE;
3407 if (*s == '-' && s[1] == '>' && (s[2] == '[' || s[2] == '{'))
3408 return TRUE;
3409 if (*s != '{' && *s != '[')
3410 return FALSE;
3280af22 3411 if (!PL_lex_inpat)
79072805
LW
3412 return TRUE;
3413
3414 /* In a pattern, so maybe we have {n,m}. */
3415 if (*s == '{') {
3416 s++;
3417 if (!isDIGIT(*s))
3418 return TRUE;
3419 while (isDIGIT(*s))
3420 s++;
3421 if (*s == ',')
3422 s++;
3423 while (isDIGIT(*s))
3424 s++;
3425 if (*s == '}')
3426 return FALSE;
3427 return TRUE;
3428
3429 }
3430
3431 /* On the other hand, maybe we have a character class */
3432
3433 s++;
3434 if (*s == ']' || *s == '^')
3435 return FALSE;
3436 else {
ffb4593c 3437 /* this is terrifying, and it works */
79072805
LW
3438 int weight = 2; /* let's weigh the evidence */
3439 char seen[256];
f27ffc4a 3440 unsigned char un_char = 255, last_un_char;
9d4ba2ae 3441 const char * const send = strchr(s,']');
3280af22 3442 char tmpbuf[sizeof PL_tokenbuf * 4];
79072805
LW
3443
3444 if (!send) /* has to be an expression */
3445 return TRUE;
3446
3447 Zero(seen,256,char);
3448 if (*s == '$')
3449 weight -= 3;
3450 else if (isDIGIT(*s)) {
3451 if (s[1] != ']') {
3452 if (isDIGIT(s[1]) && s[2] == ']')
3453 weight -= 10;
3454 }
3455 else
3456 weight -= 100;
3457 }
3458 for (; s < send; s++) {
3459 last_un_char = un_char;
3460 un_char = (unsigned char)*s;
3461 switch (*s) {
3462 case '@':
3463 case '&':
3464 case '$':
3465 weight -= seen[un_char] * 10;
7e2040f0 3466 if (isALNUM_lazy_if(s+1,UTF)) {
90e5519e 3467 int len;
8903cb82 3468 scan_ident(s, send, tmpbuf, sizeof tmpbuf, FALSE);
90e5519e
NC
3469 len = (int)strlen(tmpbuf);
3470 if (len > 1 && gv_fetchpvn_flags(tmpbuf, len, 0, SVt_PV))
79072805
LW
3471 weight -= 100;
3472 else
3473 weight -= 10;
3474 }
3475 else if (*s == '$' && s[1] &&
93a17b20
LW
3476 strchr("[#!%*<>()-=",s[1])) {
3477 if (/*{*/ strchr("])} =",s[2]))
79072805
LW
3478 weight -= 10;
3479 else
3480 weight -= 1;
3481 }
3482 break;
3483 case '\\':
3484 un_char = 254;
3485 if (s[1]) {
93a17b20 3486 if (strchr("wds]",s[1]))
79072805 3487 weight += 100;
10edeb5d 3488 else if (seen[(U8)'\''] || seen[(U8)'"'])
79072805 3489 weight += 1;
93a17b20 3490 else if (strchr("rnftbxcav",s[1]))
79072805
LW
3491 weight += 40;
3492 else if (isDIGIT(s[1])) {
3493 weight += 40;
3494 while (s[1] && isDIGIT(s[1]))
3495 s++;
3496 }
3497 }
3498 else
3499 weight += 100;
3500 break;
3501 case '-':
3502 if (s[1] == '\\')
3503 weight += 50;
93a17b20 3504 if (strchr("aA01! ",last_un_char))
79072805 3505 weight += 30;
93a17b20 3506 if (strchr("zZ79~",s[1]))
79072805 3507 weight += 30;
f27ffc4a
GS
3508 if (last_un_char == 255 && (isDIGIT(s[1]) || s[1] == '$'))
3509 weight -= 5; /* cope with negative subscript */
79072805
LW
3510 break;
3511 default:
3792a11b
NC
3512 if (!isALNUM(last_un_char)
3513 && !(last_un_char == '$' || last_un_char == '@'
3514 || last_un_char == '&')
3515 && isALPHA(*s) && s[1] && isALPHA(s[1])) {
79072805
LW
3516 char *d = tmpbuf;
3517 while (isALPHA(*s))
3518 *d++ = *s++;
3519 *d = '\0';
5458a98a 3520 if (keyword(tmpbuf, d - tmpbuf, 0))
79072805
LW
3521 weight -= 150;
3522 }
3523 if (un_char == last_un_char + 1)
3524 weight += 5;
3525 weight -= seen[un_char];
3526 break;
3527 }
3528 seen[un_char]++;
3529 }
3530 if (weight >= 0) /* probably a character class */
3531 return FALSE;
3532 }
3533
3534 return TRUE;
3535}
ffed7fef 3536
ffb4593c
NT
3537/*
3538 * S_intuit_method
3539 *
3540 * Does all the checking to disambiguate
3541 * foo bar
3542 * between foo(bar) and bar->foo. Returns 0 if not a method, otherwise
3543 * FUNCMETH (bar->foo(args)) or METHOD (bar->foo args).
3544 *
3545 * First argument is the stuff after the first token, e.g. "bar".
3546 *
3547 * Not a method if bar is a filehandle.
3548 * Not a method if foo is a subroutine prototyped to take a filehandle.
3549 * Not a method if it's really "Foo $bar"
3550 * Method if it's "foo $bar"
3551 * Not a method if it's really "print foo $bar"
3552 * Method if it's really "foo package::" (interpreted as package->foo)
8f8cf39c 3553 * Not a method if bar is known to be a subroutine ("sub bar; foo bar")
3cb0bbe5 3554 * Not a method if bar is a filehandle or package, but is quoted with
ffb4593c
NT
3555 * =>
3556 */
3557
76e3520e 3558STATIC int
62d55b22 3559S_intuit_method(pTHX_ char *start, GV *gv, CV *cv)
a0d0e21e 3560{
97aff369 3561 dVAR;
a0d0e21e 3562 char *s = start + (*start == '$');
3280af22 3563 char tmpbuf[sizeof PL_tokenbuf];
a0d0e21e
LW
3564 STRLEN len;
3565 GV* indirgv;
5db06880
NC
3566#ifdef PERL_MAD
3567 int soff;
3568#endif
a0d0e21e 3569
7918f24d
NC
3570 PERL_ARGS_ASSERT_INTUIT_METHOD;
3571
a0d0e21e 3572 if (gv) {
62d55b22 3573 if (SvTYPE(gv) == SVt_PVGV && GvIO(gv))
a0d0e21e 3574 return 0;
62d55b22
NC
3575 if (cv) {
3576 if (SvPOK(cv)) {
3577 const char *proto = SvPVX_const(cv);
3578 if (proto) {
3579 if (*proto == ';')
3580 proto++;
3581 if (*proto == '*')
3582 return 0;
3583 }
b6c543e3
IZ
3584 }
3585 } else
c35e046a 3586 gv = NULL;
a0d0e21e 3587 }
8903cb82 3588 s = scan_word(s, tmpbuf, sizeof tmpbuf, TRUE, &len);
ffb4593c
NT
3589 /* start is the beginning of the possible filehandle/object,
3590 * and s is the end of it
3591 * tmpbuf is a copy of it
3592 */
3593
a0d0e21e 3594 if (*start == '$') {
3ef1310e
RGS
3595 if (gv || PL_last_lop_op == OP_PRINT || PL_last_lop_op == OP_SAY ||
3596 isUPPER(*PL_tokenbuf))
a0d0e21e 3597 return 0;
5db06880
NC
3598#ifdef PERL_MAD
3599 len = start - SvPVX(PL_linestr);
3600#endif
29595ff2 3601 s = PEEKSPACE(s);
f0092767 3602#ifdef PERL_MAD
5db06880
NC
3603 start = SvPVX(PL_linestr) + len;
3604#endif
3280af22
NIS
3605 PL_bufptr = start;
3606 PL_expect = XREF;
a0d0e21e
LW
3607 return *s == '(' ? FUNCMETH : METHOD;
3608 }
5458a98a 3609 if (!keyword(tmpbuf, len, 0)) {
c3e0f903
GS
3610 if (len > 2 && tmpbuf[len - 2] == ':' && tmpbuf[len - 1] == ':') {
3611 len -= 2;
3612 tmpbuf[len] = '\0';
5db06880
NC
3613#ifdef PERL_MAD
3614 soff = s - SvPVX(PL_linestr);
3615#endif
c3e0f903
GS
3616 goto bare_package;
3617 }
90e5519e 3618 indirgv = gv_fetchpvn_flags(tmpbuf, len, 0, SVt_PVCV);
8ebc5c01 3619 if (indirgv && GvCVu(indirgv))
a0d0e21e
LW
3620 return 0;
3621 /* filehandle or package name makes it a method */
da51bb9b 3622 if (!gv || GvIO(indirgv) || gv_stashpvn(tmpbuf, len, 0)) {
5db06880
NC
3623#ifdef PERL_MAD
3624 soff = s - SvPVX(PL_linestr);
3625#endif
29595ff2 3626 s = PEEKSPACE(s);
3280af22 3627 if ((PL_bufend - s) >= 2 && *s == '=' && *(s+1) == '>')
55497cff 3628 return 0; /* no assumptions -- "=>" quotes bearword */
c3e0f903 3629 bare_package:
cd81e915 3630 start_force(PL_curforce);
9ded7720 3631 NEXTVAL_NEXTTOKE.opval = (OP*)newSVOP(OP_CONST, 0,
64142370 3632 S_newSV_maybe_utf8(aTHX_ tmpbuf, len));
9ded7720 3633 NEXTVAL_NEXTTOKE.opval->op_private = OPpCONST_BARE;
5db06880
NC
3634 if (PL_madskills)
3635 curmad('X', newSVpvn(start,SvPVX(PL_linestr) + soff - start));
3280af22 3636 PL_expect = XTERM;
a0d0e21e 3637 force_next(WORD);
3280af22 3638 PL_bufptr = s;
5db06880
NC
3639#ifdef PERL_MAD
3640 PL_bufptr = SvPVX(PL_linestr) + soff; /* restart before space */
3641#endif
a0d0e21e
LW
3642 return *s == '(' ? FUNCMETH : METHOD;
3643 }
3644 }
3645 return 0;
3646}
3647
16d20bd9 3648/* Encoded script support. filter_add() effectively inserts a
4e553d73 3649 * 'pre-processing' function into the current source input stream.
16d20bd9
AD
3650 * Note that the filter function only applies to the current source file
3651 * (e.g., it will not affect files 'require'd or 'use'd by this one).
3652 *
3653 * The datasv parameter (which may be NULL) can be used to pass
3654 * private data to this instance of the filter. The filter function
3655 * can recover the SV using the FILTER_DATA macro and use it to
3656 * store private buffers and state information.
3657 *
3658 * The supplied datasv parameter is upgraded to a PVIO type
4755096e 3659 * and the IoDIRP/IoANY field is used to store the function pointer,
e0c19803 3660 * and IOf_FAKE_DIRP is enabled on datasv to mark this as such.
16d20bd9
AD
3661 * Note that IoTOP_NAME, IoFMT_NAME, IoBOTTOM_NAME, if set for
3662 * private use must be set using malloc'd pointers.
3663 */
16d20bd9
AD
3664
3665SV *
864dbfa3 3666Perl_filter_add(pTHX_ filter_t funcp, SV *datasv)
16d20bd9 3667{
97aff369 3668 dVAR;
f4c556ac 3669 if (!funcp)
a0714e2c 3670 return NULL;
f4c556ac 3671
5486870f
DM
3672 if (!PL_parser)
3673 return NULL;
3674
3280af22
NIS
3675 if (!PL_rsfp_filters)
3676 PL_rsfp_filters = newAV();
16d20bd9 3677 if (!datasv)
561b68a9 3678 datasv = newSV(0);
862a34c6 3679 SvUPGRADE(datasv, SVt_PVIO);
8141890a 3680 IoANY(datasv) = FPTR2DPTR(void *, funcp); /* stash funcp into spare field */
e0c19803 3681 IoFLAGS(datasv) |= IOf_FAKE_DIRP;
f4c556ac 3682 DEBUG_P(PerlIO_printf(Perl_debug_log, "filter_add func %p (%s)\n",
55662e27
JH
3683 FPTR2DPTR(void *, IoANY(datasv)),
3684 SvPV_nolen(datasv)));
3280af22
NIS
3685 av_unshift(PL_rsfp_filters, 1);
3686 av_store(PL_rsfp_filters, 0, datasv) ;
16d20bd9
AD
3687 return(datasv);
3688}
4e553d73 3689
16d20bd9
AD
3690
3691/* Delete most recently added instance of this filter function. */
a0d0e21e 3692void
864dbfa3 3693Perl_filter_del(pTHX_ filter_t funcp)
16d20bd9 3694{
97aff369 3695 dVAR;
e0c19803 3696 SV *datasv;
24801a4b 3697
7918f24d
NC
3698 PERL_ARGS_ASSERT_FILTER_DEL;
3699
33073adb 3700#ifdef DEBUGGING
55662e27
JH
3701 DEBUG_P(PerlIO_printf(Perl_debug_log, "filter_del func %p",
3702 FPTR2DPTR(void*, funcp)));
33073adb 3703#endif
5486870f 3704 if (!PL_parser || !PL_rsfp_filters || AvFILLp(PL_rsfp_filters)<0)
16d20bd9
AD
3705 return;
3706 /* if filter is on top of stack (usual case) just pop it off */
e0c19803 3707 datasv = FILTER_DATA(AvFILLp(PL_rsfp_filters));
8141890a 3708 if (IoANY(datasv) == FPTR2DPTR(void *, funcp)) {
e0c19803 3709 IoFLAGS(datasv) &= ~IOf_FAKE_DIRP;
4755096e 3710 IoANY(datasv) = (void *)NULL;
3280af22 3711 sv_free(av_pop(PL_rsfp_filters));
e50aee73 3712
16d20bd9
AD
3713 return;
3714 }
3715 /* we need to search for the correct entry and clear it */
cea2e8a9 3716 Perl_die(aTHX_ "filter_del can only delete in reverse order (currently)");
16d20bd9
AD
3717}
3718
3719
1de9afcd
RGS
3720/* Invoke the idxth filter function for the current rsfp. */
3721/* maxlen 0 = read one text line */
16d20bd9 3722I32
864dbfa3 3723Perl_filter_read(pTHX_ int idx, SV *buf_sv, int maxlen)
a0d0e21e 3724{
97aff369 3725 dVAR;
16d20bd9
AD
3726 filter_t funcp;
3727 SV *datasv = NULL;
f482118e
NC
3728 /* This API is bad. It should have been using unsigned int for maxlen.
3729 Not sure if we want to change the API, but if not we should sanity
3730 check the value here. */
39cd7a59
NC
3731 const unsigned int correct_length
3732 = maxlen < 0 ?
3733#ifdef PERL_MICRO
3734 0x7FFFFFFF
3735#else
3736 INT_MAX
3737#endif
3738 : maxlen;
e50aee73 3739
7918f24d
NC
3740 PERL_ARGS_ASSERT_FILTER_READ;
3741
5486870f 3742 if (!PL_parser || !PL_rsfp_filters)
16d20bd9 3743 return -1;
1de9afcd 3744 if (idx > AvFILLp(PL_rsfp_filters)) { /* Any more filters? */
16d20bd9
AD
3745 /* Provide a default input filter to make life easy. */
3746 /* Note that we append to the line. This is handy. */
f4c556ac
GS
3747 DEBUG_P(PerlIO_printf(Perl_debug_log,
3748 "filter_read %d: from rsfp\n", idx));
f482118e 3749 if (correct_length) {
16d20bd9
AD
3750 /* Want a block */
3751 int len ;
f54cb97a 3752 const int old_len = SvCUR(buf_sv);
16d20bd9
AD
3753
3754 /* ensure buf_sv is large enough */
881d8f0a 3755 SvGROW(buf_sv, (STRLEN)(old_len + correct_length + 1)) ;
f482118e
NC
3756 if ((len = PerlIO_read(PL_rsfp, SvPVX(buf_sv) + old_len,
3757 correct_length)) <= 0) {
3280af22 3758 if (PerlIO_error(PL_rsfp))
37120919
AD
3759 return -1; /* error */
3760 else
3761 return 0 ; /* end of file */
3762 }
16d20bd9 3763 SvCUR_set(buf_sv, old_len + len) ;
881d8f0a 3764 SvPVX(buf_sv)[old_len + len] = '\0';
16d20bd9
AD
3765 } else {
3766 /* Want a line */
3280af22
NIS
3767 if (sv_gets(buf_sv, PL_rsfp, SvCUR(buf_sv)) == NULL) {
3768 if (PerlIO_error(PL_rsfp))
37120919
AD
3769 return -1; /* error */
3770 else
3771 return 0 ; /* end of file */
3772 }