This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Improve handling of qq(\N{...}); and /x
[perl5.git] / toke.c
CommitLineData
a0d0e21e 1/* toke.c
a687059c 2 *
1129b882
NC
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
a687059c 5 *
d48672a2
LW
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
378cc40b 8 *
a0d0e21e
LW
9 */
10
11/*
4ac71550
TC
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
378cc40b
LW
15 */
16
9cbb5ea2
GS
17/*
18 * This file is the lexer for Perl. It's closely linked to the
4e553d73 19 * parser, perly.y.
ffb4593c
NT
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
f0e67a1d
Z
24/*
25=head1 Lexer interface
26
27This is the lower layer of the Perl parser, managing characters and tokens.
28
29=for apidoc AmU|yy_parser *|PL_parser
30
31Pointer to a structure encapsulating the state of the parsing operation
32currently in progress. The pointer can be locally changed to perform
33a nested parse without interfering with the state of an outer parse.
34Individual members of C<PL_parser> have their own documentation.
35
36=cut
37*/
38
378cc40b 39#include "EXTERN.h"
864dbfa3 40#define PERL_IN_TOKE_C
378cc40b 41#include "perl.h"
378cc40b 42
eb0d8d16
NC
43#define new_constant(a,b,c,d,e,f,g) \
44 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g)
45
6154021b 46#define pl_yylval (PL_parser->yylval)
d3b6f988 47
acdf0a21
DM
48/* YYINITDEPTH -- initial size of the parser's stacks. */
49#define YYINITDEPTH 200
50
199e78b7
DM
51/* XXX temporary backwards compatibility */
52#define PL_lex_brackets (PL_parser->lex_brackets)
53#define PL_lex_brackstack (PL_parser->lex_brackstack)
54#define PL_lex_casemods (PL_parser->lex_casemods)
55#define PL_lex_casestack (PL_parser->lex_casestack)
56#define PL_lex_defer (PL_parser->lex_defer)
57#define PL_lex_dojoin (PL_parser->lex_dojoin)
58#define PL_lex_expect (PL_parser->lex_expect)
59#define PL_lex_formbrack (PL_parser->lex_formbrack)
60#define PL_lex_inpat (PL_parser->lex_inpat)
61#define PL_lex_inwhat (PL_parser->lex_inwhat)
62#define PL_lex_op (PL_parser->lex_op)
63#define PL_lex_repl (PL_parser->lex_repl)
64#define PL_lex_starts (PL_parser->lex_starts)
65#define PL_lex_stuff (PL_parser->lex_stuff)
66#define PL_multi_start (PL_parser->multi_start)
67#define PL_multi_open (PL_parser->multi_open)
68#define PL_multi_close (PL_parser->multi_close)
69#define PL_pending_ident (PL_parser->pending_ident)
70#define PL_preambled (PL_parser->preambled)
71#define PL_sublex_info (PL_parser->sublex_info)
bdc0bf6f 72#define PL_linestr (PL_parser->linestr)
c2598295
DM
73#define PL_expect (PL_parser->expect)
74#define PL_copline (PL_parser->copline)
f06b5848
DM
75#define PL_bufptr (PL_parser->bufptr)
76#define PL_oldbufptr (PL_parser->oldbufptr)
77#define PL_oldoldbufptr (PL_parser->oldoldbufptr)
78#define PL_linestart (PL_parser->linestart)
79#define PL_bufend (PL_parser->bufend)
80#define PL_last_uni (PL_parser->last_uni)
81#define PL_last_lop (PL_parser->last_lop)
82#define PL_last_lop_op (PL_parser->last_lop_op)
bc177e6b 83#define PL_lex_state (PL_parser->lex_state)
2f9285f8 84#define PL_rsfp (PL_parser->rsfp)
5486870f 85#define PL_rsfp_filters (PL_parser->rsfp_filters)
12bd6ede
DM
86#define PL_in_my (PL_parser->in_my)
87#define PL_in_my_stash (PL_parser->in_my_stash)
14047fc9 88#define PL_tokenbuf (PL_parser->tokenbuf)
670a9cb2 89#define PL_multi_end (PL_parser->multi_end)
13765c85 90#define PL_error_count (PL_parser->error_count)
199e78b7
DM
91
92#ifdef PERL_MAD
93# define PL_endwhite (PL_parser->endwhite)
94# define PL_faketokens (PL_parser->faketokens)
95# define PL_lasttoke (PL_parser->lasttoke)
96# define PL_nextwhite (PL_parser->nextwhite)
97# define PL_realtokenstart (PL_parser->realtokenstart)
98# define PL_skipwhite (PL_parser->skipwhite)
99# define PL_thisclose (PL_parser->thisclose)
100# define PL_thismad (PL_parser->thismad)
101# define PL_thisopen (PL_parser->thisopen)
102# define PL_thisstuff (PL_parser->thisstuff)
103# define PL_thistoken (PL_parser->thistoken)
104# define PL_thiswhite (PL_parser->thiswhite)
fb205e7a
DM
105# define PL_thiswhite (PL_parser->thiswhite)
106# define PL_nexttoke (PL_parser->nexttoke)
107# define PL_curforce (PL_parser->curforce)
108#else
109# define PL_nexttoke (PL_parser->nexttoke)
110# define PL_nexttype (PL_parser->nexttype)
111# define PL_nextval (PL_parser->nextval)
199e78b7
DM
112#endif
113
16173588
NC
114/* This can't be done with embed.fnc, because struct yy_parser contains a
115 member named pending_ident, which clashes with the generated #define */
3cbf51f5
DM
116static int
117S_pending_ident(pTHX);
199e78b7 118
0bd48802 119static const char ident_too_long[] = "Identifier too long";
8903cb82 120
29595ff2 121#ifdef PERL_MAD
29595ff2 122# define CURMAD(slot,sv) if (PL_madskills) { curmad(slot,sv); sv = 0; }
cd81e915 123# define NEXTVAL_NEXTTOKE PL_nexttoke[PL_curforce].next_val
9ded7720 124#else
5db06880 125# define CURMAD(slot,sv)
9ded7720 126# define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
29595ff2
NC
127#endif
128
9059aa12
LW
129#define XFAKEBRACK 128
130#define XENUMMASK 127
131
39e02b42
JH
132#ifdef USE_UTF8_SCRIPTS
133# define UTF (!IN_BYTES)
2b9d42f0 134#else
746b446a 135# define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || (PL_hints & HINT_UTF8))
2b9d42f0 136#endif
a0ed51b3 137
b1fc3636
CJ
138/* The maximum number of characters preceding the unrecognized one to display */
139#define UNRECOGNIZED_PRECEDE_COUNT 10
140
61f0cdd9 141/* In variables named $^X, these are the legal values for X.
2b92dfce
GS
142 * 1999-02-27 mjd-perl-patch@plover.com */
143#define isCONTROLVAR(x) (isUPPER(x) || strchr("[\\]^_?", (x)))
144
bf4acbe4 145#define SPACE_OR_TAB(c) ((c)==' '||(c)=='\t')
bf4acbe4 146
ffb4593c
NT
147/* LEX_* are values for PL_lex_state, the state of the lexer.
148 * They are arranged oddly so that the guard on the switch statement
79072805
LW
149 * can get by with a single comparison (if the compiler is smart enough).
150 */
151
fb73857a 152/* #define LEX_NOTPARSING 11 is done in perl.h. */
153
b6007c36
DM
154#define LEX_NORMAL 10 /* normal code (ie not within "...") */
155#define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
156#define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
157#define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
158#define LEX_INTERPSTART 6 /* expecting the start of a $var */
159
160 /* at end of code, eg "$x" followed by: */
161#define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
162#define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
163
164#define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
165 string or after \E, $foo, etc */
166#define LEX_INTERPCONST 2 /* NOT USED */
167#define LEX_FORMLINE 1 /* expecting a format line */
168#define LEX_KNOWNEXT 0 /* next token known; just return it */
169
79072805 170
bbf60fe6 171#ifdef DEBUGGING
27da23d5 172static const char* const lex_state_names[] = {
bbf60fe6
DM
173 "KNOWNEXT",
174 "FORMLINE",
175 "INTERPCONST",
176 "INTERPCONCAT",
177 "INTERPENDMAYBE",
178 "INTERPEND",
179 "INTERPSTART",
180 "INTERPPUSH",
181 "INTERPCASEMOD",
182 "INTERPNORMAL",
183 "NORMAL"
184};
185#endif
186
79072805
LW
187#ifdef ff_next
188#undef ff_next
d48672a2
LW
189#endif
190
79072805 191#include "keywords.h"
fe14fcc3 192
ffb4593c
NT
193/* CLINE is a macro that ensures PL_copline has a sane value */
194
ae986130
LW
195#ifdef CLINE
196#undef CLINE
197#endif
57843af0 198#define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
3280af22 199
5db06880 200#ifdef PERL_MAD
29595ff2
NC
201# define SKIPSPACE0(s) skipspace0(s)
202# define SKIPSPACE1(s) skipspace1(s)
203# define SKIPSPACE2(s,tsv) skipspace2(s,&tsv)
204# define PEEKSPACE(s) skipspace2(s,0)
205#else
206# define SKIPSPACE0(s) skipspace(s)
207# define SKIPSPACE1(s) skipspace(s)
208# define SKIPSPACE2(s,tsv) skipspace(s)
209# define PEEKSPACE(s) skipspace(s)
210#endif
211
ffb4593c
NT
212/*
213 * Convenience functions to return different tokens and prime the
9cbb5ea2 214 * lexer for the next token. They all take an argument.
ffb4593c
NT
215 *
216 * TOKEN : generic token (used for '(', DOLSHARP, etc)
217 * OPERATOR : generic operator
218 * AOPERATOR : assignment operator
219 * PREBLOCK : beginning the block after an if, while, foreach, ...
220 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
221 * PREREF : *EXPR where EXPR is not a simple identifier
222 * TERM : expression term
223 * LOOPX : loop exiting command (goto, last, dump, etc)
224 * FTST : file test operator
225 * FUN0 : zero-argument function
2d2e263d 226 * FUN1 : not used, except for not, which isn't a UNIOP
ffb4593c
NT
227 * BOop : bitwise or or xor
228 * BAop : bitwise and
229 * SHop : shift operator
230 * PWop : power operator
9cbb5ea2 231 * PMop : pattern-matching operator
ffb4593c
NT
232 * Aop : addition-level operator
233 * Mop : multiplication-level operator
234 * Eop : equality-testing operator
e5edeb50 235 * Rop : relational operator <= != gt
ffb4593c
NT
236 *
237 * Also see LOP and lop() below.
238 */
239
998054bd 240#ifdef DEBUGGING /* Serve -DT. */
704d4215 241# define REPORT(retval) tokereport((I32)retval, &pl_yylval)
998054bd 242#else
bbf60fe6 243# define REPORT(retval) (retval)
998054bd
SC
244#endif
245
bbf60fe6
DM
246#define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
247#define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
248#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, REPORT(retval)))
249#define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
250#define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
251#define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
252#define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
6154021b
RGS
253#define LOOPX(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)LOOPEX))
254#define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
255#define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
256#define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
257#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITOROP)))
258#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITANDOP)))
259#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)SHIFTOP)))
260#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)POWOP)))
261#define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
262#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)ADDOP)))
263#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MULOP)))
264#define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP))
265#define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP))
2f3197b3 266
a687059c
LW
267/* This bit of chicanery makes a unary function followed by
268 * a parenthesis into a function with one argument, highest precedence.
6f33ba73
RGS
269 * The UNIDOR macro is for unary functions that can be followed by the //
270 * operator (such as C<shift // 0>).
a687059c 271 */
376fcdbf 272#define UNI2(f,x) { \
6154021b 273 pl_yylval.ival = f; \
376fcdbf
AL
274 PL_expect = x; \
275 PL_bufptr = s; \
276 PL_last_uni = PL_oldbufptr; \
277 PL_last_lop_op = f; \
278 if (*s == '(') \
279 return REPORT( (int)FUNC1 ); \
29595ff2 280 s = PEEKSPACE(s); \
376fcdbf
AL
281 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
282 }
6f33ba73
RGS
283#define UNI(f) UNI2(f,XTERM)
284#define UNIDOR(f) UNI2(f,XTERMORDORDOR)
a687059c 285
376fcdbf 286#define UNIBRACK(f) { \
6154021b 287 pl_yylval.ival = f; \
376fcdbf
AL
288 PL_bufptr = s; \
289 PL_last_uni = PL_oldbufptr; \
290 if (*s == '(') \
291 return REPORT( (int)FUNC1 ); \
29595ff2 292 s = PEEKSPACE(s); \
376fcdbf
AL
293 return REPORT( (*s == '(') ? (int)FUNC1 : (int)UNIOP ); \
294 }
79072805 295
9f68db38 296/* grandfather return to old style */
6154021b 297#define OLDLOP(f) return(pl_yylval.ival=f,PL_expect = XTERM,PL_bufptr = s,(int)LSTOP)
79072805 298
8fa7f367
JH
299#ifdef DEBUGGING
300
6154021b 301/* how to interpret the pl_yylval associated with the token */
bbf60fe6
DM
302enum token_type {
303 TOKENTYPE_NONE,
304 TOKENTYPE_IVAL,
6154021b 305 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
bbf60fe6
DM
306 TOKENTYPE_PVAL,
307 TOKENTYPE_OPVAL,
308 TOKENTYPE_GVVAL
309};
310
6d4a66ac
NC
311static struct debug_tokens {
312 const int token;
313 enum token_type type;
314 const char *name;
315} const debug_tokens[] =
9041c2e3 316{
bbf60fe6
DM
317 { ADDOP, TOKENTYPE_OPNUM, "ADDOP" },
318 { ANDAND, TOKENTYPE_NONE, "ANDAND" },
319 { ANDOP, TOKENTYPE_NONE, "ANDOP" },
320 { ANONSUB, TOKENTYPE_IVAL, "ANONSUB" },
321 { ARROW, TOKENTYPE_NONE, "ARROW" },
322 { ASSIGNOP, TOKENTYPE_OPNUM, "ASSIGNOP" },
323 { BITANDOP, TOKENTYPE_OPNUM, "BITANDOP" },
324 { BITOROP, TOKENTYPE_OPNUM, "BITOROP" },
325 { COLONATTR, TOKENTYPE_NONE, "COLONATTR" },
326 { CONTINUE, TOKENTYPE_NONE, "CONTINUE" },
0d863452 327 { DEFAULT, TOKENTYPE_NONE, "DEFAULT" },
bbf60fe6
DM
328 { DO, TOKENTYPE_NONE, "DO" },
329 { DOLSHARP, TOKENTYPE_NONE, "DOLSHARP" },
330 { DORDOR, TOKENTYPE_NONE, "DORDOR" },
331 { DOROP, TOKENTYPE_OPNUM, "DOROP" },
332 { DOTDOT, TOKENTYPE_IVAL, "DOTDOT" },
333 { ELSE, TOKENTYPE_NONE, "ELSE" },
334 { ELSIF, TOKENTYPE_IVAL, "ELSIF" },
335 { EQOP, TOKENTYPE_OPNUM, "EQOP" },
336 { FOR, TOKENTYPE_IVAL, "FOR" },
337 { FORMAT, TOKENTYPE_NONE, "FORMAT" },
338 { FUNC, TOKENTYPE_OPNUM, "FUNC" },
339 { FUNC0, TOKENTYPE_OPNUM, "FUNC0" },
340 { FUNC0SUB, TOKENTYPE_OPVAL, "FUNC0SUB" },
341 { FUNC1, TOKENTYPE_OPNUM, "FUNC1" },
342 { FUNCMETH, TOKENTYPE_OPVAL, "FUNCMETH" },
0d863452 343 { GIVEN, TOKENTYPE_IVAL, "GIVEN" },
bbf60fe6
DM
344 { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" },
345 { IF, TOKENTYPE_IVAL, "IF" },
346 { LABEL, TOKENTYPE_PVAL, "LABEL" },
347 { LOCAL, TOKENTYPE_IVAL, "LOCAL" },
348 { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" },
349 { LSTOP, TOKENTYPE_OPNUM, "LSTOP" },
350 { LSTOPSUB, TOKENTYPE_OPVAL, "LSTOPSUB" },
351 { MATCHOP, TOKENTYPE_OPNUM, "MATCHOP" },
352 { METHOD, TOKENTYPE_OPVAL, "METHOD" },
353 { MULOP, TOKENTYPE_OPNUM, "MULOP" },
354 { MY, TOKENTYPE_IVAL, "MY" },
355 { MYSUB, TOKENTYPE_NONE, "MYSUB" },
356 { NOAMP, TOKENTYPE_NONE, "NOAMP" },
357 { NOTOP, TOKENTYPE_NONE, "NOTOP" },
358 { OROP, TOKENTYPE_IVAL, "OROP" },
359 { OROR, TOKENTYPE_NONE, "OROR" },
360 { PACKAGE, TOKENTYPE_NONE, "PACKAGE" },
88e1f1a2
JV
361 { PLUGEXPR, TOKENTYPE_OPVAL, "PLUGEXPR" },
362 { PLUGSTMT, TOKENTYPE_OPVAL, "PLUGSTMT" },
bbf60fe6
DM
363 { PMFUNC, TOKENTYPE_OPVAL, "PMFUNC" },
364 { POSTDEC, TOKENTYPE_NONE, "POSTDEC" },
365 { POSTINC, TOKENTYPE_NONE, "POSTINC" },
366 { POWOP, TOKENTYPE_OPNUM, "POWOP" },
367 { PREDEC, TOKENTYPE_NONE, "PREDEC" },
368 { PREINC, TOKENTYPE_NONE, "PREINC" },
369 { PRIVATEREF, TOKENTYPE_OPVAL, "PRIVATEREF" },
370 { REFGEN, TOKENTYPE_NONE, "REFGEN" },
371 { RELOP, TOKENTYPE_OPNUM, "RELOP" },
372 { SHIFTOP, TOKENTYPE_OPNUM, "SHIFTOP" },
373 { SUB, TOKENTYPE_NONE, "SUB" },
374 { THING, TOKENTYPE_OPVAL, "THING" },
375 { UMINUS, TOKENTYPE_NONE, "UMINUS" },
376 { UNIOP, TOKENTYPE_OPNUM, "UNIOP" },
377 { UNIOPSUB, TOKENTYPE_OPVAL, "UNIOPSUB" },
378 { UNLESS, TOKENTYPE_IVAL, "UNLESS" },
379 { UNTIL, TOKENTYPE_IVAL, "UNTIL" },
380 { USE, TOKENTYPE_IVAL, "USE" },
0d863452 381 { WHEN, TOKENTYPE_IVAL, "WHEN" },
bbf60fe6
DM
382 { WHILE, TOKENTYPE_IVAL, "WHILE" },
383 { WORD, TOKENTYPE_OPVAL, "WORD" },
be25f609 384 { YADAYADA, TOKENTYPE_IVAL, "YADAYADA" },
c35e046a 385 { 0, TOKENTYPE_NONE, NULL }
bbf60fe6
DM
386};
387
6154021b 388/* dump the returned token in rv, plus any optional arg in pl_yylval */
998054bd 389
bbf60fe6 390STATIC int
704d4215 391S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
bbf60fe6 392{
97aff369 393 dVAR;
7918f24d
NC
394
395 PERL_ARGS_ASSERT_TOKEREPORT;
396
bbf60fe6 397 if (DEBUG_T_TEST) {
bd61b366 398 const char *name = NULL;
bbf60fe6 399 enum token_type type = TOKENTYPE_NONE;
f54cb97a 400 const struct debug_tokens *p;
396482e1 401 SV* const report = newSVpvs("<== ");
bbf60fe6 402
f54cb97a 403 for (p = debug_tokens; p->token; p++) {
bbf60fe6
DM
404 if (p->token == (int)rv) {
405 name = p->name;
406 type = p->type;
407 break;
408 }
409 }
410 if (name)
54667de8 411 Perl_sv_catpv(aTHX_ report, name);
bbf60fe6
DM
412 else if ((char)rv > ' ' && (char)rv < '~')
413 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
414 else if (!rv)
396482e1 415 sv_catpvs(report, "EOF");
bbf60fe6
DM
416 else
417 Perl_sv_catpvf(aTHX_ report, "?? %"IVdf, (IV)rv);
418 switch (type) {
419 case TOKENTYPE_NONE:
420 case TOKENTYPE_GVVAL: /* doesn't appear to be used */
421 break;
422 case TOKENTYPE_IVAL:
704d4215 423 Perl_sv_catpvf(aTHX_ report, "(ival=%"IVdf")", (IV)lvalp->ival);
bbf60fe6
DM
424 break;
425 case TOKENTYPE_OPNUM:
426 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
704d4215 427 PL_op_name[lvalp->ival]);
bbf60fe6
DM
428 break;
429 case TOKENTYPE_PVAL:
704d4215 430 Perl_sv_catpvf(aTHX_ report, "(pval=\"%s\")", lvalp->pval);
bbf60fe6
DM
431 break;
432 case TOKENTYPE_OPVAL:
704d4215 433 if (lvalp->opval) {
401441c0 434 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
704d4215
GG
435 PL_op_name[lvalp->opval->op_type]);
436 if (lvalp->opval->op_type == OP_CONST) {
b6007c36 437 Perl_sv_catpvf(aTHX_ report, " %s",
704d4215 438 SvPEEK(cSVOPx_sv(lvalp->opval)));
b6007c36
DM
439 }
440
441 }
401441c0 442 else
396482e1 443 sv_catpvs(report, "(opval=null)");
bbf60fe6
DM
444 break;
445 }
b6007c36 446 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
bbf60fe6
DM
447 };
448 return (int)rv;
998054bd
SC
449}
450
b6007c36
DM
451
452/* print the buffer with suitable escapes */
453
454STATIC void
15f169a1 455S_printbuf(pTHX_ const char *const fmt, const char *const s)
b6007c36 456{
396482e1 457 SV* const tmp = newSVpvs("");
7918f24d
NC
458
459 PERL_ARGS_ASSERT_PRINTBUF;
460
b6007c36
DM
461 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
462 SvREFCNT_dec(tmp);
463}
464
8fa7f367
JH
465#endif
466
8290c323
NC
467static int
468S_deprecate_commaless_var_list(pTHX) {
469 PL_expect = XTERM;
470 deprecate("comma-less variable list");
471 return REPORT(','); /* grandfather non-comma-format format */
472}
473
ffb4593c
NT
474/*
475 * S_ao
476 *
c963b151
BD
477 * This subroutine detects &&=, ||=, and //= and turns an ANDAND, OROR or DORDOR
478 * into an OP_ANDASSIGN, OP_ORASSIGN, or OP_DORASSIGN
ffb4593c
NT
479 */
480
76e3520e 481STATIC int
cea2e8a9 482S_ao(pTHX_ int toketype)
a0d0e21e 483{
97aff369 484 dVAR;
3280af22
NIS
485 if (*PL_bufptr == '=') {
486 PL_bufptr++;
a0d0e21e 487 if (toketype == ANDAND)
6154021b 488 pl_yylval.ival = OP_ANDASSIGN;
a0d0e21e 489 else if (toketype == OROR)
6154021b 490 pl_yylval.ival = OP_ORASSIGN;
c963b151 491 else if (toketype == DORDOR)
6154021b 492 pl_yylval.ival = OP_DORASSIGN;
a0d0e21e
LW
493 toketype = ASSIGNOP;
494 }
495 return toketype;
496}
497
ffb4593c
NT
498/*
499 * S_no_op
500 * When Perl expects an operator and finds something else, no_op
501 * prints the warning. It always prints "<something> found where
502 * operator expected. It prints "Missing semicolon on previous line?"
503 * if the surprise occurs at the start of the line. "do you need to
504 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
505 * where the compiler doesn't know if foo is a method call or a function.
506 * It prints "Missing operator before end of line" if there's nothing
507 * after the missing operator, or "... before <...>" if there is something
508 * after the missing operator.
509 */
510
76e3520e 511STATIC void
15f169a1 512S_no_op(pTHX_ const char *const what, char *s)
463ee0b2 513{
97aff369 514 dVAR;
9d4ba2ae
AL
515 char * const oldbp = PL_bufptr;
516 const bool is_first = (PL_oldbufptr == PL_linestart);
68dc0745 517
7918f24d
NC
518 PERL_ARGS_ASSERT_NO_OP;
519
1189a94a
GS
520 if (!s)
521 s = oldbp;
07c798fb 522 else
1189a94a 523 PL_bufptr = s;
cea2e8a9 524 yywarn(Perl_form(aTHX_ "%s found where operator expected", what));
56da5a46
RGS
525 if (ckWARN_d(WARN_SYNTAX)) {
526 if (is_first)
527 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
528 "\t(Missing semicolon on previous line?)\n");
529 else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) {
f54cb97a 530 const char *t;
c35e046a
AL
531 for (t = PL_oldoldbufptr; (isALNUM_lazy_if(t,UTF) || *t == ':'); t++)
532 NOOP;
56da5a46
RGS
533 if (t < PL_bufptr && isSPACE(*t))
534 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
535 "\t(Do you need to predeclare %.*s?)\n",
551405c4 536 (int)(t - PL_oldoldbufptr), PL_oldoldbufptr);
56da5a46
RGS
537 }
538 else {
539 assert(s >= oldbp);
540 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
551405c4 541 "\t(Missing operator before %.*s?)\n", (int)(s - oldbp), oldbp);
56da5a46 542 }
07c798fb 543 }
3280af22 544 PL_bufptr = oldbp;
8990e307
LW
545}
546
ffb4593c
NT
547/*
548 * S_missingterm
549 * Complain about missing quote/regexp/heredoc terminator.
d4c19fe8 550 * If it's called with NULL then it cauterizes the line buffer.
ffb4593c
NT
551 * If we're in a delimited string and the delimiter is a control
552 * character, it's reformatted into a two-char sequence like ^C.
553 * This is fatal.
554 */
555
76e3520e 556STATIC void
cea2e8a9 557S_missingterm(pTHX_ char *s)
8990e307 558{
97aff369 559 dVAR;
8990e307
LW
560 char tmpbuf[3];
561 char q;
562 if (s) {
9d4ba2ae 563 char * const nl = strrchr(s,'\n');
d2719217 564 if (nl)
8990e307
LW
565 *nl = '\0';
566 }
463559e7 567 else if (isCNTRL(PL_multi_close)) {
8990e307 568 *tmpbuf = '^';
585ec06d 569 tmpbuf[1] = (char)toCTRL(PL_multi_close);
8990e307
LW
570 tmpbuf[2] = '\0';
571 s = tmpbuf;
572 }
573 else {
eb160463 574 *tmpbuf = (char)PL_multi_close;
8990e307
LW
575 tmpbuf[1] = '\0';
576 s = tmpbuf;
577 }
578 q = strchr(s,'"') ? '\'' : '"';
cea2e8a9 579 Perl_croak(aTHX_ "Can't find string terminator %c%s%c anywhere before EOF",q,s,q);
463ee0b2 580}
79072805 581
ef89dcc3 582#define FEATURE_IS_ENABLED(name) \
0d863452 583 ((0 != (PL_hints & HINT_LOCALIZE_HH)) \
89529cee 584 && S_feature_is_enabled(aTHX_ STR_WITH_LEN(name)))
4a731d7b 585/* The longest string we pass in. */
1863b879 586#define MAX_FEATURE_LEN (sizeof("unicode_strings")-1)
4a731d7b 587
0d863452
RH
588/*
589 * S_feature_is_enabled
590 * Check whether the named feature is enabled.
591 */
592STATIC bool
15f169a1 593S_feature_is_enabled(pTHX_ const char *const name, STRLEN namelen)
0d863452 594{
97aff369 595 dVAR;
0d863452 596 HV * const hinthv = GvHV(PL_hintgv);
4a731d7b 597 char he_name[8 + MAX_FEATURE_LEN] = "feature_";
7918f24d
NC
598
599 PERL_ARGS_ASSERT_FEATURE_IS_ENABLED;
600
4a731d7b
NC
601 assert(namelen <= MAX_FEATURE_LEN);
602 memcpy(&he_name[8], name, namelen);
d4c19fe8 603
7b9ef140 604 return (hinthv && hv_exists(hinthv, he_name, 8 + namelen));
0d863452
RH
605}
606
ffb4593c 607/*
9cbb5ea2
GS
608 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
609 * utf16-to-utf8-reversed.
ffb4593c
NT
610 */
611
c39cd008
GS
612#ifdef PERL_CR_FILTER
613static void
614strip_return(SV *sv)
615{
95a20fc0 616 register const char *s = SvPVX_const(sv);
9d4ba2ae 617 register const char * const e = s + SvCUR(sv);
7918f24d
NC
618
619 PERL_ARGS_ASSERT_STRIP_RETURN;
620
c39cd008
GS
621 /* outer loop optimized to do nothing if there are no CR-LFs */
622 while (s < e) {
623 if (*s++ == '\r' && *s == '\n') {
624 /* hit a CR-LF, need to copy the rest */
625 register char *d = s - 1;
626 *d++ = *s++;
627 while (s < e) {
628 if (*s == '\r' && s[1] == '\n')
629 s++;
630 *d++ = *s++;
631 }
632 SvCUR(sv) -= s - d;
633 return;
634 }
635 }
636}
a868473f 637
76e3520e 638STATIC I32
c39cd008 639S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
a868473f 640{
f54cb97a 641 const I32 count = FILTER_READ(idx+1, sv, maxlen);
c39cd008
GS
642 if (count > 0 && !maxlen)
643 strip_return(sv);
644 return count;
a868473f
NIS
645}
646#endif
647
199e78b7
DM
648
649
ffb4593c
NT
650/*
651 * Perl_lex_start
5486870f 652 *
e3abe207 653 * Create a parser object and initialise its parser and lexer fields
5486870f
DM
654 *
655 * rsfp is the opened file handle to read from (if any),
656 *
657 * line holds any initial content already read from the file (or in
658 * the case of no file, such as an eval, the whole contents);
659 *
660 * new_filter indicates that this is a new file and it shouldn't inherit
661 * the filters from the current parser (ie require).
ffb4593c
NT
662 */
663
a0d0e21e 664void
5486870f 665Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, bool new_filter)
79072805 666{
97aff369 667 dVAR;
6ef55633 668 const char *s = NULL;
8990e307 669 STRLEN len;
5486870f 670 yy_parser *parser, *oparser;
acdf0a21
DM
671
672 /* create and initialise a parser */
673
199e78b7 674 Newxz(parser, 1, yy_parser);
5486870f 675 parser->old_parser = oparser = PL_parser;
acdf0a21
DM
676 PL_parser = parser;
677
678 Newx(parser->stack, YYINITDEPTH, yy_stack_frame);
679 parser->ps = parser->stack;
680 parser->stack_size = YYINITDEPTH;
681
682 parser->stack->state = 0;
683 parser->yyerrstatus = 0;
684 parser->yychar = YYEMPTY; /* Cause a token to be read. */
685
e3abe207
DM
686 /* on scope exit, free this parser and restore any outer one */
687 SAVEPARSER(parser);
7c4baf47 688 parser->saved_curcop = PL_curcop;
e3abe207 689
acdf0a21 690 /* initialise lexer state */
8990e307 691
fb205e7a
DM
692#ifdef PERL_MAD
693 parser->curforce = -1;
694#else
695 parser->nexttoke = 0;
696#endif
ca4cfd28 697 parser->error_count = oparser ? oparser->error_count : 0;
c2598295 698 parser->copline = NOLINE;
5afb0a62 699 parser->lex_state = LEX_NORMAL;
c2598295 700 parser->expect = XSTATE;
2f9285f8 701 parser->rsfp = rsfp;
56b27c9a 702 parser->rsfp_filters = (new_filter || !oparser) ? newAV()
502c6561 703 : MUTABLE_AV(SvREFCNT_inc(oparser->rsfp_filters));
2f9285f8 704
199e78b7
DM
705 Newx(parser->lex_brackstack, 120, char);
706 Newx(parser->lex_casestack, 12, char);
707 *parser->lex_casestack = '\0';
02b34bbe 708
10efb74f
NC
709 if (line) {
710 s = SvPV_const(line, len);
711 } else {
712 len = 0;
713 }
bdc0bf6f 714
10efb74f 715 if (!len) {
bdc0bf6f 716 parser->linestr = newSVpvs("\n;");
10efb74f 717 } else if (SvREADONLY(line) || s[len-1] != ';') {
bdc0bf6f 718 parser->linestr = newSVsv(line);
10efb74f 719 if (s[len-1] != ';')
bdc0bf6f 720 sv_catpvs(parser->linestr, "\n;");
6c5ce11d
NC
721 } else {
722 SvTEMP_off(line);
723 SvREFCNT_inc_simple_void_NN(line);
bdc0bf6f 724 parser->linestr = line;
8990e307 725 }
f06b5848
DM
726 parser->oldoldbufptr =
727 parser->oldbufptr =
728 parser->bufptr =
729 parser->linestart = SvPVX(parser->linestr);
730 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
731 parser->last_lop = parser->last_uni = NULL;
79072805 732}
a687059c 733
e3abe207
DM
734
735/* delete a parser object */
736
737void
738Perl_parser_free(pTHX_ const yy_parser *parser)
739{
7918f24d
NC
740 PERL_ARGS_ASSERT_PARSER_FREE;
741
7c4baf47 742 PL_curcop = parser->saved_curcop;
bdc0bf6f
DM
743 SvREFCNT_dec(parser->linestr);
744
2f9285f8
DM
745 if (parser->rsfp == PerlIO_stdin())
746 PerlIO_clearerr(parser->rsfp);
799361c3
SH
747 else if (parser->rsfp && (!parser->old_parser ||
748 (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
2f9285f8 749 PerlIO_close(parser->rsfp);
5486870f 750 SvREFCNT_dec(parser->rsfp_filters);
2f9285f8 751
e3abe207
DM
752 Safefree(parser->stack);
753 Safefree(parser->lex_brackstack);
754 Safefree(parser->lex_casestack);
755 PL_parser = parser->old_parser;
756 Safefree(parser);
757}
758
759
ffb4593c
NT
760/*
761 * Perl_lex_end
9cbb5ea2
GS
762 * Finalizer for lexing operations. Must be called when the parser is
763 * done with the lexer.
ffb4593c
NT
764 */
765
463ee0b2 766void
864dbfa3 767Perl_lex_end(pTHX)
463ee0b2 768{
97aff369 769 dVAR;
3280af22 770 PL_doextract = FALSE;
463ee0b2
LW
771}
772
ffb4593c 773/*
f0e67a1d
Z
774=for apidoc AmxU|SV *|PL_parser-E<gt>linestr
775
776Buffer scalar containing the chunk currently under consideration of the
777text currently being lexed. This is always a plain string scalar (for
778which C<SvPOK> is true). It is not intended to be used as a scalar by
779normal scalar means; instead refer to the buffer directly by the pointer
780variables described below.
781
782The lexer maintains various C<char*> pointers to things in the
783C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
784reallocated, all of these pointers must be updated. Don't attempt to
785do this manually, but rather use L</lex_grow_linestr> if you need to
786reallocate the buffer.
787
788The content of the text chunk in the buffer is commonly exactly one
789complete line of input, up to and including a newline terminator,
790but there are situations where it is otherwise. The octets of the
791buffer may be intended to be interpreted as either UTF-8 or Latin-1.
792The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
793flag on this scalar, which may disagree with it.
794
795For direct examination of the buffer, the variable
796L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
797lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
798of these pointers is usually preferable to examination of the scalar
799through normal scalar means.
800
801=for apidoc AmxU|char *|PL_parser-E<gt>bufend
802
803Direct pointer to the end of the chunk of text currently being lexed, the
804end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
805+ SvCUR(PL_parser-E<gt>linestr)>. A NUL character (zero octet) is
806always located at the end of the buffer, and does not count as part of
807the buffer's contents.
808
809=for apidoc AmxU|char *|PL_parser-E<gt>bufptr
810
811Points to the current position of lexing inside the lexer buffer.
812Characters around this point may be freely examined, within
813the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
814L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
815interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
816
817Lexing code (whether in the Perl core or not) moves this pointer past
818the characters that it consumes. It is also expected to perform some
819bookkeeping whenever a newline character is consumed. This movement
820can be more conveniently performed by the function L</lex_read_to>,
821which handles newlines appropriately.
822
823Interpretation of the buffer's octets can be abstracted out by
824using the slightly higher-level functions L</lex_peek_unichar> and
825L</lex_read_unichar>.
826
827=for apidoc AmxU|char *|PL_parser-E<gt>linestart
828
829Points to the start of the current line inside the lexer buffer.
830This is useful for indicating at which column an error occurred, and
831not much else. This must be updated by any lexing code that consumes
832a newline; the function L</lex_read_to> handles this detail.
833
834=cut
835*/
836
837/*
838=for apidoc Amx|bool|lex_bufutf8
839
840Indicates whether the octets in the lexer buffer
841(L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
842of Unicode characters. If not, they should be interpreted as Latin-1
843characters. This is analogous to the C<SvUTF8> flag for scalars.
844
845In UTF-8 mode, it is not guaranteed that the lexer buffer actually
846contains valid UTF-8. Lexing code must be robust in the face of invalid
847encoding.
848
849The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
850is significant, but not the whole story regarding the input character
851encoding. Normally, when a file is being read, the scalar contains octets
852and its C<SvUTF8> flag is off, but the octets should be interpreted as
853UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
854however, the scalar may have the C<SvUTF8> flag on, and in this case its
855octets should be interpreted as UTF-8 unless the C<use bytes> pragma
856is in effect. This logic may change in the future; use this function
857instead of implementing the logic yourself.
858
859=cut
860*/
861
862bool
863Perl_lex_bufutf8(pTHX)
864{
865 return UTF;
866}
867
868/*
869=for apidoc Amx|char *|lex_grow_linestr|STRLEN len
870
871Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
872at least I<len> octets (including terminating NUL). Returns a
873pointer to the reallocated buffer. This is necessary before making
874any direct modification of the buffer that would increase its length.
875L</lex_stuff_pvn> provides a more convenient way to insert text into
876the buffer.
877
878Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
879this function updates all of the lexer's variables that point directly
880into the buffer.
881
882=cut
883*/
884
885char *
886Perl_lex_grow_linestr(pTHX_ STRLEN len)
887{
888 SV *linestr;
889 char *buf;
890 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
891 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
892 linestr = PL_parser->linestr;
893 buf = SvPVX(linestr);
894 if (len <= SvLEN(linestr))
895 return buf;
896 bufend_pos = PL_parser->bufend - buf;
897 bufptr_pos = PL_parser->bufptr - buf;
898 oldbufptr_pos = PL_parser->oldbufptr - buf;
899 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
900 linestart_pos = PL_parser->linestart - buf;
901 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
902 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
903 buf = sv_grow(linestr, len);
904 PL_parser->bufend = buf + bufend_pos;
905 PL_parser->bufptr = buf + bufptr_pos;
906 PL_parser->oldbufptr = buf + oldbufptr_pos;
907 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
908 PL_parser->linestart = buf + linestart_pos;
909 if (PL_parser->last_uni)
910 PL_parser->last_uni = buf + last_uni_pos;
911 if (PL_parser->last_lop)
912 PL_parser->last_lop = buf + last_lop_pos;
913 return buf;
914}
915
916/*
917=for apidoc Amx|void|lex_stuff_pvn|char *pv|STRLEN len|U32 flags
918
919Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
920immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
921reallocating the buffer if necessary. This means that lexing code that
922runs later will see the characters as if they had appeared in the input.
923It is not recommended to do this as part of normal parsing, and most
924uses of this facility run the risk of the inserted characters being
925interpreted in an unintended manner.
926
927The string to be inserted is represented by I<len> octets starting
928at I<pv>. These octets are interpreted as either UTF-8 or Latin-1,
929according to whether the C<LEX_STUFF_UTF8> flag is set in I<flags>.
930The characters are recoded for the lexer buffer, according to how the
931buffer is currently being interpreted (L</lex_bufutf8>). If a string
932to be interpreted is available as a Perl scalar, the L</lex_stuff_sv>
933function is more convenient.
934
935=cut
936*/
937
938void
939Perl_lex_stuff_pvn(pTHX_ char *pv, STRLEN len, U32 flags)
940{
749123ff 941 dVAR;
f0e67a1d
Z
942 char *bufptr;
943 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
944 if (flags & ~(LEX_STUFF_UTF8))
945 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
946 if (UTF) {
947 if (flags & LEX_STUFF_UTF8) {
948 goto plain_copy;
949 } else {
950 STRLEN highhalf = 0;
951 char *p, *e = pv+len;
952 for (p = pv; p != e; p++)
953 highhalf += !!(((U8)*p) & 0x80);
954 if (!highhalf)
955 goto plain_copy;
956 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
957 bufptr = PL_parser->bufptr;
958 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
959 PL_parser->bufend += len+highhalf;
960 for (p = pv; p != e; p++) {
961 U8 c = (U8)*p;
962 if (c & 0x80) {
963 *bufptr++ = (char)(0xc0 | (c >> 6));
964 *bufptr++ = (char)(0x80 | (c & 0x3f));
965 } else {
966 *bufptr++ = (char)c;
967 }
968 }
969 }
970 } else {
971 if (flags & LEX_STUFF_UTF8) {
972 STRLEN highhalf = 0;
973 char *p, *e = pv+len;
974 for (p = pv; p != e; p++) {
975 U8 c = (U8)*p;
976 if (c >= 0xc4) {
977 Perl_croak(aTHX_ "Lexing code attempted to stuff "
978 "non-Latin-1 character into Latin-1 input");
979 } else if (c >= 0xc2 && p+1 != e &&
980 (((U8)p[1]) & 0xc0) == 0x80) {
981 p++;
982 highhalf++;
983 } else if (c >= 0x80) {
984 /* malformed UTF-8 */
985 ENTER;
986 SAVESPTR(PL_warnhook);
987 PL_warnhook = PERL_WARNHOOK_FATAL;
988 utf8n_to_uvuni((U8*)p, e-p, NULL, 0);
989 LEAVE;
990 }
991 }
992 if (!highhalf)
993 goto plain_copy;
994 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
995 bufptr = PL_parser->bufptr;
996 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
997 PL_parser->bufend += len-highhalf;
998 for (p = pv; p != e; p++) {
999 U8 c = (U8)*p;
1000 if (c & 0x80) {
1001 *bufptr++ = (char)(((c & 0x3) << 6) | (p[1] & 0x3f));
1002 p++;
1003 } else {
1004 *bufptr++ = (char)c;
1005 }
1006 }
1007 } else {
1008 plain_copy:
1009 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1010 bufptr = PL_parser->bufptr;
1011 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
1012 PL_parser->bufend += len;
1013 Copy(pv, bufptr, len, char);
1014 }
1015 }
1016}
1017
1018/*
1019=for apidoc Amx|void|lex_stuff_sv|SV *sv|U32 flags
1020
1021Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1022immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1023reallocating the buffer if necessary. This means that lexing code that
1024runs later will see the characters as if they had appeared in the input.
1025It is not recommended to do this as part of normal parsing, and most
1026uses of this facility run the risk of the inserted characters being
1027interpreted in an unintended manner.
1028
1029The string to be inserted is the string value of I<sv>. The characters
1030are recoded for the lexer buffer, according to how the buffer is currently
1031being interpreted (L</lex_bufutf8>). If a string to be interpreted is
1032not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1033need to construct a scalar.
1034
1035=cut
1036*/
1037
1038void
1039Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1040{
1041 char *pv;
1042 STRLEN len;
1043 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1044 if (flags)
1045 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1046 pv = SvPV(sv, len);
1047 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1048}
1049
1050/*
1051=for apidoc Amx|void|lex_unstuff|char *ptr
1052
1053Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1054I<ptr>. Text following I<ptr> will be moved, and the buffer shortened.
1055This hides the discarded text from any lexing code that runs later,
1056as if the text had never appeared.
1057
1058This is not the normal way to consume lexed text. For that, use
1059L</lex_read_to>.
1060
1061=cut
1062*/
1063
1064void
1065Perl_lex_unstuff(pTHX_ char *ptr)
1066{
1067 char *buf, *bufend;
1068 STRLEN unstuff_len;
1069 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1070 buf = PL_parser->bufptr;
1071 if (ptr < buf)
1072 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1073 if (ptr == buf)
1074 return;
1075 bufend = PL_parser->bufend;
1076 if (ptr > bufend)
1077 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1078 unstuff_len = ptr - buf;
1079 Move(ptr, buf, bufend+1-ptr, char);
1080 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1081 PL_parser->bufend = bufend - unstuff_len;
1082}
1083
1084/*
1085=for apidoc Amx|void|lex_read_to|char *ptr
1086
1087Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1088to I<ptr>. This advances L</PL_parser-E<gt>bufptr> to match I<ptr>,
1089performing the correct bookkeeping whenever a newline character is passed.
1090This is the normal way to consume lexed text.
1091
1092Interpretation of the buffer's octets can be abstracted out by
1093using the slightly higher-level functions L</lex_peek_unichar> and
1094L</lex_read_unichar>.
1095
1096=cut
1097*/
1098
1099void
1100Perl_lex_read_to(pTHX_ char *ptr)
1101{
1102 char *s;
1103 PERL_ARGS_ASSERT_LEX_READ_TO;
1104 s = PL_parser->bufptr;
1105 if (ptr < s || ptr > PL_parser->bufend)
1106 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1107 for (; s != ptr; s++)
1108 if (*s == '\n') {
1109 CopLINE_inc(PL_curcop);
1110 PL_parser->linestart = s+1;
1111 }
1112 PL_parser->bufptr = ptr;
1113}
1114
1115/*
1116=for apidoc Amx|void|lex_discard_to|char *ptr
1117
1118Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1119up to I<ptr>. The remaining content of the buffer will be moved, and
1120all pointers into the buffer updated appropriately. I<ptr> must not
1121be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1122it is not permitted to discard text that has yet to be lexed.
1123
1124Normally it is not necessarily to do this directly, because it suffices to
1125use the implicit discarding behaviour of L</lex_next_chunk> and things
1126based on it. However, if a token stretches across multiple lines,
1127and the lexing code has kept multiple lines of text in the buffer fof
1128that purpose, then after completion of the token it would be wise to
1129explicitly discard the now-unneeded earlier lines, to avoid future
1130multi-line tokens growing the buffer without bound.
1131
1132=cut
1133*/
1134
1135void
1136Perl_lex_discard_to(pTHX_ char *ptr)
1137{
1138 char *buf;
1139 STRLEN discard_len;
1140 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1141 buf = SvPVX(PL_parser->linestr);
1142 if (ptr < buf)
1143 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1144 if (ptr == buf)
1145 return;
1146 if (ptr > PL_parser->bufptr)
1147 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1148 discard_len = ptr - buf;
1149 if (PL_parser->oldbufptr < ptr)
1150 PL_parser->oldbufptr = ptr;
1151 if (PL_parser->oldoldbufptr < ptr)
1152 PL_parser->oldoldbufptr = ptr;
1153 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1154 PL_parser->last_uni = NULL;
1155 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1156 PL_parser->last_lop = NULL;
1157 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1158 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1159 PL_parser->bufend -= discard_len;
1160 PL_parser->bufptr -= discard_len;
1161 PL_parser->oldbufptr -= discard_len;
1162 PL_parser->oldoldbufptr -= discard_len;
1163 if (PL_parser->last_uni)
1164 PL_parser->last_uni -= discard_len;
1165 if (PL_parser->last_lop)
1166 PL_parser->last_lop -= discard_len;
1167}
1168
1169/*
1170=for apidoc Amx|bool|lex_next_chunk|U32 flags
1171
1172Reads in the next chunk of text to be lexed, appending it to
1173L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1174looked to the end of the current chunk and wants to know more. It is
1175usual, but not necessary, for lexing to have consumed the entirety of
1176the current chunk at this time.
1177
1178If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1179chunk (i.e., the current chunk has been entirely consumed), normally the
1180current chunk will be discarded at the same time that the new chunk is
1181read in. If I<flags> includes C<LEX_KEEP_PREVIOUS>, the current chunk
1182will not be discarded. If the current chunk has not been entirely
1183consumed, then it will not be discarded regardless of the flag.
1184
1185Returns true if some new text was added to the buffer, or false if the
1186buffer has reached the end of the input text.
1187
1188=cut
1189*/
1190
1191#define LEX_FAKE_EOF 0x80000000
1192
1193bool
1194Perl_lex_next_chunk(pTHX_ U32 flags)
1195{
1196 SV *linestr;
1197 char *buf;
1198 STRLEN old_bufend_pos, new_bufend_pos;
1199 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1200 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
17cc9359 1201 bool got_some_for_debugger = 0;
f0e67a1d
Z
1202 bool got_some;
1203 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF))
1204 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
f0e67a1d
Z
1205 linestr = PL_parser->linestr;
1206 buf = SvPVX(linestr);
1207 if (!(flags & LEX_KEEP_PREVIOUS) &&
1208 PL_parser->bufptr == PL_parser->bufend) {
1209 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1210 linestart_pos = 0;
1211 if (PL_parser->last_uni != PL_parser->bufend)
1212 PL_parser->last_uni = NULL;
1213 if (PL_parser->last_lop != PL_parser->bufend)
1214 PL_parser->last_lop = NULL;
1215 last_uni_pos = last_lop_pos = 0;
1216 *buf = 0;
1217 SvCUR(linestr) = 0;
1218 } else {
1219 old_bufend_pos = PL_parser->bufend - buf;
1220 bufptr_pos = PL_parser->bufptr - buf;
1221 oldbufptr_pos = PL_parser->oldbufptr - buf;
1222 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1223 linestart_pos = PL_parser->linestart - buf;
1224 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1225 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1226 }
1227 if (flags & LEX_FAKE_EOF) {
1228 goto eof;
1229 } else if (!PL_parser->rsfp) {
1230 got_some = 0;
1231 } else if (filter_gets(linestr, old_bufend_pos)) {
1232 got_some = 1;
17cc9359 1233 got_some_for_debugger = 1;
f0e67a1d 1234 } else {
580561a3
Z
1235 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1236 sv_setpvs(linestr, "");
f0e67a1d
Z
1237 eof:
1238 /* End of real input. Close filehandle (unless it was STDIN),
1239 * then add implicit termination.
1240 */
1241 if ((PerlIO*)PL_parser->rsfp == PerlIO_stdin())
1242 PerlIO_clearerr(PL_parser->rsfp);
1243 else if (PL_parser->rsfp)
1244 (void)PerlIO_close(PL_parser->rsfp);
1245 PL_parser->rsfp = NULL;
1246 PL_doextract = FALSE;
1247#ifdef PERL_MAD
1248 if (PL_madskills && !PL_in_eval && (PL_minus_p || PL_minus_n))
1249 PL_faketokens = 1;
1250#endif
1251 if (!PL_in_eval && PL_minus_p) {
1252 sv_catpvs(linestr,
1253 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1254 PL_minus_n = PL_minus_p = 0;
1255 } else if (!PL_in_eval && PL_minus_n) {
1256 sv_catpvs(linestr, /*{*/";}");
1257 PL_minus_n = 0;
1258 } else
1259 sv_catpvs(linestr, ";");
1260 got_some = 1;
1261 }
1262 buf = SvPVX(linestr);
1263 new_bufend_pos = SvCUR(linestr);
1264 PL_parser->bufend = buf + new_bufend_pos;
1265 PL_parser->bufptr = buf + bufptr_pos;
1266 PL_parser->oldbufptr = buf + oldbufptr_pos;
1267 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1268 PL_parser->linestart = buf + linestart_pos;
1269 if (PL_parser->last_uni)
1270 PL_parser->last_uni = buf + last_uni_pos;
1271 if (PL_parser->last_lop)
1272 PL_parser->last_lop = buf + last_lop_pos;
17cc9359 1273 if (got_some_for_debugger && (PERLDB_LINE || PERLDB_SAVESRC) &&
f0e67a1d
Z
1274 PL_curstash != PL_debstash) {
1275 /* debugger active and we're not compiling the debugger code,
1276 * so store the line into the debugger's array of lines
1277 */
1278 update_debugger_info(NULL, buf+old_bufend_pos,
1279 new_bufend_pos-old_bufend_pos);
1280 }
1281 return got_some;
1282}
1283
1284/*
1285=for apidoc Amx|I32|lex_peek_unichar|U32 flags
1286
1287Looks ahead one (Unicode) character in the text currently being lexed.
1288Returns the codepoint (unsigned integer value) of the next character,
1289or -1 if lexing has reached the end of the input text. To consume the
1290peeked character, use L</lex_read_unichar>.
1291
1292If the next character is in (or extends into) the next chunk of input
1293text, the next chunk will be read in. Normally the current chunk will be
1294discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1295then the current chunk will not be discarded.
1296
1297If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1298is encountered, an exception is generated.
1299
1300=cut
1301*/
1302
1303I32
1304Perl_lex_peek_unichar(pTHX_ U32 flags)
1305{
749123ff 1306 dVAR;
f0e67a1d
Z
1307 char *s, *bufend;
1308 if (flags & ~(LEX_KEEP_PREVIOUS))
1309 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1310 s = PL_parser->bufptr;
1311 bufend = PL_parser->bufend;
1312 if (UTF) {
1313 U8 head;
1314 I32 unichar;
1315 STRLEN len, retlen;
1316 if (s == bufend) {
1317 if (!lex_next_chunk(flags))
1318 return -1;
1319 s = PL_parser->bufptr;
1320 bufend = PL_parser->bufend;
1321 }
1322 head = (U8)*s;
1323 if (!(head & 0x80))
1324 return head;
1325 if (head & 0x40) {
1326 len = PL_utf8skip[head];
1327 while ((STRLEN)(bufend-s) < len) {
1328 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1329 break;
1330 s = PL_parser->bufptr;
1331 bufend = PL_parser->bufend;
1332 }
1333 }
1334 unichar = utf8n_to_uvuni((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
1335 if (retlen == (STRLEN)-1) {
1336 /* malformed UTF-8 */
1337 ENTER;
1338 SAVESPTR(PL_warnhook);
1339 PL_warnhook = PERL_WARNHOOK_FATAL;
1340 utf8n_to_uvuni((U8*)s, bufend-s, NULL, 0);
1341 LEAVE;
1342 }
1343 return unichar;
1344 } else {
1345 if (s == bufend) {
1346 if (!lex_next_chunk(flags))
1347 return -1;
1348 s = PL_parser->bufptr;
1349 }
1350 return (U8)*s;
1351 }
1352}
1353
1354/*
1355=for apidoc Amx|I32|lex_read_unichar|U32 flags
1356
1357Reads the next (Unicode) character in the text currently being lexed.
1358Returns the codepoint (unsigned integer value) of the character read,
1359and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1360if lexing has reached the end of the input text. To non-destructively
1361examine the next character, use L</lex_peek_unichar> instead.
1362
1363If the next character is in (or extends into) the next chunk of input
1364text, the next chunk will be read in. Normally the current chunk will be
1365discarded at the same time, but if I<flags> includes C<LEX_KEEP_PREVIOUS>
1366then the current chunk will not be discarded.
1367
1368If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1369is encountered, an exception is generated.
1370
1371=cut
1372*/
1373
1374I32
1375Perl_lex_read_unichar(pTHX_ U32 flags)
1376{
1377 I32 c;
1378 if (flags & ~(LEX_KEEP_PREVIOUS))
1379 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1380 c = lex_peek_unichar(flags);
1381 if (c != -1) {
1382 if (c == '\n')
1383 CopLINE_inc(PL_curcop);
1384 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1385 }
1386 return c;
1387}
1388
1389/*
1390=for apidoc Amx|void|lex_read_space|U32 flags
1391
1392Reads optional spaces, in Perl style, in the text currently being
1393lexed. The spaces may include ordinary whitespace characters and
1394Perl-style comments. C<#line> directives are processed if encountered.
1395L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1396at a non-space character (or the end of the input text).
1397
1398If spaces extend into the next chunk of input text, the next chunk will
1399be read in. Normally the current chunk will be discarded at the same
1400time, but if I<flags> includes C<LEX_KEEP_PREVIOUS> then the current
1401chunk will not be discarded.
1402
1403=cut
1404*/
1405
f0998909
Z
1406#define LEX_NO_NEXT_CHUNK 0x80000000
1407
f0e67a1d
Z
1408void
1409Perl_lex_read_space(pTHX_ U32 flags)
1410{
1411 char *s, *bufend;
1412 bool need_incline = 0;
f0998909 1413 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK))
f0e67a1d
Z
1414 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
1415#ifdef PERL_MAD
1416 if (PL_skipwhite) {
1417 sv_free(PL_skipwhite);
1418 PL_skipwhite = NULL;
1419 }
1420 if (PL_madskills)
1421 PL_skipwhite = newSVpvs("");
1422#endif /* PERL_MAD */
1423 s = PL_parser->bufptr;
1424 bufend = PL_parser->bufend;
1425 while (1) {
1426 char c = *s;
1427 if (c == '#') {
1428 do {
1429 c = *++s;
1430 } while (!(c == '\n' || (c == 0 && s == bufend)));
1431 } else if (c == '\n') {
1432 s++;
1433 PL_parser->linestart = s;
1434 if (s == bufend)
1435 need_incline = 1;
1436 else
1437 incline(s);
1438 } else if (isSPACE(c)) {
1439 s++;
1440 } else if (c == 0 && s == bufend) {
1441 bool got_more;
1442#ifdef PERL_MAD
1443 if (PL_madskills)
1444 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1445#endif /* PERL_MAD */
f0998909
Z
1446 if (flags & LEX_NO_NEXT_CHUNK)
1447 break;
f0e67a1d
Z
1448 PL_parser->bufptr = s;
1449 CopLINE_inc(PL_curcop);
1450 got_more = lex_next_chunk(flags);
1451 CopLINE_dec(PL_curcop);
1452 s = PL_parser->bufptr;
1453 bufend = PL_parser->bufend;
1454 if (!got_more)
1455 break;
1456 if (need_incline && PL_parser->rsfp) {
1457 incline(s);
1458 need_incline = 0;
1459 }
1460 } else {
1461 break;
1462 }
1463 }
1464#ifdef PERL_MAD
1465 if (PL_madskills)
1466 sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr);
1467#endif /* PERL_MAD */
1468 PL_parser->bufptr = s;
1469}
1470
1471/*
ffb4593c
NT
1472 * S_incline
1473 * This subroutine has nothing to do with tilting, whether at windmills
1474 * or pinball tables. Its name is short for "increment line". It
57843af0 1475 * increments the current line number in CopLINE(PL_curcop) and checks
ffb4593c 1476 * to see whether the line starts with a comment of the form
9cbb5ea2
GS
1477 * # line 500 "foo.pm"
1478 * If so, it sets the current line number and file to the values in the comment.
ffb4593c
NT
1479 */
1480
76e3520e 1481STATIC void
d9095cec 1482S_incline(pTHX_ const char *s)
463ee0b2 1483{
97aff369 1484 dVAR;
d9095cec
NC
1485 const char *t;
1486 const char *n;
1487 const char *e;
463ee0b2 1488
7918f24d
NC
1489 PERL_ARGS_ASSERT_INCLINE;
1490
57843af0 1491 CopLINE_inc(PL_curcop);
463ee0b2
LW
1492 if (*s++ != '#')
1493 return;
d4c19fe8
AL
1494 while (SPACE_OR_TAB(*s))
1495 s++;
73659bf1
GS
1496 if (strnEQ(s, "line", 4))
1497 s += 4;
1498 else
1499 return;
084592ab 1500 if (SPACE_OR_TAB(*s))
73659bf1 1501 s++;
4e553d73 1502 else
73659bf1 1503 return;
d4c19fe8
AL
1504 while (SPACE_OR_TAB(*s))
1505 s++;
463ee0b2
LW
1506 if (!isDIGIT(*s))
1507 return;
d4c19fe8 1508
463ee0b2
LW
1509 n = s;
1510 while (isDIGIT(*s))
1511 s++;
07714eb4 1512 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
26b6dc3f 1513 return;
bf4acbe4 1514 while (SPACE_OR_TAB(*s))
463ee0b2 1515 s++;
73659bf1 1516 if (*s == '"' && (t = strchr(s+1, '"'))) {
463ee0b2 1517 s++;
73659bf1
GS
1518 e = t + 1;
1519 }
463ee0b2 1520 else {
c35e046a
AL
1521 t = s;
1522 while (!isSPACE(*t))
1523 t++;
73659bf1 1524 e = t;
463ee0b2 1525 }
bf4acbe4 1526 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
73659bf1
GS
1527 e++;
1528 if (*e != '\n' && *e != '\0')
1529 return; /* false alarm */
1530
f4dd75d9 1531 if (t - s > 0) {
d9095cec 1532 const STRLEN len = t - s;
8a5ee598 1533#ifndef USE_ITHREADS
19bad673
NC
1534 SV *const temp_sv = CopFILESV(PL_curcop);
1535 const char *cf;
1536 STRLEN tmplen;
1537
1538 if (temp_sv) {
1539 cf = SvPVX(temp_sv);
1540 tmplen = SvCUR(temp_sv);
1541 } else {
1542 cf = NULL;
1543 tmplen = 0;
1544 }
1545
42d9b98d 1546 if (tmplen > 7 && strnEQ(cf, "(eval ", 6)) {
e66cf94c
RGS
1547 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1548 * to *{"::_<newfilename"} */
44867030
NC
1549 /* However, the long form of evals is only turned on by the
1550 debugger - usually they're "(eval %lu)" */
1551 char smallbuf[128];
1552 char *tmpbuf;
1553 GV **gvp;
d9095cec 1554 STRLEN tmplen2 = len;
798b63bc 1555 if (tmplen + 2 <= sizeof smallbuf)
e66cf94c
RGS
1556 tmpbuf = smallbuf;
1557 else
2ae0db35 1558 Newx(tmpbuf, tmplen + 2, char);
44867030
NC
1559 tmpbuf[0] = '_';
1560 tmpbuf[1] = '<';
2ae0db35 1561 memcpy(tmpbuf + 2, cf, tmplen);
44867030 1562 tmplen += 2;
8a5ee598
RGS
1563 gvp = (GV**)hv_fetch(PL_defstash, tmpbuf, tmplen, FALSE);
1564 if (gvp) {
44867030
NC
1565 char *tmpbuf2;
1566 GV *gv2;
1567
1568 if (tmplen2 + 2 <= sizeof smallbuf)
1569 tmpbuf2 = smallbuf;
1570 else
1571 Newx(tmpbuf2, tmplen2 + 2, char);
1572
1573 if (tmpbuf2 != smallbuf || tmpbuf != smallbuf) {
1574 /* Either they malloc'd it, or we malloc'd it,
1575 so no prefix is present in ours. */
1576 tmpbuf2[0] = '_';
1577 tmpbuf2[1] = '<';
1578 }
1579
1580 memcpy(tmpbuf2 + 2, s, tmplen2);
1581 tmplen2 += 2;
1582
8a5ee598 1583 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
e5527e4b 1584 if (!isGV(gv2)) {
8a5ee598 1585 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
e5527e4b
RGS
1586 /* adjust ${"::_<newfilename"} to store the new file name */
1587 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
3cb1dbc6
NC
1588 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(*gvp)));
1589 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(*gvp)));
e5527e4b 1590 }
44867030
NC
1591
1592 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
8a5ee598 1593 }
e66cf94c 1594 if (tmpbuf != smallbuf) Safefree(tmpbuf);
e66cf94c 1595 }
8a5ee598 1596#endif
05ec9bb3 1597 CopFILE_free(PL_curcop);
d9095cec 1598 CopFILE_setn(PL_curcop, s, len);
f4dd75d9 1599 }
57843af0 1600 CopLINE_set(PL_curcop, atoi(n)-1);
463ee0b2
LW
1601}
1602
29595ff2 1603#ifdef PERL_MAD
cd81e915 1604/* skip space before PL_thistoken */
29595ff2
NC
1605
1606STATIC char *
1607S_skipspace0(pTHX_ register char *s)
1608{
7918f24d
NC
1609 PERL_ARGS_ASSERT_SKIPSPACE0;
1610
29595ff2
NC
1611 s = skipspace(s);
1612 if (!PL_madskills)
1613 return s;
cd81e915
NC
1614 if (PL_skipwhite) {
1615 if (!PL_thiswhite)
6b29d1f5 1616 PL_thiswhite = newSVpvs("");
cd81e915
NC
1617 sv_catsv(PL_thiswhite, PL_skipwhite);
1618 sv_free(PL_skipwhite);
1619 PL_skipwhite = 0;
1620 }
1621 PL_realtokenstart = s - SvPVX(PL_linestr);
29595ff2
NC
1622 return s;
1623}
1624
cd81e915 1625/* skip space after PL_thistoken */
29595ff2
NC
1626
1627STATIC char *
1628S_skipspace1(pTHX_ register char *s)
1629{
d4c19fe8 1630 const char *start = s;
29595ff2
NC
1631 I32 startoff = start - SvPVX(PL_linestr);
1632
7918f24d
NC
1633 PERL_ARGS_ASSERT_SKIPSPACE1;
1634
29595ff2
NC
1635 s = skipspace(s);
1636 if (!PL_madskills)
1637 return s;
1638 start = SvPVX(PL_linestr) + startoff;
cd81e915 1639 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1640 const char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1641 PL_thistoken = newSVpvn(tstart, start - tstart);
1642 }
1643 PL_realtokenstart = -1;
1644 if (PL_skipwhite) {
1645 if (!PL_nextwhite)
6b29d1f5 1646 PL_nextwhite = newSVpvs("");
cd81e915
NC
1647 sv_catsv(PL_nextwhite, PL_skipwhite);
1648 sv_free(PL_skipwhite);
1649 PL_skipwhite = 0;
29595ff2
NC
1650 }
1651 return s;
1652}
1653
1654STATIC char *
1655S_skipspace2(pTHX_ register char *s, SV **svp)
1656{
c35e046a
AL
1657 char *start;
1658 const I32 bufptroff = PL_bufptr - SvPVX(PL_linestr);
1659 const I32 startoff = s - SvPVX(PL_linestr);
1660
7918f24d
NC
1661 PERL_ARGS_ASSERT_SKIPSPACE2;
1662
29595ff2
NC
1663 s = skipspace(s);
1664 PL_bufptr = SvPVX(PL_linestr) + bufptroff;
1665 if (!PL_madskills || !svp)
1666 return s;
1667 start = SvPVX(PL_linestr) + startoff;
cd81e915 1668 if (!PL_thistoken && PL_realtokenstart >= 0) {
d4c19fe8 1669 char * const tstart = SvPVX(PL_linestr) + PL_realtokenstart;
cd81e915
NC
1670 PL_thistoken = newSVpvn(tstart, start - tstart);
1671 PL_realtokenstart = -1;
29595ff2 1672 }
cd81e915 1673 if (PL_skipwhite) {
29595ff2 1674 if (!*svp)
6b29d1f5 1675 *svp = newSVpvs("");
cd81e915
NC
1676 sv_setsv(*svp, PL_skipwhite);
1677 sv_free(PL_skipwhite);
1678 PL_skipwhite = 0;
29595ff2
NC
1679 }
1680
1681 return s;
1682}
1683#endif
1684
80a702cd 1685STATIC void
15f169a1 1686S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
80a702cd
RGS
1687{
1688 AV *av = CopFILEAVx(PL_curcop);
1689 if (av) {
b9f83d2f 1690 SV * const sv = newSV_type(SVt_PVMG);
5fa550fb
NC
1691 if (orig_sv)
1692 sv_setsv(sv, orig_sv);
1693 else
1694 sv_setpvn(sv, buf, len);
80a702cd
RGS
1695 (void)SvIOK_on(sv);
1696 SvIV_set(sv, 0);
1697 av_store(av, (I32)CopLINE(PL_curcop), sv);
1698 }
1699}
1700
ffb4593c
NT
1701/*
1702 * S_skipspace
1703 * Called to gobble the appropriate amount and type of whitespace.
1704 * Skips comments as well.
1705 */
1706
76e3520e 1707STATIC char *
cea2e8a9 1708S_skipspace(pTHX_ register char *s)
a687059c 1709{
5db06880 1710#ifdef PERL_MAD
f0e67a1d
Z
1711 char *start = s;
1712#endif /* PERL_MAD */
7918f24d 1713 PERL_ARGS_ASSERT_SKIPSPACE;
f0e67a1d 1714#ifdef PERL_MAD
cd81e915
NC
1715 if (PL_skipwhite) {
1716 sv_free(PL_skipwhite);
f0e67a1d 1717 PL_skipwhite = NULL;
5db06880 1718 }
f0e67a1d 1719#endif /* PERL_MAD */
3280af22 1720 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
bf4acbe4 1721 while (s < PL_bufend && SPACE_OR_TAB(*s))
463ee0b2 1722 s++;
f0e67a1d
Z
1723 } else {
1724 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
1725 PL_bufptr = s;
f0998909
Z
1726 lex_read_space(LEX_KEEP_PREVIOUS |
1727 (PL_sublex_info.sub_inwhat || PL_lex_state == LEX_FORMLINE ?
1728 LEX_NO_NEXT_CHUNK : 0));
3280af22 1729 s = PL_bufptr;
f0e67a1d
Z
1730 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
1731 if (PL_linestart > PL_bufptr)
1732 PL_bufptr = PL_linestart;
1733 return s;
463ee0b2 1734 }
5db06880 1735#ifdef PERL_MAD
f0e67a1d
Z
1736 if (PL_madskills)
1737 PL_skipwhite = newSVpvn(start, s-start);
1738#endif /* PERL_MAD */
5db06880 1739 return s;
a687059c 1740}
378cc40b 1741
ffb4593c
NT
1742/*
1743 * S_check_uni
1744 * Check the unary operators to ensure there's no ambiguity in how they're
1745 * used. An ambiguous piece of code would be:
1746 * rand + 5
1747 * This doesn't mean rand() + 5. Because rand() is a unary operator,
1748 * the +5 is its argument.
1749 */
1750
76e3520e 1751STATIC void
cea2e8a9 1752S_check_uni(pTHX)
ba106d47 1753{
97aff369 1754 dVAR;
d4c19fe8
AL
1755 const char *s;
1756 const char *t;
2f3197b3 1757
3280af22 1758 if (PL_oldoldbufptr != PL_last_uni)
2f3197b3 1759 return;
3280af22
NIS
1760 while (isSPACE(*PL_last_uni))
1761 PL_last_uni++;
c35e046a
AL
1762 s = PL_last_uni;
1763 while (isALNUM_lazy_if(s,UTF) || *s == '-')
1764 s++;
3280af22 1765 if ((t = strchr(s, '(')) && t < PL_bufptr)
a0d0e21e 1766 return;
6136c704 1767
9b387841
NC
1768 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
1769 "Warning: Use of \"%.*s\" without parentheses is ambiguous",
1770 (int)(s - PL_last_uni), PL_last_uni);
2f3197b3
LW
1771}
1772
ffb4593c
NT
1773/*
1774 * LOP : macro to build a list operator. Its behaviour has been replaced
1775 * with a subroutine, S_lop() for which LOP is just another name.
1776 */
1777
a0d0e21e
LW
1778#define LOP(f,x) return lop(f,x,s)
1779
ffb4593c
NT
1780/*
1781 * S_lop
1782 * Build a list operator (or something that might be one). The rules:
1783 * - if we have a next token, then it's a list operator [why?]
1784 * - if the next thing is an opening paren, then it's a function
1785 * - else it's a list operator
1786 */
1787
76e3520e 1788STATIC I32
a0be28da 1789S_lop(pTHX_ I32 f, int x, char *s)
ffed7fef 1790{
97aff369 1791 dVAR;
7918f24d
NC
1792
1793 PERL_ARGS_ASSERT_LOP;
1794
6154021b 1795 pl_yylval.ival = f;
35c8bce7 1796 CLINE;
3280af22
NIS
1797 PL_expect = x;
1798 PL_bufptr = s;
1799 PL_last_lop = PL_oldbufptr;
eb160463 1800 PL_last_lop_op = (OPCODE)f;
5db06880
NC
1801#ifdef PERL_MAD
1802 if (PL_lasttoke)
1803 return REPORT(LSTOP);
1804#else
3280af22 1805 if (PL_nexttoke)
bbf60fe6 1806 return REPORT(LSTOP);
5db06880 1807#endif
79072805 1808 if (*s == '(')
bbf60fe6 1809 return REPORT(FUNC);
29595ff2 1810 s = PEEKSPACE(s);
79072805 1811 if (*s == '(')
bbf60fe6 1812 return REPORT(FUNC);
79072805 1813 else
bbf60fe6 1814 return REPORT(LSTOP);
79072805
LW
1815}
1816
5db06880
NC
1817#ifdef PERL_MAD
1818 /*
1819 * S_start_force
1820 * Sets up for an eventual force_next(). start_force(0) basically does
1821 * an unshift, while start_force(-1) does a push. yylex removes items
1822 * on the "pop" end.
1823 */
1824
1825STATIC void
1826S_start_force(pTHX_ int where)
1827{
1828 int i;
1829
cd81e915 1830 if (where < 0) /* so people can duplicate start_force(PL_curforce) */
5db06880 1831 where = PL_lasttoke;
cd81e915
NC
1832 assert(PL_curforce < 0 || PL_curforce == where);
1833 if (PL_curforce != where) {
5db06880
NC
1834 for (i = PL_lasttoke; i > where; --i) {
1835 PL_nexttoke[i] = PL_nexttoke[i-1];
1836 }
1837 PL_lasttoke++;
1838 }
cd81e915 1839 if (PL_curforce < 0) /* in case of duplicate start_force() */
5db06880 1840 Zero(&PL_nexttoke[where], 1, NEXTTOKE);
cd81e915
NC
1841 PL_curforce = where;
1842 if (PL_nextwhite) {
5db06880 1843 if (PL_madskills)
6b29d1f5 1844 curmad('^', newSVpvs(""));
cd81e915 1845 CURMAD('_', PL_nextwhite);
5db06880
NC
1846 }
1847}
1848
1849STATIC void
1850S_curmad(pTHX_ char slot, SV *sv)
1851{
1852 MADPROP **where;
1853
1854 if (!sv)
1855 return;
cd81e915
NC
1856 if (PL_curforce < 0)
1857 where = &PL_thismad;
5db06880 1858 else
cd81e915 1859 where = &PL_nexttoke[PL_curforce].next_mad;
5db06880 1860
cd81e915 1861 if (PL_faketokens)
76f68e9b 1862 sv_setpvs(sv, "");
5db06880
NC
1863 else {
1864 if (!IN_BYTES) {
1865 if (UTF && is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
1866 SvUTF8_on(sv);
1867 else if (PL_encoding) {
1868 sv_recode_to_utf8(sv, PL_encoding);
1869 }
1870 }
1871 }
1872
1873 /* keep a slot open for the head of the list? */
1874 if (slot != '_' && *where && (*where)->mad_key == '^') {
1875 (*where)->mad_key = slot;
daba3364 1876 sv_free(MUTABLE_SV(((*where)->mad_val)));
5db06880
NC
1877 (*where)->mad_val = (void*)sv;
1878 }
1879 else
1880 addmad(newMADsv(slot, sv), where, 0);
1881}
1882#else
b3f24c00
MHM
1883# define start_force(where) NOOP
1884# define curmad(slot, sv) NOOP
5db06880
NC
1885#endif
1886
ffb4593c
NT
1887/*
1888 * S_force_next
9cbb5ea2 1889 * When the lexer realizes it knows the next token (for instance,
ffb4593c 1890 * it is reordering tokens for the parser) then it can call S_force_next
9cbb5ea2 1891 * to know what token to return the next time the lexer is called. Caller
5db06880
NC
1892 * will need to set PL_nextval[] (or PL_nexttoke[].next_val with PERL_MAD),
1893 * and possibly PL_expect to ensure the lexer handles the token correctly.
ffb4593c
NT
1894 */
1895
4e553d73 1896STATIC void
cea2e8a9 1897S_force_next(pTHX_ I32 type)
79072805 1898{
97aff369 1899 dVAR;
704d4215
GG
1900#ifdef DEBUGGING
1901 if (DEBUG_T_TEST) {
1902 PerlIO_printf(Perl_debug_log, "### forced token:\n");
f05d7009 1903 tokereport(type, &NEXTVAL_NEXTTOKE);
704d4215
GG
1904 }
1905#endif
5db06880 1906#ifdef PERL_MAD
cd81e915 1907 if (PL_curforce < 0)
5db06880 1908 start_force(PL_lasttoke);
cd81e915 1909 PL_nexttoke[PL_curforce].next_type = type;
5db06880
NC
1910 if (PL_lex_state != LEX_KNOWNEXT)
1911 PL_lex_defer = PL_lex_state;
1912 PL_lex_state = LEX_KNOWNEXT;
1913 PL_lex_expect = PL_expect;
cd81e915 1914 PL_curforce = -1;
5db06880 1915#else
3280af22
NIS
1916 PL_nexttype[PL_nexttoke] = type;
1917 PL_nexttoke++;
1918 if (PL_lex_state != LEX_KNOWNEXT) {
1919 PL_lex_defer = PL_lex_state;
1920 PL_lex_expect = PL_expect;
1921 PL_lex_state = LEX_KNOWNEXT;
79072805 1922 }
5db06880 1923#endif
79072805
LW
1924}
1925
d0a148a6 1926STATIC SV *
15f169a1 1927S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
d0a148a6 1928{
97aff369 1929 dVAR;
740cce10 1930 SV * const sv = newSVpvn_utf8(start, len,
eaf7a4d2
CS
1931 !IN_BYTES
1932 && UTF
1933 && !is_ascii_string((const U8*)start, len)
740cce10 1934 && is_utf8_string((const U8*)start, len));
d0a148a6
NC
1935 return sv;
1936}
1937
ffb4593c
NT
1938/*
1939 * S_force_word
1940 * When the lexer knows the next thing is a word (for instance, it has
1941 * just seen -> and it knows that the next char is a word char, then
02b34bbe
DM
1942 * it calls S_force_word to stick the next word into the PL_nexttoke/val
1943 * lookahead.
ffb4593c
NT
1944 *
1945 * Arguments:
b1b65b59 1946 * char *start : buffer position (must be within PL_linestr)
02b34bbe 1947 * int token : PL_next* will be this type of bare word (e.g., METHOD,WORD)
ffb4593c
NT
1948 * int check_keyword : if true, Perl checks to make sure the word isn't
1949 * a keyword (do this if the word is a label, e.g. goto FOO)
1950 * int allow_pack : if true, : characters will also be allowed (require,
1951 * use, etc. do this)
9cbb5ea2 1952 * int allow_initial_tick : used by the "sub" lexer only.
ffb4593c
NT
1953 */
1954
76e3520e 1955STATIC char *
cea2e8a9 1956S_force_word(pTHX_ register char *start, int token, int check_keyword, int allow_pack, int allow_initial_tick)
79072805 1957{
97aff369 1958 dVAR;
463ee0b2
LW
1959 register char *s;
1960 STRLEN len;
4e553d73 1961
7918f24d
NC
1962 PERL_ARGS_ASSERT_FORCE_WORD;
1963
29595ff2 1964 start = SKIPSPACE1(start);
463ee0b2 1965 s = start;
7e2040f0 1966 if (isIDFIRST_lazy_if(s,UTF) ||
a0d0e21e 1967 (allow_pack && *s == ':') ||
15f0808c 1968 (allow_initial_tick && *s == '\'') )
a0d0e21e 1969 {
3280af22 1970 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
5458a98a 1971 if (check_keyword && keyword(PL_tokenbuf, len, 0))
463ee0b2 1972 return start;
cd81e915 1973 start_force(PL_curforce);
5db06880
NC
1974 if (PL_madskills)
1975 curmad('X', newSVpvn(start,s-start));
463ee0b2 1976 if (token == METHOD) {
29595ff2 1977 s = SKIPSPACE1(s);
463ee0b2 1978 if (*s == '(')
3280af22 1979 PL_expect = XTERM;
463ee0b2 1980 else {
3280af22 1981 PL_expect = XOPERATOR;
463ee0b2 1982 }
79072805 1983 }
e74e6b3d 1984 if (PL_madskills)
63575281 1985 curmad('g', newSVpvs( "forced" ));
9ded7720 1986 NEXTVAL_NEXTTOKE.opval
d0a148a6
NC
1987 = (OP*)newSVOP(OP_CONST,0,
1988 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
9ded7720 1989 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
79072805
LW
1990 force_next(token);
1991 }
1992 return s;
1993}
1994
ffb4593c
NT
1995/*
1996 * S_force_ident
9cbb5ea2 1997 * Called when the lexer wants $foo *foo &foo etc, but the program
ffb4593c
NT
1998 * text only contains the "foo" portion. The first argument is a pointer
1999 * to the "foo", and the second argument is the type symbol to prefix.
2000 * Forces the next token to be a "WORD".
9cbb5ea2 2001 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
ffb4593c
NT
2002 */
2003
76e3520e 2004STATIC void
bfed75c6 2005S_force_ident(pTHX_ register const char *s, int kind)
79072805 2006{
97aff369 2007 dVAR;
7918f24d
NC
2008
2009 PERL_ARGS_ASSERT_FORCE_IDENT;
2010
c35e046a 2011 if (*s) {
90e5519e
NC
2012 const STRLEN len = strlen(s);
2013 OP* const o = (OP*)newSVOP(OP_CONST, 0, newSVpvn(s, len));
cd81e915 2014 start_force(PL_curforce);
9ded7720 2015 NEXTVAL_NEXTTOKE.opval = o;
79072805 2016 force_next(WORD);
748a9306 2017 if (kind) {
11343788 2018 o->op_private = OPpCONST_ENTERED;
55497cff 2019 /* XXX see note in pp_entereval() for why we forgo typo
2020 warnings if the symbol must be introduced in an eval.
2021 GSAR 96-10-12 */
90e5519e
NC
2022 gv_fetchpvn_flags(s, len,
2023 PL_in_eval ? (GV_ADDMULTI | GV_ADDINEVAL)
2024 : GV_ADD,
2025 kind == '$' ? SVt_PV :
2026 kind == '@' ? SVt_PVAV :
2027 kind == '%' ? SVt_PVHV :
a0d0e21e 2028 SVt_PVGV
90e5519e 2029 );
748a9306 2030 }
79072805
LW
2031 }
2032}
2033
1571675a
GS
2034NV
2035Perl_str_to_version(pTHX_ SV *sv)
2036{
2037 NV retval = 0.0;
2038 NV nshift = 1.0;
2039 STRLEN len;
cfd0369c 2040 const char *start = SvPV_const(sv,len);
9d4ba2ae 2041 const char * const end = start + len;
504618e9 2042 const bool utf = SvUTF8(sv) ? TRUE : FALSE;
7918f24d
NC
2043
2044 PERL_ARGS_ASSERT_STR_TO_VERSION;
2045
1571675a 2046 while (start < end) {
ba210ebe 2047 STRLEN skip;
1571675a
GS
2048 UV n;
2049 if (utf)
9041c2e3 2050 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
1571675a
GS
2051 else {
2052 n = *(U8*)start;
2053 skip = 1;
2054 }
2055 retval += ((NV)n)/nshift;
2056 start += skip;
2057 nshift *= 1000;
2058 }
2059 return retval;
2060}
2061
4e553d73 2062/*
ffb4593c
NT
2063 * S_force_version
2064 * Forces the next token to be a version number.
e759cc13
RGS
2065 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2066 * and if "guessing" is TRUE, then no new token is created (and the caller
2067 * must use an alternative parsing method).
ffb4593c
NT
2068 */
2069
76e3520e 2070STATIC char *
e759cc13 2071S_force_version(pTHX_ char *s, int guessing)
89bfa8cd 2072{
97aff369 2073 dVAR;
5f66b61c 2074 OP *version = NULL;
44dcb63b 2075 char *d;
5db06880
NC
2076#ifdef PERL_MAD
2077 I32 startoff = s - SvPVX(PL_linestr);
2078#endif
89bfa8cd 2079
7918f24d
NC
2080 PERL_ARGS_ASSERT_FORCE_VERSION;
2081
29595ff2 2082 s = SKIPSPACE1(s);
89bfa8cd 2083
44dcb63b 2084 d = s;
dd629d5b 2085 if (*d == 'v')
44dcb63b 2086 d++;
44dcb63b 2087 if (isDIGIT(*d)) {
e759cc13
RGS
2088 while (isDIGIT(*d) || *d == '_' || *d == '.')
2089 d++;
5db06880
NC
2090#ifdef PERL_MAD
2091 if (PL_madskills) {
cd81e915 2092 start_force(PL_curforce);
5db06880
NC
2093 curmad('X', newSVpvn(s,d-s));
2094 }
2095#endif
9f3d182e 2096 if (*d == ';' || isSPACE(*d) || *d == '}' || !*d) {
dd629d5b 2097 SV *ver;
8d08d9ba
DG
2098#ifdef USE_LOCALE_NUMERIC
2099 char *loc = setlocale(LC_NUMERIC, "C");
2100#endif
6154021b 2101 s = scan_num(s, &pl_yylval);
8d08d9ba
DG
2102#ifdef USE_LOCALE_NUMERIC
2103 setlocale(LC_NUMERIC, loc);
2104#endif
6154021b 2105 version = pl_yylval.opval;
dd629d5b
GS
2106 ver = cSVOPx(version)->op_sv;
2107 if (SvPOK(ver) && !SvNIOK(ver)) {
862a34c6 2108 SvUPGRADE(ver, SVt_PVNV);
9d6ce603 2109 SvNV_set(ver, str_to_version(ver));
1571675a 2110 SvNOK_on(ver); /* hint that it is a version */
44dcb63b 2111 }
89bfa8cd 2112 }
5db06880
NC
2113 else if (guessing) {
2114#ifdef PERL_MAD
2115 if (PL_madskills) {
cd81e915
NC
2116 sv_free(PL_nextwhite); /* let next token collect whitespace */
2117 PL_nextwhite = 0;
5db06880
NC
2118 s = SvPVX(PL_linestr) + startoff;
2119 }
2120#endif
e759cc13 2121 return s;
5db06880 2122 }
89bfa8cd 2123 }
2124
5db06880
NC
2125#ifdef PERL_MAD
2126 if (PL_madskills && !version) {
cd81e915
NC
2127 sv_free(PL_nextwhite); /* let next token collect whitespace */
2128 PL_nextwhite = 0;
5db06880
NC
2129 s = SvPVX(PL_linestr) + startoff;
2130 }
2131#endif
89bfa8cd 2132 /* NOTE: The parser sees the package name and the VERSION swapped */
cd81e915 2133 start_force(PL_curforce);
9ded7720 2134 NEXTVAL_NEXTTOKE.opval = version;
4e553d73 2135 force_next(WORD);
89bfa8cd 2136
e759cc13 2137 return s;
89bfa8cd 2138}
2139
ffb4593c 2140/*
91152fc1
DG
2141 * S_force_strict_version
2142 * Forces the next token to be a version number using strict syntax rules.
2143 */
2144
2145STATIC char *
2146S_force_strict_version(pTHX_ char *s)
2147{
2148 dVAR;
2149 OP *version = NULL;
2150#ifdef PERL_MAD
2151 I32 startoff = s - SvPVX(PL_linestr);
2152#endif
2153 const char *errstr = NULL;
2154
2155 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2156
2157 while (isSPACE(*s)) /* leading whitespace */
2158 s++;
2159
2160 if (is_STRICT_VERSION(s,&errstr)) {
2161 SV *ver = newSV(0);
2162 s = (char *)scan_version(s, ver, 0);
2163 version = newSVOP(OP_CONST, 0, ver);
2164 }
2165 else if ( (*s != ';' && *s != '}' ) && (s = SKIPSPACE1(s), (*s != ';' && *s !='}' ))) {
2166 PL_bufptr = s;
2167 if (errstr)
2168 yyerror(errstr); /* version required */
2169 return s;
2170 }
2171
2172#ifdef PERL_MAD
2173 if (PL_madskills && !version) {
2174 sv_free(PL_nextwhite); /* let next token collect whitespace */
2175 PL_nextwhite = 0;
2176 s = SvPVX(PL_linestr) + startoff;
2177 }
2178#endif
2179 /* NOTE: The parser sees the package name and the VERSION swapped */
2180 start_force(PL_curforce);
2181 NEXTVAL_NEXTTOKE.opval = version;
2182 force_next(WORD);
2183
2184 return s;
2185}
2186
2187/*
ffb4593c
NT
2188 * S_tokeq
2189 * Tokenize a quoted string passed in as an SV. It finds the next
2190 * chunk, up to end of string or a backslash. It may make a new
2191 * SV containing that chunk (if HINT_NEW_STRING is on). It also
2192 * turns \\ into \.
2193 */
2194
76e3520e 2195STATIC SV *
cea2e8a9 2196S_tokeq(pTHX_ SV *sv)
79072805 2197{
97aff369 2198 dVAR;
79072805
LW
2199 register char *s;
2200 register char *send;
2201 register char *d;
b3ac6de7
IZ
2202 STRLEN len = 0;
2203 SV *pv = sv;
79072805 2204
7918f24d
NC
2205 PERL_ARGS_ASSERT_TOKEQ;
2206
79072805 2207 if (!SvLEN(sv))
b3ac6de7 2208 goto finish;
79072805 2209
a0d0e21e 2210 s = SvPV_force(sv, len);
21a311ee 2211 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1)
b3ac6de7 2212 goto finish;
463ee0b2 2213 send = s + len;
79072805
LW
2214 while (s < send && *s != '\\')
2215 s++;
2216 if (s == send)
b3ac6de7 2217 goto finish;
79072805 2218 d = s;
be4731d2 2219 if ( PL_hints & HINT_NEW_STRING ) {
59cd0e26 2220 pv = newSVpvn_flags(SvPVX_const(pv), len, SVs_TEMP | SvUTF8(sv));
be4731d2 2221 }
79072805
LW
2222 while (s < send) {
2223 if (*s == '\\') {
a0d0e21e 2224 if (s + 1 < send && (s[1] == '\\'))
79072805
LW
2225 s++; /* all that, just for this */
2226 }
2227 *d++ = *s++;
2228 }
2229 *d = '\0';
95a20fc0 2230 SvCUR_set(sv, d - SvPVX_const(sv));
b3ac6de7 2231 finish:
3280af22 2232 if ( PL_hints & HINT_NEW_STRING )
eb0d8d16 2233 return new_constant(NULL, 0, "q", sv, pv, "q", 1);
79072805
LW
2234 return sv;
2235}
2236
ffb4593c
NT
2237/*
2238 * Now come three functions related to double-quote context,
2239 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2240 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2241 * interact with PL_lex_state, and create fake ( ... ) argument lists
2242 * to handle functions and concatenation.
2243 * They assume that whoever calls them will be setting up a fake
2244 * join call, because each subthing puts a ',' after it. This lets
2245 * "lower \luPpEr"
2246 * become
2247 * join($, , 'lower ', lcfirst( 'uPpEr', ) ,)
2248 *
2249 * (I'm not sure whether the spurious commas at the end of lcfirst's
2250 * arguments and join's arguments are created or not).
2251 */
2252
2253/*
2254 * S_sublex_start
6154021b 2255 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
ffb4593c
NT
2256 *
2257 * Pattern matching will set PL_lex_op to the pattern-matching op to
6154021b 2258 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
ffb4593c
NT
2259 *
2260 * OP_CONST and OP_READLINE are easy--just make the new op and return.
2261 *
2262 * Everything else becomes a FUNC.
2263 *
2264 * Sets PL_lex_state to LEX_INTERPPUSH unless (ival was OP_NULL or we
2265 * had an OP_CONST or OP_READLINE). This just sets us up for a
2266 * call to S_sublex_push().
2267 */
2268
76e3520e 2269STATIC I32
cea2e8a9 2270S_sublex_start(pTHX)
79072805 2271{
97aff369 2272 dVAR;
6154021b 2273 register const I32 op_type = pl_yylval.ival;
79072805
LW
2274
2275 if (op_type == OP_NULL) {
6154021b 2276 pl_yylval.opval = PL_lex_op;
5f66b61c 2277 PL_lex_op = NULL;
79072805
LW
2278 return THING;
2279 }
2280 if (op_type == OP_CONST || op_type == OP_READLINE) {
3280af22 2281 SV *sv = tokeq(PL_lex_stuff);
b3ac6de7
IZ
2282
2283 if (SvTYPE(sv) == SVt_PVIV) {
2284 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2285 STRLEN len;
96a5add6 2286 const char * const p = SvPV_const(sv, len);
740cce10 2287 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
b3ac6de7
IZ
2288 SvREFCNT_dec(sv);
2289 sv = nsv;
4e553d73 2290 }
6154021b 2291 pl_yylval.opval = (OP*)newSVOP(op_type, 0, sv);
a0714e2c 2292 PL_lex_stuff = NULL;
6f33ba73
RGS
2293 /* Allow <FH> // "foo" */
2294 if (op_type == OP_READLINE)
2295 PL_expect = XTERMORDORDOR;
79072805
LW
2296 return THING;
2297 }
e3f73d4e
RGS
2298 else if (op_type == OP_BACKTICK && PL_lex_op) {
2299 /* readpipe() vas overriden */
2300 cSVOPx(cLISTOPx(cUNOPx(PL_lex_op)->op_first)->op_first->op_sibling)->op_sv = tokeq(PL_lex_stuff);
6154021b 2301 pl_yylval.opval = PL_lex_op;
9b201d7d 2302 PL_lex_op = NULL;
e3f73d4e
RGS
2303 PL_lex_stuff = NULL;
2304 return THING;
2305 }
79072805 2306
3280af22 2307 PL_sublex_info.super_state = PL_lex_state;
eac04b2e 2308 PL_sublex_info.sub_inwhat = (U16)op_type;
3280af22
NIS
2309 PL_sublex_info.sub_op = PL_lex_op;
2310 PL_lex_state = LEX_INTERPPUSH;
55497cff 2311
3280af22
NIS
2312 PL_expect = XTERM;
2313 if (PL_lex_op) {
6154021b 2314 pl_yylval.opval = PL_lex_op;
5f66b61c 2315 PL_lex_op = NULL;
55497cff 2316 return PMFUNC;
2317 }
2318 else
2319 return FUNC;
2320}
2321
ffb4593c
NT
2322/*
2323 * S_sublex_push
2324 * Create a new scope to save the lexing state. The scope will be
2325 * ended in S_sublex_done. Returns a '(', starting the function arguments
2326 * to the uc, lc, etc. found before.
2327 * Sets PL_lex_state to LEX_INTERPCONCAT.
2328 */
2329
76e3520e 2330STATIC I32
cea2e8a9 2331S_sublex_push(pTHX)
55497cff 2332{
27da23d5 2333 dVAR;
f46d017c 2334 ENTER;
55497cff 2335
3280af22 2336 PL_lex_state = PL_sublex_info.super_state;
651b5b28 2337 SAVEBOOL(PL_lex_dojoin);
3280af22 2338 SAVEI32(PL_lex_brackets);
3280af22
NIS
2339 SAVEI32(PL_lex_casemods);
2340 SAVEI32(PL_lex_starts);
651b5b28 2341 SAVEI8(PL_lex_state);
7766f137 2342 SAVEVPTR(PL_lex_inpat);
98246f1e 2343 SAVEI16(PL_lex_inwhat);
57843af0 2344 SAVECOPLINE(PL_curcop);
3280af22 2345 SAVEPPTR(PL_bufptr);
8452ff4b 2346 SAVEPPTR(PL_bufend);
3280af22
NIS
2347 SAVEPPTR(PL_oldbufptr);
2348 SAVEPPTR(PL_oldoldbufptr);
207e3d1a
JH
2349 SAVEPPTR(PL_last_lop);
2350 SAVEPPTR(PL_last_uni);
3280af22
NIS
2351 SAVEPPTR(PL_linestart);
2352 SAVESPTR(PL_linestr);
8edd5f42
RGS
2353 SAVEGENERICPV(PL_lex_brackstack);
2354 SAVEGENERICPV(PL_lex_casestack);
3280af22
NIS
2355
2356 PL_linestr = PL_lex_stuff;
a0714e2c 2357 PL_lex_stuff = NULL;
3280af22 2358
9cbb5ea2
GS
2359 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2360 = SvPVX(PL_linestr);
3280af22 2361 PL_bufend += SvCUR(PL_linestr);
bd61b366 2362 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2363 SAVEFREESV(PL_linestr);
2364
2365 PL_lex_dojoin = FALSE;
2366 PL_lex_brackets = 0;
a02a5408
JC
2367 Newx(PL_lex_brackstack, 120, char);
2368 Newx(PL_lex_casestack, 12, char);
3280af22
NIS
2369 PL_lex_casemods = 0;
2370 *PL_lex_casestack = '\0';
2371 PL_lex_starts = 0;
2372 PL_lex_state = LEX_INTERPCONCAT;
eb160463 2373 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
3280af22
NIS
2374
2375 PL_lex_inwhat = PL_sublex_info.sub_inwhat;
2376 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2377 PL_lex_inpat = PL_sublex_info.sub_op;
79072805 2378 else
5f66b61c 2379 PL_lex_inpat = NULL;
79072805 2380
55497cff 2381 return '(';
79072805
LW
2382}
2383
ffb4593c
NT
2384/*
2385 * S_sublex_done
2386 * Restores lexer state after a S_sublex_push.
2387 */
2388
76e3520e 2389STATIC I32
cea2e8a9 2390S_sublex_done(pTHX)
79072805 2391{
27da23d5 2392 dVAR;
3280af22 2393 if (!PL_lex_starts++) {
396482e1 2394 SV * const sv = newSVpvs("");
9aa983d2
JH
2395 if (SvUTF8(PL_linestr))
2396 SvUTF8_on(sv);
3280af22 2397 PL_expect = XOPERATOR;
6154021b 2398 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
79072805
LW
2399 return THING;
2400 }
2401
3280af22
NIS
2402 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2403 PL_lex_state = LEX_INTERPCASEMOD;
cea2e8a9 2404 return yylex();
79072805
LW
2405 }
2406
ffb4593c 2407 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
3280af22
NIS
2408 if (PL_lex_repl && (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS)) {
2409 PL_linestr = PL_lex_repl;
2410 PL_lex_inpat = 0;
2411 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2412 PL_bufend += SvCUR(PL_linestr);
bd61b366 2413 PL_last_lop = PL_last_uni = NULL;
3280af22
NIS
2414 SAVEFREESV(PL_linestr);
2415 PL_lex_dojoin = FALSE;
2416 PL_lex_brackets = 0;
3280af22
NIS
2417 PL_lex_casemods = 0;
2418 *PL_lex_casestack = '\0';
2419 PL_lex_starts = 0;
25da4f38 2420 if (SvEVALED(PL_lex_repl)) {
3280af22
NIS
2421 PL_lex_state = LEX_INTERPNORMAL;
2422 PL_lex_starts++;
e9fa98b2
HS
2423 /* we don't clear PL_lex_repl here, so that we can check later
2424 whether this is an evalled subst; that means we rely on the
2425 logic to ensure sublex_done() is called again only via the
2426 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
79072805 2427 }
e9fa98b2 2428 else {
3280af22 2429 PL_lex_state = LEX_INTERPCONCAT;
a0714e2c 2430 PL_lex_repl = NULL;
e9fa98b2 2431 }
79072805 2432 return ',';
ffed7fef
LW
2433 }
2434 else {
5db06880
NC
2435#ifdef PERL_MAD
2436 if (PL_madskills) {
cd81e915
NC
2437 if (PL_thiswhite) {
2438 if (!PL_endwhite)
6b29d1f5 2439 PL_endwhite = newSVpvs("");
cd81e915
NC
2440 sv_catsv(PL_endwhite, PL_thiswhite);
2441 PL_thiswhite = 0;
2442 }
2443 if (PL_thistoken)
76f68e9b 2444 sv_setpvs(PL_thistoken,"");
5db06880 2445 else
cd81e915 2446 PL_realtokenstart = -1;
5db06880
NC
2447 }
2448#endif
f46d017c 2449 LEAVE;
3280af22
NIS
2450 PL_bufend = SvPVX(PL_linestr);
2451 PL_bufend += SvCUR(PL_linestr);
2452 PL_expect = XOPERATOR;
09bef843 2453 PL_sublex_info.sub_inwhat = 0;
79072805 2454 return ')';
ffed7fef
LW
2455 }
2456}
2457
02aa26ce
NT
2458/*
2459 scan_const
2460
2461 Extracts a pattern, double-quoted string, or transliteration. This
2462 is terrifying code.
2463
94def140 2464 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3280af22 2465 processing a pattern (PL_lex_inpat is true), a transliteration
94def140 2466 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
02aa26ce 2467
94def140
TS
2468 Returns a pointer to the character scanned up to. If this is
2469 advanced from the start pointer supplied (i.e. if anything was
9b599b2a 2470 successfully parsed), will leave an OP for the substring scanned
6154021b 2471 in pl_yylval. Caller must intuit reason for not parsing further
9b599b2a
GS
2472 by looking at the next characters herself.
2473
02aa26ce
NT
2474 In patterns:
2475 backslashes:
ff3f963a 2476 constants: \N{NAME} only
02aa26ce
NT
2477 case and quoting: \U \Q \E
2478 stops on @ and $, but not for $ as tail anchor
2479
2480 In transliterations:
2481 characters are VERY literal, except for - not at the start or end
94def140
TS
2482 of the string, which indicates a range. If the range is in bytes,
2483 scan_const expands the range to the full set of intermediate
2484 characters. If the range is in utf8, the hyphen is replaced with
2485 a certain range mark which will be handled by pmtrans() in op.c.
02aa26ce
NT
2486
2487 In double-quoted strings:
2488 backslashes:
2489 double-quoted style: \r and \n
ff3f963a 2490 constants: \x31, etc.
94def140 2491 deprecated backrefs: \1 (in substitution replacements)
02aa26ce
NT
2492 case and quoting: \U \Q \E
2493 stops on @ and $
2494
2495 scan_const does *not* construct ops to handle interpolated strings.
2496 It stops processing as soon as it finds an embedded $ or @ variable
2497 and leaves it to the caller to work out what's going on.
2498
94def140
TS
2499 embedded arrays (whether in pattern or not) could be:
2500 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
2501
2502 $ in double-quoted strings must be the symbol of an embedded scalar.
02aa26ce
NT
2503
2504 $ in pattern could be $foo or could be tail anchor. Assumption:
2505 it's a tail anchor if $ is the last thing in the string, or if it's
94def140 2506 followed by one of "()| \r\n\t"
02aa26ce
NT
2507
2508 \1 (backreferences) are turned into $1
2509
2510 The structure of the code is
2511 while (there's a character to process) {
94def140
TS
2512 handle transliteration ranges
2513 skip regexp comments /(?#comment)/ and codes /(?{code})/
2514 skip #-initiated comments in //x patterns
2515 check for embedded arrays
02aa26ce
NT
2516 check for embedded scalars
2517 if (backslash) {
94def140 2518 deprecate \1 in substitution replacements
02aa26ce
NT
2519 handle string-changing backslashes \l \U \Q \E, etc.
2520 switch (what was escaped) {
94def140 2521 handle \- in a transliteration (becomes a literal -)
ff3f963a 2522 if a pattern and not \N{, go treat as regular character
94def140
TS
2523 handle \132 (octal characters)
2524 handle \x15 and \x{1234} (hex characters)
ff3f963a 2525 handle \N{name} (named characters, also \N{3,5} in a pattern)
94def140
TS
2526 handle \cV (control characters)
2527 handle printf-style backslashes (\f, \r, \n, etc)
02aa26ce 2528 } (end switch)
77a135fe 2529 continue
02aa26ce 2530 } (end if backslash)
77a135fe 2531 handle regular character
02aa26ce 2532 } (end while character to read)
4e553d73 2533
02aa26ce
NT
2534*/
2535
76e3520e 2536STATIC char *
cea2e8a9 2537S_scan_const(pTHX_ char *start)
79072805 2538{
97aff369 2539 dVAR;
3280af22 2540 register char *send = PL_bufend; /* end of the constant */
77a135fe
KW
2541 SV *sv = newSV(send - start); /* sv for the constant. See
2542 note below on sizing. */
02aa26ce
NT
2543 register char *s = start; /* start of the constant */
2544 register char *d = SvPVX(sv); /* destination for copies */
2545 bool dorange = FALSE; /* are we in a translit range? */
c2e66d9e 2546 bool didrange = FALSE; /* did we just finish a range? */
2b9d42f0 2547 I32 has_utf8 = FALSE; /* Output constant is UTF8 */
77a135fe
KW
2548 I32 this_utf8 = UTF; /* Is the source string assumed
2549 to be UTF8? But, this can
2550 show as true when the source
2551 isn't utf8, as for example
2552 when it is entirely composed
2553 of hex constants */
2554
2555 /* Note on sizing: The scanned constant is placed into sv, which is
2556 * initialized by newSV() assuming one byte of output for every byte of
2557 * input. This routine expects newSV() to allocate an extra byte for a
2558 * trailing NUL, which this routine will append if it gets to the end of
2559 * the input. There may be more bytes of input than output (eg., \N{LATIN
2560 * CAPITAL LETTER A}), or more output than input if the constant ends up
2561 * recoded to utf8, but each time a construct is found that might increase
2562 * the needed size, SvGROW() is called. Its size parameter each time is
2563 * based on the best guess estimate at the time, namely the length used so
2564 * far, plus the length the current construct will occupy, plus room for
2565 * the trailing NUL, plus one byte for every input byte still unscanned */
2566
012bcf8d 2567 UV uv;
4c3a8340
TS
2568#ifdef EBCDIC
2569 UV literal_endpoint = 0;
e294cc5d 2570 bool native_range = TRUE; /* turned to FALSE if the first endpoint is Unicode. */
4c3a8340 2571#endif
012bcf8d 2572
7918f24d
NC
2573 PERL_ARGS_ASSERT_SCAN_CONST;
2574
2b9d42f0
NIS
2575 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
2576 /* If we are doing a trans and we know we want UTF8 set expectation */
2577 has_utf8 = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
2578 this_utf8 = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
2579 }
2580
2581
79072805 2582 while (s < send || dorange) {
ff3f963a 2583
02aa26ce 2584 /* get transliterations out of the way (they're most literal) */
3280af22 2585 if (PL_lex_inwhat == OP_TRANS) {
02aa26ce 2586 /* expand a range A-Z to the full set of characters. AIE! */
79072805 2587 if (dorange) {
1ba5c669
JH
2588 I32 i; /* current expanded character */
2589 I32 min; /* first character in range */
2590 I32 max; /* last character in range */
02aa26ce 2591
e294cc5d
JH
2592#ifdef EBCDIC
2593 UV uvmax = 0;
2594#endif
2595
2596 if (has_utf8
2597#ifdef EBCDIC
2598 && !native_range
2599#endif
2600 ) {
9d4ba2ae 2601 char * const c = (char*)utf8_hop((U8*)d, -1);
8973db79
JH
2602 char *e = d++;
2603 while (e-- > c)
2604 *(e + 1) = *e;
25716404 2605 *c = (char)UTF_TO_NATIVE(0xff);
8973db79
JH
2606 /* mark the range as done, and continue */
2607 dorange = FALSE;
2608 didrange = TRUE;
2609 continue;
2610 }
2b9d42f0 2611
95a20fc0 2612 i = d - SvPVX_const(sv); /* remember current offset */
e294cc5d
JH
2613#ifdef EBCDIC
2614 SvGROW(sv,
2615 SvLEN(sv) + (has_utf8 ?
2616 (512 - UTF_CONTINUATION_MARK +
2617 UNISKIP(0x100))
2618 : 256));
2619 /* How many two-byte within 0..255: 128 in UTF-8,
2620 * 96 in UTF-8-mod. */
2621#else
9cbb5ea2 2622 SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */
e294cc5d 2623#endif
9cbb5ea2 2624 d = SvPVX(sv) + i; /* refresh d after realloc */
e294cc5d
JH
2625#ifdef EBCDIC
2626 if (has_utf8) {
2627 int j;
2628 for (j = 0; j <= 1; j++) {
2629 char * const c = (char*)utf8_hop((U8*)d, -1);
2630 const UV uv = utf8n_to_uvchr((U8*)c, d - c, NULL, 0);
2631 if (j)
2632 min = (U8)uv;
2633 else if (uv < 256)
2634 max = (U8)uv;
2635 else {
2636 max = (U8)0xff; /* only to \xff */
2637 uvmax = uv; /* \x{100} to uvmax */
2638 }
2639 d = c; /* eat endpoint chars */
2640 }
2641 }
2642 else {
2643#endif
2644 d -= 2; /* eat the first char and the - */
2645 min = (U8)*d; /* first char in range */
2646 max = (U8)d[1]; /* last char in range */
2647#ifdef EBCDIC
2648 }
2649#endif
8ada0baa 2650
c2e66d9e 2651 if (min > max) {
01ec43d0 2652 Perl_croak(aTHX_
d1573ac7 2653 "Invalid range \"%c-%c\" in transliteration operator",
1ba5c669 2654 (char)min, (char)max);
c2e66d9e
GS
2655 }
2656
c7f1f016 2657#ifdef EBCDIC
4c3a8340
TS
2658 if (literal_endpoint == 2 &&
2659 ((isLOWER(min) && isLOWER(max)) ||
2660 (isUPPER(min) && isUPPER(max)))) {
8ada0baa
JH
2661 if (isLOWER(min)) {
2662 for (i = min; i <= max; i++)
2663 if (isLOWER(i))
db42d148 2664 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2665 } else {
2666 for (i = min; i <= max; i++)
2667 if (isUPPER(i))
db42d148 2668 *d++ = NATIVE_TO_NEED(has_utf8,i);
8ada0baa
JH
2669 }
2670 }
2671 else
2672#endif
2673 for (i = min; i <= max; i++)
e294cc5d
JH
2674#ifdef EBCDIC
2675 if (has_utf8) {
2676 const U8 ch = (U8)NATIVE_TO_UTF(i);
2677 if (UNI_IS_INVARIANT(ch))
2678 *d++ = (U8)i;
2679 else {
2680 *d++ = (U8)UTF8_EIGHT_BIT_HI(ch);
2681 *d++ = (U8)UTF8_EIGHT_BIT_LO(ch);
2682 }
2683 }
2684 else
2685#endif
2686 *d++ = (char)i;
2687
2688#ifdef EBCDIC
2689 if (uvmax) {
2690 d = (char*)uvchr_to_utf8((U8*)d, 0x100);
2691 if (uvmax > 0x101)
2692 *d++ = (char)UTF_TO_NATIVE(0xff);
2693 if (uvmax > 0x100)
2694 d = (char*)uvchr_to_utf8((U8*)d, uvmax);
2695 }
2696#endif
02aa26ce
NT
2697
2698 /* mark the range as done, and continue */
79072805 2699 dorange = FALSE;
01ec43d0 2700 didrange = TRUE;
4c3a8340
TS
2701#ifdef EBCDIC
2702 literal_endpoint = 0;
2703#endif
79072805 2704 continue;
4e553d73 2705 }
02aa26ce
NT
2706
2707 /* range begins (ignore - as first or last char) */
79072805 2708 else if (*s == '-' && s+1 < send && s != start) {
4e553d73 2709 if (didrange) {
1fafa243 2710 Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
01ec43d0 2711 }
e294cc5d
JH
2712 if (has_utf8
2713#ifdef EBCDIC
2714 && !native_range
2715#endif
2716 ) {
25716404 2717 *d++ = (char)UTF_TO_NATIVE(0xff); /* use illegal utf8 byte--see pmtrans */
a0ed51b3
LW
2718 s++;
2719 continue;
2720 }
79072805
LW
2721 dorange = TRUE;
2722 s++;
01ec43d0
GS
2723 }
2724 else {
2725 didrange = FALSE;
4c3a8340
TS
2726#ifdef EBCDIC
2727 literal_endpoint = 0;
e294cc5d 2728 native_range = TRUE;
4c3a8340 2729#endif
01ec43d0 2730 }
79072805 2731 }
02aa26ce
NT
2732
2733 /* if we get here, we're not doing a transliteration */
2734
0f5d15d6
IZ
2735 /* skip for regexp comments /(?#comment)/ and code /(?{code})/,
2736 except for the last char, which will be done separately. */
3280af22 2737 else if (*s == '(' && PL_lex_inpat && s[1] == '?') {
cc6b7395 2738 if (s[2] == '#') {
e994fd66 2739 while (s+1 < send && *s != ')')
db42d148 2740 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
155aba94
GS
2741 }
2742 else if (s[2] == '{' /* This should match regcomp.c */
67edc0c9 2743 || (s[2] == '?' && s[3] == '{'))
155aba94 2744 {
cc6b7395 2745 I32 count = 1;
0f5d15d6 2746 char *regparse = s + (s[2] == '{' ? 3 : 4);
cc6b7395
IZ
2747 char c;
2748
d9f97599
GS
2749 while (count && (c = *regparse)) {
2750 if (c == '\\' && regparse[1])
2751 regparse++;
4e553d73 2752 else if (c == '{')
cc6b7395 2753 count++;
4e553d73 2754 else if (c == '}')
cc6b7395 2755 count--;
d9f97599 2756 regparse++;
cc6b7395 2757 }
e994fd66 2758 if (*regparse != ')')
5bdf89e7 2759 regparse--; /* Leave one char for continuation. */
0f5d15d6 2760 while (s < regparse)
db42d148 2761 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
cc6b7395 2762 }
748a9306 2763 }
02aa26ce
NT
2764
2765 /* likewise skip #-initiated comments in //x patterns */
3280af22
NIS
2766 else if (*s == '#' && PL_lex_inpat &&
2767 ((PMOP*)PL_lex_inpat)->op_pmflags & PMf_EXTENDED) {
748a9306 2768 while (s+1 < send && *s != '\n')
db42d148 2769 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
748a9306 2770 }
02aa26ce 2771
5d1d4326 2772 /* check for embedded arrays
da6eedaa 2773 (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
5d1d4326 2774 */
1749ea0d
TS
2775 else if (*s == '@' && s[1]) {
2776 if (isALNUM_lazy_if(s+1,UTF))
2777 break;
2778 if (strchr(":'{$", s[1]))
2779 break;
2780 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
2781 break; /* in regexp, neither @+ nor @- are interpolated */
2782 }
02aa26ce
NT
2783
2784 /* check for embedded scalars. only stop if we're sure it's a
2785 variable.
2786 */
79072805 2787 else if (*s == '$') {
3280af22 2788 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
79072805 2789 break;
77772344 2790 if (s + 1 < send && !strchr("()| \r\n\t", s[1])) {
a2a5de95
NC
2791 if (s[1] == '\\') {
2792 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
2793 "Possible unintended interpolation of $\\ in regex");
77772344 2794 }
79072805 2795 break; /* in regexp, $ might be tail anchor */
77772344 2796 }
79072805 2797 }
02aa26ce 2798
2b9d42f0
NIS
2799 /* End of else if chain - OP_TRANS rejoin rest */
2800
02aa26ce 2801 /* backslashes */
79072805 2802 if (*s == '\\' && s+1 < send) {
ff3f963a
KW
2803 char* e; /* Can be used for ending '}', etc. */
2804
79072805 2805 s++;
02aa26ce 2806
02aa26ce 2807 /* deprecate \1 in strings and substitution replacements */
3280af22 2808 if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat &&
a0d0e21e 2809 isDIGIT(*s) && *s != '0' && !isDIGIT(s[1]))
79072805 2810 {
a2a5de95 2811 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
79072805
LW
2812 *--s = '$';
2813 break;
2814 }
02aa26ce
NT
2815
2816 /* string-change backslash escapes */
3280af22 2817 if (PL_lex_inwhat != OP_TRANS && *s && strchr("lLuUEQ", *s)) {
79072805
LW
2818 --s;
2819 break;
2820 }
ff3f963a
KW
2821 /* In a pattern, process \N, but skip any other backslash escapes.
2822 * This is because we don't want to translate an escape sequence
2823 * into a meta symbol and have the regex compiler use the meta
2824 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
2825 * in spite of this, we do have to process \N here while the proper
2826 * charnames handler is in scope. See bugs #56444 and #62056.
2827 * There is a complication because \N in a pattern may also stand
2828 * for 'match a non-nl', and not mean a charname, in which case its
2829 * processing should be deferred to the regex compiler. To be a
2830 * charname it must be followed immediately by a '{', and not look
2831 * like \N followed by a curly quantifier, i.e., not something like
2832 * \N{3,}. regcurly returns a boolean indicating if it is a legal
2833 * quantifier */
2834 else if (PL_lex_inpat
2835 && (*s != 'N'
2836 || s[1] != '{'
2837 || regcurly(s + 1)))
2838 {
cc74c5bd
TS
2839 *d++ = NATIVE_TO_NEED(has_utf8,'\\');
2840 goto default_action;
2841 }
02aa26ce 2842
79072805 2843 switch (*s) {
02aa26ce
NT
2844
2845 /* quoted - in transliterations */
79072805 2846 case '-':
3280af22 2847 if (PL_lex_inwhat == OP_TRANS) {
79072805
LW
2848 *d++ = *s++;
2849 continue;
2850 }
2851 /* FALL THROUGH */
2852 default:
11b8faa4 2853 {
a2a5de95
NC
2854 if ((isALPHA(*s) || isDIGIT(*s)))
2855 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
2856 "Unrecognized escape \\%c passed through",
2857 *s);
11b8faa4 2858 /* default action is to copy the quoted character */
f9a63242 2859 goto default_action;
11b8faa4 2860 }
02aa26ce 2861
77a135fe 2862 /* eg. \132 indicates the octal constant 0x132 */
79072805
LW
2863 case '0': case '1': case '2': case '3':
2864 case '4': case '5': case '6': case '7':
ba210ebe 2865 {
53305cf1
NC
2866 I32 flags = 0;
2867 STRLEN len = 3;
77a135fe 2868 uv = NATIVE_TO_UNI(grok_oct(s, &len, &flags, NULL));
ba210ebe
JH
2869 s += len;
2870 }
012bcf8d 2871 goto NUM_ESCAPE_INSERT;
02aa26ce 2872
77a135fe 2873 /* eg. \x24 indicates the hex constant 0x24 */
79072805 2874 case 'x':
a0ed51b3
LW
2875 ++s;
2876 if (*s == '{') {
9d4ba2ae 2877 char* const e = strchr(s, '}');
a4c04bdc
NC
2878 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES |
2879 PERL_SCAN_DISALLOW_PREFIX;
53305cf1 2880 STRLEN len;
355860ce 2881
53305cf1 2882 ++s;
adaeee49 2883 if (!e) {
a0ed51b3 2884 yyerror("Missing right brace on \\x{}");
355860ce 2885 continue;
ba210ebe 2886 }
53305cf1 2887 len = e - s;
77a135fe 2888 uv = NATIVE_TO_UNI(grok_hex(s, &len, &flags, NULL));
ba210ebe 2889 s = e + 1;
a0ed51b3
LW
2890 }
2891 else {
ba210ebe 2892 {
53305cf1 2893 STRLEN len = 2;
a4c04bdc 2894 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
77a135fe 2895 uv = NATIVE_TO_UNI(grok_hex(s, &len, &flags, NULL));
ba210ebe
JH
2896 s += len;
2897 }
012bcf8d
GS
2898 }
2899
2900 NUM_ESCAPE_INSERT:
ff3f963a
KW
2901 /* Insert oct or hex escaped character. There will always be
2902 * enough room in sv since such escapes will be longer than any
2903 * UTF-8 sequence they can end up as, except if they force us
2904 * to recode the rest of the string into utf8 */
ba7cea30 2905
77a135fe 2906 /* Here uv is the ordinal of the next character being added in
ff3f963a 2907 * unicode (converted from native). */
77a135fe 2908 if (!UNI_IS_INVARIANT(uv)) {
9aa983d2 2909 if (!has_utf8 && uv > 255) {
77a135fe
KW
2910 /* Might need to recode whatever we have accumulated so
2911 * far if it contains any chars variant in utf8 or
2912 * utf-ebcdic. */
2913
2914 SvCUR_set(sv, d - SvPVX_const(sv));
2915 SvPOK_on(sv);
2916 *d = '\0';
77a135fe 2917 /* See Note on sizing above. */
7bf79863
KW
2918 sv_utf8_upgrade_flags_grow(sv,
2919 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
2920 UNISKIP(uv) + (STRLEN)(send - s) + 1);
77a135fe
KW
2921 d = SvPVX(sv) + SvCUR(sv);
2922 has_utf8 = TRUE;
012bcf8d
GS
2923 }
2924
77a135fe
KW
2925 if (has_utf8) {
2926 d = (char*)uvuni_to_utf8((U8*)d, uv);
f9a63242
JH
2927 if (PL_lex_inwhat == OP_TRANS &&
2928 PL_sublex_info.sub_op) {
2929 PL_sublex_info.sub_op->op_private |=
2930 (PL_lex_repl ? OPpTRANS_FROM_UTF
2931 : OPpTRANS_TO_UTF);
f9a63242 2932 }
e294cc5d
JH
2933#ifdef EBCDIC
2934 if (uv > 255 && !dorange)
2935 native_range = FALSE;
2936#endif
012bcf8d 2937 }
a0ed51b3 2938 else {
012bcf8d 2939 *d++ = (char)uv;
a0ed51b3 2940 }
012bcf8d
GS
2941 }
2942 else {
c4d5f83a 2943 *d++ = (char) uv;
a0ed51b3 2944 }
79072805 2945 continue;
02aa26ce 2946
4a2d328f 2947 case 'N':
ff3f963a
KW
2948 /* In a non-pattern \N must be a named character, like \N{LATIN
2949 * SMALL LETTER A} or \N{U+0041}. For patterns, it also can
2950 * mean to match a non-newline. For non-patterns, named
2951 * characters are converted to their string equivalents. In
2952 * patterns, named characters are not converted to their
2953 * ultimate forms for the same reasons that other escapes
2954 * aren't. Instead, they are converted to the \N{U+...} form
2955 * to get the value from the charnames that is in effect right
2956 * now, while preserving the fact that it was a named character
2957 * so that the regex compiler knows this */
2958
2959 /* This section of code doesn't generally use the
2960 * NATIVE_TO_NEED() macro to transform the input. I (khw) did
2961 * a close examination of this macro and determined it is a
2962 * no-op except on utfebcdic variant characters. Every
2963 * character generated by this that would normally need to be
2964 * enclosed by this macro is invariant, so the macro is not
2965 * needed, and would complicate use of copy(). There are other
2966 * parts of this file where the macro is used inconsistently,
2967 * but are saved by it being a no-op */
2968
2969 /* The structure of this section of code (besides checking for
2970 * errors and upgrading to utf8) is:
2971 * Further disambiguate between the two meanings of \N, and if
2972 * not a charname, go process it elsewhere
0a96133f
KW
2973 * If of form \N{U+...}, pass it through if a pattern;
2974 * otherwise convert to utf8
2975 * Otherwise must be \N{NAME}: convert to \N{U+c1.c2...} if a
2976 * pattern; otherwise convert to utf8 */
ff3f963a
KW
2977
2978 /* Here, s points to the 'N'; the test below is guaranteed to
2979 * succeed if we are being called on a pattern as we already
2980 * know from a test above that the next character is a '{'.
2981 * On a non-pattern \N must mean 'named sequence, which
2982 * requires braces */
2983 s++;
2984 if (*s != '{') {
2985 yyerror("Missing braces on \\N{}");
2986 continue;
2987 }
2988 s++;
2989
0a96133f 2990 /* If there is no matching '}', it is an error. */
ff3f963a
KW
2991 if (! (e = strchr(s, '}'))) {
2992 if (! PL_lex_inpat) {
5777a3f7 2993 yyerror("Missing right brace on \\N{}");
0a96133f
KW
2994 } else {
2995 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N.");
dbc0d4f2 2996 }
0a96133f 2997 continue;
ff3f963a 2998 }
cddc7ef4 2999
ff3f963a 3000 /* Here it looks like a named character */
cddc7ef4 3001
ff3f963a
KW
3002 if (PL_lex_inpat) {
3003
3004 /* XXX This block is temporary code. \N{} implies that the
3005 * pattern is to have Unicode semantics, and therefore
3006 * currently has to be encoded in utf8. By putting it in
3007 * utf8 now, we save a whole pass in the regular expression
3008 * compiler. Once that code is changed so Unicode
3009 * semantics doesn't necessarily have to be in utf8, this
3010 * block should be removed */
3011 if (!has_utf8) {
77a135fe 3012 SvCUR_set(sv, d - SvPVX_const(sv));
f08d6ad9 3013 SvPOK_on(sv);
e4f3eed8 3014 *d = '\0';
77a135fe 3015 /* See Note on sizing above. */
7bf79863 3016 sv_utf8_upgrade_flags_grow(sv,
ff3f963a
KW
3017 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3018 /* 5 = '\N{' + cur char + NUL */
3019 (STRLEN)(send - s) + 5);
f08d6ad9 3020 d = SvPVX(sv) + SvCUR(sv);
89491803 3021 has_utf8 = TRUE;
ff3f963a
KW
3022 }
3023 }
423cee85 3024
ff3f963a
KW
3025 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3026 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3027 | PERL_SCAN_DISALLOW_PREFIX;
3028 STRLEN len;
3029
3030 /* For \N{U+...}, the '...' is a unicode value even on
3031 * EBCDIC machines */
3032 s += 2; /* Skip to next char after the 'U+' */
3033 len = e - s;
3034 uv = grok_hex(s, &len, &flags, NULL);
3035 if (len == 0 || len != (STRLEN)(e - s)) {
3036 yyerror("Invalid hexadecimal number in \\N{U+...}");
3037 s = e + 1;
3038 continue;
3039 }
3040
3041 if (PL_lex_inpat) {
3042
3043 /* Pass through to the regex compiler unchanged. The
3044 * reason we evaluated the number above is to make sure
0a96133f 3045 * there wasn't a syntax error. */
ff3f963a
KW
3046 s -= 5; /* Include the '\N{U+' */
3047 Copy(s, d, e - s + 1, char); /* 1 = include the } */
3048 d += e - s + 1;
3049 }
3050 else { /* Not a pattern: convert the hex to string */
3051
3052 /* If destination is not in utf8, unconditionally
3053 * recode it to be so. This is because \N{} implies
3054 * Unicode semantics, and scalars have to be in utf8
3055 * to guarantee those semantics */
3056 if (! has_utf8) {
3057 SvCUR_set(sv, d - SvPVX_const(sv));
3058 SvPOK_on(sv);
3059 *d = '\0';
3060 /* See Note on sizing above. */
3061 sv_utf8_upgrade_flags_grow(
3062 sv,
3063 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3064 UNISKIP(uv) + (STRLEN)(send - e) + 1);
3065 d = SvPVX(sv) + SvCUR(sv);
3066 has_utf8 = TRUE;
3067 }
3068
3069 /* Add the string to the output */
3070 if (UNI_IS_INVARIANT(uv)) {
3071 *d++ = (char) uv;
3072 }
3073 else d = (char*)uvuni_to_utf8((U8*)d, uv);
3074 }
3075 }
3076 else { /* Here is \N{NAME} but not \N{U+...}. */
3077
3078 SV *res; /* result from charnames */
3079 const char *str; /* the string in 'res' */
3080 STRLEN len; /* its length */
3081
3082 /* Get the value for NAME */
3083 res = newSVpvn(s, e - s);
3084 res = new_constant( NULL, 0, "charnames",
3085 /* includes all of: \N{...} */
3086 res, NULL, s - 3, e - s + 4 );
3087
3088 /* Most likely res will be in utf8 already since the
3089 * standard charnames uses pack U, but a custom translator
3090 * can leave it otherwise, so make sure. XXX This can be
3091 * revisited to not have charnames use utf8 for characters
3092 * that don't need it when regexes don't have to be in utf8
3093 * for Unicode semantics. If doing so, remember EBCDIC */
3094 sv_utf8_upgrade(res);
3095 str = SvPV_const(res, len);
3096
3097 /* Don't accept malformed input */
3098 if (! is_utf8_string((U8 *) str, len)) {
3099 yyerror("Malformed UTF-8 returned by \\N");
3100 }
3101 else if (PL_lex_inpat) {
3102
3103 if (! len) { /* The name resolved to an empty string */
3104 Copy("\\N{}", d, 4, char);
3105 d += 4;
3106 }
3107 else {
3108 /* In order to not lose information for the regex
3109 * compiler, pass the result in the specially made
3110 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
3111 * the code points in hex of each character
3112 * returned by charnames */
3113
3114 const char *str_end = str + len;
3115 STRLEN char_length; /* cur char's byte length */
3116 STRLEN output_length; /* and the number of bytes
3117 after this is translated
3118 into hex digits */
3119 const STRLEN off = d - SvPVX_const(sv);
3120
3121 /* 2 hex per byte; 2 chars for '\N'; 2 chars for
3122 * max('U+', '.'); and 1 for NUL */
3123 char hex_string[2 * UTF8_MAXBYTES + 5];
3124
3125 /* Get the first character of the result. */
3126 U32 uv = utf8n_to_uvuni((U8 *) str,
3127 len,
3128 &char_length,
3129 UTF8_ALLOW_ANYUV);
3130
3131 /* The call to is_utf8_string() above hopefully
3132 * guarantees that there won't be an error. But
3133 * it's easy here to make sure. The function just
3134 * above warns and returns 0 if invalid utf8, but
3135 * it can also return 0 if the input is validly a
3136 * NUL. Disambiguate */
3137 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3138 uv = UNICODE_REPLACEMENT;
3139 }
3140
3141 /* Convert first code point to hex, including the
3142 * boiler plate before it */
3143 sprintf(hex_string, "\\N{U+%X", (unsigned int) uv);
3144 output_length = strlen(hex_string);
3145
3146 /* Make sure there is enough space to hold it */
3147 d = off + SvGROW(sv, off
3148 + output_length
3149 + (STRLEN)(send - e)
3150 + 2); /* '}' + NUL */
3151 /* And output it */
3152 Copy(hex_string, d, output_length, char);
3153 d += output_length;
3154
3155 /* For each subsequent character, append dot and
3156 * its ordinal in hex */
3157 while ((str += char_length) < str_end) {
3158 const STRLEN off = d - SvPVX_const(sv);
3159 U32 uv = utf8n_to_uvuni((U8 *) str,
3160 str_end - str,
3161 &char_length,
3162 UTF8_ALLOW_ANYUV);
3163 if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
3164 uv = UNICODE_REPLACEMENT;
3165 }
3166
3167 sprintf(hex_string, ".%X", (unsigned int) uv);
3168 output_length = strlen(hex_string);
3169
3170 d = off + SvGROW(sv, off
3171 + output_length
3172 + (STRLEN)(send - e)
3173 + 2); /* '}' + NUL */
3174 Copy(hex_string, d, output_length, char);
3175 d += output_length;
3176 }
3177
3178 *d++ = '}'; /* Done. Add the trailing brace */
3179 }
3180 }
3181 else { /* Here, not in a pattern. Convert the name to a
3182 * string. */
3183
3184 /* If destination is not in utf8, unconditionally
3185 * recode it to be so. This is because \N{} implies
3186 * Unicode semantics, and scalars have to be in utf8
3187 * to guarantee those semantics */
3188 if (! has_utf8) {
3189 SvCUR_set(sv, d - SvPVX_const(sv));
3190 SvPOK_on(sv);
3191 *d = '\0';
3192 /* See Note on sizing above. */
3193 sv_utf8_upgrade_flags_grow(sv,
3194 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3195 len + (STRLEN)(send - s) + 1);
3196 d = SvPVX(sv) + SvCUR(sv);
3197 has_utf8 = TRUE;
3198 } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
3199
3200 /* See Note on sizing above. (NOTE: SvCUR() is not
3201 * set correctly here). */
3202 const STRLEN off = d - SvPVX_const(sv);
3203 d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1);
3204 }
3205 Copy(str, d, len, char);
3206 d += len;
423cee85 3207 }
423cee85 3208 SvREFCNT_dec(res);
423cee85 3209 }
ff3f963a
KW
3210#ifdef EBCDIC
3211 if (!dorange)
3212 native_range = FALSE; /* \N{} is defined to be Unicode */
3213#endif
3214 s = e + 1; /* Point to just after the '}' */
423cee85
JH
3215 continue;
3216
02aa26ce 3217 /* \c is a control character */
79072805
LW
3218 case 'c':
3219 s++;
961ce445 3220 if (s < send) {
ba210ebe 3221 U8 c = *s++;
c7f1f016
NIS
3222#ifdef EBCDIC
3223 if (isLOWER(c))
3224 c = toUPPER(c);
3225#endif
db42d148 3226 *d++ = NATIVE_TO_NEED(has_utf8,toCTRL(c));
ba210ebe 3227 }
961ce445
RGS
3228 else {
3229 yyerror("Missing control char name in \\c");
3230 }
79072805 3231 continue;
02aa26ce
NT
3232
3233 /* printf-style backslashes, formfeeds, newlines, etc */
79072805 3234 case 'b':
db42d148 3235 *d++ = NATIVE_TO_NEED(has_utf8,'\b');
79072805
LW
3236 break;
3237 case 'n':
db42d148 3238 *d++ = NATIVE_TO_NEED(has_utf8,'\n');
79072805
LW
3239 break;
3240 case 'r':
db42d148 3241 *d++ = NATIVE_TO_NEED(has_utf8,'\r');
79072805
LW
3242 break;
3243 case 'f':
db42d148 3244 *d++ = NATIVE_TO_NEED(has_utf8,'\f');
79072805
LW
3245 break;
3246 case 't':
db42d148 3247 *d++ = NATIVE_TO_NEED(has_utf8,'\t');
79072805 3248 break;
34a3fe2a 3249 case 'e':
db42d148 3250 *d++ = ASCII_TO_NEED(has_utf8,'\033');
34a3fe2a
PP
3251 break;
3252 case 'a':
db42d148 3253 *d++ = ASCII_TO_NEED(has_utf8,'\007');
79072805 3254 break;
02aa26ce
NT
3255 } /* end switch */
3256
79072805
LW
3257 s++;
3258 continue;
02aa26ce 3259 } /* end if (backslash) */
4c3a8340
TS
3260#ifdef EBCDIC
3261 else
3262 literal_endpoint++;
3263#endif
02aa26ce 3264
f9a63242 3265 default_action:
77a135fe
KW
3266 /* If we started with encoded form, or already know we want it,
3267 then encode the next character */
3268 if (! NATIVE_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
2b9d42f0 3269 STRLEN len = 1;
77a135fe
KW
3270
3271
3272 /* One might think that it is wasted effort in the case of the
3273 * source being utf8 (this_utf8 == TRUE) to take the next character
3274 * in the source, convert it to an unsigned value, and then convert
3275 * it back again. But the source has not been validated here. The
3276 * routine that does the conversion checks for errors like
3277 * malformed utf8 */
3278
5f66b61c
AL
3279 const UV nextuv = (this_utf8) ? utf8n_to_uvchr((U8*)s, send - s, &len, 0) : (UV) ((U8) *s);
3280 const STRLEN need = UNISKIP(NATIVE_TO_UNI(nextuv));
77a135fe
KW
3281 if (!has_utf8) {
3282 SvCUR_set(sv, d - SvPVX_const(sv));
3283 SvPOK_on(sv);
3284 *d = '\0';
77a135fe 3285 /* See Note on sizing above. */
7bf79863
KW
3286 sv_utf8_upgrade_flags_grow(sv,
3287 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3288 need + (STRLEN)(send - s) + 1);
77a135fe
KW
3289 d = SvPVX(sv) + SvCUR(sv);
3290 has_utf8 = TRUE;
3291 } else if (need > len) {
3292 /* encoded value larger than old, may need extra space (NOTE:
3293 * SvCUR() is not set correctly here). See Note on sizing
3294 * above. */
9d4ba2ae 3295 const STRLEN off = d - SvPVX_const(sv);
77a135fe 3296 d = SvGROW(sv, off + need + (STRLEN)(send - s) + 1) + off;
2b9d42f0 3297 }
77a135fe
KW
3298 s += len;
3299
5f66b61c 3300 d = (char*)uvchr_to_utf8((U8*)d, nextuv);
e294cc5d
JH
3301#ifdef EBCDIC
3302 if (uv > 255 && !dorange)
3303 native_range = FALSE;
3304#endif
2b9d42f0
NIS
3305 }
3306 else {
3307 *d++ = NATIVE_TO_NEED(has_utf8,*s++);
3308 }
02aa26ce
NT
3309 } /* while loop to process each character */
3310
3311 /* terminate the string and set up the sv */
79072805 3312 *d = '\0';
95a20fc0 3313 SvCUR_set(sv, d - SvPVX_const(sv));
2b9d42f0 3314 if (SvCUR(sv) >= SvLEN(sv))
d0063567 3315 Perl_croak(aTHX_ "panic: constant overflowed allocated space");
2b9d42f0 3316
79072805 3317 SvPOK_on(sv);
9f4817db 3318 if (PL_encoding && !has_utf8) {
d0063567
DK
3319 sv_recode_to_utf8(sv, PL_encoding);
3320 if (SvUTF8(sv))
3321 has_utf8 = TRUE;
9f4817db 3322 }
2b9d42f0 3323 if (has_utf8) {
7e2040f0 3324 SvUTF8_on(sv);
2b9d42f0 3325 if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
d0063567 3326 PL_sublex_info.sub_op->op_private |=
2b9d42f0
NIS
3327 (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
3328 }
3329 }
79072805 3330
02aa26ce 3331 /* shrink the sv if we allocated more than we used */
79072805 3332 if (SvCUR(sv) + 5 < SvLEN(sv)) {
1da4ca5f 3333 SvPV_shrink_to_cur(sv);
79072805 3334 }
02aa26ce 3335
6154021b 3336 /* return the substring (via pl_yylval) only if we parsed anything */
3280af22 3337 if (s > PL_bufptr) {
eb0d8d16
NC
3338 if ( PL_hints & ( PL_lex_inpat ? HINT_NEW_RE : HINT_NEW_STRING ) ) {
3339 const char *const key = PL_lex_inpat ? "qr" : "q";
3340 const STRLEN keylen = PL_lex_inpat ? 2 : 1;
3341 const char *type;
3342 STRLEN typelen;
3343
3344 if (PL_lex_inwhat == OP_TRANS) {
3345 type = "tr";
3346 typelen = 2;
3347 } else if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat) {
3348 type = "s";
3349 typelen = 1;
3350 } else {
3351 type = "qq";
3352 typelen = 2;
3353 }
3354
3355 sv = S_new_constant(aTHX_ start, s - start, key, keylen, sv, NULL,
3356 type, typelen);
3357 }
6154021b 3358 pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
b3ac6de7 3359 } else
8990e307 3360 SvREFCNT_dec(sv);
79072805
LW
3361 return s;
3362}
3363
ffb4593c
NT
3364/* S_intuit_more
3365 * Returns TRUE if there's more to the expression (e.g., a subscript),
3366 * FALSE otherwise.
ffb4593c
NT
3367 *
3368 * It deals with "$foo[3]" and /$foo[3]/ and /$foo[0123456789$]+/
3369 *
3370 * ->[ and ->{ return TRUE
3371 * { and [ outside a pattern are always subscripts, so return TRUE
3372 * if we're outside a pattern and it's not { or [, then return FALSE
3373 * if we're in a pattern and the first char is a {
3374 * {4,5} (any digits around the comma) returns FALSE
3375 * if we're in a pattern and the first char is a [
3376 * [] returns FALSE
3377 * [SOMETHING] has a funky algorithm to decide whether it's a
3378 * character class or not. It has to deal with things like
3379 * /$foo[-3]/ and /$foo[$bar]/ as well as /$foo[$\d]+/
3380 * anything else returns TRUE
3381 */
3382
9cbb5ea2
GS
3383/* This is the one truly awful dwimmer necessary to conflate C and sed. */
3384
76e3520e 3385STATIC int
cea2e8a9 3386S_intuit_more(pTHX_ register char *s)
79072805 3387{
97aff369 3388 dVAR;
7918f24d
NC
3389
3390 PERL_ARGS_ASSERT_INTUIT_MORE;
3391
3280af22 3392 if (PL_lex_brackets)
79072805
LW
3393 return TRUE;
3394 if (*s == '-' && s[1] == '>' && (s[2] == '[' || s[2] == '{'))
3395 return TRUE;
3396 if (*s != '{' && *s != '[')
3397 return FALSE;
3280af22 3398 if (!PL_lex_inpat)
79072805
LW
3399 return TRUE;
3400
3401 /* In a pattern, so maybe we have {n,m}. */
3402 if (*s == '{') {
3403 s++;
3404 if (!isDIGIT(*s))
3405 return TRUE;
3406 while (isDIGIT(*s))
3407 s++;
3408 if (*s == ',')
3409 s++;
3410 while (isDIGIT(*s))
3411 s++;
3412 if (*s == '}')
3413 return FALSE;
3414 return TRUE;
3415
3416 }
3417
3418 /* On the other hand, maybe we have a character class */
3419
3420 s++;
3421 if (*s == ']' || *s == '^')
3422 return FALSE;
3423 else {
ffb4593c 3424 /* this is terrifying, and it works */
79072805
LW
3425 int weight = 2; /* let's weigh the evidence */
3426 char seen[256];
f27ffc4a 3427 unsigned char un_char = 255, last_un_char;
9d4ba2ae 3428 const char * const send = strchr(s,']');
3280af22 3429 char tmpbuf[sizeof PL_tokenbuf * 4];
79072805
LW
3430
3431 if (!send) /* has to be an expression */
3432 return TRUE;
3433
3434 Zero(seen,256,char);
3435 if (*s == '$')
3436 weight -= 3;
3437 else if (isDIGIT(*s)) {
3438 if (s[1] != ']') {
3439 if (isDIGIT(s[1]) && s[2] == ']')
3440 weight -= 10;
3441 }
3442 else
3443 weight -= 100;
3444 }
3445 for (; s < send; s++) {
3446 last_un_char = un_char;
3447 un_char = (unsigned char)*s;
3448 switch (*s) {
3449 case '@':
3450 case '&':
3451 case '$':
3452 weight -= seen[un_char] * 10;
7e2040f0 3453 if (isALNUM_lazy_if(s+1,UTF)) {
90e5519e 3454 int len;
8903cb82 3455 scan_ident(s, send, tmpbuf, sizeof tmpbuf, FALSE);
90e5519e
NC
3456 len = (int)strlen(tmpbuf);
3457 if (len > 1 && gv_fetchpvn_flags(tmpbuf, len, 0, SVt_PV))
79072805
LW
3458 weight -= 100;
3459 else
3460 weight -= 10;
3461 }
3462 else if (*s == '$' && s[1] &&
93a17b20
LW
3463 strchr("[#!%*<>()-=",s[1])) {
3464 if (/*{*/ strchr("])} =",s[2]))
79072805
LW
3465 weight -= 10;
3466 else
3467 weight -= 1;
3468 }
3469 break;
3470 case '\\':
3471 un_char = 254;
3472 if (s[1]) {
93a17b20 3473 if (strchr("wds]",s[1]))
79072805 3474 weight += 100;
10edeb5d 3475 else if (seen[(U8)'\''] || seen[(U8)'"'])
79072805 3476 weight += 1;
93a17b20 3477 else if (strchr("rnftbxcav",s[1]))
79072805
LW
3478 weight += 40;
3479 else if (isDIGIT(s[1])) {
3480 weight += 40;
3481 while (s[1] && isDIGIT(s[1]))
3482 s++;
3483 }
3484 }
3485 else
3486 weight += 100;
3487 break;
3488 case '-':
3489 if (s[1] == '\\')
3490 weight += 50;
93a17b20 3491 if (strchr("aA01! ",last_un_char))
79072805 3492 weight += 30;
93a17b20 3493 if (strchr("zZ79~",s[1]))
79072805 3494 weight += 30;
f27ffc4a
GS
3495 if (last_un_char == 255 && (isDIGIT(s[1]) || s[1] == '$'))
3496 weight -= 5; /* cope with negative subscript */
79072805
LW
3497 break;
3498 default:
3792a11b
NC
3499 if (!isALNUM(last_un_char)
3500 && !(last_un_char == '$' || last_un_char == '@'
3501 || last_un_char == '&')
3502 && isALPHA(*s) && s[1] && isALPHA(s[1])) {
79072805
LW
3503 char *d = tmpbuf;
3504 while (isALPHA(*s))
3505 *d++ = *s++;
3506 *d = '\0';
5458a98a 3507 if (keyword(tmpbuf, d - tmpbuf, 0))
79072805
LW
3508 weight -= 150;
3509 }
3510 if (un_char == last_un_char + 1)
3511 weight += 5;
3512 weight -= seen[un_char];
3513 break;
3514 }
3515 seen[un_char]++;
3516 }
3517 if (weight >= 0) /* probably a character class */
3518 return FALSE;
3519 }
3520
3521 return TRUE;
3522}
ffed7fef 3523
ffb4593c
NT
3524/*
3525 * S_intuit_method
3526 *
3527 * Does all the checking to disambiguate
3528 * foo bar
3529 * between foo(bar) and bar->foo. Returns 0 if not a method, otherwise
3530 * FUNCMETH (bar->foo(args)) or METHOD (bar->foo args).
3531 *
3532 * First argument is the stuff after the first token, e.g. "bar".
3533 *
3534 * Not a method if bar is a filehandle.
3535 * Not a method if foo is a subroutine prototyped to take a filehandle.
3536 * Not a method if it's really "Foo $bar"
3537 * Method if it's "foo $bar"
3538 * Not a method if it's really "print foo $bar"
3539 * Method if it's really "foo package::" (interpreted as package->foo)
8f8cf39c 3540 * Not a method if bar is known to be a subroutine ("sub bar; foo bar")
3cb0bbe5 3541 * Not a method if bar is a filehandle or package, but is quoted with
ffb4593c
NT
3542 * =>
3543 */
3544
76e3520e 3545STATIC int
62d55b22 3546S_intuit_method(pTHX_ char *start, GV *gv, CV *cv)
a0d0e21e 3547{
97aff369 3548 dVAR;
a0d0e21e 3549 char *s = start + (*start == '$');
3280af22 3550 char tmpbuf[sizeof PL_tokenbuf];
a0d0e21e
LW
3551 STRLEN len;
3552 GV* indirgv;
5db06880
NC
3553#ifdef PERL_MAD
3554 int soff;
3555#endif
a0d0e21e 3556
7918f24d
NC
3557 PERL_ARGS_ASSERT_INTUIT_METHOD;
3558
a0d0e21e 3559 if (gv) {
62d55b22 3560 if (SvTYPE(gv) == SVt_PVGV && GvIO(gv))
a0d0e21e 3561 return 0;
62d55b22
NC
3562 if (cv) {
3563 if (SvPOK(cv)) {
3564 const char *proto = SvPVX_const(cv);
3565 if (proto) {
3566 if (*proto == ';')
3567 proto++;
3568 if (*proto == '*')
3569 return 0;
3570 }
b6c543e3
IZ
3571 }
3572 } else
c35e046a 3573 gv = NULL;
a0d0e21e 3574 }
8903cb82 3575 s = scan_word(s, tmpbuf, sizeof tmpbuf, TRUE, &len);
ffb4593c
NT
3576 /* start is the beginning of the possible filehandle/object,
3577 * and s is the end of it
3578 * tmpbuf is a copy of it
3579 */
3580
a0d0e21e 3581 if (*start == '$') {
3ef1310e
RGS
3582 if (gv || PL_last_lop_op == OP_PRINT || PL_last_lop_op == OP_SAY ||
3583 isUPPER(*PL_tokenbuf))
a0d0e21e 3584 return 0;
5db06880
NC
3585#ifdef PERL_MAD
3586 len = start - SvPVX(PL_linestr);
3587#endif
29595ff2 3588 s = PEEKSPACE(s);
f0092767 3589#ifdef PERL_MAD
5db06880
NC
3590 start = SvPVX(PL_linestr) + len;
3591#endif
3280af22
NIS
3592 PL_bufptr = start;
3593 PL_expect = XREF;
a0d0e21e
LW
3594 return *s == '(' ? FUNCMETH : METHOD;
3595 }
5458a98a 3596 if (!keyword(tmpbuf, len, 0)) {
c3e0f903
GS
3597 if (len > 2 && tmpbuf[len - 2] == ':' && tmpbuf[len - 1] == ':') {
3598 len -= 2;
3599 tmpbuf[len] = '\0';
5db06880
NC
3600#ifdef PERL_MAD
3601 soff = s - SvPVX(PL_linestr);
3602#endif
c3e0f903
GS
3603 goto bare_package;
3604 }
90e5519e 3605 indirgv = gv_fetchpvn_flags(tmpbuf, len, 0, SVt_PVCV);
8ebc5c01 3606 if (indirgv && GvCVu(indirgv))
a0d0e21e
LW
3607 return 0;
3608 /* filehandle or package name makes it a method */
da51bb9b 3609 if (!gv || GvIO(indirgv) || gv_stashpvn(tmpbuf, len, 0)) {
5db06880
NC
3610#ifdef PERL_MAD
3611 soff = s - SvPVX(PL_linestr);
3612#endif
29595ff2 3613 s = PEEKSPACE(s);
3280af22 3614 if ((PL_bufend - s) >= 2 && *s == '=' && *(s+1) == '>')
55497cff 3615 return 0; /* no assumptions -- "=>" quotes bearword */
c3e0f903 3616 bare_package:
cd81e915 3617 start_force(PL_curforce);
9ded7720 3618 NEXTVAL_NEXTTOKE.opval = (OP*)newSVOP(OP_CONST, 0,
64142370 3619 S_newSV_maybe_utf8(aTHX_ tmpbuf, len));
9ded7720 3620 NEXTVAL_NEXTTOKE.opval->op_private = OPpCONST_BARE;
5db06880
NC
3621 if (PL_madskills)
3622 curmad('X', newSVpvn(start,SvPVX(PL_linestr) + soff - start));
3280af22 3623 PL_expect = XTERM;
a0d0e21e 3624 force_next(WORD);
3280af22 3625 PL_bufptr = s;
5db06880
NC
3626#ifdef PERL_MAD
3627 PL_bufptr = SvPVX(PL_linestr) + soff; /* restart before space */
3628#endif
a0d0e21e
LW
3629 return *s == '(' ? FUNCMETH : METHOD;
3630 }
3631 }
3632 return 0;
3633}
3634
16d20bd9 3635/* Encoded script support. filter_add() effectively inserts a
4e553d73 3636 * 'pre-processing' function into the current source input stream.
16d20bd9
AD
3637 * Note that the filter function only applies to the current source file
3638 * (e.g., it will not affect files 'require'd or 'use'd by this one).
3639 *
3640 * The datasv parameter (which may be NULL) can be used to pass
3641 * private data to this instance of the filter. The filter function
3642 * can recover the SV using the FILTER_DATA macro and use it to
3643 * store private buffers and state information.
3644 *
3645 * The supplied datasv parameter is upgraded to a PVIO type
4755096e 3646 * and the IoDIRP/IoANY field is used to store the function pointer,
e0c19803 3647 * and IOf_FAKE_DIRP is enabled on datasv to mark this as such.
16d20bd9
AD
3648 * Note that IoTOP_NAME, IoFMT_NAME, IoBOTTOM_NAME, if set for
3649 * private use must be set using malloc'd pointers.
3650 */
16d20bd9
AD
3651
3652SV *
864dbfa3 3653Perl_filter_add(pTHX_ filter_t funcp, SV *datasv)
16d20bd9 3654{
97aff369 3655 dVAR;
f4c556ac 3656 if (!funcp)
a0714e2c 3657 return NULL;
f4c556ac 3658
5486870f
DM
3659 if (!PL_parser)
3660 return NULL;
3661
3280af22
NIS
3662 if (!PL_rsfp_filters)
3663 PL_rsfp_filters = newAV();
16d20bd9 3664 if (!datasv)
561b68a9 3665 datasv = newSV(0);
862a34c6 3666 SvUPGRADE(datasv, SVt_PVIO);
8141890a 3667 IoANY(datasv) = FPTR2DPTR(void *, funcp); /* stash funcp into spare field */
e0c19803 3668 IoFLAGS(datasv) |= IOf_FAKE_DIRP;
f4c556ac 3669 DEBUG_P(PerlIO_printf(Perl_debug_log, "filter_add func %p (%s)\n",
55662e27
JH
3670 FPTR2DPTR(void *, IoANY(datasv)),
3671 SvPV_nolen(datasv)));
3280af22
NIS
3672 av_unshift(PL_rsfp_filters, 1);
3673 av_store(PL_rsfp_filters, 0, datasv) ;
16d20bd9
AD
3674 return(datasv);
3675}
4e553d73 3676
16d20bd9
AD
3677
3678/* Delete most recently added instance of this filter function. */
a0d0e21e 3679void
864dbfa3 3680Perl_filter_del(pTHX_ filter_t funcp)
16d20bd9 3681{
97aff369 3682 dVAR;
e0c19803 3683 SV *datasv;
24801a4b 3684
7918f24d
NC
3685 PERL_ARGS_ASSERT_FILTER_DEL;
3686
33073adb 3687#ifdef DEBUGGING
55662e27
JH
3688 DEBUG_P(PerlIO_printf(Perl_debug_log, "filter_del func %p",
3689 FPTR2DPTR(void*, funcp)));
33073adb 3690#endif
5486870f 3691 if (!PL_parser || !PL_rsfp_filters || AvFILLp(PL_rsfp_filters)<0)
16d20bd9
AD
3692 return;
3693 /* if filter is on top of stack (usual case) just pop it off */
e0c19803 3694 datasv = FILTER_DATA(AvFILLp(PL_rsfp_filters));
8141890a 3695 if (IoANY(datasv) == FPTR2DPTR(void *, funcp)) {
e0c19803 3696 IoFLAGS(datasv) &= ~IOf_FAKE_DIRP;
4755096e 3697 IoANY(datasv) = (void *)NULL;
3280af22 3698 sv_free(av_pop(PL_rsfp_filters));
e50aee73 3699
16d20bd9
AD
3700 return;
3701 }
3702 /* we need to search for the correct entry and clear it */
cea2e8a9 3703 Perl_die(aTHX_ "filter_del can only delete in reverse order (currently)");
16d20bd9
AD
3704}
3705
3706
1de9afcd
RGS
3707/* Invoke the idxth filter function for the current rsfp. */
3708/* maxlen 0 = read one text line */
16d20bd9 3709I32
864dbfa3 3710Perl_filter_read(pTHX_ int idx, SV *buf_sv, int maxlen)
a0d0e21e 3711{
97aff369 3712 dVAR;
16d20bd9
AD
3713 filter_t funcp;
3714 SV *datasv = NULL;
f482118e
NC
3715 /* This API is bad. It should have been using unsigned int for maxlen.
3716 Not sure if we want to change the API, but if not we should sanity
3717 check the value here. */
39cd7a59
NC
3718 const unsigned int correct_length
3719 = maxlen < 0 ?
3720#ifdef PERL_MICRO
3721 0x7FFFFFFF
3722#else
3723 INT_MAX
3724#endif
3725 : maxlen;
e50aee73 3726
7918f24d
NC
3727 PERL_ARGS_ASSERT_FILTER_READ;
3728
5486870f 3729 if (!PL_parser || !PL_rsfp_filters)
16d20bd9 3730 return -1;
1de9afcd 3731 if (idx > AvFILLp(PL_rsfp_filters)) { /* Any more filters? */
16d20bd9
AD
3732 /* Provide a default input filter to make life easy. */
3733 /* Note that we append to the line. This is handy. */
f4c556ac
GS
3734 DEBUG_P(PerlIO_printf(Perl_debug_log,
3735 "filter_read %d: from rsfp\n", idx));
f482118e 3736 if (correct_length) {
16d20bd9
AD
3737 /* Want a block */
3738 int len ;
f54cb97a 3739 const int old_len = SvCUR(buf_sv);
16d20bd9
AD
3740
3741 /* ensure buf_sv is large enough */
881d8f0a 3742 SvGROW(buf_sv, (STRLEN)(old_len + correct_length + 1)) ;
f482118e
NC
3743 if ((len = PerlIO_read(PL_rsfp, SvPVX(buf_sv) + old_len,
3744 correct_length)) <= 0) {
3280af22 3745 if (PerlIO_error(PL_rsfp))
37120919
AD
3746 return -1; /* error */
3747 else
3748 return 0 ; /* end of file */
3749 }
16d20bd9 3750 SvCUR_set(buf_sv, old_len + len) ;
881d8f0a 3751 SvPVX(buf_sv)[old_len + len] = '\0';
16d20bd9
AD
3752 } else {
3753 /* Want a line */
3280af22
NIS
3754 if (sv_gets(buf_sv, PL_rsfp, SvCUR(buf_sv)) == NULL) {
3755 if (PerlIO_error(PL_rsfp))
37120919
AD
3756 return -1; /* error */
3757 else
3758 return 0 ; /* end of file */
3759 }
16d20bd9
AD
3760 }
3761 return SvCUR(buf_sv);
3762 }
3763 /* Skip this filter slot if filter has been deleted */
1de9afcd 3764 if ( (datasv = FILTER_DATA(idx)) == &PL_sv_undef) {
f4c556ac
GS
3765 DEBUG_P(PerlIO_printf(Perl_debug_log,
3766 "filter_read %d: skipped (filter deleted)\n",