This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Change 27506 wasn't quite correct - the copy of the AMAGIC flag should
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4
IZ
104 char *precomp; /* uncompiled string. */
105 regexp *rx;
fac92740 106 char *start; /* Start of input for compile */
830247a4
IZ
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 110 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 111 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
114 U32 seen;
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
117 I32 extralen;
118 I32 seen_zerolen;
119 I32 seen_evals;
40d049e4
YO
120 regnode **open_parens; /* pointers to open parens */
121 regnode **close_parens; /* pointers to close parens */
122 regnode *opend; /* END node in program */
1aa99e6b 123 I32 utf8;
6bda09f9 124 HV *charnames; /* cache of named sequences */
81714fb9 125 HV *paren_names; /* Paren names */
40d049e4
YO
126 regnode **recurse; /* Recurse regops */
127 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
128#if ADD_TO_REGEXEC
129 char *starttry; /* -Dr: where regtry was called. */
130#define RExC_starttry (pRExC_state->starttry)
131#endif
3dab1dad 132#ifdef DEBUGGING
be8e71aa 133 const char *lastparse;
3dab1dad
YO
134 I32 lastnum;
135#define RExC_lastparse (pRExC_state->lastparse)
136#define RExC_lastnum (pRExC_state->lastnum)
137#endif
830247a4
IZ
138} RExC_state_t;
139
e2509266 140#define RExC_flags (pRExC_state->flags)
830247a4
IZ
141#define RExC_precomp (pRExC_state->precomp)
142#define RExC_rx (pRExC_state->rx)
fac92740 143#define RExC_start (pRExC_state->start)
830247a4
IZ
144#define RExC_end (pRExC_state->end)
145#define RExC_parse (pRExC_state->parse)
146#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 147#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 148#define RExC_emit (pRExC_state->emit)
fac92740 149#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
150#define RExC_naughty (pRExC_state->naughty)
151#define RExC_sawback (pRExC_state->sawback)
152#define RExC_seen (pRExC_state->seen)
153#define RExC_size (pRExC_state->size)
154#define RExC_npar (pRExC_state->npar)
155#define RExC_extralen (pRExC_state->extralen)
156#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
157#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 158#define RExC_utf8 (pRExC_state->utf8)
fc8cd66c 159#define RExC_charnames (pRExC_state->charnames)
40d049e4
YO
160#define RExC_open_parens (pRExC_state->open_parens)
161#define RExC_close_parens (pRExC_state->close_parens)
162#define RExC_opend (pRExC_state->opend)
81714fb9 163#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
164#define RExC_recurse (pRExC_state->recurse)
165#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 166
a687059c
LW
167#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
168#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
169 ((*s) == '{' && regcurly(s)))
a687059c 170
35c8bce7
LW
171#ifdef SPSTART
172#undef SPSTART /* dratted cpp namespace... */
173#endif
a687059c
LW
174/*
175 * Flags to be passed up and down.
176 */
a687059c 177#define WORST 0 /* Worst case. */
821b33a5 178#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
179#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
180#define SPSTART 0x4 /* Starts with * or +. */
181#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 182
3dab1dad
YO
183#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
184
07be1b83
YO
185/* whether trie related optimizations are enabled */
186#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
187#define TRIE_STUDY_OPT
786e8c11 188#define FULL_TRIE_STUDY
07be1b83
YO
189#define TRIE_STCLASS
190#endif
1de06328
YO
191
192
40d049e4
YO
193
194#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
195#define PBITVAL(paren) (1 << ((paren) & 7))
196#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
197#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
198#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
199
200
1de06328
YO
201/* About scan_data_t.
202
203 During optimisation we recurse through the regexp program performing
204 various inplace (keyhole style) optimisations. In addition study_chunk
205 and scan_commit populate this data structure with information about
206 what strings MUST appear in the pattern. We look for the longest
207 string that must appear for at a fixed location, and we look for the
208 longest string that may appear at a floating location. So for instance
209 in the pattern:
210
211 /FOO[xX]A.*B[xX]BAR/
212
213 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
214 strings (because they follow a .* construct). study_chunk will identify
215 both FOO and BAR as being the longest fixed and floating strings respectively.
216
217 The strings can be composites, for instance
218
219 /(f)(o)(o)/
220
221 will result in a composite fixed substring 'foo'.
222
223 For each string some basic information is maintained:
224
225 - offset or min_offset
226 This is the position the string must appear at, or not before.
227 It also implicitly (when combined with minlenp) tells us how many
228 character must match before the string we are searching.
229 Likewise when combined with minlenp and the length of the string
230 tells us how many characters must appear after the string we have
231 found.
232
233 - max_offset
234 Only used for floating strings. This is the rightmost point that
235 the string can appear at. Ifset to I32 max it indicates that the
236 string can occur infinitely far to the right.
237
238 - minlenp
239 A pointer to the minimum length of the pattern that the string
240 was found inside. This is important as in the case of positive
241 lookahead or positive lookbehind we can have multiple patterns
242 involved. Consider
243
244 /(?=FOO).*F/
245
246 The minimum length of the pattern overall is 3, the minimum length
247 of the lookahead part is 3, but the minimum length of the part that
248 will actually match is 1. So 'FOO's minimum length is 3, but the
249 minimum length for the F is 1. This is important as the minimum length
250 is used to determine offsets in front of and behind the string being
251 looked for. Since strings can be composites this is the length of the
252 pattern at the time it was commited with a scan_commit. Note that
253 the length is calculated by study_chunk, so that the minimum lengths
254 are not known until the full pattern has been compiled, thus the
255 pointer to the value.
256
257 - lookbehind
258
259 In the case of lookbehind the string being searched for can be
260 offset past the start point of the final matching string.
261 If this value was just blithely removed from the min_offset it would
262 invalidate some of the calculations for how many chars must match
263 before or after (as they are derived from min_offset and minlen and
264 the length of the string being searched for).
265 When the final pattern is compiled and the data is moved from the
266 scan_data_t structure into the regexp structure the information
267 about lookbehind is factored in, with the information that would
268 have been lost precalculated in the end_shift field for the
269 associated string.
270
271 The fields pos_min and pos_delta are used to store the minimum offset
272 and the delta to the maximum offset at the current point in the pattern.
273
274*/
2c2d71f5
JH
275
276typedef struct scan_data_t {
1de06328
YO
277 /*I32 len_min; unused */
278 /*I32 len_delta; unused */
2c2d71f5
JH
279 I32 pos_min;
280 I32 pos_delta;
281 SV *last_found;
1de06328 282 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
283 I32 last_start_min;
284 I32 last_start_max;
1de06328
YO
285 SV **longest; /* Either &l_fixed, or &l_float. */
286 SV *longest_fixed; /* longest fixed string found in pattern */
287 I32 offset_fixed; /* offset where it starts */
288 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
289 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
290 SV *longest_float; /* longest floating string found in pattern */
291 I32 offset_float_min; /* earliest point in string it can appear */
292 I32 offset_float_max; /* latest point in string it can appear */
293 I32 *minlen_float; /* pointer to the minlen relevent to the string */
294 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
295 I32 flags;
296 I32 whilem_c;
cb434fcc 297 I32 *last_closep;
653099ff 298 struct regnode_charclass_class *start_class;
2c2d71f5
JH
299} scan_data_t;
300
a687059c 301/*
e50aee73 302 * Forward declarations for pregcomp()'s friends.
a687059c 303 */
a0d0e21e 304
27da23d5 305static const scan_data_t zero_scan_data =
1de06328 306 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
307
308#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
309#define SF_BEFORE_SEOL 0x0001
310#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
311#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
312#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
313
09b7f37c
CB
314#ifdef NO_UNARY_PLUS
315# define SF_FIX_SHIFT_EOL (0+2)
316# define SF_FL_SHIFT_EOL (0+4)
317#else
318# define SF_FIX_SHIFT_EOL (+2)
319# define SF_FL_SHIFT_EOL (+4)
320#endif
c277df42
IZ
321
322#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
323#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
324
325#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
326#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
327#define SF_IS_INF 0x0040
328#define SF_HAS_PAR 0x0080
329#define SF_IN_PAR 0x0100
330#define SF_HAS_EVAL 0x0200
331#define SCF_DO_SUBSTR 0x0400
653099ff
GS
332#define SCF_DO_STCLASS_AND 0x0800
333#define SCF_DO_STCLASS_OR 0x1000
334#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 335#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 336
786e8c11
YO
337#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
338
07be1b83 339
eb160463 340#define UTF (RExC_utf8 != 0)
e2509266
JH
341#define LOC ((RExC_flags & PMf_LOCALE) != 0)
342#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 343
ffc61ed2 344#define OOB_UNICODE 12345678
93733859 345#define OOB_NAMEDCLASS -1
b8c5462f 346
a0ed51b3
LW
347#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
348#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
349
8615cb43 350
b45f050a
JF
351/* length of regex to show in messages that don't mark a position within */
352#define RegexLengthToShowInErrorMessages 127
353
354/*
355 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
356 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
357 * op/pragma/warn/regcomp.
358 */
7253e4e3
RK
359#define MARKER1 "<-- HERE" /* marker as it appears in the description */
360#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 361
7253e4e3 362#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
363
364/*
365 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
366 * arg. Show regex, up to a maximum length. If it's too long, chop and add
367 * "...".
368 */
ccb2c380 369#define FAIL(msg) STMT_START { \
bfed75c6 370 const char *ellipses = ""; \
ccb2c380
MP
371 IV len = RExC_end - RExC_precomp; \
372 \
373 if (!SIZE_ONLY) \
374 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
375 if (len > RegexLengthToShowInErrorMessages) { \
376 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
377 len = RegexLengthToShowInErrorMessages - 10; \
378 ellipses = "..."; \
379 } \
380 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
381 msg, (int)len, RExC_precomp, ellipses); \
382} STMT_END
8615cb43 383
b45f050a 384/*
b45f050a
JF
385 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
386 */
ccb2c380 387#define Simple_vFAIL(m) STMT_START { \
a28509cc 388 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
389 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
390 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
391} STMT_END
b45f050a
JF
392
393/*
394 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
395 */
ccb2c380
MP
396#define vFAIL(m) STMT_START { \
397 if (!SIZE_ONLY) \
398 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
399 Simple_vFAIL(m); \
400} STMT_END
b45f050a
JF
401
402/*
403 * Like Simple_vFAIL(), but accepts two arguments.
404 */
ccb2c380 405#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 406 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
407 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
408 (int)offset, RExC_precomp, RExC_precomp + offset); \
409} STMT_END
b45f050a
JF
410
411/*
412 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
413 */
ccb2c380
MP
414#define vFAIL2(m,a1) STMT_START { \
415 if (!SIZE_ONLY) \
416 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
417 Simple_vFAIL2(m, a1); \
418} STMT_END
b45f050a
JF
419
420
421/*
422 * Like Simple_vFAIL(), but accepts three arguments.
423 */
ccb2c380 424#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 425 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
426 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
427 (int)offset, RExC_precomp, RExC_precomp + offset); \
428} STMT_END
b45f050a
JF
429
430/*
431 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
432 */
ccb2c380
MP
433#define vFAIL3(m,a1,a2) STMT_START { \
434 if (!SIZE_ONLY) \
435 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
436 Simple_vFAIL3(m, a1, a2); \
437} STMT_END
b45f050a
JF
438
439/*
440 * Like Simple_vFAIL(), but accepts four arguments.
441 */
ccb2c380 442#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 443 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
444 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
445 (int)offset, RExC_precomp, RExC_precomp + offset); \
446} STMT_END
b45f050a 447
ccb2c380 448#define vWARN(loc,m) STMT_START { \
a28509cc 449 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
450 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
451 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
452} STMT_END
453
454#define vWARNdep(loc,m) STMT_START { \
a28509cc 455 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
456 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
457 "%s" REPORT_LOCATION, \
458 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
459} STMT_END
460
461
462#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 463 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
464 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
465 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
466} STMT_END
467
468#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 469 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
470 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
471 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
472} STMT_END
473
474#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 475 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
476 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
477 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
478} STMT_END
479
480#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 481 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
482 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
483 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
484} STMT_END
9d1d55b5 485
8615cb43 486
cd439c50 487/* Allow for side effects in s */
ccb2c380
MP
488#define REGC(c,s) STMT_START { \
489 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
490} STMT_END
cd439c50 491
fac92740
MJD
492/* Macros for recording node offsets. 20001227 mjd@plover.com
493 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
494 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
495 * Element 0 holds the number n.
07be1b83 496 * Position is 1 indexed.
fac92740
MJD
497 */
498
ccb2c380
MP
499#define Set_Node_Offset_To_R(node,byte) STMT_START { \
500 if (! SIZE_ONLY) { \
501 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
07be1b83 502 __LINE__, (node), (int)(byte))); \
ccb2c380 503 if((node) < 0) { \
551405c4 504 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
505 } else { \
506 RExC_offsets[2*(node)-1] = (byte); \
507 } \
508 } \
509} STMT_END
510
511#define Set_Node_Offset(node,byte) \
512 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
513#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
514
515#define Set_Node_Length_To_R(node,len) STMT_START { \
516 if (! SIZE_ONLY) { \
517 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 518 __LINE__, (int)(node), (int)(len))); \
ccb2c380 519 if((node) < 0) { \
551405c4 520 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
521 } else { \
522 RExC_offsets[2*(node)] = (len); \
523 } \
524 } \
525} STMT_END
526
527#define Set_Node_Length(node,len) \
528 Set_Node_Length_To_R((node)-RExC_emit_start, len)
529#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
530#define Set_Node_Cur_Length(node) \
531 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
532
533/* Get offsets and lengths */
534#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
535#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
536
07be1b83
YO
537#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
538 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
539 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
540} STMT_END
541
542
543#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
544#define EXPERIMENTAL_INPLACESCAN
545#endif
546
1de06328 547#define DEBUG_STUDYDATA(data,depth) \
a5ca303d 548DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328
YO
549 PerlIO_printf(Perl_debug_log, \
550 "%*s"/* Len:%"IVdf"/%"IVdf" */" Pos:%"IVdf"/%"IVdf \
551 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
552 (int)(depth)*2, "", \
553 (IV)((data)->pos_min), \
554 (IV)((data)->pos_delta), \
555 (IV)((data)->flags), \
556 (IV)((data)->whilem_c), \
557 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
558 ); \
559 if ((data)->last_found) \
560 PerlIO_printf(Perl_debug_log, \
561 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
562 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
563 SvPVX_const((data)->last_found), \
564 (IV)((data)->last_end), \
565 (IV)((data)->last_start_min), \
566 (IV)((data)->last_start_max), \
567 ((data)->longest && \
568 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
569 SvPVX_const((data)->longest_fixed), \
570 (IV)((data)->offset_fixed), \
571 ((data)->longest && \
572 (data)->longest==&((data)->longest_float)) ? "*" : "", \
573 SvPVX_const((data)->longest_float), \
574 (IV)((data)->offset_float_min), \
575 (IV)((data)->offset_float_max) \
576 ); \
577 PerlIO_printf(Perl_debug_log,"\n"); \
578});
579
acfe0abc 580static void clear_re(pTHX_ void *r);
4327152a 581
653099ff 582/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 583 Update the longest found anchored substring and the longest found
653099ff
GS
584 floating substrings if needed. */
585
4327152a 586STATIC void
1de06328 587S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
c277df42 588{
e1ec3a88
AL
589 const STRLEN l = CHR_SVLEN(data->last_found);
590 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 591 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 592
c277df42 593 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 594 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
595 if (*data->longest == data->longest_fixed) {
596 data->offset_fixed = l ? data->last_start_min : data->pos_min;
597 if (data->flags & SF_BEFORE_EOL)
b81d288d 598 data->flags
c277df42
IZ
599 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
600 else
601 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
602 data->minlen_fixed=minlenp;
603 data->lookbehind_fixed=0;
a0ed51b3
LW
604 }
605 else {
c277df42 606 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
607 data->offset_float_max = (l
608 ? data->last_start_max
c277df42 609 : data->pos_min + data->pos_delta);
9051bda5
HS
610 if ((U32)data->offset_float_max > (U32)I32_MAX)
611 data->offset_float_max = I32_MAX;
c277df42 612 if (data->flags & SF_BEFORE_EOL)
b81d288d 613 data->flags
c277df42
IZ
614 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
615 else
616 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
617 data->minlen_float=minlenp;
618 data->lookbehind_float=0;
c277df42
IZ
619 }
620 }
621 SvCUR_set(data->last_found, 0);
0eda9292 622 {
a28509cc 623 SV * const sv = data->last_found;
097eb12c
AL
624 if (SvUTF8(sv) && SvMAGICAL(sv)) {
625 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
626 if (mg)
627 mg->mg_len = 0;
628 }
0eda9292 629 }
c277df42
IZ
630 data->last_end = -1;
631 data->flags &= ~SF_BEFORE_EOL;
1de06328 632 DEBUG_STUDYDATA(data,0);
c277df42
IZ
633}
634
653099ff
GS
635/* Can match anything (initialization) */
636STATIC void
097eb12c 637S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 638{
653099ff 639 ANYOF_CLASS_ZERO(cl);
f8bef550 640 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 641 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
642 if (LOC)
643 cl->flags |= ANYOF_LOCALE;
644}
645
646/* Can match anything (initialization) */
647STATIC int
5f66b61c 648S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
649{
650 int value;
651
aaa51d5e 652 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
653 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
654 return 1;
1aa99e6b
IH
655 if (!(cl->flags & ANYOF_UNICODE_ALL))
656 return 0;
10edeb5d 657 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 658 return 0;
653099ff
GS
659 return 1;
660}
661
662/* Can match anything (initialization) */
663STATIC void
097eb12c 664S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 665{
8ecf7187 666 Zero(cl, 1, struct regnode_charclass_class);
653099ff 667 cl->type = ANYOF;
830247a4 668 cl_anything(pRExC_state, cl);
653099ff
GS
669}
670
671STATIC void
097eb12c 672S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 673{
8ecf7187 674 Zero(cl, 1, struct regnode_charclass_class);
653099ff 675 cl->type = ANYOF;
830247a4 676 cl_anything(pRExC_state, cl);
653099ff
GS
677 if (LOC)
678 cl->flags |= ANYOF_LOCALE;
679}
680
681/* 'And' a given class with another one. Can create false positives */
682/* We assume that cl is not inverted */
683STATIC void
5f66b61c 684S_cl_and(struct regnode_charclass_class *cl,
a28509cc 685 const struct regnode_charclass_class *and_with)
653099ff 686{
40d049e4
YO
687
688 assert(and_with->type == ANYOF);
653099ff
GS
689 if (!(and_with->flags & ANYOF_CLASS)
690 && !(cl->flags & ANYOF_CLASS)
691 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
692 && !(and_with->flags & ANYOF_FOLD)
693 && !(cl->flags & ANYOF_FOLD)) {
694 int i;
695
696 if (and_with->flags & ANYOF_INVERT)
697 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
698 cl->bitmap[i] &= ~and_with->bitmap[i];
699 else
700 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
701 cl->bitmap[i] &= and_with->bitmap[i];
702 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
703 if (!(and_with->flags & ANYOF_EOS))
704 cl->flags &= ~ANYOF_EOS;
1aa99e6b 705
14ebb1a2
JH
706 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
707 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
708 cl->flags &= ~ANYOF_UNICODE_ALL;
709 cl->flags |= ANYOF_UNICODE;
710 ARG_SET(cl, ARG(and_with));
711 }
14ebb1a2
JH
712 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
713 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 714 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
715 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
716 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 717 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
718}
719
720/* 'OR' a given class with another one. Can create false positives */
721/* We assume that cl is not inverted */
722STATIC void
097eb12c 723S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 724{
653099ff
GS
725 if (or_with->flags & ANYOF_INVERT) {
726 /* We do not use
727 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
728 * <= (B1 | !B2) | (CL1 | !CL2)
729 * which is wasteful if CL2 is small, but we ignore CL2:
730 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
731 * XXXX Can we handle case-fold? Unclear:
732 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
733 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
734 */
735 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
736 && !(or_with->flags & ANYOF_FOLD)
737 && !(cl->flags & ANYOF_FOLD) ) {
738 int i;
739
740 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
741 cl->bitmap[i] |= ~or_with->bitmap[i];
742 } /* XXXX: logic is complicated otherwise */
743 else {
830247a4 744 cl_anything(pRExC_state, cl);
653099ff
GS
745 }
746 } else {
747 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
748 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 749 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
750 || (cl->flags & ANYOF_FOLD)) ) {
751 int i;
752
753 /* OR char bitmap and class bitmap separately */
754 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
755 cl->bitmap[i] |= or_with->bitmap[i];
756 if (or_with->flags & ANYOF_CLASS) {
757 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
758 cl->classflags[i] |= or_with->classflags[i];
759 cl->flags |= ANYOF_CLASS;
760 }
761 }
762 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 763 cl_anything(pRExC_state, cl);
653099ff
GS
764 }
765 }
766 if (or_with->flags & ANYOF_EOS)
767 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
768
769 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
770 ARG(cl) != ARG(or_with)) {
771 cl->flags |= ANYOF_UNICODE_ALL;
772 cl->flags &= ~ANYOF_UNICODE;
773 }
774 if (or_with->flags & ANYOF_UNICODE_ALL) {
775 cl->flags |= ANYOF_UNICODE_ALL;
776 cl->flags &= ~ANYOF_UNICODE;
777 }
653099ff
GS
778}
779
a3621e74
YO
780#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
781#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
782#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
783#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
784
3dab1dad
YO
785
786#ifdef DEBUGGING
07be1b83 787/*
3dab1dad
YO
788 dump_trie(trie)
789 dump_trie_interim_list(trie,next_alloc)
790 dump_trie_interim_table(trie,next_alloc)
791
792 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
793 The _interim_ variants are used for debugging the interim
794 tables that are used to generate the final compressed
795 representation which is what dump_trie expects.
796
3dab1dad
YO
797 Part of the reason for their existance is to provide a form
798 of documentation as to how the different representations function.
07be1b83
YO
799
800*/
3dab1dad
YO
801
802/*
803 dump_trie(trie)
804 Dumps the final compressed table form of the trie to Perl_debug_log.
805 Used for debugging make_trie().
806*/
807
808STATIC void
809S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
810{
811 U32 state;
ab3bbdeb
YO
812 SV *sv=sv_newmortal();
813 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
814 GET_RE_DEBUG_FLAGS_DECL;
815
ab3bbdeb 816
3dab1dad
YO
817 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
818 (int)depth * 2 + 2,"",
819 "Match","Base","Ofs" );
820
821 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
be8e71aa 822 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
3dab1dad 823 if ( tmp ) {
ab3bbdeb
YO
824 PerlIO_printf( Perl_debug_log, "%*s",
825 colwidth,
ddc5bc0f 826 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
827 PL_colors[0], PL_colors[1],
828 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
829 PERL_PV_ESCAPE_FIRSTCHAR
830 )
831 );
3dab1dad
YO
832 }
833 }
834 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
835 (int)depth * 2 + 2,"");
836
837 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 838 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
839 PerlIO_printf( Perl_debug_log, "\n");
840
1e2e3d02 841 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 842 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
843
844 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
845
846 if ( trie->states[ state ].wordnum ) {
847 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
848 } else {
849 PerlIO_printf( Perl_debug_log, "%6s", "" );
850 }
851
852 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
853
854 if ( base ) {
855 U32 ofs = 0;
856
857 while( ( base + ofs < trie->uniquecharcount ) ||
858 ( base + ofs - trie->uniquecharcount < trie->lasttrans
859 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
860 ofs++;
861
862 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
863
864 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
865 if ( ( base + ofs >= trie->uniquecharcount ) &&
866 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
867 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
868 {
ab3bbdeb
YO
869 PerlIO_printf( Perl_debug_log, "%*"UVXf,
870 colwidth,
3dab1dad
YO
871 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
872 } else {
ab3bbdeb 873 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
874 }
875 }
876
877 PerlIO_printf( Perl_debug_log, "]");
878
879 }
880 PerlIO_printf( Perl_debug_log, "\n" );
881 }
882}
883/*
884 dump_trie_interim_list(trie,next_alloc)
885 Dumps a fully constructed but uncompressed trie in list form.
886 List tries normally only are used for construction when the number of
887 possible chars (trie->uniquecharcount) is very high.
888 Used for debugging make_trie().
889*/
890STATIC void
891S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
892{
893 U32 state;
ab3bbdeb
YO
894 SV *sv=sv_newmortal();
895 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
896 GET_RE_DEBUG_FLAGS_DECL;
897 /* print out the table precompression. */
ab3bbdeb
YO
898 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
899 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
900 "------:-----+-----------------\n" );
3dab1dad
YO
901
902 for( state=1 ; state < next_alloc ; state ++ ) {
903 U16 charid;
904
ab3bbdeb 905 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
906 (int)depth * 2 + 2,"", (UV)state );
907 if ( ! trie->states[ state ].wordnum ) {
908 PerlIO_printf( Perl_debug_log, "%5s| ","");
909 } else {
910 PerlIO_printf( Perl_debug_log, "W%4x| ",
911 trie->states[ state ].wordnum
912 );
913 }
914 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
be8e71aa 915 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
916 if ( tmp ) {
917 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
918 colwidth,
ddc5bc0f 919 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
920 PL_colors[0], PL_colors[1],
921 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
922 PERL_PV_ESCAPE_FIRSTCHAR
923 ) ,
1e2e3d02
YO
924 TRIE_LIST_ITEM(state,charid).forid,
925 (UV)TRIE_LIST_ITEM(state,charid).newstate
926 );
927 if (!(charid % 10))
664e119d
RGS
928 PerlIO_printf(Perl_debug_log, "\n%*s| ",
929 (int)((depth * 2) + 14), "");
1e2e3d02 930 }
ab3bbdeb
YO
931 }
932 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
933 }
934}
935
936/*
937 dump_trie_interim_table(trie,next_alloc)
938 Dumps a fully constructed but uncompressed trie in table form.
939 This is the normal DFA style state transition table, with a few
940 twists to facilitate compression later.
941 Used for debugging make_trie().
942*/
943STATIC void
944S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
945{
946 U32 state;
947 U16 charid;
ab3bbdeb
YO
948 SV *sv=sv_newmortal();
949 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
950 GET_RE_DEBUG_FLAGS_DECL;
951
952 /*
953 print out the table precompression so that we can do a visual check
954 that they are identical.
955 */
956
957 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
958
959 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
be8e71aa 960 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
3dab1dad 961 if ( tmp ) {
ab3bbdeb
YO
962 PerlIO_printf( Perl_debug_log, "%*s",
963 colwidth,
ddc5bc0f 964 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
965 PL_colors[0], PL_colors[1],
966 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
967 PERL_PV_ESCAPE_FIRSTCHAR
968 )
969 );
3dab1dad
YO
970 }
971 }
972
973 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
974
975 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 976 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
977 }
978
979 PerlIO_printf( Perl_debug_log, "\n" );
980
981 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
982
983 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
984 (int)depth * 2 + 2,"",
985 (UV)TRIE_NODENUM( state ) );
986
987 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
988 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
989 if (v)
990 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
991 else
992 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
993 }
994 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
995 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
996 } else {
997 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
998 trie->states[ TRIE_NODENUM( state ) ].wordnum );
999 }
1000 }
07be1b83 1001}
3dab1dad
YO
1002
1003#endif
1004
786e8c11
YO
1005/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1006 startbranch: the first branch in the whole branch sequence
1007 first : start branch of sequence of branch-exact nodes.
1008 May be the same as startbranch
1009 last : Thing following the last branch.
1010 May be the same as tail.
1011 tail : item following the branch sequence
1012 count : words in the sequence
1013 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1014 depth : indent depth
3dab1dad 1015
786e8c11 1016Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1017
786e8c11
YO
1018A trie is an N'ary tree where the branches are determined by digital
1019decomposition of the key. IE, at the root node you look up the 1st character and
1020follow that branch repeat until you find the end of the branches. Nodes can be
1021marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1022
786e8c11 1023 /he|she|his|hers/
72f13be8 1024
786e8c11
YO
1025would convert into the following structure. Numbers represent states, letters
1026following numbers represent valid transitions on the letter from that state, if
1027the number is in square brackets it represents an accepting state, otherwise it
1028will be in parenthesis.
07be1b83 1029
786e8c11
YO
1030 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1031 | |
1032 | (2)
1033 | |
1034 (1) +-i->(6)-+-s->[7]
1035 |
1036 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1037
786e8c11
YO
1038 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1039
1040This shows that when matching against the string 'hers' we will begin at state 1
1041read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1042then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1043is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1044single traverse. We store a mapping from accepting to state to which word was
1045matched, and then when we have multiple possibilities we try to complete the
1046rest of the regex in the order in which they occured in the alternation.
1047
1048The only prior NFA like behaviour that would be changed by the TRIE support is
1049the silent ignoring of duplicate alternations which are of the form:
1050
1051 / (DUPE|DUPE) X? (?{ ... }) Y /x
1052
1053Thus EVAL blocks follwing a trie may be called a different number of times with
1054and without the optimisation. With the optimisations dupes will be silently
1055ignored. This inconsistant behaviour of EVAL type nodes is well established as
1056the following demonstrates:
1057
1058 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1059
1060which prints out 'word' three times, but
1061
1062 'words'=~/(word|word|word)(?{ print $1 })S/
1063
1064which doesnt print it out at all. This is due to other optimisations kicking in.
1065
1066Example of what happens on a structural level:
1067
1068The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1069
1070 1: CURLYM[1] {1,32767}(18)
1071 5: BRANCH(8)
1072 6: EXACT <ac>(16)
1073 8: BRANCH(11)
1074 9: EXACT <ad>(16)
1075 11: BRANCH(14)
1076 12: EXACT <ab>(16)
1077 16: SUCCEED(0)
1078 17: NOTHING(18)
1079 18: END(0)
1080
1081This would be optimizable with startbranch=5, first=5, last=16, tail=16
1082and should turn into:
1083
1084 1: CURLYM[1] {1,32767}(18)
1085 5: TRIE(16)
1086 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1087 <ac>
1088 <ad>
1089 <ab>
1090 16: SUCCEED(0)
1091 17: NOTHING(18)
1092 18: END(0)
1093
1094Cases where tail != last would be like /(?foo|bar)baz/:
1095
1096 1: BRANCH(4)
1097 2: EXACT <foo>(8)
1098 4: BRANCH(7)
1099 5: EXACT <bar>(8)
1100 7: TAIL(8)
1101 8: EXACT <baz>(10)
1102 10: END(0)
1103
1104which would be optimizable with startbranch=1, first=1, last=7, tail=8
1105and would end up looking like:
1106
1107 1: TRIE(8)
1108 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1109 <foo>
1110 <bar>
1111 7: TAIL(8)
1112 8: EXACT <baz>(10)
1113 10: END(0)
1114
1115 d = uvuni_to_utf8_flags(d, uv, 0);
1116
1117is the recommended Unicode-aware way of saying
1118
1119 *(d++) = uv;
1120*/
1121
1e2e3d02 1122#define TRIE_STORE_REVCHAR \
786e8c11 1123 STMT_START { \
1e2e3d02 1124 SV *tmp = newSVpvs(""); \
786e8c11 1125 if (UTF) SvUTF8_on(tmp); \
1e2e3d02 1126 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
786e8c11
YO
1127 av_push( TRIE_REVCHARMAP(trie), tmp ); \
1128 } STMT_END
1129
1130#define TRIE_READ_CHAR STMT_START { \
1131 wordlen++; \
1132 if ( UTF ) { \
1133 if ( folder ) { \
1134 if ( foldlen > 0 ) { \
1135 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1136 foldlen -= len; \
1137 scan += len; \
1138 len = 0; \
1139 } else { \
1140 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1141 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1142 foldlen -= UNISKIP( uvc ); \
1143 scan = foldbuf + UNISKIP( uvc ); \
1144 } \
1145 } else { \
1146 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1147 } \
1148 } else { \
1149 uvc = (U32)*uc; \
1150 len = 1; \
1151 } \
1152} STMT_END
1153
1154
1155
1156#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1157 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1158 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1159 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1160 } \
1161 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1162 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1163 TRIE_LIST_CUR( state )++; \
1164} STMT_END
07be1b83 1165
786e8c11
YO
1166#define TRIE_LIST_NEW(state) STMT_START { \
1167 Newxz( trie->states[ state ].trans.list, \
1168 4, reg_trie_trans_le ); \
1169 TRIE_LIST_CUR( state ) = 1; \
1170 TRIE_LIST_LEN( state ) = 4; \
1171} STMT_END
07be1b83 1172
786e8c11
YO
1173#define TRIE_HANDLE_WORD(state) STMT_START { \
1174 U16 dupe= trie->states[ state ].wordnum; \
1175 regnode * const noper_next = regnext( noper ); \
1176 \
1177 if (trie->wordlen) \
1178 trie->wordlen[ curword ] = wordlen; \
1179 DEBUG_r({ \
1180 /* store the word for dumping */ \
1181 SV* tmp; \
1182 if (OP(noper) != NOTHING) \
1183 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1184 else \
1185 tmp = newSVpvn( "", 0 ); \
1186 if ( UTF ) SvUTF8_on( tmp ); \
1187 av_push( trie->words, tmp ); \
1188 }); \
1189 \
1190 curword++; \
1191 \
1192 if ( noper_next < tail ) { \
1193 if (!trie->jump) \
1194 Newxz( trie->jump, word_count + 1, U16); \
7f69552c 1195 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1196 if (!jumper) \
1197 jumper = noper_next; \
1198 if (!nextbranch) \
1199 nextbranch= regnext(cur); \
1200 } \
1201 \
1202 if ( dupe ) { \
1203 /* So it's a dupe. This means we need to maintain a */\
1204 /* linked-list from the first to the next. */\
1205 /* we only allocate the nextword buffer when there */\
1206 /* a dupe, so first time we have to do the allocation */\
1207 if (!trie->nextword) \
1208 Newxz( trie->nextword, word_count + 1, U16); \
1209 while ( trie->nextword[dupe] ) \
1210 dupe= trie->nextword[dupe]; \
1211 trie->nextword[dupe]= curword; \
1212 } else { \
1213 /* we haven't inserted this word yet. */ \
1214 trie->states[ state ].wordnum = curword; \
1215 } \
1216} STMT_END
07be1b83 1217
3dab1dad 1218
786e8c11
YO
1219#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1220 ( ( base + charid >= ucharcount \
1221 && base + charid < ubound \
1222 && state == trie->trans[ base - ucharcount + charid ].check \
1223 && trie->trans[ base - ucharcount + charid ].next ) \
1224 ? trie->trans[ base - ucharcount + charid ].next \
1225 : ( state==1 ? special : 0 ) \
1226 )
3dab1dad 1227
786e8c11
YO
1228#define MADE_TRIE 1
1229#define MADE_JUMP_TRIE 2
1230#define MADE_EXACT_TRIE 4
3dab1dad 1231
a3621e74 1232STATIC I32
786e8c11 1233S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1234{
27da23d5 1235 dVAR;
a3621e74
YO
1236 /* first pass, loop through and scan words */
1237 reg_trie_data *trie;
1238 regnode *cur;
9f7f3913 1239 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1240 STRLEN len = 0;
1241 UV uvc = 0;
1242 U16 curword = 0;
1243 U32 next_alloc = 0;
786e8c11
YO
1244 regnode *jumper = NULL;
1245 regnode *nextbranch = NULL;
7f69552c 1246 regnode *convert = NULL;
a3621e74 1247 /* we just use folder as a flag in utf8 */
e1ec3a88 1248 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1249 ? PL_fold
1250 : ( flags == EXACTFL
1251 ? PL_fold_locale
1252 : NULL
1253 )
1254 );
1255
e1ec3a88 1256 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74 1257 SV *re_trie_maxbuff;
3dab1dad
YO
1258#ifndef DEBUGGING
1259 /* these are only used during construction but are useful during
8e11feef 1260 * debugging so we store them in the struct when debugging.
8e11feef 1261 */
3dab1dad 1262 STRLEN trie_charcount=0;
3dab1dad
YO
1263 AV *trie_revcharmap;
1264#endif
a3621e74 1265 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1266#ifndef DEBUGGING
1267 PERL_UNUSED_ARG(depth);
1268#endif
a3621e74 1269
a02a5408 1270 Newxz( trie, 1, reg_trie_data );
a3621e74 1271 trie->refcount = 1;
3dab1dad 1272 trie->startstate = 1;
786e8c11 1273 trie->wordcount = word_count;
a3621e74 1274 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 1275 Newxz( trie->charmap, 256, U16 );
3dab1dad
YO
1276 if (!(UTF && folder))
1277 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
a3621e74
YO
1278 DEBUG_r({
1279 trie->words = newAV();
a3621e74 1280 });
3dab1dad 1281 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1282
0111c4fd 1283 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1284 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1285 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1286 }
3dab1dad
YO
1287 DEBUG_OPTIMISE_r({
1288 PerlIO_printf( Perl_debug_log,
786e8c11 1289 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1290 (int)depth * 2 + 2, "",
1291 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1292 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1293 (int)depth);
3dab1dad 1294 });
7f69552c
YO
1295
1296 /* Find the node we are going to overwrite */
1297 if ( first == startbranch && OP( last ) != BRANCH ) {
1298 /* whole branch chain */
1299 convert = first;
1300 } else {
1301 /* branch sub-chain */
1302 convert = NEXTOPER( first );
1303 }
1304
a3621e74
YO
1305 /* -- First loop and Setup --
1306
1307 We first traverse the branches and scan each word to determine if it
1308 contains widechars, and how many unique chars there are, this is
1309 important as we have to build a table with at least as many columns as we
1310 have unique chars.
1311
1312 We use an array of integers to represent the character codes 0..255
1313 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1314 native representation of the character value as the key and IV's for the
1315 coded index.
1316
1317 *TODO* If we keep track of how many times each character is used we can
1318 remap the columns so that the table compression later on is more
1319 efficient in terms of memory by ensuring most common value is in the
1320 middle and the least common are on the outside. IMO this would be better
1321 than a most to least common mapping as theres a decent chance the most
1322 common letter will share a node with the least common, meaning the node
1323 will not be compressable. With a middle is most common approach the worst
1324 case is when we have the least common nodes twice.
1325
1326 */
1327
a3621e74 1328 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1329 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1330 const U8 *uc = (U8*)STRING( noper );
a28509cc 1331 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1332 STRLEN foldlen = 0;
1333 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1334 const U8 *scan = (U8*)NULL;
07be1b83 1335 U32 wordlen = 0; /* required init */
3dab1dad 1336 STRLEN chars=0;
a3621e74 1337
3dab1dad
YO
1338 if (OP(noper) == NOTHING) {
1339 trie->minlen= 0;
1340 continue;
1341 }
1342 if (trie->bitmap) {
1343 TRIE_BITMAP_SET(trie,*uc);
1344 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1345 }
a3621e74 1346 for ( ; uc < e ; uc += len ) {
3dab1dad 1347 TRIE_CHARCOUNT(trie)++;
a3621e74 1348 TRIE_READ_CHAR;
3dab1dad 1349 chars++;
a3621e74
YO
1350 if ( uvc < 256 ) {
1351 if ( !trie->charmap[ uvc ] ) {
1352 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1353 if ( folder )
1354 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1355 TRIE_STORE_REVCHAR;
a3621e74
YO
1356 }
1357 } else {
1358 SV** svpp;
1359 if ( !trie->widecharmap )
1360 trie->widecharmap = newHV();
1361
1362 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1363
1364 if ( !svpp )
e4584336 1365 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1366
1367 if ( !SvTRUE( *svpp ) ) {
1368 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1369 TRIE_STORE_REVCHAR;
a3621e74
YO
1370 }
1371 }
1372 }
3dab1dad
YO
1373 if( cur == first ) {
1374 trie->minlen=chars;
1375 trie->maxlen=chars;
1376 } else if (chars < trie->minlen) {
1377 trie->minlen=chars;
1378 } else if (chars > trie->maxlen) {
1379 trie->maxlen=chars;
1380 }
1381
a3621e74
YO
1382 } /* end first pass */
1383 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1384 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1385 (int)depth * 2 + 2,"",
85c3142d 1386 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1387 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1388 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1389 );
786e8c11 1390 Newxz( trie->wordlen, word_count, U32 );
a3621e74
YO
1391
1392 /*
1393 We now know what we are dealing with in terms of unique chars and
1394 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1395 representation using a flat table will take. If it's over a reasonable
1396 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1397 conservative but potentially much slower representation using an array
1398 of lists.
1399
1400 At the end we convert both representations into the same compressed
1401 form that will be used in regexec.c for matching with. The latter
1402 is a form that cannot be used to construct with but has memory
1403 properties similar to the list form and access properties similar
1404 to the table form making it both suitable for fast searches and
1405 small enough that its feasable to store for the duration of a program.
1406
1407 See the comment in the code where the compressed table is produced
1408 inplace from the flat tabe representation for an explanation of how
1409 the compression works.
1410
1411 */
1412
1413
3dab1dad 1414 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1415 /*
1416 Second Pass -- Array Of Lists Representation
1417
1418 Each state will be represented by a list of charid:state records
1419 (reg_trie_trans_le) the first such element holds the CUR and LEN
1420 points of the allocated array. (See defines above).
1421
1422 We build the initial structure using the lists, and then convert
1423 it into the compressed table form which allows faster lookups
1424 (but cant be modified once converted).
a3621e74
YO
1425 */
1426
a3621e74
YO
1427 STRLEN transcount = 1;
1428
1e2e3d02
YO
1429 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1430 "%*sCompiling trie using list compiler\n",
1431 (int)depth * 2 + 2, ""));
1432
3dab1dad 1433 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1434 TRIE_LIST_NEW(1);
1435 next_alloc = 2;
1436
1437 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1438
c445ea15
AL
1439 regnode * const noper = NEXTOPER( cur );
1440 U8 *uc = (U8*)STRING( noper );
1441 const U8 * const e = uc + STR_LEN( noper );
1442 U32 state = 1; /* required init */
1443 U16 charid = 0; /* sanity init */
1444 U8 *scan = (U8*)NULL; /* sanity init */
1445 STRLEN foldlen = 0; /* required init */
07be1b83 1446 U32 wordlen = 0; /* required init */
c445ea15
AL
1447 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1448
3dab1dad 1449 if (OP(noper) != NOTHING) {
786e8c11 1450 for ( ; uc < e ; uc += len ) {
c445ea15 1451
786e8c11 1452 TRIE_READ_CHAR;
c445ea15 1453
786e8c11
YO
1454 if ( uvc < 256 ) {
1455 charid = trie->charmap[ uvc ];
c445ea15 1456 } else {
786e8c11
YO
1457 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1458 if ( !svpp ) {
1459 charid = 0;
1460 } else {
1461 charid=(U16)SvIV( *svpp );
1462 }
c445ea15 1463 }
786e8c11
YO
1464 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1465 if ( charid ) {
a3621e74 1466
786e8c11
YO
1467 U16 check;
1468 U32 newstate = 0;
a3621e74 1469
786e8c11
YO
1470 charid--;
1471 if ( !trie->states[ state ].trans.list ) {
1472 TRIE_LIST_NEW( state );
c445ea15 1473 }
786e8c11
YO
1474 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1475 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1476 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1477 break;
1478 }
1479 }
1480 if ( ! newstate ) {
1481 newstate = next_alloc++;
1482 TRIE_LIST_PUSH( state, charid, newstate );
1483 transcount++;
1484 }
1485 state = newstate;
1486 } else {
1487 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1488 }
a28509cc 1489 }
c445ea15 1490 }
3dab1dad 1491 TRIE_HANDLE_WORD(state);
a3621e74
YO
1492
1493 } /* end second pass */
1494
1e2e3d02
YO
1495 /* next alloc is the NEXT state to be allocated */
1496 trie->statecount = next_alloc;
a3621e74
YO
1497 Renew( trie->states, next_alloc, reg_trie_state );
1498
3dab1dad
YO
1499 /* and now dump it out before we compress it */
1500 DEBUG_TRIE_COMPILE_MORE_r(
1501 dump_trie_interim_list(trie,next_alloc,depth+1)
1e2e3d02 1502 );
a3621e74 1503
a02a5408 1504 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1505 {
1506 U32 state;
a3621e74
YO
1507 U32 tp = 0;
1508 U32 zp = 0;
1509
1510
1511 for( state=1 ; state < next_alloc ; state ++ ) {
1512 U32 base=0;
1513
1514 /*
1515 DEBUG_TRIE_COMPILE_MORE_r(
1516 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1517 );
1518 */
1519
1520 if (trie->states[state].trans.list) {
1521 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1522 U16 maxid=minid;
a28509cc 1523 U16 idx;
a3621e74
YO
1524
1525 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1526 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1527 if ( forid < minid ) {
1528 minid=forid;
1529 } else if ( forid > maxid ) {
1530 maxid=forid;
1531 }
a3621e74
YO
1532 }
1533 if ( transcount < tp + maxid - minid + 1) {
1534 transcount *= 2;
1535 Renew( trie->trans, transcount, reg_trie_trans );
1536 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1537 }
1538 base = trie->uniquecharcount + tp - minid;
1539 if ( maxid == minid ) {
1540 U32 set = 0;
1541 for ( ; zp < tp ; zp++ ) {
1542 if ( ! trie->trans[ zp ].next ) {
1543 base = trie->uniquecharcount + zp - minid;
1544 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1545 trie->trans[ zp ].check = state;
1546 set = 1;
1547 break;
1548 }
1549 }
1550 if ( !set ) {
1551 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1552 trie->trans[ tp ].check = state;
1553 tp++;
1554 zp = tp;
1555 }
1556 } else {
1557 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1558 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1559 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1560 trie->trans[ tid ].check = state;
1561 }
1562 tp += ( maxid - minid + 1 );
1563 }
1564 Safefree(trie->states[ state ].trans.list);
1565 }
1566 /*
1567 DEBUG_TRIE_COMPILE_MORE_r(
1568 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1569 );
1570 */
1571 trie->states[ state ].trans.base=base;
1572 }
cc601c31 1573 trie->lasttrans = tp + 1;
a3621e74
YO
1574 }
1575 } else {
1576 /*
1577 Second Pass -- Flat Table Representation.
1578
1579 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1580 We know that we will need Charcount+1 trans at most to store the data
1581 (one row per char at worst case) So we preallocate both structures
1582 assuming worst case.
1583
1584 We then construct the trie using only the .next slots of the entry
1585 structs.
1586
1587 We use the .check field of the first entry of the node temporarily to
1588 make compression both faster and easier by keeping track of how many non
1589 zero fields are in the node.
1590
1591 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1592 transition.
1593
1594 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1595 number representing the first entry of the node, and state as a
1596 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1597 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1598 are 2 entrys per node. eg:
1599
1600 A B A B
1601 1. 2 4 1. 3 7
1602 2. 0 3 3. 0 5
1603 3. 0 0 5. 0 0
1604 4. 0 0 7. 0 0
1605
1606 The table is internally in the right hand, idx form. However as we also
1607 have to deal with the states array which is indexed by nodenum we have to
1608 use TRIE_NODENUM() to convert.
1609
1610 */
1e2e3d02
YO
1611 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1612 "%*sCompiling trie using table compiler\n",
1613 (int)depth * 2 + 2, ""));
3dab1dad
YO
1614
1615 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
a3621e74 1616 reg_trie_trans );
3dab1dad 1617 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1618 next_alloc = trie->uniquecharcount + 1;
1619
3dab1dad 1620
a3621e74
YO
1621 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1622
c445ea15 1623 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1624 const U8 *uc = (U8*)STRING( noper );
1625 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1626
1627 U32 state = 1; /* required init */
1628
1629 U16 charid = 0; /* sanity init */
1630 U32 accept_state = 0; /* sanity init */
1631 U8 *scan = (U8*)NULL; /* sanity init */
1632
1633 STRLEN foldlen = 0; /* required init */
07be1b83 1634 U32 wordlen = 0; /* required init */
a3621e74
YO
1635 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1636
3dab1dad 1637 if ( OP(noper) != NOTHING ) {
786e8c11 1638 for ( ; uc < e ; uc += len ) {
a3621e74 1639
786e8c11 1640 TRIE_READ_CHAR;
a3621e74 1641
786e8c11
YO
1642 if ( uvc < 256 ) {
1643 charid = trie->charmap[ uvc ];
1644 } else {
1645 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1646 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1647 }
786e8c11
YO
1648 if ( charid ) {
1649 charid--;
1650 if ( !trie->trans[ state + charid ].next ) {
1651 trie->trans[ state + charid ].next = next_alloc;
1652 trie->trans[ state ].check++;
1653 next_alloc += trie->uniquecharcount;
1654 }
1655 state = trie->trans[ state + charid ].next;
1656 } else {
1657 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1658 }
1659 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1660 }
a3621e74 1661 }
3dab1dad
YO
1662 accept_state = TRIE_NODENUM( state );
1663 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1664
1665 } /* end second pass */
1666
3dab1dad
YO
1667 /* and now dump it out before we compress it */
1668 DEBUG_TRIE_COMPILE_MORE_r(
1669 dump_trie_interim_table(trie,next_alloc,depth+1)
1670 );
a3621e74 1671
a3621e74
YO
1672 {
1673 /*
1674 * Inplace compress the table.*
1675
1676 For sparse data sets the table constructed by the trie algorithm will
1677 be mostly 0/FAIL transitions or to put it another way mostly empty.
1678 (Note that leaf nodes will not contain any transitions.)
1679
1680 This algorithm compresses the tables by eliminating most such
1681 transitions, at the cost of a modest bit of extra work during lookup:
1682
1683 - Each states[] entry contains a .base field which indicates the
1684 index in the state[] array wheres its transition data is stored.
1685
1686 - If .base is 0 there are no valid transitions from that node.
1687
1688 - If .base is nonzero then charid is added to it to find an entry in
1689 the trans array.
1690
1691 -If trans[states[state].base+charid].check!=state then the
1692 transition is taken to be a 0/Fail transition. Thus if there are fail
1693 transitions at the front of the node then the .base offset will point
1694 somewhere inside the previous nodes data (or maybe even into a node
1695 even earlier), but the .check field determines if the transition is
1696 valid.
1697
786e8c11 1698 XXX - wrong maybe?
a3621e74
YO
1699 The following process inplace converts the table to the compressed
1700 table: We first do not compress the root node 1,and mark its all its
1701 .check pointers as 1 and set its .base pointer as 1 as well. This
1702 allows to do a DFA construction from the compressed table later, and
1703 ensures that any .base pointers we calculate later are greater than
1704 0.
1705
1706 - We set 'pos' to indicate the first entry of the second node.
1707
1708 - We then iterate over the columns of the node, finding the first and
1709 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1710 and set the .check pointers accordingly, and advance pos
1711 appropriately and repreat for the next node. Note that when we copy
1712 the next pointers we have to convert them from the original
1713 NODEIDX form to NODENUM form as the former is not valid post
1714 compression.
1715
1716 - If a node has no transitions used we mark its base as 0 and do not
1717 advance the pos pointer.
1718
1719 - If a node only has one transition we use a second pointer into the
1720 structure to fill in allocated fail transitions from other states.
1721 This pointer is independent of the main pointer and scans forward
1722 looking for null transitions that are allocated to a state. When it
1723 finds one it writes the single transition into the "hole". If the
786e8c11 1724 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1725
1726 - Once compressed we can Renew/realloc the structures to release the
1727 excess space.
1728
1729 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1730 specifically Fig 3.47 and the associated pseudocode.
1731
1732 demq
1733 */
a3b680e6 1734 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1735 U32 state, charid;
a3621e74 1736 U32 pos = 0, zp=0;
1e2e3d02 1737 trie->statecount = laststate;
a3621e74
YO
1738
1739 for ( state = 1 ; state < laststate ; state++ ) {
1740 U8 flag = 0;
a28509cc
AL
1741 const U32 stateidx = TRIE_NODEIDX( state );
1742 const U32 o_used = trie->trans[ stateidx ].check;
1743 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1744 trie->trans[ stateidx ].check = 0;
1745
1746 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1747 if ( flag || trie->trans[ stateidx + charid ].next ) {
1748 if ( trie->trans[ stateidx + charid ].next ) {
1749 if (o_used == 1) {
1750 for ( ; zp < pos ; zp++ ) {
1751 if ( ! trie->trans[ zp ].next ) {
1752 break;
1753 }
1754 }
1755 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1756 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1757 trie->trans[ zp ].check = state;
1758 if ( ++zp > pos ) pos = zp;
1759 break;
1760 }
1761 used--;
1762 }
1763 if ( !flag ) {
1764 flag = 1;
1765 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1766 }
1767 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1768 trie->trans[ pos ].check = state;
1769 pos++;
1770 }
1771 }
1772 }
cc601c31 1773 trie->lasttrans = pos + 1;
1e2e3d02 1774 Renew( trie->states, laststate, reg_trie_state);
a3621e74 1775 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1776 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1777 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1778 (int)depth * 2 + 2,"",
1779 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1780 (IV)next_alloc,
1781 (IV)pos,
a3621e74
YO
1782 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1783 );
1784
1785 } /* end table compress */
1786 }
1e2e3d02
YO
1787 DEBUG_TRIE_COMPILE_MORE_r(
1788 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1789 (int)depth * 2 + 2, "",
1790 (UV)trie->statecount,
1791 (UV)trie->lasttrans)
1792 );
cc601c31
YO
1793 /* resize the trans array to remove unused space */
1794 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74 1795
3dab1dad
YO
1796 /* and now dump out the compressed format */
1797 DEBUG_TRIE_COMPILE_r(
1798 dump_trie(trie,depth+1)
1799 );
07be1b83 1800
3dab1dad 1801 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1802 U8 nodetype =(U8)(flags & 0xFF);
1803 char *str=NULL;
786e8c11 1804
07be1b83 1805#ifdef DEBUGGING
e62cc96a 1806 regnode *optimize = NULL;
b57a0404
JH
1807 U32 mjd_offset = 0;
1808 U32 mjd_nodelen = 0;
07be1b83 1809#endif
a3621e74 1810 /*
3dab1dad
YO
1811 This means we convert either the first branch or the first Exact,
1812 depending on whether the thing following (in 'last') is a branch
1813 or not and whther first is the startbranch (ie is it a sub part of
1814 the alternation or is it the whole thing.)
1815 Assuming its a sub part we conver the EXACT otherwise we convert
1816 the whole branch sequence, including the first.
a3621e74 1817 */
3dab1dad 1818 /* Find the node we are going to overwrite */
7f69552c 1819 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1820 /* branch sub-chain */
3dab1dad 1821 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1822 DEBUG_r({
1823 mjd_offset= Node_Offset((convert));
1824 mjd_nodelen= Node_Length((convert));
1825 });
7f69552c
YO
1826 /* whole branch chain */
1827 } else {
1828 DEBUG_r({
1829 const regnode *nop = NEXTOPER( convert );
1830 mjd_offset= Node_Offset((nop));
1831 mjd_nodelen= Node_Length((nop));
1832 });
07be1b83 1833 }
7f69552c 1834
07be1b83
YO
1835 DEBUG_OPTIMISE_r(
1836 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1837 (int)depth * 2 + 2, "",
786e8c11 1838 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1839 );
a3621e74 1840
3dab1dad
YO
1841 /* But first we check to see if there is a common prefix we can
1842 split out as an EXACT and put in front of the TRIE node. */
1843 trie->startstate= 1;
786e8c11 1844 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
3dab1dad 1845 U32 state;
1e2e3d02 1846 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1847 U32 ofs = 0;
8e11feef
RGS
1848 I32 idx = -1;
1849 U32 count = 0;
1850 const U32 base = trie->states[ state ].trans.base;
a3621e74 1851
3dab1dad 1852 if ( trie->states[state].wordnum )
8e11feef 1853 count = 1;
a3621e74 1854
8e11feef 1855 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1856 if ( ( base + ofs >= trie->uniquecharcount ) &&
1857 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1858 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1859 {
3dab1dad 1860 if ( ++count > 1 ) {
8e11feef 1861 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
07be1b83 1862 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1863 if ( state == 1 ) break;
3dab1dad
YO
1864 if ( count == 2 ) {
1865 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1866 DEBUG_OPTIMISE_r(
8e11feef
RGS
1867 PerlIO_printf(Perl_debug_log,
1868 "%*sNew Start State=%"UVuf" Class: [",
1869 (int)depth * 2 + 2, "",
786e8c11 1870 (UV)state));
be8e71aa
YO
1871 if (idx >= 0) {
1872 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1873 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1874
3dab1dad 1875 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1876 if ( folder )
1877 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1878 DEBUG_OPTIMISE_r(
07be1b83 1879 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1880 );
8e11feef
RGS
1881 }
1882 }
1883 TRIE_BITMAP_SET(trie,*ch);
1884 if ( folder )
1885 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1886 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1887 }
1888 idx = ofs;
1889 }
3dab1dad
YO
1890 }
1891 if ( count == 1 ) {
1892 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
8e11feef 1893 const char *ch = SvPV_nolen_const( *tmp );
3dab1dad 1894 DEBUG_OPTIMISE_r(
8e11feef
RGS
1895 PerlIO_printf( Perl_debug_log,
1896 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1897 (int)depth * 2 + 2, "",
786e8c11 1898 (UV)state, (UV)idx, ch)
3dab1dad
YO
1899 );
1900 if ( state==1 ) {
1901 OP( convert ) = nodetype;
1902 str=STRING(convert);
1903 STR_LEN(convert)=0;
1904 }
1905 *str++=*ch;
1906 STR_LEN(convert)++;
a3621e74 1907
8e11feef 1908 } else {
f9049ba1 1909#ifdef DEBUGGING
8e11feef
RGS
1910 if (state>1)
1911 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1912#endif
8e11feef
RGS
1913 break;
1914 }
1915 }
3dab1dad 1916 if (str) {
8e11feef 1917 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1918 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1919 trie->startstate = state;
07be1b83
YO
1920 trie->minlen -= (state - 1);
1921 trie->maxlen -= (state - 1);
1922 DEBUG_r({
1923 regnode *fix = convert;
1924 mjd_nodelen++;
1925 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1926 while( ++fix < n ) {
1927 Set_Node_Offset_Length(fix, 0, 0);
1928 }
1929 });
8e11feef
RGS
1930 if (trie->maxlen) {
1931 convert = n;
1932 } else {
3dab1dad 1933 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 1934 DEBUG_r(optimize= n);
3dab1dad
YO
1935 }
1936 }
1937 }
a5ca303d
YO
1938 if (!jumper)
1939 jumper = last;
3dab1dad 1940 if ( trie->maxlen ) {
8e11feef
RGS
1941 NEXT_OFF( convert ) = (U16)(tail - convert);
1942 ARG_SET( convert, data_slot );
786e8c11
YO
1943 /* Store the offset to the first unabsorbed branch in
1944 jump[0], which is otherwise unused by the jump logic.
1945 We use this when dumping a trie and during optimisation. */
1946 if (trie->jump)
7f69552c 1947 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 1948
786e8c11
YO
1949 /* XXXX */
1950 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 1951 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
1952 {
1953 OP( convert ) = TRIEC;
1954 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1955 Safefree(trie->bitmap);
1956 trie->bitmap= NULL;
1957 } else
1958 OP( convert ) = TRIE;
a3621e74 1959
3dab1dad
YO
1960 /* store the type in the flags */
1961 convert->flags = nodetype;
a5ca303d
YO
1962 DEBUG_r({
1963 optimize = convert
1964 + NODE_STEP_REGNODE
1965 + regarglen[ OP( convert ) ];
1966 });
1967 /* XXX We really should free up the resource in trie now,
1968 as we won't use them - (which resources?) dmq */
3dab1dad 1969 }
a3621e74 1970 /* needed for dumping*/
e62cc96a 1971 DEBUG_r(if (optimize) {
07be1b83 1972 regnode *opt = convert;
e62cc96a 1973 while ( ++opt < optimize) {
07be1b83
YO
1974 Set_Node_Offset_Length(opt,0,0);
1975 }
786e8c11
YO
1976 /*
1977 Try to clean up some of the debris left after the
1978 optimisation.
a3621e74 1979 */
786e8c11 1980 while( optimize < jumper ) {
07be1b83 1981 mjd_nodelen += Node_Length((optimize));
a3621e74 1982 OP( optimize ) = OPTIMIZED;
07be1b83 1983 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
1984 optimize++;
1985 }
07be1b83 1986 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
1987 });
1988 } /* end node insert */
07be1b83 1989#ifndef DEBUGGING
6e8b4190 1990 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
07be1b83 1991#endif
786e8c11
YO
1992 return trie->jump
1993 ? MADE_JUMP_TRIE
1994 : trie->startstate>1
1995 ? MADE_EXACT_TRIE
1996 : MADE_TRIE;
1997}
1998
1999STATIC void
2000S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2001{
2002/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2003
2004 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2005 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2006 ISBN 0-201-10088-6
2007
2008 We find the fail state for each state in the trie, this state is the longest proper
2009 suffix of the current states 'word' that is also a proper prefix of another word in our
2010 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2011 the DFA not to have to restart after its tried and failed a word at a given point, it
2012 simply continues as though it had been matching the other word in the first place.
2013 Consider
2014 'abcdgu'=~/abcdefg|cdgu/
2015 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2016 fail, which would bring use to the state representing 'd' in the second word where we would
2017 try 'g' and succeed, prodceding to match 'cdgu'.
2018 */
2019 /* add a fail transition */
2020 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
2021 U32 *q;
2022 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2023 const U32 numstates = trie->statecount;
786e8c11
YO
2024 const U32 ubound = trie->lasttrans + ucharcount;
2025 U32 q_read = 0;
2026 U32 q_write = 0;
2027 U32 charid;
2028 U32 base = trie->states[ 1 ].trans.base;
2029 U32 *fail;
2030 reg_ac_data *aho;
2031 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2032 GET_RE_DEBUG_FLAGS_DECL;
2033#ifndef DEBUGGING
2034 PERL_UNUSED_ARG(depth);
2035#endif
2036
2037
2038 ARG_SET( stclass, data_slot );
2039 Newxz( aho, 1, reg_ac_data );
2040 RExC_rx->data->data[ data_slot ] = (void*)aho;
2041 aho->trie=trie;
2042 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1e2e3d02 2043 numstates * sizeof(reg_trie_state));
786e8c11
YO
2044 Newxz( q, numstates, U32);
2045 Newxz( aho->fail, numstates, U32 );
2046 aho->refcount = 1;
2047 fail = aho->fail;
2048 /* initialize fail[0..1] to be 1 so that we always have
2049 a valid final fail state */
2050 fail[ 0 ] = fail[ 1 ] = 1;
2051
2052 for ( charid = 0; charid < ucharcount ; charid++ ) {
2053 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2054 if ( newstate ) {
2055 q[ q_write ] = newstate;
2056 /* set to point at the root */
2057 fail[ q[ q_write++ ] ]=1;
2058 }
2059 }
2060 while ( q_read < q_write) {
2061 const U32 cur = q[ q_read++ % numstates ];
2062 base = trie->states[ cur ].trans.base;
2063
2064 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2065 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2066 if (ch_state) {
2067 U32 fail_state = cur;
2068 U32 fail_base;
2069 do {
2070 fail_state = fail[ fail_state ];
2071 fail_base = aho->states[ fail_state ].trans.base;
2072 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2073
2074 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2075 fail[ ch_state ] = fail_state;
2076 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2077 {
2078 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2079 }
2080 q[ q_write++ % numstates] = ch_state;
2081 }
2082 }
2083 }
2084 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2085 when we fail in state 1, this allows us to use the
2086 charclass scan to find a valid start char. This is based on the principle
2087 that theres a good chance the string being searched contains lots of stuff
2088 that cant be a start char.
2089 */
2090 fail[ 0 ] = fail[ 1 ] = 0;
2091 DEBUG_TRIE_COMPILE_r({
1e2e3d02
YO
2092 PerlIO_printf(Perl_debug_log, "%*sStclass Failtable (%"UVuf" states): 0",
2093 (int)(depth * 2), "", numstates
2094 );
786e8c11
YO
2095 for( q_read=1; q_read<numstates; q_read++ ) {
2096 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2097 }
2098 PerlIO_printf(Perl_debug_log, "\n");
2099 });
2100 Safefree(q);
2101 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2102}
2103
786e8c11 2104
a3621e74 2105/*
5d1c421c
JH
2106 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2107 * These need to be revisited when a newer toolchain becomes available.
2108 */
2109#if defined(__sparc64__) && defined(__GNUC__)
2110# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2111# undef SPARC64_GCC_WORKAROUND
2112# define SPARC64_GCC_WORKAROUND 1
2113# endif
2114#endif
2115
07be1b83
YO
2116#define DEBUG_PEEP(str,scan,depth) \
2117 DEBUG_OPTIMISE_r({ \
2118 SV * const mysv=sv_newmortal(); \
2119 regnode *Next = regnext(scan); \
2120 regprop(RExC_rx, mysv, scan); \
7f69552c 2121 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2122 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2123 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2124 });
2125
1de06328
YO
2126
2127
2128
2129
07be1b83
YO
2130#define JOIN_EXACT(scan,min,flags) \
2131 if (PL_regkind[OP(scan)] == EXACT) \
2132 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2133
be8e71aa 2134STATIC U32
07be1b83
YO
2135S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2136 /* Merge several consecutive EXACTish nodes into one. */
2137 regnode *n = regnext(scan);
2138 U32 stringok = 1;
2139 regnode *next = scan + NODE_SZ_STR(scan);
2140 U32 merged = 0;
2141 U32 stopnow = 0;
2142#ifdef DEBUGGING
2143 regnode *stop = scan;
72f13be8 2144 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2145#else
d47053eb
RGS
2146 PERL_UNUSED_ARG(depth);
2147#endif
2148#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2149 PERL_UNUSED_ARG(flags);
2150 PERL_UNUSED_ARG(val);
07be1b83 2151#endif
07be1b83
YO
2152 DEBUG_PEEP("join",scan,depth);
2153
2154 /* Skip NOTHING, merge EXACT*. */
2155 while (n &&
2156 ( PL_regkind[OP(n)] == NOTHING ||
2157 (stringok && (OP(n) == OP(scan))))
2158 && NEXT_OFF(n)
2159 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2160
2161 if (OP(n) == TAIL || n > next)
2162 stringok = 0;
2163 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2164 DEBUG_PEEP("skip:",n,depth);
2165 NEXT_OFF(scan) += NEXT_OFF(n);
2166 next = n + NODE_STEP_REGNODE;
2167#ifdef DEBUGGING
2168 if (stringok)
2169 stop = n;
2170#endif
2171 n = regnext(n);
2172 }
2173 else if (stringok) {
786e8c11 2174 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2175 regnode * const nnext = regnext(n);
2176
2177 DEBUG_PEEP("merg",n,depth);
2178
2179 merged++;
2180 if (oldl + STR_LEN(n) > U8_MAX)
2181 break;
2182 NEXT_OFF(scan) += NEXT_OFF(n);
2183 STR_LEN(scan) += STR_LEN(n);
2184 next = n + NODE_SZ_STR(n);
2185 /* Now we can overwrite *n : */
2186 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2187#ifdef DEBUGGING
2188 stop = next - 1;
2189#endif
2190 n = nnext;
2191 if (stopnow) break;
2192 }
2193
d47053eb
RGS
2194#ifdef EXPERIMENTAL_INPLACESCAN
2195 if (flags && !NEXT_OFF(n)) {
2196 DEBUG_PEEP("atch", val, depth);
2197 if (reg_off_by_arg[OP(n)]) {
2198 ARG_SET(n, val - n);
2199 }
2200 else {
2201 NEXT_OFF(n) = val - n;
2202 }
2203 stopnow = 1;
2204 }
07be1b83
YO
2205#endif
2206 }
2207
2208 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2209 /*
2210 Two problematic code points in Unicode casefolding of EXACT nodes:
2211
2212 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2213 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2214
2215 which casefold to
2216
2217 Unicode UTF-8
2218
2219 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2220 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2221
2222 This means that in case-insensitive matching (or "loose matching",
2223 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2224 length of the above casefolded versions) can match a target string
2225 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2226 This would rather mess up the minimum length computation.
2227
2228 What we'll do is to look for the tail four bytes, and then peek
2229 at the preceding two bytes to see whether we need to decrease
2230 the minimum length by four (six minus two).
2231
2232 Thanks to the design of UTF-8, there cannot be false matches:
2233 A sequence of valid UTF-8 bytes cannot be a subsequence of
2234 another valid sequence of UTF-8 bytes.
2235
2236 */
2237 char * const s0 = STRING(scan), *s, *t;
2238 char * const s1 = s0 + STR_LEN(scan) - 1;
2239 char * const s2 = s1 - 4;
e294cc5d
JH
2240#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2241 const char t0[] = "\xaf\x49\xaf\x42";
2242#else
07be1b83 2243 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2244#endif
07be1b83
YO
2245 const char * const t1 = t0 + 3;
2246
2247 for (s = s0 + 2;
2248 s < s2 && (t = ninstr(s, s1, t0, t1));
2249 s = t + 4) {
e294cc5d
JH
2250#ifdef EBCDIC
2251 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2252 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2253#else
07be1b83
YO
2254 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2255 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2256#endif
07be1b83
YO
2257 *min -= 4;
2258 }
2259 }
2260
2261#ifdef DEBUGGING
2262 /* Allow dumping */
2263 n = scan + NODE_SZ_STR(scan);
2264 while (n <= stop) {
2265 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2266 OP(n) = OPTIMIZED;
2267 NEXT_OFF(n) = 0;
2268 }
2269 n++;
2270 }
2271#endif
2272 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2273 return stopnow;
2274}
2275
653099ff
GS
2276/* REx optimizer. Converts nodes into quickier variants "in place".
2277 Finds fixed substrings. */
2278
a0288114 2279/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2280 to the position after last scanned or to NULL. */
2281
40d049e4
YO
2282#define INIT_AND_WITHP \
2283 assert(!and_withp); \
2284 Newx(and_withp,1,struct regnode_charclass_class); \
2285 SAVEFREEPV(and_withp)
07be1b83 2286
76e3520e 2287STATIC I32
40d049e4 2288S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2289 I32 *minlenp, I32 *deltap,
40d049e4
YO
2290 regnode *last,
2291 scan_data_t *data,
2292 I32 stopparen,
2293 U8* recursed,
2294 struct regnode_charclass_class *and_withp,
2295 U32 flags, U32 depth)
c277df42
IZ
2296 /* scanp: Start here (read-write). */
2297 /* deltap: Write maxlen-minlen here. */
2298 /* last: Stop before this one. */
40d049e4
YO
2299 /* data: string data about the pattern */
2300 /* stopparen: treat close N as END */
2301 /* recursed: which subroutines have we recursed into */
2302 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2303{
97aff369 2304 dVAR;
c277df42
IZ
2305 I32 min = 0, pars = 0, code;
2306 regnode *scan = *scanp, *next;
2307 I32 delta = 0;
2308 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2309 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2310 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2311 scan_data_t data_fake;
a3621e74 2312 SV *re_trie_maxbuff = NULL;
786e8c11 2313 regnode *first_non_open = scan;
a3621e74 2314 GET_RE_DEBUG_FLAGS_DECL;
13a24bad 2315#ifdef DEBUGGING
40d049e4 2316 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2317#endif
40d049e4 2318
786e8c11 2319 if ( depth == 0 ) {
40d049e4 2320 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2321 first_non_open=regnext(first_non_open);
2322 }
2323
b81d288d 2324
c277df42
IZ
2325 while (scan && OP(scan) != END && scan < last) {
2326 /* Peephole optimizer: */
1de06328 2327 DEBUG_STUDYDATA(data,depth);
07be1b83 2328 DEBUG_PEEP("Peep",scan,depth);
07be1b83 2329 JOIN_EXACT(scan,&min,0);
a3621e74 2330
653099ff
GS
2331 /* Follow the next-chain of the current node and optimize
2332 away all the NOTHINGs from it. */
c277df42 2333 if (OP(scan) != CURLYX) {
a3b680e6 2334 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
2335 ? I32_MAX
2336 /* I32 may be smaller than U16 on CRAYs! */
2337 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
2338 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2339 int noff;
2340 regnode *n = scan;
b81d288d 2341
c277df42
IZ
2342 /* Skip NOTHING and LONGJMP. */
2343 while ((n = regnext(n))
3dab1dad 2344 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
2345 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2346 && off + noff < max)
2347 off += noff;
2348 if (reg_off_by_arg[OP(scan)])
2349 ARG(scan) = off;
b81d288d 2350 else
c277df42
IZ
2351 NEXT_OFF(scan) = off;
2352 }
a3621e74 2353
07be1b83 2354
3dab1dad 2355
653099ff
GS
2356 /* The principal pseudo-switch. Cannot be a switch, since we
2357 look into several different things. */
b81d288d 2358 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
2359 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
2360 next = regnext(scan);
2361 code = OP(scan);
a3621e74 2362 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
2363
2364 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
786e8c11
YO
2365 /* NOTE - There is similar code to this block below for handling
2366 TRIE nodes on a re-study. If you change stuff here check there
2367 too. */
c277df42 2368 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 2369 struct regnode_charclass_class accum;
d4c19fe8 2370 regnode * const startbranch=scan;
c277df42 2371
653099ff 2372 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1de06328 2373 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
653099ff 2374 if (flags & SCF_DO_STCLASS)
830247a4 2375 cl_init_zero(pRExC_state, &accum);
a3621e74 2376
c277df42 2377 while (OP(scan) == code) {
830247a4 2378 I32 deltanext, minnext, f = 0, fake;
653099ff 2379 struct regnode_charclass_class this_class;
c277df42
IZ
2380
2381 num++;
2382 data_fake.flags = 0;
40d049e4 2383 if (data) {
2c2d71f5 2384 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2385 data_fake.last_closep = data->last_closep;
2386 }
2387 else
2388 data_fake.last_closep = &fake;
c277df42
IZ
2389 next = regnext(scan);
2390 scan = NEXTOPER(scan);
2391 if (code != BRANCH)
2392 scan = NEXTOPER(scan);
653099ff 2393 if (flags & SCF_DO_STCLASS) {
830247a4 2394 cl_init(pRExC_state, &this_class);
653099ff
GS
2395 data_fake.start_class = &this_class;
2396 f = SCF_DO_STCLASS_AND;
b81d288d 2397 }
e1901655
IZ
2398 if (flags & SCF_WHILEM_VISITED_POS)
2399 f |= SCF_WHILEM_VISITED_POS;
a3621e74 2400
653099ff 2401 /* we suppose the run is continuous, last=next...*/
1de06328 2402 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
40d049e4
YO
2403 next, &data_fake,
2404 stopparen, recursed, NULL, f,depth+1);
b81d288d 2405 if (min1 > minnext)
c277df42
IZ
2406 min1 = minnext;
2407 if (max1 < minnext + deltanext)
2408 max1 = minnext + deltanext;
2409 if (deltanext == I32_MAX)
aca2d497 2410 is_inf = is_inf_internal = 1;
c277df42
IZ
2411 scan = next;
2412 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2413 pars++;
3dab1dad
YO
2414 if (data) {
2415 if (data_fake.flags & SF_HAS_EVAL)
07be1b83 2416 data->flags |= SF_HAS_EVAL;
2c2d71f5 2417 data->whilem_c = data_fake.whilem_c;
3dab1dad 2418 }
653099ff 2419 if (flags & SCF_DO_STCLASS)
830247a4 2420 cl_or(pRExC_state, &accum, &this_class);
b81d288d 2421 if (code == SUSPEND)
c277df42
IZ
2422 break;
2423 }
2424 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2425 min1 = 0;
2426 if (flags & SCF_DO_SUBSTR) {
2427 data->pos_min += min1;
2428 data->pos_delta += max1 - min1;
2429 if (max1 != min1 || is_inf)
2430 data->longest = &(data->longest_float);
2431 }
2432 min += min1;
2433 delta += max1 - min1;
653099ff 2434 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2435 cl_or(pRExC_state, data->start_class, &accum);
653099ff 2436 if (min1) {
40d049e4 2437 cl_and(data->start_class, and_withp);
653099ff
GS
2438 flags &= ~SCF_DO_STCLASS;
2439 }
2440 }
2441 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
2442 if (min1) {
2443 cl_and(data->start_class, &accum);
653099ff 2444 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
2445 }
2446 else {
b81d288d 2447 /* Switch to OR mode: cache the old value of
de0c8cb8 2448 * data->start_class */
40d049e4
YO
2449 INIT_AND_WITHP;
2450 StructCopy(data->start_class, and_withp,
de0c8cb8
GS
2451 struct regnode_charclass_class);
2452 flags &= ~SCF_DO_STCLASS_AND;
2453 StructCopy(&accum, data->start_class,
2454 struct regnode_charclass_class);
2455 flags |= SCF_DO_STCLASS_OR;
2456 data->start_class->flags |= ANYOF_EOS;
2457 }
653099ff 2458 }
a3621e74 2459
786e8c11 2460 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
a3621e74
YO
2461 /* demq.
2462
2463 Assuming this was/is a branch we are dealing with: 'scan' now
2464 points at the item that follows the branch sequence, whatever
2465 it is. We now start at the beginning of the sequence and look
2466 for subsequences of
2467
786e8c11
YO
2468 BRANCH->EXACT=>x1
2469 BRANCH->EXACT=>x2
2470 tail
a3621e74
YO
2471
2472 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2473
2474 If we can find such a subseqence we need to turn the first
2475 element into a trie and then add the subsequent branch exact
2476 strings to the trie.
2477
2478 We have two cases
2479
786e8c11 2480 1. patterns where the whole set of branch can be converted.
a3621e74 2481
786e8c11 2482 2. patterns where only a subset can be converted.
a3621e74
YO
2483
2484 In case 1 we can replace the whole set with a single regop
2485 for the trie. In case 2 we need to keep the start and end
2486 branchs so
2487
2488 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2489 becomes BRANCH TRIE; BRANCH X;
2490
786e8c11
YO
2491 There is an additional case, that being where there is a
2492 common prefix, which gets split out into an EXACT like node
2493 preceding the TRIE node.
2494
2495 If x(1..n)==tail then we can do a simple trie, if not we make
2496 a "jump" trie, such that when we match the appropriate word
2497 we "jump" to the appopriate tail node. Essentailly we turn
2498 a nested if into a case structure of sorts.
a3621e74
YO
2499
2500 */
786e8c11 2501
3dab1dad 2502 int made=0;
0111c4fd
RGS
2503 if (!re_trie_maxbuff) {
2504 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2505 if (!SvIOK(re_trie_maxbuff))
2506 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2507 }
786e8c11 2508 if ( SvIV(re_trie_maxbuff)>=0 ) {
a3621e74
YO
2509 regnode *cur;
2510 regnode *first = (regnode *)NULL;
2511 regnode *last = (regnode *)NULL;
2512 regnode *tail = scan;
2513 U8 optype = 0;
2514 U32 count=0;
2515
2516#ifdef DEBUGGING
c445ea15 2517 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74
YO
2518#endif
2519 /* var tail is used because there may be a TAIL
2520 regop in the way. Ie, the exacts will point to the
2521 thing following the TAIL, but the last branch will
2522 point at the TAIL. So we advance tail. If we
2523 have nested (?:) we may have to move through several
2524 tails.
2525 */
2526
2527 while ( OP( tail ) == TAIL ) {
2528 /* this is the TAIL generated by (?:) */
2529 tail = regnext( tail );
2530 }
2531
3dab1dad 2532
a3621e74 2533 DEBUG_OPTIMISE_r({
32fc9b6a 2534 regprop(RExC_rx, mysv, tail );
3dab1dad
YO
2535 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2536 (int)depth * 2 + 2, "",
2537 "Looking for TRIE'able sequences. Tail node is: ",
2538 SvPV_nolen_const( mysv )
a3621e74
YO
2539 );
2540 });
3dab1dad 2541
a3621e74
YO
2542 /*
2543
2544 step through the branches, cur represents each
2545 branch, noper is the first thing to be matched
2546 as part of that branch and noper_next is the
2547 regnext() of that node. if noper is an EXACT
2548 and noper_next is the same as scan (our current
2549 position in the regex) then the EXACT branch is
2550 a possible optimization target. Once we have
2551 two or more consequetive such branches we can
2552 create a trie of the EXACT's contents and stich
2553 it in place. If the sequence represents all of
2554 the branches we eliminate the whole thing and
2555 replace it with a single TRIE. If it is a
2556 subsequence then we need to stitch it in. This
2557 means the first branch has to remain, and needs
2558 to be repointed at the item on the branch chain
2559 following the last branch optimized. This could
2560 be either a BRANCH, in which case the
2561 subsequence is internal, or it could be the
2562 item following the branch sequence in which
2563 case the subsequence is at the end.
2564
2565 */
2566
2567 /* dont use tail as the end marker for this traverse */
2568 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
aec46f14 2569 regnode * const noper = NEXTOPER( cur );
be981c67 2570#if defined(DEBUGGING) || defined(NOJUMPTRIE)
aec46f14 2571 regnode * const noper_next = regnext( noper );
be981c67 2572#endif
a3621e74 2573
a3621e74 2574 DEBUG_OPTIMISE_r({
32fc9b6a 2575 regprop(RExC_rx, mysv, cur);
3dab1dad
YO
2576 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2577 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
a3621e74 2578
32fc9b6a 2579 regprop(RExC_rx, mysv, noper);
a3621e74 2580 PerlIO_printf( Perl_debug_log, " -> %s",
cfd0369c 2581 SvPV_nolen_const(mysv));
a3621e74
YO
2582
2583 if ( noper_next ) {
32fc9b6a 2584 regprop(RExC_rx, mysv, noper_next );
a3621e74 2585 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
cfd0369c 2586 SvPV_nolen_const(mysv));
a3621e74 2587 }
3dab1dad
YO
2588 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2589 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
a3621e74 2590 });
3dab1dad
YO
2591 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2592 : PL_regkind[ OP( noper ) ] == EXACT )
2593 || OP(noper) == NOTHING )
786e8c11
YO
2594#ifdef NOJUMPTRIE
2595 && noper_next == tail
2596#endif
2597 && count < U16_MAX)
a3621e74
YO
2598 {
2599 count++;
3dab1dad
YO
2600 if ( !first || optype == NOTHING ) {
2601 if (!first) first = cur;
a3621e74
YO
2602 optype = OP( noper );
2603 } else {
a3621e74 2604 last = cur;
a3621e74
YO
2605 }
2606 } else {
2607 if ( last ) {
786e8c11
YO
2608 make_trie( pRExC_state,
2609 startbranch, first, cur, tail, count,
2610 optype, depth+1 );
a3621e74 2611 }
3dab1dad 2612 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11
YO
2613#ifdef NOJUMPTRIE
2614 && noper_next == tail
2615#endif
2616 ){
a3621e74
YO
2617 count = 1;
2618 first = cur;
2619 optype = OP( noper );
2620 } else {
2621 count = 0;
2622 first = NULL;
2623 optype = 0;
2624 }
2625 last = NULL;
2626 }
2627 }
2628 DEBUG_OPTIMISE_r({
32fc9b6a 2629 regprop(RExC_rx, mysv, cur);
a3621e74 2630 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
2631 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2632 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
a3621e74
YO
2633
2634 });
2635 if ( last ) {
786e8c11 2636 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2637#ifdef TRIE_STUDY_OPT
786e8c11
YO
2638 if ( ((made == MADE_EXACT_TRIE &&
2639 startbranch == first)
2640 || ( first_non_open == first )) &&
2641 depth==0 )
2642 flags |= SCF_TRIE_RESTUDY;
3dab1dad 2643#endif
07be1b83 2644 }
a3621e74 2645 }
3dab1dad
YO
2646
2647 } /* do trie */
786e8c11 2648
a0ed51b3 2649 }
a3621e74 2650 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 2651 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 2652 } else /* single branch is optimized. */
c277df42
IZ
2653 scan = NEXTOPER(scan);
2654 continue;
a0ed51b3
LW
2655 }
2656 else if (OP(scan) == EXACT) {
cd439c50 2657 I32 l = STR_LEN(scan);
c445ea15 2658 UV uc;
a0ed51b3 2659 if (UTF) {
a3b680e6 2660 const U8 * const s = (U8*)STRING(scan);
1aa99e6b 2661 l = utf8_length(s, s + l);
9041c2e3 2662 uc = utf8_to_uvchr(s, NULL);
c445ea15
AL
2663 } else {
2664 uc = *((U8*)STRING(scan));
a0ed51b3
LW
2665 }
2666 min += l;
c277df42 2667 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
2668 /* The code below prefers earlier match for fixed
2669 offset, later match for variable offset. */
2670 if (data->last_end == -1) { /* Update the start info. */
2671 data->last_start_min = data->pos_min;
2672 data->last_start_max = is_inf
b81d288d 2673 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 2674 }
cd439c50 2675 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
45f47268
NC
2676 if (UTF)
2677 SvUTF8_on(data->last_found);
0eda9292 2678 {
9a957fbc 2679 SV * const sv = data->last_found;
a28509cc 2680 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292
JH
2681 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2682 if (mg && mg->mg_len >= 0)
5e43f467
JH
2683 mg->mg_len += utf8_length((U8*)STRING(scan),
2684 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 2685 }
c277df42
IZ
2686 data->last_end = data->pos_min + l;
2687 data->pos_min += l; /* As in the first entry. */
2688 data->flags &= ~SF_BEFORE_EOL;
2689 }
653099ff
GS
2690 if (flags & SCF_DO_STCLASS_AND) {
2691 /* Check whether it is compatible with what we know already! */
2692 int compat = 1;
2693
1aa99e6b 2694 if (uc >= 0x100 ||
516a5887 2695 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2696 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2697 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2698 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2699 )
653099ff
GS
2700 compat = 0;
2701 ANYOF_CLASS_ZERO(data->start_class);
2702 ANYOF_BITMAP_ZERO(data->start_class);
2703 if (compat)
1aa99e6b 2704 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2705 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2706 if (uc < 0x100)
2707 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2708 }
2709 else if (flags & SCF_DO_STCLASS_OR) {
2710 /* false positive possible if the class is case-folded */
1aa99e6b 2711 if (uc < 0x100)
9b877dbb
IH
2712 ANYOF_BITMAP_SET(data->start_class, uc);
2713 else
2714 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff 2715 data->start_class->flags &= ~ANYOF_EOS;
40d049e4 2716 cl_and(data->start_class, and_withp);
653099ff
GS
2717 }
2718 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2719 }
3dab1dad 2720 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2721 I32 l = STR_LEN(scan);
1aa99e6b 2722 UV uc = *((U8*)STRING(scan));
653099ff
GS
2723
2724 /* Search for fixed substrings supports EXACT only. */
ecaa9b9c
NC
2725 if (flags & SCF_DO_SUBSTR) {
2726 assert(data);
1de06328 2727 scan_commit(pRExC_state, data, minlenp);
ecaa9b9c 2728 }
a0ed51b3 2729 if (UTF) {
6136c704 2730 const U8 * const s = (U8 *)STRING(scan);
1aa99e6b 2731 l = utf8_length(s, s + l);
9041c2e3 2732 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2733 }
2734 min += l;
ecaa9b9c 2735 if (flags & SCF_DO_SUBSTR)
a0ed51b3 2736 data->pos_min += l;
653099ff
GS
2737 if (flags & SCF_DO_STCLASS_AND) {
2738 /* Check whether it is compatible with what we know already! */
2739 int compat = 1;
2740
1aa99e6b 2741 if (uc >= 0x100 ||
516a5887 2742 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2743 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2744 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2745 compat = 0;
2746 ANYOF_CLASS_ZERO(data->start_class);
2747 ANYOF_BITMAP_ZERO(data->start_class);
2748 if (compat) {
1aa99e6b 2749 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2750 data->start_class->flags &= ~ANYOF_EOS;
2751 data->start_class->flags |= ANYOF_FOLD;
2752 if (OP(scan) == EXACTFL)
2753 data->start_class->flags |= ANYOF_LOCALE;
2754 }
2755 }
2756 else if (flags & SCF_DO_STCLASS_OR) {
2757 if (data->start_class->flags & ANYOF_FOLD) {
2758 /* false positive possible if the class is case-folded.
2759 Assume that the locale settings are the same... */
1aa99e6b
IH
2760 if (uc < 0x100)
2761 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2762 data->start_class->flags &= ~ANYOF_EOS;
2763 }
40d049e4 2764 cl_and(data->start_class, and_withp);
653099ff
GS
2765 }
2766 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2767 }
bfed75c6 2768 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2769 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2770 I32 f = flags, pos_before = 0;
d4c19fe8 2771 regnode * const oscan = scan;
653099ff
GS
2772 struct regnode_charclass_class this_class;
2773 struct regnode_charclass_class *oclass = NULL;
727f22e3 2774 I32 next_is_eval = 0;
653099ff 2775
3dab1dad 2776 switch (PL_regkind[OP(scan)]) {
653099ff 2777 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2778 scan = NEXTOPER(scan);
2779 goto finish;
2780 case PLUS:
653099ff 2781 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2782 next = NEXTOPER(scan);
653099ff 2783 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2784 mincount = 1;
2785 maxcount = REG_INFTY;
c277df42
IZ
2786 next = regnext(scan);
2787 scan = NEXTOPER(scan);
2788 goto do_curly;
2789 }
2790 }
2791 if (flags & SCF_DO_SUBSTR)
2792 data->pos_min++;
2793 min++;
2794 /* Fall through. */
2795 case STAR:
653099ff
GS
2796 if (flags & SCF_DO_STCLASS) {
2797 mincount = 0;
b81d288d 2798 maxcount = REG_INFTY;
653099ff
GS
2799 next = regnext(scan);
2800 scan = NEXTOPER(scan);
2801 goto do_curly;
2802 }
b81d288d 2803 is_inf = is_inf_internal = 1;
c277df42
IZ
2804 scan = regnext(scan);
2805 if (flags & SCF_DO_SUBSTR) {
1de06328 2806 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
c277df42
IZ
2807 data->longest = &(data->longest_float);
2808 }
2809 goto optimize_curly_tail;
2810 case CURLY:
40d049e4
YO
2811 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2812 && (scan->flags == stopparen))
2813 {
2814 mincount = 1;
2815 maxcount = 1;
2816 } else {
2817 mincount = ARG1(scan);
2818 maxcount = ARG2(scan);
2819 }
c277df42 2820 next = regnext(scan);
cb434fcc
IZ
2821 if (OP(scan) == CURLYX) {
2822 I32 lp = (data ? *(data->last_closep) : 0);
786e8c11 2823 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2824 }
c277df42 2825 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2826 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2827 do_curly:
2828 if (flags & SCF_DO_SUBSTR) {
1de06328 2829 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
c277df42
IZ
2830 pos_before = data->pos_min;
2831 }
2832 if (data) {
2833 fl = data->flags;
2834 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2835 if (is_inf)
2836 data->flags |= SF_IS_INF;
2837 }
653099ff 2838 if (flags & SCF_DO_STCLASS) {
830247a4 2839 cl_init(pRExC_state, &this_class);
653099ff
GS
2840 oclass = data->start_class;
2841 data->start_class = &this_class;
2842 f |= SCF_DO_STCLASS_AND;
2843 f &= ~SCF_DO_STCLASS_OR;
2844 }
e1901655
IZ
2845 /* These are the cases when once a subexpression
2846 fails at a particular position, it cannot succeed
2847 even after backtracking at the enclosing scope.
b81d288d 2848
e1901655
IZ
2849 XXXX what if minimal match and we are at the
2850 initial run of {n,m}? */
2851 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2852 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2853
c277df42 2854 /* This will finish on WHILEM, setting scan, or on NULL: */
40d049e4
YO
2855 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2856 last, data, stopparen, recursed, NULL,
a3621e74
YO
2857 (mincount == 0
2858 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2859
2860 if (flags & SCF_DO_STCLASS)
2861 data->start_class = oclass;
2862 if (mincount == 0 || minnext == 0) {
2863 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2864 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2865 }
2866 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2867 /* Switch to OR mode: cache the old value of
653099ff 2868 * data->start_class */
40d049e4
YO
2869 INIT_AND_WITHP;
2870 StructCopy(data->start_class, and_withp,
653099ff
GS
2871 struct regnode_charclass_class);
2872 flags &= ~SCF_DO_STCLASS_AND;
2873 StructCopy(&this_class, data->start_class,
2874 struct regnode_charclass_class);
2875 flags |= SCF_DO_STCLASS_OR;
2876 data->start_class->flags |= ANYOF_EOS;
2877 }
2878 } else { /* Non-zero len */
2879 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2880 cl_or(pRExC_state, data->start_class, &this_class);
40d049e4 2881 cl_and(data->start_class, and_withp);
653099ff
GS
2882 }
2883 else if (flags & SCF_DO_STCLASS_AND)
2884 cl_and(data->start_class, &this_class);
2885 flags &= ~SCF_DO_STCLASS;
2886 }
c277df42
IZ
2887 if (!scan) /* It was not CURLYX, but CURLY. */
2888 scan = next;
041457d9
DM
2889 if ( /* ? quantifier ok, except for (?{ ... }) */
2890 (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2891 && (minnext == 0) && (deltanext == 0)
99799961 2892 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
041457d9
DM
2893 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2894 && ckWARN(WARN_REGEXP))
b45f050a 2895 {
830247a4 2896 vWARN(RExC_parse,
b45f050a
JF
2897 "Quantifier unexpected on zero-length expression");
2898 }
2899
c277df42 2900 min += minnext * mincount;
b81d288d 2901 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2902 && (minnext + deltanext) > 0)
2903 || deltanext == I32_MAX);
aca2d497 2904 is_inf |= is_inf_internal;
c277df42
IZ
2905 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2906
2907 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2908 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2909 && data->flags & SF_IN_PAR
2910 && !(data->flags & SF_HAS_EVAL)
2911 && !deltanext && minnext == 1 ) {
2912 /* Try to optimize to CURLYN. */
2913 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
d4c19fe8 2914 regnode * const nxt1 = nxt;
497b47a8
JH
2915#ifdef DEBUGGING
2916 regnode *nxt2;
2917#endif
c277df42
IZ
2918
2919 /* Skip open. */
2920 nxt = regnext(nxt);
bfed75c6 2921 if (!strchr((const char*)PL_simple,OP(nxt))
3dab1dad 2922 && !(PL_regkind[OP(nxt)] == EXACT
b81d288d 2923 && STR_LEN(nxt) == 1))
c277df42 2924 goto nogo;
497b47a8 2925#ifdef DEBUGGING
c277df42 2926 nxt2 = nxt;
497b47a8 2927#endif
c277df42 2928 nxt = regnext(nxt);
b81d288d 2929 if (OP(nxt) != CLOSE)
c277df42 2930 goto nogo;
40d049e4
YO
2931 if (RExC_open_parens) {
2932 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
2933 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
2934 }
c277df42 2935 /* Now we know that nxt2 is the only contents: */
eb160463 2936 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2937 OP(oscan) = CURLYN;
2938 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 2939
c277df42
IZ
2940#ifdef DEBUGGING
2941 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2942 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2943 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2944 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2945 OP(nxt + 1) = OPTIMIZED; /* was count. */
2946 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2947#endif
c277df42 2948 }
c277df42
IZ
2949 nogo:
2950
2951 /* Try optimization CURLYX => CURLYM. */
b81d288d 2952 if ( OP(oscan) == CURLYX && data
c277df42 2953 && !(data->flags & SF_HAS_PAR)
c277df42 2954 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2955 && !deltanext /* atom is fixed width */
2956 && minnext != 0 /* CURLYM can't handle zero width */
2957 ) {
c277df42
IZ
2958 /* XXXX How to optimize if data == 0? */
2959 /* Optimize to a simpler form. */
2960 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2961 regnode *nxt2;
2962
2963 OP(oscan) = CURLYM;
2964 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2965 && (OP(nxt2) != WHILEM))
c277df42
IZ
2966 nxt = nxt2;
2967 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2968 /* Need to optimize away parenths. */
2969 if (data->flags & SF_IN_PAR) {
2970 /* Set the parenth number. */
2971 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2972
b81d288d 2973 if (OP(nxt) != CLOSE)
b45f050a 2974 FAIL("Panic opt close");
eb160463 2975 oscan->flags = (U8)ARG(nxt);
40d049e4
YO
2976 if (RExC_open_parens) {
2977 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
2978 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
2979 }
c277df42
IZ
2980 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2981 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 2982
c277df42
IZ
2983#ifdef DEBUGGING
2984 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2985 OP(nxt + 1) = OPTIMIZED; /* was count. */
2986 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2987 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2988#endif
c277df42
IZ
2989#if 0
2990 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2991 regnode *nnxt = regnext(nxt1);
b81d288d 2992
c277df42
IZ
2993 if (nnxt == nxt) {
2994 if (reg_off_by_arg[OP(nxt1)])
2995 ARG_SET(nxt1, nxt2 - nxt1);
2996 else if (nxt2 - nxt1 < U16_MAX)
2997 NEXT_OFF(nxt1) = nxt2 - nxt1;
2998 else
2999 OP(nxt) = NOTHING; /* Cannot beautify */
3000 }
3001 nxt1 = nnxt;
3002 }
3003#endif
3004 /* Optimize again: */
1de06328 3005 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
40d049e4 3006 NULL, stopparen, recursed, NULL, 0,depth+1);
a0ed51b3
LW
3007 }
3008 else
c277df42 3009 oscan->flags = 0;
c277df42 3010 }
e1901655
IZ
3011 else if ((OP(oscan) == CURLYX)
3012 && (flags & SCF_WHILEM_VISITED_POS)
3013 /* See the comment on a similar expression above.
3014 However, this time it not a subexpression
3015 we care about, but the expression itself. */
3016 && (maxcount == REG_INFTY)
3017 && data && ++data->whilem_c < 16) {
3018 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
3019 /* Find WHILEM (as in regexec.c) */
3020 regnode *nxt = oscan + NEXT_OFF(oscan);
3021
3022 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3023 nxt += ARG(nxt);
eb160463
GS
3024 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3025 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 3026 }
b81d288d 3027 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
3028 pars++;
3029 if (flags & SCF_DO_SUBSTR) {
c445ea15 3030 SV *last_str = NULL;
c277df42
IZ
3031 int counted = mincount != 0;
3032
3033 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
3034#if defined(SPARC64_GCC_WORKAROUND)
3035 I32 b = 0;
3036 STRLEN l = 0;
cfd0369c 3037 const char *s = NULL;
5d1c421c
JH
3038 I32 old = 0;
3039
3040 if (pos_before >= data->last_start_min)
3041 b = pos_before;
3042 else
3043 b = data->last_start_min;
3044
3045 l = 0;
cfd0369c 3046 s = SvPV_const(data->last_found, l);
5d1c421c
JH
3047 old = b - data->last_start_min;
3048
3049#else
b81d288d 3050 I32 b = pos_before >= data->last_start_min
c277df42
IZ
3051 ? pos_before : data->last_start_min;
3052 STRLEN l;
d4c19fe8 3053 const char * const s = SvPV_const(data->last_found, l);
a0ed51b3 3054 I32 old = b - data->last_start_min;
5d1c421c 3055#endif
a0ed51b3
LW
3056
3057 if (UTF)
3058 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 3059
a0ed51b3 3060 l -= old;
c277df42 3061 /* Get the added string: */
79cb57f6 3062 last_str = newSVpvn(s + old, l);
0e933229
IH
3063 if (UTF)
3064 SvUTF8_on(last_str);
c277df42
IZ
3065 if (deltanext == 0 && pos_before == b) {
3066 /* What was added is a constant string */
3067 if (mincount > 1) {
3068 SvGROW(last_str, (mincount * l) + 1);
b81d288d 3069 repeatcpy(SvPVX(last_str) + l,
3f7c398e 3070 SvPVX_const(last_str), l, mincount - 1);
b162af07 3071 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 3072 /* Add additional parts. */
b81d288d 3073 SvCUR_set(data->last_found,
c277df42
IZ
3074 SvCUR(data->last_found) - l);
3075 sv_catsv(data->last_found, last_str);
0eda9292
JH
3076 {
3077 SV * sv = data->last_found;
3078 MAGIC *mg =
3079 SvUTF8(sv) && SvMAGICAL(sv) ?
3080 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3081 if (mg && mg->mg_len >= 0)
3082 mg->mg_len += CHR_SVLEN(last_str);
3083 }
c277df42
IZ
3084 data->last_end += l * (mincount - 1);
3085 }
2a8d9689
HS
3086 } else {
3087 /* start offset must point into the last copy */
3088 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
3089 data->last_start_max += is_inf ? I32_MAX
3090 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
3091 }
3092 }
3093 /* It is counted once already... */
3094 data->pos_min += minnext * (mincount - counted);
3095 data->pos_delta += - counted * deltanext +
3096 (minnext + deltanext) * maxcount - minnext * mincount;
3097 if (mincount != maxcount) {
653099ff
GS
3098 /* Cannot extend fixed substrings found inside
3099 the group. */
1de06328 3100 scan_commit(pRExC_state,data,minlenp);
c277df42 3101 if (mincount && last_str) {
d4c19fe8
AL
3102 SV * const sv = data->last_found;
3103 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
45f47268
NC
3104 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3105
3106 if (mg)
3107 mg->mg_len = -1;
3108 sv_setsv(sv, last_str);
c277df42 3109 data->last_end = data->pos_min;
b81d288d 3110 data->last_start_min =
a0ed51b3 3111 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
3112 data->last_start_max = is_inf
3113 ? I32_MAX
c277df42 3114 : data->pos_min + data->pos_delta
a0ed51b3 3115 - CHR_SVLEN(last_str);
c277df42
IZ
3116 }
3117 data->longest = &(data->longest_float);
3118 }
aca2d497 3119 SvREFCNT_dec(last_str);
c277df42 3120 }
405ff068 3121 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
3122 data->flags |= SF_HAS_EVAL;
3123 optimize_curly_tail:
c277df42 3124 if (OP(oscan) != CURLYX) {
3dab1dad 3125 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
3126 && NEXT_OFF(next))
3127 NEXT_OFF(oscan) += NEXT_OFF(next);
3128 }
c277df42 3129 continue;
653099ff 3130 default: /* REF and CLUMP only? */
c277df42 3131 if (flags & SCF_DO_SUBSTR) {
1de06328 3132 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
c277df42
IZ
3133 data->longest = &(data->longest_float);
3134 }
aca2d497 3135 is_inf = is_inf_internal = 1;
653099ff 3136 if (flags & SCF_DO_STCLASS_OR)
830247a4 3137 cl_anything(pRExC_state, data->start_class);
653099ff 3138 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
3139 break;
3140 }
a0ed51b3 3141 }
bfed75c6 3142 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 3143 int value = 0;
653099ff 3144
c277df42 3145 if (flags & SCF_DO_SUBSTR) {
1de06328 3146 scan_commit(pRExC_state,data,minlenp);
c277df42
IZ
3147 data->pos_min++;
3148 }
3149 min++;
653099ff
GS
3150 if (flags & SCF_DO_STCLASS) {
3151 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3152
3153 /* Some of the logic below assumes that switching
3154 locale on will only add false positives. */
3dab1dad 3155 switch (PL_regkind[OP(scan)]) {
653099ff 3156 case SANY:
653099ff
GS
3157 default:
3158 do_default:
3159 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3160 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3161 cl_anything(pRExC_state, data->start_class);
653099ff
GS
3162 break;
3163 case REG_ANY:
3164 if (OP(scan) == SANY)
3165 goto do_default;
3166 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3167 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3168 || (data->start_class->flags & ANYOF_CLASS));
830247a4 3169 cl_anything(pRExC_state, data->start_class);
653099ff
GS
3170 }
3171 if (flags & SCF_DO_STCLASS_AND || !value)
3172 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3173 break;
3174 case ANYOF:
3175 if (flags & SCF_DO_STCLASS_AND)
3176 cl_and(data->start_class,
3177 (struct regnode_charclass_class*)scan);
3178 else
830247a4 3179 cl_or(pRExC_state, data->start_class,
653099ff
GS
3180 (struct regnode_charclass_class*)scan);
3181 break;
3182 case ALNUM:
3183 if (flags & SCF_DO_STCLASS_AND) {
3184 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3185 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3186 for (value = 0; value < 256; value++)
3187 if (!isALNUM(value))
3188 ANYOF_BITMAP_CLEAR(data->start_class, value);
3189 }
3190 }
3191 else {
3192 if (data->start_class->flags & ANYOF_LOCALE)
3193 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3194 else {
3195 for (value = 0; value < 256; value++)
3196 if (isALNUM(value))
b81d288d 3197 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3198 }
3199 }
3200 break;
3201 case ALNUML:
3202 if (flags & SCF_DO_STCLASS_AND) {
3203 if (data->start_class->flags & ANYOF_LOCALE)
3204 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3205 }
3206 else {
3207 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3208 data->start_class->flags |= ANYOF_LOCALE;
3209 }
3210 break;
3211 case NALNUM:
3212 if (flags & SCF_DO_STCLASS_AND) {
3213 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3214 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3215 for (value = 0; value < 256; value++)
3216 if (isALNUM(value))
3217 ANYOF_BITMAP_CLEAR(data->start_class, value);
3218 }
3219 }
3220 else {
3221 if (data->start_class->flags & ANYOF_LOCALE)
3222 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3223 else {
3224 for (value = 0; value < 256; value++)
3225 if (!isALNUM(value))
b81d288d 3226 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3227 }
3228 }
3229 break;
3230 case NALNUML:
3231 if (flags & SCF_DO_STCLASS_AND) {
3232 if (data->start_class->flags & ANYOF_LOCALE)
3233 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3234 }
3235 else {
3236 data->start_class->flags |= ANYOF_LOCALE;
3237 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3238 }
3239 break;
3240 case SPACE:
3241 if (flags & SCF_DO_STCLASS_AND) {
3242 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3243 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3244 for (value = 0; value < 256; value++)
3245 if (!isSPACE(value))
3246 ANYOF_BITMAP_CLEAR(data->start_class, value);
3247 }
3248 }
3249 else {
3250 if (data->start_class->flags & ANYOF_LOCALE)
3251 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3252 else {
3253 for (value = 0; value < 256; value++)
3254 if (isSPACE(value))
b81d288d 3255 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3256 }
3257 }
3258 break;
3259 case SPACEL:
3260 if (flags & SCF_DO_STCLASS_AND) {
3261 if (data->start_class->flags & ANYOF_LOCALE)
3262 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3263 }
3264 else {
3265 data->start_class->flags |= ANYOF_LOCALE;
3266 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3267 }
3268 break;
3269 case NSPACE:
3270 if (flags & SCF_DO_STCLASS_AND) {
3271 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3272 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3273 for (value = 0; value < 256; value++)
3274 if (isSPACE(value))
3275 ANYOF_BITMAP_CLEAR(data->start_class, value);
3276 }
3277 }
3278 else {
3279 if (data->start_class->flags & ANYOF_LOCALE)
3280 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3281 else {
3282 for (value = 0; value < 256; value++)
3283 if (!isSPACE(value))
b81d288d 3284 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3285 }
3286 }
3287 break;
3288 case NSPACEL:
3289 if (flags & SCF_DO_STCLASS_AND) {
3290 if (data->start_class->flags & ANYOF_LOCALE) {
3291 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3292 for (value = 0; value < 256; value++)
3293 if (!isSPACE(value))
3294 ANYOF_BITMAP_CLEAR(data->start_class, value);
3295 }
3296 }
3297 else {
3298 data->start_class->flags |= ANYOF_LOCALE;
3299 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3300 }
3301 break;
3302 case DIGIT:
3303 if (flags & SCF_DO_STCLASS_AND) {
3304 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3305 for (value = 0; value < 256; value++)
3306 if (!isDIGIT(value))
3307 ANYOF_BITMAP_CLEAR(data->start_class, value);
3308 }
3309 else {
3310 if (data->start_class->flags & ANYOF_LOCALE)
3311 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3312 else {
3313 for (value = 0; value < 256; value++)
3314 if (isDIGIT(value))
b81d288d 3315 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3316 }
3317 }
3318 break;
3319 case NDIGIT:
3320 if (flags & SCF_DO_STCLASS_AND) {
3321 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3322 for (value = 0; value < 256; value++)
3323 if (isDIGIT(value))
3324 ANYOF_BITMAP_CLEAR(data->start_class, value);
3325 }
3326 else {
3327 if (data->start_class->flags & ANYOF_LOCALE)
3328 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3329 else {
3330 for (value = 0; value < 256; value++)
3331 if (!isDIGIT(value))
b81d288d 3332 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3333 }
3334 }
3335 break;
3336 }
3337 if (flags & SCF_DO_STCLASS_OR)
40d049e4 3338 cl_and(data->start_class, and_withp);
653099ff
GS
3339 flags &= ~SCF_DO_STCLASS;
3340 }
a0ed51b3 3341 }
3dab1dad 3342 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
3343 data->flags |= (OP(scan) == MEOL
3344 ? SF_BEFORE_MEOL
3345 : SF_BEFORE_SEOL);
a0ed51b3 3346 }
3dab1dad 3347 else if ( PL_regkind[OP(scan)] == BRANCHJ
653099ff
GS
3348 /* Lookbehind, or need to calculate parens/evals/stclass: */
3349 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 3350 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1de06328
YO
3351 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3352 || OP(scan) == UNLESSM )
3353 {
3354 /* Negative Lookahead/lookbehind
3355 In this case we can't do fixed string optimisation.
3356 */
3357
3358 I32 deltanext, minnext, fake = 0;
3359 regnode *nscan;
3360 struct regnode_charclass_class intrnl;
3361 int f = 0;
3362
3363 data_fake.flags = 0;
3364 if (data) {
3365 data_fake.whilem_c = data->whilem_c;
3366 data_fake.last_closep = data->last_closep;
a0ed51b3 3367 }
1de06328
YO
3368 else
3369 data_fake.last_closep = &fake;
3370 if ( flags & SCF_DO_STCLASS && !scan->flags
3371 && OP(scan) == IFMATCH ) { /* Lookahead */
3372 cl_init(pRExC_state, &intrnl);
3373 data_fake.start_class = &intrnl;
3374 f |= SCF_DO_STCLASS_AND;
c277df42 3375 }
1de06328
YO
3376 if (flags & SCF_WHILEM_VISITED_POS)
3377 f |= SCF_WHILEM_VISITED_POS;
3378 next = regnext(scan);
3379 nscan = NEXTOPER(NEXTOPER(scan));
40d049e4
YO
3380 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3381 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
1de06328
YO
3382 if (scan->flags) {
3383 if (deltanext) {
3384 vFAIL("Variable length lookbehind not implemented");
3385 }
3386 else if (minnext > (I32)U8_MAX) {
3387 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3388 }
3389 scan->flags = (U8)minnext;
3390 }
3391 if (data) {
3392 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3393 pars++;
3394 if (data_fake.flags & SF_HAS_EVAL)
3395 data->flags |= SF_HAS_EVAL;
3396 data->whilem_c = data_fake.whilem_c;
3397 }
3398 if (f & SCF_DO_STCLASS_AND) {
3399 const int was = (data->start_class->flags & ANYOF_EOS);
3400
3401 cl_and(data->start_class, &intrnl);
3402 if (was)
3403 data->start_class->flags |= ANYOF_EOS;
3404 }
be8e71aa 3405 }
1de06328
YO
3406#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3407 else {
3408 /* Positive Lookahead/lookbehind
3409 In this case we can do fixed string optimisation,
3410 but we must be careful about it. Note in the case of
3411 lookbehind the positions will be offset by the minimum
3412 length of the pattern, something we won't know about
3413 until after the recurse.
3414 */
3415 I32 deltanext, fake = 0;
3416 regnode *nscan;
3417 struct regnode_charclass_class intrnl;
3418 int f = 0;
3419 /* We use SAVEFREEPV so that when the full compile
3420 is finished perl will clean up the allocated
3421 minlens when its all done. This was we don't
3422 have to worry about freeing them when we know
3423 they wont be used, which would be a pain.
3424 */
3425 I32 *minnextp;
3426 Newx( minnextp, 1, I32 );
3427 SAVEFREEPV(minnextp);
3428
3429 if (data) {
3430 StructCopy(data, &data_fake, scan_data_t);
3431 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3432 f |= SCF_DO_SUBSTR;
3433 if (scan->flags)
3434 scan_commit(pRExC_state, &data_fake,minlenp);
3435 data_fake.last_found=newSVsv(data->last_found);
3436 }
3437 }
3438 else
3439 data_fake.last_closep = &fake;
3440 data_fake.flags = 0;
3441 if (is_inf)
3442 data_fake.flags |= SF_IS_INF;
3443 if ( flags & SCF_DO_STCLASS && !scan->flags
3444 && OP(scan) == IFMATCH ) { /* Lookahead */
3445 cl_init(pRExC_state, &intrnl);
3446 data_fake.start_class = &intrnl;
3447 f |= SCF_DO_STCLASS_AND;
3448 }
3449 if (flags & SCF_WHILEM_VISITED_POS)
3450 f |= SCF_WHILEM_VISITED_POS;
3451 next = regnext(scan);
3452 nscan = NEXTOPER(NEXTOPER(scan));
40d049e4
YO
3453
3454 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3455 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
1de06328
YO
3456 if (scan->flags) {
3457 if (deltanext) {
3458 vFAIL("Variable length lookbehind not implemented");
3459 }
3460 else if (*minnextp > (I32)U8_MAX) {
3461 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3462 }
3463 scan->flags = (U8)*minnextp;
3464 }
40d049e4 3465
1de06328 3466 *minnextp += min;
40d049e4 3467
1de06328
YO
3468 if (f & SCF_DO_STCLASS_AND) {
3469 const int was = (data->start_class->flags & ANYOF_EOS);
3470
3471 cl_and(data->start_class, &intrnl);
3472 if (was)
3473 data->start_class->flags |= ANYOF_EOS;
40d049e4 3474 }
1de06328
YO
3475 if (data) {
3476 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3477 pars++;
3478 if (data_fake.flags & SF_HAS_EVAL)
3479 data->flags |= SF_HAS_EVAL;
3480 data->whilem_c = data_fake.whilem_c;
3481 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3482 if (RExC_rx->minlen<*minnextp)
3483 RExC_rx->minlen=*minnextp;
3484 scan_commit(pRExC_state, &data_fake, minnextp);
3485 SvREFCNT_dec(data_fake.last_found);
3486
3487 if ( data_fake.minlen_fixed != minlenp )
3488 {
3489 data->offset_fixed= data_fake.offset_fixed;
3490 data->minlen_fixed= data_fake.minlen_fixed;
3491 data->lookbehind_fixed+= scan->flags;
3492 }
3493 if ( data_fake.minlen_float != minlenp )
3494 {
3495 data->minlen_float= data_fake.minlen_float;
3496 data->offset_float_min=data_fake.offset_float_min;
3497 data->offset_float_max=data_fake.offset_float_max;
3498 data->lookbehind_float+= scan->flags;
3499 }
3500 }
3501 }
3502
653099ff 3503
653099ff 3504 }
1de06328 3505#endif
a0ed51b3
LW
3506 }
3507 else if (OP(scan) == OPEN) {
40d049e4
YO
3508 if (stopparen != (I32)ARG(scan))
3509 pars++;
a0ed51b3 3510 }
cb434fcc 3511 else if (OP(scan) == CLOSE) {
40d049e4
YO
3512 if (stopparen == (I32)ARG(scan)) {
3513 break;
3514 }
eb160463 3515 if ((I32)ARG(scan) == is_par) {
cb434fcc 3516 next = regnext(scan);
c277df42 3517
cb434fcc
IZ
3518 if ( next && (OP(next) != WHILEM) && next < last)
3519 is_par = 0; /* Disable optimization */
3520 }
3521 if (data)
3522 *(data->last_closep) = ARG(scan);
a0ed51b3 3523 }
1a147d38 3524 else if (OP(scan) == GOSUB || OP(scan) == GOSTART) {
40d049e4
YO
3525 /* set the pointer */
3526 I32 paren;
3527 regnode *start;
3528 regnode *end;
1a147d38 3529 if (OP(scan) == GOSUB) {
40d049e4
YO
3530 paren = ARG(scan);
3531 RExC_recurse[ARG2L(scan)] = scan;
3532 start = RExC_open_parens[paren-1];
3533 end = RExC_close_parens[paren-1];
3534 } else {
3535 paren = 0;
3536 start = RExC_rx->program + 1;
3537 end = RExC_opend;
3538 }
3539 assert(start);
3540 assert(end);
3541 if (!recursed) {
3542 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3543 SAVEFREEPV(recursed);
3544 }
3545 if (!PAREN_TEST(recursed,paren+1)) {
3546 I32 deltanext = 0;
3547 PAREN_SET(recursed,paren+1);
3548
3549 DEBUG_PEEP("goto",start,depth);
3550 min += study_chunk(
3551 pRExC_state,
3552 &start,
3553 minlenp,
3554 &deltanext,
3555 end+1,
3556 data,
3557 paren,
3558 recursed,
3559 and_withp,
3560 flags,depth+1);
3561 delta+=deltanext;
3562 if (deltanext == I32_MAX) {
3563 is_inf = is_inf_internal = 1;
3564 delta=deltanext;
3565 }
3566 DEBUG_PEEP("rtrn",end,depth);
3567 PAREN_UNSET(recursed,paren+1);
3568 } else {
3569 if (flags & SCF_DO_SUBSTR) {
3570 scan_commit(pRExC_state,data,minlenp);
3571 data->longest = &(data->longest_float);
3572 }
3573 is_inf = is_inf_internal = 1;
3574 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3575 cl_anything(pRExC_state, data->start_class);
3576 flags &= ~SCF_DO_STCLASS;
3577 }
3578 }
a0ed51b3 3579 else if (OP(scan) == EVAL) {
c277df42
IZ
3580 if (data)
3581 data->flags |= SF_HAS_EVAL;
3582 }
40d049e4
YO
3583 else if ( OP(scan)==OPFAIL ) {
3584 if (flags & SCF_DO_SUBSTR) {
3585 scan_commit(pRExC_state,data,minlenp);
3586 flags &= ~SCF_DO_SUBSTR;
3587 }
3588 }
3589 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3590 {
0f5d15d6 3591 if (flags & SCF_DO_SUBSTR) {
1de06328 3592 scan_commit(pRExC_state,data,minlenp);
0f5d15d6
IZ
3593 data->longest = &(data->longest_float);
3594 }
3595 is_inf = is_inf_internal = 1;
653099ff 3596 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3597 cl_anything(pRExC_state, data->start_class);
96776eda 3598 flags &= ~SCF_DO_STCLASS;
0f5d15d6 3599 }
786e8c11 3600#ifdef TRIE_STUDY_OPT
40d049e4 3601#ifdef FULL_TRIE_STUDY
786e8c11
YO
3602 else if (PL_regkind[OP(scan)] == TRIE) {
3603 /* NOTE - There is similar code to this block above for handling
40d049e4 3604 BRANCH nodes on the initial study. If you change stuff here
786e8c11 3605 check there too. */
7f69552c 3606 regnode *trie_node= scan;
786e8c11
YO
3607 regnode *tail= regnext(scan);
3608 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3609 I32 max1 = 0, min1 = I32_MAX;
3610 struct regnode_charclass_class accum;
3611
3612 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1de06328 3613 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
786e8c11
YO
3614 if (flags & SCF_DO_STCLASS)
3615 cl_init_zero(pRExC_state, &accum);
3616
3617 if (!trie->jump) {
3618 min1= trie->minlen;
3619 max1= trie->maxlen;
3620 } else {
3621 const regnode *nextbranch= NULL;
3622 U32 word;
3623
3624 for ( word=1 ; word <= trie->wordcount ; word++)
3625 {
3626 I32 deltanext=0, minnext=0, f = 0, fake;
3627 struct regnode_charclass_class this_class;
3628
3629 data_fake.flags = 0;
3630 if (data) {
3631 data_fake.whilem_c = data->whilem_c;
3632 data_fake.last_closep = data->last_closep;
3633 }
3634 else
3635 data_fake.last_closep = &fake;
3636
3637 if (flags & SCF_DO_STCLASS) {
3638 cl_init(pRExC_state, &this_class);
3639 data_fake.start_class = &this_class;
3640 f = SCF_DO_STCLASS_AND;
3641 }
3642 if (flags & SCF_WHILEM_VISITED_POS)
3643 f |= SCF_WHILEM_VISITED_POS;
3644
3645 if (trie->jump[word]) {
3646 if (!nextbranch)
7f69552c
YO
3647 nextbranch = trie_node + trie->jump[0];
3648 scan= trie_node + trie->jump[word];
786e8c11
YO
3649 /* We go from the jump point to the branch that follows
3650 it. Note this means we need the vestigal unused branches
3651 even though they arent otherwise used.
3652 */
40d049e4
YO
3653 minnext = study_chunk(pRExC_state, &scan, minlenp,
3654 &deltanext, (regnode *)nextbranch, &data_fake,
3655 stopparen, recursed, NULL, f,depth+1);
786e8c11
YO
3656 }
3657 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3658 nextbranch= regnext((regnode*)nextbranch);
3659
3660 if (min1 > (I32)(minnext + trie->minlen))
3661 min1 = minnext + trie->minlen;
3662 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3663 max1 = minnext + deltanext + trie->maxlen;
3664 if (deltanext == I32_MAX)
3665 is_inf = is_inf_internal = 1;
3666
3667 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3668 pars++;
3669
3670 if (data) {
3671 if (data_fake.flags & SF_HAS_EVAL)
3672 data->flags |= SF_HAS_EVAL;
3673 data->whilem_c = data_fake.whilem_c;
3674 }
3675 if (flags & SCF_DO_STCLASS)
3676 cl_or(pRExC_state, &accum, &this_class);
3677 }
3678 }
3679 if (flags & SCF_DO_SUBSTR) {
3680 data->pos_min += min1;
3681 data->pos_delta += max1 - min1;
3682 if (max1 != min1 || is_inf)
3683 data->longest = &(data->longest_float);
3684 }
3685 min += min1;
3686 delta += max1 - min1;
3687 if (flags & SCF_DO_STCLASS_OR) {
3688 cl_or(pRExC_state, data->start_class, &accum);
3689 if (min1) {
40d049e4 3690 cl_and(data->start_class, and_withp);
786e8c11
YO
3691 flags &= ~SCF_DO_STCLASS;
3692 }
3693 }
3694 else if (flags & SCF_DO_STCLASS_AND) {
3695 if (min1) {
3696 cl_and(data->start_class, &accum);
3697 flags &= ~SCF_DO_STCLASS;
3698 }
3699 else {
3700 /* Switch to OR mode: cache the old value of
3701 * data->start_class */
40d049e4
YO
3702 INIT_AND_WITHP;
3703 StructCopy(data->start_class, and_withp,
786e8c11
YO
3704 struct regnode_charclass_class);
3705 flags &= ~SCF_DO_STCLASS_AND;
3706 StructCopy(&accum, data->start_class,
3707 struct regnode_charclass_class);
3708 flags |= SCF_DO_STCLASS_OR;
3709 data->start_class->flags |= ANYOF_EOS;
3710 }
3711 }
3712 scan= tail;
3713 continue;
3714 }
3715#else
3716 else if (PL_regkind[OP(scan)] == TRIE) {
3717 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3718 U8*bang=NULL;
3719
3720 min += trie->minlen;
3721 delta += (trie->maxlen - trie->minlen);
3722 flags &= ~SCF_DO_STCLASS; /* xxx */
3723 if (flags & SCF_DO_SUBSTR) {
1de06328 3724 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
786e8c11
YO
3725 data->pos_min += trie->minlen;
3726 data->pos_delta += (trie->maxlen - trie->minlen);
3727 if (trie->maxlen != trie->minlen)
3728 data->longest = &(data->longest_float);
3729 }
3730 if (trie->jump) /* no more substrings -- for now /grr*/
3731 flags &= ~SCF_DO_SUBSTR;
3732 }
3733#endif /* old or new */
3734#endif /* TRIE_STUDY_OPT */
c277df42
IZ
3735 /* Else: zero-length, ignore. */
3736 scan = regnext(scan);
3737 }
3738
3739 finish:
3740 *scanp = scan;
aca2d497 3741 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3742 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 3743 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 3744 if (is_par > (I32)U8_MAX)
c277df42
IZ
3745 is_par = 0;
3746 if (is_par && pars==1 && data) {
3747 data->flags |= SF_IN_PAR;
3748 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3749 }
3750 else if (pars && data) {
c277df42
IZ
3751 data->flags |= SF_HAS_PAR;
3752 data->flags &= ~SF_IN_PAR;
3753 }
653099ff 3754 if (flags & SCF_DO_STCLASS_OR)
40d049e4 3755 cl_and(data->start_class, and_withp);
786e8c11
YO
3756 if (flags & SCF_TRIE_RESTUDY)
3757 data->flags |= SCF_TRIE_RESTUDY;
1de06328
YO
3758
3759 DEBUG_STUDYDATA(data,depth);
3760
c277df42
IZ
3761 return min;
3762}
3763
76e3520e 3764STATIC I32
5f66b61c 3765S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 3766{
830247a4 3767 if (RExC_rx->data) {
2eb97020 3768 const U32 count = RExC_rx->data->count;
b81d288d 3769 Renewc(RExC_rx->data,
2eb97020 3770 sizeof(*RExC_rx->data) + sizeof(void*) * (count + n - 1),
c277df42 3771 char, struct reg_data);
2eb97020 3772 Renew(RExC_rx->data->what, count + n, U8);
830247a4 3773 RExC_rx->data->count += n;
a0ed51b3
LW
3774 }
3775 else {
a02a5408 3776 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 3777 char, struct reg_data);
a02a5408 3778 Newx(RExC_rx->data->what, n, U8);
830247a4 3779 RExC_rx->data->count = n;
c277df42 3780 }
830247a4
IZ
3781 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3782 return RExC_rx->data->count - n;
c277df42
IZ
3783}
3784
76234dfb 3785#ifndef PERL_IN_XSUB_RE
d88dccdf 3786void
864dbfa3 3787Perl_reginitcolors(pTHX)
d88dccdf 3788{
97aff369 3789 dVAR;
1df70142 3790 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 3791 if (s) {
1df70142
AL
3792 char *t = savepv(s);
3793 int i = 0;
3794 PL_colors[0] = t;
d88dccdf 3795 while (++i < 6) {
1df70142
AL
3796 t = strchr(t, '\t');
3797 if (t) {
3798 *t = '\0';
3799 PL_colors[i] = ++t;
d88dccdf
IZ
3800 }
3801 else
1df70142 3802 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
3803 }
3804 } else {
1df70142 3805 int i = 0;
b81d288d 3806 while (i < 6)
06b5626a 3807 PL_colors[i++] = (char *)"";
d88dccdf
IZ
3808 }
3809 PL_colorset = 1;
3810}
76234dfb 3811#endif
8615cb43 3812
07be1b83 3813
786e8c11
YO
3814#ifdef TRIE_STUDY_OPT
3815#define CHECK_RESTUDY_GOTO \
3816 if ( \
3817 (data.flags & SCF_TRIE_RESTUDY) \
3818 && ! restudied++ \
3819 ) goto reStudy
3820#else
3821#define CHECK_RESTUDY_GOTO
3822#endif
f9f4320a 3823
a687059c 3824/*
e50aee73 3825 - pregcomp - compile a regular expression into internal code
a687059c
LW
3826 *
3827 * We can't allocate space until we know how big the compiled form will be,
3828 * but we can't compile it (and thus know how big it is) until we've got a
3829 * place to put the code. So we cheat: we compile it twice, once with code
3830 * generation turned off and size counting turned on, and once "for real".
3831 * This also means that we don't allocate space until we are sure that the
3832 * thing really will compile successfully, and we never have to move the
3833 * code and thus invalidate pointers into it. (Note that it has to be in
3834 * one piece because free() must be able to free it all.) [NB: not true in perl]
3835 *
3836 * Beware that the optimization-preparation code in here knows about some
3837 * of the structure of the compiled regexp. [I'll say.]
3838 */
b9b4dddf
YO
3839
3840
3841
f9f4320a 3842#ifndef PERL_IN_XSUB_RE
f9f4320a
YO
3843#define RE_ENGINE_PTR &PL_core_reg_engine
3844#else
f9f4320a
YO
3845extern const struct regexp_engine my_reg_engine;
3846#define RE_ENGINE_PTR &my_reg_engine
3847#endif
b9b4dddf
YO
3848/* these make a few things look better, to avoid indentation */
3849#define BEGIN_BLOCK {
f9f4320a
YO
3850#define END_BLOCK }
3851
a687059c 3852regexp *
864dbfa3 3853Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 3854{
97aff369 3855 dVAR;
f9f4320a
YO
3856 GET_RE_DEBUG_FLAGS_DECL;
3857 DEBUG_r(if (!PL_colorset) reginitcolors());
b9b4dddf
YO
3858#ifndef PERL_IN_XSUB_RE
3859 BEGIN_BLOCK
f9f4320a
YO
3860 /* Dispatch a request to compile a regexp to correct
3861 regexp engine. */
3862 HV * const table = GvHV(PL_hintgv);
3863 if (table) {
3864 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
1e2e3d02 3865 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
f9f4320a
YO
3866 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
3867 DEBUG_COMPILE_r({
8d8756e7 3868 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
3869 SvIV(*ptr));
3870 });
f2f78491 3871 return CALLREGCOMP_ENG(eng, exp, xend, pm);
f9f4320a 3872 }
b9b4dddf
YO
3873 }
3874 END_BLOCK
3875#endif
3876 BEGIN_BLOCK
a0d0e21e 3877 register regexp *r;
c277df42 3878 regnode *scan;
c277df42 3879 regnode *first;
a0d0e21e 3880 I32 flags;
a0d0e21e
LW
3881 I32 minlen = 0;
3882 I32 sawplus = 0;
3883 I32 sawopen = 0;
2c2d71f5 3884 scan_data_t data;
830247a4 3885 RExC_state_t RExC_state;
be8e71aa 3886 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83
YO
3887#ifdef TRIE_STUDY_OPT
3888 int restudied= 0;
3889 RExC_state_t copyRExC_state;
3890#endif
a0d0e21e 3891 if (exp == NULL)
c277df42 3892 FAIL("NULL regexp argument");
a0d0e21e 3893
a5961de5 3894 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 3895
5cfc7842 3896 RExC_precomp = exp;
a3621e74 3897 DEBUG_COMPILE_r({
ab3bbdeb
YO
3898 SV *dsv= sv_newmortal();
3899 RE_PV_QUOTED_DECL(s, RExC_utf8,
3900 dsv, RExC_precomp, (xend - exp), 60);
3901 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
3902 PL_colors[4],PL_colors[5],s);
a5961de5 3903 });
e2509266 3904 RExC_flags = pm->op_pmflags;
830247a4 3905 RExC_sawback = 0;
bbce6d69 3906
830247a4
IZ
3907 RExC_seen = 0;
3908 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3909 RExC_seen_evals = 0;
3910 RExC_extralen = 0;
c277df42 3911
bbce6d69 3912 /* First pass: determine size, legality. */
830247a4 3913 RExC_parse = exp;
fac92740 3914 RExC_start = exp;
830247a4
IZ
3915 RExC_end = xend;
3916 RExC_naughty = 0;
3917 RExC_npar = 1;
3918 RExC_size = 0L;
3919 RExC_emit = &PL_regdummy;
3920 RExC_whilem_seen = 0;
fc8cd66c 3921 RExC_charnames = NULL;
40d049e4
YO
3922 RExC_open_parens = NULL;
3923 RExC_close_parens = NULL;
3924 RExC_opend = NULL;
81714fb9 3925 RExC_paren_names = NULL;
40d049e4
YO
3926 RExC_recurse = NULL;
3927 RExC_recurse_count = 0;
81714fb9 3928
85ddcde9
JH
3929#if 0 /* REGC() is (currently) a NOP at the first pass.
3930 * Clever compilers notice this and complain. --jhi */
830247a4 3931 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 3932#endif
3dab1dad
YO
3933 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3934 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 3935 RExC_precomp = NULL;
a0d0e21e
LW
3936 return(NULL);
3937 }
07be1b83 3938 DEBUG_PARSE_r({
81714fb9
YO
3939 PerlIO_printf(Perl_debug_log,
3940 "Required size %"IVdf" nodes\n"
3941 "Starting second pass (creation)\n",
3942 (IV)RExC_size);
07be1b83
YO
3943 RExC_lastnum=0;
3944 RExC_lastparse=NULL;
3945 });
c277df42
IZ
3946 /* Small enough for pointer-storage convention?
3947 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
3948 if (RExC_size >= 0x10000L && RExC_extralen)
3949 RExC_size += RExC_extralen;
c277df42 3950 else
830247a4
IZ
3951 RExC_extralen = 0;
3952 if (RExC_whilem_seen > 15)
3953 RExC_whilem_seen = 15;
a0d0e21e 3954
f9f4320a
YO
3955 /* Allocate space and zero-initialize. Note, the two step process
3956 of zeroing when in debug mode, thus anything assigned has to
3957 happen after that */
a02a5408 3958 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 3959 char, regexp);
a0d0e21e 3960 if (r == NULL)
b45f050a 3961 FAIL("Regexp out of space");
0f79a09d
GS
3962#ifdef DEBUGGING
3963 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 3964 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 3965#endif
f9f4320a
YO
3966 /* initialization begins here */
3967 r->engine= RE_ENGINE_PTR;
c277df42 3968 r->refcnt = 1;
bbce6d69 3969 r->prelen = xend - exp;
5cfc7842 3970 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 3971 r->subbeg = NULL;
f8c7b90f 3972#ifdef PERL_OLD_COPY_ON_WRITE
c445ea15 3973 r->saved_copy = NULL;
ed252734 3974#endif
cf93c79d 3975 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 3976 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
38d1b06f 3977 r->lastparen = 0; /* mg.c reads this. */
4327152a
IZ
3978
3979 r->substrs = 0; /* Useful during FAIL. */
3980 r->startp = 0; /* Useful during FAIL. */
6bda09f9 3981 r->endp = 0;
81714fb9
YO
3982 r->paren_names = 0;
3983
6bda09f9 3984 if (RExC_seen & REG_SEEN_RECURSE) {
40d049e4
YO
3985 Newxz(RExC_open_parens, RExC_npar,regnode *);
3986 SAVEFREEPV(RExC_open_parens);
3987 Newxz(RExC_close_parens,RExC_npar,regnode *);
3988 SAVEFREEPV(RExC_close_parens);
6bda09f9
YO
3989 }
3990
3991 /* Useful during FAIL. */
a02a5408 3992 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
fac92740 3993 if (r->offsets) {
2af232bd 3994 r->offsets[0] = RExC_size;
fac92740 3995 }
a3621e74 3996 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd
SS
3997 "%s %"UVuf" bytes for offset annotations.\n",
3998 r->offsets ? "Got" : "Couldn't get",
392fbf5d 3999 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 4000
830247a4 4001 RExC_rx = r;
bbce6d69 4002
4003 /* Second pass: emit code. */
e2509266 4004 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
4005 RExC_parse = exp;
4006 RExC_end = xend;
4007 RExC_naughty = 0;
4008 RExC_npar = 1;
fac92740 4009 RExC_emit_start = r->program;
830247a4 4010 RExC_emit = r->program;
2cd61cdb 4011 /* Store the count of eval-groups for security checks: */
786e8c11 4012 RExC_emit->next_off = (RExC_seen_evals > (I32)U16_MAX) ? U16_MAX : (U16)RExC_seen_evals;
830247a4 4013 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 4014 r->data = 0;
3dab1dad 4015 if (reg(pRExC_state, 0, &flags,1) == NULL)
a0d0e21e 4016 return(NULL);
6bda09f9 4017
07be1b83
YO
4018 /* XXXX To minimize changes to RE engine we always allocate
4019 3-units-long substrs field. */
4020 Newx(r->substrs, 1, struct reg_substr_data);
40d049e4
YO
4021 if (RExC_recurse_count) {
4022 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4023 SAVEFREEPV(RExC_recurse);
4024 }
a0d0e21e 4025
07be1b83 4026reStudy:
1de06328 4027 r->minlen = minlen = sawplus = sawopen = 0;
07be1b83 4028 Zero(r->substrs, 1, struct reg_substr_data);
a3621e74 4029
07be1b83
YO
4030#ifdef TRIE_STUDY_OPT
4031 if ( restudied ) {
4032 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4033 RExC_state=copyRExC_state;
1de06328 4034 if (data.last_found) {
07be1b83 4035 SvREFCNT_dec(data.longest_fixed);
07be1b83 4036 SvREFCNT_dec(data.longest_float);
07be1b83 4037 SvREFCNT_dec(data.last_found);
1de06328 4038 }
40d049e4 4039 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4040 } else {
40d049e4 4041 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83
YO
4042 copyRExC_state=RExC_state;
4043 }
40d049e4
YO
4044#else
4045 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4046#endif
fc8cd66c 4047
a0d0e21e 4048 /* Dig out information for optimizations. */
cf93c79d 4049 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 4050 pm->op_pmflags = RExC_flags;
a0ed51b3 4051 if (UTF)
5ff6fc6d 4052 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 4053 r->regstclass = NULL;
830247a4 4054 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 4055 r->reganch |= ROPT_NAUGHTY;
c277df42 4056 scan = r->program + 1; /* First BRANCH. */
2779dcf1 4057
1de06328
YO
4058 /* testing for BRANCH here tells us whether there is "must appear"
4059 data in the pattern. If there is then we can use it for optimisations */
eaf3ca90 4060 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
c277df42 4061 I32 fake;
c5254dd6 4062 STRLEN longest_float_length, longest_fixed_length;
07be1b83 4063 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 4064 int stclass_flag;
07be1b83 4065 I32 last_close = 0; /* pointed to by data */
a0d0e21e
LW
4066
4067 first = scan;
c277df42 4068 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 4069 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 4070 /* An OR of *one* alternative - should not happen now. */
a0d0e21e 4071 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
07be1b83
YO
4072 /* for now we can't handle lookbehind IFMATCH*/
4073 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
4074 (OP(first) == PLUS) ||
4075 (OP(first) == MINMOD) ||
653099ff 4076 /* An {n,m} with n>0 */
07be1b83
YO
4077 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
4078 {
786e8c11 4079
a0d0e21e
LW
4080 if (OP(first) == PLUS)
4081 sawplus = 1;
4082 else
3dab1dad 4083 first += regarglen[OP(first)];
07be1b83
YO
4084 if (OP(first) == IFMATCH) {
4085 first = NEXTOPER(first);
4086 first += EXTRA_STEP_2ARGS;
7c167cea 4087 } else /* XXX possible optimisation for /(?=)/ */
07be1b83 4088 first = NEXTOPER(first);
a687059c
LW
4089 }
4090
a0d0e21e
LW
4091 /* Starting-point info. */
4092 again:
786e8c11 4093 DEBUG_PEEP("first:",first,0);
07be1b83 4094 /* Ignore EXACT as we deal with it later. */
3dab1dad 4095 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 4096 if (OP(first) == EXACT)
6f207bd3 4097 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 4098 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
4099 r->regstclass = first;
4100 }
07be1b83 4101#ifdef TRIE_STCLASS
786e8c11 4102 else if (PL_regkind[OP(first)] == TRIE &&
07be1b83
YO
4103 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
4104 {
786e8c11 4105 regnode *trie_op;
07be1b83 4106 /* this can happen only on restudy */
786e8c11
YO
4107 if ( OP(first) == TRIE ) {
4108 struct regnode_1 *trieop;
4109 Newxz(trieop,1,struct regnode_1);
4110 StructCopy(first,trieop,struct regnode_1);
4111 trie_op=(regnode *)trieop;
4112 } else {
4113 struct regnode_charclass *trieop;
4114 Newxz(trieop,1,struct regnode_charclass);
4115 StructCopy(first,trieop,struct regnode_charclass);
4116 trie_op=(regnode *)trieop;
4117 }
1de06328 4118 OP(trie_op)+=2;
786e8c11
YO
4119 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4120 r->regstclass = trie_op;
07be1b83
YO
4121 }
4122#endif
bfed75c6 4123 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 4124 r->regstclass = first;
3dab1dad
YO
4125 else if (PL_regkind[OP(first)] == BOUND ||
4126 PL_regkind[OP(first)] == NBOUND)
a0d0e21e 4127 r->regstclass = first;
3dab1dad 4128 else if (PL_regkind[OP(first)] == BOL) {
cad2e5aa
JH
4129 r->reganch |= (OP(first) == MBOL
4130 ? ROPT_ANCH_MBOL
4131 : (OP(first) == SBOL
4132 ? ROPT_ANCH_SBOL
4133 : ROPT_ANCH_BOL));
a0d0e21e 4134 first = NEXTOPER(first);
774d564b 4135 goto again;
4136 }
4137 else if (OP(first) == GPOS) {
4138 r->reganch |= ROPT_ANCH_GPOS;
4139 first = NEXTOPER(first);
4140 goto again;
a0d0e21e 4141 }
e09294f4 4142 else if (!sawopen && (OP(first) == STAR &&
3dab1dad 4143 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
4144 !(r->reganch & ROPT_ANCH) )
4145 {
4146 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
4147 const int type =
4148 (OP(NEXTOPER(first)) == REG_ANY)
4149 ? ROPT_ANCH_MBOL
4150 : ROPT_ANCH_SBOL;
cad2e5aa 4151 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 4152 first = NEXTOPER(first);
774d564b 4153 goto again;
a0d0e21e 4154 }
b81d288d 4155 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 4156 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
4157 /* x+ must match at the 1st pos of run of x's */
4158 r->reganch |= ROPT_SKIP;
a0d0e21e 4159
c277df42 4160 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa 4161#ifdef TRIE_STUDY_OPT
81714fb9 4162 DEBUG_PARSE_r(
be8e71aa
YO
4163 if (!restudied)
4164 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4165 (IV)(first - scan + 1))
4166 );
4167#else
81714fb9 4168 DEBUG_PARSE_r(
be8e71aa
YO
4169 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4170 (IV)(first - scan + 1))
4171 );
4172#endif
4173
4174
a0d0e21e
LW
4175 /*
4176 * If there's something expensive in the r.e., find the
4177 * longest literal string that must appear and make it the
4178 * regmust. Resolve ties in favor of later strings, since
4179 * the regstart check works with the beginning of the r.e.
4180 * and avoiding duplication strengthens checking. Not a
4181 * strong reason, but sufficient in the absence of others.
4182 * [Now we resolve ties in favor of the earlier string if
c277df42 4183 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
4184 * earlier string may buy us something the later one won't.]
4185 */
a0d0e21e 4186 minlen = 0;
a687059c 4187
396482e1
GA
4188 data.longest_fixed = newSVpvs("");
4189 data.longest_float = newSVpvs("");
4190 data.last_found = newSVpvs("");
c277df42
IZ
4191 data.longest = &(data.longest_fixed);
4192 first = scan;
653099ff 4193 if (!r->regstclass) {
830247a4 4194 cl_init(pRExC_state, &ch_class);
653099ff
GS
4195 data.start_class = &ch_class;
4196 stclass_flag = SCF_DO_STCLASS_AND;
4197 } else /* XXXX Check for BOUND? */
4198 stclass_flag = 0;
cb434fcc 4199 data.last_closep = &last_close;
653099ff 4200
1de06328 4201 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
40d049e4
YO
4202 &data, -1, NULL, NULL,
4203 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 4204
07be1b83 4205
786e8c11
YO
4206 CHECK_RESTUDY_GOTO;
4207
4208
830247a4 4209 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 4210 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
4211 && !RExC_seen_zerolen
4212 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 4213 r->reganch |= ROPT_CHECK_ALL;
1de06328 4214 scan_commit(pRExC_state, &data,&minlen);
c277df42
IZ
4215 SvREFCNT_dec(data.last_found);
4216
1de06328
YO
4217 /* Note that code very similar to this but for anchored string
4218 follows immediately below, changes may need to be made to both.
4219 Be careful.
4220 */
a0ed51b3 4221 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 4222 if (longest_float_length
c277df42
IZ
4223 || (data.flags & SF_FL_BEFORE_EOL
4224 && (!(data.flags & SF_FL_BEFORE_MEOL)
1de06328
YO
4225 || (RExC_flags & PMf_MULTILINE))))
4226 {
1182767e 4227 I32 t,ml;
cf93c79d 4228
1de06328 4229 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
4230 && data.offset_fixed == data.offset_float_min
4231 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4232 goto remove_float; /* As in (a)+. */
4233
1de06328
YO
4234 /* copy the information about the longest float from the reg_scan_data
4235 over to the program. */
33b8afdf
JH
4236 if (SvUTF8(data.longest_float)) {
4237 r->float_utf8 = data.longest_float;
c445ea15 4238 r->float_substr = NULL;
33b8afdf
JH
4239 } else {
4240 r->float_substr = data.longest_float;
c445ea15 4241 r->float_utf8 = NULL;
33b8afdf 4242 }
1de06328
YO
4243 /* float_end_shift is how many chars that must be matched that
4244 follow this item. We calculate it ahead of time as once the
4245 lookbehind offset is added in we lose the ability to correctly
4246 calculate it.*/
4247 ml = data.minlen_float ? *(data.minlen_float)
1182767e 4248 : (I32)longest_float_length;
1de06328
YO
4249 r->float_end_shift = ml - data.offset_float_min
4250 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4251 + data.lookbehind_float;
4252 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 4253 r->float_max_offset = data.offset_float_max;
1182767e 4254 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
4255 r->float_max_offset -= data.lookbehind_float;
4256
cf93c79d
IZ
4257 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4258 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 4259 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 4260 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4261 }
4262 else {
aca2d497 4263 remove_float:
c445ea15 4264 r->float_substr = r->float_utf8 = NULL;
c277df42 4265 SvREFCNT_dec(data.longest_float);
c5254dd6 4266 longest_float_length = 0;
a0d0e21e 4267 }
c277df42 4268
1de06328
YO
4269 /* Note that code very similar to this but for floating string
4270 is immediately above, changes may need to be made to both.
4271 Be careful.
4272 */
a0ed51b3 4273 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 4274 if (longest_fixed_length
c277df42
IZ
4275 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4276 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1de06328
YO
4277 || (RExC_flags & PMf_MULTILINE))))
4278 {
1182767e 4279 I32 t,ml;
cf93c79d 4280
1de06328
YO
4281 /* copy the information about the longest fixed
4282 from the reg_scan_data over to the program. */
33b8afdf
JH
4283 if (SvUTF8(data.longest_fixed)) {
4284 r->anchored_utf8 = data.longest_fixed;
c445ea15 4285 r->anchored_substr = NULL;
33b8afdf
JH
4286 } else {
4287 r->anchored_substr = data.longest_fixed;
c445ea15 4288 r->anchored_utf8 = NULL;
33b8afdf 4289 }
1de06328
YO
4290 /* fixed_end_shift is how many chars that must be matched that
4291 follow this item. We calculate it ahead of time as once the
4292 lookbehind offset is added in we lose the ability to correctly
4293 calculate it.*/
4294 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 4295 : (I32)longest_fixed_length;
1de06328
YO
4296 r->anchored_end_shift = ml - data.offset_fixed
4297 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4298 + data.lookbehind_fixed;
4299 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4300
cf93c79d
IZ
4301 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4302 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 4303 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 4304 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4305 }
4306 else {
c445ea15 4307 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 4308 SvREFCNT_dec(data.longest_fixed);
c5254dd6 4309 longest_fixed_length = 0;
a0d0e21e 4310 }
b81d288d 4311 if (r->regstclass
ffc61ed2 4312 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 4313 r->regstclass = NULL;
33b8afdf
JH
4314 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4315 && stclass_flag
653099ff 4316 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4317 && !cl_is_anything(data.start_class))
4318 {
1df70142 4319 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 4320
a02a5408 4321 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
4322 struct regnode_charclass_class);
4323 StructCopy(data.start_class,
830247a4 4324 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 4325 struct regnode_charclass_class);
830247a4 4326 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 4327 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 4328 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 4329 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4330 PerlIO_printf(Perl_debug_log,
a0288114 4331 "synthetic stclass \"%s\".\n",
3f7c398e 4332 SvPVX_const(sv));});
653099ff 4333 }
c277df42
IZ
4334
4335 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 4336 if (longest_fixed_length > longest_float_length) {
1de06328 4337 r->check_end_shift = r->anchored_end_shift;
c277df42 4338 r->check_substr = r->anchored_substr;
33b8afdf 4339 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
4340 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4341 if (r->reganch & ROPT_ANCH_SINGLE)
4342 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
4343 }
4344 else {
1de06328 4345 r->check_end_shift = r->float_end_shift;
c277df42 4346 r->check_substr = r->float_substr;
33b8afdf 4347 r->check_utf8 = r->float_utf8;
1de06328
YO
4348 r->check_offset_min = r->float_min_offset;
4349 r->check_offset_max = r->float_max_offset;
a0d0e21e 4350 }
30382c73
IZ
4351 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4352 This should be changed ASAP! */
33b8afdf 4353 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 4354 r->reganch |= RE_USE_INTUIT;
33b8afdf 4355 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
4356 r->reganch |= RE_INTUIT_TAIL;
4357 }
1de06328
YO
4358 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4359 if ( (STRLEN)minlen < longest_float_length )
4360 minlen= longest_float_length;
4361 if ( (STRLEN)minlen < longest_fixed_length )
4362 minlen= longest_fixed_length;
4363 */
a0ed51b3
LW
4364 }
4365 else {
c277df42
IZ
4366 /* Several toplevels. Best we can is to set minlen. */
4367 I32 fake;
653099ff 4368 struct regnode_charclass_class ch_class;
cb434fcc 4369 I32 last_close = 0;
c277df42 4370
a3621e74 4371 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
07be1b83 4372
c277df42 4373 scan = r->program + 1;
830247a4 4374 cl_init(pRExC_state, &ch_class);
653099ff 4375 data.start_class = &ch_class;
cb434fcc 4376 data.last_closep = &last_close;
07be1b83 4377
1de06328 4378 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
40d049e4 4379 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
07be1b83 4380
786e8c11 4381 CHECK_RESTUDY_GOTO;
07be1b83 4382
33b8afdf 4383 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 4384 = r->float_substr = r->float_utf8 = NULL;
653099ff 4385 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4386 && !cl_is_anything(data.start_class))
4387 {
1df70142 4388 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 4389
a02a5408 4390 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
4391 struct regnode_charclass_class);
4392 StructCopy(data.start_class,
830247a4 4393 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 4394 struct regnode_charclass_class);
830247a4 4395 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 4396 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 4397 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 4398 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4399 PerlIO_printf(Perl_debug_log,
a0288114 4400 "synthetic stclass \"%s\".\n",
3f7c398e 4401 SvPVX_const(sv));});
653099ff 4402 }
a0d0e21e
LW
4403 }
4404
1de06328
YO
4405 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4406 the "real" pattern. */
4407 if (r->minlen < minlen)
4408 r->minlen = minlen;
4409
b81d288d 4410 if (RExC_seen & REG_SEEN_GPOS)
c277df42 4411 r->reganch |= ROPT_GPOS_SEEN;
830247a4 4412 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 4413 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 4414 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 4415 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
4416 if (RExC_seen & REG_SEEN_CANY)
4417 r->reganch |= ROPT_CANY_SEEN;
81714fb9
YO
4418 if (RExC_paren_names)
4419 r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
4420 else
4421 r->paren_names = NULL;
4422
40d049e4
YO
4423 if (RExC_recurse_count) {
4424 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4425 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4426 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4427 }
4428 }
a02a5408
JC
4429 Newxz(r->startp, RExC_npar, I32);
4430 Newxz(r->endp, RExC_npar, I32);
f9f4320a 4431
f2278c82 4432 DEBUG_r( RX_DEBUG_on(r) );
be8e71aa
YO
4433 DEBUG_DUMP_r({
4434 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
4435 regdump(r);
4436 });
8e9a8a48
YO
4437 DEBUG_OFFSETS_r(if (r->offsets) {
4438 const U32 len = r->offsets[0];
4439 U32 i;
4440 GET_RE_DEBUG_FLAGS_DECL;
4441 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4442 for (i = 1; i <= len; i++) {
4443 if (r->offsets[i*2-1] || r->offsets[i*2])
4444 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
786e8c11 4445 (UV)i, (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
8e9a8a48
YO
4446 }
4447 PerlIO_printf(Perl_debug_log, "\n");
4448 });
a0d0e21e 4449 return(r);
f9f4320a 4450 END_BLOCK
a687059c
LW
4451}
4452
f9f4320a
YO
4453#undef CORE_ONLY_BLOCK
4454#undef END_BLOCK
4455#undef RE_ENGINE_PTR
3dab1dad 4456
9af228c6 4457#ifndef PERL_IN_XSUB_RE
81714fb9
YO
4458SV*
4459Perl_reg_named_buff_sv(pTHX_ SV* namesv)
4460{
4461 I32 parno = 0; /* no match */
4462 if (PL_curpm) {
4463 const REGEXP * const rx = PM_GETRE(PL_curpm);
4464 if (rx && rx->paren_names) {
4465 HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
4466 if (he_str) {
4467 IV i;
4468 SV* sv_dat=HeVAL(he_str);
4469 I32 *nums=(I32*)SvPVX(sv_dat);
4470 for ( i=0; i<SvIVX(sv_dat); i++ ) {
0a4db386 4471 if ((I32)(rx->lastparen) >= nums[i] &&
81714fb9
YO
4472 rx->endp[nums[i]] != -1)
4473 {
4474 parno = nums[i];
4475 break;
4476 }
4477 }
4478 }
4479 }
4480 }
4481 if ( !parno ) {
4482 return 0;
4483 } else {
4484 GV *gv_paren;
4485 SV *sv= sv_newmortal();
4486 Perl_sv_setpvf(aTHX_ sv, "%"IVdf,(IV)parno);
4487 gv_paren= Perl_gv_fetchsv(aTHX_ sv, GV_ADD, SVt_PVGV);
4488 return GvSVn(gv_paren);
4489 }
4490}
9af228c6 4491#endif
0a4db386 4492
894be9b7 4493/* Scans the name of a named buffer from the pattern.
0a4db386
YO
4494 * If flags is REG_RSN_RETURN_NULL returns null.
4495 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
4496 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
4497 * to the parsed name as looked up in the RExC_paren_names hash.
4498 * If there is an error throws a vFAIL().. type exception.
894be9b7 4499 */
0a4db386
YO
4500
4501#define REG_RSN_RETURN_NULL 0
4502#define REG_RSN_RETURN_NAME 1
4503#define REG_RSN_RETURN_DATA 2
4504
894be9b7
YO
4505STATIC SV*
4506S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
4507 char *name_start = RExC_parse;
0a4db386 4508 if ( UTF ) {
894be9b7 4509 STRLEN numlen;
0a4db386
YO
4510 while( isIDFIRST_uni(utf8n_to_uvchr((U8*)RExC_parse,
4511 RExC_end - RExC_parse, &numlen, UTF8_ALLOW_DEFAULT)))
4512 {
4513 RExC_parse += numlen;
4514 }
4515 } else {
4516 while( isIDFIRST(*RExC_parse) )
894be9b7
YO
4517 RExC_parse++;
4518 }
0a4db386
YO
4519 if ( flags ) {
4520 SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start,
4521 (int)(RExC_parse - name_start)));
894be9b7 4522 if (UTF)
0a4db386
YO
4523 SvUTF8_on(sv_name);
4524 if ( flags == REG_RSN_RETURN_NAME)
4525 return sv_name;
4526 else if (flags==REG_RSN_RETURN_DATA) {
4527 HE *he_str = NULL;
4528 SV *sv_dat = NULL;
4529 if ( ! sv_name ) /* should not happen*/
4530 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
4531 if (RExC_paren_names)
4532 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
4533 if ( he_str )
4534 sv_dat = HeVAL(he_str);
4535 if ( ! sv_dat )
4536 vFAIL("Reference to nonexistent named group");
4537 return sv_dat;
4538 }
4539 else {
4540 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
4541 }
4542 /* NOT REACHED */
894be9b7 4543 }
0a4db386 4544 return NULL;
894be9b7
YO
4545}
4546
3dab1dad
YO
4547#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
4548 int rem=(int)(RExC_end - RExC_parse); \
4549 int cut; \
4550 int num; \
4551 int iscut=0; \
4552 if (rem>10) { \
4553 rem=10; \
4554 iscut=1; \
4555 } \
4556 cut=10-rem; \
4557 if (RExC_lastparse!=RExC_parse) \
4558 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
4559 rem, RExC_parse, \
4560 cut + 4, \
4561 iscut ? "..." : "<" \
4562 ); \
4563 else \
4564 PerlIO_printf(Perl_debug_log,"%16s",""); \
4565 \
4566 if (SIZE_ONLY) \
4567 num=RExC_size; \
4568 else \
4569 num=REG_NODE_NUM(RExC_emit); \
4570 if (RExC_lastnum!=num) \
0a4db386 4571 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 4572 else \
0a4db386 4573 PerlIO_printf(Perl_debug_log,"|%4s",""); \
be8e71aa
YO
4574 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
4575 (int)((depth*2)), "", \
3dab1dad
YO
4576 (funcname) \
4577 ); \
4578 RExC_lastnum=num; \
4579 RExC_lastparse=RExC_parse; \
4580})
4581
07be1b83
YO
4582
4583
3dab1dad
YO
4584#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
4585 DEBUG_PARSE_MSG((funcname)); \
4586 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
4587})
6bda09f9
YO
4588#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
4589 DEBUG_PARSE_MSG((funcname)); \
4590 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
4591})
a687059c
LW
4592/*
4593 - reg - regular expression, i.e. main body or parenthesized thing
4594 *
4595 * Caller must absorb opening parenthesis.
4596 *
4597 * Combining parenthesis handling with the base level of regular expression
4598 * is a trifle forced, but the need to tie the tails of the branches to what
4599 * follows makes it hard to avoid.
4600 */
07be1b83
YO
4601#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
4602#ifdef DEBUGGING
4603#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
4604#else
4605#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
4606#endif
3dab1dad 4607
76e3520e 4608STATIC regnode *
3dab1dad 4609S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 4610 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 4611{
27da23d5 4612 dVAR;
c277df42
IZ
4613 register regnode *ret; /* Will be the head of the group. */
4614 register regnode *br;
4615 register regnode *lastbr;
cbbf8932 4616 register regnode *ender = NULL;
a0d0e21e 4617 register I32 parno = 0;
cbbf8932
AL
4618 I32 flags;
4619 const I32 oregflags = RExC_flags;
6136c704
AL
4620 bool have_branch = 0;
4621 bool is_open = 0;
9d1d55b5
JP
4622
4623 /* for (?g), (?gc), and (?o) warnings; warning
4624 about (?c) will warn about (?g) -- japhy */
4625
6136c704
AL
4626#define WASTED_O 0x01
4627#define WASTED_G 0x02
4628#define WASTED_C 0x04
4629#define WASTED_GC (0x02|0x04)
cbbf8932 4630 I32 wastedflags = 0x00;
9d1d55b5 4631
fac92740 4632 char * parse_start = RExC_parse; /* MJD */
a28509cc 4633 char * const oregcomp_parse = RExC_parse;
a0d0e21e 4634
3dab1dad
YO
4635 GET_RE_DEBUG_FLAGS_DECL;
4636 DEBUG_PARSE("reg ");
4637
4638
821b33a5 4639 *flagp = 0; /* Tentatively. */
a0d0e21e 4640
9d1d55b5 4641
a0d0e21e
LW
4642 /* Make an OPEN node, if parenthesized. */
4643 if (paren) {
fac92740 4644 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
4645 U32 posflags = 0, negflags = 0;
4646 U32 *flagsp = &posflags;
6136c704 4647 bool is_logical = 0;
a28509cc 4648 const char * const seqstart = RExC_parse;
ca9dfc88 4649
830247a4
IZ
4650 RExC_parse++;
4651 paren = *RExC_parse++;
c277df42 4652 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 4653 switch (paren) {
894be9b7 4654
fac92740 4655 case '<': /* (?<...) */
b81d288d 4656 if (*RExC_parse == '!')
c277df42 4657 paren = ',';
0a4db386
YO
4658 else if (*RExC_parse != '=')
4659 { /* (?<...>) */
81714fb9 4660 char *name_start;
894be9b7 4661 SV *svname;
81714fb9
YO
4662 paren= '>';
4663 case '\'': /* (?'...') */
4664 name_start= RExC_parse;
0a4db386
YO
4665 svname = reg_scan_name(pRExC_state,
4666 SIZE_ONLY ? /* reverse test from the others */
4667 REG_RSN_RETURN_NAME :
4668 REG_RSN_RETURN_NULL);
81714fb9
YO
4669 if (RExC_parse == name_start)
4670 goto unknown;
4671 if (*RExC_parse != paren)
4672 vFAIL2("Sequence (?%c... not terminated",
4673 paren=='>' ? '<' : paren);
4674 if (SIZE_ONLY) {
e62cc96a
YO
4675 HE *he_str;
4676 SV *sv_dat = NULL;
894be9b7
YO
4677 if (!svname) /* shouldnt happen */
4678 Perl_croak(aTHX_
4679 "panic: reg_scan_name returned NULL");
81714fb9
YO
4680 if (!RExC_paren_names) {
4681 RExC_paren_names= newHV();
4682 sv_2mortal((SV*)RExC_paren_names);
4683 }
4684 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
e62cc96a 4685 if ( he_str )
81714fb9 4686 sv_dat = HeVAL(he_str);
e62cc96a 4687 if ( ! sv_dat ) {
81714fb9 4688 /* croak baby croak */
e62cc96a
YO
4689 Perl_croak(aTHX_
4690 "panic: paren_name hash element allocation failed");
4691 } else if ( SvPOK(sv_dat) ) {
81714fb9
YO
4692 IV count=SvIV(sv_dat);
4693 I32 *pv=(I32*)SvGROW(sv_dat,SvCUR(sv_dat)+sizeof(I32)+1);
4694 SvCUR_set(sv_dat,SvCUR(sv_dat)+sizeof(I32));
4695 pv[count]=RExC_npar;
4696 SvIVX(sv_dat)++;
4697 } else {
4698 (void)SvUPGRADE(sv_dat,SVt_PVNV);
4699 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
4700 SvIOK_on(sv_dat);
4701 SvIVX(sv_dat)= 1;
e62cc96a
YO
4702 }
4703
81714fb9
YO
4704 /*sv_dump(sv_dat);*/
4705 }
4706 nextchar(pRExC_state);
4707 paren = 1;
4708 goto capturing_parens;
4709 }
4710 RExC_seen |= REG_SEEN_LOOKBEHIND;
830247a4 4711 RExC_parse++;
fac92740
MJD
4712 case '=': /* (?=...) */
4713 case '!': /* (?!...) */
7f69552c
YO
4714 if (*RExC_parse == ')')
4715 goto do_op_fail;
830247a4 4716 RExC_seen_zerolen++;
fac92740
MJD
4717 case ':': /* (?:...) */
4718 case '>': /* (?>...) */
a0d0e21e 4719 break;
24b23f37 4720 case 'C': /* (?CUT) and (?COMMIT) */
54612592
YO
4721 if (RExC_parse[0] == 'O' &&
4722 RExC_parse[1] == 'M' &&
4723 RExC_parse[2] == 'M' &&
4724 RExC_parse[3] == 'I' &&
4725 RExC_parse[4] == 'T' &&
4726 RExC_parse[5] == ')')
4727 {
4728 RExC_parse+=5;
4729 ret = reg_node(pRExC_state, COMMIT);
24b23f37
YO
4730 } else if (
4731 RExC_parse[0] == 'U' &&
4732 RExC_parse[1] == 'T' &&
4733 RExC_parse[2] == ')')
4734 {
4735 RExC_parse+=2;
4736 ret = reg_node(pRExC_state, CUT);
54612592
YO
4737 } else {
4738 vFAIL("Sequence (?C... not terminated");
4739 }
4740 nextchar(pRExC_state);
4741 return ret;
4742 break;
24b23f37
YO
4743 case 'E': /* (?ERROR) */
4744 if (RExC_parse[0] == 'R' &&
4745 RExC_parse[1] == 'R' &&
4746 RExC_parse[2] == 'O' &&
4747 RExC_parse[3] == 'R' &&
4748 RExC_parse[4] == ')')
4749 {
4750 RExC_parse+=4;
4751 ret = reg_node(pRExC_state, OPERROR);
4752 } else {
4753 vFAIL("Sequence (?E... not terminated");
4754 }
4755 nextchar(pRExC_state);
4756 return ret;
4757 break;
7f69552c
YO
4758 case 'F':
4759 if (RExC_parse[0] == 'A' &&
4760 RExC_parse[1] == 'I' &&
4761 RExC_parse[2] == 'L')
4762 RExC_parse+=3;
4763 if (*RExC_parse != ')')
4764 vFAIL("Sequence (?FAIL) or (?F) not terminated");
4765 do_op_fail:
4766 ret = reg_node(pRExC_state, OPFAIL);
4767 nextchar(pRExC_state);
4768 return ret;
4769 break;
fac92740
MJD
4770 case '$': /* (?$...) */
4771 case '@': /* (?@...) */
8615cb43 4772 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 4773 break;
fac92740 4774 case '#': /* (?#...) */
830247a4
IZ
4775 while (*RExC_parse && *RExC_parse != ')')
4776 RExC_parse++;
4777 if (*RExC_parse != ')')
c277df42 4778 FAIL("Sequence (?#... not terminated");
830247a4 4779 nextchar(pRExC_state);
a0d0e21e
LW
4780 *flagp = TRYAGAIN;
4781 return NULL;
894be9b7
YO
4782 case '0' : /* (?0) */
4783 case 'R' : /* (?R) */
4784 if (*RExC_parse != ')')
6bda09f9 4785 FAIL("Sequence (?R) not terminated");
1a147d38 4786 ret = reg_node(pRExC_state, GOSTART);
7f69552c
YO
4787 nextchar(pRExC_state);
4788 return ret;
4789 /*notreached*/
894be9b7
YO
4790 { /* named and numeric backreferences */
4791 I32 num;
4792 char * parse_start;
4793 case '&': /* (?&NAME) */
4794 parse_start = RExC_parse - 1;
4795 {
0a4db386
YO
4796 SV *sv_dat = reg_scan_name(pRExC_state,
4797 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
4798 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
894be9b7
YO
4799 }
4800 goto gen_recurse_regop;
4801 /* NOT REACHED */
6bda09f9
YO
4802 case '1': case '2': case '3': case '4': /* (?1) */
4803 case '5': case '6': case '7': case '8': case '9':
4804 RExC_parse--;
894be9b7
YO
4805 num = atoi(RExC_parse);
4806 parse_start = RExC_parse - 1; /* MJD */
6bda09f9
YO
4807 while (isDIGIT(*RExC_parse))
4808 RExC_parse++;
4809 if (*RExC_parse!=')')
4810 vFAIL("Expecting close bracket");
894be9b7
YO
4811
4812 gen_recurse_regop:
1a147d38 4813 ret = reganode(pRExC_state, GOSUB, num);
6bda09f9
YO
4814 if (!SIZE_ONLY) {
4815 if (num > (I32)RExC_rx->nparens) {
4816 RExC_parse++;
4817 vFAIL("Reference to nonexistent group");
4818 }
40d049e4 4819 ARG2L_SET( ret, RExC_recurse_count++);
6bda09f9 4820 RExC_emit++;
226de585 4821 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
acff02b8 4822 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
894be9b7 4823 } else {
6bda09f9 4824 RExC_size++;
6bda09f9 4825 }
0a4db386 4826 RExC_seen |= REG_SEEN_RECURSE;
6bda09f9 4827 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
58663417
RGS
4828 Set_Node_Offset(ret, parse_start); /* MJD */
4829
6bda09f9
YO
4830 nextchar(pRExC_state);
4831 return ret;
894be9b7
YO
4832 } /* named and numeric backreferences */
4833 /* NOT REACHED */
4834
fac92740 4835 case 'p': /* (?p...) */
9014280d 4836 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 4837 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 4838 /* FALL THROUGH*/
fac92740 4839 case '?': /* (??...) */
6136c704 4840 is_logical = 1;
438a3801
YST
4841 if (*RExC_parse != '{')
4842 goto unknown;
830247a4 4843 paren = *RExC_parse++;
0f5d15d6 4844 /* FALL THROUGH */
fac92740 4845 case '{': /* (?{...}) */
c277df42 4846 {
c277df42
IZ
4847 I32 count = 1, n = 0;
4848 char c;
830247a4 4849 char *s = RExC_parse;
c277df42 4850
830247a4
IZ
4851 RExC_seen_zerolen++;
4852 RExC_seen |= REG_SEEN_EVAL;
4853 while (count && (c = *RExC_parse)) {
6136c704
AL
4854 if (c == '\\') {
4855 if (RExC_parse[1])
4856 RExC_parse++;
4857 }
b81d288d 4858 else if (c == '{')
c277df42 4859 count++;
b81d288d 4860 else if (c == '}')
c277df42 4861 count--;
830247a4 4862 RExC_parse++;
c277df42 4863 }
6136c704 4864 if (*RExC_parse != ')') {
b81d288d 4865 RExC_parse = s;
b45f050a
JF
4866 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
4867 }
c277df42 4868 if (!SIZE_ONLY) {
f3548bdc 4869 PAD *pad;
6136c704
AL
4870 OP_4tree *sop, *rop;
4871 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 4872
569233ed
SB
4873 ENTER;
4874 Perl_save_re_context(aTHX);
f3548bdc 4875 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
4876 sop->op_private |= OPpREFCOUNTED;
4877 /* re_dup will OpREFCNT_inc */
4878 OpREFCNT_set(sop, 1);
569233ed 4879 LEAVE;
c277df42 4880
830247a4
IZ
4881 n = add_data(pRExC_state, 3, "nop");
4882 RExC_rx->data->data[n] = (void*)rop;
4883 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 4884 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 4885 SvREFCNT_dec(sv);
a0ed51b3 4886 }
e24b16f9 4887 else { /* First pass */
830247a4 4888 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 4889 && IN_PERL_RUNTIME)
2cd61cdb
IZ
4890 /* No compiled RE interpolated, has runtime
4891 components ===> unsafe. */
4892 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 4893 if (PL_tainting && PL_tainted)
cc6b7395 4894 FAIL("Eval-group in insecure regular expression");
54df2634 4895#if PERL_VERSION > 8
923e4eb5 4896 if (IN_PERL_COMPILETIME)
b5c19bd7 4897 PL_cv_has_eval = 1;
54df2634 4898#endif
c277df42 4899 }
b5c19bd7 4900
830247a4 4901 nextchar(pRExC_state);
6136c704 4902 if (is_logical) {
830247a4 4903 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
4904 if (!SIZE_ONLY)
4905 ret->flags = 2;
3dab1dad 4906 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 4907 /* deal with the length of this later - MJD */
0f5d15d6
IZ
4908 return ret;
4909 }
ccb2c380
MP
4910 ret = reganode(pRExC_state, EVAL, n);
4911 Set_Node_Length(ret, RExC_parse - parse_start + 1);
4912 Set_Node_Offset(ret, parse_start);
4913 return ret;
c277df42 4914 }
fac92740 4915 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 4916 {
0a4db386 4917 int is_define= 0;
fac92740 4918 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
4919 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
4920 || RExC_parse[1] == '<'
830247a4 4921 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
4922 I32 flag;
4923
830247a4 4924 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
4925 if (!SIZE_ONLY)
4926 ret->flags = 1;
3dab1dad 4927 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 4928 goto insert_if;
b81d288d 4929 }
a0ed51b3 4930 }
0a4db386
YO
4931 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
4932 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
4933 {
4934 char ch = RExC_parse[0] == '<' ? '>' : '\'';
4935 char *name_start= RExC_parse++;
4936 I32 num = 0;
4937 SV *sv_dat=reg_scan_name(pRExC_state,
4938 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
4939 if (RExC_parse == name_start || *RExC_parse != ch)
4940 vFAIL2("Sequence (?(%c... not terminated",
4941 (ch == '>' ? '<' : ch));
4942 RExC_parse++;
4943 if (!SIZE_ONLY) {
4944 num = add_data( pRExC_state, 1, "S" );
4945 RExC_rx->data->data[num]=(void*)sv_dat;
4946 SvREFCNT_inc(sv_dat);
4947 }
4948 ret = reganode(pRExC_state,NGROUPP,num);
4949 goto insert_if_check_paren;
4950 }
4951 else if (RExC_parse[0] == 'D' &&
4952 RExC_parse[1] == 'E' &&
4953 RExC_parse[2] == 'F' &&
4954 RExC_parse[3] == 'I' &&
4955 RExC_parse[4] == 'N' &&
4956 RExC_parse[5] == 'E')
4957 {
4958 ret = reganode(pRExC_state,DEFINEP,0);
4959 RExC_parse +=6 ;
4960 is_define = 1;
4961 goto insert_if_check_paren;
4962 }
4963 else if (RExC_parse[0] == 'R') {
4964 RExC_parse++;
4965 parno = 0;
4966 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
4967 parno = atoi(RExC_parse++);
4968 while (isDIGIT(*RExC_parse))
4969 RExC_parse++;
4970 } else if (RExC_parse[0] == '&') {
4971 SV *sv_dat;
4972 RExC_parse++;
4973 sv_dat = reg_scan_name(pRExC_state,
4974 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
4975 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
4976 }
1a147d38 4977 ret = reganode(pRExC_state,INSUBP,parno);
0a4db386
YO
4978 goto insert_if_check_paren;
4979 }
830247a4 4980 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 4981 /* (?(1)...) */
6136c704 4982 char c;
830247a4 4983 parno = atoi(RExC_parse++);
c277df42 4984
830247a4
IZ
4985 while (isDIGIT(*RExC_parse))
4986 RExC_parse++;
fac92740 4987 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 4988
0a4db386 4989 insert_if_check_paren:
830247a4 4990 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 4991 vFAIL("Switch condition not recognized");
c277df42 4992 insert_if:
3dab1dad
YO
4993 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
4994 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 4995 if (br == NULL)
830247a4 4996 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 4997 else
3dab1dad 4998 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 4999 c = *nextchar(pRExC_state);
d1b80229
IZ
5000 if (flags&HASWIDTH)
5001 *flagp |= HASWIDTH;
c277df42 5002 if (c == '|') {
0a4db386
YO
5003 if (is_define)
5004 vFAIL("(?(DEFINE)....) does not allow branches");
830247a4 5005 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
5006 regbranch(pRExC_state, &flags, 1,depth+1);
5007 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
5008 if (flags&HASWIDTH)
5009 *flagp |= HASWIDTH;
830247a4 5010 c = *nextchar(pRExC_state);
a0ed51b3
LW
5011 }
5012 else
c277df42
IZ
5013 lastbr = NULL;
5014 if (c != ')')
8615cb43 5015 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 5016 ender = reg_node(pRExC_state, TAIL);
3dab1dad 5017 REGTAIL(pRExC_state, br, ender);
c277df42 5018 if (lastbr) {
3dab1dad
YO
5019 REGTAIL(pRExC_state, lastbr, ender);
5020 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
5021 }
5022 else
3dab1dad 5023 REGTAIL(pRExC_state, ret, ender);
c277df42 5024 return ret;
a0ed51b3
LW
5025 }
5026 else {
830247a4 5027 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
5028 }
5029 }
1b1626e4 5030 case 0:
830247a4 5031 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 5032 vFAIL("Sequence (? incomplete");
1b1626e4 5033 break;
a0d0e21e 5034 default:
830247a4 5035 --RExC_parse;
fac92740 5036 parse_flags: /* (?i) */
830247a4 5037 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
5038 /* (?g), (?gc) and (?o) are useless here
5039 and must be globally applied -- japhy */
5040
5041 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
5042 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 5043 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
5044 if (! (wastedflags & wflagbit) ) {
5045 wastedflags |= wflagbit;
5046 vWARN5(
5047 RExC_parse + 1,
5048 "Useless (%s%c) - %suse /%c modifier",
5049 flagsp == &negflags ? "?-" : "?",
5050 *RExC_parse,
5051 flagsp == &negflags ? "don't " : "",
5052 *RExC_parse
5053 );
5054 }
5055 }
5056 }
5057 else if (*RExC_parse == 'c') {
5058 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
5059 if (! (wastedflags & WASTED_C) ) {
5060 wastedflags |= WASTED_GC;
9d1d55b5
JP
5061 vWARN3(
5062 RExC_parse + 1,
5063 "Useless (%sc) - %suse /gc modifier",
5064 flagsp == &negflags ? "?-" : "?",
5065 flagsp == &negflags ? "don't " : ""
5066 );
5067 }
5068 }
5069 }
5070 else { pmflag(flagsp, *RExC_parse); }
5071
830247a4 5072 ++RExC_parse;
ca9dfc88 5073 }
830247a4 5074 if (*RExC_parse == '-') {
ca9dfc88 5075 flagsp = &negflags;
9d1d55b5 5076 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 5077 ++RExC_parse;
ca9dfc88 5078 goto parse_flags;
48c036b1 5079 }
e2509266
JH
5080 RExC_flags |= posflags;
5081 RExC_flags &= ~negflags;
830247a4
IZ
5082 if (*RExC_parse == ':') {
5083 RExC_parse++;
ca9dfc88
IZ
5084 paren = ':';
5085 break;
5086 }
c277df42 5087 unknown:
830247a4
IZ
5088 if (*RExC_parse != ')') {
5089 RExC_parse++;
5090 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 5091 }
830247a4 5092 nextchar(pRExC_state);
a0d0e21e
LW
5093 *flagp = TRYAGAIN;
5094 return NULL;
5095 }
5096 }
fac92740 5097 else { /* (...) */
81714fb9 5098 capturing_parens:
830247a4
IZ
5099 parno = RExC_npar;
5100 RExC_npar++;
5101 ret = reganode(pRExC_state, OPEN, parno);
6bda09f9 5102 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
226de585 5103 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
40d049e4
YO
5104 "Setting open paren #%"IVdf" to %d\n",
5105 (IV)parno, REG_NODE_NUM(ret)));
5106 RExC_open_parens[parno-1]= ret;
6bda09f9 5107 }
fac92740
MJD
5108 Set_Node_Length(ret, 1); /* MJD */
5109 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 5110 is_open = 1;
a0d0e21e 5111 }
a0ed51b3 5112 }
fac92740 5113 else /* ! paren */
a0d0e21e
LW
5114 ret = NULL;
5115
5116 /* Pick up the branches, linking them together. */
fac92740 5117 parse_start = RExC_parse; /* MJD */
3dab1dad 5118 br = regbranch(pRExC_state, &flags, 1,depth+1);
fac92740 5119 /* branch_len = (paren != 0); */
2af232bd 5120
a0d0e21e
LW
5121 if (br == NULL)
5122 return(NULL);
830247a4
IZ
5123 if (*RExC_parse == '|') {
5124 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9 5125 reginsert(pRExC_state, BRANCHJ, br, depth+1);
a0ed51b3 5126 }
fac92740 5127 else { /* MJD */
6bda09f9 5128 reginsert(pRExC_state, BRANCH, br, depth+1);
fac92740
MJD
5129 Set_Node_Length(br, paren != 0);
5130 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
5131 }
c277df42
IZ
5132 have_branch = 1;
5133 if (SIZE_ONLY)
830247a4 5134 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
5135 }
5136 else if (paren == ':') {
c277df42
IZ
5137 *flagp |= flags&SIMPLE;
5138 }
6136c704 5139 if (is_open) { /* Starts with OPEN. */
3dab1dad 5140 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
5141 }
5142 else if (paren != '?') /* Not Conditional */
a0d0e21e 5143 ret = br;
32a0ca98 5144 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 5145 lastbr = br;
830247a4
IZ
5146 while (*RExC_parse == '|') {
5147 if (!SIZE_ONLY && RExC_extralen) {
5148 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 5149 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
5150 }
5151 if (SIZE_ONLY)
830247a4
IZ
5152 RExC_extralen += 2; /* Account for LONGJMP. */
5153 nextchar(pRExC_state);
3dab1dad 5154 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 5155
a687059c 5156 if (br == NULL)
a0d0e21e 5157 return(NULL);
3dab1dad 5158 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 5159 lastbr = br;
821b33a5
IZ
5160 if (flags&HASWIDTH)
5161 *flagp |= HASWIDTH;
a687059c 5162 *flagp |= flags&SPSTART;
a0d0e21e
LW
5163 }
5164
c277df42
IZ
5165 if (have_branch || paren != ':') {
5166 /* Make a closing node, and hook it on the end. */
5167 switch (paren) {
5168 case ':':
830247a4 5169 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
5170 break;
5171 case 1:
830247a4 5172 ender = reganode(pRExC_state, CLOSE, parno);
40d049e4
YO
5173 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
5174 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5175 "Setting close paren #%"IVdf" to %d\n",
5176 (IV)parno, REG_NODE_NUM(ender)));
5177 RExC_close_parens[parno-1]= ender;
5178 }
fac92740
MJD
5179 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
5180 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
5181 break;
5182 case '<':
c277df42
IZ
5183 case ',':
5184 case '=':
5185 case '!':
c277df42 5186 *flagp &= ~HASWIDTH;
821b33a5
IZ
5187 /* FALL THROUGH */
5188 case '>':
830247a4 5189 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
5190 break;
5191 case 0:
830247a4 5192 ender = reg_node(pRExC_state, END);
40d049e4
YO
5193 if (!SIZE_ONLY) {
5194 assert(!RExC_opend); /* there can only be one! */
5195 RExC_opend = ender;
5196 }
c277df42
IZ
5197 break;
5198 }
eaf3ca90 5199 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 5200
9674d46a 5201 if (have_branch && !SIZE_ONLY) {
eaf3ca90
YO
5202 if (depth==1)
5203 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5204
c277df42 5205 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
5206 for (br = ret; br; br = regnext(br)) {
5207 const U8 op = PL_regkind[OP(br)];
5208 if (op == BRANCH) {
07be1b83 5209 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
5210 }
5211 else if (op == BRANCHJ) {
07be1b83 5212 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 5213 }
c277df42
IZ
5214 }
5215 }
a0d0e21e 5216 }
c277df42
IZ
5217
5218 {
e1ec3a88
AL
5219 const char *p;
5220 static const char parens[] = "=!<,>";
c277df42
IZ
5221
5222 if (paren && (p = strchr(parens, paren))) {
eb160463 5223 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
5224 int flag = (p - parens) > 1;
5225
5226 if (paren == '>')
5227 node = SUSPEND, flag = 0;
6bda09f9 5228 reginsert(pRExC_state, node,ret, depth+1);
45948336
EP
5229 Set_Node_Cur_Length(ret);
5230 Set_Node_Offset(ret, parse_start + 1);
c277df42 5231 ret->flags = flag;
07be1b83 5232 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 5233 }
a0d0e21e
LW
5234 }
5235
5236 /* Check for proper termination. */
ce3e6498 5237 if (paren) {
e2509266 5238 RExC_flags = oregflags;
830247a4
IZ
5239 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
5240 RExC_parse = oregcomp_parse;
380a0633 5241 vFAIL("Unmatched (");
ce3e6498 5242 }
a0ed51b3 5243 }
830247a4
IZ
5244 else if (!paren && RExC_parse < RExC_end) {
5245 if (*RExC_parse == ')') {
5246 RExC_parse++;
380a0633 5247 vFAIL("Unmatched )");
a0ed51b3
LW
5248 }
5249 else
b45f050a 5250 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
5251 /* NOTREACHED */
5252 }
a687059c 5253
a0d0e21e 5254 return(ret);
a687059c
LW
5255}
5256
5257/*
5258 - regbranch - one alternative of an | operator
5259 *
5260 * Implements the concatenation operator.
5261 */
76e3520e 5262STATIC regnode *
3dab1dad 5263S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 5264{
97aff369 5265 dVAR;
c277df42
IZ
5266 register regnode *ret;
5267 register regnode *chain = NULL;
5268 register regnode *latest;
5269 I32 flags = 0, c = 0;
3dab1dad
YO
5270 GET_RE_DEBUG_FLAGS_DECL;
5271 DEBUG_PARSE("brnc");
b81d288d 5272 if (first)
c277df42
IZ
5273 ret = NULL;
5274 else {
b81d288d 5275 if (!SIZE_ONLY && RExC_extralen)
830247a4 5276 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 5277 else {
830247a4 5278 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
5279 Set_Node_Length(ret, 1);
5280 }
c277df42
IZ
5281 }
5282
b81d288d 5283 if (!first && SIZE_ONLY)
830247a4 5284 RExC_extralen += 1; /* BRANCHJ */
b81d288d 5285
c277df42 5286 *flagp = WORST; /* Tentatively. */
a0d0e21e 5287
830247a4
IZ
5288 RExC_parse--;
5289 nextchar(pRExC_state);
5290 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 5291 flags &= ~TRYAGAIN;
3dab1dad 5292 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5293 if (latest == NULL) {
5294 if (flags & TRYAGAIN)
5295 continue;
5296 return(NULL);
a0ed51b3
LW
5297 }
5298 else if (ret == NULL)
c277df42 5299 ret = latest;
a0d0e21e 5300 *flagp |= flags&HASWIDTH;
c277df42 5301 if (chain == NULL) /* First piece. */
a0d0e21e
LW
5302 *flagp |= flags&SPSTART;
5303 else {
830247a4 5304 RExC_naughty++;
3dab1dad 5305 REGTAIL(pRExC_state, chain, latest);
a687059c 5306 }
a0d0e21e 5307 chain = latest;
c277df42
IZ
5308 c++;
5309 }
5310 if (chain == NULL) { /* Loop ran zero times. */
830247a4 5311 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
5312 if (ret == NULL)
5313 ret = chain;
5314 }
5315 if (c == 1) {
5316 *flagp |= flags&SIMPLE;
a0d0e21e 5317 }
a687059c 5318
d4c19fe8 5319 return ret;
a687059c
LW
5320}
5321
5322/*
5323 - regpiece - something followed by possible [*+?]
5324 *
5325 * Note that the branching code sequences used for ? and the general cases
5326 * of * and + are somewhat optimized: they use the same NOTHING node as
5327 * both the endmarker for their branch list and the body of the last branch.
5328 * It might seem that this node could be dispensed with entirely, but the
5329 * endmarker role is not redundant.
5330 */
76e3520e 5331STATIC regnode *
3dab1dad 5332S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 5333{
97aff369 5334 dVAR;
c277df42 5335 register regnode *ret;
a0d0e21e
LW
5336 register char op;
5337 register char *next;
5338 I32 flags;
1df70142 5339 const char * const origparse = RExC_parse;
a0d0e21e 5340 I32 min;
c277df42 5341 I32 max = REG_INFTY;
fac92740 5342 char *parse_start;
10edeb5d 5343 const char *maxpos = NULL;
3dab1dad
YO
5344 GET_RE_DEBUG_FLAGS_DECL;
5345 DEBUG_PARSE("piec");
a0d0e21e 5346
3dab1dad 5347 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5348 if (ret == NULL) {
5349 if (flags & TRYAGAIN)
5350 *flagp |= TRYAGAIN;
5351 return(NULL);
5352 }
5353
830247a4 5354 op = *RExC_parse;
a0d0e21e 5355
830247a4 5356 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 5357 maxpos = NULL;
fac92740 5358 parse_start = RExC_parse; /* MJD */
830247a4 5359 next = RExC_parse + 1;
a0d0e21e
LW
5360 while (isDIGIT(*next) || *next == ',') {
5361 if (*next == ',') {
5362 if (maxpos)
5363 break;
5364 else
5365 maxpos = next;
a687059c 5366 }
a0d0e21e
LW
5367 next++;
5368 }
5369 if (*next == '}') { /* got one */
5370 if (!maxpos)
5371 maxpos = next;
830247a4
IZ
5372 RExC_parse++;
5373 min = atoi(RExC_parse);
a0d0e21e
LW
5374 if (*maxpos == ',')
5375 maxpos++;
5376 else
830247a4 5377 maxpos = RExC_parse;
a0d0e21e
LW
5378 max = atoi(maxpos);
5379 if (!max && *maxpos != '0')
c277df42
IZ
5380 max = REG_INFTY; /* meaning "infinity" */
5381 else if (max >= REG_INFTY)
8615cb43 5382 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
5383 RExC_parse = next;
5384 nextchar(pRExC_state);
a0d0e21e
LW
5385
5386 do_curly:
5387 if ((flags&SIMPLE)) {
830247a4 5388 RExC_naughty += 2 + RExC_naughty / 2;
6bda09f9 5389 reginsert(pRExC_state, CURLY, ret, depth+1);
fac92740
MJD
5390 Set_Node_Offset(ret, parse_start+1); /* MJD */
5391 Set_Node_Cur_Length(ret);
a0d0e21e
LW
5392 }
5393 else {
3dab1dad 5394 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
5395
5396 w->flags = 0;
3dab1dad 5397 REGTAIL(pRExC_state, ret, w);
830247a4 5398 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9
YO
5399 reginsert(pRExC_state, LONGJMP,ret, depth+1);
5400 reginsert(pRExC_state, NOTHING,ret, depth+1);
c277df42
IZ
5401 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
5402 }
6bda09f9 5403 reginsert(pRExC_state, CURLYX,ret, depth+1);
fac92740
MJD
5404 /* MJD hk */
5405 Set_Node_Offset(ret, parse_start+1);
2af232bd 5406 Set_Node_Length(ret,
fac92740 5407 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 5408
830247a4 5409 if (!SIZE_ONLY && RExC_extralen)
c277df42 5410 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 5411 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 5412 if (SIZE_ONLY)
830247a4
IZ
5413 RExC_whilem_seen++, RExC_extralen += 3;
5414 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 5415 }
c277df42 5416 ret->flags = 0;
a0d0e21e
LW
5417
5418 if (min > 0)
821b33a5
IZ
5419 *flagp = WORST;
5420 if (max > 0)
5421 *flagp |= HASWIDTH;
a0d0e21e 5422 if (max && max < min)
8615cb43 5423 vFAIL("Can't do {n,m} with n > m");
c277df42 5424 if (!SIZE_ONLY) {
eb160463
GS
5425 ARG1_SET(ret, (U16)min);
5426 ARG2_SET(ret, (U16)max);
a687059c 5427 }
a687059c 5428
a0d0e21e 5429 goto nest_check;
a687059c 5430 }
a0d0e21e 5431 }
a687059c 5432
a0d0e21e
LW
5433 if (!ISMULT1(op)) {
5434 *flagp = flags;
a687059c 5435 return(ret);
a0d0e21e 5436 }
bb20fd44 5437
c277df42 5438#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
5439
5440 /* if this is reinstated, don't forget to put this back into perldiag:
5441
5442 =item Regexp *+ operand could be empty at {#} in regex m/%s/
5443
5444 (F) The part of the regexp subject to either the * or + quantifier
5445 could match an empty string. The {#} shows in the regular
5446 expression about where the problem was discovered.
5447
5448 */
5449
bb20fd44 5450 if (!(flags&HASWIDTH) && op != '?')
b45f050a 5451 vFAIL("Regexp *+ operand could be empty");
b81d288d 5452#endif
bb20fd44 5453
fac92740 5454 parse_start = RExC_parse;
830247a4 5455 nextchar(pRExC_state);
a0d0e21e 5456
821b33a5 5457 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
5458
5459 if (op == '*' && (flags&SIMPLE)) {
6bda09f9 5460 reginsert(pRExC_state, STAR, ret, depth+1);
c277df42 5461 ret->flags = 0;
830247a4 5462 RExC_naughty += 4;
a0d0e21e
LW
5463 }
5464 else if (op == '*') {
5465 min = 0;
5466 goto do_curly;
a0ed51b3
LW
5467 }
5468 else if (op == '+' && (flags&SIMPLE)) {
6bda09f9 5469 reginsert(pRExC_state, PLUS, ret, depth+1);
c277df42 5470 ret->flags = 0;
830247a4 5471 RExC_naughty += 3;
a0d0e21e
LW
5472 }
5473 else if (op == '+') {
5474 min = 1;
5475 goto do_curly;
a0ed51b3
LW
5476 }
5477 else if (op == '?') {
a0d0e21e
LW
5478 min = 0; max = 1;
5479 goto do_curly;
5480 }
5481 nest_check:
041457d9 5482 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 5483 vWARN3(RExC_parse,
b45f050a 5484 "%.*s matches null string many times",
afd78fd5 5485 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
b45f050a 5486 origparse);
a0d0e21e
LW
5487 }
5488
b9b4dddf 5489 if (RExC_parse < RExC_end && *RExC_parse == '?') {
830247a4 5490 nextchar(pRExC_state);
6bda09f9 5491 reginsert(pRExC_state, MINMOD, ret, depth+1);
3dab1dad 5492 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 5493 }
b9b4dddf
YO
5494#ifndef REG_ALLOW_MINMOD_SUSPEND
5495 else
5496#endif
5497 if (RExC_parse < RExC_end && *RExC_parse == '+') {
5498 regnode *ender;
5499 nextchar(pRExC_state);
5500 ender = reg_node(pRExC_state, SUCCEED);
5501 REGTAIL(pRExC_state, ret, ender);
5502 reginsert(pRExC_state, SUSPEND, ret, depth+1);
5503 ret->flags = 0;
5504 ender = reg_node(pRExC_state, TAIL);
5505 REGTAIL(pRExC_state, ret, ender);
5506 /*ret= ender;*/
5507 }
5508
5509 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
830247a4 5510 RExC_parse++;
b45f050a
JF
5511 vFAIL("Nested quantifiers");
5512 }
a0d0e21e
LW
5513
5514 return(ret);
a687059c
LW
5515}
5516
fc8cd66c
YO
5517
5518/* reg_namedseq(pRExC_state,UVp)
5519
5520 This is expected to be called by a parser routine that has
5521 recognized'\N' and needs to handle the rest. RExC_parse is
5522 expected to point at the first char following the N at the time
5523 of the call.
5524
5525 If valuep is non-null then it is assumed that we are parsing inside
5526 of a charclass definition and the first codepoint in the resolved
5527 string is returned via *valuep and the routine will return NULL.
5528 In this mode if a multichar string is returned from the charnames
5529 handler a warning will be issued, and only the first char in the
5530 sequence will be examined. If the string returned is zero length
5531 then the value of *valuep is undefined and NON-NULL will
5532 be returned to indicate failure. (This will NOT be a valid pointer
5533 to a regnode.)
5534
5535 If value is null then it is assumed that we are parsing normal text
5536 and inserts a new EXACT node into the program containing the resolved
5537 string and returns a pointer to the new node. If the string is
5538 zerolength a NOTHING node is emitted.
5539
5540 On success RExC_parse is set to the char following the endbrace.
5541 Parsing failures will generate a fatal errorvia vFAIL(...)
5542
5543 NOTE: We cache all results from the charnames handler locally in
5544 the RExC_charnames hash (created on first use) to prevent a charnames
5545 handler from playing silly-buggers and returning a short string and
5546 then a long string for a given pattern. Since the regexp program
5547 size is calculated during an initial parse this would result
5548 in a buffer overrun so we cache to prevent the charname result from
5549 changing during the course of the parse.
5550
5551 */
5552STATIC regnode *
5553S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
5554{
5555 char * name; /* start of the content of the name */
5556 char * endbrace; /* endbrace following the name */
5557 SV *sv_str = NULL;
5558 SV *sv_name = NULL;
5559 STRLEN len; /* this has various purposes throughout the code */
5560 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
5561 regnode *ret = NULL;
5562
5563 if (*RExC_parse != '{') {
5564 vFAIL("Missing braces on \\N{}");
5565 }
5566 name = RExC_parse+1;
5567 endbrace = strchr(RExC_parse, '}');
5568 if ( ! endbrace ) {
5569 RExC_parse++;
5570 vFAIL("Missing right brace on \\N{}");
5571 }
5572 RExC_parse = endbrace + 1;
5573
5574
5575 /* RExC_parse points at the beginning brace,
5576 endbrace points at the last */
5577 if ( name[0]=='U' && name[1]=='+' ) {
5578 /* its a "unicode hex" notation {U+89AB} */
5579 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
5580 | PERL_SCAN_DISALLOW_PREFIX
5581 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
5582 UV cp;
196f1508 5583 len = (STRLEN)(endbrace - name - 2);
fc8cd66c 5584 cp = grok_hex(name + 2, &len, &fl, NULL);
196f1508 5585 if ( len != (STRLEN)(endbrace - name - 2) ) {
fc8cd66c
YO
5586 cp = 0xFFFD;
5587 }
5588 if (cp > 0xff)
5589 RExC_utf8 = 1;
5590 if ( valuep ) {
5591 *valuep = cp;
5592 return NULL;
5593 }
5594 sv_str= Perl_newSVpvf_nocontext("%c",(int)cp);
5595 } else {
5596 /* fetch the charnames handler for this scope */
5597 HV * const table = GvHV(PL_hintgv);
5598 SV **cvp= table ?
5599 hv_fetchs(table, "charnames", FALSE) :
5600 NULL;
5601 SV *cv= cvp ? *cvp : NULL;
5602 HE *he_str;
5603 int count;
5604 /* create an SV with the name as argument */
5605 sv_name = newSVpvn(name, endbrace - name);
5606
5607 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
5608 vFAIL2("Constant(\\N{%s}) unknown: "
5609 "(possibly a missing \"use charnames ...\")",
5610 SvPVX(sv_name));
5611 }
5612 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
5613 vFAIL2("Constant(\\N{%s}): "
5614 "$^H{charnames} is not defined",SvPVX(sv_name));
5615 }
5616
5617
5618
5619 if (!RExC_charnames) {
5620 /* make sure our cache is allocated */
5621 RExC_charnames = newHV();
6bda09f9 5622 sv_2mortal((SV*)RExC_charnames);
fc8cd66c
YO
5623 }
5624 /* see if we have looked this one up before */
5625 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
5626 if ( he_str ) {
5627 sv_str = HeVAL(he_str);
5628 cached = 1;
5629 } else {
5630 dSP ;
5631
5632 ENTER ;
5633 SAVETMPS ;
5634 PUSHMARK(SP) ;
5635
5636 XPUSHs(sv_name);
5637
5638 PUTBACK ;
5639
5640 count= call_sv(cv, G_SCALAR);
5641
5642 if (count == 1) { /* XXXX is this right? dmq */
5643 sv_str = POPs;
5644 SvREFCNT_inc_simple_void(sv_str);
5645 }
5646
5647 SPAGAIN ;
5648 PUTBACK ;
5649 FREETMPS ;
5650 LEAVE ;
5651
5652 if ( !sv_str || !SvOK(sv_str) ) {
5653 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
5654 "did not return a defined value",SvPVX(sv_name));
5655 }
5656 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
5657 cached = 1;
5658 }
5659 }
5660 if (valuep) {
5661 char *p = SvPV(sv_str, len);
5662 if (len) {
5663 STRLEN numlen = 1;
5664 if ( SvUTF8(sv_str) ) {
196f1508 5665 *valuep = utf8_to_uvchr((U8*)p, &numlen);
fc8cd66c
YO
5666 if (*valuep > 0x7F)
5667 RExC_utf8 = 1;
5668 /* XXXX
5669 We have to turn on utf8 for high bit chars otherwise
5670 we get failures with
5671
5672 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5673 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5674
5675 This is different from what \x{} would do with the same
5676 codepoint, where the condition is > 0xFF.
5677 - dmq
5678 */
5679
5680
5681 } else {
5682 *valuep = (UV)*p;
5683 /* warn if we havent used the whole string? */
5684 }
5685 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5686 vWARN2(RExC_parse,
5687 "Ignoring excess chars from \\N{%s} in character class",
5688 SvPVX(sv_name)
5689 );
5690 }
5691 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5692 vWARN2(RExC_parse,
5693 "Ignoring zero length \\N{%s} in character class",
5694 SvPVX(sv_name)
5695 );
5696 }
5697 if (sv_name)
5698 SvREFCNT_dec(sv_name);
5699 if (!cached)
5700 SvREFCNT_dec(sv_str);
5701 return len ? NULL : (regnode *)&len;
5702 } else if(SvCUR(sv_str)) {
5703
5704 char *s;
5705 char *p, *pend;
5706 STRLEN charlen = 1;
5707 char * parse_start = name-3; /* needed for the offsets */
5708 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
5709
5710 ret = reg_node(pRExC_state,
5711 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
5712 s= STRING(ret);
5713
5714 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
5715 sv_utf8_upgrade(sv_str);
5716 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
5717 RExC_utf8= 1;
5718 }
5719
5720 p = SvPV(sv_str, len);
5721 pend = p + len;
5722 /* len is the length written, charlen is the size the char read */
5723 for ( len = 0; p < pend; p += charlen ) {
5724 if (UTF) {
196f1508 5725 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
fc8cd66c
YO
5726 if (FOLD) {
5727 STRLEN foldlen,numlen;
5728 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
5729 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
5730 /* Emit all the Unicode characters. */
5731
5732 for (foldbuf = tmpbuf;
5733 foldlen;
5734 foldlen -= numlen)
5735 {
5736 uvc = utf8_to_uvchr(foldbuf, &numlen);
5737 if (numlen > 0) {
5738 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5739 s += unilen;
5740 len += unilen;
5741 /* In EBCDIC the numlen
5742 * and unilen can differ. */
5743 foldbuf += numlen;
5744 if (numlen >= foldlen)
5745 break;
5746 }
5747 else
5748 break; /* "Can't happen." */
5749 }
5750 } else {
5751 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5752 if (unilen > 0) {
5753 s += unilen;
5754 len += unilen;
5755 }
5756 }
5757 } else {
5758 len++;
5759 REGC(*p, s++);
5760 }
5761 }
5762 if (SIZE_ONLY) {
5763 RExC_size += STR_SZ(len);
5764 } else {
5765 STR_LEN(ret) = len;
5766 RExC_emit += STR_SZ(len);
5767 }
5768 Set_Node_Cur_Length(ret); /* MJD */
5769 RExC_parse--;
5770 nextchar(pRExC_state);
5771 } else {
5772 ret = reg_node(pRExC_state,NOTHING);
5773 }
5774 if (!cached) {
5775 SvREFCNT_dec(sv_str);
5776 }
5777 if (sv_name) {
5778 SvREFCNT_dec(sv_name);
5779 }
5780 return ret;
5781
5782}
5783
5784
5785
a687059c
LW
5786/*
5787 - regatom - the lowest level
5788 *
5789 * Optimization: gobbles an entire sequence of ordinary characters so that
5790 * it can turn them into a single node, which is smaller to store and
5791 * faster to run. Backslashed characters are exceptions, each becoming a
5792 * separate node; the code is simpler that way and it's not worth fixing.
5793 *
7f6f358c
YO
5794 * [Yes, it is worth fixing, some scripts can run twice the speed.]
5795 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
5796 */
76e3520e 5797STATIC regnode *
3dab1dad 5798S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 5799{
97aff369 5800 dVAR;
cbbf8932 5801 register regnode *ret = NULL;
a0d0e21e 5802 I32 flags;
45948336 5803 char *parse_start = RExC_parse;
3dab1dad
YO
5804 GET_RE_DEBUG_FLAGS_DECL;
5805 DEBUG_PARSE("atom");
a0d0e21e
LW
5806 *flagp = WORST; /* Tentatively. */
5807
5808tryagain:
830247a4 5809 switch (*RExC_parse) {
a0d0e21e 5810 case '^':
830247a4
IZ
5811 RExC_seen_zerolen++;
5812 nextchar(pRExC_state);
e2509266 5813 if (RExC_flags & PMf_MULTILINE)
830247a4 5814 ret = reg_node(pRExC_state, MBOL);
e2509266 5815 else if (RExC_flags & PMf_SINGLELINE)
830247a4 5816 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 5817 else
830247a4 5818 ret = reg_node(pRExC_state, BOL);
fac92740 5819 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
5820 break;
5821 case '$':
830247a4 5822 nextchar(pRExC_state);
b81d288d 5823 if (*RExC_parse)
830247a4 5824 RExC_seen_zerolen++;
e2509266 5825 if (RExC_flags & PMf_MULTILINE)
830247a4 5826 ret = reg_node(pRExC_state, MEOL);
e2509266 5827 else if (RExC_flags & PMf_SINGLELINE)
830247a4 5828 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 5829 else
830247a4 5830 ret = reg_node(pRExC_state, EOL);
fac92740 5831 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
5832 break;
5833 case '.':
830247a4 5834 nextchar(pRExC_state);
e2509266 5835 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
5836 ret = reg_node(pRExC_state, SANY);
5837 else
5838 ret = reg_node(pRExC_state, REG_ANY);
5839 *flagp |= HASWIDTH|SIMPLE;
830247a4 5840 RExC_naughty++;
fac92740 5841 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
5842 break;
5843 case '[':
b45f050a 5844 {
3dab1dad
YO
5845 char * const oregcomp_parse = ++RExC_parse;
5846 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
5847 if (*RExC_parse != ']') {
5848 RExC_parse = oregcomp_parse;
b45f050a
JF
5849 vFAIL("Unmatched [");
5850 }
830247a4 5851 nextchar(pRExC_state);
a0d0e21e 5852 *flagp |= HASWIDTH|SIMPLE;
fac92740 5853 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 5854 break;
b45f050a 5855 }
a0d0e21e 5856 case '(':
830247a4 5857 nextchar(pRExC_state);
3dab1dad 5858 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 5859 if (ret == NULL) {
bf93d4cc 5860 if (flags & TRYAGAIN) {
830247a4 5861 if (RExC_parse == RExC_end) {
bf93d4cc
GS
5862 /* Make parent create an empty node if needed. */
5863 *flagp |= TRYAGAIN;
5864 return(NULL);
5865 }
a0d0e21e 5866 goto tryagain;
bf93d4cc 5867 }
a0d0e21e
LW
5868 return(NULL);
5869 }
c277df42 5870 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
5871 break;
5872 case '|':
5873 case ')':
5874 if (flags & TRYAGAIN) {
5875 *flagp |= TRYAGAIN;
5876 return NULL;
5877 }
b45f050a 5878 vFAIL("Internal urp");
a0d0e21e
LW
5879 /* Supposed to be caught earlier. */
5880 break;
85afd4ae 5881 case '{':
830247a4
IZ
5882 if (!regcurly(RExC_parse)) {
5883 RExC_parse++;
85afd4ae
CS
5884 goto defchar;
5885 }
5886 /* FALL THROUGH */
a0d0e21e
LW
5887 case '?':
5888 case '+':
5889 case '*':
830247a4 5890 RExC_parse++;
b45f050a 5891 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
5892 break;
5893 case '\\':
830247a4 5894 switch (*++RExC_parse) {
a0d0e21e 5895 case 'A':
830247a4
IZ
5896 RExC_seen_zerolen++;
5897 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 5898 *flagp |= SIMPLE;
830247a4 5899 nextchar(pRExC_state);
fac92740 5900 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5901 break;
5902 case 'G':
830247a4
IZ
5903 ret = reg_node(pRExC_state, GPOS);
5904 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 5905 *flagp |= SIMPLE;
830247a4 5906 nextchar(pRExC_state);
fac92740 5907 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5908 break;
5909 case 'Z':
830247a4 5910 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 5911 *flagp |= SIMPLE;
a1917ab9 5912 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 5913 nextchar(pRExC_state);
a0d0e21e 5914 break;
b85d18e9 5915 case 'z':
830247a4 5916 ret = reg_node(pRExC_state, EOS);
b85d18e9 5917 *flagp |= SIMPLE;
830247a4
IZ
5918 RExC_seen_zerolen++; /* Do not optimize RE away */
5919 nextchar(pRExC_state);
fac92740 5920 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 5921 break;
4a2d328f 5922 case 'C':
f33976b4
DB
5923 ret = reg_node(pRExC_state, CANY);
5924 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 5925 *flagp |= HASWIDTH|SIMPLE;
830247a4 5926 nextchar(pRExC_state);
fac92740 5927 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
5928 break;
5929 case 'X':
830247a4 5930 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 5931 *flagp |= HASWIDTH;
830247a4 5932 nextchar(pRExC_state);
fac92740 5933 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 5934 break;
a0d0e21e 5935 case 'w':
eb160463 5936 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 5937 *flagp |= HASWIDTH|SIMPLE;
830247a4 5938 nextchar(pRExC_state);
fac92740 5939 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5940 break;
5941 case 'W':
eb160463 5942 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 5943 *flagp |= HASWIDTH|SIMPLE;
830247a4 5944 nextchar(pRExC_state);
fac92740 5945 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5946 break;
5947 case 'b':
830247a4
IZ
5948 RExC_seen_zerolen++;
5949 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 5950 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 5951 *flagp |= SIMPLE;
830247a4 5952 nextchar(pRExC_state);
fac92740 5953 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5954 break;
5955 case 'B':
830247a4
IZ
5956 RExC_seen_zerolen++;
5957 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 5958 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 5959 *flagp |= SIMPLE;
830247a4 5960 nextchar(pRExC_state);
fac92740 5961 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5962 break;
5963 case 's':
eb160463 5964 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 5965 *flagp |= HASWIDTH|SIMPLE;
830247a4 5966 nextchar(pRExC_state);
fac92740 5967 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5968 break;
5969 case 'S':
eb160463 5970 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 5971 *flagp |= HASWIDTH|SIMPLE;
830247a4 5972 nextchar(pRExC_state);
fac92740 5973 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5974 break;
5975 case 'd':
ffc61ed2 5976 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 5977 *flagp |= HASWIDTH|SIMPLE;
830247a4 5978 nextchar(pRExC_state);
fac92740 5979 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5980 break;
5981 case 'D':
ffc61ed2 5982 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 5983 *flagp |= HASWIDTH|SIMPLE;
830247a4 5984 nextchar(pRExC_state);
fac92740 5985 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 5986 break;
a14b48bc
LW
5987 case 'p':
5988 case 'P':
3568d838 5989 {
3dab1dad 5990 char* const oldregxend = RExC_end;
ccb2c380 5991 char* parse_start = RExC_parse - 2;
a14b48bc 5992
830247a4 5993 if (RExC_parse[1] == '{') {
3568d838 5994 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
5995 RExC_end = strchr(RExC_parse, '}');
5996 if (!RExC_end) {
3dab1dad 5997 const U8 c = (U8)*RExC_parse;
830247a4
IZ
5998 RExC_parse += 2;
5999 RExC_end = oldregxend;
0da60cf5 6000 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 6001 }
830247a4 6002 RExC_end++;
a14b48bc 6003 }
af6f566e 6004 else {
830247a4 6005 RExC_end = RExC_parse + 2;
af6f566e
HS
6006 if (RExC_end > oldregxend)
6007 RExC_end = oldregxend;
6008 }
830247a4 6009 RExC_parse--;
a14b48bc 6010
3dab1dad 6011 ret = regclass(pRExC_state,depth+1);
a14b48bc 6012
830247a4
IZ
6013 RExC_end = oldregxend;
6014 RExC_parse--;
ccb2c380
MP
6015
6016 Set_Node_Offset(ret, parse_start + 2);
6017 Set_Node_Cur_Length(ret);
830247a4 6018 nextchar(pRExC_state);
a14b48bc
LW
6019 *flagp |= HASWIDTH|SIMPLE;
6020 }
6021 break;
fc8cd66c
YO
6022 case 'N':
6023 /* Handle \N{NAME} here and not below because it can be
6024 multicharacter. join_exact() will join them up later on.
6025 Also this makes sure that things like /\N{BLAH}+/ and
6026 \N{BLAH} being multi char Just Happen. dmq*/
6027 ++RExC_parse;
6028 ret= reg_namedseq(pRExC_state, NULL);
6029 break;
0a4db386 6030 case 'k': /* Handle \k<NAME> and \k'NAME' */
81714fb9
YO
6031 {
6032 char ch= RExC_parse[1];
6033 if (ch != '<' && ch != '\'') {
6034 if (SIZE_ONLY)
6035 vWARN( RExC_parse + 1,
6036 "Possible broken named back reference treated as literal k");
6037 parse_start--;
6038 goto defchar;
6039 } else {
6040 char* name_start = (RExC_parse += 2);
6041 I32 num = 0;
0a4db386
YO
6042 SV *sv_dat = reg_scan_name(pRExC_state,
6043 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
894be9b7
YO
6044 ch= (ch == '<') ? '>' : '\'';
6045
81714fb9
YO
6046 if (RExC_parse == name_start || *RExC_parse != ch)
6047 vFAIL2("Sequence \\k%c... not terminated",
6048 (ch == '>' ? '<' : ch));
6049
6050 RExC_sawback = 1;
6051 ret = reganode(pRExC_state,
6052 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
6053 num);
6054 *flagp |= HASWIDTH;
6055
6056
6057 if (!SIZE_ONLY) {
81714fb9
YO
6058 num = add_data( pRExC_state, 1, "S" );
6059 ARG_SET(ret,num);
6060 RExC_rx->data->data[num]=(void*)sv_dat;
6061 SvREFCNT_inc(sv_dat);
6062 }
6063 /* override incorrect value set in reganode MJD */
6064 Set_Node_Offset(ret, parse_start+1);
6065 Set_Node_Cur_Length(ret); /* MJD */
6066 nextchar(pRExC_state);
6067
6068 }
6069 break;
6070 }
a0d0e21e
LW
6071 case 'n':
6072 case 'r':
6073 case 't':
6074 case 'f':
6075 case 'e':
6076 case 'a':
6077 case 'x':
6078 case 'c':
6079 case '0':
6080 goto defchar;
6081 case '1': case '2': case '3': case '4':
6082 case '5': case '6': case '7': case '8': case '9':
6083 {
1df70142 6084 const I32 num = atoi(RExC_parse);
a0d0e21e 6085
830247a4 6086 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
6087 goto defchar;
6088 else {
3dab1dad 6089 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
6090 while (isDIGIT(*RExC_parse))
6091 RExC_parse++;
b45f050a 6092
eb160463 6093 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 6094 vFAIL("Reference to nonexistent group");
830247a4 6095 RExC_sawback = 1;
eb160463
GS
6096 ret = reganode(pRExC_state,
6097 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
6098 num);
a0d0e21e 6099 *flagp |= HASWIDTH;
2af232bd 6100
fac92740 6101 /* override incorrect value set in reganode MJD */
2af232bd 6102 Set_Node_Offset(ret, parse_start+1);
fac92740 6103 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
6104 RExC_parse--;
6105 nextchar(pRExC_state);
a0d0e21e
LW
6106 }
6107 }
6108 break;
6109 case '\0':
830247a4 6110 if (RExC_parse >= RExC_end)
b45f050a 6111 FAIL("Trailing \\");
a0d0e21e
LW
6112 /* FALL THROUGH */
6113 default:
a0288114 6114 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 6115 back into the quick-grab loop below */
45948336 6116 parse_start--;
a0d0e21e
LW
6117 goto defchar;
6118 }
6119 break;
4633a7c4
LW
6120
6121 case '#':
e2509266 6122 if (RExC_flags & PMf_EXTENDED) {
3dab1dad
YO
6123 while (RExC_parse < RExC_end && *RExC_parse != '\n')
6124 RExC_parse++;
830247a4 6125 if (RExC_parse < RExC_end)
4633a7c4
LW
6126 goto tryagain;
6127 }
6128 /* FALL THROUGH */
6129
a0d0e21e 6130 default: {
ba210ebe 6131 register STRLEN len;
58ae7d3f 6132 register UV ender;
a0d0e21e 6133 register char *p;
3dab1dad 6134 char *s;
80aecb99 6135 STRLEN foldlen;
89ebb4a3 6136 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
6137
6138 parse_start = RExC_parse - 1;
a0d0e21e 6139
830247a4 6140 RExC_parse++;
a0d0e21e
LW
6141
6142 defchar:
58ae7d3f 6143 ender = 0;
eb160463
GS
6144 ret = reg_node(pRExC_state,
6145 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 6146 s = STRING(ret);
830247a4
IZ
6147 for (len = 0, p = RExC_parse - 1;
6148 len < 127 && p < RExC_end;
a0d0e21e
LW
6149 len++)
6150 {
3dab1dad 6151 char * const oldp = p;
5b5a24f7 6152
e2509266 6153 if (RExC_flags & PMf_EXTENDED)
830247a4 6154 p = regwhite(p, RExC_end);
a0d0e21e
LW
6155 switch (*p) {
6156 case '^':
6157 case '$':
6158 case '.':
6159 case '[':
6160 case '(':
6161 case ')':
6162 case '|':
6163 goto loopdone;
6164 case '\\':
6165 switch (*++p) {
6166 case 'A':
1ed8eac0
JF
6167 case 'C':
6168 case 'X':
a0d0e21e
LW
6169 case 'G':
6170 case 'Z':
b85d18e9 6171 case 'z':
a0d0e21e
LW
6172 case 'w':
6173 case 'W':
6174 case 'b':
6175 case 'B':
6176 case 's':
6177 case 'S':
6178 case 'd':
6179 case 'D':
a14b48bc
LW
6180 case 'p':
6181 case 'P':
fc8cd66c 6182 case 'N':
a0d0e21e
LW
6183 --p;
6184 goto loopdone;
6185 case 'n':
6186 ender = '\n';
6187 p++;
a687059c 6188 break;
a0d0e21e
LW
6189 case 'r':
6190 ender = '\r';
6191 p++;
a687059c 6192 break;
a0d0e21e
LW
6193 case 't':
6194 ender = '\t';
6195 p++;
a687059c 6196 break;
a0d0e21e
LW
6197 case 'f':
6198 ender = '\f';
6199 p++;
a687059c 6200 break;
a0d0e21e 6201 case 'e':
c7f1f016 6202 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 6203 p++;
a687059c 6204 break;
a0d0e21e 6205 case 'a':
c7f1f016 6206 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 6207 p++;
a687059c 6208 break;
a0d0e21e 6209 case 'x':
a0ed51b3 6210 if (*++p == '{') {
1df70142 6211 char* const e = strchr(p, '}');
b81d288d 6212
b45f050a 6213 if (!e) {
830247a4 6214 RExC_parse = p + 1;
b45f050a
JF
6215 vFAIL("Missing right brace on \\x{}");
6216 }
de5f0749 6217 else {
a4c04bdc
NC
6218 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6219 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 6220 STRLEN numlen = e - p - 1;
53305cf1 6221 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
6222 if (ender > 0xff)
6223 RExC_utf8 = 1;
a0ed51b3
LW
6224 p = e + 1;
6225 }
a0ed51b3
LW
6226 }
6227 else {
a4c04bdc 6228 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 6229 STRLEN numlen = 2;
53305cf1 6230 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
6231 p += numlen;
6232 }
a687059c 6233 break;
a0d0e21e
LW
6234 case 'c':
6235 p++;
bbce6d69 6236 ender = UCHARAT(p++);
6237 ender = toCTRL(ender);
a687059c 6238 break;
a0d0e21e
LW
6239 case '0': case '1': case '2': case '3':case '4':
6240 case '5': case '6': case '7': case '8':case '9':
6241 if (*p == '0' ||
830247a4 6242 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 6243 I32 flags = 0;
1df70142 6244 STRLEN numlen = 3;
53305cf1 6245 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
6246 p += numlen;
6247 }
6248 else {
6249 --p;
6250 goto loopdone;
a687059c
LW
6251 }
6252 break;
a0d0e21e 6253 case '\0':
830247a4 6254 if (p >= RExC_end)
b45f050a 6255 FAIL("Trailing \\");
a687059c 6256 /* FALL THROUGH */
a0d0e21e 6257 default:
041457d9 6258 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 6259 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 6260 goto normal_default;
a0d0e21e
LW
6261 }
6262 break;
a687059c 6263 default:
a0ed51b3 6264 normal_default:
fd400ab9 6265 if (UTF8_IS_START(*p) && UTF) {
1df70142 6266 STRLEN numlen;
5e12f4fb 6267 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 6268 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
6269 p += numlen;
6270 }
6271 else
6272 ender = *p++;
a0d0e21e 6273 break;
a687059c 6274 }
e2509266 6275 if (RExC_flags & PMf_EXTENDED)
830247a4 6276 p = regwhite(p, RExC_end);
60a8b682
JH
6277 if (UTF && FOLD) {
6278 /* Prime the casefolded buffer. */
ac7e0132 6279 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 6280 }
a0d0e21e
LW
6281 if (ISMULT2(p)) { /* Back off on ?+*. */
6282 if (len)
6283 p = oldp;
16ea2a2e 6284 else if (UTF) {
80aecb99 6285 if (FOLD) {
60a8b682 6286 /* Emit all the Unicode characters. */
1df70142 6287 STRLEN numlen;
80aecb99
JH
6288 for (foldbuf = tmpbuf;
6289 foldlen;
6290 foldlen -= numlen) {
6291 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6292 if (numlen > 0) {
71207a34 6293 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6294 s += unilen;
6295 len += unilen;
6296 /* In EBCDIC the numlen
6297 * and unilen can differ. */
9dc45d57 6298 foldbuf += numlen;
47654450
JH
6299 if (numlen >= foldlen)
6300 break;
9dc45d57
JH
6301 }
6302 else
6303 break; /* "Can't happen." */
80aecb99
JH
6304 }
6305 }
6306 else {
71207a34 6307 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6308 if (unilen > 0) {
0ebc6274
JH
6309 s += unilen;
6310 len += unilen;
9dc45d57 6311 }
80aecb99 6312 }
a0ed51b3 6313 }
a0d0e21e
LW
6314 else {
6315 len++;
eb160463 6316 REGC((char)ender, s++);
a0d0e21e
LW
6317 }
6318 break;
a687059c 6319 }
16ea2a2e 6320 if (UTF) {
80aecb99 6321 if (FOLD) {
60a8b682 6322 /* Emit all the Unicode characters. */
1df70142 6323 STRLEN numlen;
80aecb99
JH
6324 for (foldbuf = tmpbuf;
6325 foldlen;
6326 foldlen -= numlen) {
6327 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6328 if (numlen > 0) {
71207a34 6329 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6330 len += unilen;
6331 s += unilen;
6332 /* In EBCDIC the numlen
6333 * and unilen can differ. */
9dc45d57 6334 foldbuf += numlen;
47654450
JH
6335 if (numlen >= foldlen)
6336 break;
9dc45d57
JH
6337 }
6338 else
6339 break;
80aecb99
JH
6340 }
6341 }
6342 else {
71207a34 6343 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6344 if (unilen > 0) {
0ebc6274
JH
6345 s += unilen;
6346 len += unilen;
9dc45d57 6347 }
80aecb99
JH
6348 }
6349 len--;
a0ed51b3
LW
6350 }
6351 else
eb160463 6352 REGC((char)ender, s++);
a0d0e21e
LW
6353 }
6354 loopdone:
830247a4 6355 RExC_parse = p - 1;
fac92740 6356 Set_Node_Cur_Length(ret); /* MJD */
830247a4 6357 nextchar(pRExC_state);
793db0cb
JH
6358 {
6359 /* len is STRLEN which is unsigned, need to copy to signed */
6360 IV iv = len;
6361 if (iv < 0)
6362 vFAIL("Internal disaster");
6363 }
a0d0e21e
LW
6364 if (len > 0)
6365 *flagp |= HASWIDTH;
090f7165 6366 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 6367 *flagp |= SIMPLE;
3dab1dad 6368
cd439c50 6369 if (SIZE_ONLY)
830247a4 6370 RExC_size += STR_SZ(len);
3dab1dad
YO
6371 else {
6372 STR_LEN(ret) = len;
830247a4 6373 RExC_emit += STR_SZ(len);
07be1b83 6374 }
3dab1dad 6375 }
a0d0e21e
LW
6376 break;
6377 }
a687059c 6378
60a8b682
JH
6379 /* If the encoding pragma is in effect recode the text of
6380 * any EXACT-kind nodes. */
fc8cd66c 6381 if (ret && PL_encoding && PL_regkind[OP(ret)] == EXACT) {
3dab1dad
YO
6382 const STRLEN oldlen = STR_LEN(ret);
6383 SV * const sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
d0063567
DK
6384
6385 if (RExC_utf8)
6386 SvUTF8_on(sv);
6387 if (sv_utf8_downgrade(sv, TRUE)) {
1df70142
AL
6388 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
6389 const STRLEN newlen = SvCUR(sv);
d0063567
DK
6390
6391 if (SvUTF8(sv))
6392 RExC_utf8 = 1;
6393 if (!SIZE_ONLY) {
a3621e74
YO
6394 GET_RE_DEBUG_FLAGS_DECL;
6395 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
d0063567
DK
6396 (int)oldlen, STRING(ret),
6397 (int)newlen, s));
6398 Copy(s, STRING(ret), newlen, char);
6399 STR_LEN(ret) += newlen - oldlen;
6400 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
6401 } else
6402 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
6403 }
a72c7584
JH
6404 }
6405
a0d0e21e 6406 return(ret);
a687059c
LW
6407}
6408
873ef191 6409STATIC char *
5f66b61c 6410S_regwhite(char *p, const char *e)
5b5a24f7
CS
6411{
6412 while (p < e) {
6413 if (isSPACE(*p))
6414 ++p;
6415 else if (*p == '#') {
6416 do {
6417 p++;
6418 } while (p < e && *p != '\n');
6419 }
6420 else
6421 break;
6422 }
6423 return p;
6424}
6425
b8c5462f
JH
6426/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
6427 Character classes ([:foo:]) can also be negated ([:^foo:]).
6428 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
6429 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 6430 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
6431
6432#define POSIXCC_DONE(c) ((c) == ':')
6433#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
6434#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
6435
b8c5462f 6436STATIC I32
830247a4 6437S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 6438{
97aff369 6439 dVAR;
936ed897 6440 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 6441
830247a4 6442 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 6443 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 6444 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 6445 const char c = UCHARAT(RExC_parse);
097eb12c 6446 char* const s = RExC_parse++;
b81d288d 6447
9a86a77b 6448 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
6449 RExC_parse++;
6450 if (RExC_parse == RExC_end)
620e46c5 6451 /* Grandfather lone [:, [=, [. */
830247a4 6452 RExC_parse = s;
620e46c5 6453 else {
3dab1dad 6454 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
6455 assert(*t == c);
6456
9a86a77b 6457 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 6458 const char *posixcc = s + 1;
830247a4 6459 RExC_parse++; /* skip over the ending ] */
3dab1dad 6460
b8c5462f 6461 if (*s == ':') {
1df70142
AL
6462 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
6463 const I32 skip = t - posixcc;
80916619
NC
6464
6465 /* Initially switch on the length of the name. */
6466 switch (skip) {
6467 case 4:
3dab1dad
YO
6468 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
6469 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 6470 break;
80916619
NC
6471 case 5:
6472 /* Names all of length 5. */
6473 /* alnum alpha ascii blank cntrl digit graph lower
6474 print punct space upper */
6475 /* Offset 4 gives the best switch position. */
6476 switch (posixcc[4]) {
6477 case 'a':
3dab1dad
YO
6478 if (memEQ(posixcc, "alph", 4)) /* alpha */
6479 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
6480 break;
6481 case 'e':
3dab1dad
YO
6482 if (memEQ(posixcc, "spac", 4)) /* space */
6483 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
6484 break;
6485 case 'h':
3dab1dad
YO
6486 if (memEQ(posixcc, "grap", 4)) /* graph */
6487 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
6488 break;
6489 case 'i':
3dab1dad
YO
6490 if (memEQ(posixcc, "asci", 4)) /* ascii */
6491 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
6492 break;
6493 case 'k':
3dab1dad
YO
6494 if (memEQ(posixcc, "blan", 4)) /* blank */
6495 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
6496 break;
6497 case 'l':
3dab1dad
YO
6498 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
6499 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
6500 break;
6501 case 'm':
3dab1dad
YO
6502 if (memEQ(posixcc, "alnu", 4)) /* alnum */
6503 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
6504 break;
6505 case 'r':
3dab1dad
YO
6506 if (memEQ(posixcc, "lowe", 4)) /* lower */
6507 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
6508 else if (memEQ(posixcc, "uppe", 4)) /* upper */
6509 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
6510 break;
6511 case 't':
3dab1dad
YO
6512 if (memEQ(posixcc, "digi", 4)) /* digit */
6513 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
6514 else if (memEQ(posixcc, "prin", 4)) /* print */
6515 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
6516 else if (memEQ(posixcc, "punc", 4)) /* punct */
6517 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 6518 break;
b8c5462f
JH
6519 }
6520 break;
80916619 6521 case 6:
3dab1dad
YO
6522 if (memEQ(posixcc, "xdigit", 6))
6523 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
6524 break;
6525 }
80916619
NC
6526
6527 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
6528 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
6529 t - s - 1, s + 1);
80916619
NC
6530 assert (posixcc[skip] == ':');
6531 assert (posixcc[skip+1] == ']');
b45f050a 6532 } else if (!SIZE_ONLY) {
b8c5462f 6533 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 6534
830247a4 6535 /* adjust RExC_parse so the warning shows after
b45f050a 6536 the class closes */
9a86a77b 6537 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 6538 RExC_parse++;
b45f050a
JF
6539 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6540 }
b8c5462f
JH
6541 } else {
6542 /* Maternal grandfather:
6543 * "[:" ending in ":" but not in ":]" */
830247a4 6544 RExC_parse = s;
767d463e 6545 }
620e46c5
JH
6546 }
6547 }
6548
b8c5462f
JH
6549 return namedclass;
6550}
6551
6552STATIC void
830247a4 6553S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 6554{
97aff369 6555 dVAR;
3dab1dad 6556 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
6557 const char *s = RExC_parse;
6558 const char c = *s++;
b8c5462f 6559
3dab1dad 6560 while (isALNUM(*s))
b8c5462f
JH
6561 s++;
6562 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
6563 if (ckWARN(WARN_REGEXP))
6564 vWARN3(s+2,
6565 "POSIX syntax [%c %c] belongs inside character classes",
6566 c, c);
b45f050a
JF
6567
6568 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 6569 if (POSIXCC_NOTYET(c)) {
830247a4 6570 /* adjust RExC_parse so the error shows after
b45f050a 6571 the class closes */
9a86a77b 6572 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 6573 NOOP;
b45f050a
JF
6574 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6575 }
b8c5462f
JH
6576 }
6577 }
620e46c5
JH
6578}
6579
7f6f358c
YO
6580
6581/*
6582 parse a class specification and produce either an ANYOF node that
6583 matches the pattern. If the pattern matches a single char only and
6584 that char is < 256 then we produce an EXACT node instead.
6585*/
76e3520e 6586STATIC regnode *
3dab1dad 6587S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 6588{
97aff369 6589 dVAR;
9ef43ace 6590 register UV value = 0;
9a86a77b 6591 register UV nextvalue;
3568d838 6592 register IV prevvalue = OOB_UNICODE;
ffc61ed2 6593 register IV range = 0;
c277df42 6594 register regnode *ret;
ba210ebe 6595 STRLEN numlen;
ffc61ed2 6596 IV namedclass;
cbbf8932 6597 char *rangebegin = NULL;
936ed897 6598 bool need_class = 0;
c445ea15 6599 SV *listsv = NULL;
ffc61ed2 6600 UV n;
9e55ce06 6601 bool optimize_invert = TRUE;
cbbf8932 6602 AV* unicode_alternate = NULL;
1b2d223b
JH
6603#ifdef EBCDIC
6604 UV literal_endpoint = 0;
6605#endif
7f6f358c 6606 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 6607
3dab1dad 6608 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 6609 case we need to change the emitted regop to an EXACT. */
07be1b83 6610 const char * orig_parse = RExC_parse;
72f13be8 6611 GET_RE_DEBUG_FLAGS_DECL;
76e84362
SH
6612#ifndef DEBUGGING
6613 PERL_UNUSED_ARG(depth);
6614#endif
72f13be8 6615
3dab1dad 6616 DEBUG_PARSE("clas");
7f6f358c
YO
6617
6618 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
6619 ret = reganode(pRExC_state, ANYOF, 0);
6620
6621 if (!SIZE_ONLY)
6622 ANYOF_FLAGS(ret) = 0;
6623
9a86a77b 6624 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
6625 RExC_naughty++;
6626 RExC_parse++;
6627 if (!SIZE_ONLY)
6628 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
6629 }
a0d0e21e 6630
73060fc4 6631 if (SIZE_ONLY) {
830247a4 6632 RExC_size += ANYOF_SKIP;
73060fc4
JH
6633 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
6634 }
936ed897 6635 else {
830247a4 6636 RExC_emit += ANYOF_SKIP;
936ed897
IZ
6637 if (FOLD)
6638 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
6639 if (LOC)
6640 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 6641 ANYOF_BITMAP_ZERO(ret);
396482e1 6642 listsv = newSVpvs("# comment\n");
a0d0e21e 6643 }
b8c5462f 6644
9a86a77b
JH
6645 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6646
b938889d 6647 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 6648 checkposixcc(pRExC_state);
b8c5462f 6649
f064b6ad
HS
6650 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
6651 if (UCHARAT(RExC_parse) == ']')
6652 goto charclassloop;
ffc61ed2 6653
fc8cd66c 6654parseit:
9a86a77b 6655 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
6656
6657 charclassloop:
6658
6659 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
6660
73b437c8 6661 if (!range)
830247a4 6662 rangebegin = RExC_parse;
ffc61ed2 6663 if (UTF) {
5e12f4fb 6664 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 6665 RExC_end - RExC_parse,
9f7f3913 6666 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
6667 RExC_parse += numlen;
6668 }
6669 else
6670 value = UCHARAT(RExC_parse++);
7f6f358c 6671
9a86a77b
JH
6672 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6673 if (value == '[' && POSIXCC(nextvalue))
830247a4 6674 namedclass = regpposixcc(pRExC_state, value);
620e46c5 6675 else if (value == '\\') {
ffc61ed2 6676 if (UTF) {
5e12f4fb 6677 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 6678 RExC_end - RExC_parse,
9f7f3913 6679 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
6680 RExC_parse += numlen;
6681 }
6682 else
6683 value = UCHARAT(RExC_parse++);
470c3474 6684 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 6685 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
6686 * be a problem later if we want switch on Unicode.
6687 * A similar issue a little bit later when switching on
6688 * namedclass. --jhi */
ffc61ed2 6689 switch ((I32)value) {
b8c5462f
JH
6690 case 'w': namedclass = ANYOF_ALNUM; break;
6691 case 'W': namedclass = ANYOF_NALNUM; break;
6692 case 's': namedclass = ANYOF_SPACE; break;
6693 case 'S': namedclass = ANYOF_NSPACE; break;
6694 case 'd': namedclass = ANYOF_DIGIT; break;
6695 case 'D': namedclass = ANYOF_NDIGIT; break;
fc8cd66c
YO
6696 case 'N': /* Handle \N{NAME} in class */
6697 {
6698 /* We only pay attention to the first char of
6699 multichar strings being returned. I kinda wonder
6700 if this makes sense as it does change the behaviour
6701 from earlier versions, OTOH that behaviour was broken
6702 as well. */
6703 UV v; /* value is register so we cant & it /grrr */
6704 if (reg_namedseq(pRExC_state, &v)) {
6705 goto parseit;
6706 }
6707 value= v;
6708 }
6709 break;
ffc61ed2
JH
6710 case 'p':
6711 case 'P':
3dab1dad
YO
6712 {
6713 char *e;
af6f566e 6714 if (RExC_parse >= RExC_end)
2a4859cd 6715 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 6716 if (*RExC_parse == '{') {
1df70142 6717 const U8 c = (U8)value;
ffc61ed2
JH
6718 e = strchr(RExC_parse++, '}');
6719 if (!e)
0da60cf5 6720 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
6721 while (isSPACE(UCHARAT(RExC_parse)))
6722 RExC_parse++;
6723 if (e == RExC_parse)
0da60cf5 6724 vFAIL2("Empty \\%c{}", c);
ffc61ed2 6725 n = e - RExC_parse;
ab13f0c7
JH
6726 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
6727 n--;
ffc61ed2
JH
6728 }
6729 else {
6730 e = RExC_parse;
6731 n = 1;
6732 }
6733 if (!SIZE_ONLY) {
ab13f0c7
JH
6734 if (UCHARAT(RExC_parse) == '^') {
6735 RExC_parse++;
6736 n--;
6737 value = value == 'p' ? 'P' : 'p'; /* toggle */
6738 while (isSPACE(UCHARAT(RExC_parse))) {
6739 RExC_parse++;
6740 n--;
6741 }
6742 }
097eb12c
AL
6743 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
6744 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
6745 }
6746 RExC_parse = e + 1;
6747 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 6748 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 6749 }
f81125e2 6750 break;
b8c5462f
JH
6751 case 'n': value = '\n'; break;
6752 case 'r': value = '\r'; break;
6753 case 't': value = '\t'; break;
6754 case 'f': value = '\f'; break;
6755 case 'b': value = '\b'; break;
c7f1f016
NIS
6756 case 'e': value = ASCII_TO_NATIVE('\033');break;
6757 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 6758 case 'x':
ffc61ed2 6759 if (*RExC_parse == '{') {
a4c04bdc
NC
6760 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6761 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 6762 char * const e = strchr(RExC_parse++, '}');
b81d288d 6763 if (!e)
ffc61ed2 6764 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
6765
6766 numlen = e - RExC_parse;
6767 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
6768 RExC_parse = e + 1;
6769 }
6770 else {
a4c04bdc 6771 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
6772 numlen = 2;
6773 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
6774 RExC_parse += numlen;
6775 }
b8c5462f
JH
6776 break;
6777 case 'c':
830247a4 6778 value = UCHARAT(RExC_parse++);
b8c5462f
JH
6779 value = toCTRL(value);
6780 break;
6781 case '0': case '1': case '2': case '3': case '4':
6782 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
6783 {
6784 I32 flags = 0;
6785 numlen = 3;
6786 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 6787 RExC_parse += numlen;
b8c5462f 6788 break;
53305cf1 6789 }
1028017a 6790 default:
041457d9 6791 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
6792 vWARN2(RExC_parse,
6793 "Unrecognized escape \\%c in character class passed through",
6794 (int)value);
1028017a 6795 break;
b8c5462f 6796 }
ffc61ed2 6797 } /* end of \blah */
1b2d223b
JH
6798#ifdef EBCDIC
6799 else
6800 literal_endpoint++;
6801#endif
ffc61ed2
JH
6802
6803 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
6804
6805 if (!SIZE_ONLY && !need_class)
936ed897 6806 ANYOF_CLASS_ZERO(ret);
ffc61ed2 6807
936ed897 6808 need_class = 1;
ffc61ed2
JH
6809
6810 /* a bad range like a-\d, a-[:digit:] ? */
6811 if (range) {
73b437c8 6812 if (!SIZE_ONLY) {
afd78fd5 6813 if (ckWARN(WARN_REGEXP)) {
097eb12c 6814 const int w =
afd78fd5
JH
6815 RExC_parse >= rangebegin ?
6816 RExC_parse - rangebegin : 0;
830247a4 6817 vWARN4(RExC_parse,
b45f050a 6818 "False [] range \"%*.*s\"",
097eb12c 6819 w, w, rangebegin);
afd78fd5 6820 }
3568d838
JH
6821 if (prevvalue < 256) {
6822 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
6823 ANYOF_BITMAP_SET(ret, '-');
6824 }
6825 else {
6826 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
6827 Perl_sv_catpvf(aTHX_ listsv,
3568d838 6828 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 6829 }
b8c5462f 6830 }
ffc61ed2
JH
6831
6832 range = 0; /* this was not a true range */
73b437c8 6833 }
ffc61ed2 6834
73b437c8 6835 if (!SIZE_ONLY) {
c49a72a9
NC
6836 const char *what = NULL;
6837 char yesno = 0;
6838
3568d838
JH
6839 if (namedclass > OOB_NAMEDCLASS)
6840 optimize_invert = FALSE;
e2962f66
JH
6841 /* Possible truncation here but in some 64-bit environments
6842 * the compiler gets heartburn about switch on 64-bit values.
6843 * A similar issue a little earlier when switching on value.
98f323fa 6844 * --jhi */
e2962f66 6845 switch ((I32)namedclass) {
73b437c8
JH
6846 case ANYOF_ALNUM:
6847 if (LOC)
936ed897 6848 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
6849 else {
6850 for (value = 0; value < 256; value++)
6851 if (isALNUM(value))
936ed897 6852 ANYOF_BITMAP_SET(ret, value);
73b437c8 6853 }
c49a72a9
NC
6854 yesno = '+';
6855 what = "Word";
73b437c8
JH
6856 break;
6857 case ANYOF_NALNUM:
6858 if (LOC)
936ed897 6859 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
6860 else {
6861 for (value = 0; value < 256; value++)
6862 if (!isALNUM(value))
936ed897 6863 ANYOF_BITMAP_SET(ret, value);
73b437c8 6864 }
c49a72a9
NC
6865 yesno = '!';
6866 what = "Word";
73b437c8 6867 break;
ffc61ed2 6868 case ANYOF_ALNUMC:
73b437c8 6869 if (LOC)
ffc61ed2 6870 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
6871 else {
6872 for (value = 0; value < 256; value++)
ffc61ed2 6873 if (isALNUMC(value))
936ed897 6874 ANYOF_BITMAP_SET(ret, value);
73b437c8 6875 }
c49a72a9
NC
6876 yesno = '+';
6877 what = "Alnum";
73b437c8
JH
6878 break;
6879 case ANYOF_NALNUMC:
6880 if (LOC)
936ed897 6881 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
6882 else {
6883 for (value = 0; value < 256; value++)
6884 if (!isALNUMC(value))
936ed897 6885 ANYOF_BITMAP_SET(ret, value);
73b437c8 6886 }
c49a72a9
NC
6887 yesno = '!';
6888 what = "Alnum";
73b437c8
JH
6889 break;
6890 case ANYOF_ALPHA:
6891 if (LOC)
936ed897 6892 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
6893 else {
6894 for (value = 0; value < 256; value++)
6895 if (isALPHA(value))
936ed897 6896 ANYOF_BITMAP_SET(ret, value);
73b437c8 6897 }
c49a72a9
NC
6898 yesno = '+';
6899 what = "Alpha";
73b437c8
JH
6900 break;
6901 case ANYOF_NALPHA:
6902 if (LOC)
936ed897 6903 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
6904 else {
6905 for (value = 0; value < 256; value++)
6906 if (!isALPHA(value))
936ed897 6907 ANYOF_BITMAP_SET(ret, value);
73b437c8 6908 }
c49a72a9
NC
6909 yesno = '!';
6910 what = "Alpha";
73b437c8
JH
6911 break;
6912 case ANYOF_ASCII:
6913 if (LOC)
936ed897 6914 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 6915 else {
c7f1f016 6916#ifndef EBCDIC
1ba5c669
JH
6917 for (value = 0; value < 128; value++)
6918 ANYOF_BITMAP_SET(ret, value);
6919#else /* EBCDIC */
ffbc6a93 6920 for (value = 0; value < 256; value++) {
3a3c4447
JH
6921 if (isASCII(value))
6922 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 6923 }
1ba5c669 6924#endif /* EBCDIC */
73b437c8 6925 }
c49a72a9
NC
6926 yesno = '+';
6927 what = "ASCII";
73b437c8
JH
6928 break;
6929 case ANYOF_NASCII:
6930 if (LOC)
936ed897 6931 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 6932 else {
c7f1f016 6933#ifndef EBCDIC
1ba5c669
JH
6934 for (value = 128; value < 256; value++)
6935 ANYOF_BITMAP_SET(ret, value);
6936#else /* EBCDIC */
ffbc6a93 6937 for (value = 0; value < 256; value++) {
3a3c4447
JH
6938 if (!isASCII(value))
6939 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 6940 }
1ba5c669 6941#endif /* EBCDIC */
73b437c8 6942 }
c49a72a9
NC
6943 yesno = '!';
6944 what = "ASCII";
73b437c8 6945 break;
aaa51d5e
JF
6946 case ANYOF_BLANK:
6947 if (LOC)
6948 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
6949 else {
6950 for (value = 0; value < 256; value++)
6951 if (isBLANK(value))
6952 ANYOF_BITMAP_SET(ret, value);
6953 }
c49a72a9
NC
6954 yesno = '+';
6955 what = "Blank";
aaa51d5e
JF
6956 break;
6957 case ANYOF_NBLANK:
6958 if (LOC)
6959 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
6960 else {
6961 for (value = 0; value < 256; value++)
6962 if (!isBLANK(value))
6963 ANYOF_BITMAP_SET(ret, value);
6964 }
c49a72a9
NC
6965 yesno = '!';
6966 what = "Blank";
aaa51d5e 6967 break;
73b437c8
JH
6968 case ANYOF_CNTRL:
6969 if (LOC)
936ed897 6970 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
6971 else {
6972 for (value = 0; value < 256; value++)
6973 if (isCNTRL(value))
936ed897 6974 ANYOF_BITMAP_SET(ret, value);
73b437c8 6975 }
c49a72a9
NC
6976 yesno = '+';
6977 what = "Cntrl";
73b437c8
JH
6978 break;
6979 case ANYOF_NCNTRL:
6980 if (LOC)
936ed897 6981 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
6982 else {
6983 for (value = 0; value < 256; value++)
6984 if (!isCNTRL(value))
936ed897 6985 ANYOF_BITMAP_SET(ret, value);
73b437c8 6986 }
c49a72a9
NC
6987 yesno = '!';
6988 what = "Cntrl";
ffc61ed2
JH
6989 break;
6990 case ANYOF_DIGIT:
6991 if (LOC)
6992 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
6993 else {
6994 /* consecutive digits assumed */
6995 for (value = '0'; value <= '9'; value++)
6996 ANYOF_BITMAP_SET(ret, value);
6997 }
c49a72a9
NC
6998 yesno = '+';
6999 what = "Digit";
ffc61ed2
JH
7000 break;
7001 case ANYOF_NDIGIT:
7002 if (LOC)
7003 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
7004 else {
7005 /* consecutive digits assumed */
7006 for (value = 0; value < '0'; value++)
7007 ANYOF_BITMAP_SET(ret, value);
7008 for (value = '9' + 1; value < 256; value++)
7009 ANYOF_BITMAP_SET(ret, value);
7010 }
c49a72a9
NC
7011 yesno = '!';
7012 what = "Digit";
73b437c8
JH
7013 break;
7014 case ANYOF_GRAPH:
7015 if (LOC)
936ed897 7016 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
7017 else {
7018 for (value = 0; value < 256; value++)
7019 if (isGRAPH(value))
936ed897 7020 ANYOF_BITMAP_SET(ret, value);
73b437c8 7021 }
c49a72a9
NC
7022 yesno = '+';
7023 what = "Graph";
73b437c8
JH
7024 break;
7025 case ANYOF_NGRAPH:
7026 if (LOC)
936ed897 7027 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
7028 else {
7029 for (value = 0; value < 256; value++)
7030 if (!isGRAPH(value))
936ed897 7031 ANYOF_BITMAP_SET(ret, value);
73b437c8 7032 }
c49a72a9
NC
7033 yesno = '!';
7034 what = "Graph";
73b437c8
JH
7035 break;
7036 case ANYOF_LOWER:
7037 if (LOC)
936ed897 7038 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
7039 else {
7040 for (value = 0; value < 256; value++)
7041 if (isLOWER(value))
936ed897 7042 ANYOF_BITMAP_SET(ret, value);
73b437c8 7043 }
c49a72a9
NC
7044 yesno = '+';
7045 what = "Lower";
73b437c8
JH
7046 break;
7047 case ANYOF_NLOWER:
7048 if (LOC)
936ed897 7049 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
7050 else {
7051 for (value = 0; value < 256; value++)
7052 if (!isLOWER(value))
936ed897 7053 ANYOF_BITMAP_SET(ret, value);
73b437c8 7054 }
c49a72a9
NC
7055 yesno = '!';
7056 what = "Lower";
73b437c8
JH
7057 break;
7058 case ANYOF_PRINT:
7059 if (LOC)
936ed897 7060 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
7061 else {
7062 for (value = 0; value < 256; value++)
7063 if (isPRINT(value))
936ed897 7064 ANYOF_BITMAP_SET(ret, value);
73b437c8 7065 }
c49a72a9
NC
7066 yesno = '+';
7067 what = "Print";
73b437c8
JH
7068 break;
7069 case ANYOF_NPRINT:
7070 if (LOC)
936ed897 7071 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
7072 else {
7073 for (value = 0; value < 256; value++)
7074 if (!isPRINT(value))
936ed897 7075 ANYOF_BITMAP_SET(ret, value);
73b437c8 7076 }
c49a72a9
NC
7077 yesno = '!';
7078 what = "Print";
73b437c8 7079 break;
aaa51d5e
JF
7080 case ANYOF_PSXSPC:
7081 if (LOC)
7082 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
7083 else {
7084 for (value = 0; value < 256; value++)
7085 if (isPSXSPC(value))
7086 ANYOF_BITMAP_SET(ret, value);
7087 }
c49a72a9
NC
7088 yesno = '+';
7089 what = "Space";
aaa51d5e
JF
7090 break;
7091 case ANYOF_NPSXSPC:
7092 if (LOC)
7093 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
7094 else {
7095 for (value = 0; value < 256; value++)
7096 if (!isPSXSPC(value))
7097 ANYOF_BITMAP_SET(ret, value);
7098 }
c49a72a9
NC
7099 yesno = '!';
7100 what = "Space";
aaa51d5e 7101 break;
73b437c8
JH
7102 case ANYOF_PUNCT:
7103 if (LOC)
936ed897 7104 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
7105 else {
7106 for (value = 0; value < 256; value++)
7107 if (isPUNCT(value))
936ed897 7108 ANYOF_BITMAP_SET(ret, value);
73b437c8 7109 }
c49a72a9
NC
7110 yesno = '+';
7111 what = "Punct";
73b437c8
JH
7112 break;
7113 case ANYOF_NPUNCT:
7114 if (LOC)
936ed897 7115 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
7116 else {
7117 for (value = 0; value < 256; value++)
7118 if (!isPUNCT(value))
936ed897 7119 ANYOF_BITMAP_SET(ret, value);
73b437c8 7120 }
c49a72a9
NC
7121 yesno = '!';
7122 what = "Punct";
ffc61ed2
JH
7123 break;
7124 case ANYOF_SPACE:
7125 if (LOC)
7126 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
7127 else {
7128 for (value = 0; value < 256; value++)
7129 if (isSPACE(value))
7130 ANYOF_BITMAP_SET(ret, value);
7131 }
c49a72a9
NC
7132 yesno = '+';
7133 what = "SpacePerl";
ffc61ed2
JH
7134 break;
7135 case ANYOF_NSPACE:
7136 if (LOC)
7137 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
7138 else {
7139 for (value = 0; value < 256; value++)
7140 if (!isSPACE(value))
7141 ANYOF_BITMAP_SET(ret, value);
7142 }
c49a72a9
NC
7143 yesno = '!';
7144 what = "SpacePerl";
73b437c8
JH
7145 break;
7146 case ANYOF_UPPER:
7147 if (LOC)
936ed897 7148 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
7149 else {
7150 for (value = 0; value < 256; value++)
7151 if (isUPPER(value))
936ed897 7152 ANYOF_BITMAP_SET(ret, value);
73b437c8 7153 }
c49a72a9
NC
7154 yesno = '+';
7155 what = "Upper";
73b437c8
JH
7156 break;
7157 case ANYOF_NUPPER:
7158 if (LOC)
936ed897 7159 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
7160 else {
7161 for (value = 0; value < 256; value++)
7162 if (!isUPPER(value))
936ed897 7163 ANYOF_BITMAP_SET(ret, value);
73b437c8 7164 }
c49a72a9
NC
7165 yesno = '!';
7166 what = "Upper";
73b437c8
JH
7167 break;
7168 case ANYOF_XDIGIT:
7169 if (LOC)
936ed897 7170 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
7171 else {
7172 for (value = 0; value < 256; value++)
7173 if (isXDIGIT(value))
936ed897 7174 ANYOF_BITMAP_SET(ret, value);
73b437c8 7175 }
c49a72a9
NC
7176 yesno = '+';
7177 what = "XDigit";
73b437c8
JH
7178 break;
7179 case ANYOF_NXDIGIT:
7180 if (LOC)
936ed897 7181 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
7182 else {
7183 for (value = 0; value < 256; value++)
7184 if (!isXDIGIT(value))
936ed897 7185 ANYOF_BITMAP_SET(ret, value);
73b437c8 7186 }
c49a72a9
NC
7187 yesno = '!';
7188 what = "XDigit";
73b437c8 7189 break;
f81125e2
JP
7190 case ANYOF_MAX:
7191 /* this is to handle \p and \P */
7192 break;
73b437c8 7193 default:
b45f050a 7194 vFAIL("Invalid [::] class");
73b437c8 7195 break;
b8c5462f 7196 }
c49a72a9
NC
7197 if (what) {
7198 /* Strings such as "+utf8::isWord\n" */
7199 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
7200 }
b8c5462f 7201 if (LOC)
936ed897 7202 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 7203 continue;
a0d0e21e 7204 }
ffc61ed2
JH
7205 } /* end of namedclass \blah */
7206
a0d0e21e 7207 if (range) {
eb160463 7208 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
7209 const int w = RExC_parse - rangebegin;
7210 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 7211 range = 0; /* not a valid range */
73b437c8 7212 }
a0d0e21e
LW
7213 }
7214 else {
3568d838 7215 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
7216 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
7217 RExC_parse[1] != ']') {
7218 RExC_parse++;
ffc61ed2
JH
7219
7220 /* a bad range like \w-, [:word:]- ? */
7221 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 7222 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 7223 const int w =
afd78fd5
JH
7224 RExC_parse >= rangebegin ?
7225 RExC_parse - rangebegin : 0;
830247a4 7226 vWARN4(RExC_parse,
b45f050a 7227 "False [] range \"%*.*s\"",
097eb12c 7228 w, w, rangebegin);
afd78fd5 7229 }
73b437c8 7230 if (!SIZE_ONLY)
936ed897 7231 ANYOF_BITMAP_SET(ret, '-');
73b437c8 7232 } else
ffc61ed2
JH
7233 range = 1; /* yeah, it's a range! */
7234 continue; /* but do it the next time */
a0d0e21e 7235 }
a687059c 7236 }
ffc61ed2 7237
93733859 7238 /* now is the next time */
07be1b83 7239 /*stored += (value - prevvalue + 1);*/
ae5c130c 7240 if (!SIZE_ONLY) {
3568d838 7241 if (prevvalue < 256) {
1df70142 7242 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 7243 IV i;
3568d838 7244#ifdef EBCDIC
1b2d223b
JH
7245 /* In EBCDIC [\x89-\x91] should include
7246 * the \x8e but [i-j] should not. */
7247 if (literal_endpoint == 2 &&
7248 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
7249 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 7250 {
3568d838
JH
7251 if (isLOWER(prevvalue)) {
7252 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7253 if (isLOWER(i))
7254 ANYOF_BITMAP_SET(ret, i);
7255 } else {
3568d838 7256 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7257 if (isUPPER(i))
7258 ANYOF_BITMAP_SET(ret, i);
7259 }
8ada0baa 7260 }
ffc61ed2 7261 else
8ada0baa 7262#endif
07be1b83
YO
7263 for (i = prevvalue; i <= ceilvalue; i++) {
7264 if (!ANYOF_BITMAP_TEST(ret,i)) {
7265 stored++;
7266 ANYOF_BITMAP_SET(ret, i);
7267 }
7268 }
3568d838 7269 }
a5961de5 7270 if (value > 255 || UTF) {
1df70142
AL
7271 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
7272 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 7273 stored+=2; /* can't optimize this class */
ffc61ed2 7274 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 7275 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 7276 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
7277 prevnatvalue, natvalue);
7278 }
7279 else if (prevnatvalue == natvalue) {
7280 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 7281 if (FOLD) {
89ebb4a3 7282 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 7283 STRLEN foldlen;
1df70142 7284 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 7285
e294cc5d
JH
7286#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
7287 if (RExC_precomp[0] == ':' &&
7288 RExC_precomp[1] == '[' &&
7289 (f == 0xDF || f == 0x92)) {
7290 f = NATIVE_TO_UNI(f);
7291 }
7292#endif
c840d2a2
JH
7293 /* If folding and foldable and a single
7294 * character, insert also the folded version
7295 * to the charclass. */
9e55ce06 7296 if (f != value) {
e294cc5d
JH
7297#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
7298 if ((RExC_precomp[0] == ':' &&
7299 RExC_precomp[1] == '[' &&
7300 (f == 0xA2 &&
7301 (value == 0xFB05 || value == 0xFB06))) ?
7302 foldlen == ((STRLEN)UNISKIP(f) - 1) :
7303 foldlen == (STRLEN)UNISKIP(f) )
7304#else
eb160463 7305 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 7306#endif
9e55ce06
JH
7307 Perl_sv_catpvf(aTHX_ listsv,
7308 "%04"UVxf"\n", f);
7309 else {
7310 /* Any multicharacter foldings
7311 * require the following transform:
7312 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
7313 * where E folds into "pq" and F folds
7314 * into "rst", all other characters
7315 * fold to single characters. We save
7316 * away these multicharacter foldings,
7317 * to be later saved as part of the
7318 * additional "s" data. */
7319 SV *sv;
7320
7321 if (!unicode_alternate)
7322 unicode_alternate = newAV();
7323 sv = newSVpvn((char*)foldbuf, foldlen);
7324 SvUTF8_on(sv);
7325 av_push(unicode_alternate, sv);
7326 }
7327 }
254ba52a 7328
60a8b682
JH
7329 /* If folding and the value is one of the Greek
7330 * sigmas insert a few more sigmas to make the
7331 * folding rules of the sigmas to work right.
7332 * Note that not all the possible combinations
7333 * are handled here: some of them are handled
9e55ce06
JH
7334 * by the standard folding rules, and some of
7335 * them (literal or EXACTF cases) are handled
7336 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
7337 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
7338 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7339 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 7340 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7341 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7342 }
7343 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
7344 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7345 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7346 }
7347 }
ffc61ed2 7348 }
1b2d223b
JH
7349#ifdef EBCDIC
7350 literal_endpoint = 0;
7351#endif
8ada0baa 7352 }
ffc61ed2
JH
7353
7354 range = 0; /* this range (if it was one) is done now */
a0d0e21e 7355 }
ffc61ed2 7356
936ed897 7357 if (need_class) {
4f66b38d 7358 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 7359 if (SIZE_ONLY)
830247a4 7360 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 7361 else
830247a4 7362 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 7363 }
ffc61ed2 7364
7f6f358c
YO
7365
7366 if (SIZE_ONLY)
7367 return ret;
7368 /****** !SIZE_ONLY AFTER HERE *********/
7369
7370 if( stored == 1 && value < 256
7371 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
7372 ) {
7373 /* optimize single char class to an EXACT node
7374 but *only* when its not a UTF/high char */
07be1b83
YO
7375 const char * cur_parse= RExC_parse;
7376 RExC_emit = (regnode *)orig_emit;
7377 RExC_parse = (char *)orig_parse;
7f6f358c
YO
7378 ret = reg_node(pRExC_state,
7379 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 7380 RExC_parse = (char *)cur_parse;
7f6f358c
YO
7381 *STRING(ret)= (char)value;
7382 STR_LEN(ret)= 1;
7383 RExC_emit += STR_SZ(1);
7384 return ret;
7385 }
ae5c130c 7386 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 7387 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
7388 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
7389 ) {
a0ed51b3 7390 for (value = 0; value < 256; ++value) {
936ed897 7391 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 7392 UV fold = PL_fold[value];
ffc61ed2
JH
7393
7394 if (fold != value)
7395 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
7396 }
7397 }
936ed897 7398 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 7399 }
ffc61ed2 7400
ae5c130c 7401 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 7402 if (optimize_invert &&
ffc61ed2
JH
7403 /* If the only flag is inversion. */
7404 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 7405 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 7406 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 7407 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 7408 }
7f6f358c 7409 {
097eb12c 7410 AV * const av = newAV();
ffc61ed2 7411 SV *rv;
9e55ce06 7412 /* The 0th element stores the character class description
6a0407ee 7413 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
7414 * to initialize the appropriate swash (which gets stored in
7415 * the 1st element), and also useful for dumping the regnode.
7416 * The 2nd element stores the multicharacter foldings,
6a0407ee 7417 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
7418 av_store(av, 0, listsv);
7419 av_store(av, 1, NULL);
9e55ce06 7420 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 7421 rv = newRV_noinc((SV*)av);
19860706 7422 n = add_data(pRExC_state, 1, "s");
830247a4 7423 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 7424 ARG_SET(ret, n);
a0ed51b3 7425 }
a0ed51b3
LW
7426 return ret;
7427}
7428
76e3520e 7429STATIC char*
830247a4 7430S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 7431{
097eb12c 7432 char* const retval = RExC_parse++;
a0d0e21e 7433
4633a7c4 7434 for (;;) {
830247a4
IZ
7435 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
7436 RExC_parse[2] == '#') {
e994fd66
AE
7437 while (*RExC_parse != ')') {
7438 if (RExC_parse == RExC_end)
7439 FAIL("Sequence (?#... not terminated");
830247a4 7440 RExC_parse++;
e994fd66 7441 }
830247a4 7442 RExC_parse++;
4633a7c4
LW
7443 continue;
7444 }
e2509266 7445 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
7446 if (isSPACE(*RExC_parse)) {
7447 RExC_parse++;
748a9306
LW
7448 continue;
7449 }
830247a4 7450 else if (*RExC_parse == '#') {
e994fd66
AE
7451 while (RExC_parse < RExC_end)
7452 if (*RExC_parse++ == '\n') break;
748a9306
LW
7453 continue;
7454 }
748a9306 7455 }
4633a7c4 7456 return retval;
a0d0e21e 7457 }
a687059c
LW
7458}
7459
7460/*
c277df42 7461- reg_node - emit a node
a0d0e21e 7462*/
76e3520e 7463STATIC regnode * /* Location. */
830247a4 7464S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 7465{
97aff369 7466 dVAR;
c277df42 7467 register regnode *ptr;
504618e9 7468 regnode * const ret = RExC_emit;
07be1b83 7469 GET_RE_DEBUG_FLAGS_DECL;
a687059c 7470
c277df42 7471 if (SIZE_ONLY) {
830247a4
IZ
7472 SIZE_ALIGN(RExC_size);
7473 RExC_size += 1;
a0d0e21e
LW
7474 return(ret);
7475 }
c277df42 7476 NODE_ALIGN_FILL(ret);
a0d0e21e 7477 ptr = ret;
c277df42 7478 FILL_ADVANCE_NODE(ptr, op);
fac92740 7479 if (RExC_offsets) { /* MJD */
07be1b83 7480 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740
MJD
7481 "reg_node", __LINE__,
7482 reg_name[op],
07be1b83
YO
7483 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
7484 ? "Overwriting end of array!\n" : "OK",
7485 (UV)(RExC_emit - RExC_emit_start),
7486 (UV)(RExC_parse - RExC_start),
7487 (UV)RExC_offsets[0]));
ccb2c380 7488 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 7489 }
07be1b83 7490
830247a4 7491 RExC_emit = ptr;
a687059c 7492
a0d0e21e 7493 return(ret);
a687059c
LW
7494}
7495
7496/*
a0d0e21e
LW
7497- reganode - emit a node with an argument
7498*/
76e3520e 7499STATIC regnode * /* Location. */
830247a4 7500S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 7501{
97aff369 7502 dVAR;
c277df42 7503 register regnode *ptr;
504618e9 7504 regnode * const ret = RExC_emit;
07be1b83 7505 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 7506
c277df42 7507 if (SIZE_ONLY) {
830247a4
IZ
7508 SIZE_ALIGN(RExC_size);
7509 RExC_size += 2;
6bda09f9
YO
7510 /*
7511 We can't do this:
7512
7513 assert(2==regarglen[op]+1);
7514
7515 Anything larger than this has to allocate the extra amount.
7516 If we changed this to be:
7517
7518 RExC_size += (1 + regarglen[op]);
7519
7520 then it wouldn't matter. Its not clear what side effect
7521 might come from that so its not done so far.
7522 -- dmq
7523 */
a0d0e21e
LW
7524 return(ret);
7525 }
fe14fcc3 7526
c277df42 7527 NODE_ALIGN_FILL(ret);
a0d0e21e 7528 ptr = ret;
c277df42 7529 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 7530 if (RExC_offsets) { /* MJD */
07be1b83 7531 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7532 "reganode",
ccb2c380
MP
7533 __LINE__,
7534 reg_name[op],
07be1b83 7535 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 7536 "Overwriting end of array!\n" : "OK",
07be1b83
YO
7537 (UV)(RExC_emit - RExC_emit_start),
7538 (UV)(RExC_parse - RExC_start),
7539 (UV)RExC_offsets[0]));
ccb2c380 7540 Set_Cur_Node_Offset;
fac92740
MJD
7541 }
7542
830247a4 7543 RExC_emit = ptr;
fe14fcc3 7544
a0d0e21e 7545 return(ret);
fe14fcc3
LW
7546}
7547
7548/*
cd439c50 7549- reguni - emit (if appropriate) a Unicode character
a0ed51b3 7550*/
71207a34
AL
7551STATIC STRLEN
7552S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 7553{
97aff369 7554 dVAR;
71207a34 7555 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
7556}
7557
7558/*
a0d0e21e
LW
7559- reginsert - insert an operator in front of already-emitted operand
7560*
7561* Means relocating the operand.
7562*/
76e3520e 7563STATIC void
6bda09f9 7564S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
a687059c 7565{
97aff369 7566 dVAR;
c277df42
IZ
7567 register regnode *src;
7568 register regnode *dst;
7569 register regnode *place;
504618e9 7570 const int offset = regarglen[(U8)op];
6bda09f9 7571 const int size = NODE_STEP_REGNODE + offset;
07be1b83 7572 GET_RE_DEBUG_FLAGS_DECL;
22c35a8c 7573/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
6bda09f9 7574 DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]);
c277df42 7575 if (SIZE_ONLY) {
6bda09f9 7576 RExC_size += size;
a0d0e21e
LW
7577 return;
7578 }
a687059c 7579
830247a4 7580 src = RExC_emit;
6bda09f9 7581 RExC_emit += size;
830247a4 7582 dst = RExC_emit;
40d049e4 7583 if (RExC_open_parens) {
6bda09f9 7584 int paren;
40d049e4 7585 DEBUG_PARSE_FMT("inst"," - %d",RExC_npar);
6bda09f9 7586 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
40d049e4
YO
7587 if ( RExC_open_parens[paren] >= opnd ) {
7588 DEBUG_PARSE_FMT("open"," - %d",size);
7589 RExC_open_parens[paren] += size;
7590 } else {
7591 DEBUG_PARSE_FMT("open"," - %s","ok");
7592 }
7593 if ( RExC_close_parens[paren] >= opnd ) {
7594 DEBUG_PARSE_FMT("close"," - %d",size);
7595 RExC_close_parens[paren] += size;
7596 } else {
7597 DEBUG_PARSE_FMT("close"," - %s","ok");
7598 }
7599 }
6bda09f9 7600 }
40d049e4 7601
fac92740 7602 while (src > opnd) {
c277df42 7603 StructCopy(--src, --dst, regnode);
fac92740 7604 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 7605 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 7606 "reg_insert",
ccb2c380
MP
7607 __LINE__,
7608 reg_name[op],
07be1b83
YO
7609 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
7610 ? "Overwriting end of array!\n" : "OK",
7611 (UV)(src - RExC_emit_start),
7612 (UV)(dst - RExC_emit_start),
7613 (UV)RExC_offsets[0]));
ccb2c380
MP
7614 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
7615 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
7616 }
7617 }
7618
a0d0e21e
LW
7619
7620 place = opnd; /* Op node, where operand used to be. */
fac92740 7621 if (RExC_offsets) { /* MJD */
07be1b83 7622 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7623 "reginsert",
ccb2c380
MP
7624 __LINE__,
7625 reg_name[op],
07be1b83 7626 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 7627 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
7628 (UV)(place - RExC_emit_start),
7629 (UV)(RExC_parse - RExC_start),
786e8c11 7630 (UV)RExC_offsets[0]));
ccb2c380 7631 Set_Node_Offset(place, RExC_parse);
45948336 7632 Set_Node_Length(place, 1);
fac92740 7633 }
c277df42
IZ
7634 src = NEXTOPER(place);
7635 FILL_ADVANCE_NODE(place, op);
7636 Zero(src, offset, regnode);
a687059c
LW
7637}
7638
7639/*
c277df42 7640- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 7641- SEE ALSO: regtail_study
a0d0e21e 7642*/
097eb12c 7643/* TODO: All three parms should be const */
76e3520e 7644STATIC void
3dab1dad 7645S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 7646{
97aff369 7647 dVAR;
c277df42 7648 register regnode *scan;
72f13be8 7649 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1
SP
7650#ifndef DEBUGGING
7651 PERL_UNUSED_ARG(depth);
7652#endif
a0d0e21e 7653
c277df42 7654 if (SIZE_ONLY)
a0d0e21e
LW
7655 return;
7656
7657 /* Find last node. */
7658 scan = p;
7659 for (;;) {
504618e9 7660 regnode * const temp = regnext(scan);
3dab1dad
YO
7661 DEBUG_PARSE_r({
7662 SV * const mysv=sv_newmortal();
7663 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
7664 regprop(RExC_rx, mysv, scan);
eaf3ca90
YO
7665 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
7666 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
7667 (temp == NULL ? "->" : ""),
7668 (temp == NULL ? reg_name[OP(val)] : "")
7669 );
3dab1dad
YO
7670 });
7671 if (temp == NULL)
7672 break;
7673 scan = temp;
7674 }
7675
7676 if (reg_off_by_arg[OP(scan)]) {
7677 ARG_SET(scan, val - scan);
7678 }
7679 else {
7680 NEXT_OFF(scan) = val - scan;
7681 }
7682}
7683
07be1b83 7684#ifdef DEBUGGING
3dab1dad
YO
7685/*
7686- regtail_study - set the next-pointer at the end of a node chain of p to val.
7687- Look for optimizable sequences at the same time.
7688- currently only looks for EXACT chains.
07be1b83
YO
7689
7690This is expermental code. The idea is to use this routine to perform
7691in place optimizations on branches and groups as they are constructed,
7692with the long term intention of removing optimization from study_chunk so
7693that it is purely analytical.
7694
7695Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
7696to control which is which.
7697
3dab1dad
YO
7698*/
7699/* TODO: All four parms should be const */
07be1b83 7700
3dab1dad
YO
7701STATIC U8
7702S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
7703{
7704 dVAR;
7705 register regnode *scan;
07be1b83
YO
7706 U8 exact = PSEUDO;
7707#ifdef EXPERIMENTAL_INPLACESCAN
7708 I32 min = 0;
7709#endif
7710
3dab1dad
YO
7711 GET_RE_DEBUG_FLAGS_DECL;
7712
07be1b83 7713
3dab1dad
YO
7714 if (SIZE_ONLY)
7715 return exact;
7716
7717 /* Find last node. */
7718
7719 scan = p;
7720 for (;;) {
7721 regnode * const temp = regnext(scan);
07be1b83
YO
7722#ifdef EXPERIMENTAL_INPLACESCAN
7723 if (PL_regkind[OP(scan)] == EXACT)
7724 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
7725 return EXACT;
7726#endif
3dab1dad
YO
7727 if ( exact ) {
7728 switch (OP(scan)) {
7729 case EXACT:
7730 case EXACTF:
7731 case EXACTFL:
7732 if( exact == PSEUDO )
7733 exact= OP(scan);
07be1b83
YO
7734 else if ( exact != OP(scan) )
7735 exact= 0;
3dab1dad
YO
7736 case NOTHING:
7737 break;
7738 default:
7739 exact= 0;
7740 }
7741 }
7742 DEBUG_PARSE_r({
7743 SV * const mysv=sv_newmortal();
7744 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
7745 regprop(RExC_rx, mysv, scan);
eaf3ca90 7746 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
3dab1dad 7747 SvPV_nolen_const(mysv),
eaf3ca90
YO
7748 REG_NODE_NUM(scan),
7749 reg_name[exact]);
3dab1dad 7750 });
a0d0e21e
LW
7751 if (temp == NULL)
7752 break;
7753 scan = temp;
7754 }
07be1b83
YO
7755 DEBUG_PARSE_r({
7756 SV * const mysv_val=sv_newmortal();
7757 DEBUG_PARSE_MSG("");
7758 regprop(RExC_rx, mysv_val, val);
7759 PerlIO_printf(Perl_debug_log, "~ attach to %s (%d) offset to %d\n",
7760 SvPV_nolen_const(mysv_val),
7761 REG_NODE_NUM(val),
7762 val - scan
7763 );
7764 });
c277df42
IZ
7765 if (reg_off_by_arg[OP(scan)]) {
7766 ARG_SET(scan, val - scan);
a0ed51b3
LW
7767 }
7768 else {
c277df42
IZ
7769 NEXT_OFF(scan) = val - scan;
7770 }
3dab1dad
YO
7771
7772 return exact;
a687059c 7773}
07be1b83 7774#endif
a687059c
LW
7775
7776/*
a687059c
LW
7777 - regcurly - a little FSA that accepts {\d+,?\d*}
7778 */
79072805 7779STATIC I32
5f66b61c 7780S_regcurly(register const char *s)
a687059c
LW
7781{
7782 if (*s++ != '{')
7783 return FALSE;
f0fcb552 7784 if (!isDIGIT(*s))
a687059c 7785 return FALSE;
f0fcb552 7786 while (isDIGIT(*s))
a687059c
LW
7787 s++;
7788 if (*s == ',')
7789 s++;
f0fcb552 7790 while (isDIGIT(*s))
a687059c
LW
7791 s++;
7792 if (*s != '}')
7793 return FALSE;
7794 return TRUE;
7795}
7796
a687059c
LW
7797
7798/*
fd181c75 7799 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
7800 */
7801void
097eb12c 7802Perl_regdump(pTHX_ const regexp *r)
a687059c 7803{
35ff7856 7804#ifdef DEBUGGING
97aff369 7805 dVAR;
c445ea15 7806 SV * const sv = sv_newmortal();
ab3bbdeb 7807 SV *dsv= sv_newmortal();
a687059c 7808
786e8c11 7809 (void)dumpuntil(r, r->program, r->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
7810
7811 /* Header fields of interest. */
ab3bbdeb
YO
7812 if (r->anchored_substr) {
7813 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
7814 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 7815 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
7816 "anchored %s%s at %"IVdf" ",
7817 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 7818 (IV)r->anchored_offset);
ab3bbdeb
YO
7819 } else if (r->anchored_utf8) {
7820 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
7821 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 7822 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
7823 "anchored utf8 %s%s at %"IVdf" ",
7824 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 7825 (IV)r->anchored_offset);
ab3bbdeb
YO
7826 }
7827 if (r->float_substr) {
7828 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
7829 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 7830 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
7831 "floating %s%s at %"IVdf"..%"UVuf" ",
7832 s, RE_SV_TAIL(r->float_substr),
7b0972df 7833 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
7834 } else if (r->float_utf8) {
7835 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
7836 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 7837 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
7838 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
7839 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 7840 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 7841 }
33b8afdf 7842 if (r->check_substr || r->check_utf8)
b81d288d 7843 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
7844 (const char *)
7845 (r->check_substr == r->float_substr
7846 && r->check_utf8 == r->float_utf8
7847 ? "(checking floating" : "(checking anchored"));
c277df42
IZ
7848 if (r->reganch & ROPT_NOSCAN)
7849 PerlIO_printf(Perl_debug_log, " noscan");
7850 if (r->reganch & ROPT_CHECK_ALL)
7851 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 7852 if (r->check_substr || r->check_utf8)
c277df42
IZ
7853 PerlIO_printf(Perl_debug_log, ") ");
7854
46fc3d4c 7855 if (r->regstclass) {
32fc9b6a 7856 regprop(r, sv, r->regstclass);
1de06328 7857 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 7858 }
774d564b 7859 if (r->reganch & ROPT_ANCH) {
7860 PerlIO_printf(Perl_debug_log, "anchored");
7861 if (r->reganch & ROPT_ANCH_BOL)
7862 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
7863 if (r->reganch & ROPT_ANCH_MBOL)
7864 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
7865 if (r->reganch & ROPT_ANCH_SBOL)
7866 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 7867 if (r->reganch & ROPT_ANCH_GPOS)
7868 PerlIO_printf(Perl_debug_log, "(GPOS)");
7869 PerlIO_putc(Perl_debug_log, ' ');
7870 }
c277df42
IZ
7871 if (r->reganch & ROPT_GPOS_SEEN)
7872 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 7873 if (r->reganch & ROPT_SKIP)
760ac839 7874 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 7875 if (r->reganch & ROPT_IMPLICIT)
760ac839 7876 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 7877 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
7878 if (r->reganch & ROPT_EVAL_SEEN)
7879 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 7880 PerlIO_printf(Perl_debug_log, "\n");
65e66c80 7881#else
96a5add6 7882 PERL_UNUSED_CONTEXT;
65e66c80 7883 PERL_UNUSED_ARG(r);
17c3b450 7884#endif /* DEBUGGING */
a687059c
LW
7885}
7886
7887/*
a0d0e21e
LW
7888- regprop - printable representation of opcode
7889*/
46fc3d4c 7890void
32fc9b6a 7891Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 7892{
35ff7856 7893#ifdef DEBUGGING
97aff369 7894 dVAR;
9b155405 7895 register int k;
1de06328 7896 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 7897
54dc92de 7898 sv_setpvn(sv, "", 0);
03363afd 7899 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
7900 /* It would be nice to FAIL() here, but this may be called from
7901 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 7902 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
bfed75c6 7903 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405 7904
3dab1dad 7905 k = PL_regkind[OP(o)];
9b155405 7906
2a782b5b 7907 if (k == EXACT) {
396482e1 7908 SV * const dsv = sv_2mortal(newSVpvs(""));
ab3bbdeb
YO
7909 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
7910 * is a crude hack but it may be the best for now since
7911 * we have no flag "this EXACTish node was UTF-8"
7912 * --jhi */
7913 const char * const s =
ddc5bc0f 7914 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
ab3bbdeb
YO
7915 PL_colors[0], PL_colors[1],
7916 PERL_PV_ESCAPE_UNI_DETECT |
7917 PERL_PV_PRETTY_ELIPSES |
7918 PERL_PV_PRETTY_LTGT
7919 );
7920 Perl_sv_catpvf(aTHX_ sv, " %s", s );
bb263b4e 7921 } else if (k == TRIE) {
3dab1dad 7922 /* print the details of the trie in dumpuntil instead, as
4f639d21 7923 * prog->data isn't available here */
1de06328
YO
7924 const char op = OP(o);
7925 const I32 n = ARG(o);
7926 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
7927 (reg_ac_data *)prog->data->data[n] :
7928 NULL;
7929 const reg_trie_data * const trie = !IS_TRIE_AC(op) ?
7930 (reg_trie_data*)prog->data->data[n] :
7931 ac->trie;
7932
7933 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
7934 DEBUG_TRIE_COMPILE_r(
7935 Perl_sv_catpvf(aTHX_ sv,
7936 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
7937 (UV)trie->startstate,
1e2e3d02 7938 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
1de06328
YO
7939 (UV)trie->wordcount,
7940 (UV)trie->minlen,
7941 (UV)trie->maxlen,
7942 (UV)TRIE_CHARCOUNT(trie),
7943 (UV)trie->uniquecharcount
7944 )
7945 );
7946 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
7947 int i;
7948 int rangestart = -1;
f46cb337 7949 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
1de06328
YO
7950 Perl_sv_catpvf(aTHX_ sv, "[");
7951 for (i = 0; i <= 256; i++) {
7952 if (i < 256 && BITMAP_TEST(bitmap,i)) {
7953 if (rangestart == -1)
7954 rangestart = i;
7955 } else if (rangestart != -1) {
7956 if (i <= rangestart + 3)
7957 for (; rangestart < i; rangestart++)
7958 put_byte(sv, rangestart);
7959 else {
7960 put_byte(sv, rangestart);
7961 sv_catpvs(sv, "-");
7962 put_byte(sv, i - 1);
7963 }
7964 rangestart = -1;
7965 }
7966 }
7967 Perl_sv_catpvf(aTHX_ sv, "]");
7968 }
7969
a3621e74 7970 } else if (k == CURLY) {
cb434fcc 7971 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
7972 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
7973 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 7974 }
2c2d71f5
JH
7975 else if (k == WHILEM && o->flags) /* Ordinal/of */
7976 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
6bda09f9 7977 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP)
894356b3 7978 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
1a147d38 7979 else if (k == GOSUB)
6bda09f9 7980 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
9b155405 7981 else if (k == LOGICAL)
04ebc1ab 7982 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
7983 else if (k == ANYOF) {
7984 int i, rangestart = -1;
2d03de9c 7985 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
7986
7987 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
7988 static const char * const anyofs[] = {
653099ff
GS
7989 "\\w",
7990 "\\W",
7991 "\\s",
7992 "\\S",
7993 "\\d",
7994 "\\D",
7995 "[:alnum:]",
7996 "[:^alnum:]",
7997 "[:alpha:]",
7998 "[:^alpha:]",
7999 "[:ascii:]",
8000 "[:^ascii:]",
8001 "[:ctrl:]",
8002 "[:^ctrl:]",
8003 "[:graph:]",
8004 "[:^graph:]",
8005 "[:lower:]",
8006 "[:^lower:]",
8007 "[:print:]",
8008 "[:^print:]",
8009 "[:punct:]",
8010 "[:^punct:]",
8011 "[:upper:]",
aaa51d5e 8012 "[:^upper:]",
653099ff 8013 "[:xdigit:]",
aaa51d5e
JF
8014 "[:^xdigit:]",
8015 "[:space:]",
8016 "[:^space:]",
8017 "[:blank:]",
8018 "[:^blank:]"
653099ff
GS
8019 };
8020
19860706 8021 if (flags & ANYOF_LOCALE)
396482e1 8022 sv_catpvs(sv, "{loc}");
19860706 8023 if (flags & ANYOF_FOLD)
396482e1 8024 sv_catpvs(sv, "{i}");
653099ff 8025 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 8026 if (flags & ANYOF_INVERT)
396482e1 8027 sv_catpvs(sv, "^");
ffc61ed2
JH
8028 for (i = 0; i <= 256; i++) {
8029 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
8030 if (rangestart == -1)
8031 rangestart = i;
8032 } else if (rangestart != -1) {
8033 if (i <= rangestart + 3)
8034 for (; rangestart < i; rangestart++)
653099ff 8035 put_byte(sv, rangestart);
ffc61ed2
JH
8036 else {
8037 put_byte(sv, rangestart);
396482e1 8038 sv_catpvs(sv, "-");
ffc61ed2 8039 put_byte(sv, i - 1);
653099ff 8040 }
ffc61ed2 8041 rangestart = -1;
653099ff 8042 }
847a199f 8043 }
ffc61ed2
JH
8044
8045 if (o->flags & ANYOF_CLASS)
bb7a0f54 8046 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
ffc61ed2
JH
8047 if (ANYOF_CLASS_TEST(o,i))
8048 sv_catpv(sv, anyofs[i]);
8049
8050 if (flags & ANYOF_UNICODE)
396482e1 8051 sv_catpvs(sv, "{unicode}");
1aa99e6b 8052 else if (flags & ANYOF_UNICODE_ALL)
396482e1 8053 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
8054
8055 {
8056 SV *lv;
32fc9b6a 8057 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 8058
ffc61ed2
JH
8059 if (lv) {
8060 if (sw) {
89ebb4a3 8061 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 8062
ffc61ed2 8063 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 8064 uvchr_to_utf8(s, i);
ffc61ed2 8065
3568d838 8066 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
8067 if (rangestart == -1)
8068 rangestart = i;
8069 } else if (rangestart != -1) {
ffc61ed2
JH
8070 if (i <= rangestart + 3)
8071 for (; rangestart < i; rangestart++) {
2d03de9c
AL
8072 const U8 * const e = uvchr_to_utf8(s,rangestart);
8073 U8 *p;
8074 for(p = s; p < e; p++)
ffc61ed2
JH
8075 put_byte(sv, *p);
8076 }
8077 else {
2d03de9c
AL
8078 const U8 *e = uvchr_to_utf8(s,rangestart);
8079 U8 *p;
8080 for (p = s; p < e; p++)
ffc61ed2 8081 put_byte(sv, *p);
396482e1 8082 sv_catpvs(sv, "-");
2d03de9c
AL
8083 e = uvchr_to_utf8(s, i-1);
8084 for (p = s; p < e; p++)
1df70142 8085 put_byte(sv, *p);
ffc61ed2
JH
8086 }
8087 rangestart = -1;
8088 }
19860706 8089 }
ffc61ed2 8090
396482e1 8091 sv_catpvs(sv, "..."); /* et cetera */
19860706 8092 }
fde631ed 8093
ffc61ed2 8094 {
2e0de35c 8095 char *s = savesvpv(lv);
c445ea15 8096 char * const origs = s;
b81d288d 8097
3dab1dad
YO
8098 while (*s && *s != '\n')
8099 s++;
b81d288d 8100
ffc61ed2 8101 if (*s == '\n') {
2d03de9c 8102 const char * const t = ++s;
ffc61ed2
JH
8103
8104 while (*s) {
8105 if (*s == '\n')
8106 *s = ' ';
8107 s++;
8108 }
8109 if (s[-1] == ' ')
8110 s[-1] = 0;
8111
8112 sv_catpv(sv, t);
fde631ed 8113 }
b81d288d 8114
ffc61ed2 8115 Safefree(origs);
fde631ed
JH
8116 }
8117 }
653099ff 8118 }
ffc61ed2 8119
653099ff
GS
8120 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
8121 }
9b155405 8122 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 8123 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 8124#else
96a5add6 8125 PERL_UNUSED_CONTEXT;
65e66c80
SP
8126 PERL_UNUSED_ARG(sv);
8127 PERL_UNUSED_ARG(o);
f9049ba1 8128 PERL_UNUSED_ARG(prog);
17c3b450 8129#endif /* DEBUGGING */
35ff7856 8130}
a687059c 8131
cad2e5aa
JH
8132SV *
8133Perl_re_intuit_string(pTHX_ regexp *prog)
8134{ /* Assume that RE_INTUIT is set */
97aff369 8135 dVAR;
a3621e74 8136 GET_RE_DEBUG_FLAGS_DECL;
96a5add6
AL
8137 PERL_UNUSED_CONTEXT;
8138
a3621e74 8139 DEBUG_COMPILE_r(
cfd0369c 8140 {
2d03de9c 8141 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 8142 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
8143
8144 if (!PL_colorset) reginitcolors();
8145 PerlIO_printf(Perl_debug_log,
a0288114 8146 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
8147 PL_colors[4],
8148 prog->check_substr ? "" : "utf8 ",
8149 PL_colors[5],PL_colors[0],
cad2e5aa
JH
8150 s,
8151 PL_colors[1],
8152 (strlen(s) > 60 ? "..." : ""));
8153 } );
8154
33b8afdf 8155 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
8156}
8157
84da74a7
YO
8158/*
8159 pregfree - free a regexp
8160
8161 See regdupe below if you change anything here.
8162*/
8163
2b69d0c2 8164void
864dbfa3 8165Perl_pregfree(pTHX_ struct regexp *r)
a687059c 8166{
27da23d5 8167 dVAR;
0df25f3d 8168
fc32ee4a 8169 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 8170
7821416a
IZ
8171 if (!r || (--r->refcnt > 0))
8172 return;
ab3bbdeb 8173 DEBUG_COMPILE_r({
0df25f3d
YO
8174 if (!PL_colorset)
8175 reginitcolors();
ab3bbdeb
YO
8176 if (RX_DEBUG(r)){
8177 SV *dsv= sv_newmortal();
8178 RE_PV_QUOTED_DECL(s, (r->reganch & ROPT_UTF8),
8179 dsv, r->precomp, r->prelen, 60);
8180 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
8181 PL_colors[4],PL_colors[5],s);
8182 }
9e55ce06 8183 });
cad2e5aa 8184
43c5f42d
NC
8185 /* gcov results gave these as non-null 100% of the time, so there's no
8186 optimisation in checking them before calling Safefree */
8187 Safefree(r->precomp);
8188 Safefree(r->offsets); /* 20010421 MJD */
ed252734 8189 RX_MATCH_COPY_FREE(r);
f8c7b90f 8190#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
8191 if (r->saved_copy)
8192 SvREFCNT_dec(r->saved_copy);
8193#endif
a193d654
GS
8194 if (r->substrs) {
8195 if (r->anchored_substr)
8196 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
8197 if (r->anchored_utf8)
8198 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
8199 if (r->float_substr)
8200 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
8201 if (r->float_utf8)
8202 SvREFCNT_dec(r->float_utf8);
2779dcf1 8203 Safefree(r->substrs);
a193d654 8204 }
81714fb9
YO
8205 if (r->paren_names)
8206 SvREFCNT_dec(r->paren_names);
c277df42
IZ
8207 if (r->data) {
8208 int n = r->data->count;
f3548bdc
DM
8209 PAD* new_comppad = NULL;
8210 PAD* old_comppad;
4026c95a 8211 PADOFFSET refcnt;
dfad63ad 8212
c277df42 8213 while (--n >= 0) {
261faec3 8214 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
8215 switch (r->data->what[n]) {
8216 case 's':
81714fb9 8217 case 'S':
c277df42
IZ
8218 SvREFCNT_dec((SV*)r->data->data[n]);
8219 break;
653099ff
GS
8220 case 'f':
8221 Safefree(r->data->data[n]);
8222 break;
dfad63ad
HS
8223 case 'p':
8224 new_comppad = (AV*)r->data->data[n];
8225 break;
c277df42 8226 case 'o':
dfad63ad 8227 if (new_comppad == NULL)
cea2e8a9 8228 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
8229 PAD_SAVE_LOCAL(old_comppad,
8230 /* Watch out for global destruction's random ordering. */
c445ea15 8231 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 8232 );
b34c0dd4 8233 OP_REFCNT_LOCK;
4026c95a
SH
8234 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
8235 OP_REFCNT_UNLOCK;
8236 if (!refcnt)
9b978d73 8237 op_free((OP_4tree*)r->data->data[n]);
9b978d73 8238
f3548bdc 8239 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
8240 SvREFCNT_dec((SV*)new_comppad);
8241 new_comppad = NULL;
c277df42
IZ
8242 break;
8243 case 'n':
9e55ce06 8244 break;
07be1b83 8245 case 'T':
be8e71aa
YO
8246 { /* Aho Corasick add-on structure for a trie node.
8247 Used in stclass optimization only */
07be1b83
YO
8248 U32 refcount;
8249 reg_ac_data *aho=(reg_ac_data*)r->data->data[n];
8250 OP_REFCNT_LOCK;
8251 refcount = --aho->refcount;
8252 OP_REFCNT_UNLOCK;
8253 if ( !refcount ) {
8254 Safefree(aho->states);
8255 Safefree(aho->fail);
8256 aho->trie=NULL; /* not necessary to free this as it is
8257 handled by the 't' case */
8258 Safefree(r->data->data[n]); /* do this last!!!! */
be8e71aa 8259 Safefree(r->regstclass);
07be1b83
YO
8260 }
8261 }
8262 break;
a3621e74 8263 case 't':
07be1b83 8264 {
be8e71aa 8265 /* trie structure. */
07be1b83
YO
8266 U32 refcount;
8267 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
8268 OP_REFCNT_LOCK;
8269 refcount = --trie->refcount;
8270 OP_REFCNT_UNLOCK;
8271 if ( !refcount ) {
8272 Safefree(trie->charmap);
8273 if (trie->widecharmap)
8274 SvREFCNT_dec((SV*)trie->widecharmap);
8275 Safefree(trie->states);
8276 Safefree(trie->trans);
8277 if (trie->bitmap)
8278 Safefree(trie->bitmap);
8279 if (trie->wordlen)
8280 Safefree(trie->wordlen);
786e8c11
YO
8281 if (trie->jump)
8282 Safefree(trie->jump);
8283 if (trie->nextword)
8284 Safefree(trie->nextword);
a3621e74 8285#ifdef DEBUGGING
be8e71aa
YO
8286 if (RX_DEBUG(r)) {
8287 if (trie->words)
8288 SvREFCNT_dec((SV*)trie->words);
8289 if (trie->revcharmap)
8290 SvREFCNT_dec((SV*)trie->revcharmap);
8291 }
a3621e74 8292#endif
07be1b83 8293 Safefree(r->data->data[n]); /* do this last!!!! */
a3621e74 8294 }
07be1b83
YO
8295 }
8296 break;
c277df42 8297 default:
830247a4 8298 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
8299 }
8300 }
8301 Safefree(r->data->what);
8302 Safefree(r->data);
a0d0e21e
LW
8303 }
8304 Safefree(r->startp);
8305 Safefree(r->endp);
8306 Safefree(r);
a687059c 8307}
c277df42 8308
84da74a7
YO
8309#define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
8310#define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
81714fb9 8311#define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t))
84da74a7
YO
8312#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
8313
8314/*
8315 regdupe - duplicate a regexp.
8316
8317 This routine is called by sv.c's re_dup and is expected to clone a
8318 given regexp structure. It is a no-op when not under USE_ITHREADS.
8319 (Originally this *was* re_dup() for change history see sv.c)
8320
8321 See pregfree() above if you change anything here.
8322*/
a3c0e9ca 8323#if defined(USE_ITHREADS)
84da74a7
YO
8324regexp *
8325Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param)
8326{
84da74a7
YO
8327 dVAR;
8328 REGEXP *ret;
8329 int i, len, npar;
8330 struct reg_substr_datum *s;
8331
8332 if (!r)
8333 return (REGEXP *)NULL;
8334
8335 if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
8336 return ret;
8337
8338 len = r->offsets[0];
8339 npar = r->nparens+1;
8340
8341 Newxc(ret, sizeof(regexp) + (len+1)*sizeof(regnode), char, regexp);
8342 Copy(r->program, ret->program, len+1, regnode);
8343
8344 Newx(ret->startp, npar, I32);
8345 Copy(r->startp, ret->startp, npar, I32);
8346 Newx(ret->endp, npar, I32);
8347 Copy(r->startp, ret->startp, npar, I32);
8348
8349 Newx(ret->substrs, 1, struct reg_substr_data);
8350 for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
8351 s->min_offset = r->substrs->data[i].min_offset;
8352 s->max_offset = r->substrs->data[i].max_offset;
8353 s->end_shift = r->substrs->data[i].end_shift;
8354 s->substr = sv_dup_inc(r->substrs->data[i].substr, param);
8355 s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param);
8356 }
8357
8358 ret->regstclass = NULL;
8359 if (r->data) {
8360 struct reg_data *d;
8361 const int count = r->data->count;
8362 int i;
8363
8364 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
8365 char, struct reg_data);
8366 Newx(d->what, count, U8);
8367
8368 d->count = count;
8369 for (i = 0; i < count; i++) {
8370 d->what[i] = r->data->what[i];
8371 switch (d->what[i]) {
8372 /* legal options are one of: sfpont
8373 see also regcomp.h and pregfree() */
8374 case 's':
81714fb9 8375 case 'S':
84da74a7
YO
8376 d->data[i] = sv_dup_inc((SV *)r->data->data[i], param);
8377 break;
8378 case 'p':
8379 d->data[i] = av_dup_inc((AV *)r->data->data[i], param);
8380 break;
8381 case 'f':
8382 /* This is cheating. */
8383 Newx(d->data[i], 1, struct regnode_charclass_class);
8384 StructCopy(r->data->data[i], d->data[i],
8385 struct regnode_charclass_class);
8386 ret->regstclass = (regnode*)d->data[i];
8387 break;
8388 case 'o':
8389 /* Compiled op trees are readonly, and can thus be
8390 shared without duplication. */
8391 OP_REFCNT_LOCK;
8392 d->data[i] = (void*)OpREFCNT_inc((OP*)r->data->data[i]);
8393 OP_REFCNT_UNLOCK;
8394 break;
8395 case 'n':
8396 d->data[i] = r->data->data[i];
8397 break;
8398 case 't':
8399 d->data[i] = r->data->data[i];
8400 OP_REFCNT_LOCK;
8401 ((reg_trie_data*)d->data[i])->refcount++;
8402 OP_REFCNT_UNLOCK;
8403 break;
8404 case 'T':
8405 d->data[i] = r->data->data[i];
8406 OP_REFCNT_LOCK;
8407 ((reg_ac_data*)d->data[i])->refcount++;
8408 OP_REFCNT_UNLOCK;
8409 /* Trie stclasses are readonly and can thus be shared
8410 * without duplication. We free the stclass in pregfree
8411 * when the corresponding reg_ac_data struct is freed.
8412 */
8413 ret->regstclass= r->regstclass;
8414 break;
8415 default:
8416 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", r->data->what[i]);
8417 }
8418 }
8419
8420 ret->data = d;
8421 }
8422 else
8423 ret->data = NULL;
8424
8425 Newx(ret->offsets, 2*len+1, U32);
8426 Copy(r->offsets, ret->offsets, 2*len+1, U32);
8427
8428 ret->precomp = SAVEPVN(r->precomp, r->prelen);
8429 ret->refcnt = r->refcnt;
8430 ret->minlen = r->minlen;
8431 ret->prelen = r->prelen;
8432 ret->nparens = r->nparens;
8433 ret->lastparen = r->lastparen;
8434 ret->lastcloseparen = r->lastcloseparen;
8435 ret->reganch = r->reganch;
8436
8437 ret->sublen = r->sublen;
8438
f9f4320a 8439 ret->engine = r->engine;
81714fb9
YO
8440
8441 ret->paren_names = hv_dup_inc(r->paren_names, param);
f9f4320a 8442
84da74a7
YO
8443 if (RX_MATCH_COPIED(ret))
8444 ret->subbeg = SAVEPVN(r->subbeg, r->sublen);
8445 else
8446 ret->subbeg = NULL;
8447#ifdef PERL_OLD_COPY_ON_WRITE
8448 ret->saved_copy = NULL;
8449#endif
8450
8451 ptr_table_store(PL_ptr_table, r, ret);
8452 return ret;
84da74a7 8453}
a3c0e9ca 8454#endif
84da74a7 8455
76234dfb 8456#ifndef PERL_IN_XSUB_RE
c277df42
IZ
8457/*
8458 - regnext - dig the "next" pointer out of a node
c277df42
IZ
8459 */
8460regnode *
864dbfa3 8461Perl_regnext(pTHX_ register regnode *p)
c277df42 8462{
97aff369 8463 dVAR;
c277df42
IZ
8464 register I32 offset;
8465
3280af22 8466 if (p == &PL_regdummy)
c277df42
IZ
8467 return(NULL);
8468
8469 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
8470 if (offset == 0)
8471 return(NULL);
8472
c277df42 8473 return(p+offset);
c277df42 8474}
76234dfb 8475#endif
c277df42 8476
01f988be 8477STATIC void
cea2e8a9 8478S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
8479{
8480 va_list args;
8481 STRLEN l1 = strlen(pat1);
8482 STRLEN l2 = strlen(pat2);
8483 char buf[512];
06bf62c7 8484 SV *msv;
73d840c0 8485 const char *message;
c277df42
IZ
8486
8487 if (l1 > 510)
8488 l1 = 510;
8489 if (l1 + l2 > 510)
8490 l2 = 510 - l1;
8491 Copy(pat1, buf, l1 , char);
8492 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
8493 buf[l1 + l2] = '\n';
8494 buf[l1 + l2 + 1] = '\0';
8736538c
AS
8495#ifdef I_STDARG
8496 /* ANSI variant takes additional second argument */
c277df42 8497 va_start(args, pat2);
8736538c
AS
8498#else
8499 va_start(args);
8500#endif
5a844595 8501 msv = vmess(buf, &args);
c277df42 8502 va_end(args);
cfd0369c 8503 message = SvPV_const(msv,l1);
c277df42
IZ
8504 if (l1 > 512)
8505 l1 = 512;
8506 Copy(message, buf, l1 , char);
197cf9b9 8507 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 8508 Perl_croak(aTHX_ "%s", buf);
c277df42 8509}
a0ed51b3
LW
8510
8511/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
8512
76234dfb 8513#ifndef PERL_IN_XSUB_RE
a0ed51b3 8514void
864dbfa3 8515Perl_save_re_context(pTHX)
b81d288d 8516{
97aff369 8517 dVAR;
1ade1aa1
NC
8518
8519 struct re_save_state *state;
8520
8521 SAVEVPTR(PL_curcop);
8522 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
8523
8524 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
8525 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
8526 SSPUSHINT(SAVEt_RE_STATE);
8527
46ab3289 8528 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 8529
a0ed51b3 8530 PL_reg_start_tmp = 0;
a0ed51b3 8531 PL_reg_start_tmpl = 0;
c445ea15 8532 PL_reg_oldsaved = NULL;
a5db57d6 8533 PL_reg_oldsavedlen = 0;
a5db57d6 8534 PL_reg_maxiter = 0;
a5db57d6 8535 PL_reg_leftiter = 0;
c445ea15 8536 PL_reg_poscache = NULL;
a5db57d6 8537 PL_reg_poscache_size = 0;
1ade1aa1
NC
8538#ifdef PERL_OLD_COPY_ON_WRITE
8539 PL_nrs = NULL;
8540#endif
ada6e8a9 8541
c445ea15
AL
8542 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
8543 if (PL_curpm) {
8544 const REGEXP * const rx = PM_GETRE(PL_curpm);
8545 if (rx) {
1df70142 8546 U32 i;
ada6e8a9 8547 for (i = 1; i <= rx->nparens; i++) {
1df70142 8548 char digits[TYPE_CHARS(long)];
d9fad198 8549 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
8550 GV *const *const gvp
8551 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
8552
b37c2d43
AL
8553 if (gvp) {
8554 GV * const gv = *gvp;
8555 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
8556 save_scalar(gv);
49f27e4b 8557 }
ada6e8a9
AMS
8558 }
8559 }
8560 }
a0ed51b3 8561}
76234dfb 8562#endif
51371543 8563
51371543 8564static void
acfe0abc 8565clear_re(pTHX_ void *r)
51371543 8566{
97aff369 8567 dVAR;
51371543
GS
8568 ReREFCNT_dec((regexp *)r);
8569}
ffbc6a93 8570
a28509cc
AL
8571#ifdef DEBUGGING
8572
8573STATIC void
8574S_put_byte(pTHX_ SV *sv, int c)
8575{
8576 if (isCNTRL(c) || c == 255 || !isPRINT(c))
8577 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
8578 else if (c == '-' || c == ']' || c == '\\' || c == '^')
8579 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
8580 else
8581 Perl_sv_catpvf(aTHX_ sv, "%c", c);
8582}
8583
786e8c11 8584
3dab1dad
YO
8585#define CLEAR_OPTSTART \
8586 if (optstart) STMT_START { \
07be1b83 8587 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart)); \
3dab1dad
YO
8588 optstart=NULL; \
8589 } STMT_END
8590
786e8c11 8591#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 8592
b5a2f8d8
NC
8593STATIC const regnode *
8594S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
8595 const regnode *last, const regnode *plast,
8596 SV* sv, I32 indent, U32 depth)
a28509cc 8597{
97aff369 8598 dVAR;
786e8c11 8599 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 8600 register const regnode *next;
3dab1dad
YO
8601 const regnode *optstart= NULL;
8602 GET_RE_DEBUG_FLAGS_DECL;
a28509cc 8603
786e8c11
YO
8604#ifdef DEBUG_DUMPUNTIL
8605 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
8606 last ? last-start : 0,plast ? plast-start : 0);
8607#endif
8608
8609 if (plast && plast < last)
8610 last= plast;
8611
8612 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc
AL
8613 /* While that wasn't END last time... */
8614
8615 NODE_ALIGN(node);
8616 op = OP(node);
8617 if (op == CLOSE)
786e8c11 8618 indent--;
b5a2f8d8 8619 next = regnext((regnode *)node);
07be1b83 8620
a28509cc 8621 /* Where, what. */
8e11feef 8622 if (OP(node) == OPTIMIZED) {
e68ec53f 8623 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 8624 optstart = node;
3dab1dad 8625 else
8e11feef 8626 goto after_print;
3dab1dad
YO
8627 } else
8628 CLEAR_OPTSTART;
07be1b83 8629
32fc9b6a 8630 regprop(r, sv, node);
a28509cc 8631 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 8632 (int)(2*indent + 1), "", SvPVX_const(sv));
3dab1dad
YO
8633
8634 if (OP(node) != OPTIMIZED) {
8e11feef
RGS
8635 if (next == NULL) /* Next ptr. */
8636 PerlIO_printf(Perl_debug_log, "(0)");
786e8c11
YO
8637 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
8638 PerlIO_printf(Perl_debug_log, "(FAIL)");
8e11feef
RGS
8639 else
8640 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
786e8c11 8641
1de06328 8642 /*if (PL_regkind[(U8)op] != TRIE)*/
786e8c11 8643 (void)PerlIO_putc(Perl_debug_log, '\n');
3dab1dad
YO
8644 }
8645
a28509cc
AL
8646 after_print:
8647 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
8648 assert(next);
8649 {
8650 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
8651 ? regnext((regnode *)next)
8652 : next);
be8e71aa
YO
8653 if (last && nnode > last)
8654 nnode = last;
786e8c11 8655 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 8656 }
a28509cc
AL
8657 }
8658 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 8659 assert(next);
786e8c11 8660 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
8661 }
8662 else if ( PL_regkind[(U8)op] == TRIE ) {
7f69552c 8663 const regnode *this_trie = node;
1de06328 8664 const char op = OP(node);
a28509cc 8665 const I32 n = ARG(node);
1de06328
YO
8666 const reg_ac_data * const ac = op>=AHOCORASICK ?
8667 (reg_ac_data *)r->data->data[n] :
8668 NULL;
8669 const reg_trie_data * const trie = op<AHOCORASICK ?
8670 (reg_trie_data*)r->data->data[n] :
8671 ac->trie;
786e8c11 8672 const regnode *nextbranch= NULL;
a28509cc 8673 I32 word_idx;
1de06328 8674 sv_setpvn(sv, "", 0);
786e8c11 8675 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
097eb12c 8676 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
786e8c11
YO
8677
8678 PerlIO_printf(Perl_debug_log, "%*s%s ",
8679 (int)(2*(indent+3)), "",
8680 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
8681 PL_colors[0], PL_colors[1],
8682 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
8683 PERL_PV_PRETTY_ELIPSES |
7f69552c 8684 PERL_PV_PRETTY_LTGT
786e8c11
YO
8685 )
8686 : "???"
8687 );
8688 if (trie->jump) {
40d049e4 8689 U16 dist= trie->jump[word_idx+1];
7f69552c 8690 PerlIO_printf(Perl_debug_log, "(%u)\n",
40d049e4 8691 (dist ? this_trie + dist : next) - start);
786e8c11
YO
8692 if (dist) {
8693 if (!nextbranch)
24b23f37 8694 nextbranch= this_trie + trie->jump[0];
7f69552c
YO
8695 DUMPUNTIL(this_trie + dist, nextbranch);
8696 }
786e8c11
YO
8697 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
8698 nextbranch= regnext((regnode *)nextbranch);
8699 } else {
8700 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 8701 }
786e8c11
YO
8702 }
8703 if (last && next > last)
8704 node= last;
8705 else
8706 node= next;
a28509cc 8707 }
786e8c11
YO
8708 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
8709 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
8710 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
8711 }
8712 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 8713 assert(next);
786e8c11 8714 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
8715 }
8716 else if ( op == PLUS || op == STAR) {
786e8c11 8717 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc
AL
8718 }
8719 else if (op == ANYOF) {
8720 /* arglen 1 + class block */
8721 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
8722 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
8723 node = NEXTOPER(node);
8724 }
8725 else if (PL_regkind[(U8)op] == EXACT) {
8726 /* Literal string, where present. */
8727 node += NODE_SZ_STR(node) - 1;
8728 node = NEXTOPER(node);
8729 }
8730 else {
8731 node = NEXTOPER(node);
8732 node += regarglen[(U8)op];
8733 }
8734 if (op == CURLYX || op == OPEN)
786e8c11 8735 indent++;
a28509cc 8736 else if (op == WHILEM)
786e8c11 8737 indent--;
a28509cc 8738 }
3dab1dad 8739 CLEAR_OPTSTART;
786e8c11
YO
8740#ifdef DEBUG_DUMPUNTIL
8741 PerlIO_printf(Perl_debug_log, "--- %d\n",indent);
8742#endif
1de06328 8743 return node;
a28509cc
AL
8744}
8745
8746#endif /* DEBUGGING */
8747
241d1a3b
NC
8748/*
8749 * Local variables:
8750 * c-indentation-style: bsd
8751 * c-basic-offset: 4
8752 * indent-tabs-mode: t
8753 * End:
8754 *
37442d52
RGS
8755 * ex: set ts=8 sts=4 sw=4 noet:
8756 */