This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Typo fix, by Robin Barker
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
fdf8c088 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4 104 char *precomp; /* uncompiled string. */
f8fc2ecf
YO
105 regexp *rx; /* perl core regexp structure */
106 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 107 char *start; /* Start of input for compile */
830247a4
IZ
108 char *end; /* End of input for compile */
109 char *parse; /* Input-scan pointer. */
110 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 111 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 112 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
113 I32 naughty; /* How bad is this pattern? */
114 I32 sawback; /* Did we see \1, ...? */
115 U32 seen;
116 I32 size; /* Code size. */
c74340f9
YO
117 I32 npar; /* Capture buffer count, (OPEN). */
118 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 119 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
120 I32 extralen;
121 I32 seen_zerolen;
122 I32 seen_evals;
40d049e4
YO
123 regnode **open_parens; /* pointers to open parens */
124 regnode **close_parens; /* pointers to close parens */
125 regnode *opend; /* END node in program */
1aa99e6b 126 I32 utf8;
6bda09f9 127 HV *charnames; /* cache of named sequences */
81714fb9 128 HV *paren_names; /* Paren names */
1f1031fe 129
40d049e4
YO
130 regnode **recurse; /* Recurse regops */
131 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
132#if ADD_TO_REGEXEC
133 char *starttry; /* -Dr: where regtry was called. */
134#define RExC_starttry (pRExC_state->starttry)
135#endif
3dab1dad 136#ifdef DEBUGGING
be8e71aa 137 const char *lastparse;
3dab1dad 138 I32 lastnum;
1f1031fe 139 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
140#define RExC_lastparse (pRExC_state->lastparse)
141#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 142#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 143#endif
830247a4
IZ
144} RExC_state_t;
145
e2509266 146#define RExC_flags (pRExC_state->flags)
830247a4
IZ
147#define RExC_precomp (pRExC_state->precomp)
148#define RExC_rx (pRExC_state->rx)
f8fc2ecf 149#define RExC_rxi (pRExC_state->rxi)
fac92740 150#define RExC_start (pRExC_state->start)
830247a4
IZ
151#define RExC_end (pRExC_state->end)
152#define RExC_parse (pRExC_state->parse)
153#define RExC_whilem_seen (pRExC_state->whilem_seen)
f8fc2ecf 154#define RExC_offsets (pRExC_state->rxi->offsets) /* I am not like the others */
830247a4 155#define RExC_emit (pRExC_state->emit)
fac92740 156#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
157#define RExC_naughty (pRExC_state->naughty)
158#define RExC_sawback (pRExC_state->sawback)
159#define RExC_seen (pRExC_state->seen)
160#define RExC_size (pRExC_state->size)
161#define RExC_npar (pRExC_state->npar)
e2e6a0f1 162#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
163#define RExC_extralen (pRExC_state->extralen)
164#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
165#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 166#define RExC_utf8 (pRExC_state->utf8)
fc8cd66c 167#define RExC_charnames (pRExC_state->charnames)
40d049e4
YO
168#define RExC_open_parens (pRExC_state->open_parens)
169#define RExC_close_parens (pRExC_state->close_parens)
170#define RExC_opend (pRExC_state->opend)
81714fb9 171#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
172#define RExC_recurse (pRExC_state->recurse)
173#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 174
cde0cee5 175
a687059c
LW
176#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
177#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
178 ((*s) == '{' && regcurly(s)))
a687059c 179
35c8bce7
LW
180#ifdef SPSTART
181#undef SPSTART /* dratted cpp namespace... */
182#endif
a687059c
LW
183/*
184 * Flags to be passed up and down.
185 */
a687059c 186#define WORST 0 /* Worst case. */
821b33a5 187#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
188#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
189#define SPSTART 0x4 /* Starts with * or +. */
190#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 191
3dab1dad
YO
192#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
193
07be1b83
YO
194/* whether trie related optimizations are enabled */
195#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
196#define TRIE_STUDY_OPT
786e8c11 197#define FULL_TRIE_STUDY
07be1b83
YO
198#define TRIE_STCLASS
199#endif
1de06328
YO
200
201
40d049e4
YO
202
203#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
204#define PBITVAL(paren) (1 << ((paren) & 7))
205#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
206#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
207#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
208
209
1de06328
YO
210/* About scan_data_t.
211
212 During optimisation we recurse through the regexp program performing
213 various inplace (keyhole style) optimisations. In addition study_chunk
214 and scan_commit populate this data structure with information about
215 what strings MUST appear in the pattern. We look for the longest
216 string that must appear for at a fixed location, and we look for the
217 longest string that may appear at a floating location. So for instance
218 in the pattern:
219
220 /FOO[xX]A.*B[xX]BAR/
221
222 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
223 strings (because they follow a .* construct). study_chunk will identify
224 both FOO and BAR as being the longest fixed and floating strings respectively.
225
226 The strings can be composites, for instance
227
228 /(f)(o)(o)/
229
230 will result in a composite fixed substring 'foo'.
231
232 For each string some basic information is maintained:
233
234 - offset or min_offset
235 This is the position the string must appear at, or not before.
236 It also implicitly (when combined with minlenp) tells us how many
237 character must match before the string we are searching.
238 Likewise when combined with minlenp and the length of the string
239 tells us how many characters must appear after the string we have
240 found.
241
242 - max_offset
243 Only used for floating strings. This is the rightmost point that
244 the string can appear at. Ifset to I32 max it indicates that the
245 string can occur infinitely far to the right.
246
247 - minlenp
248 A pointer to the minimum length of the pattern that the string
249 was found inside. This is important as in the case of positive
250 lookahead or positive lookbehind we can have multiple patterns
251 involved. Consider
252
253 /(?=FOO).*F/
254
255 The minimum length of the pattern overall is 3, the minimum length
256 of the lookahead part is 3, but the minimum length of the part that
257 will actually match is 1. So 'FOO's minimum length is 3, but the
258 minimum length for the F is 1. This is important as the minimum length
259 is used to determine offsets in front of and behind the string being
260 looked for. Since strings can be composites this is the length of the
261 pattern at the time it was commited with a scan_commit. Note that
262 the length is calculated by study_chunk, so that the minimum lengths
263 are not known until the full pattern has been compiled, thus the
264 pointer to the value.
265
266 - lookbehind
267
268 In the case of lookbehind the string being searched for can be
269 offset past the start point of the final matching string.
270 If this value was just blithely removed from the min_offset it would
271 invalidate some of the calculations for how many chars must match
272 before or after (as they are derived from min_offset and minlen and
273 the length of the string being searched for).
274 When the final pattern is compiled and the data is moved from the
275 scan_data_t structure into the regexp structure the information
276 about lookbehind is factored in, with the information that would
277 have been lost precalculated in the end_shift field for the
278 associated string.
279
280 The fields pos_min and pos_delta are used to store the minimum offset
281 and the delta to the maximum offset at the current point in the pattern.
282
283*/
2c2d71f5
JH
284
285typedef struct scan_data_t {
1de06328
YO
286 /*I32 len_min; unused */
287 /*I32 len_delta; unused */
2c2d71f5
JH
288 I32 pos_min;
289 I32 pos_delta;
290 SV *last_found;
1de06328 291 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
292 I32 last_start_min;
293 I32 last_start_max;
1de06328
YO
294 SV **longest; /* Either &l_fixed, or &l_float. */
295 SV *longest_fixed; /* longest fixed string found in pattern */
296 I32 offset_fixed; /* offset where it starts */
297 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
298 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
299 SV *longest_float; /* longest floating string found in pattern */
300 I32 offset_float_min; /* earliest point in string it can appear */
301 I32 offset_float_max; /* latest point in string it can appear */
302 I32 *minlen_float; /* pointer to the minlen relevent to the string */
303 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
304 I32 flags;
305 I32 whilem_c;
cb434fcc 306 I32 *last_closep;
653099ff 307 struct regnode_charclass_class *start_class;
2c2d71f5
JH
308} scan_data_t;
309
a687059c 310/*
e50aee73 311 * Forward declarations for pregcomp()'s friends.
a687059c 312 */
a0d0e21e 313
27da23d5 314static const scan_data_t zero_scan_data =
1de06328 315 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
316
317#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
318#define SF_BEFORE_SEOL 0x0001
319#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
320#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
321#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
322
09b7f37c
CB
323#ifdef NO_UNARY_PLUS
324# define SF_FIX_SHIFT_EOL (0+2)
325# define SF_FL_SHIFT_EOL (0+4)
326#else
327# define SF_FIX_SHIFT_EOL (+2)
328# define SF_FL_SHIFT_EOL (+4)
329#endif
c277df42
IZ
330
331#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
332#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
333
334#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
335#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
336#define SF_IS_INF 0x0040
337#define SF_HAS_PAR 0x0080
338#define SF_IN_PAR 0x0100
339#define SF_HAS_EVAL 0x0200
340#define SCF_DO_SUBSTR 0x0400
653099ff
GS
341#define SCF_DO_STCLASS_AND 0x0800
342#define SCF_DO_STCLASS_OR 0x1000
343#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 344#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 345
786e8c11 346#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 347#define SCF_SEEN_ACCEPT 0x8000
07be1b83 348
eb160463 349#define UTF (RExC_utf8 != 0)
bbe252da
YO
350#define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
351#define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
a0ed51b3 352
ffc61ed2 353#define OOB_UNICODE 12345678
93733859 354#define OOB_NAMEDCLASS -1
b8c5462f 355
a0ed51b3
LW
356#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
357#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
358
8615cb43 359
b45f050a
JF
360/* length of regex to show in messages that don't mark a position within */
361#define RegexLengthToShowInErrorMessages 127
362
363/*
364 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
365 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
366 * op/pragma/warn/regcomp.
367 */
7253e4e3
RK
368#define MARKER1 "<-- HERE" /* marker as it appears in the description */
369#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 370
7253e4e3 371#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
372
373/*
374 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
375 * arg. Show regex, up to a maximum length. If it's too long, chop and add
376 * "...".
377 */
58e23c8d 378#define _FAIL(code) STMT_START { \
bfed75c6 379 const char *ellipses = ""; \
ccb2c380
MP
380 IV len = RExC_end - RExC_precomp; \
381 \
382 if (!SIZE_ONLY) \
383 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
384 if (len > RegexLengthToShowInErrorMessages) { \
385 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
386 len = RegexLengthToShowInErrorMessages - 10; \
387 ellipses = "..."; \
388 } \
58e23c8d 389 code; \
ccb2c380 390} STMT_END
8615cb43 391
58e23c8d
YO
392#define FAIL(msg) _FAIL( \
393 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
394 msg, (int)len, RExC_precomp, ellipses))
395
396#define FAIL2(msg,arg) _FAIL( \
397 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
398 arg, (int)len, RExC_precomp, ellipses))
399
b45f050a 400/*
b45f050a
JF
401 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
402 */
ccb2c380 403#define Simple_vFAIL(m) STMT_START { \
a28509cc 404 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
405 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
406 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
407} STMT_END
b45f050a
JF
408
409/*
410 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
411 */
ccb2c380
MP
412#define vFAIL(m) STMT_START { \
413 if (!SIZE_ONLY) \
414 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
415 Simple_vFAIL(m); \
416} STMT_END
b45f050a
JF
417
418/*
419 * Like Simple_vFAIL(), but accepts two arguments.
420 */
ccb2c380 421#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 422 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
423 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
424 (int)offset, RExC_precomp, RExC_precomp + offset); \
425} STMT_END
b45f050a
JF
426
427/*
428 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
429 */
ccb2c380
MP
430#define vFAIL2(m,a1) STMT_START { \
431 if (!SIZE_ONLY) \
432 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
433 Simple_vFAIL2(m, a1); \
434} STMT_END
b45f050a
JF
435
436
437/*
438 * Like Simple_vFAIL(), but accepts three arguments.
439 */
ccb2c380 440#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 441 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
442 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
443 (int)offset, RExC_precomp, RExC_precomp + offset); \
444} STMT_END
b45f050a
JF
445
446/*
447 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
448 */
ccb2c380
MP
449#define vFAIL3(m,a1,a2) STMT_START { \
450 if (!SIZE_ONLY) \
451 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
452 Simple_vFAIL3(m, a1, a2); \
453} STMT_END
b45f050a
JF
454
455/*
456 * Like Simple_vFAIL(), but accepts four arguments.
457 */
ccb2c380 458#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 459 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
460 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
461 (int)offset, RExC_precomp, RExC_precomp + offset); \
462} STMT_END
b45f050a 463
ccb2c380 464#define vWARN(loc,m) STMT_START { \
a28509cc 465 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
466 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
467 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
468} STMT_END
469
470#define vWARNdep(loc,m) STMT_START { \
a28509cc 471 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
472 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
473 "%s" REPORT_LOCATION, \
474 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
475} STMT_END
476
477
478#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 479 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
480 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
481 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
482} STMT_END
483
484#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 485 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
486 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
487 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
488} STMT_END
489
490#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 491 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
492 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
493 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
494} STMT_END
495
496#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 497 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
498 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
499 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
500} STMT_END
9d1d55b5 501
8615cb43 502
cd439c50 503/* Allow for side effects in s */
ccb2c380
MP
504#define REGC(c,s) STMT_START { \
505 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
506} STMT_END
cd439c50 507
fac92740
MJD
508/* Macros for recording node offsets. 20001227 mjd@plover.com
509 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
510 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
511 * Element 0 holds the number n.
07be1b83 512 * Position is 1 indexed.
fac92740
MJD
513 */
514
ccb2c380
MP
515#define Set_Node_Offset_To_R(node,byte) STMT_START { \
516 if (! SIZE_ONLY) { \
517 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 518 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 519 if((node) < 0) { \
551405c4 520 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
521 } else { \
522 RExC_offsets[2*(node)-1] = (byte); \
523 } \
524 } \
525} STMT_END
526
527#define Set_Node_Offset(node,byte) \
528 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
529#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
530
531#define Set_Node_Length_To_R(node,len) STMT_START { \
532 if (! SIZE_ONLY) { \
533 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 534 __LINE__, (int)(node), (int)(len))); \
ccb2c380 535 if((node) < 0) { \
551405c4 536 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
537 } else { \
538 RExC_offsets[2*(node)] = (len); \
539 } \
540 } \
541} STMT_END
542
543#define Set_Node_Length(node,len) \
544 Set_Node_Length_To_R((node)-RExC_emit_start, len)
545#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
546#define Set_Node_Cur_Length(node) \
547 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
548
549/* Get offsets and lengths */
550#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
551#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
552
07be1b83
YO
553#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
554 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
555 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
556} STMT_END
557
558
559#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
560#define EXPERIMENTAL_INPLACESCAN
561#endif
562
304ee84b
YO
563#define DEBUG_STUDYDATA(str,data,depth) \
564DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 565 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
566 "%*s" str "Pos:%"IVdf"/%"IVdf \
567 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
568 (int)(depth)*2, "", \
569 (IV)((data)->pos_min), \
570 (IV)((data)->pos_delta), \
304ee84b 571 (UV)((data)->flags), \
1de06328 572 (IV)((data)->whilem_c), \
304ee84b
YO
573 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
574 is_inf ? "INF " : "" \
1de06328
YO
575 ); \
576 if ((data)->last_found) \
577 PerlIO_printf(Perl_debug_log, \
578 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
579 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
580 SvPVX_const((data)->last_found), \
581 (IV)((data)->last_end), \
582 (IV)((data)->last_start_min), \
583 (IV)((data)->last_start_max), \
584 ((data)->longest && \
585 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
586 SvPVX_const((data)->longest_fixed), \
587 (IV)((data)->offset_fixed), \
588 ((data)->longest && \
589 (data)->longest==&((data)->longest_float)) ? "*" : "", \
590 SvPVX_const((data)->longest_float), \
591 (IV)((data)->offset_float_min), \
592 (IV)((data)->offset_float_max) \
593 ); \
594 PerlIO_printf(Perl_debug_log,"\n"); \
595});
596
acfe0abc 597static void clear_re(pTHX_ void *r);
4327152a 598
653099ff 599/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 600 Update the longest found anchored substring and the longest found
653099ff
GS
601 floating substrings if needed. */
602
4327152a 603STATIC void
304ee84b 604S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
c277df42 605{
e1ec3a88
AL
606 const STRLEN l = CHR_SVLEN(data->last_found);
607 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 608 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 609
c277df42 610 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 611 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
612 if (*data->longest == data->longest_fixed) {
613 data->offset_fixed = l ? data->last_start_min : data->pos_min;
614 if (data->flags & SF_BEFORE_EOL)
b81d288d 615 data->flags
c277df42
IZ
616 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
617 else
618 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
619 data->minlen_fixed=minlenp;
620 data->lookbehind_fixed=0;
a0ed51b3 621 }
304ee84b 622 else { /* *data->longest == data->longest_float */
c277df42 623 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
624 data->offset_float_max = (l
625 ? data->last_start_max
c277df42 626 : data->pos_min + data->pos_delta);
304ee84b 627 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
9051bda5 628 data->offset_float_max = I32_MAX;
c277df42 629 if (data->flags & SF_BEFORE_EOL)
b81d288d 630 data->flags
c277df42
IZ
631 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
632 else
633 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
634 data->minlen_float=minlenp;
635 data->lookbehind_float=0;
c277df42
IZ
636 }
637 }
638 SvCUR_set(data->last_found, 0);
0eda9292 639 {
a28509cc 640 SV * const sv = data->last_found;
097eb12c
AL
641 if (SvUTF8(sv) && SvMAGICAL(sv)) {
642 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
643 if (mg)
644 mg->mg_len = 0;
645 }
0eda9292 646 }
c277df42
IZ
647 data->last_end = -1;
648 data->flags &= ~SF_BEFORE_EOL;
304ee84b 649 DEBUG_STUDYDATA("cl_anything: ",data,0);
c277df42
IZ
650}
651
653099ff
GS
652/* Can match anything (initialization) */
653STATIC void
097eb12c 654S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 655{
653099ff 656 ANYOF_CLASS_ZERO(cl);
f8bef550 657 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 658 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
659 if (LOC)
660 cl->flags |= ANYOF_LOCALE;
661}
662
663/* Can match anything (initialization) */
664STATIC int
5f66b61c 665S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
666{
667 int value;
668
aaa51d5e 669 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
670 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
671 return 1;
1aa99e6b
IH
672 if (!(cl->flags & ANYOF_UNICODE_ALL))
673 return 0;
10edeb5d 674 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 675 return 0;
653099ff
GS
676 return 1;
677}
678
679/* Can match anything (initialization) */
680STATIC void
097eb12c 681S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 682{
8ecf7187 683 Zero(cl, 1, struct regnode_charclass_class);
653099ff 684 cl->type = ANYOF;
830247a4 685 cl_anything(pRExC_state, cl);
653099ff
GS
686}
687
688STATIC void
097eb12c 689S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 690{
8ecf7187 691 Zero(cl, 1, struct regnode_charclass_class);
653099ff 692 cl->type = ANYOF;
830247a4 693 cl_anything(pRExC_state, cl);
653099ff
GS
694 if (LOC)
695 cl->flags |= ANYOF_LOCALE;
696}
697
698/* 'And' a given class with another one. Can create false positives */
699/* We assume that cl is not inverted */
700STATIC void
5f66b61c 701S_cl_and(struct regnode_charclass_class *cl,
a28509cc 702 const struct regnode_charclass_class *and_with)
653099ff 703{
40d049e4
YO
704
705 assert(and_with->type == ANYOF);
653099ff
GS
706 if (!(and_with->flags & ANYOF_CLASS)
707 && !(cl->flags & ANYOF_CLASS)
708 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
709 && !(and_with->flags & ANYOF_FOLD)
710 && !(cl->flags & ANYOF_FOLD)) {
711 int i;
712
713 if (and_with->flags & ANYOF_INVERT)
714 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
715 cl->bitmap[i] &= ~and_with->bitmap[i];
716 else
717 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
718 cl->bitmap[i] &= and_with->bitmap[i];
719 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
720 if (!(and_with->flags & ANYOF_EOS))
721 cl->flags &= ~ANYOF_EOS;
1aa99e6b 722
14ebb1a2
JH
723 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
724 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
725 cl->flags &= ~ANYOF_UNICODE_ALL;
726 cl->flags |= ANYOF_UNICODE;
727 ARG_SET(cl, ARG(and_with));
728 }
14ebb1a2
JH
729 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
730 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 731 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
732 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
733 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 734 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
735}
736
737/* 'OR' a given class with another one. Can create false positives */
738/* We assume that cl is not inverted */
739STATIC void
097eb12c 740S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 741{
653099ff
GS
742 if (or_with->flags & ANYOF_INVERT) {
743 /* We do not use
744 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
745 * <= (B1 | !B2) | (CL1 | !CL2)
746 * which is wasteful if CL2 is small, but we ignore CL2:
747 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
748 * XXXX Can we handle case-fold? Unclear:
749 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
750 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
751 */
752 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
753 && !(or_with->flags & ANYOF_FOLD)
754 && !(cl->flags & ANYOF_FOLD) ) {
755 int i;
756
757 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
758 cl->bitmap[i] |= ~or_with->bitmap[i];
759 } /* XXXX: logic is complicated otherwise */
760 else {
830247a4 761 cl_anything(pRExC_state, cl);
653099ff
GS
762 }
763 } else {
764 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
765 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 766 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
767 || (cl->flags & ANYOF_FOLD)) ) {
768 int i;
769
770 /* OR char bitmap and class bitmap separately */
771 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
772 cl->bitmap[i] |= or_with->bitmap[i];
773 if (or_with->flags & ANYOF_CLASS) {
774 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
775 cl->classflags[i] |= or_with->classflags[i];
776 cl->flags |= ANYOF_CLASS;
777 }
778 }
779 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 780 cl_anything(pRExC_state, cl);
653099ff
GS
781 }
782 }
783 if (or_with->flags & ANYOF_EOS)
784 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
785
786 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
787 ARG(cl) != ARG(or_with)) {
788 cl->flags |= ANYOF_UNICODE_ALL;
789 cl->flags &= ~ANYOF_UNICODE;
790 }
791 if (or_with->flags & ANYOF_UNICODE_ALL) {
792 cl->flags |= ANYOF_UNICODE_ALL;
793 cl->flags &= ~ANYOF_UNICODE;
794 }
653099ff
GS
795}
796
a3621e74
YO
797#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
798#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
799#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
800#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
801
3dab1dad
YO
802
803#ifdef DEBUGGING
07be1b83 804/*
2b8b4781
NC
805 dump_trie(trie,widecharmap,revcharmap)
806 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
807 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
808
809 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
810 The _interim_ variants are used for debugging the interim
811 tables that are used to generate the final compressed
812 representation which is what dump_trie expects.
813
3dab1dad
YO
814 Part of the reason for their existance is to provide a form
815 of documentation as to how the different representations function.
07be1b83
YO
816
817*/
3dab1dad
YO
818
819/*
3dab1dad
YO
820 Dumps the final compressed table form of the trie to Perl_debug_log.
821 Used for debugging make_trie().
822*/
823
824STATIC void
2b8b4781
NC
825S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
826 AV *revcharmap, U32 depth)
3dab1dad
YO
827{
828 U32 state;
ab3bbdeb 829 SV *sv=sv_newmortal();
55eed653 830 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
831 GET_RE_DEBUG_FLAGS_DECL;
832
ab3bbdeb 833
3dab1dad
YO
834 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
835 (int)depth * 2 + 2,"",
836 "Match","Base","Ofs" );
837
838 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 839 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 840 if ( tmp ) {
ab3bbdeb
YO
841 PerlIO_printf( Perl_debug_log, "%*s",
842 colwidth,
ddc5bc0f 843 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
844 PL_colors[0], PL_colors[1],
845 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
846 PERL_PV_ESCAPE_FIRSTCHAR
847 )
848 );
3dab1dad
YO
849 }
850 }
851 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
852 (int)depth * 2 + 2,"");
853
854 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 855 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
856 PerlIO_printf( Perl_debug_log, "\n");
857
1e2e3d02 858 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 859 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
860
861 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
862
863 if ( trie->states[ state ].wordnum ) {
864 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
865 } else {
866 PerlIO_printf( Perl_debug_log, "%6s", "" );
867 }
868
869 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
870
871 if ( base ) {
872 U32 ofs = 0;
873
874 while( ( base + ofs < trie->uniquecharcount ) ||
875 ( base + ofs - trie->uniquecharcount < trie->lasttrans
876 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
877 ofs++;
878
879 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
880
881 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
882 if ( ( base + ofs >= trie->uniquecharcount ) &&
883 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
884 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
885 {
ab3bbdeb
YO
886 PerlIO_printf( Perl_debug_log, "%*"UVXf,
887 colwidth,
3dab1dad
YO
888 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
889 } else {
ab3bbdeb 890 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
891 }
892 }
893
894 PerlIO_printf( Perl_debug_log, "]");
895
896 }
897 PerlIO_printf( Perl_debug_log, "\n" );
898 }
899}
900/*
3dab1dad
YO
901 Dumps a fully constructed but uncompressed trie in list form.
902 List tries normally only are used for construction when the number of
903 possible chars (trie->uniquecharcount) is very high.
904 Used for debugging make_trie().
905*/
906STATIC void
55eed653 907S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
908 HV *widecharmap, AV *revcharmap, U32 next_alloc,
909 U32 depth)
3dab1dad
YO
910{
911 U32 state;
ab3bbdeb 912 SV *sv=sv_newmortal();
55eed653 913 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
914 GET_RE_DEBUG_FLAGS_DECL;
915 /* print out the table precompression. */
ab3bbdeb
YO
916 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
917 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
918 "------:-----+-----------------\n" );
3dab1dad
YO
919
920 for( state=1 ; state < next_alloc ; state ++ ) {
921 U16 charid;
922
ab3bbdeb 923 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
924 (int)depth * 2 + 2,"", (UV)state );
925 if ( ! trie->states[ state ].wordnum ) {
926 PerlIO_printf( Perl_debug_log, "%5s| ","");
927 } else {
928 PerlIO_printf( Perl_debug_log, "W%4x| ",
929 trie->states[ state ].wordnum
930 );
931 }
932 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 933 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
934 if ( tmp ) {
935 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
936 colwidth,
ddc5bc0f 937 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
938 PL_colors[0], PL_colors[1],
939 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
940 PERL_PV_ESCAPE_FIRSTCHAR
941 ) ,
1e2e3d02
YO
942 TRIE_LIST_ITEM(state,charid).forid,
943 (UV)TRIE_LIST_ITEM(state,charid).newstate
944 );
945 if (!(charid % 10))
664e119d
RGS
946 PerlIO_printf(Perl_debug_log, "\n%*s| ",
947 (int)((depth * 2) + 14), "");
1e2e3d02 948 }
ab3bbdeb
YO
949 }
950 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
951 }
952}
953
954/*
3dab1dad
YO
955 Dumps a fully constructed but uncompressed trie in table form.
956 This is the normal DFA style state transition table, with a few
957 twists to facilitate compression later.
958 Used for debugging make_trie().
959*/
960STATIC void
55eed653 961S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
962 HV *widecharmap, AV *revcharmap, U32 next_alloc,
963 U32 depth)
3dab1dad
YO
964{
965 U32 state;
966 U16 charid;
ab3bbdeb 967 SV *sv=sv_newmortal();
55eed653 968 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
969 GET_RE_DEBUG_FLAGS_DECL;
970
971 /*
972 print out the table precompression so that we can do a visual check
973 that they are identical.
974 */
975
976 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
977
978 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 979 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 980 if ( tmp ) {
ab3bbdeb
YO
981 PerlIO_printf( Perl_debug_log, "%*s",
982 colwidth,
ddc5bc0f 983 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
984 PL_colors[0], PL_colors[1],
985 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
986 PERL_PV_ESCAPE_FIRSTCHAR
987 )
988 );
3dab1dad
YO
989 }
990 }
991
992 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
993
994 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 995 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
996 }
997
998 PerlIO_printf( Perl_debug_log, "\n" );
999
1000 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1001
1002 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1003 (int)depth * 2 + 2,"",
1004 (UV)TRIE_NODENUM( state ) );
1005
1006 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1007 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1008 if (v)
1009 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1010 else
1011 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1012 }
1013 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1014 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1015 } else {
1016 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1017 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1018 }
1019 }
07be1b83 1020}
3dab1dad
YO
1021
1022#endif
1023
786e8c11
YO
1024/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1025 startbranch: the first branch in the whole branch sequence
1026 first : start branch of sequence of branch-exact nodes.
1027 May be the same as startbranch
1028 last : Thing following the last branch.
1029 May be the same as tail.
1030 tail : item following the branch sequence
1031 count : words in the sequence
1032 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1033 depth : indent depth
3dab1dad 1034
786e8c11 1035Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1036
786e8c11
YO
1037A trie is an N'ary tree where the branches are determined by digital
1038decomposition of the key. IE, at the root node you look up the 1st character and
1039follow that branch repeat until you find the end of the branches. Nodes can be
1040marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1041
786e8c11 1042 /he|she|his|hers/
72f13be8 1043
786e8c11
YO
1044would convert into the following structure. Numbers represent states, letters
1045following numbers represent valid transitions on the letter from that state, if
1046the number is in square brackets it represents an accepting state, otherwise it
1047will be in parenthesis.
07be1b83 1048
786e8c11
YO
1049 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1050 | |
1051 | (2)
1052 | |
1053 (1) +-i->(6)-+-s->[7]
1054 |
1055 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1056
786e8c11
YO
1057 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1058
1059This shows that when matching against the string 'hers' we will begin at state 1
1060read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1061then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1062is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1063single traverse. We store a mapping from accepting to state to which word was
1064matched, and then when we have multiple possibilities we try to complete the
1065rest of the regex in the order in which they occured in the alternation.
1066
1067The only prior NFA like behaviour that would be changed by the TRIE support is
1068the silent ignoring of duplicate alternations which are of the form:
1069
1070 / (DUPE|DUPE) X? (?{ ... }) Y /x
1071
1072Thus EVAL blocks follwing a trie may be called a different number of times with
1073and without the optimisation. With the optimisations dupes will be silently
1074ignored. This inconsistant behaviour of EVAL type nodes is well established as
1075the following demonstrates:
1076
1077 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1078
1079which prints out 'word' three times, but
1080
1081 'words'=~/(word|word|word)(?{ print $1 })S/
1082
1083which doesnt print it out at all. This is due to other optimisations kicking in.
1084
1085Example of what happens on a structural level:
1086
1087The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1088
1089 1: CURLYM[1] {1,32767}(18)
1090 5: BRANCH(8)
1091 6: EXACT <ac>(16)
1092 8: BRANCH(11)
1093 9: EXACT <ad>(16)
1094 11: BRANCH(14)
1095 12: EXACT <ab>(16)
1096 16: SUCCEED(0)
1097 17: NOTHING(18)
1098 18: END(0)
1099
1100This would be optimizable with startbranch=5, first=5, last=16, tail=16
1101and should turn into:
1102
1103 1: CURLYM[1] {1,32767}(18)
1104 5: TRIE(16)
1105 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1106 <ac>
1107 <ad>
1108 <ab>
1109 16: SUCCEED(0)
1110 17: NOTHING(18)
1111 18: END(0)
1112
1113Cases where tail != last would be like /(?foo|bar)baz/:
1114
1115 1: BRANCH(4)
1116 2: EXACT <foo>(8)
1117 4: BRANCH(7)
1118 5: EXACT <bar>(8)
1119 7: TAIL(8)
1120 8: EXACT <baz>(10)
1121 10: END(0)
1122
1123which would be optimizable with startbranch=1, first=1, last=7, tail=8
1124and would end up looking like:
1125
1126 1: TRIE(8)
1127 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1128 <foo>
1129 <bar>
1130 7: TAIL(8)
1131 8: EXACT <baz>(10)
1132 10: END(0)
1133
1134 d = uvuni_to_utf8_flags(d, uv, 0);
1135
1136is the recommended Unicode-aware way of saying
1137
1138 *(d++) = uv;
1139*/
1140
1e2e3d02 1141#define TRIE_STORE_REVCHAR \
786e8c11 1142 STMT_START { \
1e2e3d02 1143 SV *tmp = newSVpvs(""); \
786e8c11 1144 if (UTF) SvUTF8_on(tmp); \
1e2e3d02 1145 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
2b8b4781 1146 av_push( revcharmap, tmp ); \
786e8c11
YO
1147 } STMT_END
1148
1149#define TRIE_READ_CHAR STMT_START { \
1150 wordlen++; \
1151 if ( UTF ) { \
1152 if ( folder ) { \
1153 if ( foldlen > 0 ) { \
1154 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1155 foldlen -= len; \
1156 scan += len; \
1157 len = 0; \
1158 } else { \
1159 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1160 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1161 foldlen -= UNISKIP( uvc ); \
1162 scan = foldbuf + UNISKIP( uvc ); \
1163 } \
1164 } else { \
1165 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1166 } \
1167 } else { \
1168 uvc = (U32)*uc; \
1169 len = 1; \
1170 } \
1171} STMT_END
1172
1173
1174
1175#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1176 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1177 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1178 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1179 } \
1180 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1181 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1182 TRIE_LIST_CUR( state )++; \
1183} STMT_END
07be1b83 1184
786e8c11
YO
1185#define TRIE_LIST_NEW(state) STMT_START { \
1186 Newxz( trie->states[ state ].trans.list, \
1187 4, reg_trie_trans_le ); \
1188 TRIE_LIST_CUR( state ) = 1; \
1189 TRIE_LIST_LEN( state ) = 4; \
1190} STMT_END
07be1b83 1191
786e8c11
YO
1192#define TRIE_HANDLE_WORD(state) STMT_START { \
1193 U16 dupe= trie->states[ state ].wordnum; \
1194 regnode * const noper_next = regnext( noper ); \
1195 \
1196 if (trie->wordlen) \
1197 trie->wordlen[ curword ] = wordlen; \
1198 DEBUG_r({ \
1199 /* store the word for dumping */ \
1200 SV* tmp; \
1201 if (OP(noper) != NOTHING) \
1202 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1203 else \
1204 tmp = newSVpvn( "", 0 ); \
1205 if ( UTF ) SvUTF8_on( tmp ); \
2b8b4781 1206 av_push( trie_words, tmp ); \
786e8c11
YO
1207 }); \
1208 \
1209 curword++; \
1210 \
1211 if ( noper_next < tail ) { \
1212 if (!trie->jump) \
c944940b 1213 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1214 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1215 if (!jumper) \
1216 jumper = noper_next; \
1217 if (!nextbranch) \
1218 nextbranch= regnext(cur); \
1219 } \
1220 \
1221 if ( dupe ) { \
1222 /* So it's a dupe. This means we need to maintain a */\
1223 /* linked-list from the first to the next. */\
1224 /* we only allocate the nextword buffer when there */\
1225 /* a dupe, so first time we have to do the allocation */\
1226 if (!trie->nextword) \
c944940b 1227 trie->nextword = (U16 *) \
446bd890 1228 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
786e8c11
YO
1229 while ( trie->nextword[dupe] ) \
1230 dupe= trie->nextword[dupe]; \
1231 trie->nextword[dupe]= curword; \
1232 } else { \
1233 /* we haven't inserted this word yet. */ \
1234 trie->states[ state ].wordnum = curword; \
1235 } \
1236} STMT_END
07be1b83 1237
3dab1dad 1238
786e8c11
YO
1239#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1240 ( ( base + charid >= ucharcount \
1241 && base + charid < ubound \
1242 && state == trie->trans[ base - ucharcount + charid ].check \
1243 && trie->trans[ base - ucharcount + charid ].next ) \
1244 ? trie->trans[ base - ucharcount + charid ].next \
1245 : ( state==1 ? special : 0 ) \
1246 )
3dab1dad 1247
786e8c11
YO
1248#define MADE_TRIE 1
1249#define MADE_JUMP_TRIE 2
1250#define MADE_EXACT_TRIE 4
3dab1dad 1251
a3621e74 1252STATIC I32
786e8c11 1253S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1254{
27da23d5 1255 dVAR;
a3621e74
YO
1256 /* first pass, loop through and scan words */
1257 reg_trie_data *trie;
55eed653 1258 HV *widecharmap = NULL;
2b8b4781 1259 AV *revcharmap = newAV();
a3621e74 1260 regnode *cur;
9f7f3913 1261 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1262 STRLEN len = 0;
1263 UV uvc = 0;
1264 U16 curword = 0;
1265 U32 next_alloc = 0;
786e8c11
YO
1266 regnode *jumper = NULL;
1267 regnode *nextbranch = NULL;
7f69552c 1268 regnode *convert = NULL;
a3621e74 1269 /* we just use folder as a flag in utf8 */
e1ec3a88 1270 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1271 ? PL_fold
1272 : ( flags == EXACTFL
1273 ? PL_fold_locale
1274 : NULL
1275 )
1276 );
1277
2b8b4781
NC
1278#ifdef DEBUGGING
1279 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1280 AV *trie_words = NULL;
1281 /* along with revcharmap, this only used during construction but both are
1282 * useful during debugging so we store them in the struct when debugging.
8e11feef 1283 */
2b8b4781
NC
1284#else
1285 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
3dab1dad 1286 STRLEN trie_charcount=0;
3dab1dad 1287#endif
2b8b4781 1288 SV *re_trie_maxbuff;
a3621e74 1289 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1290#ifndef DEBUGGING
1291 PERL_UNUSED_ARG(depth);
1292#endif
a3621e74 1293
c944940b 1294 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1295 trie->refcount = 1;
3dab1dad 1296 trie->startstate = 1;
786e8c11 1297 trie->wordcount = word_count;
f8fc2ecf 1298 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1299 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1300 if (!(UTF && folder))
c944940b 1301 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
a3621e74 1302 DEBUG_r({
2b8b4781 1303 trie_words = newAV();
a3621e74 1304 });
a3621e74 1305
0111c4fd 1306 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1307 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1308 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1309 }
3dab1dad
YO
1310 DEBUG_OPTIMISE_r({
1311 PerlIO_printf( Perl_debug_log,
786e8c11 1312 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1313 (int)depth * 2 + 2, "",
1314 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1315 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1316 (int)depth);
3dab1dad 1317 });
7f69552c
YO
1318
1319 /* Find the node we are going to overwrite */
1320 if ( first == startbranch && OP( last ) != BRANCH ) {
1321 /* whole branch chain */
1322 convert = first;
1323 } else {
1324 /* branch sub-chain */
1325 convert = NEXTOPER( first );
1326 }
1327
a3621e74
YO
1328 /* -- First loop and Setup --
1329
1330 We first traverse the branches and scan each word to determine if it
1331 contains widechars, and how many unique chars there are, this is
1332 important as we have to build a table with at least as many columns as we
1333 have unique chars.
1334
1335 We use an array of integers to represent the character codes 0..255
1336 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1337 native representation of the character value as the key and IV's for the
1338 coded index.
1339
1340 *TODO* If we keep track of how many times each character is used we can
1341 remap the columns so that the table compression later on is more
1342 efficient in terms of memory by ensuring most common value is in the
1343 middle and the least common are on the outside. IMO this would be better
1344 than a most to least common mapping as theres a decent chance the most
1345 common letter will share a node with the least common, meaning the node
1346 will not be compressable. With a middle is most common approach the worst
1347 case is when we have the least common nodes twice.
1348
1349 */
1350
a3621e74 1351 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1352 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1353 const U8 *uc = (U8*)STRING( noper );
a28509cc 1354 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1355 STRLEN foldlen = 0;
1356 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1357 const U8 *scan = (U8*)NULL;
07be1b83 1358 U32 wordlen = 0; /* required init */
3dab1dad 1359 STRLEN chars=0;
a3621e74 1360
3dab1dad
YO
1361 if (OP(noper) == NOTHING) {
1362 trie->minlen= 0;
1363 continue;
1364 }
1365 if (trie->bitmap) {
1366 TRIE_BITMAP_SET(trie,*uc);
1367 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1368 }
a3621e74 1369 for ( ; uc < e ; uc += len ) {
3dab1dad 1370 TRIE_CHARCOUNT(trie)++;
a3621e74 1371 TRIE_READ_CHAR;
3dab1dad 1372 chars++;
a3621e74
YO
1373 if ( uvc < 256 ) {
1374 if ( !trie->charmap[ uvc ] ) {
1375 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1376 if ( folder )
1377 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1378 TRIE_STORE_REVCHAR;
a3621e74
YO
1379 }
1380 } else {
1381 SV** svpp;
55eed653
NC
1382 if ( !widecharmap )
1383 widecharmap = newHV();
a3621e74 1384
55eed653 1385 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1386
1387 if ( !svpp )
e4584336 1388 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1389
1390 if ( !SvTRUE( *svpp ) ) {
1391 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1392 TRIE_STORE_REVCHAR;
a3621e74
YO
1393 }
1394 }
1395 }
3dab1dad
YO
1396 if( cur == first ) {
1397 trie->minlen=chars;
1398 trie->maxlen=chars;
1399 } else if (chars < trie->minlen) {
1400 trie->minlen=chars;
1401 } else if (chars > trie->maxlen) {
1402 trie->maxlen=chars;
1403 }
1404
a3621e74
YO
1405 } /* end first pass */
1406 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1407 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1408 (int)depth * 2 + 2,"",
55eed653 1409 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1410 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1411 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1412 );
c944940b 1413 trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
a3621e74
YO
1414
1415 /*
1416 We now know what we are dealing with in terms of unique chars and
1417 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1418 representation using a flat table will take. If it's over a reasonable
1419 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1420 conservative but potentially much slower representation using an array
1421 of lists.
1422
1423 At the end we convert both representations into the same compressed
1424 form that will be used in regexec.c for matching with. The latter
1425 is a form that cannot be used to construct with but has memory
1426 properties similar to the list form and access properties similar
1427 to the table form making it both suitable for fast searches and
1428 small enough that its feasable to store for the duration of a program.
1429
1430 See the comment in the code where the compressed table is produced
1431 inplace from the flat tabe representation for an explanation of how
1432 the compression works.
1433
1434 */
1435
1436
3dab1dad 1437 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1438 /*
1439 Second Pass -- Array Of Lists Representation
1440
1441 Each state will be represented by a list of charid:state records
1442 (reg_trie_trans_le) the first such element holds the CUR and LEN
1443 points of the allocated array. (See defines above).
1444
1445 We build the initial structure using the lists, and then convert
1446 it into the compressed table form which allows faster lookups
1447 (but cant be modified once converted).
a3621e74
YO
1448 */
1449
a3621e74
YO
1450 STRLEN transcount = 1;
1451
1e2e3d02
YO
1452 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1453 "%*sCompiling trie using list compiler\n",
1454 (int)depth * 2 + 2, ""));
446bd890 1455
c944940b
JH
1456 trie->states = (reg_trie_state *)
1457 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1458 sizeof(reg_trie_state) );
a3621e74
YO
1459 TRIE_LIST_NEW(1);
1460 next_alloc = 2;
1461
1462 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1463
c445ea15
AL
1464 regnode * const noper = NEXTOPER( cur );
1465 U8 *uc = (U8*)STRING( noper );
1466 const U8 * const e = uc + STR_LEN( noper );
1467 U32 state = 1; /* required init */
1468 U16 charid = 0; /* sanity init */
1469 U8 *scan = (U8*)NULL; /* sanity init */
1470 STRLEN foldlen = 0; /* required init */
07be1b83 1471 U32 wordlen = 0; /* required init */
c445ea15
AL
1472 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1473
3dab1dad 1474 if (OP(noper) != NOTHING) {
786e8c11 1475 for ( ; uc < e ; uc += len ) {
c445ea15 1476
786e8c11 1477 TRIE_READ_CHAR;
c445ea15 1478
786e8c11
YO
1479 if ( uvc < 256 ) {
1480 charid = trie->charmap[ uvc ];
c445ea15 1481 } else {
55eed653 1482 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1483 if ( !svpp ) {
1484 charid = 0;
1485 } else {
1486 charid=(U16)SvIV( *svpp );
1487 }
c445ea15 1488 }
786e8c11
YO
1489 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1490 if ( charid ) {
a3621e74 1491
786e8c11
YO
1492 U16 check;
1493 U32 newstate = 0;
a3621e74 1494
786e8c11
YO
1495 charid--;
1496 if ( !trie->states[ state ].trans.list ) {
1497 TRIE_LIST_NEW( state );
c445ea15 1498 }
786e8c11
YO
1499 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1500 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1501 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1502 break;
1503 }
1504 }
1505 if ( ! newstate ) {
1506 newstate = next_alloc++;
1507 TRIE_LIST_PUSH( state, charid, newstate );
1508 transcount++;
1509 }
1510 state = newstate;
1511 } else {
1512 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1513 }
a28509cc 1514 }
c445ea15 1515 }
3dab1dad 1516 TRIE_HANDLE_WORD(state);
a3621e74
YO
1517
1518 } /* end second pass */
1519
1e2e3d02
YO
1520 /* next alloc is the NEXT state to be allocated */
1521 trie->statecount = next_alloc;
c944940b
JH
1522 trie->states = (reg_trie_state *)
1523 PerlMemShared_realloc( trie->states,
1524 next_alloc
1525 * sizeof(reg_trie_state) );
a3621e74 1526
3dab1dad 1527 /* and now dump it out before we compress it */
2b8b4781
NC
1528 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1529 revcharmap, next_alloc,
1530 depth+1)
1e2e3d02 1531 );
a3621e74 1532
c944940b
JH
1533 trie->trans = (reg_trie_trans *)
1534 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1535 {
1536 U32 state;
a3621e74
YO
1537 U32 tp = 0;
1538 U32 zp = 0;
1539
1540
1541 for( state=1 ; state < next_alloc ; state ++ ) {
1542 U32 base=0;
1543
1544 /*
1545 DEBUG_TRIE_COMPILE_MORE_r(
1546 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1547 );
1548 */
1549
1550 if (trie->states[state].trans.list) {
1551 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1552 U16 maxid=minid;
a28509cc 1553 U16 idx;
a3621e74
YO
1554
1555 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1556 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1557 if ( forid < minid ) {
1558 minid=forid;
1559 } else if ( forid > maxid ) {
1560 maxid=forid;
1561 }
a3621e74
YO
1562 }
1563 if ( transcount < tp + maxid - minid + 1) {
1564 transcount *= 2;
c944940b
JH
1565 trie->trans = (reg_trie_trans *)
1566 PerlMemShared_realloc( trie->trans,
446bd890
NC
1567 transcount
1568 * sizeof(reg_trie_trans) );
a3621e74
YO
1569 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1570 }
1571 base = trie->uniquecharcount + tp - minid;
1572 if ( maxid == minid ) {
1573 U32 set = 0;
1574 for ( ; zp < tp ; zp++ ) {
1575 if ( ! trie->trans[ zp ].next ) {
1576 base = trie->uniquecharcount + zp - minid;
1577 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1578 trie->trans[ zp ].check = state;
1579 set = 1;
1580 break;
1581 }
1582 }
1583 if ( !set ) {
1584 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1585 trie->trans[ tp ].check = state;
1586 tp++;
1587 zp = tp;
1588 }
1589 } else {
1590 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1591 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1592 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1593 trie->trans[ tid ].check = state;
1594 }
1595 tp += ( maxid - minid + 1 );
1596 }
1597 Safefree(trie->states[ state ].trans.list);
1598 }
1599 /*
1600 DEBUG_TRIE_COMPILE_MORE_r(
1601 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1602 );
1603 */
1604 trie->states[ state ].trans.base=base;
1605 }
cc601c31 1606 trie->lasttrans = tp + 1;
a3621e74
YO
1607 }
1608 } else {
1609 /*
1610 Second Pass -- Flat Table Representation.
1611
1612 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1613 We know that we will need Charcount+1 trans at most to store the data
1614 (one row per char at worst case) So we preallocate both structures
1615 assuming worst case.
1616
1617 We then construct the trie using only the .next slots of the entry
1618 structs.
1619
1620 We use the .check field of the first entry of the node temporarily to
1621 make compression both faster and easier by keeping track of how many non
1622 zero fields are in the node.
1623
1624 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1625 transition.
1626
1627 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1628 number representing the first entry of the node, and state as a
1629 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1630 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1631 are 2 entrys per node. eg:
1632
1633 A B A B
1634 1. 2 4 1. 3 7
1635 2. 0 3 3. 0 5
1636 3. 0 0 5. 0 0
1637 4. 0 0 7. 0 0
1638
1639 The table is internally in the right hand, idx form. However as we also
1640 have to deal with the states array which is indexed by nodenum we have to
1641 use TRIE_NODENUM() to convert.
1642
1643 */
1e2e3d02
YO
1644 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1645 "%*sCompiling trie using table compiler\n",
1646 (int)depth * 2 + 2, ""));
3dab1dad 1647
c944940b
JH
1648 trie->trans = (reg_trie_trans *)
1649 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1650 * trie->uniquecharcount + 1,
1651 sizeof(reg_trie_trans) );
1652 trie->states = (reg_trie_state *)
1653 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1654 sizeof(reg_trie_state) );
a3621e74
YO
1655 next_alloc = trie->uniquecharcount + 1;
1656
3dab1dad 1657
a3621e74
YO
1658 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1659
c445ea15 1660 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1661 const U8 *uc = (U8*)STRING( noper );
1662 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1663
1664 U32 state = 1; /* required init */
1665
1666 U16 charid = 0; /* sanity init */
1667 U32 accept_state = 0; /* sanity init */
1668 U8 *scan = (U8*)NULL; /* sanity init */
1669
1670 STRLEN foldlen = 0; /* required init */
07be1b83 1671 U32 wordlen = 0; /* required init */
a3621e74
YO
1672 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1673
3dab1dad 1674 if ( OP(noper) != NOTHING ) {
786e8c11 1675 for ( ; uc < e ; uc += len ) {
a3621e74 1676
786e8c11 1677 TRIE_READ_CHAR;
a3621e74 1678
786e8c11
YO
1679 if ( uvc < 256 ) {
1680 charid = trie->charmap[ uvc ];
1681 } else {
55eed653 1682 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 1683 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1684 }
786e8c11
YO
1685 if ( charid ) {
1686 charid--;
1687 if ( !trie->trans[ state + charid ].next ) {
1688 trie->trans[ state + charid ].next = next_alloc;
1689 trie->trans[ state ].check++;
1690 next_alloc += trie->uniquecharcount;
1691 }
1692 state = trie->trans[ state + charid ].next;
1693 } else {
1694 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1695 }
1696 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1697 }
a3621e74 1698 }
3dab1dad
YO
1699 accept_state = TRIE_NODENUM( state );
1700 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1701
1702 } /* end second pass */
1703
3dab1dad 1704 /* and now dump it out before we compress it */
2b8b4781
NC
1705 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1706 revcharmap,
1707 next_alloc, depth+1));
a3621e74 1708
a3621e74
YO
1709 {
1710 /*
1711 * Inplace compress the table.*
1712
1713 For sparse data sets the table constructed by the trie algorithm will
1714 be mostly 0/FAIL transitions or to put it another way mostly empty.
1715 (Note that leaf nodes will not contain any transitions.)
1716
1717 This algorithm compresses the tables by eliminating most such
1718 transitions, at the cost of a modest bit of extra work during lookup:
1719
1720 - Each states[] entry contains a .base field which indicates the
1721 index in the state[] array wheres its transition data is stored.
1722
1723 - If .base is 0 there are no valid transitions from that node.
1724
1725 - If .base is nonzero then charid is added to it to find an entry in
1726 the trans array.
1727
1728 -If trans[states[state].base+charid].check!=state then the
1729 transition is taken to be a 0/Fail transition. Thus if there are fail
1730 transitions at the front of the node then the .base offset will point
1731 somewhere inside the previous nodes data (or maybe even into a node
1732 even earlier), but the .check field determines if the transition is
1733 valid.
1734
786e8c11 1735 XXX - wrong maybe?
a3621e74
YO
1736 The following process inplace converts the table to the compressed
1737 table: We first do not compress the root node 1,and mark its all its
1738 .check pointers as 1 and set its .base pointer as 1 as well. This
1739 allows to do a DFA construction from the compressed table later, and
1740 ensures that any .base pointers we calculate later are greater than
1741 0.
1742
1743 - We set 'pos' to indicate the first entry of the second node.
1744
1745 - We then iterate over the columns of the node, finding the first and
1746 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1747 and set the .check pointers accordingly, and advance pos
1748 appropriately and repreat for the next node. Note that when we copy
1749 the next pointers we have to convert them from the original
1750 NODEIDX form to NODENUM form as the former is not valid post
1751 compression.
1752
1753 - If a node has no transitions used we mark its base as 0 and do not
1754 advance the pos pointer.
1755
1756 - If a node only has one transition we use a second pointer into the
1757 structure to fill in allocated fail transitions from other states.
1758 This pointer is independent of the main pointer and scans forward
1759 looking for null transitions that are allocated to a state. When it
1760 finds one it writes the single transition into the "hole". If the
786e8c11 1761 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1762
1763 - Once compressed we can Renew/realloc the structures to release the
1764 excess space.
1765
1766 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1767 specifically Fig 3.47 and the associated pseudocode.
1768
1769 demq
1770 */
a3b680e6 1771 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1772 U32 state, charid;
a3621e74 1773 U32 pos = 0, zp=0;
1e2e3d02 1774 trie->statecount = laststate;
a3621e74
YO
1775
1776 for ( state = 1 ; state < laststate ; state++ ) {
1777 U8 flag = 0;
a28509cc
AL
1778 const U32 stateidx = TRIE_NODEIDX( state );
1779 const U32 o_used = trie->trans[ stateidx ].check;
1780 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1781 trie->trans[ stateidx ].check = 0;
1782
1783 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1784 if ( flag || trie->trans[ stateidx + charid ].next ) {
1785 if ( trie->trans[ stateidx + charid ].next ) {
1786 if (o_used == 1) {
1787 for ( ; zp < pos ; zp++ ) {
1788 if ( ! trie->trans[ zp ].next ) {
1789 break;
1790 }
1791 }
1792 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1793 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1794 trie->trans[ zp ].check = state;
1795 if ( ++zp > pos ) pos = zp;
1796 break;
1797 }
1798 used--;
1799 }
1800 if ( !flag ) {
1801 flag = 1;
1802 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1803 }
1804 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1805 trie->trans[ pos ].check = state;
1806 pos++;
1807 }
1808 }
1809 }
cc601c31 1810 trie->lasttrans = pos + 1;
c944940b
JH
1811 trie->states = (reg_trie_state *)
1812 PerlMemShared_realloc( trie->states, laststate
1813 * sizeof(reg_trie_state) );
a3621e74 1814 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1815 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1816 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1817 (int)depth * 2 + 2,"",
1818 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1819 (IV)next_alloc,
1820 (IV)pos,
a3621e74
YO
1821 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1822 );
1823
1824 } /* end table compress */
1825 }
1e2e3d02
YO
1826 DEBUG_TRIE_COMPILE_MORE_r(
1827 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1828 (int)depth * 2 + 2, "",
1829 (UV)trie->statecount,
1830 (UV)trie->lasttrans)
1831 );
cc601c31 1832 /* resize the trans array to remove unused space */
c944940b
JH
1833 trie->trans = (reg_trie_trans *)
1834 PerlMemShared_realloc( trie->trans, trie->lasttrans
1835 * sizeof(reg_trie_trans) );
a3621e74 1836
3dab1dad 1837 /* and now dump out the compressed format */
2b8b4781 1838 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
07be1b83 1839
3dab1dad 1840 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1841 U8 nodetype =(U8)(flags & 0xFF);
1842 char *str=NULL;
786e8c11 1843
07be1b83 1844#ifdef DEBUGGING
e62cc96a 1845 regnode *optimize = NULL;
b57a0404
JH
1846 U32 mjd_offset = 0;
1847 U32 mjd_nodelen = 0;
07be1b83 1848#endif
a3621e74 1849 /*
3dab1dad
YO
1850 This means we convert either the first branch or the first Exact,
1851 depending on whether the thing following (in 'last') is a branch
1852 or not and whther first is the startbranch (ie is it a sub part of
1853 the alternation or is it the whole thing.)
1854 Assuming its a sub part we conver the EXACT otherwise we convert
1855 the whole branch sequence, including the first.
a3621e74 1856 */
3dab1dad 1857 /* Find the node we are going to overwrite */
7f69552c 1858 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1859 /* branch sub-chain */
3dab1dad 1860 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1861 DEBUG_r({
1862 mjd_offset= Node_Offset((convert));
1863 mjd_nodelen= Node_Length((convert));
1864 });
7f69552c
YO
1865 /* whole branch chain */
1866 } else {
1867 DEBUG_r({
1868 const regnode *nop = NEXTOPER( convert );
1869 mjd_offset= Node_Offset((nop));
1870 mjd_nodelen= Node_Length((nop));
1871 });
07be1b83 1872 }
7f69552c 1873
07be1b83
YO
1874 DEBUG_OPTIMISE_r(
1875 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1876 (int)depth * 2 + 2, "",
786e8c11 1877 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1878 );
a3621e74 1879
3dab1dad
YO
1880 /* But first we check to see if there is a common prefix we can
1881 split out as an EXACT and put in front of the TRIE node. */
1882 trie->startstate= 1;
55eed653 1883 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 1884 U32 state;
1e2e3d02 1885 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1886 U32 ofs = 0;
8e11feef
RGS
1887 I32 idx = -1;
1888 U32 count = 0;
1889 const U32 base = trie->states[ state ].trans.base;
a3621e74 1890
3dab1dad 1891 if ( trie->states[state].wordnum )
8e11feef 1892 count = 1;
a3621e74 1893
8e11feef 1894 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1895 if ( ( base + ofs >= trie->uniquecharcount ) &&
1896 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1897 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1898 {
3dab1dad 1899 if ( ++count > 1 ) {
2b8b4781 1900 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 1901 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1902 if ( state == 1 ) break;
3dab1dad
YO
1903 if ( count == 2 ) {
1904 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1905 DEBUG_OPTIMISE_r(
8e11feef
RGS
1906 PerlIO_printf(Perl_debug_log,
1907 "%*sNew Start State=%"UVuf" Class: [",
1908 (int)depth * 2 + 2, "",
786e8c11 1909 (UV)state));
be8e71aa 1910 if (idx >= 0) {
2b8b4781 1911 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 1912 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1913
3dab1dad 1914 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1915 if ( folder )
1916 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1917 DEBUG_OPTIMISE_r(
07be1b83 1918 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1919 );
8e11feef
RGS
1920 }
1921 }
1922 TRIE_BITMAP_SET(trie,*ch);
1923 if ( folder )
1924 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1925 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1926 }
1927 idx = ofs;
1928 }
3dab1dad
YO
1929 }
1930 if ( count == 1 ) {
2b8b4781 1931 SV **tmp = av_fetch( revcharmap, idx, 0);
de734bd5
A
1932 char *ch = SvPV_nolen( *tmp );
1933 DEBUG_OPTIMISE_r({
1934 SV *sv=sv_newmortal();
8e11feef
RGS
1935 PerlIO_printf( Perl_debug_log,
1936 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1937 (int)depth * 2 + 2, "",
de734bd5
A
1938 (UV)state, (UV)idx,
1939 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
1940 PL_colors[0], PL_colors[1],
1941 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1942 PERL_PV_ESCAPE_FIRSTCHAR
1943 )
1944 );
1945 });
3dab1dad
YO
1946 if ( state==1 ) {
1947 OP( convert ) = nodetype;
1948 str=STRING(convert);
1949 STR_LEN(convert)=0;
1950 }
de734bd5
A
1951 while (*ch) {
1952 *str++ = *ch++;
1953 STR_LEN(convert)++;
1954 }
1955
8e11feef 1956 } else {
f9049ba1 1957#ifdef DEBUGGING
8e11feef
RGS
1958 if (state>1)
1959 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1960#endif
8e11feef
RGS
1961 break;
1962 }
1963 }
3dab1dad 1964 if (str) {
8e11feef 1965 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1966 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1967 trie->startstate = state;
07be1b83
YO
1968 trie->minlen -= (state - 1);
1969 trie->maxlen -= (state - 1);
1970 DEBUG_r({
1971 regnode *fix = convert;
de734bd5 1972 U32 word = trie->wordcount;
07be1b83
YO
1973 mjd_nodelen++;
1974 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1975 while( ++fix < n ) {
1976 Set_Node_Offset_Length(fix, 0, 0);
1977 }
de734bd5 1978 while (word--) {
2b8b4781 1979 SV ** const tmp = av_fetch( trie_words, word, 0 );
de734bd5
A
1980 if (tmp) {
1981 if ( STR_LEN(convert) <= SvCUR(*tmp) )
1982 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
1983 else
1984 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
1985 }
1986 }
07be1b83 1987 });
8e11feef
RGS
1988 if (trie->maxlen) {
1989 convert = n;
1990 } else {
3dab1dad 1991 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 1992 DEBUG_r(optimize= n);
3dab1dad
YO
1993 }
1994 }
1995 }
a5ca303d
YO
1996 if (!jumper)
1997 jumper = last;
3dab1dad 1998 if ( trie->maxlen ) {
8e11feef
RGS
1999 NEXT_OFF( convert ) = (U16)(tail - convert);
2000 ARG_SET( convert, data_slot );
786e8c11
YO
2001 /* Store the offset to the first unabsorbed branch in
2002 jump[0], which is otherwise unused by the jump logic.
2003 We use this when dumping a trie and during optimisation. */
2004 if (trie->jump)
7f69552c 2005 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2006
786e8c11
YO
2007 /* XXXX */
2008 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 2009 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2010 {
2011 OP( convert ) = TRIEC;
2012 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2013 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2014 trie->bitmap= NULL;
2015 } else
2016 OP( convert ) = TRIE;
a3621e74 2017
3dab1dad
YO
2018 /* store the type in the flags */
2019 convert->flags = nodetype;
a5ca303d
YO
2020 DEBUG_r({
2021 optimize = convert
2022 + NODE_STEP_REGNODE
2023 + regarglen[ OP( convert ) ];
2024 });
2025 /* XXX We really should free up the resource in trie now,
2026 as we won't use them - (which resources?) dmq */
3dab1dad 2027 }
a3621e74 2028 /* needed for dumping*/
e62cc96a 2029 DEBUG_r(if (optimize) {
07be1b83 2030 regnode *opt = convert;
e62cc96a 2031 while ( ++opt < optimize) {
07be1b83
YO
2032 Set_Node_Offset_Length(opt,0,0);
2033 }
786e8c11
YO
2034 /*
2035 Try to clean up some of the debris left after the
2036 optimisation.
a3621e74 2037 */
786e8c11 2038 while( optimize < jumper ) {
07be1b83 2039 mjd_nodelen += Node_Length((optimize));
a3621e74 2040 OP( optimize ) = OPTIMIZED;
07be1b83 2041 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2042 optimize++;
2043 }
07be1b83 2044 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2045 });
2046 } /* end node insert */
55eed653 2047 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2048#ifdef DEBUGGING
2049 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2050 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2051#else
2052 SvREFCNT_dec(revcharmap);
07be1b83 2053#endif
786e8c11
YO
2054 return trie->jump
2055 ? MADE_JUMP_TRIE
2056 : trie->startstate>1
2057 ? MADE_EXACT_TRIE
2058 : MADE_TRIE;
2059}
2060
2061STATIC void
2062S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2063{
2064/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2065
2066 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2067 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2068 ISBN 0-201-10088-6
2069
2070 We find the fail state for each state in the trie, this state is the longest proper
2071 suffix of the current states 'word' that is also a proper prefix of another word in our
2072 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2073 the DFA not to have to restart after its tried and failed a word at a given point, it
2074 simply continues as though it had been matching the other word in the first place.
2075 Consider
2076 'abcdgu'=~/abcdefg|cdgu/
2077 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2078 fail, which would bring use to the state representing 'd' in the second word where we would
2079 try 'g' and succeed, prodceding to match 'cdgu'.
2080 */
2081 /* add a fail transition */
3251b653
NC
2082 const U32 trie_offset = ARG(source);
2083 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2084 U32 *q;
2085 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2086 const U32 numstates = trie->statecount;
786e8c11
YO
2087 const U32 ubound = trie->lasttrans + ucharcount;
2088 U32 q_read = 0;
2089 U32 q_write = 0;
2090 U32 charid;
2091 U32 base = trie->states[ 1 ].trans.base;
2092 U32 *fail;
2093 reg_ac_data *aho;
2094 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2095 GET_RE_DEBUG_FLAGS_DECL;
2096#ifndef DEBUGGING
2097 PERL_UNUSED_ARG(depth);
2098#endif
2099
2100
2101 ARG_SET( stclass, data_slot );
c944940b 2102 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2103 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2104 aho->trie=trie_offset;
446bd890
NC
2105 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2106 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2107 Newxz( q, numstates, U32);
c944940b 2108 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2109 aho->refcount = 1;
2110 fail = aho->fail;
2111 /* initialize fail[0..1] to be 1 so that we always have
2112 a valid final fail state */
2113 fail[ 0 ] = fail[ 1 ] = 1;
2114
2115 for ( charid = 0; charid < ucharcount ; charid++ ) {
2116 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2117 if ( newstate ) {
2118 q[ q_write ] = newstate;
2119 /* set to point at the root */
2120 fail[ q[ q_write++ ] ]=1;
2121 }
2122 }
2123 while ( q_read < q_write) {
2124 const U32 cur = q[ q_read++ % numstates ];
2125 base = trie->states[ cur ].trans.base;
2126
2127 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2128 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2129 if (ch_state) {
2130 U32 fail_state = cur;
2131 U32 fail_base;
2132 do {
2133 fail_state = fail[ fail_state ];
2134 fail_base = aho->states[ fail_state ].trans.base;
2135 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2136
2137 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2138 fail[ ch_state ] = fail_state;
2139 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2140 {
2141 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2142 }
2143 q[ q_write++ % numstates] = ch_state;
2144 }
2145 }
2146 }
2147 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2148 when we fail in state 1, this allows us to use the
2149 charclass scan to find a valid start char. This is based on the principle
2150 that theres a good chance the string being searched contains lots of stuff
2151 that cant be a start char.
2152 */
2153 fail[ 0 ] = fail[ 1 ] = 0;
2154 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2155 PerlIO_printf(Perl_debug_log,
2156 "%*sStclass Failtable (%"UVuf" states): 0",
2157 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2158 );
786e8c11
YO
2159 for( q_read=1; q_read<numstates; q_read++ ) {
2160 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2161 }
2162 PerlIO_printf(Perl_debug_log, "\n");
2163 });
2164 Safefree(q);
2165 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2166}
2167
786e8c11 2168
a3621e74 2169/*
5d1c421c
JH
2170 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2171 * These need to be revisited when a newer toolchain becomes available.
2172 */
2173#if defined(__sparc64__) && defined(__GNUC__)
2174# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2175# undef SPARC64_GCC_WORKAROUND
2176# define SPARC64_GCC_WORKAROUND 1
2177# endif
2178#endif
2179
07be1b83 2180#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2181 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2182 SV * const mysv=sv_newmortal(); \
2183 regnode *Next = regnext(scan); \
2184 regprop(RExC_rx, mysv, scan); \
7f69552c 2185 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2186 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2187 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2188 }});
07be1b83 2189
1de06328
YO
2190
2191
2192
2193
07be1b83
YO
2194#define JOIN_EXACT(scan,min,flags) \
2195 if (PL_regkind[OP(scan)] == EXACT) \
2196 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2197
be8e71aa 2198STATIC U32
07be1b83
YO
2199S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2200 /* Merge several consecutive EXACTish nodes into one. */
2201 regnode *n = regnext(scan);
2202 U32 stringok = 1;
2203 regnode *next = scan + NODE_SZ_STR(scan);
2204 U32 merged = 0;
2205 U32 stopnow = 0;
2206#ifdef DEBUGGING
2207 regnode *stop = scan;
72f13be8 2208 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2209#else
d47053eb
RGS
2210 PERL_UNUSED_ARG(depth);
2211#endif
2212#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2213 PERL_UNUSED_ARG(flags);
2214 PERL_UNUSED_ARG(val);
07be1b83 2215#endif
07be1b83
YO
2216 DEBUG_PEEP("join",scan,depth);
2217
2218 /* Skip NOTHING, merge EXACT*. */
2219 while (n &&
2220 ( PL_regkind[OP(n)] == NOTHING ||
2221 (stringok && (OP(n) == OP(scan))))
2222 && NEXT_OFF(n)
2223 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2224
2225 if (OP(n) == TAIL || n > next)
2226 stringok = 0;
2227 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2228 DEBUG_PEEP("skip:",n,depth);
2229 NEXT_OFF(scan) += NEXT_OFF(n);
2230 next = n + NODE_STEP_REGNODE;
2231#ifdef DEBUGGING
2232 if (stringok)
2233 stop = n;
2234#endif
2235 n = regnext(n);
2236 }
2237 else if (stringok) {
786e8c11 2238 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2239 regnode * const nnext = regnext(n);
2240
2241 DEBUG_PEEP("merg",n,depth);
2242
2243 merged++;
2244 if (oldl + STR_LEN(n) > U8_MAX)
2245 break;
2246 NEXT_OFF(scan) += NEXT_OFF(n);
2247 STR_LEN(scan) += STR_LEN(n);
2248 next = n + NODE_SZ_STR(n);
2249 /* Now we can overwrite *n : */
2250 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2251#ifdef DEBUGGING
2252 stop = next - 1;
2253#endif
2254 n = nnext;
2255 if (stopnow) break;
2256 }
2257
d47053eb
RGS
2258#ifdef EXPERIMENTAL_INPLACESCAN
2259 if (flags && !NEXT_OFF(n)) {
2260 DEBUG_PEEP("atch", val, depth);
2261 if (reg_off_by_arg[OP(n)]) {
2262 ARG_SET(n, val - n);
2263 }
2264 else {
2265 NEXT_OFF(n) = val - n;
2266 }
2267 stopnow = 1;
2268 }
07be1b83
YO
2269#endif
2270 }
2271
2272 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2273 /*
2274 Two problematic code points in Unicode casefolding of EXACT nodes:
2275
2276 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2277 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2278
2279 which casefold to
2280
2281 Unicode UTF-8
2282
2283 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2284 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2285
2286 This means that in case-insensitive matching (or "loose matching",
2287 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2288 length of the above casefolded versions) can match a target string
2289 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2290 This would rather mess up the minimum length computation.
2291
2292 What we'll do is to look for the tail four bytes, and then peek
2293 at the preceding two bytes to see whether we need to decrease
2294 the minimum length by four (six minus two).
2295
2296 Thanks to the design of UTF-8, there cannot be false matches:
2297 A sequence of valid UTF-8 bytes cannot be a subsequence of
2298 another valid sequence of UTF-8 bytes.
2299
2300 */
2301 char * const s0 = STRING(scan), *s, *t;
2302 char * const s1 = s0 + STR_LEN(scan) - 1;
2303 char * const s2 = s1 - 4;
e294cc5d
JH
2304#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2305 const char t0[] = "\xaf\x49\xaf\x42";
2306#else
07be1b83 2307 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2308#endif
07be1b83
YO
2309 const char * const t1 = t0 + 3;
2310
2311 for (s = s0 + 2;
2312 s < s2 && (t = ninstr(s, s1, t0, t1));
2313 s = t + 4) {
e294cc5d
JH
2314#ifdef EBCDIC
2315 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2316 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2317#else
07be1b83
YO
2318 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2319 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2320#endif
07be1b83
YO
2321 *min -= 4;
2322 }
2323 }
2324
2325#ifdef DEBUGGING
2326 /* Allow dumping */
2327 n = scan + NODE_SZ_STR(scan);
2328 while (n <= stop) {
2329 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2330 OP(n) = OPTIMIZED;
2331 NEXT_OFF(n) = 0;
2332 }
2333 n++;
2334 }
2335#endif
2336 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2337 return stopnow;
2338}
2339
653099ff
GS
2340/* REx optimizer. Converts nodes into quickier variants "in place".
2341 Finds fixed substrings. */
2342
a0288114 2343/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2344 to the position after last scanned or to NULL. */
2345
40d049e4
YO
2346#define INIT_AND_WITHP \
2347 assert(!and_withp); \
2348 Newx(and_withp,1,struct regnode_charclass_class); \
2349 SAVEFREEPV(and_withp)
07be1b83 2350
b515a41d
YO
2351/* this is a chain of data about sub patterns we are processing that
2352 need to be handled seperately/specially in study_chunk. Its so
2353 we can simulate recursion without losing state. */
2354struct scan_frame;
2355typedef struct scan_frame {
2356 regnode *last; /* last node to process in this frame */
2357 regnode *next; /* next node to process when last is reached */
2358 struct scan_frame *prev; /*previous frame*/
2359 I32 stop; /* what stopparen do we use */
2360} scan_frame;
2361
304ee84b
YO
2362
2363#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2364
76e3520e 2365STATIC I32
40d049e4 2366S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2367 I32 *minlenp, I32 *deltap,
40d049e4
YO
2368 regnode *last,
2369 scan_data_t *data,
2370 I32 stopparen,
2371 U8* recursed,
2372 struct regnode_charclass_class *and_withp,
2373 U32 flags, U32 depth)
c277df42
IZ
2374 /* scanp: Start here (read-write). */
2375 /* deltap: Write maxlen-minlen here. */
2376 /* last: Stop before this one. */
40d049e4
YO
2377 /* data: string data about the pattern */
2378 /* stopparen: treat close N as END */
2379 /* recursed: which subroutines have we recursed into */
2380 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2381{
97aff369 2382 dVAR;
c277df42
IZ
2383 I32 min = 0, pars = 0, code;
2384 regnode *scan = *scanp, *next;
2385 I32 delta = 0;
2386 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2387 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2388 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2389 scan_data_t data_fake;
a3621e74 2390 SV *re_trie_maxbuff = NULL;
786e8c11 2391 regnode *first_non_open = scan;
e2e6a0f1 2392 I32 stopmin = I32_MAX;
8aa23a47
YO
2393 scan_frame *frame = NULL;
2394
a3621e74 2395 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2396
13a24bad 2397#ifdef DEBUGGING
40d049e4 2398 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2399#endif
40d049e4 2400
786e8c11 2401 if ( depth == 0 ) {
40d049e4 2402 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2403 first_non_open=regnext(first_non_open);
2404 }
2405
b81d288d 2406
8aa23a47
YO
2407 fake_study_recurse:
2408 while ( scan && OP(scan) != END && scan < last ){
2409 /* Peephole optimizer: */
304ee84b 2410 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47
YO
2411 DEBUG_PEEP("Peep",scan,depth);
2412 JOIN_EXACT(scan,&min,0);
2413
2414 /* Follow the next-chain of the current node and optimize
2415 away all the NOTHINGs from it. */
2416 if (OP(scan) != CURLYX) {
2417 const int max = (reg_off_by_arg[OP(scan)]
2418 ? I32_MAX
2419 /* I32 may be smaller than U16 on CRAYs! */
2420 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2421 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2422 int noff;
2423 regnode *n = scan;
2424
2425 /* Skip NOTHING and LONGJMP. */
2426 while ((n = regnext(n))
2427 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2428 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2429 && off + noff < max)
2430 off += noff;
2431 if (reg_off_by_arg[OP(scan)])
2432 ARG(scan) = off;
2433 else
2434 NEXT_OFF(scan) = off;
2435 }
a3621e74 2436
c277df42 2437
8aa23a47
YO
2438
2439 /* The principal pseudo-switch. Cannot be a switch, since we
2440 look into several different things. */
2441 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2442 || OP(scan) == IFTHEN) {
2443 next = regnext(scan);
2444 code = OP(scan);
2445 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2446
2447 if (OP(next) == code || code == IFTHEN) {
2448 /* NOTE - There is similar code to this block below for handling
2449 TRIE nodes on a re-study. If you change stuff here check there
2450 too. */
2451 I32 max1 = 0, min1 = I32_MAX, num = 0;
2452 struct regnode_charclass_class accum;
2453 regnode * const startbranch=scan;
2454
2455 if (flags & SCF_DO_SUBSTR)
304ee84b 2456 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
2457 if (flags & SCF_DO_STCLASS)
2458 cl_init_zero(pRExC_state, &accum);
2459
2460 while (OP(scan) == code) {
2461 I32 deltanext, minnext, f = 0, fake;
2462 struct regnode_charclass_class this_class;
2463
2464 num++;
2465 data_fake.flags = 0;
2466 if (data) {
2467 data_fake.whilem_c = data->whilem_c;
2468 data_fake.last_closep = data->last_closep;
2469 }
2470 else
2471 data_fake.last_closep = &fake;
58e23c8d
YO
2472
2473 data_fake.pos_delta = delta;
8aa23a47
YO
2474 next = regnext(scan);
2475 scan = NEXTOPER(scan);
2476 if (code != BRANCH)
c277df42 2477 scan = NEXTOPER(scan);
8aa23a47
YO
2478 if (flags & SCF_DO_STCLASS) {
2479 cl_init(pRExC_state, &this_class);
2480 data_fake.start_class = &this_class;
2481 f = SCF_DO_STCLASS_AND;
58e23c8d 2482 }
8aa23a47
YO
2483 if (flags & SCF_WHILEM_VISITED_POS)
2484 f |= SCF_WHILEM_VISITED_POS;
2485
2486 /* we suppose the run is continuous, last=next...*/
2487 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2488 next, &data_fake,
2489 stopparen, recursed, NULL, f,depth+1);
2490 if (min1 > minnext)
2491 min1 = minnext;
2492 if (max1 < minnext + deltanext)
2493 max1 = minnext + deltanext;
2494 if (deltanext == I32_MAX)
2495 is_inf = is_inf_internal = 1;
2496 scan = next;
2497 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2498 pars++;
2499 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2500 if ( stopmin > minnext)
2501 stopmin = min + min1;
2502 flags &= ~SCF_DO_SUBSTR;
2503 if (data)
2504 data->flags |= SCF_SEEN_ACCEPT;
2505 }
2506 if (data) {
2507 if (data_fake.flags & SF_HAS_EVAL)
2508 data->flags |= SF_HAS_EVAL;
2509 data->whilem_c = data_fake.whilem_c;
3dab1dad 2510 }
8aa23a47
YO
2511 if (flags & SCF_DO_STCLASS)
2512 cl_or(pRExC_state, &accum, &this_class);
2513 }
2514 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2515 min1 = 0;
2516 if (flags & SCF_DO_SUBSTR) {
2517 data->pos_min += min1;
2518 data->pos_delta += max1 - min1;
2519 if (max1 != min1 || is_inf)
2520 data->longest = &(data->longest_float);
2521 }
2522 min += min1;
2523 delta += max1 - min1;
2524 if (flags & SCF_DO_STCLASS_OR) {
2525 cl_or(pRExC_state, data->start_class, &accum);
2526 if (min1) {
2527 cl_and(data->start_class, and_withp);
2528 flags &= ~SCF_DO_STCLASS;
653099ff 2529 }
8aa23a47
YO
2530 }
2531 else if (flags & SCF_DO_STCLASS_AND) {
2532 if (min1) {
2533 cl_and(data->start_class, &accum);
2534 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2535 }
8aa23a47
YO
2536 else {
2537 /* Switch to OR mode: cache the old value of
2538 * data->start_class */
2539 INIT_AND_WITHP;
2540 StructCopy(data->start_class, and_withp,
2541 struct regnode_charclass_class);
2542 flags &= ~SCF_DO_STCLASS_AND;
2543 StructCopy(&accum, data->start_class,
2544 struct regnode_charclass_class);
2545 flags |= SCF_DO_STCLASS_OR;
2546 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2547 }
8aa23a47 2548 }
a3621e74 2549
8aa23a47
YO
2550 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2551 /* demq.
a3621e74 2552
8aa23a47
YO
2553 Assuming this was/is a branch we are dealing with: 'scan' now
2554 points at the item that follows the branch sequence, whatever
2555 it is. We now start at the beginning of the sequence and look
2556 for subsequences of
a3621e74 2557
8aa23a47
YO
2558 BRANCH->EXACT=>x1
2559 BRANCH->EXACT=>x2
2560 tail
a3621e74 2561
8aa23a47 2562 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2563
8aa23a47
YO
2564 If we can find such a subseqence we need to turn the first
2565 element into a trie and then add the subsequent branch exact
2566 strings to the trie.
a3621e74 2567
8aa23a47 2568 We have two cases
a3621e74 2569
8aa23a47 2570 1. patterns where the whole set of branch can be converted.
a3621e74 2571
8aa23a47 2572 2. patterns where only a subset can be converted.
a3621e74 2573
8aa23a47
YO
2574 In case 1 we can replace the whole set with a single regop
2575 for the trie. In case 2 we need to keep the start and end
2576 branchs so
a3621e74 2577
8aa23a47
YO
2578 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2579 becomes BRANCH TRIE; BRANCH X;
786e8c11 2580
8aa23a47
YO
2581 There is an additional case, that being where there is a
2582 common prefix, which gets split out into an EXACT like node
2583 preceding the TRIE node.
a3621e74 2584
8aa23a47
YO
2585 If x(1..n)==tail then we can do a simple trie, if not we make
2586 a "jump" trie, such that when we match the appropriate word
2587 we "jump" to the appopriate tail node. Essentailly we turn
2588 a nested if into a case structure of sorts.
b515a41d 2589
8aa23a47
YO
2590 */
2591
2592 int made=0;
2593 if (!re_trie_maxbuff) {
2594 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2595 if (!SvIOK(re_trie_maxbuff))
2596 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2597 }
2598 if ( SvIV(re_trie_maxbuff)>=0 ) {
2599 regnode *cur;
2600 regnode *first = (regnode *)NULL;
2601 regnode *last = (regnode *)NULL;
2602 regnode *tail = scan;
2603 U8 optype = 0;
2604 U32 count=0;
a3621e74
YO
2605
2606#ifdef DEBUGGING
8aa23a47 2607 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2608#endif
8aa23a47
YO
2609 /* var tail is used because there may be a TAIL
2610 regop in the way. Ie, the exacts will point to the
2611 thing following the TAIL, but the last branch will
2612 point at the TAIL. So we advance tail. If we
2613 have nested (?:) we may have to move through several
2614 tails.
2615 */
2616
2617 while ( OP( tail ) == TAIL ) {
2618 /* this is the TAIL generated by (?:) */
2619 tail = regnext( tail );
2620 }
a3621e74 2621
8aa23a47
YO
2622
2623 DEBUG_OPTIMISE_r({
2624 regprop(RExC_rx, mysv, tail );
2625 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2626 (int)depth * 2 + 2, "",
2627 "Looking for TRIE'able sequences. Tail node is: ",
2628 SvPV_nolen_const( mysv )
2629 );
2630 });
2631
2632 /*
2633
2634 step through the branches, cur represents each
2635 branch, noper is the first thing to be matched
2636 as part of that branch and noper_next is the
2637 regnext() of that node. if noper is an EXACT
2638 and noper_next is the same as scan (our current
2639 position in the regex) then the EXACT branch is
2640 a possible optimization target. Once we have
2641 two or more consequetive such branches we can
2642 create a trie of the EXACT's contents and stich
2643 it in place. If the sequence represents all of
2644 the branches we eliminate the whole thing and
2645 replace it with a single TRIE. If it is a
2646 subsequence then we need to stitch it in. This
2647 means the first branch has to remain, and needs
2648 to be repointed at the item on the branch chain
2649 following the last branch optimized. This could
2650 be either a BRANCH, in which case the
2651 subsequence is internal, or it could be the
2652 item following the branch sequence in which
2653 case the subsequence is at the end.
2654
2655 */
2656
2657 /* dont use tail as the end marker for this traverse */
2658 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2659 regnode * const noper = NEXTOPER( cur );
b515a41d 2660#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2661 regnode * const noper_next = regnext( noper );
b515a41d
YO
2662#endif
2663
8aa23a47
YO
2664 DEBUG_OPTIMISE_r({
2665 regprop(RExC_rx, mysv, cur);
2666 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2667 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2668
2669 regprop(RExC_rx, mysv, noper);
2670 PerlIO_printf( Perl_debug_log, " -> %s",
2671 SvPV_nolen_const(mysv));
2672
2673 if ( noper_next ) {
2674 regprop(RExC_rx, mysv, noper_next );
2675 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2676 SvPV_nolen_const(mysv));
2677 }
2678 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2679 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2680 });
2681 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2682 : PL_regkind[ OP( noper ) ] == EXACT )
2683 || OP(noper) == NOTHING )
786e8c11 2684#ifdef NOJUMPTRIE
8aa23a47 2685 && noper_next == tail
786e8c11 2686#endif
8aa23a47
YO
2687 && count < U16_MAX)
2688 {
2689 count++;
2690 if ( !first || optype == NOTHING ) {
2691 if (!first) first = cur;
2692 optype = OP( noper );
2693 } else {
2694 last = cur;
2695 }
2696 } else {
2697 if ( last ) {
2698 make_trie( pRExC_state,
2699 startbranch, first, cur, tail, count,
2700 optype, depth+1 );
2701 }
2702 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2703#ifdef NOJUMPTRIE
8aa23a47 2704 && noper_next == tail
786e8c11 2705#endif
8aa23a47
YO
2706 ){
2707 count = 1;
2708 first = cur;
2709 optype = OP( noper );
2710 } else {
2711 count = 0;
2712 first = NULL;
2713 optype = 0;
2714 }
2715 last = NULL;
2716 }
2717 }
2718 DEBUG_OPTIMISE_r({
2719 regprop(RExC_rx, mysv, cur);
2720 PerlIO_printf( Perl_debug_log,
2721 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2722 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2723
2724 });
2725 if ( last ) {
2726 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2727#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2728 if ( ((made == MADE_EXACT_TRIE &&
2729 startbranch == first)
2730 || ( first_non_open == first )) &&
2731 depth==0 ) {
2732 flags |= SCF_TRIE_RESTUDY;
2733 if ( startbranch == first
2734 && scan == tail )
2735 {
2736 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2737 }
2738 }
3dab1dad 2739#endif
8aa23a47
YO
2740 }
2741 }
2742
2743 } /* do trie */
2744
653099ff 2745 }
8aa23a47
YO
2746 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2747 scan = NEXTOPER(NEXTOPER(scan));
2748 } else /* single branch is optimized. */
2749 scan = NEXTOPER(scan);
2750 continue;
2751 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2752 scan_frame *newframe = NULL;
2753 I32 paren;
2754 regnode *start;
2755 regnode *end;
2756
2757 if (OP(scan) != SUSPEND) {
2758 /* set the pointer */
2759 if (OP(scan) == GOSUB) {
2760 paren = ARG(scan);
2761 RExC_recurse[ARG2L(scan)] = scan;
2762 start = RExC_open_parens[paren-1];
2763 end = RExC_close_parens[paren-1];
2764 } else {
2765 paren = 0;
f8fc2ecf 2766 start = RExC_rxi->program + 1;
8aa23a47
YO
2767 end = RExC_opend;
2768 }
2769 if (!recursed) {
2770 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2771 SAVEFREEPV(recursed);
2772 }
2773 if (!PAREN_TEST(recursed,paren+1)) {
2774 PAREN_SET(recursed,paren+1);
2775 Newx(newframe,1,scan_frame);
2776 } else {
2777 if (flags & SCF_DO_SUBSTR) {
304ee84b 2778 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
2779 data->longest = &(data->longest_float);
2780 }
2781 is_inf = is_inf_internal = 1;
2782 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2783 cl_anything(pRExC_state, data->start_class);
2784 flags &= ~SCF_DO_STCLASS;
2785 }
2786 } else {
2787 Newx(newframe,1,scan_frame);
2788 paren = stopparen;
2789 start = scan+2;
2790 end = regnext(scan);
2791 }
2792 if (newframe) {
2793 assert(start);
2794 assert(end);
2795 SAVEFREEPV(newframe);
2796 newframe->next = regnext(scan);
2797 newframe->last = last;
2798 newframe->stop = stopparen;
2799 newframe->prev = frame;
2800
2801 frame = newframe;
2802 scan = start;
2803 stopparen = paren;
2804 last = end;
2805
2806 continue;
2807 }
2808 }
2809 else if (OP(scan) == EXACT) {
2810 I32 l = STR_LEN(scan);
2811 UV uc;
2812 if (UTF) {
2813 const U8 * const s = (U8*)STRING(scan);
2814 l = utf8_length(s, s + l);
2815 uc = utf8_to_uvchr(s, NULL);
2816 } else {
2817 uc = *((U8*)STRING(scan));
2818 }
2819 min += l;
2820 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2821 /* The code below prefers earlier match for fixed
2822 offset, later match for variable offset. */
2823 if (data->last_end == -1) { /* Update the start info. */
2824 data->last_start_min = data->pos_min;
2825 data->last_start_max = is_inf
2826 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 2827 }
8aa23a47
YO
2828 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2829 if (UTF)
2830 SvUTF8_on(data->last_found);
2831 {
2832 SV * const sv = data->last_found;
2833 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2834 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2835 if (mg && mg->mg_len >= 0)
2836 mg->mg_len += utf8_length((U8*)STRING(scan),
2837 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 2838 }
8aa23a47
YO
2839 data->last_end = data->pos_min + l;
2840 data->pos_min += l; /* As in the first entry. */
2841 data->flags &= ~SF_BEFORE_EOL;
2842 }
2843 if (flags & SCF_DO_STCLASS_AND) {
2844 /* Check whether it is compatible with what we know already! */
2845 int compat = 1;
2846
2847 if (uc >= 0x100 ||
2848 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2849 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2850 && (!(data->start_class->flags & ANYOF_FOLD)
2851 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2852 )
2853 compat = 0;
2854 ANYOF_CLASS_ZERO(data->start_class);
2855 ANYOF_BITMAP_ZERO(data->start_class);
2856 if (compat)
2857 ANYOF_BITMAP_SET(data->start_class, uc);
2858 data->start_class->flags &= ~ANYOF_EOS;
2859 if (uc < 0x100)
2860 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2861 }
2862 else if (flags & SCF_DO_STCLASS_OR) {
2863 /* false positive possible if the class is case-folded */
2864 if (uc < 0x100)
2865 ANYOF_BITMAP_SET(data->start_class, uc);
2866 else
2867 data->start_class->flags |= ANYOF_UNICODE_ALL;
2868 data->start_class->flags &= ~ANYOF_EOS;
2869 cl_and(data->start_class, and_withp);
2870 }
2871 flags &= ~SCF_DO_STCLASS;
2872 }
2873 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2874 I32 l = STR_LEN(scan);
2875 UV uc = *((U8*)STRING(scan));
2876
2877 /* Search for fixed substrings supports EXACT only. */
2878 if (flags & SCF_DO_SUBSTR) {
2879 assert(data);
304ee84b 2880 SCAN_COMMIT(pRExC_state, data, minlenp);
8aa23a47
YO
2881 }
2882 if (UTF) {
2883 const U8 * const s = (U8 *)STRING(scan);
2884 l = utf8_length(s, s + l);
2885 uc = utf8_to_uvchr(s, NULL);
2886 }
2887 min += l;
2888 if (flags & SCF_DO_SUBSTR)
2889 data->pos_min += l;
2890 if (flags & SCF_DO_STCLASS_AND) {
2891 /* Check whether it is compatible with what we know already! */
2892 int compat = 1;
2893
2894 if (uc >= 0x100 ||
2895 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2896 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2897 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2898 compat = 0;
2899 ANYOF_CLASS_ZERO(data->start_class);
2900 ANYOF_BITMAP_ZERO(data->start_class);
2901 if (compat) {
2902 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2903 data->start_class->flags &= ~ANYOF_EOS;
8aa23a47
YO
2904 data->start_class->flags |= ANYOF_FOLD;
2905 if (OP(scan) == EXACTFL)
2906 data->start_class->flags |= ANYOF_LOCALE;
653099ff 2907 }
8aa23a47
YO
2908 }
2909 else if (flags & SCF_DO_STCLASS_OR) {
2910 if (data->start_class->flags & ANYOF_FOLD) {
2911 /* false positive possible if the class is case-folded.
2912 Assume that the locale settings are the same... */
1aa99e6b
IH
2913 if (uc < 0x100)
2914 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2915 data->start_class->flags &= ~ANYOF_EOS;
2916 }
8aa23a47 2917 cl_and(data->start_class, and_withp);
653099ff 2918 }
8aa23a47
YO
2919 flags &= ~SCF_DO_STCLASS;
2920 }
2921 else if (strchr((const char*)PL_varies,OP(scan))) {
2922 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2923 I32 f = flags, pos_before = 0;
2924 regnode * const oscan = scan;
2925 struct regnode_charclass_class this_class;
2926 struct regnode_charclass_class *oclass = NULL;
2927 I32 next_is_eval = 0;
2928
2929 switch (PL_regkind[OP(scan)]) {
2930 case WHILEM: /* End of (?:...)* . */
2931 scan = NEXTOPER(scan);
2932 goto finish;
2933 case PLUS:
2934 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2935 next = NEXTOPER(scan);
2936 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2937 mincount = 1;
2938 maxcount = REG_INFTY;
2939 next = regnext(scan);
2940 scan = NEXTOPER(scan);
2941 goto do_curly;
2942 }
2943 }
2944 if (flags & SCF_DO_SUBSTR)
2945 data->pos_min++;
2946 min++;
2947 /* Fall through. */
2948 case STAR:
2949 if (flags & SCF_DO_STCLASS) {
2950 mincount = 0;
2951 maxcount = REG_INFTY;
2952 next = regnext(scan);
2953 scan = NEXTOPER(scan);
2954 goto do_curly;
2955 }
2956 is_inf = is_inf_internal = 1;
2957 scan = regnext(scan);
c277df42 2958 if (flags & SCF_DO_SUBSTR) {
304ee84b 2959 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
8aa23a47 2960 data->longest = &(data->longest_float);
c277df42 2961 }
8aa23a47
YO
2962 goto optimize_curly_tail;
2963 case CURLY:
2964 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2965 && (scan->flags == stopparen))
2966 {
2967 mincount = 1;
2968 maxcount = 1;
2969 } else {
2970 mincount = ARG1(scan);
2971 maxcount = ARG2(scan);
653099ff 2972 }
8aa23a47
YO
2973 next = regnext(scan);
2974 if (OP(scan) == CURLYX) {
2975 I32 lp = (data ? *(data->last_closep) : 0);
2976 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 2977 }
8aa23a47
YO
2978 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2979 next_is_eval = (OP(scan) == EVAL);
2980 do_curly:
2981 if (flags & SCF_DO_SUBSTR) {
304ee84b 2982 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
8aa23a47 2983 pos_before = data->pos_min;
b45f050a 2984 }
8aa23a47
YO
2985 if (data) {
2986 fl = data->flags;
2987 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2988 if (is_inf)
2989 data->flags |= SF_IS_INF;
2990 }
2991 if (flags & SCF_DO_STCLASS) {
2992 cl_init(pRExC_state, &this_class);
2993 oclass = data->start_class;
2994 data->start_class = &this_class;
2995 f |= SCF_DO_STCLASS_AND;
2996 f &= ~SCF_DO_STCLASS_OR;
2997 }
2998 /* These are the cases when once a subexpression
2999 fails at a particular position, it cannot succeed
3000 even after backtracking at the enclosing scope.
3001
3002 XXXX what if minimal match and we are at the
3003 initial run of {n,m}? */
3004 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3005 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 3006
8aa23a47
YO
3007 /* This will finish on WHILEM, setting scan, or on NULL: */
3008 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3009 last, data, stopparen, recursed, NULL,
3010 (mincount == 0
3011 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 3012
8aa23a47
YO
3013 if (flags & SCF_DO_STCLASS)
3014 data->start_class = oclass;
3015 if (mincount == 0 || minnext == 0) {
3016 if (flags & SCF_DO_STCLASS_OR) {
3017 cl_or(pRExC_state, data->start_class, &this_class);
3018 }
3019 else if (flags & SCF_DO_STCLASS_AND) {
3020 /* Switch to OR mode: cache the old value of
3021 * data->start_class */
3022 INIT_AND_WITHP;
3023 StructCopy(data->start_class, and_withp,
3024 struct regnode_charclass_class);
3025 flags &= ~SCF_DO_STCLASS_AND;
3026 StructCopy(&this_class, data->start_class,
3027 struct regnode_charclass_class);
3028 flags |= SCF_DO_STCLASS_OR;
3029 data->start_class->flags |= ANYOF_EOS;
3030 }
3031 } else { /* Non-zero len */
3032 if (flags & SCF_DO_STCLASS_OR) {
3033 cl_or(pRExC_state, data->start_class, &this_class);
3034 cl_and(data->start_class, and_withp);
3035 }
3036 else if (flags & SCF_DO_STCLASS_AND)
3037 cl_and(data->start_class, &this_class);
3038 flags &= ~SCF_DO_STCLASS;
3039 }
3040 if (!scan) /* It was not CURLYX, but CURLY. */
3041 scan = next;
3042 if ( /* ? quantifier ok, except for (?{ ... }) */
3043 (next_is_eval || !(mincount == 0 && maxcount == 1))
3044 && (minnext == 0) && (deltanext == 0)
3045 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3046 && maxcount <= REG_INFTY/3 /* Complement check for big count */
3047 && ckWARN(WARN_REGEXP))
3048 {
3049 vWARN(RExC_parse,
3050 "Quantifier unexpected on zero-length expression");
3051 }
3052
3053 min += minnext * mincount;
3054 is_inf_internal |= ((maxcount == REG_INFTY
3055 && (minnext + deltanext) > 0)
3056 || deltanext == I32_MAX);
3057 is_inf |= is_inf_internal;
3058 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3059
3060 /* Try powerful optimization CURLYX => CURLYN. */
3061 if ( OP(oscan) == CURLYX && data
3062 && data->flags & SF_IN_PAR
3063 && !(data->flags & SF_HAS_EVAL)
3064 && !deltanext && minnext == 1 ) {
3065 /* Try to optimize to CURLYN. */
3066 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3067 regnode * const nxt1 = nxt;
497b47a8 3068#ifdef DEBUGGING
8aa23a47 3069 regnode *nxt2;
497b47a8 3070#endif
c277df42 3071
8aa23a47
YO
3072 /* Skip open. */
3073 nxt = regnext(nxt);
3074 if (!strchr((const char*)PL_simple,OP(nxt))
3075 && !(PL_regkind[OP(nxt)] == EXACT
3076 && STR_LEN(nxt) == 1))
3077 goto nogo;
497b47a8 3078#ifdef DEBUGGING
8aa23a47 3079 nxt2 = nxt;
497b47a8 3080#endif
8aa23a47
YO
3081 nxt = regnext(nxt);
3082 if (OP(nxt) != CLOSE)
3083 goto nogo;
3084 if (RExC_open_parens) {
3085 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3086 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3087 }
3088 /* Now we know that nxt2 is the only contents: */
3089 oscan->flags = (U8)ARG(nxt);
3090 OP(oscan) = CURLYN;
3091 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3092
c277df42 3093#ifdef DEBUGGING
8aa23a47
YO
3094 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3095 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3096 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3097 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3098 OP(nxt + 1) = OPTIMIZED; /* was count. */
3099 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 3100#endif
8aa23a47
YO
3101 }
3102 nogo:
3103
3104 /* Try optimization CURLYX => CURLYM. */
3105 if ( OP(oscan) == CURLYX && data
3106 && !(data->flags & SF_HAS_PAR)
3107 && !(data->flags & SF_HAS_EVAL)
3108 && !deltanext /* atom is fixed width */
3109 && minnext != 0 /* CURLYM can't handle zero width */
3110 ) {
3111 /* XXXX How to optimize if data == 0? */
3112 /* Optimize to a simpler form. */
3113 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3114 regnode *nxt2;
3115
3116 OP(oscan) = CURLYM;
3117 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3118 && (OP(nxt2) != WHILEM))
3119 nxt = nxt2;
3120 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3121 /* Need to optimize away parenths. */
3122 if (data->flags & SF_IN_PAR) {
3123 /* Set the parenth number. */
3124 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3125
3126 if (OP(nxt) != CLOSE)
3127 FAIL("Panic opt close");
3128 oscan->flags = (U8)ARG(nxt);
3129 if (RExC_open_parens) {
3130 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3131 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3132 }
8aa23a47
YO
3133 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3134 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3135
c277df42 3136#ifdef DEBUGGING
8aa23a47
YO
3137 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3138 OP(nxt + 1) = OPTIMIZED; /* was count. */
3139 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3140 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3141#endif
c277df42 3142#if 0
8aa23a47
YO
3143 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3144 regnode *nnxt = regnext(nxt1);
3145
3146 if (nnxt == nxt) {
3147 if (reg_off_by_arg[OP(nxt1)])
3148 ARG_SET(nxt1, nxt2 - nxt1);
3149 else if (nxt2 - nxt1 < U16_MAX)
3150 NEXT_OFF(nxt1) = nxt2 - nxt1;
3151 else
3152 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3153 }
8aa23a47 3154 nxt1 = nnxt;
c277df42 3155 }
5d1c421c 3156#endif
8aa23a47
YO
3157 /* Optimize again: */
3158 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3159 NULL, stopparen, recursed, NULL, 0,depth+1);
3160 }
3161 else
3162 oscan->flags = 0;
3163 }
3164 else if ((OP(oscan) == CURLYX)
3165 && (flags & SCF_WHILEM_VISITED_POS)
3166 /* See the comment on a similar expression above.
3167 However, this time it not a subexpression
3168 we care about, but the expression itself. */
3169 && (maxcount == REG_INFTY)
3170 && data && ++data->whilem_c < 16) {
3171 /* This stays as CURLYX, we can put the count/of pair. */
3172 /* Find WHILEM (as in regexec.c) */
3173 regnode *nxt = oscan + NEXT_OFF(oscan);
3174
3175 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3176 nxt += ARG(nxt);
3177 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3178 | (RExC_whilem_seen << 4)); /* On WHILEM */
3179 }
3180 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3181 pars++;
3182 if (flags & SCF_DO_SUBSTR) {
3183 SV *last_str = NULL;
3184 int counted = mincount != 0;
a0ed51b3 3185
8aa23a47
YO
3186 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3187#if defined(SPARC64_GCC_WORKAROUND)
3188 I32 b = 0;
3189 STRLEN l = 0;
3190 const char *s = NULL;
3191 I32 old = 0;
b515a41d 3192
8aa23a47
YO
3193 if (pos_before >= data->last_start_min)
3194 b = pos_before;
3195 else
3196 b = data->last_start_min;
b515a41d 3197
8aa23a47
YO
3198 l = 0;
3199 s = SvPV_const(data->last_found, l);
3200 old = b - data->last_start_min;
3201
3202#else
3203 I32 b = pos_before >= data->last_start_min
3204 ? pos_before : data->last_start_min;
3205 STRLEN l;
3206 const char * const s = SvPV_const(data->last_found, l);
3207 I32 old = b - data->last_start_min;
3208#endif
3209
3210 if (UTF)
3211 old = utf8_hop((U8*)s, old) - (U8*)s;
3212
3213 l -= old;
3214 /* Get the added string: */
3215 last_str = newSVpvn(s + old, l);
3216 if (UTF)
3217 SvUTF8_on(last_str);
3218 if (deltanext == 0 && pos_before == b) {
3219 /* What was added is a constant string */
3220 if (mincount > 1) {
3221 SvGROW(last_str, (mincount * l) + 1);
3222 repeatcpy(SvPVX(last_str) + l,
3223 SvPVX_const(last_str), l, mincount - 1);
3224 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3225 /* Add additional parts. */
3226 SvCUR_set(data->last_found,
3227 SvCUR(data->last_found) - l);
3228 sv_catsv(data->last_found, last_str);
3229 {
3230 SV * sv = data->last_found;
3231 MAGIC *mg =
3232 SvUTF8(sv) && SvMAGICAL(sv) ?
3233 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3234 if (mg && mg->mg_len >= 0)
3235 mg->mg_len += CHR_SVLEN(last_str);
b515a41d 3236 }
8aa23a47 3237 data->last_end += l * (mincount - 1);
b515a41d 3238 }
8aa23a47
YO
3239 } else {
3240 /* start offset must point into the last copy */
3241 data->last_start_min += minnext * (mincount - 1);
3242 data->last_start_max += is_inf ? I32_MAX
3243 : (maxcount - 1) * (minnext + data->pos_delta);
3244 }
c277df42 3245 }
8aa23a47
YO
3246 /* It is counted once already... */
3247 data->pos_min += minnext * (mincount - counted);
3248 data->pos_delta += - counted * deltanext +
3249 (minnext + deltanext) * maxcount - minnext * mincount;
3250 if (mincount != maxcount) {
3251 /* Cannot extend fixed substrings found inside
3252 the group. */
304ee84b 3253 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3254 if (mincount && last_str) {
3255 SV * const sv = data->last_found;
3256 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3257 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3258
3259 if (mg)
3260 mg->mg_len = -1;
3261 sv_setsv(sv, last_str);
3262 data->last_end = data->pos_min;
3263 data->last_start_min =
3264 data->pos_min - CHR_SVLEN(last_str);
3265 data->last_start_max = is_inf
3266 ? I32_MAX
3267 : data->pos_min + data->pos_delta
3268 - CHR_SVLEN(last_str);
3269 }
3270 data->longest = &(data->longest_float);
3271 }
3272 SvREFCNT_dec(last_str);
c277df42 3273 }
8aa23a47
YO
3274 if (data && (fl & SF_HAS_EVAL))
3275 data->flags |= SF_HAS_EVAL;
3276 optimize_curly_tail:
3277 if (OP(oscan) != CURLYX) {
3278 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3279 && NEXT_OFF(next))
3280 NEXT_OFF(oscan) += NEXT_OFF(next);
3281 }
3282 continue;
3283 default: /* REF and CLUMP only? */
3284 if (flags & SCF_DO_SUBSTR) {
304ee84b 3285 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3286 data->longest = &(data->longest_float);
3287 }
3288 is_inf = is_inf_internal = 1;
3289 if (flags & SCF_DO_STCLASS_OR)
3290 cl_anything(pRExC_state, data->start_class);
3291 flags &= ~SCF_DO_STCLASS;
3292 break;
c277df42 3293 }
8aa23a47
YO
3294 }
3295 else if (strchr((const char*)PL_simple,OP(scan))) {
3296 int value = 0;
653099ff 3297
8aa23a47 3298 if (flags & SCF_DO_SUBSTR) {
304ee84b 3299 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3300 data->pos_min++;
3301 }
3302 min++;
3303 if (flags & SCF_DO_STCLASS) {
3304 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3305
8aa23a47
YO
3306 /* Some of the logic below assumes that switching
3307 locale on will only add false positives. */
3308 switch (PL_regkind[OP(scan)]) {
3309 case SANY:
3310 default:
3311 do_default:
3312 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3313 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3314 cl_anything(pRExC_state, data->start_class);
3315 break;
3316 case REG_ANY:
3317 if (OP(scan) == SANY)
3318 goto do_default;
3319 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3320 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3321 || (data->start_class->flags & ANYOF_CLASS));
3322 cl_anything(pRExC_state, data->start_class);
653099ff 3323 }
8aa23a47
YO
3324 if (flags & SCF_DO_STCLASS_AND || !value)
3325 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3326 break;
3327 case ANYOF:
3328 if (flags & SCF_DO_STCLASS_AND)
3329 cl_and(data->start_class,
3330 (struct regnode_charclass_class*)scan);
653099ff 3331 else
8aa23a47
YO
3332 cl_or(pRExC_state, data->start_class,
3333 (struct regnode_charclass_class*)scan);
3334 break;
3335 case ALNUM:
3336 if (flags & SCF_DO_STCLASS_AND) {
3337 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3338 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3339 for (value = 0; value < 256; value++)
3340 if (!isALNUM(value))
3341 ANYOF_BITMAP_CLEAR(data->start_class, value);
3342 }
653099ff 3343 }
8aa23a47
YO
3344 else {
3345 if (data->start_class->flags & ANYOF_LOCALE)
3346 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3347 else {
3348 for (value = 0; value < 256; value++)
3349 if (isALNUM(value))
3350 ANYOF_BITMAP_SET(data->start_class, value);
653099ff 3351 }
8aa23a47
YO
3352 }
3353 break;
3354 case ALNUML:
3355 if (flags & SCF_DO_STCLASS_AND) {
3356 if (data->start_class->flags & ANYOF_LOCALE)
3357 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3358 }
3359 else {
3360 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3361 data->start_class->flags |= ANYOF_LOCALE;
3362 }
3363 break;
3364 case NALNUM:
3365 if (flags & SCF_DO_STCLASS_AND) {
3366 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3367 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3368 for (value = 0; value < 256; value++)
3369 if (isALNUM(value))
3370 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3371 }
3372 }
8aa23a47
YO
3373 else {
3374 if (data->start_class->flags & ANYOF_LOCALE)
3375 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3376 else {
3377 for (value = 0; value < 256; value++)
3378 if (!isALNUM(value))
3379 ANYOF_BITMAP_SET(data->start_class, value);
3380 }
653099ff 3381 }
8aa23a47
YO
3382 break;
3383 case NALNUML:
3384 if (flags & SCF_DO_STCLASS_AND) {
3385 if (data->start_class->flags & ANYOF_LOCALE)
3386 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
653099ff 3387 }
8aa23a47
YO
3388 else {
3389 data->start_class->flags |= ANYOF_LOCALE;
3390 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3391 }
3392 break;
3393 case SPACE:
3394 if (flags & SCF_DO_STCLASS_AND) {
3395 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3396 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3397 for (value = 0; value < 256; value++)
3398 if (!isSPACE(value))
3399 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3400 }
3401 }
8aa23a47
YO
3402 else {
3403 if (data->start_class->flags & ANYOF_LOCALE)
3404 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3405 else {
3406 for (value = 0; value < 256; value++)
3407 if (isSPACE(value))
3408 ANYOF_BITMAP_SET(data->start_class, value);
3409 }
653099ff 3410 }
8aa23a47
YO
3411 break;
3412 case SPACEL:
3413 if (flags & SCF_DO_STCLASS_AND) {
3414 if (data->start_class->flags & ANYOF_LOCALE)
3415 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3416 }
3417 else {
3418 data->start_class->flags |= ANYOF_LOCALE;
3419 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3420 }
3421 break;
3422 case NSPACE:
3423 if (flags & SCF_DO_STCLASS_AND) {
3424 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3425 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3426 for (value = 0; value < 256; value++)
3427 if (isSPACE(value))
3428 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff 3429 }
8aa23a47
YO
3430 }
3431 else {
3432 if (data->start_class->flags & ANYOF_LOCALE)
3433 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3434 else {
3435 for (value = 0; value < 256; value++)
3436 if (!isSPACE(value))
3437 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3438 }
3439 }
8aa23a47
YO
3440 break;
3441 case NSPACEL:
3442 if (flags & SCF_DO_STCLASS_AND) {
3443 if (data->start_class->flags & ANYOF_LOCALE) {
3444 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3445 for (value = 0; value < 256; value++)
3446 if (!isSPACE(value))
3447 ANYOF_BITMAP_CLEAR(data->start_class, value);
3448 }
653099ff 3449 }
8aa23a47
YO
3450 else {
3451 data->start_class->flags |= ANYOF_LOCALE;
3452 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3453 }
3454 break;
3455 case DIGIT:
3456 if (flags & SCF_DO_STCLASS_AND) {
3457 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3458 for (value = 0; value < 256; value++)
3459 if (!isDIGIT(value))
3460 ANYOF_BITMAP_CLEAR(data->start_class, value);
3461 }
3462 else {
3463 if (data->start_class->flags & ANYOF_LOCALE)
3464 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3465 else {
3466 for (value = 0; value < 256; value++)
3467 if (isDIGIT(value))
3468 ANYOF_BITMAP_SET(data->start_class, value);
3469 }
3470 }
3471 break;
3472 case NDIGIT:
3473 if (flags & SCF_DO_STCLASS_AND) {
3474 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3475 for (value = 0; value < 256; value++)
3476 if (isDIGIT(value))
3477 ANYOF_BITMAP_CLEAR(data->start_class, value);
3478 }
3479 else {
3480 if (data->start_class->flags & ANYOF_LOCALE)
3481 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3482 else {
3483 for (value = 0; value < 256; value++)
3484 if (!isDIGIT(value))
3485 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3486 }
3487 }
8aa23a47
YO
3488 break;
3489 }
3490 if (flags & SCF_DO_STCLASS_OR)
3491 cl_and(data->start_class, and_withp);
3492 flags &= ~SCF_DO_STCLASS;
3493 }
3494 }
3495 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3496 data->flags |= (OP(scan) == MEOL
3497 ? SF_BEFORE_MEOL
3498 : SF_BEFORE_SEOL);
3499 }
3500 else if ( PL_regkind[OP(scan)] == BRANCHJ
3501 /* Lookbehind, or need to calculate parens/evals/stclass: */
3502 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3503 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3504 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3505 || OP(scan) == UNLESSM )
3506 {
3507 /* Negative Lookahead/lookbehind
3508 In this case we can't do fixed string optimisation.
3509 */
1de06328 3510
8aa23a47
YO
3511 I32 deltanext, minnext, fake = 0;
3512 regnode *nscan;
3513 struct regnode_charclass_class intrnl;
3514 int f = 0;
1de06328 3515
8aa23a47
YO
3516 data_fake.flags = 0;
3517 if (data) {
3518 data_fake.whilem_c = data->whilem_c;
3519 data_fake.last_closep = data->last_closep;
c277df42 3520 }
8aa23a47
YO
3521 else
3522 data_fake.last_closep = &fake;
58e23c8d 3523 data_fake.pos_delta = delta;
8aa23a47
YO
3524 if ( flags & SCF_DO_STCLASS && !scan->flags
3525 && OP(scan) == IFMATCH ) { /* Lookahead */
3526 cl_init(pRExC_state, &intrnl);
3527 data_fake.start_class = &intrnl;
3528 f |= SCF_DO_STCLASS_AND;
3529 }
3530 if (flags & SCF_WHILEM_VISITED_POS)
3531 f |= SCF_WHILEM_VISITED_POS;
3532 next = regnext(scan);
3533 nscan = NEXTOPER(NEXTOPER(scan));
3534 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3535 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3536 if (scan->flags) {
3537 if (deltanext) {
58e23c8d 3538 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3539 }
3540 else if (minnext > (I32)U8_MAX) {
58e23c8d 3541 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3542 }
3543 scan->flags = (U8)minnext;
3544 }
3545 if (data) {
3546 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3547 pars++;
3548 if (data_fake.flags & SF_HAS_EVAL)
3549 data->flags |= SF_HAS_EVAL;
3550 data->whilem_c = data_fake.whilem_c;
3551 }
3552 if (f & SCF_DO_STCLASS_AND) {
3553 const int was = (data->start_class->flags & ANYOF_EOS);
3554
3555 cl_and(data->start_class, &intrnl);
3556 if (was)
3557 data->start_class->flags |= ANYOF_EOS;
3558 }
cb434fcc 3559 }
8aa23a47
YO
3560#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3561 else {
3562 /* Positive Lookahead/lookbehind
3563 In this case we can do fixed string optimisation,
3564 but we must be careful about it. Note in the case of
3565 lookbehind the positions will be offset by the minimum
3566 length of the pattern, something we won't know about
3567 until after the recurse.
3568 */
3569 I32 deltanext, fake = 0;
3570 regnode *nscan;
3571 struct regnode_charclass_class intrnl;
3572 int f = 0;
3573 /* We use SAVEFREEPV so that when the full compile
3574 is finished perl will clean up the allocated
3575 minlens when its all done. This was we don't
3576 have to worry about freeing them when we know
3577 they wont be used, which would be a pain.
3578 */
3579 I32 *minnextp;
3580 Newx( minnextp, 1, I32 );
3581 SAVEFREEPV(minnextp);
3582
3583 if (data) {
3584 StructCopy(data, &data_fake, scan_data_t);
3585 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3586 f |= SCF_DO_SUBSTR;
3587 if (scan->flags)
304ee84b 3588 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
8aa23a47
YO
3589 data_fake.last_found=newSVsv(data->last_found);
3590 }
3591 }
3592 else
3593 data_fake.last_closep = &fake;
3594 data_fake.flags = 0;
58e23c8d 3595 data_fake.pos_delta = delta;
8aa23a47
YO
3596 if (is_inf)
3597 data_fake.flags |= SF_IS_INF;
3598 if ( flags & SCF_DO_STCLASS && !scan->flags
3599 && OP(scan) == IFMATCH ) { /* Lookahead */
3600 cl_init(pRExC_state, &intrnl);
3601 data_fake.start_class = &intrnl;
3602 f |= SCF_DO_STCLASS_AND;
3603 }
3604 if (flags & SCF_WHILEM_VISITED_POS)
3605 f |= SCF_WHILEM_VISITED_POS;
3606 next = regnext(scan);
3607 nscan = NEXTOPER(NEXTOPER(scan));
3608
3609 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3610 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3611 if (scan->flags) {
3612 if (deltanext) {
58e23c8d 3613 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3614 }
3615 else if (*minnextp > (I32)U8_MAX) {
58e23c8d 3616 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3617 }
3618 scan->flags = (U8)*minnextp;
3619 }
3620
3621 *minnextp += min;
3622
3623 if (f & SCF_DO_STCLASS_AND) {
3624 const int was = (data->start_class->flags & ANYOF_EOS);
3625
3626 cl_and(data->start_class, &intrnl);
3627 if (was)
3628 data->start_class->flags |= ANYOF_EOS;
3629 }
3630 if (data) {
3631 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3632 pars++;
3633 if (data_fake.flags & SF_HAS_EVAL)
3634 data->flags |= SF_HAS_EVAL;
3635 data->whilem_c = data_fake.whilem_c;
3636 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3637 if (RExC_rx->minlen<*minnextp)
3638 RExC_rx->minlen=*minnextp;
304ee84b 3639 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
8aa23a47
YO
3640 SvREFCNT_dec(data_fake.last_found);
3641
3642 if ( data_fake.minlen_fixed != minlenp )
3643 {
3644 data->offset_fixed= data_fake.offset_fixed;
3645 data->minlen_fixed= data_fake.minlen_fixed;
3646 data->lookbehind_fixed+= scan->flags;
3647 }
3648 if ( data_fake.minlen_float != minlenp )
3649 {
3650 data->minlen_float= data_fake.minlen_float;
3651 data->offset_float_min=data_fake.offset_float_min;
3652 data->offset_float_max=data_fake.offset_float_max;
3653 data->lookbehind_float+= scan->flags;
3654 }
3655 }
3656 }
3657
3658
40d049e4 3659 }
8aa23a47
YO
3660#endif
3661 }
3662 else if (OP(scan) == OPEN) {
3663 if (stopparen != (I32)ARG(scan))
3664 pars++;
3665 }
3666 else if (OP(scan) == CLOSE) {
3667 if (stopparen == (I32)ARG(scan)) {
3668 break;
3669 }
3670 if ((I32)ARG(scan) == is_par) {
3671 next = regnext(scan);
b515a41d 3672
8aa23a47
YO
3673 if ( next && (OP(next) != WHILEM) && next < last)
3674 is_par = 0; /* Disable optimization */
40d049e4 3675 }
8aa23a47
YO
3676 if (data)
3677 *(data->last_closep) = ARG(scan);
3678 }
3679 else if (OP(scan) == EVAL) {
c277df42
IZ
3680 if (data)
3681 data->flags |= SF_HAS_EVAL;
8aa23a47
YO
3682 }
3683 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3684 if (flags & SCF_DO_SUBSTR) {
304ee84b 3685 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47 3686 flags &= ~SCF_DO_SUBSTR;
40d049e4 3687 }
8aa23a47
YO
3688 if (data && OP(scan)==ACCEPT) {
3689 data->flags |= SCF_SEEN_ACCEPT;
3690 if (stopmin > min)
3691 stopmin = min;
e2e6a0f1 3692 }
8aa23a47
YO
3693 }
3694 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3695 {
0f5d15d6 3696 if (flags & SCF_DO_SUBSTR) {
304ee84b 3697 SCAN_COMMIT(pRExC_state,data,minlenp);
0f5d15d6
IZ
3698 data->longest = &(data->longest_float);
3699 }
3700 is_inf = is_inf_internal = 1;
653099ff 3701 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3702 cl_anything(pRExC_state, data->start_class);
96776eda 3703 flags &= ~SCF_DO_STCLASS;
8aa23a47 3704 }
58e23c8d 3705 else if (OP(scan) == GPOS) {
bbe252da 3706 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
58e23c8d
YO
3707 !(delta || is_inf || (data && data->pos_delta)))
3708 {
bbe252da
YO
3709 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3710 RExC_rx->extflags |= RXf_ANCH_GPOS;
58e23c8d
YO
3711 if (RExC_rx->gofs < (U32)min)
3712 RExC_rx->gofs = min;
3713 } else {
bbe252da 3714 RExC_rx->extflags |= RXf_GPOS_FLOAT;
58e23c8d
YO
3715 RExC_rx->gofs = 0;
3716 }
3717 }
786e8c11 3718#ifdef TRIE_STUDY_OPT
40d049e4 3719#ifdef FULL_TRIE_STUDY
8aa23a47
YO
3720 else if (PL_regkind[OP(scan)] == TRIE) {
3721 /* NOTE - There is similar code to this block above for handling
3722 BRANCH nodes on the initial study. If you change stuff here
3723 check there too. */
3724 regnode *trie_node= scan;
3725 regnode *tail= regnext(scan);
f8fc2ecf 3726 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
3727 I32 max1 = 0, min1 = I32_MAX;
3728 struct regnode_charclass_class accum;
3729
3730 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
304ee84b 3731 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
3732 if (flags & SCF_DO_STCLASS)
3733 cl_init_zero(pRExC_state, &accum);
3734
3735 if (!trie->jump) {
3736 min1= trie->minlen;
3737 max1= trie->maxlen;
3738 } else {
3739 const regnode *nextbranch= NULL;
3740 U32 word;
3741
3742 for ( word=1 ; word <= trie->wordcount ; word++)
3743 {
3744 I32 deltanext=0, minnext=0, f = 0, fake;
3745 struct regnode_charclass_class this_class;
3746
3747 data_fake.flags = 0;
3748 if (data) {
3749 data_fake.whilem_c = data->whilem_c;
3750 data_fake.last_closep = data->last_closep;
3751 }
3752 else
3753 data_fake.last_closep = &fake;
58e23c8d 3754 data_fake.pos_delta = delta;
8aa23a47
YO
3755 if (flags & SCF_DO_STCLASS) {
3756 cl_init(pRExC_state, &this_class);
3757 data_fake.start_class = &this_class;
3758 f = SCF_DO_STCLASS_AND;
3759 }
3760 if (flags & SCF_WHILEM_VISITED_POS)
3761 f |= SCF_WHILEM_VISITED_POS;
3762
3763 if (trie->jump[word]) {
3764 if (!nextbranch)
3765 nextbranch = trie_node + trie->jump[0];
3766 scan= trie_node + trie->jump[word];
3767 /* We go from the jump point to the branch that follows
3768 it. Note this means we need the vestigal unused branches
3769 even though they arent otherwise used.
3770 */
3771 minnext = study_chunk(pRExC_state, &scan, minlenp,
3772 &deltanext, (regnode *)nextbranch, &data_fake,
3773 stopparen, recursed, NULL, f,depth+1);
3774 }
3775 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3776 nextbranch= regnext((regnode*)nextbranch);
3777
3778 if (min1 > (I32)(minnext + trie->minlen))
3779 min1 = minnext + trie->minlen;
3780 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3781 max1 = minnext + deltanext + trie->maxlen;
3782 if (deltanext == I32_MAX)
3783 is_inf = is_inf_internal = 1;
3784
3785 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3786 pars++;
3787 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3788 if ( stopmin > min + min1)
3789 stopmin = min + min1;
3790 flags &= ~SCF_DO_SUBSTR;
3791 if (data)
3792 data->flags |= SCF_SEEN_ACCEPT;
3793 }
3794 if (data) {
3795 if (data_fake.flags & SF_HAS_EVAL)
3796 data->flags |= SF_HAS_EVAL;
3797 data->whilem_c = data_fake.whilem_c;
3798 }
3799 if (flags & SCF_DO_STCLASS)
3800 cl_or(pRExC_state, &accum, &this_class);
3801 }
3802 }
3803 if (flags & SCF_DO_SUBSTR) {
3804 data->pos_min += min1;
3805 data->pos_delta += max1 - min1;
3806 if (max1 != min1 || is_inf)
3807 data->longest = &(data->longest_float);
3808 }
3809 min += min1;
3810 delta += max1 - min1;
3811 if (flags & SCF_DO_STCLASS_OR) {
3812 cl_or(pRExC_state, data->start_class, &accum);
3813 if (min1) {
3814 cl_and(data->start_class, and_withp);
3815 flags &= ~SCF_DO_STCLASS;
3816 }
3817 }
3818 else if (flags & SCF_DO_STCLASS_AND) {
3819 if (min1) {
3820 cl_and(data->start_class, &accum);
3821 flags &= ~SCF_DO_STCLASS;
3822 }
3823 else {
3824 /* Switch to OR mode: cache the old value of
3825 * data->start_class */
3826 INIT_AND_WITHP;
3827 StructCopy(data->start_class, and_withp,
3828 struct regnode_charclass_class);
3829 flags &= ~SCF_DO_STCLASS_AND;
3830 StructCopy(&accum, data->start_class,
3831 struct regnode_charclass_class);
3832 flags |= SCF_DO_STCLASS_OR;
3833 data->start_class->flags |= ANYOF_EOS;
3834 }
3835 }
3836 scan= tail;
3837 continue;
3838 }
786e8c11 3839#else
8aa23a47 3840 else if (PL_regkind[OP(scan)] == TRIE) {
f8fc2ecf 3841 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
3842 U8*bang=NULL;
3843
3844 min += trie->minlen;
3845 delta += (trie->maxlen - trie->minlen);
3846 flags &= ~SCF_DO_STCLASS; /* xxx */
3847 if (flags & SCF_DO_SUBSTR) {
304ee84b 3848 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3849 data->pos_min += trie->minlen;
3850 data->pos_delta += (trie->maxlen - trie->minlen);
3851 if (trie->maxlen != trie->minlen)
3852 data->longest = &(data->longest_float);
3853 }
3854 if (trie->jump) /* no more substrings -- for now /grr*/
3855 flags &= ~SCF_DO_SUBSTR;
b515a41d 3856 }
8aa23a47
YO
3857#endif /* old or new */
3858#endif /* TRIE_STUDY_OPT */
3859 /* Else: zero-length, ignore. */
3860 scan = regnext(scan);
3861 }
3862 if (frame) {
3863 last = frame->last;
3864 scan = frame->next;
3865 stopparen = frame->stop;
3866 frame = frame->prev;
3867 goto fake_study_recurse;
c277df42
IZ
3868 }
3869
3870 finish:
8aa23a47 3871 assert(!frame);
304ee84b 3872 DEBUG_STUDYDATA("pre-fin:",data,depth);
8aa23a47 3873
c277df42 3874 *scanp = scan;
aca2d497 3875 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3876 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 3877 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 3878 if (is_par > (I32)U8_MAX)
c277df42
IZ
3879 is_par = 0;
3880 if (is_par && pars==1 && data) {
3881 data->flags |= SF_IN_PAR;
3882 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3883 }
3884 else if (pars && data) {
c277df42
IZ
3885 data->flags |= SF_HAS_PAR;
3886 data->flags &= ~SF_IN_PAR;
3887 }
653099ff 3888 if (flags & SCF_DO_STCLASS_OR)
40d049e4 3889 cl_and(data->start_class, and_withp);
786e8c11
YO
3890 if (flags & SCF_TRIE_RESTUDY)
3891 data->flags |= SCF_TRIE_RESTUDY;
1de06328 3892
304ee84b 3893 DEBUG_STUDYDATA("post-fin:",data,depth);
1de06328 3894
e2e6a0f1 3895 return min < stopmin ? min : stopmin;
c277df42
IZ
3896}
3897
2eccd3b2
NC
3898STATIC U32
3899S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
c277df42 3900{
4a4e7719
NC
3901 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
3902
3903 Renewc(RExC_rxi->data,
3904 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
3905 char, struct reg_data);
3906 if(count)
f8fc2ecf 3907 Renew(RExC_rxi->data->what, count + n, U8);
4a4e7719 3908 else
f8fc2ecf 3909 Newx(RExC_rxi->data->what, n, U8);
4a4e7719
NC
3910 RExC_rxi->data->count = count + n;
3911 Copy(s, RExC_rxi->data->what + count, n, U8);
3912 return count;
c277df42
IZ
3913}
3914
f8149455 3915/*XXX: todo make this not included in a non debugging perl */
76234dfb 3916#ifndef PERL_IN_XSUB_RE
d88dccdf 3917void
864dbfa3 3918Perl_reginitcolors(pTHX)
d88dccdf 3919{
97aff369 3920 dVAR;
1df70142 3921 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 3922 if (s) {
1df70142
AL
3923 char *t = savepv(s);
3924 int i = 0;
3925 PL_colors[0] = t;
d88dccdf 3926 while (++i < 6) {
1df70142
AL
3927 t = strchr(t, '\t');
3928 if (t) {
3929 *t = '\0';
3930 PL_colors[i] = ++t;
d88dccdf
IZ
3931 }
3932 else
1df70142 3933 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
3934 }
3935 } else {
1df70142 3936 int i = 0;
b81d288d 3937 while (i < 6)
06b5626a 3938 PL_colors[i++] = (char *)"";
d88dccdf
IZ
3939 }
3940 PL_colorset = 1;
3941}
76234dfb 3942#endif
8615cb43 3943
07be1b83 3944
786e8c11
YO
3945#ifdef TRIE_STUDY_OPT
3946#define CHECK_RESTUDY_GOTO \
3947 if ( \
3948 (data.flags & SCF_TRIE_RESTUDY) \
3949 && ! restudied++ \
3950 ) goto reStudy
3951#else
3952#define CHECK_RESTUDY_GOTO
3953#endif
f9f4320a 3954
a687059c 3955/*
e50aee73 3956 - pregcomp - compile a regular expression into internal code
a687059c
LW
3957 *
3958 * We can't allocate space until we know how big the compiled form will be,
3959 * but we can't compile it (and thus know how big it is) until we've got a
3960 * place to put the code. So we cheat: we compile it twice, once with code
3961 * generation turned off and size counting turned on, and once "for real".
3962 * This also means that we don't allocate space until we are sure that the
3963 * thing really will compile successfully, and we never have to move the
3964 * code and thus invalidate pointers into it. (Note that it has to be in
3965 * one piece because free() must be able to free it all.) [NB: not true in perl]
3966 *
3967 * Beware that the optimization-preparation code in here knows about some
3968 * of the structure of the compiled regexp. [I'll say.]
3969 */
b9b4dddf
YO
3970
3971
3972
f9f4320a 3973#ifndef PERL_IN_XSUB_RE
f9f4320a
YO
3974#define RE_ENGINE_PTR &PL_core_reg_engine
3975#else
f9f4320a
YO
3976extern const struct regexp_engine my_reg_engine;
3977#define RE_ENGINE_PTR &my_reg_engine
3978#endif
6d5c990f
RGS
3979
3980#ifndef PERL_IN_XSUB_RE
a687059c 3981regexp *
864dbfa3 3982Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 3983{
97aff369 3984 dVAR;
6d5c990f 3985 HV * const table = GvHV(PL_hintgv);
f9f4320a
YO
3986 /* Dispatch a request to compile a regexp to correct
3987 regexp engine. */
f9f4320a
YO
3988 if (table) {
3989 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
6d5c990f 3990 GET_RE_DEBUG_FLAGS_DECL;
1e2e3d02 3991 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
f9f4320a
YO
3992 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
3993 DEBUG_COMPILE_r({
8d8756e7 3994 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
3995 SvIV(*ptr));
3996 });
f2f78491 3997 return CALLREGCOMP_ENG(eng, exp, xend, pm);
f9f4320a 3998 }
b9b4dddf 3999 }
2a5d9b1d
RGS
4000 return Perl_re_compile(aTHX_ exp, xend, pm);
4001}
6d5c990f 4002#endif
2a5d9b1d
RGS
4003
4004regexp *
4005Perl_re_compile(pTHX_ char *exp, char *xend, PMOP *pm)
4006{
4007 dVAR;
a0d0e21e 4008 register regexp *r;
f8fc2ecf 4009 register regexp_internal *ri;
c277df42 4010 regnode *scan;
c277df42 4011 regnode *first;
a0d0e21e 4012 I32 flags;
a0d0e21e
LW
4013 I32 minlen = 0;
4014 I32 sawplus = 0;
4015 I32 sawopen = 0;
2c2d71f5 4016 scan_data_t data;
830247a4 4017 RExC_state_t RExC_state;
be8e71aa 4018 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83
YO
4019#ifdef TRIE_STUDY_OPT
4020 int restudied= 0;
4021 RExC_state_t copyRExC_state;
4022#endif
2a5d9b1d 4023 GET_RE_DEBUG_FLAGS_DECL;
6d5c990f
RGS
4024 DEBUG_r(if (!PL_colorset) reginitcolors());
4025
a0d0e21e 4026 if (exp == NULL)
c277df42 4027 FAIL("NULL regexp argument");
a0d0e21e 4028
a5961de5 4029 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 4030
5cfc7842 4031 RExC_precomp = exp;
a3621e74 4032 DEBUG_COMPILE_r({
ab3bbdeb
YO
4033 SV *dsv= sv_newmortal();
4034 RE_PV_QUOTED_DECL(s, RExC_utf8,
4035 dsv, RExC_precomp, (xend - exp), 60);
4036 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4037 PL_colors[4],PL_colors[5],s);
a5961de5 4038 });
e2509266 4039 RExC_flags = pm->op_pmflags;
830247a4 4040 RExC_sawback = 0;
bbce6d69 4041
830247a4
IZ
4042 RExC_seen = 0;
4043 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4044 RExC_seen_evals = 0;
4045 RExC_extralen = 0;
c277df42 4046
bbce6d69 4047 /* First pass: determine size, legality. */
830247a4 4048 RExC_parse = exp;
fac92740 4049 RExC_start = exp;
830247a4
IZ
4050 RExC_end = xend;
4051 RExC_naughty = 0;
4052 RExC_npar = 1;
e2e6a0f1 4053 RExC_nestroot = 0;
830247a4
IZ
4054 RExC_size = 0L;
4055 RExC_emit = &PL_regdummy;
4056 RExC_whilem_seen = 0;
fc8cd66c 4057 RExC_charnames = NULL;
40d049e4
YO
4058 RExC_open_parens = NULL;
4059 RExC_close_parens = NULL;
4060 RExC_opend = NULL;
81714fb9 4061 RExC_paren_names = NULL;
1f1031fe
YO
4062#ifdef DEBUGGING
4063 RExC_paren_name_list = NULL;
4064#endif
40d049e4
YO
4065 RExC_recurse = NULL;
4066 RExC_recurse_count = 0;
81714fb9 4067
85ddcde9
JH
4068#if 0 /* REGC() is (currently) a NOP at the first pass.
4069 * Clever compilers notice this and complain. --jhi */
830247a4 4070 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 4071#endif
3dab1dad
YO
4072 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4073 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 4074 RExC_precomp = NULL;
a0d0e21e
LW
4075 return(NULL);
4076 }
07be1b83 4077 DEBUG_PARSE_r({
81714fb9
YO
4078 PerlIO_printf(Perl_debug_log,
4079 "Required size %"IVdf" nodes\n"
4080 "Starting second pass (creation)\n",
4081 (IV)RExC_size);
07be1b83
YO
4082 RExC_lastnum=0;
4083 RExC_lastparse=NULL;
4084 });
c277df42
IZ
4085 /* Small enough for pointer-storage convention?
4086 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
4087 if (RExC_size >= 0x10000L && RExC_extralen)
4088 RExC_size += RExC_extralen;
c277df42 4089 else
830247a4
IZ
4090 RExC_extralen = 0;
4091 if (RExC_whilem_seen > 15)
4092 RExC_whilem_seen = 15;
a0d0e21e 4093
e2e6a0f1
YO
4094#ifdef DEBUGGING
4095 /* Make room for a sentinel value at the end of the program */
4096 RExC_size++;
4097#endif
4098
f9f4320a
YO
4099 /* Allocate space and zero-initialize. Note, the two step process
4100 of zeroing when in debug mode, thus anything assigned has to
4101 happen after that */
f8fc2ecf
YO
4102 Newxz(r, 1, regexp);
4103 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4104 char, regexp_internal);
4105 if ( r == NULL || ri == NULL )
b45f050a 4106 FAIL("Regexp out of space");
0f79a09d
GS
4107#ifdef DEBUGGING
4108 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
f8fc2ecf 4109 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
58e23c8d 4110#else
f8fc2ecf
YO
4111 /* bulk initialize base fields with 0. */
4112 Zero(ri, sizeof(regexp_internal), char);
0f79a09d 4113#endif
58e23c8d
YO
4114
4115 /* non-zero initialization begins here */
f8fc2ecf 4116 RXi_SET( r, ri );
f9f4320a 4117 r->engine= RE_ENGINE_PTR;
c277df42 4118 r->refcnt = 1;
bbce6d69 4119 r->prelen = xend - exp;
5cfc7842 4120 r->precomp = savepvn(RExC_precomp, r->prelen);
bbe252da
YO
4121 r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME;
4122 r->intflags = 0;
830247a4 4123 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
81714fb9 4124
6bda09f9 4125 if (RExC_seen & REG_SEEN_RECURSE) {
40d049e4
YO
4126 Newxz(RExC_open_parens, RExC_npar,regnode *);
4127 SAVEFREEPV(RExC_open_parens);
4128 Newxz(RExC_close_parens,RExC_npar,regnode *);
4129 SAVEFREEPV(RExC_close_parens);
6bda09f9
YO
4130 }
4131
4132 /* Useful during FAIL. */
f8fc2ecf
YO
4133 Newxz(ri->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4134 if (ri->offsets) {
4135 ri->offsets[0] = RExC_size;
fac92740 4136 }
a3621e74 4137 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd 4138 "%s %"UVuf" bytes for offset annotations.\n",
f8fc2ecf 4139 ri->offsets ? "Got" : "Couldn't get",
392fbf5d 4140 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 4141
830247a4 4142 RExC_rx = r;
f8fc2ecf 4143 RExC_rxi = ri;
bbce6d69 4144
4145 /* Second pass: emit code. */
e2509266 4146 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
4147 RExC_parse = exp;
4148 RExC_end = xend;
4149 RExC_naughty = 0;
4150 RExC_npar = 1;
f8fc2ecf
YO
4151 RExC_emit_start = ri->program;
4152 RExC_emit = ri->program;
e2e6a0f1
YO
4153#ifdef DEBUGGING
4154 /* put a sentinal on the end of the program so we can check for
4155 overwrites */
f8fc2ecf 4156 ri->program[RExC_size].type = 255;
e2e6a0f1 4157#endif
2cd61cdb 4158 /* Store the count of eval-groups for security checks: */
f8149455 4159 RExC_rx->seen_evals = RExC_seen_evals;
830247a4 4160 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
3dab1dad 4161 if (reg(pRExC_state, 0, &flags,1) == NULL)
a0d0e21e 4162 return(NULL);
6bda09f9 4163
07be1b83
YO
4164 /* XXXX To minimize changes to RE engine we always allocate
4165 3-units-long substrs field. */
4166 Newx(r->substrs, 1, struct reg_substr_data);
40d049e4
YO
4167 if (RExC_recurse_count) {
4168 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4169 SAVEFREEPV(RExC_recurse);
4170 }
a0d0e21e 4171
07be1b83 4172reStudy:
1de06328 4173 r->minlen = minlen = sawplus = sawopen = 0;
07be1b83 4174 Zero(r->substrs, 1, struct reg_substr_data);
a3621e74 4175
07be1b83
YO
4176#ifdef TRIE_STUDY_OPT
4177 if ( restudied ) {
5d458dd8 4178 U32 seen=RExC_seen;
07be1b83 4179 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5d458dd8
YO
4180
4181 RExC_state = copyRExC_state;
4182 if (seen & REG_TOP_LEVEL_BRANCHES)
4183 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4184 else
4185 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
1de06328 4186 if (data.last_found) {
07be1b83 4187 SvREFCNT_dec(data.longest_fixed);
07be1b83 4188 SvREFCNT_dec(data.longest_float);
07be1b83 4189 SvREFCNT_dec(data.last_found);
1de06328 4190 }
40d049e4 4191 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4192 } else {
40d049e4 4193 StructCopy(&zero_scan_data, &data, scan_data_t);
5d458dd8 4194 copyRExC_state = RExC_state;
07be1b83 4195 }
40d049e4
YO
4196#else
4197 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4198#endif
fc8cd66c 4199
a0d0e21e 4200 /* Dig out information for optimizations. */
bbe252da 4201 r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME; /* Again? */
e2509266 4202 pm->op_pmflags = RExC_flags;
a0ed51b3 4203 if (UTF)
bbe252da 4204 r->extflags |= RXf_UTF8; /* Unicode in it? */
f8fc2ecf 4205 ri->regstclass = NULL;
830247a4 4206 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
bbe252da 4207 r->intflags |= PREGf_NAUGHTY;
f8fc2ecf 4208 scan = ri->program + 1; /* First BRANCH. */
2779dcf1 4209
1de06328
YO
4210 /* testing for BRANCH here tells us whether there is "must appear"
4211 data in the pattern. If there is then we can use it for optimisations */
eaf3ca90 4212 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
c277df42 4213 I32 fake;
c5254dd6 4214 STRLEN longest_float_length, longest_fixed_length;
07be1b83 4215 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 4216 int stclass_flag;
07be1b83 4217 I32 last_close = 0; /* pointed to by data */
a0d0e21e
LW
4218
4219 first = scan;
c277df42 4220 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 4221 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 4222 /* An OR of *one* alternative - should not happen now. */
a0d0e21e 4223 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
07be1b83
YO
4224 /* for now we can't handle lookbehind IFMATCH*/
4225 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
4226 (OP(first) == PLUS) ||
4227 (OP(first) == MINMOD) ||
653099ff 4228 /* An {n,m} with n>0 */
07be1b83
YO
4229 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
4230 {
786e8c11 4231
a0d0e21e
LW
4232 if (OP(first) == PLUS)
4233 sawplus = 1;
4234 else
3dab1dad 4235 first += regarglen[OP(first)];
07be1b83
YO
4236 if (OP(first) == IFMATCH) {
4237 first = NEXTOPER(first);
4238 first += EXTRA_STEP_2ARGS;
7c167cea 4239 } else /* XXX possible optimisation for /(?=)/ */
07be1b83 4240 first = NEXTOPER(first);
a687059c
LW
4241 }
4242
a0d0e21e
LW
4243 /* Starting-point info. */
4244 again:
786e8c11 4245 DEBUG_PEEP("first:",first,0);
07be1b83 4246 /* Ignore EXACT as we deal with it later. */
3dab1dad 4247 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 4248 if (OP(first) == EXACT)
6f207bd3 4249 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 4250 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
f8fc2ecf 4251 ri->regstclass = first;
b3c9acc1 4252 }
07be1b83 4253#ifdef TRIE_STCLASS
786e8c11 4254 else if (PL_regkind[OP(first)] == TRIE &&
f8fc2ecf 4255 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
07be1b83 4256 {
786e8c11 4257 regnode *trie_op;
07be1b83 4258 /* this can happen only on restudy */
786e8c11 4259 if ( OP(first) == TRIE ) {
c944940b 4260 struct regnode_1 *trieop = (struct regnode_1 *)
446bd890 4261 PerlMemShared_calloc(1, sizeof(struct regnode_1));
786e8c11
YO
4262 StructCopy(first,trieop,struct regnode_1);
4263 trie_op=(regnode *)trieop;
4264 } else {
c944940b 4265 struct regnode_charclass *trieop = (struct regnode_charclass *)
446bd890 4266 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
786e8c11
YO
4267 StructCopy(first,trieop,struct regnode_charclass);
4268 trie_op=(regnode *)trieop;
4269 }
1de06328 4270 OP(trie_op)+=2;
786e8c11 4271 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
f8fc2ecf 4272 ri->regstclass = trie_op;
07be1b83
YO
4273 }
4274#endif
bfed75c6 4275 else if (strchr((const char*)PL_simple,OP(first)))
f8fc2ecf 4276 ri->regstclass = first;
3dab1dad
YO
4277 else if (PL_regkind[OP(first)] == BOUND ||
4278 PL_regkind[OP(first)] == NBOUND)
f8fc2ecf 4279 ri->regstclass = first;
3dab1dad 4280 else if (PL_regkind[OP(first)] == BOL) {
bbe252da
YO
4281 r->extflags |= (OP(first) == MBOL
4282 ? RXf_ANCH_MBOL
cad2e5aa 4283 : (OP(first) == SBOL
bbe252da
YO
4284 ? RXf_ANCH_SBOL
4285 : RXf_ANCH_BOL));
a0d0e21e 4286 first = NEXTOPER(first);
774d564b 4287 goto again;
4288 }
4289 else if (OP(first) == GPOS) {
bbe252da 4290 r->extflags |= RXf_ANCH_GPOS;
774d564b 4291 first = NEXTOPER(first);
4292 goto again;
a0d0e21e 4293 }
cf2a2b69
YO
4294 else if ((!sawopen || !RExC_sawback) &&
4295 (OP(first) == STAR &&
3dab1dad 4296 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
bbe252da 4297 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
a0d0e21e
LW
4298 {
4299 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
4300 const int type =
4301 (OP(NEXTOPER(first)) == REG_ANY)
bbe252da
YO
4302 ? RXf_ANCH_MBOL
4303 : RXf_ANCH_SBOL;
4304 r->extflags |= type;
4305 r->intflags |= PREGf_IMPLICIT;
a0d0e21e 4306 first = NEXTOPER(first);
774d564b 4307 goto again;
a0d0e21e 4308 }
b81d288d 4309 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 4310 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa 4311 /* x+ must match at the 1st pos of run of x's */
bbe252da 4312 r->intflags |= PREGf_SKIP;
a0d0e21e 4313
c277df42 4314 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa 4315#ifdef TRIE_STUDY_OPT
81714fb9 4316 DEBUG_PARSE_r(
be8e71aa
YO
4317 if (!restudied)
4318 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4319 (IV)(first - scan + 1))
4320 );
4321#else
81714fb9 4322 DEBUG_PARSE_r(
be8e71aa
YO
4323 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4324 (IV)(first - scan + 1))
4325 );
4326#endif
4327
4328
a0d0e21e
LW
4329 /*
4330 * If there's something expensive in the r.e., find the
4331 * longest literal string that must appear and make it the
4332 * regmust. Resolve ties in favor of later strings, since
4333 * the regstart check works with the beginning of the r.e.
4334 * and avoiding duplication strengthens checking. Not a
4335 * strong reason, but sufficient in the absence of others.
4336 * [Now we resolve ties in favor of the earlier string if
c277df42 4337 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
4338 * earlier string may buy us something the later one won't.]
4339 */
de8c5301 4340
396482e1
GA
4341 data.longest_fixed = newSVpvs("");
4342 data.longest_float = newSVpvs("");
4343 data.last_found = newSVpvs("");
c277df42
IZ
4344 data.longest = &(data.longest_fixed);
4345 first = scan;
f8fc2ecf 4346 if (!ri->regstclass) {
830247a4 4347 cl_init(pRExC_state, &ch_class);
653099ff
GS
4348 data.start_class = &ch_class;
4349 stclass_flag = SCF_DO_STCLASS_AND;
4350 } else /* XXXX Check for BOUND? */
4351 stclass_flag = 0;
cb434fcc 4352 data.last_closep = &last_close;
de8c5301 4353
1de06328 4354 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
40d049e4
YO
4355 &data, -1, NULL, NULL,
4356 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 4357
07be1b83 4358
786e8c11
YO
4359 CHECK_RESTUDY_GOTO;
4360
4361
830247a4 4362 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 4363 && data.last_start_min == 0 && data.last_end > 0
830247a4 4364 && !RExC_seen_zerolen
2bf803e2 4365 && !(RExC_seen & REG_SEEN_VERBARG)
bbe252da
YO
4366 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4367 r->extflags |= RXf_CHECK_ALL;
304ee84b 4368 scan_commit(pRExC_state, &data,&minlen,0);
c277df42
IZ
4369 SvREFCNT_dec(data.last_found);
4370
1de06328
YO
4371 /* Note that code very similar to this but for anchored string
4372 follows immediately below, changes may need to be made to both.
4373 Be careful.
4374 */
a0ed51b3 4375 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 4376 if (longest_float_length
c277df42
IZ
4377 || (data.flags & SF_FL_BEFORE_EOL
4378 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4379 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4380 {
1182767e 4381 I32 t,ml;
cf93c79d 4382
1de06328 4383 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
4384 && data.offset_fixed == data.offset_float_min
4385 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4386 goto remove_float; /* As in (a)+. */
4387
1de06328
YO
4388 /* copy the information about the longest float from the reg_scan_data
4389 over to the program. */
33b8afdf
JH
4390 if (SvUTF8(data.longest_float)) {
4391 r->float_utf8 = data.longest_float;
c445ea15 4392 r->float_substr = NULL;
33b8afdf
JH
4393 } else {
4394 r->float_substr = data.longest_float;
c445ea15 4395 r->float_utf8 = NULL;
33b8afdf 4396 }
1de06328
YO
4397 /* float_end_shift is how many chars that must be matched that
4398 follow this item. We calculate it ahead of time as once the
4399 lookbehind offset is added in we lose the ability to correctly
4400 calculate it.*/
4401 ml = data.minlen_float ? *(data.minlen_float)
1182767e 4402 : (I32)longest_float_length;
1de06328
YO
4403 r->float_end_shift = ml - data.offset_float_min
4404 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4405 + data.lookbehind_float;
4406 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 4407 r->float_max_offset = data.offset_float_max;
1182767e 4408 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
4409 r->float_max_offset -= data.lookbehind_float;
4410
cf93c79d
IZ
4411 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4412 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4413 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4414 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4415 }
4416 else {
aca2d497 4417 remove_float:
c445ea15 4418 r->float_substr = r->float_utf8 = NULL;
c277df42 4419 SvREFCNT_dec(data.longest_float);
c5254dd6 4420 longest_float_length = 0;
a0d0e21e 4421 }
c277df42 4422
1de06328
YO
4423 /* Note that code very similar to this but for floating string
4424 is immediately above, changes may need to be made to both.
4425 Be careful.
4426 */
a0ed51b3 4427 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 4428 if (longest_fixed_length
c277df42
IZ
4429 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4430 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4431 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4432 {
1182767e 4433 I32 t,ml;
cf93c79d 4434
1de06328
YO
4435 /* copy the information about the longest fixed
4436 from the reg_scan_data over to the program. */
33b8afdf
JH
4437 if (SvUTF8(data.longest_fixed)) {
4438 r->anchored_utf8 = data.longest_fixed;
c445ea15 4439 r->anchored_substr = NULL;
33b8afdf
JH
4440 } else {
4441 r->anchored_substr = data.longest_fixed;
c445ea15 4442 r->anchored_utf8 = NULL;
33b8afdf 4443 }
1de06328
YO
4444 /* fixed_end_shift is how many chars that must be matched that
4445 follow this item. We calculate it ahead of time as once the
4446 lookbehind offset is added in we lose the ability to correctly
4447 calculate it.*/
4448 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 4449 : (I32)longest_fixed_length;
1de06328
YO
4450 r->anchored_end_shift = ml - data.offset_fixed
4451 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4452 + data.lookbehind_fixed;
4453 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4454
cf93c79d
IZ
4455 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4456 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4457 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4458 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4459 }
4460 else {
c445ea15 4461 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 4462 SvREFCNT_dec(data.longest_fixed);
c5254dd6 4463 longest_fixed_length = 0;
a0d0e21e 4464 }
f8fc2ecf
YO
4465 if (ri->regstclass
4466 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4467 ri->regstclass = NULL;
33b8afdf
JH
4468 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4469 && stclass_flag
653099ff 4470 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4471 && !cl_is_anything(data.start_class))
4472 {
2eccd3b2 4473 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4474
f8fc2ecf 4475 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
4476 struct regnode_charclass_class);
4477 StructCopy(data.start_class,
f8fc2ecf 4478 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 4479 struct regnode_charclass_class);
f8fc2ecf 4480 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 4481 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 4482 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 4483 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4484 PerlIO_printf(Perl_debug_log,
a0288114 4485 "synthetic stclass \"%s\".\n",
3f7c398e 4486 SvPVX_const(sv));});
653099ff 4487 }
c277df42
IZ
4488
4489 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 4490 if (longest_fixed_length > longest_float_length) {
1de06328 4491 r->check_end_shift = r->anchored_end_shift;
c277df42 4492 r->check_substr = r->anchored_substr;
33b8afdf 4493 r->check_utf8 = r->anchored_utf8;
c277df42 4494 r->check_offset_min = r->check_offset_max = r->anchored_offset;
bbe252da
YO
4495 if (r->extflags & RXf_ANCH_SINGLE)
4496 r->extflags |= RXf_NOSCAN;
a0ed51b3
LW
4497 }
4498 else {
1de06328 4499 r->check_end_shift = r->float_end_shift;
c277df42 4500 r->check_substr = r->float_substr;
33b8afdf 4501 r->check_utf8 = r->float_utf8;
1de06328
YO
4502 r->check_offset_min = r->float_min_offset;
4503 r->check_offset_max = r->float_max_offset;
a0d0e21e 4504 }
30382c73
IZ
4505 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4506 This should be changed ASAP! */
bbe252da
YO
4507 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4508 r->extflags |= RXf_USE_INTUIT;
33b8afdf 4509 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
bbe252da 4510 r->extflags |= RXf_INTUIT_TAIL;
cad2e5aa 4511 }
1de06328
YO
4512 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4513 if ( (STRLEN)minlen < longest_float_length )
4514 minlen= longest_float_length;
4515 if ( (STRLEN)minlen < longest_fixed_length )
4516 minlen= longest_fixed_length;
4517 */
a0ed51b3
LW
4518 }
4519 else {
c277df42
IZ
4520 /* Several toplevels. Best we can is to set minlen. */
4521 I32 fake;
653099ff 4522 struct regnode_charclass_class ch_class;
cb434fcc 4523 I32 last_close = 0;
c277df42 4524
5d458dd8 4525 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
07be1b83 4526
f8fc2ecf 4527 scan = ri->program + 1;
830247a4 4528 cl_init(pRExC_state, &ch_class);
653099ff 4529 data.start_class = &ch_class;
cb434fcc 4530 data.last_closep = &last_close;
07be1b83 4531
de8c5301 4532
1de06328 4533 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
40d049e4 4534 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
de8c5301 4535
786e8c11 4536 CHECK_RESTUDY_GOTO;
07be1b83 4537
33b8afdf 4538 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 4539 = r->float_substr = r->float_utf8 = NULL;
653099ff 4540 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4541 && !cl_is_anything(data.start_class))
4542 {
2eccd3b2 4543 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4544
f8fc2ecf 4545 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
4546 struct regnode_charclass_class);
4547 StructCopy(data.start_class,
f8fc2ecf 4548 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 4549 struct regnode_charclass_class);
f8fc2ecf 4550 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 4551 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 4552 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 4553 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4554 PerlIO_printf(Perl_debug_log,
a0288114 4555 "synthetic stclass \"%s\".\n",
3f7c398e 4556 SvPVX_const(sv));});
653099ff 4557 }
a0d0e21e
LW
4558 }
4559
1de06328
YO
4560 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4561 the "real" pattern. */
cf9788e3
RGS
4562 DEBUG_OPTIMISE_r({
4563 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
70685ca0 4564 (IV)minlen, (IV)r->minlen);
cf9788e3 4565 });
de8c5301 4566 r->minlenret = minlen;
1de06328
YO
4567 if (r->minlen < minlen)
4568 r->minlen = minlen;
4569
b81d288d 4570 if (RExC_seen & REG_SEEN_GPOS)
bbe252da 4571 r->extflags |= RXf_GPOS_SEEN;
830247a4 4572 if (RExC_seen & REG_SEEN_LOOKBEHIND)
bbe252da 4573 r->extflags |= RXf_LOOKBEHIND_SEEN;
830247a4 4574 if (RExC_seen & REG_SEEN_EVAL)
bbe252da 4575 r->extflags |= RXf_EVAL_SEEN;
f33976b4 4576 if (RExC_seen & REG_SEEN_CANY)
bbe252da 4577 r->extflags |= RXf_CANY_SEEN;
e2e6a0f1 4578 if (RExC_seen & REG_SEEN_VERBARG)
bbe252da 4579 r->intflags |= PREGf_VERBARG_SEEN;
5d458dd8 4580 if (RExC_seen & REG_SEEN_CUTGROUP)
bbe252da 4581 r->intflags |= PREGf_CUTGROUP_SEEN;
81714fb9
YO
4582 if (RExC_paren_names)
4583 r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
4584 else
4585 r->paren_names = NULL;
e357fc67
YO
4586 if (r->prelen == 3 && strEQ("\\s+", r->precomp))
4587 r->extflags |= RXf_WHITE;
4588 else if (r->prelen == 1 && r->precomp[0] == '^')
4589 r->extflags |= RXf_START_ONLY;
4590
1f1031fe
YO
4591#ifdef DEBUGGING
4592 if (RExC_paren_names) {
4593 ri->name_list_idx = add_data( pRExC_state, 1, "p" );
4594 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
4595 } else
1f1031fe 4596#endif
cde0cee5 4597 ri->name_list_idx = 0;
1f1031fe 4598
40d049e4
YO
4599 if (RExC_recurse_count) {
4600 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4601 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4602 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4603 }
4604 }
a02a5408
JC
4605 Newxz(r->startp, RExC_npar, I32);
4606 Newxz(r->endp, RExC_npar, I32);
c74340f9
YO
4607 /* assume we don't need to swap parens around before we match */
4608
be8e71aa
YO
4609 DEBUG_DUMP_r({
4610 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
4611 regdump(r);
4612 });
f8fc2ecf
YO
4613 DEBUG_OFFSETS_r(if (ri->offsets) {
4614 const U32 len = ri->offsets[0];
8e9a8a48
YO
4615 U32 i;
4616 GET_RE_DEBUG_FLAGS_DECL;
f8fc2ecf 4617 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->offsets[0]);
8e9a8a48 4618 for (i = 1; i <= len; i++) {
f8fc2ecf 4619 if (ri->offsets[i*2-1] || ri->offsets[i*2])
8e9a8a48 4620 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
f8fc2ecf 4621 (UV)i, (UV)ri->offsets[i*2-1], (UV)ri->offsets[i*2]);
8e9a8a48
YO
4622 }
4623 PerlIO_printf(Perl_debug_log, "\n");
4624 });
a0d0e21e 4625 return(r);
a687059c
LW
4626}
4627
f9f4320a 4628#undef CORE_ONLY_BLOCK
f9f4320a 4629#undef RE_ENGINE_PTR
3dab1dad 4630
9af228c6 4631#ifndef PERL_IN_XSUB_RE
81714fb9 4632SV*
44a2ac75 4633Perl_reg_named_buff_get(pTHX_ SV* namesv, const REGEXP * const from_re, U32 flags)
81714fb9 4634{
44a2ac75
YO
4635 AV *retarray = NULL;
4636 SV *ret;
4637 if (flags & 1)
4638 retarray=newAV();
4639
4640 if (from_re || PL_curpm) {
4641 const REGEXP * const rx = from_re ? from_re : PM_GETRE(PL_curpm);
81714fb9
YO
4642 if (rx && rx->paren_names) {
4643 HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
4644 if (he_str) {
4645 IV i;
4646 SV* sv_dat=HeVAL(he_str);
4647 I32 *nums=(I32*)SvPVX(sv_dat);
4648 for ( i=0; i<SvIVX(sv_dat); i++ ) {
0a4db386 4649 if ((I32)(rx->lastparen) >= nums[i] &&
81714fb9
YO
4650 rx->endp[nums[i]] != -1)
4651 {
44a2ac75
YO
4652 ret = reg_numbered_buff_get(nums[i],rx,NULL,0);
4653 if (!retarray)
4654 return ret;
4655 } else {
4656 ret = newSVsv(&PL_sv_undef);
4657 }
4658 if (retarray) {
4659 SvREFCNT_inc(ret);
4660 av_push(retarray, ret);
81714fb9
YO
4661 }
4662 }
44a2ac75
YO
4663 if (retarray)
4664 return (SV*)retarray;
81714fb9
YO
4665 }
4666 }
4667 }
44a2ac75
YO
4668 return NULL;
4669}
4670
4671SV*
4672Perl_reg_numbered_buff_get(pTHX_ I32 paren, const REGEXP * const rx, SV* usesv, U32 flags)
4673{
4674 char *s = NULL;
a9d504c3 4675 I32 i = 0;
44a2ac75
YO
4676 I32 s1, t1;
4677 SV *sv = usesv ? usesv : newSVpvs("");
cb5c6874 4678 PERL_UNUSED_ARG(flags);
44a2ac75 4679
cde0cee5
YO
4680 if (!rx->subbeg) {
4681 sv_setsv(sv,&PL_sv_undef);
4682 return sv;
4683 }
4684 else
4685 if (paren == -2 && rx->startp[0] != -1) {
44a2ac75
YO
4686 /* $` */
4687 i = rx->startp[0];
cde0cee5 4688 s = rx->subbeg;
44a2ac75
YO
4689 }
4690 else
cde0cee5 4691 if (paren == -1 && rx->endp[0] != -1) {
44a2ac75
YO
4692 /* $' */
4693 s = rx->subbeg + rx->endp[0];
4694 i = rx->sublen - rx->endp[0];
4695 }
4696 else
4697 if ( 0 <= paren && paren <= (I32)rx->nparens &&
4698 (s1 = rx->startp[paren]) != -1 &&
4699 (t1 = rx->endp[paren]) != -1)
4700 {
4701 /* $& $1 ... */
4702 i = t1 - s1;
4703 s = rx->subbeg + s1;
cde0cee5
YO
4704 } else {
4705 sv_setsv(sv,&PL_sv_undef);
4706 return sv;
4707 }
4708 assert(rx->sublen >= (s - rx->subbeg) + i );
4709 if (i >= 0) {
4710 const int oldtainted = PL_tainted;
4711 TAINT_NOT;
4712 sv_setpvn(sv, s, i);
4713 PL_tainted = oldtainted;
4714 if ( (rx->extflags & RXf_CANY_SEEN)
4715 ? (RX_MATCH_UTF8(rx)
4716 && (!i || is_utf8_string((U8*)s, i)))
4717 : (RX_MATCH_UTF8(rx)) )
4718 {
4719 SvUTF8_on(sv);
4720 }
4721 else
4722 SvUTF8_off(sv);
4723 if (PL_tainting) {
4724 if (RX_MATCH_TAINTED(rx)) {
4725 if (SvTYPE(sv) >= SVt_PVMG) {
4726 MAGIC* const mg = SvMAGIC(sv);
4727 MAGIC* mgt;
4728 PL_tainted = 1;
4729 SvMAGIC_set(sv, mg->mg_moremagic);
4730 SvTAINT(sv);
4731 if ((mgt = SvMAGIC(sv))) {
4732 mg->mg_moremagic = mgt;
4733 SvMAGIC_set(sv, mg);
44a2ac75 4734 }
cde0cee5
YO
4735 } else {
4736 PL_tainted = 1;
4737 SvTAINT(sv);
4738 }
4739 } else
4740 SvTAINTED_off(sv);
44a2ac75 4741 }
81714fb9 4742 } else {
44a2ac75 4743 sv_setsv(sv,&PL_sv_undef);
81714fb9 4744 }
44a2ac75 4745 return sv;
81714fb9 4746}
9af228c6 4747#endif
0a4db386 4748
894be9b7 4749/* Scans the name of a named buffer from the pattern.
0a4db386
YO
4750 * If flags is REG_RSN_RETURN_NULL returns null.
4751 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
4752 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
4753 * to the parsed name as looked up in the RExC_paren_names hash.
4754 * If there is an error throws a vFAIL().. type exception.
894be9b7 4755 */
0a4db386
YO
4756
4757#define REG_RSN_RETURN_NULL 0
4758#define REG_RSN_RETURN_NAME 1
4759#define REG_RSN_RETURN_DATA 2
4760
894be9b7
YO
4761STATIC SV*
4762S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
4763 char *name_start = RExC_parse;
1f1031fe
YO
4764
4765 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
4766 /* skip IDFIRST by using do...while */
4767 if (UTF)
4768 do {
4769 RExC_parse += UTF8SKIP(RExC_parse);
4770 } while (isALNUM_utf8((U8*)RExC_parse));
4771 else
4772 do {
4773 RExC_parse++;
4774 } while (isALNUM(*RExC_parse));
894be9b7 4775 }
1f1031fe 4776
0a4db386
YO
4777 if ( flags ) {
4778 SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start,
4779 (int)(RExC_parse - name_start)));
894be9b7 4780 if (UTF)
0a4db386
YO
4781 SvUTF8_on(sv_name);
4782 if ( flags == REG_RSN_RETURN_NAME)
4783 return sv_name;
4784 else if (flags==REG_RSN_RETURN_DATA) {
4785 HE *he_str = NULL;
4786 SV *sv_dat = NULL;
4787 if ( ! sv_name ) /* should not happen*/
4788 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
4789 if (RExC_paren_names)
4790 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
4791 if ( he_str )
4792 sv_dat = HeVAL(he_str);
4793 if ( ! sv_dat )
4794 vFAIL("Reference to nonexistent named group");
4795 return sv_dat;
4796 }
4797 else {
4798 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
4799 }
4800 /* NOT REACHED */
894be9b7 4801 }
0a4db386 4802 return NULL;
894be9b7
YO
4803}
4804
3dab1dad
YO
4805#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
4806 int rem=(int)(RExC_end - RExC_parse); \
4807 int cut; \
4808 int num; \
4809 int iscut=0; \
4810 if (rem>10) { \
4811 rem=10; \
4812 iscut=1; \
4813 } \
4814 cut=10-rem; \
4815 if (RExC_lastparse!=RExC_parse) \
4816 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
4817 rem, RExC_parse, \
4818 cut + 4, \
4819 iscut ? "..." : "<" \
4820 ); \
4821 else \
4822 PerlIO_printf(Perl_debug_log,"%16s",""); \
4823 \
4824 if (SIZE_ONLY) \
4825 num=RExC_size; \
4826 else \
4827 num=REG_NODE_NUM(RExC_emit); \
4828 if (RExC_lastnum!=num) \
0a4db386 4829 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 4830 else \
0a4db386 4831 PerlIO_printf(Perl_debug_log,"|%4s",""); \
be8e71aa
YO
4832 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
4833 (int)((depth*2)), "", \
3dab1dad
YO
4834 (funcname) \
4835 ); \
4836 RExC_lastnum=num; \
4837 RExC_lastparse=RExC_parse; \
4838})
4839
07be1b83
YO
4840
4841
3dab1dad
YO
4842#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
4843 DEBUG_PARSE_MSG((funcname)); \
4844 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
4845})
6bda09f9
YO
4846#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
4847 DEBUG_PARSE_MSG((funcname)); \
4848 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
4849})
a687059c
LW
4850/*
4851 - reg - regular expression, i.e. main body or parenthesized thing
4852 *
4853 * Caller must absorb opening parenthesis.
4854 *
4855 * Combining parenthesis handling with the base level of regular expression
4856 * is a trifle forced, but the need to tie the tails of the branches to what
4857 * follows makes it hard to avoid.
4858 */
07be1b83
YO
4859#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
4860#ifdef DEBUGGING
4861#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
4862#else
4863#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
4864#endif
3dab1dad 4865
e2e6a0f1
YO
4866/* this idea is borrowed from STR_WITH_LEN in handy.h */
4867#define CHECK_WORD(s,v,l) \
4868 (((sizeof(s)-1)==(l)) && (strnEQ(start_verb, (s ""), (sizeof(s)-1))))
4869
76e3520e 4870STATIC regnode *
3dab1dad 4871S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 4872 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 4873{
27da23d5 4874 dVAR;
c277df42
IZ
4875 register regnode *ret; /* Will be the head of the group. */
4876 register regnode *br;
4877 register regnode *lastbr;
cbbf8932 4878 register regnode *ender = NULL;
a0d0e21e 4879 register I32 parno = 0;
cbbf8932
AL
4880 I32 flags;
4881 const I32 oregflags = RExC_flags;
6136c704
AL
4882 bool have_branch = 0;
4883 bool is_open = 0;
9d1d55b5
JP
4884
4885 /* for (?g), (?gc), and (?o) warnings; warning
4886 about (?c) will warn about (?g) -- japhy */
4887
6136c704
AL
4888#define WASTED_O 0x01
4889#define WASTED_G 0x02
4890#define WASTED_C 0x04
4891#define WASTED_GC (0x02|0x04)
cbbf8932 4892 I32 wastedflags = 0x00;
9d1d55b5 4893
fac92740 4894 char * parse_start = RExC_parse; /* MJD */
a28509cc 4895 char * const oregcomp_parse = RExC_parse;
a0d0e21e 4896
3dab1dad
YO
4897 GET_RE_DEBUG_FLAGS_DECL;
4898 DEBUG_PARSE("reg ");
4899
4900
821b33a5 4901 *flagp = 0; /* Tentatively. */
a0d0e21e 4902
9d1d55b5 4903
a0d0e21e
LW
4904 /* Make an OPEN node, if parenthesized. */
4905 if (paren) {
e2e6a0f1
YO
4906 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
4907 char *start_verb = RExC_parse;
4908 STRLEN verb_len = 0;
4909 char *start_arg = NULL;
4910 unsigned char op = 0;
4911 int argok = 1;
4912 int internal_argval = 0; /* internal_argval is only useful if !argok */
4913 while ( *RExC_parse && *RExC_parse != ')' ) {
4914 if ( *RExC_parse == ':' ) {
4915 start_arg = RExC_parse + 1;
4916 break;
4917 }
4918 RExC_parse++;
4919 }
4920 ++start_verb;
4921 verb_len = RExC_parse - start_verb;
4922 if ( start_arg ) {
4923 RExC_parse++;
4924 while ( *RExC_parse && *RExC_parse != ')' )
4925 RExC_parse++;
4926 if ( *RExC_parse != ')' )
4927 vFAIL("Unterminated verb pattern argument");
4928 if ( RExC_parse == start_arg )
4929 start_arg = NULL;
4930 } else {
4931 if ( *RExC_parse != ')' )
4932 vFAIL("Unterminated verb pattern");
4933 }
5d458dd8 4934
e2e6a0f1
YO
4935 switch ( *start_verb ) {
4936 case 'A': /* (*ACCEPT) */
4937 if ( CHECK_WORD("ACCEPT",start_verb,verb_len) ) {
4938 op = ACCEPT;
4939 internal_argval = RExC_nestroot;
4940 }
4941 break;
4942 case 'C': /* (*COMMIT) */
4943 if ( CHECK_WORD("COMMIT",start_verb,verb_len) )
4944 op = COMMIT;
e2e6a0f1
YO
4945 break;
4946 case 'F': /* (*FAIL) */
4947 if ( verb_len==1 || CHECK_WORD("FAIL",start_verb,verb_len) ) {
4948 op = OPFAIL;
4949 argok = 0;
4950 }
4951 break;
5d458dd8
YO
4952 case ':': /* (*:NAME) */
4953 case 'M': /* (*MARK:NAME) */
4954 if ( verb_len==0 || CHECK_WORD("MARK",start_verb,verb_len) ) {
e2e6a0f1 4955 op = MARKPOINT;
5d458dd8
YO
4956 argok = -1;
4957 }
4958 break;
4959 case 'P': /* (*PRUNE) */
4960 if ( CHECK_WORD("PRUNE",start_verb,verb_len) )
4961 op = PRUNE;
e2e6a0f1 4962 break;
5d458dd8
YO
4963 case 'S': /* (*SKIP) */
4964 if ( CHECK_WORD("SKIP",start_verb,verb_len) )
4965 op = SKIP;
4966 break;
4967 case 'T': /* (*THEN) */
4968 /* [19:06] <TimToady> :: is then */
4969 if ( CHECK_WORD("THEN",start_verb,verb_len) ) {
4970 op = CUTGROUP;
4971 RExC_seen |= REG_SEEN_CUTGROUP;
4972 }
e2e6a0f1
YO
4973 break;
4974 }
4975 if ( ! op ) {
4976 RExC_parse++;
4977 vFAIL3("Unknown verb pattern '%.*s'",
4978 verb_len, start_verb);
4979 }
4980 if ( argok ) {
4981 if ( start_arg && internal_argval ) {
4982 vFAIL3("Verb pattern '%.*s' may not have an argument",
4983 verb_len, start_verb);
4984 } else if ( argok < 0 && !start_arg ) {
4985 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
4986 verb_len, start_verb);
4987 } else {
4988 ret = reganode(pRExC_state, op, internal_argval);
4989 if ( ! internal_argval && ! SIZE_ONLY ) {
4990 if (start_arg) {
4991 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
4992 ARG(ret) = add_data( pRExC_state, 1, "S" );
f8fc2ecf 4993 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
e2e6a0f1
YO
4994 ret->flags = 0;
4995 } else {
4996 ret->flags = 1;
4997 }
4998 }
4999 }
5000 if (!internal_argval)
5001 RExC_seen |= REG_SEEN_VERBARG;
5002 } else if ( start_arg ) {
5003 vFAIL3("Verb pattern '%.*s' may not have an argument",
5004 verb_len, start_verb);
5005 } else {
5006 ret = reg_node(pRExC_state, op);
5007 }
5008 nextchar(pRExC_state);
5009 return ret;
5010 } else
fac92740 5011 if (*RExC_parse == '?') { /* (?...) */
6136c704 5012 bool is_logical = 0;
a28509cc 5013 const char * const seqstart = RExC_parse;
ca9dfc88 5014
830247a4
IZ
5015 RExC_parse++;
5016 paren = *RExC_parse++;
c277df42 5017 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 5018 switch (paren) {
894be9b7 5019
1f1031fe
YO
5020 case 'P': /* (?P...) variants for those used to PCRE/Python */
5021 paren = *RExC_parse++;
5022 if ( paren == '<') /* (?P<...>) named capture */
5023 goto named_capture;
5024 else if (paren == '>') { /* (?P>name) named recursion */
5025 goto named_recursion;
5026 }
5027 else if (paren == '=') { /* (?P=...) named backref */
5028 /* this pretty much dupes the code for \k<NAME> in regatom(), if
5029 you change this make sure you change that */
5030 char* name_start = RExC_parse;
5031 U32 num = 0;
5032 SV *sv_dat = reg_scan_name(pRExC_state,
5033 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5034 if (RExC_parse == name_start || *RExC_parse != ')')
5035 vFAIL2("Sequence %.3s... not terminated",parse_start);
5036
5037 if (!SIZE_ONLY) {
5038 num = add_data( pRExC_state, 1, "S" );
5039 RExC_rxi->data->data[num]=(void*)sv_dat;
5040 SvREFCNT_inc(sv_dat);
5041 }
5042 RExC_sawback = 1;
5043 ret = reganode(pRExC_state,
5044 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5045 num);
5046 *flagp |= HASWIDTH;
5047
5048 Set_Node_Offset(ret, parse_start+1);
5049 Set_Node_Cur_Length(ret); /* MJD */
5050
5051 nextchar(pRExC_state);
5052 return ret;
5053 }
5054 goto unknown;
fac92740 5055 case '<': /* (?<...) */
b81d288d 5056 if (*RExC_parse == '!')
c277df42 5057 paren = ',';
0a4db386 5058 else if (*RExC_parse != '=')
1f1031fe 5059 named_capture:
0a4db386 5060 { /* (?<...>) */
81714fb9 5061 char *name_start;
894be9b7 5062 SV *svname;
81714fb9
YO
5063 paren= '>';
5064 case '\'': /* (?'...') */
5065 name_start= RExC_parse;
0a4db386
YO
5066 svname = reg_scan_name(pRExC_state,
5067 SIZE_ONLY ? /* reverse test from the others */
5068 REG_RSN_RETURN_NAME :
5069 REG_RSN_RETURN_NULL);
81714fb9
YO
5070 if (RExC_parse == name_start)
5071 goto unknown;
5072 if (*RExC_parse != paren)
5073 vFAIL2("Sequence (?%c... not terminated",
5074 paren=='>' ? '<' : paren);
5075 if (SIZE_ONLY) {
e62cc96a
YO
5076 HE *he_str;
5077 SV *sv_dat = NULL;
894be9b7
YO
5078 if (!svname) /* shouldnt happen */
5079 Perl_croak(aTHX_
5080 "panic: reg_scan_name returned NULL");
81714fb9
YO
5081 if (!RExC_paren_names) {
5082 RExC_paren_names= newHV();
5083 sv_2mortal((SV*)RExC_paren_names);
1f1031fe
YO
5084#ifdef DEBUGGING
5085 RExC_paren_name_list= newAV();
5086 sv_2mortal((SV*)RExC_paren_name_list);
5087#endif
81714fb9
YO
5088 }
5089 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
e62cc96a 5090 if ( he_str )
81714fb9 5091 sv_dat = HeVAL(he_str);
e62cc96a 5092 if ( ! sv_dat ) {
81714fb9 5093 /* croak baby croak */
e62cc96a
YO
5094 Perl_croak(aTHX_
5095 "panic: paren_name hash element allocation failed");
5096 } else if ( SvPOK(sv_dat) ) {
81714fb9
YO
5097 IV count=SvIV(sv_dat);
5098 I32 *pv=(I32*)SvGROW(sv_dat,SvCUR(sv_dat)+sizeof(I32)+1);
5099 SvCUR_set(sv_dat,SvCUR(sv_dat)+sizeof(I32));
5100 pv[count]=RExC_npar;
5101 SvIVX(sv_dat)++;
5102 } else {
5103 (void)SvUPGRADE(sv_dat,SVt_PVNV);
5104 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5105 SvIOK_on(sv_dat);
5106 SvIVX(sv_dat)= 1;
e62cc96a 5107 }
1f1031fe
YO
5108#ifdef DEBUGGING
5109 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5110 SvREFCNT_dec(svname);
5111#endif
e62cc96a 5112
81714fb9
YO
5113 /*sv_dump(sv_dat);*/
5114 }
5115 nextchar(pRExC_state);
5116 paren = 1;
5117 goto capturing_parens;
5118 }
5119 RExC_seen |= REG_SEEN_LOOKBEHIND;
830247a4 5120 RExC_parse++;
fac92740
MJD
5121 case '=': /* (?=...) */
5122 case '!': /* (?!...) */
830247a4 5123 RExC_seen_zerolen++;
e2e6a0f1
YO
5124 if (*RExC_parse == ')') {
5125 ret=reg_node(pRExC_state, OPFAIL);
5126 nextchar(pRExC_state);
5127 return ret;
5128 }
fac92740
MJD
5129 case ':': /* (?:...) */
5130 case '>': /* (?>...) */
a0d0e21e 5131 break;
fac92740
MJD
5132 case '$': /* (?$...) */
5133 case '@': /* (?@...) */
8615cb43 5134 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 5135 break;
fac92740 5136 case '#': /* (?#...) */
830247a4
IZ
5137 while (*RExC_parse && *RExC_parse != ')')
5138 RExC_parse++;
5139 if (*RExC_parse != ')')
c277df42 5140 FAIL("Sequence (?#... not terminated");
830247a4 5141 nextchar(pRExC_state);
a0d0e21e
LW
5142 *flagp = TRYAGAIN;
5143 return NULL;
894be9b7
YO
5144 case '0' : /* (?0) */
5145 case 'R' : /* (?R) */
5146 if (*RExC_parse != ')')
6bda09f9 5147 FAIL("Sequence (?R) not terminated");
1a147d38 5148 ret = reg_node(pRExC_state, GOSTART);
7f69552c
YO
5149 nextchar(pRExC_state);
5150 return ret;
5151 /*notreached*/
894be9b7
YO
5152 { /* named and numeric backreferences */
5153 I32 num;
894be9b7
YO
5154 case '&': /* (?&NAME) */
5155 parse_start = RExC_parse - 1;
1f1031fe 5156 named_recursion:
894be9b7 5157 {
0a4db386
YO
5158 SV *sv_dat = reg_scan_name(pRExC_state,
5159 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5160 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
894be9b7
YO
5161 }
5162 goto gen_recurse_regop;
5163 /* NOT REACHED */
542fa716
YO
5164 case '+':
5165 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5166 RExC_parse++;
5167 vFAIL("Illegal pattern");
5168 }
5169 goto parse_recursion;
5170 /* NOT REACHED*/
5171 case '-': /* (?-1) */
5172 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5173 RExC_parse--; /* rewind to let it be handled later */
5174 goto parse_flags;
5175 }
5176 /*FALLTHROUGH */
6bda09f9
YO
5177 case '1': case '2': case '3': case '4': /* (?1) */
5178 case '5': case '6': case '7': case '8': case '9':
5179 RExC_parse--;
542fa716 5180 parse_recursion:
894be9b7
YO
5181 num = atoi(RExC_parse);
5182 parse_start = RExC_parse - 1; /* MJD */
542fa716
YO
5183 if (*RExC_parse == '-')
5184 RExC_parse++;
6bda09f9
YO
5185 while (isDIGIT(*RExC_parse))
5186 RExC_parse++;
5187 if (*RExC_parse!=')')
5188 vFAIL("Expecting close bracket");
894be9b7
YO
5189
5190 gen_recurse_regop:
542fa716
YO
5191 if ( paren == '-' ) {
5192 /*
5193 Diagram of capture buffer numbering.
5194 Top line is the normal capture buffer numbers
5195 Botton line is the negative indexing as from
5196 the X (the (?-2))
5197
5198 + 1 2 3 4 5 X 6 7
5199 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5200 - 5 4 3 2 1 X x x
5201
5202 */
5203 num = RExC_npar + num;
5204 if (num < 1) {
5205 RExC_parse++;
5206 vFAIL("Reference to nonexistent group");
5207 }
5208 } else if ( paren == '+' ) {
5209 num = RExC_npar + num - 1;
5210 }
5211
1a147d38 5212 ret = reganode(pRExC_state, GOSUB, num);
6bda09f9
YO
5213 if (!SIZE_ONLY) {
5214 if (num > (I32)RExC_rx->nparens) {
5215 RExC_parse++;
5216 vFAIL("Reference to nonexistent group");
5217 }
40d049e4 5218 ARG2L_SET( ret, RExC_recurse_count++);
6bda09f9 5219 RExC_emit++;
226de585 5220 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
acff02b8 5221 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
894be9b7 5222 } else {
6bda09f9 5223 RExC_size++;
6bda09f9 5224 }
0a4db386 5225 RExC_seen |= REG_SEEN_RECURSE;
6bda09f9 5226 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
58663417
RGS
5227 Set_Node_Offset(ret, parse_start); /* MJD */
5228
6bda09f9
YO
5229 nextchar(pRExC_state);
5230 return ret;
894be9b7
YO
5231 } /* named and numeric backreferences */
5232 /* NOT REACHED */
5233
fac92740 5234 case 'p': /* (?p...) */
9014280d 5235 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 5236 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 5237 /* FALL THROUGH*/
fac92740 5238 case '?': /* (??...) */
6136c704 5239 is_logical = 1;
438a3801
YST
5240 if (*RExC_parse != '{')
5241 goto unknown;
830247a4 5242 paren = *RExC_parse++;
0f5d15d6 5243 /* FALL THROUGH */
fac92740 5244 case '{': /* (?{...}) */
c277df42 5245 {
2eccd3b2
NC
5246 I32 count = 1;
5247 U32 n = 0;
c277df42 5248 char c;
830247a4 5249 char *s = RExC_parse;
c277df42 5250
830247a4
IZ
5251 RExC_seen_zerolen++;
5252 RExC_seen |= REG_SEEN_EVAL;
5253 while (count && (c = *RExC_parse)) {
6136c704
AL
5254 if (c == '\\') {
5255 if (RExC_parse[1])
5256 RExC_parse++;
5257 }
b81d288d 5258 else if (c == '{')
c277df42 5259 count++;
b81d288d 5260 else if (c == '}')
c277df42 5261 count--;
830247a4 5262 RExC_parse++;
c277df42 5263 }
6136c704 5264 if (*RExC_parse != ')') {
b81d288d 5265 RExC_parse = s;
b45f050a
JF
5266 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5267 }
c277df42 5268 if (!SIZE_ONLY) {
f3548bdc 5269 PAD *pad;
6136c704
AL
5270 OP_4tree *sop, *rop;
5271 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 5272
569233ed
SB
5273 ENTER;
5274 Perl_save_re_context(aTHX);
f3548bdc 5275 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
5276 sop->op_private |= OPpREFCOUNTED;
5277 /* re_dup will OpREFCNT_inc */
5278 OpREFCNT_set(sop, 1);
569233ed 5279 LEAVE;
c277df42 5280
830247a4 5281 n = add_data(pRExC_state, 3, "nop");
f8fc2ecf
YO
5282 RExC_rxi->data->data[n] = (void*)rop;
5283 RExC_rxi->data->data[n+1] = (void*)sop;
5284 RExC_rxi->data->data[n+2] = (void*)pad;
c277df42 5285 SvREFCNT_dec(sv);
a0ed51b3 5286 }
e24b16f9 5287 else { /* First pass */
830247a4 5288 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 5289 && IN_PERL_RUNTIME)
2cd61cdb
IZ
5290 /* No compiled RE interpolated, has runtime
5291 components ===> unsafe. */
5292 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 5293 if (PL_tainting && PL_tainted)
cc6b7395 5294 FAIL("Eval-group in insecure regular expression");
54df2634 5295#if PERL_VERSION > 8
923e4eb5 5296 if (IN_PERL_COMPILETIME)
b5c19bd7 5297 PL_cv_has_eval = 1;
54df2634 5298#endif
c277df42 5299 }
b5c19bd7 5300
830247a4 5301 nextchar(pRExC_state);
6136c704 5302 if (is_logical) {
830247a4 5303 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5304 if (!SIZE_ONLY)
5305 ret->flags = 2;
3dab1dad 5306 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 5307 /* deal with the length of this later - MJD */
0f5d15d6
IZ
5308 return ret;
5309 }
ccb2c380
MP
5310 ret = reganode(pRExC_state, EVAL, n);
5311 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5312 Set_Node_Offset(ret, parse_start);
5313 return ret;
c277df42 5314 }
fac92740 5315 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 5316 {
0a4db386 5317 int is_define= 0;
fac92740 5318 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
5319 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5320 || RExC_parse[1] == '<'
830247a4 5321 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
5322 I32 flag;
5323
830247a4 5324 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5325 if (!SIZE_ONLY)
5326 ret->flags = 1;
3dab1dad 5327 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 5328 goto insert_if;
b81d288d 5329 }
a0ed51b3 5330 }
0a4db386
YO
5331 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5332 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5333 {
5334 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5335 char *name_start= RExC_parse++;
2eccd3b2 5336 U32 num = 0;
0a4db386
YO
5337 SV *sv_dat=reg_scan_name(pRExC_state,
5338 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5339 if (RExC_parse == name_start || *RExC_parse != ch)
5340 vFAIL2("Sequence (?(%c... not terminated",
5341 (ch == '>' ? '<' : ch));
5342 RExC_parse++;
5343 if (!SIZE_ONLY) {
5344 num = add_data( pRExC_state, 1, "S" );
f8fc2ecf 5345 RExC_rxi->data->data[num]=(void*)sv_dat;
0a4db386
YO
5346 SvREFCNT_inc(sv_dat);
5347 }
5348 ret = reganode(pRExC_state,NGROUPP,num);
5349 goto insert_if_check_paren;
5350 }
5351 else if (RExC_parse[0] == 'D' &&
5352 RExC_parse[1] == 'E' &&
5353 RExC_parse[2] == 'F' &&
5354 RExC_parse[3] == 'I' &&
5355 RExC_parse[4] == 'N' &&
5356 RExC_parse[5] == 'E')
5357 {
5358 ret = reganode(pRExC_state,DEFINEP,0);
5359 RExC_parse +=6 ;
5360 is_define = 1;
5361 goto insert_if_check_paren;
5362 }
5363 else if (RExC_parse[0] == 'R') {
5364 RExC_parse++;
5365 parno = 0;
5366 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5367 parno = atoi(RExC_parse++);
5368 while (isDIGIT(*RExC_parse))
5369 RExC_parse++;
5370 } else if (RExC_parse[0] == '&') {
5371 SV *sv_dat;
5372 RExC_parse++;
5373 sv_dat = reg_scan_name(pRExC_state,
5374 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5375 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5376 }
1a147d38 5377 ret = reganode(pRExC_state,INSUBP,parno);
0a4db386
YO
5378 goto insert_if_check_paren;
5379 }
830247a4 5380 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 5381 /* (?(1)...) */
6136c704 5382 char c;
830247a4 5383 parno = atoi(RExC_parse++);
c277df42 5384
830247a4
IZ
5385 while (isDIGIT(*RExC_parse))
5386 RExC_parse++;
fac92740 5387 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 5388
0a4db386 5389 insert_if_check_paren:
830247a4 5390 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 5391 vFAIL("Switch condition not recognized");
c277df42 5392 insert_if:
3dab1dad
YO
5393 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
5394 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 5395 if (br == NULL)
830247a4 5396 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 5397 else
3dab1dad 5398 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 5399 c = *nextchar(pRExC_state);
d1b80229
IZ
5400 if (flags&HASWIDTH)
5401 *flagp |= HASWIDTH;
c277df42 5402 if (c == '|') {
0a4db386
YO
5403 if (is_define)
5404 vFAIL("(?(DEFINE)....) does not allow branches");
830247a4 5405 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
5406 regbranch(pRExC_state, &flags, 1,depth+1);
5407 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
5408 if (flags&HASWIDTH)
5409 *flagp |= HASWIDTH;
830247a4 5410 c = *nextchar(pRExC_state);
a0ed51b3
LW
5411 }
5412 else
c277df42
IZ
5413 lastbr = NULL;
5414 if (c != ')')
8615cb43 5415 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 5416 ender = reg_node(pRExC_state, TAIL);
3dab1dad 5417 REGTAIL(pRExC_state, br, ender);
c277df42 5418 if (lastbr) {
3dab1dad
YO
5419 REGTAIL(pRExC_state, lastbr, ender);
5420 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
5421 }
5422 else
3dab1dad 5423 REGTAIL(pRExC_state, ret, ender);
c277df42 5424 return ret;
a0ed51b3
LW
5425 }
5426 else {
830247a4 5427 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
5428 }
5429 }
1b1626e4 5430 case 0:
830247a4 5431 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 5432 vFAIL("Sequence (? incomplete");
1b1626e4 5433 break;
a0d0e21e 5434 default:
cde0cee5
YO
5435 --RExC_parse;
5436 parse_flags: /* (?i) */
5437 {
5438 U32 posflags = 0, negflags = 0;
5439 U32 *flagsp = &posflags;
5440
5441 while (*RExC_parse) {
5442 /* && strchr("iogcmsx", *RExC_parse) */
9d1d55b5
JP
5443 /* (?g), (?gc) and (?o) are useless here
5444 and must be globally applied -- japhy */
cde0cee5
YO
5445 switch (*RExC_parse) {
5446 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
5447 case 'o':
5448 case 'g':
9d1d55b5 5449 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 5450 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
5451 if (! (wastedflags & wflagbit) ) {
5452 wastedflags |= wflagbit;
5453 vWARN5(
5454 RExC_parse + 1,
5455 "Useless (%s%c) - %suse /%c modifier",
5456 flagsp == &negflags ? "?-" : "?",
5457 *RExC_parse,
5458 flagsp == &negflags ? "don't " : "",
5459 *RExC_parse
5460 );
5461 }
5462 }
cde0cee5
YO
5463 break;
5464
5465 case 'c':
9d1d55b5 5466 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
5467 if (! (wastedflags & WASTED_C) ) {
5468 wastedflags |= WASTED_GC;
9d1d55b5
JP
5469 vWARN3(
5470 RExC_parse + 1,
5471 "Useless (%sc) - %suse /gc modifier",
5472 flagsp == &negflags ? "?-" : "?",
5473 flagsp == &negflags ? "don't " : ""
5474 );
5475 }
5476 }
cde0cee5
YO
5477 break;
5478 case 'k':
5479 if (flagsp == &negflags) {
5480 if (SIZE_ONLY && ckWARN(WARN_REGEXP))
5481 vWARN(RExC_parse + 1,"Useless use of (?-k)");
5482 } else {
5483 *flagsp |= RXf_PMf_KEEPCOPY;
5484 }
5485 break;
5486 case '-':
5487 if (flagsp == &negflags)
5488 goto unknown;
5489 flagsp = &negflags;
5490 wastedflags = 0; /* reset so (?g-c) warns twice */
5491 break;
5492 case ':':
5493 paren = ':';
5494 /*FALLTHROUGH*/
5495 case ')':
5496 RExC_flags |= posflags;
5497 RExC_flags &= ~negflags;
5498 nextchar(pRExC_state);
5499 if (paren != ':') {
5500 *flagp = TRYAGAIN;
5501 return NULL;
5502 } else {
5503 ret = NULL;
5504 goto parse_rest;
5505 }
5506 /*NOTREACHED*/
5507 default:
5508 unknown:
5509 RExC_parse++;
5510 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5511 /*NOTREACHED*/
5512 }
830247a4 5513 ++RExC_parse;
48c036b1 5514 }
cde0cee5 5515 }} /* one for the default block, one for the switch */
a0d0e21e 5516 }
fac92740 5517 else { /* (...) */
81714fb9 5518 capturing_parens:
830247a4
IZ
5519 parno = RExC_npar;
5520 RExC_npar++;
e2e6a0f1 5521
830247a4 5522 ret = reganode(pRExC_state, OPEN, parno);
e2e6a0f1
YO
5523 if (!SIZE_ONLY ){
5524 if (!RExC_nestroot)
5525 RExC_nestroot = parno;
5526 if (RExC_seen & REG_SEEN_RECURSE) {
5527 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
40d049e4
YO
5528 "Setting open paren #%"IVdf" to %d\n",
5529 (IV)parno, REG_NODE_NUM(ret)));
e2e6a0f1
YO
5530 RExC_open_parens[parno-1]= ret;
5531 }
6bda09f9 5532 }
fac92740
MJD
5533 Set_Node_Length(ret, 1); /* MJD */
5534 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 5535 is_open = 1;
a0d0e21e 5536 }
a0ed51b3 5537 }
fac92740 5538 else /* ! paren */
a0d0e21e 5539 ret = NULL;
cde0cee5
YO
5540
5541 parse_rest:
a0d0e21e 5542 /* Pick up the branches, linking them together. */
fac92740 5543 parse_start = RExC_parse; /* MJD */
3dab1dad 5544 br = regbranch(pRExC_state, &flags, 1,depth+1);
fac92740 5545 /* branch_len = (paren != 0); */
2af232bd 5546
a0d0e21e
LW
5547 if (br == NULL)
5548 return(NULL);
830247a4
IZ
5549 if (*RExC_parse == '|') {
5550 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9 5551 reginsert(pRExC_state, BRANCHJ, br, depth+1);
a0ed51b3 5552 }
fac92740 5553 else { /* MJD */
6bda09f9 5554 reginsert(pRExC_state, BRANCH, br, depth+1);
fac92740
MJD
5555 Set_Node_Length(br, paren != 0);
5556 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
5557 }
c277df42
IZ
5558 have_branch = 1;
5559 if (SIZE_ONLY)
830247a4 5560 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
5561 }
5562 else if (paren == ':') {
c277df42
IZ
5563 *flagp |= flags&SIMPLE;
5564 }
6136c704 5565 if (is_open) { /* Starts with OPEN. */
3dab1dad 5566 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
5567 }
5568 else if (paren != '?') /* Not Conditional */
a0d0e21e 5569 ret = br;
32a0ca98 5570 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 5571 lastbr = br;
830247a4
IZ
5572 while (*RExC_parse == '|') {
5573 if (!SIZE_ONLY && RExC_extralen) {
5574 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 5575 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
5576 }
5577 if (SIZE_ONLY)
830247a4
IZ
5578 RExC_extralen += 2; /* Account for LONGJMP. */
5579 nextchar(pRExC_state);
3dab1dad 5580 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 5581
a687059c 5582 if (br == NULL)
a0d0e21e 5583 return(NULL);
3dab1dad 5584 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 5585 lastbr = br;
821b33a5
IZ
5586 if (flags&HASWIDTH)
5587 *flagp |= HASWIDTH;
a687059c 5588 *flagp |= flags&SPSTART;
a0d0e21e
LW
5589 }
5590
c277df42
IZ
5591 if (have_branch || paren != ':') {
5592 /* Make a closing node, and hook it on the end. */
5593 switch (paren) {
5594 case ':':
830247a4 5595 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
5596 break;
5597 case 1:
830247a4 5598 ender = reganode(pRExC_state, CLOSE, parno);
40d049e4
YO
5599 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
5600 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5601 "Setting close paren #%"IVdf" to %d\n",
5602 (IV)parno, REG_NODE_NUM(ender)));
5603 RExC_close_parens[parno-1]= ender;
e2e6a0f1
YO
5604 if (RExC_nestroot == parno)
5605 RExC_nestroot = 0;
40d049e4 5606 }
fac92740
MJD
5607 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
5608 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
5609 break;
5610 case '<':
c277df42
IZ
5611 case ',':
5612 case '=':
5613 case '!':
c277df42 5614 *flagp &= ~HASWIDTH;
821b33a5
IZ
5615 /* FALL THROUGH */
5616 case '>':
830247a4 5617 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
5618 break;
5619 case 0:
830247a4 5620 ender = reg_node(pRExC_state, END);
40d049e4
YO
5621 if (!SIZE_ONLY) {
5622 assert(!RExC_opend); /* there can only be one! */
5623 RExC_opend = ender;
5624 }
c277df42
IZ
5625 break;
5626 }
eaf3ca90 5627 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 5628
9674d46a 5629 if (have_branch && !SIZE_ONLY) {
eaf3ca90
YO
5630 if (depth==1)
5631 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5632
c277df42 5633 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
5634 for (br = ret; br; br = regnext(br)) {
5635 const U8 op = PL_regkind[OP(br)];
5636 if (op == BRANCH) {
07be1b83 5637 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
5638 }
5639 else if (op == BRANCHJ) {
07be1b83 5640 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 5641 }
c277df42
IZ
5642 }
5643 }
a0d0e21e 5644 }
c277df42
IZ
5645
5646 {
e1ec3a88
AL
5647 const char *p;
5648 static const char parens[] = "=!<,>";
c277df42
IZ
5649
5650 if (paren && (p = strchr(parens, paren))) {
eb160463 5651 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
5652 int flag = (p - parens) > 1;
5653
5654 if (paren == '>')
5655 node = SUSPEND, flag = 0;
6bda09f9 5656 reginsert(pRExC_state, node,ret, depth+1);
45948336
EP
5657 Set_Node_Cur_Length(ret);
5658 Set_Node_Offset(ret, parse_start + 1);
c277df42 5659 ret->flags = flag;
07be1b83 5660 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 5661 }
a0d0e21e
LW
5662 }
5663
5664 /* Check for proper termination. */
ce3e6498 5665 if (paren) {
e2509266 5666 RExC_flags = oregflags;
830247a4
IZ
5667 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
5668 RExC_parse = oregcomp_parse;
380a0633 5669 vFAIL("Unmatched (");
ce3e6498 5670 }
a0ed51b3 5671 }
830247a4
IZ
5672 else if (!paren && RExC_parse < RExC_end) {
5673 if (*RExC_parse == ')') {
5674 RExC_parse++;
380a0633 5675 vFAIL("Unmatched )");
a0ed51b3
LW
5676 }
5677 else
b45f050a 5678 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
5679 /* NOTREACHED */
5680 }
a687059c 5681
a0d0e21e 5682 return(ret);
a687059c
LW
5683}
5684
5685/*
5686 - regbranch - one alternative of an | operator
5687 *
5688 * Implements the concatenation operator.
5689 */
76e3520e 5690STATIC regnode *
3dab1dad 5691S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 5692{
97aff369 5693 dVAR;
c277df42
IZ
5694 register regnode *ret;
5695 register regnode *chain = NULL;
5696 register regnode *latest;
5697 I32 flags = 0, c = 0;
3dab1dad
YO
5698 GET_RE_DEBUG_FLAGS_DECL;
5699 DEBUG_PARSE("brnc");
b81d288d 5700 if (first)
c277df42
IZ
5701 ret = NULL;
5702 else {
b81d288d 5703 if (!SIZE_ONLY && RExC_extralen)
830247a4 5704 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 5705 else {
830247a4 5706 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
5707 Set_Node_Length(ret, 1);
5708 }
c277df42
IZ
5709 }
5710
b81d288d 5711 if (!first && SIZE_ONLY)
830247a4 5712 RExC_extralen += 1; /* BRANCHJ */
b81d288d 5713
c277df42 5714 *flagp = WORST; /* Tentatively. */
a0d0e21e 5715
830247a4
IZ
5716 RExC_parse--;
5717 nextchar(pRExC_state);
5718 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 5719 flags &= ~TRYAGAIN;
3dab1dad 5720 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5721 if (latest == NULL) {
5722 if (flags & TRYAGAIN)
5723 continue;
5724 return(NULL);
a0ed51b3
LW
5725 }
5726 else if (ret == NULL)
c277df42 5727 ret = latest;
a0d0e21e 5728 *flagp |= flags&HASWIDTH;
c277df42 5729 if (chain == NULL) /* First piece. */
a0d0e21e
LW
5730 *flagp |= flags&SPSTART;
5731 else {
830247a4 5732 RExC_naughty++;
3dab1dad 5733 REGTAIL(pRExC_state, chain, latest);
a687059c 5734 }
a0d0e21e 5735 chain = latest;
c277df42
IZ
5736 c++;
5737 }
5738 if (chain == NULL) { /* Loop ran zero times. */
830247a4 5739 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
5740 if (ret == NULL)
5741 ret = chain;
5742 }
5743 if (c == 1) {
5744 *flagp |= flags&SIMPLE;
a0d0e21e 5745 }
a687059c 5746
d4c19fe8 5747 return ret;
a687059c
LW
5748}
5749
5750/*
5751 - regpiece - something followed by possible [*+?]
5752 *
5753 * Note that the branching code sequences used for ? and the general cases
5754 * of * and + are somewhat optimized: they use the same NOTHING node as
5755 * both the endmarker for their branch list and the body of the last branch.
5756 * It might seem that this node could be dispensed with entirely, but the
5757 * endmarker role is not redundant.
5758 */
76e3520e 5759STATIC regnode *
3dab1dad 5760S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 5761{
97aff369 5762 dVAR;
c277df42 5763 register regnode *ret;
a0d0e21e
LW
5764 register char op;
5765 register char *next;
5766 I32 flags;
1df70142 5767 const char * const origparse = RExC_parse;
a0d0e21e 5768 I32 min;
c277df42 5769 I32 max = REG_INFTY;
fac92740 5770 char *parse_start;
10edeb5d 5771 const char *maxpos = NULL;
3dab1dad
YO
5772 GET_RE_DEBUG_FLAGS_DECL;
5773 DEBUG_PARSE("piec");
a0d0e21e 5774
3dab1dad 5775 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5776 if (ret == NULL) {
5777 if (flags & TRYAGAIN)
5778 *flagp |= TRYAGAIN;
5779 return(NULL);
5780 }
5781
830247a4 5782 op = *RExC_parse;
a0d0e21e 5783
830247a4 5784 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 5785 maxpos = NULL;
fac92740 5786 parse_start = RExC_parse; /* MJD */
830247a4 5787 next = RExC_parse + 1;
a0d0e21e
LW
5788 while (isDIGIT(*next) || *next == ',') {
5789 if (*next == ',') {
5790 if (maxpos)
5791 break;
5792 else
5793 maxpos = next;
a687059c 5794 }
a0d0e21e
LW
5795 next++;
5796 }
5797 if (*next == '}') { /* got one */
5798 if (!maxpos)
5799 maxpos = next;
830247a4
IZ
5800 RExC_parse++;
5801 min = atoi(RExC_parse);
a0d0e21e
LW
5802 if (*maxpos == ',')
5803 maxpos++;
5804 else
830247a4 5805 maxpos = RExC_parse;
a0d0e21e
LW
5806 max = atoi(maxpos);
5807 if (!max && *maxpos != '0')
c277df42
IZ
5808 max = REG_INFTY; /* meaning "infinity" */
5809 else if (max >= REG_INFTY)
8615cb43 5810 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
5811 RExC_parse = next;
5812 nextchar(pRExC_state);
a0d0e21e
LW
5813
5814 do_curly:
5815 if ((flags&SIMPLE)) {
830247a4 5816 RExC_naughty += 2 + RExC_naughty / 2;
6bda09f9 5817 reginsert(pRExC_state, CURLY, ret, depth+1);
fac92740
MJD
5818 Set_Node_Offset(ret, parse_start+1); /* MJD */
5819 Set_Node_Cur_Length(ret);
a0d0e21e
LW
5820 }
5821 else {
3dab1dad 5822 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
5823
5824 w->flags = 0;
3dab1dad 5825 REGTAIL(pRExC_state, ret, w);
830247a4 5826 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9
YO
5827 reginsert(pRExC_state, LONGJMP,ret, depth+1);
5828 reginsert(pRExC_state, NOTHING,ret, depth+1);
c277df42
IZ
5829 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
5830 }
6bda09f9 5831 reginsert(pRExC_state, CURLYX,ret, depth+1);
fac92740
MJD
5832 /* MJD hk */
5833 Set_Node_Offset(ret, parse_start+1);
2af232bd 5834 Set_Node_Length(ret,
fac92740 5835 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 5836
830247a4 5837 if (!SIZE_ONLY && RExC_extralen)
c277df42 5838 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 5839 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 5840 if (SIZE_ONLY)
830247a4
IZ
5841 RExC_whilem_seen++, RExC_extralen += 3;
5842 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 5843 }
c277df42 5844 ret->flags = 0;
a0d0e21e
LW
5845
5846 if (min > 0)
821b33a5
IZ
5847 *flagp = WORST;
5848 if (max > 0)
5849 *flagp |= HASWIDTH;
a0d0e21e 5850 if (max && max < min)
8615cb43 5851 vFAIL("Can't do {n,m} with n > m");
c277df42 5852 if (!SIZE_ONLY) {
eb160463
GS
5853 ARG1_SET(ret, (U16)min);
5854 ARG2_SET(ret, (U16)max);
a687059c 5855 }
a687059c 5856
a0d0e21e 5857 goto nest_check;
a687059c 5858 }
a0d0e21e 5859 }
a687059c 5860
a0d0e21e
LW
5861 if (!ISMULT1(op)) {
5862 *flagp = flags;
a687059c 5863 return(ret);
a0d0e21e 5864 }
bb20fd44 5865
c277df42 5866#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
5867
5868 /* if this is reinstated, don't forget to put this back into perldiag:
5869
5870 =item Regexp *+ operand could be empty at {#} in regex m/%s/
5871
5872 (F) The part of the regexp subject to either the * or + quantifier
5873 could match an empty string. The {#} shows in the regular
5874 expression about where the problem was discovered.
5875
5876 */
5877
bb20fd44 5878 if (!(flags&HASWIDTH) && op != '?')
b45f050a 5879 vFAIL("Regexp *+ operand could be empty");
b81d288d 5880#endif
bb20fd44 5881
fac92740 5882 parse_start = RExC_parse;
830247a4 5883 nextchar(pRExC_state);
a0d0e21e 5884
821b33a5 5885 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
5886
5887 if (op == '*' && (flags&SIMPLE)) {
6bda09f9 5888 reginsert(pRExC_state, STAR, ret, depth+1);
c277df42 5889 ret->flags = 0;
830247a4 5890 RExC_naughty += 4;
a0d0e21e
LW
5891 }
5892 else if (op == '*') {
5893 min = 0;
5894 goto do_curly;
a0ed51b3
LW
5895 }
5896 else if (op == '+' && (flags&SIMPLE)) {
6bda09f9 5897 reginsert(pRExC_state, PLUS, ret, depth+1);
c277df42 5898 ret->flags = 0;
830247a4 5899 RExC_naughty += 3;
a0d0e21e
LW
5900 }
5901 else if (op == '+') {
5902 min = 1;
5903 goto do_curly;
a0ed51b3
LW
5904 }
5905 else if (op == '?') {
a0d0e21e
LW
5906 min = 0; max = 1;
5907 goto do_curly;
5908 }
5909 nest_check:
041457d9 5910 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 5911 vWARN3(RExC_parse,
b45f050a 5912 "%.*s matches null string many times",
afd78fd5 5913 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
b45f050a 5914 origparse);
a0d0e21e
LW
5915 }
5916
b9b4dddf 5917 if (RExC_parse < RExC_end && *RExC_parse == '?') {
830247a4 5918 nextchar(pRExC_state);
6bda09f9 5919 reginsert(pRExC_state, MINMOD, ret, depth+1);
3dab1dad 5920 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 5921 }
b9b4dddf
YO
5922#ifndef REG_ALLOW_MINMOD_SUSPEND
5923 else
5924#endif
5925 if (RExC_parse < RExC_end && *RExC_parse == '+') {
5926 regnode *ender;
5927 nextchar(pRExC_state);
5928 ender = reg_node(pRExC_state, SUCCEED);
5929 REGTAIL(pRExC_state, ret, ender);
5930 reginsert(pRExC_state, SUSPEND, ret, depth+1);
5931 ret->flags = 0;
5932 ender = reg_node(pRExC_state, TAIL);
5933 REGTAIL(pRExC_state, ret, ender);
5934 /*ret= ender;*/
5935 }
5936
5937 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
830247a4 5938 RExC_parse++;
b45f050a
JF
5939 vFAIL("Nested quantifiers");
5940 }
a0d0e21e
LW
5941
5942 return(ret);
a687059c
LW
5943}
5944
fc8cd66c
YO
5945
5946/* reg_namedseq(pRExC_state,UVp)
5947
5948 This is expected to be called by a parser routine that has
5949 recognized'\N' and needs to handle the rest. RExC_parse is
5950 expected to point at the first char following the N at the time
5951 of the call.
5952
5953 If valuep is non-null then it is assumed that we are parsing inside
5954 of a charclass definition and the first codepoint in the resolved
5955 string is returned via *valuep and the routine will return NULL.
5956 In this mode if a multichar string is returned from the charnames
5957 handler a warning will be issued, and only the first char in the
5958 sequence will be examined. If the string returned is zero length
5959 then the value of *valuep is undefined and NON-NULL will
5960 be returned to indicate failure. (This will NOT be a valid pointer
5961 to a regnode.)
5962
5963 If value is null then it is assumed that we are parsing normal text
5964 and inserts a new EXACT node into the program containing the resolved
5965 string and returns a pointer to the new node. If the string is
5966 zerolength a NOTHING node is emitted.
5967
5968 On success RExC_parse is set to the char following the endbrace.
5969 Parsing failures will generate a fatal errorvia vFAIL(...)
5970
5971 NOTE: We cache all results from the charnames handler locally in
5972 the RExC_charnames hash (created on first use) to prevent a charnames
5973 handler from playing silly-buggers and returning a short string and
5974 then a long string for a given pattern. Since the regexp program
5975 size is calculated during an initial parse this would result
5976 in a buffer overrun so we cache to prevent the charname result from
5977 changing during the course of the parse.
5978
5979 */
5980STATIC regnode *
5981S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
5982{
5983 char * name; /* start of the content of the name */
5984 char * endbrace; /* endbrace following the name */
5985 SV *sv_str = NULL;
5986 SV *sv_name = NULL;
5987 STRLEN len; /* this has various purposes throughout the code */
5988 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
5989 regnode *ret = NULL;
5990
5991 if (*RExC_parse != '{') {
5992 vFAIL("Missing braces on \\N{}");
5993 }
5994 name = RExC_parse+1;
5995 endbrace = strchr(RExC_parse, '}');
5996 if ( ! endbrace ) {
5997 RExC_parse++;
5998 vFAIL("Missing right brace on \\N{}");
5999 }
6000 RExC_parse = endbrace + 1;
6001
6002
6003 /* RExC_parse points at the beginning brace,
6004 endbrace points at the last */
6005 if ( name[0]=='U' && name[1]=='+' ) {
6006 /* its a "unicode hex" notation {U+89AB} */
6007 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
6008 | PERL_SCAN_DISALLOW_PREFIX
6009 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6010 UV cp;
196f1508 6011 len = (STRLEN)(endbrace - name - 2);
fc8cd66c 6012 cp = grok_hex(name + 2, &len, &fl, NULL);
196f1508 6013 if ( len != (STRLEN)(endbrace - name - 2) ) {
fc8cd66c
YO
6014 cp = 0xFFFD;
6015 }
6016 if (cp > 0xff)
6017 RExC_utf8 = 1;
6018 if ( valuep ) {
6019 *valuep = cp;
6020 return NULL;
6021 }
6022 sv_str= Perl_newSVpvf_nocontext("%c",(int)cp);
6023 } else {
6024 /* fetch the charnames handler for this scope */
6025 HV * const table = GvHV(PL_hintgv);
6026 SV **cvp= table ?
6027 hv_fetchs(table, "charnames", FALSE) :
6028 NULL;
6029 SV *cv= cvp ? *cvp : NULL;
6030 HE *he_str;
6031 int count;
6032 /* create an SV with the name as argument */
6033 sv_name = newSVpvn(name, endbrace - name);
6034
6035 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
6036 vFAIL2("Constant(\\N{%s}) unknown: "
6037 "(possibly a missing \"use charnames ...\")",
6038 SvPVX(sv_name));
6039 }
6040 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
6041 vFAIL2("Constant(\\N{%s}): "
6042 "$^H{charnames} is not defined",SvPVX(sv_name));
6043 }
6044
6045
6046
6047 if (!RExC_charnames) {
6048 /* make sure our cache is allocated */
6049 RExC_charnames = newHV();
6bda09f9 6050 sv_2mortal((SV*)RExC_charnames);
fc8cd66c
YO
6051 }
6052 /* see if we have looked this one up before */
6053 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
6054 if ( he_str ) {
6055 sv_str = HeVAL(he_str);
6056 cached = 1;
6057 } else {
6058 dSP ;
6059
6060 ENTER ;
6061 SAVETMPS ;
6062 PUSHMARK(SP) ;
6063
6064 XPUSHs(sv_name);
6065
6066 PUTBACK ;
6067
6068 count= call_sv(cv, G_SCALAR);
6069
6070 if (count == 1) { /* XXXX is this right? dmq */
6071 sv_str = POPs;
6072 SvREFCNT_inc_simple_void(sv_str);
6073 }
6074
6075 SPAGAIN ;
6076 PUTBACK ;
6077 FREETMPS ;
6078 LEAVE ;
6079
6080 if ( !sv_str || !SvOK(sv_str) ) {
6081 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
6082 "did not return a defined value",SvPVX(sv_name));
6083 }
6084 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
6085 cached = 1;
6086 }
6087 }
6088 if (valuep) {
6089 char *p = SvPV(sv_str, len);
6090 if (len) {
6091 STRLEN numlen = 1;
6092 if ( SvUTF8(sv_str) ) {
196f1508 6093 *valuep = utf8_to_uvchr((U8*)p, &numlen);
fc8cd66c
YO
6094 if (*valuep > 0x7F)
6095 RExC_utf8 = 1;
6096 /* XXXX
6097 We have to turn on utf8 for high bit chars otherwise
6098 we get failures with
6099
6100 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6101 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6102
6103 This is different from what \x{} would do with the same
6104 codepoint, where the condition is > 0xFF.
6105 - dmq
6106 */
6107
6108
6109 } else {
6110 *valuep = (UV)*p;
6111 /* warn if we havent used the whole string? */
6112 }
6113 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6114 vWARN2(RExC_parse,
6115 "Ignoring excess chars from \\N{%s} in character class",
6116 SvPVX(sv_name)
6117 );
6118 }
6119 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6120 vWARN2(RExC_parse,
6121 "Ignoring zero length \\N{%s} in character class",
6122 SvPVX(sv_name)
6123 );
6124 }
6125 if (sv_name)
6126 SvREFCNT_dec(sv_name);
6127 if (!cached)
6128 SvREFCNT_dec(sv_str);
6129 return len ? NULL : (regnode *)&len;
6130 } else if(SvCUR(sv_str)) {
6131
6132 char *s;
6133 char *p, *pend;
6134 STRLEN charlen = 1;
6135 char * parse_start = name-3; /* needed for the offsets */
6136 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
6137
6138 ret = reg_node(pRExC_state,
6139 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6140 s= STRING(ret);
6141
6142 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
6143 sv_utf8_upgrade(sv_str);
6144 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
6145 RExC_utf8= 1;
6146 }
6147
6148 p = SvPV(sv_str, len);
6149 pend = p + len;
6150 /* len is the length written, charlen is the size the char read */
6151 for ( len = 0; p < pend; p += charlen ) {
6152 if (UTF) {
196f1508 6153 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
fc8cd66c
YO
6154 if (FOLD) {
6155 STRLEN foldlen,numlen;
6156 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6157 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
6158 /* Emit all the Unicode characters. */
6159
6160 for (foldbuf = tmpbuf;
6161 foldlen;
6162 foldlen -= numlen)
6163 {
6164 uvc = utf8_to_uvchr(foldbuf, &numlen);
6165 if (numlen > 0) {
6166 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6167 s += unilen;
6168 len += unilen;
6169 /* In EBCDIC the numlen
6170 * and unilen can differ. */
6171 foldbuf += numlen;
6172 if (numlen >= foldlen)
6173 break;
6174 }
6175 else
6176 break; /* "Can't happen." */
6177 }
6178 } else {
6179 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6180 if (unilen > 0) {
6181 s += unilen;
6182 len += unilen;
6183 }
6184 }
6185 } else {
6186 len++;
6187 REGC(*p, s++);
6188 }
6189 }
6190 if (SIZE_ONLY) {
6191 RExC_size += STR_SZ(len);
6192 } else {
6193 STR_LEN(ret) = len;
6194 RExC_emit += STR_SZ(len);
6195 }
6196 Set_Node_Cur_Length(ret); /* MJD */
6197 RExC_parse--;
6198 nextchar(pRExC_state);
6199 } else {
6200 ret = reg_node(pRExC_state,NOTHING);
6201 }
6202 if (!cached) {
6203 SvREFCNT_dec(sv_str);
6204 }
6205 if (sv_name) {
6206 SvREFCNT_dec(sv_name);
6207 }
6208 return ret;
6209
6210}
6211
6212
9e08bc66
TS
6213/*
6214 * reg_recode
6215 *
6216 * It returns the code point in utf8 for the value in *encp.
6217 * value: a code value in the source encoding
6218 * encp: a pointer to an Encode object
6219 *
6220 * If the result from Encode is not a single character,
6221 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6222 */
6223STATIC UV
6224S_reg_recode(pTHX_ const char value, SV **encp)
6225{
6226 STRLEN numlen = 1;
6227 SV * const sv = sv_2mortal(newSVpvn(&value, numlen));
6228 const char * const s = encp && *encp ? sv_recode_to_utf8(sv, *encp)
6229 : SvPVX(sv);
6230 const STRLEN newlen = SvCUR(sv);
6231 UV uv = UNICODE_REPLACEMENT;
6232
6233 if (newlen)
6234 uv = SvUTF8(sv)
6235 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6236 : *(U8*)s;
6237
6238 if (!newlen || numlen != newlen) {
6239 uv = UNICODE_REPLACEMENT;
6240 if (encp)
6241 *encp = NULL;
6242 }
6243 return uv;
6244}
6245
fc8cd66c 6246
a687059c
LW
6247/*
6248 - regatom - the lowest level
ee9b8eae
YO
6249
6250 Try to identify anything special at the start of the pattern. If there
6251 is, then handle it as required. This may involve generating a single regop,
6252 such as for an assertion; or it may involve recursing, such as to
6253 handle a () structure.
6254
6255 If the string doesn't start with something special then we gobble up
6256 as much literal text as we can.
6257
6258 Once we have been able to handle whatever type of thing started the
6259 sequence, we return.
6260
6261 Note: we have to be careful with escapes, as they can be both literal
6262 and special, and in the case of \10 and friends can either, depending
6263 on context. Specifically there are two seperate switches for handling
6264 escape sequences, with the one for handling literal escapes requiring
6265 a dummy entry for all of the special escapes that are actually handled
6266 by the other.
6267*/
6268
76e3520e 6269STATIC regnode *
3dab1dad 6270S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 6271{
97aff369 6272 dVAR;
cbbf8932 6273 register regnode *ret = NULL;
a0d0e21e 6274 I32 flags;
45948336 6275 char *parse_start = RExC_parse;
3dab1dad
YO
6276 GET_RE_DEBUG_FLAGS_DECL;
6277 DEBUG_PARSE("atom");
a0d0e21e
LW
6278 *flagp = WORST; /* Tentatively. */
6279
ee9b8eae 6280
a0d0e21e 6281tryagain:
830247a4 6282 switch (*RExC_parse) {
a0d0e21e 6283 case '^':
830247a4
IZ
6284 RExC_seen_zerolen++;
6285 nextchar(pRExC_state);
bbe252da 6286 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 6287 ret = reg_node(pRExC_state, MBOL);
bbe252da 6288 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 6289 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6290 else
830247a4 6291 ret = reg_node(pRExC_state, BOL);
fac92740 6292 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6293 break;
6294 case '$':
830247a4 6295 nextchar(pRExC_state);
b81d288d 6296 if (*RExC_parse)
830247a4 6297 RExC_seen_zerolen++;
bbe252da 6298 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 6299 ret = reg_node(pRExC_state, MEOL);
bbe252da 6300 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 6301 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6302 else
830247a4 6303 ret = reg_node(pRExC_state, EOL);
fac92740 6304 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6305 break;
6306 case '.':
830247a4 6307 nextchar(pRExC_state);
bbe252da 6308 if (RExC_flags & RXf_PMf_SINGLELINE)
ffc61ed2
JH
6309 ret = reg_node(pRExC_state, SANY);
6310 else
6311 ret = reg_node(pRExC_state, REG_ANY);
6312 *flagp |= HASWIDTH|SIMPLE;
830247a4 6313 RExC_naughty++;
fac92740 6314 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6315 break;
6316 case '[':
b45f050a 6317 {
3dab1dad
YO
6318 char * const oregcomp_parse = ++RExC_parse;
6319 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
6320 if (*RExC_parse != ']') {
6321 RExC_parse = oregcomp_parse;
b45f050a
JF
6322 vFAIL("Unmatched [");
6323 }
830247a4 6324 nextchar(pRExC_state);
a0d0e21e 6325 *flagp |= HASWIDTH|SIMPLE;
fac92740 6326 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 6327 break;
b45f050a 6328 }
a0d0e21e 6329 case '(':
830247a4 6330 nextchar(pRExC_state);
3dab1dad 6331 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 6332 if (ret == NULL) {
bf93d4cc 6333 if (flags & TRYAGAIN) {
830247a4 6334 if (RExC_parse == RExC_end) {
bf93d4cc
GS
6335 /* Make parent create an empty node if needed. */
6336 *flagp |= TRYAGAIN;
6337 return(NULL);
6338 }
a0d0e21e 6339 goto tryagain;
bf93d4cc 6340 }
a0d0e21e
LW
6341 return(NULL);
6342 }
c277df42 6343 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
6344 break;
6345 case '|':
6346 case ')':
6347 if (flags & TRYAGAIN) {
6348 *flagp |= TRYAGAIN;
6349 return NULL;
6350 }
b45f050a 6351 vFAIL("Internal urp");
a0d0e21e
LW
6352 /* Supposed to be caught earlier. */
6353 break;
85afd4ae 6354 case '{':
830247a4
IZ
6355 if (!regcurly(RExC_parse)) {
6356 RExC_parse++;
85afd4ae
CS
6357 goto defchar;
6358 }
6359 /* FALL THROUGH */
a0d0e21e
LW
6360 case '?':
6361 case '+':
6362 case '*':
830247a4 6363 RExC_parse++;
b45f050a 6364 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
6365 break;
6366 case '\\':
ee9b8eae
YO
6367 /* Special Escapes
6368
6369 This switch handles escape sequences that resolve to some kind
6370 of special regop and not to literal text. Escape sequnces that
6371 resolve to literal text are handled below in the switch marked
6372 "Literal Escapes".
6373
6374 Every entry in this switch *must* have a corresponding entry
6375 in the literal escape switch. However, the opposite is not
6376 required, as the default for this switch is to jump to the
6377 literal text handling code.
6378 */
830247a4 6379 switch (*++RExC_parse) {
ee9b8eae 6380 /* Special Escapes */
a0d0e21e 6381 case 'A':
830247a4
IZ
6382 RExC_seen_zerolen++;
6383 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6384 *flagp |= SIMPLE;
ee9b8eae 6385 goto finish_meta_pat;
a0d0e21e 6386 case 'G':
830247a4
IZ
6387 ret = reg_node(pRExC_state, GPOS);
6388 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 6389 *flagp |= SIMPLE;
ee9b8eae
YO
6390 goto finish_meta_pat;
6391 case 'K':
6392 RExC_seen_zerolen++;
6393 ret = reg_node(pRExC_state, KEEPS);
6394 *flagp |= SIMPLE;
6395 goto finish_meta_pat;
a0d0e21e 6396 case 'Z':
830247a4 6397 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6398 *flagp |= SIMPLE;
a1917ab9 6399 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 6400 goto finish_meta_pat;
b85d18e9 6401 case 'z':
830247a4 6402 ret = reg_node(pRExC_state, EOS);
b85d18e9 6403 *flagp |= SIMPLE;
830247a4 6404 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 6405 goto finish_meta_pat;
4a2d328f 6406 case 'C':
f33976b4
DB
6407 ret = reg_node(pRExC_state, CANY);
6408 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 6409 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 6410 goto finish_meta_pat;
a0ed51b3 6411 case 'X':
830247a4 6412 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 6413 *flagp |= HASWIDTH;
ee9b8eae 6414 goto finish_meta_pat;
a0d0e21e 6415 case 'w':
eb160463 6416 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 6417 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 6418 goto finish_meta_pat;
a0d0e21e 6419 case 'W':
eb160463 6420 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 6421 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 6422 goto finish_meta_pat;
a0d0e21e 6423 case 'b':
830247a4
IZ
6424 RExC_seen_zerolen++;
6425 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 6426 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 6427 *flagp |= SIMPLE;
ee9b8eae 6428 goto finish_meta_pat;
a0d0e21e 6429 case 'B':
830247a4
IZ
6430 RExC_seen_zerolen++;
6431 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 6432 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 6433 *flagp |= SIMPLE;
ee9b8eae 6434 goto finish_meta_pat;
a0d0e21e 6435 case 's':
eb160463 6436 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 6437 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 6438 goto finish_meta_pat;
a0d0e21e 6439 case 'S':
eb160463 6440 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 6441 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 6442 goto finish_meta_pat;
a0d0e21e 6443 case 'd':
ffc61ed2 6444 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 6445 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 6446 goto finish_meta_pat;
a0d0e21e 6447 case 'D':
ffc61ed2 6448 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 6449 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae
YO
6450 goto finish_meta_pat;
6451 case 'v':
6452 ret = reganode(pRExC_state, PRUNE, 0);
6453 ret->flags = 1;
6454 *flagp |= SIMPLE;
6455 goto finish_meta_pat;
6456 case 'V':
6457 ret = reganode(pRExC_state, SKIP, 0);
6458 ret->flags = 1;
6459 *flagp |= SIMPLE;
6460 finish_meta_pat:
830247a4 6461 nextchar(pRExC_state);
fac92740 6462 Set_Node_Length(ret, 2); /* MJD */
ee9b8eae 6463 break;
a14b48bc
LW
6464 case 'p':
6465 case 'P':
3568d838 6466 {
3dab1dad 6467 char* const oldregxend = RExC_end;
ccb2c380 6468 char* parse_start = RExC_parse - 2;
a14b48bc 6469
830247a4 6470 if (RExC_parse[1] == '{') {
3568d838 6471 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
6472 RExC_end = strchr(RExC_parse, '}');
6473 if (!RExC_end) {
3dab1dad 6474 const U8 c = (U8)*RExC_parse;
830247a4
IZ
6475 RExC_parse += 2;
6476 RExC_end = oldregxend;
0da60cf5 6477 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 6478 }
830247a4 6479 RExC_end++;
a14b48bc 6480 }
af6f566e 6481 else {
830247a4 6482 RExC_end = RExC_parse + 2;
af6f566e
HS
6483 if (RExC_end > oldregxend)
6484 RExC_end = oldregxend;
6485 }
830247a4 6486 RExC_parse--;
a14b48bc 6487
3dab1dad 6488 ret = regclass(pRExC_state,depth+1);
a14b48bc 6489
830247a4
IZ
6490 RExC_end = oldregxend;
6491 RExC_parse--;
ccb2c380
MP
6492
6493 Set_Node_Offset(ret, parse_start + 2);
6494 Set_Node_Cur_Length(ret);
830247a4 6495 nextchar(pRExC_state);
a14b48bc
LW
6496 *flagp |= HASWIDTH|SIMPLE;
6497 }
6498 break;
fc8cd66c
YO
6499 case 'N':
6500 /* Handle \N{NAME} here and not below because it can be
6501 multicharacter. join_exact() will join them up later on.
6502 Also this makes sure that things like /\N{BLAH}+/ and
6503 \N{BLAH} being multi char Just Happen. dmq*/
6504 ++RExC_parse;
6505 ret= reg_namedseq(pRExC_state, NULL);
6506 break;
0a4db386 6507 case 'k': /* Handle \k<NAME> and \k'NAME' */
1f1031fe 6508 parse_named_seq:
81714fb9
YO
6509 {
6510 char ch= RExC_parse[1];
1f1031fe
YO
6511 if (ch != '<' && ch != '\'' && ch != '{') {
6512 RExC_parse++;
6513 vFAIL2("Sequence %.2s... not terminated",parse_start);
81714fb9 6514 } else {
1f1031fe
YO
6515 /* this pretty much dupes the code for (?P=...) in reg(), if
6516 you change this make sure you change that */
81714fb9 6517 char* name_start = (RExC_parse += 2);
2eccd3b2 6518 U32 num = 0;
0a4db386
YO
6519 SV *sv_dat = reg_scan_name(pRExC_state,
6520 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
1f1031fe 6521 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
81714fb9 6522 if (RExC_parse == name_start || *RExC_parse != ch)
1f1031fe
YO
6523 vFAIL2("Sequence %.3s... not terminated",parse_start);
6524
6525 if (!SIZE_ONLY) {
6526 num = add_data( pRExC_state, 1, "S" );
6527 RExC_rxi->data->data[num]=(void*)sv_dat;
6528 SvREFCNT_inc(sv_dat);
6529 }
6530
81714fb9
YO
6531 RExC_sawback = 1;
6532 ret = reganode(pRExC_state,
6533 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
6534 num);
6535 *flagp |= HASWIDTH;
1f1031fe 6536
81714fb9
YO
6537 /* override incorrect value set in reganode MJD */
6538 Set_Node_Offset(ret, parse_start+1);
6539 Set_Node_Cur_Length(ret); /* MJD */
6540 nextchar(pRExC_state);
1f1031fe 6541
81714fb9
YO
6542 }
6543 break;
1f1031fe 6544 }
2bf803e2 6545 case 'g':
a0d0e21e
LW
6546 case '1': case '2': case '3': case '4':
6547 case '5': case '6': case '7': case '8': case '9':
6548 {
c74340f9 6549 I32 num;
2bf803e2
YO
6550 bool isg = *RExC_parse == 'g';
6551 bool isrel = 0;
6552 bool hasbrace = 0;
6553 if (isg) {
c74340f9 6554 RExC_parse++;
2bf803e2
YO
6555 if (*RExC_parse == '{') {
6556 RExC_parse++;
6557 hasbrace = 1;
6558 }
6559 if (*RExC_parse == '-') {
6560 RExC_parse++;
6561 isrel = 1;
6562 }
1f1031fe
YO
6563 if (hasbrace && !isDIGIT(*RExC_parse)) {
6564 if (isrel) RExC_parse--;
6565 RExC_parse -= 2;
6566 goto parse_named_seq;
6567 } }
c74340f9
YO
6568 num = atoi(RExC_parse);
6569 if (isrel) {
5624f11d 6570 num = RExC_npar - num;
c74340f9
YO
6571 if (num < 1)
6572 vFAIL("Reference to nonexistent or unclosed group");
6573 }
2bf803e2 6574 if (!isg && num > 9 && num >= RExC_npar)
a0d0e21e
LW
6575 goto defchar;
6576 else {
3dab1dad 6577 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
6578 while (isDIGIT(*RExC_parse))
6579 RExC_parse++;
1f1031fe
YO
6580 if (parse_start == RExC_parse - 1)
6581 vFAIL("Unterminated \\g... pattern");
2bf803e2
YO
6582 if (hasbrace) {
6583 if (*RExC_parse != '}')
6584 vFAIL("Unterminated \\g{...} pattern");
6585 RExC_parse++;
6586 }
c74340f9
YO
6587 if (!SIZE_ONLY) {
6588 if (num > (I32)RExC_rx->nparens)
6589 vFAIL("Reference to nonexistent group");
c74340f9 6590 }
830247a4 6591 RExC_sawback = 1;
eb160463
GS
6592 ret = reganode(pRExC_state,
6593 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
6594 num);
a0d0e21e 6595 *flagp |= HASWIDTH;
2af232bd 6596
fac92740 6597 /* override incorrect value set in reganode MJD */
2af232bd 6598 Set_Node_Offset(ret, parse_start+1);
fac92740 6599 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
6600 RExC_parse--;
6601 nextchar(pRExC_state);
a0d0e21e
LW
6602 }
6603 }
6604 break;
6605 case '\0':
830247a4 6606 if (RExC_parse >= RExC_end)
b45f050a 6607 FAIL("Trailing \\");
a0d0e21e
LW
6608 /* FALL THROUGH */
6609 default:
a0288114 6610 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 6611 back into the quick-grab loop below */
45948336 6612 parse_start--;
a0d0e21e
LW
6613 goto defchar;
6614 }
6615 break;
4633a7c4
LW
6616
6617 case '#':
bbe252da 6618 if (RExC_flags & RXf_PMf_EXTENDED) {
3dab1dad
YO
6619 while (RExC_parse < RExC_end && *RExC_parse != '\n')
6620 RExC_parse++;
830247a4 6621 if (RExC_parse < RExC_end)
4633a7c4
LW
6622 goto tryagain;
6623 }
6624 /* FALL THROUGH */
6625
a0d0e21e 6626 default: {
ba210ebe 6627 register STRLEN len;
58ae7d3f 6628 register UV ender;
a0d0e21e 6629 register char *p;
3dab1dad 6630 char *s;
80aecb99 6631 STRLEN foldlen;
89ebb4a3 6632 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
6633
6634 parse_start = RExC_parse - 1;
a0d0e21e 6635
830247a4 6636 RExC_parse++;
a0d0e21e
LW
6637
6638 defchar:
58ae7d3f 6639 ender = 0;
eb160463
GS
6640 ret = reg_node(pRExC_state,
6641 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 6642 s = STRING(ret);
830247a4
IZ
6643 for (len = 0, p = RExC_parse - 1;
6644 len < 127 && p < RExC_end;
a0d0e21e
LW
6645 len++)
6646 {
3dab1dad 6647 char * const oldp = p;
5b5a24f7 6648
bbe252da 6649 if (RExC_flags & RXf_PMf_EXTENDED)
830247a4 6650 p = regwhite(p, RExC_end);
a0d0e21e
LW
6651 switch (*p) {
6652 case '^':
6653 case '$':
6654 case '.':
6655 case '[':
6656 case '(':
6657 case ')':
6658 case '|':
6659 goto loopdone;
6660 case '\\':
ee9b8eae
YO
6661 /* Literal Escapes Switch
6662
6663 This switch is meant to handle escape sequences that
6664 resolve to a literal character.
6665
6666 Every escape sequence that represents something
6667 else, like an assertion or a char class, is handled
6668 in the switch marked 'Special Escapes' above in this
6669 routine, but also has an entry here as anything that
6670 isn't explicitly mentioned here will be treated as
6671 an unescaped equivalent literal.
6672 */
6673
a0d0e21e 6674 switch (*++p) {
ee9b8eae
YO
6675 /* These are all the special escapes. */
6676 case 'A': /* Start assertion */
6677 case 'b': case 'B': /* Word-boundary assertion*/
6678 case 'C': /* Single char !DANGEROUS! */
6679 case 'd': case 'D': /* digit class */
6680 case 'g': case 'G': /* generic-backref, pos assertion */
6681 case 'k': case 'K': /* named backref, keep marker */
6682 case 'N': /* named char sequence */
6683 case 'p': case 'P': /* unicode property */
6684 case 's': case 'S': /* space class */
6685 case 'v': case 'V': /* (*PRUNE) and (*SKIP) */
6686 case 'w': case 'W': /* word class */
6687 case 'X': /* eXtended Unicode "combining character sequence" */
6688 case 'z': case 'Z': /* End of line/string assertion */
a0d0e21e
LW
6689 --p;
6690 goto loopdone;
ee9b8eae
YO
6691
6692 /* Anything after here is an escape that resolves to a
6693 literal. (Except digits, which may or may not)
6694 */
a0d0e21e
LW
6695 case 'n':
6696 ender = '\n';
6697 p++;
a687059c 6698 break;
a0d0e21e
LW
6699 case 'r':
6700 ender = '\r';
6701 p++;
a687059c 6702 break;
a0d0e21e
LW
6703 case 't':
6704 ender = '\t';
6705 p++;
a687059c 6706 break;
a0d0e21e
LW
6707 case 'f':
6708 ender = '\f';
6709 p++;
a687059c 6710 break;
a0d0e21e 6711 case 'e':
c7f1f016 6712 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 6713 p++;
a687059c 6714 break;
a0d0e21e 6715 case 'a':
c7f1f016 6716 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 6717 p++;
a687059c 6718 break;
a0d0e21e 6719 case 'x':
a0ed51b3 6720 if (*++p == '{') {
1df70142 6721 char* const e = strchr(p, '}');
b81d288d 6722
b45f050a 6723 if (!e) {
830247a4 6724 RExC_parse = p + 1;
b45f050a
JF
6725 vFAIL("Missing right brace on \\x{}");
6726 }
de5f0749 6727 else {
a4c04bdc
NC
6728 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6729 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 6730 STRLEN numlen = e - p - 1;
53305cf1 6731 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
6732 if (ender > 0xff)
6733 RExC_utf8 = 1;
a0ed51b3
LW
6734 p = e + 1;
6735 }
a0ed51b3
LW
6736 }
6737 else {
a4c04bdc 6738 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 6739 STRLEN numlen = 2;
53305cf1 6740 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
6741 p += numlen;
6742 }
9e08bc66
TS
6743 if (PL_encoding && ender < 0x100)
6744 goto recode_encoding;
a687059c 6745 break;
a0d0e21e
LW
6746 case 'c':
6747 p++;
bbce6d69 6748 ender = UCHARAT(p++);
6749 ender = toCTRL(ender);
a687059c 6750 break;
a0d0e21e
LW
6751 case '0': case '1': case '2': case '3':case '4':
6752 case '5': case '6': case '7': case '8':case '9':
6753 if (*p == '0' ||
830247a4 6754 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 6755 I32 flags = 0;
1df70142 6756 STRLEN numlen = 3;
53305cf1 6757 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
6758 p += numlen;
6759 }
6760 else {
6761 --p;
6762 goto loopdone;
a687059c 6763 }
9e08bc66
TS
6764 if (PL_encoding && ender < 0x100)
6765 goto recode_encoding;
6766 break;
6767 recode_encoding:
6768 {
6769 SV* enc = PL_encoding;
6770 ender = reg_recode((const char)(U8)ender, &enc);
6771 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
6772 vWARN(p, "Invalid escape in the specified encoding");
6773 RExC_utf8 = 1;
6774 }
a687059c 6775 break;
a0d0e21e 6776 case '\0':
830247a4 6777 if (p >= RExC_end)
b45f050a 6778 FAIL("Trailing \\");
a687059c 6779 /* FALL THROUGH */
a0d0e21e 6780 default:
041457d9 6781 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 6782 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 6783 goto normal_default;
a0d0e21e
LW
6784 }
6785 break;
a687059c 6786 default:
a0ed51b3 6787 normal_default:
fd400ab9 6788 if (UTF8_IS_START(*p) && UTF) {
1df70142 6789 STRLEN numlen;
5e12f4fb 6790 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 6791 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
6792 p += numlen;
6793 }
6794 else
6795 ender = *p++;
a0d0e21e 6796 break;
a687059c 6797 }
bbe252da 6798 if (RExC_flags & RXf_PMf_EXTENDED)
830247a4 6799 p = regwhite(p, RExC_end);
60a8b682
JH
6800 if (UTF && FOLD) {
6801 /* Prime the casefolded buffer. */
ac7e0132 6802 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 6803 }
a0d0e21e
LW
6804 if (ISMULT2(p)) { /* Back off on ?+*. */
6805 if (len)
6806 p = oldp;
16ea2a2e 6807 else if (UTF) {
80aecb99 6808 if (FOLD) {
60a8b682 6809 /* Emit all the Unicode characters. */
1df70142 6810 STRLEN numlen;
80aecb99
JH
6811 for (foldbuf = tmpbuf;
6812 foldlen;
6813 foldlen -= numlen) {
6814 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6815 if (numlen > 0) {
71207a34 6816 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6817 s += unilen;
6818 len += unilen;
6819 /* In EBCDIC the numlen
6820 * and unilen can differ. */
9dc45d57 6821 foldbuf += numlen;
47654450
JH
6822 if (numlen >= foldlen)
6823 break;
9dc45d57
JH
6824 }
6825 else
6826 break; /* "Can't happen." */
80aecb99
JH
6827 }
6828 }
6829 else {
71207a34 6830 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6831 if (unilen > 0) {
0ebc6274
JH
6832 s += unilen;
6833 len += unilen;
9dc45d57 6834 }
80aecb99 6835 }
a0ed51b3 6836 }
a0d0e21e
LW
6837 else {
6838 len++;
eb160463 6839 REGC((char)ender, s++);
a0d0e21e
LW
6840 }
6841 break;
a687059c 6842 }
16ea2a2e 6843 if (UTF) {
80aecb99 6844 if (FOLD) {
60a8b682 6845 /* Emit all the Unicode characters. */
1df70142 6846 STRLEN numlen;
80aecb99
JH
6847 for (foldbuf = tmpbuf;
6848 foldlen;
6849 foldlen -= numlen) {
6850 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6851 if (numlen > 0) {
71207a34 6852 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6853 len += unilen;
6854 s += unilen;
6855 /* In EBCDIC the numlen
6856 * and unilen can differ. */
9dc45d57 6857 foldbuf += numlen;
47654450
JH
6858 if (numlen >= foldlen)
6859 break;
9dc45d57
JH
6860 }
6861 else
6862 break;
80aecb99
JH
6863 }
6864 }
6865 else {
71207a34 6866 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6867 if (unilen > 0) {
0ebc6274
JH
6868 s += unilen;
6869 len += unilen;
9dc45d57 6870 }
80aecb99
JH
6871 }
6872 len--;
a0ed51b3
LW
6873 }
6874 else
eb160463 6875 REGC((char)ender, s++);
a0d0e21e
LW
6876 }
6877 loopdone:
830247a4 6878 RExC_parse = p - 1;
fac92740 6879 Set_Node_Cur_Length(ret); /* MJD */
830247a4 6880 nextchar(pRExC_state);
793db0cb
JH
6881 {
6882 /* len is STRLEN which is unsigned, need to copy to signed */
6883 IV iv = len;
6884 if (iv < 0)
6885 vFAIL("Internal disaster");
6886 }
a0d0e21e
LW
6887 if (len > 0)
6888 *flagp |= HASWIDTH;
090f7165 6889 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 6890 *flagp |= SIMPLE;
3dab1dad 6891
cd439c50 6892 if (SIZE_ONLY)
830247a4 6893 RExC_size += STR_SZ(len);
3dab1dad
YO
6894 else {
6895 STR_LEN(ret) = len;
830247a4 6896 RExC_emit += STR_SZ(len);
07be1b83 6897 }
3dab1dad 6898 }
a0d0e21e
LW
6899 break;
6900 }
a687059c 6901
a0d0e21e 6902 return(ret);
a687059c
LW
6903}
6904
873ef191 6905STATIC char *
5f66b61c 6906S_regwhite(char *p, const char *e)
5b5a24f7
CS
6907{
6908 while (p < e) {
6909 if (isSPACE(*p))
6910 ++p;
6911 else if (*p == '#') {
6912 do {
6913 p++;
6914 } while (p < e && *p != '\n');
6915 }
6916 else
6917 break;
6918 }
6919 return p;
6920}
6921
b8c5462f
JH
6922/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
6923 Character classes ([:foo:]) can also be negated ([:^foo:]).
6924 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
6925 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 6926 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
6927
6928#define POSIXCC_DONE(c) ((c) == ':')
6929#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
6930#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
6931
b8c5462f 6932STATIC I32
830247a4 6933S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 6934{
97aff369 6935 dVAR;
936ed897 6936 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 6937
830247a4 6938 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 6939 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 6940 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 6941 const char c = UCHARAT(RExC_parse);
097eb12c 6942 char* const s = RExC_parse++;
b81d288d 6943
9a86a77b 6944 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
6945 RExC_parse++;
6946 if (RExC_parse == RExC_end)
620e46c5 6947 /* Grandfather lone [:, [=, [. */
830247a4 6948 RExC_parse = s;
620e46c5 6949 else {
3dab1dad 6950 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
6951 assert(*t == c);
6952
9a86a77b 6953 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 6954 const char *posixcc = s + 1;
830247a4 6955 RExC_parse++; /* skip over the ending ] */
3dab1dad 6956
b8c5462f 6957 if (*s == ':') {
1df70142
AL
6958 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
6959 const I32 skip = t - posixcc;
80916619
NC
6960
6961 /* Initially switch on the length of the name. */
6962 switch (skip) {
6963 case 4:
3dab1dad
YO
6964 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
6965 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 6966 break;
80916619
NC
6967 case 5:
6968 /* Names all of length 5. */
6969 /* alnum alpha ascii blank cntrl digit graph lower
6970 print punct space upper */
6971 /* Offset 4 gives the best switch position. */
6972 switch (posixcc[4]) {
6973 case 'a':
3dab1dad
YO
6974 if (memEQ(posixcc, "alph", 4)) /* alpha */
6975 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
6976 break;
6977 case 'e':
3dab1dad
YO
6978 if (memEQ(posixcc, "spac", 4)) /* space */
6979 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
6980 break;
6981 case 'h':
3dab1dad
YO
6982 if (memEQ(posixcc, "grap", 4)) /* graph */
6983 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
6984 break;
6985 case 'i':
3dab1dad
YO
6986 if (memEQ(posixcc, "asci", 4)) /* ascii */
6987 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
6988 break;
6989 case 'k':
3dab1dad
YO
6990 if (memEQ(posixcc, "blan", 4)) /* blank */
6991 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
6992 break;
6993 case 'l':
3dab1dad
YO
6994 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
6995 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
6996 break;
6997 case 'm':
3dab1dad
YO
6998 if (memEQ(posixcc, "alnu", 4)) /* alnum */
6999 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
7000 break;
7001 case 'r':
3dab1dad
YO
7002 if (memEQ(posixcc, "lowe", 4)) /* lower */
7003 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
7004 else if (memEQ(posixcc, "uppe", 4)) /* upper */
7005 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
7006 break;
7007 case 't':
3dab1dad
YO
7008 if (memEQ(posixcc, "digi", 4)) /* digit */
7009 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
7010 else if (memEQ(posixcc, "prin", 4)) /* print */
7011 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
7012 else if (memEQ(posixcc, "punc", 4)) /* punct */
7013 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 7014 break;
b8c5462f
JH
7015 }
7016 break;
80916619 7017 case 6:
3dab1dad
YO
7018 if (memEQ(posixcc, "xdigit", 6))
7019 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
7020 break;
7021 }
80916619
NC
7022
7023 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
7024 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
7025 t - s - 1, s + 1);
80916619
NC
7026 assert (posixcc[skip] == ':');
7027 assert (posixcc[skip+1] == ']');
b45f050a 7028 } else if (!SIZE_ONLY) {
b8c5462f 7029 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 7030
830247a4 7031 /* adjust RExC_parse so the warning shows after
b45f050a 7032 the class closes */
9a86a77b 7033 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 7034 RExC_parse++;
b45f050a
JF
7035 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7036 }
b8c5462f
JH
7037 } else {
7038 /* Maternal grandfather:
7039 * "[:" ending in ":" but not in ":]" */
830247a4 7040 RExC_parse = s;
767d463e 7041 }
620e46c5
JH
7042 }
7043 }
7044
b8c5462f
JH
7045 return namedclass;
7046}
7047
7048STATIC void
830247a4 7049S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 7050{
97aff369 7051 dVAR;
3dab1dad 7052 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
7053 const char *s = RExC_parse;
7054 const char c = *s++;
b8c5462f 7055
3dab1dad 7056 while (isALNUM(*s))
b8c5462f
JH
7057 s++;
7058 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
7059 if (ckWARN(WARN_REGEXP))
7060 vWARN3(s+2,
7061 "POSIX syntax [%c %c] belongs inside character classes",
7062 c, c);
b45f050a
JF
7063
7064 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 7065 if (POSIXCC_NOTYET(c)) {
830247a4 7066 /* adjust RExC_parse so the error shows after
b45f050a 7067 the class closes */
9a86a77b 7068 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 7069 NOOP;
b45f050a
JF
7070 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7071 }
b8c5462f
JH
7072 }
7073 }
620e46c5
JH
7074}
7075
7f6f358c 7076
89836f1f
YO
7077#define _C_C_T_(NAME,TEST,WORD) \
7078ANYOF_##NAME: \
7079 if (LOC) \
7080 ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
7081 else { \
7082 for (value = 0; value < 256; value++) \
7083 if (TEST) \
7084 ANYOF_BITMAP_SET(ret, value); \
7085 } \
7086 yesno = '+'; \
7087 what = WORD; \
7088 break; \
7089case ANYOF_N##NAME: \
7090 if (LOC) \
7091 ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
7092 else { \
7093 for (value = 0; value < 256; value++) \
7094 if (!TEST) \
7095 ANYOF_BITMAP_SET(ret, value); \
7096 } \
7097 yesno = '!'; \
7098 what = WORD; \
7099 break
7100
7101
7f6f358c
YO
7102/*
7103 parse a class specification and produce either an ANYOF node that
89836f1f
YO
7104 matches the pattern or if the pattern matches a single char only and
7105 that char is < 256 and we are case insensitive then we produce an
7106 EXACT node instead.
7f6f358c 7107*/
89836f1f 7108
76e3520e 7109STATIC regnode *
3dab1dad 7110S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 7111{
97aff369 7112 dVAR;
9ef43ace 7113 register UV value = 0;
9a86a77b 7114 register UV nextvalue;
3568d838 7115 register IV prevvalue = OOB_UNICODE;
ffc61ed2 7116 register IV range = 0;
c277df42 7117 register regnode *ret;
ba210ebe 7118 STRLEN numlen;
ffc61ed2 7119 IV namedclass;
cbbf8932 7120 char *rangebegin = NULL;
936ed897 7121 bool need_class = 0;
c445ea15 7122 SV *listsv = NULL;
ffc61ed2 7123 UV n;
9e55ce06 7124 bool optimize_invert = TRUE;
cbbf8932 7125 AV* unicode_alternate = NULL;
1b2d223b
JH
7126#ifdef EBCDIC
7127 UV literal_endpoint = 0;
7128#endif
7f6f358c 7129 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 7130
3dab1dad 7131 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 7132 case we need to change the emitted regop to an EXACT. */
07be1b83 7133 const char * orig_parse = RExC_parse;
72f13be8 7134 GET_RE_DEBUG_FLAGS_DECL;
76e84362
SH
7135#ifndef DEBUGGING
7136 PERL_UNUSED_ARG(depth);
7137#endif
72f13be8 7138
3dab1dad 7139 DEBUG_PARSE("clas");
7f6f358c
YO
7140
7141 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
7142 ret = reganode(pRExC_state, ANYOF, 0);
7143
7144 if (!SIZE_ONLY)
7145 ANYOF_FLAGS(ret) = 0;
7146
9a86a77b 7147 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
7148 RExC_naughty++;
7149 RExC_parse++;
7150 if (!SIZE_ONLY)
7151 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
7152 }
a0d0e21e 7153
73060fc4 7154 if (SIZE_ONLY) {
830247a4 7155 RExC_size += ANYOF_SKIP;
73060fc4
JH
7156 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
7157 }
936ed897 7158 else {
830247a4 7159 RExC_emit += ANYOF_SKIP;
936ed897
IZ
7160 if (FOLD)
7161 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
7162 if (LOC)
7163 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 7164 ANYOF_BITMAP_ZERO(ret);
396482e1 7165 listsv = newSVpvs("# comment\n");
a0d0e21e 7166 }
b8c5462f 7167
9a86a77b
JH
7168 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7169
b938889d 7170 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 7171 checkposixcc(pRExC_state);
b8c5462f 7172
f064b6ad
HS
7173 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
7174 if (UCHARAT(RExC_parse) == ']')
7175 goto charclassloop;
ffc61ed2 7176
fc8cd66c 7177parseit:
9a86a77b 7178 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
7179
7180 charclassloop:
7181
7182 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
7183
73b437c8 7184 if (!range)
830247a4 7185 rangebegin = RExC_parse;
ffc61ed2 7186 if (UTF) {
5e12f4fb 7187 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 7188 RExC_end - RExC_parse,
9f7f3913 7189 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
7190 RExC_parse += numlen;
7191 }
7192 else
7193 value = UCHARAT(RExC_parse++);
7f6f358c 7194
9a86a77b
JH
7195 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7196 if (value == '[' && POSIXCC(nextvalue))
830247a4 7197 namedclass = regpposixcc(pRExC_state, value);
620e46c5 7198 else if (value == '\\') {
ffc61ed2 7199 if (UTF) {
5e12f4fb 7200 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 7201 RExC_end - RExC_parse,
9f7f3913 7202 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
7203 RExC_parse += numlen;
7204 }
7205 else
7206 value = UCHARAT(RExC_parse++);
470c3474 7207 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 7208 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
7209 * be a problem later if we want switch on Unicode.
7210 * A similar issue a little bit later when switching on
7211 * namedclass. --jhi */
ffc61ed2 7212 switch ((I32)value) {
b8c5462f
JH
7213 case 'w': namedclass = ANYOF_ALNUM; break;
7214 case 'W': namedclass = ANYOF_NALNUM; break;
7215 case 's': namedclass = ANYOF_SPACE; break;
7216 case 'S': namedclass = ANYOF_NSPACE; break;
7217 case 'd': namedclass = ANYOF_DIGIT; break;
7218 case 'D': namedclass = ANYOF_NDIGIT; break;
fc8cd66c
YO
7219 case 'N': /* Handle \N{NAME} in class */
7220 {
7221 /* We only pay attention to the first char of
7222 multichar strings being returned. I kinda wonder
7223 if this makes sense as it does change the behaviour
7224 from earlier versions, OTOH that behaviour was broken
7225 as well. */
7226 UV v; /* value is register so we cant & it /grrr */
7227 if (reg_namedseq(pRExC_state, &v)) {
7228 goto parseit;
7229 }
7230 value= v;
7231 }
7232 break;
ffc61ed2
JH
7233 case 'p':
7234 case 'P':
3dab1dad
YO
7235 {
7236 char *e;
af6f566e 7237 if (RExC_parse >= RExC_end)
2a4859cd 7238 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 7239 if (*RExC_parse == '{') {
1df70142 7240 const U8 c = (U8)value;
ffc61ed2
JH
7241 e = strchr(RExC_parse++, '}');
7242 if (!e)
0da60cf5 7243 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
7244 while (isSPACE(UCHARAT(RExC_parse)))
7245 RExC_parse++;
7246 if (e == RExC_parse)
0da60cf5 7247 vFAIL2("Empty \\%c{}", c);
ffc61ed2 7248 n = e - RExC_parse;
ab13f0c7
JH
7249 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
7250 n--;
ffc61ed2
JH
7251 }
7252 else {
7253 e = RExC_parse;
7254 n = 1;
7255 }
7256 if (!SIZE_ONLY) {
ab13f0c7
JH
7257 if (UCHARAT(RExC_parse) == '^') {
7258 RExC_parse++;
7259 n--;
7260 value = value == 'p' ? 'P' : 'p'; /* toggle */
7261 while (isSPACE(UCHARAT(RExC_parse))) {
7262 RExC_parse++;
7263 n--;
7264 }
7265 }
097eb12c
AL
7266 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
7267 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
7268 }
7269 RExC_parse = e + 1;
7270 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 7271 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 7272 }
f81125e2 7273 break;
b8c5462f
JH
7274 case 'n': value = '\n'; break;
7275 case 'r': value = '\r'; break;
7276 case 't': value = '\t'; break;
7277 case 'f': value = '\f'; break;
7278 case 'b': value = '\b'; break;
c7f1f016
NIS
7279 case 'e': value = ASCII_TO_NATIVE('\033');break;
7280 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 7281 case 'x':
ffc61ed2 7282 if (*RExC_parse == '{') {
a4c04bdc
NC
7283 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7284 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 7285 char * const e = strchr(RExC_parse++, '}');
b81d288d 7286 if (!e)
ffc61ed2 7287 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
7288
7289 numlen = e - RExC_parse;
7290 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
7291 RExC_parse = e + 1;
7292 }
7293 else {
a4c04bdc 7294 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
7295 numlen = 2;
7296 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
7297 RExC_parse += numlen;
7298 }
9e08bc66
TS
7299 if (PL_encoding && value < 0x100)
7300 goto recode_encoding;
b8c5462f
JH
7301 break;
7302 case 'c':
830247a4 7303 value = UCHARAT(RExC_parse++);
b8c5462f
JH
7304 value = toCTRL(value);
7305 break;
7306 case '0': case '1': case '2': case '3': case '4':
7307 case '5': case '6': case '7': case '8': case '9':
9e08bc66
TS
7308 {
7309 I32 flags = 0;
7310 numlen = 3;
7311 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
7312 RExC_parse += numlen;
7313 if (PL_encoding && value < 0x100)
7314 goto recode_encoding;
7315 break;
7316 }
7317 recode_encoding:
7318 {
7319 SV* enc = PL_encoding;
7320 value = reg_recode((const char)(U8)value, &enc);
7321 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7322 vWARN(RExC_parse,
7323 "Invalid escape in the specified encoding");
7324 break;
7325 }
1028017a 7326 default:
041457d9 7327 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
7328 vWARN2(RExC_parse,
7329 "Unrecognized escape \\%c in character class passed through",
7330 (int)value);
1028017a 7331 break;
b8c5462f 7332 }
ffc61ed2 7333 } /* end of \blah */
1b2d223b
JH
7334#ifdef EBCDIC
7335 else
7336 literal_endpoint++;
7337#endif
ffc61ed2
JH
7338
7339 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
7340
7341 if (!SIZE_ONLY && !need_class)
936ed897 7342 ANYOF_CLASS_ZERO(ret);
ffc61ed2 7343
936ed897 7344 need_class = 1;
ffc61ed2
JH
7345
7346 /* a bad range like a-\d, a-[:digit:] ? */
7347 if (range) {
73b437c8 7348 if (!SIZE_ONLY) {
afd78fd5 7349 if (ckWARN(WARN_REGEXP)) {
097eb12c 7350 const int w =
afd78fd5
JH
7351 RExC_parse >= rangebegin ?
7352 RExC_parse - rangebegin : 0;
830247a4 7353 vWARN4(RExC_parse,
b45f050a 7354 "False [] range \"%*.*s\"",
097eb12c 7355 w, w, rangebegin);
afd78fd5 7356 }
3568d838
JH
7357 if (prevvalue < 256) {
7358 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
7359 ANYOF_BITMAP_SET(ret, '-');
7360 }
7361 else {
7362 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7363 Perl_sv_catpvf(aTHX_ listsv,
3568d838 7364 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 7365 }
b8c5462f 7366 }
ffc61ed2
JH
7367
7368 range = 0; /* this was not a true range */
73b437c8 7369 }
ffc61ed2 7370
89836f1f
YO
7371
7372
73b437c8 7373 if (!SIZE_ONLY) {
c49a72a9
NC
7374 const char *what = NULL;
7375 char yesno = 0;
7376
3568d838
JH
7377 if (namedclass > OOB_NAMEDCLASS)
7378 optimize_invert = FALSE;
e2962f66
JH
7379 /* Possible truncation here but in some 64-bit environments
7380 * the compiler gets heartburn about switch on 64-bit values.
7381 * A similar issue a little earlier when switching on value.
98f323fa 7382 * --jhi */
e2962f66 7383 switch ((I32)namedclass) {
89836f1f
YO
7384 case _C_C_T_(ALNUM, isALNUM(value), "Word");
7385 case _C_C_T_(ALNUMC, isALNUMC(value), "Alnum");
7386 case _C_C_T_(ALPHA, isALPHA(value), "Alpha");
7387 case _C_C_T_(BLANK, isBLANK(value), "Blank");
7388 case _C_C_T_(CNTRL, isCNTRL(value), "Cntrl");
7389 case _C_C_T_(GRAPH, isGRAPH(value), "Graph");
7390 case _C_C_T_(LOWER, isLOWER(value), "Lower");
7391 case _C_C_T_(PRINT, isPRINT(value), "Print");
7392 case _C_C_T_(PSXSPC, isPSXSPC(value), "Space");
7393 case _C_C_T_(PUNCT, isPUNCT(value), "Punct");
7394 case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
7395 case _C_C_T_(UPPER, isUPPER(value), "Upper");
7396 case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
73b437c8
JH
7397 case ANYOF_ASCII:
7398 if (LOC)
936ed897 7399 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 7400 else {
c7f1f016 7401#ifndef EBCDIC
1ba5c669
JH
7402 for (value = 0; value < 128; value++)
7403 ANYOF_BITMAP_SET(ret, value);
7404#else /* EBCDIC */
ffbc6a93 7405 for (value = 0; value < 256; value++) {
3a3c4447
JH
7406 if (isASCII(value))
7407 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 7408 }
1ba5c669 7409#endif /* EBCDIC */
73b437c8 7410 }
c49a72a9
NC
7411 yesno = '+';
7412 what = "ASCII";
73b437c8
JH
7413 break;
7414 case ANYOF_NASCII:
7415 if (LOC)
936ed897 7416 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 7417 else {
c7f1f016 7418#ifndef EBCDIC
1ba5c669
JH
7419 for (value = 128; value < 256; value++)
7420 ANYOF_BITMAP_SET(ret, value);
7421#else /* EBCDIC */
ffbc6a93 7422 for (value = 0; value < 256; value++) {
3a3c4447
JH
7423 if (!isASCII(value))
7424 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 7425 }
1ba5c669 7426#endif /* EBCDIC */
73b437c8 7427 }
c49a72a9
NC
7428 yesno = '!';
7429 what = "ASCII";
89836f1f 7430 break;
ffc61ed2
JH
7431 case ANYOF_DIGIT:
7432 if (LOC)
7433 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
7434 else {
7435 /* consecutive digits assumed */
7436 for (value = '0'; value <= '9'; value++)
7437 ANYOF_BITMAP_SET(ret, value);
7438 }
c49a72a9
NC
7439 yesno = '+';
7440 what = "Digit";
ffc61ed2
JH
7441 break;
7442 case ANYOF_NDIGIT:
7443 if (LOC)
7444 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
7445 else {
7446 /* consecutive digits assumed */
7447 for (value = 0; value < '0'; value++)
7448 ANYOF_BITMAP_SET(ret, value);
7449 for (value = '9' + 1; value < 256; value++)
7450 ANYOF_BITMAP_SET(ret, value);
7451 }
c49a72a9
NC
7452 yesno = '!';
7453 what = "Digit";
89836f1f 7454 break;
f81125e2
JP
7455 case ANYOF_MAX:
7456 /* this is to handle \p and \P */
7457 break;
73b437c8 7458 default:
b45f050a 7459 vFAIL("Invalid [::] class");
73b437c8 7460 break;
b8c5462f 7461 }
c49a72a9
NC
7462 if (what) {
7463 /* Strings such as "+utf8::isWord\n" */
7464 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
7465 }
b8c5462f 7466 if (LOC)
936ed897 7467 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 7468 continue;
a0d0e21e 7469 }
ffc61ed2
JH
7470 } /* end of namedclass \blah */
7471
a0d0e21e 7472 if (range) {
eb160463 7473 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
7474 const int w = RExC_parse - rangebegin;
7475 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 7476 range = 0; /* not a valid range */
73b437c8 7477 }
a0d0e21e
LW
7478 }
7479 else {
3568d838 7480 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
7481 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
7482 RExC_parse[1] != ']') {
7483 RExC_parse++;
ffc61ed2
JH
7484
7485 /* a bad range like \w-, [:word:]- ? */
7486 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 7487 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 7488 const int w =
afd78fd5
JH
7489 RExC_parse >= rangebegin ?
7490 RExC_parse - rangebegin : 0;
830247a4 7491 vWARN4(RExC_parse,
b45f050a 7492 "False [] range \"%*.*s\"",
097eb12c 7493 w, w, rangebegin);
afd78fd5 7494 }
73b437c8 7495 if (!SIZE_ONLY)
936ed897 7496 ANYOF_BITMAP_SET(ret, '-');
73b437c8 7497 } else
ffc61ed2
JH
7498 range = 1; /* yeah, it's a range! */
7499 continue; /* but do it the next time */
a0d0e21e 7500 }
a687059c 7501 }
ffc61ed2 7502
93733859 7503 /* now is the next time */
07be1b83 7504 /*stored += (value - prevvalue + 1);*/
ae5c130c 7505 if (!SIZE_ONLY) {
3568d838 7506 if (prevvalue < 256) {
1df70142 7507 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 7508 IV i;
3568d838 7509#ifdef EBCDIC
1b2d223b
JH
7510 /* In EBCDIC [\x89-\x91] should include
7511 * the \x8e but [i-j] should not. */
7512 if (literal_endpoint == 2 &&
7513 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
7514 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 7515 {
3568d838
JH
7516 if (isLOWER(prevvalue)) {
7517 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7518 if (isLOWER(i))
7519 ANYOF_BITMAP_SET(ret, i);
7520 } else {
3568d838 7521 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7522 if (isUPPER(i))
7523 ANYOF_BITMAP_SET(ret, i);
7524 }
8ada0baa 7525 }
ffc61ed2 7526 else
8ada0baa 7527#endif
07be1b83
YO
7528 for (i = prevvalue; i <= ceilvalue; i++) {
7529 if (!ANYOF_BITMAP_TEST(ret,i)) {
7530 stored++;
7531 ANYOF_BITMAP_SET(ret, i);
7532 }
7533 }
3568d838 7534 }
a5961de5 7535 if (value > 255 || UTF) {
1df70142
AL
7536 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
7537 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 7538 stored+=2; /* can't optimize this class */
ffc61ed2 7539 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 7540 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 7541 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
7542 prevnatvalue, natvalue);
7543 }
7544 else if (prevnatvalue == natvalue) {
7545 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 7546 if (FOLD) {
89ebb4a3 7547 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 7548 STRLEN foldlen;
1df70142 7549 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 7550
e294cc5d
JH
7551#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
7552 if (RExC_precomp[0] == ':' &&
7553 RExC_precomp[1] == '[' &&
7554 (f == 0xDF || f == 0x92)) {
7555 f = NATIVE_TO_UNI(f);
7556 }
7557#endif
c840d2a2
JH
7558 /* If folding and foldable and a single
7559 * character, insert also the folded version
7560 * to the charclass. */
9e55ce06 7561 if (f != value) {
e294cc5d
JH
7562#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
7563 if ((RExC_precomp[0] == ':' &&
7564 RExC_precomp[1] == '[' &&
7565 (f == 0xA2 &&
7566 (value == 0xFB05 || value == 0xFB06))) ?
7567 foldlen == ((STRLEN)UNISKIP(f) - 1) :
7568 foldlen == (STRLEN)UNISKIP(f) )
7569#else
eb160463 7570 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 7571#endif
9e55ce06
JH
7572 Perl_sv_catpvf(aTHX_ listsv,
7573 "%04"UVxf"\n", f);
7574 else {
7575 /* Any multicharacter foldings
7576 * require the following transform:
7577 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
7578 * where E folds into "pq" and F folds
7579 * into "rst", all other characters
7580 * fold to single characters. We save
7581 * away these multicharacter foldings,
7582 * to be later saved as part of the
7583 * additional "s" data. */
7584 SV *sv;
7585
7586 if (!unicode_alternate)
7587 unicode_alternate = newAV();
7588 sv = newSVpvn((char*)foldbuf, foldlen);
7589 SvUTF8_on(sv);
7590 av_push(unicode_alternate, sv);
7591 }
7592 }
254ba52a 7593
60a8b682
JH
7594 /* If folding and the value is one of the Greek
7595 * sigmas insert a few more sigmas to make the
7596 * folding rules of the sigmas to work right.
7597 * Note that not all the possible combinations
7598 * are handled here: some of them are handled
9e55ce06
JH
7599 * by the standard folding rules, and some of
7600 * them (literal or EXACTF cases) are handled
7601 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
7602 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
7603 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7604 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 7605 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7606 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7607 }
7608 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
7609 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7610 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7611 }
7612 }
ffc61ed2 7613 }
1b2d223b
JH
7614#ifdef EBCDIC
7615 literal_endpoint = 0;
7616#endif
8ada0baa 7617 }
ffc61ed2
JH
7618
7619 range = 0; /* this range (if it was one) is done now */
a0d0e21e 7620 }
ffc61ed2 7621
936ed897 7622 if (need_class) {
4f66b38d 7623 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 7624 if (SIZE_ONLY)
830247a4 7625 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 7626 else
830247a4 7627 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 7628 }
ffc61ed2 7629
7f6f358c
YO
7630
7631 if (SIZE_ONLY)
7632 return ret;
7633 /****** !SIZE_ONLY AFTER HERE *********/
7634
7635 if( stored == 1 && value < 256
7636 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
7637 ) {
7638 /* optimize single char class to an EXACT node
7639 but *only* when its not a UTF/high char */
07be1b83
YO
7640 const char * cur_parse= RExC_parse;
7641 RExC_emit = (regnode *)orig_emit;
7642 RExC_parse = (char *)orig_parse;
7f6f358c
YO
7643 ret = reg_node(pRExC_state,
7644 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 7645 RExC_parse = (char *)cur_parse;
7f6f358c
YO
7646 *STRING(ret)= (char)value;
7647 STR_LEN(ret)= 1;
7648 RExC_emit += STR_SZ(1);
7649 return ret;
7650 }
ae5c130c 7651 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 7652 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
7653 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
7654 ) {
a0ed51b3 7655 for (value = 0; value < 256; ++value) {
936ed897 7656 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 7657 UV fold = PL_fold[value];
ffc61ed2
JH
7658
7659 if (fold != value)
7660 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
7661 }
7662 }
936ed897 7663 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 7664 }
ffc61ed2 7665
ae5c130c 7666 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 7667 if (optimize_invert &&
ffc61ed2
JH
7668 /* If the only flag is inversion. */
7669 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 7670 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 7671 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 7672 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 7673 }
7f6f358c 7674 {
097eb12c 7675 AV * const av = newAV();
ffc61ed2 7676 SV *rv;
9e55ce06 7677 /* The 0th element stores the character class description
6a0407ee 7678 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
7679 * to initialize the appropriate swash (which gets stored in
7680 * the 1st element), and also useful for dumping the regnode.
7681 * The 2nd element stores the multicharacter foldings,
6a0407ee 7682 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
7683 av_store(av, 0, listsv);
7684 av_store(av, 1, NULL);
9e55ce06 7685 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 7686 rv = newRV_noinc((SV*)av);
19860706 7687 n = add_data(pRExC_state, 1, "s");
f8fc2ecf 7688 RExC_rxi->data->data[n] = (void*)rv;
ffc61ed2 7689 ARG_SET(ret, n);
a0ed51b3 7690 }
a0ed51b3
LW
7691 return ret;
7692}
89836f1f
YO
7693#undef _C_C_T_
7694
a0ed51b3 7695
76e3520e 7696STATIC char*
830247a4 7697S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 7698{
097eb12c 7699 char* const retval = RExC_parse++;
a0d0e21e 7700
4633a7c4 7701 for (;;) {
830247a4
IZ
7702 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
7703 RExC_parse[2] == '#') {
e994fd66
AE
7704 while (*RExC_parse != ')') {
7705 if (RExC_parse == RExC_end)
7706 FAIL("Sequence (?#... not terminated");
830247a4 7707 RExC_parse++;
e994fd66 7708 }
830247a4 7709 RExC_parse++;
4633a7c4
LW
7710 continue;
7711 }
bbe252da 7712 if (RExC_flags & RXf_PMf_EXTENDED) {
830247a4
IZ
7713 if (isSPACE(*RExC_parse)) {
7714 RExC_parse++;
748a9306
LW
7715 continue;
7716 }
830247a4 7717 else if (*RExC_parse == '#') {
e994fd66
AE
7718 while (RExC_parse < RExC_end)
7719 if (*RExC_parse++ == '\n') break;
748a9306
LW
7720 continue;
7721 }
748a9306 7722 }
4633a7c4 7723 return retval;
a0d0e21e 7724 }
a687059c
LW
7725}
7726
7727/*
c277df42 7728- reg_node - emit a node
a0d0e21e 7729*/
76e3520e 7730STATIC regnode * /* Location. */
830247a4 7731S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 7732{
97aff369 7733 dVAR;
c277df42 7734 register regnode *ptr;
504618e9 7735 regnode * const ret = RExC_emit;
07be1b83 7736 GET_RE_DEBUG_FLAGS_DECL;
a687059c 7737
c277df42 7738 if (SIZE_ONLY) {
830247a4
IZ
7739 SIZE_ALIGN(RExC_size);
7740 RExC_size += 1;
a0d0e21e
LW
7741 return(ret);
7742 }
e2e6a0f1
YO
7743#ifdef DEBUGGING
7744 if (OP(RExC_emit) == 255)
7745 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %s: %d ",
7746 reg_name[op], OP(RExC_emit));
7747#endif
c277df42 7748 NODE_ALIGN_FILL(ret);
a0d0e21e 7749 ptr = ret;
c277df42 7750 FILL_ADVANCE_NODE(ptr, op);
fac92740 7751 if (RExC_offsets) { /* MJD */
07be1b83 7752 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740
MJD
7753 "reg_node", __LINE__,
7754 reg_name[op],
07be1b83
YO
7755 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
7756 ? "Overwriting end of array!\n" : "OK",
7757 (UV)(RExC_emit - RExC_emit_start),
7758 (UV)(RExC_parse - RExC_start),
7759 (UV)RExC_offsets[0]));
ccb2c380 7760 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 7761 }
07be1b83 7762
830247a4 7763 RExC_emit = ptr;
a0d0e21e 7764 return(ret);
a687059c
LW
7765}
7766
7767/*
a0d0e21e
LW
7768- reganode - emit a node with an argument
7769*/
76e3520e 7770STATIC regnode * /* Location. */
830247a4 7771S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 7772{
97aff369 7773 dVAR;
c277df42 7774 register regnode *ptr;
504618e9 7775 regnode * const ret = RExC_emit;
07be1b83 7776 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 7777
c277df42 7778 if (SIZE_ONLY) {
830247a4
IZ
7779 SIZE_ALIGN(RExC_size);
7780 RExC_size += 2;
6bda09f9
YO
7781 /*
7782 We can't do this:
7783
7784 assert(2==regarglen[op]+1);
7785
7786 Anything larger than this has to allocate the extra amount.
7787 If we changed this to be:
7788
7789 RExC_size += (1 + regarglen[op]);
7790
7791 then it wouldn't matter. Its not clear what side effect
7792 might come from that so its not done so far.
7793 -- dmq
7794 */
a0d0e21e
LW
7795 return(ret);
7796 }
e2e6a0f1
YO
7797#ifdef DEBUGGING
7798 if (OP(RExC_emit) == 255)
7799 Perl_croak(aTHX_ "panic: reganode overwriting end of allocated program space");
7800#endif
c277df42 7801 NODE_ALIGN_FILL(ret);
a0d0e21e 7802 ptr = ret;
c277df42 7803 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 7804 if (RExC_offsets) { /* MJD */
07be1b83 7805 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7806 "reganode",
ccb2c380
MP
7807 __LINE__,
7808 reg_name[op],
07be1b83 7809 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 7810 "Overwriting end of array!\n" : "OK",
07be1b83
YO
7811 (UV)(RExC_emit - RExC_emit_start),
7812 (UV)(RExC_parse - RExC_start),
7813 (UV)RExC_offsets[0]));
ccb2c380 7814 Set_Cur_Node_Offset;
fac92740
MJD
7815 }
7816
830247a4 7817 RExC_emit = ptr;
a0d0e21e 7818 return(ret);
fe14fcc3
LW
7819}
7820
7821/*
cd439c50 7822- reguni - emit (if appropriate) a Unicode character
a0ed51b3 7823*/
71207a34
AL
7824STATIC STRLEN
7825S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 7826{
97aff369 7827 dVAR;
71207a34 7828 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
7829}
7830
7831/*
a0d0e21e
LW
7832- reginsert - insert an operator in front of already-emitted operand
7833*
7834* Means relocating the operand.
7835*/
76e3520e 7836STATIC void
6bda09f9 7837S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
a687059c 7838{
97aff369 7839 dVAR;
c277df42
IZ
7840 register regnode *src;
7841 register regnode *dst;
7842 register regnode *place;
504618e9 7843 const int offset = regarglen[(U8)op];
6bda09f9 7844 const int size = NODE_STEP_REGNODE + offset;
07be1b83 7845 GET_RE_DEBUG_FLAGS_DECL;
22c35a8c 7846/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
6bda09f9 7847 DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]);
c277df42 7848 if (SIZE_ONLY) {
6bda09f9 7849 RExC_size += size;
a0d0e21e
LW
7850 return;
7851 }
a687059c 7852
830247a4 7853 src = RExC_emit;
6bda09f9 7854 RExC_emit += size;
830247a4 7855 dst = RExC_emit;
40d049e4 7856 if (RExC_open_parens) {
6bda09f9 7857 int paren;
6d99fb9b 7858 DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);
6bda09f9 7859 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
40d049e4
YO
7860 if ( RExC_open_parens[paren] >= opnd ) {
7861 DEBUG_PARSE_FMT("open"," - %d",size);
7862 RExC_open_parens[paren] += size;
7863 } else {
7864 DEBUG_PARSE_FMT("open"," - %s","ok");
7865 }
7866 if ( RExC_close_parens[paren] >= opnd ) {
7867 DEBUG_PARSE_FMT("close"," - %d",size);
7868 RExC_close_parens[paren] += size;
7869 } else {
7870 DEBUG_PARSE_FMT("close"," - %s","ok");
7871 }
7872 }
6bda09f9 7873 }
40d049e4 7874
fac92740 7875 while (src > opnd) {
c277df42 7876 StructCopy(--src, --dst, regnode);
fac92740 7877 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 7878 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 7879 "reg_insert",
ccb2c380
MP
7880 __LINE__,
7881 reg_name[op],
07be1b83
YO
7882 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
7883 ? "Overwriting end of array!\n" : "OK",
7884 (UV)(src - RExC_emit_start),
7885 (UV)(dst - RExC_emit_start),
7886 (UV)RExC_offsets[0]));
ccb2c380
MP
7887 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
7888 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
7889 }
7890 }
7891
a0d0e21e
LW
7892
7893 place = opnd; /* Op node, where operand used to be. */
fac92740 7894 if (RExC_offsets) { /* MJD */
07be1b83 7895 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7896 "reginsert",
ccb2c380
MP
7897 __LINE__,
7898 reg_name[op],
07be1b83 7899 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 7900 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
7901 (UV)(place - RExC_emit_start),
7902 (UV)(RExC_parse - RExC_start),
786e8c11 7903 (UV)RExC_offsets[0]));
ccb2c380 7904 Set_Node_Offset(place, RExC_parse);
45948336 7905 Set_Node_Length(place, 1);
fac92740 7906 }
c277df42
IZ
7907 src = NEXTOPER(place);
7908 FILL_ADVANCE_NODE(place, op);
7909 Zero(src, offset, regnode);
a687059c
LW
7910}
7911
7912/*
c277df42 7913- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 7914- SEE ALSO: regtail_study
a0d0e21e 7915*/
097eb12c 7916/* TODO: All three parms should be const */
76e3520e 7917STATIC void
3dab1dad 7918S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 7919{
97aff369 7920 dVAR;
c277df42 7921 register regnode *scan;
72f13be8 7922 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1
SP
7923#ifndef DEBUGGING
7924 PERL_UNUSED_ARG(depth);
7925#endif
a0d0e21e 7926
c277df42 7927 if (SIZE_ONLY)
a0d0e21e
LW
7928 return;
7929
7930 /* Find last node. */
7931 scan = p;
7932 for (;;) {
504618e9 7933 regnode * const temp = regnext(scan);
3dab1dad
YO
7934 DEBUG_PARSE_r({
7935 SV * const mysv=sv_newmortal();
7936 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
7937 regprop(RExC_rx, mysv, scan);
eaf3ca90
YO
7938 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
7939 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
7940 (temp == NULL ? "->" : ""),
7941 (temp == NULL ? reg_name[OP(val)] : "")
7942 );
3dab1dad
YO
7943 });
7944 if (temp == NULL)
7945 break;
7946 scan = temp;
7947 }
7948
7949 if (reg_off_by_arg[OP(scan)]) {
7950 ARG_SET(scan, val - scan);
7951 }
7952 else {
7953 NEXT_OFF(scan) = val - scan;
7954 }
7955}
7956
07be1b83 7957#ifdef DEBUGGING
3dab1dad
YO
7958/*
7959- regtail_study - set the next-pointer at the end of a node chain of p to val.
7960- Look for optimizable sequences at the same time.
7961- currently only looks for EXACT chains.
07be1b83
YO
7962
7963This is expermental code. The idea is to use this routine to perform
7964in place optimizations on branches and groups as they are constructed,
7965with the long term intention of removing optimization from study_chunk so
7966that it is purely analytical.
7967
7968Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
7969to control which is which.
7970
3dab1dad
YO
7971*/
7972/* TODO: All four parms should be const */
07be1b83 7973
3dab1dad
YO
7974STATIC U8
7975S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
7976{
7977 dVAR;
7978 register regnode *scan;
07be1b83
YO
7979 U8 exact = PSEUDO;
7980#ifdef EXPERIMENTAL_INPLACESCAN
7981 I32 min = 0;
7982#endif
7983
3dab1dad
YO
7984 GET_RE_DEBUG_FLAGS_DECL;
7985
07be1b83 7986
3dab1dad
YO
7987 if (SIZE_ONLY)
7988 return exact;
7989
7990 /* Find last node. */
7991
7992 scan = p;
7993 for (;;) {
7994 regnode * const temp = regnext(scan);
07be1b83
YO
7995#ifdef EXPERIMENTAL_INPLACESCAN
7996 if (PL_regkind[OP(scan)] == EXACT)
7997 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
7998 return EXACT;
7999#endif
3dab1dad
YO
8000 if ( exact ) {
8001 switch (OP(scan)) {
8002 case EXACT:
8003 case EXACTF:
8004 case EXACTFL:
8005 if( exact == PSEUDO )
8006 exact= OP(scan);
07be1b83
YO
8007 else if ( exact != OP(scan) )
8008 exact= 0;
3dab1dad
YO
8009 case NOTHING:
8010 break;
8011 default:
8012 exact= 0;
8013 }
8014 }
8015 DEBUG_PARSE_r({
8016 SV * const mysv=sv_newmortal();
8017 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8018 regprop(RExC_rx, mysv, scan);
eaf3ca90 8019 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
3dab1dad 8020 SvPV_nolen_const(mysv),
eaf3ca90
YO
8021 REG_NODE_NUM(scan),
8022 reg_name[exact]);
3dab1dad 8023 });
a0d0e21e
LW
8024 if (temp == NULL)
8025 break;
8026 scan = temp;
8027 }
07be1b83
YO
8028 DEBUG_PARSE_r({
8029 SV * const mysv_val=sv_newmortal();
8030 DEBUG_PARSE_MSG("");
8031 regprop(RExC_rx, mysv_val, val);
70685ca0
JH
8032 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8033 SvPV_nolen_const(mysv_val),
8034 (IV)REG_NODE_NUM(val),
8035 (IV)(val - scan)
07be1b83
YO
8036 );
8037 });
c277df42
IZ
8038 if (reg_off_by_arg[OP(scan)]) {
8039 ARG_SET(scan, val - scan);
a0ed51b3
LW
8040 }
8041 else {
c277df42
IZ
8042 NEXT_OFF(scan) = val - scan;
8043 }
3dab1dad
YO
8044
8045 return exact;
a687059c 8046}
07be1b83 8047#endif
a687059c
LW
8048
8049/*
a687059c
LW
8050 - regcurly - a little FSA that accepts {\d+,?\d*}
8051 */
79072805 8052STATIC I32
5f66b61c 8053S_regcurly(register const char *s)
a687059c
LW
8054{
8055 if (*s++ != '{')
8056 return FALSE;
f0fcb552 8057 if (!isDIGIT(*s))
a687059c 8058 return FALSE;
f0fcb552 8059 while (isDIGIT(*s))
a687059c
LW
8060 s++;
8061 if (*s == ',')
8062 s++;
f0fcb552 8063 while (isDIGIT(*s))
a687059c
LW
8064 s++;
8065 if (*s != '}')
8066 return FALSE;
8067 return TRUE;
8068}
8069
a687059c
LW
8070
8071/*
fd181c75 8072 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
8073 */
8074void
097eb12c 8075Perl_regdump(pTHX_ const regexp *r)
a687059c 8076{
35ff7856 8077#ifdef DEBUGGING
97aff369 8078 dVAR;
c445ea15 8079 SV * const sv = sv_newmortal();
ab3bbdeb 8080 SV *dsv= sv_newmortal();
f8fc2ecf 8081 RXi_GET_DECL(r,ri);
a687059c 8082
f8fc2ecf 8083 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
8084
8085 /* Header fields of interest. */
ab3bbdeb
YO
8086 if (r->anchored_substr) {
8087 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8088 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 8089 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8090 "anchored %s%s at %"IVdf" ",
8091 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 8092 (IV)r->anchored_offset);
ab3bbdeb
YO
8093 } else if (r->anchored_utf8) {
8094 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8095 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 8096 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8097 "anchored utf8 %s%s at %"IVdf" ",
8098 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 8099 (IV)r->anchored_offset);
ab3bbdeb
YO
8100 }
8101 if (r->float_substr) {
8102 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8103 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 8104 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8105 "floating %s%s at %"IVdf"..%"UVuf" ",
8106 s, RE_SV_TAIL(r->float_substr),
7b0972df 8107 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
8108 } else if (r->float_utf8) {
8109 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8110 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 8111 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8112 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8113 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 8114 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 8115 }
33b8afdf 8116 if (r->check_substr || r->check_utf8)
b81d288d 8117 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
8118 (const char *)
8119 (r->check_substr == r->float_substr
8120 && r->check_utf8 == r->float_utf8
8121 ? "(checking floating" : "(checking anchored"));
bbe252da 8122 if (r->extflags & RXf_NOSCAN)
c277df42 8123 PerlIO_printf(Perl_debug_log, " noscan");
bbe252da 8124 if (r->extflags & RXf_CHECK_ALL)
c277df42 8125 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 8126 if (r->check_substr || r->check_utf8)
c277df42
IZ
8127 PerlIO_printf(Perl_debug_log, ") ");
8128
f8fc2ecf
YO
8129 if (ri->regstclass) {
8130 regprop(r, sv, ri->regstclass);
1de06328 8131 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 8132 }
bbe252da 8133 if (r->extflags & RXf_ANCH) {
774d564b 8134 PerlIO_printf(Perl_debug_log, "anchored");
bbe252da 8135 if (r->extflags & RXf_ANCH_BOL)
774d564b 8136 PerlIO_printf(Perl_debug_log, "(BOL)");
bbe252da 8137 if (r->extflags & RXf_ANCH_MBOL)
c277df42 8138 PerlIO_printf(Perl_debug_log, "(MBOL)");
bbe252da 8139 if (r->extflags & RXf_ANCH_SBOL)
cad2e5aa 8140 PerlIO_printf(Perl_debug_log, "(SBOL)");
bbe252da 8141 if (r->extflags & RXf_ANCH_GPOS)
774d564b 8142 PerlIO_printf(Perl_debug_log, "(GPOS)");
8143 PerlIO_putc(Perl_debug_log, ' ');
8144 }
bbe252da 8145 if (r->extflags & RXf_GPOS_SEEN)
70685ca0 8146 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
bbe252da 8147 if (r->intflags & PREGf_SKIP)
760ac839 8148 PerlIO_printf(Perl_debug_log, "plus ");
bbe252da 8149 if (r->intflags & PREGf_IMPLICIT)
760ac839 8150 PerlIO_printf(Perl_debug_log, "implicit ");
70685ca0 8151 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
bbe252da 8152 if (r->extflags & RXf_EVAL_SEEN)
ce862d02 8153 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 8154 PerlIO_printf(Perl_debug_log, "\n");
65e66c80 8155#else
96a5add6 8156 PERL_UNUSED_CONTEXT;
65e66c80 8157 PERL_UNUSED_ARG(r);
17c3b450 8158#endif /* DEBUGGING */
a687059c
LW
8159}
8160
8161/*
a0d0e21e
LW
8162- regprop - printable representation of opcode
8163*/
46fc3d4c 8164void
32fc9b6a 8165Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 8166{
35ff7856 8167#ifdef DEBUGGING
97aff369 8168 dVAR;
9b155405 8169 register int k;
f8fc2ecf 8170 RXi_GET_DECL(prog,progi);
1de06328 8171 GET_RE_DEBUG_FLAGS_DECL;
f8fc2ecf 8172
a0d0e21e 8173
54dc92de 8174 sv_setpvn(sv, "", 0);
8aa23a47 8175
03363afd 8176 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
8177 /* It would be nice to FAIL() here, but this may be called from
8178 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 8179 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
bfed75c6 8180 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405 8181
3dab1dad 8182 k = PL_regkind[OP(o)];
9b155405 8183
2a782b5b 8184 if (k == EXACT) {
396482e1 8185 SV * const dsv = sv_2mortal(newSVpvs(""));
ab3bbdeb
YO
8186 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
8187 * is a crude hack but it may be the best for now since
8188 * we have no flag "this EXACTish node was UTF-8"
8189 * --jhi */
8190 const char * const s =
ddc5bc0f 8191 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
ab3bbdeb
YO
8192 PL_colors[0], PL_colors[1],
8193 PERL_PV_ESCAPE_UNI_DETECT |
8194 PERL_PV_PRETTY_ELIPSES |
8195 PERL_PV_PRETTY_LTGT
8196 );
8197 Perl_sv_catpvf(aTHX_ sv, " %s", s );
bb263b4e 8198 } else if (k == TRIE) {
3dab1dad 8199 /* print the details of the trie in dumpuntil instead, as
f8fc2ecf 8200 * progi->data isn't available here */
1de06328 8201 const char op = OP(o);
647f639f 8202 const U32 n = ARG(o);
1de06328 8203 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
f8fc2ecf 8204 (reg_ac_data *)progi->data->data[n] :
1de06328 8205 NULL;
3251b653
NC
8206 const reg_trie_data * const trie
8207 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
1de06328
YO
8208
8209 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
8210 DEBUG_TRIE_COMPILE_r(
8211 Perl_sv_catpvf(aTHX_ sv,
8212 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
8213 (UV)trie->startstate,
1e2e3d02 8214 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
1de06328
YO
8215 (UV)trie->wordcount,
8216 (UV)trie->minlen,
8217 (UV)trie->maxlen,
8218 (UV)TRIE_CHARCOUNT(trie),
8219 (UV)trie->uniquecharcount
8220 )
8221 );
8222 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
8223 int i;
8224 int rangestart = -1;
f46cb337 8225 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
1de06328
YO
8226 Perl_sv_catpvf(aTHX_ sv, "[");
8227 for (i = 0; i <= 256; i++) {
8228 if (i < 256 && BITMAP_TEST(bitmap,i)) {
8229 if (rangestart == -1)
8230 rangestart = i;
8231 } else if (rangestart != -1) {
8232 if (i <= rangestart + 3)
8233 for (; rangestart < i; rangestart++)
8234 put_byte(sv, rangestart);
8235 else {
8236 put_byte(sv, rangestart);
8237 sv_catpvs(sv, "-");
8238 put_byte(sv, i - 1);
8239 }
8240 rangestart = -1;
8241 }
8242 }
8243 Perl_sv_catpvf(aTHX_ sv, "]");
8244 }
8245
a3621e74 8246 } else if (k == CURLY) {
cb434fcc 8247 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
8248 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
8249 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 8250 }
2c2d71f5
JH
8251 else if (k == WHILEM && o->flags) /* Ordinal/of */
8252 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
1f1031fe 8253 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
894356b3 8254 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
1f1031fe 8255 if ( prog->paren_names ) {
ee9b8eae
YO
8256 if ( k != REF || OP(o) < NREF) {
8257 AV *list= (AV *)progi->data->data[progi->name_list_idx];
8258 SV **name= av_fetch(list, ARG(o), 0 );
8259 if (name)
8260 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
8261 }
8262 else {
8263 AV *list= (AV *)progi->data->data[ progi->name_list_idx ];
8264 SV *sv_dat=(SV*)progi->data->data[ ARG( o ) ];
8265 I32 *nums=(I32*)SvPVX(sv_dat);
8266 SV **name= av_fetch(list, nums[0], 0 );
8267 I32 n;
8268 if (name) {
8269 for ( n=0; n<SvIVX(sv_dat); n++ ) {
8270 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
8271 (n ? "," : ""), (IV)nums[n]);
8272 }
8273 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
1f1031fe 8274 }
1f1031fe 8275 }
ee9b8eae 8276 }
1f1031fe 8277 } else if (k == GOSUB)
6bda09f9 8278 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
e2e6a0f1
YO
8279 else if (k == VERB) {
8280 if (!o->flags)
8281 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
be2597df 8282 SVfARG((SV*)progi->data->data[ ARG( o ) ]));
e2e6a0f1 8283 } else if (k == LOGICAL)
04ebc1ab 8284 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
8285 else if (k == ANYOF) {
8286 int i, rangestart = -1;
2d03de9c 8287 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
8288
8289 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
8290 static const char * const anyofs[] = {
653099ff
GS
8291 "\\w",
8292 "\\W",
8293 "\\s",
8294 "\\S",
8295 "\\d",
8296 "\\D",
8297 "[:alnum:]",
8298 "[:^alnum:]",
8299 "[:alpha:]",
8300 "[:^alpha:]",
8301 "[:ascii:]",
8302 "[:^ascii:]",
8303 "[:ctrl:]",
8304 "[:^ctrl:]",
8305 "[:graph:]",
8306 "[:^graph:]",
8307 "[:lower:]",
8308 "[:^lower:]",
8309 "[:print:]",
8310 "[:^print:]",
8311 "[:punct:]",
8312 "[:^punct:]",
8313 "[:upper:]",
aaa51d5e 8314 "[:^upper:]",
653099ff 8315 "[:xdigit:]",
aaa51d5e
JF
8316 "[:^xdigit:]",
8317 "[:space:]",
8318 "[:^space:]",
8319 "[:blank:]",
8320 "[:^blank:]"
653099ff
GS
8321 };
8322
19860706 8323 if (flags & ANYOF_LOCALE)
396482e1 8324 sv_catpvs(sv, "{loc}");
19860706 8325 if (flags & ANYOF_FOLD)
396482e1 8326 sv_catpvs(sv, "{i}");
653099ff 8327 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 8328 if (flags & ANYOF_INVERT)
396482e1 8329 sv_catpvs(sv, "^");
ffc61ed2
JH
8330 for (i = 0; i <= 256; i++) {
8331 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
8332 if (rangestart == -1)
8333 rangestart = i;
8334 } else if (rangestart != -1) {
8335 if (i <= rangestart + 3)
8336 for (; rangestart < i; rangestart++)
653099ff 8337 put_byte(sv, rangestart);
ffc61ed2
JH
8338 else {
8339 put_byte(sv, rangestart);
396482e1 8340 sv_catpvs(sv, "-");
ffc61ed2 8341 put_byte(sv, i - 1);
653099ff 8342 }
ffc61ed2 8343 rangestart = -1;
653099ff 8344 }
847a199f 8345 }
ffc61ed2
JH
8346
8347 if (o->flags & ANYOF_CLASS)
bb7a0f54 8348 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
ffc61ed2
JH
8349 if (ANYOF_CLASS_TEST(o,i))
8350 sv_catpv(sv, anyofs[i]);
8351
8352 if (flags & ANYOF_UNICODE)
396482e1 8353 sv_catpvs(sv, "{unicode}");
1aa99e6b 8354 else if (flags & ANYOF_UNICODE_ALL)
396482e1 8355 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
8356
8357 {
8358 SV *lv;
32fc9b6a 8359 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 8360
ffc61ed2
JH
8361 if (lv) {
8362 if (sw) {
89ebb4a3 8363 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 8364
ffc61ed2 8365 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 8366 uvchr_to_utf8(s, i);
ffc61ed2 8367
3568d838 8368 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
8369 if (rangestart == -1)
8370 rangestart = i;
8371 } else if (rangestart != -1) {
ffc61ed2
JH
8372 if (i <= rangestart + 3)
8373 for (; rangestart < i; rangestart++) {
2d03de9c
AL
8374 const U8 * const e = uvchr_to_utf8(s,rangestart);
8375 U8 *p;
8376 for(p = s; p < e; p++)
ffc61ed2
JH
8377 put_byte(sv, *p);
8378 }
8379 else {
2d03de9c
AL
8380 const U8 *e = uvchr_to_utf8(s,rangestart);
8381 U8 *p;
8382 for (p = s; p < e; p++)
ffc61ed2 8383 put_byte(sv, *p);
396482e1 8384 sv_catpvs(sv, "-");
2d03de9c
AL
8385 e = uvchr_to_utf8(s, i-1);
8386 for (p = s; p < e; p++)
1df70142 8387 put_byte(sv, *p);
ffc61ed2
JH
8388 }
8389 rangestart = -1;
8390 }
19860706 8391 }
ffc61ed2 8392
396482e1 8393 sv_catpvs(sv, "..."); /* et cetera */
19860706 8394 }
fde631ed 8395
ffc61ed2 8396 {
2e0de35c 8397 char *s = savesvpv(lv);
c445ea15 8398 char * const origs = s;
b81d288d 8399
3dab1dad
YO
8400 while (*s && *s != '\n')
8401 s++;
b81d288d 8402
ffc61ed2 8403 if (*s == '\n') {
2d03de9c 8404 const char * const t = ++s;
ffc61ed2
JH
8405
8406 while (*s) {
8407 if (*s == '\n')
8408 *s = ' ';
8409 s++;
8410 }
8411 if (s[-1] == ' ')
8412 s[-1] = 0;
8413
8414 sv_catpv(sv, t);
fde631ed 8415 }
b81d288d 8416
ffc61ed2 8417 Safefree(origs);
fde631ed
JH
8418 }
8419 }
653099ff 8420 }
ffc61ed2 8421
653099ff
GS
8422 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
8423 }
9b155405 8424 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 8425 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 8426#else
96a5add6 8427 PERL_UNUSED_CONTEXT;
65e66c80
SP
8428 PERL_UNUSED_ARG(sv);
8429 PERL_UNUSED_ARG(o);
f9049ba1 8430 PERL_UNUSED_ARG(prog);
17c3b450 8431#endif /* DEBUGGING */
35ff7856 8432}
a687059c 8433
cad2e5aa
JH
8434SV *
8435Perl_re_intuit_string(pTHX_ regexp *prog)
8436{ /* Assume that RE_INTUIT is set */
97aff369 8437 dVAR;
a3621e74 8438 GET_RE_DEBUG_FLAGS_DECL;
96a5add6
AL
8439 PERL_UNUSED_CONTEXT;
8440
a3621e74 8441 DEBUG_COMPILE_r(
cfd0369c 8442 {
2d03de9c 8443 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 8444 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
8445
8446 if (!PL_colorset) reginitcolors();
8447 PerlIO_printf(Perl_debug_log,
a0288114 8448 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
8449 PL_colors[4],
8450 prog->check_substr ? "" : "utf8 ",
8451 PL_colors[5],PL_colors[0],
cad2e5aa
JH
8452 s,
8453 PL_colors[1],
8454 (strlen(s) > 60 ? "..." : ""));
8455 } );
8456
33b8afdf 8457 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
8458}
8459
84da74a7 8460/*
f8149455 8461 pregfree()
84da74a7 8462
f8149455
YO
8463 handles refcounting and freeing the perl core regexp structure. When
8464 it is necessary to actually free the structure the first thing it
8465 does is call the 'free' method of the regexp_engine associated to to
8466 the regexp, allowing the handling of the void *pprivate; member
8467 first. (This routine is not overridable by extensions, which is why
8468 the extensions free is called first.)
8469
8470 See regdupe and regdupe_internal if you change anything here.
84da74a7 8471*/
f8149455 8472#ifndef PERL_IN_XSUB_RE
2b69d0c2 8473void
864dbfa3 8474Perl_pregfree(pTHX_ struct regexp *r)
a687059c 8475{
27da23d5 8476 dVAR;
fc32ee4a 8477 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 8478
7821416a
IZ
8479 if (!r || (--r->refcnt > 0))
8480 return;
f8149455
YO
8481
8482 CALLREGFREE_PVT(r); /* free the private data */
8483
43c5f42d
NC
8484 /* gcov results gave these as non-null 100% of the time, so there's no
8485 optimisation in checking them before calling Safefree */
8486 Safefree(r->precomp);
ed252734 8487 RX_MATCH_COPY_FREE(r);
f8c7b90f 8488#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
8489 if (r->saved_copy)
8490 SvREFCNT_dec(r->saved_copy);
8491#endif
a193d654
GS
8492 if (r->substrs) {
8493 if (r->anchored_substr)
8494 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
8495 if (r->anchored_utf8)
8496 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
8497 if (r->float_substr)
8498 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
8499 if (r->float_utf8)
8500 SvREFCNT_dec(r->float_utf8);
2779dcf1 8501 Safefree(r->substrs);
a193d654 8502 }
81714fb9
YO
8503 if (r->paren_names)
8504 SvREFCNT_dec(r->paren_names);
f8149455
YO
8505
8506 Safefree(r->startp);
8507 Safefree(r->endp);
8508 Safefree(r);
8509}
8510#endif
8511
8512/* regfree_internal()
8513
8514 Free the private data in a regexp. This is overloadable by
8515 extensions. Perl takes care of the regexp structure in pregfree(),
8516 this covers the *pprivate pointer which technically perldoesnt
8517 know about, however of course we have to handle the
8518 regexp_internal structure when no extension is in use.
8519
8520 Note this is called before freeing anything in the regexp
8521 structure.
8522 */
8523
8524void
8525Perl_regfree_internal(pTHX_ struct regexp *r)
8526{
8527 dVAR;
8528 RXi_GET_DECL(r,ri);
8529 GET_RE_DEBUG_FLAGS_DECL;
8530
8531 DEBUG_COMPILE_r({
8532 if (!PL_colorset)
8533 reginitcolors();
8534 {
8535 SV *dsv= sv_newmortal();
8536 RE_PV_QUOTED_DECL(s, (r->extflags & RXf_UTF8),
8537 dsv, r->precomp, r->prelen, 60);
8538 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
8539 PL_colors[4],PL_colors[5],s);
8540 }
8541 });
8542
8543 Safefree(ri->offsets); /* 20010421 MJD */
f8fc2ecf
YO
8544 if (ri->data) {
8545 int n = ri->data->count;
f3548bdc
DM
8546 PAD* new_comppad = NULL;
8547 PAD* old_comppad;
4026c95a 8548 PADOFFSET refcnt;
dfad63ad 8549
c277df42 8550 while (--n >= 0) {
261faec3 8551 /* If you add a ->what type here, update the comment in regcomp.h */
f8fc2ecf 8552 switch (ri->data->what[n]) {
c277df42 8553 case 's':
81714fb9 8554 case 'S':
55eed653 8555 case 'u':
f8fc2ecf 8556 SvREFCNT_dec((SV*)ri->data->data[n]);
c277df42 8557 break;
653099ff 8558 case 'f':
f8fc2ecf 8559 Safefree(ri->data->data[n]);
653099ff 8560 break;
dfad63ad 8561 case 'p':
f8fc2ecf 8562 new_comppad = (AV*)ri->data->data[n];
dfad63ad 8563 break;
c277df42 8564 case 'o':
dfad63ad 8565 if (new_comppad == NULL)
cea2e8a9 8566 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
8567 PAD_SAVE_LOCAL(old_comppad,
8568 /* Watch out for global destruction's random ordering. */
c445ea15 8569 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 8570 );
b34c0dd4 8571 OP_REFCNT_LOCK;
f8fc2ecf 8572 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
4026c95a
SH
8573 OP_REFCNT_UNLOCK;
8574 if (!refcnt)
f8fc2ecf 8575 op_free((OP_4tree*)ri->data->data[n]);
9b978d73 8576
f3548bdc 8577 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
8578 SvREFCNT_dec((SV*)new_comppad);
8579 new_comppad = NULL;
c277df42
IZ
8580 break;
8581 case 'n':
9e55ce06 8582 break;
07be1b83 8583 case 'T':
be8e71aa
YO
8584 { /* Aho Corasick add-on structure for a trie node.
8585 Used in stclass optimization only */
07be1b83 8586 U32 refcount;
f8fc2ecf 8587 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
07be1b83
YO
8588 OP_REFCNT_LOCK;
8589 refcount = --aho->refcount;
8590 OP_REFCNT_UNLOCK;
8591 if ( !refcount ) {
446bd890
NC
8592 PerlMemShared_free(aho->states);
8593 PerlMemShared_free(aho->fail);
446bd890
NC
8594 /* do this last!!!! */
8595 PerlMemShared_free(ri->data->data[n]);
8596 PerlMemShared_free(ri->regstclass);
07be1b83
YO
8597 }
8598 }
8599 break;
a3621e74 8600 case 't':
07be1b83 8601 {
be8e71aa 8602 /* trie structure. */
07be1b83 8603 U32 refcount;
f8fc2ecf 8604 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
07be1b83
YO
8605 OP_REFCNT_LOCK;
8606 refcount = --trie->refcount;
8607 OP_REFCNT_UNLOCK;
8608 if ( !refcount ) {
446bd890 8609 PerlMemShared_free(trie->charmap);
446bd890
NC
8610 PerlMemShared_free(trie->states);
8611 PerlMemShared_free(trie->trans);
07be1b83 8612 if (trie->bitmap)
446bd890 8613 PerlMemShared_free(trie->bitmap);
07be1b83 8614 if (trie->wordlen)
446bd890 8615 PerlMemShared_free(trie->wordlen);
786e8c11 8616 if (trie->jump)
446bd890 8617 PerlMemShared_free(trie->jump);
786e8c11 8618 if (trie->nextword)
446bd890 8619 PerlMemShared_free(trie->nextword);
446bd890
NC
8620 /* do this last!!!! */
8621 PerlMemShared_free(ri->data->data[n]);
a3621e74 8622 }
07be1b83
YO
8623 }
8624 break;
c277df42 8625 default:
f8fc2ecf 8626 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
c277df42
IZ
8627 }
8628 }
f8fc2ecf
YO
8629 Safefree(ri->data->what);
8630 Safefree(ri->data);
a0d0e21e 8631 }
f8fc2ecf
YO
8632 if (ri->swap) {
8633 Safefree(ri->swap->startp);
8634 Safefree(ri->swap->endp);
8635 Safefree(ri->swap);
c74340f9 8636 }
f8fc2ecf 8637 Safefree(ri);
a687059c 8638}
c277df42 8639
84da74a7
YO
8640#define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
8641#define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
81714fb9 8642#define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t))
84da74a7
YO
8643#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
8644
8645/*
8646 regdupe - duplicate a regexp.
8647
8648 This routine is called by sv.c's re_dup and is expected to clone a
8649 given regexp structure. It is a no-op when not under USE_ITHREADS.
8650 (Originally this *was* re_dup() for change history see sv.c)
8651
f8149455
YO
8652 After all of the core data stored in struct regexp is duplicated
8653 the regexp_engine.dupe method is used to copy any private data
8654 stored in the *pprivate pointer. This allows extensions to handle
8655 any duplication it needs to do.
8656
8657 See pregfree() and regfree_internal() if you change anything here.
84da74a7 8658*/
a3c0e9ca 8659#if defined(USE_ITHREADS)
f8149455 8660#ifndef PERL_IN_XSUB_RE
84da74a7 8661regexp *
f8149455 8662Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param)
84da74a7 8663{
84da74a7 8664 dVAR;
f8fc2ecf 8665 regexp *ret;
f8149455 8666 int i, npar;
84da74a7 8667 struct reg_substr_datum *s;
644c02aa 8668
84da74a7
YO
8669 if (!r)
8670 return (REGEXP *)NULL;
8671
8672 if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
8673 return ret;
8674
f8149455 8675
84da74a7 8676 npar = r->nparens+1;
f8fc2ecf 8677 Newxz(ret, 1, regexp);
84da74a7
YO
8678 Newx(ret->startp, npar, I32);
8679 Copy(r->startp, ret->startp, npar, I32);
8680 Newx(ret->endp, npar, I32);
f8149455 8681 Copy(r->endp, ret->endp, npar, I32);
84da74a7 8682
c945c181 8683 if (r->substrs) {
785a26d5
YO
8684 Newx(ret->substrs, 1, struct reg_substr_data);
8685 for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
8686 s->min_offset = r->substrs->data[i].min_offset;
8687 s->max_offset = r->substrs->data[i].max_offset;
8688 s->end_shift = r->substrs->data[i].end_shift;
8689 s->substr = sv_dup_inc(r->substrs->data[i].substr, param);
8690 s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param);
8691 }
8692 } else
8693 ret->substrs = NULL;
f8149455
YO
8694
8695 ret->precomp = SAVEPVN(r->precomp, r->prelen);
8696 ret->refcnt = r->refcnt;
8697 ret->minlen = r->minlen;
8698 ret->minlenret = r->minlenret;
8699 ret->prelen = r->prelen;
8700 ret->nparens = r->nparens;
8701 ret->lastparen = r->lastparen;
8702 ret->lastcloseparen = r->lastcloseparen;
8703 ret->intflags = r->intflags;
8704 ret->extflags = r->extflags;
8705
8706 ret->sublen = r->sublen;
8707
8708 ret->engine = r->engine;
8709
8710 ret->paren_names = hv_dup_inc(r->paren_names, param);
8711
8712 if (RX_MATCH_COPIED(ret))
8713 ret->subbeg = SAVEPVN(r->subbeg, r->sublen);
8714 else
8715 ret->subbeg = NULL;
8716#ifdef PERL_OLD_COPY_ON_WRITE
8717 ret->saved_copy = NULL;
8718#endif
8719
8720 ret->pprivate = r->pprivate;
785a26d5
YO
8721 if (ret->pprivate)
8722 RXi_SET(ret,CALLREGDUPE_PVT(ret,param));
f8149455
YO
8723
8724 ptr_table_store(PL_ptr_table, r, ret);
8725 return ret;
8726}
8727#endif /* PERL_IN_XSUB_RE */
8728
8729/*
8730 regdupe_internal()
8731
8732 This is the internal complement to regdupe() which is used to copy
8733 the structure pointed to by the *pprivate pointer in the regexp.
8734 This is the core version of the extension overridable cloning hook.
8735 The regexp structure being duplicated will be copied by perl prior
8736 to this and will be provided as the regexp *r argument, however
8737 with the /old/ structures pprivate pointer value. Thus this routine
8738 may override any copying normally done by perl.
8739
8740 It returns a pointer to the new regexp_internal structure.
8741*/
8742
8743void *
8744Perl_regdupe_internal(pTHX_ const regexp *r, CLONE_PARAMS *param)
8745{
8746 dVAR;
8747 regexp_internal *reti;
8748 int len, npar;
8749 RXi_GET_DECL(r,ri);
8750
8751 npar = r->nparens+1;
8752 len = ri->offsets[0];
8753
8754 Newxc(reti, sizeof(regexp_internal) + (len+1)*sizeof(regnode), char, regexp_internal);
8755 Copy(ri->program, reti->program, len+1, regnode);
8756
8757 if(ri->swap) {
8758 Newx(reti->swap, 1, regexp_paren_ofs);
8759 /* no need to copy these */
8760 Newx(reti->swap->startp, npar, I32);
8761 Newx(reti->swap->endp, npar, I32);
8762 } else {
8763 reti->swap = NULL;
8764 }
8765
84da74a7 8766
f8fc2ecf
YO
8767 reti->regstclass = NULL;
8768 if (ri->data) {
84da74a7 8769 struct reg_data *d;
f8fc2ecf 8770 const int count = ri->data->count;
84da74a7
YO
8771 int i;
8772
8773 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
8774 char, struct reg_data);
8775 Newx(d->what, count, U8);
8776
8777 d->count = count;
8778 for (i = 0; i < count; i++) {
f8fc2ecf 8779 d->what[i] = ri->data->what[i];
84da74a7 8780 switch (d->what[i]) {
55eed653 8781 /* legal options are one of: sSfpontTu
84da74a7
YO
8782 see also regcomp.h and pregfree() */
8783 case 's':
81714fb9 8784 case 'S':
0536c0a7 8785 case 'p': /* actually an AV, but the dup function is identical. */
55eed653 8786 case 'u': /* actually an HV, but the dup function is identical. */
f8fc2ecf 8787 d->data[i] = sv_dup_inc((SV *)ri->data->data[i], param);
84da74a7 8788 break;
84da74a7
YO
8789 case 'f':
8790 /* This is cheating. */
8791 Newx(d->data[i], 1, struct regnode_charclass_class);
f8fc2ecf 8792 StructCopy(ri->data->data[i], d->data[i],
84da74a7 8793 struct regnode_charclass_class);
f8fc2ecf 8794 reti->regstclass = (regnode*)d->data[i];
84da74a7
YO
8795 break;
8796 case 'o':
bbe252da
YO
8797 /* Compiled op trees are readonly and in shared memory,
8798 and can thus be shared without duplication. */
84da74a7 8799 OP_REFCNT_LOCK;
f8fc2ecf 8800 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
84da74a7
YO
8801 OP_REFCNT_UNLOCK;
8802 break;
23eab42c
NC
8803 case 'T':
8804 /* Trie stclasses are readonly and can thus be shared
8805 * without duplication. We free the stclass in pregfree
8806 * when the corresponding reg_ac_data struct is freed.
8807 */
8808 reti->regstclass= ri->regstclass;
8809 /* Fall through */
84da74a7 8810 case 't':
84da74a7 8811 OP_REFCNT_LOCK;
0536c0a7 8812 ((reg_trie_data*)ri->data->data[i])->refcount++;
84da74a7 8813 OP_REFCNT_UNLOCK;
0536c0a7
NC
8814 /* Fall through */
8815 case 'n':
8816 d->data[i] = ri->data->data[i];
84da74a7 8817 break;
84da74a7 8818 default:
f8fc2ecf 8819 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
84da74a7
YO
8820 }
8821 }
8822
f8fc2ecf 8823 reti->data = d;
84da74a7
YO
8824 }
8825 else
f8fc2ecf 8826 reti->data = NULL;
84da74a7 8827
cde0cee5
YO
8828 reti->name_list_idx = ri->name_list_idx;
8829
f8fc2ecf
YO
8830 Newx(reti->offsets, 2*len+1, U32);
8831 Copy(ri->offsets, reti->offsets, 2*len+1, U32);
81714fb9 8832
f8149455 8833 return (void*)reti;
84da74a7 8834}
f8149455
YO
8835
8836#endif /* USE_ITHREADS */
84da74a7 8837
de8c5301
YO
8838/*
8839 reg_stringify()
8840
8841 converts a regexp embedded in a MAGIC struct to its stringified form,
8842 caching the converted form in the struct and returns the cached
8843 string.
8844
8845 If lp is nonnull then it is used to return the length of the
8846 resulting string
8847
8848 If flags is nonnull and the returned string contains UTF8 then
f8149455 8849 (*flags & 1) will be true.
de8c5301
YO
8850
8851 If haseval is nonnull then it is used to return whether the pattern
8852 contains evals.
8853
8854 Normally called via macro:
8855
f8149455 8856 CALLREG_STRINGIFY(mg,&len,&utf8);
de8c5301
YO
8857
8858 And internally with
8859
f8149455 8860 CALLREG_AS_STR(mg,&lp,&flags,&haseval)
de8c5301
YO
8861
8862 See sv_2pv_flags() in sv.c for an example of internal usage.
8863
8864 */
f8149455 8865#ifndef PERL_IN_XSUB_RE
de8c5301
YO
8866char *
8867Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
8868 dVAR;
8869 const regexp * const re = (regexp *)mg->mg_obj;
f8149455 8870
de8c5301 8871 if (!mg->mg_ptr) {
a20207d7 8872 const char *fptr = STD_PAT_MODS; /*"msix"*/
cde0cee5 8873 char reflags[7];
de8c5301 8874 char ch;
cde0cee5
YO
8875 bool hask = ((re->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
8876 bool hasm = ((re->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
8877 U16 reganch = (U16)((re->extflags & RXf_PMf_STD_PMMOD) >> 12);
8878 bool need_newline = 0;
8879 int left = 0;
8880 int right = 4 + hask;
8881 if (hask)
a20207d7 8882 reflags[left++] = KEEPCOPY_PAT_MOD; /*'k'*/
de8c5301
YO
8883 while((ch = *fptr++)) {
8884 if(reganch & 1) {
8885 reflags[left++] = ch;
8886 }
8887 else {
8888 reflags[right--] = ch;
8889 }
8890 reganch >>= 1;
8891 }
cde0cee5 8892 if(hasm) {
de8c5301 8893 reflags[left] = '-';
cde0cee5 8894 left = 5 + hask;
de8c5301 8895 }
cde0cee5 8896 /* printf("[%*.7s]\n",left,reflags); */
de8c5301
YO
8897 mg->mg_len = re->prelen + 4 + left;
8898 /*
8899 * If /x was used, we have to worry about a regex ending with a
8900 * comment later being embedded within another regex. If so, we don't
8901 * want this regex's "commentization" to leak out to the right part of
8902 * the enclosing regex, we must cap it with a newline.
8903 *
8904 * So, if /x was used, we scan backwards from the end of the regex. If
8905 * we find a '#' before we find a newline, we need to add a newline
8906 * ourself. If we find a '\n' first (or if we don't find '#' or '\n'),
8907 * we don't need to add anything. -jfriedl
8908 */
bbe252da 8909 if (PMf_EXTENDED & re->extflags) {
de8c5301
YO
8910 const char *endptr = re->precomp + re->prelen;
8911 while (endptr >= re->precomp) {
8912 const char c = *(endptr--);
8913 if (c == '\n')
8914 break; /* don't need another */
8915 if (c == '#') {
8916 /* we end while in a comment, so we need a newline */
8917 mg->mg_len++; /* save space for it */
8918 need_newline = 1; /* note to add it */
8919 break;
8920 }
8921 }
8922 }
8923
8924 Newx(mg->mg_ptr, mg->mg_len + 1 + left, char);
8925 mg->mg_ptr[0] = '(';
8926 mg->mg_ptr[1] = '?';
8927 Copy(reflags, mg->mg_ptr+2, left, char);
8928 *(mg->mg_ptr+left+2) = ':';
8929 Copy(re->precomp, mg->mg_ptr+3+left, re->prelen, char);
8930 if (need_newline)
8931 mg->mg_ptr[mg->mg_len - 2] = '\n';
8932 mg->mg_ptr[mg->mg_len - 1] = ')';
8933 mg->mg_ptr[mg->mg_len] = 0;
8934 }
8935 if (haseval)
f8149455 8936 *haseval = re->seen_evals;
de8c5301 8937 if (flags)
bbe252da 8938 *flags = ((re->extflags & RXf_UTF8) ? 1 : 0);
de8c5301
YO
8939
8940 if (lp)
8941 *lp = mg->mg_len;
8942 return mg->mg_ptr;
8943}
8944
c277df42
IZ
8945/*
8946 - regnext - dig the "next" pointer out of a node
c277df42
IZ
8947 */
8948regnode *
864dbfa3 8949Perl_regnext(pTHX_ register regnode *p)
c277df42 8950{
97aff369 8951 dVAR;
c277df42
IZ
8952 register I32 offset;
8953
f8fc2ecf 8954 if (!p)
c277df42
IZ
8955 return(NULL);
8956
8957 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
8958 if (offset == 0)
8959 return(NULL);
8960
c277df42 8961 return(p+offset);
c277df42 8962}
76234dfb 8963#endif
c277df42 8964
01f988be 8965STATIC void
cea2e8a9 8966S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
8967{
8968 va_list args;
8969 STRLEN l1 = strlen(pat1);
8970 STRLEN l2 = strlen(pat2);
8971 char buf[512];
06bf62c7 8972 SV *msv;
73d840c0 8973 const char *message;
c277df42
IZ
8974
8975 if (l1 > 510)
8976 l1 = 510;
8977 if (l1 + l2 > 510)
8978 l2 = 510 - l1;
8979 Copy(pat1, buf, l1 , char);
8980 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
8981 buf[l1 + l2] = '\n';
8982 buf[l1 + l2 + 1] = '\0';
8736538c
AS
8983#ifdef I_STDARG
8984 /* ANSI variant takes additional second argument */
c277df42 8985 va_start(args, pat2);
8736538c
AS
8986#else
8987 va_start(args);
8988#endif
5a844595 8989 msv = vmess(buf, &args);
c277df42 8990 va_end(args);
cfd0369c 8991 message = SvPV_const(msv,l1);
c277df42
IZ
8992 if (l1 > 512)
8993 l1 = 512;
8994 Copy(message, buf, l1 , char);
197cf9b9 8995 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 8996 Perl_croak(aTHX_ "%s", buf);
c277df42 8997}
a0ed51b3
LW
8998
8999/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
9000
76234dfb 9001#ifndef PERL_IN_XSUB_RE
a0ed51b3 9002void
864dbfa3 9003Perl_save_re_context(pTHX)
b81d288d 9004{
97aff369 9005 dVAR;
1ade1aa1
NC
9006
9007 struct re_save_state *state;
9008
9009 SAVEVPTR(PL_curcop);
9010 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
9011
9012 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
9013 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
9014 SSPUSHINT(SAVEt_RE_STATE);
9015
46ab3289 9016 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 9017
a0ed51b3 9018 PL_reg_start_tmp = 0;
a0ed51b3 9019 PL_reg_start_tmpl = 0;
c445ea15 9020 PL_reg_oldsaved = NULL;
a5db57d6 9021 PL_reg_oldsavedlen = 0;
a5db57d6 9022 PL_reg_maxiter = 0;
a5db57d6 9023 PL_reg_leftiter = 0;
c445ea15 9024 PL_reg_poscache = NULL;
a5db57d6 9025 PL_reg_poscache_size = 0;
1ade1aa1
NC
9026#ifdef PERL_OLD_COPY_ON_WRITE
9027 PL_nrs = NULL;
9028#endif
ada6e8a9 9029
c445ea15
AL
9030 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
9031 if (PL_curpm) {
9032 const REGEXP * const rx = PM_GETRE(PL_curpm);
9033 if (rx) {
1df70142 9034 U32 i;
ada6e8a9 9035 for (i = 1; i <= rx->nparens; i++) {
1df70142 9036 char digits[TYPE_CHARS(long)];
d9fad198 9037 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
9038 GV *const *const gvp
9039 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
9040
b37c2d43
AL
9041 if (gvp) {
9042 GV * const gv = *gvp;
9043 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
9044 save_scalar(gv);
49f27e4b 9045 }
ada6e8a9
AMS
9046 }
9047 }
9048 }
a0ed51b3 9049}
76234dfb 9050#endif
51371543 9051
51371543 9052static void
acfe0abc 9053clear_re(pTHX_ void *r)
51371543 9054{
97aff369 9055 dVAR;
51371543
GS
9056 ReREFCNT_dec((regexp *)r);
9057}
ffbc6a93 9058
a28509cc
AL
9059#ifdef DEBUGGING
9060
9061STATIC void
9062S_put_byte(pTHX_ SV *sv, int c)
9063{
9064 if (isCNTRL(c) || c == 255 || !isPRINT(c))
9065 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
9066 else if (c == '-' || c == ']' || c == '\\' || c == '^')
9067 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
9068 else
9069 Perl_sv_catpvf(aTHX_ sv, "%c", c);
9070}
9071
786e8c11 9072
3dab1dad
YO
9073#define CLEAR_OPTSTART \
9074 if (optstart) STMT_START { \
70685ca0 9075 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
3dab1dad
YO
9076 optstart=NULL; \
9077 } STMT_END
9078
786e8c11 9079#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 9080
b5a2f8d8
NC
9081STATIC const regnode *
9082S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
9083 const regnode *last, const regnode *plast,
9084 SV* sv, I32 indent, U32 depth)
a28509cc 9085{
97aff369 9086 dVAR;
786e8c11 9087 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 9088 register const regnode *next;
3dab1dad 9089 const regnode *optstart= NULL;
1f1031fe 9090
f8fc2ecf 9091 RXi_GET_DECL(r,ri);
3dab1dad 9092 GET_RE_DEBUG_FLAGS_DECL;
1f1031fe 9093
786e8c11
YO
9094#ifdef DEBUG_DUMPUNTIL
9095 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
9096 last ? last-start : 0,plast ? plast-start : 0);
9097#endif
9098
9099 if (plast && plast < last)
9100 last= plast;
9101
9102 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc 9103 /* While that wasn't END last time... */
a28509cc
AL
9104 NODE_ALIGN(node);
9105 op = OP(node);
de734bd5 9106 if (op == CLOSE || op == WHILEM)
786e8c11 9107 indent--;
b5a2f8d8 9108 next = regnext((regnode *)node);
1f1031fe 9109
a28509cc 9110 /* Where, what. */
8e11feef 9111 if (OP(node) == OPTIMIZED) {
e68ec53f 9112 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 9113 optstart = node;
3dab1dad 9114 else
8e11feef 9115 goto after_print;
3dab1dad
YO
9116 } else
9117 CLEAR_OPTSTART;
1f1031fe 9118
32fc9b6a 9119 regprop(r, sv, node);
a28509cc 9120 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 9121 (int)(2*indent + 1), "", SvPVX_const(sv));
1f1031fe
YO
9122
9123 if (OP(node) != OPTIMIZED) {
9124 if (next == NULL) /* Next ptr. */
9125 PerlIO_printf(Perl_debug_log, " (0)");
9126 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
9127 PerlIO_printf(Perl_debug_log, " (FAIL)");
9128 else
9129 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
9130 (void)PerlIO_putc(Perl_debug_log, '\n');
9131 }
9132
a28509cc
AL
9133 after_print:
9134 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
9135 assert(next);
9136 {
9137 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
9138 ? regnext((regnode *)next)
9139 : next);
be8e71aa
YO
9140 if (last && nnode > last)
9141 nnode = last;
786e8c11 9142 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 9143 }
a28509cc
AL
9144 }
9145 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 9146 assert(next);
786e8c11 9147 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
9148 }
9149 else if ( PL_regkind[(U8)op] == TRIE ) {
7f69552c 9150 const regnode *this_trie = node;
1de06328 9151 const char op = OP(node);
647f639f 9152 const U32 n = ARG(node);
1de06328 9153 const reg_ac_data * const ac = op>=AHOCORASICK ?
f8fc2ecf 9154 (reg_ac_data *)ri->data->data[n] :
1de06328 9155 NULL;
3251b653
NC
9156 const reg_trie_data * const trie =
9157 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
2b8b4781
NC
9158#ifdef DEBUGGING
9159 AV *const trie_words = (AV *) ri->data->data[n + TRIE_WORDS_OFFSET];
9160#endif
786e8c11 9161 const regnode *nextbranch= NULL;
a28509cc 9162 I32 word_idx;
1de06328 9163 sv_setpvn(sv, "", 0);
786e8c11 9164 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
2b8b4781 9165 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
786e8c11
YO
9166
9167 PerlIO_printf(Perl_debug_log, "%*s%s ",
9168 (int)(2*(indent+3)), "",
9169 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
9170 PL_colors[0], PL_colors[1],
9171 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
9172 PERL_PV_PRETTY_ELIPSES |
7f69552c 9173 PERL_PV_PRETTY_LTGT
786e8c11
YO
9174 )
9175 : "???"
9176 );
9177 if (trie->jump) {
40d049e4 9178 U16 dist= trie->jump[word_idx+1];
70685ca0
JH
9179 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
9180 (UV)((dist ? this_trie + dist : next) - start));
786e8c11
YO
9181 if (dist) {
9182 if (!nextbranch)
24b23f37 9183 nextbranch= this_trie + trie->jump[0];
7f69552c
YO
9184 DUMPUNTIL(this_trie + dist, nextbranch);
9185 }
786e8c11
YO
9186 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
9187 nextbranch= regnext((regnode *)nextbranch);
9188 } else {
9189 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 9190 }
786e8c11
YO
9191 }
9192 if (last && next > last)
9193 node= last;
9194 else
9195 node= next;
a28509cc 9196 }
786e8c11
YO
9197 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
9198 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
9199 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
9200 }
9201 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 9202 assert(next);
786e8c11 9203 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
9204 }
9205 else if ( op == PLUS || op == STAR) {
786e8c11 9206 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc
AL
9207 }
9208 else if (op == ANYOF) {
9209 /* arglen 1 + class block */
9210 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
9211 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
9212 node = NEXTOPER(node);
9213 }
9214 else if (PL_regkind[(U8)op] == EXACT) {
9215 /* Literal string, where present. */
9216 node += NODE_SZ_STR(node) - 1;
9217 node = NEXTOPER(node);
9218 }
9219 else {
9220 node = NEXTOPER(node);
9221 node += regarglen[(U8)op];
9222 }
9223 if (op == CURLYX || op == OPEN)
786e8c11 9224 indent++;
a28509cc 9225 }
3dab1dad 9226 CLEAR_OPTSTART;
786e8c11 9227#ifdef DEBUG_DUMPUNTIL
70685ca0 9228 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
786e8c11 9229#endif
1de06328 9230 return node;
a28509cc
AL
9231}
9232
9233#endif /* DEBUGGING */
9234
241d1a3b
NC
9235/*
9236 * Local variables:
9237 * c-indentation-style: bsd
9238 * c-basic-offset: 4
9239 * indent-tabs-mode: t
9240 * End:
9241 *
37442d52
RGS
9242 * ex: set ts=8 sts=4 sw=4 noet:
9243 */